Data Processing
You can use PersiaBatch
to declare a batch of data in various types, shapes and implications.
- Processing ID Type Feature
- Non-ID Type Feature and Label
- Processing Meta Data
- PersiaBatch Processing Complete Example
Processing ID Type Feature
An ID type feature is a sparse matrix that contains variable length of discrete values. PERSIA converts these discrete valuess to embeddings by looking up from embedding-worker
. The conversion rules are different for different id_type_feature
, see embedding config for more details.
In addition, PersiaBatch
only accepts IDTypeFeature
or IDTypeFeatureWithSingleID
with np.uint64
datatype.
ID Type Feature with Variable Length
The following code shows how to convert an id_type_feature
with variable length to LIL
sparse matrix with fixed sample size.
import numpy as np
from persia.embedding.data import PersiaBatch, IDTypeFeature
id_type_feature_names = [
"gender", "user_id", "photo_id"
]
gender_data = [
[0],
[1],
[0],
[0],
[1]
]
user_id_data = [
[100001, 100003, 100005, 100020],
[100001],
[100001, 200001, 300001],
[400001, 100001],
[100001]
]
photo_id_data = [
[400032, 400031],
[400032, 400332, 420032, 400332,],
[400032],
[], # support empty id_type_feature but still need to add it to keep batch construction
[400032, 401032, 400732, 460032, 500032]
]
id_type_feature_data = [
gender_data, user_id_data, photo_id_data
]
batch_size = 5
id_type_features = []
for id_type_feature_idx, id_type_feature_name in enumerate(id_type_feature_names):
id_type_feature = []
for batch_idx in range(batch_size):
id_type_feature.append(
np.array(
id_type_feature_data[id_type_feature_idx][batch_idx: batch_idx + 1],
dtype=np.uint64
).reshape(-1)
)
id_type_features.append(
IDTypeFeature(id_type_feature_name, id_type_feature)
)
ID Type Feature with Single ID
The below code shows how to process data who have only one ID for each sample.
import numpy as np
from persia.embedding.data import PersiaBatch, IDTypeFeatureWithSingleID
id_type_feature_names = [
"gender", "user_id", "photo_id"
]
id_type_feature_data = np.array([
[0, 100001, 200001],
[1, 100002, 300002],
[0, 100003, 400002],
[0, 100005, 410002],
[1, 100006, 400032],
], dtype=np.uint64)
batch_size = 5
start = 0
id_type_features = []
for id_type_feature_idx, id_type_feature_name in enumerate(id_type_feature_names):
id_type_feature = []
id_type_features.append(
IDTypeFeatureWithSingleID(
id_type_feature_name,
id_type_feature_data[start: start + batch_size,id_type_feature_idx]
)
)
Non-ID Type Feature and Label
Non-ID type features and Labels are tensors with various data type and shape. They must have the same batch size with id_type_feature
in a PersiaBatch
.
The best practice is to stack data with the same type and then append it to non_id_type_features
, instead of appending one by one.
Datatype supported in NonIDTypeFeature
and Label
:
numpy.dtype |
---|
np.bool |
np.int8 |
np.int16 |
np.int32 |
np.int64 |
np.float32 |
np.float64 |
np.uint8 |
Here is an example:
import numpy as np
from persia.embedding.data import NonIDTypeFeature, Label
batch_size = 5
non_id_type_features = []
# add non_id_type_feature
# int8 image_embedding from DNN Extractor
non_id_type_features.append(NonIDTypeFeature(np.ones((batch_size, 256), dtype=np.int8)))
# general statistics such as average income, height, weight
# you can merge the non_id_type_feature together with same datatype
non_id_type_features.append(NonIDTypeFeature(np.eye((batch_size, 3) dtype=np.float32)))
# image_pixel_data or RS data with multiple dimension
non_id_type_features.append(NonIDTypeFeature(np.ones((batch_size, 3, 224, 224), dtype=np.int8)))
labels = []
# add label
# multiple labels classification
labels.append(Label(np.ones((batch_size, 4), dtype=np.bool)))
# regression label
labels.append(Label(np.ones((batch_size), dtype=np.float32)))
Processing Meta Data
There is an optional meta field in PersiaBatch
to store unstructured data. You are able to serialize the object into bytes and add it into PersiaBatch
.
import json
import pickle
import time
from persia.embedding.data import PesiaBatch, IDTypeFeature
batch_size = 5
id_type_features = [
IDTypeFeature(
"empty_id_type_feature_with_batch_size",
[np.array([], dtype=np.uint64)] * batch_size)
]
meta_info = {
"batch_id": 100000000,
"timestamp": time.time()
}
meta_json_bytes = json.dumps(meta_info)
# Or use pickle serialize the meta_info
# meta_pickle_bytes = pickle.dumps(meta_info)
PersiaBatch(
id_type_features,
meta=meta_json_bytes
)
PersiaBatch Processing Complete Example
Here is a complete example of how to generate a PersiaBatch
from raw data:
import json
import time
import numpy as np
from persia.embedding.data import PersiaBatch, IDTypeFeature, NonIDTypeFeature, Label
batch_size = 5
id_type_feature_names = [
"gender", "user_id", "photo_id"
]
gender_data = [
[0],
[1],
[0],
[0],
[1]
]
user_id_data = [
[100001, 100003, 100005, 100020],
[100001],
[100001, 200001, 300001],
[400001, 100001],
[100001]
]
photo_id_data = [
[400032, 400031],
[400032, 400332, 420032, 400332,],
[400032],
[], # support empty id_type_feature but still need to add it to keep batch construction
[400032, 401032, 400732, 460032, 500032]
]
id_type_feature_data = [
gender_data, user_id_data, photo_id_data
]
id_type_features = []
for id_type_feature_idx, id_type_feature_name in enumerate(id_type_feature_names):
id_type_feature = []
for batch_idx in range(batch_size):
id_type_feature.append(
np.array(
id_type_feature_data[id_type_feature_idx][batch_idx: batch_idx + 1],
dtype=np.uint64
).reshape(-1)
)
id_type_features.append(IDTypeFeature(id_type_feature_name, id_type_feature))
non_id_type_features = []
# add non_id_type_feature
# int8 image_embedding from DNN Extractor
non_id_type_features.append(NonIDTypeFeature(np.ones((batch_size, 256), dtype=np.int8)))
# general statistics such as average income, height, weight
# you can merge the non_id_type_feature together with same datatype
non_id_type_features.append(NonIDTypeFeature(np.eye((batch_size, 3) dtype=np.float32)))
# image_pixel_data or RS data with multiple dimension
non_id_type_features.append(NonIDTypeFeature(np.ones((batch_size, 3, 224, 224), dtype=np.int8)))
labels = []
# add label
# multiple label classification task label
labels.append(Label(np.ones((batch_size, 4), dtype=np.bool), name="ctr_label"))
# regression task label
labels.append(Label(np.ones((batch_size), dtype=np.float32), name="income_label"))
meta_info = {
"batch_id": 100000000,
"timestamp": time.time()
}
persia_batch = PersiaBatch(
id_type_features,
non_id_type_features=non_id_type_features,
labels=labels,
requires_grad=True,
meta=json.dumps(meta_info)
)