Data Processing

You can use PersiaBatch to declare a batch of data in various types, shapes and implications.

Processing ID Type Feature

An ID type feature is a sparse matrix that contains variable length of discrete values. PERSIA converts these discrete valuess to embeddings by looking up from embedding-worker. The conversion rules are different for different id_type_feature, see embedding config for more details.

In addition, PersiaBatch only accepts IDTypeFeature or IDTypeFeatureWithSingleID with np.uint64 datatype.

ID Type Feature with Variable Length

The following code shows how to convert an id_type_feature with variable length to LIL sparse matrix with fixed sample size.

import numpy as np

from persia.embedding.data import PersiaBatch, IDTypeFeature


id_type_feature_names = [
    "gender", "user_id", "photo_id"
]

gender_data = [
    [0],
    [1],
    [0],
    [0],
    [1]
]

user_id_data = [
    [100001, 100003, 100005, 100020],
    [100001],
    [100001, 200001, 300001],
    [400001, 100001],
    [100001]
]

photo_id_data = [
    [400032, 400031],
    [400032, 400332, 420032, 400332,],
    [400032],
    [], # support empty id_type_feature but still need to add it to keep batch construction
    [400032, 401032, 400732, 460032, 500032]
]

id_type_feature_data = [
    gender_data, user_id_data, photo_id_data
]

batch_size = 5
id_type_features = []

for id_type_feature_idx, id_type_feature_name in enumerate(id_type_feature_names):
    id_type_feature = []
    for batch_idx in range(batch_size):
        id_type_feature.append(
            np.array(
                id_type_feature_data[id_type_feature_idx][batch_idx: batch_idx + 1],
                 dtype=np.uint64
            ).reshape(-1)
        )
    id_type_features.append(
        IDTypeFeature(id_type_feature_name, id_type_feature)
    )

ID Type Feature with Single ID

The below code shows how to process data who have only one ID for each sample.

import numpy as np

from persia.embedding.data import PersiaBatch, IDTypeFeatureWithSingleID


id_type_feature_names = [
    "gender", "user_id", "photo_id"
]

id_type_feature_data = np.array([
    [0, 100001, 200001],
    [1, 100002, 300002],
    [0, 100003, 400002],
    [0, 100005, 410002],
    [1, 100006, 400032],
], dtype=np.uint64)

batch_size = 5
start = 0
id_type_features = []

for id_type_feature_idx, id_type_feature_name in enumerate(id_type_feature_names):
    id_type_feature = []
    id_type_features.append(
        IDTypeFeatureWithSingleID(
            id_type_feature_name,
            id_type_feature_data[start: start + batch_size,id_type_feature_idx]
        )
    )

Non-ID Type Feature and Label

Non-ID type features and Labels are tensors with various data type and shape. They must have the same batch size with id_type_feature in a PersiaBatch.

The best practice is to stack data with the same type and then append it to non_id_type_features, instead of appending one by one.

Datatype supported in NonIDTypeFeature and Label:

numpy.dtype
np.bool
np.int8
np.int16
np.int32
np.int64
np.float32
np.float64
np.uint8

Here is an example:

import numpy as np

from persia.embedding.data import NonIDTypeFeature, Label

batch_size = 5

non_id_type_features = []
# add non_id_type_feature
# int8 image_embedding from DNN Extractor
non_id_type_features.append(NonIDTypeFeature(np.ones((batch_size, 256), dtype=np.int8)))
# general statistics such as average income, height, weight
# you can merge the non_id_type_feature together with same datatype
non_id_type_features.append(NonIDTypeFeature(np.eye((batch_size, 3) dtype=np.float32)))
# image_pixel_data or RS data with multiple dimension
non_id_type_features.append(NonIDTypeFeature(np.ones((batch_size, 3, 224, 224), dtype=np.int8)))

labels = []
# add label
# multiple labels classification
labels.append(Label(np.ones((batch_size, 4), dtype=np.bool)))
# regression label
labels.append(Label(np.ones((batch_size), dtype=np.float32)))

Processing Meta Data

There is an optional meta field in PersiaBatch to store unstructured data. You are able to serialize the object into bytes and add it into PersiaBatch.

import json
import pickle

import time
from persia.embedding.data import PesiaBatch, IDTypeFeature
batch_size = 5
id_type_features = [
    IDTypeFeature(
        "empty_id_type_feature_with_batch_size",
        [np.array([], dtype=np.uint64)] * batch_size)
]

meta_info = {
    "batch_id": 100000000,
    "timestamp": time.time()
}

meta_json_bytes = json.dumps(meta_info)
# Or use pickle serialize the meta_info
# meta_pickle_bytes = pickle.dumps(meta_info)

PersiaBatch(
    id_type_features,
    meta=meta_json_bytes
)

PersiaBatch Processing Complete Example

Here is a complete example of how to generate a PersiaBatch from raw data:

import json
import time

import numpy as np

from persia.embedding.data import PersiaBatch, IDTypeFeature, NonIDTypeFeature, Label

batch_size = 5

id_type_feature_names = [
    "gender", "user_id", "photo_id"
]

gender_data = [
    [0],
    [1],
    [0],
    [0],
    [1]
]

user_id_data = [
    [100001, 100003, 100005, 100020],
    [100001],
    [100001, 200001, 300001],
    [400001, 100001],
    [100001]
]

photo_id_data = [
    [400032, 400031],
    [400032, 400332, 420032, 400332,],
    [400032],
    [], # support empty id_type_feature but still need to add it to keep batch construction
    [400032, 401032, 400732, 460032, 500032]
]

id_type_feature_data = [
    gender_data, user_id_data, photo_id_data
]

id_type_features = []

for id_type_feature_idx, id_type_feature_name in enumerate(id_type_feature_names):
    id_type_feature = []
    for batch_idx in range(batch_size):
        id_type_feature.append(
            np.array(
                id_type_feature_data[id_type_feature_idx][batch_idx: batch_idx + 1],
                dtype=np.uint64
            ).reshape(-1)
        )
    id_type_features.append(IDTypeFeature(id_type_feature_name, id_type_feature))


non_id_type_features = []
# add non_id_type_feature
# int8 image_embedding from DNN Extractor
non_id_type_features.append(NonIDTypeFeature(np.ones((batch_size, 256), dtype=np.int8)))

# general statistics such as average income, height, weight
# you can merge the non_id_type_feature together with same datatype
non_id_type_features.append(NonIDTypeFeature(np.eye((batch_size, 3) dtype=np.float32)))

# image_pixel_data or RS data with multiple dimension
non_id_type_features.append(NonIDTypeFeature(np.ones((batch_size, 3, 224, 224), dtype=np.int8)))

labels = []
# add label
# multiple label classification task label
labels.append(Label(np.ones((batch_size, 4), dtype=np.bool), name="ctr_label"))

# regression task label
labels.append(Label(np.ones((batch_size), dtype=np.float32), name="income_label"))

meta_info = {
    "batch_id": 100000000,
    "timestamp": time.time()
}

persia_batch = PersiaBatch(
    id_type_features,
    non_id_type_features=non_id_type_features,
    labels=labels,
    requires_grad=True,
    meta=json.dumps(meta_info)
)