generated from microsoft/python-package-template
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
4ab8c7b
commit feb43df
Showing
8 changed files
with
215 additions
and
20 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,15 +7,13 @@ name = "tf-tabular" | |
authors = [ | ||
{name = "Mathias Claassen", email = "[email protected]"}, | ||
] | ||
description = "TODO" | ||
description = "TF Tabular simplifies the experimentation and preprocessing of tabular datsets for TensorFlow models." | ||
readme = "README.md" | ||
classifiers = [ | ||
"Development Status :: 3 - Alpha", | ||
"Intended Audience :: Developers", | ||
"License :: OSI Approved :: MIT License", | ||
"Programming Language :: Python :: 3 :: Only", | ||
"Programming Language :: Python :: 3.8", | ||
"Programming Language :: Python :: 3.9", | ||
"Programming Language :: Python :: 3.10", | ||
"Programming Language :: Python :: 3.11" | ||
] | ||
|
@@ -110,7 +108,7 @@ line-ending = "auto" | |
|
||
|
||
[tool.pytest.ini_options] | ||
addopts = "--cov-report xml:coverage.xml --cov src --cov-fail-under 0 --cov-append -m 'not integration'" | ||
addopts = "--cov-report html:coverage.html --cov src --cov-fail-under 0 --cov-append -m 'not integration'" | ||
pythonpath = [ | ||
"src" | ||
] | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
import pandas as pd | ||
from tf_tabular.utils import get_vocab | ||
|
||
|
||
def test_get_vocab_string(): | ||
df = pd.Series(["a", "b", "c", "a", "b", "c"]) | ||
vocab = get_vocab(df) | ||
assert set(vocab) == set(["a", "b", "c"]) | ||
|
||
|
||
def test_get_vocab_max_size(): | ||
df = pd.Series(["a", "b", "c", "a", "b"]) | ||
vocab = get_vocab(df, max_size=2) | ||
assert set(vocab) == set(["a", "b"]) | ||
|
||
|
||
def test_get_vocab_int(): | ||
df = pd.Series([1, 2, 3, 1, 2, 3]) | ||
vocab = get_vocab(df) | ||
assert set(vocab) == set([1, 2, 3]) | ||
|
||
|
||
def test_exclude_none(): | ||
df = pd.Series(["a", "b", "_none_"]) | ||
vocab = get_vocab(df) | ||
assert set(vocab) == set(["a", "b"]) | ||
|
||
|
||
def test_vocab_lists(): | ||
df = pd.Series([["a", "b"], ["c", "d"], ["a", "b"], ["c", "b"]]) | ||
vocab = get_vocab(df) | ||
assert set(vocab) == set(["a", "b", "c", "d"]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
import pytest | ||
import pandas as pd | ||
import numpy as np | ||
from tensorflow.keras import Model | ||
from tf_tabular.builder import InputBuilder | ||
|
||
|
||
def test_input_builder_defaults(): | ||
builder = InputBuilder() | ||
assert builder.input_specs == [] | ||
assert builder.sequence_processor is None | ||
assert builder.combiner == "mean" | ||
assert builder.numeric_processor.num_projection is None | ||
|
||
|
||
def test_add_categoricals_missing_params(): | ||
builder = InputBuilder() | ||
pytest.raises(KeyError, builder.add_inputs_list, categoricals=["a", "b"]) | ||
pytest.raises(KeyError, builder.add_inputs_list, categoricals=["a", "b"], vocabs={"a": [], "b": []}) | ||
|
||
|
||
def test_add_categoricals_with_embedding(): | ||
builder = InputBuilder() | ||
builder.add_inputs_list( | ||
categoricals=["a", "b"], embedding_dims={"a": 10, "b": 20}, vocabs={"a": [1, 2, 3], "b": [4, 5, 6]} | ||
) | ||
assert len(builder.input_specs) == 2 | ||
assert builder.input_specs[0].name == "a" | ||
assert builder.input_specs[1].name == "b" | ||
assert builder.input_specs[0].embedding_dim == 10 | ||
assert builder.input_specs[1].embedding_dim == 20 | ||
assert not builder.input_specs[0].is_sequence | ||
assert not builder.input_specs[1].is_sequence | ||
assert not builder.input_specs[0].is_multi_hot | ||
assert not builder.input_specs[1].is_multi_hot | ||
assert builder.input_specs[0].vocab == [1, 2, 3] | ||
assert builder.input_specs[1].vocab == [4, 5, 6] | ||
|
||
|
||
def test_add_categoricals_with_embedding_df(): | ||
builder = InputBuilder() | ||
emb_a = pd.DataFrame({"id": [1, 2, 3], "embedding": [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [0.7, 0.8, 0.9]]}) | ||
emb_a["embedding"] = emb_a["embedding"].apply(np.array) | ||
builder.add_inputs_list( | ||
categoricals=["a"], embedding_dims={"a": 10}, vocabs={"a": [1, 2, 3]}, embedding_df={"a": emb_a} | ||
) | ||
inputs, output = builder.build_input_layers() | ||
model = Model(inputs=inputs, outputs=output) | ||
emb_layer = model.get_layer("a_emb") | ||
assert not emb_layer.trainable | ||
|
||
layer_embs = emb_layer.get_weights()[0] | ||
expected = np.stack(emb_a.embedding.values).astype(np.float32) | ||
|
||
assert layer_embs.shape == (4, 3) | ||
|
||
assert np.array_equal(layer_embs[1:], expected) | ||
# assert that the OOV embedding is the mean of the others | ||
assert np.allclose(layer_embs[0], emb_a.embedding.mean()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
import pytest | ||
import numpy as np | ||
from tensorflow.keras import Model | ||
from tf_tabular.builder import InputBuilder | ||
|
||
|
||
def test_add_numericals_with_normalization(): | ||
builder = InputBuilder() | ||
params = {"a": {"sample": np.array([10, 4, 12])}, "b": {"mean": 3.1, "var": 1.0}} | ||
builder.add_inputs_list(numericals=["a", "b"], normalization_params=params) | ||
assert len(builder.input_specs) == 2 | ||
assert builder.input_specs[0].name == "a" | ||
assert builder.input_specs[1].name == "b" | ||
assert not builder.input_specs[0].is_sequence | ||
assert not builder.input_specs[1].is_sequence | ||
assert np.array_equal(builder.input_specs[0].sample, params["a"]["sample"]) | ||
assert builder.input_specs[1].mean == params["b"]["mean"] | ||
assert builder.input_specs[1].variance == params["b"]["var"] | ||
|
||
inputs, output = builder.build_input_layers() | ||
model = Model(inputs=inputs, outputs=output) | ||
assert model.get_layer("a_norm") is not None | ||
assert model.get_layer("b_norm") is not None | ||
|
||
|
||
def test_add_numericals_no_norm(): | ||
builder = InputBuilder() | ||
builder.add_inputs_list(numericals=["a"]) | ||
assert len(builder.input_specs) == 1 | ||
assert builder.input_specs[0].name == "a" | ||
assert builder.input_specs[0].sample is None | ||
assert builder.input_specs[0].mean is None | ||
assert builder.input_specs[0].variance is None | ||
|
||
inputs, output = builder.build_input_layers() | ||
model = Model(inputs=inputs, outputs=output) | ||
pytest.raises(ValueError, model.get_layer, "a_norm") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
from tensorflow.keras import Model | ||
from tf_tabular.builder import InputBuilder | ||
from tf_tabular.sequence_processor import SequenceProcessor | ||
|
||
|
||
def test_add_sequential_columns(): | ||
builder = InputBuilder(sequence_processor=SequenceProcessor(attention_name="test_attn")) | ||
builder.add_inputs_list( | ||
categoricals=["a", "b"], | ||
embedding_dims={"a": 10, "b": 20}, | ||
vocabs={"a": [1, 2, 3], "b": [4, 5, 6]}, | ||
sequentials=["a"], | ||
) | ||
inputs, output = builder.build_input_layers() | ||
model = Model(inputs=inputs, outputs=output) | ||
assert model.get_layer("test_attn") is not None | ||
|
||
|
||
def test_add_multihot_combiner_default(): | ||
builder = InputBuilder() | ||
builder.add_inputs_list(categoricals=["a"], embedding_dims={"a": 10}, vocabs={"a": [1, 2, 3]}, multi_hots=["a"]) | ||
inputs, output = builder.build_input_layers() | ||
model = Model(inputs=inputs, outputs=output) | ||
assert model.get_layer("a_emb").output_shape == (None, None, 10) | ||
assert model.get_layer("a_emb").trainable | ||
assert output.shape == (None, 10) | ||
assert model.get_layer("global_average_pooling1d_1") is not None | ||
|
||
|
||
def test_add_multihot_combiner_max(): | ||
builder = InputBuilder(combiner="max") | ||
builder.add_inputs_list(categoricals=["a"], embedding_dims={"a": 10}, vocabs={"a": [1, 2, 3]}, multi_hots=["a"]) | ||
inputs, output = builder.build_input_layers() | ||
model = Model(inputs=inputs, outputs=output) | ||
assert model.get_layer("a_emb").output_shape == (None, None, 10) | ||
assert output.shape == (None, 10) | ||
assert model.get_layer("global_max_pooling1d") is not None | ||
assert model.get_layer("global_max_pooling1d").output_shape == (None, 10) | ||
|
||
|
||
def test_add_multihot_combiner_sum(): | ||
builder = InputBuilder(combiner="sum") | ||
builder.add_inputs_list(categoricals=["a"], embedding_dims={"a": 10}, vocabs={"a": [1, 2, 3]}, multi_hots=["a"]) | ||
inputs, output = builder.build_input_layers() | ||
model = Model(inputs=inputs, outputs=output) | ||
assert model.get_layer("a_emb").output_shape == (None, None, 10) | ||
assert output.shape == (None, 10) | ||
assert model.get_layer("lambda") is not None | ||
assert model.get_layer("lambda").output_shape == (None, 10) |