Skip to content

Commit

Permalink
Remove prints for logging statements
Browse files Browse the repository at this point in the history
  • Loading branch information
mats-claassen committed May 8, 2024
1 parent 8b0055a commit 3c263cf
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 7 deletions.
18 changes: 12 additions & 6 deletions examples/sequential/preprocessing.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from typing import List
import logging

import pandas as pd
import numpy as np
Expand All @@ -7,6 +8,9 @@
from tf_tabular.utils import get_vocab


logger = logging.getLogger(__name__)


def normalize_ratings_by_mean_user_rating(ratings: pd.DataFrame, user_id_column="user_id"):
"""Normalizes the ratings by subtracting the mean rating on a user basis.
Expand All @@ -25,12 +29,14 @@ def split_by_user(
ratings: pd.DataFrame,
max_y_cutoff: int,
val_split: float = 0.2,
target_split: float = 0.2,
):
"""Split dataset by users.
:param pd.DataFrame ratings: User ratings dataframe
:param int max_y_cutoff: Max number of movies that will be used as targets per user
:param float val_split: Validation dataset split, defaults to 0.2
:param float target_split: Percent of user actions to leave as prediction target, defaults to 0.2
:return tuple (pd.DataFrame, pd.DataFrame): Train and validation datasets
"""
ratings = ratings.sort_values(["user_id", "timestamp"])
Expand All @@ -40,7 +46,7 @@ def split_by_user(
ratings = ratings[["user_id", "movie_id", "user_rating"]].groupby(["user_id"], as_index=False).agg(list)

def cutoff(x):
return min(int(len(x) * val_split), max_y_cutoff)
return min(int(len(x) * target_split), max_y_cutoff)

ratings["user_history"] = ratings["movie_id"].apply(lambda x: x[: -cutoff(x)])
ratings["target_id"] = ratings["movie_id"].apply(lambda x: x[-cutoff(x) :])
Expand All @@ -55,14 +61,14 @@ def cutoff(x):

np.random.shuffle(unique_users)
num_users = len(unique_users)
print(f"Unique users: {num_users}")
val_users = unique_users[: int(num_users * 0.2)]
train_users = unique_users[int(num_users * 0.2) :]
logger.info(f"Unique users: {num_users}")
val_users = unique_users[: int(num_users * val_split)]
train_users = unique_users[int(num_users * val_split) :]
train_set = ratings[ratings.user_id.isin(train_users)]
val_set = ratings[ratings.user_id.isin(val_users)]

print(f"Train set size: {train_set.shape}")
print(f"Validation set size: {val_set.shape}")
logger.info(f"Train set size: {train_set.shape}")
logger.info(f"Validation set size: {val_set.shape}")
return train_set, val_set


Expand Down
6 changes: 5 additions & 1 deletion src/tf_tabular/input_spec.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
from enum import Enum
import logging
import numpy as np
import pandas as pd
import tensorflow as tf
from typing import List
from .utils import build_continuous_input, build_categorical_input


logger = logging.getLogger(__name__)


class ColumnType(Enum):
NUMERIC = 1
CATEGORICAL = 2
Expand Down Expand Up @@ -85,7 +89,7 @@ def __init__(self, name: str, norm_params: dict, is_sequence: bool = False):
self.variance = norm_params["var"]
else:
# No normalization
print(f"No normalization parameters found for {name}. Not normalizing this column")
logger.info(f"No normalization parameters found for {name}. Not normalizing this column")

def build_layer(self):
"""Builds the input layer stack for the numeric feature"""
Expand Down

0 comments on commit 3c263cf

Please sign in to comment.