Skip to content

Commit

Permalink
Merge pull request #46 from transferwise/tree
Browse files Browse the repository at this point in the history
A tree-based (non-overlapping) solver
  • Loading branch information
EgorKraevTransferwise committed May 7, 2024
2 parents 32af520 + f87bb99 commit ca74864
Show file tree
Hide file tree
Showing 11 changed files with 448 additions and 85 deletions.
45 changes: 39 additions & 6 deletions tests/test_fit.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
explain_timeseries,
)
from wise_pizza.segment_data import SegmentData
from wise_pizza.solver import solve_lasso, solve_lp
from wise_pizza.solve.solver import solve_lasso, solve_lp
from wise_pizza.time import create_time_basis
from wise_pizza.plotting_time import plot_time

Expand All @@ -33,7 +33,7 @@
# Too long, delete some values for quick starts, e.g. by deleting the parameters in nan_percent, size_one_percent
deltas_test_values = [
("totals", "split_fits", "force_dim", "extra_dim"), # how
("lp", "lasso"), # solver
("lp", "lasso", "tree"), # solver
(True,), # plot_is_static
(explain_changes_in_average, explain_changes_in_totals), # function
(0.0, 90.0), # nan_percent
Expand All @@ -44,7 +44,7 @@

# possible values for explain_levels
levels_test_values = [
("lp", "lasso"), # solver
("lp", "lasso", "tree"), # solver
(0.0, 90.0), # nan_percent
(0.0, 90.0), # size_one_percent
]
Expand Down Expand Up @@ -136,9 +136,9 @@ def test_categorical():
print("yay!")


@pytest.mark.parametrize("nan_percent", [0.0, 1.0])
def test_synthetic_template(nan_percent: float):
all_data = synthetic_data(init_len=1000)
@pytest.mark.parametrize("nan_percent, clustering", [[0.0, False], [1.0, False]])
def test_synthetic_template(nan_percent: float, clustering: bool):
all_data = synthetic_data(init_len=10000, dim_values=5)
data = all_data.data

data.loc[(data["dim0"] == 0) & (data["dim1"] == 1), "totals"] += 100
Expand All @@ -155,6 +155,7 @@ def test_synthetic_template(nan_percent: float):
min_segments=5,
verbose=1,
solver="lp",
cluster_values=clustering,
)
print("***")
for s in sf.segments:
Expand All @@ -167,6 +168,38 @@ def test_synthetic_template(nan_percent: float):
print("yay!")


@pytest.mark.parametrize("nan_percent", [0.0, 1.0])
def test_synthetic_template_tree(nan_percent: float):
all_data = synthetic_data(init_len=1000)
data = all_data.data

data.loc[(data["dim0"] == 0) & (data["dim1"] == 1), "totals"] += 200
data.loc[(data["dim1"] == 0) & (data["dim2"] == 1), "totals"] += 300

if nan_percent > 0:
data = values_to_nan(data, nan_percent)
sf = explain_levels(
data,
dims=all_data.dimensions,
total_name=all_data.segment_total,
size_name=all_data.segment_size,
max_depth=2,
min_segments=5,
verbose=1,
solver="tree",
)
print("***")
for s in sf.segments:
print(s)

# TODO: insert approppriate asserts
# assert abs(sf.segments[0]["coef"] - 300) < 2
# assert abs(sf.segments[1]["coef"] - 100) < 2

# sf.plot()
print("yay!")


@pytest.mark.parametrize("nan_percent", [0.0, 1.0])
def test_synthetic_ts_template(nan_percent: float):
all_data = synthetic_ts_data(init_len=10000)
Expand Down
75 changes: 70 additions & 5 deletions wise_pizza/cluster.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
from typing import List, Dict, Tuple
from collections import defaultdict

import numpy as np
import pandas as pd
from sklearn.preprocessing import PowerTransformer
Expand All @@ -18,17 +21,27 @@ def guided_kmeans(X: np.ndarray, power_transform: bool = True) -> np.ndarray:
X = X.values

if power_transform:
if len(X[X > 0] > 1):
X[X > 0] = PowerTransformer(standardize=False).fit_transform(X[X > 0].reshape(-1, 1)).reshape(-1)
if len(X[X < 0] > 1):
X[X < 0] = -PowerTransformer(standardize=False).fit_transform(-X[X < 0].reshape(-1, 1)).reshape(-1)
if len(X[X > 0]) > 1:
X[X > 0] = (
PowerTransformer(standardize=False)
.fit_transform(X[X > 0].reshape(-1, 1))
.reshape(-1)
)
if len(X[X < 0]) > 1:
X[X < 0] = (
-PowerTransformer(standardize=False)
.fit_transform(-X[X < 0].reshape(-1, 1))
.reshape(-1)
)

best_score = -1
best_labels = None
best_n = -1
# If we allow 2 clusters, it almost always just splits positive vs negative - boring!
for n_clusters in range(3, int(len(X) / 2) + 1):
cluster_labels = KMeans(n_clusters=n_clusters, init="k-means++", n_init=10).fit_predict(X)
cluster_labels = KMeans(
n_clusters=n_clusters, init="k-means++", n_init=10
).fit_predict(X)
score = silhouette_score(X, cluster_labels)
# print(n_clusters, score)
if score > best_score:
Expand All @@ -45,3 +58,55 @@ def to_matrix(labels: np.ndarray) -> np.ndarray:
for i in labels.unique():
out[labels == i, i] = 1.0
return out


def make_clusters(dim_df: pd.DataFrame, dims: List[str]):
cluster_names = {}
for dim in dims:
if len(dim_df[dim].unique()) >= 6: # otherwise what's the point in clustering?
grouped_df = (
dim_df[[dim, "totals", "weights"]].groupby(dim, as_index=False).sum()
)
grouped_df["avg"] = grouped_df["totals"] / grouped_df["weights"]
grouped_df["cluster"], _ = guided_kmeans(grouped_df["avg"])
pre_clusters = (
grouped_df[["cluster", dim]]
.groupby("cluster")
.agg({dim: lambda x: "@@".join(x)})
.values
)
# filter out clusters with only one element
these_clusters = [c for c in pre_clusters.reshape(-1) if "@@" in c]
# create short cluster names
for i, c in enumerate(these_clusters):
cluster_names[f"{dim}_cluster_{i + 1}"] = c
return cluster_names


def nice_cluster_names(x: List[Dict[str, List[str]]]) -> Tuple[List[Dict], Dict]:
# first pass just populate cluster names
cluster_strings = defaultdict(set)
for xx in x:
for dim, v in xx.items():
if len(v) > 1:
cluster_strings[dim].add("@@".join(v))

cluster_names = {}
reverse_cluster_names = {}
for dim, clusters in cluster_strings.items():
reverse_cluster_names[dim] = {}
for i, c in enumerate(clusters):
cluster_names[f"{dim}_cluster_{i + 1}"] = c
reverse_cluster_names[dim][c] = f"{dim}_cluster_{i + 1}"

col_defs = []
for xx in x:
this_def = {}
for dim, v in xx.items():
if len(v) > 1:
this_def[dim] = reverse_cluster_names[dim]["@@".join(v)]
else:
this_def[dim] = v[0]
col_defs.append(this_def)

return col_defs, cluster_names
9 changes: 8 additions & 1 deletion wise_pizza/explain.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,7 @@ def explain_timeseries(
max_depth: int = 2,
solver: str = "omp",
verbose: bool = False,
constrain_signs: bool = False,
cluster_values: bool = False,
time_basis: Optional[pd.DataFrame] = None,
fit_log_space: bool = False,
Expand Down Expand Up @@ -388,7 +389,10 @@ def explain_timeseries(
fit_sizes = True

if fit_log_space:
tf = LogTransform(offset=1, weight_pow_sc=log_space_weight_sc)
tf = LogTransform(
offset=1,
weight_pow_sc=log_space_weight_sc,
)
else:
tf = IdentityTransform()

Expand All @@ -415,6 +419,7 @@ def explain_timeseries(
max_depth=max_depth,
solver=solver,
verbose=verbose,
constrain_signs=constrain_signs,
cluster_values=cluster_values,
time_basis=time_basis,
)
Expand All @@ -441,6 +446,7 @@ def explain_timeseries(
max_depth=max_depth,
solver=solver,
verbose=verbose,
constrain_signs=constrain_signs,
cluster_values=cluster_values,
time_basis=time_basis,
)
Expand Down Expand Up @@ -477,6 +483,7 @@ def explain_timeseries(
max_depth=max_depth,
solver=solver,
verbose=verbose,
constrain_signs=constrain_signs,
cluster_values=cluster_values,
time_basis=time_basis,
)
Expand Down
Loading

0 comments on commit ca74864

Please sign in to comment.