Skip to content

Commit 702e548

Browse files
committed
Fix Dedup filtering, New Eval system for RL, todo -> docs
1 parent 64a0448 commit 702e548

File tree

2 files changed

+51
-81
lines changed

2 files changed

+51
-81
lines changed

blockingpy/blocker.py

+48-80
Original file line numberDiff line numberDiff line change
@@ -257,13 +257,13 @@ def block(
257257
logger.info("===== creating graph =====\n")
258258

259259
if deduplication:
260-
x_df = x_df[x_df["y"] > x_df["x"]]
261-
# x_df['pair'] = x_df.apply(lambda row: tuple(sorted([row['y'], row['x']])), axis=1)
262-
# x_df = x_df.loc[x_df.groupby('pair')['dist'].idxmin()]
263-
# x_df = x_df.drop('pair', axis=1)
260+
x_df["pair"] = x_df.apply(lambda row: tuple(sorted([row["y"], row["x"]])), axis=1)
261+
x_df = x_df.loc[x_df.groupby("pair")["dist"].idxmin()]
262+
x_df = x_df.drop("pair", axis=1)
264263

265264
x_df["query_g"] = "q" + x_df["y"].astype(str)
266265
x_df["index_g"] = "q" + x_df["x"].astype(str)
266+
print(f"X shape: {x_df.shape}")
267267
else:
268268
x_df["query_g"] = "q" + x_df["y"].astype(str)
269269
x_df["index_g"] = "i" + x_df["x"].astype(str)
@@ -294,71 +294,36 @@ def block(
294294

295295
if true_blocks is not None:
296296
if not deduplication:
297-
pairs_to_eval = x_df[x_df["y"].isin(true_blocks["y"])][["x", "y", "block"]]
298-
pairs_to_eval = pairs_to_eval.merge(
299-
true_blocks[["x", "y"]], on=["x", "y"], how="left", indicator="both"
297+
candidate_pairs = list(itertools.product(list(range(len(x_dtm))), true_blocks["y"]))
298+
cp_df = pd.DataFrame(candidate_pairs, columns=["x", "y"])
299+
cp_df = cp_df.astype(int)
300+
comparison_df = (
301+
cp_df.merge(true_blocks, on=["x", "y"], how="left")
302+
.rename(columns={"block": "block_true"})
303+
.merge(x_df, on=["x", "y"], how="left")
304+
.rename(columns={"block": "block_pred"})
300305
)
301-
pairs_to_eval["both"] = np.where(pairs_to_eval["both"] == "both", 0, -1)
302-
true_blocks = true_blocks.merge(
303-
pairs_to_eval[["x", "y"]],
304-
on=["x", "y"],
305-
how="left",
306-
indicator="both",
306+
comparison_df["TP"] = (comparison_df["block_true"].notna()) & (
307+
comparison_df["block_pred"].notna()
307308
)
308-
true_blocks["both"] = np.where(true_blocks["both"] == "both", 0, 1)
309-
true_blocks["block"] += pairs_to_eval["block"].max()
310-
311-
to_concat = true_blocks[true_blocks["both"] == 1][["x", "y", "block", "both"]]
312-
pairs_to_eval = pd.concat([pairs_to_eval, to_concat], ignore_index=True)
313-
pairs_to_eval["row_id"] = range(len(pairs_to_eval))
314-
pairs_to_eval["x2"] = pairs_to_eval["x"] + pairs_to_eval["y"].max()
315-
316-
pairs_to_eval_long = pd.melt(
317-
pairs_to_eval[["y", "x2", "row_id", "block", "both"]],
318-
id_vars=["row_id", "block", "both"],
319-
)
320-
321-
filtered_df = pairs_to_eval_long[pairs_to_eval_long["both"] == 0].copy()
322-
filtered_df["group_id"] = filtered_df.groupby("block").ngroup()
323-
pairs_to_eval_long.loc[pairs_to_eval_long["both"] == 0, "block_id"] = filtered_df[
324-
"group_id"
325-
]
326-
pairs_to_eval_long.loc[pairs_to_eval_long["both"] == 0, "true_id"] = filtered_df[
327-
"group_id"
328-
]
329-
330-
block_id_max = pairs_to_eval_long["block_id"].max(skipna=True)
331-
pairs_to_eval_long.loc[pairs_to_eval_long["both"] == -1, "block_id"] = (
332-
block_id_max + pairs_to_eval_long.groupby("row_id").ngroup() + 1
309+
# CNL -> Correct Non-Links / True Negative
310+
comparison_df["CNL"] = (comparison_df["block_true"].isna()) & (
311+
comparison_df["block_pred"].isna()
333312
)
334-
block_id_max = pairs_to_eval_long["block_id"].max(skipna=True)
335-
# recreating R's rleid function
336-
pairs_to_eval_long["rleid"] = (
337-
pairs_to_eval_long["row_id"] != pairs_to_eval_long["row_id"].shift(1)
338-
).cumsum()
339-
pairs_to_eval_long.loc[
340-
(pairs_to_eval_long["both"] == 1) & (pairs_to_eval_long["block_id"].isna()),
341-
"block_id",
342-
] = (
343-
block_id_max + pairs_to_eval_long["rleid"]
313+
comparison_df["FP"] = (comparison_df["block_true"].isna()) & (
314+
comparison_df["block_pred"].notna()
344315
)
345-
346-
true_id_max = pairs_to_eval_long["true_id"].max(skipna=True)
347-
pairs_to_eval_long.loc[pairs_to_eval_long["both"] == 1, "true_id"] = (
348-
true_id_max + pairs_to_eval_long.groupby("row_id").ngroup() + 1
316+
comparison_df["FN"] = (comparison_df["block_true"].notna()) & (
317+
comparison_df["block_pred"].isna()
349318
)
350-
true_id_max = pairs_to_eval_long["true_id"].max(skipna=True)
351-
# recreating R's rleid function again
352-
pairs_to_eval_long["rleid"] = (
353-
pairs_to_eval_long["row_id"] != pairs_to_eval_long["row_id"].shift(1)
354-
).cumsum()
355-
pairs_to_eval_long.loc[
356-
(pairs_to_eval_long["both"] == -1) & (pairs_to_eval_long["true_id"].isna()),
357-
"true_id",
358-
] = (
359-
true_id_max + pairs_to_eval_long["rleid"]
319+
self.confusion = pd.DataFrame(
320+
[
321+
[comparison_df["CNL"].sum(), comparison_df["FN"].sum()],
322+
[comparison_df["FP"].sum(), comparison_df["TP"].sum()],
323+
],
324+
index=["Predicted Negative", "Predicted Positive"],
325+
columns=["Actual Negative", "Actual Positive"],
360326
)
361-
pairs_to_eval_long = pairs_to_eval_long.drop(columns=["rleid"], axis=1)
362327

363328
else:
364329
pairs_to_eval_long = (
@@ -369,24 +334,27 @@ def block(
369334
.rename(columns={"block": "true_id"})
370335
)
371336

372-
candidate_pairs = np.array(
373-
list(itertools.combinations(range(pairs_to_eval_long.shape[0]), 2))
374-
)
375-
block_id_array = pairs_to_eval_long["block_id"].to_numpy()
376-
true_id_array = pairs_to_eval_long["true_id"].to_numpy()
377-
same_block = (
378-
block_id_array[candidate_pairs[:, 0]] == block_id_array[candidate_pairs[:, 1]]
379-
)
380-
same_truth = (
381-
true_id_array[candidate_pairs[:, 0]] == true_id_array[candidate_pairs[:, 1]]
382-
)
337+
candidate_pairs = np.array(
338+
list(itertools.combinations(range(pairs_to_eval_long.shape[0]), 2))
339+
)
340+
341+
block_id_array = pairs_to_eval_long["block_id"].to_numpy()
342+
true_id_array = pairs_to_eval_long["true_id"].to_numpy()
343+
same_block = (
344+
block_id_array[candidate_pairs[:, 0]] == block_id_array[candidate_pairs[:, 1]]
345+
)
346+
same_truth = (
347+
true_id_array[candidate_pairs[:, 0]] == true_id_array[candidate_pairs[:, 1]]
348+
)
383349

384-
self.confusion = pd.crosstab(same_block, same_truth)
350+
self.confusion = pd.crosstab(same_block, same_truth)
351+
self.confusion.index = ["Predicted Negative", "Predicted Positive"]
352+
self.confusion.columns = ["Actual Negative", "Actual Positive"]
385353

386-
fp = self.confusion.loc[True, False]
387-
fn = self.confusion.loc[False, True]
388-
tp = self.confusion.loc[True, True]
389-
tn = self.confusion.loc[False, False]
354+
fp = self.confusion.iloc[1, 0]
355+
fn = self.confusion.iloc[0, 1]
356+
tp = self.confusion.iloc[1, 1]
357+
tn = self.confusion.iloc[0, 0]
390358

391359
recall = tp / (fn + tp) if (fn + tp) != 0 else 0
392360
precision = tp / (tp + fp) if (tp + fp) != 0 else 0
@@ -420,5 +388,5 @@ def block(
420388
eval_metrics=self.eval_metrics,
421389
confusion=self.confusion,
422390
colnames_xy=colnames_xy,
423-
graph=graph,
391+
graph=graph,
424392
)

blockingpy/blocking_result.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@ class BlockingResult:
5454
Names of columns used in blocking
5555
graph : networkx.Graph or None
5656
Network representation of blocking results if requested
57+
len_x : int
58+
Number of records in the original reference dataset
5759
5860
Notes
5961
-----
@@ -72,7 +74,7 @@ def __init__(
7274
eval_metrics: pd.Series | None,
7375
confusion: pd.DataFrame | None,
7476
colnames_xy: np.ndarray,
75-
graph: bool | None = False,
77+
graph: bool | None = False,
7678
) -> None:
7779
"""Initialize a BlockingResult instance."""
7880
self.result = x_df[["x", "y", "block", "dist"]]

0 commit comments

Comments
 (0)