@@ -257,13 +257,13 @@ def block(
257
257
logger .info ("===== creating graph =====\n " )
258
258
259
259
if deduplication :
260
- x_df = x_df [x_df ["y" ] > x_df ["x" ]]
261
- # x_df['pair'] = x_df.apply(lambda row: tuple(sorted([row['y'], row['x']])), axis=1)
262
- # x_df = x_df.loc[x_df.groupby('pair')['dist'].idxmin()]
263
- # x_df = x_df.drop('pair', axis=1)
260
+ x_df ["pair" ] = x_df .apply (lambda row : tuple (sorted ([row ["y" ], row ["x" ]])), axis = 1 )
261
+ x_df = x_df .loc [x_df .groupby ("pair" )["dist" ].idxmin ()]
262
+ x_df = x_df .drop ("pair" , axis = 1 )
264
263
265
264
x_df ["query_g" ] = "q" + x_df ["y" ].astype (str )
266
265
x_df ["index_g" ] = "q" + x_df ["x" ].astype (str )
266
+ print (f"X shape: { x_df .shape } " )
267
267
else :
268
268
x_df ["query_g" ] = "q" + x_df ["y" ].astype (str )
269
269
x_df ["index_g" ] = "i" + x_df ["x" ].astype (str )
@@ -294,71 +294,36 @@ def block(
294
294
295
295
if true_blocks is not None :
296
296
if not deduplication :
297
- pairs_to_eval = x_df [x_df ["y" ].isin (true_blocks ["y" ])][["x" , "y" , "block" ]]
298
- pairs_to_eval = pairs_to_eval .merge (
299
- true_blocks [["x" , "y" ]], on = ["x" , "y" ], how = "left" , indicator = "both"
297
+ candidate_pairs = list (itertools .product (list (range (len (x_dtm ))), true_blocks ["y" ]))
298
+ cp_df = pd .DataFrame (candidate_pairs , columns = ["x" , "y" ])
299
+ cp_df = cp_df .astype (int )
300
+ comparison_df = (
301
+ cp_df .merge (true_blocks , on = ["x" , "y" ], how = "left" )
302
+ .rename (columns = {"block" : "block_true" })
303
+ .merge (x_df , on = ["x" , "y" ], how = "left" )
304
+ .rename (columns = {"block" : "block_pred" })
300
305
)
301
- pairs_to_eval ["both" ] = np .where (pairs_to_eval ["both" ] == "both" , 0 , - 1 )
302
- true_blocks = true_blocks .merge (
303
- pairs_to_eval [["x" , "y" ]],
304
- on = ["x" , "y" ],
305
- how = "left" ,
306
- indicator = "both" ,
306
+ comparison_df ["TP" ] = (comparison_df ["block_true" ].notna ()) & (
307
+ comparison_df ["block_pred" ].notna ()
307
308
)
308
- true_blocks ["both" ] = np .where (true_blocks ["both" ] == "both" , 0 , 1 )
309
- true_blocks ["block" ] += pairs_to_eval ["block" ].max ()
310
-
311
- to_concat = true_blocks [true_blocks ["both" ] == 1 ][["x" , "y" , "block" , "both" ]]
312
- pairs_to_eval = pd .concat ([pairs_to_eval , to_concat ], ignore_index = True )
313
- pairs_to_eval ["row_id" ] = range (len (pairs_to_eval ))
314
- pairs_to_eval ["x2" ] = pairs_to_eval ["x" ] + pairs_to_eval ["y" ].max ()
315
-
316
- pairs_to_eval_long = pd .melt (
317
- pairs_to_eval [["y" , "x2" , "row_id" , "block" , "both" ]],
318
- id_vars = ["row_id" , "block" , "both" ],
319
- )
320
-
321
- filtered_df = pairs_to_eval_long [pairs_to_eval_long ["both" ] == 0 ].copy ()
322
- filtered_df ["group_id" ] = filtered_df .groupby ("block" ).ngroup ()
323
- pairs_to_eval_long .loc [pairs_to_eval_long ["both" ] == 0 , "block_id" ] = filtered_df [
324
- "group_id"
325
- ]
326
- pairs_to_eval_long .loc [pairs_to_eval_long ["both" ] == 0 , "true_id" ] = filtered_df [
327
- "group_id"
328
- ]
329
-
330
- block_id_max = pairs_to_eval_long ["block_id" ].max (skipna = True )
331
- pairs_to_eval_long .loc [pairs_to_eval_long ["both" ] == - 1 , "block_id" ] = (
332
- block_id_max + pairs_to_eval_long .groupby ("row_id" ).ngroup () + 1
309
+ # CNL -> Correct Non-Links / True Negative
310
+ comparison_df ["CNL" ] = (comparison_df ["block_true" ].isna ()) & (
311
+ comparison_df ["block_pred" ].isna ()
333
312
)
334
- block_id_max = pairs_to_eval_long ["block_id" ].max (skipna = True )
335
- # recreating R's rleid function
336
- pairs_to_eval_long ["rleid" ] = (
337
- pairs_to_eval_long ["row_id" ] != pairs_to_eval_long ["row_id" ].shift (1 )
338
- ).cumsum ()
339
- pairs_to_eval_long .loc [
340
- (pairs_to_eval_long ["both" ] == 1 ) & (pairs_to_eval_long ["block_id" ].isna ()),
341
- "block_id" ,
342
- ] = (
343
- block_id_max + pairs_to_eval_long ["rleid" ]
313
+ comparison_df ["FP" ] = (comparison_df ["block_true" ].isna ()) & (
314
+ comparison_df ["block_pred" ].notna ()
344
315
)
345
-
346
- true_id_max = pairs_to_eval_long ["true_id" ].max (skipna = True )
347
- pairs_to_eval_long .loc [pairs_to_eval_long ["both" ] == 1 , "true_id" ] = (
348
- true_id_max + pairs_to_eval_long .groupby ("row_id" ).ngroup () + 1
316
+ comparison_df ["FN" ] = (comparison_df ["block_true" ].notna ()) & (
317
+ comparison_df ["block_pred" ].isna ()
349
318
)
350
- true_id_max = pairs_to_eval_long ["true_id" ].max (skipna = True )
351
- # recreating R's rleid function again
352
- pairs_to_eval_long ["rleid" ] = (
353
- pairs_to_eval_long ["row_id" ] != pairs_to_eval_long ["row_id" ].shift (1 )
354
- ).cumsum ()
355
- pairs_to_eval_long .loc [
356
- (pairs_to_eval_long ["both" ] == - 1 ) & (pairs_to_eval_long ["true_id" ].isna ()),
357
- "true_id" ,
358
- ] = (
359
- true_id_max + pairs_to_eval_long ["rleid" ]
319
+ self .confusion = pd .DataFrame (
320
+ [
321
+ [comparison_df ["CNL" ].sum (), comparison_df ["FN" ].sum ()],
322
+ [comparison_df ["FP" ].sum (), comparison_df ["TP" ].sum ()],
323
+ ],
324
+ index = ["Predicted Negative" , "Predicted Positive" ],
325
+ columns = ["Actual Negative" , "Actual Positive" ],
360
326
)
361
- pairs_to_eval_long = pairs_to_eval_long .drop (columns = ["rleid" ], axis = 1 )
362
327
363
328
else :
364
329
pairs_to_eval_long = (
@@ -369,24 +334,27 @@ def block(
369
334
.rename (columns = {"block" : "true_id" })
370
335
)
371
336
372
- candidate_pairs = np .array (
373
- list (itertools .combinations (range (pairs_to_eval_long .shape [0 ]), 2 ))
374
- )
375
- block_id_array = pairs_to_eval_long ["block_id" ].to_numpy ()
376
- true_id_array = pairs_to_eval_long ["true_id" ].to_numpy ()
377
- same_block = (
378
- block_id_array [candidate_pairs [:, 0 ]] == block_id_array [candidate_pairs [:, 1 ]]
379
- )
380
- same_truth = (
381
- true_id_array [candidate_pairs [:, 0 ]] == true_id_array [candidate_pairs [:, 1 ]]
382
- )
337
+ candidate_pairs = np .array (
338
+ list (itertools .combinations (range (pairs_to_eval_long .shape [0 ]), 2 ))
339
+ )
340
+
341
+ block_id_array = pairs_to_eval_long ["block_id" ].to_numpy ()
342
+ true_id_array = pairs_to_eval_long ["true_id" ].to_numpy ()
343
+ same_block = (
344
+ block_id_array [candidate_pairs [:, 0 ]] == block_id_array [candidate_pairs [:, 1 ]]
345
+ )
346
+ same_truth = (
347
+ true_id_array [candidate_pairs [:, 0 ]] == true_id_array [candidate_pairs [:, 1 ]]
348
+ )
383
349
384
- self .confusion = pd .crosstab (same_block , same_truth )
350
+ self .confusion = pd .crosstab (same_block , same_truth )
351
+ self .confusion .index = ["Predicted Negative" , "Predicted Positive" ]
352
+ self .confusion .columns = ["Actual Negative" , "Actual Positive" ]
385
353
386
- fp = self .confusion .loc [ True , False ]
387
- fn = self .confusion .loc [ False , True ]
388
- tp = self .confusion .loc [ True , True ]
389
- tn = self .confusion .loc [ False , False ]
354
+ fp = self .confusion .iloc [ 1 , 0 ]
355
+ fn = self .confusion .iloc [ 0 , 1 ]
356
+ tp = self .confusion .iloc [ 1 , 1 ]
357
+ tn = self .confusion .iloc [ 0 , 0 ]
390
358
391
359
recall = tp / (fn + tp ) if (fn + tp ) != 0 else 0
392
360
precision = tp / (tp + fp ) if (tp + fp ) != 0 else 0
@@ -420,5 +388,5 @@ def block(
420
388
eval_metrics = self .eval_metrics ,
421
389
confusion = self .confusion ,
422
390
colnames_xy = colnames_xy ,
423
- graph = graph ,
391
+ graph = graph ,
424
392
)
0 commit comments