Skip to content

Commit

Permalink
Merge pull request #59 from mitre/even_more_fixes
Browse files Browse the repository at this point in the history
Additional fixes
  • Loading branch information
dehall authored Jul 20, 2023
2 parents 6ec0195 + 1be2469 commit 38bd3e9
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 18 deletions.
5 changes: 3 additions & 2 deletions data_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import time
from datetime import datetime

import numpy as np
import pandas as pd

from utils.data_reader import (
Expand Down Expand Up @@ -155,8 +156,8 @@ def top_N(series, N=0, lower_limit=1):


def summary(series):
# 1. count the number of missing (null) entries
missing = series.isna().sum()
# 1. count the number of missing (null or blank string) entries
missing = series.replace(r"^\s*$", np.nan, regex=True).isna().sum()

# 2. basic descriptive statistics on the length of the values
length = series.str.len().describe().to_dict()
Expand Down
29 changes: 17 additions & 12 deletions households/matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,7 +360,7 @@ def get_household_matches(
pairs_writer.writerow(matching_pairs[i])
print(f"[{datetime.now()}] Wrote matching pairs to {pairs_path}")

five_percent = int(len(matching_pairs) / 20)
five_percent = max(int(len(matching_pairs) / 20), 1)
pos_to_pairs = {}
# note: "for pair in matching_pairs:" had unexpectedly poor performance here
for i in range(len(matching_pairs)):
Expand Down Expand Up @@ -407,30 +407,42 @@ def get_candidate_links(pii_lines, split_factor=4, exact_addresses=False, debug=
# start with an empty index we can append to
candidate_links = pd.MultiIndex.from_tuples([], names=[0, 1])

# only include lines with an address, since otherwise
# missing addresses will be considered a match ("" == "")
pii_lines_with_address = pii_lines[pii_lines.household_street_address != ""]

if len(pii_lines_with_address) == 0:
# essentially just a null check
# don't bother with the rest if we have no addresses
# this should never happen
return candidate_links

# break up the dataframe into subframes,
# and iterate over every pair of subframes.
# we improve performance somewhat by only comparing looking forward,
# that is, only comparing a given set of rows
# against rows with higher indices.
for subset_A in np.array_split(pii_lines, split_factor):
for subset_A in np.array_split(pii_lines_with_address, split_factor):
first_item_in_A = subset_A.index.min()

# don't compare against earlier items
# Note: this assumes that the index is the row number
# (NOT the record_id/patid) and the df is sequential
# this is currently the case in households.py#parse_source_file()
lines_to_compare = pii_lines[first_item_in_A:]
lines_to_compare = pii_lines_with_address[first_item_in_A:]

# pick a sub split factor to give us ~same size subset_A and subset_B.
# the idea is that there's some implicit overhead to splitting,
# so don't split more tha necessary
sub_split_factor = int(len(lines_to_compare) / len(subset_A))
sub_split_factor = max(int(len(lines_to_compare) / len(subset_A)), 1)
for subset_B in np.array_split(lines_to_compare, sub_split_factor):
if debug:
print(
f"[{datetime.now()}] Indexing rows "
f"[{subset_A.index.min()}..{subset_A.index.max()}]"
" against "
f"[{subset_B.index.min()}..{subset_B.index.max()}]"
f". {len(candidate_links)} candidates so far"
)

# note pairs_subset and candidate_links are MultiIndexes
Expand All @@ -452,13 +464,6 @@ def get_candidate_links(pii_lines, split_factor=4, exact_addresses=False, debug=

gc.collect()

# rows with blank address match ("" == "") so drop those here
# TODO: ideally we wouldn't compare blank address lines in the first place
# but the indexing and splitting bits get complicated if we drop them earlier
blank_addresses = pii_lines[pii_lines["household_street_address"] == ""].index
candidate_links = candidate_links.drop(blank_addresses, level=0, errors="ignore")
candidate_links = candidate_links.drop(blank_addresses, level=1, errors="ignore")

if debug:
print(f"[{datetime.now()}] Found {len(candidate_links)} candidate pairs")

Expand Down Expand Up @@ -509,7 +514,7 @@ def get_matching_pairs(
matching_pairs = pd.MultiIndex.from_tuples([], names=[0, 1])
# we know that we could support len(subset_A) in memory above,
# so use the same amount here
len_subset_A = int(len(pii_lines) / split_factor)
len_subset_A = max(int(len(pii_lines) / split_factor), 1)

# note: np.array_split had unexpectedly poor performance here for very large indices
for i in range(0, len(candidate_links), len_subset_A):
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ clkhash>=0.16.0
psycopg2>=2.8.3
anonlink-client==0.1.5
ijson>=3.1.2
textdistance[extras]>=4.5.0
textdistance>=4.5.0
usaddress>=0.5.10
pylint>=2.4.2
tqdm>=4.36.1
Expand Down
12 changes: 9 additions & 3 deletions utils/data_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,14 +124,18 @@ def map_key(row, key):
return row_key


def empty_str_from_none(string):
if string is None:
def empty_str_from_none(obj):
if obj is None:
return ""
elif isinstance(obj, pd.Series):
return obj.fillna("")
else:
return string
return obj


def case_insensitive_lookup(row, key, version):
# IMPORTANT: this function gets called from extract.py and data_analysis.py
# with different types for `row`
data_key = DATA_DICTIONARY[version][key]
if isinstance(data_key, list):
first_key = map_key(row, data_key[0])
Expand All @@ -141,6 +145,8 @@ def case_insensitive_lookup(row, key, version):
if mapped_subkey:
subdata = empty_str_from_none(row[mapped_subkey])
data = data + " " + subdata
if isinstance(data, pd.Series):
data.name = key

return data

Expand Down

0 comments on commit 38bd3e9

Please sign in to comment.