Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Simplify and cleanup #124

Merged
merged 6 commits into from
Oct 10, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
177 changes: 87 additions & 90 deletions ranker/shared/ranker_obj.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,46 +101,51 @@ def rank(self, answers, jaccard_like=False):
]
return answers

def probes(self, r_node_ids):
# Identify Probes
#################
# Q Graph Connectivity Matrix
q_node_ids = list(self.qgraph["nodes"].keys())
n_q_nodes = len(q_node_ids)
q_conn = np.full((n_q_nodes, n_q_nodes), 0)
for e in self.qgraph["edges"].values():
e_sub = q_node_ids.index(e["subject"])
e_obj = q_node_ids.index(e["object"])
if e_sub is not None and e_obj is not None:
q_conn[e_sub, e_obj] = 1

# Determine probes based on connectivity
node_conn = np.sum(q_conn, 0) + np.sum(q_conn, 1).T
probe_nodes = []
for conn in range(np.max(node_conn)):
is_this_conn = node_conn == (conn + 1)
probe_nodes += list(np.where(is_this_conn)[0])
if len(probe_nodes) > 1:
break
q_probes = list(combinations(probe_nodes, 2))

# Converting qgraph inds to rgraph inds:
qr_index = defaultdict(list)
for node_id in r_node_ids:
qr_index[node_id[0]].append(r_node_ids.index(node_id))

probes = []
for probe in q_probes:
left = qr_index[q_node_ids[probe[0]]]
right = qr_index[q_node_ids[probe[1]]]
for le in left:
for ri in right:
probes.append((le, ri))

return probes

def score(self, answer, jaccard_like=False):
"""Compute answer score."""
# answer is a list of dicts with fields 'id' and 'bound'
r_node_ids, edges_all = self.get_rgraph(answer)

for i_analysis, edges in enumerate(edges_all):
# Identify Probes
#################
# Q Graph Connectivity Matrix
q_node_ids = list(self.qgraph["nodes"].keys())
n_q_nodes = len(q_node_ids)
q_conn = np.full((n_q_nodes, n_q_nodes), 0)
for e in self.qgraph["edges"].values():
e_sub = q_node_ids.index(e["subject"])
e_obj = q_node_ids.index(e["object"])
if e_sub is not None and e_obj is not None:
q_conn[e_sub, e_obj] = 1

# Determine probes based on connectivity
node_conn = np.sum(q_conn, 0) + np.sum(q_conn, 1).T
probe_nodes = []
for conn in range(np.max(node_conn)):
is_this_conn = node_conn == (conn + 1)
probe_nodes += list(np.where(is_this_conn)[0])
if len(probe_nodes) > 1:
break
q_probes = list(combinations(probe_nodes, 2))

# Converting qgraph inds to rgraph inds:
qr_index = defaultdict(list)
for node_id in r_node_ids[i_analysis]:
qr_index[node_id[0]].append(r_node_ids[i_analysis].index(node_id))

probes = []
for probe in q_probes:
left = qr_index[q_node_ids[probe[0]]]
right = qr_index[q_node_ids[probe[1]]]
for le in left:
for ri in right:
probes.append((le, ri))
probes = self.probes(r_node_ids[i_analysis])

laplacian = self.graph_laplacian((r_node_ids[i_analysis], edges), probes)
# If this still happens at this point it is because a probe has a problem
Expand All @@ -162,72 +167,52 @@ def score(self, answer, jaccard_like=False):
def graph_laplacian(self, rgraph, probes):
"""Generate graph Laplacian."""
node_ids, edges = rgraph

# compute graph laplacian for this case while removing duplicate sources for each edge in the result graph
num_nodes = len(node_ids)
weight_dict = []
for subject_index in range(num_nodes):
weight_dict_i = []
for object_id in range(num_nodes):
weight_dict_i.append({})
weight_dict.append(weight_dict_i)

index = {node_id: node_ids.index(node_id) for node_id in node_ids}

# For each potential edge in the dense answer graph
# Make a dictionary of edge source / properties.
# For the case where there are redundant edges,
# same subject, object, source, property type,
# Take the max of the property values
# This might happen if two KPs have the same underlying data sources
# But for some reason return different publication counts
weight_dict = defaultdict(lambda: defaultdict(\
lambda: defaultdict( lambda: defaultdict(float))))
for edge in edges:
subject_id, object_id, edge_weight = (
edge["subject"],
edge["object"],
edge["weight"],
)
subject_index, object_index = index[subject_id], index[object_id]
for edge_source in edge_weight.keys():
for edge_property in edge_weight[edge_source].keys():
val = edge_weight[edge_source][edge_property]
if edge_source in weight_dict[subject_index][object_index]:
if (
edge_property
in weight_dict[subject_index][object_index][edge_source]
):
weight_dict[subject_index][object_index][edge_source][
edge_property
] = max(
weight_dict[subject_index][object_index][edge_source][
edge_property
],
val,
)
else:
weight_dict[subject_index][object_index][edge_source][
edge_property
] = val
else:
weight_dict[subject_index][object_index][edge_source] = {}
weight_dict[subject_index][object_index][edge_source][
edge_property
] = val
subject = edge["subject"]
object = edge["object"]
edge_weight = edge["weight"]

for edge_source, edge_properties in edge_weight.items():
for edge_property, edge_val in edge_properties.items():
weight_dict[subject][object][edge_source][edge_property] = \
max(weight_dict[subject][object][edge_source][edge_property], edge_val)
maximusunc marked this conversation as resolved.
Show resolved Hide resolved
weight_dict[object][subject][edge_source][edge_property] = \
maximusunc marked this conversation as resolved.
Show resolved Hide resolved
max(weight_dict[object][subject][edge_source][edge_property], edge_val)

qedge_qnode_ids = set(
[frozenset((e["subject"], e["object"])) for e in self.qedge_by_id.values()]
)

# Now go through these edges
# Turn each value into an edge weight
# Then calculate the graph laplacian
laplacian = np.zeros((num_nodes, num_nodes))
for subject_index in range(num_nodes):
q_node_id_subject = node_ids[subject_index][0]
for object_id in range(num_nodes):
q_node_id_object = node_ids[object_id][0]
weight_mat = np.zeros((num_nodes, num_nodes)) # For debugging
for i, sub_id_mapping in enumerate(node_ids):
q_node_id_subject = sub_id_mapping[0]
for j, obj_id_mapping in enumerate(node_ids):
q_node_id_object = obj_id_mapping[0]

edge_qnode_ids = frozenset((q_node_id_subject, q_node_id_object))

# Set default weight (or 0 when edge is not a qedge)
weight = (
self.DEFAULT_WEIGHT if edge_qnode_ids in qedge_qnode_ids else 0.0
)

for source in weight_dict[subject_index][object_id].keys():
for property in weight_dict[subject_index][object_id][
source
].keys():
source_w = weight_dict[subject_index][object_id][source][
property
]
for source, properties in weight_dict[sub_id_mapping][obj_id_mapping].items():
for property, source_w in properties.items():
source_weighted = source_w * source_weight(
source,
property,
Expand All @@ -238,12 +223,24 @@ def graph_laplacian(self, rgraph, probes):
source_weighted = 0.99999999
weight = weight + -1 / (np.log(source_weighted))

laplacian[subject_index, object_id] += -weight
laplacian[object_id, subject_index] += -weight
laplacian[subject_index, subject_index] += weight
laplacian[object_id, object_id] += weight
weight_mat[i, j] += weight # For debugging

laplacian[i, j] += -weight
laplacian[j, i] += -weight
laplacian[i, i] += weight
laplacian[j, j] += weight

# Using weight_mat you can calculated the laplacian, however we did this above.
# weight_row_sums = np.sum(weight_mat,axis=1)
# laplacian = -1 * weight_mat.copy()
# for i in range(num_nodes):
# laplacian[i, i] = weight_row_sums[i]

# Clean up Laplacian (remove extra nodes etc.)
# Sometimes, mostly because of a bug of some kind,
# There will be rows of all zeros in the laplacian.
# This will cause numerical issues.
# We can remove these, as long as they aren't probes.
removal_candidate = np.all(np.abs(laplacian) == 0, axis=0)
# Don't permit removing probes
for probe in probes:
Expand Down
Loading