diff --git a/ranker/shared/ranker_obj.py b/ranker/shared/ranker_obj.py index 2f0efe4..4679c16 100644 --- a/ranker/shared/ranker_obj.py +++ b/ranker/shared/ranker_obj.py @@ -101,46 +101,51 @@ def rank(self, answers, jaccard_like=False): ] return answers + def probes(self, r_node_ids): + # Identify Probes + ################# + # Q Graph Connectivity Matrix + q_node_ids = list(self.qgraph["nodes"].keys()) + n_q_nodes = len(q_node_ids) + q_conn = np.full((n_q_nodes, n_q_nodes), 0) + for e in self.qgraph["edges"].values(): + e_sub = q_node_ids.index(e["subject"]) + e_obj = q_node_ids.index(e["object"]) + if e_sub is not None and e_obj is not None: + q_conn[e_sub, e_obj] = 1 + + # Determine probes based on connectivity + node_conn = np.sum(q_conn, 0) + np.sum(q_conn, 1).T + probe_nodes = [] + for conn in range(np.max(node_conn)): + is_this_conn = node_conn == (conn + 1) + probe_nodes += list(np.where(is_this_conn)[0]) + if len(probe_nodes) > 1: + break + q_probes = list(combinations(probe_nodes, 2)) + + # Converting qgraph inds to rgraph inds: + qr_index = defaultdict(list) + for node_id in r_node_ids: + qr_index[node_id[0]].append(r_node_ids.index(node_id)) + + probes = [] + for probe in q_probes: + left = qr_index[q_node_ids[probe[0]]] + right = qr_index[q_node_ids[probe[1]]] + for le in left: + for ri in right: + probes.append((le, ri)) + + return probes + def score(self, answer, jaccard_like=False): """Compute answer score.""" # answer is a list of dicts with fields 'id' and 'bound' r_node_ids, edges_all = self.get_rgraph(answer) for i_analysis, edges in enumerate(edges_all): - # Identify Probes - ################# - # Q Graph Connectivity Matrix - q_node_ids = list(self.qgraph["nodes"].keys()) - n_q_nodes = len(q_node_ids) - q_conn = np.full((n_q_nodes, n_q_nodes), 0) - for e in self.qgraph["edges"].values(): - e_sub = q_node_ids.index(e["subject"]) - e_obj = q_node_ids.index(e["object"]) - if e_sub is not None and e_obj is not None: - q_conn[e_sub, e_obj] = 1 - - # Determine probes based on connectivity - node_conn = np.sum(q_conn, 0) + np.sum(q_conn, 1).T - probe_nodes = [] - for conn in range(np.max(node_conn)): - is_this_conn = node_conn == (conn + 1) - probe_nodes += list(np.where(is_this_conn)[0]) - if len(probe_nodes) > 1: - break - q_probes = list(combinations(probe_nodes, 2)) - - # Converting qgraph inds to rgraph inds: - qr_index = defaultdict(list) - for node_id in r_node_ids[i_analysis]: - qr_index[node_id[0]].append(r_node_ids[i_analysis].index(node_id)) - - probes = [] - for probe in q_probes: - left = qr_index[q_node_ids[probe[0]]] - right = qr_index[q_node_ids[probe[1]]] - for le in left: - for ri in right: - probes.append((le, ri)) + probes = self.probes(r_node_ids[i_analysis]) laplacian = self.graph_laplacian((r_node_ids[i_analysis], edges), probes) # If this still happens at this point it is because a probe has a problem @@ -162,58 +167,43 @@ def score(self, answer, jaccard_like=False): def graph_laplacian(self, rgraph, probes): """Generate graph Laplacian.""" node_ids, edges = rgraph - - # compute graph laplacian for this case while removing duplicate sources for each edge in the result graph num_nodes = len(node_ids) - weight_dict = [] - for subject_index in range(num_nodes): - weight_dict_i = [] - for object_id in range(num_nodes): - weight_dict_i.append({}) - weight_dict.append(weight_dict_i) - - index = {node_id: node_ids.index(node_id) for node_id in node_ids} + + # For each potential edge in the dense answer graph + # Make a dictionary of edge source / properties. + # For the case where there are redundant edges, + # same subject, object, source, property type, + # Take the max of the property values + # This might happen if two KPs have the same underlying data sources + # But for some reason return different publication counts + weight_dict = defaultdict(lambda: defaultdict(\ + lambda: defaultdict( lambda: defaultdict(float)))) for edge in edges: - subject_id, object_id, edge_weight = ( - edge["subject"], - edge["object"], - edge["weight"], - ) - subject_index, object_index = index[subject_id], index[object_id] - for edge_source in edge_weight.keys(): - for edge_property in edge_weight[edge_source].keys(): - val = edge_weight[edge_source][edge_property] - if edge_source in weight_dict[subject_index][object_index]: - if ( - edge_property - in weight_dict[subject_index][object_index][edge_source] - ): - weight_dict[subject_index][object_index][edge_source][ - edge_property - ] = max( - weight_dict[subject_index][object_index][edge_source][ - edge_property - ], - val, - ) - else: - weight_dict[subject_index][object_index][edge_source][ - edge_property - ] = val - else: - weight_dict[subject_index][object_index][edge_source] = {} - weight_dict[subject_index][object_index][edge_source][ - edge_property - ] = val + subject = edge["subject"] + object = edge["object"] + edge_weight = edge["weight"] + + for edge_source, edge_properties in edge_weight.items(): + for edge_property, edge_val in edge_properties.items(): + weight_dict[subject][object][edge_source][edge_property] = \ + max(weight_dict[subject][object][edge_source][edge_property], edge_val) + weight_dict[object][subject][edge_source][edge_property] = \ + max(weight_dict[object][subject][edge_source][edge_property], edge_val) qedge_qnode_ids = set( [frozenset((e["subject"], e["object"])) for e in self.qedge_by_id.values()] ) + + # Now go through these edges + # Turn each value into an edge weight + # Then calculate the graph laplacian laplacian = np.zeros((num_nodes, num_nodes)) - for subject_index in range(num_nodes): - q_node_id_subject = node_ids[subject_index][0] - for object_id in range(num_nodes): - q_node_id_object = node_ids[object_id][0] + weight_mat = np.zeros((num_nodes, num_nodes)) # For debugging + for i, sub_id_mapping in enumerate(node_ids): + q_node_id_subject = sub_id_mapping[0] + for j, obj_id_mapping in enumerate(node_ids): + q_node_id_object = obj_id_mapping[0] + edge_qnode_ids = frozenset((q_node_id_subject, q_node_id_object)) # Set default weight (or 0 when edge is not a qedge) @@ -221,13 +211,8 @@ def graph_laplacian(self, rgraph, probes): self.DEFAULT_WEIGHT if edge_qnode_ids in qedge_qnode_ids else 0.0 ) - for source in weight_dict[subject_index][object_id].keys(): - for property in weight_dict[subject_index][object_id][ - source - ].keys(): - source_w = weight_dict[subject_index][object_id][source][ - property - ] + for source, properties in weight_dict[sub_id_mapping][obj_id_mapping].items(): + for property, source_w in properties.items(): source_weighted = source_w * source_weight( source, property, @@ -238,12 +223,24 @@ def graph_laplacian(self, rgraph, probes): source_weighted = 0.99999999 weight = weight + -1 / (np.log(source_weighted)) - laplacian[subject_index, object_id] += -weight - laplacian[object_id, subject_index] += -weight - laplacian[subject_index, subject_index] += weight - laplacian[object_id, object_id] += weight + weight_mat[i, j] += weight # For debugging + + laplacian[i, j] += -weight + laplacian[j, i] += -weight + laplacian[i, i] += weight + laplacian[j, j] += weight + + # Using weight_mat you can calculated the laplacian, however we did this above. + # weight_row_sums = np.sum(weight_mat,axis=1) + # laplacian = -1 * weight_mat.copy() + # for i in range(num_nodes): + # laplacian[i, i] = weight_row_sums[i] # Clean up Laplacian (remove extra nodes etc.) + # Sometimes, mostly because of a bug of some kind, + # There will be rows of all zeros in the laplacian. + # This will cause numerical issues. + # We can remove these, as long as they aren't probes. removal_candidate = np.all(np.abs(laplacian) == 0, axis=0) # Don't permit removing probes for probe in probes: