Skip to content

Commit 0a1b98d

Browse files
committed
in the middle of adding a generic distance fnc
1 parent ce08712 commit 0a1b98d

File tree

1 file changed

+72
-0
lines changed

1 file changed

+72
-0
lines changed

stringmeup/taxonomy.py

+72
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ def __init__(self, nodes_filename, names_filename):
6464
# "Memory" data structure to be populated at function calls
6565
# For faster response in case of same query is asked again
6666
self.lineages = {}
67+
self.distances = {}
6768
self.lca_mappings = {}
6869

6970
# Add nodes to self.taxonomy
@@ -293,6 +294,77 @@ def get_distance(self, tax_id_ancestor, tax_id):
293294

294295
return distance
295296

297+
def get_distance_generic(self, tax_id_1, tax_id_2):
298+
"""
299+
Fnc get_distance() calculates a distance between an ancestor and a
300+
descendant, and might be legacy.
301+
302+
Creating this fnc for a general case where we want to know how many
303+
edges we need to traverse in order to get from tax_id_1 to tax_id_2.
304+
"""
305+
distance = None
306+
307+
# Extra calcs to check for distance from self.distances
308+
tax_id_small = min(tax_id_1, tax_id_2)
309+
tax_id_large = max(tax_id_1, tax_id_2)
310+
311+
# self.distances is ordered... smallest tax_id always goes first
312+
if tax_id_small in self.distances:
313+
if tax_id_large in self.distances[tax_id_small]:
314+
lca = self.distances[tax_id_small][tax_id_large]
315+
else:
316+
self.distances[tax_id_small] = {}
317+
318+
# Do we need to calculate the distance?
319+
if distance is None:
320+
321+
# Lowest common ancestor
322+
lca = self.get_lca(tax_id_1, tax_id_2)
323+
324+
# Sum of distances between both tax_ids and the LCA makes the total distance
325+
distance_1 = self.get_distance(lca, tax_id_1)
326+
distance_2 = self.get_distance(lca, tax_id_2)
327+
distance = distance_1 + distance_2
328+
329+
# The lineage of the clade, make a set of it
330+
lineage = set(self.taxonomy_tree.get_lineage([clade_id])[clade_id])
331+
lineage.remove(clade_id) # Only want tax_ids above the clade tax_id
332+
333+
# The tax_id is an ancestor of the clade root
334+
if tax_id in lineage:
335+
336+
# The distance between the clade root and the ancestor
337+
distance = self.taxonomy_tree.get_distance(tax_id, clade_id)
338+
339+
# Not an ancestor, must compute two distances and add them together
340+
else:
341+
342+
# Get the lineage of the tax_id
343+
tax_id_lineage = self.taxonomy_tree.get_lineage([tax_id])[tax_id]
344+
tax_id_lineage.reverse() # Flip the lineage so that it goes from leaf to root
345+
346+
# Loop to find the lowest common ancestor (lca) of the clade id and
347+
# the tax_id that we are currently getting the distance to
348+
lca = None
349+
for ancestor in tax_id_lineage:
350+
if ancestor in lineage:
351+
lca = ancestor
352+
break
353+
354+
# Find the distance between the lca and the clade id
355+
clade_lca_distance = self.taxonomy_tree.get_distance(ancestor, clade_id)
356+
357+
# Find the distance between the lca and the tax_id currently
358+
# being investigated
359+
tax_id_lca_distance = self.taxonomy_tree.get_distance(ancestor, tax_id)
360+
361+
# The distance between the clade id and the tax_id is the sum of
362+
# the two distances
363+
distance = clade_lca_distance + tax_id_lca_distance
364+
365+
# Save the distance between the clade id and the tax_id
366+
distance_dict[tax_id] = distance
367+
296368
def get_rank(self, tax_id_list):
297369
"""
298370
Returns the rank of each tax_id.

0 commit comments

Comments
 (0)