-
Notifications
You must be signed in to change notification settings - Fork 5
/
symbol_to_id_lookup.py
executable file
·130 lines (109 loc) · 4.39 KB
/
symbol_to_id_lookup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/env python3
import sys
import os
import re
"""
This script accepts a list of gene or transcript symbol/synonyms and converts them into one or more
FlyBase IDs.
The synonyms file can be download from:
ftp://ftp.flybase.org/releases/current/precomputed_files/synonyms/
Usage:
./symbol_to_id_lookup.py your_symbol_list.txt flybase_synonym_file.tsv > output.tsv
Assumptions:
* Only gene or transcript symbols/synonyms/names.
* Drosophila melanogaster only
Author: Josh Goodman <[email protected]>
"""
def insert_symbol(symbol: str, fbid: str, dict: dict):
"""
Modifies the dictionary in place by inserting the symbol as the key
and the fbid as the value. If the symbol is already present in the
dictionary the fbid is added to the unique set of FlyBase IDs in the value
:param symbol:str - A single symbol to insert into the dictionary.
:param fbid:str - A single FlyBase ID.
:param dict:dict - The dictionary reference to modify.
:return: None
"""
if symbol and symbol not in dict:
# If we haven't seen this symbol before initialize the set.
dict[symbol] = {fbid}
elif symbol:
# We have seen this symbol before so we add it to the set.
dict[symbol].add(fbid)
return None
def generate_inverted_symbol_dict(sym_file: str):
"""
Generates an inverted dictionary of all symbols, synonyms, names, etc.
as keys and a set of FBids as values.
:param sym_file: str - The FlyBase synonyms file to parse.
:return: The inverted symbol/synonym dictionary.
"""
"""
Regex to split name synonyms on commas without spaces.
Commas without a trailing space indicate a new name.
Commas with a trailing space indicate a name with a comma in it.
e.g.
my gene1,my gene2 -> ['my gene1', 'my gene2']
my gene1, my gene2 -> ['my gene1, my gene2']
"""
# Match commas that are not followed by a space.
comma_ns_re = re.compile(r",(?!\s)")
# Init the dictionary.
symbol_dict = {}
# Open file and loop over lines.
with open(sym_file, "r") as file:
for line in file:
line = line.strip()
# This script only cares about genes or transcripts ignore the rest.
if line.startswith("FBgn") or line.startswith("FBtr"):
# Split out the ID column and all the others.
fbid, *cols = line.split("\t")
try:
col_len = len(cols)
# Dmel only.
if cols[0] != "Dmel":
continue
# Symbol
insert_symbol(cols[1], fbid, symbol_dict)
# Fullname
if col_len >= 3 and cols[2]:
insert_symbol(cols[2], fbid, symbol_dict)
# Fullname synonyms
if col_len >= 4 and cols[3]:
[
insert_symbol(syn, fbid, symbol_dict)
for syn in comma_ns_re.split(cols[3])
]
# Symbol synonyms
if col_len >= 5 and cols[4]:
[
insert_symbol(syn, fbid, symbol_dict)
for syn in comma_ns_re.split(cols[4])
]
except IndexError:
print(f"Formatting problem found in line:\n{line}", file=sys.stderr)
continue
return symbol_dict
if __name__ == "__main__":
try:
# Read in arguments.
symbols_to_check, fb_synonym = sys.argv[1:3]
# Generate the inverted dictionary.
inverted_symbol_dict = generate_inverted_symbol_dict(fb_synonym)
# Open their symbol file and loop over it.
with open(symbols_to_check, "r") as file:
for symbol in file:
symbol = symbol.strip()
try:
# Fetch the ID set for the symbol in their list.
ids = inverted_symbol_dict[symbol]
# Print out results.
print(f"{symbol}\t{','.join(ids)}")
except KeyError:
# Symbol doesn't exist in dictionary.
print(f"{symbol}")
except ValueError:
print(
f"Usage: {os.path.basename(__file__)} your_symbols.txt fb_synonym.tsv",
file=sys.stderr,
)