-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathncbi.py
executable file
·125 lines (101 loc) · 3.01 KB
/
ncbi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/Library/Frameworks/Python.framework/Versions/3.6/bin/python3
from bs4 import BeautifulSoup
import string
import re
import requests
import json
from sys import stdin
"""
The file ncbi_genes_formatted.txt contains gene data:
1. gene stable id
2. gene name
3. phenotype description
4. hgnc id
5. entrezGene id
These data of each gene are placed in 5 lines in that order.
Each data, unless specified, is of value 'EMPTY'
"""
errorFile = open("ncbi-errors",'w')
outFile = open("ncbi.out","w")
col = 1
geneStableID = ""
geneName = ""
phenoDesc = ""
hgnc = ""
entrezGeneID = ""
geneData = {}
# extract the newline character
while True:
try:
info = input()
if col == 1:
geneStableID = info
elif col == 2 :
geneName = info
elif col == 3:
phenoDesc = info
elif col == 4:
hgnc = info
else :
entrezGeneID = info
col = 0
geneData['Phenotype Description'] = [phenoDesc] if bool(phenoDesc.strip()) else []
geneData['Gene Name'] = geneName
geneData['HGNC ID'] = hgnc
geneData['Gene Stable ID'] = geneStableID
geneData['Found in NCBI'] = False
# web scrape
r = requests.get("https://www.ncbi.nlm.nih.gov/gene/" + entrezGeneID)
soup = BeautifulSoup(r.content, "lxml")
# validates the geneStableID
title = soup.find("title");
noItemFound = re.compile("No items found")
if noItemFound.search(title.text) is None:
summary = soup.find('dl', {"id":'summaryDl'})
key = ''
officialSymbol = re.compile("Official\s+Symbol")
officialFullName = re.compile("Official\s+Full\s+Name")
try:
for c in summary.children:
if c.name == 'dt':
if(officialSymbol.search(c.text) is not None):
key = 'Official Symbol'
elif(officialFullName.search(c.text) is not None):
key = 'Official Full Name'
else:
key = c.text
elif c.name == 'dd':
if key == 'Also known as':
geneData[key] = c.text.split('; ')
geneData['Found in NCBI'] = geneData['Gene Name'] in geneData[key]
elif key == 'See related' or key == 'Orthologs':
geneData[key] = {}
for child in c.children:
try:
k = str(child.contents[0].strip(';'))
geneData[key][k] = child['href']
except AttributeError:
continue
else:
info = c.contents[0]
if info == '\n':
geneData[key] = c.contents[1].text
else:
geneData[key] = info
try:
if not geneData['Found in NCBI']:
geneData['Found in NCBI'] = geneData['Gene Name'] == geneData['Official Symbol']
except KeyError:
errorFile.write(entrezGeneID + " no Official Symbol")
except AttributeError:
errorFile.write(entrezGeneID)
errorFile.write(summary.prettify())
outFile.write('"' + entrezGeneID + '":')
outFile.write(json.dumps(geneData, sort_keys=False, indent=4))
geneData = {}
# update col
col += 1
except EOFError:
break
errorFile.close()
outFile.close()