-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathRebaseBlast.py
211 lines (166 loc) · 8.32 KB
/
RebaseBlast.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
#import os to work with UNIX commands
import os
#sys may be used if no input is found (program will close)
import sys
#import SeqIO for parsing fasta files
from Bio import SeqIO
#import Entrez and urllib to download assembly files from NCBI's FTP
from Bio import Entrez
import urllib
#import gzip and shutil for working with zipped assembly files
import gzip
import shutil
#import argparse for command-line parameters
import argparse
#create the command-line parameters for email (for Entrez) and input file
parser = argparse.ArgumentParser()
parser.add_argument('-e', '--email', help="Enter your email for Entrez here", required=True)
parser.add_argument('-i', '--input', help="Input file name (list of NCBI accession numbers, one on each line)")
args = parser.parse_args()
#set Entrez's email to the email provided
Entrez.email = args.email
#import csv for working with csv files
import csv
#read REBASE sequence files, as these records will need to be parsed for
#information on each top hit.
sequences='formattedSeqs.fasta'
print("Reading REBASE sequences from " + sequences + "...")
rebase_records=list(SeqIO.parse(sequences,'fasta'))
#read the genome accession #'s from the input file
#(specified in args.input parameter)
ids=[]
if args.input != None:
for line in open(args.input).readlines():
ids.append(line.strip())
#make a sequences directory to store all sequence fasta files in (if there isn't already one)
if not (os.path.isdir('sequences')):
os.system('mkdir sequences')
#this for loop takes each id and downloads it's assembly file from NCBI through FTP
idsused = 0 #count how many files have been downloaded so the user can follow the progress of the script
for id in ids:
idsused+=1
print("Downloading " + id + " from NCBI... (" + str(idsused) + ")")
#search NCBI assembly database by genome accession # (id), and retrieve only 1 record
handle = Entrez.esearch(db="assembly", term=id+"[id]", retmax='1')
record = Entrez.read(handle)
#get the assembly search id for the genome accession provided
searchid=record['IdList'][0]
#get the assembly summary/report using the assembly search id
esummary_handle=Entrez.esummary(db="assembly",id=searchid, report="full")
esummary_record = Entrez.read(esummary_handle,validate=False)
#get the FTP link for the assembly file
try:
url = esummary_record['DocumentSummarySet']['DocumentSummary'][0]['FtpPath_RefSeq']
except IndexError:
continue
#download the assembly file via FTP to the sequences directory
label = os.path.basename(url)
if not os.path.isfile('sequences/' + f'{label}.fna'):
link = os.path.join(url,label+'_genomic.fna.gz')
link=link.replace('\\','/') #reformat some slashes from the link
urllib.request.urlretrieve(link, 'sequences/' + f'{label}.fna.gz')
else:
print(id + " found in sequences directory.")
#open the output files
csv_output_name = "RMBlastCSV.csv"
csv_output=open(csv_output_name,'w')
fasta_output_name="RMBlastFasta.fasta"
fasta_output=open(fasta_output_name,'w')
#write the headers to the simple output file
csv_output.write('Genome,Contig,RMSystemName,SystemType,E-Value,% Identity,Query Coverage,Bitscore,Length\n')
#make a directory to hold the full BLAST results
if not (os.path.isdir('full_blast_results')):
os.system('mkdir full_blast_results')
#count the files BLASTed so the user can follow the progress of the script
fileCount = 0
#this for loop goes through each file in the sequences directory, unzips it, and BLASTs it against the local REBASE database created in the RebaseSetup script. It then writes information about the top hit to the output file.
for fileName in os.listdir("sequences"):
#if the file is zipped, unzip it
if fileName.endswith('.gz'):
unzippedFileName = fileName[:fileName.index('.gz')]
#unzip the .gz assembly files
with gzip.open("sequences/" + fileName,'rb') as f_in:
with open("sequences/" + unzippedFileName ,'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
#delete the original zipped file
os.system('rm sequences/' + fileName)
#if the file is already unzipped, set its name to the unzipped variable
else:
unzippedFileName = fileName
#make a file name for holding the BLAST results for each file
temp_output_file='temp.csv'
fileCount += 1
print("Running " + unzippedFileName + "..." + "(" + str(fileCount) + ")")
#This blast command will generate a csv formatted output file containing the Query Seq-id, the subject seq-id, the e-value, the query coverage, the percent identity, the bitscore, and the alignment length for each hit.
#the local BLAST db created in RebaseSetup is named LocalRebaseDB
blast_command = 'blastn -ungapped -query sequences/' + unzippedFileName + ' -db LocalRebaseDB -out ' + temp_output_file + ' -outfmt "10 qseqid sseqid evalue pident qcovs bitscore length"'
os.system(blast_command)
#read the csv file and sort it by bitscore
with open(temp_output_file) as csvFile:
reader = csv.reader(csvFile, delimiter=',')
sortedList = sorted(reader,key=lambda x: float(x[5]),reverse=True) #blast output is sorted by bitscore
topHit=sortedList[0] #now we have just the top hit for the multi-fasta assembly
#sseqid is in index 1
topHitName=topHit[1]
#find the top hit in REBASE's records
for record in rebase_records:
if topHitName == record.id:
description = record.description #the description contains several pieces of information about the RM system
topHitSequence = str(record.seq) #get the sequence of the top hit
#the description is tab-delimitted, so split it to get each portion
hitInfo = description.split("\t")
#create a dictionary to store each piece of information
#(the description follows the format KEY:VALUE\tKEY:VALUE\t, etc.)
hitDict = {}
for info in hitInfo:
hitDict[info[:info.index(":")]] = info[info.index(":")+1:]
genomeName=unzippedFileName #genome name is the file name
contig=topHit[0] #contig where the top hit matched to is provided in index 0 of the temporary csv output
#get the name of the system and its type from the REBASE description
nameOfSystem = hitDict["REBASE"]
systemType = hitDict["EnzType"]
try:
proteinID = hitDict["ProteinId"]
except KeyError:
proteinID = "N/A"
eValue = topHit[2]
pIdent = topHit[3]
qCov = topHit[4]
bitscore = topHit[5]
length = topHit[6]
#write relevant output to the output files
csv_output.write(genomeName + ",")
csv_output.write(contig + ",")
csv_output.write(nameOfSystem + ",")
csv_output.write(systemType + ",")
csv_output.write(eValue + ",")
csv_output.write(pIdent + ",")
csv_output.write(qCov + ",")
csv_output.write(bitscore + ",")
csv_output.write(length + "\n")
fasta_output.write(">Genome: " + genomeName + "\t")
fasta_output.write("Contig: " + contig + "\t")
fasta_output.write("RMSystemName: " + nameOfSystem + "\t")
fasta_output.write("SystemType: " + systemType + "\t")
fasta_output.write("ProteinID: " + proteinID + "\t")
fasta_output.write("E-Val: " + eValue + "\t")
fasta_output.write("Bitscore: " + bitscore + "\n")
fasta_output.write(topHitSequence + "\n")
#write full BLAST output to a file and save it in the full_blast_results directory
with open("full_blast_results/" + unzippedFileName[:unzippedFileName.index('_ASM')] + ".csv", 'w') as fullBlastOutput:
fullBlastOutput.write('qseqid,sseqid,evalue,pident,qcovs,bitscore,length\n')
for line in open('temp.csv'):
fullBlastOutput.write(line)
#close the output files
csv_output.close()
fasta_output.close()
#if no files were found, notify the user
if fileCount == 0:
print("No input detected. FASTA files must be in a directory named 'sequences', or files may be downloaded from NCBI using --input (a file with accession numbers listed one per line)")
sys.exit()
#remove the temporary csv file
os.system('rm temp.csv')
print("Done!")
print("Output with more BLAST information (without sequences, csv formatted) can be found in " + csv_output_name + ".")
print("Output with sequences from REBASE and protein IDs (fasta formatted) can be found in " + fasta_output_name + ".")
print("Full BLAST results for each genome ran can be found in full_blast_results/")