-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathextract_align.py
138 lines (115 loc) · 5.92 KB
/
extract_align.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
from Bio import SeqIO
import argparse
import os
import re
import subprocess
import fileinput
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.Alphabet import generic_dna
import sys
import pandas as pd
import numpy as np
import pybedtools
import pyfaidx
from pyfaidx import Fasta
import logging
pd.options.mode.chained_assignment = None # default='warn'
LOGGER = logging.getLogger(__name__)
## Set up input arguments
def get_args():
parser = argparse.ArgumentParser(description="Will process a blast output generated using a file of putative TEs (usually generated by RepeatModeler. For each putative consensus in the input putative TE library, it will generate an aligned file with N buffered instances from the queried genome, the input consensus, and, if requested, a new revised and extended consensus for inspection.", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-g', '--genome', type=str, help='Name of the fasta formatted genome to be queried.', required=True)
parser.add_argument('-b', '--blastfile', type=str, help='Blast output to be used. Must be formatted using "outfmt 6".', required = True)
parser.add_argument('-l', '--library', type=str, help='Library of putative TE consensus sequences to be extracted and aligned. Must be in fasta format with no # or / in the headers.', required = True)
parser.add_argument('-b', '--buffer', type=int, help='Buffer size. The number of bp of flanking sequence for each hit to be extracted along with the hit. Optional, Default = 1000', default = 1000)
parser.add_argument('-n', '--hitnumber', type=str, help='The number of hits to be exracted. Optional. Default = 50.', default = 50)
parser.add_argument('-a', '--align', type=str, help='Align the output fasta file, y or n?. Default is y.', default = 'y')
parser.add_argument('-e', '--emboss', type=int, help='Generate a trimal/emboss consensus, y or n. Optional.', default = 'y')
parser.add_argument("-l", "--log_level", default="INFO")
args = parser.parse_args()
GENOME = args.genome
BLAST = args.blastfile
LIB = args.library
BUFFER = args.buffer
HITNUM = args.hitnumber
ALIGN = args.align
EMBOSS = args.emboss
LOG = args.log_level
return GENOME, BLAST, LIB, BUFFER, HITNUM, ALIGN, EMBOSS, LOG
## No longer needed --- Reverse complement function for selected blast hits
#def REVCOMP(INPUT):
# for record in SeqIO.parse(INPUT, 'fasta'):
# REVSEQ = record.reverse_complement(id='rc_' + record.id, description = 'add coordinates when actually working')
# return REVSEQ
## Create TE outfiles function. Creates files for populating with blast hits.
def CREATE_TE_OUTFILES(LIBRARY):
for record in SeqIO.parse(LIBRARY, 'fasta'):
NEWID = re.sub('#', '__', record.id)
NEWID = re.sub('/', '___', NEWID)
record.id = 'CONSENSUS-' + NEWID
record.description = ''
SeqIO.write(record, 'tefiles/' + NEWID + '.fa', 'fasta')
## Organize blast hits function. Will read in blast file, sort based on e-value and bitscore, ajd deterine top BUFFER hits for extraction.
def ORGANIZE_BLAST(BLAST, BUFFER, HITNUMBER):
##Read in blast data
BLASTDF = pd.read_table(BLAST, sep='\t', names=['QUERYNAME', 'SCAFFOLD', 'C', 'D', 'E', 'F', 'QUERYSTART', 'QUERYSTOP', 'SCAFSTART', 'SCAFSTOP', 'E-VALUE', 'BITSCORE'])
##Convert to bed format
BLASTBED = BLASTDF[['SCAFFOLD', 'SCAFSTART', 'SCAFSTOP', 'QUERYNAME', 'E-VALUE', 'BITSCORE']]
BLASTBED.insert(6, 'STRAND', '+')
BLASTBED.loc[BLASTBED.SCAFSTOP < BLASTDF.SCAFSTART, 'STRAND'] = '-'
BLASTBED.SCAFSTART, BLASTBED.SCAFSTOP = np.where(BLASTBED.SCAFSTART > BLASTBED.SCAFSTOP, [BLASTBED.SCAFSTOP, BLASTBED.SCAFSTART], [BLASTBED.SCAFSTART, BLASTBED.SCAFSTOP])
##Generate list of query names
QUERYLIST = BLASTBED.QUERYNAME.unique()
##Sort subsets of df based on query names, keep the top BUFFER hits, and save bedfiles
for QUERY in QUERYLIST:
QUERYFRAME = BLASTBED[BLASTBED['QUERYNAME'] == QUERY]
QUERYFRAME = QUERYFRAME.sort_values(by=['E-VALUE', 'BITSCORE'], ascending=[True, False])
QUERYFRAME = QUERYFRAME.head(HITNUM)
#QUERYFRAME.SCAFSTART = QUERYFRAME.SCAFSTART - BUFFER
#QUERYFRAME.SCAFSTOP = QUERYFRAME.SCAFSTOP + BUFFER
#If value in SCAFSTART is < 0, set to 0
#QUERYFRAME.SCAFSTART = np.where(QUERYFRAME.SCAFSTART < 0, 0, QUERYFRAME.SCAFSTART)
QUERYFRAME.to_csv('bedfiles/' + QUERY + '.bed', sep='\t', header=True, index=True)
pybedtools.BedTool.sequence(fi=fasta
####MAIN function
def main():
## Set up logging and script timing
# logging.basicConfig(format='')
# logging.getLogger().setLevel(getattr(logging, args.log_level.upper()))
# start_time = time.time()
# LOGGER.info("#\n# extract_align.py\n#")
## Set up directories
os.mkdir('tefiles')
os.mkdir('bedfiles')
os.mkdir('muscle')
os.mkdir('consensusfiles')
## Index the genome
GENOMEIDX = Fasta('GENOME')
##Get input arguments
GENOME, BLAST, LIB, BUFFER, HITNUM, ALIGN, EMBOSS = get_args()
print('Input genome = ' + GENOME)
print('Input blast file = ' + BLAST)
print('Input library file = ' + LIB)
print('Buffer size = ' + BUFFER)
print('Number of hits to target = ' + HITNUM)
##Determine optional arguments and print to screen.
if ALIGN == 'n' and EMBOSS == 'y':
print('Input is contradictory. Generating a new consensus with emboss requires muscle alignment.')
elif ALIGN == 'y' and EMBOSS == 'y':
print('Output files will be aligned and a new consensus will be generated with emboss and trimal.')
elif ALIGN == 'y' and EMBOSS == 'n':
print('Output files will be aligned but without a new emboss/trimal consensus.')
elif ALIGN == 'n' and EMBOSS == 'n':
print('Extractions will be made but no alignment.')
else:
print('Invalid input for either align, or emboss, or both.')
##Create TE out files to populate with blast hits
CREATE_TE_OUTFILES(LIB)
end_time = time.time()
LOGGER.info("Run time: " + str(datetime.timedelta(seconds=end_time-start_time)))
#
# Wrap script functionality in main() to avoid automatic execution
# when imported ( e.g. when help is called on file )
#
if __name__ =="__main__":main()