extract_align.py

from Bio import SeqIO
import argparse
import os
import re
import subprocess
import fileinput
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.Alphabet import generic_dna
import sys
import pandas as pd
import numpy as np
import pybedtools
import pyfaidx
from pyfaidx import Fasta
import logging
pd.options.mode.chained_assignment = None  # default='warn'

LOGGER = logging.getLogger(__name__)

## Set up input arguments
def get_args():
	parser = argparse.ArgumentParser(description="Will process a blast output generated using a file of putative TEs (usually generated by RepeatModeler. For each putative consensus in the input putative TE library, it will generate an aligned file with N buffered instances from the queried genome, the input consensus, and, if requested, a new revised and extended consensus for inspection.", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
	parser.add_argument('-g', '--genome', type=str, help='Name of the fasta formatted genome to be queried.', required=True)
	parser.add_argument('-b', '--blastfile', type=str, help='Blast output to be used. Must be formatted using "outfmt 6".', required = True)
	parser.add_argument('-l', '--library', type=str, help='Library of putative TE consensus sequences to be extracted and aligned. Must be in fasta format with no # or / in the headers.', required = True)
	parser.add_argument('-b', '--buffer', type=int, help='Buffer size. The number of bp of flanking sequence for each hit to be extracted along with the hit. Optional, Default = 1000', default = 1000)
	parser.add_argument('-n', '--hitnumber', type=str, help='The number of hits to be exracted. Optional. Default = 50.', default = 50)
	parser.add_argument('-a', '--align', type=str, help='Align the output fasta file, y or n?. Default is y.', default = 'y')
	parser.add_argument('-e', '--emboss', type=int, help='Generate a trimal/emboss consensus, y or n. Optional.', default = 'y')
	parser.add_argument("-l", "--log_level", default="INFO")


	args = parser.parse_args()
	GENOME = args.genome
	BLAST = args.blastfile
	LIB = args.library
	BUFFER = args.buffer
	HITNUM = args.hitnumber
	ALIGN = args.align
	EMBOSS = args.emboss
	LOG = args.log_level

	return GENOME, BLAST, LIB, BUFFER, HITNUM, ALIGN, EMBOSS, LOG

## No longer needed --- Reverse complement function for selected blast hits
#def REVCOMP(INPUT):
#	for record in SeqIO.parse(INPUT, 'fasta'):
#		REVSEQ = record.reverse_complement(id='rc_' + record.id, description = 'add coordinates when actually working')
#		return REVSEQ

## Create TE outfiles function. Creates files for populating with blast hits.
def CREATE_TE_OUTFILES(LIBRARY):
	for record in SeqIO.parse(LIBRARY, 'fasta'):
		NEWID = re.sub('#', '__', record.id)
		NEWID = re.sub('/', '___', NEWID)
		record.id = 'CONSENSUS-' + NEWID
		record.description = ''
		SeqIO.write(record, 'tefiles/' + NEWID + '.fa', 'fasta')
				
## Organize blast hits function. Will read in blast file, sort based on e-value and bitscore, ajd deterine top BUFFER hits for extraction.
def ORGANIZE_BLAST(BLAST, BUFFER, HITNUMBER):
##Read in blast data
	BLASTDF = pd.read_table(BLAST, sep='\t', names=['QUERYNAME', 'SCAFFOLD', 'C', 'D', 'E', 'F', 'QUERYSTART', 'QUERYSTOP', 'SCAFSTART', 'SCAFSTOP', 'E-VALUE', 'BITSCORE'])
##Convert to bed format
	BLASTBED = BLASTDF[['SCAFFOLD', 'SCAFSTART', 'SCAFSTOP', 'QUERYNAME', 'E-VALUE', 'BITSCORE']]
	BLASTBED.insert(6, 'STRAND', '+')
	BLASTBED.loc[BLASTBED.SCAFSTOP < BLASTDF.SCAFSTART, 'STRAND'] = '-'
	BLASTBED.SCAFSTART, BLASTBED.SCAFSTOP = np.where(BLASTBED.SCAFSTART > BLASTBED.SCAFSTOP, [BLASTBED.SCAFSTOP, BLASTBED.SCAFSTART], [BLASTBED.SCAFSTART, BLASTBED.SCAFSTOP])
##Generate list of query names
	QUERYLIST = BLASTBED.QUERYNAME.unique()
##Sort subsets of df based on query names, keep the top BUFFER hits, and save bedfiles
	for QUERY in QUERYLIST:
		QUERYFRAME = BLASTBED[BLASTBED['QUERYNAME'] == QUERY]
		QUERYFRAME = QUERYFRAME.sort_values(by=['E-VALUE', 'BITSCORE'], ascending=[True, False])
		QUERYFRAME = QUERYFRAME.head(HITNUM)
		#QUERYFRAME.SCAFSTART = QUERYFRAME.SCAFSTART - BUFFER
		#QUERYFRAME.SCAFSTOP = QUERYFRAME.SCAFSTOP + BUFFER
		#If value in SCAFSTART is < 0, set to 0
		#QUERYFRAME.SCAFSTART = np.where(QUERYFRAME.SCAFSTART < 0, 0, QUERYFRAME.SCAFSTART)
		QUERYFRAME.to_csv('bedfiles/' + QUERY + '.bed', sep='\t', header=True, index=True)
		pybedtools.BedTool.sequence(fi=fasta
	

####MAIN function
def main():
## Set up logging and script timing
#    logging.basicConfig(format='')
#    logging.getLogger().setLevel(getattr(logging, args.log_level.upper()))
#    start_time = time.time()

#    LOGGER.info("#\n# extract_align.py\n#")
	
## Set up directories	
	os.mkdir('tefiles')
	os.mkdir('bedfiles')
	os.mkdir('muscle')
	os.mkdir('consensusfiles')
	
## Index the genome 
	GENOMEIDX = Fasta('GENOME')
	
##Get input arguments
	GENOME, BLAST, LIB, BUFFER, HITNUM, ALIGN, EMBOSS = get_args()
	print('Input genome = ' + GENOME)
	print('Input blast file = ' + BLAST)
	print('Input library file = ' + LIB)
	print('Buffer size = ' + BUFFER)
	print('Number of hits to target = ' + HITNUM)
	
##Determine optional arguments and print to screen.
	if ALIGN == 'n' and EMBOSS == 'y':
		print('Input is contradictory. Generating a new consensus with emboss requires muscle alignment.')
	elif ALIGN == 'y' and EMBOSS == 'y':
		print('Output files will be aligned and a new consensus will be generated with emboss and trimal.')
	elif ALIGN == 'y' and EMBOSS == 'n':
		print('Output files will be aligned but without a new emboss/trimal consensus.')
	elif ALIGN == 'n' and EMBOSS == 'n':
		print('Extractions will be made but no alignment.')
	else:
		print('Invalid input for either align, or emboss, or both.')

##Create TE out files to populate with blast hits
	CREATE_TE_OUTFILES(LIB)
	
	end_time = time.time()
    LOGGER.info("Run time: " + str(datetime.timedelta(seconds=end_time-start_time)))


#
# Wrap script functionality in main() to avoid automatic execution
# when imported ( e.g. when help is called on file )
#
if __name__ =="__main__":main()