diff --git a/src/python/ensembl/tools/anno/finalise_genset/cpc2.py b/src/python/ensembl/tools/anno/finalise_genset/cpc2.py new file mode 100644 index 0000000..946bb96 --- /dev/null +++ b/src/python/ensembl/tools/anno/finalise_genset/cpc2.py @@ -0,0 +1,73 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +check_file(cpc2_output_path) +cpc2_output_path = validation_dir / "cpc2.tsv" + + cpc2_volume = f"{validation_dir}/:/app:rw" + cpc2_cmd = [ + "singularity", + "exec", + "--bind", + str(cpc2_volume), + str(cpc2_bin), + "python3", + "/CPC2_standalone-1.0.1/bin/CPC2.py", + "-i", + str(cdna_file), + "--ORF", + "-o", + str(cpc2_output_path), + ] + logger.info(" ".join(cpc2_cmd)) + subprocess.run(cpc2_cmd, check=True) + cpc2_output_path = f"{str(cpc2_output_path)}.txt" + + +def read_cpc2_results(file_path): + + results = [] + + file_in = open(file_path) + line = file_in.readline() + while line: + line = line.rstrip() + match = re.search(r"^#ID", line) + if match: + line = file_in.readline() + continue + + eles = line.split("\t") + if not len(eles) == 9: + line = file_in.readline() + continue + + transcript_id = eles[0] + transcript_length = eles[1] + peptide_length = eles[2] + coding_probability = eles[7] + coding_potential = eles[8] + results.append( + [ + transcript_id, + coding_probability, + coding_potential, + transcript_length, + peptide_length, + ] + ) + line = file_in.readline() + file_in.close() + + return results \ No newline at end of file diff --git a/src/python/ensembl/tools/anno/finalise_genset/diamond.py b/src/python/ensembl/tools/anno/finalise_genset/diamond.py new file mode 100644 index 0000000..c63a533 --- /dev/null +++ b/src/python/ensembl/tools/anno/finalise_genset/diamond.py @@ -0,0 +1,101 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + logger.info("diamond validation") + diamond_results = None + if diamond_validation_db is not None: + diamond_output_dir = create_dir(validation_dir, "diamond_output") + diamond_validation( + diamond_validation_db, + amino_acid_file, + diamond_output_dir, + num_threads, + ) + diamond_results = read_diamond_results(diamond_output_dir) + + + def diamond_validation( + diamond_validation_db: Path, + amino_acid_file: Path, + diamond_output_dir: Path, + num_threads: int +)-> None: + + batched_protein_files = split_protein_file(amino_acid_file, diamond_output_dir, 100) + + pool = multiprocessing.Pool(int(num_threads)) # pylint: disable=consider-using-with + for batched_protein_file in batched_protein_files: + pool.apply_async( + multiprocess_diamond, + args=( + batched_protein_file, + diamond_output_dir, + diamond_validation_db, + ), + ) + pool.close() + pool.join() + + +def multiprocess_diamond( + batched_protein_file, + diamond_output_dir, + diamond_validation_db, +)->None: + + #batch_num = os.path.splitext(batched_protein_file)[0] + #batch_dir = os.path.dirname(batched_protein_file) + diamond_output_file = f"{batched_protein_file}.dmdout" + logger.info("Running diamond on %s :", batched_protein_file) + + diamond_cmd = [ + "diamond", + "blastp", + "--query", + batched_protein_file, + "--db", + diamond_validation_db, + "--out", + diamond_output_file, + ] + + logger.info(" ".join(diamond_cmd)) + subprocess.run(diamond_cmd) + subprocess.run(["mv", diamond_output_file, diamond_output_dir]) + + +def read_diamond_results(diamond_output_dir): + + results = [] + diamond_files = glob.glob(diamond_output_dir + "/*.dmdout") + for file_path in diamond_files: + file_in = open(file_path) + line = file_in.readline() + while line: + line = line.rstrip() + + eles = line.split("\t") + if not len(eles) == 12: + line = file_in.readline() + continue + + transcript_id = eles[0] + e_value = eles[10] + results.append([transcript_id, e_value]) + line = file_in.readline() + file_in.close() + + return results + \ No newline at end of file diff --git a/src/python/ensembl/tools/anno/finalise_genset/finalise_geneset_utils.py b/src/python/ensembl/tools/anno/finalise_genset/finalise_geneset_utils.py index 989da6b..178fbb4 100644 --- a/src/python/ensembl/tools/anno/finalise_genset/finalise_geneset_utils.py +++ b/src/python/ensembl/tools/anno/finalise_genset/finalise_geneset_utils.py @@ -12,210 +12,188 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +""" +Set of functions used to collect the results of the subpipelines and generate the final output +This logic will be revised in the nextflow pipeline +""" +__all__ = ["run_finalise_geneset"] + +import logging +import logging.config +import multiprocessing +import os +from pathlib import Path +import re +import subprocess +from typing import List +import shutil + +from ensembl.tools.anno.utils._utils import ( + check_file, + get_seq_region_length, + fasta_to_dict, + check_exe, + create_dir, + check_gtf_content, + split_protein_file +) + +logger = logging.getLogger(__name__) def run_finalise_geneset( - main_script_dir, - main_output_dir, - genome_file, - seq_region_names, - validation_type, - diamond_validation_db, - num_threads, -): - - if validation_type is None: - logger.info("Setting validation type to relaxed") - else: - logger.info("Setting validation type to " + validation_type) + main_script_dir: Path, + main_output_dir: Path, + genome_file: Path, + seq_region_names: List[str], + diamond_validation_db :Path, + validation_type: str = "relaxed", + num_threads: int = 1, + cpc2_bin: Path = Path("/hps/software/users/ensembl/genebuild/genebuild_virtual_user/singularity/test_cpc2.sif"), + rnasamba_bin: Path = Path("/hps/software/users/ensembl/genebuild/genebuild_virtual_user/singularity/rnasamba_latest.sif"), + rnasamba_weights: Path= Path("/nfs/production/flicek/ensembl/genebuild/genebuild_virtual_user/rnasamba_data/full_length_weights.hdf5"), - final_annotation_dir = utils.create_dir(main_output_dir, "annotation_output") - region_annotation_dir = utils.create_dir(final_annotation_dir, "initial_region_gtfs") - final_region_annotation_dir = utils.create_dir( +): + """Collect results + + :param main_script_dir: + :type main_script_dir: Path + :param main_output_dir: _description_ + :type main_output_dir: Path + :param genome_file: Path for genome fasta file + :type genome_file: Path + :param seq_region_names: list of seq region names + :type seq_region_names: List[str] + :param diamond_validation_db: Path for diamond validation db + :type diamond_validation_db: Path + :param validation_type: type of validation + :type validation_type: str, default "relaxed" + :param num_threads: Num of threads. Defaults to 1. + :type num_threads: int, default 1 + :param cpc2_bin: CPC2 software path + :type cpc2_bin: Path, default "/hps/software/users/ensembl/genebuild/genebuild_virtual_user/singularity/test_cpc2.sif" + :param rnasamba_bin: RNASamba software path + :type rnasamba_bin: Path, default "/hps/software/users/ensembl/genebuild/genebuild_virtual_user/singularity/rnasamba_latest.sif" + :param rnasamba_weights: RNASamba weights path + :type rnasamba_weights: Path, default "/nfs/production/flicek/ensembl/genebuild/genebuild_virtual_user/rnasamba_data/full_length_weights.hdf5" + + """ + + + final_annotation_dir = create_dir(main_output_dir, "annotation_output") + region_annotation_dir = create_dir(final_annotation_dir, "initial_region_gtfs") + final_region_annotation_dir = create_dir( final_annotation_dir, "final_region_gtfs" ) - utr_region_annotation_dir = utils.create_dir(final_annotation_dir, "utr_region_gtfs") - validation_dir = utils.create_dir(final_annotation_dir, "cds_validation") - seq_region_lengths = utils.get_seq_region_lengths(genome_file, 0) - - logger.info("Skip analysis if the gtf file already exists") - output_file = os.path.join(final_annotation_dir, "annotation.gtf") - if os.path.exists(output_file): - logger.info("final_annotation_dir exists") - transcript_count = utils.check_gtf_content(output_file, "transcript") + utr_region_annotation_dir = create_dir(final_annotation_dir, "utr_region_gtfs") + validation_dir = create_dir(final_annotation_dir, "cds_validation") + seq_region_lengths = get_seq_region_length(genome_file, 0) + output_file = final_annotation_dir / "annotation.gtf" + if output_file.exists(): + transcript_count = check_gtf_content(output_file, "transcript") if transcript_count > 0: - logger.info("Final gtf file exists") + logger.info("Final gtf file exists, skipping analysis") return - else: - logger.info("No gtf file, go on with the analysis") - # This used to be a list of output dirs and a loop which was neat, - # I'm coverting to a list of conditions as - # it's more straightforward with the renaming - # and having to merge scallop and stringtie - protein_annotation_raw = os.path.join( - main_output_dir, "genblast_output", "annotation.gtf" - ) - minimap2_annotation_raw = os.path.join( - main_output_dir, "minimap2_output", "annotation.gtf" - ) - stringtie_annotation_raw = os.path.join( - main_output_dir, "stringtie_output", "annotation.gtf" - ) - scallop_annotation_raw = os.path.join( - main_output_dir, "scallop_output", "annotation.gtf" - ) - busco_annotation_raw = os.path.join(main_output_dir, "busco_output", "annotation.gtf") - - transcript_selector_script = os.path.join( - main_script_dir, "support_scripts_perl", "select_best_transcripts.pl" - ) - finalise_geneset_script = os.path.join( - main_script_dir, "support_scripts_perl", "finalise_geneset.pl" - ) - clean_geneset_script = os.path.join( - main_script_dir, "support_scripts_perl", "clean_geneset.pl" - ) - clean_utrs_script = os.path.join( - main_script_dir, "support_scripts_perl", "clean_utrs_and_lncRNAs.pl" - ) - gtf_to_seq_script = os.path.join( - main_script_dir, "support_scripts_perl", "gtf_to_seq.pl" - ) - transcriptomic_annotation_raw = os.path.join( - final_annotation_dir, "transcriptomic_raw.gtf" - ) - file_out = open(transcriptomic_annotation_raw, "w+") - for transcriptomic_file in [ + protein_annotation_raw = main_output_dir / "genblast_output" / "annotation.gtf" + minimap2_annotation_raw = main_output_dir / "minimap2_output" / "annotation.gtf" + stringtie_annotation_raw = main_output_dir / "stringtie_output" / "annotation.gtf" + scallop_annotation_raw = main_output_dir / "scallop_output" / "annotation.gtf" + busco_annotation_raw = main_output_dir / "busco_output" / "annotation.gtf" + transcript_selector_script = main_output_dir / "support_scripts_perl" / "select_best_transcripts.pl" + finalise_geneset_script = main_output_dir / "support_scripts_perl" / "finalise_geneset.pl" + clean_geneset_script = main_output_dir / "support_scripts_perl" / "clean_geneset.pl" + clean_utrs_script = main_output_dir / "support_scripts_perl" / "clean_utrs_and_lncRNAs.pl" + gtf_to_seq_script = main_output_dir / "support_scripts_perl" / "gtf_to_seq.pl" + + transcriptomic_annotation_raw = main_output_dir / "transcriptomic_raw.gtf" + + with transcriptomic_annotation_raw.open("w") as file_out: + for transcriptomic_file in [ minimap2_annotation_raw, scallop_annotation_raw, stringtie_annotation_raw, - ]: - - if not os.path.exists(transcriptomic_file): - logger.info( - "No annotation.gtf file found in " + transcriptomic_file + ", skipping" - ) - continue - - file_in = open(transcriptomic_file) - line = file_in.readline() - while line: - print(line.rstrip(), file=file_out) - line = file_in.readline() - file_in.close() - file_out.close() + ]: + if not transcriptomic_file.exists(): + logger.info("No annotation.gtf file found in %s, skipping", transcriptomic_file) + continue - # Copy the raw files into the annotation dir, this is not needed - # as such, but collecting them in one place and relabelling is - # helpful for a user - if os.path.exists(busco_annotation_raw): - subprocess.run( - [ - "cp", - busco_annotation_raw, - os.path.join(final_annotation_dir, "busco_raw.gtf"), - ] - ) + with transcriptomic_file.open("r") as file_in: + for line in file_in: + file_out.write(line) - if os.path.exists(protein_annotation_raw): - subprocess.run( - [ - "cp", - protein_annotation_raw, - os.path.join(final_annotation_dir, "protein_raw.gtf"), - ] - ) + # Copy the raw files into the annotation dir + copy_raw_files(busco_annotation_raw, final_annotation_dir / "busco_raw.gtf") + copy_raw_files(protein_annotation_raw, final_annotation_dir / "protein_raw.gtf") - gtf_files = ["transcriptomic_raw.gtf", "protein_raw.gtf", "busco_raw.gtf"] + #select best transcript generic_select_cmd = [ "perl", - transcript_selector_script, + str(transcript_selector_script), "-genome_file", - genome_file, + str(genome_file), ] - pool = multiprocessing.Pool(int(num_threads)) + pool = multiprocessing.Pool(int(num_threads)) # pylint: disable=consider-using-with for seq_region_name in seq_region_names: # The selection script needs different params depending on # whether the seqs are from transcriptomic data or not - region_details = ( - seq_region_name + ".rs1" + ".re" + str(seq_region_lengths[seq_region_name]) - ) - transcriptomic_region_gtf_path = os.path.join( - region_annotation_dir, (region_details + ".trans.gtf") - ) - busco_region_gtf_path = os.path.join( - region_annotation_dir, (region_details + ".busco.gtf") - ) - protein_region_gtf_path = os.path.join( - region_annotation_dir, (region_details + ".protein.gtf") - ) - - if os.path.exists(transcriptomic_annotation_raw): - logger.info("Finalising transcriptomic data for: " + seq_region_name) - transcriptomic_annotation_select = re.sub( - "_raw.gtf", "_sel.gtf", transcriptomic_annotation_raw - ) + region_details = f"{seq_region_name}.rs1.re{seq_region_lengths[seq_region_name]}" + transcriptomic_region_gtf_path = region_annotation_dir / f"{region_details}.trans.gtf" + busco_region_gtf_path = region_annotation_dir / f"{region_details}.busco.gtf" + protein_region_gtf_path = region_annotation_dir / f"{region_details}.protein.gtf" + if transcriptomic_annotation_raw: cmd = generic_select_cmd.copy() cmd.extend( [ "-region_details", region_details, "-input_gtf_file", - transcriptomic_annotation_raw, + str(transcriptomic_annotation_raw), "-output_gtf_file", - transcriptomic_region_gtf_path, + str(transcriptomic_region_gtf_path), "-cds_search", "-final_biotype", "transcriptomic", ] ) - pool.apply_async(multiprocess_finalise_geneset, args=(cmd,)) - - if os.path.exists(busco_annotation_raw): - logger.info("Finalising BUSCO data for: " + seq_region_name) - busco_annotation_select = re.sub("_raw.gtf", "_sel.gtf", busco_annotation_raw) + pool.apply_async(run_command, args=(cmd,)) + if busco_annotation_raw: cmd = generic_select_cmd.copy() cmd.extend( [ "-region_details", region_details, "-input_gtf_file", - busco_annotation_raw, + str(busco_annotation_raw), "-output_gtf_file", - busco_region_gtf_path, + str(busco_region_gtf_path), "-all_cds_exons", "-final_biotype", "busco", ] ) - pool.apply_async(multiprocess_finalise_geneset, args=(cmd,)) - - if os.path.exists(protein_annotation_raw): - logger.info("Finalising protein data for: " + seq_region_name) - protein_annotation_select = re.sub( - "_raw.gtf", "_sel.gtf", protein_annotation_raw - ) + pool.apply_async(run_command, args=(cmd,)) + if protein_annotation_raw: cmd = generic_select_cmd.copy() cmd.extend( [ "-region_details", region_details, "-input_gtf_file", - protein_annotation_raw, + str(protein_annotation_raw), "-output_gtf_file", - protein_region_gtf_path, + str(protein_region_gtf_path), "-clean_transcripts", "-all_cds_exons", "-final_biotype", "protein", ] ) - pool.apply_async(multiprocess_finalise_geneset, args=(cmd,)) + pool.apply_async(run_command, args=(cmd,)) pool.close() pool.join() - # At this point we will have the region files for all the, + # At this point we will have the region files for each seq region and we can merge them merge_finalise_output_files( final_annotation_dir, region_annotation_dir, @@ -230,46 +208,48 @@ def run_finalise_geneset( ) # Create a single GTF file with all the selected transcripts - # now that they have proper ids - fully_merged_gtf_path = os.path.join( - final_annotation_dir, "all_selected_transcripts.gtf" - ) - fully_merged_gtf_out = open(fully_merged_gtf_path, "w+") - - merge_gtf_cmd = ["cat"] - merge_gtf_cmd.extend(glob.glob(final_annotation_dir + "/*_sel.gtf")) - subprocess.run(merge_gtf_cmd, stdout=fully_merged_gtf_out) - fully_merged_gtf_out.close() - + # now that they have proper ids + fully_merged_gtf_path = final_annotation_dir / "all_selected_transcripts.gtf" + # Collect all *_sel.gtf files + gtf_files = list(final_annotation_dir.glob("*_sel.gtf")) + + # Ensure there are files to merge + if not gtf_files: + raise FileNotFoundError("No *_sel.gtf files found in %s",final_annotation_dir) + + + try: + with fully_merged_gtf_path.open("w+") as fully_merged_gtf_out: + merge_gtf_cmd = ["cat"] + [str(gtf_file) for gtf_file in gtf_files] + subprocess.run(merge_gtf_cmd, stdout=fully_merged_gtf_out, check=True) + logger.info("Merged GTF files into %s", fully_merged_gtf_path) + except Exception as e: + print("An error occurred while merging GTF files: %s",e) + # Now collapse the gene set generic_finalise_cmd = [ "perl", - finalise_geneset_script, + str(finalise_geneset_script), "-genome_file", - genome_file, + str(genome_file), ] - pool = multiprocessing.Pool(int(num_threads)) + pool = multiprocessing.Pool(int(num_threads)) # pylint:disable=consider-using-with for seq_region_name in seq_region_names: - region_details = ( - seq_region_name + ".rs1" + ".re" + str(seq_region_lengths[seq_region_name]) - ) - final_region_gtf_path = os.path.join( - final_region_annotation_dir, (region_details + ".final.gtf") - ) - + region_details = f"{seq_region_name}.rs1.re{seq_region_lengths[seq_region_name]}" + final_region_gtf_path = final_region_annotation_dir / f"{region_details}.final.gtf" cmd = generic_finalise_cmd.copy() cmd.extend( [ "-region_details", region_details, "-input_gtf_file", - fully_merged_gtf_path, + str(fully_merged_gtf_path), "-output_gtf_file", - final_region_gtf_path, + str(final_region_gtf_path), ] ) - pool.apply_async(multiprocess_finalise_geneset, args=(cmd,)) + pool.apply_async(run_command, args=(cmd,)) pool.close() pool.join() @@ -280,11 +260,10 @@ def run_finalise_geneset( ".final.gtf", "prevalidation", ) - merged_gtf_file = os.path.join(final_annotation_dir, ("prevalidation_sel.gtf")) - merged_cdna_file = os.path.join(final_annotation_dir, ("prevalidation_sel.cdna.fa")) - merged_amino_acid_file = os.path.join( - final_annotation_dir, ("prevalidation_sel.prot.fa") - ) + merged_gtf_file = final_annotation_dir / "prevalidation_sel.gtf" + merged_cdna_file = final_annotation_dir / "prevalidation_sel.cdna.fa" + merged_amino_acid_file = final_annotation_dir /"prevalidation_sel.prot.fa" + #validation updated_gtf_lines = validate_coding_transcripts( merged_cdna_file, merged_amino_acid_file, @@ -293,7 +272,12 @@ def run_finalise_geneset( diamond_validation_db, merged_gtf_file, num_threads, + cpc2_bin, + rnasamba_bin, + rnasamba_weights ) + + #### postvalidation_gtf_file = os.path.join(final_annotation_dir, ("postvalidation.gtf")) file_out = open(postvalidation_gtf_file, "w+") for line in updated_gtf_lines: @@ -380,67 +364,43 @@ def run_finalise_geneset( def validate_coding_transcripts( - cdna_file, - amino_acid_file, - validation_dir, - validation_type, - diamond_validation_db, - gtf_file, - num_threads, + cdna_file:Path, + amino_acid_file:Path, + validation_dir:Path, + validation_type:str, + diamond_validation_db:Path, + gtf_file:Path, + num_threads:int, + cpc2_bin: Path = Path("/hps/software/users/ensembl/genebuild/genebuild_virtual_user/singularity/test_cpc2.sif"), + rnasamba_bin: Path = Path("/hps/software/users/ensembl/genebuild/genebuild_virtual_user/singularity/rnasamba_latest.sif"), + rnasamba_weights: Path= Path("/nfs/production/flicek/ensembl/genebuild/genebuild_virtual_user/rnasamba_data/full_length_weights.hdf5"), + ): + """Run validation on protein coding transcript + using CPC2, RNASamba, Diamond + + Args: + cdna_file (Path): Input cdna file + amino_acid_file (Path): Input protein file + validation_dir (Path): Validation output directory + validation_type (str): _description_ + diamond_validation_db (Path): _description_ + gtf_file (Path): _description_ + num_threads (int): number of threads + + Returns: + _type_: _description_ + """ logger.info("Running CDS validation with RNAsamba and CPC2") - rnasamba_weights = config["rnasamba"]["weights"] - rnasamba_output_path = os.path.join(validation_dir, "rnasamba.tsv.txt") - cpc2_output_path = os.path.join(validation_dir, "cpc2.tsv") - rnasamba_volume = validation_dir + "/:/app:rw" - rnasamba_cmd = [ - "singularity", - "exec", - "--bind", - rnasamba_volume, - config["rnasamba"]["software"], - "rnasamba", - "classify", - rnasamba_output_path, - cdna_file, - rnasamba_weights, - ] - logger.info(" ".join(rnasamba_cmd)) - subprocess.run(rnasamba_cmd) - cpc2_volume = validation_dir + "/:/app:rw" - cpc2_cmd = [ - "singularity", - "exec", - "--bind", - cpc2_volume, - config["cpc2"]["software"], - "python3", - "/CPC2_standalone-1.0.1/bin/CPC2.py", - "-i", - cdna_file, - "--ORF", - "-o", - cpc2_output_path, - ] - logger.info(" ".join(cpc2_cmd)) - subprocess.run(cpc2_cmd) - cpc2_output_path = cpc2_output_path + ".txt" - - check_file(rnasamba_output_path) - check_file(cpc2_output_path) - - logger.info("diamond validation") - diamond_results = None - if diamond_validation_db is not None: - diamond_output_dir = utils.create_dir(validation_dir, "diamond_output") - diamond_validation( - diamond_validation_db, - amino_acid_file, - diamond_output_dir, - num_threads, - ) - diamond_results = read_diamond_results(diamond_output_dir) + #rnasamba_weights = config["rnasamba"]["weights"] + + + + + + + logger.info("read results") rnasamba_results = read_rnasamba_results(rnasamba_output_path) @@ -455,139 +415,6 @@ def validate_coding_transcripts( return updated_gtf_lines -def diamond_validation( - diamond_validation_db, amino_acid_file, diamond_output_dir, num_threads -): - - batched_protein_files = split_protein_file(amino_acid_file, diamond_output_dir, 100) - - pool = multiprocessing.Pool(int(num_threads)) - for batched_protein_file in batched_protein_files: - pool.apply_async( - multiprocess_diamond, - args=( - batched_protein_file, - diamond_output_dir, - diamond_validation_db, - ), - ) - pool.close() - pool.join() - - -def multiprocess_diamond( - batched_protein_file, - diamond_output_dir, - diamond_validation_db, -): - - batch_num = os.path.splitext(batched_protein_file)[0] - batch_dir = os.path.dirname(batched_protein_file) - diamond_output_file = batched_protein_file + ".dmdout" - logger.info("Running diamond on " + batched_protein_file + ":") - - diamond_cmd = [ - "diamond", - "blastp", - "--query", - batched_protein_file, - "--db", - diamond_validation_db, - "--out", - diamond_output_file, - ] - - logger.info(" ".join(diamond_cmd)) - subprocess.run(diamond_cmd) - subprocess.run(["mv", diamond_output_file, diamond_output_dir]) -def read_rnasamba_results(file_path): - - results = [] - - file_in = open(file_path) - line = file_in.readline() - while line: - line = line.rstrip() - match = re.search(r"^sequence_name", line) - if match: - line = file_in.readline() - continue - - eles = line.split("\t") - if not len(eles) == 3: - line = file_in.readline() - continue - - transcript_id = eles[0] - coding_probability = eles[1] - coding_potential = eles[2] - results.append([transcript_id, coding_probability, coding_potential]) - line = file_in.readline() - file_in.close() - - return results - - -def read_cpc2_results(file_path): - - results = [] - - file_in = open(file_path) - line = file_in.readline() - while line: - line = line.rstrip() - match = re.search(r"^#ID", line) - if match: - line = file_in.readline() - continue - - eles = line.split("\t") - if not len(eles) == 9: - line = file_in.readline() - continue - - transcript_id = eles[0] - transcript_length = eles[1] - peptide_length = eles[2] - coding_probability = eles[7] - coding_potential = eles[8] - results.append( - [ - transcript_id, - coding_probability, - coding_potential, - transcript_length, - peptide_length, - ] - ) - line = file_in.readline() - file_in.close() - - return results - - -def read_diamond_results(diamond_output_dir): - - results = [] - diamond_files = glob.glob(diamond_output_dir + "/*.dmdout") - for file_path in diamond_files: - file_in = open(file_path) - line = file_in.readline() - while line: - line = line.rstrip() - - eles = line.split("\t") - if not len(eles) == 12: - line = file_in.readline() - continue - - transcript_id = eles[0] - e_value = eles[10] - results.append([transcript_id, e_value]) - line = file_in.readline() - file_in.close() - - return results def combine_results(rnasamba_results, cpc2_results, diamond_results): @@ -625,7 +452,7 @@ def combine_results(rnasamba_results, cpc2_results, diamond_results): transcript_id = result[0] e_value = result[1] # There seems to be an issue where there are a small number - # of sequences that don't make it into the cpc2/rnasamba output + # of sequences that don't make it into the cpc2/rnasamba oqutput # Should code in a system for this, but it would be good to # understand why it happens to begin with. Seems to be the same # number of missing seqs in both, so maybe a shared cut-off @@ -635,16 +462,20 @@ def combine_results(rnasamba_results, cpc2_results, diamond_results): return transcript_ids def merge_finalise_output_files( - final_annotation_dir, region_annotation_dir, extension, id_label -): - - gtf_files = glob.glob(region_annotation_dir + "/*" + extension) - - merged_gtf_file = os.path.join(final_annotation_dir, (id_label + "_sel.gtf")) - merged_cdna_file = os.path.join(final_annotation_dir, (id_label + "_sel.cdna.fa")) - merged_amino_acid_file = os.path.join( - final_annotation_dir, (id_label + "_sel.prot.fa") - ) + final_annotation_dir: Path, region_annotation_dir: Path, extension: str, id_label: str +)-> None: + """Merge the resulting gene models from all the seq regions for the specified extension + + Args: + final_annotation_dir (Path): Output directory + region_annotation_dir (Path): Input directory + extension (str): File extension + id_label (str): Type of data or process to merge + """ + gtf_files = list(region_annotation_dir.glob(f"*{extension}")) + merged_gtf_file = final_annotation_dir / f"{id_label}_sel.gtf" + merged_cdna_file = final_annotation_dir / f"{id_label}_sel.cdna.fa" + merged_amino_acid_file = final_annotation_dir / f"{id_label}_sel.prot.fa" # The below is not great, it's a bit messy because there might be # some cases where there aren't translations. So it's not as @@ -657,79 +488,177 @@ def merge_finalise_output_files( gene_id_counter = 0 transcript_id_counter = 0 - gtf_out = open(merged_gtf_file, "w+") - cdna_out = open(merged_cdna_file, "w+") - amino_acid_out = open(merged_amino_acid_file, "w+") - for gtf_file in gtf_files: - logger.info("GTF file: " + gtf_file) - cdna_seq_index = {} - amino_acid_seq_index = {} - cdna_file = gtf_file + ".cdna" - amino_acid_file = gtf_file + ".prot" - cdna_in = open(cdna_file) - amino_acid_in = open(amino_acid_file) - cdna_seq_index = fasta_to_dict(cdna_in.readlines()) - amino_acid_seq_index = fasta_to_dict(amino_acid_in.readlines()) - cdna_in.close() - amino_acid_in.close() - - current_gene_id = "" - gtf_in = open(gtf_file) - line = gtf_in.readline() - while line: - if re.search(r"^#", line): - line = gtf_in.readline() - continue - - eles = line.split("\t") - if not len(eles) == 9: - line = gtf_in.readline() - continue - - match = re.search(r'gene_id "([^"]+)".+transcript_id "([^"]+)"', line) - if match and eles[2] == "transcript": - transcript_id_counter += 1 + + with merged_gtf_file.open("w") as gtf_out, \ + merged_cdna_file.open("w") as cdna_out, \ + merged_amino_acid_file.open("w") as amino_acid_out: + + for gtf_file in gtf_files: + logger.info("GTF file: %s", gtf_file) + cdna_seq_index = {} + amino_acid_seq_index = {} + cdna_file = gtf_file.with_suffix(".cdna") + amino_acid_file = gtf_file.with_suffix(".prot") + if cdna_file.exists(): + with cdna_file.open() as cdna_in: + cdna_seq_index = fasta_to_dict(cdna_in.readlines()) + + if amino_acid_file.exists(): + with amino_acid_file.open() as amino_acid_in: + amino_acid_seq_index = fasta_to_dict(amino_acid_in.readlines()) + current_gene_id = "" + with gtf_file.open() as gtf_in: + for line in gtf_in: + if re.search(r"^#", line): + #if line.startswith("#"): + continue + eles = line.split("\t") + if len(eles) != 9: + continue + + match = re.search(r'gene_id "([^"]+)".+transcript_id "([^"]+)"', line) + if match and eles[2] == "transcript": + transcript_id_counter += 1 + gene_id = match.group(1) + transcript_id = match.group(2) + + if not current_gene_id: + gene_id_counter += 1 + current_gene_id = gene_id + + if gene_id != current_gene_id: + gene_id_counter += 1 + current_gene_id = gene_id + new_gene_id = f"{id_label}_{gene_id_counter}" + new_transcript_id = f"{id_label}_{transcript_id_counter}" + + line = re.sub(f'gene_id "{gene_id}"', f'gene_id "{new_gene_id}"', line) + line = re.sub(f'transcript_id "{transcript_id}"', f'transcript_id "{new_transcript_id}"', line) + gtf_out.write(line) + if eles[2] == "transcript": + new_header = f">{new_transcript_id}\n" + cdna_out.write(new_header + cdna_seq_index[transcript_id]) + if transcript_id in amino_acid_seq_index: + amino_acid_out.write(new_header + amino_acid_seq_index[transcript_id]) - gene_id = match.group(1) - transcript_id = match.group(2) - if not current_gene_id: - gene_id_counter += 1 - current_gene_id = gene_id - - if not gene_id == current_gene_id: - gene_id_counter += 1 - current_gene_id = gene_id - - new_gene_id = id_label + "_" + str(gene_id_counter) - new_transcript_id = id_label + "_" + str(transcript_id_counter) - line = re.sub( - 'gene_id "' + gene_id + '"', - ('gene_id "' + new_gene_id + '"'), - line, - ) - line = re.sub( - 'transcript_id "' + transcript_id + '"', - ('transcript_id "' + new_transcript_id + '"'), - line, - ) - gtf_out.write(line) - line = gtf_in.readline() - - if eles[2] == "transcript": - new_header = ">" + new_transcript_id + "\n" - cdna_out.write(new_header + cdna_seq_index[transcript_id]) +def multiprocess_finalise_geneset(cmd): - if transcript_id in amino_acid_seq_index: - amino_acid_out.write(new_header + amino_acid_seq_index[transcript_id]) + print(" ".join(cmd)) + subprocess.run(cmd) + + + +def run_command(cmd)->None: + """Submit command in multiprocessing action + + Args: + cmd (List[str]): command to execute + """ + try: + result = subprocess.run(cmd, check=True, capture_output=True, text=True) + logger.info("Command executed successfully: %s", cmd) + logger.info("Output: %s", result.stdout) + except subprocess.CalledProcessError as e: + logger.error("Command failed: %s", cmd) + logger.error("Error: %s", e.stderr) + +def copy_raw_files(raw_file: Path, destination_file :Path)-> None: + """Copy file in a new destination + + Args: + raw_file (Path): File to copy + destination_file (Path): New destination + """ + if raw_file.exists(): + try: + shutil.copy(raw_file, destination_file) + logger.info("Copied %s to %s", raw_file, destination_file) + except Exception as e: + logger.error("Failed to copy %s to %s: %s", raw_file, destination_file, e) + else: + logger.info("No file found at %s, skipping", raw_file) + + + + + + + + + + + + + + + + + + +def parse_args(): + """Parse command line arguments.""" + parser = argparse.ArgumentParser(description="Genblast arguments") + parser.add_argument( + "--masked_genome_file", required=True, help="Masked genome file path" + ) + parser.add_argument("--output_dir", required=True, help="Output directory path") + parser.add_argument("--protein_file", required=True, help="Path for the protein dataset") + parser.add_argument( + "--genblast_timeout_secs", type=int, default=10800, help="Genblast timeout period" + ) + parser.add_argument( + "--max_intron_length", type=int, required=True, help="Maximum intron length" + ) + parser.add_argument( + "--genblast_bin", + default="genblast", + help="Genblast executable path", + ) + parser.add_argument( + "--convert2blastmask_bin", + default="convert2blastmask", + help="convert2blastmask executable path", + ) + parser.add_argument( + "--makeblastdb_bin", + default="makeblastdb", + help="makeblastdb executable path", + ) + parser.add_argument("--num_threads", type=int, default=1, help="Number of threads") + parser.add_argument( + "--protein_set", + required=True, + choices=["uniprot", "orthodb"], + help="Protein set [uniprot, orthodb]", + ) + return parser.parse_args() - gtf_out.close() - cdna_out.close() - amino_acid_out.close() +def main(): + """Genblast's entry-point.""" + args = parse_args() + log_file_path = create_dir(args.output_dir, "log") / "genblast.log" + loginipath = Path(__file__).parents[6] / "conf" / "logging.conf" -def multiprocess_finalise_geneset(cmd): + logging.config.fileConfig( + loginipath, + defaults={"logfilename": str(log_file_path)}, + disable_existing_loggers=False, + ) - print(" ".join(cmd)) - subprocess.run(cmd) + run_genblast( + Path(args.masked_genome_file), + Path(args.output_dir), + Path(args.protein_file), + args.max_intron_length, + args.genblast_timeout_secs, + Path(args.genblast_bin), + Path(args.convert2blastmask_bin), + Path(args.makeblastdb_bin), + args.num_threads, + args.protein_set, + ) +if __name__ == "__main__": + main() diff --git a/src/python/ensembl/tools/anno/finalise_genset/rnasamba.py b/src/python/ensembl/tools/anno/finalise_genset/rnasamba.py new file mode 100644 index 0000000..1b06f75 --- /dev/null +++ b/src/python/ensembl/tools/anno/finalise_genset/rnasamba.py @@ -0,0 +1,59 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +check_file(rnasamba_output_path) +rnasamba_output_path = validation_dir / "rnasamba.tsv.txt" +rnasamba_volume = f"{validation_dir}/:/app:rw" + rnasamba_cmd = [ + "singularity", + "exec", + "--bind", + str(rnasamba_volume), + str(rnasamba_bin), + "rnasamba", + "classify", + str(rnasamba_output_path), + str(cdna_file), + str(rnasamba_weights), + ] + logger.info(" ".join(rnasamba_cmd)) + subprocess.run(rnasamba_cmd, check=True) + + + def read_rnasamba_results(file_path): + + results = [] + + file_in = open(file_path) + line = file_in.readline() + while line: + line = line.rstrip() + match = re.search(r"^sequence_name", line) + if match: + line = file_in.readline() + continue + + eles = line.split("\t") + if not len(eles) == 3: + line = file_in.readline() + continue + + transcript_id = eles[0] + coding_probability = eles[1] + coding_potential = eles[2] + results.append([transcript_id, coding_probability, coding_potential]) + line = file_in.readline() + file_in.close() + + return results \ No newline at end of file