Skip to content

Commit

Permalink
moved split protein in utils
Browse files Browse the repository at this point in the history
  • Loading branch information
ens-ftricomi committed Dec 5, 2024
1 parent 042bc4b commit 7ee2b59
Show file tree
Hide file tree
Showing 2 changed files with 98 additions and 65 deletions.
59 changes: 5 additions & 54 deletions src/python/ensembl/tools/anno/protein_annotation/genblast.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,18 +38,17 @@
import multiprocessing
import os
from pathlib import Path
import random
import re
import shutil
import signal
import subprocess
from typing import List
import argparse

from ensembl.tools.anno.utils._utils import (
check_exe,
create_dir,
check_gtf_content,
split_protein_file
)

logger = logging.getLogger(__name__)
Expand All @@ -65,7 +64,7 @@ def run_genblast(#pylint:disable=dangerous-default-value
convert2blastmask_bin: Path = Path("convert2blastmask"),
makeblastdb_bin: Path = Path("makeblastdb"),
num_threads: int = 1,
protein_set: str = ["uniprot", "orthodb"],
protein_set: str = "uniprot",
) -> None:
"""
Expand Down Expand Up @@ -102,6 +101,8 @@ def run_genblast(#pylint:disable=dangerous-default-value
check_exe(genblast_bin)
check_exe(convert2blastmask_bin)
check_exe(makeblastdb_bin)
if protein_set not in {"uniprot", "orthodb"}:
raise ValueError("protein_set must be either 'uniprot' or 'orthodb'")
if protein_set == "uniprot":
genblast_dir = create_dir(output_dir, "uniprot_output")
elif protein_set == "orthodb":
Expand All @@ -128,7 +129,7 @@ def run_genblast(#pylint:disable=dangerous-default-value
else:
_run_convert2blastmask(convert2blastmask_bin, masked_genome, asnb_file)
_run_makeblastdb(makeblastdb_bin, masked_genome, asnb_file)
batched_protein_files = _split_protein_file(
batched_protein_files = split_protein_file(
protein_dataset, genblast_dir, num_threads
)
pool = multiprocessing.Pool(num_threads) # pylint:disable=consider-using-with
Expand Down Expand Up @@ -286,56 +287,6 @@ def _generate_genblast_gtf(genblast_dir: Path) -> None:
path.unlink()


def _split_protein_file(
protein_dataset: Path, output_dir: Path, batch_size: int = 20
) -> List:
"""
The protein dataset file is splitted by a number of sequence equals to the batch_size
in batch files stored in 10 output directories.
protein_dataset : Path for the protein dataset.
output_dir : Output directory path.
batch_size : Size of the batch, it needs to be equals to the number of threads
to parallelise the sequence processing for each file.
"""
batched_protein_files = []

for i in range(0, 10):
create_dir(output_dir, (f"bin_{i}"))
with open(protein_dataset,"r", encoding="utf8") as file_in:
seq_count = 0
batch_count = 0
current_record = ""
initial_seq = True
for line in file_in:
match = re.search(r">(.+)$", line)
# match header and is not first sequence, if the number of stored sequences in each file equals
# the number of batch_size, a new file will be created and the current_record reset
if match and not initial_seq and seq_count % batch_size == 0:
bin_num = random.randint(0, 9)
batch_file = output_dir / f"bin_{bin_num}" / f"{batch_count}.fa"
with batch_file.open("w+") as file_out:
file_out.write(current_record)
batch_count += 1
seq_count += 1
current_record = line
batched_protein_files.append(batch_file)
# match header and is the first sequence
elif match:
current_record += line
initial_seq = False
seq_count += 1
# other lines
else:
current_record += line

if current_record:
bin_num = random.randint(0, 9)
batch_file = output_dir / f"bin_{bin_num}" / f"{batch_count}.fa"
with batch_file.open("w+") as file_out:
file_out.write(current_record)
batched_protein_files.append(batch_file)
return batched_protein_files


def _run_convert2blastmask(
convert2blastmask_bin: Path, masked_genome: Path, asnb_file: Path
Expand Down
104 changes: 93 additions & 11 deletions src/python/ensembl/tools/anno/utils/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import os
from os import PathLike
from pathlib import Path
import random
import re
import subprocess
import shutil
Expand Down Expand Up @@ -81,8 +82,8 @@ def check_gtf_content(gtf_file: PathLike, content_obj: str) -> int:
Check number of transcript lines in the GTF
Arg:
gtf_file: GTF file path
content_obj: Object to detect and count in the gtf i.e transcript, repeat
gtf_file: GTF file path
content_obj: Object to detect and count in the gtf i.e transcript, repeat
Return: Number of occurences
"""
Expand Down Expand Up @@ -403,6 +404,95 @@ def check_file(file_path: Path) -> None:
if not file_path.is_file():
raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), file_path)

def fasta_to_dict(fasta_lines: List[str]) -> Dict[str, str]:
"""
Save fastq sequences in a dictionary <header, sequence>
fasta_lines: List[str]: List of sequences
"""
index = {}
header = None
seq = []

for line in fasta_lines:
line = line.strip()
if line.startswith(">"):
if header:
index[header] = ''.join(seq)
header_match = re.match(r">(.+)", line)
if header_match:
header = header_match.group(1)
seq = []
else:
raise ValueError(f"Invalid FASTA header line: {line}")
else:
seq.append(line)

if header:
index[header] = ''.join(seq)

return index
"""
def fasta_to_dict(fasta_lines:List[str]):
index = {}
for line in fasta_lines:
#it = iter(fasta_lines)
#for header in it:
match = re.search(r">(.+)\n$", line)
header = match.group(1)
seq = next(line)
index[header] = seq
return index
"""

def split_protein_file(
protein_dataset: Path, output_dir: Path, batch_size: int = 20
) -> List:
"""
The protein dataset file is splitted by a number of sequence equals to the batch_size
in batch files stored in 10 output directories.
protein_dataset : Path for the protein dataset.
output_dir : Output directory path.
batch_size : Size of the batch, it needs to be equals to the number of threads
to parallelise the sequence processing for each file.
"""
batched_protein_files = []

for i in range(0, 10):
create_dir(output_dir, (f"bin_{i}"))
with open(protein_dataset,"r", encoding="utf8") as file_in:
seq_count = 0
batch_count = 0
current_record = ""
initial_seq = True
for line in file_in:
match = re.search(r">(.+)$", line)
# match header and is not first sequence, if the number of stored sequences in each file equals
# the number of batch_size, a new file will be created and the current_record reset
if match and not initial_seq and seq_count % batch_size == 0:
bin_num = random.randint(0, 9)
batch_file = output_dir / f"bin_{bin_num}" / f"{batch_count}.fa"
with batch_file.open("w+") as file_out:
file_out.write(current_record)
batch_count += 1
seq_count += 1
current_record = line
batched_protein_files.append(batch_file)
# match header and is the first sequence
elif match:
current_record += line
initial_seq = False
seq_count += 1
# other lines
else:
current_record += line

if current_record:
bin_num = random.randint(0, 9)
batch_file = output_dir / f"bin_{bin_num}" / f"{batch_count}.fa"
with batch_file.open("w+") as file_out:
file_out.write(current_record)
batched_protein_files.append(batch_file)
return batched_protein_files

def load_results_to_ensembl_db(
main_script_dir,
Expand Down Expand Up @@ -1222,15 +1312,7 @@ def read_gtf_genes(gtf_file):
return gtf_genes


def fasta_to_dict(fasta_list):
index = {}
it = iter(fasta_list)
for header in it:
match = re.search(r">(.+)\n$", header)
header = match.group(1)
seq = next(it)
index[header] = seq
return index



# def merge_gtf_files(file_paths,id_label):
Expand Down

0 comments on commit 7ee2b59

Please sign in to comment.