Skip to content

Commit

Permalink
first splitting|
Browse files Browse the repository at this point in the history
  • Loading branch information
ens-ftricomi committed Dec 5, 2024
1 parent 7ee2b59 commit dab7a14
Show file tree
Hide file tree
Showing 4 changed files with 603 additions and 441 deletions.
73 changes: 73 additions & 0 deletions src/python/ensembl/tools/anno/finalise_genset/cpc2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# See the NOTICE file distributed with this work for additional information
# regarding copyright ownership.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
check_file(cpc2_output_path)
cpc2_output_path = validation_dir / "cpc2.tsv"

cpc2_volume = f"{validation_dir}/:/app:rw"
cpc2_cmd = [
"singularity",
"exec",
"--bind",
str(cpc2_volume),
str(cpc2_bin),
"python3",
"/CPC2_standalone-1.0.1/bin/CPC2.py",
"-i",
str(cdna_file),
"--ORF",
"-o",
str(cpc2_output_path),
]
logger.info(" ".join(cpc2_cmd))
subprocess.run(cpc2_cmd, check=True)
cpc2_output_path = f"{str(cpc2_output_path)}.txt"


def read_cpc2_results(file_path):

results = []

file_in = open(file_path)
line = file_in.readline()
while line:
line = line.rstrip()
match = re.search(r"^#ID", line)
if match:
line = file_in.readline()
continue

eles = line.split("\t")
if not len(eles) == 9:
line = file_in.readline()
continue

transcript_id = eles[0]
transcript_length = eles[1]
peptide_length = eles[2]
coding_probability = eles[7]
coding_potential = eles[8]
results.append(
[
transcript_id,
coding_probability,
coding_potential,
transcript_length,
peptide_length,
]
)
line = file_in.readline()
file_in.close()

return results
101 changes: 101 additions & 0 deletions src/python/ensembl/tools/anno/finalise_genset/diamond.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
# See the NOTICE file distributed with this work for additional information
# regarding copyright ownership.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

logger.info("diamond validation")
diamond_results = None
if diamond_validation_db is not None:
diamond_output_dir = create_dir(validation_dir, "diamond_output")
diamond_validation(
diamond_validation_db,
amino_acid_file,
diamond_output_dir,
num_threads,
)
diamond_results = read_diamond_results(diamond_output_dir)


def diamond_validation(
diamond_validation_db: Path,
amino_acid_file: Path,
diamond_output_dir: Path,
num_threads: int
)-> None:

batched_protein_files = split_protein_file(amino_acid_file, diamond_output_dir, 100)

pool = multiprocessing.Pool(int(num_threads)) # pylint: disable=consider-using-with
for batched_protein_file in batched_protein_files:
pool.apply_async(
multiprocess_diamond,
args=(
batched_protein_file,
diamond_output_dir,
diamond_validation_db,
),
)
pool.close()
pool.join()


def multiprocess_diamond(
batched_protein_file,
diamond_output_dir,
diamond_validation_db,
)->None:

#batch_num = os.path.splitext(batched_protein_file)[0]
#batch_dir = os.path.dirname(batched_protein_file)
diamond_output_file = f"{batched_protein_file}.dmdout"
logger.info("Running diamond on %s :", batched_protein_file)

diamond_cmd = [
"diamond",
"blastp",
"--query",
batched_protein_file,
"--db",
diamond_validation_db,
"--out",
diamond_output_file,
]

logger.info(" ".join(diamond_cmd))
subprocess.run(diamond_cmd)
subprocess.run(["mv", diamond_output_file, diamond_output_dir])


def read_diamond_results(diamond_output_dir):

results = []
diamond_files = glob.glob(diamond_output_dir + "/*.dmdout")
for file_path in diamond_files:
file_in = open(file_path)
line = file_in.readline()
while line:
line = line.rstrip()

eles = line.split("\t")
if not len(eles) == 12:
line = file_in.readline()
continue

transcript_id = eles[0]
e_value = eles[10]
results.append([transcript_id, e_value])
line = file_in.readline()
file_in.close()

return results

Loading

0 comments on commit dab7a14

Please sign in to comment.