first splitting|

Ensembl · Dec 5, 2024 · dab7a14 · dab7a14
1 parent 7ee2b59
commit dab7a14
Show file tree

Hide file tree

Showing 4 changed files with 603 additions and 441 deletions.
diff --git a/src/python/ensembl/tools/anno/finalise_genset/cpc2.py b/src/python/ensembl/tools/anno/finalise_genset/cpc2.py
@@ -0,0 +1,73 @@
+# See the NOTICE file distributed with this work for additional information
+# regarding copyright ownership.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+check_file(cpc2_output_path)
+cpc2_output_path = validation_dir / "cpc2.tsv"
+
+    cpc2_volume = f"{validation_dir}/:/app:rw"
+    cpc2_cmd = [
+        "singularity",
+        "exec",
+        "--bind",
+        str(cpc2_volume),
+        str(cpc2_bin),
+        "python3",
+        "/CPC2_standalone-1.0.1/bin/CPC2.py",
+        "-i",
+        str(cdna_file),
+        "--ORF",
+        "-o",
+        str(cpc2_output_path),
+    ]
+    logger.info(" ".join(cpc2_cmd))
+    subprocess.run(cpc2_cmd, check=True)
+    cpc2_output_path = f"{str(cpc2_output_path)}.txt"
+
+
+def read_cpc2_results(file_path):
+
+    results = []
+
+    file_in = open(file_path)
+    line = file_in.readline()
+    while line:
+        line = line.rstrip()
+        match = re.search(r"^#ID", line)
+        if match:
+            line = file_in.readline()
+            continue
+
+        eles = line.split("\t")
+        if not len(eles) == 9:
+            line = file_in.readline()
+            continue
+
+        transcript_id = eles[0]
+        transcript_length = eles[1]
+        peptide_length = eles[2]
+        coding_probability = eles[7]
+        coding_potential = eles[8]
+        results.append(
+            [
+                transcript_id,
+                coding_probability,
+                coding_potential,
+                transcript_length,
+                peptide_length,
+            ]
+        )
+        line = file_in.readline()
+    file_in.close()
+
+    return results    
diff --git a/src/python/ensembl/tools/anno/finalise_genset/diamond.py b/src/python/ensembl/tools/anno/finalise_genset/diamond.py
@@ -0,0 +1,101 @@
+# See the NOTICE file distributed with this work for additional information
+# regarding copyright ownership.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ logger.info("diamond validation")
+    diamond_results = None
+    if diamond_validation_db is not None:
+        diamond_output_dir = create_dir(validation_dir, "diamond_output")
+        diamond_validation(
+            diamond_validation_db,
+            amino_acid_file,
+            diamond_output_dir,
+            num_threads,
+        )
+        diamond_results = read_diamond_results(diamond_output_dir)
+
+
+        def diamond_validation(
+    diamond_validation_db: Path,
+    amino_acid_file: Path, 
+    diamond_output_dir: Path, 
+    num_threads: int
+)-> None:
+
+    batched_protein_files = split_protein_file(amino_acid_file, diamond_output_dir, 100)
+
+    pool = multiprocessing.Pool(int(num_threads)) # pylint: disable=consider-using-with
+    for batched_protein_file in batched_protein_files:
+        pool.apply_async(
+            multiprocess_diamond,
+            args=(
+                batched_protein_file,
+                diamond_output_dir,
+                diamond_validation_db,
+            ),
+        )
+    pool.close()
+    pool.join()
+
+
+def multiprocess_diamond(
+    batched_protein_file,
+    diamond_output_dir,
+    diamond_validation_db,
+)->None:
+
+    #batch_num = os.path.splitext(batched_protein_file)[0]
+    #batch_dir = os.path.dirname(batched_protein_file)
+    diamond_output_file = f"{batched_protein_file}.dmdout"
+    logger.info("Running diamond on %s :", batched_protein_file)
+
+    diamond_cmd = [
+        "diamond",
+        "blastp",
+        "--query",
+        batched_protein_file,
+        "--db",
+        diamond_validation_db,
+        "--out",
+        diamond_output_file,
+    ]
+
+    logger.info(" ".join(diamond_cmd))
+    subprocess.run(diamond_cmd)
+    subprocess.run(["mv", diamond_output_file, diamond_output_dir])
+
+
+def read_diamond_results(diamond_output_dir):
+
+    results = []
+    diamond_files = glob.glob(diamond_output_dir + "/*.dmdout")
+    for file_path in diamond_files:
+        file_in = open(file_path)
+        line = file_in.readline()
+        while line:
+            line = line.rstrip()
+
+            eles = line.split("\t")
+            if not len(eles) == 12:
+                line = file_in.readline()
+                continue
+
+            transcript_id = eles[0]
+            e_value = eles[10]
+            results.append([transcript_id, e_value])
+            line = file_in.readline()
+    file_in.close()
+
+    return results
+