Merge pull request #44 from jolespin/devel

blacklist eukaryotes
jolespin · Dec 28, 2023 · 8a582da · 8a582da
2 parents 3ed2910 + 2a28599
commit 8a582da
Show file tree

Hide file tree

Showing 3 changed files with 27 additions and 2 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -408,6 +408,7 @@ ________________________________________________________________
 <details>
 	<summary> <b>Daily Change Log:</b> </summary>
 
+* [2023.12.28] - Added `--blacklist` option to `compile_eukaryotic_classifications.py` with a default value of `species:uncultured eukaryote` in `classify_eukaryotic.py`
 * [2023.12.28] - Fixed critical error where `classify_eukaryotic.py` was trying to access a deprecated database file from MicrEuk_v2.
 * [2023.12.22] - Fixed minor error with `eukaryotic_gene_modeling_wrapper.py` not allowing for `Tiara` to run in backend.
 * [2023.12.21] - `GTDB-Tk` changed name of archaea summary file so VEBA was not adding this to final classification. Fixed this in `classify-prokaryotic.py`.

diff --git a/src/classify-eukaryotic.py b/src/classify-eukaryotic.py
@@ -154,6 +154,7 @@ def get_compile_cmd( input_filepaths, output_filepaths, output_directory, direct
         "-s {}".format(input_filepaths[1]),
         "--eukaryotic_database {}".format(opts.eukaryotic_database),
         "-o {}".format(output_filepaths[0]),
+        "-b '{}'".format(opts.blacklist),
         "--debug",
     ]
     # if opts.clusters:
@@ -658,6 +659,7 @@ def main(args=None):
     parser_consensus.add_argument("--retain_unannotated", type=int, default=1, help = "Consider unannotations (i.e., blank functions) in the scording system [Default: 1]")
     parser_consensus.add_argument("--unannotated_weight", type=float, default=0.382, help = "Weight for unannotations (i.e., blank functions) in the scording system? [Default: 0.382]")
     parser_consensus.add_argument("--representative_threshold", type=float, default=0.618, help = "Score to consider as representative [Default: 0.618]")
+    parser_consensus.add_argument("-b", "--blacklist", type=str, default="species:uncultured eukaryote", help="Comma-separated list of [taxon_level]:[blacklisted label]. Use 'NONE' for no black listed taxa. [Default: species:uncultured eukaryote]")
 
     # Options
     opts = parser.parse_args()

diff --git a/src/scripts/compile_eukaryotic_classifications.py b/src/scripts/compile_eukaryotic_classifications.py
@@ -6,7 +6,7 @@
 from tqdm import tqdm 
 
 __program__ = os.path.split(sys.argv[0])[-1]
-__version__ = "2023.12.14"
+__version__ = "2023.12.28"
 
 
 def main(args=None):
@@ -32,6 +32,8 @@ def main(args=None):
     parser.add_argument("--header", type=int, default=1, help="Include header in output {0=No, 1=Yes) [Default: 1]")
     parser.add_argument("--debug", action="store_true")
     parser.add_argument("--remove_genes_with_missing_values", action="store_true")
+    parser.add_argument("-b", "--blacklist", type=str, help="Comma-separated list of [taxon_level]:[blacklisted label] (e.g. species:uncultured eukaryote,genus:unclassified)")
+
     # parser.add_argument("--use_original_metaeuk_gene_identifiers", action="store_true")
 
     # Options
@@ -87,6 +89,22 @@ def main(args=None):
     gene_to_target = df_metaeuk["T_acc"]
     gene_to_source = gene_to_target.map(lambda id_target: target_to_source.get(id_target,np.nan))
 
+    # Blacklist
+    blacklisted_sources = set()
+    if opts.blacklist:
+        print("* The following taxa will be blacklisted (i.e., bitscores --> 0)", file=sys.stderr)
+        for item in opts.blacklist.split(","):
+            assert ":" in item
+            taxon_level, blacklist_label = item.split(":")
+            assert taxon_level in df_source_taxonomy.columns
+
+            sources = list()
+            for id_source, taxon_value in df_source_taxonomy[taxon_level].items():
+                if taxon_value == blacklist_label:
+                    sources.append(id_source)
+            blacklisted_sources.update(sources)
+            print(" * {}".format(item), sorted(sources), sep="\n", file=sys.stderr)
+
     if opts.scaffolds_to_bins:
         # Scaffolds -> Bins
         fp = opts.scaffolds_to_bins
@@ -138,6 +156,10 @@ def main(args=None):
     })
     df_gene_classifications.index.name = "id_gene"
 
+    if blacklisted_sources:
+        df_gene_classifications["bitscore_before_blacklist"] = df_gene_classifications["bitscore"].copy()
+        mask = df_gene_classifications["id_source"].map(lambda x: x in blacklisted_sources)
+        df_gene_classifications.loc[mask, "bitscore"] = 0.0
 
     # df_gene_classifications = pd.concat([
     #     gene_to_scaffold.to_frame("id_scaffold"),
@@ -170,7 +192,7 @@ def main(args=None):
 
     df_gene_classifications.to_csv(opts.output, sep="\t", header=bool(opts.header))
 
-
+