Skip to content

Commit

Permalink
Merge pull request #44 from jolespin/devel
Browse files Browse the repository at this point in the history
blacklist eukaryotes
  • Loading branch information
jolespin authored Dec 28, 2023
2 parents 3ed2910 + 2a28599 commit 8a582da
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 2 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -408,6 +408,7 @@ ________________________________________________________________
<details>
<summary> <b>Daily Change Log:</b> </summary>

* [2023.12.28] - Added `--blacklist` option to `compile_eukaryotic_classifications.py` with a default value of `species:uncultured eukaryote` in `classify_eukaryotic.py`
* [2023.12.28] - Fixed critical error where `classify_eukaryotic.py` was trying to access a deprecated database file from MicrEuk_v2.
* [2023.12.22] - Fixed minor error with `eukaryotic_gene_modeling_wrapper.py` not allowing for `Tiara` to run in backend.
* [2023.12.21] - `GTDB-Tk` changed name of archaea summary file so VEBA was not adding this to final classification. Fixed this in `classify-prokaryotic.py`.
Expand Down
2 changes: 2 additions & 0 deletions src/classify-eukaryotic.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ def get_compile_cmd( input_filepaths, output_filepaths, output_directory, direct
"-s {}".format(input_filepaths[1]),
"--eukaryotic_database {}".format(opts.eukaryotic_database),
"-o {}".format(output_filepaths[0]),
"-b '{}'".format(opts.blacklist),
"--debug",
]
# if opts.clusters:
Expand Down Expand Up @@ -658,6 +659,7 @@ def main(args=None):
parser_consensus.add_argument("--retain_unannotated", type=int, default=1, help = "Consider unannotations (i.e., blank functions) in the scording system [Default: 1]")
parser_consensus.add_argument("--unannotated_weight", type=float, default=0.382, help = "Weight for unannotations (i.e., blank functions) in the scording system? [Default: 0.382]")
parser_consensus.add_argument("--representative_threshold", type=float, default=0.618, help = "Score to consider as representative [Default: 0.618]")
parser_consensus.add_argument("-b", "--blacklist", type=str, default="species:uncultured eukaryote", help="Comma-separated list of [taxon_level]:[blacklisted label]. Use 'NONE' for no black listed taxa. [Default: species:uncultured eukaryote]")

# Options
opts = parser.parse_args()
Expand Down
26 changes: 24 additions & 2 deletions src/scripts/compile_eukaryotic_classifications.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from tqdm import tqdm

__program__ = os.path.split(sys.argv[0])[-1]
__version__ = "2023.12.14"
__version__ = "2023.12.28"


def main(args=None):
Expand All @@ -32,6 +32,8 @@ def main(args=None):
parser.add_argument("--header", type=int, default=1, help="Include header in output {0=No, 1=Yes) [Default: 1]")
parser.add_argument("--debug", action="store_true")
parser.add_argument("--remove_genes_with_missing_values", action="store_true")
parser.add_argument("-b", "--blacklist", type=str, help="Comma-separated list of [taxon_level]:[blacklisted label] (e.g. species:uncultured eukaryote,genus:unclassified)")

# parser.add_argument("--use_original_metaeuk_gene_identifiers", action="store_true")

# Options
Expand Down Expand Up @@ -87,6 +89,22 @@ def main(args=None):
gene_to_target = df_metaeuk["T_acc"]
gene_to_source = gene_to_target.map(lambda id_target: target_to_source.get(id_target,np.nan))

# Blacklist
blacklisted_sources = set()
if opts.blacklist:
print("* The following taxa will be blacklisted (i.e., bitscores --> 0)", file=sys.stderr)
for item in opts.blacklist.split(","):
assert ":" in item
taxon_level, blacklist_label = item.split(":")
assert taxon_level in df_source_taxonomy.columns

sources = list()
for id_source, taxon_value in df_source_taxonomy[taxon_level].items():
if taxon_value == blacklist_label:
sources.append(id_source)
blacklisted_sources.update(sources)
print(" * {}".format(item), sorted(sources), sep="\n", file=sys.stderr)

if opts.scaffolds_to_bins:
# Scaffolds -> Bins
fp = opts.scaffolds_to_bins
Expand Down Expand Up @@ -138,6 +156,10 @@ def main(args=None):
})
df_gene_classifications.index.name = "id_gene"

if blacklisted_sources:
df_gene_classifications["bitscore_before_blacklist"] = df_gene_classifications["bitscore"].copy()
mask = df_gene_classifications["id_source"].map(lambda x: x in blacklisted_sources)
df_gene_classifications.loc[mask, "bitscore"] = 0.0

# df_gene_classifications = pd.concat([
# gene_to_scaffold.to_frame("id_scaffold"),
Expand Down Expand Up @@ -170,7 +192,7 @@ def main(args=None):

df_gene_classifications.to_csv(opts.output, sep="\t", header=bool(opts.header))





Expand Down

0 comments on commit 8a582da

Please sign in to comment.