forked from KoslickiLab/YACHT
-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_YACHT.py
150 lines (133 loc) · 8.44 KB
/
run_YACHT.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/usr/bin/env python
import os, sys
import numpy as np
import pandas as pd
from pathlib import Path
import srcs.hypothesis_recovery_src as hr
from scipy.sparse import load_npz
import argparse
import srcs.utils as utils
import json
import warnings
import zipfile
from pathlib import Path
warnings.filterwarnings("ignore")
from tqdm import tqdm
from loguru import logger
logger.remove()
logger.add(sys.stdout, format="{time:YYYY-MM-DD HH:mm:ss} - {level} - {message}", level="INFO")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="This script estimates the abundance of microorganisms from a "
"reference database matrix and metagenomic sample.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--json', type=str, help='Path to a json file generated by make_training_data_from_sketches.py.', required=True)
parser.add_argument('--sample_file', help='Metagenomic sample in .sig.zip format', required=True)
parser.add_argument('--significance', type=float, help='Minimum probability of individual true negative.',
required=False, default=0.99)
parser.add_argument('--num_threads', type=int, help='Number of threads to use for parallelization.', required=False, default=16)
parser.add_argument('--keep_raw', action='store_true', help='Keep raw results in output file.')
parser.add_argument('--show_all', action='store_true', help='Show all organisms (no matter if present) in output file.')
parser.add_argument('--min_coverage_list', nargs="+", type=float, help='A list of percentages of unique k-mers covered by reads in the sample. '
'Each value should be between 0 and 1, with 0 being the most sensitive (and least '
'precise) and 1 being the most precise (and least sensitive).',
required=False, default=[1, 0.5, 0.1, 0.05, 0.01])
parser.add_argument('--out', type=str, help='path to output excel file', required=False, default=os.path.join(os.getcwd(), 'result.xlsx'))
# parse the arguments
args = parser.parse_args()
json_file_path = str(Path(args.json).absolute()) # path to json file
sample_file = str(Path(args.sample_file).absolute()) # location of sample.sig file
significance = args.significance # Minimum probability of individual true negative.
num_threads = args.num_threads # Number of threads to use for parallelization.
keep_raw = args.keep_raw # Keep raw results in output file.
show_all = args.show_all # Show all organisms (no matter if present) in output file.
min_coverage_list = args.min_coverage_list # a list of percentages of unique k-mers covered by reads in the sample.
out = str(Path(args.out).absolute()) # full path to output excel file
outdir = os.path.dirname(out) # path to output directory
out_filename = os.path.basename(out) # output filename
# check if the output filename is valid
if os.path.splitext(out_filename)[1] != '.xlsx':
raise ValueError(f'Output filename {out} is not a valid excel file. Please use .xlsx as the extension.')
# check if the json file exists
utils.check_file_existence(json_file_path, f'Config file {json_file_path} does not exist. '
f'Please run make_training_data_from_sketches.py first.')
# load the config file, ksize, and ani_thresh
config = json.load(open(json_file_path, 'r'))
manifest_file_path = config['manifest_file_path']
path_to_genome_temp_dir = config['intermediate_files_dir']
scale = config['scale']
ksize = config['ksize']
ani_thresh = config['ani_thresh']
# Make sure the output can be written to
if not os.access(outdir, os.W_OK):
raise FileNotFoundError(f"Cannot write to the location: {outdir}.")
# check if min_coverage is between 0 and 1
for x in min_coverage_list:
if not (0 <= x <= 1):
raise ValueError(f'One of values in the min_coverage_list you provided {x} is not between 0 and 1. Please check your input.')
# make sure all these files exist
utils.check_file_existence(manifest_file_path, f'The manifest file {manifest_file_path} '
f'does not exist. Please check if you are using the correct json file as input.')
# load the training data
logger.info('Loading the manifest file generated from the training data.')
manifest = pd.read_csv(manifest_file_path, sep='\t', header=0)
# check if there is a manifest in the sample sig file
with zipfile.ZipFile(sample_file, 'r') as zip_file:
if 'SOURMASH-MANIFEST.csv' not in zip_file.namelist():
raise FileNotFoundError(f'The input file {sample_file} appears to be missing a manifest associated with it. '
f'Try running: sourmash sig merge {sample_file} -o <new signature with the manifest present>. '
f'And then run YACHT using the output of that command.')
# load sample signature and its signature info
logger.info('Loading sample signature and its signature info.')
try:
sample_sig = utils.load_signature_with_ksize(sample_file, ksize)
except ValueError:
raise ValueError(f'Expected exactly one signature with ksize {ksize} in {sample_file}, found {len(sample_file)}. '
f'Likely you will need to do something like: sourmash sig merge {sample_file} -o <new signature with just one sketch in it>.')
sample_sig_info = utils.get_info_from_single_sig(sample_file, ksize)
# add sample signature info to the manifest
manifest['num_exclusive_kmers_in_sample_sketch'] = sample_sig_info[3]
manifest['num_total_kmers_in_sample_sketch'] = utils.get_num_kmers(sample_sig_info[2], sample_sig_info[3], sample_sig_info[4], scale=False)
manifest['sample_scale_factor'] = sample_sig_info[4]
manifest['min_coverage'] = 1.0
# check that the sample scale factor is the same as the genome scale factor for all organisms
if scale != sample_sig_info[4]:
raise ValueError(f'Sample scale factor does not equal genome scale factor. Please check your input.')
# compute hypothesis recovery
logger.info('Computing hypothesis recovery.')
sample_info_set = (sample_file, sample_sig)
min_coverage_list = list(set(min_coverage_list))
min_coverage_list.sort(reverse=True)
has_raw = False
if 1.0 not in min_coverage_list:
min_coverage_list = [1.0] + min_coverage_list
else:
has_raw = True
manifest_list = hr.hypothesis_recovery(manifest, sample_info_set, path_to_genome_temp_dir, min_coverage_list, scale, ksize, significance, ani_thresh, num_threads)
# remove unnecessary columns
remove_cols = ['md5sum', 'sample_scale_factor']
temp_manifest_list = []
for temp_manifest in manifest_list:
temp_manifest = temp_manifest[[col for col in temp_manifest.columns if col not in remove_cols]]
temp_manifest.rename(columns={'genome_scale_factor': 'scale_factor'}, inplace=True)
temp_manifest_list += [temp_manifest]
manifest_list = temp_manifest_list
# save the results into Excel file
logger.info(f'Saving results to {outdir}.')
# save the results with different min_coverage
with pd.ExcelWriter(out, engine='openpyxl', mode='w') as writer:
# save the raw results (i.e., min_coverage=1.0)
if keep_raw:
temp_mainifest = manifest_list[0].copy()
temp_mainifest.rename(columns={'acceptance_threshold_with_coverage': 'acceptance_threshold_wo_coverage',
'actual_confidence_with_coverage': 'actual_confidence_wo_coverage',
'alt_confidence_mut_rate_with_coverage': 'alt_confidence_mut_rate_wo_coverage'}, inplace=True)
temp_mainifest.to_excel(writer, sheet_name=f'raw_result', index=False)
# save the results with different min_coverage given by the user
if not has_raw:
min_coverage_list = min_coverage_list[1:]
new_manifest_list = manifest_list[1:]
for min_coverage, temp_mainifest in zip(min_coverage_list, manifest_list):
if not show_all:
temp_mainifest = temp_mainifest[temp_mainifest['in_sample_est'] == True]
temp_mainifest.to_excel(writer, sheet_name=f'min_coverage{min_coverage}', index=False)