diff --git a/.gitignore b/.gitignore index 6a42692..630bf7a 100644 --- a/.gitignore +++ b/.gitignore @@ -35,6 +35,8 @@ .settings/ .cproject .project +.autotools +.vscode/ src/.settings/ src/.cproject src/.project diff --git a/src/main/main.cpp b/src/main/main.cpp index d81bbef..3f47f2c 100644 --- a/src/main/main.cpp +++ b/src/main/main.cpp @@ -34,8 +34,6 @@ using namespace seqan3::literals; -//std::string gtdb_root = "D:\\gtdb_genomes_reps_r202"; - double cputime(void) { struct rusage r; @@ -55,7 +53,7 @@ int main(int argc, char const **argv) seqan3::argument_parser top_level_parser{"taxor", argc, argv, seqan3::update_notifications::off, {"build", "search", "profile"}}; - top_level_parser.info.version = "1.0.0"; + top_level_parser.info.version = "0.1.0"; try { @@ -87,189 +85,5 @@ int main(int argc, char const **argv) return error_code; - // Print debuging of bulk count method to fid error when building ixf - /*using sequence_file_t = seqan3::sequence_file_input>; - hixf::build_arguments args{}; - std::vector> bins{}; - - seqan3::interleaved_xor_filter<> ixf{64, 7000000}; - - bool success = false; - while (!success) - { - size_t bin_idx{0}; - for (int64_t pos = 1160; pos < 1170;++pos) - { - std::vector hashes{}; - hixf::read_from_temp_hash_file(pos, hashes); - if (hashes.empty()) - continue; - //bins.emplace_back(hashes); - success = iff.add_bin_elements(bin_idx, hashes); - std::cout << bin_idx << "\t" << hashes.size() << std::endl; - bin_idx++; - if (!success) - break; - } - if (!success) - { - iff.clear(); - iff.set_seed(); - std::cout << "Reset seed after bin " << bin_idx << std::endl; - } - } - - robin_hood::unordered_flat_set read_hashes{}; - for (auto && [seq] : sequence_file_t{"/media/jens/INTENSO/refseq-viral/2022-03-23_22-07-02/files.renamed/GCF_000839085.1_genomic.fna.gz"}) - for (auto hash : - seq - | seqan3::views::minimiser_hash(args.shape, - seqan3::window_size{args.window_size}, - seqan3::seed{hixf::adjust_seed(args.shape.count())})) - read_hashes.insert(hash); - std::vector c{}; - std::ranges::copy(read_hashes, std::back_inserter(c)); - typedef seqan3::interleaved_binary_fuse_filter<>::counting_agent_type TIFFAgent; - TIFFAgent ixf_count_agent = iff.counting_agent< uint64_t >(); - - auto result = ixf_count_agent.bulk_count(c); - seqan3::debug_stream << "Root result: " << result << "\n"; - - */ - - - //create_layout(); - - /*hixf::search_arguments search_args{}; - search_args.index_file = std::filesystem::path{"/media/jens/INTENSO/refseq-viral/2022-03-23_22-07-02/raptor_kmer.hixf"}; - search_args.query_file = std::filesystem::path{"/media/jens/INTENSO/refseq-viral/2022-03-23_22-07-02/files.renamed/GCF_000839085.1_genomic.fna.gz"}; - search_args.out_file = std::filesystem::path{"/media/jens/INTENSO/refseq-viral/2022-03-23_22-07-02/raptor.hixf_search.out"}; - search_args.compute_syncmer = false; - search_args.threshold = 0.2; - hixf::search_hixf(search_args); -*/ - - return 1; - -/* - - -// multi_interleaved_xor_filter mixf = load_multi_ixf_index("GTDB_r202.ixf"); -// std::cout << "Filter loaded successfully" << std::endl; - - //std::vector> bins2vec(mixf.count_single_filter()); - - //bins_to_species(mixf, bins2vec); - - - double ratio_sum = 0.0; - uint64_t nr_reads = 0; - std::vector sensitivity(mixf.species_vector.size()); - std::fill(sensitivity.begin(), sensitivity.end(), 0.0); - std::vector classified(mixf.species_vector.size()); - std::fill(classified.begin(), classified.end(), 0); - int count = 0; - for ( const auto& [seq, header, qual] : fin) - { - - if (seq.size() < 1000) - continue; - - // rhv holds hash vectors for forward and reverse complement - std::vector rhv = hashing::seq_to_syncmers(kmer_size, seq, sync_size, t_syncmer); - uint64_t strobe_nr = rhv.size(); -// TInterval ci = calculateCI(0.1, kmer_size, strobe_nr, 0.95); -// uint64_t threshold = strobe_nr - ci.second; - std::vector max_found(mixf.species_vector.size()); - std::fill(max_found.begin(), max_found.end(), 0); -// std::cout << rhv[0].first.size() << "\t" << seq.size() << std::endl; - - std::vector count_vectors = mixf.bulk_count(rhv); - - std::vector result(mixf.species_vector.size()); - for (uint64_t i = 0; i < result.size(); ++i) - { - result[i] = 0; - if (mixf.species_vector[i].filter_index == UINT16_MAX) - continue; - for (uint64_t spec_bin = mixf.species_vector[i].first_bin; spec_bin <= mixf.species_vector[i].last_bin; ++spec_bin) - { - result[i] += count_vectors[mixf.species_vector[i].filter_index][spec_bin]; - if (i == 355) - { - std::cout << spec_bin << "\t" << count_vectors[mixf.species_vector[i].filter_index][spec_bin] << "\t" << strobe_nr << std::endl; - } - } - } - - // find maximum matches between forward and reverse complement matches - for (int i = 0; i < max_found.size(); ++i) - { - if (result[i] > max_found[i]) - max_found[i] = result[i]; - } - - - //break; - - double max_ratio = 0.0; - - std::vector> potential_indexes{}; - for (int i = 0; i < max_found.size(); ++i) - { - - double ratio = (double) max_found[i] / (double) strobe_nr; - sensitivity[i] += ratio; - - // Why does 0.05 work so well? - if ( ratio > 0.1) - { - - if (ratio > max_ratio) - max_ratio = ratio; - - potential_indexes.emplace_back(std::make_pair(i, ratio)); - } - } - - int spec_count = 0; - for (const auto & p : potential_indexes) - { - if (p.second >= max_ratio) - { - classified[p.first] += 1; -// if (++spec_count > 1) -// std::cout << header << std::endl; - - } - } - - nr_reads++; -// if (nr_reads == 10) -// break; - } - - for (int i=0; i < sensitivity.size(); ++i) - { - sensitivity[i] /= (double) nr_reads; - } - - - for (uint64_t i = 0; i < mixf.species_vector.size(); ++i) - { - - - if (classified[i] > 1) - { - std::cout << i << "\t" << mixf.species_vector[i].name << "\t" << classified[i] << "\t" << sensitivity[i] << std::endl; - } - } - -// seqan3::debug_stream << sensitivity << "\n"; -// seqan3::debug_stream << classified << "\n"; - seqan3::debug_stream << nr_reads << "\n"; - - return 0; - */ } diff --git a/src/main/taxor_build.cpp b/src/main/taxor_build.cpp index 8374002..8d08b87 100644 --- a/src/main/taxor_build.cpp +++ b/src/main/taxor_build.cpp @@ -41,26 +41,17 @@ void set_up_subparser_layout(seqan3::argument_parser & parser, taxor::build::con parser.info.version = "0.1.0"; parser.info.author = "Jens-Uwe Ulrich"; parser.info.email = "jens-uwe.ulrich@hpi.de"; - parser.info.short_description = "Creates and HIXF index of a given set of fasta files"; + parser.info.short_description = "Creates an HIXF index of a given set of fasta files"; - parser.info.description.emplace_back("Creates an HIXF index using either kmers, syncmers or FracMinHash sketches"); + parser.info.description.emplace_back("Creates an HIXF index using either k-mers or syncmers"); parser.add_subsection("Main options:"); // ----------------------------------------------------------------------------------------------------------------- parser.add_option(config.input_file_name, - '\0', "input-file", "", - /* "Provide the prefix you used for the output prefix in the chopper count --output-prefix option. " - "If you have different means of estimating the k-mer counts of your input data, make sure that a " - "file [INPUT-PREFIX].count exists. It needs to be tab-separated and consist of two columns: " - "\"[filepath] [tab] [weight/count]\".",*/ + '\0', "input-file", "tab-separated-value file containing taxonomy information and reference file names", seqan3::option_spec::required); - parser.add_list_item("", "Example file:"); - parser.add_list_item("", "```"); - parser.add_list_item("", "GCF_000839185.1 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/839/185/GCF_000839185.1_ViralProj14174 Cowpox virus k__Viruses;p__Nucleocytoviricota;c__Pokkesviricetes;o__Chitovirales;f__Poxviridae;g__Orthopoxvirus;s__Cowpox virus"); - parser.add_list_item("", "GCF_000860085.1 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/860/085/GCF_000860085.1_ViralProj15241 Vaccinia virus k__Viruses;p__Nucleocytoviricota;c__Pokkesviricetes;o__Chitovirales;f__Poxviridae;g__Orthopoxvirus;s__Vaccinia virus"); - parser.add_list_item("", "```"); - parser.add_option(config.input_sequence_folder, '\0', "input-sequence_dir", "directory containing the fasta reference files"); + parser.add_option(config.input_sequence_folder, '\0', "input-sequence-dir", "directory containing the fasta reference files"); parser.add_option(config.output_file_name, '\0', "output-filename", "A file name for the resulting index."); @@ -74,8 +65,7 @@ void set_up_subparser_layout(seqan3::argument_parser & parser, taxor::build::con parser.add_option(config.threads, '\0', "threads", - "The number of threads to use. Currently, only merging of sketches is parallelized, so if option " - "--rearrange-user-bins is not set, --threads will have no effect.", + "The number of threads to use.", seqan3::option_spec::standard, seqan3::arithmetic_range_validator{static_cast(1), static_cast(32)}); @@ -83,9 +73,7 @@ void set_up_subparser_layout(seqan3::argument_parser & parser, taxor::build::con parser.add_flag(config.output_verbose_statistics, '\0', "output-verbose-statistics", - "Enable verbose statistics to be " - "printed to std::cout. If the flag --determine-best-tmax is not set, this flag is ignored " - "and has no effect.", + "Enable verbose statistics to be printed", seqan3::option_spec::hidden); parser.add_flag(config.debug, @@ -369,86 +357,6 @@ void build_hixf(taxor::build::configuration const config, store_index(args.out_path, index); } -uint64_t construct_and_query_ixf(size_t bins, size_t elements_per_bin) -{ - - - //ulrich::interleaved_xor_filter<> ixf(elems); - seqan3::interleaved_xor_filter<> ixf(bins, elements_per_bin); - std::vector elems{}; - while (true) - { - bool success = true; - for (int e = 0; e < bins ; ++e) - { - - //if (e % 7 == 0 || e % 15 == 0) - // continue; - - std::vector tmp{}; - for (size_t i = 0; i < elements_per_bin; ++i) - { - size_t key = (e*elements_per_bin) + i; - tmp.emplace_back(key); - } - success = ixf.add_bin_elements(e, tmp); - if (!success) - { - ixf.clear(); - ixf.set_seed(); - std::cout << e << std::endl; - break; - } - if (e == 2) - elems=std::move(tmp); - } - if (success) - break; - } - - //std::cout << "Built IXF in " << ixf_construct.elapsed() << " seconds" << std::endl; - - std::cout << ixf.bin_count() << std::endl; - std::cout << (double) ixf.bit_size() / (double) ixf.bin_count() << std::endl; - - typedef seqan3::interleaved_xor_filter<>::counting_agent_type< uint64_t > TIXFAgent; - - auto agent = ixf.membership_agent(); - auto & result = agent.bulk_contains(elems[0]); - seqan3::debug_stream << elems[0] << "\t" << result << '\n'; - TIXFAgent ixf_count_agent = ixf.counting_agent< uint64_t >(); - //auto result = count_agent.bulk_count(readHs); - auto ixf_result = ixf_count_agent.bulk_count(elems); - - seqan3::debug_stream << ixf_result << "\n"; -/* double fpr {0.0}; - for (int i =0; i < ixf_result.size(); ++i) - { - if (i == 2) - { - std::cout << ixf_result[i] << "/" << elems.size() << std::endl; - return ixf_result[i]; - continue; - } - fpr += (double) ixf_result[i] / (double) elements_per_bin; - } - fpr /= (double) (bins - 1); - std::cout << "FPR of the IXF: " << fpr << std::endl; - double mbytes = (double) ixf.bit_size() / (double) 8388608; - std::cout << "Size of the IXF: " << mbytes << " MBytes" << std::endl; - std::cout << "Queried " << elems.size() <<" keys in IXF in " << ixf_query.elapsed() << " seconds" << std::endl; - StopClock ixf_store{}; - ixf_store.start(); - std::string filter_file = "test.ixf"; - - std::ofstream os( filter_file, std::ios::binary ); - cereal::BinaryOutputArchive archive( os ); - archive( ixf ); - ixf_store.stop(); - std::cout << "Stored IXF in " << ixf_store.elapsed() << " seconds" << std::endl; - */ -} - int execute(seqan3::argument_parser & parser) { taxor::build::configuration config; diff --git a/src/main/taxor_profile.cpp b/src/main/taxor_profile.cpp index d2b8f0e..215d63e 100644 --- a/src/main/taxor_profile.cpp +++ b/src/main/taxor_profile.cpp @@ -29,14 +29,14 @@ void set_up_subparser_layout(seqan3::argument_parser & parser, taxor::profile::c seqan3::option_spec::required); parser.add_option(config.report_file, '\0', "cami-report-file", - "output file reporting genomic abundances in CAMI profiling format" - "This is the relative genome abundance in terms of the genome copy number for the respective TAXID in the overall sample." + "output file reporting genomic abundances in CAMI profiling format. " + "This is the relative genome abundance in terms of the genome copy number for the respective TAXID in the overall sample. " "Note that this is not identical to the relative abundance in terms of assigned base pairs.", seqan3::option_spec::required); parser.add_option(config.sequence_abundance_file, '\0', "seq-abundance-file", - "output file reporting sequence abundance in CAMI profiling format (including unclassified reads)" - "This is the relative sequence abundance in terms of sequenced base pairs for the respective TAXID in the overall sample." + "output file reporting sequence abundance in CAMI profiling format (including unclassified reads). " + "This is the relative sequence abundance in terms of sequenced base pairs for the respective TAXID in the overall sample. " "Note that this is not identical to the genomic abundance in terms of genome copy number for the respective TAXID."); parser.add_option(config.binning_file, '\0', "binning-file",