From 3a6f3138064266675a3507007e137d484373e3a2 Mon Sep 17 00:00:00 2001
From: Jens-Uwe Ulrich <jens-uwe.ulrich@hpi.de>
Date: Tue, 23 May 2023 13:21:57 +0200
Subject: [PATCH] update help messages

---
 .gitignore                 |   2 +
 src/main/main.cpp          | 188 +------------------------------------
 src/main/taxor_build.cpp   | 104 ++------------------
 src/main/taxor_profile.cpp |   8 +-
 4 files changed, 13 insertions(+), 289 deletions(-)

diff --git a/.gitignore b/.gitignore
index 6a42692..630bf7a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -35,6 +35,8 @@
 .settings/
 .cproject
 .project
+.autotools
+.vscode/
 src/.settings/
 src/.cproject
 src/.project
diff --git a/src/main/main.cpp b/src/main/main.cpp
index d81bbef..3f47f2c 100644
--- a/src/main/main.cpp
+++ b/src/main/main.cpp
@@ -34,8 +34,6 @@
 
 using namespace seqan3::literals;
 
-//std::string gtdb_root = "D:\\gtdb_genomes_reps_r202";
-
 double cputime(void)
 {
 	struct rusage r;
@@ -55,7 +53,7 @@ int main(int argc, char const **argv)
 
 	seqan3::argument_parser top_level_parser{"taxor", argc, argv, seqan3::update_notifications::off,
                                              {"build", "search", "profile"}};
-    top_level_parser.info.version = "1.0.0";
+    top_level_parser.info.version = "0.1.0";
 
     try
     {
@@ -87,189 +85,5 @@ int main(int argc, char const **argv)
 
     return error_code;
 
-	// Print debuging of bulk count method to fid error when building ixf
-	/*using sequence_file_t = seqan3::sequence_file_input<hixf::dna4_traits, seqan3::fields<seqan3::field::seq>>;
-	hixf::build_arguments args{};
-	std::vector<std::vector<size_t>> bins{};
-
-	seqan3::interleaved_xor_filter<> ixf{64, 7000000};
-	
-	bool success = false;
-	while (!success)
-	{
-		size_t bin_idx{0};
-		for (int64_t pos = 1160; pos < 1170;++pos)
-		{
-			std::vector<size_t> hashes{};
-			hixf::read_from_temp_hash_file(pos, hashes);
-			if (hashes.empty())
-				continue;
-			//bins.emplace_back(hashes);
-			success = iff.add_bin_elements(bin_idx, hashes);
-			std::cout << bin_idx << "\t" << hashes.size() << std::endl;
-			bin_idx++;
-			if (!success)
-				break;
-		}
-		if (!success)
-		{
-			iff.clear();
-            iff.set_seed();
-			std::cout << "Reset seed after bin " << bin_idx << std::endl;
-		}
-	}
-
-	robin_hood::unordered_flat_set<size_t> read_hashes{};
-	for (auto && [seq] : sequence_file_t{"/media/jens/INTENSO/refseq-viral/2022-03-23_22-07-02/files.renamed/GCF_000839085.1_genomic.fna.gz"})
-                for (auto hash :
-                     seq
-                         | seqan3::views::minimiser_hash(args.shape,
-                                                         seqan3::window_size{args.window_size},
-                                                         seqan3::seed{hixf::adjust_seed(args.shape.count())}))
-                    read_hashes.insert(hash);
-	std::vector<size_t> c{};
-    std::ranges::copy(read_hashes, std::back_inserter(c));
-	typedef seqan3::interleaved_binary_fuse_filter<>::counting_agent_type<u_int64_t> TIFFAgent;
-	TIFFAgent ixf_count_agent = iff.counting_agent< uint64_t >();
-
-	auto result = ixf_count_agent.bulk_count(c);
-    seqan3::debug_stream << "Root result: " << result << "\n";
-	
-	*/
-
-	
-	//create_layout();
-
-	/*hixf::search_arguments search_args{};
-	search_args.index_file = std::filesystem::path{"/media/jens/INTENSO/refseq-viral/2022-03-23_22-07-02/raptor_kmer.hixf"};
-	search_args.query_file = std::filesystem::path{"/media/jens/INTENSO/refseq-viral/2022-03-23_22-07-02/files.renamed/GCF_000839085.1_genomic.fna.gz"};
-	search_args.out_file = std::filesystem::path{"/media/jens/INTENSO/refseq-viral/2022-03-23_22-07-02/raptor.hixf_search.out"};
-	search_args.compute_syncmer = false;
-	search_args.threshold = 0.2;
-	hixf::search_hixf(search_args);
-*/
-
-	return 1;
-	
-/*
-	
-	
-//	multi_interleaved_xor_filter mixf = load_multi_ixf_index("GTDB_r202.ixf");
-//	std::cout << "Filter loaded successfully" << std::endl;
-	
-	//std::vector<std::vector<Species*>> bins2vec(mixf.count_single_filter());
-
-	//bins_to_species(mixf, bins2vec);
-	
-
-	double ratio_sum = 0.0;
-	uint64_t nr_reads = 0;
-	std::vector<double> sensitivity(mixf.species_vector.size());
-	std::fill(sensitivity.begin(), sensitivity.end(), 0.0);
-	std::vector<uint64_t> classified(mixf.species_vector.size());
-	std::fill(classified.begin(), classified.end(), 0);
-	int count = 0;
-	for ( const auto& [seq, header, qual] : fin)
-	{
-
-		if (seq.size() < 1000)
-			continue;
-
-		// rhv holds hash vectors for forward and reverse complement
-		std::vector<size_t> rhv = hashing::seq_to_syncmers(kmer_size, seq, sync_size, t_syncmer);
-		uint64_t strobe_nr = rhv.size();
-//		TInterval ci = calculateCI(0.1, kmer_size, strobe_nr, 0.95);
-//		uint64_t threshold = strobe_nr - ci.second;
-		std::vector<uint64_t> max_found(mixf.species_vector.size());
-		std::fill(max_found.begin(), max_found.end(), 0);
-//		std::cout << rhv[0].first.size() << "\t" << seq.size() << std::endl;
-		
-			std::vector<TIXFAgent::counting_vector> count_vectors = mixf.bulk_count(rhv);
-
-			std::vector<uint64_t> result(mixf.species_vector.size());
-			for (uint64_t i = 0; i < result.size(); ++i)
-			{
-				result[i] = 0;
-				if (mixf.species_vector[i].filter_index == UINT16_MAX)
-					continue;
-				for (uint64_t spec_bin = mixf.species_vector[i].first_bin; spec_bin <= mixf.species_vector[i].last_bin; ++spec_bin)
-				{
-					result[i] += count_vectors[mixf.species_vector[i].filter_index][spec_bin];
-					if (i == 355)
-					{
-						std::cout << spec_bin << "\t" << count_vectors[mixf.species_vector[i].filter_index][spec_bin] << "\t" << strobe_nr << std::endl;
-					}
-				}
-			}
-			
-			// find maximum matches between forward and reverse complement matches
-			for (int i = 0; i < max_found.size(); ++i)
-			{
-				if (result[i] > max_found[i])
-					max_found[i] = result[i];
-			}
-		
-
-		//break;
-
-		double max_ratio = 0.0;
-
-		std::vector<std::pair<uint64_t, double>> potential_indexes{};
-		for (int i = 0; i < max_found.size(); ++i)
-		{
-			
-			double ratio = (double) max_found[i] / (double) strobe_nr;
-			sensitivity[i] += ratio;
-
-			// Why does 0.05 work so well?
-			if ( ratio > 0.1)
-			{
-
-				if (ratio > max_ratio)
-					max_ratio = ratio;
-
-				potential_indexes.emplace_back(std::make_pair(i, ratio));
-			}
-		}
-
-		int spec_count = 0;
-		for (const auto & p : potential_indexes)
-		{
-			if (p.second >= max_ratio)
-			{
-				classified[p.first] += 1;
-//				if (++spec_count > 1)
-//					std::cout << header << std::endl;
-
-			}
-		}
-		
-		nr_reads++;
-//		if (nr_reads == 10)
-//			break;
-	}
-
-	for (int i=0; i < sensitivity.size(); ++i)
-	{
-		sensitivity[i] /= (double) nr_reads;
-	}
-
-
-	for (uint64_t i = 0; i < mixf.species_vector.size(); ++i)
-	{
-		
-		
-		if (classified[i] > 1)
-		{
-			std::cout << i << "\t" << mixf.species_vector[i].name << "\t" << classified[i] << "\t" << sensitivity[i] << std::endl;
-		}
-	}
-
-//	seqan3::debug_stream << sensitivity << "\n";
-//	seqan3::debug_stream << classified << "\n";
-	seqan3::debug_stream << nr_reads << "\n";
-
-	return 0;
-	*/
 }
 
diff --git a/src/main/taxor_build.cpp b/src/main/taxor_build.cpp
index 8374002..8d08b87 100644
--- a/src/main/taxor_build.cpp
+++ b/src/main/taxor_build.cpp
@@ -41,26 +41,17 @@ void set_up_subparser_layout(seqan3::argument_parser & parser, taxor::build::con
     parser.info.version = "0.1.0";
     parser.info.author = "Jens-Uwe Ulrich";
     parser.info.email = "jens-uwe.ulrich@hpi.de";
-    parser.info.short_description = "Creates and HIXF index of a given set of fasta files";
+    parser.info.short_description = "Creates an HIXF index of a given set of fasta files";
 
-    parser.info.description.emplace_back("Creates an HIXF index using either kmers, syncmers or FracMinHash sketches");
+    parser.info.description.emplace_back("Creates an HIXF index using either k-mers or syncmers");
 
     parser.add_subsection("Main options:");
     // -----------------------------------------------------------------------------------------------------------------
     parser.add_option(config.input_file_name,
-                      '\0', "input-file", "",
- /*                     "Provide the prefix you used for the output prefix in the chopper count --output-prefix option. "
-                      "If you have different means of estimating the k-mer counts of your input data, make sure that a "
-                      "file [INPUT-PREFIX].count exists. It needs to be tab-separated and consist of two columns: "
-                      "\"[filepath] [tab] [weight/count]\".",*/
+                      '\0', "input-file", "tab-separated-value file containing taxonomy information and reference file names",
                       seqan3::option_spec::required);
-    parser.add_list_item("", "Example file:");
-    parser.add_list_item("", "```");
-    parser.add_list_item("", "GCF_000839185.1 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/839/185/GCF_000839185.1_ViralProj14174 Cowpox virus    k__Viruses;p__Nucleocytoviricota;c__Pokkesviricetes;o__Chitovirales;f__Poxviridae;g__Orthopoxvirus;s__Cowpox virus");
-    parser.add_list_item("", "GCF_000860085.1 https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/860/085/GCF_000860085.1_ViralProj15241 Vaccinia virus  k__Viruses;p__Nucleocytoviricota;c__Pokkesviricetes;o__Chitovirales;f__Poxviridae;g__Orthopoxvirus;s__Vaccinia virus");
-    parser.add_list_item("", "```");
 
-    parser.add_option(config.input_sequence_folder, '\0', "input-sequence_dir", "directory containing the fasta reference files");
+    parser.add_option(config.input_sequence_folder, '\0', "input-sequence-dir", "directory containing the fasta reference files");
 
     parser.add_option(config.output_file_name, '\0', "output-filename", "A file name for the resulting index.");
 
@@ -74,8 +65,7 @@ void set_up_subparser_layout(seqan3::argument_parser & parser, taxor::build::con
 
     parser.add_option(config.threads,
                       '\0', "threads",
-                      "The number of threads to use. Currently, only merging of sketches is parallelized, so if option "
-                      "--rearrange-user-bins is not set, --threads will have no effect.",
+                      "The number of threads to use.",
                       seqan3::option_spec::standard,
                       seqan3::arithmetic_range_validator{static_cast<size_t>(1), static_cast<size_t>(32)});
 
@@ -83,9 +73,7 @@ void set_up_subparser_layout(seqan3::argument_parser & parser, taxor::build::con
 
     parser.add_flag(config.output_verbose_statistics,
                     '\0', "output-verbose-statistics",
-                    "Enable verbose statistics to be "
-                    "printed to std::cout. If the flag --determine-best-tmax is not set, this flag is ignored "
-                    "and has no effect.",
+                    "Enable verbose statistics to be printed",
                     seqan3::option_spec::hidden);
 
     parser.add_flag(config.debug,
@@ -369,86 +357,6 @@ void build_hixf(taxor::build::configuration const config,
     store_index(args.out_path, index);
 }
 
-uint64_t construct_and_query_ixf(size_t bins, size_t elements_per_bin)
-{
-	
-	
-	//ulrich::interleaved_xor_filter<> ixf(elems);
-	seqan3::interleaved_xor_filter<> ixf(bins, elements_per_bin);
-	std::vector<size_t> elems{};
-	while (true)
-	{
-		bool success = true;
-		for (int e = 0; e < bins ; ++e)
-		{
-
-            //if (e % 7 == 0 || e % 15 == 0)
-            //    continue;
-
-			std::vector<size_t> tmp{};
-			for (size_t i = 0; i < elements_per_bin; ++i)
-			{
-				size_t key = (e*elements_per_bin) + i;
-				tmp.emplace_back(key);
-			}
-			success = ixf.add_bin_elements(e, tmp);
-			if (!success)
-			{
-				ixf.clear();
-				ixf.set_seed();
-				std::cout << e << std::endl;
-				break;
-			}
-			if (e == 2)
-				elems=std::move(tmp);
-		}
-		if (success)
-			break;
-	}
-	
-	//std::cout << "Built IXF in " << ixf_construct.elapsed() << " seconds" << std::endl;
-
-	std::cout << ixf.bin_count() << std::endl;
-	std::cout << (double) ixf.bit_size() / (double) ixf.bin_count() << std::endl;
-	
-	typedef seqan3::interleaved_xor_filter<>::counting_agent_type< uint64_t > TIXFAgent;
-
-	auto agent = ixf.membership_agent();
-    auto & result = agent.bulk_contains(elems[0]);
-    seqan3::debug_stream << elems[0]  << "\t" << result << '\n';
-	TIXFAgent ixf_count_agent = ixf.counting_agent< uint64_t >();
-	//auto result = count_agent.bulk_count(readHs);
-	auto ixf_result = ixf_count_agent.bulk_count(elems);
-	
-	seqan3::debug_stream << ixf_result << "\n";
-/*	double fpr {0.0};
-	for (int i =0; i < ixf_result.size(); ++i)
-	{
-		if (i == 2)
-		{
-			std::cout << ixf_result[i] << "/" << elems.size() << std::endl;
-			return ixf_result[i];
-			continue;
-		}
-		fpr += (double) ixf_result[i] / (double) elements_per_bin;
-	}
-	fpr /= (double) (bins - 1);
-	std::cout << "FPR of the IXF: " << fpr << std::endl;
-	double mbytes = (double) ixf.bit_size() / (double) 8388608;
-	std::cout << "Size of the IXF: " << mbytes << " MBytes" << std::endl;
-	std::cout << "Queried " << elems.size() <<" keys in IXF in " << ixf_query.elapsed() << " seconds" << std::endl;
-	StopClock ixf_store{};
-	ixf_store.start();
-	std::string filter_file = "test.ixf";
-	
-	std::ofstream               os( filter_file, std::ios::binary );
-    cereal::BinaryOutputArchive archive( os );
-    archive( ixf );  
-	ixf_store.stop();
-	std::cout << "Stored IXF in " << ixf_store.elapsed() << " seconds" << std::endl;
-    */
-}
-
 int execute(seqan3::argument_parser & parser)
 {
     taxor::build::configuration config;
diff --git a/src/main/taxor_profile.cpp b/src/main/taxor_profile.cpp
index d2b8f0e..215d63e 100644
--- a/src/main/taxor_profile.cpp
+++ b/src/main/taxor_profile.cpp
@@ -29,14 +29,14 @@ void set_up_subparser_layout(seqan3::argument_parser & parser, taxor::profile::c
                       seqan3::option_spec::required);
 
     parser.add_option(config.report_file, '\0', "cami-report-file", 
-                      "output file reporting genomic abundances in CAMI profiling format"
-                      "This is the relative genome abundance in terms of the genome copy number for the respective TAXID in the overall sample."
+                      "output file reporting genomic abundances in CAMI profiling format. "
+                      "This is the relative genome abundance in terms of the genome copy number for the respective TAXID in the overall sample. "
                       "Note that this is not identical to the relative abundance in terms of assigned base pairs.",
                       seqan3::option_spec::required);
 
     parser.add_option(config.sequence_abundance_file, '\0', "seq-abundance-file", 
-                      "output file reporting sequence abundance in CAMI profiling format (including unclassified reads)"
-                      "This is the relative sequence abundance in terms of sequenced base pairs for the respective TAXID in the overall sample."
+                      "output file reporting sequence abundance in CAMI profiling format (including unclassified reads). "
+                      "This is the relative sequence abundance in terms of sequenced base pairs for the respective TAXID in the overall sample. "
                       "Note that this is not identical to the genomic abundance in terms of genome copy number for the respective TAXID.");
 
     parser.add_option(config.binning_file, '\0', "binning-file",