Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions include/valik/build/index_factory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,14 +69,15 @@ class index_factory
std::vector<std::string> header_paths = parse_bin_paths(*arguments, "header");
std::string shape_string{};
uint64_t window_size{};
size_t count{};
uint64_t distinct_count{};
uint64_t unique_count{};
uint64_t bin_size{};
entropy_ranking.reserve(header_paths.size());
for (auto && [file_name, bin_number] : seqan3::views::zip(header_paths, std::views::iota(0u)))
{
std::ifstream file_stream{file_name};
file_stream >> shape_string >> window_size >> count >> bin_size;
entropy_map.emplace_back(std::make_pair((size_t) bin_number, (double) count / (double) bin_size));
file_stream >> shape_string >> window_size >> unique_count >> distinct_count >> bin_size;
entropy_map.emplace_back(std::make_pair((size_t) bin_number, (double) unique_count / (double) bin_size));
}

std::ranges::sort(entropy_map.begin(), entropy_map.end(), [](const std::pair<size_t, double> &a, const std::pair<size_t, double> &b)
Expand Down
4 changes: 2 additions & 2 deletions include/valik/shared.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,8 @@ struct build_arguments final : public split_arguments
bool manual_parameters{false};
bool input_is_minimiser{false};

uint8_t kmer_count_min_cutoff{2};
uint8_t kmer_count_max_cutoff{64};
uint8_t kmer_count_min_cutoff{0};
uint8_t kmer_count_max_cutoff{254};
bool use_filesize_dependent_cutoff{false};

std::filesystem::path ref_meta_path{};
Expand Down
4 changes: 2 additions & 2 deletions include/valik/split/metadata.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -559,8 +559,8 @@ struct metadata
segment_stats seg = segments[seg_id];
out_str << seg_id << '\t';
for (size_t ind : seg.seq_vec)
out_str << ind << '\t';
out_str << seg.start << '\t' << seg.len << '\n';
out_str << ind << ';';
out_str << '\t' << seg.start << '\t' << seg.len << '\n';
}

out_str << "$\n";
Expand Down
13 changes: 12 additions & 1 deletion src/argument_parsing/build.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ void init_build_parser(sharg::parser & parser, build_arguments & arguments)
.long_id = "kmer-count-min",
.description = "Only store k-mers with at least (>=) x occurrences. "
"Mutually exclusive with --use-filesize-dependent-cutoff.",
.validator = sharg::arithmetic_range_validator{0, 254}});
.validator = sharg::arithmetic_range_validator{1, 254}});
parser.add_option(arguments.kmer_count_max_cutoff,
sharg::config{.short_id = '\0',
.long_id = "kmer-count-max",
Expand Down Expand Up @@ -224,6 +224,8 @@ void run_build(sharg::parser & parser)
std::cout << "database size " << meta.total_len << "bp\n";
std::cout << "segment count " << meta.seg_count << '\n';
std::cout << "segment len " << std::to_string((uint64_t) std::round(meta.total_len / (double) meta.seg_count)) << "bp\n";
std::cout << "\n-----------Reference segments-----------\n";
std::cout << meta.to_string();
}


Expand Down Expand Up @@ -277,6 +279,15 @@ void run_build(sharg::parser & parser)
arguments.window_size = arguments.kmer_size;
}

if (parser.is_option_set("kmer-count-min") ||
parser.is_option_set("kmer-count-max"))
{
if (arguments.kmer_count_min_cutoff > arguments.kmer_count_max_cutoff)
throw sharg::parser_error{"Set --kmer-count-min <= --kmer-count-max."};
if (!arguments.input_is_minimiser)
seqan3::debug_stream << "[Warning] Arguments --kmer-count-min and --kmer-count-max are only compatible with minimisers (add --fast).\n";
}

if (arguments.kmer_size > arguments.window_size)
{
throw sharg::parser_error{"The k-mer size cannot be bigger than the window size."};
Expand Down
24 changes: 14 additions & 10 deletions src/prepare/compute_bin_size.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,22 +70,22 @@ void compute_minimiser(valik::build_arguments const & arguments)
distinct_minimisers.insert(hash);
});

uint64_t count{};

uint64_t distinct_kmer_count{0};
uint64_t unique_kmer_count{0};
{
//!TODO: apply k-mer count cutoffs in metagenome search
//!TODO: apply k-mer count cutoffs in metagenome search and count unique k-mers
std::ofstream outfile{minimiser_file, std::ios::binary};
for (auto && hash : distinct_minimisers)
{
outfile.write(reinterpret_cast<const char *>(&hash), sizeof(hash));
++count;
++distinct_kmer_count;
}
}

{
std::ofstream headerfile{header_file};
headerfile << arguments.shape.to_string() << '\t' << std::to_string(arguments.window_size) << '\t' <<
count << '\t' << seq_size << '\n';
unique_kmer_count << '\t' << distinct_kmer_count << '\t' << seq_size << '\n';
}

std::filesystem::remove(progress_file);
Expand Down Expand Up @@ -145,23 +145,26 @@ void compute_minimiser(valik::build_arguments const & arguments)
minimiser_table[value] = std::min<uint8_t>(254u, minimiser_table[value] + 1);
}

uint64_t count{};
uint64_t distinct_kmer_count{0};
uint64_t unique_kmer_count{0};
{
std::ofstream outfile{minimiser_file, std::ios::binary};
for (auto && [hash, occurrences] : minimiser_table)
{
if (occurrences >= arguments.kmer_count_min_cutoff && occurrences <= arguments.kmer_count_max_cutoff)
{
if (occurrences == 1)
++unique_kmer_count;
outfile.write(reinterpret_cast<const char *>(&hash), sizeof(hash));
++count;
++distinct_kmer_count;
}
}
}

{
std::ofstream headerfile{header_file};
headerfile << arguments.shape.to_string() << '\t' << std::to_string(arguments.window_size) << '\t' <<
count << '\t' << seg.len << '\n';
unique_kmer_count << '\t' << distinct_kmer_count << '\t' << seg.len << '\n';
}

std::filesystem::remove(progress_file);
Expand Down Expand Up @@ -244,12 +247,13 @@ size_t kmer_count_from_minimiser_files(std::vector<std::string> const & minimise

std::string shape_string{};
uint64_t window_size{};
size_t max_count{};
uint64_t max_unique{};
uint64_t max_count{};
uint64_t bin_size{};

biggest_file.replace_extension("header");
std::ifstream file_stream{biggest_file};
file_stream >> shape_string >> window_size >> max_count >> bin_size;
file_stream >> shape_string >> window_size >> max_unique >> max_count >> bin_size;

return max_count;
}
Expand Down
Loading