From dbc0cb8f53925554c6ff87a5eb9cc272d12de1fd Mon Sep 17 00:00:00 2001 From: Harun Mustafa Date: Tue, 8 Oct 2024 11:51:10 +0200 Subject: [PATCH] indicate colour breakpoints to support monotig graphs with sshash --- metagraph/src/cli/build.cpp | 6 +- metagraph/src/cli/config/config.cpp | 3 + metagraph/src/cli/config/config.hpp | 1 + metagraph/src/cli/query.cpp | 5 +- .../src/graph/alignment/annotation_buffer.cpp | 1 + metagraph/src/graph/annotated_dbg.cpp | 101 ++++++++++---- metagraph/src/graph/annotated_dbg.hpp | 7 +- .../representation/base/sequence_graph.hpp | 2 + .../graph/representation/canonical_dbg.hpp | 2 + .../graph/representation/hash/dbg_sshash.cpp | 127 +++++++++++++++++- .../graph/representation/hash/dbg_sshash.hpp | 12 +- .../tests/annotation/test_annotated_dbg.cpp | 22 +-- .../annotation/test_annotated_dbg_helpers.cpp | 1 + .../tests/graph/all/test_dbg_helpers.cpp | 72 ++++++++++ .../tests/graph/all/test_dbg_helpers.hpp | 8 ++ metagraph/tests/graph/test_canonical_dbg.cpp | 3 +- metagraph/tests/graph/test_dbg_canonical.cpp | 3 +- 17 files changed, 335 insertions(+), 41 deletions(-) diff --git a/metagraph/src/cli/build.cpp b/metagraph/src/cli/build.cpp index d58cbde119..6a193d7214 100644 --- a/metagraph/src/cli/build.cpp +++ b/metagraph/src/cli/build.cpp @@ -251,12 +251,16 @@ int build_graph(Config *config) { } } else if (config->graph_type == Config::GraphType::SSHASH && !config->dynamic) { - graph.reset(new DBGSSHash(files.at(0), config->k, config->graph_mode, config->num_chars)); if (files.size() > 1) { logger->error("DBGSSHash does not support multiple input files."); exit(1); } + graph.reset(new DBGSSHash(files.at(0), + config->k, + config->graph_mode, + config->num_chars, + config->is_monochromatic)); } else { //slower method switch (config->graph_type) { diff --git a/metagraph/src/cli/config/config.cpp b/metagraph/src/cli/config/config.cpp index 7312ff93ba..fd81cfd26b 100644 --- a/metagraph/src/cli/config/config.cpp +++ b/metagraph/src/cli/config/config.cpp @@ -151,6 +151,8 @@ Config::Config(int argc, char *argv[]) { dynamic = true; } else if (!strcmp(argv[i], "--mask-dummy")) { mark_dummy_kmers = true; + } else if (!strcmp(argv[i], "--is-monochromatic")) { + is_monochromatic = true; } else if (!strcmp(argv[i], "--anno-filename")) { filename_anno = true; } else if (!strcmp(argv[i], "--anno-header")) { @@ -972,6 +974,7 @@ if (advanced) { fprintf(stderr, "\t --mode \t\tk-mer indexing mode: basic / canonical / primary [basic]\n"); #endif fprintf(stderr, "\t --complete \t\tconstruct a complete graph (only for Bitmap graph) [off]\n"); + fprintf(stderr, "\t --is-monochromatic \t\tindicate that the input sequences are monochromatic (i.e., their colouring is constant) [off]\n"); fprintf(stderr, "\t --mem-cap-gb [INT] \tpreallocated buffer size in GB [1]\n"); if (advanced) { fprintf(stderr, "\t --dynamic \t\tuse dynamic build method [off]\n"); diff --git a/metagraph/src/cli/config/config.hpp b/metagraph/src/cli/config/config.hpp index b4848812d8..aae5304270 100644 --- a/metagraph/src/cli/config/config.hpp +++ b/metagraph/src/cli/config/config.hpp @@ -28,6 +28,7 @@ class Config { bool complete = false; bool dynamic = false; bool mark_dummy_kmers = false; + bool is_monochromatic = false; bool filename_anno = false; bool annotate_sequence_headers = false; bool to_adj_list = false; diff --git a/metagraph/src/cli/query.cpp b/metagraph/src/cli/query.cpp index cb7d2e35c9..fcb41c61b5 100644 --- a/metagraph/src/cli/query.cpp +++ b/metagraph/src/cli/query.cpp @@ -955,8 +955,9 @@ construct_query_graph(const AnnotatedDBG &anno_graph, #pragma omp parallel for num_threads(num_threads) for (size_t i = 0; i < contigs.size(); ++i) { contigs[i].second.reserve(contigs[i].first.length() - graph_init->get_k() + 1); - full_dbg.map_to_nodes(contigs[i].first, - [&](node_index node) { contigs[i].second.push_back(node); }); + call_annotated_nodes_offsets(full_dbg, contigs[i].first, [&](node_index node, int64_t) { + contigs[i].second.push_back(node); + }); } logger->trace("[Query graph construction] Contigs mapped to the full graph in {} sec", timer.elapsed()); diff --git a/metagraph/src/graph/alignment/annotation_buffer.cpp b/metagraph/src/graph/alignment/annotation_buffer.cpp index 4020f312a7..16d501385b 100644 --- a/metagraph/src/graph/alignment/annotation_buffer.cpp +++ b/metagraph/src/graph/alignment/annotation_buffer.cpp @@ -48,6 +48,7 @@ void AnnotationBuffer::fetch_queued_annotations() { for (const auto &path : queued_paths_) { std::vector base_path; + std::vector base_path_offsets; if (base_graph->get_mode() == DeBruijnGraph::CANONICAL) { // TODO: avoid this call of spell_path std::string query = spell_path(graph_, path); diff --git a/metagraph/src/graph/annotated_dbg.cpp b/metagraph/src/graph/annotated_dbg.cpp index 430365365a..6fc6ea257c 100644 --- a/metagraph/src/graph/annotated_dbg.cpp +++ b/metagraph/src/graph/annotated_dbg.cpp @@ -6,6 +6,7 @@ #include "annotation/representation/row_compressed/annotate_row_compressed.hpp" #include "annotation/int_matrix/base/int_matrix.hpp" #include "graph/representation/canonical_dbg.hpp" +#include "graph/representation/hash/dbg_sshash.hpp" #include "common/aligned_vector.hpp" #include "common/vectors/vector_algorithm.hpp" #include "common/vector_map.hpp" @@ -23,6 +24,21 @@ using Column = mtg::annot::matrix::BinaryMatrix::Column; typedef AnnotatedDBG::Label Label; typedef std::pair StringCountPair; +void call_annotated_nodes_offsets(const SequenceGraph &graph, + std::string_view sequence, + const std::function &callback) { + if (const auto *sshash = dynamic_cast(&graph)) { + if (sshash->is_monochromatic()) { + sshash->map_to_contigs_with_rc(sequence, [&](SequenceGraph::node_index i, int64_t offset, bool) { + callback(i, offset); + }); + return; + } + } + + graph.map_to_nodes(sequence, [&](SequenceGraph::node_index i) { callback(i, 0); }); +} + AnnotatedSequenceGraph ::AnnotatedSequenceGraph(std::shared_ptr graph, @@ -46,10 +62,9 @@ ::annotate_sequence(std::string_view sequence, std::vector indices; indices.reserve(sequence.size()); - - graph_->map_to_nodes(sequence, [&](node_index i) { + call_annotated_nodes_offsets(*graph_, sequence, [&](node_index i, int64_t) { if (i > 0) - indices.push_back(graph_to_anno_index(i)); + indices.emplace_back(graph_to_anno_index(i)); }); if (!indices.size()) @@ -80,9 +95,9 @@ ::annotate_sequences(const std::vector if (!indices.capacity()) indices.reserve(data[t].first.size()); - graph_->map_to_nodes(data[t].first, [&](node_index i) { + call_annotated_nodes_offsets(*graph_, data[t].first, [&](node_index i, int64_t) { if (i > 0) - indices.push_back(graph_to_anno_index(i)); + indices.emplace_back(graph_to_anno_index(i)); }); } @@ -117,10 +132,10 @@ void AnnotatedDBG::add_kmer_counts(std::string_view sequence, indices.reserve(sequence.size() - dbg_.get_k() + 1); size_t end = 0; - graph_->map_to_nodes(sequence, [&](node_index i) { + call_annotated_nodes_offsets(dbg_, sequence, [&](node_index i, int64_t) { // only insert indexes for matched k-mers and shift counts accordingly if (i > 0) { - indices.push_back(graph_to_anno_index(i)); + indices.emplace_back(graph_to_anno_index(i)); kmer_counts[indices.size() - 1] = kmer_counts[end++]; } }); @@ -139,15 +154,22 @@ void AnnotatedDBG::add_kmer_coord(std::string_view sequence, if (sequence.size() < dbg_.get_k()) return; - std::vector indices = map_to_nodes(dbg_, sequence); + std::vector indices; + std::vector offsets; + call_annotated_nodes_offsets(dbg_, sequence, [&](node_index i, int64_t offset) { + indices.emplace_back(i); + offsets.emplace_back(offset); + }); std::lock_guard lock(mutex_); + auto it = offsets.begin(); for (node_index i : indices) { // only insert coordinates for matched k-mers and increment the coordinates if (i > 0) - annotator_->add_label_coord(graph_to_anno_index(i), labels, coord); + annotator_->add_label_coord(graph_to_anno_index(i), labels, coord - *it); coord++; + ++it; } } @@ -156,10 +178,17 @@ void AnnotatedDBG::add_kmer_coords( assert(check_compatibility()); std::vector> ids; + std::vector> offsets; ids.reserve(data.size()); for (const auto &[sequence, labels, _] : data) { - if (sequence.size() >= dbg_.get_k()) - ids.push_back(map_to_nodes(dbg_, sequence)); + if (sequence.size() >= dbg_.get_k()) { + auto &id = ids.emplace_back(); + auto &offset = offsets.emplace_back(); + call_annotated_nodes_offsets(dbg_, sequence, [&](node_index i, int64_t o) { + id.emplace_back(i); + offset.emplace_back(o); + }); + } } std::lock_guard lock(mutex_); @@ -168,11 +197,13 @@ void AnnotatedDBG::add_kmer_coords( const auto &labels = std::get<1>(data[t]); uint64_t coord = std::get<2>(data[t]); + auto it = offsets[t].begin(); for (node_index i : ids[t]) { // only insert coordinates for matched k-mers and increment the coordinates if (i > 0) - annotator_->add_label_coord(graph_to_anno_index(i), labels, coord); + annotator_->add_label_coord(graph_to_anno_index(i), labels, coord - *it); coord++; + ++it; } } } @@ -200,10 +231,10 @@ void AnnotatedDBG::annotate_kmer_coords( coords[last].reserve(sequence.size() - dbg_.get_k() + 1); } - graph_->map_to_nodes(sequence, [&](node_index i) { + call_annotated_nodes_offsets(*graph_, sequence, [&](node_index i, int64_t o) { if (i > 0) { ids[last].push_back(graph_to_anno_index(i)); - coords[last].emplace_back(graph_to_anno_index(i), coord); + coords[last].emplace_back(graph_to_anno_index(i), coord - o); } coord++; }); @@ -238,7 +269,7 @@ std::vector