Skip to content

Commit

Permalink
first commit on contig annotation
Browse files Browse the repository at this point in the history
  • Loading branch information
hmusta committed Oct 2, 2024
1 parent 163b644 commit bfae7bc
Show file tree
Hide file tree
Showing 6 changed files with 122 additions and 17 deletions.
6 changes: 5 additions & 1 deletion metagraph/src/cli/build.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -251,12 +251,16 @@ int build_graph(Config *config) {
}

} else if (config->graph_type == Config::GraphType::SSHASH && !config->dynamic) {
graph.reset(new DBGSSHash(files.at(0), config->k, config->graph_mode, config->num_chars));
if (files.size() > 1) {
logger->error("DBGSSHash does not support multiple input files.");
exit(1);
}

graph.reset(new DBGSSHash(files.at(0),
config->k,
config->graph_mode,
config->num_chars,
config->is_monochromatic));
} else {
//slower method
switch (config->graph_type) {
Expand Down
3 changes: 3 additions & 0 deletions metagraph/src/cli/config/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,8 @@ Config::Config(int argc, char *argv[]) {
dynamic = true;
} else if (!strcmp(argv[i], "--mask-dummy")) {
mark_dummy_kmers = true;
} else if (!strcmp(argv[i], "--is-monochromatic")) {
is_monochromatic = true;
} else if (!strcmp(argv[i], "--anno-filename")) {
filename_anno = true;
} else if (!strcmp(argv[i], "--anno-header")) {
Expand Down Expand Up @@ -972,6 +974,7 @@ if (advanced) {
fprintf(stderr, "\t --mode \t\tk-mer indexing mode: basic / canonical / primary [basic]\n");
#endif
fprintf(stderr, "\t --complete \t\tconstruct a complete graph (only for Bitmap graph) [off]\n");
fprintf(stderr, "\t --is-monochromatic \t\tindicate that the input sequences are monochromatic (i.e., their colouring is constant) [off]\n");
fprintf(stderr, "\t --mem-cap-gb [INT] \tpreallocated buffer size in GB [1]\n");
if (advanced) {
fprintf(stderr, "\t --dynamic \t\tuse dynamic build method [off]\n");
Expand Down
1 change: 1 addition & 0 deletions metagraph/src/cli/config/config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ class Config {
bool complete = false;
bool dynamic = false;
bool mark_dummy_kmers = false;
bool is_monochromatic = false;
bool filename_anno = false;
bool annotate_sequence_headers = false;
bool to_adj_list = false;
Expand Down
18 changes: 17 additions & 1 deletion metagraph/src/graph/representation/canonical_dbg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ ::map_to_nodes_sequentially(std::string_view sequence,
path.reserve(sequence.size() - get_k() + 1);

if (const auto sshash = std::dynamic_pointer_cast<const DBGSSHash>(graph_)) {
sshash->map_to_nodes_with_rc<>(sequence, [&](node_index node, bool orientation) {
sshash->map_to_nodes_with_rc<true>(sequence, [&](node_index node, bool orientation) {
callback(node && orientation ? reverse_complement(node) : node);
}, terminate);
return;
Expand Down Expand Up @@ -156,6 +156,22 @@ ::map_to_nodes_sequentially(std::string_view sequence,
void CanonicalDBG::map_to_nodes(std::string_view sequence,
const std::function<void(node_index)> &callback,
const std::function<bool()> &terminate) const {
if (const auto sshash = std::dynamic_pointer_cast<const DBGSSHash>(graph_)) {
if (sshash->is_monochromatic()) {
sshash->map_to_contigs_with_rc<true>(
sequence,
[&](node_index node, uint64_t, bool orientation) {
if (orientation)
node = reverse_complement(node);

callback(get_base_node(node));
},
terminate
);
return;
}
}

map_to_nodes_sequentially(sequence, [&](node_index i) {
callback(get_base_node(i));
}, terminate);
Expand Down
94 changes: 80 additions & 14 deletions metagraph/src/graph/representation/hash/dbg_sshash.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ size_t DBGSSHash::dict_size() const {
return std::visit([](const auto &d) { return d.size(); }, dict_);
}

DBGSSHash::DBGSSHash(size_t k, Mode mode) : k_(k), num_nodes_(0), mode_(mode) {
DBGSSHash::DBGSSHash(size_t k, Mode mode)
: k_(k), num_nodes_(0), mode_(mode), is_monochromatic_(false) {
size_t odd_k = (k_ | 1);

if (odd_k * bits_per_char <= 64) {
Expand All @@ -54,11 +55,16 @@ DBGSSHash::DBGSSHash(size_t k, Mode mode) : k_(k), num_nodes_(0), mode_(mode) {
}
}

DBGSSHash::DBGSSHash(const std::string &input_filename, size_t k, Mode mode, size_t num_chars)
DBGSSHash::DBGSSHash(const std::string &input_filename,
size_t k,
Mode mode,
size_t num_chars,
bool is_monochromatic)
: DBGSSHash(k, mode) {
if (k <= 1)
throw std::domain_error("k must be at least 2");

is_monochromatic_ = is_monochromatic;
sshash::build_configuration build_config;
build_config.k = k;

Expand Down Expand Up @@ -99,6 +105,26 @@ void DBGSSHash::add_sequence(std::string_view sequence,
throw std::logic_error("adding sequences not supported");
}

void DBGSSHash
::map_to_nodes_with_rc_advanced(std::string_view sequence,
const std::function<void(sshash::lookup_result)>& callback,
bool with_rc,
const std::function<bool()>& terminate) const {
if (terminate() || sequence.size() < k_)
return;

std::visit([&](const auto &dict) {
using kmer_t = get_kmer_t<decltype(dict)>;
kmer_t uint_kmer = sshash::util::string_to_uint_kmer<kmer_t>(sequence.data(), k_ - 1);
uint_kmer.pad_char();
for (size_t i = k_ - 1; i < sequence.size() && !terminate(); ++i) {
uint_kmer.drop_char();
uint_kmer.kth_char_or(k_ - 1, kmer_t::char_to_uint(sequence[i]));
callback(dict.lookup_advanced_uint(uint_kmer, with_rc));
}
}, dict_);
}

template <bool with_rc>
void DBGSSHash::map_to_nodes_with_rc(std::string_view sequence,
const std::function<void(node_index, bool)>& callback,
Expand All @@ -113,18 +139,11 @@ void DBGSSHash::map_to_nodes_with_rc(std::string_view sequence,
return;
}

std::visit([&](const auto &dict) {
using kmer_t = get_kmer_t<decltype(dict)>;
kmer_t uint_kmer = sshash::util::string_to_uint_kmer<kmer_t>(sequence.data(), k_ - 1);
uint_kmer.pad_char();
for (size_t i = k_ - 1; i < sequence.size() && !terminate(); ++i) {
uint_kmer.drop_char();
uint_kmer.kth_char_or(k_ - 1, kmer_t::char_to_uint(sequence[i]));
auto res = dict.lookup_advanced_uint(uint_kmer, with_rc);
callback(sshash_to_graph_index(res.kmer_id), res.kmer_orientation);
}
}, dict_);
map_to_nodes_with_rc_advanced(sequence, [&](sshash::lookup_result res) {
callback(sshash_to_graph_index(res.kmer_id), res.kmer_orientation);
}, with_rc, terminate);
}

template
void DBGSSHash::map_to_nodes_with_rc<true>(std::string_view,
const std::function<void(node_index, bool)>&,
Expand All @@ -134,10 +153,53 @@ void DBGSSHash::map_to_nodes_with_rc<false>(std::string_view,
const std::function<void(node_index, bool)>&,
const std::function<bool()>&) const;

template <bool with_rc>
void DBGSSHash::map_to_contigs_with_rc(std::string_view sequence,
const std::function<void(node_index, uint64_t, bool)>& callback,
const std::function<bool()>& terminate) const {
if (terminate() || sequence.size() < k_)
return;

if (!num_nodes()) {
for (size_t i = 0; i < sequence.size() - k_ + 1 && !terminate(); ++i) {
callback(npos, 0, false);
}
return;
}

map_to_nodes_with_rc_advanced(sequence, [&](sshash::lookup_result res) {
node_index node = sshash_to_graph_index(res.kmer_id);
if (node != npos) {
node -= res.kmer_id_in_contig;
} else {
res.kmer_id_in_contig = 0;
}

callback(node, res.kmer_id_in_contig, res.kmer_orientation);
}, with_rc, terminate);
}

template
void DBGSSHash::map_to_contigs_with_rc<true>(std::string_view,
const std::function<void(node_index, uint64_t, bool)>&,
const std::function<bool()>&) const;
template
void DBGSSHash::map_to_contigs_with_rc<false>(std::string_view,
const std::function<void(node_index, uint64_t, bool)>&,
const std::function<bool()>&) const;

void DBGSSHash::map_to_nodes(std::string_view sequence,
const std::function<void(node_index)>& callback,
const std::function<bool()>& terminate) const {
if (mode_ != CANONICAL) {
if (is_monochromatic_) {
if (mode_ != CANONICAL) {
map_to_contigs_with_rc<false>(
sequence, [&](node_index node, uint64_t, bool) { callback(node); }, terminate);
} else {
map_to_contigs_with_rc<true>(
sequence, [&](node_index node, uint64_t, bool) { callback(node); }, terminate);
}
} else if (mode_ != CANONICAL) {
map_to_nodes_sequentially(sequence, callback, terminate);
} else {
map_to_nodes_with_rc<true>(
Expand Down Expand Up @@ -352,6 +414,7 @@ void DBGSSHash::serialize(std::ostream& out) const {
saver.visit(num_nodes_);
saver.visit(k_);
saver.visit(mode_);
saver.visit(is_monochromatic_);

if (num_nodes())
std::visit([&](const auto &d) { saver.visit(d); }, dict_);
Expand All @@ -368,12 +431,15 @@ bool DBGSSHash::load(std::istream &in) {
size_t num_nodes;
size_t k;
Mode mode;
bool is_monochromatic;
loader.visit(num_nodes);
loader.visit(k);
loader.visit(mode);
loader.visit(is_monochromatic);

*this = DBGSSHash(k, mode);
num_nodes_ = num_nodes;
is_monochromatic_ = is_monochromatic;

if (num_nodes_)
std::visit([&](auto &d) { d.visit(loader); }, dict_);
Expand Down
17 changes: 16 additions & 1 deletion metagraph/src/graph/representation/hash/dbg_sshash.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ class DBGSSHash : public DeBruijnGraph {
sshash::dictionary<kmer_t<KmerInt256>>>;

explicit DBGSSHash(size_t k, Mode mode = BASIC);
DBGSSHash(const std::string &input_filename, size_t k, Mode mode = BASIC, size_t num_chars = 0);
DBGSSHash(const std::string &input_filename, size_t k, Mode mode = BASIC, size_t num_chars = 0, bool is_monochromatic = false);

// SequenceGraph overrides
void add_sequence(
Expand All @@ -54,6 +54,12 @@ class DBGSSHash : public DeBruijnGraph {
const std::function<void(node_index, bool)>& callback,
const std::function<bool()>& terminate = []() { return false; }) const;

template <bool with_rc = true>
void map_to_contigs_with_rc(
std::string_view sequence,
const std::function<void(node_index, uint64_t, bool)>& callback,
const std::function<bool()>& terminate = []() { return false; }) const;

uint64_t num_nodes() const override;

bool load(std::istream& in);
Expand All @@ -72,6 +78,8 @@ class DBGSSHash : public DeBruijnGraph {

Mode get_mode() const override final { return mode_; }

bool is_monochromatic() const { return is_monochromatic_; }

node_index traverse(node_index node, char next_char) const override;
node_index traverse_back(node_index node, char prev_char) const override;

Expand Down Expand Up @@ -116,6 +124,13 @@ class DBGSSHash : public DeBruijnGraph {
size_t k_;
size_t num_nodes_;
Mode mode_;
bool is_monochromatic_;

void map_to_nodes_with_rc_advanced(
std::string_view sequence,
const std::function<void(sshash::lookup_result)>& callback,
bool with_rc,
const std::function<bool()>& terminate = []() { return false; }) const;

size_t dict_size() const;
};
Expand Down

0 comments on commit bfae7bc

Please sign in to comment.