Skip to content

Commit

Permalink
incomplete: protein k-mer support
Browse files Browse the repository at this point in the history
  • Loading branch information
hmusta committed May 30, 2024
1 parent 0120fae commit add9fff
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 5 deletions.
21 changes: 20 additions & 1 deletion metagraph/src/graph/representation/hash/dbg_sshash.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

#include "common/seq_tools/reverse_complement.hpp"
#include "common/threads/threading.hpp"
#include "kmer/kmer_extractor.hpp"


namespace mtg {
Expand Down Expand Up @@ -130,8 +129,12 @@ void DBGSSHash::map_to_nodes_with_rc(std::string_view sequence,
for (size_t i = k_ - 1; i < sequence.size() && !terminate(); ++i) {
uint_kmer.drop_char();
uint_kmer.kth_char_or(k_ - 1, kmer_t::char_to_uint(sequence[i]));
#if _PROTEIN_GRAPH
callback(sshash_to_graph_index(dict_.lookup_uint(uint_kmer)), false);
#else
auto res = dict_.lookup_advanced_uint(uint_kmer, true);
callback(sshash_to_graph_index(res.kmer_id), res.kmer_orientation);
#endif
}
}

Expand All @@ -140,19 +143,27 @@ DBGSSHash::node_index DBGSSHash::traverse(node_index node, char next_char) const
kmer_t new_kmer = sshash::util::string_to_uint_kmer<kmer_t>(string_kmer.c_str(), k_);
new_kmer.drop_char();
new_kmer.kth_char_or(k_ - 1, kmer_t::char_to_uint(next_char));
#if _PROTEIN_GRAPH
return sshash_to_graph_index(dict_.lookup_uint(new_kmer));
#else
auto res = dict_.lookup_advanced_uint(new_kmer, mode_ == CANONICAL);
node_index next = sshash_to_graph_index(res.kmer_id);
return res.kmer_orientation ? reverse_complement(next) : next;
#endif
}

DBGSSHash::node_index DBGSSHash::traverse_back(node_index node, char prev_char) const {
std::string string_kmer = get_node_sequence(node);
kmer_t new_kmer = sshash::util::string_to_uint_kmer<kmer_t>(string_kmer.c_str(), k_);
new_kmer.append_char(kmer_t::char_to_uint(prev_char));
new_kmer.take_chars(k_);
#if _PROTEIN_GRAPH
return sshash_to_graph_index(dict_.lookup_uint(new_kmer));
#else
auto res = dict_.lookup_advanced_uint(new_kmer, mode_ == CANONICAL);
node_index prev = sshash_to_graph_index(res.kmer_id);
return res.kmer_orientation ? reverse_complement(prev) : prev;
#endif
}

void DBGSSHash::adjacent_outgoing_nodes(node_index node,
Expand Down Expand Up @@ -254,18 +265,26 @@ DBGSSHash::node_index DBGSSHash::kmer_to_node(std::string_view kmer) const {
if (!num_nodes())
return npos;

#if _PROTEIN_GRAPH
return dict_.lookup(kmer.data());
#else
auto res = dict_.lookup_advanced(kmer.data(), mode_ == CANONICAL);
node_index node = sshash_to_graph_index(res.kmer_id);
return res.kmer_orientation ? reverse_complement(node) : node;
#endif
}

std::pair<DBGSSHash::node_index, bool>
DBGSSHash::kmer_to_node_with_rc(std::string_view kmer) const {
if (!num_nodes())
return std::make_pair(npos, false);

#if _PROTEIN_GRAPH
return dict_.lookup(kmer.data());
#else
auto res = dict_.lookup_advanced(kmer.data(), true);
return std::make_pair(sshash_to_graph_index(res.kmer_id), res.kmer_orientation);
#endif
}

std::string DBGSSHash::get_node_sequence(node_index node) const {
Expand Down
7 changes: 7 additions & 0 deletions metagraph/src/graph/representation/hash/dbg_sshash.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,18 @@
#include "common/utils/string_utils.hpp"
#include "common/logger.hpp"
#include "graph/representation/base/sequence_graph.hpp"
#include "kmer/kmer_extractor.hpp"

namespace mtg::graph {

class DBGSSHash : public DeBruijnGraph {
#if _PROTEIN_GRAPH
using kmer_t = sshash::alpha_kmer_t<uint64_t,
kmer::KmerExtractor2Bit::bits_per_char,
kmer::alphabets::kAlphabetProtein>;
#else
using kmer_t = sshash::dna_uint_kmer_t<uint64_t>;
#endif

public:
explicit DBGSSHash(size_t k, Mode mode = BASIC);
Expand Down
8 changes: 4 additions & 4 deletions metagraph/src/kmer/alphabets.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ static_assert(kBOSSSigmaDNA5 > 1llu << (kBOSSBitsPerCharDNA5 - 1));



constexpr char kAlphabetProtein[] = "ABCDEFGHIJKLMNOPQRSTUVWYZX";
inline constexpr char kAlphabetProtein[] = "ABCDEFGHIJKLMNOPQRSTUVWYZX";
constexpr uint8_t kSigmaProtein = sizeof(kAlphabetProtein) - 1;
constexpr uint8_t kBitsPerCharProtein = log2<kSigmaProtein - 1>::value + 1;
constexpr uint8_t kCharToProtein[128] = {
Expand All @@ -106,7 +106,7 @@ static_assert(kSigmaProtein > 1llu << (kBitsPerCharProtein - 1));


//for case-specific DNA and RNA (U <-> T) data
constexpr char kAlphabetDNACaseSent[] = "ACGTNacgt";
inline constexpr char kAlphabetDNACaseSent[] = "ACGTNacgt";
constexpr uint8_t kSigmaDNACaseSent = sizeof(kAlphabetDNACaseSent) - 1;
constexpr uint8_t kBitsPerCharDNACaseSent = log2<kSigmaDNACaseSent - 1>::value + 1;
constexpr uint8_t kCharToDNACaseSent[128] = {
Expand All @@ -125,7 +125,7 @@ static_assert(kSigmaDNACaseSent > 1llu << (kBitsPerCharDNACaseSent - 1));


//for DNA and RNA (U <-> T) alphabets
constexpr char kAlphabetDNA[] = "ACGT";
inline constexpr char kAlphabetDNA[] = "ACGT";
constexpr uint8_t kSigmaDNA = sizeof(kAlphabetDNA) - 1;
constexpr uint8_t kBitsPerCharDNA = log2<kSigmaDNA - 1>::value + 1;
constexpr uint8_t kCharToDNA[128] = {
Expand All @@ -142,7 +142,7 @@ const std::vector<uint8_t> kComplementMapDNA = { 3, 2, 1, 0, 4 };
static_assert(kSigmaDNA <= 1llu << kBitsPerCharDNA);
static_assert(kSigmaDNA > 1llu << (kBitsPerCharDNA - 1));

constexpr char kAlphabetDNA5[] = "ACGTN";
inline constexpr char kAlphabetDNA5[] = "ACGTN";
constexpr uint8_t kSigmaDNA5 = sizeof(kAlphabetDNA5) - 1;
constexpr uint8_t kBitsPerCharDNA5 = log2<kSigmaDNA5 - 1>::value + 1;
static_assert(kSigmaDNA5 <= 1llu << kBitsPerCharDNA5);
Expand Down

0 comments on commit add9fff

Please sign in to comment.