From 1806c0c8f7aa4365f9f72c8ea51e947d1e93ccd9 Mon Sep 17 00:00:00 2001 From: Martin Steinegger Date: Wed, 6 Dec 2023 15:44:53 +0900 Subject: [PATCH] Add padding module --- src/CommandDeclarations.h | 1 + src/MMseqsBase.cpp | 9 ++++++++- src/util/CMakeLists.txt | 1 + src/util/makepaddedseqdb.cpp | 31 +++++++++++++++++++++++++++++++ 4 files changed, 41 insertions(+), 1 deletion(-) create mode 100644 src/util/makepaddedseqdb.cpp diff --git a/src/CommandDeclarations.h b/src/CommandDeclarations.h index 503663424..e8eac296c 100644 --- a/src/CommandDeclarations.h +++ b/src/CommandDeclarations.h @@ -23,6 +23,7 @@ extern int convertkb(int argc, const char **argv, const Command& command); extern int convertmsa(int argc, const char **argv, const Command& command); extern int convertprofiledb(int argc, const char **argv, const Command& command); extern int createdb(int argc, const char **argv, const Command& command); +extern int makepaddedseqdb(int argc, const char **argv, const Command& command); extern int createindex(int argc, const char **argv, const Command& command); extern int createlinindex(int argc, const char **argv, const Command& command); extern int createseqfiledb(int argc, const char **argv, const Command& command); diff --git a/src/MMseqsBase.cpp b/src/MMseqsBase.cpp index a8b8a8422..8ec4493eb 100644 --- a/src/MMseqsBase.cpp +++ b/src/MMseqsBase.cpp @@ -130,6 +130,13 @@ std::vector baseCommands = { " ... | ", CITATION_MMSEQS2, {{"fast[a|q]File[.gz|bz2]|stdin", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::flatfileStdinAndGeneric }, {"sequenceDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile }}}, + {"makepaddedseqdb", makepaddedseqdb, &par.onlyverbosity, COMMAND_HIDDEN, + "Generate a padded sequence DB", + "Generate a padded sequence DB", + "Martin Steinegger ", + " ", + CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_HEADER, &DbValidator::sequenceDb }, + {"sequenceIndexDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }}}, {"appenddbtoindex", appenddbtoindex, &par.appenddbtoindex, COMMAND_HIDDEN, NULL, NULL, @@ -137,7 +144,7 @@ std::vector baseCommands = { " ... ", CITATION_MMSEQS2, {{"DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::allDb }, {"DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::allDb }}}, - {"indexdb", indexdb, &par.indexdb, COMMAND_HIDDEN, + {"indexdb", indexdb, &par.indexdb, COMMAND_HIDDEN, NULL, NULL, "Martin Steinegger ", diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt index 3b2638fbf..7fa7b0603 100644 --- a/src/util/CMakeLists.txt +++ b/src/util/CMakeLists.txt @@ -31,6 +31,7 @@ set(util_source_files util/filterdb.cpp util/gff2db.cpp util/renamedbkeys.cpp + util/makepaddedseqdb.cpp util/masksequence.cpp util/maskbygff.cpp util/mergeclusters.cpp diff --git a/src/util/makepaddedseqdb.cpp b/src/util/makepaddedseqdb.cpp new file mode 100644 index 000000000..de2224159 --- /dev/null +++ b/src/util/makepaddedseqdb.cpp @@ -0,0 +1,31 @@ +#include "Parameters.h" +#include "DBReader.h" +#include "DBWriter.h" +#include "Debug.h" +#include "Util.h" + +int makepaddedseqdb(int argc, const char **argv, const Command &command) { + Parameters &par = Parameters::getInstance(); + par.parseParameters(argc, argv, command, true, 0, 0); + DBReader dbr(par.db1.c_str(), par.db1Index.c_str(), 1, + DBReader::USE_INDEX | DBReader::USE_DATA); + dbr.open(DBReader::SORT_BY_LENGTH); + DBWriter writer(par.db2.c_str(), par.db2Index.c_str(), 1, false, dbr.getDbtype()); + writer.open(); + std::string result; + const int ALIGN = 4; + for (long id = dbr.getSize() - 1; id >= 0; id--) { + unsigned int key = dbr.getDbKey(id); + char *data = dbr.getData(id, 0); + size_t seqLen = dbr.getSeqLen(id); + const size_t sequencepadding = (seqLen % ALIGN == 0) ? 0 : ALIGN - seqLen % ALIGN; + result.append(data, seqLen); + result.append(sequencepadding, ' '); + writer.writeData(data, seqLen + sequencepadding, key, 0, false); + } + writer.close(true); + DBReader::softlinkDb(par.db1, par.db2, DBFiles::SEQUENCE_ANCILLARY); + + dbr.close(); + return EXIT_SUCCESS; +} \ No newline at end of file