Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 26 additions & 4 deletions include/zim/suggestion.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,12 +97,18 @@ class LIBZIM_API SuggestionSearcher
*/
class LIBZIM_API SuggestionSearch
{
public:
public: // types
typedef std::vector<SuggestionItem> Results;

public: // functions
SuggestionSearch(SuggestionSearch&& s);
SuggestionSearch& operator=(SuggestionSearch&& s);
~SuggestionSearch();

/** Get a set of results for this search.
*
* Returns a subset of title suggestions for the requested range from
* the full set of results.
*
* @param start The begining of the range to get
* (offset of the first result).
Expand All @@ -111,14 +117,30 @@ class LIBZIM_API SuggestionSearch
*/
const SuggestionResultSet getResults(int start, int maxResults) const;

/** Get the number of estimated results for this suggestion search.
/** Get spelling correction suggestions for this search.
*
* Returns spelling correction suggestions for the word containing the
* text edit location. In the current implementation, the text edit
* location is assumed to be at the end of the query string provided to
* the SuggestionSearch::suggest() method. In the future the text edit
* location will be indicated by a special code-point (e.g.
* carriage-return, form-feed or soft-hyphen) in the query string.
*
* @param maxCount The maximum number of results to return.
*/
Results getSpellingSuggestions(uint32_t maxCount) const;

/** Get the estimated count of title matches for this suggestion search.
*
* As the name suggest, it is a estimation of the number of results.
* As the name suggest, it is an estimation of the number of results.
* As a member of the initial API, the name of this method conceals
* the fact that only title suggestions are covered by it.
*/
int getEstimatedMatches() const;

private: // methods
SuggestionSearch(std::shared_ptr<SuggestionDataBase> p_internalDb, const std::string& query);
SuggestionSearch(std::shared_ptr<SuggestionDataBase> p_internalDb,
const std::string& query);

private: // data
std::shared_ptr<SuggestionDataBase> mp_internalDb;
Expand Down
175 changes: 172 additions & 3 deletions src/suggestion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include "suggestion_internal.h"
#include "fileimpl.h"
#include "tools.h"
#include "fs_unix.h"
#include "constants.h"

#if defined(ENABLE_XAPIAN)
Expand Down Expand Up @@ -198,9 +199,10 @@ void SuggestionSearcher::initDatabase()
mp_internalDb = std::make_shared<SuggestionDataBase>(m_archive, m_verbose);
}

SuggestionSearch::SuggestionSearch(std::shared_ptr<SuggestionDataBase> p_internalDb, const std::string& query)
: mp_internalDb(p_internalDb),
m_query(query)
SuggestionSearch::SuggestionSearch(std::shared_ptr<SuggestionDataBase> p_internalDb,
const std::string& query)
: mp_internalDb(p_internalDb)
, m_query(query)
#if defined(ENABLE_XAPIAN)
, mp_enquire(nullptr)
#endif // ENABLE_XAPIAN
Expand Down Expand Up @@ -252,6 +254,173 @@ const SuggestionResultSet SuggestionSearch::getResults(int start, int maxResults
return SuggestionResultSet(entryRange);
}

namespace
{

class QueryInfo
{
public:
explicit QueryInfo(const std::string& query)
{
// XXX: assuming that the query edit location (caret position) is at the end
const size_t lastSpacePos = query.find_last_of(' ');
const size_t startOfLastWord = lastSpacePos != std::string::npos
? lastSpacePos + 1
: 0;
m_queryPrefix = query.substr(0, startOfLastWord);
m_wordToComplete = query.substr(startOfLastWord);
m_wordBeingEdited = m_wordToComplete;
m_querySuffix = "";
}

const std::string& wordBeingEdited() const { return m_wordBeingEdited; }

std::string spellingSuggestion(const std::string& correctedWord) const {
return m_queryPrefix + correctedWord + m_querySuffix;
}
private:
std::string m_queryPrefix;
std::string m_wordToComplete;
std::string m_wordBeingEdited;
std::string m_querySuffix;
};

} // unnamed namespace

namespace suggestions
{

#if defined(LIBZIM_WITH_XAPIAN) && ! defined(_WIN32)
#define ENABLE_SPELLINGSDB
#endif

#ifdef ENABLE_SPELLINGSDB
class SpellingsDB
{
public: // functions
explicit SpellingsDB(const TermCollection& terms);
~SpellingsDB();

SpellingsDB(const SpellingsDB& ) = delete;
void operator=(const SpellingsDB& ) = delete;

std::vector<std::string> getSpellingCorrections(const std::string& word, uint32_t maxCount) const;

private: // functions
static std::string createTempDir();

private: // data
const std::string tmpDirPath_;
mutable Xapian::WritableDatabase impl_;
};

std::string SpellingsDB::createTempDir()
{
char tmpDirPath[] = "/dev/shm/libzimspellingdb.XXXXXX";
if ( ! mkdtemp(tmpDirPath) ) {
throw std::runtime_error("SpellingsDB: mkdtemp() failed");
}
return tmpDirPath;
}

SpellingsDB::SpellingsDB(const TermCollection& terms)
: tmpDirPath_(createTempDir())
, impl_(tmpDirPath_ + "/spellingdb.xapian", Xapian::DB_BACKEND_GLASS)
{
for (const auto& t : terms) {
impl_.add_spelling(t.term);
}
}

SpellingsDB::~SpellingsDB()
{
unix::FS::remove(tmpDirPath_);
}

std::vector<std::string> SpellingsDB::getSpellingCorrections(const std::string& word, uint32_t maxCount) const {
if ( maxCount > 1 ) {
throw std::runtime_error("More than one spelling correction was requested");
}

std::vector<std::string> result;
const auto term = impl_.get_spelling_suggestion(word, 3);
if ( !term.empty() ) {
result.push_back(term);
}
return result;
}
#endif // ENABLE_SPELLINGSDB

} // namespace suggestions

SuggestionDataBase::~SuggestionDataBase() = default;

namespace
{

using namespace suggestions;

TermCollection getAllTerms(const SuggestionDataBase& db) {
TermCollection allTerms;

#ifdef LIBZIM_WITH_XAPIAN
const Xapian::Database& titleDb = db.m_database;
for (Xapian::docid docid = 1; docid <= titleDb.get_lastdocid(); ++docid) {
const auto doc = titleDb.get_document(docid);
const auto title = doc.get_value(0);
allTerms.push_back(TermWithFreq{title, 1});
}
#endif // LIBZIM_WITH_XAPIAN

std::sort(allTerms.begin(), allTerms.end(), TermWithFreq::dictionaryPred);
return allTerms;
}

} // unnamed namespace

std::vector<std::string> SuggestionDataBase::getSpellingCorrections(
const std::string& word,
uint32_t maxCount) const
{
#ifdef ENABLE_SPELLINGSDB
if ( this->hasDatabase() ) {
std::lock_guard<std::mutex> locker(m_spellingsDBMutex);
if ( !m_spellingsDB ) {
const TermCollection& allTerms = this->getAllSuggestionTerms();
m_spellingsDB.reset(new SpellingsDB(allTerms));
}
return m_spellingsDB->getSpellingCorrections(word, maxCount);
}
#endif // ENABLE_SPELLINGSDB

return {};
}

const TermCollection& SuggestionDataBase::getAllSuggestionTerms() const
{
std::lock_guard<std::mutex> locker(m_suggestionTermsMutex);
if ( m_suggestionTerms.empty() ) {
m_suggestionTerms = getAllTerms(*this);
}
return m_suggestionTerms;
}

SuggestionSearch::Results SuggestionSearch::getSpellingSuggestions(uint32_t maxCount) const {
QueryInfo queryInfo(m_query);

SuggestionSearch::Results r;
if ( !queryInfo.wordBeingEdited().empty() ) {
const auto terms = mp_internalDb->getSpellingCorrections(queryInfo.wordBeingEdited(), maxCount);

for (const auto& t : terms) {
const auto suggestion = queryInfo.spellingSuggestion(t);
r.push_back(SuggestionItem("", "", suggestion));
}
}

return r;
}

const void SuggestionSearch::forceRangeSuggestion() {
#if defined(ENABLE_XAPIAN)
mp_internalDb->m_database.close();
Expand Down
34 changes: 34 additions & 0 deletions src/suggestion_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,40 @@
namespace zim
{

namespace suggestions
{

struct TermWithFreq
{
std::string term;
uint32_t freq;

static bool freqPred(const TermWithFreq& t1, const TermWithFreq& t2) {
return t1.freq > t2.freq;
}

static bool dictionaryPred(const TermWithFreq& t1, const TermWithFreq& t2) {
return t1.term < t2.term;
}
};

typedef std::vector<TermWithFreq> TermCollection;
class SpellingsDB;

} // namespace suggestions

/**
* A class to encapsulate a xapian title index and it's archive and all the
* information we can gather from it.
*/
class SuggestionDataBase {
public: // methods
SuggestionDataBase(const Archive& archive, bool verbose);
~SuggestionDataBase();

const suggestions::TermCollection& getAllSuggestionTerms() const;
std::vector<std::string> getSpellingCorrections(const std::string& word,
uint32_t maxCount) const;

public: // data
// The archive to get suggestions from.
Expand All @@ -52,6 +79,9 @@ class SuggestionDataBase {
private: // data
std::mutex m_mutex;

mutable std::mutex m_suggestionTermsMutex;
mutable suggestions::TermCollection m_suggestionTerms;

#if defined(LIBZIM_WITH_XAPIAN)

public: // xapian based methods
Expand All @@ -77,6 +107,10 @@ class SuggestionDataBase {

private:
void initXapianDb();

mutable std::mutex m_spellingsDBMutex;
mutable std::unique_ptr<suggestions::SpellingsDB> m_spellingsDB;

#endif // LIBZIM_WITH_XAPIAN
};

Expand Down
6 changes: 6 additions & 0 deletions src/tools.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,12 @@ namespace zim {
**/
uint32_t LIBZIM_PRIVATE_API randomNumber(uint32_t max);

inline bool startsWith(const std::string& str, const std::string& prefix)
{
return prefix.length() <= str.length()
&& std::equal(prefix.begin(), prefix.end(), str.begin());
}

std::vector<std::string> split(const std::string & str,
const std::string & delims=" *-");

Expand Down
Loading
Loading