-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #36 from jelmervdl/fasttext-option
Add fasttext as an option alongside cld2
- Loading branch information
Showing
16 changed files
with
243 additions
and
120 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,70 +1,7 @@ | ||
#include "src/lang.hh" | ||
#include "lang.hh" | ||
|
||
namespace warc2text { | ||
// hint = {content language code(s), tld, original encoding, CLD2::Language} | ||
const CLD2::CLDHints NO_HINT = {nullptr, nullptr, CLD2::UNKNOWN_ENCODING, CLD2::UNKNOWN_LANGUAGE}; | ||
const std::string LanguageDetector::kUnknownLanguageLabel = "unk"; | ||
|
||
bool detectLanguage(const std::string& text, std::unordered_map<std::string, std::string>& text_by_lang){ | ||
CLD2::Language langs[3] = {CLD2::UNKNOWN_LANGUAGE, CLD2::UNKNOWN_LANGUAGE, CLD2::UNKNOWN_LANGUAGE}; | ||
int percents[3] = {0,0,0}; | ||
double scores[3] = {0.0, 0.0, 0.0}; | ||
|
||
bool reliable = false; | ||
int text_bytes; | ||
int valid_prefix_bytes; | ||
|
||
CLD2::ResultChunkVector chunks; | ||
|
||
CLD2::ExtDetectLanguageSummaryCheckUTF8(text.data(), text.size(), true, &NO_HINT, 0, &langs[0], &percents[0], &scores[0], &chunks, &text_bytes, &reliable, &valid_prefix_bytes); | ||
|
||
text_by_lang.clear(); | ||
|
||
if (not reliable) return reliable; | ||
|
||
std::string* top1 = nullptr; | ||
std::string* top2 = nullptr; | ||
std::string* top3 = nullptr; | ||
|
||
if (langs[0] != CLD2::UNKNOWN_LANGUAGE and percents[0] > 0) { | ||
top1 = &text_by_lang[CLD2::LanguageCode(langs[0])]; | ||
top1->reserve(text.size() * (percents[0] + 1)); | ||
} | ||
|
||
if (langs[1] != CLD2::UNKNOWN_LANGUAGE and percents[1] > 0) { | ||
top2 = &text_by_lang[CLD2::LanguageCode(langs[1])]; | ||
top2->reserve(text.size() * (percents[1] + 1)); | ||
} | ||
|
||
if (langs[2] != CLD2::UNKNOWN_LANGUAGE and percents[2] > 0) { | ||
top3 = &text_by_lang[CLD2::LanguageCode(langs[2])]; | ||
top3->reserve(text.size() * (percents[2] + 1)); | ||
} | ||
|
||
for (const CLD2::ResultChunk& chunk : chunks) { | ||
std::string* ref = static_cast<CLD2::Language>(chunk.lang1) == langs[0] ? top1 : | ||
static_cast<CLD2::Language>(chunk.lang1) == langs[1] ? top2 : | ||
static_cast<CLD2::Language>(chunk.lang1) == langs[2] ? top3 : nullptr; | ||
if (ref == nullptr) continue; | ||
ref->append(text, chunk.offset, chunk.bytes); | ||
} | ||
|
||
// remove empty texts from text_by_lang | ||
// apparently it is possible that the reported percentage is > 0, but the language does not appear in chunks | ||
for (auto it = text_by_lang.cbegin(); it != text_by_lang.cend(); ){ | ||
if (it->second.size() == 0) text_by_lang.erase(it++); | ||
else ++it; | ||
} | ||
|
||
// TODO: do something with the scores? | ||
|
||
return reliable; | ||
} | ||
|
||
bool detectLanguage(const std::string& text, std::string& lang){ | ||
bool reliable = false; | ||
int valid_prefix_bytes = 0; | ||
CLD2::Language l = CLD2::DetectLanguageCheckUTF8(text.data(), text.size(), true, &reliable, &valid_prefix_bytes); | ||
lang = CLD2::LanguageCode(l); | ||
return reliable; | ||
} | ||
} // namespace warc2text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,18 +1,49 @@ | ||
#ifndef WARC2TEXT_LANG_HH | ||
#define WARC2TEXT_LANG_HH | ||
|
||
#include <memory> | ||
#include <string> | ||
#include <unordered_map> | ||
#include <utility> | ||
#include "cld2/public/compact_lang_det.h" | ||
#include "cld2/public/encodings.h" | ||
|
||
namespace fasttext { | ||
class FastText; | ||
} // namespace fasttext | ||
|
||
namespace warc2text { | ||
// detect language of plain text, return top 3 languages | ||
bool detectLanguage(const std::string& text, std::unordered_map<std::string, std::string>& chunks); | ||
|
||
// detect top language of plain text | ||
bool detectLanguage(const std::string& text, std::string& lang); | ||
} | ||
class LanguageDetector { | ||
public: | ||
virtual ~LanguageDetector() {}; | ||
|
||
// detect language of plain text, return top languages | ||
virtual void detect(const std::string& text, std::unordered_map<std::string, std::string>& chunks) const = 0; | ||
|
||
// Label used for text (chunks) that cannot reliably be identified | ||
static const std::string kUnknownLanguageLabel; | ||
}; | ||
|
||
class FastTextDetector : public LanguageDetector { | ||
public: | ||
explicit FastTextDetector(const std::string &filename); | ||
virtual ~FastTextDetector(); | ||
virtual void detect(const std::string& text, std::unordered_map<std::string, std::string>& chunks) const; | ||
|
||
private: | ||
std::unique_ptr<fasttext::FastText> classifier_; | ||
}; | ||
|
||
class CLD2Detector : public LanguageDetector { | ||
public: | ||
virtual void detect(const std::string& text, std::unordered_map<std::string, std::string>& chunks) const; | ||
virtual ~CLD2Detector(); | ||
}; | ||
|
||
class CLD2MultiLangDetector : public LanguageDetector { | ||
public: | ||
virtual void detect(const std::string& text, std::unordered_map<std::string, std::string>& chunks) const; | ||
virtual ~CLD2MultiLangDetector(); | ||
}; | ||
|
||
} // namespace warc2text | ||
|
||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
#include "src/lang.hh" | ||
#include "cld2/public/compact_lang_det.h" | ||
#include "cld2/public/encodings.h" | ||
|
||
namespace warc2text { | ||
// hint = {content language code(s), tld, original encoding, CLD2::Language} | ||
const CLD2::CLDHints NO_HINT = {nullptr, nullptr, CLD2::UNKNOWN_ENCODING, CLD2::UNKNOWN_LANGUAGE}; | ||
|
||
CLD2Detector::~CLD2Detector() {} | ||
|
||
void CLD2Detector::detect(const std::string& text, std::unordered_map<std::string, std::string>& text_by_lang) const { | ||
bool reliable = false; | ||
int valid_prefix_bytes = 0; | ||
CLD2::Language l = CLD2::DetectLanguageCheckUTF8(text.data(), text.size(), true, &reliable, &valid_prefix_bytes); | ||
text_by_lang[reliable ? CLD2::LanguageCode(l) : kUnknownLanguageLabel] = text; | ||
} | ||
|
||
CLD2MultiLangDetector::~CLD2MultiLangDetector() {} | ||
|
||
void CLD2MultiLangDetector::detect(const std::string& text, std::unordered_map<std::string, std::string>& text_by_lang) const { | ||
CLD2::Language langs[3] = {CLD2::UNKNOWN_LANGUAGE, CLD2::UNKNOWN_LANGUAGE, CLD2::UNKNOWN_LANGUAGE}; | ||
int percents[3] = {0,0,0}; | ||
double scores[3] = {0.0, 0.0, 0.0}; | ||
|
||
bool reliable = false; | ||
int text_bytes; | ||
int valid_prefix_bytes; | ||
|
||
CLD2::ResultChunkVector chunks; | ||
|
||
CLD2::ExtDetectLanguageSummaryCheckUTF8(text.data(), text.size(), true, &NO_HINT, 0, &langs[0], &percents[0], &scores[0], &chunks, &text_bytes, &reliable, &valid_prefix_bytes); | ||
|
||
text_by_lang.clear(); | ||
|
||
if (not reliable) { | ||
text_by_lang[kUnknownLanguageLabel] = text; | ||
return; | ||
} | ||
|
||
std::string* top1 = nullptr; | ||
std::string* top2 = nullptr; | ||
std::string* top3 = nullptr; | ||
|
||
if (langs[0] != CLD2::UNKNOWN_LANGUAGE and percents[0] > 0) { | ||
top1 = &text_by_lang[CLD2::LanguageCode(langs[0])]; | ||
top1->reserve(text.size() * (percents[0] + 1)); | ||
} | ||
|
||
if (langs[1] != CLD2::UNKNOWN_LANGUAGE and percents[1] > 0) { | ||
top2 = &text_by_lang[CLD2::LanguageCode(langs[1])]; | ||
top2->reserve(text.size() * (percents[1] + 1)); | ||
} | ||
|
||
if (langs[2] != CLD2::UNKNOWN_LANGUAGE and percents[2] > 0) { | ||
top3 = &text_by_lang[CLD2::LanguageCode(langs[2])]; | ||
top3->reserve(text.size() * (percents[2] + 1)); | ||
} | ||
|
||
for (const CLD2::ResultChunk& chunk : chunks) { | ||
std::string* ref = static_cast<CLD2::Language>(chunk.lang1) == langs[0] ? top1 : | ||
static_cast<CLD2::Language>(chunk.lang1) == langs[1] ? top2 : | ||
static_cast<CLD2::Language>(chunk.lang1) == langs[2] ? top3 : nullptr; | ||
if (ref == nullptr) continue; | ||
ref->append(text, chunk.offset, chunk.bytes); | ||
} | ||
|
||
// remove empty texts from text_by_lang | ||
// apparently it is possible that the reported percentage is > 0, but the language does not appear in chunks | ||
for (auto it = text_by_lang.cbegin(); it != text_by_lang.cend(); ){ | ||
if (it->second.size() == 0) text_by_lang.erase(it++); | ||
else ++it; | ||
} | ||
|
||
// TODO: do something with the scores? | ||
} | ||
} // namespace warc2text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
#include "src/lang.hh" | ||
|
||
#include "fasttext.h" | ||
#include "util/exception.hh" | ||
|
||
#include <cstdlib> | ||
#include <cstring> | ||
|
||
namespace warc2text { | ||
|
||
FastTextDetector::FastTextDetector(const std::string &filename) | ||
: classifier_(new fasttext::FastText) { | ||
classifier_->loadModel(filename); | ||
} | ||
|
||
FastTextDetector::~FastTextDetector() {} | ||
|
||
const char kLabelPrefix[] = "__label__"; | ||
|
||
void FastTextDetector::detect(const std::string& text, std::unordered_map<std::string, std::string>& chunks) const { | ||
const float kThreshold = 0.5f; | ||
std::vector<int32_t> words, labels; | ||
classifier_->getDictionary()->getStringNoNewline(text, words, labels); | ||
fasttext::Predictions predictions; | ||
classifier_->predict(1, words, predictions, kThreshold); | ||
if (predictions.empty()) { | ||
chunks[kUnknownLanguageLabel] = text; | ||
return; | ||
} | ||
|
||
// Labels look like __label__eng | ||
std::string label = classifier_->getDictionary()->getLabel(predictions[0].second); | ||
UTIL_THROW_IF2(strncmp(label.c_str(), kLabelPrefix, sizeof(kLabelPrefix) - 1), "Was expecting text classifier labels to begin with " << kLabelPrefix << " but they look like " << label); | ||
label.erase(0, sizeof(kLabelPrefix) - 1); | ||
|
||
// For better or worse, we're currently doing everything as one chunk. | ||
chunks[label] = text; | ||
} | ||
|
||
} // namespace warc2text |
Oops, something went wrong.