Skip to content

Commit

Permalink
Merge pull request #36 from jelmervdl/fasttext-option
Browse files Browse the repository at this point in the history
Add fasttext as an option alongside cld2
  • Loading branch information
lpla authored Mar 14, 2023
2 parents 8607c00 + 518d10a commit eac887e
Show file tree
Hide file tree
Showing 16 changed files with 243 additions and 120 deletions.
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,6 @@
path = cld2
url = https://github.com/bitextor/cld2.git
ignore = dirty
[submodule "fasttext"]
path = fasttext
url = https://github.com/kpuatfb/fastText.git
8 changes: 7 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,12 @@ cmake_minimum_required(VERSION 2.8.3)

project(warc2text)

set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD 17)


set(CMAKE_CXX_FLAGS_DEBUG "-g")
set(CMAKE_CXX_FLAGS_RELEASE "-O3")
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3")
set(CMAKE_CXX_FLAGS "-Wall -Wextra -DBOOST_LOG_DYN_LINK ${CMAKE_CXX_FLAGS}")
if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I /usr/local/opt/icu4c/include")
Expand Down Expand Up @@ -41,10 +44,12 @@ if (NOT SKIP_PREPROCESS_BUILD)
endif()

target_include_directories(warc2text_lib PUBLIC ${PREPROCESS_PATH})
target_include_directories(warc2text_lib PRIVATE fasttext/src)
#

# add libcld2.so
add_subdirectory(cld2)
add_subdirectory(fasttext EXCLUDE_FROM_ALL)
#

# define executables
Expand All @@ -53,6 +58,7 @@ target_link_libraries(warc2text
warc2text_lib
${Boost_LIBRARIES}
cld2_full
fasttext-static
)

include(GNUInstallDirs)
Expand Down
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,15 @@ mkdir build
cd build
cmake -DCMAKE_INSTALL_PREFIX=/your/prefix/path ..
# cmake .. -DCMAKE_BUILD_TYPE=Debug # for debug
# cmake .. -DICU_ROOT_DIR=(brew --prefix icu4c)/lib # for macOS
make -j
make install
```

## Usage

**note:** for warcs with many languages you might hit the open file limit quite quickly. It is therefore advised to increase it, e.g. `ulimit -n 8192`.

```
warc2text -o <output_folder> [ -f <output_files> ] [ --pdfpass <output_warc> ]
[ --paragraph-identification ] [ --tag-filters <filters_file> ] <warc_file>...
Expand All @@ -41,6 +45,8 @@ warc2text -o <output_folder> [ -f <output_files> ] [ --pdfpass <output_warc> ]
* `--files`/`-f` list of output files separated by commas (and without `.gz`); `text` and `url` are always written, while `mime` and `html` are optional
* `--pdfpass` WARC file where PDF records will be stored
* `--paragraph-identification` print the paragraph identifier for each sentence extracted from the HTML
* `--classifier` classifier to use: `cld2` or `fasttext`.
* `--fasttext-model` path to FastText model for fasttext classifier.
* `--tag-filters` file containing filters that are used to eliminate matching documents
* `--invert-tag-filters` output only documents that match the filter
* `--url-filters` file containing regular expressions that match urls of documents to eliminate
Expand Down
1 change: 1 addition & 0 deletions fasttext
Submodule fasttext added at ffee8e
2 changes: 2 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ add_library(warc2text_lib
record.cc
html.cc
lang.cc
lang_cld2.cc
lang_fasttext.cc
util.cc
bilangwriter.cc
xh_scanner.cc
Expand Down
31 changes: 8 additions & 23 deletions src/bilangwriter.cc
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "bilangwriter.hh"
#include "util.hh"
#include "util/exception.hh"
#include <cassert>
#include <string>

Expand Down Expand Up @@ -50,6 +51,7 @@ namespace warc2text{

void GzipWriter::open(const std::string& filename) {
dest = std::fopen(filename.c_str(), "wb");
UTIL_THROW_IF(!dest, util::ErrnoException, "while creating " << filename);
}

void GzipWriter::write(const char* text, std::size_t size) {
Expand Down Expand Up @@ -108,39 +110,22 @@ namespace warc2text{
return result;
}

void BilangWriter::write(const Record& record, bool multilang, bool paragraph_identification) {
void BilangWriter::write(const Record& record, bool paragraph_identification) {
std::string base64text;
std::string base64html;

if (multilang) {
if (output_files.count("html") == 1)
util::encodeBase64(record.getPayload(), base64html);

if (output_files.count("html") == 1)
util::encodeBase64(record.getPayload(), base64html);

for (const auto& it : record.getTextByLangs()) {
std::string payload = it.second;

if (paragraph_identification) {
payload = get_paragraph_id(payload);
}

util::encodeBase64(payload, base64text);
this->write(it.first, base64text, record.getURL(), record.getHTTPcontentType(), base64html);
}

} else {
std::string payload = record.getPlainText();
for (const auto& it : record.getTextByLangs()) {
std::string payload = it.second;

if (paragraph_identification) {
payload = get_paragraph_id(payload);
}

util::encodeBase64(payload, base64text);

if (output_files.count("html") == 1)
util::encodeBase64(record.getPayload(), base64html);

this->write(record.getLanguage(), base64text, record.getURL(), record.getHTTPcontentType(), base64html);
this->write(it.first, base64text, record.getURL(), record.getHTTPcontentType(), base64html);
}
}

Expand Down
2 changes: 1 addition & 1 deletion src/bilangwriter.hh
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ namespace warc2text {
output_files(output_files)
{};

void write(const Record& record, bool multilang = false, bool paragraph_identification = false);
void write(const Record& record, bool paragraph_identification = false);

};

Expand Down
69 changes: 3 additions & 66 deletions src/lang.cc
Original file line number Diff line number Diff line change
@@ -1,70 +1,7 @@
#include "src/lang.hh"
#include "lang.hh"

namespace warc2text {
// hint = {content language code(s), tld, original encoding, CLD2::Language}
const CLD2::CLDHints NO_HINT = {nullptr, nullptr, CLD2::UNKNOWN_ENCODING, CLD2::UNKNOWN_LANGUAGE};
const std::string LanguageDetector::kUnknownLanguageLabel = "unk";

bool detectLanguage(const std::string& text, std::unordered_map<std::string, std::string>& text_by_lang){
CLD2::Language langs[3] = {CLD2::UNKNOWN_LANGUAGE, CLD2::UNKNOWN_LANGUAGE, CLD2::UNKNOWN_LANGUAGE};
int percents[3] = {0,0,0};
double scores[3] = {0.0, 0.0, 0.0};

bool reliable = false;
int text_bytes;
int valid_prefix_bytes;

CLD2::ResultChunkVector chunks;

CLD2::ExtDetectLanguageSummaryCheckUTF8(text.data(), text.size(), true, &NO_HINT, 0, &langs[0], &percents[0], &scores[0], &chunks, &text_bytes, &reliable, &valid_prefix_bytes);

text_by_lang.clear();

if (not reliable) return reliable;

std::string* top1 = nullptr;
std::string* top2 = nullptr;
std::string* top3 = nullptr;

if (langs[0] != CLD2::UNKNOWN_LANGUAGE and percents[0] > 0) {
top1 = &text_by_lang[CLD2::LanguageCode(langs[0])];
top1->reserve(text.size() * (percents[0] + 1));
}

if (langs[1] != CLD2::UNKNOWN_LANGUAGE and percents[1] > 0) {
top2 = &text_by_lang[CLD2::LanguageCode(langs[1])];
top2->reserve(text.size() * (percents[1] + 1));
}

if (langs[2] != CLD2::UNKNOWN_LANGUAGE and percents[2] > 0) {
top3 = &text_by_lang[CLD2::LanguageCode(langs[2])];
top3->reserve(text.size() * (percents[2] + 1));
}

for (const CLD2::ResultChunk& chunk : chunks) {
std::string* ref = static_cast<CLD2::Language>(chunk.lang1) == langs[0] ? top1 :
static_cast<CLD2::Language>(chunk.lang1) == langs[1] ? top2 :
static_cast<CLD2::Language>(chunk.lang1) == langs[2] ? top3 : nullptr;
if (ref == nullptr) continue;
ref->append(text, chunk.offset, chunk.bytes);
}

// remove empty texts from text_by_lang
// apparently it is possible that the reported percentage is > 0, but the language does not appear in chunks
for (auto it = text_by_lang.cbegin(); it != text_by_lang.cend(); ){
if (it->second.size() == 0) text_by_lang.erase(it++);
else ++it;
}

// TODO: do something with the scores?

return reliable;
}

bool detectLanguage(const std::string& text, std::string& lang){
bool reliable = false;
int valid_prefix_bytes = 0;
CLD2::Language l = CLD2::DetectLanguageCheckUTF8(text.data(), text.size(), true, &reliable, &valid_prefix_bytes);
lang = CLD2::LanguageCode(l);
return reliable;
}
} // namespace warc2text
47 changes: 39 additions & 8 deletions src/lang.hh
Original file line number Diff line number Diff line change
@@ -1,18 +1,49 @@
#ifndef WARC2TEXT_LANG_HH
#define WARC2TEXT_LANG_HH

#include <memory>
#include <string>
#include <unordered_map>
#include <utility>
#include "cld2/public/compact_lang_det.h"
#include "cld2/public/encodings.h"

namespace fasttext {
class FastText;
} // namespace fasttext

namespace warc2text {
// detect language of plain text, return top 3 languages
bool detectLanguage(const std::string& text, std::unordered_map<std::string, std::string>& chunks);

// detect top language of plain text
bool detectLanguage(const std::string& text, std::string& lang);
}
class LanguageDetector {
public:
virtual ~LanguageDetector() {};

// detect language of plain text, return top languages
virtual void detect(const std::string& text, std::unordered_map<std::string, std::string>& chunks) const = 0;

// Label used for text (chunks) that cannot reliably be identified
static const std::string kUnknownLanguageLabel;
};

class FastTextDetector : public LanguageDetector {
public:
explicit FastTextDetector(const std::string &filename);
virtual ~FastTextDetector();
virtual void detect(const std::string& text, std::unordered_map<std::string, std::string>& chunks) const;

private:
std::unique_ptr<fasttext::FastText> classifier_;
};

class CLD2Detector : public LanguageDetector {
public:
virtual void detect(const std::string& text, std::unordered_map<std::string, std::string>& chunks) const;
virtual ~CLD2Detector();
};

class CLD2MultiLangDetector : public LanguageDetector {
public:
virtual void detect(const std::string& text, std::unordered_map<std::string, std::string>& chunks) const;
virtual ~CLD2MultiLangDetector();
};

} // namespace warc2text

#endif
76 changes: 76 additions & 0 deletions src/lang_cld2.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#include "src/lang.hh"
#include "cld2/public/compact_lang_det.h"
#include "cld2/public/encodings.h"

namespace warc2text {
// hint = {content language code(s), tld, original encoding, CLD2::Language}
const CLD2::CLDHints NO_HINT = {nullptr, nullptr, CLD2::UNKNOWN_ENCODING, CLD2::UNKNOWN_LANGUAGE};

CLD2Detector::~CLD2Detector() {}

void CLD2Detector::detect(const std::string& text, std::unordered_map<std::string, std::string>& text_by_lang) const {
bool reliable = false;
int valid_prefix_bytes = 0;
CLD2::Language l = CLD2::DetectLanguageCheckUTF8(text.data(), text.size(), true, &reliable, &valid_prefix_bytes);
text_by_lang[reliable ? CLD2::LanguageCode(l) : kUnknownLanguageLabel] = text;
}

CLD2MultiLangDetector::~CLD2MultiLangDetector() {}

void CLD2MultiLangDetector::detect(const std::string& text, std::unordered_map<std::string, std::string>& text_by_lang) const {
CLD2::Language langs[3] = {CLD2::UNKNOWN_LANGUAGE, CLD2::UNKNOWN_LANGUAGE, CLD2::UNKNOWN_LANGUAGE};
int percents[3] = {0,0,0};
double scores[3] = {0.0, 0.0, 0.0};

bool reliable = false;
int text_bytes;
int valid_prefix_bytes;

CLD2::ResultChunkVector chunks;

CLD2::ExtDetectLanguageSummaryCheckUTF8(text.data(), text.size(), true, &NO_HINT, 0, &langs[0], &percents[0], &scores[0], &chunks, &text_bytes, &reliable, &valid_prefix_bytes);

text_by_lang.clear();

if (not reliable) {
text_by_lang[kUnknownLanguageLabel] = text;
return;
}

std::string* top1 = nullptr;
std::string* top2 = nullptr;
std::string* top3 = nullptr;

if (langs[0] != CLD2::UNKNOWN_LANGUAGE and percents[0] > 0) {
top1 = &text_by_lang[CLD2::LanguageCode(langs[0])];
top1->reserve(text.size() * (percents[0] + 1));
}

if (langs[1] != CLD2::UNKNOWN_LANGUAGE and percents[1] > 0) {
top2 = &text_by_lang[CLD2::LanguageCode(langs[1])];
top2->reserve(text.size() * (percents[1] + 1));
}

if (langs[2] != CLD2::UNKNOWN_LANGUAGE and percents[2] > 0) {
top3 = &text_by_lang[CLD2::LanguageCode(langs[2])];
top3->reserve(text.size() * (percents[2] + 1));
}

for (const CLD2::ResultChunk& chunk : chunks) {
std::string* ref = static_cast<CLD2::Language>(chunk.lang1) == langs[0] ? top1 :
static_cast<CLD2::Language>(chunk.lang1) == langs[1] ? top2 :
static_cast<CLD2::Language>(chunk.lang1) == langs[2] ? top3 : nullptr;
if (ref == nullptr) continue;
ref->append(text, chunk.offset, chunk.bytes);
}

// remove empty texts from text_by_lang
// apparently it is possible that the reported percentage is > 0, but the language does not appear in chunks
for (auto it = text_by_lang.cbegin(); it != text_by_lang.cend(); ){
if (it->second.size() == 0) text_by_lang.erase(it++);
else ++it;
}

// TODO: do something with the scores?
}
} // namespace warc2text
40 changes: 40 additions & 0 deletions src/lang_fasttext.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#include "src/lang.hh"

#include "fasttext.h"
#include "util/exception.hh"

#include <cstdlib>
#include <cstring>

namespace warc2text {

FastTextDetector::FastTextDetector(const std::string &filename)
: classifier_(new fasttext::FastText) {
classifier_->loadModel(filename);
}

FastTextDetector::~FastTextDetector() {}

const char kLabelPrefix[] = "__label__";

void FastTextDetector::detect(const std::string& text, std::unordered_map<std::string, std::string>& chunks) const {
const float kThreshold = 0.5f;
std::vector<int32_t> words, labels;
classifier_->getDictionary()->getStringNoNewline(text, words, labels);
fasttext::Predictions predictions;
classifier_->predict(1, words, predictions, kThreshold);
if (predictions.empty()) {
chunks[kUnknownLanguageLabel] = text;
return;
}

// Labels look like __label__eng
std::string label = classifier_->getDictionary()->getLabel(predictions[0].second);
UTIL_THROW_IF2(strncmp(label.c_str(), kLabelPrefix, sizeof(kLabelPrefix) - 1), "Was expecting text classifier labels to begin with " << kLabelPrefix << " but they look like " << label);
label.erase(0, sizeof(kLabelPrefix) - 1);

// For better or worse, we're currently doing everything as one chunk.
chunks[label] = text;
}

} // namespace warc2text
Loading

0 comments on commit eac887e

Please sign in to comment.