diff --git a/.gitmodules b/.gitmodules index 017fa01..7a8a2a7 100644 --- a/.gitmodules +++ b/.gitmodules @@ -6,3 +6,6 @@ path = cld2 url = https://github.com/bitextor/cld2.git ignore = dirty +[submodule "fasttext"] + path = fasttext + url = https://github.com/kpuatfb/fastText.git diff --git a/CMakeLists.txt b/CMakeLists.txt index 9e90b00..f3c3ed5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,9 +2,12 @@ cmake_minimum_required(VERSION 2.8.3) project(warc2text) -set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_FLAGS_DEBUG "-g") +set(CMAKE_CXX_FLAGS_RELEASE "-O3") +set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3") set(CMAKE_CXX_FLAGS "-Wall -Wextra -DBOOST_LOG_DYN_LINK ${CMAKE_CXX_FLAGS}") if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I /usr/local/opt/icu4c/include") @@ -41,10 +44,12 @@ if (NOT SKIP_PREPROCESS_BUILD) endif() target_include_directories(warc2text_lib PUBLIC ${PREPROCESS_PATH}) +target_include_directories(warc2text_lib PRIVATE fasttext/src) # # add libcld2.so add_subdirectory(cld2) +add_subdirectory(fasttext EXCLUDE_FROM_ALL) # # define executables @@ -53,6 +58,7 @@ target_link_libraries(warc2text warc2text_lib ${Boost_LIBRARIES} cld2_full + fasttext-static ) include(GNUInstallDirs) diff --git a/README.md b/README.md index 2aad126..93c2efe 100644 --- a/README.md +++ b/README.md @@ -28,11 +28,15 @@ mkdir build cd build cmake -DCMAKE_INSTALL_PREFIX=/your/prefix/path .. # cmake .. -DCMAKE_BUILD_TYPE=Debug # for debug +# cmake .. -DICU_ROOT_DIR=(brew --prefix icu4c)/lib # for macOS make -j make install ``` ## Usage + +**note:** for warcs with many languages you might hit the open file limit quite quickly. It is therefore advised to increase it, e.g. `ulimit -n 8192`. + ``` warc2text -o [ -f ] [ --pdfpass ] [ --paragraph-identification ] [ --tag-filters ] ... @@ -41,6 +45,8 @@ warc2text -o [ -f ] [ --pdfpass ] * `--files`/`-f` list of output files separated by commas (and without `.gz`); `text` and `url` are always written, while `mime` and `html` are optional * `--pdfpass` WARC file where PDF records will be stored * `--paragraph-identification` print the paragraph identifier for each sentence extracted from the HTML +* `--classifier` classifier to use: `cld2` or `fasttext`. +* `--fasttext-model` path to FastText model for fasttext classifier. * `--tag-filters` file containing filters that are used to eliminate matching documents * `--invert-tag-filters` output only documents that match the filter * `--url-filters` file containing regular expressions that match urls of documents to eliminate diff --git a/fasttext b/fasttext new file mode 160000 index 0000000..ffee8e4 --- /dev/null +++ b/fasttext @@ -0,0 +1 @@ +Subproject commit ffee8e4d72a4d2ecd859575007877d12acbee5b3 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index cfdd65b..6db72e0 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -26,6 +26,8 @@ add_library(warc2text_lib record.cc html.cc lang.cc + lang_cld2.cc + lang_fasttext.cc util.cc bilangwriter.cc xh_scanner.cc diff --git a/src/bilangwriter.cc b/src/bilangwriter.cc index a6f1936..e1b71cb 100644 --- a/src/bilangwriter.cc +++ b/src/bilangwriter.cc @@ -1,5 +1,6 @@ #include "bilangwriter.hh" #include "util.hh" +#include "util/exception.hh" #include #include @@ -50,6 +51,7 @@ namespace warc2text{ void GzipWriter::open(const std::string& filename) { dest = std::fopen(filename.c_str(), "wb"); + UTIL_THROW_IF(!dest, util::ErrnoException, "while creating " << filename); } void GzipWriter::write(const char* text, std::size_t size) { @@ -108,39 +110,22 @@ namespace warc2text{ return result; } - void BilangWriter::write(const Record& record, bool multilang, bool paragraph_identification) { + void BilangWriter::write(const Record& record, bool paragraph_identification) { std::string base64text; std::string base64html; - if (multilang) { + if (output_files.count("html") == 1) + util::encodeBase64(record.getPayload(), base64html); - if (output_files.count("html") == 1) - util::encodeBase64(record.getPayload(), base64html); - - for (const auto& it : record.getTextByLangs()) { - std::string payload = it.second; - - if (paragraph_identification) { - payload = get_paragraph_id(payload); - } - - util::encodeBase64(payload, base64text); - this->write(it.first, base64text, record.getURL(), record.getHTTPcontentType(), base64html); - } - - } else { - std::string payload = record.getPlainText(); + for (const auto& it : record.getTextByLangs()) { + std::string payload = it.second; if (paragraph_identification) { payload = get_paragraph_id(payload); } util::encodeBase64(payload, base64text); - - if (output_files.count("html") == 1) - util::encodeBase64(record.getPayload(), base64html); - - this->write(record.getLanguage(), base64text, record.getURL(), record.getHTTPcontentType(), base64html); + this->write(it.first, base64text, record.getURL(), record.getHTTPcontentType(), base64html); } } diff --git a/src/bilangwriter.hh b/src/bilangwriter.hh index 3b520f6..1b7f83e 100644 --- a/src/bilangwriter.hh +++ b/src/bilangwriter.hh @@ -57,7 +57,7 @@ namespace warc2text { output_files(output_files) {}; - void write(const Record& record, bool multilang = false, bool paragraph_identification = false); + void write(const Record& record, bool paragraph_identification = false); }; diff --git a/src/lang.cc b/src/lang.cc index ff57913..19e52f0 100644 --- a/src/lang.cc +++ b/src/lang.cc @@ -1,70 +1,7 @@ -#include "src/lang.hh" +#include "lang.hh" namespace warc2text { - // hint = {content language code(s), tld, original encoding, CLD2::Language} - const CLD2::CLDHints NO_HINT = {nullptr, nullptr, CLD2::UNKNOWN_ENCODING, CLD2::UNKNOWN_LANGUAGE}; + +const std::string LanguageDetector::kUnknownLanguageLabel = "unk"; - bool detectLanguage(const std::string& text, std::unordered_map& text_by_lang){ - CLD2::Language langs[3] = {CLD2::UNKNOWN_LANGUAGE, CLD2::UNKNOWN_LANGUAGE, CLD2::UNKNOWN_LANGUAGE}; - int percents[3] = {0,0,0}; - double scores[3] = {0.0, 0.0, 0.0}; - - bool reliable = false; - int text_bytes; - int valid_prefix_bytes; - - CLD2::ResultChunkVector chunks; - - CLD2::ExtDetectLanguageSummaryCheckUTF8(text.data(), text.size(), true, &NO_HINT, 0, &langs[0], &percents[0], &scores[0], &chunks, &text_bytes, &reliable, &valid_prefix_bytes); - - text_by_lang.clear(); - - if (not reliable) return reliable; - - std::string* top1 = nullptr; - std::string* top2 = nullptr; - std::string* top3 = nullptr; - - if (langs[0] != CLD2::UNKNOWN_LANGUAGE and percents[0] > 0) { - top1 = &text_by_lang[CLD2::LanguageCode(langs[0])]; - top1->reserve(text.size() * (percents[0] + 1)); - } - - if (langs[1] != CLD2::UNKNOWN_LANGUAGE and percents[1] > 0) { - top2 = &text_by_lang[CLD2::LanguageCode(langs[1])]; - top2->reserve(text.size() * (percents[1] + 1)); - } - - if (langs[2] != CLD2::UNKNOWN_LANGUAGE and percents[2] > 0) { - top3 = &text_by_lang[CLD2::LanguageCode(langs[2])]; - top3->reserve(text.size() * (percents[2] + 1)); - } - - for (const CLD2::ResultChunk& chunk : chunks) { - std::string* ref = static_cast(chunk.lang1) == langs[0] ? top1 : - static_cast(chunk.lang1) == langs[1] ? top2 : - static_cast(chunk.lang1) == langs[2] ? top3 : nullptr; - if (ref == nullptr) continue; - ref->append(text, chunk.offset, chunk.bytes); - } - - // remove empty texts from text_by_lang - // apparently it is possible that the reported percentage is > 0, but the language does not appear in chunks - for (auto it = text_by_lang.cbegin(); it != text_by_lang.cend(); ){ - if (it->second.size() == 0) text_by_lang.erase(it++); - else ++it; - } - - // TODO: do something with the scores? - - return reliable; - } - - bool detectLanguage(const std::string& text, std::string& lang){ - bool reliable = false; - int valid_prefix_bytes = 0; - CLD2::Language l = CLD2::DetectLanguageCheckUTF8(text.data(), text.size(), true, &reliable, &valid_prefix_bytes); - lang = CLD2::LanguageCode(l); - return reliable; - } } // namespace warc2text diff --git a/src/lang.hh b/src/lang.hh index a00a4bb..3552892 100644 --- a/src/lang.hh +++ b/src/lang.hh @@ -1,18 +1,49 @@ #ifndef WARC2TEXT_LANG_HH #define WARC2TEXT_LANG_HH +#include #include #include -#include -#include "cld2/public/compact_lang_det.h" -#include "cld2/public/encodings.h" + +namespace fasttext { +class FastText; +} // namespace fasttext namespace warc2text { - // detect language of plain text, return top 3 languages - bool detectLanguage(const std::string& text, std::unordered_map& chunks); - // detect top language of plain text - bool detectLanguage(const std::string& text, std::string& lang); -} +class LanguageDetector { + public: + virtual ~LanguageDetector() {}; + + // detect language of plain text, return top languages + virtual void detect(const std::string& text, std::unordered_map& chunks) const = 0; + + // Label used for text (chunks) that cannot reliably be identified + static const std::string kUnknownLanguageLabel; +}; + +class FastTextDetector : public LanguageDetector { + public: + explicit FastTextDetector(const std::string &filename); + virtual ~FastTextDetector(); + virtual void detect(const std::string& text, std::unordered_map& chunks) const; + + private: + std::unique_ptr classifier_; +}; + +class CLD2Detector : public LanguageDetector { +public: + virtual void detect(const std::string& text, std::unordered_map& chunks) const; + virtual ~CLD2Detector(); +}; + +class CLD2MultiLangDetector : public LanguageDetector { +public: + virtual void detect(const std::string& text, std::unordered_map& chunks) const; + virtual ~CLD2MultiLangDetector(); +}; + +} // namespace warc2text #endif diff --git a/src/lang_cld2.cc b/src/lang_cld2.cc new file mode 100644 index 0000000..30e8cd9 --- /dev/null +++ b/src/lang_cld2.cc @@ -0,0 +1,76 @@ +#include "src/lang.hh" +#include "cld2/public/compact_lang_det.h" +#include "cld2/public/encodings.h" + +namespace warc2text { + // hint = {content language code(s), tld, original encoding, CLD2::Language} + const CLD2::CLDHints NO_HINT = {nullptr, nullptr, CLD2::UNKNOWN_ENCODING, CLD2::UNKNOWN_LANGUAGE}; + + CLD2Detector::~CLD2Detector() {} + + void CLD2Detector::detect(const std::string& text, std::unordered_map& text_by_lang) const { + bool reliable = false; + int valid_prefix_bytes = 0; + CLD2::Language l = CLD2::DetectLanguageCheckUTF8(text.data(), text.size(), true, &reliable, &valid_prefix_bytes); + text_by_lang[reliable ? CLD2::LanguageCode(l) : kUnknownLanguageLabel] = text; + } + + CLD2MultiLangDetector::~CLD2MultiLangDetector() {} + + void CLD2MultiLangDetector::detect(const std::string& text, std::unordered_map& text_by_lang) const { + CLD2::Language langs[3] = {CLD2::UNKNOWN_LANGUAGE, CLD2::UNKNOWN_LANGUAGE, CLD2::UNKNOWN_LANGUAGE}; + int percents[3] = {0,0,0}; + double scores[3] = {0.0, 0.0, 0.0}; + + bool reliable = false; + int text_bytes; + int valid_prefix_bytes; + + CLD2::ResultChunkVector chunks; + + CLD2::ExtDetectLanguageSummaryCheckUTF8(text.data(), text.size(), true, &NO_HINT, 0, &langs[0], &percents[0], &scores[0], &chunks, &text_bytes, &reliable, &valid_prefix_bytes); + + text_by_lang.clear(); + + if (not reliable) { + text_by_lang[kUnknownLanguageLabel] = text; + return; + } + + std::string* top1 = nullptr; + std::string* top2 = nullptr; + std::string* top3 = nullptr; + + if (langs[0] != CLD2::UNKNOWN_LANGUAGE and percents[0] > 0) { + top1 = &text_by_lang[CLD2::LanguageCode(langs[0])]; + top1->reserve(text.size() * (percents[0] + 1)); + } + + if (langs[1] != CLD2::UNKNOWN_LANGUAGE and percents[1] > 0) { + top2 = &text_by_lang[CLD2::LanguageCode(langs[1])]; + top2->reserve(text.size() * (percents[1] + 1)); + } + + if (langs[2] != CLD2::UNKNOWN_LANGUAGE and percents[2] > 0) { + top3 = &text_by_lang[CLD2::LanguageCode(langs[2])]; + top3->reserve(text.size() * (percents[2] + 1)); + } + + for (const CLD2::ResultChunk& chunk : chunks) { + std::string* ref = static_cast(chunk.lang1) == langs[0] ? top1 : + static_cast(chunk.lang1) == langs[1] ? top2 : + static_cast(chunk.lang1) == langs[2] ? top3 : nullptr; + if (ref == nullptr) continue; + ref->append(text, chunk.offset, chunk.bytes); + } + + // remove empty texts from text_by_lang + // apparently it is possible that the reported percentage is > 0, but the language does not appear in chunks + for (auto it = text_by_lang.cbegin(); it != text_by_lang.cend(); ){ + if (it->second.size() == 0) text_by_lang.erase(it++); + else ++it; + } + + // TODO: do something with the scores? + } +} // namespace warc2text diff --git a/src/lang_fasttext.cc b/src/lang_fasttext.cc new file mode 100644 index 0000000..0f6f955 --- /dev/null +++ b/src/lang_fasttext.cc @@ -0,0 +1,40 @@ +#include "src/lang.hh" + +#include "fasttext.h" +#include "util/exception.hh" + +#include +#include + +namespace warc2text { + +FastTextDetector::FastTextDetector(const std::string &filename) + : classifier_(new fasttext::FastText) { + classifier_->loadModel(filename); +} + +FastTextDetector::~FastTextDetector() {} + +const char kLabelPrefix[] = "__label__"; + +void FastTextDetector::detect(const std::string& text, std::unordered_map& chunks) const { + const float kThreshold = 0.5f; + std::vector words, labels; + classifier_->getDictionary()->getStringNoNewline(text, words, labels); + fasttext::Predictions predictions; + classifier_->predict(1, words, predictions, kThreshold); + if (predictions.empty()) { + chunks[kUnknownLanguageLabel] = text; + return; + } + + // Labels look like __label__eng + std::string label = classifier_->getDictionary()->getLabel(predictions[0].second); + UTIL_THROW_IF2(strncmp(label.c_str(), kLabelPrefix, sizeof(kLabelPrefix) - 1), "Was expecting text classifier labels to begin with " << kLabelPrefix << " but they look like " << label); + label.erase(0, sizeof(kLabelPrefix) - 1); + + // For better or worse, we're currently doing everything as one chunk. + chunks[label] = text; +} + +} // namespace warc2text diff --git a/src/record.cc b/src/record.cc index b398f61..57d2b75 100644 --- a/src/record.cc +++ b/src/record.cc @@ -239,10 +239,8 @@ namespace warc2text { return text_by_langs; } - int Record::detectLanguage(bool multilang){ - if (not multilang) return warc2text::detectLanguage(plaintext, language); - - warc2text::detectLanguage(plaintext, text_by_langs); + int Record::detectLanguage(LanguageDetector const &detector){ + detector.detect(plaintext, text_by_langs); return text_by_langs.size(); } diff --git a/src/record.hh b/src/record.hh index b66a740..13d9c7e 100644 --- a/src/record.hh +++ b/src/record.hh @@ -38,7 +38,7 @@ namespace warc2text { int cleanPayload(); int cleanPayload(const util::umap_tag_filters_regex& tagFilters); - int detectLanguage(bool multilang); + int detectLanguage(LanguageDetector const &detector); static std::string readZipPayload(const std::string& content_type, const std::string& payload); static std::string isPayloadZip(const std::string& content_type, const std::string& uri); diff --git a/src/warcpreprocessor.cc b/src/warcpreprocessor.cc index f87fef0..1d90079 100644 --- a/src/warcpreprocessor.cc +++ b/src/warcpreprocessor.cc @@ -1,4 +1,5 @@ #include "warcpreprocessor.hh" +#include "src/lang.hh" #include "zipreader.hh" #include "util/compress.hh" #include @@ -8,10 +9,12 @@ namespace warc2text { const std::unordered_set WARCPreprocessor::removeExtensions = {".jpg", ".jpeg", ".gif", ".png", ".css", ".js", ".mp3", ".mp4", ".flv", ".wmv", ".gz", ".zip", ".rar" }; - WARCPreprocessor::WARCPreprocessor(const std::string& outputFolder, const std::unordered_set& output_files, + WARCPreprocessor::WARCPreprocessor(const LanguageDetector &detector, + const std::string& outputFolder, const std::unordered_set& output_files, const std::string& pdf_warc_filename, const std::string& tagFiltersFile, bool invert, - const std::string& urlFiltersFile, bool multilang, bool encodeURLs, + const std::string& urlFiltersFile, bool encodeURLs, bool paragraph_identification) : + detector(detector), writer(outputFolder, output_files), totalRecords(0), textRecords(0), @@ -22,7 +25,6 @@ namespace warc2text { tagFilters(), pdf_warc_filename(pdf_warc_filename), invert(invert), - multilang(multilang), encodeURLs(encodeURLs), paragraph_identification(paragraph_identification) { if (!tagFiltersFile.empty()) @@ -146,21 +148,28 @@ namespace warc2text { ++textRecords; textBytes += record.getPlainText().size(); - n_langs = record.detectLanguage(multilang); - if (n_langs == 1) { - langBytes += record.getPlainText().size(); - } else if (n_langs > 1) { + record.detectLanguage(detector); + n_langs = 0; + for (auto const &chunk : record.getTextByLangs()) { + // Don't count the unknown language chunks + if (chunk.first == LanguageDetector::kUnknownLanguageLabel) + continue; + + langBytes += chunk.second.size(); + ++n_langs; + } + + if (n_langs > 1) { BOOST_LOG_TRIVIAL(trace) << "Record " << record.getURL() << ": multiple (" << n_langs << ") languages detected"; - for (auto it : record.getTextByLangs()) - langBytes += it.second.size(); + } else if (n_langs == 1) { + } else { BOOST_LOG_TRIVIAL(trace) << "Record " << record.getURL() << ": language not detected"; - continue; } langRecords += n_langs; - writer.write(record, multilang, paragraph_identification); + writer.write(record, paragraph_identification); } pdf_warc_writer.close(); } diff --git a/src/warcpreprocessor.hh b/src/warcpreprocessor.hh index 8e8548f..b332523 100644 --- a/src/warcpreprocessor.hh +++ b/src/warcpreprocessor.hh @@ -2,6 +2,7 @@ #define WARC2TEXT_WARCPREPROCESSOR_HH #include "record.hh" +#include "src/lang.hh" #include "warcreader.hh" #include "bilangwriter.hh" #include "util.hh" @@ -24,6 +25,7 @@ namespace warc2text { class WARCPreprocessor { private: + LanguageDetector const &detector; BilangWriter writer; unsigned int totalRecords; unsigned int textRecords; @@ -35,7 +37,6 @@ namespace warc2text { boost::regex urlFilter; std::string pdf_warc_filename; bool invert; - bool multilang; bool encodeURLs; bool paragraph_identification; @@ -43,9 +44,10 @@ namespace warc2text { bool URLfilter(const std::string& url); public: - explicit WARCPreprocessor(const std::string& outputFolder, const std::unordered_set& output_files = {}, + explicit WARCPreprocessor(LanguageDetector const &detector, + const std::string& outputFolder, const std::unordered_set& output_files = {}, const std::string& pdf_warc_filename = "", const std::string& tagFiltersFile = "", - bool invert = false, const std::string& urlFiltersFile = "", bool multilang = false, + bool invert = false, const std::string& urlFiltersFile = "", bool encodeURLs = false, bool paragraph_identification = false); void process(const std::string &filename); void printStatistics() const; diff --git a/warc2text_main.cc b/warc2text_main.cc index dac24e4..1f165ac 100644 --- a/warc2text_main.cc +++ b/warc2text_main.cc @@ -8,6 +8,7 @@ #include #include #include +#include "src/lang.hh" #include "src/warcpreprocessor.hh" using namespace warc2text; @@ -25,6 +26,8 @@ struct Options { std::string url_filters_filename; bool multilang{}; bool encodeURLs{}; + std::string classifier; + std::string fasttext_model; }; void parseArgs(int argc, char *argv[], Options& out) { @@ -43,6 +46,8 @@ void parseArgs(int argc, char *argv[], Options& out) { ("verbose,v", po::bool_switch(&out.verbose)->default_value(false), "Verbosity level") ("silent,s", po::bool_switch(&out.silent)->default_value(false)) ("multilang", po::bool_switch(&out.multilang)->default_value(false), "Detect multiple languages in a single record") + ("classifier", po::value(&out.classifier)->default_value("cld2"), "Language classifier: cld2 or fasttext (default cld2)") + ("fasttext-model", po::value(&out.fasttext_model)->default_value(""), "Path to fasttext model") ("encode-urls", po::bool_switch(&out.encodeURLs)->default_value(false), "Encode URLs obtained from WARC records"); po::positional_options_description pd; @@ -58,6 +63,8 @@ void parseArgs(int argc, char *argv[], Options& out) { " -f List of output files separated by commas\n" " Default (mandatory): \"url,text\"\n" " Optional values: \"mime,html\"\n" + " --classifier Classifier to use: cld2 or fasttext\n" + " --fasttext-model Path to FastText model for fasttext classifier\n" " --multilang Detect multiple languages in documents (up to 3),\n" " write as many text records as languages detected\n" " --tag-filters File containing html tag filters\n" @@ -94,9 +101,29 @@ int main(int argc, char *argv[]) { boost::algorithm::split(files_list, options.files, [](char c) {return c == ',';}); std::unordered_set output_files(files_list.begin(), files_list.end()); + std::unique_ptr detector; + + if (options.classifier == "cld2") { + if (options.multilang) { + detector.reset(new CLD2MultiLangDetector()); + } else { + detector.reset(new CLD2Detector()); + } + } else if (options.classifier == "fasttext") { + if (options.multilang) { + BOOST_LOG_TRIVIAL(error) << "FastText classifier doesn't do multilang at the moment"; + abort(); + } else { + detector.reset(new FastTextDetector(options.fasttext_model)); + } + } else { + BOOST_LOG_TRIVIAL(error) << "Unsupported classifier option"; + abort(); + } + std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now(); - WARCPreprocessor warcpproc(options.output, output_files, options.pdf_warc_filename, options.tag_filters_filename, - options.tag_filters_invert, options.url_filters_filename, options.multilang, + WARCPreprocessor warcpproc(*detector, options.output, output_files, options.pdf_warc_filename, options.tag_filters_filename, + options.tag_filters_invert, options.url_filters_filename, options.encodeURLs, options.paragraph_identification); for (const std::string& file : options.warcs){ warcpproc.process(file);