From 144fdb051c84752e004f9802da4c250d2d0c074e Mon Sep 17 00:00:00 2001 From: Jelmer van der Linde Date: Wed, 8 Mar 2023 16:30:53 +0000 Subject: [PATCH 1/8] Add fasttext as an option alongside cld2 --- .gitmodules | 3 +++ CMakeLists.txt | 5 +++- README.md | 1 + fasttext | 1 + src/CMakeLists.txt | 3 ++- src/lang.hh | 47 +++++++++++++++++++++++++++++------ src/{lang.cc => lang_cld2.cc} | 24 +++++++++++------- src/lang_fasttext.cc | 40 +++++++++++++++++++++++++++++ src/record.cc | 6 ++--- src/record.hh | 2 +- src/warcpreprocessor.cc | 11 ++++---- src/warcpreprocessor.hh | 8 +++--- warc2text_main.cc | 31 +++++++++++++++++++++-- 13 files changed, 148 insertions(+), 34 deletions(-) create mode 160000 fasttext rename src/{lang.cc => lang_cld2.cc} (84%) create mode 100644 src/lang_fasttext.cc diff --git a/.gitmodules b/.gitmodules index 017fa01..7a8a2a7 100644 --- a/.gitmodules +++ b/.gitmodules @@ -6,3 +6,6 @@ path = cld2 url = https://github.com/bitextor/cld2.git ignore = dirty +[submodule "fasttext"] + path = fasttext + url = https://github.com/kpuatfb/fastText.git diff --git a/CMakeLists.txt b/CMakeLists.txt index 9e90b00..df965b8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 2.8.3) project(warc2text) -set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_FLAGS "-Wall -Wextra -DBOOST_LOG_DYN_LINK ${CMAKE_CXX_FLAGS}") @@ -41,10 +41,12 @@ if (NOT SKIP_PREPROCESS_BUILD) endif() target_include_directories(warc2text_lib PUBLIC ${PREPROCESS_PATH}) +target_include_directories(warc2text_lib PRIVATE fasttext/src) # # add libcld2.so add_subdirectory(cld2) +add_subdirectory(fasttext) # # define executables @@ -53,6 +55,7 @@ target_link_libraries(warc2text warc2text_lib ${Boost_LIBRARIES} cld2_full + fasttext-static ) include(GNUInstallDirs) diff --git a/README.md b/README.md index 2aad126..995cd93 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,7 @@ mkdir build cd build cmake -DCMAKE_INSTALL_PREFIX=/your/prefix/path .. # cmake .. -DCMAKE_BUILD_TYPE=Debug # for debug +# cmake .. -DICU_ROOT_DIR=(brew --prefix icu4c)/lib # for macOS make -j make install ``` diff --git a/fasttext b/fasttext new file mode 160000 index 0000000..ffee8e4 --- /dev/null +++ b/fasttext @@ -0,0 +1 @@ +Subproject commit ffee8e4d72a4d2ecd859575007877d12acbee5b3 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index cfdd65b..c01951d 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -25,7 +25,8 @@ add_library(warc2text_lib warcreader.cc record.cc html.cc - lang.cc + lang_cld2.cc + lang_fasttext.cc util.cc bilangwriter.cc xh_scanner.cc diff --git a/src/lang.hh b/src/lang.hh index a00a4bb..8eb8abf 100644 --- a/src/lang.hh +++ b/src/lang.hh @@ -1,18 +1,49 @@ #ifndef WARC2TEXT_LANG_HH #define WARC2TEXT_LANG_HH +#include #include #include -#include -#include "cld2/public/compact_lang_det.h" -#include "cld2/public/encodings.h" + +namespace fasttext { +class FastText; +} // namespace fasttext namespace warc2text { - // detect language of plain text, return top 3 languages - bool detectLanguage(const std::string& text, std::unordered_map& chunks); - // detect top language of plain text - bool detectLanguage(const std::string& text, std::string& lang); -} +class LanguageDetector { + public: + virtual ~LanguageDetector() {}; + + // detect language of plain text, return top languages + virtual bool detect(const std::string& text, std::unordered_map& chunks) const = 0; +}; + +class FastTextDetector : public LanguageDetector { + public: + explicit FastTextDetector(const std::string &filename); + + virtual ~FastTextDetector(); + + // detect language of plain text, return top languages + virtual bool detect(const std::string& text, std::unordered_map& chunks) const; + + private: + std::unique_ptr classifier_; +}; + +class CLD2Detector : public LanguageDetector { +public: + virtual bool detect(const std::string& text, std::unordered_map& chunks) const; + virtual ~CLD2Detector(); +}; + +class CLD2MultiLangDetector : public LanguageDetector { +public: + virtual bool detect(const std::string& text, std::unordered_map& chunks) const; + virtual ~CLD2MultiLangDetector(); +}; + +} // namespace warc2text #endif diff --git a/src/lang.cc b/src/lang_cld2.cc similarity index 84% rename from src/lang.cc rename to src/lang_cld2.cc index ff57913..88fc44e 100644 --- a/src/lang.cc +++ b/src/lang_cld2.cc @@ -1,10 +1,24 @@ #include "src/lang.hh" +#include "cld2/public/compact_lang_det.h" +#include "cld2/public/encodings.h" namespace warc2text { // hint = {content language code(s), tld, original encoding, CLD2::Language} const CLD2::CLDHints NO_HINT = {nullptr, nullptr, CLD2::UNKNOWN_ENCODING, CLD2::UNKNOWN_LANGUAGE}; - bool detectLanguage(const std::string& text, std::unordered_map& text_by_lang){ + CLD2Detector::~CLD2Detector() {} + + bool CLD2Detector::detect(const std::string& text, std::unordered_map& text_by_lang) const { + bool reliable = false; + int valid_prefix_bytes = 0; + CLD2::Language l = CLD2::DetectLanguageCheckUTF8(text.data(), text.size(), true, &reliable, &valid_prefix_bytes); + text_by_lang[CLD2::LanguageCode(l)] = text; + return reliable; + } + + CLD2MultiLangDetector::~CLD2MultiLangDetector() {} + + bool CLD2MultiLangDetector::detect(const std::string& text, std::unordered_map& text_by_lang) const { CLD2::Language langs[3] = {CLD2::UNKNOWN_LANGUAGE, CLD2::UNKNOWN_LANGUAGE, CLD2::UNKNOWN_LANGUAGE}; int percents[3] = {0,0,0}; double scores[3] = {0.0, 0.0, 0.0}; @@ -59,12 +73,4 @@ namespace warc2text { return reliable; } - - bool detectLanguage(const std::string& text, std::string& lang){ - bool reliable = false; - int valid_prefix_bytes = 0; - CLD2::Language l = CLD2::DetectLanguageCheckUTF8(text.data(), text.size(), true, &reliable, &valid_prefix_bytes); - lang = CLD2::LanguageCode(l); - return reliable; - } } // namespace warc2text diff --git a/src/lang_fasttext.cc b/src/lang_fasttext.cc new file mode 100644 index 0000000..ec35fa5 --- /dev/null +++ b/src/lang_fasttext.cc @@ -0,0 +1,40 @@ +#include "src/lang.hh" + +#include "fasttext.h" + +#include +#include + +namespace warc2text { + +FastTextDetector::FastTextDetector(const std::string &filename) + : classifier_(new fasttext::FastText) { + classifier_->loadModel(filename); +} + +FastTextDetector::~FastTextDetector() {} + +const char kLabelPrefix[] = "__label__"; + +bool FastTextDetector::detect(const std::string& text, std::unordered_map& chunks) const { + const float kThreshold = 0.5f; + std::vector words, labels; + classifier_->getDictionary()->getStringNoNewline(text, words, labels); + fasttext::Predictions predictions; + classifier_->predict(1, words, predictions, kThreshold); + if (predictions.empty()) return false; + + // Labels look like __label__eng + std::string label = classifier_->getDictionary()->getLabel(predictions[0].second); + if (strncmp(label.c_str(), kLabelPrefix, sizeof(kLabelPrefix) - 1)) { + std::cerr << "Was expecting text classifier labels to begin with " << kLabelPrefix << " but they look like " << label << std::endl; + std::abort(); + } + label.erase(0, sizeof(kLabelPrefix) - 1); + + // For better or worse, we're currently doing everything as one chunk. + chunks[label] = text; + return true; +} + +} // namespace warc2text diff --git a/src/record.cc b/src/record.cc index e2e9615..6233f09 100644 --- a/src/record.cc +++ b/src/record.cc @@ -233,10 +233,8 @@ namespace warc2text { return text_by_langs; } - int Record::detectLanguage(bool multilang){ - if (not multilang) return warc2text::detectLanguage(plaintext, language); - - warc2text::detectLanguage(plaintext, text_by_langs); + int Record::detectLanguage(LanguageDetector const &detector){ + detector.detect(plaintext, text_by_langs); return text_by_langs.size(); } diff --git a/src/record.hh b/src/record.hh index b66a740..13d9c7e 100644 --- a/src/record.hh +++ b/src/record.hh @@ -38,7 +38,7 @@ namespace warc2text { int cleanPayload(); int cleanPayload(const util::umap_tag_filters_regex& tagFilters); - int detectLanguage(bool multilang); + int detectLanguage(LanguageDetector const &detector); static std::string readZipPayload(const std::string& content_type, const std::string& payload); static std::string isPayloadZip(const std::string& content_type, const std::string& uri); diff --git a/src/warcpreprocessor.cc b/src/warcpreprocessor.cc index f87fef0..5aa7b9c 100644 --- a/src/warcpreprocessor.cc +++ b/src/warcpreprocessor.cc @@ -8,10 +8,12 @@ namespace warc2text { const std::unordered_set WARCPreprocessor::removeExtensions = {".jpg", ".jpeg", ".gif", ".png", ".css", ".js", ".mp3", ".mp4", ".flv", ".wmv", ".gz", ".zip", ".rar" }; - WARCPreprocessor::WARCPreprocessor(const std::string& outputFolder, const std::unordered_set& output_files, + WARCPreprocessor::WARCPreprocessor(const LanguageDetector &detector, + const std::string& outputFolder, const std::unordered_set& output_files, const std::string& pdf_warc_filename, const std::string& tagFiltersFile, bool invert, - const std::string& urlFiltersFile, bool multilang, bool encodeURLs, + const std::string& urlFiltersFile, bool encodeURLs, bool paragraph_identification) : + detector(detector), writer(outputFolder, output_files), totalRecords(0), textRecords(0), @@ -22,7 +24,6 @@ namespace warc2text { tagFilters(), pdf_warc_filename(pdf_warc_filename), invert(invert), - multilang(multilang), encodeURLs(encodeURLs), paragraph_identification(paragraph_identification) { if (!tagFiltersFile.empty()) @@ -146,7 +147,7 @@ namespace warc2text { ++textRecords; textBytes += record.getPlainText().size(); - n_langs = record.detectLanguage(multilang); + n_langs = record.detectLanguage(detector); if (n_langs == 1) { langBytes += record.getPlainText().size(); } else if (n_langs > 1) { @@ -160,7 +161,7 @@ namespace warc2text { langRecords += n_langs; - writer.write(record, multilang, paragraph_identification); + writer.write(record, true, paragraph_identification); } pdf_warc_writer.close(); } diff --git a/src/warcpreprocessor.hh b/src/warcpreprocessor.hh index 8e8548f..b332523 100644 --- a/src/warcpreprocessor.hh +++ b/src/warcpreprocessor.hh @@ -2,6 +2,7 @@ #define WARC2TEXT_WARCPREPROCESSOR_HH #include "record.hh" +#include "src/lang.hh" #include "warcreader.hh" #include "bilangwriter.hh" #include "util.hh" @@ -24,6 +25,7 @@ namespace warc2text { class WARCPreprocessor { private: + LanguageDetector const &detector; BilangWriter writer; unsigned int totalRecords; unsigned int textRecords; @@ -35,7 +37,6 @@ namespace warc2text { boost::regex urlFilter; std::string pdf_warc_filename; bool invert; - bool multilang; bool encodeURLs; bool paragraph_identification; @@ -43,9 +44,10 @@ namespace warc2text { bool URLfilter(const std::string& url); public: - explicit WARCPreprocessor(const std::string& outputFolder, const std::unordered_set& output_files = {}, + explicit WARCPreprocessor(LanguageDetector const &detector, + const std::string& outputFolder, const std::unordered_set& output_files = {}, const std::string& pdf_warc_filename = "", const std::string& tagFiltersFile = "", - bool invert = false, const std::string& urlFiltersFile = "", bool multilang = false, + bool invert = false, const std::string& urlFiltersFile = "", bool encodeURLs = false, bool paragraph_identification = false); void process(const std::string &filename); void printStatistics() const; diff --git a/warc2text_main.cc b/warc2text_main.cc index dac24e4..1f165ac 100644 --- a/warc2text_main.cc +++ b/warc2text_main.cc @@ -8,6 +8,7 @@ #include #include #include +#include "src/lang.hh" #include "src/warcpreprocessor.hh" using namespace warc2text; @@ -25,6 +26,8 @@ struct Options { std::string url_filters_filename; bool multilang{}; bool encodeURLs{}; + std::string classifier; + std::string fasttext_model; }; void parseArgs(int argc, char *argv[], Options& out) { @@ -43,6 +46,8 @@ void parseArgs(int argc, char *argv[], Options& out) { ("verbose,v", po::bool_switch(&out.verbose)->default_value(false), "Verbosity level") ("silent,s", po::bool_switch(&out.silent)->default_value(false)) ("multilang", po::bool_switch(&out.multilang)->default_value(false), "Detect multiple languages in a single record") + ("classifier", po::value(&out.classifier)->default_value("cld2"), "Language classifier: cld2 or fasttext (default cld2)") + ("fasttext-model", po::value(&out.fasttext_model)->default_value(""), "Path to fasttext model") ("encode-urls", po::bool_switch(&out.encodeURLs)->default_value(false), "Encode URLs obtained from WARC records"); po::positional_options_description pd; @@ -58,6 +63,8 @@ void parseArgs(int argc, char *argv[], Options& out) { " -f List of output files separated by commas\n" " Default (mandatory): \"url,text\"\n" " Optional values: \"mime,html\"\n" + " --classifier Classifier to use: cld2 or fasttext\n" + " --fasttext-model Path to FastText model for fasttext classifier\n" " --multilang Detect multiple languages in documents (up to 3),\n" " write as many text records as languages detected\n" " --tag-filters File containing html tag filters\n" @@ -94,9 +101,29 @@ int main(int argc, char *argv[]) { boost::algorithm::split(files_list, options.files, [](char c) {return c == ',';}); std::unordered_set output_files(files_list.begin(), files_list.end()); + std::unique_ptr detector; + + if (options.classifier == "cld2") { + if (options.multilang) { + detector.reset(new CLD2MultiLangDetector()); + } else { + detector.reset(new CLD2Detector()); + } + } else if (options.classifier == "fasttext") { + if (options.multilang) { + BOOST_LOG_TRIVIAL(error) << "FastText classifier doesn't do multilang at the moment"; + abort(); + } else { + detector.reset(new FastTextDetector(options.fasttext_model)); + } + } else { + BOOST_LOG_TRIVIAL(error) << "Unsupported classifier option"; + abort(); + } + std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now(); - WARCPreprocessor warcpproc(options.output, output_files, options.pdf_warc_filename, options.tag_filters_filename, - options.tag_filters_invert, options.url_filters_filename, options.multilang, + WARCPreprocessor warcpproc(*detector, options.output, output_files, options.pdf_warc_filename, options.tag_filters_filename, + options.tag_filters_invert, options.url_filters_filename, options.encodeURLs, options.paragraph_identification); for (const std::string& file : options.warcs){ warcpproc.process(file); From 5858675123e0763623f22af53638eb8f117a4ac6 Mon Sep 17 00:00:00 2001 From: Jelmer van der Linde Date: Wed, 8 Mar 2023 17:00:24 +0000 Subject: [PATCH 2/8] Catch fopen errors --- src/bilangwriter.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/bilangwriter.cc b/src/bilangwriter.cc index a6f1936..39c2aa7 100644 --- a/src/bilangwriter.cc +++ b/src/bilangwriter.cc @@ -1,5 +1,6 @@ #include "bilangwriter.hh" #include "util.hh" +#include "util/exception.hh" #include #include @@ -50,6 +51,7 @@ namespace warc2text{ void GzipWriter::open(const std::string& filename) { dest = std::fopen(filename.c_str(), "wb"); + UTIL_THROW_IF(!dest, util::ErrnoException, "while creating " << filename); } void GzipWriter::write(const char* text, std::size_t size) { From 7f05d337439d7606f017ade50a50776fa849868e Mon Sep 17 00:00:00 2001 From: Jelmer van der Linde Date: Fri, 10 Mar 2023 14:37:02 +0000 Subject: [PATCH 3/8] Treat unreliable language identification as an `unk` language This makes CLD2 and FastText behave the same regardless of the `--multilang` option --- src/CMakeLists.txt | 1 + src/lang.cc | 7 +++++++ src/lang.hh | 14 +++++++------- src/lang_cld2.cc | 14 +++++++------- src/lang_fasttext.cc | 14 +++++++------- src/warcpreprocessor.cc | 22 +++++++++++++++------- 6 files changed, 44 insertions(+), 28 deletions(-) create mode 100644 src/lang.cc diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c01951d..6db72e0 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -25,6 +25,7 @@ add_library(warc2text_lib warcreader.cc record.cc html.cc + lang.cc lang_cld2.cc lang_fasttext.cc util.cc diff --git a/src/lang.cc b/src/lang.cc new file mode 100644 index 0000000..19e52f0 --- /dev/null +++ b/src/lang.cc @@ -0,0 +1,7 @@ +#include "lang.hh" + +namespace warc2text { + +const std::string LanguageDetector::kUnknownLanguageLabel = "unk"; + +} // namespace warc2text diff --git a/src/lang.hh b/src/lang.hh index 8eb8abf..3552892 100644 --- a/src/lang.hh +++ b/src/lang.hh @@ -16,17 +16,17 @@ class LanguageDetector { virtual ~LanguageDetector() {}; // detect language of plain text, return top languages - virtual bool detect(const std::string& text, std::unordered_map& chunks) const = 0; + virtual void detect(const std::string& text, std::unordered_map& chunks) const = 0; + + // Label used for text (chunks) that cannot reliably be identified + static const std::string kUnknownLanguageLabel; }; class FastTextDetector : public LanguageDetector { public: explicit FastTextDetector(const std::string &filename); - virtual ~FastTextDetector(); - - // detect language of plain text, return top languages - virtual bool detect(const std::string& text, std::unordered_map& chunks) const; + virtual void detect(const std::string& text, std::unordered_map& chunks) const; private: std::unique_ptr classifier_; @@ -34,13 +34,13 @@ class FastTextDetector : public LanguageDetector { class CLD2Detector : public LanguageDetector { public: - virtual bool detect(const std::string& text, std::unordered_map& chunks) const; + virtual void detect(const std::string& text, std::unordered_map& chunks) const; virtual ~CLD2Detector(); }; class CLD2MultiLangDetector : public LanguageDetector { public: - virtual bool detect(const std::string& text, std::unordered_map& chunks) const; + virtual void detect(const std::string& text, std::unordered_map& chunks) const; virtual ~CLD2MultiLangDetector(); }; diff --git a/src/lang_cld2.cc b/src/lang_cld2.cc index 88fc44e..30e8cd9 100644 --- a/src/lang_cld2.cc +++ b/src/lang_cld2.cc @@ -8,17 +8,16 @@ namespace warc2text { CLD2Detector::~CLD2Detector() {} - bool CLD2Detector::detect(const std::string& text, std::unordered_map& text_by_lang) const { + void CLD2Detector::detect(const std::string& text, std::unordered_map& text_by_lang) const { bool reliable = false; int valid_prefix_bytes = 0; CLD2::Language l = CLD2::DetectLanguageCheckUTF8(text.data(), text.size(), true, &reliable, &valid_prefix_bytes); - text_by_lang[CLD2::LanguageCode(l)] = text; - return reliable; + text_by_lang[reliable ? CLD2::LanguageCode(l) : kUnknownLanguageLabel] = text; } CLD2MultiLangDetector::~CLD2MultiLangDetector() {} - bool CLD2MultiLangDetector::detect(const std::string& text, std::unordered_map& text_by_lang) const { + void CLD2MultiLangDetector::detect(const std::string& text, std::unordered_map& text_by_lang) const { CLD2::Language langs[3] = {CLD2::UNKNOWN_LANGUAGE, CLD2::UNKNOWN_LANGUAGE, CLD2::UNKNOWN_LANGUAGE}; int percents[3] = {0,0,0}; double scores[3] = {0.0, 0.0, 0.0}; @@ -33,7 +32,10 @@ namespace warc2text { text_by_lang.clear(); - if (not reliable) return reliable; + if (not reliable) { + text_by_lang[kUnknownLanguageLabel] = text; + return; + } std::string* top1 = nullptr; std::string* top2 = nullptr; @@ -70,7 +72,5 @@ namespace warc2text { } // TODO: do something with the scores? - - return reliable; } } // namespace warc2text diff --git a/src/lang_fasttext.cc b/src/lang_fasttext.cc index ec35fa5..0f6f955 100644 --- a/src/lang_fasttext.cc +++ b/src/lang_fasttext.cc @@ -1,6 +1,7 @@ #include "src/lang.hh" #include "fasttext.h" +#include "util/exception.hh" #include #include @@ -16,25 +17,24 @@ FastTextDetector::~FastTextDetector() {} const char kLabelPrefix[] = "__label__"; -bool FastTextDetector::detect(const std::string& text, std::unordered_map& chunks) const { +void FastTextDetector::detect(const std::string& text, std::unordered_map& chunks) const { const float kThreshold = 0.5f; std::vector words, labels; classifier_->getDictionary()->getStringNoNewline(text, words, labels); fasttext::Predictions predictions; classifier_->predict(1, words, predictions, kThreshold); - if (predictions.empty()) return false; + if (predictions.empty()) { + chunks[kUnknownLanguageLabel] = text; + return; + } // Labels look like __label__eng std::string label = classifier_->getDictionary()->getLabel(predictions[0].second); - if (strncmp(label.c_str(), kLabelPrefix, sizeof(kLabelPrefix) - 1)) { - std::cerr << "Was expecting text classifier labels to begin with " << kLabelPrefix << " but they look like " << label << std::endl; - std::abort(); - } + UTIL_THROW_IF2(strncmp(label.c_str(), kLabelPrefix, sizeof(kLabelPrefix) - 1), "Was expecting text classifier labels to begin with " << kLabelPrefix << " but they look like " << label); label.erase(0, sizeof(kLabelPrefix) - 1); // For better or worse, we're currently doing everything as one chunk. chunks[label] = text; - return true; } } // namespace warc2text diff --git a/src/warcpreprocessor.cc b/src/warcpreprocessor.cc index 5aa7b9c..daa35ac 100644 --- a/src/warcpreprocessor.cc +++ b/src/warcpreprocessor.cc @@ -1,4 +1,5 @@ #include "warcpreprocessor.hh" +#include "src/lang.hh" #include "zipreader.hh" #include "util/compress.hh" #include @@ -147,16 +148,23 @@ namespace warc2text { ++textRecords; textBytes += record.getPlainText().size(); - n_langs = record.detectLanguage(detector); - if (n_langs == 1) { - langBytes += record.getPlainText().size(); - } else if (n_langs > 1) { + record.detectLanguage(detector); + n_langs = 0; + for (auto const &chunk : record.getTextByLangs()) { + // Don't count the unknown language chunks + if (chunk.first == LanguageDetector::kUnknownLanguageLabel) + continue; + + langBytes += chunk.second.size(); + ++n_langs; + } + + if (n_langs > 1) { BOOST_LOG_TRIVIAL(trace) << "Record " << record.getURL() << ": multiple (" << n_langs << ") languages detected"; - for (auto it : record.getTextByLangs()) - langBytes += it.second.size(); + } else if (n_langs == 1) { + } else { BOOST_LOG_TRIVIAL(trace) << "Record " << record.getURL() << ": language not detected"; - continue; } langRecords += n_langs; From f6966b29e79faba1c83fc2375926e41f56af71be Mon Sep 17 00:00:00 2001 From: Jelmer van der Linde Date: Fri, 10 Mar 2023 14:38:45 +0000 Subject: [PATCH 4/8] Remove the `multilang` option from `bilangwriter` --- src/bilangwriter.cc | 29 ++++++----------------------- src/bilangwriter.hh | 2 +- src/warcpreprocessor.cc | 2 +- 3 files changed, 8 insertions(+), 25 deletions(-) diff --git a/src/bilangwriter.cc b/src/bilangwriter.cc index 39c2aa7..e1b71cb 100644 --- a/src/bilangwriter.cc +++ b/src/bilangwriter.cc @@ -110,39 +110,22 @@ namespace warc2text{ return result; } - void BilangWriter::write(const Record& record, bool multilang, bool paragraph_identification) { + void BilangWriter::write(const Record& record, bool paragraph_identification) { std::string base64text; std::string base64html; - if (multilang) { + if (output_files.count("html") == 1) + util::encodeBase64(record.getPayload(), base64html); - if (output_files.count("html") == 1) - util::encodeBase64(record.getPayload(), base64html); - - for (const auto& it : record.getTextByLangs()) { - std::string payload = it.second; - - if (paragraph_identification) { - payload = get_paragraph_id(payload); - } - - util::encodeBase64(payload, base64text); - this->write(it.first, base64text, record.getURL(), record.getHTTPcontentType(), base64html); - } - - } else { - std::string payload = record.getPlainText(); + for (const auto& it : record.getTextByLangs()) { + std::string payload = it.second; if (paragraph_identification) { payload = get_paragraph_id(payload); } util::encodeBase64(payload, base64text); - - if (output_files.count("html") == 1) - util::encodeBase64(record.getPayload(), base64html); - - this->write(record.getLanguage(), base64text, record.getURL(), record.getHTTPcontentType(), base64html); + this->write(it.first, base64text, record.getURL(), record.getHTTPcontentType(), base64html); } } diff --git a/src/bilangwriter.hh b/src/bilangwriter.hh index 3b520f6..1b7f83e 100644 --- a/src/bilangwriter.hh +++ b/src/bilangwriter.hh @@ -57,7 +57,7 @@ namespace warc2text { output_files(output_files) {}; - void write(const Record& record, bool multilang = false, bool paragraph_identification = false); + void write(const Record& record, bool paragraph_identification = false); }; diff --git a/src/warcpreprocessor.cc b/src/warcpreprocessor.cc index daa35ac..1d90079 100644 --- a/src/warcpreprocessor.cc +++ b/src/warcpreprocessor.cc @@ -169,7 +169,7 @@ namespace warc2text { langRecords += n_langs; - writer.write(record, true, paragraph_identification); + writer.write(record, paragraph_identification); } pdf_warc_writer.close(); } From 38bb6dd723d563ae1ea8518c4a79ece63bab5512 Mon Sep 17 00:00:00 2001 From: Jelmer van der Linde Date: Mon, 13 Mar 2023 12:06:44 +0000 Subject: [PATCH 5/8] Update options mentioned in README --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 995cd93..688cf9a 100644 --- a/README.md +++ b/README.md @@ -42,6 +42,8 @@ warc2text -o [ -f ] [ --pdfpass ] * `--files`/`-f` list of output files separated by commas (and without `.gz`); `text` and `url` are always written, while `mime` and `html` are optional * `--pdfpass` WARC file where PDF records will be stored * `--paragraph-identification` print the paragraph identifier for each sentence extracted from the HTML +* `--classifier` classifier to use: `cld2` or `fasttext`. +* `--fasttext-model` path to FastText model for fasttext classifier. * `--tag-filters` file containing filters that are used to eliminate matching documents * `--invert-tag-filters` output only documents that match the filter * `--url-filters` file containing regular expressions that match urls of documents to eliminate From 49b2467d9fee2532431038b83656046c4c4c6413 Mon Sep 17 00:00:00 2001 From: Jelmer van der Linde Date: Mon, 13 Mar 2023 12:07:51 +0000 Subject: [PATCH 6/8] Add note about ulimit --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 688cf9a..93c2efe 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,9 @@ make install ``` ## Usage + +**note:** for warcs with many languages you might hit the open file limit quite quickly. It is therefore advised to increase it, e.g. `ulimit -n 8192`. + ``` warc2text -o [ -f ] [ --pdfpass ] [ --paragraph-identification ] [ --tag-filters ] ... From 68f7eff402c123601a15595292cb23df5048740a Mon Sep 17 00:00:00 2001 From: Jelmer van der Linde Date: Mon, 13 Mar 2023 12:08:33 +0000 Subject: [PATCH 7/8] Add `-O3` for release builds It makes quite a difference! --- CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index df965b8..2b31e9a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,6 +5,9 @@ project(warc2text) set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_FLAGS_DEBUG "-g") +set(CMAKE_CXX_FLAGS_RELEASE "-O3") +set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3") set(CMAKE_CXX_FLAGS "-Wall -Wextra -DBOOST_LOG_DYN_LINK ${CMAKE_CXX_FLAGS}") if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I /usr/local/opt/icu4c/include") From 518d10ab4e95bc8761f433fa8a89d3e69912abe5 Mon Sep 17 00:00:00 2001 From: Jelmer van der Linde Date: Tue, 14 Mar 2023 15:15:55 +0000 Subject: [PATCH 8/8] Don't install fasttext stuff on `make install` --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2b31e9a..f3c3ed5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -49,7 +49,7 @@ target_include_directories(warc2text_lib PRIVATE fasttext/src) # add libcld2.so add_subdirectory(cld2) -add_subdirectory(fasttext) +add_subdirectory(fasttext EXCLUDE_FROM_ALL) # # define executables