Merge pull request #36 from jelmervdl/fasttext-option

Add fasttext as an option alongside cld2
bitextor · Mar 14, 2023 · eac887e · eac887e
2 parents 8607c00 + 518d10a
commit eac887e
Show file tree

Hide file tree

Showing 16 changed files with 243 additions and 120 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -6,3 +6,6 @@
 	path = cld2
 	url = https://github.com/bitextor/cld2.git
     ignore = dirty
+[submodule "fasttext"]
+	path = fasttext
+	url = https://github.com/kpuatfb/fastText.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -2,9 +2,12 @@ cmake_minimum_required(VERSION 2.8.3)
 
 project(warc2text)
 
-set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD 17)
 
 
+set(CMAKE_CXX_FLAGS_DEBUG "-g")
+set(CMAKE_CXX_FLAGS_RELEASE "-O3")
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3")
 set(CMAKE_CXX_FLAGS "-Wall -Wextra -DBOOST_LOG_DYN_LINK ${CMAKE_CXX_FLAGS}")
 if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I /usr/local/opt/icu4c/include")
@@ -41,10 +44,12 @@ if (NOT SKIP_PREPROCESS_BUILD)
 endif()
 
 target_include_directories(warc2text_lib PUBLIC ${PREPROCESS_PATH})
+target_include_directories(warc2text_lib PRIVATE fasttext/src)
 #
 
 # add libcld2.so
 add_subdirectory(cld2)
+add_subdirectory(fasttext EXCLUDE_FROM_ALL)
 #
 
 # define executables
@@ -53,6 +58,7 @@ target_link_libraries(warc2text
     warc2text_lib
     ${Boost_LIBRARIES}
     cld2_full
+    fasttext-static
 )
 
 include(GNUInstallDirs)

diff --git a/README.md b/README.md
@@ -28,11 +28,15 @@ mkdir build
 cd build
 cmake -DCMAKE_INSTALL_PREFIX=/your/prefix/path ..
 # cmake .. -DCMAKE_BUILD_TYPE=Debug # for debug
+# cmake .. -DICU_ROOT_DIR=(brew --prefix icu4c)/lib # for macOS
 make -j
 make install
 ```
 
 ## Usage
+
+**note:** for warcs with many languages you might hit the open file limit quite quickly. It is therefore advised to increase it, e.g. `ulimit -n 8192`.
+
 ```
 warc2text -o <output_folder> [ -f <output_files> ] [ --pdfpass <output_warc> ]
           [ --paragraph-identification ] [ --tag-filters <filters_file> ] <warc_file>...
@@ -41,6 +45,8 @@ warc2text -o <output_folder> [ -f <output_files> ] [ --pdfpass <output_warc> ]
 * `--files`/`-f` list of output files separated by commas (and without `.gz`); `text` and `url` are always written, while `mime` and `html` are optional
 * `--pdfpass` WARC file where PDF records will be stored
 * `--paragraph-identification` print the paragraph identifier for each sentence extracted from the HTML
+* `--classifier` classifier to use: `cld2` or `fasttext`.
+* `--fasttext-model` path to FastText model for fasttext classifier.
 * `--tag-filters` file containing filters that are used to eliminate matching documents
 * `--invert-tag-filters` output only documents that match the filter
 * `--url-filters` file containing regular expressions that match urls of documents to eliminate

diff --git a/fasttext b/fasttext
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -26,6 +26,8 @@ add_library(warc2text_lib
     record.cc
     html.cc
     lang.cc
+    lang_cld2.cc
+    lang_fasttext.cc
     util.cc
     bilangwriter.cc
     xh_scanner.cc

diff --git a/src/bilangwriter.cc b/src/bilangwriter.cc
@@ -1,5 +1,6 @@
 #include "bilangwriter.hh"
 #include "util.hh"
+#include "util/exception.hh"
 #include <cassert>
 #include <string>
 
@@ -50,6 +51,7 @@ namespace warc2text{
 
     void GzipWriter::open(const std::string& filename) {
         dest = std::fopen(filename.c_str(), "wb");
+        UTIL_THROW_IF(!dest, util::ErrnoException, "while creating " << filename);
     }
 
     void GzipWriter::write(const char* text, std::size_t size) {
@@ -108,39 +110,22 @@ namespace warc2text{
         return result;
     }
 
-    void BilangWriter::write(const Record& record, bool multilang, bool paragraph_identification) {
+    void BilangWriter::write(const Record& record, bool paragraph_identification) {
         std::string base64text;
         std::string base64html;
 
-        if (multilang) {
+        if (output_files.count("html") == 1)
+            util::encodeBase64(record.getPayload(), base64html);
 
-            if (output_files.count("html") == 1)
-                util::encodeBase64(record.getPayload(), base64html);
-
-            for (const auto& it : record.getTextByLangs()) {
-                std::string payload = it.second;
-
-                if (paragraph_identification) {
-                    payload = get_paragraph_id(payload);
-                }
-
-                util::encodeBase64(payload, base64text);
-                this->write(it.first, base64text, record.getURL(), record.getHTTPcontentType(), base64html);
-            }
-
-        } else {
-            std::string payload = record.getPlainText();
+        for (const auto& it : record.getTextByLangs()) {
+            std::string payload = it.second;
 
             if (paragraph_identification) {
                 payload = get_paragraph_id(payload);
             }
 
             util::encodeBase64(payload, base64text);
-
-            if (output_files.count("html") == 1)
-                util::encodeBase64(record.getPayload(), base64html);
-
-            this->write(record.getLanguage(), base64text, record.getURL(), record.getHTTPcontentType(), base64html);
+            this->write(it.first, base64text, record.getURL(), record.getHTTPcontentType(), base64html);
         }
     }
 

diff --git a/src/bilangwriter.hh b/src/bilangwriter.hh
@@ -57,7 +57,7 @@ namespace warc2text {
                 output_files(output_files)
             {};
 
-            void write(const Record& record, bool multilang = false, bool paragraph_identification = false);
+            void write(const Record& record, bool paragraph_identification = false);
 
     };
 

diff --git a/src/lang.cc b/src/lang.cc
@@ -1,70 +1,7 @@
-#include "src/lang.hh"
+#include "lang.hh"
 
 namespace warc2text {
-    // hint = {content language code(s), tld, original encoding, CLD2::Language}
-    const CLD2::CLDHints NO_HINT = {nullptr, nullptr, CLD2::UNKNOWN_ENCODING, CLD2::UNKNOWN_LANGUAGE};
+	
+const std::string LanguageDetector::kUnknownLanguageLabel = "unk";
 
-    bool detectLanguage(const std::string& text, std::unordered_map<std::string, std::string>& text_by_lang){
-        CLD2::Language langs[3] = {CLD2::UNKNOWN_LANGUAGE, CLD2::UNKNOWN_LANGUAGE, CLD2::UNKNOWN_LANGUAGE};
-        int percents[3] = {0,0,0};
-        double scores[3] = {0.0, 0.0, 0.0};
-
-        bool reliable = false;
-        int text_bytes;
-        int valid_prefix_bytes;
-
-        CLD2::ResultChunkVector chunks;
-
-        CLD2::ExtDetectLanguageSummaryCheckUTF8(text.data(), text.size(), true, &NO_HINT, 0, &langs[0], &percents[0], &scores[0], &chunks, &text_bytes, &reliable, &valid_prefix_bytes);
-
-        text_by_lang.clear();
-
-        if (not reliable) return reliable;
-
-        std::string* top1 = nullptr;
-        std::string* top2 = nullptr;
-        std::string* top3 = nullptr;
-
-        if (langs[0] != CLD2::UNKNOWN_LANGUAGE and percents[0] > 0) {
-            top1 = &text_by_lang[CLD2::LanguageCode(langs[0])];
-            top1->reserve(text.size() * (percents[0] + 1));
-        }
-
-        if (langs[1] != CLD2::UNKNOWN_LANGUAGE and percents[1] > 0) {
-            top2 = &text_by_lang[CLD2::LanguageCode(langs[1])];
-            top2->reserve(text.size() * (percents[1] + 1));
-        }
-
-        if (langs[2] != CLD2::UNKNOWN_LANGUAGE and percents[2] > 0) {
-            top3 = &text_by_lang[CLD2::LanguageCode(langs[2])];
-            top3->reserve(text.size() * (percents[2] + 1));
-        }
-
-        for (const CLD2::ResultChunk& chunk : chunks) {
-            std::string* ref = static_cast<CLD2::Language>(chunk.lang1) == langs[0] ? top1 :
-                        static_cast<CLD2::Language>(chunk.lang1) == langs[1] ? top2 :
-                        static_cast<CLD2::Language>(chunk.lang1) == langs[2] ? top3 : nullptr;
-            if (ref == nullptr) continue;
-            ref->append(text, chunk.offset, chunk.bytes);
-        }
-
-        // remove empty texts from text_by_lang
-        // apparently it is possible that the reported percentage is > 0, but the language does not appear in chunks
-        for (auto it = text_by_lang.cbegin(); it != text_by_lang.cend(); ){
-            if (it->second.size() == 0) text_by_lang.erase(it++);
-            else ++it;
-        }
-
-        // TODO: do something with the scores?
-
-        return reliable;
-    }
-
-    bool detectLanguage(const std::string& text, std::string& lang){
-        bool reliable = false;
-        int valid_prefix_bytes = 0;
-        CLD2::Language l = CLD2::DetectLanguageCheckUTF8(text.data(), text.size(), true, &reliable, &valid_prefix_bytes);
-        lang = CLD2::LanguageCode(l);
-        return reliable;
-    }
 } // namespace warc2text
diff --git a/src/lang.hh b/src/lang.hh
@@ -1,18 +1,49 @@
 #ifndef WARC2TEXT_LANG_HH
 #define WARC2TEXT_LANG_HH
 
+#include <memory>
 #include <string>
 #include <unordered_map>
-#include <utility>
-#include "cld2/public/compact_lang_det.h"
-#include "cld2/public/encodings.h"
+
+namespace fasttext {
+class FastText;
+} // namespace fasttext
 
 namespace warc2text {
-    // detect language of plain text, return top 3 languages
-    bool detectLanguage(const std::string& text, std::unordered_map<std::string, std::string>& chunks);
 
-    // detect top language of plain text
-    bool detectLanguage(const std::string& text, std::string& lang);
-}
+class LanguageDetector {
+  public:
+    virtual ~LanguageDetector() {};
+
+    // detect language of plain text, return top languages
+    virtual void detect(const std::string& text, std::unordered_map<std::string, std::string>& chunks) const = 0;
+
+    // Label used for text (chunks) that cannot reliably be identified
+    static const std::string kUnknownLanguageLabel;
+};
+
+class FastTextDetector : public LanguageDetector {
+  public:
+    explicit FastTextDetector(const std::string &filename);
+    virtual ~FastTextDetector();
+    virtual void detect(const std::string& text, std::unordered_map<std::string, std::string>& chunks) const;
+
+  private:
+    std::unique_ptr<fasttext::FastText> classifier_;
+};
+
+class CLD2Detector : public LanguageDetector {
+public:
+  virtual void detect(const std::string& text, std::unordered_map<std::string, std::string>& chunks) const;
+  virtual ~CLD2Detector();
+};
+
+class CLD2MultiLangDetector : public LanguageDetector {
+public:
+  virtual void detect(const std::string& text, std::unordered_map<std::string, std::string>& chunks) const;
+  virtual ~CLD2MultiLangDetector();
+};
+
+} // namespace warc2text
 
 #endif
diff --git a/src/lang_cld2.cc b/src/lang_cld2.cc
@@ -0,0 +1,76 @@
+#include "src/lang.hh"
+#include "cld2/public/compact_lang_det.h"
+#include "cld2/public/encodings.h"
+
+namespace warc2text {
+    // hint = {content language code(s), tld, original encoding, CLD2::Language}
+    const CLD2::CLDHints NO_HINT = {nullptr, nullptr, CLD2::UNKNOWN_ENCODING, CLD2::UNKNOWN_LANGUAGE};
+
+    CLD2Detector::~CLD2Detector() {}
+
+    void CLD2Detector::detect(const std::string& text, std::unordered_map<std::string, std::string>& text_by_lang) const {
+        bool reliable = false;
+        int valid_prefix_bytes = 0;
+        CLD2::Language l = CLD2::DetectLanguageCheckUTF8(text.data(), text.size(), true, &reliable, &valid_prefix_bytes);
+        text_by_lang[reliable ? CLD2::LanguageCode(l) : kUnknownLanguageLabel] = text;
+    }
+
+    CLD2MultiLangDetector::~CLD2MultiLangDetector() {}
+
+    void CLD2MultiLangDetector::detect(const std::string& text, std::unordered_map<std::string, std::string>& text_by_lang) const {
+        CLD2::Language langs[3] = {CLD2::UNKNOWN_LANGUAGE, CLD2::UNKNOWN_LANGUAGE, CLD2::UNKNOWN_LANGUAGE};
+        int percents[3] = {0,0,0};
+        double scores[3] = {0.0, 0.0, 0.0};
+
+        bool reliable = false;
+        int text_bytes;
+        int valid_prefix_bytes;
+
+        CLD2::ResultChunkVector chunks;
+
+        CLD2::ExtDetectLanguageSummaryCheckUTF8(text.data(), text.size(), true, &NO_HINT, 0, &langs[0], &percents[0], &scores[0], &chunks, &text_bytes, &reliable, &valid_prefix_bytes);
+
+        text_by_lang.clear();
+
+        if (not reliable) {
+            text_by_lang[kUnknownLanguageLabel] = text;
+            return;
+        }
+
+        std::string* top1 = nullptr;
+        std::string* top2 = nullptr;
+        std::string* top3 = nullptr;
+
+        if (langs[0] != CLD2::UNKNOWN_LANGUAGE and percents[0] > 0) {
+            top1 = &text_by_lang[CLD2::LanguageCode(langs[0])];
+            top1->reserve(text.size() * (percents[0] + 1));
+        }
+
+        if (langs[1] != CLD2::UNKNOWN_LANGUAGE and percents[1] > 0) {
+            top2 = &text_by_lang[CLD2::LanguageCode(langs[1])];
+            top2->reserve(text.size() * (percents[1] + 1));
+        }
+
+        if (langs[2] != CLD2::UNKNOWN_LANGUAGE and percents[2] > 0) {
+            top3 = &text_by_lang[CLD2::LanguageCode(langs[2])];
+            top3->reserve(text.size() * (percents[2] + 1));
+        }
+
+        for (const CLD2::ResultChunk& chunk : chunks) {
+            std::string* ref = static_cast<CLD2::Language>(chunk.lang1) == langs[0] ? top1 :
+                        static_cast<CLD2::Language>(chunk.lang1) == langs[1] ? top2 :
+                        static_cast<CLD2::Language>(chunk.lang1) == langs[2] ? top3 : nullptr;
+            if (ref == nullptr) continue;
+            ref->append(text, chunk.offset, chunk.bytes);
+        }
+
+        // remove empty texts from text_by_lang
+        // apparently it is possible that the reported percentage is > 0, but the language does not appear in chunks
+        for (auto it = text_by_lang.cbegin(); it != text_by_lang.cend(); ){
+            if (it->second.size() == 0) text_by_lang.erase(it++);
+            else ++it;
+        }
+
+        // TODO: do something with the scores?
+    }
+} // namespace warc2text
diff --git a/src/lang_fasttext.cc b/src/lang_fasttext.cc
@@ -0,0 +1,40 @@
+#include "src/lang.hh"
+
+#include "fasttext.h"
+#include "util/exception.hh"
+
+#include <cstdlib>
+#include <cstring>
+
+namespace warc2text {
+
+FastTextDetector::FastTextDetector(const std::string &filename)
+  : classifier_(new fasttext::FastText) {
+  classifier_->loadModel(filename);
+}
+
+FastTextDetector::~FastTextDetector() {}
+
+const char kLabelPrefix[] = "__label__";
+
+void FastTextDetector::detect(const std::string& text, std::unordered_map<std::string, std::string>& chunks) const {
+  const float kThreshold = 0.5f;
+  std::vector<int32_t> words, labels;
+  classifier_->getDictionary()->getStringNoNewline(text, words, labels);
+  fasttext::Predictions predictions;
+  classifier_->predict(1, words, predictions, kThreshold);
+  if (predictions.empty()) {
+    chunks[kUnknownLanguageLabel] = text;
+    return;
+  }
+
+  // Labels look like __label__eng
+  std::string label = classifier_->getDictionary()->getLabel(predictions[0].second);
+  UTIL_THROW_IF2(strncmp(label.c_str(), kLabelPrefix, sizeof(kLabelPrefix) - 1), "Was expecting text classifier labels to begin with " << kLabelPrefix << " but they look like " << label);
+  label.erase(0, sizeof(kLabelPrefix) - 1);
+
+  // For better or worse, we're currently doing everything as one chunk.
+  chunks[label] = text;
+}
+
+} // namespace warc2text
-Original file line number
+Diff line change
@@ Expand Up / @@ -57,7 +57,7 @@ namespace warc2text { @@
                     output_files(output_files)
                 {};
-                void write(const Record& record, bool multilang = false, bool paragraph_identification = false);
+                void write(const Record& record, bool paragraph_identification = false);
         };
@@ Expand Down @@