Initial nl config for DH2023 demo

explosion · May 4, 2023 · a1fd639 · a1fd639
1 parent f524724
commit a1fd639
Show file tree

Hide file tree

Showing 3 changed files with 19 additions and 20 deletions.
diff --git a/README.md b/README.md
@@ -43,8 +43,7 @@ language or switch to `"latest"`.
 
 #### OSCAR 21.09
 
-The dataset [`oscar-corpus/OSCAR-2109`](https://huggingface.co/datasets/oscar-corpus/OSCAR-2109)
-requires you to:
+The dataset [`oscar-corpus/OSCAR-2109`](https://huggingface.co/datasets/oscar-corpus/OSCAR-2109) requires you to:
 - create a Hugging Face Hub account
 - agree to the dataset terms to access: https://huggingface.co/datasets/oscar-corpus/OSCAR-2109
 - authenticate with `huggingface-cli login`
@@ -181,8 +180,8 @@ in the project directory.
 | File | Source | Description |
 | --- | --- | --- |
 | `software/floret` | Git |  |
-| `/scratch/vectors/downloaded/wikipedia/kowiki-20220201-pages-articles.xml.bz2` | URL |  |
-| `/scratch/vectors/downloaded/opensubtitles/ko.txt.gz` | URL |  |
-| `/scratch/vectors/downloaded/newscrawl/ko/news.2020.ko.shuffled.deduped.gz` | URL |  |
+| `/scratch/vectors/downloaded/wikipedia/nlwiki-20230420-pages-articles.xml.bz2` | URL |  |
+| `/scratch/vectors/downloaded/opensubtitles/nl.txt.gz` | URL |  |
+| `/scratch/vectors/downloaded/newscrawl/nl/news.2022.nl.shuffled.deduped.gz` | URL |  |
 
 <!-- SPACY PROJECT: AUTO-GENERATED DOCS END (do not remove) -->
diff --git a/project.yml b/project.yml
@@ -132,14 +132,14 @@ description: |
 spacy_version: ">=3.2.0,<4.0.0"
 vars:
   name: "vectors"
-  lang: "ko"
+  lang: "nl"
   n_process_tokenize: 16
   # The defaults assume that you have a large hard drive mounted under /scratch.
   downloaded_dir: "/scratch/vectors/downloaded"
   extracted_dir: "/scratch/vectors/extracted"
   tokenized_dir: "/scratch/vectors/tokenized"
-  wikipedia_version: 20220201
-  newscrawl_year: 2020
+  wikipedia_version: 20230420
+  newscrawl_year: 2022
   oscar_dataset: "oscar-corpus/OSCAR-2109"
   oscar_dataset_subset: "deduplicated_${vars.lang}"
   # For "oscar" instead of OSCAR-2109 (no auth required).
@@ -150,17 +150,17 @@ vars:
   vector_input_dir: "/scratch/vectors/input"
   vector_model: "cbow"
   # For languages with alphabets: minn/maxn 4/5 or 5/5 is a good starting point.
-  vector_minn: 2
-  vector_maxn: 3
+  vector_minn: 4
+  vector_maxn: 5
   vector_epoch: 5
   vector_dim: 300
   vector_neg: 10
   vector_bucket_md: 50000
   vector_bucket_lg: 200000
-  vector_min_count: 20
+  vector_min_count: 150
   vector_hash_count: 2
-  vector_thread: 16
-  vector_lr: 0.05
+  vector_thread: 12
+  vector_lr: 0.01
 
 directories: ["software", "vectors"]
 
@@ -304,8 +304,8 @@ commands:
       - "${vars.tokenized_dir}/${vars.lang}_opensubtitles.txt"
       - "${vars.tokenized_dir}/${vars.lang}_newscrawl_${vars.newscrawl_year}.txt"
       - "${vars.tokenized_dir}/${vars.lang}_oscar_${vars.oscar_dataset_subset}.txt"
-    outputs:
-      - "${vars.vector_input_dir}/${vars.lang}.txt"
+#    outputs:
+#      - "${vars.vector_input_dir}/${vars.lang}.txt"
 
   - name: "compile-floret"
     help: "Compile floret"
@@ -334,7 +334,7 @@ commands:
         -output vectors/${vars.lang}_md
     deps:
       - "software/floret"
-      - "${vars.vector_input_dir}/${vars.lang}.txt"
+      #- "${vars.vector_input_dir}/${vars.lang}.txt"
     outputs:
       - "vectors/${vars.lang}_md.floret"
 
@@ -358,7 +358,7 @@ commands:
         -output vectors/${vars.lang}_lg
     deps:
       - "software/floret"
-      - "${vars.vector_input_dir}/${vars.lang}.txt"
+      #- "${vars.vector_input_dir}/${vars.lang}.txt"
     outputs:
       - "vectors/${vars.lang}_lg.floret"
 

diff --git a/scripts/tokenize_resource.py b/scripts/tokenize_resource.py
@@ -50,9 +50,9 @@ def main(
 
     if input_file:
         if max_texts > 0:
-            texts = islice(open(input_file), max_texts)
+            texts = islice(open(input_file, encoding="utf8"), max_texts)
         else:
-            texts = open(input_file)
+            texts = open(input_file, encoding="utf8")
     elif input_dataset:
         dataset = load_dataset(
             input_dataset,
@@ -66,7 +66,7 @@ def main(
         else:
             texts = (line["text"] for line in dataset)
 
-    with open(output_file, "w") as output_fileh, Pool(processes=n_process) as pool:
+    with open(output_file, "w", encoding="utf-8") as output_fileh, Pool(processes=n_process) as pool:
         result = pool.imap(partial(tokenize_batch, nlp), chunked(texts, batch_size))
         for lines in result:
             output_fileh.writelines(lines)