Skip to content

Commit 2620839

Browse files
committed
2 parents cd44263 + 9713e3d commit 2620839

File tree

1 file changed

+41
-12
lines changed

1 file changed

+41
-12
lines changed

scripts/build_dictionary.py

+41-12
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,10 @@
2222
import gzip
2323
import os
2424
import string
25-
import sys
2625
from collections import Counter
2726

27+
from nltk import data
28+
2829

2930
STRING_PUNCTUATION = tuple(string.punctuation)
3031
DIGETS = tuple(string.digits)
@@ -115,6 +116,22 @@ def build_word_frequency(filepath, language, output_path):
115116
return word_frequency
116117

117118

119+
def export_misfit_words(misfit_filepath, word_freq_filepath, word_frequency):
120+
with load_file(word_freq_filepath, 'utf-8') as f:
121+
source_word_frequency = json.load(f)
122+
123+
source_words = set(source_word_frequency.keys())
124+
final_words = set(word_frequency.keys())
125+
126+
misfitted_words = source_words.difference(final_words)
127+
misfitted_words = sorted(list(misfitted_words))
128+
129+
with open(misfit_filepath, 'w+') as file:
130+
for word in misfitted_words:
131+
file.write(word)
132+
file.write('\n')
133+
134+
118135
def clean_english(word_frequency, filepath_exclude, filepath_include):
119136
""" Clean an English word frequency list
120137
@@ -576,20 +593,21 @@ def _parse_args():
576593

577594
parser = argparse.ArgumentParser(description='Build a new dictionary (word frequency) using the OpenSubtitles2018 project')
578595
parser.add_argument("-l", "--language", required=True, help="The language being built", choices=['en', 'es', 'de', 'fr', 'pt', 'ru'])
579-
parser.add_argument("-p", "--path", help="The path to the downloaded text file OR the saved word frequency json")
580-
parser.add_argument("-P", "--parse_input", action="store_true", help="Add this if providing a text file to be parsed")
596+
parser.add_argument("-f", "--file-path", help="The path to the downloaded text file OR the saved word frequency json")
597+
parser.add_argument("-p", "--parse-input", action="store_true", help="Add this if providing a text file to be parsed")
598+
parser.add_argument("-m", "--misfit-file", action="store_true", help="Create file with words which was removed from dictionary")
581599

582600
args = parser.parse_args()
583601

584602
# validate that we have a path, if needed!
585603
if args.parse_input:
586-
if not args.path:
604+
if not args.file_path:
587605
raise Exception("A path is required if parsing a text file!")
588606

589-
if args.path:
590-
args.path = os.path.abspath(os.path.realpath(args.path))
607+
if args.file_path:
608+
args.file_path = os.path.abspath(os.path.realpath(args.file_path))
591609

592-
if not os.path.exists(args.path):
610+
if not os.path.exists(args.file_path):
593611
raise FileNotFoundError("File Not Found. A valid path is required if parsing a text file!")
594612

595613
return args
@@ -602,8 +620,9 @@ def _parse_args():
602620
script_path = os.path.dirname(os.path.abspath(__file__))
603621
module_path = os.path.abspath("{}/../".format(script_path))
604622
resources_path = os.path.abspath("{}/resources/".format(module_path))
605-
exclude_filepath = os.path.abspath("{}/data/{}_exclude.txt".format(script_path, args.language))
606-
include_filepath = os.path.abspath("{}/data/{}_include.txt".format(script_path, args.language))
623+
data_path = os.path.abspath("{}/data/".format(script_path))
624+
exclude_filepath = os.path.abspath("{}/{}_exclude.txt".format(data_path, args.language))
625+
include_filepath = os.path.abspath("{}/{}_include.txt".format(data_path, args.language))
607626

608627
print(script_path)
609628
print(module_path)
@@ -613,15 +632,20 @@ def _parse_args():
613632

614633
# Should we re-process a file?
615634
if args.parse_input:
616-
json_path = os.path.join(script_path, "data", "{}.json".format(args.language))
635+
json_path = os.path.join(script_path, "data", "{}_full.json".format(args.language))
617636
print(json_path)
618-
word_frequency = build_word_frequency(args.path, args.language, json_path)
637+
word_frequency = build_word_frequency(args.file_path, args.language, json_path)
619638
else:
620639
json_path = os.path.join(script_path, "data", "{}_full.json.gz".format(args.language))
621640
print(json_path)
622641
with load_file(json_path, 'utf-8') as f:
623642
word_frequency = json.load(f)
624643

644+
# create include and exclude files before cleaning
645+
for filepath in (include_filepath, exclude_filepath):
646+
with open(filepath, 'a+'):
647+
pass
648+
625649
# clean up the dictionary
626650
if args.language == "en":
627651
word_frequency = clean_english(word_frequency, exclude_filepath, include_filepath)
@@ -637,4 +661,9 @@ def _parse_args():
637661
word_frequency = clean_russian(word_frequency, exclude_filepath, include_filepath)
638662

639663
# export word frequency for review!
640-
export_word_frequency(os.path.join(script_path, "{}.json".format(args.language)), word_frequency)
664+
word_frequency_path = os.path.join(script_path, "{}.json".format(args.language))
665+
export_word_frequency(word_frequency_path, word_frequency)
666+
667+
if args.misfit_file:
668+
misfit_filepath = os.path.abspath("{}/{}_misfit.txt".format(data_path, args.language))
669+
export_misfit_words(misfit_filepath, json_path, word_frequency)

0 commit comments

Comments
 (0)