22
22
import gzip
23
23
import os
24
24
import string
25
- import sys
26
25
from collections import Counter
27
26
27
+ from nltk import data
28
+
28
29
29
30
STRING_PUNCTUATION = tuple (string .punctuation )
30
31
DIGETS = tuple (string .digits )
@@ -115,6 +116,22 @@ def build_word_frequency(filepath, language, output_path):
115
116
return word_frequency
116
117
117
118
119
+ def export_misfit_words (misfit_filepath , word_freq_filepath , word_frequency ):
120
+ with load_file (word_freq_filepath , 'utf-8' ) as f :
121
+ source_word_frequency = json .load (f )
122
+
123
+ source_words = set (source_word_frequency .keys ())
124
+ final_words = set (word_frequency .keys ())
125
+
126
+ misfitted_words = source_words .difference (final_words )
127
+ misfitted_words = sorted (list (misfitted_words ))
128
+
129
+ with open (misfit_filepath , 'w+' ) as file :
130
+ for word in misfitted_words :
131
+ file .write (word )
132
+ file .write ('\n ' )
133
+
134
+
118
135
def clean_english (word_frequency , filepath_exclude , filepath_include ):
119
136
""" Clean an English word frequency list
120
137
@@ -576,20 +593,21 @@ def _parse_args():
576
593
577
594
parser = argparse .ArgumentParser (description = 'Build a new dictionary (word frequency) using the OpenSubtitles2018 project' )
578
595
parser .add_argument ("-l" , "--language" , required = True , help = "The language being built" , choices = ['en' , 'es' , 'de' , 'fr' , 'pt' , 'ru' ])
579
- parser .add_argument ("-p" , "--path" , help = "The path to the downloaded text file OR the saved word frequency json" )
580
- parser .add_argument ("-P" , "--parse_input" , action = "store_true" , help = "Add this if providing a text file to be parsed" )
596
+ parser .add_argument ("-f" , "--file-path" , help = "The path to the downloaded text file OR the saved word frequency json" )
597
+ parser .add_argument ("-p" , "--parse-input" , action = "store_true" , help = "Add this if providing a text file to be parsed" )
598
+ parser .add_argument ("-m" , "--misfit-file" , action = "store_true" , help = "Create file with words which was removed from dictionary" )
581
599
582
600
args = parser .parse_args ()
583
601
584
602
# validate that we have a path, if needed!
585
603
if args .parse_input :
586
- if not args .path :
604
+ if not args .file_path :
587
605
raise Exception ("A path is required if parsing a text file!" )
588
606
589
- if args .path :
590
- args .path = os .path .abspath (os .path .realpath (args .path ))
607
+ if args .file_path :
608
+ args .file_path = os .path .abspath (os .path .realpath (args .file_path ))
591
609
592
- if not os .path .exists (args .path ):
610
+ if not os .path .exists (args .file_path ):
593
611
raise FileNotFoundError ("File Not Found. A valid path is required if parsing a text file!" )
594
612
595
613
return args
@@ -602,8 +620,9 @@ def _parse_args():
602
620
script_path = os .path .dirname (os .path .abspath (__file__ ))
603
621
module_path = os .path .abspath ("{}/../" .format (script_path ))
604
622
resources_path = os .path .abspath ("{}/resources/" .format (module_path ))
605
- exclude_filepath = os .path .abspath ("{}/data/{}_exclude.txt" .format (script_path , args .language ))
606
- include_filepath = os .path .abspath ("{}/data/{}_include.txt" .format (script_path , args .language ))
623
+ data_path = os .path .abspath ("{}/data/" .format (script_path ))
624
+ exclude_filepath = os .path .abspath ("{}/{}_exclude.txt" .format (data_path , args .language ))
625
+ include_filepath = os .path .abspath ("{}/{}_include.txt" .format (data_path , args .language ))
607
626
608
627
print (script_path )
609
628
print (module_path )
@@ -613,15 +632,20 @@ def _parse_args():
613
632
614
633
# Should we re-process a file?
615
634
if args .parse_input :
616
- json_path = os .path .join (script_path , "data" , "{}.json" .format (args .language ))
635
+ json_path = os .path .join (script_path , "data" , "{}_full .json" .format (args .language ))
617
636
print (json_path )
618
- word_frequency = build_word_frequency (args .path , args .language , json_path )
637
+ word_frequency = build_word_frequency (args .file_path , args .language , json_path )
619
638
else :
620
639
json_path = os .path .join (script_path , "data" , "{}_full.json.gz" .format (args .language ))
621
640
print (json_path )
622
641
with load_file (json_path , 'utf-8' ) as f :
623
642
word_frequency = json .load (f )
624
643
644
+ # create include and exclude files before cleaning
645
+ for filepath in (include_filepath , exclude_filepath ):
646
+ with open (filepath , 'a+' ):
647
+ pass
648
+
625
649
# clean up the dictionary
626
650
if args .language == "en" :
627
651
word_frequency = clean_english (word_frequency , exclude_filepath , include_filepath )
@@ -637,4 +661,9 @@ def _parse_args():
637
661
word_frequency = clean_russian (word_frequency , exclude_filepath , include_filepath )
638
662
639
663
# export word frequency for review!
640
- export_word_frequency (os .path .join (script_path , "{}.json" .format (args .language )), word_frequency )
664
+ word_frequency_path = os .path .join (script_path , "{}.json" .format (args .language ))
665
+ export_word_frequency (word_frequency_path , word_frequency )
666
+
667
+ if args .misfit_file :
668
+ misfit_filepath = os .path .abspath ("{}/{}_misfit.txt" .format (data_path , args .language ))
669
+ export_misfit_words (misfit_filepath , json_path , word_frequency )
0 commit comments