diff --git a/requirements.txt b/requirements.txt index 9033972..c32c905 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ -git+https://github.com/huridocs/pdf-document-layout-analysis@a834f712edf0fc0a4660de0270f3efbb08e0031c -git+https://github.com/huridocs/queue-processor@1875372bf9f6dcd1995a32c4e50ff92aa45f9ea8 +git+https://github.com/huridocs/pdf-document-layout-analysis@6262a95d1dd09055556a8e081eacfe02dd308b89 +git+https://github.com/huridocs/queue-processor@716ddf050c59035583b0852dc0b78a7860ce5c05 slugify==0.0.1 python-Levenshtein==0.25.1 tdda==2.0.9 @@ -15,7 +15,7 @@ mongomock==4.1.2 fasttext-wheel==0.9.2 rich==13.7.1 joblib==1.4.0 -tqdm==4.66.2 +tqdm==4.66.3 rapidfuzz==3.8.1 sentry_sdk==1.44.0 pymongo==4.6.3 diff --git a/src/Extractor.py b/src/Extractor.py index 17524b7..c0189d6 100755 --- a/src/Extractor.py +++ b/src/Extractor.py @@ -186,9 +186,11 @@ def get_suggestions(self) -> list[Suggestion]: if extractor_instance.get_name() != extractor_name: continue - message = f"Using {extractor_instance.get_name()} to calculate {len(prediction_samples)} suggestions" + suggestions = extractor_instance.get_suggestions(prediction_samples) + suggestions = [suggestion.mark_suggestion_if_empty() for suggestion in suggestions] + message = f"Using {extractor_instance.get_name()} to calculate {len(suggestions)} suggestions" send_logs(self.extraction_identifier, message) - return extractor_instance.get_suggestions(prediction_samples) + return suggestions send_logs(self.extraction_identifier, f"No extractor available", Severity.error) return [] diff --git a/src/data/LabeledData.py b/src/data/LabeledData.py index 92ad5c0..4a7a5aa 100644 --- a/src/data/LabeledData.py +++ b/src/data/LabeledData.py @@ -11,6 +11,7 @@ class LabeledData(BaseModel): entity_name: str = "" language_iso: str = "" label_text: str = "" + empty_value: bool = False values: list[Option] = list() source_text: str = "" page_width: float = 0 diff --git a/src/data/Suggestion.py b/src/data/Suggestion.py index bb46fe5..799bcf9 100644 --- a/src/data/Suggestion.py +++ b/src/data/Suggestion.py @@ -17,11 +17,24 @@ class Suggestion(BaseModel): xml_file_name: str = "" entity_name: str = "" text: str = "" + empty_suggestion: bool = False values: list[Option] = list() segment_text: str = "" page_number: int = 1 segments_boxes: list[SegmentBox] = list() + def is_empty(self): + if self.empty_suggestion: + return True + + return not self.text and not self.values + + def mark_suggestion_if_empty(self): + if self.is_empty(): + self.empty_suggestion = True + + return self + def to_dict(self): suggestion_dict = self.model_dump() suggestion_dict["segments_boxes"] = [x.to_dict() for x in self.segments_boxes] diff --git a/src/extractors/ToTextExtractor.py b/src/extractors/ToTextExtractor.py index 012f85f..6a25f6f 100644 --- a/src/extractors/ToTextExtractor.py +++ b/src/extractors/ToTextExtractor.py @@ -10,13 +10,6 @@ from data.Suggestion import Suggestion from extractors.ExtractorBase import ExtractorBase from extractors.ToTextExtractorMethod import ToTextExtractorMethod -from extractors.text_to_text_extractor.methods.DateParserMethod import DateParserMethod -from extractors.text_to_text_extractor.methods.DateParserWithBreaksMethod import DateParserWithBreaksMethod -from extractors.text_to_text_extractor.methods.InputWithoutSpaces import InputWithoutSpaces -from extractors.text_to_text_extractor.methods.MT5TrueCaseEnglishSpanishMethod import MT5TrueCaseEnglishSpanishMethod -from extractors.text_to_text_extractor.methods.RegexMethod import RegexMethod -from extractors.text_to_text_extractor.methods.RegexSubtractionMethod import RegexSubtractionMethod -from extractors.text_to_text_extractor.methods.SameInputOutputMethod import SameInputOutputMethod from send_logs import send_logs RETRAIN_SAMPLES_THRESHOLD = 250 @@ -115,8 +108,8 @@ def get_best_method(self, extraction_data: ExtractionData): try: performance = method_instance.performance(training_set, test_set) except Exception as e: - message = f"Error checking {method_instance.get_name()}: {e}" - send_logs(self.extraction_identifier, message, Severity.error) + message = f"Error checking {method_instance.get_name()}" + send_logs(self.extraction_identifier, message, Severity.error, e) performance = 0 performance_log += f"{method_instance.get_name()}: {round(performance, 2)}%\n" send_logs(self.extraction_identifier, f"Performance {method_instance.get_name()}: {performance}%") diff --git a/src/extractors/pdf_to_multi_option_extractor/PdfToMultiOptionExtractor.py b/src/extractors/pdf_to_multi_option_extractor/PdfToMultiOptionExtractor.py index 636dbdd..a15cb01 100644 --- a/src/extractors/pdf_to_multi_option_extractor/PdfToMultiOptionExtractor.py +++ b/src/extractors/pdf_to_multi_option_extractor/PdfToMultiOptionExtractor.py @@ -211,7 +211,7 @@ def get_method_performance( performance = method.get_performance(train_set, test_set) except Exception as e: severity = Severity.error if method.REPORT_ERRORS else Severity.info - send_logs(self.extraction_identifier, f"Error checking {method.get_name()}: {e}", severity) + send_logs(self.extraction_identifier, f"Error checking {method.get_name()}", severity, e) performance = 0 self.reset_extraction_data(train_set) diff --git a/src/extractors/pdf_to_multi_option_extractor/labeled_data/countries_in_favor_empty_labels/labels.json b/src/extractors/pdf_to_multi_option_extractor/labeled_data/countries_in_favor_empty_labels/labels.json new file mode 100644 index 0000000..a3a7802 --- /dev/null +++ b/src/extractors/pdf_to_multi_option_extractor/labeled_data/countries_in_favor_empty_labels/labels.json @@ -0,0 +1,2179 @@ +{ + "1695895074941fcy09egfwqw": [], + "1695895075594cwfvpr2xyab": [], + "1695895077479zge00r1yz2": [], + "1695895080076oprcrymxh0c": [], + "1695895080534nw6d5cxeoq": [], + "1695895086084fllr17uf4pe": [], + "1695895087330ebln6qfxjd7": [], + "1695895103069zwt3l908l": [], + "1695895106206nyuwt7z4my": [], + "1695895106939n2bbrf1l7ee": [], + "rightsdocs_res_45_20": [ + "Argentina", + "Australia", + "Austria", + "Bahamas", + "Brazil", + "Bulgaria", + "Chile", + "Czechia", + "Denmark", + "Fiji", + "Germany", + "Italy", + "Japan", + "Korea, Republic of", + "Marshall Islands", + "Netherlands", + "Peru", + "Poland", + "Slovakia", + "Spain", + "Ukraine", + "Uruguay" + ], + "rightsdocs_res_s25_1": [ + "Albania", + "Belgium", + "Botswana", + "El Salvador", + "France", + "Georgia", + "Germany", + "Korea, Republic of", + "Latvia", + "Maldives", + "Mexico", + "Mongolia", + "Morocco", + "Netherlands", + "North Macedonia", + "Portugal", + "Qatar", + "Saudi Arabia", + "Slovenia", + "Switzerland", + "Togo", + "United Arab Emirates", + "United Kingdom", + "Côte d'Ivoire" + ], + "rightsdocs_res_45_19": [ + "Argentina", + "Armenia", + "Australia", + "Austria", + "Bahamas", + "Brazil", + "Bulgaria", + "Chile", + "Czechia", + "Denmark", + "Fiji", + "Germany", + "Italy", + "Japan", + "Korea, Republic of", + "Marshall Islands", + "Mexico", + "Netherlands", + "Peru", + "Poland", + "Slovakia", + "Spain", + "Ukraine", + "Uruguay" + ], + "rightsdocs_res_43_10": [ + "Angola", + "Argentina", + "Bahrain", + "Bangladesh", + "Burkina Faso", + "Cameroon", + "Chile", + "Congo, the Democratic Republic of the", + "Eritrea", + "Fiji", + "India", + "Indonesia", + "Libya", + "Mauritania", + "Namibia", + "Nepal", + "Nigeria", + "Pakistan", + "Philippines", + "Qatar", + "Senegal", + "Somalia", + "Sudan", + "Togo", + "Uruguay", + "Venezuela, Bolivarian Republic of" + ], + "rightsdocs_res_43_15": [ + "Angola", + "Argentina", + "Armenia", + "Bahamas", + "Bahrain", + "Bangladesh", + "Burkina Faso", + "Cameroon", + "Congo, the Democratic Republic of the", + "Eritrea", + "Fiji", + "India", + "Indonesia", + "Namibia", + "Nepal", + "Nigeria", + "Pakistan", + "Philippines", + "Qatar", + "Senegal", + "Somalia", + "Sudan", + "Togo", + "Uruguay", + "Venezuela, Bolivarian Republic of" + ], + "rightsdocs_res_45_6": [ + "Afghanistan", + "Angola", + "Argentina", + "Bahamas", + "Bahrain", + "Bangladesh", + "Burkina Faso", + "Cameroon", + "Congo, the Democratic Republic of the", + "Eritrea", + "Fiji", + "India", + "Indonesia", + "Libya", + "Mauritania", + "Namibia", + "Nepal", + "Nigeria", + "Pakistan", + "Peru", + "Philippines", + "Qatar", + "Senegal", + "Somalia", + "Sudan", + "Togo", + "Venezuela, Bolivarian Republic of" + ], + "rightsdocs_res_42_9": [ + "Angola", + "Argentina", + "Bahamas", + "Bahrain", + "Bangladesh", + "Burkina Faso", + "Cameroon", + "Chile", + "China", + "Cuba", + "Egypt", + "Eritrea", + "Fiji", + "India", + "Iraq", + "Nepal", + "Nigeria", + "Pakistan", + "Peru", + "Philippines", + "Qatar", + "Rwanda", + "Saudi Arabia", + "Senegal", + "Somalia", + "South Africa", + "Togo", + "Tunisia", + "Uruguay" + ], + "rightsdocs_res_11926": [ + "Angola", + "Azerbaijan", + "Bangladesh", + "Bolivia, Plurinational State of", + "Brazil", + "Cameroon", + "China", + "Cuba", + "Djibouti", + "Egypt", + "Gabon", + "Ghana", + "Guatemala", + "India", + "Indonesia", + "Jordan", + "Madagascar", + "Malaysia", + "Mali", + "Mauritius", + "Mexico", + "Nicaragua", + "Nigeria", + "Pakistan", + "Peru", + "Philippines", + "Qatar", + "Russian Federation", + "Saudi Arabia", + "Senegal", + "South Africa", + "Sri Lanka", + "Uruguay", + "Zambia" + ], + "rightsdocs_res_3388": [ + "Algeria", + "Argentina", + "Benin", + "Botswana", + "Brazil", + "Burkina Faso", + "China", + "Congo", + "Costa Rica", + "Côte d'Ivoire", + "Cuba", + "Ethiopia", + "Gabon", + "India", + "Indonesia", + "Kazakhstan", + "Kenya", + "Kuwait", + "Maldives", + "Morocco", + "Namibia", + "Pakistan", + "Philippines", + "Russian Federation", + "Saudi Arabia", + "Sierra Leone", + "South Africa", + "United Arab Emirates", + "Venezuela, Bolivarian Republic of", + "Viet Nam" + ], + "rightsdocs_res_45_15": [ + "Argentina", + "Australia", + "Austria", + "Bahamas", + "Brazil", + "Bulgaria", + "Chile", + "Czechia", + "Denmark", + "Fiji", + "Germany", + "Italy", + "Korea, Republic of", + "Marshall Islands", + "Mexico", + "Netherlands", + "Peru", + "Poland", + "Qatar", + "Slovakia", + "Spain", + "Uruguay" + ], + "rightsdocs_res_12067": [ + "Angola", + "Azerbaijan", + "Bangladesh", + "Bolivia, Plurinational State of", + "Brazil", + "Cameroon", + "China", + "Cuba", + "Djibouti", + "Egypt", + "Gabon", + "Ghana", + "Guatemala", + "India", + "Indonesia", + "Jordan", + "Madagascar", + "Malaysia", + "Mali", + "Mauritius", + "Mexico", + "Nicaragua", + "Nigeria", + "Pakistan", + "Peru", + "Philippines", + "Qatar", + "Russian Federation", + "Saudi Arabia", + "Senegal", + "South Africa", + "Sri Lanka", + "Uruguay", + "Zambia" + ], + "rightsdocs_res_43_2": [ + "Afghanistan", + "Argentina", + "Australia", + "Austria", + "Bahamas", + "Brazil", + "Bulgaria", + "Chile", + "Czechia", + "Denmark", + "Fiji", + "Germany", + "Italy", + "Japan", + "Korea, Republic of", + "Marshall Islands", + "Mexico", + "Netherlands", + "Peru", + "Poland", + "Slovakia", + "Spain", + "Ukraine", + "Uruguay" + ], + "rightsdocs_res_42_28": [ + "Afghanistan", + "Angola", + "Argentina", + "Australia", + "Austria", + "Bahamas", + "Bangladesh", + "Brazil", + "Bulgaria", + "Burkina Faso", + "Chile", + "Congo, the Democratic Republic of the", + "Croatia", + "Czechia", + "Denmark", + "Fiji", + "Hungary", + "Iceland", + "India", + "Iraq", + "Italy", + "Japan", + "Mexico", + "Nepal", + "Pakistan", + "Peru", + "Rwanda", + "Senegal", + "Slovakia", + "South Africa", + "Spain", + "Togo", + "Tunisia", + "Ukraine", + "United Kingdom", + "Uruguay" + ], + "rightsdocs_res_874": [ + "Algeria", + "Argentina", + "Bangladesh", + "Bolivia, Plurinational State of", + "Botswana", + "Brazil", + "China", + "Congo", + "Côte d'Ivoire", + "Cuba", + "El Salvador", + "Ethiopia", + "Gabon", + "Ghana", + "India", + "Indonesia", + "Kazakhstan", + "Kenya", + "Maldives", + "Morocco", + "Namibia", + "Nigeria", + "Pakistan", + "Paraguay", + "Qatar", + "Russian Federation", + "Saudi Arabia", + "Sierra Leone", + "South Africa", + "United Arab Emirates", + "Venezuela, Bolivarian Republic of", + "Viet Nam" + ], + "rightsdocs_res_42_24": [ + "Angola", + "Argentina", + "Australia", + "Austria", + "Brazil", + "Bulgaria", + "Burkina Faso", + "Chile", + "Croatia", + "Czechia", + "Denmark", + "Fiji", + "Hungary", + "Iceland", + "Italy", + "Mexico", + "Nepal", + "Peru", + "Rwanda", + "Slovakia", + "South Africa", + "Spain", + "Togo", + "Ukraine", + "United Kingdom", + "Uruguay" + ], + "rightsdocs_res_41_2": [ + "Argentina", + "Australia", + "Austria", + "Bahamas", + "Bulgaria", + "Croatia", + "Czechia", + "Denmark", + "Fiji", + "Iceland", + "Italy", + "Mexico", + "Peru", + "Slovakia", + "Spain", + "Ukraine", + "United Kingdom", + "Uruguay" + ], + "rightsdocs_res_3570": [ + "Argentina", + "Austria", + "Benin", + "Botswana", + "Brazil", + "Burkina Faso", + "Chile", + "Costa Rica", + "Côte d'Ivoire", + "Czechia", + "Estonia", + "France", + "Germany", + "Ireland", + "Italy", + "Japan", + "Kazakhstan", + "Korea, Republic of", + "Maldives", + "Mexico", + "Montenegro", + "Morocco", + "North Macedonia", + "Peru", + "Philippines", + "Romania", + "Sierra Leone", + "United Arab Emirates", + "United Kingdom", + "United States" + ], + "rightsdocs_res_3352": [ + "Algeria", + "Argentina", + "Austria", + "Benin", + "Botswana", + "Brazil", + "Burkina Faso", + "Chile", + "China", + "Congo", + "Costa Rica", + "Côte d'Ivoire", + "Cuba", + "Czechia", + "Estonia", + "Ethiopia", + "France", + "Gabon", + "Germany", + "India", + "Indonesia", + "Ireland", + "Italy", + "Japan", + "Kazakhstan", + "Kenya", + "Korea, Republic of", + "Kuwait", + "Maldives", + "Mexico", + "Montenegro", + "Morocco", + "Namibia", + "North Macedonia", + "Pakistan", + "Peru", + "Philippines", + "Romania", + "Russian Federation", + "Saudi Arabia", + "Sierra Leone", + "South Africa", + "United Arab Emirates", + "United Kingdom", + "Venezuela, Bolivarian Republic of", + "Viet Nam" + ], + "rightsdocs_res_3522": [ + "Algeria", + "Argentina", + "Austria", + "Benin", + "Botswana", + "Brazil", + "Burkina Faso", + "Chile", + "China", + "Congo", + "Costa Rica", + "Côte d'Ivoire", + "Cuba", + "Czechia", + "Estonia", + "Ethiopia", + "France", + "Gabon", + "Germany", + "India", + "Indonesia", + "Ireland", + "Italy", + "Japan", + "Kazakhstan", + "Kenya", + "Korea, Republic of", + "Kuwait", + "Maldives", + "Mexico", + "Montenegro", + "Morocco", + "Namibia", + "North Macedonia", + "Pakistan", + "Peru", + "Philippines", + "Romania", + "Russian Federation", + "Saudi Arabia", + "Sierra Leone", + "South Africa", + "United Arab Emirates", + "United Kingdom", + "Venezuela, Bolivarian Republic of", + "Viet Nam" + ], + "rightsdocs_res_41_25": [ + "Australia", + "Austria", + "Bahamas", + "Bulgaria", + "Chile", + "Croatia", + "Czechia", + "Denmark", + "Fiji", + "Hungary", + "Iceland", + "Italy", + "Japan", + "Mexico", + "Peru", + "Rwanda", + "Slovakia", + "Spain", + "Ukraine", + "United Kingdom" + ], + "rightsdocs_res_3540": [ + "Algeria", + "Argentina", + "Austria", + "Benin", + "Botswana", + "Brazil", + "Burkina Faso", + "Chile", + "China", + "Congo", + "Costa Rica", + "Côte d'Ivoire", + "Cuba", + "Czechia", + "Estonia", + "Ethiopia", + "France", + "Gabon", + "Germany", + "India", + "Indonesia", + "Ireland", + "Italy", + "Japan", + "Kazakhstan", + "Kenya", + "Korea, Republic of", + "Kuwait", + "Maldives", + "Mexico", + "Montenegro", + "Morocco", + "Namibia", + "North Macedonia", + "Pakistan", + "Peru", + "Philippines", + "Romania", + "Russian Federation", + "Saudi Arabia", + "Sierra Leone", + "South Africa", + "United Arab Emirates", + "United Kingdom", + "Venezuela, Bolivarian Republic of", + "Viet Nam" + ], + "rightsdocs_res_42_3": [ + "Afghanistan", + "Argentina", + "Australia", + "Austria", + "Bahamas", + "Bahrain", + "Bangladesh", + "Brazil", + "Bulgaria", + "Burkina Faso", + "Chile", + "Croatia", + "Czechia", + "Denmark", + "Egypt", + "Eritrea", + "Fiji", + "Hungary", + "Iceland", + "Iraq", + "Italy", + "Mexico", + "Nigeria", + "Pakistan", + "Peru", + "Qatar", + "Rwanda", + "Saudi Arabia", + "Senegal", + "Slovakia", + "Somalia", + "South Africa", + "Spain", + "Togo", + "Tunisia", + "United Kingdom", + "Uruguay" + ], + "rightsdocs_res_45_31": [ + "Afghanistan", + "Angola", + "Argentina", + "Armenia", + "Australia", + "Austria", + "Bahamas", + "Brazil", + "Bulgaria", + "Burkina Faso", + "Chile", + "Congo, the Democratic Republic of the", + "Czechia", + "Denmark", + "Fiji", + "Germany", + "Italy", + "Japan", + "Korea, Republic of", + "Marshall Islands", + "Mexico", + "Nepal", + "Netherlands", + "Peru", + "Poland", + "Qatar", + "Senegal", + "Slovakia", + "Spain", + "Togo", + "Ukraine", + "Uruguay" + ], + "rightsdocs_res_45_14": [ + "Afghanistan", + "Angola", + "Bahamas", + "Bahrain", + "Bangladesh", + "Burkina Faso", + "Cameroon", + "Congo, the Democratic Republic of the", + "Eritrea", + "Fiji", + "India", + "Indonesia", + "Libya", + "Mauritania", + "Namibia", + "Nepal", + "Nigeria", + "Pakistan", + "Philippines", + "Qatar", + "Senegal", + "Somalia", + "Sudan", + "Togo", + "Venezuela, Bolivarian Republic of" + ], + "rightsdocs_res_42_26": [ + "Argentina", + "Australia", + "Austria", + "Bahamas", + "Brazil", + "Bulgaria", + "Chile", + "Croatia", + "Czechia", + "Denmark", + "Fiji", + "Hungary", + "Iceland", + "Italy", + "Japan", + "Mexico", + "Peru", + "Rwanda", + "Slovakia", + "Spain", + "Ukraine", + "United Kingdom", + "Uruguay" + ], + "rightsdocs_res_3035": [ + "Algeria", + "Argentina", + "Benin", + "Botswana", + "Brazil", + "Burkina Faso", + "China", + "Congo", + "Costa Rica", + "Côte d'Ivoire", + "Cuba", + "Ethiopia", + "Gabon", + "India", + "Indonesia", + "Kazakhstan", + "Kenya", + "Kuwait", + "Maldives", + "Morocco", + "Namibia", + "Pakistan", + "Philippines", + "Russian Federation", + "Saudi Arabia", + "Sierra Leone", + "South Africa", + "United Arab Emirates", + "Venezuela, Bolivarian Republic of", + "Viet Nam" + ], + "rightsdocs_res_12001": [ + "Angola", + "Azerbaijan", + "Bangladesh", + "Bolivia, Plurinational State of", + "Brazil", + "China", + "Cuba", + "Djibouti", + "Egypt", + "Gabon", + "Ghana", + "India", + "Indonesia", + "Jordan", + "Malaysia", + "Mali", + "Mauritius", + "Mexico", + "Nicaragua", + "Nigeria", + "Pakistan", + "Peru", + "Philippines", + "Qatar", + "Russian Federation", + "Saudi Arabia", + "Senegal", + "South Africa", + "Sri Lanka", + "Uruguay", + "Zambia" + ], + "rightsdocs_res_45_21": [ + "Argentina", + "Australia", + "Austria", + "Bahamas", + "Brazil", + "Bulgaria", + "Chile", + "Czechia", + "Denmark", + "Fiji", + "Germany", + "Italy", + "Japan", + "Korea, Republic of", + "Libya", + "Marshall Islands", + "Mexico", + "Netherlands", + "Peru", + "Poland", + "Qatar", + "Slovakia", + "Somalia", + "Spain", + "Togo", + "Ukraine", + "Uruguay" + ], + "rightsdocs_res_2898": [ + "Algeria", + "Argentina", + "Benin", + "Brazil", + "Burkina Faso", + "Chile", + "China", + "Congo", + "Costa Rica", + "Côte d'Ivoire", + "Cuba", + "Ethiopia", + "Gabon", + "India", + "Indonesia", + "Kazakhstan", + "Kenya", + "Morocco", + "Namibia", + "Pakistan", + "Peru", + "Philippines", + "Russian Federation", + "Saudi Arabia", + "Sierra Leone", + "South Africa", + "United Arab Emirates", + "Venezuela, Bolivarian Republic of", + "Viet Nam" + ], + "rightsdocs_res_3011": [ + "Algeria", + "Argentina", + "Benin", + "Botswana", + "Brazil", + "Burkina Faso", + "Chile", + "China", + "Congo", + "Costa Rica", + "Côte d'Ivoire", + "Cuba", + "Ethiopia", + "Gabon", + "India", + "Indonesia", + "Kazakhstan", + "Kenya", + "Kuwait", + "Maldives", + "Mexico", + "Morocco", + "Namibia", + "Pakistan", + "Peru", + "Philippines", + "Russian Federation", + "Saudi Arabia", + "Sierra Leone", + "South Africa", + "United Arab Emirates", + "Venezuela, Bolivarian Republic of", + "Viet Nam" + ], + "rightsdocs_res_12928": [ + "Algeria", + "Argentina", + "Brazil", + "Chile", + "China", + "Congo", + "Costa Rica", + "Côte d'Ivoire", + "Cuba", + "Ethiopia", + "India", + "Indonesia", + "Kazakhstan", + "Kenya", + "Kuwait", + "Maldives", + "Mexico", + "Morocco", + "Namibia", + "Pakistan", + "Peru", + "Philippines", + "Russian Federation", + "Saudi Arabia", + "Sierra Leone", + "South Africa", + "United Arab Emirates", + "Venezuela, Bolivarian Republic of", + "Viet Nam" + ], + "rightsdocs_res_44_19": [ + "Argentina", + "Australia", + "Austria", + "Brazil", + "Bulgaria", + "Chile", + "Czechia", + "Denmark", + "Fiji", + "Germany", + "Italy", + "Japan", + "Korea, Republic of", + "Libya", + "Marshall Islands", + "Mexico", + "Netherlands", + "Peru", + "Poland", + "Slovakia", + "Spain", + "Ukraine" + ], + "rightsdocs_res_41_3": [ + "Afghanistan", + "Angola", + "Bahamas", + "Bahrain", + "Bangladesh", + "Burkina Faso", + "Cameroon", + "China", + "Congo, the Democratic Republic of the", + "Cuba", + "Egypt", + "Eritrea", + "Fiji", + "India", + "Iraq", + "Nepal", + "Nigeria", + "Pakistan", + "Philippines", + "Qatar", + "Rwanda", + "Saudi Arabia", + "Senegal", + "Somalia", + "South Africa", + "Togo", + "Tunisia", + "Uruguay" + ], + "rightsdocs_res_43_3": [ + "Afghanistan", + "Angola", + "Argentina", + "Armenia", + "Bahrain", + "Bangladesh", + "Burkina Faso", + "Chile", + "Eritrea", + "Indonesia", + "Libya", + "Mauritania", + "Mexico", + "Namibia", + "Nigeria", + "Pakistan", + "Peru", + "Qatar", + "Senegal", + "Somalia", + "Sudan", + "Venezuela, Bolivarian Republic of" + ], + "1695895091619szj1a274hdq": [], + "1695895096917lo3dqcniefh": [], + "1695895098672drk2igpbu4m": [], + "rightsdocs_res_3546": [ + "Algeria", + "Argentina", + "Botswana", + "Brazil", + "Burkina Faso", + "Chile", + "China", + "Congo", + "Costa Rica", + "Cuba", + "Ethiopia", + "India", + "Indonesia", + "Kazakhstan", + "Kuwait", + "Mexico", + "Morocco", + "Namibia", + "Pakistan", + "Peru", + "Philippines", + "Russian Federation", + "Sierra Leone", + "South Africa", + "United Arab Emirates", + "Venezuela, Bolivarian Republic of", + "Viet Nam" + ], + "rightsdocs_res_42_23": [ + "Afghanistan", + "Angola", + "Bahamas", + "Bahrain", + "Bangladesh", + "Burkina Faso", + "Cameroon", + "China", + "Congo, the Democratic Republic of the", + "Cuba", + "Egypt", + "Eritrea", + "Fiji", + "India", + "Iraq", + "Nepal", + "Nigeria", + "Pakistan", + "Philippines", + "Qatar", + "Rwanda", + "Saudi Arabia", + "Senegal", + "Somalia", + "South Africa", + "Togo", + "Tunisia" + ], + "rightsdocs_res_859": [ + "Algeria", + "Argentina", + "Bangladesh", + "Bolivia, Plurinational State of", + "Botswana", + "Brazil", + "China", + "Congo", + "Côte d'Ivoire", + "Cuba", + "El Salvador", + "Ethiopia", + "Gabon", + "Ghana", + "India", + "Indonesia", + "Kazakhstan", + "Kenya", + "Maldives", + "Morocco", + "Namibia", + "Nigeria", + "Pakistan", + "Paraguay", + "Russian Federation", + "Saudi Arabia", + "Sierra Leone", + "South Africa", + "United Arab Emirates", + "Venezuela, Bolivarian Republic of", + "Viet Nam" + ], + "rightsdocs_res_45_5": [ + "Angola", + "Argentina", + "Bahamas", + "Bahrain", + "Bangladesh", + "Burkina Faso", + "Cameroon", + "Chile", + "Congo, the Democratic Republic of the", + "Eritrea", + "Fiji", + "India", + "Indonesia", + "Libya", + "Mauritania", + "Namibia", + "Nepal", + "Nigeria", + "Pakistan", + "Peru", + "Philippines", + "Qatar", + "Senegal", + "Somalia", + "Sudan", + "Togo", + "Venezuela, Bolivarian Republic of" + ], + "rightsdocs_res_41_23": [ + "Argentina", + "Australia", + "Austria", + "Bahamas", + "Brazil", + "Bulgaria", + "Chile", + "Croatia", + "Czechia", + "Denmark", + "Fiji", + "Hungary", + "Iceland", + "Italy", + "Japan", + "Mexico", + "Peru", + "Qatar", + "Rwanda", + "Saudi Arabia", + "Slovakia", + "Spain", + "Togo", + "Ukraine", + "United Kingdom", + "Uruguay" + ], + "rightsdocs_res_42_27": [ + "Argentina", + "Australia", + "Austria", + "Bahamas", + "Brazil", + "Bulgaria", + "Chile", + "Croatia", + "Czechia", + "Denmark", + "Fiji", + "Hungary", + "Iceland", + "Italy", + "Japan", + "Mexico", + "Peru", + "Qatar", + "Rwanda", + "Saudi Arabia", + "Slovakia", + "Somalia", + "Spain", + "Togo", + "Ukraine", + "United Kingdom", + "Uruguay" + ], + "rightsdocs_res_41_1": [ + "Argentina", + "Australia", + "Austria", + "Bahamas", + "Brazil", + "Bulgaria", + "Chile", + "Croatia", + "Czechia", + "Denmark", + "Fiji", + "Iceland", + "Italy", + "Japan", + "Mexico", + "Peru", + "Slovakia", + "Spain", + "Ukraine", + "United Kingdom", + "Uruguay" + ], + "rightsdocs_res_41_18": [ + "Argentina", + "Australia", + "Austria", + "Bahamas", + "Brazil", + "Bulgaria", + "Chile", + "Croatia", + "Cuba", + "Czechia", + "Denmark", + "Fiji", + "Iceland", + "Italy", + "Japan", + "Mexico", + "Nepal", + "Peru", + "Philippines", + "Rwanda", + "Slovakia", + "South Africa", + "Spain", + "Tunisia", + "Ukraine", + "United Kingdom", + "Uruguay" + ], + "rightsdocs_res_44_18": [ + "Afghanistan", + "Angola", + "Argentina", + "Bahamas", + "Bahrain", + "Bangladesh", + "Burkina Faso", + "Cameroon", + "Chile", + "Congo, the Democratic Republic of the", + "Eritrea", + "Fiji", + "India", + "Indonesia", + "Libya", + "Marshall Islands", + "Mauritania", + "Namibia", + "Nepal", + "Nigeria", + "Pakistan", + "Peru", + "Philippines", + "Qatar", + "Senegal", + "Somalia", + "Sudan", + "Togo", + "Uruguay", + "Venezuela, Bolivarian Republic of" + ], + "rightsdocs_res_3316": [ + "Algeria", + "Argentina", + "Benin", + "Botswana", + "Brazil", + "Burkina Faso", + "Chile", + "China", + "Congo", + "Costa Rica", + "Côte d'Ivoire", + "Cuba", + "Ethiopia", + "Gabon", + "India", + "Indonesia", + "Kazakhstan", + "Kenya", + "Kuwait", + "Maldives", + "Mexico", + "Morocco", + "Namibia", + "Pakistan", + "Peru", + "Philippines", + "Russian Federation", + "Saudi Arabia", + "Sierra Leone", + "South Africa", + "United Arab Emirates", + "Venezuela, Bolivarian Republic of", + "Viet Nam" + ], + "rightsdocs_res_862": [ + "Algeria", + "Argentina", + "Bangladesh", + "Bolivia, Plurinational State of", + "Botswana", + "Brazil", + "China", + "Congo", + "Côte d'Ivoire", + "Cuba", + "El Salvador", + "Ethiopia", + "Gabon", + "Ghana", + "India", + "Indonesia", + "Kazakhstan", + "Kenya", + "Maldives", + "Mexico", + "Morocco", + "Namibia", + "Nigeria", + "Pakistan", + "Paraguay", + "Qatar", + "Russian Federation", + "Saudi Arabia", + "Sierra Leone", + "South Africa", + "United Arab Emirates", + "Venezuela, Bolivarian Republic of", + "Viet Nam" + ], + "rightsdocs_res_42_2": [ + "Argentina", + "Australia", + "Austria", + "Bahamas", + "Brazil", + "Bulgaria", + "Chile", + "Croatia", + "Czechia", + "Denmark", + "Fiji", + "Hungary", + "Iceland", + "Italy", + "Mexico", + "Peru", + "Qatar", + "Slovakia", + "South Africa", + "Spain", + "United Kingdom", + "Uruguay" + ], + "rightsdocs_res_3576": [ + "Argentina", + "Austria", + "Benin", + "Botswana", + "Brazil", + "Burkina Faso", + "Chile", + "Costa Rica", + "Côte d'Ivoire", + "Czechia", + "Estonia", + "France", + "Gabon", + "Germany", + "Indonesia", + "Ireland", + "Italy", + "Japan", + "Korea, Republic of", + "Kuwait", + "Maldives", + "Mexico", + "Montenegro", + "Morocco", + "North Macedonia", + "Peru", + "Romania", + "Saudi Arabia", + "Sierra Leone", + "United Arab Emirates", + "United Kingdom", + "United States" + ], + "rightsdocs_res_44_21": [ + "Afghanistan", + "Argentina", + "Australia", + "Austria", + "Bahamas", + "Brazil", + "Bulgaria", + "Chile", + "Czechia", + "Denmark", + "Fiji", + "Germany", + "Italy", + "Japan", + "Korea, Republic of", + "Libya", + "Marshall Islands", + "Mexico", + "Netherlands", + "Peru", + "Poland", + "Qatar", + "Slovakia", + "Somalia", + "Spain", + "Togo", + "Ukraine", + "Uruguay" + ], + "rightsdocs_res_s27_1": [ + "Albania", + "Bangladesh", + "Belgium", + "Botswana", + "Brazil", + "Croatia", + "Egypt", + "El Salvador", + "Georgia", + "Germany", + "Ghana", + "Hungary", + "Indonesia", + "Iraq", + "Korea, Republic of", + "Kyrgyzstan", + "Latvia", + "Netherlands", + "Nigeria", + "Panama", + "Paraguay", + "Portugal", + "Qatar", + "Rwanda", + "Saudi Arabia", + "Slovenia", + "Switzerland", + "Togo", + "Tunisia", + "United Arab Emirates", + "United Kingdom", + "United States", + "Côte d'Ivoire" + ], + "rightsdocs_res_45_2": [ + "Angola", + "Burkina Faso", + "Cameroon", + "Eritrea", + "Fiji", + "Indonesia", + "Mexico", + "Namibia", + "Nepal", + "Pakistan", + "Philippines", + "Qatar", + "Sudan", + "Venezuela, Bolivarian Republic of" + ], + "rightsdocs_res_45_1": [ + "Afghanistan", + "Argentina", + "Australia", + "Austria", + "Brazil", + "Bulgaria", + "Chile", + "Czechia", + "Denmark", + "Fiji", + "Germany", + "Italy", + "Japan", + "Korea, Republic of", + "Marshall Islands", + "Mexico", + "Netherlands", + "Peru", + "Poland", + "Slovakia", + "Spain", + "Ukraine", + "Uruguay" + ], + "rightsdocs_res_s28_1": [ + "Afghanistan", + "Angola", + "Belgium", + "Brazil", + "Burundi", + "Chile", + "China", + "Congo, the Democratic Republic of the", + "Cuba", + "Ecuador", + "Egypt", + "Iraq", + "Kyrgyzstan", + "Mexico", + "Nepal", + "Nigeria", + "Pakistan", + "Peru", + "Philippines", + "Qatar", + "Saudi Arabia", + "Senegal", + "Slovenia", + "South Africa", + "Spain", + "Tunisia", + "United Arab Emirates", + "Venezuela, Bolivarian Republic of", + "Côte d'Ivoire" + ], + "rightsdocs_res_2801": [ + "Argentina", + "Austria", + "Benin", + "Botswana", + "Brazil", + "Burkina Faso", + "Chile", + "Costa Rica", + "Côte d'Ivoire", + "Czechia", + "Estonia", + "France", + "Gabon", + "Germany", + "Indonesia", + "Ireland", + "Italy", + "Japan", + "Korea, Republic of", + "Kuwait", + "Maldives", + "Mexico", + "Montenegro", + "Morocco", + "North Macedonia", + "Peru", + "Romania", + "Saudi Arabia", + "Sierra Leone", + "United Arab Emirates", + "United Kingdom", + "United States" + ], + "rightsdocs_res_41_22": [ + "Argentina", + "Australia", + "Austria", + "Brazil", + "Bulgaria", + "Chile", + "Croatia", + "Czechia", + "Denmark", + "Fiji", + "Hungary", + "Iceland", + "Italy", + "Japan", + "Mexico", + "Peru", + "Slovakia", + "Spain", + "Ukraine", + "United Kingdom" + ], + "rightsdocs_res_11983": [ + "Angola", + "Azerbaijan", + "Bangladesh", + "Bolivia, Plurinational State of", + "Brazil", + "Cameroon", + "China", + "Cuba", + "Djibouti", + "Egypt", + "Ghana", + "Guatemala", + "India", + "Indonesia", + "Jordan", + "Madagascar", + "Malaysia", + "Mali", + "Mauritius", + "Mexico", + "Nicaragua", + "Nigeria", + "Pakistan", + "Peru", + "Philippines", + "Qatar", + "Russian Federation", + "Saudi Arabia", + "Senegal", + "South Africa", + "Sri Lanka", + "Zambia" + ], + "rightsdocs_res_2880": [ + "Austria", + "Benin", + "Botswana", + "Chile", + "Costa Rica", + "Côte d'Ivoire", + "Czechia", + "Estonia", + "France", + "Germany", + "Ireland", + "Italy", + "Japan", + "Korea, Republic of", + "Maldives", + "Mexico", + "Montenegro", + "North Macedonia", + "Philippines", + "Romania", + "Sierra Leone", + "United Kingdom", + "United States" + ], + "rightsdocs_res_44_23": [ + "Afghanistan", + "Angola", + "Argentina", + "Armenia", + "Australia", + "Austria", + "Bahamas", + "Bahrain", + "Bangladesh", + "Brazil", + "Bulgaria", + "Burkina Faso", + "Chile", + "Congo, the Democratic Republic of the", + "Czechia", + "Denmark", + "Fiji", + "Germany", + "India", + "Indonesia", + "Italy", + "Japan", + "Korea, Republic of", + "Libya", + "Marshall Islands", + "Mexico", + "Nepal", + "Netherlands", + "Nigeria", + "Peru", + "Philippines", + "Poland", + "Qatar", + "Senegal", + "Slovakia", + "Somalia", + "Spain", + "Sudan", + "Togo", + "Ukraine", + "Uruguay" + ], + "rightsdocs_res_41_4": [ + "Afghanistan", + "Angola", + "Bahamas", + "Bahrain", + "Bangladesh", + "Brazil", + "Burkina Faso", + "Cameroon", + "Chile", + "China", + "Congo, the Democratic Republic of the", + "Cuba", + "Egypt", + "Eritrea", + "Fiji", + "India", + "Iraq", + "Mexico", + "Nepal", + "Nigeria", + "Pakistan", + "Peru", + "Philippines", + "Qatar", + "Rwanda", + "Saudi Arabia", + "Senegal", + "Somalia", + "South Africa", + "Togo", + "Tunisia", + "Uruguay" + ], + "rightsdocs_res_45_4": [ + "Angola", + "Argentina", + "Bahamas", + "Bahrain", + "Bangladesh", + "Burkina Faso", + "Cameroon", + "Eritrea", + "Fiji", + "India", + "Indonesia", + "Mauritania", + "Namibia", + "Nepal", + "Nigeria", + "Pakistan", + "Philippines", + "Qatar", + "Senegal", + "Sudan", + "Togo", + "Venezuela, Bolivarian Republic of" + ], + "rightsdocs_res_3531": [ + "Algeria", + "Argentina", + "Austria", + "Benin", + "Botswana", + "Brazil", + "Burkina Faso", + "Chile", + "China", + "Congo", + "Costa Rica", + "Côte d'Ivoire", + "Cuba", + "Czechia", + "Estonia", + "Ethiopia", + "France", + "Gabon", + "Germany", + "India", + "Indonesia", + "Ireland", + "Italy", + "Japan", + "Kazakhstan", + "Kenya", + "Korea, Republic of", + "Kuwait", + "Maldives", + "Mexico", + "Montenegro", + "Morocco", + "Namibia", + "North Macedonia", + "Pakistan", + "Peru", + "Philippines", + "Romania", + "Russian Federation", + "Saudi Arabia", + "Sierra Leone", + "South Africa", + "United Arab Emirates", + "United Kingdom", + "Venezuela, Bolivarian Republic of", + "Viet Nam" + ], + "rightsdocs_res_2832": [ + "Argentina", + "Austria", + "Benin", + "Botswana", + "Brazil", + "Chile", + "Costa Rica", + "Czechia", + "Estonia", + "France", + "Gabon", + "Germany", + "Ireland", + "Italy", + "Japan", + "Korea, Republic of", + "Maldives", + "Montenegro", + "North Macedonia", + "Peru", + "Romania", + "Sierra Leone", + "United Kingdom", + "United States" + ], + "rightsdocs_res_3594": [ + "Argentina", + "Austria", + "Benin", + "Botswana", + "Brazil", + "Chile", + "Costa Rica", + "Côte d'Ivoire", + "Czechia", + "Estonia", + "France", + "Germany", + "Ireland", + "Italy", + "Korea, Republic of", + "Mexico", + "Montenegro", + "North Macedonia", + "Peru", + "Romania", + "Sierra Leone", + "United Kingdom", + "United States" + ], + "rightsdocs_res_42_25": [ + "Argentina", + "Australia", + "Austria", + "Bahamas", + "Brazil", + "Bulgaria", + "Chile", + "Croatia", + "Czechia", + "Denmark", + "Hungary", + "Iceland", + "Italy", + "Japan", + "Peru", + "Slovakia", + "Spain", + "Ukraine", + "United Kingdom" + ], + "rightsdocs_res_877": [ + "Albania", + "Algeria", + "Argentina", + "Bolivia, Plurinational State of", + "Brazil", + "Congo", + "Côte d'Ivoire", + "El Salvador", + "Estonia", + "France", + "Gabon", + "Germany", + "Ireland", + "Kazakhstan", + "Latvia", + "Mexico", + "Montenegro", + "Namibia", + "Netherlands", + "North Macedonia", + "Paraguay", + "Portugal", + "Sierra Leone", + "South Africa", + "United Kingdom", + "Venezuela, Bolivarian Republic of" + ], + "rightsdocs_res_41_19": [ + "Afghanistan", + "Angola", + "Argentina", + "Bahamas", + "Bahrain", + "Bangladesh", + "Brazil", + "Burkina Faso", + "Cameroon", + "Chile", + "China", + "Congo, the Democratic Republic of the", + "Cuba", + "Egypt", + "Eritrea", + "Fiji", + "India", + "Iraq", + "Mexico", + "Nepal", + "Nigeria", + "Pakistan", + "Peru", + "Philippines", + "Qatar", + "Rwanda", + "Saudi Arabia", + "Senegal", + "Somalia", + "South Africa", + "Togo", + "Tunisia", + "Uruguay" + ], + "rightsdocs_res_42_1": [ + "Afghanistan", + "Angola", + "Argentina", + "Bahamas", + "Bahrain", + "Bangladesh", + "Burkina Faso", + "Cameroon", + "China", + "Congo, the Democratic Republic of the", + "Cuba", + "Egypt", + "Eritrea", + "Fiji", + "India", + "Iraq", + "Mexico", + "Nepal", + "Nigeria", + "Pakistan", + "Peru", + "Philippines", + "Qatar", + "Rwanda", + "Saudi Arabia", + "Senegal", + "South Africa", + "Togo", + "Tunisia", + "Uruguay" + ], + "rightsdocs_res_42_8": [ + "Angola", + "Bahamas", + "Bahrain", + "Bangladesh", + "Burkina Faso", + "Cameroon", + "China", + "Cuba", + "Egypt", + "Eritrea", + "Fiji", + "India", + "Iraq", + "Nepal", + "Nigeria", + "Pakistan", + "Philippines", + "Qatar", + "Rwanda", + "Saudi Arabia", + "Senegal", + "Somalia", + "South Africa", + "Tunisia", + "Uruguay" + ], + "rightsdocs_res_41_5": [ + "Afghanistan", + "Angola", + "Argentina", + "Bahamas", + "Bahrain", + "Bangladesh", + "Brazil", + "Burkina Faso", + "Cameroon", + "Chile", + "China", + "Congo, the Democratic Republic of the", + "Cuba", + "Egypt", + "Eritrea", + "Fiji", + "India", + "Iraq", + "Nepal", + "Nigeria", + "Pakistan", + "Peru", + "Philippines", + "Qatar", + "Rwanda", + "Saudi Arabia", + "Senegal", + "Somalia", + "South Africa", + "Togo", + "Tunisia", + "Uruguay" + ], + "rightsdocs_res_3564": [ + "Argentina", + "Austria", + "Botswana", + "Brazil", + "Chile", + "Costa Rica", + "Czechia", + "Estonia", + "France", + "Germany", + "Ireland", + "Italy", + "Japan", + "Korea, Republic of", + "Mexico", + "Montenegro", + "North Macedonia", + "Peru", + "Romania", + "United Kingdom", + "United States" + ], + "rightsdocs_res_42_4": [ + "Angola", + "Burkina Faso", + "Cameroon", + "China", + "Cuba", + "Egypt", + "Eritrea", + "Fiji", + "Iraq", + "Mexico", + "Nepal", + "Pakistan", + "Philippines", + "Qatar", + "Rwanda", + "South Africa", + "Tunisia", + "Uruguay" + ], + "1674746308503ltc72zcky3h": [], + "1674746294557s160p33rh8": [], + "1674746318601lt0zdzi7pwa": [], + "1674746323194yyzc4xfggfe": [], + "1674746342744g76s6nsbry": [], + "1674746343722fsshjp2gk8": [], + "1674746344261gwkjxmf7fm": [], + "1674746346307a5cgoj04hih": [], + "1674746348446iheahcd7txb": [], + "1674746350495e9n7id02hoe": [], + "1674746352071xwxjj1xv5z": [], + "1674746369407v35wueggc9": [], + "1674746374288u0pkrwh5mlh": [], + "1674746375983vqkmazyw5og": [], + "1674746398979mk72w75e329": [], + "1674746401722v5o7rpafzg": [], + "1674746412094vqvlhudtcqn": [], + "1674746428538v6u7xod401": [], + "1674746434707xeqs2we68h": [], + "1674746436573rzfve4pxg9": [], + "1674746438321eqijkzrm926": [], + "1674746439714ou4db4kqr59": [], + "1674746443261dphvbzivd7": [], + "1674746445584u8d26vxzrnh": [], + "1674746445906f5gxwtp911i": [], + "1674746446474kcn00ozwueo": [], + "1674746447876v8k2pxwrmmj": [], + "1674746450022td57wxgq74r": [], + "1674746452401u9jcq44r72l": [], + "1695894673681wog9xhfnbt": [], + "1695894704063p5fhued7tx": [], + "1695894738922hf107ngvqmp": [], + "1695894840228iecej4r3y7c": [], + "1695894864589tmcs9j07ji": [], + "1695894871414puw6hhmnoc9": [], + "1695894871656zm5p65d7ql8": [], + "1695894872356z8nj8tv50o": [], + "1695894872572fvw01r626g": [], + "1695894874549jcifsx56l4": [], + "1695895054538xqnwud856n": [], + "1695895058364cfjzcsvf3a4": [], + "1695895058564lhq9sjspm9": [], + "1695895064080v2fbywh9l29": [], + "1695895073106k7a2610zpsq": [], + "1695895109776gzfd9kmwfs": [], + "1695895113668u4qwn3694im": [] +} \ No newline at end of file diff --git a/src/extractors/pdf_to_multi_option_extractor/labeled_data/countries_in_favor_empty_labels/options.json b/src/extractors/pdf_to_multi_option_extractor/labeled_data/countries_in_favor_empty_labels/options.json new file mode 100644 index 0000000..1331857 --- /dev/null +++ b/src/extractors/pdf_to_multi_option_extractor/labeled_data/countries_in_favor_empty_labels/options.json @@ -0,0 +1,197 @@ +[ + "Kyrgyzstan", + "Saint Lucia", + "Korea, Democratic People's Republic of", + "Qatar", + "Italy", + "Slovakia", + "South Sudan", + "Eswatini", + "China", + "Togo", + "Lithuania", + "Ukraine", + "Germany", + "Oman", + "Malta", + "Mexico", + "Chad", + "Saint Kitts and Nevis", + "Barbados", + "Montenegro", + "Palestine, State of", + "Armenia", + "Haiti", + "Luxembourg", + "Vanuatu", + "Venezuela, Bolivarian Republic of", + "Colombia", + "Azerbaijan", + "Mauritania", + "Spain", + "Turkey", + "Sao Tome and Principe", + "Zambia", + "Myanmar", + "Czechia", + "Lao People's Democratic Republic", + "Bhutan", + "Ecuador", + "Slovenia", + "Jordan", + "Belgium", + "Hungary", + "Bulgaria", + "Tonga", + "Turkmenistan", + "Israel", + "Bahamas", + "Guinea-Bissau", + "Mali", + "Maldives", + "Marshall Islands", + "Indonesia", + "Morocco", + "Saint Vincent and the Grenadines", + "San Marino", + "Solomon Islands", + "Uruguay", + "Cameroon", + "Denmark", + "Liberia", + "Tuvalu", + "Portugal", + "Niger", + "Guyana", + "Congo", + "Nepal", + "Peru", + "Japan", + "Cuba", + "Algeria", + "Finland", + "Russian Federation", + "Sweden", + "Burundi", + "Thailand", + "Grenada", + "Belarus", + "Kiribati", + "Andorra", + "Monaco", + "Samoa", + "Papua New Guinea", + "Namibia", + "Bolivia, Plurinational State of", + "Saudi Arabia", + "Antigua and Barbuda", + "Norway", + "Timor-Leste", + "Comoros", + "Korea, Republic of", + "Kenya", + "Netherlands", + "Bahrain", + "Panama", + "Uzbekistan", + "Benin", + "Trinidad and Tobago", + "North Macedonia", + "Sudan", + "Cambodia", + "Canada", + "Chile", + "Nauru", + "United Arab Emirates", + "Guatemala", + "Liechtenstein", + "Sri Lanka", + "Congo, the Democratic Republic of the", + "Botswana", + "Equatorial Guinea", + "Guinea", + "Viet Nam", + "Holy See (Vatican City State)", + "Mauritius", + "Angola", + "Libya", + "United Kingdom", + "Central African Republic", + "Somalia", + "India", + "Cape Verde", + "New Zealand", + "Uganda", + "Nicaragua", + "Palau", + "Fiji", + "Poland", + "Romania", + "Lebanon", + "Philippines", + "Rwanda", + "Afghanistan", + "Zimbabwe", + "Austria", + "Brunei Darussalam", + "Eritrea", + "Mozambique", + "El Salvador", + "Burkina Faso", + "Bangladesh", + "Iran, Islamic Republic of", + "Latvia", + "Iceland", + "Kazakhstan", + "Micronesia, Federated States of", + "Cyprus", + "Ghana", + "Sierra Leone", + "Iraq", + "Suriname", + "Malawi", + "Georgia", + "Lesotho", + "Mongolia", + "Nigeria", + "Tanzania, United Republic of", + "Paraguay", + "Jamaica", + "Moldova, Republic of", + "Malaysia", + "Tajikistan", + "Costa Rica", + "Syrian Arab Republic", + "Switzerland", + "Singapore", + "Argentina", + "Estonia", + "Dominican Republic", + "Gabon", + "Kuwait", + "Australia", + "Egypt", + "Tunisia", + "Albania", + "South Africa", + "Greece", + "Senegal", + "Ethiopia", + "Honduras", + "Bosnia and Herzegovina", + "Pakistan", + "Belize", + "Seychelles", + "Yemen", + "Brazil", + "Gambia", + "Côte d'Ivoire", + "Croatia", + "Ireland", + "France", + "Serbia", + "Madagascar", + "Djibouti", + "United States", + "Dominica" +] diff --git a/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/FastSegmentSelectorFuzzy95.py b/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/FastSegmentSelectorFuzzy95.py index e921682..2a92752 100644 --- a/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/FastSegmentSelectorFuzzy95.py +++ b/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/FastSegmentSelectorFuzzy95.py @@ -17,7 +17,6 @@ class FastSegmentSelectorFuzzy95(PdfMultiOptionMethod): - REPORT_ERRORS = False threshold = 85 text_types = [TokenType.TEXT, TokenType.LIST_ITEM, TokenType.TITLE, TokenType.SECTION_HEADER, TokenType.CAPTION] diff --git a/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/FastSegmentSelectorFuzzyCommas.py b/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/FastSegmentSelectorFuzzyCommas.py index 5db80b5..b388add 100644 --- a/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/FastSegmentSelectorFuzzyCommas.py +++ b/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/FastSegmentSelectorFuzzyCommas.py @@ -7,14 +7,12 @@ class FastSegmentSelectorFuzzyCommas(FastSegmentSelectorFuzzy95): - REPORT_ERRORS = False + def train(self, multi_option_data: ExtractionData): + self.set_parameters(multi_option_data) + super().train(multi_option_data) + FuzzyCommas().train(multi_option_data) def predict(self, multi_option_data: ExtractionData) -> list[list[Option]]: self.set_parameters(multi_option_data) self.extraction_data = self.get_prediction_data(multi_option_data) return FuzzyCommas().predict(self.extraction_data) - - def train(self, multi_option_data: ExtractionData): - self.set_parameters(multi_option_data) - super().train(multi_option_data) - FuzzyCommas().train(multi_option_data) diff --git a/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/FuzzyCommas.py b/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/FuzzyCommas.py index 759e0bc..18fc676 100644 --- a/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/FuzzyCommas.py +++ b/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/FuzzyCommas.py @@ -34,7 +34,6 @@ def get_appearances_for_segments( text = pdf_segment.text_content texts_separated_by_comma = self.clean_texts(re.split(",|:| and ", text), False) for one_piece_text in texts_separated_by_comma: - appearance = self.get_appearances_one_segment(one_piece_text, aliases) if appearance: @@ -46,37 +45,37 @@ def get_appearances_for_segments( return appearances, not_found_texts def get_appearances_one_segment(self, text: str, aliases: dict[str, list[str]]) -> str: - for option in self.options_cleaned_words_sorted_by_length: - if len(text) < len(option) * 0.92 or len(text) > len(option) * 1.2: + for option_cleaned in self.options_cleaned_words_sorted_by_length: + if len(text) < len(option_cleaned) * 0.92 or len(text) > len(option_cleaned) * 1.2: continue - if fuzz.partial_ratio(option, self.clean_text(text, True)) >= self.threshold: - return self.options_cleaned[self.options_cleaned_words_sorted.index(option)] + if fuzz.partial_ratio(option_cleaned, self.clean_text(text, True)) >= self.threshold: + return self.options_cleaned[self.options_cleaned_words_sorted.index(option_cleaned)] - for option in self.options_cleaned_by_length: - if not aliases or option not in aliases: + for option_cleaned in self.options_cleaned_by_length: + if not aliases or option_cleaned not in aliases: continue - for alias in aliases[option]: + for alias in aliases[option_cleaned]: if rapidfuzz.fuzz.ratio(alias, text) > self.threshold: - return option + return option_cleaned return "" @staticmethod - def clean_text(text: str, sort: bool) -> str: + def clean_text(text: str, sort_words: bool) -> str: text = text.lower() text = "".join([letter for letter in text if letter.isalnum() or letter == " "]) - if sort: + if sort_words: text = " ".join(sorted(text.split())) else: text = " ".join(text.split()) return text - def clean_texts(self, texts: list[str], sort: bool) -> list[str]: - return list([self.clean_text(option, sort) for option in texts]) + def clean_texts(self, texts: list[str], sort_words: bool) -> list[str]: + return list([self.clean_text(option, sort_words) for option in texts]) def predict(self, multi_option_data: ExtractionData) -> list[list[Option]]: self.set_parameters(multi_option_data) @@ -84,6 +83,9 @@ def predict(self, multi_option_data: ExtractionData) -> list[list[Option]]: try: aliases = json.loads(self.get_aliases_path().read_text()) + if not aliases or not isinstance(aliases, dict): + raise FileNotFoundError + except FileNotFoundError: aliases = dict() @@ -138,7 +140,7 @@ def find_aliases(not_found_options: list[str], not_found_texts: list[str]) -> di return aliases def set_options_variants(self): - self.options_cleaned = self.clean_texts(texts=[x.label for x in self.options], sort=False) + self.options_cleaned = self.clean_texts(texts=[x.label for x in self.options], sort_words=False) self.options_cleaned_by_length = sorted(self.options_cleaned, key=lambda x: -len(x)) - self.options_cleaned_words_sorted = self.clean_texts(texts=[x.label for x in self.options], sort=True) + self.options_cleaned_words_sorted = self.clean_texts(texts=[x.label for x in self.options], sort_words=True) self.options_cleaned_words_sorted_by_length = sorted(self.options_cleaned_words_sorted, key=lambda x: -len(x)) diff --git a/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/PreviousWordsSentenceSelectorFuzzyCommas.py b/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/PreviousWordsSentenceSelectorFuzzyCommas.py index 6b863a1..1e0fb99 100644 --- a/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/PreviousWordsSentenceSelectorFuzzyCommas.py +++ b/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/PreviousWordsSentenceSelectorFuzzyCommas.py @@ -12,9 +12,6 @@ class PreviousWordsSentenceSelectorFuzzyCommas(SentenceSelectorFuzzyCommas): - - REPORT_ERRORS = False - def train(self, multi_option_data: ExtractionData): self.set_parameters(multi_option_data) extraction_data_by_sentences = self.get_extraction_data_by_sentence(multi_option_data) diff --git a/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/SentenceSelectorFuzzyCommas.py b/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/SentenceSelectorFuzzyCommas.py index 76e51e1..895fb8d 100644 --- a/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/SentenceSelectorFuzzyCommas.py +++ b/src/extractors/pdf_to_multi_option_extractor/multi_option_extraction_methods/SentenceSelectorFuzzyCommas.py @@ -14,9 +14,6 @@ class SentenceSelectorFuzzyCommas(FastSegmentSelectorFuzzyCommas): - - REPORT_ERRORS = False - def train(self, multi_option_data: ExtractionData): self.set_parameters(multi_option_data) extraction_data_by_sentences = self.get_extraction_data_by_sentence(multi_option_data) diff --git a/src/extractors/pdf_to_text_extractor/methods/PdfToTextRegexMethod.py b/src/extractors/pdf_to_text_extractor/methods/PdfToTextRegexMethod.py index 13672b1..4abd9ae 100644 --- a/src/extractors/pdf_to_text_extractor/methods/PdfToTextRegexMethod.py +++ b/src/extractors/pdf_to_text_extractor/methods/PdfToTextRegexMethod.py @@ -9,7 +9,9 @@ class PdfToTextRegexMethod(ToTextExtractorMethod): def train(self, extraction_data: ExtractionData): - regex_list = rexpy.extract([x.labeled_data.label_text for x in extraction_data.samples]) + samples = [x.labeled_data.label_text for x in extraction_data.samples] + samples = [sample for sample in samples if sample] + regex_list = rexpy.extract(samples) regex_list = [regex[1:-1] for regex in regex_list] self.save_json("regex_list.json", regex_list) diff --git a/src/extractors/segment_selector/SegmentSelectorResults.py b/src/extractors/segment_selector/SegmentSelectorResults.py index c380364..507f4ac 100644 --- a/src/extractors/segment_selector/SegmentSelectorResults.py +++ b/src/extractors/segment_selector/SegmentSelectorResults.py @@ -4,10 +4,18 @@ class SegmentSelectorResults(BaseModel): method: str dataset: str + samples: int precision: float recall: float seconds: int @staticmethod def get_padding(): - return {"method": "right", "dataset": "right", "precision": "left", "recall": "left", "seconds": "left"} + return { + "method": "right", + "dataset": "right", + "samples": "left", + "precision": "left", + "recall": "left", + "seconds": "left", + } diff --git a/src/extractors/segment_selector/get_data_for_performance.py b/src/extractors/segment_selector/get_data_for_performance.py index 49d6d59..73b0985 100644 --- a/src/extractors/segment_selector/get_data_for_performance.py +++ b/src/extractors/segment_selector/get_data_for_performance.py @@ -19,6 +19,7 @@ "plan_many_date", "plan_many_title", "president", + "president_empty_labels", "rightdocs_titles", "secretary", "semantic_president", diff --git a/src/extractors/text_to_text_extractor/methods/RegexMethod.py b/src/extractors/text_to_text_extractor/methods/RegexMethod.py index 95eafc8..0522cac 100644 --- a/src/extractors/text_to_text_extractor/methods/RegexMethod.py +++ b/src/extractors/text_to_text_extractor/methods/RegexMethod.py @@ -11,7 +11,9 @@ class RegexMethod(ToTextExtractorMethod): def train(self, extraction_data: ExtractionData): - regex_list = rexpy.extract([x.labeled_data.label_text for x in extraction_data.samples]) + samples = [x.labeled_data.label_text for x in extraction_data.samples] + samples = [sample for sample in samples if sample] + regex_list = rexpy.extract(samples) regex_list = [regex[1:-1] for regex in regex_list] self.save_json("regex_list.json", regex_list) diff --git a/src/extractors/text_to_text_extractor/methods/test/test_regex_method.py b/src/extractors/text_to_text_extractor/methods/test/test_regex_method.py index 27913e0..5d20170 100644 --- a/src/extractors/text_to_text_extractor/methods/test/test_regex_method.py +++ b/src/extractors/text_to_text_extractor/methods/test/test_regex_method.py @@ -77,6 +77,24 @@ def test_predict_void(self): self.assertEqual(1, len(predictions)) self.assertEqual("", predictions[0]) + def test_regex_when_empty_labels(self): + sample_1 = [TrainingSample(labeled_data=LabeledData(label_text="123", language_iso="en"), tags_texts=[""])] + sample_2 = [TrainingSample(labeled_data=LabeledData(label_text="321", language_iso="en"), tags_texts=[""])] + empty_labels = [TrainingSample(labeled_data=LabeledData(label_text="", language_iso="en"), tags_texts=[""])] * 100 + + extraction_data = ExtractionData( + samples=sample_1 + empty_labels + sample_2, extraction_identifier=extraction_identifier + ) + + regex_method = RegexMethod(extraction_identifier) + + regex_method.train(extraction_data) + + texts = ["foo 555 var"] + predictions = regex_method.predict([PredictionSample.from_texts(texts)]) + self.assertEqual(1, len(predictions)) + self.assertEqual("555", predictions[0]) + def test_retrain(self): sample = [TrainingSample(labeled_data=LabeledData(label_text="1", language_iso="en"), tags_texts=[""])] extraction_data = ExtractionData(samples=sample, extraction_identifier=extraction_identifier) diff --git a/src/performance_pdf_to_multi_option_report.py b/src/performance_pdf_to_multi_option_report.py index dacc6e2..0af858b 100644 --- a/src/performance_pdf_to_multi_option_report.py +++ b/src/performance_pdf_to_multi_option_report.py @@ -41,6 +41,7 @@ "d4la_document_type": (44.07, "CleanBeginningDotDigits500_SingleLabelSetFit"), "cejil_secretary": (80.0, "FuzzyAll75"), "countries_in_favor": (96.89, "PreviousWordsSentenceSelectorFuzzyCommas"), + "countries_in_favor_empty_labels": (96.89, "PreviousWordsSentenceSelectorFuzzyCommas"), "cejil_judge": (92.86, "FuzzyLast"), } @@ -189,7 +190,7 @@ def get_predictions(dataset: ExtractionData) -> (list[list[int]], list[list[int] def get_mistakes() -> dict[str, (float, str)]: f1s_method_name = dict() - for dataset in get_multi_option_benchmark_data(filter_by=["cejil_judge"]): + for dataset in get_multi_option_benchmark_data(filter_by=[]): truth_one_hot, prediction_one_hot, method_name, test_samples = get_predictions(dataset) correct = 0 diff --git a/src/performance_segment_selector.py b/src/performance_segment_selector.py index e39951a..090051f 100644 --- a/src/performance_segment_selector.py +++ b/src/performance_segment_selector.py @@ -30,11 +30,12 @@ def print_results(results): average_precision = round(sum(precisions) / len(precisions), 2) average_recall = round(sum(recalls) / len(recalls), 2) average_seconds = round(sum(seconds) / len(seconds)) - + samples = sum([x.samples for x in results if x.method == method.get_name()]) results.append( SegmentSelectorResults( method=method.get_name(), dataset="Average", + samples=samples, precision=average_precision, recall=average_recall, seconds=average_seconds, @@ -68,6 +69,7 @@ def get_performance_segment_selector(): selector_results = SegmentSelectorResults( method=method.get_name(), dataset=dataset, + samples=len(pdfs_data), precision=round(100 * precision_score(truth, predicted_labels), 2), recall=round(100 * recall_score(truth, predicted_labels), 2), seconds=round(time() - start), @@ -75,7 +77,7 @@ def get_performance_segment_selector(): results.append(selector_results) - print_results(results) + print_results(results) if __name__ == "__main__": diff --git a/src/send_logs.py b/src/send_logs.py index fa148cf..96b3d7d 100644 --- a/src/send_logs.py +++ b/src/send_logs.py @@ -1,10 +1,26 @@ +import traceback + from config import config_logger from data.ExtractionIdentifier import ExtractionIdentifier from data.LogsMessage import Severity -def send_logs(extraction_identifier: ExtractionIdentifier, message: str, severity: Severity = Severity.info): - if severity == Severity.error: - config_logger.error(message + " for " + extraction_identifier.model_dump_json()) - else: +def send_logs( + extraction_identifier: ExtractionIdentifier, + message: str, + severity: Severity = Severity.info, + exception: Exception = None, +): + if severity != Severity.error: config_logger.info(message + " for " + extraction_identifier.model_dump_json()) + return + + try: + stacktrace_message = "\n".join(traceback.format_exception(type(exception), exception, exception.__traceback__)) + error_message = message + error_message += f"\nException type: {type(exception).__name__}" + error_message += f"\nException: {exception}" + error_message += f"\nStackTrace: {stacktrace_message}" + config_logger.error(error_message + " for " + extraction_identifier.model_dump_json()) + except: + config_logger.error(message + " for " + extraction_identifier.model_dump_json())