Skip to content

Commit

Permalink
Merge pull request #98 from huridocs/labeled-as-empty
Browse files Browse the repository at this point in the history
Support labeled data with no value
  • Loading branch information
gabriel-piles authored Oct 18, 2024
2 parents 341aab9 + 44fbc99 commit 3525ade
Show file tree
Hide file tree
Showing 21 changed files with 2,481 additions and 53 deletions.
6 changes: 3 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
git+https://github.com/huridocs/pdf-document-layout-analysis@a834f712edf0fc0a4660de0270f3efbb08e0031c
git+https://github.com/huridocs/queue-processor@1875372bf9f6dcd1995a32c4e50ff92aa45f9ea8
git+https://github.com/huridocs/pdf-document-layout-analysis@6262a95d1dd09055556a8e081eacfe02dd308b89
git+https://github.com/huridocs/queue-processor@716ddf050c59035583b0852dc0b78a7860ce5c05
slugify==0.0.1
python-Levenshtein==0.25.1
tdda==2.0.9
Expand All @@ -15,7 +15,7 @@ mongomock==4.1.2
fasttext-wheel==0.9.2
rich==13.7.1
joblib==1.4.0
tqdm==4.66.2
tqdm==4.66.3
rapidfuzz==3.8.1
sentry_sdk==1.44.0
pymongo==4.6.3
Expand Down
6 changes: 4 additions & 2 deletions src/Extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,9 +186,11 @@ def get_suggestions(self) -> list[Suggestion]:
if extractor_instance.get_name() != extractor_name:
continue

message = f"Using {extractor_instance.get_name()} to calculate {len(prediction_samples)} suggestions"
suggestions = extractor_instance.get_suggestions(prediction_samples)
suggestions = [suggestion.mark_suggestion_if_empty() for suggestion in suggestions]
message = f"Using {extractor_instance.get_name()} to calculate {len(suggestions)} suggestions"
send_logs(self.extraction_identifier, message)
return extractor_instance.get_suggestions(prediction_samples)
return suggestions

send_logs(self.extraction_identifier, f"No extractor available", Severity.error)
return []
Expand Down
1 change: 1 addition & 0 deletions src/data/LabeledData.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ class LabeledData(BaseModel):
entity_name: str = ""
language_iso: str = ""
label_text: str = ""
empty_value: bool = False
values: list[Option] = list()
source_text: str = ""
page_width: float = 0
Expand Down
13 changes: 13 additions & 0 deletions src/data/Suggestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,24 @@ class Suggestion(BaseModel):
xml_file_name: str = ""
entity_name: str = ""
text: str = ""
empty_suggestion: bool = False
values: list[Option] = list()
segment_text: str = ""
page_number: int = 1
segments_boxes: list[SegmentBox] = list()

def is_empty(self):
if self.empty_suggestion:
return True

return not self.text and not self.values

def mark_suggestion_if_empty(self):
if self.is_empty():
self.empty_suggestion = True

return self

def to_dict(self):
suggestion_dict = self.model_dump()
suggestion_dict["segments_boxes"] = [x.to_dict() for x in self.segments_boxes]
Expand Down
11 changes: 2 additions & 9 deletions src/extractors/ToTextExtractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,6 @@
from data.Suggestion import Suggestion
from extractors.ExtractorBase import ExtractorBase
from extractors.ToTextExtractorMethod import ToTextExtractorMethod
from extractors.text_to_text_extractor.methods.DateParserMethod import DateParserMethod
from extractors.text_to_text_extractor.methods.DateParserWithBreaksMethod import DateParserWithBreaksMethod
from extractors.text_to_text_extractor.methods.InputWithoutSpaces import InputWithoutSpaces
from extractors.text_to_text_extractor.methods.MT5TrueCaseEnglishSpanishMethod import MT5TrueCaseEnglishSpanishMethod
from extractors.text_to_text_extractor.methods.RegexMethod import RegexMethod
from extractors.text_to_text_extractor.methods.RegexSubtractionMethod import RegexSubtractionMethod
from extractors.text_to_text_extractor.methods.SameInputOutputMethod import SameInputOutputMethod
from send_logs import send_logs

RETRAIN_SAMPLES_THRESHOLD = 250
Expand Down Expand Up @@ -115,8 +108,8 @@ def get_best_method(self, extraction_data: ExtractionData):
try:
performance = method_instance.performance(training_set, test_set)
except Exception as e:
message = f"Error checking {method_instance.get_name()}: {e}"
send_logs(self.extraction_identifier, message, Severity.error)
message = f"Error checking {method_instance.get_name()}"
send_logs(self.extraction_identifier, message, Severity.error, e)
performance = 0
performance_log += f"{method_instance.get_name()}: {round(performance, 2)}%\n"
send_logs(self.extraction_identifier, f"Performance {method_instance.get_name()}: {performance}%")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ def get_method_performance(
performance = method.get_performance(train_set, test_set)
except Exception as e:
severity = Severity.error if method.REPORT_ERRORS else Severity.info
send_logs(self.extraction_identifier, f"Error checking {method.get_name()}: {e}", severity)
send_logs(self.extraction_identifier, f"Error checking {method.get_name()}", severity, e)
performance = 0

self.reset_extraction_data(train_set)
Expand Down
Loading

0 comments on commit 3525ade

Please sign in to comment.