diff --git a/__init__.py b/__init__.py index b15dd86..7fd5631 100644 --- a/__init__.py +++ b/__init__.py @@ -1,8 +1,6 @@ -#!/usr/bin/env python3 - from calibre.customize import InterfaceActionBase -VERSION = (3, 31, 1) +VERSION = (3, 31, 2) class WordDumbDumb(InterfaceActionBase): diff --git a/__main__.py b/__main__.py index c566a35..b739240 100644 --- a/__main__.py +++ b/__main__.py @@ -1,5 +1,3 @@ -#!/usr/bin/env python3 - """ Create X-Ray file on macOS: run this script in subprocess to bypass the ludicrous library validation diff --git a/config.py b/config.py index d9c5b46..b377ece 100644 --- a/config.py +++ b/config.py @@ -1,5 +1,3 @@ -#!/usr/bin/env python3 - import json import webbrowser from functools import partial diff --git a/custom_lemmas.py b/custom_lemmas.py index d04a3dd..9c04ed6 100644 --- a/custom_lemmas.py +++ b/custom_lemmas.py @@ -1,5 +1,3 @@ -#!/usr/bin/env python3 - import base64 import sqlite3 from functools import partial diff --git a/custom_x_ray.py b/custom_x_ray.py index 579cc0d..e6b7138 100644 --- a/custom_x_ray.py +++ b/custom_x_ray.py @@ -1,5 +1,3 @@ -#!/usr/bin/env python3 - import json from typing import TYPE_CHECKING, Any diff --git a/data/deps.json b/data/deps.json index 45b8798..4e6738e 100644 --- a/data/deps.json +++ b/data/deps.json @@ -1,7 +1,7 @@ { "cupy": "12.3.0", "lxml": "5.1.0", - "rapidfuzz": "3.6.1", + "rapidfuzz": "3.6.2", "spacy_cpu_model": "3.7.0", "spacy_trf_model": "3.7.2", "thinc-apple-ops": "0.1.4", diff --git a/database.py b/database.py index b293730..5d515fe 100644 --- a/database.py +++ b/database.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python3 import sqlite3 from pathlib import Path from typing import Iterator diff --git a/deps.py b/deps.py index dc24d94..3027b26 100644 --- a/deps.py +++ b/deps.py @@ -1,5 +1,3 @@ -#!/usr/bin/env python3 - import bz2 import platform import shutil diff --git a/epub.py b/epub.py index 46e42d0..574d1d5 100644 --- a/epub.py +++ b/epub.py @@ -1,12 +1,10 @@ -#!/usr/bin/env python3 - import operator import re import shutil import sqlite3 import zipfile from collections import defaultdict -from dataclasses import dataclass +from dataclasses import dataclass, field from functools import partial from html import escape, unescape from pathlib import Path @@ -67,8 +65,17 @@ class Occurrence: paragraph_end: int word_start: int word_end: int - lemma: str = "" - entity_id: int = 0 + entity_id: int = -1 + sense_ids: tuple[int, ...] = () + + +@dataclass +class Sense: + pos: str + short_def: str + full_def: str + example: str + ipas: list[str] = field(default_factory=list) class EPUB: @@ -79,6 +86,7 @@ def __init__( wiki_commons: Wikimedia_Commons | None, wikidata: Wikidata | None, custom_x_ray: CustomX, + lemmas_conn: sqlite3.Connection | None, ) -> None: self.book_path = Path(book_path_str) self.mediawiki = mediawiki @@ -97,10 +105,13 @@ def __init__( self.image_href_has_folder = False self.image_filenames: set[str] = set() self.custom_x_ray = custom_x_ray - self.lemmas: dict[str, int] = {} - self.lemma_id = 0 - self.lemmas_conn: sqlite3.Connection | None = None + self.sense_id_dict: dict[tuple[int, ...], int] = {} + self.word_wise_id = 0 + self.lemmas_conn: sqlite3.Connection | None = lemmas_conn self.prefs: Prefs = {} + self.lemma_lang: str = "" + self.gloss_lang: str = "" + self.gloss_source: str = "" def extract_epub(self) -> Iterator[tuple[str, tuple[int, int, Path]]]: from lxml import etree @@ -221,24 +232,32 @@ def add_entity( def add_lemma( self, lemma: str, + pos: str, paragraph_start: int, paragraph_end: int, word_start: int, word_end: int, xhtml_path: Path, ) -> None: + sense_ids = self.find_sense_ids(lemma, pos) + if len(sense_ids) == 0: + return + if sense_ids in self.sense_id_dict: + ww_id = self.sense_id_dict[sense_ids] + else: + ww_id = self.word_wise_id + self.word_wise_id += 1 + self.sense_id_dict[sense_ids] = ww_id + self.entity_occurrences[xhtml_path].append( Occurrence( paragraph_start=paragraph_start, paragraph_end=paragraph_end, word_start=word_start, word_end=word_end, - lemma=lemma, + sense_ids=sense_ids, ) ) - if lemma not in self.lemmas: - self.lemmas[lemma] = self.lemma_id - self.lemma_id += 1 def remove_entities(self, minimal_count: int) -> None: for entity, data in self.entities.copy().items(): @@ -252,26 +271,22 @@ def remove_entities(self, minimal_count: int) -> None: self.removed_entity_ids.add(data["id"]) def modify_epub( - self, - prefs: Prefs, - lemma_lang: str, - gloss_lang: str, - lemmas_conn: sqlite3.Connection | None, - has_multiple_ipas: bool, + self, prefs: Prefs, lemma_lang: str, gloss_lang: str, gloss_source: str ) -> None: - self.lemmas_conn = lemmas_conn self.prefs = prefs - self.has_multiple_ipas = has_multiple_ipas + self.lemma_lang = lemma_lang + self.gloss_lang = gloss_lang + self.gloss_source = gloss_source if self.entities: query_mediawiki(self.entities, self.mediawiki, prefs["search_people"]) if self.wikidata: query_wikidata(self.entities, self.mediawiki, self.wikidata) if prefs["minimal_x_ray_count"] > 1: self.remove_entities(prefs["minimal_x_ray_count"]) - self.create_x_ray_footnotes(prefs, lemma_lang) - self.insert_anchor_elements(lemma_lang) - if self.lemmas: - self.create_word_wise_footnotes(lemma_lang, gloss_lang) + self.create_x_ray_footnotes() + self.insert_anchor_elements() + if len(self.sense_id_dict) > 0: + self.create_word_wise_footnotes() self.modify_opf() self.zip_extract_folder() if self.mediawiki is not None: @@ -280,12 +295,12 @@ def modify_epub( self.wikidata.close() if self.wiki_commons is not None: self.wiki_commons.close() - if lemmas_conn is not None: - lemmas_conn.close() + if self.lemmas_conn is not None: + self.lemmas_conn.close() - def insert_anchor_elements(self, lemma_lang: str) -> None: + def insert_anchor_elements(self) -> None: css_rules = "" - if len(self.lemmas) > 0: + if len(self.sense_id_dict) > 0: css_rules += """ body {line-height: 2.5;} ruby.wordwise {text-decoration: overline;} @@ -300,7 +315,7 @@ def insert_anchor_elements(self, lemma_lang: str) -> None: """ for xhtml_path, occurrences in self.entity_occurrences.items(): - if self.entities and self.lemmas: + if len(self.entities) > 0 and self.lemmas_conn is not None: occurrences = sorted( occurrences, key=operator.attrgetter("paragraph_start", "word_start"), @@ -326,14 +341,14 @@ def insert_anchor_elements(self, lemma_lang: str) -> None: paragraph_text[last_w_end : occurrence.word_start] ) word = paragraph_text[occurrence.word_start : occurrence.word_end] - if occurrence.lemma == "": + if occurrence.entity_id != -1: new_xhtml_str += ( f'{escape(word)}' ) else: new_xhtml_str += self.build_word_wise_tag( - occurrence.lemma, word, lemma_lang + occurrence.sense_ids, word ) last_w_end = occurrence.word_end if occurrence.paragraph_end != last_p_end: @@ -355,39 +370,33 @@ def insert_anchor_elements(self, lemma_lang: str) -> None: ) f.write(new_xhtml_str) - def build_word_wise_tag(self, lemma: str, word: str, lemma_lang: str) -> str: - if lemma not in self.lemmas: - return word - data = self.get_lemma_gloss(lemma, lemma_lang) - if not data: - del self.lemmas[lemma] - return word - short_def = data[0][0] - len_ratio = 3 if lemma_lang in CJK_LANGS else 2.5 - lemma_id = self.lemmas[lemma] + def build_word_wise_tag( + self, + sense_ids: tuple[int, ...], + word: str, + ) -> str: + ww_id = self.sense_id_dict[sense_ids] + sense_list = self.get_sense_data(sense_ids[:1]) + short_def = sense_list[0].short_def + len_ratio = 3 if self.lemma_lang in CJK_LANGS else 2.5 if len(short_def) / len(word) > len_ratio: return ( '{escape(word)}' + f'{ww_id}">{escape(word)}' ) else: return ( '{escape(word)}({escape(short_def)}' + f'{ww_id}">{escape(word)}({escape(short_def)}' ")" ) - def split_p_tags(self, intro: str) -> str: - intro = escape(intro) - p_tags = "" - for p_str in intro.splitlines(): - p_tags += f"

{p_str}

" - return p_tags - - def create_x_ray_footnotes(self, prefs: Prefs, lang: str) -> None: + def create_x_ray_footnotes(self) -> None: if self.mediawiki is None: # just let mypy know it's not None return - source_name, source_link = x_ray_source(self.mediawiki.source_id, prefs, lang) + source_name, source_link = x_ray_source( + self.mediawiki.source_id, self.prefs, self.lemma_lang + ) image_prefix = "" if self.xhtml_href_has_folder: image_prefix += "../" @@ -396,7 +405,7 @@ def create_x_ray_footnotes(self, prefs: Prefs, lang: str) -> None: s = f""" + lang="{self.lemma_lang}" xml:lang="{self.lemma_lang}"> X-Ray """ @@ -405,11 +414,11 @@ def create_x_ray_footnotes(self, prefs: Prefs, lang: str) -> None: custom_desc, custom_source_id, _ = custom_data s += ( f'" - elif (prefs["search_people"] or data["label"] not in PERSON_LABELS) and ( - intro_cache := self.mediawiki.get_cache(entity) - ): + elif ( + self.prefs["search_people"] or data["label"] not in PERSON_LABELS + ) and (intro_cache := self.mediawiki.get_cache(entity)): s += f'