diff --git a/__init__.py b/__init__.py
index b15dd86..7fd5631 100644
--- a/__init__.py
+++ b/__init__.py
@@ -1,8 +1,6 @@
-#!/usr/bin/env python3
-
from calibre.customize import InterfaceActionBase
-VERSION = (3, 31, 1)
+VERSION = (3, 31, 2)
class WordDumbDumb(InterfaceActionBase):
diff --git a/__main__.py b/__main__.py
index c566a35..b739240 100644
--- a/__main__.py
+++ b/__main__.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
"""
Create X-Ray file on macOS: run this script in subprocess to bypass
the ludicrous library validation
diff --git a/config.py b/config.py
index d9c5b46..b377ece 100644
--- a/config.py
+++ b/config.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
import json
import webbrowser
from functools import partial
diff --git a/custom_lemmas.py b/custom_lemmas.py
index d04a3dd..9c04ed6 100644
--- a/custom_lemmas.py
+++ b/custom_lemmas.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
import base64
import sqlite3
from functools import partial
diff --git a/custom_x_ray.py b/custom_x_ray.py
index 579cc0d..e6b7138 100644
--- a/custom_x_ray.py
+++ b/custom_x_ray.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
import json
from typing import TYPE_CHECKING, Any
diff --git a/data/deps.json b/data/deps.json
index 45b8798..4e6738e 100644
--- a/data/deps.json
+++ b/data/deps.json
@@ -1,7 +1,7 @@
{
"cupy": "12.3.0",
"lxml": "5.1.0",
- "rapidfuzz": "3.6.1",
+ "rapidfuzz": "3.6.2",
"spacy_cpu_model": "3.7.0",
"spacy_trf_model": "3.7.2",
"thinc-apple-ops": "0.1.4",
diff --git a/database.py b/database.py
index b293730..5d515fe 100644
--- a/database.py
+++ b/database.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python3
import sqlite3
from pathlib import Path
from typing import Iterator
diff --git a/deps.py b/deps.py
index dc24d94..3027b26 100644
--- a/deps.py
+++ b/deps.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
import bz2
import platform
import shutil
diff --git a/epub.py b/epub.py
index 46e42d0..574d1d5 100644
--- a/epub.py
+++ b/epub.py
@@ -1,12 +1,10 @@
-#!/usr/bin/env python3
-
import operator
import re
import shutil
import sqlite3
import zipfile
from collections import defaultdict
-from dataclasses import dataclass
+from dataclasses import dataclass, field
from functools import partial
from html import escape, unescape
from pathlib import Path
@@ -67,8 +65,17 @@ class Occurrence:
paragraph_end: int
word_start: int
word_end: int
- lemma: str = ""
- entity_id: int = 0
+ entity_id: int = -1
+ sense_ids: tuple[int, ...] = ()
+
+
+@dataclass
+class Sense:
+ pos: str
+ short_def: str
+ full_def: str
+ example: str
+ ipas: list[str] = field(default_factory=list)
class EPUB:
@@ -79,6 +86,7 @@ def __init__(
wiki_commons: Wikimedia_Commons | None,
wikidata: Wikidata | None,
custom_x_ray: CustomX,
+ lemmas_conn: sqlite3.Connection | None,
) -> None:
self.book_path = Path(book_path_str)
self.mediawiki = mediawiki
@@ -97,10 +105,13 @@ def __init__(
self.image_href_has_folder = False
self.image_filenames: set[str] = set()
self.custom_x_ray = custom_x_ray
- self.lemmas: dict[str, int] = {}
- self.lemma_id = 0
- self.lemmas_conn: sqlite3.Connection | None = None
+ self.sense_id_dict: dict[tuple[int, ...], int] = {}
+ self.word_wise_id = 0
+ self.lemmas_conn: sqlite3.Connection | None = lemmas_conn
self.prefs: Prefs = {}
+ self.lemma_lang: str = ""
+ self.gloss_lang: str = ""
+ self.gloss_source: str = ""
def extract_epub(self) -> Iterator[tuple[str, tuple[int, int, Path]]]:
from lxml import etree
@@ -221,24 +232,32 @@ def add_entity(
def add_lemma(
self,
lemma: str,
+ pos: str,
paragraph_start: int,
paragraph_end: int,
word_start: int,
word_end: int,
xhtml_path: Path,
) -> None:
+ sense_ids = self.find_sense_ids(lemma, pos)
+ if len(sense_ids) == 0:
+ return
+ if sense_ids in self.sense_id_dict:
+ ww_id = self.sense_id_dict[sense_ids]
+ else:
+ ww_id = self.word_wise_id
+ self.word_wise_id += 1
+ self.sense_id_dict[sense_ids] = ww_id
+
self.entity_occurrences[xhtml_path].append(
Occurrence(
paragraph_start=paragraph_start,
paragraph_end=paragraph_end,
word_start=word_start,
word_end=word_end,
- lemma=lemma,
+ sense_ids=sense_ids,
)
)
- if lemma not in self.lemmas:
- self.lemmas[lemma] = self.lemma_id
- self.lemma_id += 1
def remove_entities(self, minimal_count: int) -> None:
for entity, data in self.entities.copy().items():
@@ -252,26 +271,22 @@ def remove_entities(self, minimal_count: int) -> None:
self.removed_entity_ids.add(data["id"])
def modify_epub(
- self,
- prefs: Prefs,
- lemma_lang: str,
- gloss_lang: str,
- lemmas_conn: sqlite3.Connection | None,
- has_multiple_ipas: bool,
+ self, prefs: Prefs, lemma_lang: str, gloss_lang: str, gloss_source: str
) -> None:
- self.lemmas_conn = lemmas_conn
self.prefs = prefs
- self.has_multiple_ipas = has_multiple_ipas
+ self.lemma_lang = lemma_lang
+ self.gloss_lang = gloss_lang
+ self.gloss_source = gloss_source
if self.entities:
query_mediawiki(self.entities, self.mediawiki, prefs["search_people"])
if self.wikidata:
query_wikidata(self.entities, self.mediawiki, self.wikidata)
if prefs["minimal_x_ray_count"] > 1:
self.remove_entities(prefs["minimal_x_ray_count"])
- self.create_x_ray_footnotes(prefs, lemma_lang)
- self.insert_anchor_elements(lemma_lang)
- if self.lemmas:
- self.create_word_wise_footnotes(lemma_lang, gloss_lang)
+ self.create_x_ray_footnotes()
+ self.insert_anchor_elements()
+ if len(self.sense_id_dict) > 0:
+ self.create_word_wise_footnotes()
self.modify_opf()
self.zip_extract_folder()
if self.mediawiki is not None:
@@ -280,12 +295,12 @@ def modify_epub(
self.wikidata.close()
if self.wiki_commons is not None:
self.wiki_commons.close()
- if lemmas_conn is not None:
- lemmas_conn.close()
+ if self.lemmas_conn is not None:
+ self.lemmas_conn.close()
- def insert_anchor_elements(self, lemma_lang: str) -> None:
+ def insert_anchor_elements(self) -> None:
css_rules = ""
- if len(self.lemmas) > 0:
+ if len(self.sense_id_dict) > 0:
css_rules += """
body {line-height: 2.5;}
ruby.wordwise {text-decoration: overline;}
@@ -300,7 +315,7 @@ def insert_anchor_elements(self, lemma_lang: str) -> None:
"""
for xhtml_path, occurrences in self.entity_occurrences.items():
- if self.entities and self.lemmas:
+ if len(self.entities) > 0 and self.lemmas_conn is not None:
occurrences = sorted(
occurrences,
key=operator.attrgetter("paragraph_start", "word_start"),
@@ -326,14 +341,14 @@ def insert_anchor_elements(self, lemma_lang: str) -> None:
paragraph_text[last_w_end : occurrence.word_start]
)
word = paragraph_text[occurrence.word_start : occurrence.word_end]
- if occurrence.lemma == "":
+ if occurrence.entity_id != -1:
new_xhtml_str += (
f'{escape(word)}'
)
else:
new_xhtml_str += self.build_word_wise_tag(
- occurrence.lemma, word, lemma_lang
+ occurrence.sense_ids, word
)
last_w_end = occurrence.word_end
if occurrence.paragraph_end != last_p_end:
@@ -355,39 +370,33 @@ def insert_anchor_elements(self, lemma_lang: str) -> None:
)
f.write(new_xhtml_str)
- def build_word_wise_tag(self, lemma: str, word: str, lemma_lang: str) -> str:
- if lemma not in self.lemmas:
- return word
- data = self.get_lemma_gloss(lemma, lemma_lang)
- if not data:
- del self.lemmas[lemma]
- return word
- short_def = data[0][0]
- len_ratio = 3 if lemma_lang in CJK_LANGS else 2.5
- lemma_id = self.lemmas[lemma]
+ def build_word_wise_tag(
+ self,
+ sense_ids: tuple[int, ...],
+ word: str,
+ ) -> str:
+ ww_id = self.sense_id_dict[sense_ids]
+ sense_list = self.get_sense_data(sense_ids[:1])
+ short_def = sense_list[0].short_def
+ len_ratio = 3 if self.lemma_lang in CJK_LANGS else 2.5
if len(short_def) / len(word) > len_ratio:
return (
'{escape(word)}'
+ f'{ww_id}">{escape(word)}'
)
else:
return (
'{escape(word)}"
)
- def split_p_tags(self, intro: str) -> str:
- intro = escape(intro)
- p_tags = ""
- for p_str in intro.splitlines():
- p_tags += f"
{p_str}
"
- return p_tags
-
- def create_x_ray_footnotes(self, prefs: Prefs, lang: str) -> None:
+ def create_x_ray_footnotes(self) -> None:
if self.mediawiki is None: # just let mypy know it's not None
return
- source_name, source_link = x_ray_source(self.mediawiki.source_id, prefs, lang)
+ source_name, source_link = x_ray_source(
+ self.mediawiki.source_id, self.prefs, self.lemma_lang
+ )
image_prefix = ""
if self.xhtml_href_has_folder:
image_prefix += "../"
@@ -396,7 +405,7 @@ def create_x_ray_footnotes(self, prefs: Prefs, lang: str) -> None:
s = f"""
+ lang="{self.lemma_lang}" xml:lang="{self.lemma_lang}">
X-Ray
"""
@@ -405,11 +414,11 @@ def create_x_ray_footnotes(self, prefs: Prefs, lang: str) -> None:
custom_desc, custom_source_id, _ = custom_data
s += (
f'"
- elif (prefs["search_people"] or data["label"] not in PERSON_LABELS) and (
- intro_cache := self.mediawiki.get_cache(entity)
- ):
+ elif (
+ self.prefs["search_people"] or data["label"] not in PERSON_LABELS
+ ) and (intro_cache := self.mediawiki.get_cache(entity)):
s += f'