Skip to content

Commit

Permalink
Remove "use_pos" config option
Browse files Browse the repository at this point in the history
Code now uses POS to find gloss if it's available, should match more
words especially for languages that spaCy doesn't have a good
lemmitizer like Korean.

Also update Word Wise test files because the old files are created
without using POS.
  • Loading branch information
xxyzz committed Jul 28, 2024
1 parent 32e4559 commit 9af0c6e
Show file tree
Hide file tree
Showing 8 changed files with 73 additions and 204 deletions.
2 changes: 1 addition & 1 deletion __init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from calibre.customize import InterfaceActionBase

VERSION = (3, 32, 2)
VERSION = (3, 33, 0)


class WordDumbDumb(InterfaceActionBase):
Expand Down
6 changes: 0 additions & 6 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@
)

prefs = JSONConfig("plugins/worddumb")
prefs.defaults["use_pos"] = True
prefs.defaults["search_people"] = False
prefs.defaults["model_size"] = "md"
prefs.defaults["zh_wiki_variant"] = "cn"
Expand Down Expand Up @@ -94,10 +93,6 @@ def __init__(self):
)
vl.addWidget(custom_wiktionary_button)

self.use_pos_box = QCheckBox(_("Use POS type to find Word Wise definition"))
self.use_pos_box.setChecked(prefs["use_pos"])
vl.addWidget(self.use_pos_box)

self.search_people_box = QCheckBox(
_(
"Fetch X-Ray people descriptions from Wikipedia or other "
Expand Down Expand Up @@ -231,7 +226,6 @@ def open_github(self) -> None:

def save_settings(self) -> None:
prefs["python_path"] = self.python_path.text()
prefs["use_pos"] = self.use_pos_box.isChecked()
prefs["search_people"] = self.search_people_box.isChecked()
prefs["model_size"] = self.model_size_box.currentData()
prefs["zh_wiki_variant"] = self.zh_wiki_box.currentData()
Expand Down
118 changes: 19 additions & 99 deletions dump_lemmas.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import sqlite3
from operator import itemgetter
from pathlib import Path

try:
Expand Down Expand Up @@ -27,10 +26,8 @@ def spacy_doc_path(
model_version: str,
lemma_lang: str,
is_kindle: bool,
is_phrase: bool,
plugin_path: Path,
prefs: Prefs,
use_lemma_matcher: bool,
):
import platform

Expand All @@ -42,10 +39,6 @@ def spacy_doc_path(
f"{spacy_model or lemma_lang}_{'kindle' if is_kindle else 'wiktionary'}"
f"_{gloss_lang}_{model_version}_{py_version}"
)
if use_lemma_matcher:
if is_phrase:
path = path.with_name(path.name + "_phrase")
path = path.with_name(path.name + "_pos")
return path


Expand All @@ -60,17 +53,7 @@ def dump_spacy_docs(
insert_installed_libs(plugin_path)
import spacy

use_lemma_matcher = prefs["use_pos"] and lemma_lang != "zh" and spacy_model != ""
excluded_components = ["ner", "parser"]
if not use_lemma_matcher:
excluded_components.extend(
["tok2vec", "morphologizer", "tagger", "attribute_ruler", "lemmatizer"]
)
nlp = (
spacy.load(spacy_model, exclude=excluded_components)
if spacy_model != ""
else spacy.blank(lemma_lang)
)
nlp = spacy.load(spacy_model) if spacy_model != "" else spacy.blank(lemma_lang)
lemmas_conn = sqlite3.connect(db_path)
pkg_versions = load_plugin_json(plugin_path, "data/deps.json")
save_spacy_docs(
Expand All @@ -82,7 +65,6 @@ def dump_spacy_docs(
lemmas_conn,
plugin_path,
prefs,
use_lemma_matcher,
)
lemmas_conn.close()

Expand All @@ -96,95 +78,33 @@ def save_spacy_docs(
lemmas_conn: sqlite3.Connection,
plugin_path: Path,
prefs: Prefs,
use_lemma_matcher: bool,
):
from spacy.tokens import DocBin

phrases_doc_bin = DocBin(attrs=["LOWER"])
if use_lemma_matcher:
lemmas_doc_bin = DocBin(attrs=["LOWER"])
lemmas_doc_bin = DocBin(attrs=["LOWER"])
difficulty_limit = (
5 if is_kindle else prefs[f"{lemma_lang}_wiktionary_difficulty_limit"]
)
if use_lemma_matcher:
for doc in create_lemma_patterns_with_pos(
lemma_lang, lemmas_conn, nlp, difficulty_limit
):
if " " in doc.text:
phrases_doc_bin.add(doc)
else:
lemmas_doc_bin.add(doc)
else:
for doc in create_lemma_patterns_without_pos(
lemmas_conn, nlp, difficulty_limit
):
phrases_doc_bin.add(doc)

phrases_doc_bin.to_disk(
spacy_doc_path(
spacy_model,
model_version,
lemma_lang,
is_kindle,
True,
plugin_path,
prefs,
use_lemma_matcher,
)
)
if use_lemma_matcher:
lemmas_doc_bin.to_disk(
spacy_doc_path(
spacy_model,
model_version,
lemma_lang,
is_kindle,
False,
plugin_path,
prefs,
use_lemma_matcher,
)
)


def create_lemma_patterns_with_pos(lemma_lang, conn, nlp, difficulty_limit):
if lemma_lang == "zh":
query_sql = """
SELECT DISTINCT lemma
FROM lemmas l
JOIN senses s ON l.id = s.lemma_id AND enabled = 1 AND difficulty <= :difficulty
UNION ALL
SELECT DISTINCT form FROM forms f
JOIN senses s ON f.lemma_id = s.lemma_id AND f.pos = s.pos
AND enabled = 1 AND difficulty <= :difficulty
"""
else:
query_sql = """
SELECT DISTINCT lemma
FROM lemmas l
JOIN senses s ON l.id = s.lemma_id AND enabled = 1 AND difficulty <= :difficulty
UNION ALL
SELECT DISTINCT form
FROM lemmas l
JOIN forms f ON l.id = f.lemma_id
JOIN senses s ON l.id = s.lemma_id AND f.pos = s.pos
AND enabled = 1 AND difficulty <= :difficulty
"""
yield from nlp.pipe(
map(itemgetter(0), conn.execute(query_sql, {"difficulty": difficulty_limit}))
)


def create_lemma_patterns_without_pos(conn, nlp, difficulty_limit):
query_sql = """
SELECT DISTINCT lemma
FROM lemmas l JOIN senses s ON l.id = s.lemma_id
AND enabled = 1 AND difficulty <= :difficulty
FROM lemmas l
JOIN senses s ON l.id = s.lemma_id AND enabled = 1 AND difficulty <= :difficulty
UNION ALL
SELECT DISTINCT form
FROM forms f JOIN senses s ON f.lemma_id = s.lemma_id
AND f.pos = s.pos AND enabled = 1 AND difficulty <= :difficulty
FROM lemmas l
JOIN forms f ON l.id = f.lemma_id
JOIN senses s ON l.id = s.lemma_id AND f.pos = s.pos
AND enabled = 1 AND difficulty <= :difficulty
"""
yield from nlp.tokenizer.pipe(
map(itemgetter(0), conn.execute(query_sql, {"difficulty": difficulty_limit}))
for doc in nlp.tokenizer.pipe(
map(
lambda x: x[0].lower(),
lemmas_conn.execute(query_sql, {"difficulty": difficulty_limit}),
)
):
lemmas_doc_bin.add(doc)
lemmas_doc_bin.to_disk(
spacy_doc_path(
spacy_model, model_version, lemma_lang, is_kindle, plugin_path, prefs
)
)
79 changes: 18 additions & 61 deletions parse_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import sqlite3
from dataclasses import asdict, dataclass
from html import escape, unescape
from itertools import chain
from pathlib import Path
from sqlite3 import Connection
from typing import Any, Iterator
Expand Down Expand Up @@ -237,7 +236,6 @@ def create_files(data: ParseJobData, prefs: Prefs, notif: Any) -> None:
nlp = load_spacy(
data.spacy_model,
data.book_path if data.create_x else None,
prefs["use_pos"],
data.book_lang,
)
lemmas_conn = None
Expand All @@ -250,7 +248,7 @@ def create_files(data: ParseJobData, prefs: Prefs, notif: Any) -> None:
else kindle_db_path(data.plugin_path, data.book_lang, prefs)
)
lemmas_conn = sqlite3.connect(lemmas_db_path)
lemma_matcher, phrase_matcher = create_spacy_matcher(
lemma_matcher = create_spacy_matcher(
nlp,
data.spacy_model,
data.book_lang,
Expand Down Expand Up @@ -316,13 +314,11 @@ def create_files(data: ParseJobData, prefs: Prefs, notif: Any) -> None:
epub_find_lemma(
doc,
lemma_matcher,
phrase_matcher,
start,
end,
interval_tree,
epub,
xhtml_path,
prefs["use_pos"],
)
supported_languages = load_languages_data(data.plugin_path)
gloss_lang = prefs["wiktionary_gloss_lang"]
Expand Down Expand Up @@ -371,7 +367,6 @@ def create_files(data: ParseJobData, prefs: Prefs, notif: Any) -> None:
kindle_find_lemma(
doc,
lemma_matcher,
phrase_matcher,
start,
data.mobi_codec,
escaped_text,
Expand Down Expand Up @@ -425,20 +420,9 @@ def index_in_escaped_text(
return None


def match_lemmas(doc, lemma_matcher, phrase_matcher):
from spacy.util import filter_spans

phrase_spans = phrase_matcher(doc, as_spans=True)
if lemma_matcher is not None:
return filter_spans(chain(phrase_spans, lemma_matcher(doc, as_spans=True)))
else:
return filter_spans(phrase_spans)


def kindle_find_lemma(
doc,
lemma_matcher,
phrase_matcher,
start,
mobi_codec,
escaped_text,
Expand All @@ -447,14 +431,14 @@ def kindle_find_lemma(
lemma_lang,
prefs,
):
from spacy.util import filter_spans

lemma_starts: set[int] = set()
for span in match_lemmas(doc, lemma_matcher, phrase_matcher):
lemma = getattr(span, "lemma_", "")
pos = getattr(span.doc[span.start], "pos_", "")
for span in filter_spans(lemma_matcher(doc, as_spans=True)):
data = get_kindle_lemma_data(
span.lemma_ if prefs["use_pos"] and lemma != "" else span.text,
getattr(span, "lemma_", ""),
span.text,
pos if prefs["use_pos"] and pos != "" else "",
getattr(span.doc[span.start], "pos_", ""),
lemmas_conn,
lemma_lang,
prefs,
Expand All @@ -476,15 +460,15 @@ def kindle_find_lemma(
def epub_find_lemma(
doc,
lemma_matcher,
phrase_matcher,
paragraph_start,
paragraph_end,
interval_tree,
epub,
xhtml_path,
use_pos,
):
for span in match_lemmas(doc, lemma_matcher, phrase_matcher):
from spacy.util import filter_spans

for span in filter_spans(lemma_matcher(doc, as_spans=True)):
if interval_tree is not None and interval_tree.is_overlap(
Interval(span.start_char, span.end_char - 1)
):
Expand All @@ -493,7 +477,7 @@ def epub_find_lemma(
epub.add_lemma(
getattr(span, "lemma_", ""),
span.text,
spacy_to_wiktionary_pos(pos) if use_pos and pos != "" else "",
spacy_to_wiktionary_pos(pos) if pos != "" else "",
paragraph_start,
paragraph_end,
span.start_char,
Expand Down Expand Up @@ -738,19 +722,13 @@ def find_named_entity(
return intervals


def load_spacy(
model: str, book_path: str | None, use_pos: bool, lemma_lang: str
) -> Any:
def load_spacy(model: str, book_path: str | None, lemma_lang: str) -> Any:
import spacy

if model == "":
return spacy.blank(lemma_lang)

excluded_components = []
if not use_pos:
excluded_components.extend(
["tok2vec", "morphologizer", "tagger", "attribute_ruler", "lemmatizer"]
)
if book_path is None:
excluded_components.append("ner")

Expand Down Expand Up @@ -790,20 +768,11 @@ def create_spacy_matcher(
disabled_pipes = list(set(["ner", "parser", "senter"]) & set(nlp.pipe_names))
pkg_versions = load_plugin_json(plugin_path, "data/deps.json")
model_version = get_spacy_model_version(model, pkg_versions)
# Chinese words don't have inflection forms, only use phrase matcher
use_lemma_matcher = prefs["use_pos"] and lemma_lang != "zh" and model != ""
phrase_matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
phrases_doc_path = spacy_doc_path(
model,
model_version,
lemma_lang,
is_kindle,
True,
plugin_path,
prefs,
use_lemma_matcher,
lemma_matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
lemmas_doc_path = spacy_doc_path(
model, model_version, lemma_lang, is_kindle, plugin_path, prefs
)
if not phrases_doc_path.exists():
if not lemmas_doc_path.exists():
save_spacy_docs(
nlp,
model,
Expand All @@ -813,20 +782,8 @@ def create_spacy_matcher(
lemmas_conn,
plugin_path,
prefs,
use_lemma_matcher,
)
phrases_doc_bin = DocBin().from_disk(phrases_doc_path)
if use_lemma_matcher:
lemma_matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
lemmas_doc_path = spacy_doc_path(
model, model_version, lemma_lang, is_kindle, False, plugin_path, prefs, True
)
lemmas_doc_bin = DocBin().from_disk(lemmas_doc_path)

lemmas_doc_bin = DocBin().from_disk(lemmas_doc_path)
with nlp.select_pipes(disable=disabled_pipes):
phrase_matcher.add("phrases", phrases_doc_bin.get_docs(nlp.vocab))
if use_lemma_matcher:
lemma_matcher.add("lemmas", lemmas_doc_bin.get_docs(nlp.vocab))
return lemma_matcher, phrase_matcher
else:
return None, phrase_matcher
lemma_matcher.add("lemmas", lemmas_doc_bin.get_docs(nlp.vocab))
return lemma_matcher
Loading

0 comments on commit 9af0c6e

Please sign in to comment.