diff --git a/pyglossary/entry.py b/pyglossary/entry.py index d25cb759e..872263a3d 100644 --- a/pyglossary/entry.py +++ b/pyglossary/entry.py @@ -196,8 +196,8 @@ def getRawEntrySortKey( key: Callable[[bytes], Any], ) -> Callable[[RawEntryType], Any]: def newKey(x: RawEntryType) -> Any: - # x is rawEntry, so x[0] is list[str]: list of words (entry.l_word) - return key(x[0]) # type: ignore + # x is rawEntry, so x[2:] is list[bytes]: list of words in bytes + return key([b.decode("utf-8") for b in x[2:]]) # type: ignore return newKey @@ -262,6 +262,13 @@ def l_word(self) -> list[str]: return [self._word] return self._word + @property + def lb_word(self) -> list[bytes]: + """Returns list of the word and all the alternate words.""" + if isinstance(self._word, str): + return [self._word.encode("utf-8")] + return [word.encode("utf-8") for word in self._word] + @property def defi(self) -> str: """Returns string of definition.""" diff --git a/pyglossary/glossary_types.py b/pyglossary/glossary_types.py index b5908e0e6..305922ee1 100644 --- a/pyglossary/glossary_types.py +++ b/pyglossary/glossary_types.py @@ -4,6 +4,7 @@ from collections.abc import ( Callable, Iterator, + Sequence, ) # -*- coding: utf-8 -*- @@ -30,10 +31,10 @@ MultiStr: TypeAlias = "str | list[str]" -# different types in order: -# - without defiFormat -# - with defiFormat -RawEntryType: TypeAlias = tuple[list[str], bytes] | tuple[list[str], bytes, str] +# str(rawEntry[0]): defiFormat or "" +# rawEntry[1]: b_defi +# rawEntry[2:]: b_word_list +RawEntryType: TypeAlias = Sequence[bytes] class EntryType(typing.Protocol): # noqa: PLR0904 @@ -56,6 +57,9 @@ def s_word(self) -> str: ... @property def l_word(self) -> list[str]: ... + @property + def lb_word(self) -> list[bytes]: ... + @property def defi(self) -> str: ... diff --git a/pyglossary/glossary_v2.py b/pyglossary/glossary_v2.py index a64c92ee0..416778f63 100644 --- a/pyglossary/glossary_v2.py +++ b/pyglossary/glossary_v2.py @@ -241,11 +241,7 @@ def _dataEntryToRaw(self, entry: DataEntry) -> RawEntryType: b_fpath = b"" if self.tmpDataDir: b_fpath = entry.save(self.tmpDataDir).encode("utf-8") - return ( - [entry.getFileName()], - b_fpath, - "b", - ) + return (b"b", b_fpath, entry.getFileName().encode("utf-8")) def _entryToRaw(self, entry: EntryType) -> RawEntryType: """ @@ -256,22 +252,21 @@ def _entryToRaw(self, entry: EntryType) -> RawEntryType: return self._dataEntryToRaw(cast("DataEntry", entry)) defiFormat = entry.defiFormat - if defiFormat and defiFormat != self._defaultDefiFormat: - return (entry.l_word, entry.b_defi, defiFormat) - return (entry.l_word, entry.b_defi) + if defiFormat is None or defiFormat == self._defaultDefiFormat: + defiFormat = "" + + return [defiFormat.encode("ascii"), entry.b_defi] + entry.lb_word def _entryFromRaw(self, rawEntry: RawEntryType) -> EntryType: - word = rawEntry[0] + defiFormat = rawEntry[0].decode("ascii") or self._defaultDefiFormat defi = rawEntry[1].decode("utf-8") - if len(rawEntry) > 2: # noqa: PLR2004 - defiFormat = rawEntry[2] - if defiFormat == "b": - fname = word - if isinstance(fname, list): - fname = fname[0] # NESTED 4 - return DataEntry(fname, tmpPath=defi) # pyright: ignore[reportReturnType] - else: - defiFormat = self._defaultDefiFormat + word = [b.decode("utf-8") for b in rawEntry[2:]] + + if defiFormat == "b": + fname = word + if isinstance(fname, list): + fname = fname[0] # NESTED 4 + return DataEntry(fname, tmpPath=defi) # pyright: ignore[reportReturnType] return Entry(word, defi, defiFormat=defiFormat) # pyright: ignore[reportReturnType] diff --git a/pyglossary/sq_entry_list.py b/pyglossary/sq_entry_list.py index d0609922e..78e70523c 100644 --- a/pyglossary/sq_entry_list.py +++ b/pyglossary/sq_entry_list.py @@ -21,7 +21,6 @@ import logging import os from os.path import isfile -from pickle import dumps, loads from typing import TYPE_CHECKING from .glossary_utils import Error @@ -116,10 +115,10 @@ def __len__(self) -> int: return self._len def _encode(self, entry: EntryType) -> bytes: - return dumps(self._entryToRaw(entry)) + return b"\x00".join(self._entryToRaw(entry)) def _decode(self, data: bytes) -> EntryType: - return self._entryFromRaw(loads(data)) + return self._entryFromRaw(data.split(b"\x00")) def append(self, entry: EntryType) -> None: self._cur.execute(