change structure of RawEntryType completely, improves RAM usage for i…

…ndirect mode
ilius · Dec 6, 2024 · bcf4e11 · bcf4e11
1 parent eda901a
commit bcf4e11
Show file tree

Hide file tree

Showing 4 changed files with 32 additions and 27 deletions.
diff --git a/pyglossary/entry.py b/pyglossary/entry.py
@@ -196,8 +196,8 @@ def getRawEntrySortKey(
 		key: Callable[[bytes], Any],
 	) -> Callable[[RawEntryType], Any]:
 		def newKey(x: RawEntryType) -> Any:
-			# x is rawEntry, so x[0] is list[str]: list of words (entry.l_word)
-			return key(x[0])  # type: ignore
+			# x is rawEntry, so x[2:] is list[bytes]: list of words in bytes
+			return key([b.decode("utf-8") for b in x[2:]])  # type: ignore
 
 		return newKey
 
@@ -262,6 +262,13 @@ def l_word(self) -> list[str]:
 			return [self._word]
 		return self._word
 
+	@property
+	def lb_word(self) -> list[bytes]:
+		"""Returns list of the word and all the alternate words."""
+		if isinstance(self._word, str):
+			return [self._word.encode("utf-8")]
+		return [word.encode("utf-8") for word in self._word]
+
 	@property
 	def defi(self) -> str:
 		"""Returns string of definition."""

diff --git a/pyglossary/glossary_types.py b/pyglossary/glossary_types.py
@@ -4,6 +4,7 @@
 from collections.abc import (
 	Callable,
 	Iterator,
+	Sequence,
 )
 
 # -*- coding: utf-8 -*-
@@ -30,10 +31,10 @@
 
 MultiStr: TypeAlias = "str | list[str]"
 
-# different types in order:
-# - without defiFormat
-# - with defiFormat
-RawEntryType: TypeAlias = tuple[list[str], bytes] | tuple[list[str], bytes, str]
+# str(rawEntry[0]): defiFormat or ""
+# rawEntry[1]: b_defi
+# rawEntry[2:]: b_word_list
+RawEntryType: TypeAlias = Sequence[bytes]
 
 
 class EntryType(typing.Protocol):  # noqa: PLR0904
@@ -56,6 +57,9 @@ def s_word(self) -> str: ...
 	@property
 	def l_word(self) -> list[str]: ...
 
+	@property
+	def lb_word(self) -> list[bytes]: ...
+
 	@property
 	def defi(self) -> str: ...
 

diff --git a/pyglossary/glossary_v2.py b/pyglossary/glossary_v2.py
@@ -241,11 +241,7 @@ def _dataEntryToRaw(self, entry: DataEntry) -> RawEntryType:
 		b_fpath = b""
 		if self.tmpDataDir:
 			b_fpath = entry.save(self.tmpDataDir).encode("utf-8")
-		return (
-			[entry.getFileName()],
-			b_fpath,
-			"b",
-		)
+		return (b"b", b_fpath, entry.getFileName().encode("utf-8"))
 
 	def _entryToRaw(self, entry: EntryType) -> RawEntryType:
 		"""
@@ -256,22 +252,21 @@ def _entryToRaw(self, entry: EntryType) -> RawEntryType:
 			return self._dataEntryToRaw(cast("DataEntry", entry))
 
 		defiFormat = entry.defiFormat
-		if defiFormat and defiFormat != self._defaultDefiFormat:
-			return (entry.l_word, entry.b_defi, defiFormat)
-		return (entry.l_word, entry.b_defi)
+		if defiFormat is None or defiFormat == self._defaultDefiFormat:
+			defiFormat = ""
+
+		return [defiFormat.encode("ascii"), entry.b_defi] + entry.lb_word
 
 	def _entryFromRaw(self, rawEntry: RawEntryType) -> EntryType:
-		word = rawEntry[0]
+		defiFormat = rawEntry[0].decode("ascii") or self._defaultDefiFormat
 		defi = rawEntry[1].decode("utf-8")
-		if len(rawEntry) > 2:  # noqa: PLR2004
-			defiFormat = rawEntry[2]
-			if defiFormat == "b":
-				fname = word
-				if isinstance(fname, list):
-					fname = fname[0]  # NESTED 4
-				return DataEntry(fname, tmpPath=defi)  # pyright: ignore[reportReturnType]
-		else:
-			defiFormat = self._defaultDefiFormat
+		word = [b.decode("utf-8") for b in rawEntry[2:]]
+
+		if defiFormat == "b":
+			fname = word
+			if isinstance(fname, list):
+				fname = fname[0]  # NESTED 4
+			return DataEntry(fname, tmpPath=defi)  # pyright: ignore[reportReturnType]
 
 		return Entry(word, defi, defiFormat=defiFormat)  # pyright: ignore[reportReturnType]
 

diff --git a/pyglossary/sq_entry_list.py b/pyglossary/sq_entry_list.py
@@ -21,7 +21,6 @@
 import logging
 import os
 from os.path import isfile
-from pickle import dumps, loads
 from typing import TYPE_CHECKING
 
 from .glossary_utils import Error
@@ -116,10 +115,10 @@ def __len__(self) -> int:
 		return self._len
 
 	def _encode(self, entry: EntryType) -> bytes:
-		return dumps(self._entryToRaw(entry))
+		return b"\x00".join(self._entryToRaw(entry))
 
 	def _decode(self, data: bytes) -> EntryType:
-		return self._entryFromRaw(loads(data))
+		return self._entryFromRaw(data.split(b"\x00"))
 
 	def append(self, entry: EntryType) -> None:
 		self._cur.execute(