Skip to content

Commit

Permalink
stardict: some refactoring in writer.py
Browse files Browse the repository at this point in the history
  • Loading branch information
ilius committed Dec 7, 2024
1 parent 9ec81a1 commit 95d133f
Showing 1 changed file with 24 additions and 43 deletions.
67 changes: 24 additions & 43 deletions pyglossary/plugins/stardict/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,9 @@ def open(self, filename: str) -> None:
def write(self) -> Generator[None, EntryType, None]:
from pyglossary.os_utils import runDictzip

if not isdir(self._resDir):
os.mkdir(self._resDir)

if self._sametypesequence:
if self._merge_syns:
yield from self.writeCompactMergeSyns(self._sametypesequence)
Expand Down Expand Up @@ -207,8 +210,6 @@ def writeCompact(self, defiFormat: str) -> Generator[None, EntryType, None]:

t0 = now()
wordCount = 0
if not isdir(self._resDir):
os.mkdir(self._resDir)

entryIndex = -1
while True:
Expand All @@ -221,26 +222,22 @@ def writeCompact(self, defiFormat: str) -> Generator[None, EntryType, None]:
entryIndex += 1

words = entry.l_word # list of strs
word = words[0] # str
defi = self.fixDefi(entry.defi, defiFormat)
# defi is str

for alt in words[1:]:
altIndexList.append((alt.encode("utf-8"), entryIndex))

b_dictBlock = defi.encode("utf-8")
b_dictBlock = self.fixDefi(entry.defi, defiFormat).encode("utf-8")
dictFile.write(b_dictBlock)
blockLen = len(b_dictBlock)

b_idxBlock = (
word.encode("utf-8")
words[0].encode("utf-8")
+ b"\x00"
+ dictMarkToBytes(dictMark)
+ uint32ToBytes(blockLen)
+ uint32ToBytes(len(b_dictBlock))
)
idxFile.write(b_idxBlock)

dictMark += blockLen
dictMark += len(b_dictBlock)
wordCount += 1

if dictMark > dictMarkMax:
Expand Down Expand Up @@ -277,8 +274,6 @@ def writeGeneral(self) -> Generator[None, EntryType, None]:
t0 = now()
wordCount = 0
defiFormatCounter: typing.Counter[str] = Counter()
if not isdir(self._resDir):
os.mkdir(self._resDir)

dictMarkToBytes, dictMarkMax = self.dictMarkToBytesFunc()

Expand All @@ -300,26 +295,23 @@ def writeGeneral(self) -> Generator[None, EntryType, None]:
defiFormat = "m"

words = entry.l_word # list of strs
word = words[0] # str
defi = self.fixDefi(entry.defi, defiFormat)
# defi is str

for alt in words[1:]:
altIndexList.append((alt.encode("utf-8"), entryIndex))

b_dictBlock = (defiFormat + defi).encode("utf-8") + b"\x00"
b_defi = self.fixDefi(entry.defi, defiFormat).encode("utf-8")
b_dictBlock = defiFormat.encode("ascii") + b_defi + b"\x00"
dictFile.write(b_dictBlock)
blockLen = len(b_dictBlock)

b_idxBlock = (
word.encode("utf-8")
words[0].encode("utf-8")
+ b"\x00"
+ dictMarkToBytes(dictMark)
+ uint32ToBytes(blockLen)
+ uint32ToBytes(len(b_dictBlock))
)
idxFile.write(b_idxBlock)

dictMark += blockLen
dictMark += len(b_dictBlock)
wordCount += 1

if dictMark > dictMarkMax:
Expand Down Expand Up @@ -382,20 +374,18 @@ def writeCompactMergeSyns(
defiFormat - format of article definition: h - html, m - plain text
"""
log.debug(f"writeCompactMergeSyns: {defiFormat=}")
dictMark = 0

idxBlockList = self.newIdxList()
altIndexList = self.newSynList()

dictFile = open(self._filename + ".dict", "wb")

t0 = now()
if not isdir(self._resDir):
os.mkdir(self._resDir)

dictMarkToBytes, dictMarkMax = self.dictMarkToBytesFunc()

entryIndex = -1
dictMark = 0
while True:
entry = yield
if entry is None:
Expand All @@ -406,19 +396,15 @@ def writeCompactMergeSyns(
entryIndex += 1

words = entry.l_word # list of strs
word = words[0] # str
defi = self.fixDefi(entry.defi, defiFormat)
# defi is str

b_dictBlock = defi.encode("utf-8")
b_dictBlock = self.fixDefi(entry.defi, defiFormat).encode("utf-8")
dictFile.write(b_dictBlock)
blockLen = len(b_dictBlock)

blockData = dictMarkToBytes(dictMark) + uint32ToBytes(blockLen)
blockData = dictMarkToBytes(dictMark) + uint32ToBytes(len(b_dictBlock))
for word in words:
idxBlockList.append((word.encode("utf-8"), blockData))

dictMark += blockLen
dictMark += len(b_dictBlock)

if dictMark > dictMarkMax:
raise Error(
Expand All @@ -445,7 +431,6 @@ def writeGeneralMergeSyns(self) -> Generator[None, EntryType, None]:
sametypesequence option is not used.
"""
log.debug("writeGeneralMergeSyns")
dictMark = 0
idxBlockList = self.newIdxList()
altIndexList = self.newSynList()

Expand All @@ -454,12 +439,11 @@ def writeGeneralMergeSyns(self) -> Generator[None, EntryType, None]:
t0 = now()
wordCount = 0
defiFormatCounter: typing.Counter[str] = Counter()
if not isdir(self._resDir):
os.mkdir(self._resDir)

dictMarkToBytes, dictMarkMax = self.dictMarkToBytesFunc()

entryIndex = -1
dictMark = 0
while True:
entry = yield
if entry is None:
Expand All @@ -477,19 +461,16 @@ def writeGeneralMergeSyns(self) -> Generator[None, EntryType, None]:
defiFormat = "m"

words = entry.l_word # list of strs
word = words[0] # str
defi = self.fixDefi(entry.defi, defiFormat)
# defi is str

b_dictBlock = (defiFormat + defi).encode("utf-8") + b"\x00"
b_defi = self.fixDefi(entry.defi, defiFormat).encode("utf-8")
b_dictBlock = defiFormat.encode("ascii") + b_defi + b"\x00"
dictFile.write(b_dictBlock)
blockLen = len(b_dictBlock)

blockData = dictMarkToBytes(dictMark) + uint32ToBytes(blockLen)
blockData = dictMarkToBytes(dictMark) + uint32ToBytes(len(b_dictBlock))
for word in words:
idxBlockList.append((word.encode("utf-8"), blockData))

dictMark += blockLen
dictMark += len(b_dictBlock)

if dictMark > dictMarkMax:
raise Error(
Expand Down Expand Up @@ -564,9 +545,9 @@ def writeIfoFile(
ifo.append(("synwordcount", str(synWordCount)))

desc = glos.getInfo("description")
_copyright = glos.getInfo("copyright")
if _copyright:
desc = f"{_copyright}\n{desc}"
copyright_ = glos.getInfo("copyright")
if copyright_:
desc = f"{copyright_}\n{desc}"
publisher = glos.getInfo("publisher")
if publisher:
desc = f"Publisher: {publisher}\n{desc}"
Expand Down

0 comments on commit 95d133f

Please sign in to comment.