From 5cefaa93a0b8b736ffca264f46e91d1b33dba32e Mon Sep 17 00:00:00 2001 From: Saul Pwanson Date: Thu, 16 May 2024 14:13:27 -0700 Subject: [PATCH] [path-] fix undercounted progress for multibyte chars #2323 Co-authored-by: @midichef --- visidata/path.py | 8 ++++++-- visidata/threads.py | 40 ++++++++++++++++++++++++++++++---------- 2 files changed, 36 insertions(+), 12 deletions(-) diff --git a/visidata/path.py b/visidata/path.py index a45102a19..1d44a8662 100644 --- a/visidata/path.py +++ b/visidata/path.py @@ -8,7 +8,7 @@ from functools import wraps, lru_cache from visidata import vd -from visidata import VisiData, Progress +from visidata import VisiData, Progress, TextProgress vd.help_encoding = '''Common Encodings: @@ -97,12 +97,16 @@ def peek(self, size=-1): class FileProgress: 'Open file in binary mode and track read() progress.' def __init__(self, path, fp, mode='r', **kwargs): + 'kwargs has all open() kwargs' self.path = path self.fp = fp self.prog = None if 'r' in mode: gerund = 'reading' - self.prog = Progress(gerund=gerund, total=filesize(path)) + if 'b' in mode: + self.prog = Progress(gerund=gerund, total=filesize(path)) + else: + self.prog = TextProgress(gerund=gerund, total=filesize(path), encoding=kwargs.get('encoding')) elif 'w' in mode: gerund = 'writing' self.prog = Progress(gerund=gerund) diff --git a/visidata/threads.py b/visidata/threads.py index 95d2f2dc6..54e9a369f 100644 --- a/visidata/threads.py +++ b/visidata/threads.py @@ -36,7 +36,14 @@ def _execAsync(*args, **kwargs): return _decorator -class _Progress: +class Progress: + '''Maintain progress count as either an iterable wrapper, or a context manager. + + - *iterable*: wrapped iterable if used as an iterator. + - *gerund*: status text shown while this Progress is active. + - *total*: total count expected. + - *sheet*: specific sheet to associate this progress with. Default is sheet from current thread. + ''' def __init__(self, iterable=None, gerund="", total=None, sheet=None): self.iterable = iterable if total is None: @@ -70,16 +77,28 @@ def __iter__(self): yield item self.made += 1 -@VisiData.global_api -def Progress(vd, iterable=None, gerund="", total=None, sheet=None): - '''Maintain progress count as either an iterable wrapper, or a context manager. - - *iterable*: wrapped iterable if used as an iterator. - - *gerund*: status text shown while this Progress is active. - - *total*: total count expected. - - *sheet*: specific sheet to associate this progress with. Default is sheet from current thread. - ''' - return _Progress(iterable=iterable, gerund=gerund, total=total, sheet=sheet) +class TextProgress(Progress): + def __init__(self, encoding='utf-8', **kwargs): + super().__init__(**kwargs) + self.est_sample = '' + self.est_charbytes = 1 + + def addProgress(self, n:int): + if self.made < self.total: + return super().addProgress(n * self.est_charbytes) + + def addSample(self, s:str): + # A short string can cause charbytes to be overestimated by 30%, + # due to the Byte Order Marker in encodings like utf-8-sig. + # Combining short strings into one big one lowers that error to < 1%. + if len(self.est_sample) < self.made/1000: + self.est_sample += s[:100] + self.est_charbytes = len(self.est_sample.encode(self.encoding)) / len(self.est_sample) + + +vd.Progress = Progress +vd.TextProgress = TextProgress @VisiData.api @@ -452,6 +471,7 @@ def codestr(code): vd.addGlobals({ 'ThreadsSheet': ThreadsSheet, 'Progress': Progress, + 'TextProgress': TextProgress, 'asynccache': asynccache, 'asyncsingle': asyncsingle, 'asyncignore': asyncignore,