Add srcheaders module and refactor tokenization

wrmsr · wrmsr · commit 41c7eb2efefb · 2025-04-16T12:33:50.000-07:00
Introduce a new module for handling Python source headers and refactor the tokenization process to use an iterator for improved efficiency. Remove the obsolete pyheader module.
diff --git a/omdev/.manifests.json b/omdev/.manifests.json
@@ -203,6 +203,18 @@
       }
     }
   },
+  {
+    "module": ".py.srcheaders",
+    "attr": "_CLI_MODULE",
+    "file": "omdev/py/srcheaders.py",
+    "line": 64,
+    "value": {
+      "$.cli.types.CliModule": {
+        "cmd_name": "py/srcheaders",
+        "mod_name": "omdev.py.srcheaders"
+      }
+    }
+  },
   {
     "module": ".py.tools.importscan",
     "attr": "_CLI_MODULE",
diff --git a/omdev/py/attrdocs.py b/omdev/py/attrdocs.py
@@ -172,6 +172,8 @@ def _main() -> None:
         with open(args.file) as f:
             src = f.read()
 
+        #
+
         attr_docs = extract_attr_docs(src)
 
         #
diff --git a/omdev/py/srcheaders.py b/omdev/py/srcheaders.py
@@ -0,0 +1,97 @@
+import ast
+import dataclasses as dc
+import typing as ta
+
+from omdev.tokens import all as tks
+from omlish.lite.check import check
+
+
+##
+
+
+@dc.dataclass(frozen=True)
+class PyHeaderLine:
+    kind: ta.Literal['comment', 'string']
+    src: str
+    content: str
+    line: int
+    col: int
+
+
+def get_py_header_lines(src: str) -> list[PyHeaderLine]:
+    ret: list[PyHeaderLine] = []
+
+    src_toks = tks.iter_src_to_tokens(src)
+    hdr_toks = (
+        tok
+        for tok in src_toks
+        if tok.name == 'COMMENT' or
+        (tok.name not in tks.WS_NAMES and tok.name != 'NL')
+    )
+
+    for tok in hdr_toks:
+        if tok.name == 'COMMENT':
+            cs = tok.src.lstrip()
+            check.state(cs.startswith('#'))
+            cs = cs[1:].lstrip()
+            ret.append(PyHeaderLine(
+                kind='comment',
+                src=tok.src,
+                content=cs,
+                line=check.isinstance(tok.line, int),
+                col=check.isinstance(tok.utf8_byte_offset, int),
+            ))
+
+        elif tok.name == 'STRING':
+            ss = ast.literal_eval(tok.src)
+            ret.append(PyHeaderLine(
+                kind='string',
+                src=tok.src,
+                content=ss,
+                line=check.isinstance(tok.line, int),
+                col=check.isinstance(tok.utf8_byte_offset, int),
+            ))
+
+        else:
+            break
+
+    return ret
+
+
+##
+
+
+# @omlish-manifest
+_CLI_MODULE = {'$omdev.cli.types.CliModule': {
+    'cmd_name': 'py/srcheaders',
+    'mod_name': __name__,
+}}
+
+
+if __name__ == '__main__':
+    def _main() -> None:
+        import argparse
+
+        parser = argparse.ArgumentParser()
+        parser.add_argument('file')
+        args = parser.parse_args()
+
+        #
+
+        with open(args.file) as f:
+            src = f.read()
+
+        #
+
+        hls = get_py_header_lines(src)
+
+        #
+
+        import json
+
+        print(json.dumps(
+            [dc.asdict(hl) for hl in hls],
+            indent=2,
+        ))
+
+    _main()
diff --git a/omdev/tokens/all.py b/omdev/tokens/all.py
@@ -30,6 +30,7 @@
 NON_CODING_TOKENS = TokenNames.NON_CODING_TOKENS  # noqa
 
 curly_escape = Tokenization.curly_escape  # noqa
+iter_src_to_tokens = Tokenization.iter_src_to_tokens  # noqa
 src_to_tokens = Tokenization.src_to_tokens  # noqa
 parse_string_literal = Tokenization.parse_string_literal  # noqa
 tokens_to_src = Tokenization.tokens_to_src  # noqa
diff --git a/omdev/tokens/tokenizert.py b/omdev/tokens/tokenizert.py
@@ -82,13 +82,12 @@ def _re_partition(cls, regex: ta.Pattern[str], s: str) -> ta.Tuple[str, str, str
             return (s, '', '')
 
     @classmethod
-    def src_to_tokens(cls, src: str) -> ta.List[Token]:
+    def iter_src_to_tokens(cls, src: str) -> ta.Iterator[Token]:
         tokenize_target = io.StringIO(src)
         lines = ('', *tokenize_target)
 
         tokenize_target.seek(0)
 
-        tokens = []
         last_line = 1
         last_col = 0
         end_offset = 0
@@ -106,20 +105,20 @@ def src_to_tokens(cls, src: str) -> ta.List[Token]:
                 while cls._ESCAPED_NL_RE.search(newtok):
                     ws, nl, newtok = cls._re_partition(cls._ESCAPED_NL_RE, newtok)
                     if ws:
-                        tokens.append(Token(TokenNames.UNIMPORTANT_WS, ws, last_line, end_offset))
+                        yield Token(TokenNames.UNIMPORTANT_WS, ws, last_line, end_offset)
                         end_offset += len(ws.encode())
-                    tokens.append(Token(TokenNames.ESCAPED_NL, nl, last_line, end_offset))
+                    yield Token(TokenNames.ESCAPED_NL, nl, last_line, end_offset)
                     end_offset = 0
                     last_line += 1
                 if newtok:
-                    tokens.append(Token(TokenNames.UNIMPORTANT_WS, newtok, sline, 0))
+                    yield Token(TokenNames.UNIMPORTANT_WS, newtok, sline, 0)
                     end_offset = len(newtok.encode())
                 else:
                     end_offset = 0
 
             elif scol > last_col:
                 newtok = line[last_col:scol]
-                tokens.append(Token(TokenNames.UNIMPORTANT_WS, newtok, sline, end_offset))
+                yield Token(TokenNames.UNIMPORTANT_WS, newtok, sline, end_offset)
                 end_offset += len(newtok.encode())
 
             tok_name = tokenize.tok_name[tok_type]
@@ -130,14 +129,16 @@ def src_to_tokens(cls, src: str) -> ta.List[Token]:
                     ecol += len(new_tok_text) - len(tok_text)
                     tok_text = new_tok_text
 
-            tokens.append(Token(tok_name, tok_text, sline, end_offset))
+            yield Token(tok_name, tok_text, sline, end_offset)
             last_line, last_col = eline, ecol
             if sline != eline:
                 end_offset = len(lines[last_line][:last_col].encode())
             else:
                 end_offset += len(tok_text.encode())
 
-        return tokens
+    @classmethod
+    def src_to_tokens(cls, src: str) -> ta.List[Token]:
+        return list(cls.iter_src_to_tokens(src))
 
     @classmethod
     def parse_string_literal(cls, src: str) -> ta.Tuple[str, str]:
diff --git a/x/pyheader.py b/x/pyheader.py

Original file line number	Diff line number	Diff line change
`@@ -203,6 +203,18 @@`
`203`	`203`	`}`
`204`	`204`	`}`
`205`	`205`	`},`
	`206`	`+ {`
	`207`	`+ "module": ".py.srcheaders",`
	`208`	`+ "attr": "_CLI_MODULE",`
	`209`	`+ "file": "omdev/py/srcheaders.py",`
	`210`	`+ "line": 64,`
	`211`	`+ "value": {`
	`212`	`+ "$.cli.types.CliModule": {`
	`213`	`+ "cmd_name": "py/srcheaders",`
	`214`	`+ "mod_name": "omdev.py.srcheaders"`
	`215`	`+ }`
	`216`	`+ }`
	`217`	`+ },`
`206`	`218`	`{`
`207`	`219`	`"module": ".py.tools.importscan",`
`208`	`220`	`"attr": "_CLI_MODULE",`
Original file line number	Diff line number	Diff line change
`@@ -172,6 +172,8 @@ def _main() -> None:`
`172`	`172`	`with open(args.file) as f:`
`173`	`173`	`src = f.read()`
`174`	`174`
	`175`	`+ #`
	`176`	`+`
`175`	`177`	`attr_docs = extract_attr_docs(src)`
`176`	`178`
`177`	`179`	`#`