Skip to content

Commit 198044d

Browse files
authored
Add ty type checker to CI and fix type errors (fixes bug from PR #157) (#192)
* Add ty type checker to CI and fix type errors - Add ty (Astral's fast Python type checker) to GitHub CI workflow - Fix type annotations across all RAG apps: - Update load_data return types from list[str] to list[dict[str, Any]] - Fix base_rag_example.py to properly handle dict format from create_text_chunks - Fix type errors in leann-core: - chunking_utils.py: Add explicit type annotations - cli.py: Fix return type annotations for PDF extraction functions - interactive_utils.py: Fix readline import type handling - Fix type errors in apps: - wechat_history.py: Fix return type annotations - document_rag.py, code_rag.py: Replace **kwargs with explicit arguments - Add ty configuration to pyproject.toml This resolves the bug introduced in PR #157 where create_text_chunks() changed to return list[dict] but callers were not updated. * Fix remaining ty type errors - Fix slack_mcp_reader.py channel parameter can be None - Fix embedding_compute.py ContextProp type issue - Fix searcher_base.py method override signatures - Fix chunking_utils.py chunk_text assignment - Fix slack_rag.py and twitter_rag.py return types - Fix email.py and image_rag.py method overrides * Fix multimodal benchmark scripts type errors - Fix undefined LeannRetriever -> LeannMultiVector - Add proper type casts for HuggingFace Dataset iteration - Cast task config values to correct types - Add type annotations for dataset row dicts * Enable ty check for multimodal scripts in CI All type errors in multimodal scripts have been fixed, so we can now include them in the CI type checking. * Fix all test type errors and enable ty check on tests - Fix test_basic.py: search() takes str not list - Fix test_cli_prompt_template.py: add type: ignore for Mock assignments - Fix test_prompt_template_persistence.py: match BaseSearcher.search signature - Fix test_prompt_template_e2e.py: add type narrowing asserts after skip - Fix test_readme_examples.py: use explicit kwargs instead of **model_args - Fix metadata_filter.py: allow Optional[MetadataFilters] - Update CI to run ty check on tests * Format code with ruff * Format searcher_base.py
1 parent a2e5f52 commit 198044d

32 files changed

+261
-144
lines changed

.github/workflows/build-reusable.yml

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,30 @@ jobs:
2828
run: |
2929
uv run --only-group lint pre-commit run --all-files --show-diff-on-failure
3030
31+
type-check:
32+
name: Type Check with ty
33+
runs-on: ubuntu-latest
34+
steps:
35+
- uses: actions/checkout@v4
36+
with:
37+
ref: ${{ inputs.ref }}
38+
submodules: recursive
39+
40+
- name: Install uv and Python
41+
uses: astral-sh/setup-uv@v6
42+
with:
43+
python-version: '3.11'
44+
45+
- name: Install ty
46+
run: uv tool install ty
47+
48+
- name: Run ty type checker
49+
run: |
50+
# Run ty on core packages, apps, and tests
51+
ty check packages/leann-core/src apps tests
3152
3253
build:
33-
needs: lint
54+
needs: [lint, type-check]
3455
name: Build ${{ matrix.os }} Python ${{ matrix.python }}
3556
strategy:
3657
matrix:

apps/base_rag_example.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import argparse
77
from abc import ABC, abstractmethod
88
from pathlib import Path
9-
from typing import Any, Union
9+
from typing import Any
1010

1111
import dotenv
1212
from leann.api import LeannBuilder, LeannChat
@@ -257,8 +257,8 @@ def _add_specific_arguments(self, parser: argparse.ArgumentParser):
257257
pass
258258

259259
@abstractmethod
260-
async def load_data(self, args) -> list[Union[str, dict[str, Any]]]:
261-
"""Load data from the source. Returns list of text chunks (strings or dicts with 'text' key)."""
260+
async def load_data(self, args) -> list[dict[str, Any]]:
261+
"""Load data from the source. Returns list of text chunks as dicts with 'text' and 'metadata' keys."""
262262
pass
263263

264264
def get_llm_config(self, args) -> dict[str, Any]:
@@ -282,8 +282,8 @@ def get_llm_config(self, args) -> dict[str, Any]:
282282

283283
return config
284284

285-
async def build_index(self, args, texts: list[Union[str, dict[str, Any]]]) -> str:
286-
"""Build LEANN index from texts (accepts strings or dicts with 'text' key)."""
285+
async def build_index(self, args, texts: list[dict[str, Any]]) -> str:
286+
"""Build LEANN index from text chunks (dicts with 'text' and 'metadata' keys)."""
287287
index_path = str(Path(args.index_dir) / f"{self.default_index_name}.leann")
288288

289289
print(f"\n[Building Index] Creating {self.name} index...")

apps/browser_rag.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import os
77
import sys
88
from pathlib import Path
9+
from typing import Any
910

1011
# Add parent directory to path for imports
1112
sys.path.insert(0, str(Path(__file__).parent))
@@ -85,7 +86,7 @@ def _find_chrome_profiles(self) -> list[Path]:
8586

8687
return profiles
8788

88-
async def load_data(self, args) -> list[str]:
89+
async def load_data(self, args) -> list[dict[str, Any]]:
8990
"""Load browser history and convert to text chunks."""
9091
# Determine Chrome profiles
9192
if args.chrome_profile and not args.auto_find_profiles:

apps/chatgpt_rag.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import sys
77
from pathlib import Path
8+
from typing import Any
89

910
# Add parent directory to path for imports
1011
sys.path.insert(0, str(Path(__file__).parent))
@@ -80,7 +81,7 @@ def _find_chatgpt_exports(self, export_path: Path) -> list[Path]:
8081

8182
return export_files
8283

83-
async def load_data(self, args) -> list[str]:
84+
async def load_data(self, args) -> list[dict[str, Any]]:
8485
"""Load ChatGPT export data and convert to text chunks."""
8586
export_path = Path(args.export_path)
8687

apps/claude_rag.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import sys
77
from pathlib import Path
8+
from typing import Any
89

910
# Add parent directory to path for imports
1011
sys.path.insert(0, str(Path(__file__).parent))
@@ -80,7 +81,7 @@ def _find_claude_exports(self, export_path: Path) -> list[Path]:
8081

8182
return export_files
8283

83-
async def load_data(self, args) -> list[str]:
84+
async def load_data(self, args) -> list[dict[str, Any]]:
8485
"""Load Claude export data and convert to text chunks."""
8586
export_path = Path(args.export_path)
8687

apps/code_rag.py

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
import sys
88
from pathlib import Path
9+
from typing import Any
910

1011
# Add parent directory to path for imports
1112
sys.path.insert(0, str(Path(__file__).parent))
@@ -77,7 +78,7 @@ def _add_specific_arguments(self, parser):
7778
help="Try to preserve import statements in chunks (default: True)",
7879
)
7980

80-
async def load_data(self, args) -> list[str]:
81+
async def load_data(self, args) -> list[dict[str, Any]]:
8182
"""Load code files and convert to AST-aware chunks."""
8283
print(f"🔍 Scanning code repository: {args.repo_dir}")
8384
print(f"📁 Including extensions: {args.include_extensions}")
@@ -88,14 +89,6 @@ async def load_data(self, args) -> list[str]:
8889
if not repo_path.exists():
8990
raise ValueError(f"Repository directory not found: {args.repo_dir}")
9091

91-
# Load code files with filtering
92-
reader_kwargs = {
93-
"recursive": True,
94-
"encoding": "utf-8",
95-
"required_exts": args.include_extensions,
96-
"exclude_hidden": True,
97-
}
98-
9992
# Create exclusion filter
10093
def file_filter(file_path: str) -> bool:
10194
"""Filter out unwanted files and directories."""
@@ -120,8 +113,11 @@ def file_filter(file_path: str) -> bool:
120113
# Load documents with file filtering
121114
documents = SimpleDirectoryReader(
122115
args.repo_dir,
123-
file_extractor=None, # Use default extractors
124-
**reader_kwargs,
116+
file_extractor=None,
117+
recursive=True,
118+
encoding="utf-8",
119+
required_exts=args.include_extensions,
120+
exclude_hidden=True,
125121
).load_data(show_progress=True)
126122

127123
# Apply custom filtering

apps/document_rag.py

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
import sys
77
from pathlib import Path
8-
from typing import Any, Union
8+
from typing import Any
99

1010
# Add parent directory to path for imports
1111
sys.path.insert(0, str(Path(__file__).parent))
@@ -52,7 +52,7 @@ def _add_specific_arguments(self, parser):
5252
help="Enable AST-aware chunking for code files in the data directory",
5353
)
5454

55-
async def load_data(self, args) -> list[Union[str, dict[str, Any]]]:
55+
async def load_data(self, args) -> list[dict[str, Any]]:
5656
"""Load documents and convert to text chunks."""
5757
print(f"Loading documents from: {args.data_dir}")
5858
if args.file_types:
@@ -66,16 +66,12 @@ async def load_data(self, args) -> list[Union[str, dict[str, Any]]]:
6666
raise ValueError(f"Data directory not found: {args.data_dir}")
6767

6868
# Load documents
69-
reader_kwargs = {
70-
"recursive": True,
71-
"encoding": "utf-8",
72-
}
73-
if args.file_types:
74-
reader_kwargs["required_exts"] = args.file_types
75-
76-
documents = SimpleDirectoryReader(args.data_dir, **reader_kwargs).load_data(
77-
show_progress=True
78-
)
69+
documents = SimpleDirectoryReader(
70+
args.data_dir,
71+
recursive=True,
72+
encoding="utf-8",
73+
required_exts=args.file_types if args.file_types else None,
74+
).load_data(show_progress=True)
7975

8076
if not documents:
8177
print(f"No documents found in {args.data_dir} with extensions {args.file_types}")

apps/email_data/email.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,11 +127,12 @@ class EmlxMboxReader(MboxReader):
127127

128128
def load_data(
129129
self,
130-
directory: Path,
130+
file: Path, # Note: for EmlxMboxReader, this is actually a directory
131131
extra_info: dict | None = None,
132132
fs: AbstractFileSystem | None = None,
133133
) -> list[Document]:
134134
"""Parse .emlx files from directory into strings using MboxReader logic."""
135+
directory = file # Rename for clarity - this is a directory of .emlx files
135136
import os
136137
import tempfile
137138

apps/email_rag.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import sys
77
from pathlib import Path
8+
from typing import Any
89

910
# Add parent directory to path for imports
1011
sys.path.insert(0, str(Path(__file__).parent))
@@ -64,7 +65,7 @@ def _find_mail_directories(self) -> list[Path]:
6465

6566
return messages_dirs
6667

67-
async def load_data(self, args) -> list[str]:
68+
async def load_data(self, args) -> list[dict[str, Any]]:
6869
"""Load emails and convert to text chunks."""
6970
# Determine mail directories
7071
if args.mail_path:

apps/history_data/wechat_history.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ def check_api_available(self) -> bool:
8686
text=True,
8787
timeout=5,
8888
)
89-
return result.returncode == 0 and result.stdout.strip()
89+
return result.returncode == 0 and bool(result.stdout.strip())
9090
except Exception:
9191
return False
9292

@@ -314,7 +314,9 @@ def _concatenate_messages(
314314

315315
return concatenated_groups
316316

317-
def _create_concatenated_content(self, message_group: dict, contact_name: str) -> str:
317+
def _create_concatenated_content(
318+
self, message_group: dict, contact_name: str
319+
) -> tuple[str, str]:
318320
"""
319321
Create concatenated content from a group of messages.
320322

0 commit comments

Comments
 (0)