yichuan-w
diff --git a/‎.github/workflows/build-reusable.yml‎
Lines changed: 22 additions & 1 deletion b/‎.github/workflows/build-reusable.yml‎
Lines changed: 22 additions & 1 deletion
diff --git a/‎apps/base_rag_example.py‎
Lines changed: 5 additions & 5 deletions b/‎apps/base_rag_example.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎apps/browser_rag.py‎
Lines changed: 2 additions & 1 deletion b/‎apps/browser_rag.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎apps/chatgpt_rag.py‎
Lines changed: 2 additions & 1 deletion b/‎apps/chatgpt_rag.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎apps/claude_rag.py‎
Lines changed: 2 additions & 1 deletion b/‎apps/claude_rag.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎apps/code_rag.py‎
Lines changed: 7 additions & 11 deletions b/‎apps/code_rag.py‎
Lines changed: 7 additions & 11 deletions
diff --git a/‎apps/document_rag.py‎
Lines changed: 8 additions & 12 deletions b/‎apps/document_rag.py‎
Lines changed: 8 additions & 12 deletions
diff --git a/‎apps/email_data/email.py‎
Lines changed: 2 additions & 1 deletion b/‎apps/email_data/email.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎apps/email_rag.py‎
Lines changed: 2 additions & 1 deletion b/‎apps/email_rag.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎apps/history_data/wechat_history.py‎
Lines changed: 4 additions & 2 deletions b/‎apps/history_data/wechat_history.py‎
Lines changed: 4 additions & 2 deletions
@@ -28,9 +28,30 @@ jobs:
         run: |
           uv run --only-group lint pre-commit run --all-files --show-diff-on-failure
 
+  type-check:
+    name: Type Check with ty
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref }}
+          submodules: recursive
+
+      - name: Install uv and Python
+        uses: astral-sh/setup-uv@v6
+        with:
+          python-version: '3.11'
+
+      - name: Install ty
+        run: uv tool install ty
+
+      - name: Run ty type checker
+        run: |
+          # Run ty on core packages, apps, and tests
+          ty check packages/leann-core/src apps tests
 
   build:
-    needs: lint
+    needs: [lint, type-check]
     name: Build ${{ matrix.os }} Python ${{ matrix.python }}
     strategy:
       matrix:
 
@@ -6,7 +6,7 @@
 import argparse
 from abc import ABC, abstractmethod
 from pathlib import Path
-from typing import Any, Union
+from typing import Any
 
 import dotenv
 from leann.api import LeannBuilder, LeannChat
@@ -257,8 +257,8 @@ def _add_specific_arguments(self, parser: argparse.ArgumentParser):
         pass
 
     @abstractmethod
-    async def load_data(self, args) -> list[Union[str, dict[str, Any]]]:
-        """Load data from the source. Returns list of text chunks (strings or dicts with 'text' key)."""
+    async def load_data(self, args) -> list[dict[str, Any]]:
+        """Load data from the source. Returns list of text chunks as dicts with 'text' and 'metadata' keys."""
         pass
 
     def get_llm_config(self, args) -> dict[str, Any]:
@@ -282,8 +282,8 @@ def get_llm_config(self, args) -> dict[str, Any]:
 
         return config
 
-    async def build_index(self, args, texts: list[Union[str, dict[str, Any]]]) -> str:
-        """Build LEANN index from texts (accepts strings or dicts with 'text' key)."""
+    async def build_index(self, args, texts: list[dict[str, Any]]) -> str:
+        """Build LEANN index from text chunks (dicts with 'text' and 'metadata' keys)."""
         index_path = str(Path(args.index_dir) / f"{self.default_index_name}.leann")
 
         print(f"\n[Building Index] Creating {self.name} index...")
 
@@ -6,6 +6,7 @@
 import os
 import sys
 from pathlib import Path
+from typing import Any
 
 # Add parent directory to path for imports
 sys.path.insert(0, str(Path(__file__).parent))
@@ -85,7 +86,7 @@ def _find_chrome_profiles(self) -> list[Path]:
 
         return profiles
 
-    async def load_data(self, args) -> list[str]:
+    async def load_data(self, args) -> list[dict[str, Any]]:
         """Load browser history and convert to text chunks."""
         # Determine Chrome profiles
         if args.chrome_profile and not args.auto_find_profiles:
 
@@ -5,6 +5,7 @@
 
 import sys
 from pathlib import Path
+from typing import Any
 
 # Add parent directory to path for imports
 sys.path.insert(0, str(Path(__file__).parent))
@@ -80,7 +81,7 @@ def _find_chatgpt_exports(self, export_path: Path) -> list[Path]:
 
         return export_files
 
-    async def load_data(self, args) -> list[str]:
+    async def load_data(self, args) -> list[dict[str, Any]]:
         """Load ChatGPT export data and convert to text chunks."""
         export_path = Path(args.export_path)
 
 
@@ -5,6 +5,7 @@
 
 import sys
 from pathlib import Path
+from typing import Any
 
 # Add parent directory to path for imports
 sys.path.insert(0, str(Path(__file__).parent))
@@ -80,7 +81,7 @@ def _find_claude_exports(self, export_path: Path) -> list[Path]:
 
         return export_files
 
-    async def load_data(self, args) -> list[str]:
+    async def load_data(self, args) -> list[dict[str, Any]]:
         """Load Claude export data and convert to text chunks."""
         export_path = Path(args.export_path)
 
 
@@ -6,6 +6,7 @@
 
 import sys
 from pathlib import Path
+from typing import Any
 
 # Add parent directory to path for imports
 sys.path.insert(0, str(Path(__file__).parent))
@@ -77,7 +78,7 @@ def _add_specific_arguments(self, parser):
             help="Try to preserve import statements in chunks (default: True)",
         )
 
-    async def load_data(self, args) -> list[str]:
+    async def load_data(self, args) -> list[dict[str, Any]]:
         """Load code files and convert to AST-aware chunks."""
         print(f"🔍 Scanning code repository: {args.repo_dir}")
         print(f"📁 Including extensions: {args.include_extensions}")
@@ -88,14 +89,6 @@ async def load_data(self, args) -> list[str]:
         if not repo_path.exists():
             raise ValueError(f"Repository directory not found: {args.repo_dir}")
 
-        # Load code files with filtering
-        reader_kwargs = {
-            "recursive": True,
-            "encoding": "utf-8",
-            "required_exts": args.include_extensions,
-            "exclude_hidden": True,
-        }
-
         # Create exclusion filter
         def file_filter(file_path: str) -> bool:
             """Filter out unwanted files and directories."""
@@ -120,8 +113,11 @@ def file_filter(file_path: str) -> bool:
             # Load documents with file filtering
             documents = SimpleDirectoryReader(
                 args.repo_dir,
-                file_extractor=None,  # Use default extractors
-                **reader_kwargs,
+                file_extractor=None,
+                recursive=True,
+                encoding="utf-8",
+                required_exts=args.include_extensions,
+                exclude_hidden=True,
             ).load_data(show_progress=True)
 
             # Apply custom filtering
 
@@ -5,7 +5,7 @@
 
 import sys
 from pathlib import Path
-from typing import Any, Union
+from typing import Any
 
 # Add parent directory to path for imports
 sys.path.insert(0, str(Path(__file__).parent))
@@ -52,7 +52,7 @@ def _add_specific_arguments(self, parser):
             help="Enable AST-aware chunking for code files in the data directory",
         )
 
-    async def load_data(self, args) -> list[Union[str, dict[str, Any]]]:
+    async def load_data(self, args) -> list[dict[str, Any]]:
         """Load documents and convert to text chunks."""
         print(f"Loading documents from: {args.data_dir}")
         if args.file_types:
@@ -66,16 +66,12 @@ async def load_data(self, args) -> list[Union[str, dict[str, Any]]]:
             raise ValueError(f"Data directory not found: {args.data_dir}")
 
         # Load documents
-        reader_kwargs = {
-            "recursive": True,
-            "encoding": "utf-8",
-        }
-        if args.file_types:
-            reader_kwargs["required_exts"] = args.file_types
-
-        documents = SimpleDirectoryReader(args.data_dir, **reader_kwargs).load_data(
-            show_progress=True
-        )
+        documents = SimpleDirectoryReader(
+            args.data_dir,
+            recursive=True,
+            encoding="utf-8",
+            required_exts=args.file_types if args.file_types else None,
+        ).load_data(show_progress=True)
 
         if not documents:
             print(f"No documents found in {args.data_dir} with extensions {args.file_types}")
 
@@ -127,11 +127,12 @@ class EmlxMboxReader(MboxReader):
 
     def load_data(
         self,
-        directory: Path,
+        file: Path,  # Note: for EmlxMboxReader, this is actually a directory
         extra_info: dict | None = None,
         fs: AbstractFileSystem | None = None,
     ) -> list[Document]:
         """Parse .emlx files from directory into strings using MboxReader logic."""
+        directory = file  # Rename for clarity - this is a directory of .emlx files
         import os
         import tempfile
 
 
@@ -5,6 +5,7 @@
 
 import sys
 from pathlib import Path
+from typing import Any
 
 # Add parent directory to path for imports
 sys.path.insert(0, str(Path(__file__).parent))
@@ -64,7 +65,7 @@ def _find_mail_directories(self) -> list[Path]:
 
         return messages_dirs
 
-    async def load_data(self, args) -> list[str]:
+    async def load_data(self, args) -> list[dict[str, Any]]:
         """Load emails and convert to text chunks."""
         # Determine mail directories
         if args.mail_path:
 
@@ -86,7 +86,7 @@ def check_api_available(self) -> bool:
                 text=True,
                 timeout=5,
             )
-            return result.returncode == 0 and result.stdout.strip()
+            return result.returncode == 0 and bool(result.stdout.strip())
         except Exception:
             return False
 
@@ -314,7 +314,9 @@ def _concatenate_messages(
 
         return concatenated_groups
 
-    def _create_concatenated_content(self, message_group: dict, contact_name: str) -> str:
+    def _create_concatenated_content(
+        self, message_group: dict, contact_name: str
+    ) -> tuple[str, str]:
         """
         Create concatenated content from a group of messages.