Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 9 additions & 5 deletions core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,14 @@
from typing import Any, Dict, List, Literal, Optional

import tomli
from dotenv import load_dotenv
from pydantic import BaseModel
from pydantic_settings import BaseSettings

load_dotenv(override=True)
from utils.env_loader import load_local_env

# Default to loading from .env unless a secret manager (e.g., Infisical) is
# injecting variables.
load_local_env(override=True)


class ParserXMLSettings(BaseModel):
Expand Down Expand Up @@ -130,7 +133,7 @@ class Settings(BaseSettings):

# Mode configuration
MODE: Literal["cloud", "self_hosted"] = "cloud"
USE_LOCAL_ENV: bool = True
SECRET_MANAGER: Literal["env", "infisical"] = "env"

# API configuration
API_DOMAIN: str = "api.morphik.ai"
Expand Down Expand Up @@ -177,7 +180,7 @@ def dev_mode(self) -> bool: # pragma: no cover - compatibility shim
@lru_cache()
def get_settings() -> Settings:
"""Get cached settings instance."""
load_dotenv(override=True)
load_local_env(override=True)

# Load config.toml
with open("morphik.toml", "rb") as f:
Expand Down Expand Up @@ -363,13 +366,14 @@ def get_settings() -> Settings:
# Load morphik config
api_domain = config["morphik"].get("api_domain", "api.morphik.ai")
embedding_api_domain = config["morphik"].get("morphik_embedding_api_domain") or api_domain
secret_manager = config["morphik"].get("secret_manager", "env")

settings_dict.update(
{
"ENABLE_COLPALI": config["morphik"]["enable_colpali"],
"COLPALI_MODE": config["morphik"].get("colpali_mode", "local"),
"MODE": config["morphik"].get("mode", "cloud"),
"USE_LOCAL_ENV": config["morphik"].get("use_local_env", True),
"SECRET_MANAGER": secret_manager,
"API_DOMAIN": api_domain,
"MORPHIK_EMBEDDING_API_DOMAIN": embedding_api_domain,
}
Expand Down
13 changes: 13 additions & 0 deletions core/models/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,19 @@ class FolderDetailsRequest(BaseModel):
)


class FolderTreeRequest(BaseModel):
"""Request model for retrieving a hierarchical folder tree with documents."""

folder_path: Optional[str] = Field(
default=None,
description="Base folder path to return. Use '/' or null for the full hierarchy.",
)
document_fields: Optional[List[str]] = Field(
default=None,
description="Optional list of fields to include for documents in each folder node (dot notation supported).",
)


class SearchDocumentsRequest(BaseModel):
"""Request model for searching documents by name"""

Expand Down
15 changes: 15 additions & 0 deletions core/models/responses.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,21 @@ class FolderDetailsResponse(BaseModel):
folders: List[FolderDetails]


class FolderTreeNode(BaseModel):
"""Nested folder tree entry including contained documents."""

id: Optional[str] = None
name: Optional[str] = None
full_path: Optional[str] = None
description: Optional[str] = None
depth: Optional[int] = None
documents: List[Dict[str, Any]] = Field(default_factory=list)
children: List["FolderTreeNode"] = Field(default_factory=list)


FolderTreeNode.model_rebuild()


class RequeueIngestionResult(BaseModel):
"""Result information for an individual requeued ingestion job."""

Expand Down
173 changes: 172 additions & 1 deletion core/routes/folders.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,15 @@
from core.database.postgres_database import InvalidMetadataFilterError
from core.models.auth import AuthContext
from core.models.folders import Folder, FolderCreate, FolderSummary
from core.models.request import FolderDetailsRequest
from core.models.request import FolderDetailsRequest, FolderTreeRequest
from core.models.responses import (
DocumentAddToFolderResponse,
DocumentDeleteResponse,
FolderDeleteResponse,
FolderDetails,
FolderDetailsResponse,
FolderDocumentInfo,
FolderTreeNode,
)
from core.routes.utils import project_document_fields
from core.services.telemetry import TelemetryService
Expand Down Expand Up @@ -263,6 +264,176 @@ async def list_folder_summaries(auth: AuthContext = Depends(verify_token)) -> Li
raise HTTPException(status_code=500, detail=str(exc))


@router.post("/tree", response_model=FolderTreeNode)
async def get_folder_tree(
request: FolderTreeRequest,
auth: AuthContext = Depends(verify_token),
) -> FolderTreeNode:
"""
Return a hierarchical folder tree (with documents) rooted at ``folder_path``.

When ``folder_path`` is null or ``/``, the entire accessible hierarchy is returned.
"""

try:
folder_path = request.folder_path
document_fields = request.document_fields
normalized_path: Optional[str] = None
if folder_path is not None:
if isinstance(folder_path, str) and folder_path.lower() == "null":
folder_path = None
else:
try:
normalized_path = normalize_folder_path(folder_path)
except ValueError as exc:
raise HTTPException(status_code=400, detail=str(exc))
if normalized_path == "/":
normalized_path = None

base_path = normalized_path or "/"

base_folder: Optional[Folder] = None
if normalized_path:
base_folder = await document_service.db.get_folder_by_full_path(normalized_path, auth)
if not base_folder:
raise HTTPException(status_code=404, detail=f"Folder {folder_path} not found")

def _canonical_folder_path(folder: Folder) -> Optional[str]:
if folder.full_path:
try:
return normalize_folder_path(folder.full_path)
except ValueError:
return None
if folder.name:
try:
return normalize_folder_path(folder.name)
except ValueError:
return None
return None

def _parent_path(path: str) -> Optional[str]:
if not path or path == "/":
return None
segments = [part for part in path.strip("/").split("/") if part]
if len(segments) <= 1:
return "/"
return "/" + "/".join(segments[:-1])

def _attach_child(parent: FolderTreeNode, child: FolderTreeNode) -> None:
if not any(existing.full_path == child.full_path for existing in parent.children):
parent.children.append(child)

def _make_node(path: str, folder: Optional[Folder]) -> FolderTreeNode:
name = folder.name if folder else ("/" if path == "/" else (path.strip("/").split("/")[-1] or "/"))
depth = folder.depth if folder else (0 if path == "/" else None)
return FolderTreeNode(
id=folder.id if folder else None,
name=name,
full_path=path,
description=folder.description if folder else None,
depth=depth,
documents=[],
children=[],
)

all_folders = await document_service.db.list_folders(auth)
folders_with_paths: List[tuple[str, Folder]] = []
for folder in all_folders:
path = _canonical_folder_path(folder)
if path:
folders_with_paths.append((path, folder))

if normalized_path:
scoped = []
prefix = normalized_path.rstrip("/") + "/"
for path, folder in folders_with_paths:
if path == normalized_path or path.startswith(prefix):
scoped.append((path, folder))
folders_with_paths = scoped
if base_folder:
base_folder_path = _canonical_folder_path(base_folder)
if base_folder_path and all(path != base_folder_path for path, _ in folders_with_paths):
folders_with_paths.append((base_folder_path, base_folder))

nodes_by_path: Dict[str, FolderTreeNode] = {
path: _make_node(path, folder) for path, folder in folders_with_paths
}

root_node = nodes_by_path.get(base_path)
if not root_node:
root_node = _make_node(base_path, base_folder)
nodes_by_path[base_path] = root_node

for path in sorted(nodes_by_path.keys(), key=lambda p: (p.count("/"), p)):
if path == base_path:
continue
node = nodes_by_path[path]
parent_path = _parent_path(path)
parent_node = nodes_by_path.get(parent_path)
if not parent_node:
parent_node = root_node
_attach_child(parent_node, node)

doc_system_filters = {"folder_path_prefix": base_path} if normalized_path else None
document_result = await document_service.db.list_documents_flexible(
auth=auth,
skip=0,
limit=None,
system_filters=doc_system_filters,
include_total_count=False,
include_status_counts=False,
include_folder_counts=False,
return_documents=True,
sort_by="filename",
sort_direction="asc",
)

documents = document_result.get("documents", []) or []
for document in documents:
if hasattr(document, "model_dump"):
doc_dict = document.model_dump(mode="json")
elif hasattr(document, "dict"):
doc_dict = document.dict()
else:
doc_dict = dict(document)

doc_path_raw = doc_dict.get("folder_path")
try:
doc_path = normalize_folder_path(doc_path_raw) if doc_path_raw is not None else None
except ValueError:
doc_path = doc_path_raw

target_path = doc_path or base_path
target_node = nodes_by_path.get(target_path)
if not target_node:
target_node = _make_node(target_path, None)
nodes_by_path[target_path] = target_node
parent_path = _parent_path(target_path or "/")
parent_node = nodes_by_path.get(parent_path) or root_node
_attach_child(parent_node, target_node)

projected_doc = project_document_fields(doc_dict, document_fields)
if doc_path_raw is not None and "folder_path" not in projected_doc:
projected_doc["folder_path"] = doc_path_raw

target_node.documents.append(projected_doc)

def _sort_tree(node: FolderTreeNode) -> None:
node.children.sort(key=lambda child: (child.name or "", child.full_path or ""))
for child in node.children:
_sort_tree(child)
node.documents.sort(key=lambda doc: str(doc.get("filename") or doc.get("external_id") or ""))

_sort_tree(root_node)
return root_node

except HTTPException:
raise
except Exception as exc: # noqa: BLE001
logger.error("Error building folder tree: %s", exc)
raise HTTPException(status_code=500, detail=str(exc))


@router.post("/{folder_id_or_name:path}/documents/{document_id}", response_model=DocumentAddToFolderResponse)
@telemetry.track(operation_type="add_document_to_folder", metadata_resolver=telemetry.add_document_to_folder_metadata)
async def add_document_to_folder(
Expand Down
5 changes: 3 additions & 2 deletions core/services/user_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
import jwt
from sqlalchemy import or_, select

from utils.env_loader import load_local_env

from ..config import get_settings
from ..database.user_limits_db import UserLimitsDatabase
from ..models.tiers import AccountTier, get_tier_limits
Expand Down Expand Up @@ -206,9 +208,8 @@ async def record_usage(
import os

import stripe
from dotenv import load_dotenv

load_dotenv(override=True)
load_local_env(override=True)

# Get Stripe API key from environment variable
stripe_api_key = os.environ.get("STRIPE_API_KEY")
Expand Down
5 changes: 3 additions & 2 deletions ee/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@
from typing import List, Optional

import tomli
from dotenv import load_dotenv
from pydantic import Field
from pydantic_settings import BaseSettings

from utils.env_loader import load_local_env

# Determine the root directory of the EE features (assuming this file is in ee/config.py)
EE_ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
DEFAULT_TOKEN_STORAGE_PATH = os.path.join(EE_ROOT_DIR, "connector_tokens")
Expand Down Expand Up @@ -50,7 +51,7 @@ def get_ee_settings() -> EESettings:
2. ee.toml file.
3. Default values defined in EESettings model.
"""
load_dotenv(override=True)
load_local_env(override=True)

ee_toml_path = os.path.join(EE_ROOT_DIR, "ee.toml")
config_from_toml = {}
Expand Down
2 changes: 1 addition & 1 deletion morphik.docker.toml
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ colpali_pdf_dpi = 150 # DPI for PDF to image conversion in ColPali processing
[morphik]
enable_colpali = true
mode = "self_hosted" # "cloud" or "self_hosted"
use_local_env = true
secret_manager = "env" # "env" (default) or "infisical"
api_domain = "api.morphik.ai" # API domain for cloud URIs
# Only call the embedding API if colpali_mode is "api"
morphik_embedding_api_domain = "http://localhost:6000" # endpoint for multivector embedding service
Expand Down
2 changes: 1 addition & 1 deletion morphik.toml
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ colpali_pdf_dpi = 150 # DPI for PDF to image conversion in ColPali processing
[morphik]
enable_colpali = true
mode = "self_hosted" # "cloud" or "self_hosted"
use_local_env = true
secret_manager = "env" # "env" (default) or "infisical"
api_domain = "api.morphik.ai" # API domain for cloud URIs
# Only call the embedding API if colpali_mode is "api"
morphik_embedding_api_domain = "http://localhost:6000" # endpoint for multivector embedding service
Expand Down
6 changes: 3 additions & 3 deletions start_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@
import requests
import tomli
import uvicorn
from dotenv import load_dotenv

from core.config import get_settings
from core.logging_config import setup_logging
from utils.env_loader import load_local_env

# Global variable to store the worker process
worker_process = None
Expand Down Expand Up @@ -275,8 +275,8 @@ def main():
if not args.skip_redis_check:
check_and_start_redis()

# Load environment variables from .env file
load_dotenv(override=True)
# Load environment variables from .env file if secrets aren't injected
load_local_env(override=True)

# Check if Ollama is required and running
if not args.skip_ollama_check:
Expand Down
Loading
Loading