add traditional phase pick method

Chuan1937 · Chuan1937 · commit 3b2cca5aa907 · 2025-12-27T16:35:51.000+08:00
diff --git a/README.md b/README.md
@@ -1,14 +1,15 @@
 # QuakeCore AI Agent
 
-这是一个基于 AI 的地震数据处理智能体框架。它允许用户上传 SEGY 文件，并通过自然语言与 AI 对话来分析文件结构和内容。
+这是一个基于 AI 的地震数据处理智能体框架。它允许用户上传 MiniSEED、SAC、SEG-Y、HDF5 等多种地震数据格式，并通过自然语言与 AI 对话来分析文件结构、获取统计信息或执行相位拾取。
 
 ## 功能特点
 
+*   **多格式支持**: 一次性接入 SEGY、MiniSEED、SAC、HDF5、NumPy 数组等主流格式，自动识别采样率与起始时间。
+*   **智能拾取**: 内置 STA/LTA、AIC、频率比、AR 模型等多种传统拾取算法，统一归一化评分并输出摘要。
+*   **HDF5 自适应读取**: 自动遍历数据集并解码自定义键名/起始时间字段，减少手动配置。
 *   **Web 界面**: 类似 GPT 的聊天界面，基于 Streamlit 构建。
-*   **SEGY 支持**: 支持上传和解析标准 SEGY 地震数据文件。
-*   **本地 AI**: 集成 LangChain 和 Ollama，支持在本地运行 LLM (如 Llama 3) 进行推理，保护数据隐私。
-*   **双模推理**: 也可切换至 DeepSeek API（OpenAI SDK 兼容），满足云端推理需求。
-*   **智能工具**: AI 可以自动调用工具读取 SEGY 头信息（文本头、二进制头）和道集数据。
+*   **本地/云端 AI**: 集成 LangChain + Ollama（本地）以及 DeepSeek API（云端），按需切换推理路径。
+*   **智能工具**: AI 可自动调用读取头信息、数据导出、相位拾取等工具，完成常见地震处理任务。
 
 ## 快速开始
 
@@ -57,25 +58,38 @@ streamlit run app.py
 1.  在左侧边栏选择推理方式：
     *   **本地 Ollama**：填入模型名称（默认 `qwen2.5:3b`），并确保服务正在运行。
     *   **DeepSeek API**：填入模型名称、Base URL 和 API Key（也可通过 `DEEPSEEK_API_KEY` 环境变量注入）。
-2.  在“数据源”区域上传 `.segy`/`.sgy` 文件，或直接指向仓库中的示例文件 `data/viking_small.segy`。
+2.  在“数据源”区域上传 `.segy`/`.sgy`/`.mseed`/`.sac`/`.h5` 文件，或直接使用仓库中的示例文件（如 `data/example.mseed`、`data/example.h5`、`data/viking_small.segy`）。
 3.  在聊天框中输入指令，例如：
     *   "读取segy文件，给我说明其内部的结构"
     *   "显示这个文件的文本头信息"
     *   "这个文件的采样率是多少？"
     *   "读取第0道的统计数据"
+    *   "对当前加载的波形做初至拾取"
     *   "将第100到200道导出为Excel文件，保存在data/convert/目录下"
 
+## 相位拾取工具
+
+聊天对话触发“run_phase_picking”后，Agent 会自动判断数据类型并运行多种传统拾取算法。可选参数：
+
+*   `source_type`: 数据来源（`mseed`、`sac`、`segy`、`hdf5`、`npy` 等），缺省时按文件扩展名或当前上下文推断。
+*   `dataset`: 针对 HDF5 指定数据集名称；留空时会自动遍历并选取首个可用数据集。
+*   `sampling_rate`: 当文件缺失采样率元数据时手动提供（单位 Hz）。
+
+结果会列出每条 Trace 的多种拾取方法（STA/LTA、AIC、频率比、AR 模型、特征阈值、自相关等），并附带统一归一化分数与综合摘要，便于快速确认最佳拾取时间。
+
 ## 项目结构
 
 *   `app.py`: Streamlit 前端主程序。
 *   `agent/`: AI 智能体相关代码。
-    *   `core.py`: Agent 初始化和配置。
-    *   `tools.py`: 定义 AI 可调用的 SEGY 处理工具。
+    *   `core.py`: Agent 初始化和配置，注册所有工具。
+    *   `tools.py`: 定义文件读取、转换、相位拾取等 LangChain 工具。
 *   `utils/`: 底层工具库。
-    *   `segy_handler.py`: 基于 `segyio` 的文件读取封装。
+    *   `segy_handler.py` / `miniseed_handler.py`: 针对不同格式的读取与转换封装。
+    *   `phase_picker.py`: 统一的波形预处理与多算法拾取实现。
+    *   `hdf5_handler.py`: 自适应数据集解析和导出逻辑。
 
 ## 扩展
 
 *   **模型支持**: 已内置 Ollama 与 DeepSeek，如需扩展更多 OpenAI 兼容接口，可在 `agent/core.py` 的 `_build_llm` 中新增 provider。
-*   **功能增强**: 在 `utils/segy_handler.py` 和 `agent/tools.py` 中添加更多地震处理算法（如频谱分析、增益控制等）。
+*   **功能增强**: 在 `utils/segy_handler.py`、`utils/phase_picker.py` 与 `agent/tools.py` 中扩展更多地震处理算法（如频谱分析、自动剪切、机器学习拾取等）。
 
diff --git a/agent/core.py b/agent/core.py
@@ -27,6 +27,7 @@
     convert_miniseed_to_numpy,
     convert_miniseed_to_hdf5,
     convert_miniseed_to_sac,
+    run_phase_picking,
 )
 
 Provider = Literal["deepseek", "ollama"]
@@ -98,6 +99,7 @@ def get_agent_executor(
         convert_miniseed_to_numpy,
         convert_miniseed_to_hdf5,
         convert_miniseed_to_sac,
+        run_phase_picking,
     ]
 
     template = '''Answer the following questions as best you can. You have access to the following tools:
diff --git a/agent/tools.py b/agent/tools.py
@@ -1,7 +1,10 @@
+from dataclasses import asdict
+
 from langchain.tools import tool
 from utils.segy_handler import SegyHandler
 from utils.miniseed_handler import MiniSEEDHandler
 from utils.hdf5_handler import HDF5Handler
+from utils.phase_picker import pick_phases, summarize_pick_results
 import json
 from typing import Union
 import numpy as np
@@ -71,6 +74,121 @@ def _coerce_int(value, *, allow_none=False, default=None, field_name="value"):
     raise ValueError(f"{field_name} must be an integer, got {value!r}")
 
 
+def _coerce_float(value, *, allow_none=False, default=None, field_name="value"):
+    if value is None:
+        if allow_none:
+            return default
+        if default is not None:
+            return float(default)
+        raise ValueError(f"{field_name} must be provided")
+    if isinstance(value, (int, float, np.integer, np.floating)):
+        return float(value)
+    if isinstance(value, str):
+        lowered = value.strip()
+        if allow_none and lowered.lower() in {"none", "null", ""}:
+            return default
+        return float(lowered)
+    raise ValueError(f"{field_name} must be a float, got {value!r}")
+
+
+def _normalize_method_list(raw_value):
+    if raw_value is None:
+        return None
+    if isinstance(raw_value, str):
+        candidate = raw_value.strip()
+        if not candidate:
+            return None
+        if candidate.startswith("["):
+            try:
+                data = json.loads(candidate)
+                if isinstance(data, list):
+                    return [str(item).strip() for item in data if str(item).strip()]
+            except json.JSONDecodeError:
+                pass
+        return [part.strip() for part in candidate.split(",") if part.strip()]
+    if isinstance(raw_value, (list, tuple, set)):
+        return [str(item).strip() for item in raw_value if str(item).strip()]
+    return None
+
+
+def _parse_method_params(raw_value):
+    if raw_value is None:
+        return None
+    if isinstance(raw_value, dict):
+        return raw_value
+    if isinstance(raw_value, str):
+        candidate = raw_value.strip()
+        if not candidate:
+            return None
+        try:
+            data = json.loads(candidate)
+            if isinstance(data, dict):
+                return data
+        except json.JSONDecodeError:
+            return None
+    return None
+
+
+def _normalize_source_type(value: str | None):
+    if not value:
+        return None
+    normalized = value.lower().strip()
+    alias = {
+        "miniseed": "mseed",
+        "mseed": "mseed",
+        "segy": "segy",
+        "sgy": "segy",
+        "hdf5": "hdf5",
+        "h5": "hdf5",
+        "npy": "npy",
+        "npz": "npz",
+        "sac": "sac",
+    }
+    return alias.get(normalized, normalized)
+
+
+def _infer_file_type_from_path(path: str | None):
+    if not path:
+        return None
+    ext = os.path.splitext(path)[1].lower()
+    mapping = {
+        ".segy": "segy",
+        ".sgy": "segy",
+        ".mseed": "mseed",
+        ".miniseed": "mseed",
+        ".h5": "hdf5",
+        ".hdf5": "hdf5",
+        ".npy": "npy",
+        ".npz": "npz",
+        ".sac": "sac",
+    }
+    return mapping.get(ext)
+
+
+def _resolve_source_path(path: str | None, source_type: str | None):
+    normalized_type = _normalize_source_type(source_type)
+    if path:
+        inferred = normalized_type or _infer_file_type_from_path(path)
+        return path, inferred
+
+    candidates = [
+        ("segy", CURRENT_SEGY_PATH),
+        ("mseed", CURRENT_MINISEED_PATH),
+        ("hdf5", CURRENT_HDF5_PATH),
+    ]
+
+    if normalized_type:
+        for ctype, cpath in candidates:
+            if ctype == normalized_type and cpath:
+                return cpath, ctype
+        return None, normalized_type
+
+    for ctype, cpath in candidates:
+        if cpath:
+            return cpath, ctype
+    return None, None
+
+
 def _resolve_output_path(output_path: str | None, *, default_filename: str) -> str:
     """Resolve output file path.
 
@@ -603,3 +721,64 @@ def convert_miniseed_to_sac(params: Union[str, dict, None] = None):
     if "error" in result:
         return json.dumps(result, indent=2)
     return json.dumps(result, indent=2)
+
+# 运行传统拾取方法
+@tool
+def run_phase_picking(params: Union[str, dict, None] = None):
+    """
+    Run classical phase picking on the loaded file (SEGY/MiniSEED/HDF5/NumPy) or a specified path.
+    Args: path (optional), file_type/source_type, dataset (HDF5), sampling_rate (for NumPy), methods, method_params.
+    """
+    """在已加载或指定的地震数据文件上运行传统初至拾取。参数：path（可选）、file_type/source_type、dataset、sampling_rate、methods、method_params。"""
+    parsed = _parse_param_dict(params)
+
+    requested_path = parsed.get("path")
+    source_type = parsed.get("file_type") or parsed.get("source_type")
+    dataset = parsed.get("dataset")
+
+    try:
+        sampling_rate = _coerce_float(
+            parsed.get("sampling_rate"),
+            allow_none=True,
+            default=None,
+            field_name="sampling_rate",
+        )
+    except ValueError as exc:
+        return str(exc)
+
+    methods = _normalize_method_list(parsed.get("methods"))
+    method_params = _parse_method_params(parsed.get("method_params"))
+
+    resolved_path, inferred_type = _resolve_source_path(requested_path, source_type)
+    if not resolved_path:
+        return "No suitable data file is currently loaded. Please upload or specify a file path."
+
+    file_type = _normalize_source_type(source_type) or inferred_type
+
+    try:
+        picks = pick_phases(
+            resolved_path,
+            file_type=file_type,
+            dataset=dataset,
+            sampling_rate=sampling_rate,
+            methods=methods,
+            method_params=method_params,
+        )
+    except Exception as exc:
+        return json.dumps({"error": str(exc)}, indent=2)
+
+    if not picks:
+        return "No phase arrivals were detected."
+
+    serialized = [asdict(item) for item in picks]
+    summary = summarize_pick_results(picks)
+    return json.dumps(
+        {
+            "file": resolved_path,
+            "file_type": file_type,
+            "count": len(serialized),
+            "results": serialized,
+            "summary": summary,
+        },
+        indent=2,
+    )
diff --git a/utils/hdf5_handler.py b/utils/hdf5_handler.py
@@ -5,6 +5,8 @@
 
 
 class HDF5Handler:
+    _PREFERRED_DATASETS = ("traces", "data", "dataset", "waveforms", "waveform", "values")
+
     def __init__(self, filepath: str):
         self.filepath = filepath
         self._validate_file()
@@ -35,19 +37,29 @@ def visitor(name, node):
         return datasets
 
     def _select_dataset(self, h5f, dataset_name: str | None):
+        def _candidate_names(name: str):
+            norm = str(name).strip()
+            if not norm:
+                return []
+            candidates = [norm]
+            if norm.startswith("/"):
+                candidates.append(norm.lstrip("/"))
+            else:
+                candidates.append(f"/{norm}")
+            return [c for c in candidates if c]
+
         if dataset_name:
-            if dataset_name in h5f:
-                node = h5f[dataset_name]
-                if isinstance(node, h5py.Dataset):
-                    return node, dataset_name
-                raise ValueError(f"Path {dataset_name} is not a dataset")
-            if dataset_name.startswith("/") and dataset_name[1:] in h5f:
-                node = h5f[dataset_name[1:]]
-                if isinstance(node, h5py.Dataset):
-                    return node, dataset_name
+            for candidate in _candidate_names(dataset_name):
+                node = h5f.get(candidate)
+                if node is not None and isinstance(node, h5py.Dataset):
+                    return node, candidate
             raise ValueError(f"Dataset {dataset_name} not found")
-        if "traces" in h5f and isinstance(h5f["traces"], h5py.Dataset):
-            return h5f["traces"], "traces"
+
+        for preferred in self._PREFERRED_DATASETS:
+            node = h5f.get(preferred)
+            if node is not None and isinstance(node, h5py.Dataset):
+                return node, preferred
+
         datasets = self._collect_datasets(h5f)
         if datasets:
             return datasets[0]
diff --git a/utils/phase_picker.py b/utils/phase_picker.py

Original file line number	Diff line number	Diff line change
`@@ -27,6 +27,7 @@`
`27`	`27`	`convert_miniseed_to_numpy,`
`28`	`28`	`convert_miniseed_to_hdf5,`
`29`	`29`	`convert_miniseed_to_sac,`
	`30`	`+ run_phase_picking,`
`30`	`31`	`)`
`31`	`32`
`32`	`33`	`Provider = Literal["deepseek", "ollama"]`
`@@ -98,6 +99,7 @@ def get_agent_executor(`
`98`	`99`	`convert_miniseed_to_numpy,`
`99`	`100`	`convert_miniseed_to_hdf5,`
`100`	`101`	`convert_miniseed_to_sac,`
	`102`	`+ run_phase_picking,`
`101`	`103`	`]`
`102`	`104`
`103`	`105`	`template = '''Answer the following questions as best you can. You have access to the following tools:`