GradientHQ
diff --git a/‎scripts/download_model_shard.sh‎
Lines changed: 45 additions & 0 deletions b/‎scripts/download_model_shard.sh‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎scripts/download_shard.py‎
Lines changed: 81 additions & 0 deletions b/‎scripts/download_shard.py‎
Lines changed: 81 additions & 0 deletions
diff --git a/‎src/backend/server/static_config.py‎
Lines changed: 1 addition & 0 deletions b/‎src/backend/server/static_config.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/parallax/launch.py‎
Lines changed: 1 addition & 1 deletion b/‎src/parallax/launch.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/parallax/metal/indexer/kernel.py‎
Lines changed: 230 additions & 0 deletions b/‎src/parallax/metal/indexer/kernel.py‎
Lines changed: 230 additions & 0 deletions
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+# Example usage of the download_shard.py script
+
+# Default values
+MODEL_REPO=${1:-"Qwen/Qwen2.5-7B-Instruct"}
+START_LAYER=${2:-0}
+END_LAYER=${3:-10}
+OUTPUT_DIR=${4}  # Optional, defaults to empty/unset
+
+# Get the directory where this script is located
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
+
+echo "========================================================"
+echo "Downloading shard for model: $MODEL_REPO"
+echo "Layers: [$START_LAYER, $END_LAYER)"
+if [ -z "$OUTPUT_DIR" ]; then
+    echo "Output Directory: Default Hugging Face Cache"
+    OUTPUT_ARG=""
+else
+    echo "Output Directory: $OUTPUT_DIR"
+    OUTPUT_ARG="--output-dir $OUTPUT_DIR"
+fi
+echo "========================================================"
+
+# Ensure PYTHONPATH includes src
+export PYTHONPATH="${PROJECT_ROOT}/src:${PYTHONPATH}"
+
+python "${SCRIPT_DIR}/download_shard.py" \
+    --model-repo "$MODEL_REPO" \
+    --start-layer "$START_LAYER" \
+    --end-layer "$END_LAYER" \
+    $OUTPUT_ARG
+
+if [ $? -eq 0 ]; then
+    echo "========================================================"
+    echo "Download completed successfully."
+    echo "========================================================"
+else
+    echo "========================================================"
+    echo "Download failed."
+    echo "========================================================"
+    exit 1
+fi
@@ -0,0 +1,81 @@
+import argparse
+import os
+import sys
+from pathlib import Path
+
+# Add src to sys.path to allow importing parallax modules
+# Assuming script is in scripts/ directory, so src is at ../src
+current_dir = Path(__file__).resolve().parent
+src_dir = current_dir.parent / "src"
+sys.path.append(str(src_dir))
+
+try:
+    from parallax.utils.selective_download import selective_model_download
+    from parallax_utils.logging_config import get_logger, set_log_level
+except ImportError:
+    print(
+        f"Error: Could not import parallax modules. Please ensure 'src' directory is in PYTHONPATH or script is located in 'scripts/'. Added path: {src_dir}"
+    )
+    sys.exit(1)
+
+logger = get_logger("download_shard")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Download specific layers of a model from Hugging Face Hub."
+    )
+    parser.add_argument(
+        "--model-repo", type=str, required=True, help="Hugging Face model repository ID"
+    )
+    parser.add_argument(
+        "--start-layer", type=int, required=True, help="Start layer index (inclusive)"
+    )
+    parser.add_argument("--end-layer", type=int, required=True, help="End layer index (exclusive)")
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        required=False,
+        default=None,
+        help="Local directory to save the model. If not provided, uses default Hugging Face cache.",
+    )
+    parser.add_argument("--log-level", type=str, default="INFO", help="Logging level")
+
+    args = parser.parse_args()
+    set_log_level(args.log_level)
+
+    # Convert output_dir to absolute path if provided
+    if args.output_dir:
+        output_dir = os.path.abspath(args.output_dir)
+        logger.info(
+            f"Downloading model {args.model_repo} layers [{args.start_layer}, {args.end_layer}) to {output_dir}"
+        )
+    else:
+        output_dir = None
+        logger.info(
+            f"Downloading model {args.model_repo} layers [{args.start_layer}, {args.end_layer}) to default Hugging Face cache"
+        )
+
+    try:
+        # Note: selective_model_download uses 'cache_dir' argument which is usually passed to hf_hub_download.
+        # hf_hub_download uses cache_dir as the base for its cache structure (models--owner--repo/...).
+        # If the user wants to download DIRECTLY to output_dir without the HF cache structure,
+        # selective_download might need adjustment or we accept the HF cache structure.
+        # Based on selective_download.py implementation:
+        # It calls snapshot_download(..., cache_dir=cache_dir) or hf_hub_download(..., cache_dir=cache_dir).
+        # So it will create the standard HF cache structure inside output_dir.
+
+        model_path = selective_model_download(
+            repo_id=args.model_repo,
+            start_layer=args.start_layer,
+            end_layer=args.end_layer,
+            cache_dir=output_dir,
+        )
+        logger.info(f"Successfully downloaded/verified model shard. Cache location: {model_path}")
+    except Exception as e:
+        logger.error(f"Failed to download model shard: {e}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
@@ -42,6 +42,7 @@
     "deepseek-ai/DeepSeek-V3": "mlx-community/DeepSeek-V3-4bit",
     "deepseek-ai/DeepSeek-V2.5-1210": "mlx-community/DeepSeek-V2.5-1210-4bit",
     "deepseek-ai/DeepSeek-R1": "mlx-community/DeepSeek-R1-4bit",
+    "deepseek-ai/DeepSeek-V3.2": "mlx-community/DeepSeek-V3.2-4bit",
     # Qwen 2.5 Series
     "Qwen/Qwen2.5-0.5B-Instruct": "Qwen/Qwen2.5-0.5B-Instruct",
     "Qwen/Qwen2.5-1.5B-Instruct": "Qwen/Qwen2.5-1.5B-Instruct",
 
@@ -107,7 +107,7 @@ def _wait_executors_check_layer_change(shared_state: SharedState, executor_subpr
                 display_parallax_join(args.model_path)
             check_latest_release()
 
-            config = fetch_model_from_hf(args.model_path)
+            config = fetch_model_from_hf(args.model_path, local_files_only=args.use_hfcache)
             # only launch http server on head node
             if args.start_layer == 0:
                 http_server_process = launch_http_server(args)
 
@@ -0,0 +1,230 @@
+import os
+from typing import Dict, List, Optional
+
+import mlx.core as mx
+
+_KERNELS: Dict[str, object] = {}
+
+
+def _get_metal_source(filename):
+    path = os.path.join(os.path.dirname(__file__), filename)
+    with open(path, "r") as f:
+        return f.read()
+
+
+def _type_to_string(dtype: mx.Dtype) -> str:
+    if dtype == mx.float32:
+        return "float"
+    elif dtype == mx.float16:
+        return "half"
+    elif dtype == mx.bfloat16:
+        return "bfloat16_t"
+    else:
+        raise ValueError(f"Unsupported dtype: {dtype}")
+
+
+def _get_kernel(
+    name: str,
+    filename: str,
+    input_names: List[str],
+    output_names: List[str],
+    dtype: mx.Dtype = mx.float32,
+):
+    type_str = _type_to_string(dtype)
+    kernel_key = f"{name}_{type_str}"
+
+    if kernel_key not in _KERNELS:
+        source = _get_metal_source(filename)
+        source = source.replace("{{T}}", type_str)
+
+        header = """
+#include <metal_stdlib>
+using namespace metal;
+"""
+        _KERNELS[kernel_key] = mx.fast.metal_kernel(
+            name=name,
+            input_names=input_names,
+            output_names=output_names,
+            source=source,
+            header=header,
+        )
+    return _KERNELS[kernel_key]
+
+
+def store_indexer_cache(
+    key: mx.array,
+    key_cache: mx.array,
+    block_tables: mx.array,
+    context_lengths: mx.array,
+    block_size: int,
+    layer_idx: int,
+    slot_mapping: Optional[mx.array] = None,
+):
+    dtype = key.dtype
+    # key: (batch, target_len, num_heads, head_dim) or flattened
+
+    if slot_mapping is None:
+        # Decode Mode
+        batch_size = key.shape[0]
+        if key.ndim == 4:
+            # (batch, 1, num_kv_heads, head_dim) -> (batch, num_kv_heads, head_dim)
+            if key.shape[1] == 1:
+                key = key.squeeze(1)
+            elif key.shape[2] == 1:
+                # Fallback for old layout (batch, num_kv_heads, 1, head_dim)
+                key = key.squeeze(2)
+
+        num_heads = key.shape[1]
+        head_dim = key.shape[2]
+
+        # Compute slot_mapping internally
+        indices = context_lengths - 1
+        block_indices_in_table = indices // block_size
+        offsets = indices % block_size
+        batch_indices = mx.arange(batch_size)
+        physical_block_numbers = block_tables[batch_indices, block_indices_in_table]
+        slot_mapping = physical_block_numbers.astype(mx.int32) * block_size + offsets.astype(
+            mx.int32
+        )
+
+        num_tokens = batch_size
+    else:
+        # Prefill Mode
+        if key.ndim == 4:
+            B, T, H, D = key.shape
+            key = key.reshape(B * T, H, D)
+
+        num_tokens = key.shape[0]
+        num_heads = key.shape[1]
+        head_dim = key.shape[2]
+
+    num_layers = key_cache.shape[0]
+    num_blocks = key_cache.shape[1]
+
+    key_stride = num_heads * head_dim
+
+    def mk_int(val):
+        return mx.array(val, dtype=mx.int32)
+
+    inputs = [
+        key,
+        key_cache,
+        slot_mapping,
+        mk_int(key_stride),
+        mk_int(num_heads),
+        mk_int(head_dim),
+        mk_int(block_size),
+        mk_int(layer_idx),
+        mk_int(num_layers),
+        mk_int(num_blocks),
+    ]
+
+    input_names = [
+        "key",
+        "key_cache",
+        "slot_mapping",
+        "key_stride",
+        "num_heads",
+        "head_dim",
+        "block_size",
+        "layer_idx",
+        "num_layers",
+        "num_blocks",
+    ]
+
+    kernel = _get_kernel(
+        name="store_key_kernel",
+        filename="store_key.metal",
+        input_names=input_names,
+        output_names=["dummy_out"],
+        dtype=dtype,
+    )
+
+    grid = (num_heads * head_dim, num_tokens, 1)
+    thread_group = (min(1024, num_heads * head_dim), 1, 1)
+
+    outputs = kernel(
+        inputs=inputs,
+        grid=grid,
+        threadgroup=thread_group,
+        output_shapes=[(num_tokens, num_heads * head_dim)],  # Dummy output
+        output_dtypes=[mx.float32],
+        verbose=False,
+    )
+    mx.eval(outputs)
+
+
+def q_dot_k(
+    q: mx.array,  # (num_heads, head_dim)
+    key_cache: mx.array,  # (L, B, H, BS, D)
+    block_table: mx.array,  # (max_blocks)
+    context_length: mx.array,  # scalar
+    block_size: int,
+    layer_idx: int,
+) -> mx.array:
+
+    if q.ndim > 2:
+        q = q.squeeze()  # Ensure (H, D)
+
+    num_heads = q.shape[0]
+    head_dim = q.shape[1]
+
+    num_layers = key_cache.shape[0]
+    num_total_blocks = key_cache.shape[1]
+    max_blocks = block_table.shape[0]
+
+    ctx_len = int(context_length.item())
+
+    def mk_int(val):
+        return mx.array(val, dtype=mx.int32)
+
+    inputs = [
+        q,
+        key_cache,
+        block_table,
+        mk_int(ctx_len),
+        mk_int(block_size),
+        mk_int(num_heads),
+        mk_int(head_dim),
+        mk_int(layer_idx),
+        mk_int(num_layers),
+        mk_int(num_total_blocks),
+        mk_int(max_blocks),
+    ]
+
+    input_names = [
+        "q",
+        "key_cache",
+        "block_table",
+        "context_len",
+        "block_size",
+        "num_heads",
+        "head_dim",
+        "layer_idx",
+        "num_layers",
+        "num_total_blocks",
+        "max_blocks",
+    ]
+
+    kernel = _get_kernel(
+        name="q_dot_k_kernel",
+        filename="q_dot_k.metal",
+        input_names=input_names,
+        output_names=["output"],
+        dtype=q.dtype,
+    )
+
+    # Grid: (block_size, num_heads, 1)
+    grid = (block_size, num_heads, 1)
+    thread_group = (min(1024, block_size), 1, 1)
+
+    outputs = kernel(
+        inputs=inputs,
+        grid=grid,
+        threadgroup=thread_group,
+        output_shapes=[(num_heads, ctx_len)],
+        output_dtypes=[mx.float32],  # Score is float32
+        verbose=False,
+    )
+
+    return outputs[0]