diff --git a/.github/workflows/kubeflow-components-test.yaml b/.github/workflows/kubeflow-components-test.yaml new file mode 100644 index 00000000..240ccf62 --- /dev/null +++ b/.github/workflows/kubeflow-components-test.yaml @@ -0,0 +1,51 @@ +name: Kubeflow Components Tests + +on: + pull_request: + paths: + - 'build/dockerfiles/kubeflow-components/**' + - '.github/workflows/kubeflow-components-test.yaml' + +jobs: + test-scripts: + name: Run BATS tests + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5.0.1 + + - name: Install BATS + run: | + sudo apt-get update + sudo apt-get install -y bats + + - name: Run push-modelkit tests + working-directory: build/dockerfiles/kubeflow-components + run: bats tests/push-modelkit.bats + + - name: Run unpack-modelkit tests + working-directory: build/dockerfiles/kubeflow-components + run: bats tests/unpack-modelkit.bats + + test-container-build: + name: Test container build + runs-on: ubuntu-latest + steps: + - name: Set up QEMU + uses: docker/setup-qemu-action@c7c53464625b32c7a7e944ae62b3e17d2b600130 # v3.7.0 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # v3.11.1 + + - name: Checkout + uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5.0.1 + + - name: Check kubeflow components container build + uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # v6.18.0 + with: + platforms: linux/amd64,linux/arm64 + push: false + context: build/dockerfiles/kubeflow-components + file: build/dockerfiles/kubeflow-components/Dockerfile + build-args: | + KIT_BASE_IMAGE=ghcr.io/kitops-ml/kitops:next diff --git a/.github/workflows/next-container-build.yaml b/.github/workflows/next-container-build.yaml index 59133522..bfbe6466 100644 --- a/.github/workflows/next-container-build.yaml +++ b/.github/workflows/next-container-build.yaml @@ -12,6 +12,7 @@ env: IMAGE_NAME: ${{ github.repository }} INIT_IMAGE_NAME: ${{ github.repository }}-init KIT_SERVE_IMAGE: ${{ github.repository }}-kserve + KUBEFLOW_IMAGE: ${{ github.repository }}-kubeflow NEXT_TAG: next permissions: @@ -90,3 +91,20 @@ jobs: index:org.opencontainers.image.description=KitOps KServe container index:org.opencontainers.image.source=https://github.com/kitops-ml/kitops index:org.opencontainers.image.licenses=Apache-2.0 + + - name: Build and push Kubeflow Pipeline components container + id: build-kubeflow-container + uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # v6.18.0 + with: + platforms: linux/amd64,linux/arm64 + push: true + context: build/dockerfiles/kubeflow-components + file: build/dockerfiles/kubeflow-components/Dockerfile + build-args: | + KIT_BASE_IMAGE=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}@${{ steps.build-kit-container.outputs.digest }} + tags: | + ${{ env.REGISTRY }}/${{ env.KUBEFLOW_IMAGE }}:${{ env.NEXT_TAG }} + annotations: | + index:org.opencontainers.image.description=KitOps Kubeflow Pipeline Components + index:org.opencontainers.image.source=https://github.com/kitops-ml/kitops + index:org.opencontainers.image.licenses=Apache-2.0 diff --git a/.github/workflows/platform-release.yaml b/.github/workflows/platform-release.yaml index 4cc834e0..9a9588c8 100644 --- a/.github/workflows/platform-release.yaml +++ b/.github/workflows/platform-release.yaml @@ -21,6 +21,7 @@ env: IMAGE_NAME: ${{ github.repository }} INIT_IMAGE_NAME: ${{ github.repository }}-init KIT_SERVE_IMAGE: ${{ github.repository }}-kserve + KUBEFLOW_IMAGE: ${{ github.repository }}-kubeflow permissions: contents: write @@ -431,6 +432,24 @@ jobs: index:org.opencontainers.image.source=https://github.com/kitops-ml/kitops index:org.opencontainers.image.licenses=Apache-2.0 + - name: Build and push Kubeflow Pipeline components container + id: build-kubeflow-container + uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # v6.18.0 + with: + platforms: linux/amd64,linux/arm64 + push: true + context: build/dockerfiles/kubeflow-components + file: build/dockerfiles/kubeflow-components/Dockerfile + build-args: | + KIT_BASE_IMAGE=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}@${{ steps.build-kit-container.outputs.digest }} + tags: | + ${{ env.REGISTRY }}/${{ env.KUBEFLOW_IMAGE }}:latest + ${{ env.REGISTRY }}/${{ env.KUBEFLOW_IMAGE }}:${{ github.ref_name }} + annotations: | + index:org.opencontainers.image.description=KitOps Kubeflow Pipeline Components + index:org.opencontainers.image.source=https://github.com/kitops-ml/kitops + index:org.opencontainers.image.licenses=Apache-2.0 + - name: Generate artifact attestation for base container uses: actions/attest-build-provenance@977bb373ede98d70efdf65b84cb5f73e068dcc2a # v3.0.0 with: @@ -451,3 +470,10 @@ jobs: subject-name: ${{ env.REGISTRY }}/${{ env.KIT_SERVE_IMAGE }} subject-digest: ${{ steps.build-kit-serve-container.outputs.digest }} push-to-registry: true + + - name: Generate artifact attestation for kubeflow container + uses: actions/attest-build-provenance@977bb373ede98d70efdf65b84cb5f73e068dcc2a # v3.0.0 + with: + subject-name: ${{ env.REGISTRY }}/${{ env.KUBEFLOW_IMAGE }} + subject-digest: ${{ steps.build-kubeflow-container.outputs.digest }} + push-to-registry: true diff --git a/build/dockerfiles/kubeflow-components/Dockerfile b/build/dockerfiles/kubeflow-components/Dockerfile new file mode 100644 index 00000000..2a518b0b --- /dev/null +++ b/build/dockerfiles/kubeflow-components/Dockerfile @@ -0,0 +1,32 @@ +# Multi-platform digest for Cosign v2.4.0 +ARG COSIGN_DIGEST=sha256:9d50ceb15f023eda8f58032849eedc0216236d2e2f4cfe1cdf97c00ae7798cfe +ARG KIT_BASE_IMAGE=ghcr.io/kitops-ml/kitops:next + +FROM gcr.io/projectsigstore/cosign@$COSIGN_DIGEST AS cosign-install +FROM $KIT_BASE_IMAGE + +# Install additional tools needed for scripts +USER 0 +RUN apk add --no-cache \ + bash \ + jq +USER 1001 + +# Copy cosign from multi-platform build +COPY --from=cosign-install /ko-app/cosign /usr/local/bin/cosign + +# Copy scripts (needs root for chmod) +USER 0 +COPY scripts/ /scripts/ +RUN chmod +x /scripts/*.sh +USER 1001 + +# Set working directory +WORKDIR /workspace + +# Default entrypoint +ENTRYPOINT ["/bin/bash"] + +LABEL org.opencontainers.image.description="KitOps Kubeflow Pipeline Components" +LABEL org.opencontainers.image.source="https://github.com/kitops-ml/kitops" +LABEL org.opencontainers.image.licenses="Apache-2.0" diff --git a/build/dockerfiles/kubeflow-components/README.md b/build/dockerfiles/kubeflow-components/README.md new file mode 100644 index 00000000..82bf9119 --- /dev/null +++ b/build/dockerfiles/kubeflow-components/README.md @@ -0,0 +1,316 @@ +# Kubeflow Pipeline ModelKit Components + +Kubeflow Pipeline components for packaging and deploying ML artifacts as KitOps ModelKits. + +## Components + +### push-modelkit + +Packages ML artifacts in a directory as a ModelKit and pushes it to an OCI registry. + +If a `Kitfile` exists in `modelkit_dir`, it is used as-is. Otherwise, one is auto-generated via `kit init`. + +**Required inputs** + +- `registry` – Container registry host (e.g., `registry.io`) +- `repository` – Repository path (e.g., `myorg/mymodel`) +- `tag` – ModelKit tag (default: `latest`) +- `modelkit_dir` – Directory with model files (with or without `Kitfile`) + +**Optional metadata (for Kitfile)** + +- `modelkit_name` – ModelKit package name +- `modelkit_desc` – ModelKit description +- `modelkit_author` – ModelKit author + +**Optional attestation metadata** + +- `dataset_uri` – Dataset URI +- `code_repo` – Code repository URL +- `code_commit` – Code commit hash + +**Outputs** + +- `ref` – Tagged ModelKit reference (e.g., `registry.io/myorg/mymodel:v1`) +- `digest` – Digest-based ModelKit reference (e.g., `registry.io/myorg/mymodel@sha256:abc…`) + +### unpack-modelkit + +Pulls a ModelKit from a registry and extracts it. + +**Inputs** + +- `modelkit_reference` – ModelKit reference (e.g., `registry.io/repo:tag` or `registry.io/repo@sha256:…`) +- `extract_path` – Directory to extract contents (default: `/tmp/model`) + +**Outputs** + +- `model_path` – Directory where contents were extracted + +## Usage Examples + +Complete, runnable examples (including a full house-prices pipeline) are in the [`examples/`](examples/) directory. + +### Basic usage + +Training component that writes ML artifacts to a directory: + +```python +from kfp import dsl + +@dsl.component( + packages_to_install=['pandas', 'xgboost', 'scikit-learn'], + base_image='python:3.11-slim', +) +def train_model(modelkit_dir: dsl.Output[dsl.Artifact]): + """Train model and save to directory.""" + import os + import pickle + + model = train_your_model() + os.makedirs(modelkit_dir.path, exist_ok=True) + + with open(os.path.join(modelkit_dir.path, 'model.pkl'), 'wb') as f: + pickle.dump(model, f) + + save_dataset(os.path.join(modelkit_dir.path, 'predictions.csv')) + save_code(os.path.join(modelkit_dir.path, 'train.py')) + save_docs(os.path.join(modelkit_dir.path, 'README.md')) +``` + +Component to push the directory as a ModelKit: + +```python +from kfp import dsl, kubernetes + +@dsl.container_component +def push_modelkit( + registry: str, + repository: str, + tag: str, + input_modelkit_dir: dsl.Input[dsl.Artifact], + output_ref: dsl.Output[dsl.Artifact], + output_digest: dsl.Output[dsl.Artifact], + modelkit_name: str = '', + modelkit_desc: str = '', + modelkit_author: str = '', + dataset_uri: str = '', + code_repo: str = '', + code_commit: str = '', +): + return dsl.ContainerSpec( + image='ghcr.io/kitops-ml/kitops-kubeflow:latest', + command=['/bin/bash', '-c'], + args=[ + f'/scripts/push-modelkit.sh ' + f'"{registry}" "{repository}" "{tag}" ' + f'--modelkit-dir "{input_modelkit_dir.path}" ' + f'--name "{modelkit_name}" ' + f'--desc "{modelkit_desc}" ' + f'--author "{modelkit_author}" ' + f'--dataset-uri "{dataset_uri}" ' + f'--code-repo "{code_repo}" ' + f'--code-commit "{code_commit}" ' + f'&& cp /tmp/outputs/reference "{output_ref.path}" ' + f'&& cp /tmp/outputs/digest "{output_digest.path}"' + ], + ) +``` + +Simple end‑to‑end pipeline: + +```python +@dsl.pipeline( + name='simple-modelkit-pipeline', + description='Train and package as ModelKit', +) +def simple_pipeline( + registry: str = 'jozu.ml', + repository: str = 'team/model', + tag: str = 'latest', +): + train = train_model() + + push = push_modelkit( + registry=registry, + repository=repository, + tag=tag, + input_modelkit_dir=train.outputs['modelkit_dir'], + modelkit_name='My Model', + modelkit_desc='Description of my model', + modelkit_author='Data Science Team', + ) + + kubernetes.use_secret_as_volume( + push, + secret_name='docker-config', + mount_path='/home/user/.docker', + ) +``` + +### Using a custom Kitfile + +If you need full control, create a `Kitfile` alongside your artifacts: + +```python +@dsl.component(base_image='python:3.11-slim') +def train_with_kitfile(modelkit_dir: dsl.Output[dsl.Artifact]): + """Train and create custom Kitfile.""" + import os + + train_and_save_model(modelkit_dir.path) + + kitfile_content = """ +manifestVersion: 1.0 +package: + name: Custom Model + description: Model with custom configuration + authors: + - Data Science Team +model: + path: model.pkl +datasets: + - path: train.csv + - path: test.csv +code: + - path: train.py +docs: + - path: README.md +""" + with open(os.path.join(modelkit_dir.path, 'Kitfile'), 'w') as f: + f.write(kitfile_content) +``` + +When a `Kitfile` is present, the component uses it instead of generating one. + +### Pipeline with attestation + +```python +@dsl.pipeline( + name='production-pipeline', + description='Production pipeline with attestation', +) +def production_pipeline( + registry: str = 'jozu.ml', + repository: str = 'team/prod-model', + tag: str = 'v1.0.0', + dataset_uri: str = 's3://bucket/data.csv', + code_repo: str = 'github.com/org/repo', + code_commit: str = 'abc123', +): + train = train_model() + + push = push_modelkit( + registry=registry, + repository=repository, + tag=tag, + input_modelkit_dir=train.outputs['modelkit_dir'], + modelkit_name='Production Model', + modelkit_desc='Production model v1.0.0', + modelkit_author='ML Team', + dataset_uri=dataset_uri, + code_repo=code_repo, + code_commit=code_commit, + ) + + kubernetes.use_secret_as_volume( + push, + secret_name='docker-config', + mount_path='/home/user/.docker', + ) + kubernetes.use_secret_as_volume( + push, + secret_name='cosign-keys', + mount_path='/etc/cosign', + ) +``` + +## Secret Requirements + +### Registry credentials + +Create a Kubernetes secret with Docker registry credentials: + +```bash +kubectl create secret generic docker-config \ + --from-file=config.json="$HOME/.docker/config.json" \ + --namespace=kubeflow +``` + +Or: + +```bash +kubectl create secret docker-registry docker-config \ + --docker-server=jozu.ml \ + --docker-username=myuser \ + --docker-password=mypassword \ + --docker-email=user@example.com \ + --namespace=kubeflow +``` + +Mount in your pipeline (as shown above) using: + +```python +kubernetes.use_secret_as_volume( + push, + secret_name='docker-config', + mount_path='/home/user/.docker', +) +``` + +### Cosign keys (optional) + +For ModelKit attestation signing, create a secret with cosign keys: + +```bash +cosign generate-key-pair + +kubectl create secret generic cosign-keys \ + --from-file=cosign.key=cosign.key \ + --from-file=cosign.pub=cosign.pub \ + --namespace=kubeflow +``` + +Mount it as in the attestation pipeline example: + +```python +kubernetes.use_secret_as_volume( + push, + secret_name='cosign-keys', + mount_path='/etc/cosign', +) +``` + +If cosign keys are not available, the signing step logs a warning and continues. + +## Troubleshooting + +### Authentication errors + +**Symptom:** `Failed to push ModelKit` or `401 Unauthorized` + +**Check:** + +```bash +kubectl get secret docker-config -n kubeflow +kubectl get secret docker-config -n kubeflow \ + -o jsonpath='{.data.config\.json}' | base64 -d +``` + +`config.json` should contain registry auth for your host: + +```json +{ + "auths": { + "jozu.ml": { + "auth": "base64(username:password)" + } + } +} +``` + +### Directory not found + +**Symptom:** `ModelKit directory does not exist` + +Ensure your training component creates `modelkit_dir.path` and writes artifacts into it (see `train_model` example above). diff --git a/build/dockerfiles/kubeflow-components/components/push-modelkit/component.yaml b/build/dockerfiles/kubeflow-components/components/push-modelkit/component.yaml new file mode 100644 index 00000000..238edf29 --- /dev/null +++ b/build/dockerfiles/kubeflow-components/components/push-modelkit/component.yaml @@ -0,0 +1,51 @@ +name: push-modelkit +description: Package and push ML artifacts as a ModelKit + +inputs: + - {name: registry, type: String, description: 'Container registry host (e.g., registry.io)'} + - {name: repository, type: String, description: 'Repository path (e.g., myorg/mymodel)'} + - {name: tag, type: String, default: 'latest', description: 'ModelKit tag'} + + - {name: modelkit_dir, type: String, description: 'Directory containing ML artifacts (with or without Kitfile)'} + + - {name: modelkit_name, type: String, optional: true, description: 'Name for the ModelKit'} + - {name: modelkit_desc, type: String, optional: true, description: 'Description for the ModelKit'} + - {name: modelkit_author, type: String, optional: true, description: 'Author for the ModelKit'} + + - {name: dataset_uri, type: String, optional: true, description: 'Dataset URI'} + - {name: code_repo, type: String, optional: true, description: 'Code repository URL'} + - {name: code_commit, type: String, optional: true, description: 'Code commit hash'} + +outputs: + - {name: ref, type: String, description: 'Tagged ModelKit reference (e.g., registry.io/repo:tag)'} + - {name: digest, type: String, description: 'Digest-based ModelKit reference(e.g., registry.io/repo@sha256:...)'} + +implementation: + container: + image: ghcr.io/kitops-ml/kitops-kubeflow:latest + command: + - /bin/bash + - -c + - | + /scripts/push-modelkit.sh \ + "$0" "$1" "$2" \ + --modelkit-dir "$3" \ + ${4:+--name "$4"} \ + ${5:+--desc "$5"} \ + ${6:+--author "$6"} \ + ${7:+--dataset-uri "$7"} \ + ${8:+--code-repo "$8"} \ + ${9:+--code-commit "$9"} + - {inputValue: registry} + - {inputValue: repository} + - {inputValue: tag} + - {inputValue: modelkit_dir} + - {inputValue: modelkit_name} + - {inputValue: modelkit_desc} + - {inputValue: modelkit_author} + - {inputValue: dataset_uri} + - {inputValue: code_repo} + - {inputValue: code_commit} + fileOutputs: + ref: /tmp/outputs/reference + digest: /tmp/outputs/digest diff --git a/build/dockerfiles/kubeflow-components/components/unpack-modelkit/component.yaml b/build/dockerfiles/kubeflow-components/components/unpack-modelkit/component.yaml new file mode 100644 index 00000000..f4faf93f --- /dev/null +++ b/build/dockerfiles/kubeflow-components/components/unpack-modelkit/component.yaml @@ -0,0 +1,20 @@ +name: unpack-modelkit +description: Unpack ModelKit artifacts from a registry + +inputs: + - {name: modelkit_reference, type: String, description: 'ModelKit reference (e.g., registry.io/repo:tag or registry.io/repo@sha256:...)'} + - {name: extract_path, type: String, default: '/tmp/model', description: 'Directory to extract ModelKit artifacts'} + +outputs: + - {name: model_path, type: String, description: 'Directory where ModelKit artifacts were extracted'} + +implementation: + container: + image: ghcr.io/kitops-ml/kitops-kubeflow:latest + command: + - /bin/bash + - /scripts/unpack-modelkit.sh + - {inputValue: modelkit_reference} + - {inputValue: extract_path} + fileOutputs: + model_path: /tmp/outputs/model_path diff --git a/build/dockerfiles/kubeflow-components/examples/README.md b/build/dockerfiles/kubeflow-components/examples/README.md new file mode 100644 index 00000000..d6c81181 --- /dev/null +++ b/build/dockerfiles/kubeflow-components/examples/README.md @@ -0,0 +1,27 @@ +# Kubeflow ModelKit Component Examples + +This directory contains runnable Kubeflow Pipeline examples that use the `push-modelkit` and `unpack-modelkit` components. + +## House Prices Pipeline (`house-prices-pipeline.py`) + +An end-to-end pipeline that: + +- Trains an XGBoost regression model +- Writes model artifacts into a directory (model, code, docs) +- Packages them as a ModelKit and pushes to an OCI registry +- Optionally adds attestation metadata and cosign signing + +### How to Run + +```bash +pip install kfp==2.14.3 kfp-kubernetes==2.14.3 +python house-prices-pipeline.py +``` + +Upload the generated `house-prices-with-modelkit.yaml` to the Kubeflow UI (or use the KFP SDK) to execute the pipeline. + +### After It Runs + +Use the `kit` CLI to pull, inspect, and unpack the resulting ModelKit. + +For full component reference, integration patterns, secret requirements, and troubleshooting, see the main Kubeflow components README in this directory. diff --git a/build/dockerfiles/kubeflow-components/examples/house-prices-inference-service.yaml b/build/dockerfiles/kubeflow-components/examples/house-prices-inference-service.yaml new file mode 100644 index 00000000..d0f119a8 --- /dev/null +++ b/build/dockerfiles/kubeflow-components/examples/house-prices-inference-service.yaml @@ -0,0 +1,14 @@ +apiVersion: serving.kserve.io/v1beta1 +kind: InferenceService +metadata: + name: house-prices-model + namespace: modelkit-demo + annotations: + serving.kserve.io/deploymentMode: RawDeployment +spec: + predictor: + model: + modelFormat: + name: sklearn + storageUri: kit://jozu.ml/demo/house-prices:latest + protocolVersion: v2 \ No newline at end of file diff --git a/build/dockerfiles/kubeflow-components/examples/house-prices-pipeline.py b/build/dockerfiles/kubeflow-components/examples/house-prices-pipeline.py new file mode 100644 index 00000000..c0d12878 --- /dev/null +++ b/build/dockerfiles/kubeflow-components/examples/house-prices-pipeline.py @@ -0,0 +1,242 @@ +""" +Example Kubeflow Pipeline integrating push-modelkit component with house prices training. + +This example demonstrates the directory-based approach where the training component +creates a directory with well-named files (model.pkl, predictions.csv, train.py, README.md) +and the push-modelkit component packages the entire directory as a ModelKit. + +Uses KFP v2.14.3 +""" + +from kfp import dsl, kubernetes + +@dsl.component( + packages_to_install=['pandas', 'xgboost', 'scikit-learn'], + base_image='python:3.11-slim' +) +def train_house_prices( + modelkit_dir: dsl.Output[dsl.Artifact] +): + """Train house prices model with synthetic data.""" + import pandas as pd + import xgboost as xgb + from sklearn.model_selection import train_test_split + from sklearn.datasets import make_regression + import pickle + import os + + # Generate synthetic data + X, y = make_regression(n_samples=1000, n_features=10, noise=10, random_state=42) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) + + # Convert to DataFrame + feature_names = [f'feature_{i}' for i in range(X.shape[1])] + X_train_df = pd.DataFrame(X_train, columns=feature_names) + X_test_df = pd.DataFrame(X_test, columns=feature_names) + + # Train model + model = xgb.XGBRegressor(n_estimators=100, max_depth=7, learning_rate=0.1, random_state=42) + model.fit(X_train_df, y_train) + + # Evaluate + train_score = model.score(X_train_df, y_train) + test_score = model.score(X_test_df, y_test) + print(f"Training R² score: {train_score:.4f}") + print(f"Test R² score: {test_score:.4f}") + + # Create directory for modelkit artifacts + os.makedirs(modelkit_dir.path, exist_ok=True) + + # Write files with proper names directly to the directory + model_file = os.path.join(modelkit_dir.path, 'model.pkl') + predictions_file = os.path.join(modelkit_dir.path, 'predictions.csv') + training_script_file = os.path.join(modelkit_dir.path, 'train.py') + readme_file = os.path.join(modelkit_dir.path, 'README.md') + + # Save model using pickle (avoids KFP UTF-8 issues with binary formats) + with open(model_file, 'wb') as f: + pickle.dump(model, f) + + # Generate predictions + predictions = model.predict(X_test_df) + pred_df = pd.DataFrame({'Id': range(len(predictions)), 'Prediction': predictions}) + pred_df.to_csv(predictions_file, index=False) + + # Save training script + with open(training_script_file, 'w') as f: + f.write("""# Training Script + +This model was trained using XGBoost on synthetic regression data. + +## Training Configuration +- Algorithm: XGBoost Gradient Boosting +- n_estimators: 100 +- max_depth: 7 +- learning_rate: 0.1 +- random_state: 42 + +## Data +- Training samples: 800 +- Test samples: 200 +- Features: 10 (synthetic) +""") + + # Generate README + with open(readme_file, 'w') as f: + f.write(f"""# House Prices Demo Model + +## Model Details +- **Framework**: XGBoost {xgb.__version__} +- **Algorithm**: Gradient Boosted Trees +- **Training R² Score**: {train_score:.4f} +- **Test R² Score**: {test_score:.4f} + +## Training Data +- Training samples: {len(X_train)} +- Test samples: {len(X_test)} +- Features: {X.shape[1]} (synthetic) + +## Usage +```python +import pickle +with open('model.pkl', 'rb') as f: + model = pickle.load(f) +predictions = model.predict(X_new) +``` + +--- +Packaged with KitOps +""") + + +@dsl.container_component +def push_modelkit( + registry: str, + repository: str, + tag: str, + output_ref: dsl.Output[dsl.Artifact], + output_digest: dsl.Output[dsl.Artifact], + input_modelkit_dir: dsl.Input[dsl.Artifact], + modelkit_name: str = '', + modelkit_desc: str = '', + modelkit_author: str = '', + dataset_uri: str = '', + code_repo: str = '', + code_commit: str = '' +): + """Package and push model as ModelKit with attestation. + + Outputs: + output_ref: Tagged reference (e.g., jozu.ml/repo:tag) + output_digest: Digest reference (e.g., jozu.ml/repo@sha256:...) + """ + # Build command using safe argument passing + return dsl.ContainerSpec( + image='ghcr.io/kitops-ml/kitops-kubeflow:latest', + command=['/bin/bash', '-c'], + args=[ + ''' + export DOCKER_CONFIG=/home/user/.docker && \ + /scripts/push-modelkit.sh \ + "$0" "$1" "$2" \ + --modelkit-dir "$3" \ + ${4:+--name "$4"} \ + ${5:+--desc "$5"} \ + ${6:+--author "$6"} \ + ${7:+--dataset-uri "$7"} \ + ${8:+--code-repo "$8"} \ + ${9:+--code-commit "$9"} \ + && cp /tmp/outputs/reference "${10}" \ + && cp /tmp/outputs/digest "${11}" + ''', + registry, + repository, + tag, + input_modelkit_dir.path, + modelkit_name, + modelkit_desc, + modelkit_author, + dataset_uri, + code_repo, + code_commit, + output_ref.path, + output_digest.path + ] + ) + +@dsl.pipeline( + name='house-prices-with-modelkit', + description='Train house prices model and package as ModelKit' +) +def house_prices_pipeline( + registry: str = 'jozu.ml', + repository: str = 'demo/house-prices', + tag: str = 'latest', + dataset_source_uri: str = 'synthetic', + code_repo: str = 'github.com/kitops-ml/kitops', + code_commit: str = 'main' +): + """ + Complete pipeline that trains a house prices model and packages it as a ModelKit. + + Args: + registry: Container registry (e.g., jozu.ml) + repository: Repository path for ModelKit (e.g., demo/house-prices) + tag: ModelKit tag + dataset_source_uri: Source URI of dataset for attestation + code_repo: Code repository for attestation + code_commit: Git commit hash for attestation + """ + + # Train model with synthetic data + train = train_house_prices() + + # Package as ModelKit with directory of artifacts + push = push_modelkit( + registry=registry, + repository=repository, + tag=tag, + # Pass directory containing all artifacts + input_modelkit_dir=train.outputs['modelkit_dir'], + # Metadata + modelkit_name='House Prices Demo Model', + modelkit_desc='XGBoost model trained on synthetic data for KitOps demo', + modelkit_author='KitOps Team', + # Attestation metadata + dataset_uri=dataset_source_uri, + code_repo=code_repo, + code_commit=code_commit + ) + + # Mount docker-config secret for registry authentication + kubernetes.use_secret_as_volume( + push, + secret_name='docker-config', + mount_path='/home/user/.docker' + ) + + +if __name__ == '__main__': + import kfp + + # Check KFP version and use appropriate compiler + kfp_version = kfp.__version__ + print(f"Using KFP version: {kfp_version}") + + if kfp_version.startswith('2.'): + from kfp import compiler + compiler.Compiler().compile( + pipeline_func=house_prices_pipeline, + package_path='house-prices-with-modelkit.yaml' + ) + else: + # KFP v1 + import kfp.compiler as compiler + compiler.Compiler().compile( + pipeline_func=house_prices_pipeline, + pipeline_name='house-prices-with-modelkit', + package_path='house-prices-with-modelkit.yaml' + ) + + print("Pipeline compiled successfully!") + print("Upload house-prices-with-modelkit.yaml to Kubeflow UI") diff --git a/build/dockerfiles/kubeflow-components/examples/train-house-prices.yaml b/build/dockerfiles/kubeflow-components/examples/train-house-prices.yaml new file mode 100644 index 00000000..0c478a42 --- /dev/null +++ b/build/dockerfiles/kubeflow-components/examples/train-house-prices.yaml @@ -0,0 +1,123 @@ +name: train-house-prices +description: Train XGBoost model on synthetic house prices data + +outputs: + - {name: model_path, type: String} + - {name: predictions_path, type: String} + - {name: training_script_path, type: String} + - {name: readme_path, type: String} + +implementation: + container: + image: python:3.11-slim + command: + - sh + - -c + - | + pip install pandas xgboost scikit-learn && python - <<'EOF' + import pandas as pd + import xgboost as xgb + from sklearn.model_selection import train_test_split + from sklearn.datasets import make_regression + import pickle + import os + + # Create output directory + os.makedirs('/outputs', exist_ok=True) + + # Generate synthetic data + X, y = make_regression(n_samples=1000, n_features=10, noise=10, random_state=42) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) + + # Convert to DataFrame + feature_names = [f'feature_{i}' for i in range(X.shape[1])] + X_train_df = pd.DataFrame(X_train, columns=feature_names) + X_test_df = pd.DataFrame(X_test, columns=feature_names) + + # Train model + model = xgb.XGBRegressor(n_estimators=100, max_depth=7, learning_rate=0.1, random_state=42) + model.fit(X_train_df, y_train) + + # Evaluate + train_score = model.score(X_train_df, y_train) + test_score = model.score(X_test_df, y_test) + print(f"Training R² score: {train_score:.4f}") + print(f"Test R² score: {test_score:.4f}") + + # Save model + model_path = '/outputs/model.pkl' + with open(model_path, 'wb') as f: + pickle.dump(model, f) + + # Generate predictions + predictions = model.predict(X_test_df) + predictions_path = '/outputs/predictions.csv' + pred_df = pd.DataFrame({'Id': range(len(predictions)), 'Prediction': predictions}) + pred_df.to_csv(predictions_path, index=False) + + # Save training script + training_script_path = '/outputs/train.py' + with open(training_script_path, 'w') as f: + f.write("""# Training Script + + This model was trained using XGBoost on synthetic regression data. + + ## Training Configuration + - Algorithm: XGBoost Gradient Boosting + - n_estimators: 100 + - max_depth: 7 + - learning_rate: 0.1 + - random_state: 42 + + ## Data + - Training samples: 800 + - Test samples: 200 + - Features: 10 (synthetic) + """) + + # Generate README + readme_path = '/outputs/README.md' + with open(readme_path, 'w') as f: + f.write(f"""# House Prices Demo Model + + ## Model Details + - **Framework**: XGBoost {xgb.__version__} + - **Algorithm**: Gradient Boosted Trees + - **Training R² Score**: {train_score:.4f} + - **Test R² Score**: {test_score:.4f} + + ## Training Data + - Training samples: {len(X_train)} + - Test samples: {len(X_test)} + - Features: {X.shape[1]} (synthetic) + + ## Usage + ```python + import pickle + with open('model.pkl', 'rb') as f: + model = pickle.load(f) + predictions = model.predict(X_new) + ``` + + --- + Packaged with KitOps + """) + + # Create KFP outputs directory + os.makedirs('/tmp/kfp/outputs', exist_ok=True) + + # Write output paths for KFP + with open('/tmp/kfp/outputs/model_path', 'w') as f: + f.write(model_path) + with open('/tmp/kfp/outputs/predictions_path', 'w') as f: + f.write(predictions_path) + with open('/tmp/kfp/outputs/training_script_path', 'w') as f: + f.write(training_script_path) + with open('/tmp/kfp/outputs/readme_path', 'w') as f: + f.write(readme_path) + EOF + fileOutputs: + model_path: /tmp/kfp/outputs/model_path + predictions_path: /tmp/kfp/outputs/predictions_path + training_script_path: /tmp/kfp/outputs/training_script_path + readme_path: /tmp/kfp/outputs/readme_path diff --git a/build/dockerfiles/kubeflow-components/scripts/lib/common.sh b/build/dockerfiles/kubeflow-components/scripts/lib/common.sh new file mode 100644 index 00000000..bb5df0de --- /dev/null +++ b/build/dockerfiles/kubeflow-components/scripts/lib/common.sh @@ -0,0 +1,112 @@ +#!/bin/bash + +# Common library for workflow utilities + +# Environment variables with defaults +export LOG_LEVEL="${LOG_LEVEL:-INFO}" +export REQUEST_ID="${REQUEST_ID:-}" + +# Convert LOG_LEVEL to numeric value: DEBUG=0, INFO=1, WARN=2, ERROR=3 +case "$LOG_LEVEL" in + DEBUG) LOG_LEVEL_VALUE=0 ;; + INFO) LOG_LEVEL_VALUE=1 ;; + WARN) LOG_LEVEL_VALUE=2 ;; + ERROR) LOG_LEVEL_VALUE=3 ;; + *) LOG_LEVEL_VALUE=1 ;; # Default to INFO +esac +export LOG_LEVEL_VALUE + +# Logging functions +log_json() { + local level=$1 + local message=$2 + local extra="${3-}" + if [ -z "$extra" ]; then extra="{}"; fi + + local timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ") + + jq -nc \ + --arg timestamp "$timestamp" \ + --arg level "$level" \ + --arg request_id "${REQUEST_ID:-}" \ + --arg message "$message" \ + --argjson extra "$extra" \ + '{timestamp: $timestamp, level: $level, request_id: $request_id, message: $message, extra: $extra}' +} + +log_debug() { + local extra="${2-}" + if [ -z "$extra" ]; then extra="{}"; fi + [ "$LOG_LEVEL_VALUE" -le 0 ] && log_json "DEBUG" "$1" "$extra" + return 0 +} + +log_info() { + local extra="${2-}" + if [ -z "$extra" ]; then extra="{}"; fi + [ "$LOG_LEVEL_VALUE" -le 1 ] && log_json "INFO" "$1" "$extra" +} + +log_warn() { + local extra="${2-}" + if [ -z "$extra" ]; then extra="{}"; fi + [ "$LOG_LEVEL_VALUE" -le 2 ] && log_json "WARN" "$1" "$extra" +} + +log_error() { + local extra="${2-}" + if [ -z "$extra" ]; then extra="{}"; fi + [ "$LOG_LEVEL_VALUE" -le 3 ] && log_json "ERROR" "$1" "$extra" >&2 +} + +# Print error message and exit +die() { + local extra="${2-}" + if [ -z "$extra" ]; then extra="{}"; fi + log_error "$1" "$extra" + exit 1 +} + +# Retry logic +retry() { + local max_attempts=$1 + local delay=$2 + shift 2 + local attempt=1 + + while [ $attempt -le $max_attempts ]; do + log_debug "Attempting command (attempt $attempt/$max_attempts)" + + if "$@"; then + return 0 + fi + + if [ $attempt -lt $max_attempts ]; then + log_warn "Command failed, retrying in ${delay}s" "{\"attempt\":$attempt}" + sleep $delay + fi + + attempt=$((attempt + 1)) + done + + log_error "Command failed after $max_attempts attempts" + return 1 +} + +# Check required environment variables +require_env() { + for var in "$@"; do + if [ -z "${!var:-}" ]; then + die "Required environment variable not set: $var" + fi + done +} + +# Check required commands +require_cmd() { + for cmd in "$@"; do + if ! command -v "$cmd" &> /dev/null; then + die "Required command not found: $cmd" + fi + done +} diff --git a/build/dockerfiles/kubeflow-components/scripts/push-modelkit.sh b/build/dockerfiles/kubeflow-components/scripts/push-modelkit.sh new file mode 100755 index 00000000..fcf78409 --- /dev/null +++ b/build/dockerfiles/kubeflow-components/scripts/push-modelkit.sh @@ -0,0 +1,225 @@ +#!/bin/bash +set -euo pipefail + +# Usage: /scripts/push-modelkit.sh --modelkit-dir [options] +# +# Arguments: +# Container registry host (e.g., jozu.ml) +# Repository path (e.g., myorg/mymodel) +# ModelKit tag +# --modelkit-dir Directory with ML artifacts (with or without Kitfile) +# +# Options: +# --name ModelKit name +# --desc ModelKit description +# --author ModelKit author +# --dataset-uri Dataset URI +# --code-repo Code repository +# --code-commit Code commit +# +# Environment variables: `DOCKER_CONFIG` (path to .docker directory containing config.json) + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "${SCRIPT_DIR}/lib/common.sh" + +# Initialize variables +REGISTRY="" +REPOSITORY="" +TAG="" +MODELKIT_DIR="" +MODELKIT_NAME="" +MODELKIT_DESC="" +MODELKIT_AUTHOR="" +DATASET_URI="" +CODE_REPO="" +CODE_COMMIT="" + +# Parse arguments +if [ $# -lt 3 ]; then + die "Usage: $0 --modelkit-dir [options]" +fi + +# First three args are positional +REGISTRY="$1" +REPOSITORY="$2" +TAG="$3" +shift 3 + +# Parse optional arguments +while [[ $# -gt 0 ]]; do + case $1 in + --modelkit-dir) + MODELKIT_DIR="$2" + shift 2 + ;; + --name) + MODELKIT_NAME="$2" + shift 2 + ;; + --desc) + MODELKIT_DESC="$2" + shift 2 + ;; + --author) + MODELKIT_AUTHOR="$2" + shift 2 + ;; + --dataset-uri) + DATASET_URI="$2" + shift 2 + ;; + --code-repo) + CODE_REPO="$2" + shift 2 + ;; + --code-commit) + CODE_COMMIT="$2" + shift 2 + ;; + *) + die "Unknown argument: $1" + ;; + esac +done + +# Validate required arguments +if [ -z "$MODELKIT_DIR" ]; then + die "Must specify --modelkit-dir" +fi + +if [ ! -d "$MODELKIT_DIR" ]; then + die "ModelKit directory not found: $MODELKIT_DIR" +fi + +# Construct ModelKit reference +MODELKIT_REF="${REGISTRY}/${REPOSITORY}:${TAG}" + +log_info "Starting ModelKit push" "{\"reference\":\"$MODELKIT_REF\"}" + +require_cmd kit cosign jq + +# Disable kit update notifications +kit version --show-update-notifications=false >/dev/null 2>&1 || true + +# Create output directory +mkdir -p /tmp/outputs + +# Use the provided directory as working directory +WORK_DIR="$MODELKIT_DIR" + +log_info "Using ModelKit directory" "{\"dir\":\"$MODELKIT_DIR\"}" + +# Check if Kitfile exists, if not run kit init +if [ ! -f "$WORK_DIR/Kitfile" ] && [ ! -f "$WORK_DIR/kitfile" ] && [ ! -f "$WORK_DIR/.kitfile" ]; then + log_info "No Kitfile found, running kit init" + + INIT_ARGS=() + [ -n "$MODELKIT_NAME" ] && INIT_ARGS+=(--name "$MODELKIT_NAME") + [ -n "$MODELKIT_DESC" ] && INIT_ARGS+=(--desc "$MODELKIT_DESC") + [ -n "$MODELKIT_AUTHOR" ] && INIT_ARGS+=(--author "$MODELKIT_AUTHOR") + + kit init "$WORK_DIR" ${INIT_ARGS[@]+"${INIT_ARGS[@]}"} || die "Failed to initialize Kitfile" +else + log_info "Found existing Kitfile" +fi + +# Pack the ModelKit +log_info "Packing ModelKit artifacts" +kit pack "$WORK_DIR" -t "$MODELKIT_REF" || die "Failed to pack ModelKit" + +# Push to registry with retry +log_info "Pushing to registry" +retry 3 2 kit push "$MODELKIT_REF" || die "Failed to push ModelKit" + +# Fetch digest from registry +log_debug "Fetching digest from registry" + +set +e +INSPECT_OUTPUT=$(kit inspect "$MODELKIT_REF" --remote 2>&1) +INSPECT_EXIT_CODE=$? +set -e + +log_debug "Kit inspect completed" "{\"exit_code\":$INSPECT_EXIT_CODE}" + +if [ $INSPECT_EXIT_CODE -eq 0 ]; then + MODELKIT_DIGEST=$(echo "$INSPECT_OUTPUT" | jq -r '.digest' 2>/dev/null || echo "") +fi + +if [ -z "$MODELKIT_DIGEST" ]; then + die "Could not determine ModelKit digest" "{\"reference\":\"$MODELKIT_REF\",\"exit_code\":$INSPECT_EXIT_CODE}" +fi + +log_debug "ModelKit digest: $MODELKIT_DIGEST" + +# Construct full reference with digest +FULL_REF="${REGISTRY}/${REPOSITORY}@${MODELKIT_DIGEST}" + +log_info "Push completed" "{\"reference\":\"$FULL_REF\"}" + +# Create in-toto attestation predicate +ATTESTATION_PREDICATE=$(jq -nc \ + --arg reference "$FULL_REF" \ + --arg digest "$MODELKIT_DIGEST" \ + --arg dataset_uri "$DATASET_URI" \ + --arg code_repo "$CODE_REPO" \ + --arg code_commit "$CODE_COMMIT" \ + --arg timestamp "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \ + '{ + predicateType: "https://kitops.ml/attestation/v1", + predicate: { + modelkit: { + reference: $reference, + digest: $digest + }, + metadata: { + dataset_uri: $dataset_uri, + code_repo: $code_repo, + code_commit: $code_commit, + created_at: $timestamp + } + } + }') + +log_debug "Created attestation predicate" + +# Sign with cosign (non-fatal) +if [ -f "/etc/cosign/cosign.key" ]; then + log_info "Signing and attaching attestation" + + PREDICATE_FILE=$(mktemp) + echo "$ATTESTATION_PREDICATE" > "$PREDICATE_FILE" + + if retry 3 2 cosign attest \ + --key /etc/cosign/cosign.key \ + --predicate "$PREDICATE_FILE" \ + --tlog-upload=false \ + --yes \ + "$FULL_REF" 2>&1; then + log_info "Signed with cosign" + else + log_warn "Failed to sign with cosign, continuing" + fi + + rm -f "$PREDICATE_FILE" +else + log_warn "No cosign key found at /etc/cosign/cosign.key, skipping signing" +fi + +# Output results +# Write to KFP output files +echo -n "$MODELKIT_REF" > /tmp/outputs/reference # Tagged reference (e.g., jozu.ml/repo:tag) +echo -n "$FULL_REF" > /tmp/outputs/digest # Digest reference (e.g., jozu.ml/repo@sha256:...) + +# Output JSON to stdout +jq -n \ + --arg reference "$FULL_REF" \ + --arg digest "$MODELKIT_DIGEST" \ + --arg timestamp "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" \ + '{ + "reference": $reference, + "digest": $digest, + "timestamp": $timestamp, + "status": "success" + }' + +log_info "Push workflow completed" diff --git a/build/dockerfiles/kubeflow-components/scripts/unpack-modelkit.sh b/build/dockerfiles/kubeflow-components/scripts/unpack-modelkit.sh new file mode 100755 index 00000000..c8f4269a --- /dev/null +++ b/build/dockerfiles/kubeflow-components/scripts/unpack-modelkit.sh @@ -0,0 +1,53 @@ +#!/bin/bash +set -euo pipefail + +# Usage: /scripts/unpack-modelkit.sh [extract_path] +# Environment variables: `DOCKER_CONFIG` (path to .docker directory containing config.json) +# Unpacks ModelKit artifacts to a directory + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "${SCRIPT_DIR}/lib/common.sh" + +# Validate arguments +if [ $# -lt 1 ]; then + die "Usage: $0 [extract_path]" +fi + +MODELKIT_REF="$1" +EXTRACT_PATH="${2:-/tmp/model}" + +log_info "Starting unpack" "{\"modelkit_reference\":\"$MODELKIT_REF\",\"extract_path\":\"$EXTRACT_PATH\"}" + +require_cmd kit jq +require_env DOCKER_CONFIG + +# Disable kit update notifications to keep output clean +kit version --show-update-notifications=false >/dev/null 2>&1 || true + +# Create output directory +mkdir -p /tmp/outputs +mkdir -p "$EXTRACT_PATH" + +# Step 1: Unpack ModelKit with retry +log_info "Unpacking" +retry 3 2 kit unpack "$MODELKIT_REF" -d "$EXTRACT_PATH" || die "Failed to unpack ModelKit" + +log_info "Unpacked successfully" "{\"path\":\"$EXTRACT_PATH\"}" + +# Output results +# Write to KFP output file +echo -n "$EXTRACT_PATH" > /tmp/outputs/model_path + +# Output JSON to stdout +jq -n \ + --arg path "$EXTRACT_PATH" \ + --arg reference "$MODELKIT_REF" \ + --arg timestamp "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" \ + '{ + "model_path": $path, + "modelkit_reference": $reference, + "timestamp": $timestamp, + "status": "success" + }' + +log_info "Unpack workflow completed" diff --git a/build/dockerfiles/kubeflow-components/tests/push-modelkit.bats b/build/dockerfiles/kubeflow-components/tests/push-modelkit.bats new file mode 100755 index 00000000..5f1fd5c3 --- /dev/null +++ b/build/dockerfiles/kubeflow-components/tests/push-modelkit.bats @@ -0,0 +1,330 @@ +#!/usr/bin/env bats + +# Path to the script under test +SCRIPT_PATH="${BATS_TEST_DIRNAME}/../scripts/push-modelkit.sh" + +setup() { + # Create temporary directory for tests + export TEST_DIR="$(mktemp -d)" + export MODEL_DIR="$TEST_DIR/model" + export OUTPUT_DIR="/tmp/outputs" + export LOG_LEVEL="INFO" + export REQUEST_ID="test-push-modelkit" + export DOCKER_CONFIG="$TEST_DIR/.docker" + + # Create mock model directory with Kitfile + mkdir -p "$MODEL_DIR" + echo "mock model content" > "$MODEL_DIR/model.bin" + cat > "$MODEL_DIR/Kitfile" << 'KITFILEEOF' +manifestVersion: 1.0 +package: + name: test-model +model: + path: model.bin +KITFILEEOF + + # Create mock docker config + mkdir -p "$DOCKER_CONFIG" + cat > "$DOCKER_CONFIG/config.json" << 'DOCKEREOF' +{"auths":{"registry.io":{"auth":"TU9DS19VU0VSOk1PQ0tfUEFTU1dPUkQ="}}} # Mock auth is base64("MOCK_USER:MOCK_PASSWORD") +DOCKEREOF + + + # Create output directory + mkdir -p "$OUTPUT_DIR" + + # Mock commands + export PATH="$TEST_DIR/bin:$PATH" + mkdir -p "$TEST_DIR/bin" + + # Create mock kit command + cat > "$TEST_DIR/bin/kit" << 'EOF' +#!/bin/bash +# Mock kit command for testing + +if [[ "$1" == "version" ]]; then + echo "kitops version v1.0.0" + exit 0 +fi + +if [[ "$1" == "init" ]]; then + # Mock kit init - create a basic Kitfile + # Handle: kit init [--name NAME] [--desc DESC] [--author AUTHOR] + dir="$2" + shift 2 + # Consume optional flags + while [[ $# -gt 0 ]]; do + case $1 in + --name|--desc|--author) + shift 2 # Skip flag and value + ;; + *) + shift + ;; + esac + done + cat > "$dir/Kitfile" << 'INITEOF' +manifestVersion: 1.0 +package: + name: auto-generated +model: + path: model.bin +INITEOF + exit 0 +fi + +if [[ "$1" == "pack" ]]; then + # Mock pack output + echo "Packing model..." + exit 0 +fi + +if [[ "$1" == "push" ]]; then + # Mock push output with digest + echo "Pushed to registry" + echo "Digest: sha256:abc123def456789012345678901234567890123456789012345678901234" + exit 0 +fi + +if [[ "$1" == "inspect" ]]; then + # Mock inspect output + cat << 'INSPECTEOF' +{"digest":"sha256:abc123def456789012345678901234567890123456789012345678901234"} +INSPECTEOF + exit 0 +fi + +exit 1 +EOF + chmod +x "$TEST_DIR/bin/kit" + + # Create mock cosign command + cat > "$TEST_DIR/bin/cosign" << 'EOF' +#!/bin/bash +if [[ "$1" == "attest" ]]; then + echo "Signing attestation..." + exit 0 +fi +exit 1 +EOF + chmod +x "$TEST_DIR/bin/cosign" + + # Create mock jq command + cat > "$TEST_DIR/bin/jq" << 'EOF' +#!/bin/bash +exec /usr/bin/jq "$@" +EOF + chmod +x "$TEST_DIR/bin/jq" + + # Create failing kit command for error tests + cat > "$TEST_DIR/bin/kit-fail" << 'EOF' +#!/bin/bash +exit 1 +EOF + chmod +x "$TEST_DIR/bin/kit-fail" +} + +teardown() { + # Clean up temporary directory + rm -rf "$TEST_DIR" + rm -rf "$OUTPUT_DIR" + unset MODEL_DIR + unset OUTPUT_DIR + unset LOG_LEVEL + unset REQUEST_ID + unset DOCKER_CONFIG +} + +# Argument validation tests + +@test "fails when no arguments provided" { + run bash "$SCRIPT_PATH" + [ "$status" -eq 1 ] + [[ "$output" =~ "Usage:" ]] +} + +@test "fails when only registry provided" { + run bash "$SCRIPT_PATH" "registry.io" + [ "$status" -eq 1 ] + [[ "$output" =~ "Usage:" ]] +} + +@test "fails when only registry and repository provided" { + run bash "$SCRIPT_PATH" "registry.io" "myorg/mymodel" + [ "$status" -eq 1 ] + [[ "$output" =~ "Usage:" ]] +} + +@test "fails when no modelkit-dir specified" { + run bash "$SCRIPT_PATH" "registry.io" "myorg/mymodel" "v1" + [ "$status" -eq 1 ] + [[ "$output" =~ "Must specify --modelkit-dir" ]] +} + +# Directory mode tests + +@test "successfully packs and pushes from directory with Kitfile" { + run bash "$SCRIPT_PATH" "registry.io" "myorg/mymodel" "v1" --modelkit-dir "$MODEL_DIR" + [ "$status" -eq 0 ] + [[ "$output" =~ "Using ModelKit directory" ]] + [[ "$output" =~ "Packing ModelKit" ]] + [[ "$output" =~ "Pushing to registry" ]] +} + +@test "runs kit init when no Kitfile present" { + # Remove Kitfile + rm "$MODEL_DIR/Kitfile" + + run bash "$SCRIPT_PATH" "registry.io" "myorg/mymodel" "v1" --modelkit-dir "$MODEL_DIR" + [ "$status" -eq 0 ] + [[ "$output" =~ "No Kitfile found" ]] + [[ "$output" =~ "running kit init" ]] +} + +@test "recognizes lowercase kitfile" { + # Replace Kitfile with lowercase kitfile + mv "$MODEL_DIR/Kitfile" "$MODEL_DIR/kitfile" + + run bash "$SCRIPT_PATH" "registry.io" "myorg/mymodel" "v1" --modelkit-dir "$MODEL_DIR" + [ "$status" -eq 0 ] + [[ ! "$output" =~ "No Kitfile found" ]] + [[ "$output" =~ "Packing ModelKit" ]] +} + +@test "recognizes dotfile .kitfile" { + # Replace Kitfile with .kitfile + mv "$MODEL_DIR/Kitfile" "$MODEL_DIR/.kitfile" + + run bash "$SCRIPT_PATH" "registry.io" "myorg/mymodel" "v1" --modelkit-dir "$MODEL_DIR" + [ "$status" -eq 0 ] + [[ ! "$output" =~ "No Kitfile found" ]] + [[ "$output" =~ "Packing ModelKit" ]] +} + +@test "fails when directory does not exist" { + run bash "$SCRIPT_PATH" "registry.io" "myorg/mymodel" "v1" --modelkit-dir "/nonexistent" + [ "$status" -eq 1 ] + [[ "$output" =~ "ModelKit directory not found" ]] +} + +@test "handles directory with spaces" { + model_with_spaces="$TEST_DIR/model with spaces" + mkdir -p "$model_with_spaces" + echo "mock" > "$model_with_spaces/model.bin" + cat > "$model_with_spaces/Kitfile" << 'EOF' +manifestVersion: 1.0 +model: + path: model.bin +EOF + + run bash "$SCRIPT_PATH" "registry.io" "myorg/mymodel" "v1" --modelkit-dir "$model_with_spaces" + [ "$status" -eq 0 ] +} + +@test "passes metadata to kit init when no Kitfile exists" { + # Remove Kitfile + rm "$MODEL_DIR/Kitfile" + + run bash "$SCRIPT_PATH" "registry.io" "myorg/mymodel" "v1" \ + --modelkit-dir "$MODEL_DIR" \ + --name "My Model" \ + --desc "Test model" \ + --author "Test Author" + [ "$status" -eq 0 ] + [[ "$output" =~ "running kit init" ]] +} + +# Output validation tests + +@test "creates output files in /tmp/outputs" { + run bash "$SCRIPT_PATH" "registry.io" "myorg/mymodel" "v1" --modelkit-dir "$MODEL_DIR" + [ "$status" -eq 0 ] + [ -f "$OUTPUT_DIR/reference" ] + [ -f "$OUTPUT_DIR/digest" ] +} + +@test "output files contain correct values" { + run bash "$SCRIPT_PATH" "registry.io" "myorg/mymodel" "v1" --modelkit-dir "$MODEL_DIR" + [ "$status" -eq 0 ] + + ref_content=$(cat "$OUTPUT_DIR/reference") + digest_content=$(cat "$OUTPUT_DIR/digest") + + [[ "$ref_content" == "registry.io/myorg/mymodel:v1" ]] + [[ "$digest_content" =~ registry.io/myorg/mymodel@sha256:abc123def456 ]] +} + +@test "returns valid JSON output" { + run bash "$SCRIPT_PATH" "registry.io" "myorg/mymodel" "v1" --modelkit-dir "$MODEL_DIR" + [ "$status" -eq 0 ] + + # Extract final JSON output + json_output=$(echo "$output" | awk '/^{$/,/^}$/' | jq -s '.[] | select(.status != null)') + echo "$json_output" | jq -e '.reference' + echo "$json_output" | jq -e '.digest' + echo "$json_output" | jq -e '.status == "success"' +} + +# Attestation metadata tests + +@test "accepts attestation metadata flags" { + run bash "$SCRIPT_PATH" "registry.io" "myorg/mymodel" "v1" \ + --modelkit-dir "$MODEL_DIR" \ + --dataset-uri "s3://bucket/data" \ + --code-repo "github.com/org/repo" \ + --code-commit "abc123" + [ "$status" -eq 0 ] +} + +# Cosign signing tests + +@test "signs with cosign when key exists" { + mkdir -p /tmp/etc/cosign + echo "mock-key" > /tmp/etc/cosign/cosign.key + + run bash "$SCRIPT_PATH" "registry.io" "myorg/mymodel" "v1" --modelkit-dir "$MODEL_DIR" + [ "$status" -eq 0 ] + + rm -rf /tmp/etc/cosign +} + +@test "warns when cosign key not found" { + run bash "$SCRIPT_PATH" "registry.io" "myorg/mymodel" "v1" --modelkit-dir "$MODEL_DIR" + [ "$status" -eq 0 ] + [[ "$output" =~ "No cosign key found" ]] || [[ "$output" =~ "skipping signing" ]] +} + +# Error handling tests + +@test "fails when kit command is not found" { + export PATH="/usr/bin:/bin" + run bash "$SCRIPT_PATH" "registry.io" "myorg/mymodel" "v1" --modelkit-dir "$MODEL_DIR" + [ "$status" -eq 1 ] + [[ "$output" =~ "Required command not found: kit" ]] +} + +@test "retries on push failure and eventually fails" { + # Replace kit with failing version + mv "$TEST_DIR/bin/kit" "$TEST_DIR/bin/kit.bak" + mv "$TEST_DIR/bin/kit-fail" "$TEST_DIR/bin/kit" + + run bash "$SCRIPT_PATH" "registry.io" "myorg/mymodel" "v1" --modelkit-dir "$MODEL_DIR" + [ "$status" -eq 1 ] +} + +# Edge case tests + +@test "handles registry with port number" { + run bash "$SCRIPT_PATH" "registry.io:5000" "myorg/mymodel" "v1" --modelkit-dir "$MODEL_DIR" + [ "$status" -eq 0 ] +} + +@test "handles repository with nested path" { + run bash "$SCRIPT_PATH" "registry.io" "myorg/team/project/mymodel" "v1" --modelkit-dir "$MODEL_DIR" + [ "$status" -eq 0 ] +} + +@test "handles tag with special characters" { + run bash "$SCRIPT_PATH" "registry.io" "myorg/mymodel" "v1.0.0-rc1" --modelkit-dir "$MODEL_DIR" + [ "$status" -eq 0 ] +} diff --git a/build/dockerfiles/kubeflow-components/tests/unpack-modelkit.bats b/build/dockerfiles/kubeflow-components/tests/unpack-modelkit.bats new file mode 100755 index 00000000..ec7e2d5f --- /dev/null +++ b/build/dockerfiles/kubeflow-components/tests/unpack-modelkit.bats @@ -0,0 +1,188 @@ +#!/usr/bin/env bats + +# Path to the script under test +SCRIPT_PATH="${BATS_TEST_DIRNAME}/../scripts/unpack-modelkit.sh" + +setup() { + # Create temporary directory for tests + export TEST_DIR="$(mktemp -d)" + export EXTRACT_DIR="$TEST_DIR/extract" + export OUTPUT_DIR="/tmp/outputs" + export LOG_LEVEL="INFO" + export REQUEST_ID="test-unpack-modelkit" + export DOCKER_CONFIG="$TEST_DIR/.docker" + + # Create mock docker config + mkdir -p "$DOCKER_CONFIG" + cat > "$DOCKER_CONFIG/config.json" << 'DOCKEREOF' +{"auths":{"registry.io":{"auth":"TU9DS19VU0VSOk1PQ0tfUEFTU1dPUkQ="}}} # base64("MOCK_USER:MOCK_PASSWORD") +DOCKEREOF + + + # Create output directory + mkdir -p "$OUTPUT_DIR" + + # Mock commands + export PATH="$TEST_DIR/bin:$PATH" + mkdir -p "$TEST_DIR/bin" + + # Create mock kit command + cat > "$TEST_DIR/bin/kit" << 'EOF' +#!/bin/bash +# Mock kit command for testing + +if [[ "$1" == "version" ]]; then + echo "kitops version v1.0.0" + exit 0 +fi + +if [[ "$1" == "unpack" ]]; then + reference="$2" + # Parse -d flag for directory + shift 2 + while [[ $# -gt 0 ]]; do + case $1 in + -d) + dir="$2" + shift 2 + ;; + *) + shift + ;; + esac + done + + # Create mock unpacked content + mkdir -p "$dir" + echo "mock model content" > "$dir/model.bin" + echo "mock kitfile" > "$dir/Kitfile" + exit 0 +fi + +exit 1 +EOF + chmod +x "$TEST_DIR/bin/kit" + + # Create mock jq command + cat > "$TEST_DIR/bin/jq" << 'EOF' +#!/bin/bash +# Forward to real jq +exec /usr/bin/jq "$@" +EOF + chmod +x "$TEST_DIR/bin/jq" + + # Create failing kit command for error tests + cat > "$TEST_DIR/bin/kit-fail" << 'EOF' +#!/bin/bash +exit 1 +EOF + chmod +x "$TEST_DIR/bin/kit-fail" +} + +teardown() { + # Clean up temporary directory + rm -rf "$TEST_DIR" + rm -rf "$OUTPUT_DIR" + unset EXTRACT_DIR + unset OUTPUT_DIR + unset LOG_LEVEL + unset REQUEST_ID + unset DOCKER_CONFIG +} + +# Argument validation tests + +@test "fails when no arguments provided" { + run bash "$SCRIPT_PATH" + [ "$status" -eq 1 ] + [[ "$output" =~ "Usage:" ]] + [[ "$output" =~ "modelkit_reference" ]] +} + +@test "succeeds with only modelkit_reference (uses default extract path)" { + run bash "$SCRIPT_PATH" "registry.io/myorg/mymodel:v1" + [ "$status" -eq 0 ] + [[ "$output" =~ "Unpack workflow completed" ]] +} + +# ModelKit unpack tests + +@test "successfully unpacks modelkit to specified path" { + run bash "$SCRIPT_PATH" "registry.io/myorg/mymodel:v1" "$EXTRACT_DIR" + [ "$status" -eq 0 ] + [[ "$output" =~ "Starting unpack" ]] + [[ "$output" =~ "Unpacking" ]] + [ -f "$EXTRACT_DIR/model.bin" ] +} + +@test "creates output file in /tmp/outputs" { + run bash "$SCRIPT_PATH" "registry.io/myorg/mymodel:v1" "$EXTRACT_DIR" + [ "$status" -eq 0 ] + [ -f "$OUTPUT_DIR/model_path" ] +} + +@test "output file contains correct value" { + run bash "$SCRIPT_PATH" "registry.io/myorg/mymodel:v1" "$EXTRACT_DIR" + [ "$status" -eq 0 ] + + path_content=$(cat "$OUTPUT_DIR/model_path") + [[ "$path_content" == "$EXTRACT_DIR" ]] +} + +@test "returns valid JSON output" { + run bash "$SCRIPT_PATH" "registry.io/myorg/mymodel:v1" "$EXTRACT_DIR" + [ "$status" -eq 0 ] + + # Extract final JSON output (the one with "status" field) + json_output=$(echo "$output" | awk '/^{$/,/^}$/' | jq -s '.[] | select(.status != null)') + echo "$json_output" | jq -e '.model_path' + echo "$json_output" | jq -e '.modelkit_reference' + echo "$json_output" | jq -e '.status == "success"' +} + +# Error handling tests + +@test "fails when kit command is not found" { + export PATH="/usr/bin:/bin" + run bash "$SCRIPT_PATH" "registry.io/myorg/mymodel:v1" "$EXTRACT_DIR" + [ "$status" -eq 1 ] + [[ "$output" =~ "Required command not found: kit" ]] +} + +@test "fails when DOCKER_CONFIG not set" { + unset DOCKER_CONFIG + run bash "$SCRIPT_PATH" "registry.io/myorg/mymodel:v1" "$EXTRACT_DIR" + [ "$status" -eq 1 ] + [[ "$output" =~ "Required environment variable not set: DOCKER_CONFIG" ]] +} + +@test "retries on unpack failure and eventually fails" { + # Replace kit with failing version + mv "$TEST_DIR/bin/kit" "$TEST_DIR/bin/kit.bak" + mv "$TEST_DIR/bin/kit-fail" "$TEST_DIR/bin/kit" + + run bash "$SCRIPT_PATH" "registry.io/myorg/mymodel:v1" "$EXTRACT_DIR" + [ "$status" -eq 1 ] +} + +# Edge case tests + +@test "handles paths with spaces" { + extract_with_spaces="$TEST_DIR/extract with spaces" + + run bash "$SCRIPT_PATH" "registry.io/myorg/mymodel:v1" "$extract_with_spaces" + [ "$status" -eq 0 ] + [ -d "$extract_with_spaces" ] + [ -f "$extract_with_spaces/model.bin" ] +} + +# Integration tests + +@test "creates extract directory if it does not exist" { + nonexistent="$TEST_DIR/nonexistent/deep/path" + + run bash "$SCRIPT_PATH" "registry.io/myorg/mymodel:v1" "$nonexistent" + [ "$status" -eq 0 ] + [ -d "$nonexistent" ] + [ -f "$nonexistent/model.bin" ] +}