diff --git a/standalone/README.md b/standalone/README.md index f4c351f4..5e2ea9c0 100644 --- a/standalone/README.md +++ b/standalone/README.md @@ -77,15 +77,28 @@ The script requires information regarding the location and method for accessing * `--namespace`: The namespace in which the Kubernetes resources are located - **Required** * `--storage-class`: The storage class to use for the PVCs - **Optional** - Default: cluster default storage class. * `--nproc-per-node`: The number of processes to run per node - **Optional** - Default: 1. -* `--sdg-object-store-secret`: The name of the Kubernetes secret containing the SDG object store credentials. -* `--sdg-object-store-endpoint`: The endpoint of the object store. `SDG_OBJECT_STORE_ENDPOINT` environment variable can be used as well. -* `--sdg-object-store-bucket`: The bucket name in the object store. `SDG_OBJECT_STORE_BUCKET` environment variable can be used as well. -* `--sdg-object-store-access-key`: The access key for the object store. `SDG_OBJECT_STORE_ACCESS_KEY` environment variable can be used as well. -* `--sdg-object-store-secret-key`: The secret key for the object store. `SDG_OBJECT_STORE_SECRET_KEY` environment variable can be used as well. -* `--sdg-object-store-data-key`: The key for the SDG data in the object store. e.g., `sdg.tar.gz`. `SDG_OBJECT_STORE_DATA_KEY` environment variable can be used as well. +* `--sdg-object-store-secret`: The name of the Kubernetes secret containing the SDG object store + credentials. **Optional** - If not provided, the script will expect the provided CLI options to fetch the SDG data. +* `--sdg-object-store-endpoint`: The endpoint of the object store. `SDG_OBJECT_STORE_ENDPOINT` + environment variable can be used as well. **Optional** +* `--sdg-object-store-bucket`: The bucket name in the object store. `SDG_OBJECT_STORE_BUCKET` + environment variable can be used as well. **Required** - If `--sdg-object-store-secret` is not provided. +* `--sdg-object-store-access-key`: The access key for the object store. + `SDG_OBJECT_STORE_ACCESS_KEY` environment variable can be used as well. **Required** - If `--sdg-object-store-secret` is not provided. +* `--sdg-object-store-secret-key`: The secret key for the object store. + `SDG_OBJECT_STORE_SECRET_KEY` environment variable can be used as well. **Required** - If `--sdg-object-store-secret` is not provided. +* `--sdg-object-store-data-key`: The key for the SDG data in the object store. e.g., + `sdg.tar.gz`.`SDG_OBJECT_STORE_DATA_KEY` environment variable can be used as well. **Required** - If `--sdg-object-store-secret` is not provided. * `--sdg-object-store-verify-tls`: Whether to verify TLS for the object store endpoint (default: - true). `SDG_OBJECT_STORE_VERIFY_TLS` environment variable can be used as well. -* `--sdg-object-store-region`: The region of the object store. `SDG_OBJECT_STORE_REGION` environment variable can be used as well. + true). `SDG_OBJECT_STORE_VERIFY_TLS` environment variable can be used as well. **Optional** +* `--sdg-object-store-region`: The region of the object store. `SDG_OBJECT_STORE_REGION` environment + variable can be used as well. **Optional** +* `--eval-serving-endpoint`: Serving endpoint for evaluation. e.g: + http://serving.kubeflow.svc.cluster.local:8080/v1 - **Required** +* `--eval-serving-model-name`: The name of the model to use for evaluation. **Required** +* `--eval-serving-model-api-key`: The API key for the model to evaluate. `EVAL_SERVING_MODEL_API_KEY` + environment variable can be used as well. **Required** + ## Example End-To-End Workflow @@ -145,7 +158,12 @@ stringData: data_key: sdg.tar.gz EOF -./standalone run --namespace my-namespace --sdg-object-store-secret sdg-data +./standalone run \ + --namespace my-namespace \ + --eval-serving-endpoint http://serving.kubeflow.svc.cluster.local:8080/v1 \ + --eval-serving-model-name my-model \ + --eval-serving-model-api-key ***** \ + --sdg-object-store-secret sdg-data ``` > [!WARNING] @@ -162,6 +180,13 @@ The list of all supported keys: * `endpoint`: The endpoint of the object store, e.g: https://s3.openshift-storage.svc:443 - **Optional** * `region`: The region of the object store - **Optional** +> [!NOTE] +> The `--eval-serving-endpoint` and `--eval-serving-model-name` values will be stored in a ConfigMap +> named `eval-serving-details` in the same namespace as the resources that the script interacts +> with. (in this case, `my-namespace`) +> The `--eval-serving-model-api-key` value will be stored in a secret named `eval-serving-details` +> in the same namespace as the resources that the script interacts with. (in this case, `my-namespace`) + #### Running the Script Without Kubernetes Secret Alternatively, you can provide the necessary information directly via CLI options or environment, @@ -172,6 +197,9 @@ Secret named `sdg-object-store-credentials` in the same namespace as the resourc ```bash ./standalone run \ --namespace my-namespace \ + --eval-serving-endpoint http://serving.kubeflow.svc.cluster.local:8080/v1 \ + --eval-serving-model-name my-model \ + --eval-serving-model-api-key ***** \ --sdg-object-store-access-key key \ --sdg-object-store-secret-key key \ --sdg-object-store-bucket sdg-data \ @@ -184,7 +212,10 @@ If you don't use the official AWS S3 endpoint, you can provide additional inform ```bash ./standalone run \ - --namespace foo \ + --namespace my-namespace \ + --eval-serving-endpoint http://serving.kubeflow.svc.cluster.local:8080/v1 \ + --eval-serving-model-name my-model \ + --eval-serving-model-api-key ***** \ --sdg-object-store-access-key key \ --sdg-object-store-secret-key key \ --sdg-object-store-bucket sdg-data \ diff --git a/standalone/standalone.py b/standalone/standalone.py index 0a6ec622..f1c7484a 100755 --- a/standalone/standalone.py +++ b/standalone/standalone.py @@ -81,6 +81,25 @@ """ +EVAL_SERVING_NAME = "eval-serving-details" +EVAL_SERVING_DETAILS = """ +kind: ConfigMap +apiVersion: v1 +metadata: + name: {EVAL_SERVING_NAME} +data: + endpoint: {eval_serving_endpoint} + model: {eval_serving_model_name} +--- +apiVersion: v1 +kind: Secret +metadata: + name: {EVAL_SERVING_NAME} +type: Opaque +stringData: + api_key: {eval_serving_model_api_key} +""" + PYTORCH_TRAINING_JOB = """ apiVersion: kubeflow.org/v1 kind: PyTorchJob @@ -464,6 +483,30 @@ def show( help="Serving model for SDG - for SDG only", hidden=True, ) +@click.option( + "--eval-serving-endpoint", + type=str, + help=( + "Serving endpoint for evaluation." + "e.g. http://serving.kubeflow.svc.cluster.local:8080/v1" + ), + required=True, +) +@click.option( + "--eval-serving-model-name", + type=str, + help="The name of the model to use for evaluation.", + required=True, +) +@click.option( + "--eval-serving-model-api-key", + type=str, + help=( + "Serving model API key for evaluation. " "(EVAL_SERVING_MODEL_API_KEY env var)" + ), + envvar="EVAL_SERVING_MODEL_API_KEY", + required=True, +) @click.option( "--nproc-per-node", type=int, @@ -561,6 +604,9 @@ def run( storage_class: typing.Optional[str] = "standard", serving_endpoint: typing.Optional[str] = None, serving_model: typing.Optional[str] = None, + eval_serving_endpoint: typing.Optional[str] = None, + eval_serving_model_name: typing.Optional[str] = None, + eval_serving_model_api_key: typing.Optional[str] = None, nproc_per_node: typing.Optional[int] = 1, eval_type: typing.Optional[str] = None, training_phase: typing.Optional[str] = None, @@ -585,6 +631,9 @@ def run( storage_class (str): The storage class to use for the PersistentVolumeClaim. For SDG only. serving_endpoint (str): The serving endpoint for SDG. For SDG only. serving_model (str): The serving model for SDG. For SDG only. + eval_serving_endpoint (str): The serving endpoint for evaluation. For Evaluation only. + eval_serving_model_name (str): The serving model name for evaluation. For Evaluation only. + eval_serving_model_api_key (str): The serving model API key for evaluation. For Evaluation only. nproc_per_node (int): The number of processes per node. For training only. eval_type (str): The type of evaluation to run. training_phase (str): The type of training phase to run. @@ -609,6 +658,9 @@ def run( ctx.obj["storage_class"] = storage_class ctx.obj["serving_endpoint"] = serving_endpoint ctx.obj["serving_model"] = serving_model + ctx.obj["eval_serving_endpoint"] = eval_serving_endpoint + ctx.obj["eval_serving_model_name"] = eval_serving_model_name + ctx.obj["eval_serving_model_api_key"] = eval_serving_model_api_key ctx.obj["nproc_per_node"] = nproc_per_node ctx.obj["eval_type"] = eval_type ctx.obj["training_phase"] = training_phase @@ -1289,6 +1341,18 @@ def run_mt_bench_op( name=TRAINING_VOLUME_NAME, mount_path=TRAINING_PVC_MOUNT_PATH ), ], + env_from=[ + kubernetes.client.V1EnvFromSource( + config_map_ref=kubernetes.client.V1ConfigMapEnvSource( + name=EVAL_SERVING_NAME + ) + ), + kubernetes.client.V1EnvFromSource( + secret_ref=kubernetes.client.V1SecretEnvSource( + name=EVAL_SERVING_NAME + ) + ), + ], ) ] container = kubernetes.client.V1Container( @@ -1602,6 +1666,9 @@ def sdg_data_fetch( # Populate variables from context namespace = ctx.obj["namespace"] storage_class = ctx.obj["storage_class"] + eval_serving_endpoint = ctx.obj["eval_serving_endpoint"] + eval_serving_model_name = ctx.obj["eval_serving_model_name"] + eval_serving_model_api_key = ctx.obj["eval_serving_model_api_key"] sdg_object_store_endpoint = ctx.obj["sdg_object_store_endpoint"] sdg_object_store_bucket = ctx.obj["sdg_object_store_bucket"] sdg_object_store_access_key = ctx.obj["sdg_object_store_access_key"] @@ -1611,6 +1678,9 @@ def sdg_data_fetch( sdg_object_store_verify_tls = ctx.obj["sdg_object_store_verify_tls"] sdg_object_store_secret = ctx.obj["sdg_object_store_secret"] + # Make sure the endpoint is a valid URL + validate_url(eval_serving_endpoint) + # Check if all required arguments are provided if not sdg_object_store_secret: if not all( @@ -1704,6 +1774,33 @@ def decode_base64(data): "'bucket', 'access_key', 'secret_key', 'data_key'.", ) + # Create config map/secret with api_key, serving endpoint for evaluation + cms = list( + yaml.safe_load_all( + EVAL_SERVING_DETAILS.format( + eval_serving_endpoint=eval_serving_endpoint, + eval_serving_model_name=eval_serving_model_name, + eval_serving_model_api_key=eval_serving_model_api_key, + ) + ) + ) + for cm in cms: + try: + # if this is a ConfigMap + kind = cm["kind"] + if kind == "ConfigMap": + v1.create_namespaced_config_map(namespace=namespace, body=cm) + logger.info("Successfully created %s '%s' created.", kind, cm) + elif kind == "Secret": + # if this is a Secret + v1.create_namespaced_secret(namespace=namespace, body=cm) + logger.info("Successfully created %s '%s' created.", kind, cm) + except kubernetes.client.rest.ApiException as exc: + if exc.status == 409: + logger.info("%s '%s' already exists.", kind, cm["metadata"]["name"]) + else: + raise + # list of PVCs to create and their details pvcs = [ { @@ -1711,21 +1808,21 @@ def decode_base64(data): "namespace": namespace, "storage_class": storage_class, "access_modes": ["ReadWriteOnce"], - "size": "1Gi", + "size": "10Gi", # SDG Data set can be big so let's go with a safe size }, { "name": MODEL_PVC_NAME, "namespace": namespace, "storage_class": storage_class, "access_modes": ["ReadWriteOnce"], - "size": "50Gi", + "size": "100Gi", # Model can be big so let's go with a safe size }, { "name": TRAINING_PVC_NAME, "namespace": namespace, "storage_class": storage_class, "access_modes": ["ReadWriteMany"], - "size": "50Gi", + "size": "100Gi", # Training data can be big so let's go with a safe size }, ] for pvc in pvcs: diff --git a/standalone/standalone.tpl b/standalone/standalone.tpl index e8518410..b0d50424 100755 --- a/standalone/standalone.tpl +++ b/standalone/standalone.tpl @@ -66,6 +66,25 @@ KFP_MODEL_SERVER_CM = """ {{kfp_model_server_cm}} """ +EVAL_SERVING_NAME = "eval-serving-details" +EVAL_SERVING_DETAILS = """ +kind: ConfigMap +apiVersion: v1 +metadata: + name: {EVAL_SERVING_NAME} +data: + endpoint: {eval_serving_endpoint} + model: {eval_serving_model_name} +--- +apiVersion: v1 +kind: Secret +metadata: + name: {EVAL_SERVING_NAME} +type: Opaque +stringData: + api_key: {eval_serving_model_api_key} +""" + PYTORCH_TRAINING_JOB = """ apiVersion: kubeflow.org/v1 kind: PyTorchJob @@ -449,6 +468,30 @@ def show( help="Serving model for SDG - for SDG only", hidden=True, ) +@click.option( + "--eval-serving-endpoint", + type=str, + help=( + "Serving endpoint for evaluation." + "e.g. http://serving.kubeflow.svc.cluster.local:8080/v1" + ), + required=True, +) +@click.option( + "--eval-serving-model-name", + type=str, + help="The name of the model to use for evaluation.", + required=True, +) +@click.option( + "--eval-serving-model-api-key", + type=str, + help=( + "Serving model API key for evaluation. " "(EVAL_SERVING_MODEL_API_KEY env var)" + ), + envvar="EVAL_SERVING_MODEL_API_KEY", + required=True, +) @click.option( "--nproc-per-node", type=int, @@ -546,6 +589,9 @@ def run( storage_class: typing.Optional[str] = "standard", serving_endpoint: typing.Optional[str] = None, serving_model: typing.Optional[str] = None, + eval_serving_endpoint: typing.Optional[str] = None, + eval_serving_model_name: typing.Optional[str] = None, + eval_serving_model_api_key: typing.Optional[str] = None, nproc_per_node: typing.Optional[int] = 1, eval_type: typing.Optional[str] = None, training_phase: typing.Optional[str] = None, @@ -570,6 +616,9 @@ def run( storage_class (str): The storage class to use for the PersistentVolumeClaim. For SDG only. serving_endpoint (str): The serving endpoint for SDG. For SDG only. serving_model (str): The serving model for SDG. For SDG only. + eval_serving_endpoint (str): The serving endpoint for evaluation. For Evaluation only. + eval_serving_model_name (str): The serving model name for evaluation. For Evaluation only. + eval_serving_model_api_key (str): The serving model API key for evaluation. For Evaluation only. nproc_per_node (int): The number of processes per node. For training only. eval_type (str): The type of evaluation to run. training_phase (str): The type of training phase to run. @@ -594,6 +643,9 @@ def run( ctx.obj["storage_class"] = storage_class ctx.obj["serving_endpoint"] = serving_endpoint ctx.obj["serving_model"] = serving_model + ctx.obj["eval_serving_endpoint"] = eval_serving_endpoint + ctx.obj["eval_serving_model_name"] = eval_serving_model_name + ctx.obj["eval_serving_model_api_key"] = eval_serving_model_api_key ctx.obj["nproc_per_node"] = nproc_per_node ctx.obj["eval_type"] = eval_type ctx.obj["training_phase"] = training_phase @@ -1060,6 +1112,18 @@ def create_eval_job( name=TRAINING_VOLUME_NAME, mount_path=TRAINING_PVC_MOUNT_PATH ), ], + env_from=[ + kubernetes.client.V1EnvFromSource( + config_map_ref=kubernetes.client.V1ConfigMapEnvSource( + name=EVAL_SERVING_NAME + ) + ), + kubernetes.client.V1EnvFromSource( + secret_ref=kubernetes.client.V1SecretEnvSource( + name=EVAL_SERVING_NAME + ) + ), + ], ) ] container = kubernetes.client.V1Container( @@ -1373,6 +1437,9 @@ def sdg_data_fetch( # Populate variables from context namespace = ctx.obj["namespace"] storage_class = ctx.obj["storage_class"] + eval_serving_endpoint = ctx.obj["eval_serving_endpoint"] + eval_serving_model_name = ctx.obj["eval_serving_model_name"] + eval_serving_model_api_key = ctx.obj["eval_serving_model_api_key"] sdg_object_store_endpoint = ctx.obj["sdg_object_store_endpoint"] sdg_object_store_bucket = ctx.obj["sdg_object_store_bucket"] sdg_object_store_access_key = ctx.obj["sdg_object_store_access_key"] @@ -1382,6 +1449,9 @@ def sdg_data_fetch( sdg_object_store_verify_tls = ctx.obj["sdg_object_store_verify_tls"] sdg_object_store_secret = ctx.obj["sdg_object_store_secret"] + # Make sure the endpoint is a valid URL + validate_url(eval_serving_endpoint) + # Check if all required arguments are provided if not sdg_object_store_secret: if not all( @@ -1475,6 +1545,33 @@ def sdg_data_fetch( "'bucket', 'access_key', 'secret_key', 'data_key'.", ) + # Create config map/secret with api_key, serving endpoint for evaluation + cms = list( + yaml.safe_load_all( + EVAL_SERVING_DETAILS.format( + eval_serving_endpoint=eval_serving_endpoint, + eval_serving_model_name=eval_serving_model_name, + eval_serving_model_api_key=eval_serving_model_api_key, + ) + ) + ) + for cm in cms: + try: + # if this is a ConfigMap + kind = cm["kind"] + if kind == "ConfigMap": + v1.create_namespaced_config_map(namespace=namespace, body=cm) + logger.info("Successfully created %s '%s' created.", kind, cm) + elif kind == "Secret": + # if this is a Secret + v1.create_namespaced_secret(namespace=namespace, body=cm) + logger.info("Successfully created %s '%s' created.", kind, cm) + except kubernetes.client.rest.ApiException as exc: + if exc.status == 409: + logger.info("%s '%s' already exists.", kind, cm["metadata"]["name"]) + else: + raise + # list of PVCs to create and their details pvcs = [ { @@ -1482,21 +1579,21 @@ def sdg_data_fetch( "namespace": namespace, "storage_class": storage_class, "access_modes": ["ReadWriteOnce"], - "size": "1Gi", + "size": "10Gi", # SDG Data set can be big so let's go with a safe size }, { "name": MODEL_PVC_NAME, "namespace": namespace, "storage_class": storage_class, "access_modes": ["ReadWriteOnce"], - "size": "50Gi", + "size": "100Gi", # Model can be big so let's go with a safe size }, { "name": TRAINING_PVC_NAME, "namespace": namespace, "storage_class": storage_class, "access_modes": ["ReadWriteMany"], - "size": "50Gi", + "size": "100Gi", # Training data can be big so let's go with a safe size }, ] for pvc in pvcs: