Skip to content

Commit 74a661e

Browse files
authored
K8s IaC (#204)
* ConfigManagement for k8s IaC * Workload Principles for Image Push * k8s image push using buildkit
1 parent 4314cf9 commit 74a661e

File tree

25 files changed

+425
-168
lines changed

25 files changed

+425
-168
lines changed

.github/workflows/pytest.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,6 @@ jobs:
5050
run: |
5151
cd src/
5252
python -m pip install --upgrade pip wheel setuptools
53-
pip install torch==2.7.0+cpu -f https://download.pytorch.org/whl/cpu/torch
5453
pip install -e ".[all-test]"
5554
5655
- name: Run All Tests

.gitignore

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,8 @@ __pycache__/
4646
**/.terraform*
4747
**/terraform.tfstate*
4848
**/*.pem
49-
opentofu/**/generated/*.*
50-
opentofu/**/generated/kubeconfig
49+
opentofu/**/stage/*.*
50+
opentofu/**/stage/kubeconfig
5151

5252
##############################################################################
5353
# Helm

helm/charts/server/templates/deployment.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,10 +52,10 @@ spec:
5252
httpGet:
5353
path: {{ include "getPath" . }}v1/liveness
5454
port: server-port
55-
initialDelaySeconds: 15
55+
initialDelaySeconds: 30
5656
periodSeconds: 30
57-
timeoutSeconds: 15
58-
failureThreshold: 6
57+
timeoutSeconds: 30
58+
failureThreshold: 10
5959
readinessProbe:
6060
httpGet:
6161
path: {{ include "getPath" . }}v1/readiness

opentofu/README.md

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,8 @@
1-
zip -r ai-optimizer-stack.zip . -x "terraform*" ".terraform*" "*/terraform*" "*/.terraform*" "generated/*.*"
1+
# Packaging Stack
2+
3+
The IaC is packaged and attached to each release using GitHub Actions. Below is the manual procedure:
4+
5+
1. Zip the Iac with Archives
6+
```bash
7+
zip -r ai-optimizer-stack.zip . -x "terraform*" ".terraform*" "*/terraform*" "*/.terraform*" "generated/*.*"
8+
```

opentofu/cfgmgt/apply.py

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
"""
2+
Copyright (c) 2025, Oracle and/or its affiliates.
3+
Licensed under the Universal Permissive License v1.0 as shown at http://oss.oracle.com/licenses/upl.
4+
"""
5+
# spell-checker:ignore kubeconfig
6+
7+
import subprocess
8+
import argparse
9+
import os
10+
import sys
11+
import time
12+
13+
# --- Constants ---
14+
HELM_NAME = "ai-optimizer"
15+
HELM_REPO = "https://oracle-samples.github.io/ai-optimizer/helm"
16+
STAGE_PATH = os.path.join(os.path.dirname(__file__), "stage")
17+
os.environ["KUBECONFIG"] = os.path.join(STAGE_PATH, "kubeconfig")
18+
19+
20+
# --- Utility Functions ---
21+
def run_cmd(cmd, capture_output=True):
22+
"""Generic subprocess execution"""
23+
try:
24+
result = subprocess.run(
25+
cmd,
26+
stdout=subprocess.PIPE if capture_output else None,
27+
stderr=subprocess.PIPE if capture_output else None,
28+
text=True,
29+
check=False,
30+
)
31+
stdout = result.stdout.strip() if result.stdout else ""
32+
stderr = result.stderr.strip() if result.stderr else ""
33+
return stdout, stderr, result.returncode
34+
except subprocess.SubprocessError as e:
35+
return "", str(e), 1
36+
37+
38+
def retry(func, retries=3, delay=10):
39+
"""Retry a function with given arguments on failure."""
40+
for attempt in range(1, retries + 1):
41+
print(f"🔁 Attempt {attempt}/{retries}")
42+
if func():
43+
return True
44+
if attempt < retries:
45+
print(f"⏳ Retrying in {delay} seconds...")
46+
time.sleep(delay)
47+
print("🚨 Maximum retries reached. Exiting.")
48+
sys.exit(1)
49+
50+
51+
# --- Core Functionalities ---
52+
def helm_repo_add_if_missing():
53+
"""Add/Update Helm Repo"""
54+
print(f"➕ Adding Helm repo '{HELM_NAME}'...")
55+
_, stderr, rc = run_cmd(["helm", "repo", "add", HELM_NAME, HELM_REPO], capture_output=False)
56+
if rc != 0:
57+
print(f"❌ Failed to add repo:\n{stderr}")
58+
sys.exit(1)
59+
60+
print("⬆️ Checking for Helm updates...")
61+
_, stderr, rc = run_cmd(["helm", "repo", "update"], capture_output=False)
62+
if rc != 0:
63+
print(f"❌ Failed to update repos:\n{stderr}")
64+
sys.exit(1)
65+
print(f"✅ Repo '{HELM_NAME}' added and updated.\n")
66+
67+
68+
def apply_helm_chart_inner(release_name, namespace):
69+
"""Apply Helm Chart"""
70+
values_path = os.path.join(STAGE_PATH, "helm-values.yaml")
71+
if not os.path.isfile(values_path):
72+
print(f"⚠️ Values file not found: {values_path}")
73+
return False
74+
75+
helm_repo_add_if_missing()
76+
77+
cmd = [
78+
"helm",
79+
"upgrade",
80+
"--install",
81+
release_name,
82+
f"{HELM_NAME}/{HELM_NAME}",
83+
"--namespace",
84+
namespace,
85+
"--values",
86+
values_path,
87+
]
88+
89+
print(f"🚀 Applying Helm chart '{HELM_NAME}' to namespace '{namespace}'...")
90+
stdout, stderr, rc = run_cmd(cmd)
91+
if rc == 0:
92+
print("✅ Helm chart applied:")
93+
print(f"Apply Helm Chart: {stdout}")
94+
return True
95+
else:
96+
print(f"❌ Failed to apply Helm chart:\n{stderr}")
97+
return False
98+
99+
100+
def apply_helm_chart(release_name, namespace):
101+
"""Retry Enabled Add/Update Helm Chart"""
102+
retry(lambda: apply_helm_chart_inner(release_name, namespace))
103+
104+
105+
def apply_manifest_inner():
106+
"""Apply Manifest"""
107+
manifest_path = os.path.join(STAGE_PATH, "k8s-manifest.yaml")
108+
if not os.path.isfile(manifest_path):
109+
print(f"⚠️ Manifest not found: {manifest_path}")
110+
return False
111+
112+
print("🚀 Applying Kubernetes manifest: k8s-manifest.yaml")
113+
_, stderr, rc = run_cmd(["kubectl", "apply", "-f", manifest_path], capture_output=False)
114+
if rc == 0:
115+
print("✅ Manifest applied.\n")
116+
return True
117+
else:
118+
print(f"❌ Failed to apply manifest:\n{stderr}")
119+
return False
120+
121+
122+
def apply_manifest():
123+
"""Retry Enabled Add/Update Manifest"""
124+
retry(apply_manifest_inner)
125+
126+
127+
# --- Entry Point ---
128+
if __name__ == "__main__":
129+
parser = argparse.ArgumentParser(description="Apply a Helm chart and a Kubernetes manifest.")
130+
parser.add_argument("release_name", help="Helm release name")
131+
parser.add_argument("namespace", help="Kubernetes namespace")
132+
args = parser.parse_args()
133+
134+
apply_manifest()
135+
apply_helm_chart(args.release_name, args.namespace)
File renamed without changes.

opentofu/modules/kubernetes/cfgmgt.tf

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
# Copyright (c) 2024, 2025, Oracle and/or its affiliates.
2+
# All rights reserved. The Universal Permissive License (UPL), Version 1.0 as shown at http://oss.oracle.com/licenses/upl
3+
# spell-checker: disable
4+
5+
locals {
6+
helm_values = templatefile("${path.module}/templates/helm_values.yaml", {
7+
label = var.label_prefix
8+
repository_server = local.repository_server
9+
repository_client = local.repository_client
10+
oci_tenancy = var.tenancy_id
11+
oci_region = var.region
12+
adb_ocid = var.adb_id
13+
adb_name = lower(var.adb_name)
14+
k8s_node_pool_gpu_deploy = var.k8s_node_pool_gpu_deploy
15+
lb_ip = var.lb.ip_address_details[0].ip_address
16+
})
17+
18+
k8s_manifest = templatefile("${path.module}/templates/k8s_manifest.yaml", {
19+
label = var.label_prefix
20+
repository_host = local.repository_host
21+
repository_server = local.repository_server
22+
repository_client = local.repository_client
23+
compartment_ocid = var.lb.compartment_id
24+
lb_ocid = var.lb.id
25+
lb_subnet_ocid = var.public_subnet_id
26+
lb_ip_ocid = var.lb.ip_address_details[0].ip_address
27+
lb_nsgs = var.lb_nsg_id
28+
lb_min_shape = var.lb.shape_details[0].minimum_bandwidth_in_mbps
29+
lb_max_shape = var.lb.shape_details[0].maximum_bandwidth_in_mbps
30+
adb_name = lower(var.adb_name)
31+
adb_password = var.adb_password
32+
adb_service = format("%s_TP", var.adb_name)
33+
api_key = random_string.api_key.result
34+
})
35+
}
36+
37+
resource "local_sensitive_file" "kubeconfig" {
38+
content = data.oci_containerengine_cluster_kube_config.default_cluster_kube_config.content
39+
filename = "${path.root}/cfgmgt/stage/kubeconfig"
40+
file_permission = 0600
41+
}
42+
43+
resource "local_sensitive_file" "helm_values" {
44+
content = local.helm_values
45+
filename = "${path.root}/cfgmgt/stage/helm-values.yaml"
46+
file_permission = 0600
47+
}
48+
49+
resource "local_sensitive_file" "k8s_manifest" {
50+
content = local.k8s_manifest
51+
filename = "${path.root}/cfgmgt/stage/k8s-manifest.yaml"
52+
file_permission = 0600
53+
}
54+
55+
resource "null_resource" "apply" {
56+
triggers = {
57+
always_run = "${timestamp()}"
58+
}
59+
provisioner "local-exec" {
60+
command = <<EOT
61+
python3 ${path.root}/cfgmgt/apply.py ${var.label_prefix} ${var.label_prefix}
62+
EOT
63+
}
64+
depends_on = [
65+
local_sensitive_file.kubeconfig,
66+
local_sensitive_file.helm_values,
67+
local_sensitive_file.k8s_manifest,
68+
oci_containerengine_node_pool.default_node_pool_details,
69+
oci_containerengine_node_pool.gpu_node_pool_details,
70+
oci_containerengine_addon.oraoper_addon,
71+
oci_containerengine_addon.certmgr_addon,
72+
oci_containerengine_addon.ingress_addon
73+
]
74+
}

opentofu/modules/kubernetes/iam.tf

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,9 @@ resource "oci_identity_policy" "workers_policies" {
1717
name = format("%s-workers-policy", var.label_prefix)
1818
description = format("%s - K8s Workers", var.label_prefix)
1919
statements = [
20+
# Workload Principles specific to oracle-database-operator-system Namespace
2021
format("allow any-user to manage autonomous-database-family in compartment id %s where all {request.principal.type = 'workload', request.principal.namespace = 'oracle-database-operator-system', request.principal.service_account = 'default', request.principal.cluster_id = '%s'}", var.compartment_id, oci_containerengine_cluster.default_cluster.id),
21-
format("allow any-user to read objectstorage-namespaces in compartment id %s where all {request.principal.type = 'workload', request.principal.service_account = 'default', request.principal.cluster_id = '%s'}", var.compartment_id, oci_containerengine_cluster.default_cluster.id),
22-
format("allow any-user to inspect buckets in compartment id %s where all {request.principal.type = 'workload', request.principal.service_account = 'default', request.principal.cluster_id = '%s'}", var.compartment_id, oci_containerengine_cluster.default_cluster.id),
23-
format("allow any-user to read objects in compartment id %s where all {request.principal.type = 'workload', request.principal.service_account = 'default', request.principal.cluster_id = '%s'}", var.compartment_id, oci_containerengine_cluster.default_cluster.id),
22+
# Workload Principles specific to native-ingress-controller-system Namespace
2423
format("allow any-user to manage load-balancers in compartment id %s where all {request.principal.type = 'workload', request.principal.namespace = 'native-ingress-controller-system', request.principal.service_account = 'oci-native-ingress-controller', request.principal.cluster_id = '%s'}", var.compartment_id, oci_containerengine_cluster.default_cluster.id),
2524
format("allow any-user to use virtual-network-family in compartment id %s where all {request.principal.type = 'workload', request.principal.namespace = 'native-ingress-controller-system', request.principal.service_account = 'oci-native-ingress-controller', request.principal.cluster_id = '%s'}", var.compartment_id, oci_containerengine_cluster.default_cluster.id),
2625
format("allow any-user to manage cabundles in compartment id %s where all {request.principal.type = 'workload', request.principal.namespace = 'native-ingress-controller-system', request.principal.service_account = 'oci-native-ingress-controller', request.principal.cluster_id = '%s'}", var.compartment_id, oci_containerengine_cluster.default_cluster.id),
@@ -37,6 +36,12 @@ resource "oci_identity_policy" "workers_policies" {
3736
format("allow any-user to manage waf-family in compartment id %s where all {request.principal.type = 'workload', request.principal.namespace = 'native-ingress-controller-system', request.principal.service_account = 'oci-native-ingress-controller', request.principal.cluster_id = '%s'}", var.compartment_id, oci_containerengine_cluster.default_cluster.id),
3837
format("allow any-user to read cluster-family in compartment id %s where all {request.principal.type = 'workload', request.principal.namespace = 'native-ingress-controller-system', request.principal.service_account = 'oci-native-ingress-controller', request.principal.cluster_id = '%s'}", var.compartment_id, oci_containerengine_cluster.default_cluster.id),
3938
format("allow any-user to use tag-namespaces in compartment id %s where all {request.principal.type = 'workload', request.principal.namespace = 'native-ingress-controller-system', request.principal.service_account = 'oci-native-ingress-controller', request.principal.cluster_id = '%s'}", var.compartment_id, oci_containerengine_cluster.default_cluster.id),
39+
# Workload Principles specific to Custom Namespace
40+
format("allow any-user to read objectstorage-namespaces in compartment id %s where all {request.principal.type = 'workload', request.principal.namespace = '%s', request.principal.cluster_id = '%s'}", var.compartment_id, var.label_prefix, oci_containerengine_cluster.default_cluster.id),
41+
format("allow any-user to inspect buckets in compartment id %s where all {request.principal.type = 'workload', request.principal.namespace = '%s', request.principal.cluster_id = '%s'}", var.compartment_id, var.label_prefix, oci_containerengine_cluster.default_cluster.id),
42+
format("allow any-user to read objects in compartment id %s where all {request.principal.type = 'workload', request.principal.namespace = '%s', request.principal.cluster_id = '%s'}", var.compartment_id, var.label_prefix, oci_containerengine_cluster.default_cluster.id),
43+
format("allow any-user to manage repos in compartment id %s where all {request.principal.type = 'workload', request.principal.namespace = '%s', request.principal.cluster_id = '%s'}", var.compartment_id, var.label_prefix, oci_containerengine_cluster.default_cluster.id),
44+
# Instance Principles
4045
format("allow dynamic-group %s to use generative-ai-family in compartment id %s", oci_identity_dynamic_group.workers_dynamic_group.name, var.compartment_id),
4146
format("allow dynamic-group %s to manage repos in compartment id %s", oci_identity_dynamic_group.workers_dynamic_group.name, var.compartment_id),
4247
]

opentofu/modules/kubernetes/locals.tf

Lines changed: 3 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -11,36 +11,10 @@ locals {
1111
local.region_map,
1212
var.region
1313
)
14-
15-
server_repository = lower(format("%s.ocir.io/%s/%s", local.image_region, data.oci_objectstorage_namespace.objectstorage_namespace.namespace, oci_artifacts_container_repository.server_repository.display_name))
16-
client_repository = lower(format("%s.ocir.io/%s/%s", local.image_region, data.oci_objectstorage_namespace.objectstorage_namespace.namespace, oci_artifacts_container_repository.client_repository.display_name))
14+
repository_host = lower(format("%s.ocir.io", local.image_region))
15+
repository_server = lower(format("%s/%s/%s", local.repository_host, data.oci_objectstorage_namespace.objectstorage_namespace.namespace, oci_artifacts_container_repository.repository_server.display_name))
16+
repository_client = lower(format("%s/%s/%s", local.repository_host, data.oci_objectstorage_namespace.objectstorage_namespace.namespace, oci_artifacts_container_repository.repository_client.display_name))
1717
k8s_cluster_name = format("%s-k8s", var.label_prefix)
18-
helm_values = templatefile("${path.module}/templates/helm_values.yaml", {
19-
label = var.label_prefix
20-
server_repository = local.server_repository
21-
client_repository = local.client_repository
22-
oci_tenancy = var.tenancy_id
23-
oci_region = var.region
24-
adb_ocid = var.adb_id
25-
adb_name = lower(var.adb_name)
26-
k8s_node_pool_gpu_deploy = var.k8s_node_pool_gpu_deploy
27-
lb_ip = var.lb.ip_address_details[0].ip_address
28-
})
29-
30-
k8s_manifest = templatefile("${path.module}/templates/k8s_manifest.yaml", {
31-
label = var.label_prefix
32-
compartment_ocid = var.lb.compartment_id
33-
lb_ocid = var.lb.id
34-
lb_subnet_ocid = var.public_subnet_id
35-
lb_ip_ocid = var.lb.ip_address_details[0].ip_address
36-
lb_nsgs = var.lb_nsg_id
37-
lb_min_shape = var.lb.shape_details[0].minimum_bandwidth_in_mbps
38-
lb_max_shape = var.lb.shape_details[0].maximum_bandwidth_in_mbps
39-
adb_name = lower(var.adb_name)
40-
adb_password = var.adb_password
41-
adb_service = format("%s_TP", var.adb_name)
42-
api_key = random_string.api_key.result
43-
})
4418

4519
oke_worker_images = try({
4620
for k, v in data.oci_containerengine_node_pool_option.images.sources : v.image_id => merge(

opentofu/modules/kubernetes/main.tf

Lines changed: 2 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -9,38 +9,20 @@ resource "random_string" "api_key" {
99
}
1010

1111
// oci_artifacts_container_repository
12-
resource "oci_artifacts_container_repository" "server_repository" {
12+
resource "oci_artifacts_container_repository" "repository_server" {
1313
compartment_id = var.compartment_id
1414
display_name = lower(format("%s/server", var.label_prefix))
1515
is_immutable = false
1616
is_public = false
1717
}
1818

19-
resource "oci_artifacts_container_repository" "client_repository" {
19+
resource "oci_artifacts_container_repository" "repository_client" {
2020
compartment_id = var.compartment_id
2121
display_name = lower(format("%s/client", var.label_prefix))
2222
is_immutable = false
2323
is_public = false
2424
}
2525

26-
resource "local_sensitive_file" "kubeconfig" {
27-
content = data.oci_containerengine_cluster_kube_config.default_cluster_kube_config.content
28-
filename = "${path.root}/generated/kubeconfig"
29-
file_permission = 0600
30-
}
31-
32-
resource "local_sensitive_file" "helm_values" {
33-
content = local.helm_values
34-
filename = "${path.root}/generated/${var.label_prefix}-values.yaml"
35-
file_permission = 0600
36-
}
37-
38-
resource "local_sensitive_file" "k8s_manifest" {
39-
content = local.k8s_manifest
40-
filename = "${path.root}/generated/${var.label_prefix}-manifest.yaml"
41-
file_permission = 0600
42-
}
43-
4426
// Cluster
4527
resource "oci_containerengine_cluster" "default_cluster" {
4628
compartment_id = var.compartment_id

0 commit comments

Comments
 (0)