Skip to content

Commit a6ebea0

Browse files
Merge pull request #1190 from blublinsky/deployment-diagnostic
implement detailed deployment diagnostics
2 parents ba91c4c + d542d6f commit a6ebea0

File tree

11 files changed

+727
-120
lines changed

11 files changed

+727
-120
lines changed

api/v1alpha1/olsconfig_types.go

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,10 +50,91 @@ type FeatureGate string
5050

5151
// OLSConfigStatus defines the observed state of OLS deployment.
5252
type OLSConfigStatus struct {
53+
// Conditions represent the state of individual components
54+
// Always populated after first reconciliation
5355
// +operator-sdk:csv:customresourcedefinitions:type=status
5456
Conditions []metav1.Condition `json:"conditions"`
57+
58+
// OverallStatus provides a high-level summary of the entire system's health.
59+
// Aggregates all component conditions into a single status value.
60+
// - Ready: All components are healthy
61+
// - NotReady: At least one component is not ready (check conditions for details)
62+
// Always set after first reconciliation
63+
// +kubebuilder:validation:Enum=Ready;NotReady
64+
// +operator-sdk:csv:customresourcedefinitions:type=status
65+
OverallStatus OverallStatus `json:"overallStatus"`
66+
67+
// DiagnosticInfo provides detailed troubleshooting information when deployments fail.
68+
// Each entry contains pod-level error details for a specific component.
69+
// This array is automatically populated when deployments fail and cleared when they recover.
70+
// Only present during deployment failures.
71+
// +optional
72+
// +operator-sdk:csv:customresourcedefinitions:type=status
73+
DiagnosticInfo []PodDiagnostic `json:"diagnosticInfo,omitempty"`
74+
}
75+
76+
// PodDiagnostic describes a pod-level issue
77+
type PodDiagnostic struct {
78+
// FailedComponent identifies which component this diagnostic relates to,
79+
// using the same type as the Conditions field (e.g., "ApiReady", "CacheReady")
80+
// This allows easy correlation between condition status and diagnostic details.
81+
FailedComponent string `json:"failedComponent"`
82+
83+
// PodName is the name of the pod with issues
84+
PodName string `json:"podName"`
85+
86+
// ContainerName is the container within the pod that failed
87+
// Empty if the issue is at the pod level (e.g., scheduling)
88+
// +optional
89+
ContainerName string `json:"containerName,omitempty"`
90+
91+
// Reason is the failure reason
92+
// Examples: ImagePullBackOff, CrashLoopBackOff, Unschedulable, OOMKilled
93+
Reason string `json:"reason"`
94+
95+
// Message provides detailed error information from Kubernetes
96+
Message string `json:"message"`
97+
98+
// ExitCode for terminated containers (only set for container failures)
99+
// +optional
100+
ExitCode *int32 `json:"exitCode,omitempty"`
101+
102+
// Type indicates the diagnostic type
103+
// +kubebuilder:validation:Enum=ContainerWaiting;ContainerTerminated;PodScheduling;PodCondition
104+
Type DiagnosticType `json:"type"`
105+
106+
// LastUpdated is the timestamp when this diagnostic was collected
107+
LastUpdated metav1.Time `json:"lastUpdated"`
55108
}
56109

110+
// DiagnosticType categorizes the type of diagnostic
111+
// +kubebuilder:validation:Enum=ContainerWaiting;ContainerTerminated;PodScheduling;PodCondition
112+
type DiagnosticType string
113+
114+
const (
115+
DiagnosticTypeContainerWaiting DiagnosticType = "ContainerWaiting"
116+
DiagnosticTypeContainerTerminated DiagnosticType = "ContainerTerminated"
117+
DiagnosticTypePodScheduling DiagnosticType = "PodScheduling"
118+
DiagnosticTypePodCondition DiagnosticType = "PodCondition"
119+
)
120+
121+
// DeploymentStatus represents the status of a deployment check
122+
type DeploymentStatus string
123+
124+
const (
125+
DeploymentStatusReady DeploymentStatus = "Ready"
126+
DeploymentStatusProgressing DeploymentStatus = "Progressing"
127+
DeploymentStatusFailed DeploymentStatus = "Failed"
128+
)
129+
130+
// OverallStatus represents the aggregate status of the entire system
131+
type OverallStatus string
132+
133+
const (
134+
OverallStatusReady OverallStatus = "Ready"
135+
OverallStatusNotReady OverallStatus = "NotReady"
136+
)
137+
57138
// LLMSpec defines the desired state of the large language model (LLM).
58139
type LLMSpec struct {
59140
// +kubebuilder:validation:Required

api/v1alpha1/zz_generated.deepcopy.go

Lines changed: 28 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

bundle/manifests/lightspeed-operator.clusterserviceversion.yaml

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -376,8 +376,26 @@ spec:
376376
displayName: Log level
377377
path: olsDataCollector.logLevel
378378
statusDescriptors:
379-
- displayName: Conditions
379+
- description: |-
380+
Conditions represent the state of individual components
381+
Always populated after first reconciliation
382+
displayName: Conditions
380383
path: conditions
384+
- description: |-
385+
DiagnosticInfo provides detailed troubleshooting information when deployments fail.
386+
Each entry contains pod-level error details for a specific component.
387+
This array is automatically populated when deployments fail and cleared when they recover.
388+
Only present during deployment failures.
389+
displayName: Diagnostic Info
390+
path: diagnosticInfo
391+
- description: |-
392+
OverallStatus provides a high-level summary of the entire system's health.
393+
Aggregates all component conditions into a single status value.
394+
- Ready: All components are healthy
395+
- NotReady: At least one component is not ready (check conditions for details)
396+
Always set after first reconciliation
397+
displayName: Overall Status
398+
path: overallStatus
381399
version: v1alpha1
382400
description: |-
383401
OpenShift Lightspeed Operator provides generative AI-based virtual assistant which integrates into the OpenShift web console. OpenShift Lightspeed can answer natural language questions related to OpenShift Container Platform.
@@ -399,6 +417,11 @@ spec:
399417
spec:
400418
clusterPermissions:
401419
- rules:
420+
- nonResourceURLs:
421+
- /ls-access
422+
- /ols-metrics-access
423+
verbs:
424+
- get
402425
- apiGroups:
403426
- ""
404427
resources:
@@ -414,6 +437,14 @@ spec:
414437
- patch
415438
- update
416439
- watch
440+
- apiGroups:
441+
- ""
442+
resources:
443+
- pods
444+
verbs:
445+
- get
446+
- list
447+
- watch
417448
- apiGroups:
418449
- ""
419450
resources:
@@ -549,7 +580,10 @@ spec:
549580
- clusterroles
550581
verbs:
551582
- create
583+
- get
552584
- list
585+
- patch
586+
- update
553587
- watch
554588
- apiGroups:
555589
- storage.k8s.io
@@ -590,7 +624,7 @@ spec:
590624
- --metrics-bind-address=:8443
591625
- --secure-metrics-server
592626
- --cert-dir=/etc/tls/private
593-
- --lcore-image=quay.io/openshift-lightspeed/lightspeed-stack:latest
627+
- --lcore-image=quay.io/lightspeed-core/lightspeed-stack:dev-latest
594628
- --use-lcore=false
595629
- --service-image=quay.io/openshift-lightspeed/lightspeed-service-api:latest
596630
- --console-image=quay.io/openshift-lightspeed/lightspeed-console-plugin:latest

bundle/manifests/ols.openshift.io_olsconfigs.yaml

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1307,6 +1307,9 @@ spec:
13071307
description: OLSConfigStatus defines the observed state of OLS deployment.
13081308
properties:
13091309
conditions:
1310+
description: |-
1311+
Conditions represent the state of individual components
1312+
Always populated after first reconciliation
13101313
items:
13111314
description: Condition contains details for one aspect of the current
13121315
state of this API Resource.
@@ -1362,8 +1365,85 @@ spec:
13621365
- type
13631366
type: object
13641367
type: array
1368+
diagnosticInfo:
1369+
description: |-
1370+
DiagnosticInfo provides detailed troubleshooting information when deployments fail.
1371+
Each entry contains pod-level error details for a specific component.
1372+
This array is automatically populated when deployments fail and cleared when they recover.
1373+
Only present during deployment failures.
1374+
items:
1375+
description: PodDiagnostic describes a pod-level issue
1376+
properties:
1377+
containerName:
1378+
description: |-
1379+
ContainerName is the container within the pod that failed
1380+
Empty if the issue is at the pod level (e.g., scheduling)
1381+
type: string
1382+
exitCode:
1383+
description: ExitCode for terminated containers (only set for
1384+
container failures)
1385+
format: int32
1386+
type: integer
1387+
failedComponent:
1388+
description: |-
1389+
FailedComponent identifies which component this diagnostic relates to,
1390+
using the same type as the Conditions field (e.g., "ApiReady", "CacheReady")
1391+
This allows easy correlation between condition status and diagnostic details.
1392+
type: string
1393+
lastUpdated:
1394+
description: LastUpdated is the timestamp when this diagnostic
1395+
was collected
1396+
format: date-time
1397+
type: string
1398+
message:
1399+
description: Message provides detailed error information from
1400+
Kubernetes
1401+
type: string
1402+
podName:
1403+
description: PodName is the name of the pod with issues
1404+
type: string
1405+
reason:
1406+
description: |-
1407+
Reason is the failure reason
1408+
Examples: ImagePullBackOff, CrashLoopBackOff, Unschedulable, OOMKilled
1409+
type: string
1410+
type:
1411+
allOf:
1412+
- enum:
1413+
- ContainerWaiting
1414+
- ContainerTerminated
1415+
- PodScheduling
1416+
- PodCondition
1417+
- enum:
1418+
- ContainerWaiting
1419+
- ContainerTerminated
1420+
- PodScheduling
1421+
- PodCondition
1422+
description: Type indicates the diagnostic type
1423+
type: string
1424+
required:
1425+
- failedComponent
1426+
- lastUpdated
1427+
- message
1428+
- podName
1429+
- reason
1430+
- type
1431+
type: object
1432+
type: array
1433+
overallStatus:
1434+
description: |-
1435+
OverallStatus provides a high-level summary of the entire system's health.
1436+
Aggregates all component conditions into a single status value.
1437+
- Ready: All components are healthy
1438+
- NotReady: At least one component is not ready (check conditions for details)
1439+
Always set after first reconciliation
1440+
enum:
1441+
- Ready
1442+
- NotReady
1443+
type: string
13651444
required:
13661445
- conditions
1446+
- overallStatus
13671447
type: object
13681448
required:
13691449
- spec

0 commit comments

Comments
 (0)