Skip to content

Commit a4fc1da

Browse files
authored
Fix Database: Ready when pods are still pending (#88)
* refactor: move storage ready status assignment * fix: database was ready when pods were pending * fix: dead variable * fix: code review nested blocks * chore: bump chart version * fix: review suggestions
1 parent 0dfe459 commit a4fc1da

File tree

4 files changed

+76
-37
lines changed

4 files changed

+76
-37
lines changed

deploy/ydb-operator/Chart.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@ type: application
1515
# This is the chart version. This version number should be incremented each time you make changes
1616
# to the chart and its templates, including the app version.
1717
# Versions are expected to follow Semantic Versioning (https://semver.org/)
18-
version: 0.4.15
18+
version: 0.4.16
1919

2020
# This is the version number of the application being deployed. This version number should be
2121
# incremented each time you make changes to the application. Versions are not expected to
2222
# follow Semantic Versioning. They should reflect the version the application is using.
2323
# It is recommended to use it with quotes.
24-
appVersion: "0.4.15"
24+
appVersion: "0.4.16"

internal/controllers/database/sync.go

+70-31
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ import (
1919

2020
ydbv1alpha1 "github.com/ydb-platform/ydb-kubernetes-operator/api/v1alpha1"
2121
"github.com/ydb-platform/ydb-kubernetes-operator/internal/cms"
22+
"github.com/ydb-platform/ydb-kubernetes-operator/internal/labels"
2223
"github.com/ydb-platform/ydb-kubernetes-operator/internal/resources"
2324
)
2425

@@ -134,43 +135,72 @@ func (r *Reconciler) waitForClusterResources(ctx context.Context, database *reso
134135
return Continue, ctrl.Result{Requeue: false}, nil
135136
}
136137

137-
func (r *Reconciler) waitForStatefulSetToScale(ctx context.Context, database *resources.DatabaseBuilder) (bool, ctrl.Result, error) {
138-
r.Log.Info("running step waitForStatefulSetToScale")
138+
func (r *Reconciler) waitForStatefulSetToScale(
139+
ctx context.Context,
140+
database *resources.DatabaseBuilder,
141+
) (bool, ctrl.Result, error) {
142+
r.Log.Info("running step waitForStatefulSetToScale for Database")
139143

140-
if database.Spec.ServerlessResources == nil {
141-
found := &appsv1.StatefulSet{}
142-
err := r.Get(ctx, types.NamespacedName{
143-
Name: database.Name,
144-
Namespace: database.Namespace,
145-
}, found)
146-
if err != nil {
147-
if apierrors.IsNotFound(err) {
148-
return Stop, ctrl.Result{RequeueAfter: DefaultRequeueDelay}, nil
149-
}
150-
r.Recorder.Event(
151-
database,
152-
corev1.EventTypeNormal,
153-
"Syncing",
154-
fmt.Sprintf("Failed to get StatefulSets: %s", err),
155-
)
156-
return Stop, ctrl.Result{RequeueAfter: DefaultRequeueDelay}, err
144+
if database.Spec.ServerlessResources != nil {
145+
return Continue, ctrl.Result{Requeue: false}, nil
146+
}
147+
148+
found := &appsv1.StatefulSet{}
149+
err := r.Get(ctx, types.NamespacedName{
150+
Name: database.Name,
151+
Namespace: database.Namespace,
152+
}, found)
153+
if err != nil {
154+
if apierrors.IsNotFound(err) {
155+
return Stop, ctrl.Result{RequeueAfter: DefaultRequeueDelay}, nil
157156
}
157+
r.Recorder.Event(
158+
database,
159+
corev1.EventTypeNormal,
160+
"Syncing",
161+
fmt.Sprintf("Failed to get StatefulSets: %s", err),
162+
)
163+
return Stop, ctrl.Result{RequeueAfter: DefaultRequeueDelay}, err
164+
}
158165

159-
if found.Status.Replicas != database.Spec.Nodes {
160-
msg := fmt.Sprintf("Waiting for number of running pods to match expected: %d != %d",
161-
found.Status.Replicas,
162-
database.Spec.Nodes,
163-
)
164-
r.Recorder.Event(database, corev1.EventTypeNormal, "Provisioning", msg)
165-
database.Status.State = string(Provisioning)
166-
return r.setState(ctx, database)
166+
podLabels := labels.Common(database.Name, make(map[string]string))
167+
podLabels.Merge(map[string]string{
168+
labels.ComponentKey: labels.DynamicComponent,
169+
})
170+
171+
matchingLabels := client.MatchingLabels{}
172+
for k, v := range podLabels {
173+
matchingLabels[k] = v
174+
}
175+
176+
podList := &corev1.PodList{}
177+
opts := []client.ListOption{
178+
client.InNamespace(database.Namespace),
179+
matchingLabels,
180+
}
181+
182+
err = r.List(ctx, podList, opts...)
183+
if err != nil {
184+
r.Recorder.Event(
185+
database,
186+
corev1.EventTypeNormal,
187+
"Syncing",
188+
fmt.Sprintf("Failed to list cluster pods: %s", err),
189+
)
190+
database.Status.State = string(Provisioning)
191+
return Stop, ctrl.Result{RequeueAfter: DefaultRequeueDelay}, err
192+
}
193+
194+
runningPods := 0
195+
for _, e := range podList.Items {
196+
if e.Status.Phase == "Running" {
197+
runningPods++
167198
}
168199
}
169200

170-
if database.Status.State != string(Ready) &&
171-
meta.IsStatusConditionTrue(database.Status.Conditions, TenantInitializedCondition) {
172-
r.Recorder.Event(database, corev1.EventTypeNormal, "ResourcesReady", "Resource are ready and DB is initialized")
173-
database.Status.State = string(Ready)
201+
if runningPods != int(database.Spec.Nodes) {
202+
msg := fmt.Sprintf("Waiting for number of running dynamic pods to match expected: %d != %d", runningPods, database.Spec.Nodes)
203+
r.Recorder.Event(database, corev1.EventTypeNormal, string(Provisioning), msg)
174204
return r.setState(ctx, database)
175205
}
176206

@@ -372,6 +402,15 @@ func (r *Reconciler) handleTenantCreation(
372402
Reason: TenantInitializedReasonCompleted,
373403
Message: "Tenant creation is complete",
374404
})
405+
406+
r.Recorder.Event(
407+
database,
408+
corev1.EventTypeNormal,
409+
"DatabaseReady",
410+
"Database is initialized",
411+
)
412+
database.Status.State = string(Ready)
413+
375414
return r.setState(ctx, database)
376415
}
377416

internal/controllers/storage/init.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ func (r *Reconciler) processSkipInitPipeline(storage *resources.StorageClusterBu
4747
storage,
4848
corev1.EventTypeNormal,
4949
"ResourcesReady",
50-
"Everything should be in sync",
50+
"All resources are ready",
5151
)
5252

5353
storage.Status.State = string(Ready)
@@ -120,7 +120,7 @@ func (r *Reconciler) runInitScripts(
120120
storage,
121121
corev1.EventTypeNormal,
122122
"ResourcesReady",
123-
"Everything should be in sync",
123+
"All resources are ready",
124124
)
125125
storage.Status.State = string(Ready)
126126

internal/controllers/storage/sync.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ func (r *Reconciler) waitForStatefulSetToScale(
101101
ctx context.Context,
102102
storage *resources.StorageClusterBuilder,
103103
) (bool, ctrl.Result, error) {
104-
r.Log.Info("running step waitForStatefulSetToScale")
104+
r.Log.Info("running step waitForStatefulSetToScale for Storage")
105105
found := &appsv1.StatefulSet{}
106106
err := r.Get(ctx, types.NamespacedName{
107107
Name: storage.Name,
@@ -155,7 +155,7 @@ func (r *Reconciler) waitForStatefulSetToScale(
155155
}
156156

157157
if runningPods != int(storage.Spec.Nodes) {
158-
msg := fmt.Sprintf("Waiting for number of running pods to match expected: %d != %d", runningPods, storage.Spec.Nodes)
158+
msg := fmt.Sprintf("Waiting for number of running storage pods to match expected: %d != %d", runningPods, storage.Spec.Nodes)
159159
r.Recorder.Event(storage, corev1.EventTypeNormal, string(Provisioning), msg)
160160
storage.Status.State = string(Provisioning)
161161
return r.setState(ctx, storage)

0 commit comments

Comments
 (0)