Skip to content

Commit 7cdfdc0

Browse files
authored
CLOUDP-189434 Using Wait Steps instead of risky Steps (#1332)
* CLOUDP-189434 Using Wait Steps instead of risky Steps * Added more context in the comment
1 parent a7d6acd commit 7cdfdc0

File tree

9 files changed

+799
-179
lines changed

9 files changed

+799
-179
lines changed

cmd/readiness/main.go

Lines changed: 40 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@ import (
1010
"github.com/mongodb/mongodb-kubernetes-operator/pkg/readiness/config"
1111
"github.com/mongodb/mongodb-kubernetes-operator/pkg/readiness/headless"
1212
"github.com/mongodb/mongodb-kubernetes-operator/pkg/readiness/health"
13-
"github.com/mongodb/mongodb-kubernetes-operator/pkg/util/contains"
1413
"go.uber.org/zap/zapcore"
1514
"gopkg.in/natefinch/lumberjack.v2"
1615

@@ -25,12 +24,9 @@ const (
2524
mongodNotReadyIntervalMinutes = time.Minute * 1
2625
)
2726

28-
var riskySteps []string
2927
var logger *zap.SugaredLogger
3028

3129
func init() {
32-
riskySteps = []string{"WaitAllRsMembersUp", "WaitRsInit", "WaitCanUpdate"}
33-
3430
// By default, we log to the output (convenient for tests)
3531
cfg := zap.NewDevelopmentConfig()
3632
log, err := cfg.Build()
@@ -45,8 +41,7 @@ func init() {
4541
// The logic depends on if the pod is a standard MongoDB or an AppDB one.
4642
// - If MongoDB: then just the 'statuses[0].IsInGoalState` field is used to learn if the Agent has reached the goal
4743
// - if AppDB: the 'mmsStatus[0].lastGoalVersionAchieved' field is compared with the one from mounted automation config
48-
// Additionally if the previous check hasn't returned 'true' the "deadlock" case is checked to make sure the Agent is
49-
// not waiting for the other members.
44+
// Additionally if the previous check hasn't returned 'true' an additional check for wait steps is being performed
5045
func isPodReady(conf config.Config) (bool, error) {
5146
healthStatus, err := parseHealthStatus(conf.HealthStatusReader)
5247
if err != nil {
@@ -55,8 +50,8 @@ func isPodReady(conf config.Config) (bool, error) {
5550
}
5651

5752
// The 'statuses' file can be empty only for OM Agents
58-
if len(healthStatus.Healthiness) == 0 && !isHeadlessMode() {
59-
logger.Info("'statuses' is empty. We assume there is no automation config for the agent yet.")
53+
if len(healthStatus.Statuses) == 0 && !isHeadlessMode() {
54+
logger.Debug("'statuses' is empty. We assume there is no automation config for the agent yet. Returning ready.")
6055
return true, nil
6156
}
6257

@@ -73,33 +68,35 @@ func isPodReady(conf config.Config) (bool, error) {
7368
}
7469

7570
if inGoalState && inReadyState {
76-
logger.Info("Agent has reached goal state")
71+
logger.Info("The Agent has reached goal state. Returning ready.")
7772
return true, nil
7873
}
7974

8075
// Fallback logic: the agent is not in goal state and got stuck in some steps
81-
if !inGoalState && hasDeadlockedSteps(healthStatus) {
76+
if !inGoalState && isOnWaitingStep(healthStatus) {
77+
logger.Info("The Agent is on wait Step. Returning ready.")
8278
return true, nil
8379
}
8480

81+
logger.Info("Reached the end of the check. Returning not ready.")
8582
return false, nil
8683
}
8784

88-
// hasDeadlockedSteps returns true if the agent is stuck on waiting for the other agents
89-
func hasDeadlockedSteps(health health.Status) bool {
90-
currentStep := findCurrentStep(health.ProcessPlans)
85+
// isOnWaitingStep returns true if the agent is stuck on waiting for the other Agents or something else to happen.
86+
func isOnWaitingStep(health health.Status) bool {
87+
currentStep := findCurrentStep(health.MmsStatus)
9188
if currentStep != nil {
92-
return isDeadlocked(currentStep)
89+
return isWaitStep(currentStep)
9390
}
9491
return false
9592
}
9693

97-
// findCurrentStep returns the step which seems to be run by the Agent now. The step is always in the last plan
98-
// (see https://github.com/10gen/ops-manager-kubernetes/pull/401#discussion_r333071555) so we iterate over all the steps
99-
// there and find the last step which has "Started" non nil
100-
// (indeed this is not the perfect logic as sometimes the agent doesn't update the 'Started' as well - see
101-
// 'health-status-ok.json', but seems it works for finding deadlocks still
102-
// noinspection GoNilness
94+
// findCurrentStep returns the step which the Agent is working now.
95+
// The algorithm (described in https://github.com/10gen/ops-manager-kubernetes/pull/401#discussion_r333071555):
96+
// - Obtain the latest plan (the last one in the plans array)
97+
// - Find the last step, which has Started not nil and Completed nil. The Steps are processed as a tree in a BFS fashion.
98+
// The last element is very likely to be the Step the Agent is performing at the moment. There are some chances that
99+
// this is a waiting step, use isWaitStep to verify this.
103100
func findCurrentStep(processStatuses map[string]health.MmsDirectorStatus) *health.StepStatus {
104101
var currentPlan *health.PlanStatus
105102
if len(processStatuses) == 0 {
@@ -111,13 +108,14 @@ func findCurrentStep(processStatuses map[string]health.MmsDirectorStatus) *healt
111108
logger.Errorf("Only one process status is expected but got %d!", len(processStatuses))
112109
return nil
113110
}
111+
114112
// There is always only one process managed by the Agent - so there will be only one loop
115-
for k, v := range processStatuses {
116-
if len(v.Plans) == 0 {
117-
logger.Errorf("The process %s doesn't contain any plans!", k)
113+
for processName, processStatus := range processStatuses {
114+
if len(processStatus.Plans) == 0 {
115+
logger.Errorf("The process %s doesn't contain any plans!", processName)
118116
return nil
119117
}
120-
currentPlan = v.Plans[len(v.Plans)-1]
118+
currentPlan = processStatus.Plans[len(processStatus.Plans)-1]
121119
}
122120

123121
if currentPlan.Completed != nil {
@@ -129,7 +127,7 @@ func findCurrentStep(processStatuses map[string]health.MmsDirectorStatus) *healt
129127
var lastStartedStep *health.StepStatus
130128
for _, m := range currentPlan.Moves {
131129
for _, s := range m.Steps {
132-
if s.Started != nil {
130+
if s.Started != nil && s.Completed == nil {
133131
lastStartedStep = s
134132
}
135133
}
@@ -138,12 +136,23 @@ func findCurrentStep(processStatuses map[string]health.MmsDirectorStatus) *healt
138136
return lastStartedStep
139137
}
140138

141-
func isDeadlocked(status *health.StepStatus) bool {
139+
// isWaitStep returns true is the Agent is currently waiting for something to happen.
140+
//
141+
// Most of the time, the Agent waits for an initialization by other member of the cluster. In such case,
142+
// holding the rollout does not improve the overall system state. Even if the probe returns true too quickly
143+
// the worst thing that can happen is a short service interruption, which is still better than full service outage.
144+
//
145+
// The 15 seconds explanation:
146+
// - The status file is written every 10s but the Agent processes steps independently of it
147+
// - In order to avoid reacting on a newly added wait Step (as they can naturally go away), we're giving the Agent
148+
// at least 15 sends to spend on that Step.
149+
// - This hopefully prevents the Probe from flipping False to True too quickly.
150+
func isWaitStep(status *health.StepStatus) bool {
142151
// Some logic behind 15 seconds: the health status file is dumped each 10 seconds, so we are sure that if the agent
143152
// has been in the step for 10 seconds - this means it is waiting for the other hosts, and they are not available
144153
fifteenSecondsAgo := time.Now().Add(time.Duration(-15) * time.Second)
145-
if contains.String(riskySteps, status.Step) && status.Completed == nil && status.Started.Before(fifteenSecondsAgo) {
146-
logger.Infof("Indicated a possible deadlock, status: %s, started at %s but hasn't finished "+
154+
if status.IsWaitStep && status.Completed == nil && status.Started.Before(fifteenSecondsAgo) {
155+
logger.Debugf("Indicated a wait Step, status: %s, started at %s but hasn't finished "+
147156
"yet. Marking the probe as ready", status.Step, status.Started.Format(time.RFC3339))
148157
return true
149158
}
@@ -160,7 +169,7 @@ func isInGoalState(health health.Status, conf config.Config) (bool, error) {
160169
// performCheckOMMode does a general check if the Agent has reached the goal state - must be called when Agent is in
161170
// "OM mode"
162171
func performCheckOMMode(health health.Status) bool {
163-
for _, v := range health.Healthiness {
172+
for _, v := range health.Statuses {
164173
logger.Debug(v)
165174
if v.IsInGoalState {
166175
return true
@@ -231,10 +240,10 @@ func main() {
231240
// isInReadyState checks the MongoDB Server state. It returns true if the mongod process is up and its state
232241
// is PRIMARY or SECONDARY.
233242
func isInReadyState(health health.Status) bool {
234-
if len(health.Healthiness) == 0 {
243+
if len(health.Statuses) == 0 {
235244
return true
236245
}
237-
for _, processHealth := range health.Healthiness {
246+
for _, processHealth := range health.Statuses {
238247
// We know this loop should run only once, in Kubernetes there's
239248
// only 1 server managed per host.
240249
if !processHealth.ExpectedToBeUp {

0 commit comments

Comments
 (0)