@@ -10,7 +10,6 @@ import (
1010 "github.com/mongodb/mongodb-kubernetes-operator/pkg/readiness/config"
1111 "github.com/mongodb/mongodb-kubernetes-operator/pkg/readiness/headless"
1212 "github.com/mongodb/mongodb-kubernetes-operator/pkg/readiness/health"
13- "github.com/mongodb/mongodb-kubernetes-operator/pkg/util/contains"
1413 "go.uber.org/zap/zapcore"
1514 "gopkg.in/natefinch/lumberjack.v2"
1615
@@ -25,12 +24,9 @@ const (
2524 mongodNotReadyIntervalMinutes = time .Minute * 1
2625)
2726
28- var riskySteps []string
2927var logger * zap.SugaredLogger
3028
3129func init () {
32- riskySteps = []string {"WaitAllRsMembersUp" , "WaitRsInit" , "WaitCanUpdate" }
33-
3430 // By default, we log to the output (convenient for tests)
3531 cfg := zap .NewDevelopmentConfig ()
3632 log , err := cfg .Build ()
@@ -45,8 +41,7 @@ func init() {
4541// The logic depends on if the pod is a standard MongoDB or an AppDB one.
4642// - If MongoDB: then just the 'statuses[0].IsInGoalState` field is used to learn if the Agent has reached the goal
4743// - if AppDB: the 'mmsStatus[0].lastGoalVersionAchieved' field is compared with the one from mounted automation config
48- // Additionally if the previous check hasn't returned 'true' the "deadlock" case is checked to make sure the Agent is
49- // not waiting for the other members.
44+ // Additionally if the previous check hasn't returned 'true' an additional check for wait steps is being performed
5045func isPodReady (conf config.Config ) (bool , error ) {
5146 healthStatus , err := parseHealthStatus (conf .HealthStatusReader )
5247 if err != nil {
@@ -55,8 +50,8 @@ func isPodReady(conf config.Config) (bool, error) {
5550 }
5651
5752 // The 'statuses' file can be empty only for OM Agents
58- if len (healthStatus .Healthiness ) == 0 && ! isHeadlessMode () {
59- logger .Info ("'statuses' is empty. We assume there is no automation config for the agent yet." )
53+ if len (healthStatus .Statuses ) == 0 && ! isHeadlessMode () {
54+ logger .Debug ("'statuses' is empty. We assume there is no automation config for the agent yet. Returning ready ." )
6055 return true , nil
6156 }
6257
@@ -73,33 +68,35 @@ func isPodReady(conf config.Config) (bool, error) {
7368 }
7469
7570 if inGoalState && inReadyState {
76- logger .Info ("Agent has reached goal state" )
71+ logger .Info ("The Agent has reached goal state. Returning ready. " )
7772 return true , nil
7873 }
7974
8075 // Fallback logic: the agent is not in goal state and got stuck in some steps
81- if ! inGoalState && hasDeadlockedSteps (healthStatus ) {
76+ if ! inGoalState && isOnWaitingStep (healthStatus ) {
77+ logger .Info ("The Agent is on wait Step. Returning ready." )
8278 return true , nil
8379 }
8480
81+ logger .Info ("Reached the end of the check. Returning not ready." )
8582 return false , nil
8683}
8784
88- // hasDeadlockedSteps returns true if the agent is stuck on waiting for the other agents
89- func hasDeadlockedSteps (health health.Status ) bool {
90- currentStep := findCurrentStep (health .ProcessPlans )
85+ // isOnWaitingStep returns true if the agent is stuck on waiting for the other Agents or something else to happen.
86+ func isOnWaitingStep (health health.Status ) bool {
87+ currentStep := findCurrentStep (health .MmsStatus )
9188 if currentStep != nil {
92- return isDeadlocked (currentStep )
89+ return isWaitStep (currentStep )
9390 }
9491 return false
9592}
9693
97- // findCurrentStep returns the step which seems to be run by the Agent now. The step is always in the last plan
98- // (see https://github.com/10gen/ops-manager-kubernetes/pull/401#discussion_r333071555) so we iterate over all the steps
99- // there and find the last step which has "Started" non nil
100- // (indeed this is not the perfect logic as sometimes the agent doesn't update the 'Started' as well - see
101- // 'health-status-ok.json', but seems it works for finding deadlocks still
102- // noinspection GoNilness
94+ // findCurrentStep returns the step which the Agent is working now.
95+ // The algorithm (described in https://github.com/10gen/ops-manager-kubernetes/pull/401#discussion_r333071555):
96+ // - Obtain the latest plan (the last one in the plans array)
97+ // - Find the last step, which has Started not nil and Completed nil. The Steps are processed as a tree in a BFS fashion.
98+ // The last element is very likely to be the Step the Agent is performing at the moment. There are some chances that
99+ // this is a waiting step, use isWaitStep to verify this.
103100func findCurrentStep (processStatuses map [string ]health.MmsDirectorStatus ) * health.StepStatus {
104101 var currentPlan * health.PlanStatus
105102 if len (processStatuses ) == 0 {
@@ -111,13 +108,14 @@ func findCurrentStep(processStatuses map[string]health.MmsDirectorStatus) *healt
111108 logger .Errorf ("Only one process status is expected but got %d!" , len (processStatuses ))
112109 return nil
113110 }
111+
114112 // There is always only one process managed by the Agent - so there will be only one loop
115- for k , v := range processStatuses {
116- if len (v .Plans ) == 0 {
117- logger .Errorf ("The process %s doesn't contain any plans!" , k )
113+ for processName , processStatus := range processStatuses {
114+ if len (processStatus .Plans ) == 0 {
115+ logger .Errorf ("The process %s doesn't contain any plans!" , processName )
118116 return nil
119117 }
120- currentPlan = v .Plans [len (v .Plans )- 1 ]
118+ currentPlan = processStatus .Plans [len (processStatus .Plans )- 1 ]
121119 }
122120
123121 if currentPlan .Completed != nil {
@@ -129,7 +127,7 @@ func findCurrentStep(processStatuses map[string]health.MmsDirectorStatus) *healt
129127 var lastStartedStep * health.StepStatus
130128 for _ , m := range currentPlan .Moves {
131129 for _ , s := range m .Steps {
132- if s .Started != nil {
130+ if s .Started != nil && s . Completed == nil {
133131 lastStartedStep = s
134132 }
135133 }
@@ -138,12 +136,23 @@ func findCurrentStep(processStatuses map[string]health.MmsDirectorStatus) *healt
138136 return lastStartedStep
139137}
140138
141- func isDeadlocked (status * health.StepStatus ) bool {
139+ // isWaitStep returns true is the Agent is currently waiting for something to happen.
140+ //
141+ // Most of the time, the Agent waits for an initialization by other member of the cluster. In such case,
142+ // holding the rollout does not improve the overall system state. Even if the probe returns true too quickly
143+ // the worst thing that can happen is a short service interruption, which is still better than full service outage.
144+ //
145+ // The 15 seconds explanation:
146+ // - The status file is written every 10s but the Agent processes steps independently of it
147+ // - In order to avoid reacting on a newly added wait Step (as they can naturally go away), we're giving the Agent
148+ // at least 15 sends to spend on that Step.
149+ // - This hopefully prevents the Probe from flipping False to True too quickly.
150+ func isWaitStep (status * health.StepStatus ) bool {
142151 // Some logic behind 15 seconds: the health status file is dumped each 10 seconds, so we are sure that if the agent
143152 // has been in the step for 10 seconds - this means it is waiting for the other hosts, and they are not available
144153 fifteenSecondsAgo := time .Now ().Add (time .Duration (- 15 ) * time .Second )
145- if contains . String ( riskySteps , status .Step ) && status .Completed == nil && status .Started .Before (fifteenSecondsAgo ) {
146- logger .Infof ("Indicated a possible deadlock , status: %s, started at %s but hasn't finished " +
154+ if status .IsWaitStep && status .Completed == nil && status .Started .Before (fifteenSecondsAgo ) {
155+ logger .Debugf ("Indicated a wait Step , status: %s, started at %s but hasn't finished " +
147156 "yet. Marking the probe as ready" , status .Step , status .Started .Format (time .RFC3339 ))
148157 return true
149158 }
@@ -160,7 +169,7 @@ func isInGoalState(health health.Status, conf config.Config) (bool, error) {
160169// performCheckOMMode does a general check if the Agent has reached the goal state - must be called when Agent is in
161170// "OM mode"
162171func performCheckOMMode (health health.Status ) bool {
163- for _ , v := range health .Healthiness {
172+ for _ , v := range health .Statuses {
164173 logger .Debug (v )
165174 if v .IsInGoalState {
166175 return true
@@ -231,10 +240,10 @@ func main() {
231240// isInReadyState checks the MongoDB Server state. It returns true if the mongod process is up and its state
232241// is PRIMARY or SECONDARY.
233242func isInReadyState (health health.Status ) bool {
234- if len (health .Healthiness ) == 0 {
243+ if len (health .Statuses ) == 0 {
235244 return true
236245 }
237- for _ , processHealth := range health .Healthiness {
246+ for _ , processHealth := range health .Statuses {
238247 // We know this loop should run only once, in Kubernetes there's
239248 // only 1 server managed per host.
240249 if ! processHealth .ExpectedToBeUp {
0 commit comments