Skip to content

Commit

Permalink
fix: Set MaximumAcceptableUpdatedNonReadyToUpdatedReadyNodesRatio to …
Browse files Browse the repository at this point in the history
…0.11 and add cap of 5 non-ready nodes at once
  • Loading branch information
TwiN committed Jan 31, 2023
1 parent 18c35ba commit 38f4e72
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 13 deletions.
18 changes: 14 additions & 4 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,10 @@ import (
)

const (
MaximumFailedExecutionBeforePanic = 10 // Maximum number of allowed failed executions before panicking
MaximumAcceptableUpdatedNonReadyToUpdatedReadyNodesRatio = 0.15 // To help with larger clusters
MaximumFailedExecutionBeforePanic = 10 // Maximum number of allowed failed executions before panicking

MaximumAcceptableUpdatedNonReadyToUpdatedReadyNodesRatio = 0.11 // To help with larger clusters
MaximumNumberOfUpdatedNonReadyNodes = 5 // To prevent too many non-ready nodes from being taken into account when calculating resources available in one node
)

var (
Expand Down Expand Up @@ -143,7 +145,7 @@ func DoHandleRollingUpgrade(client k8s.ClientAPI, ec2Service ec2iface.EC2API, au
log.Printf("[%s] Skipping because ASG has a desired capacity of %d, but only has %d instances", aws.StringValue(autoScalingGroup.AutoScalingGroupName), aws.Int64Value(autoScalingGroup.DesiredCapacity), len(autoScalingGroup.Instances))
continue
}
if !HasAcceptableUpdatedNonReadyToUpdatedReadyNodesRatio(numberOfNonReadyUpdatedNodesOrInstances, len(updatedReadyNodes)) {
if !HasAcceptableNumberOfUpdatedNonReadyNodes(numberOfNonReadyUpdatedNodesOrInstances, len(updatedReadyNodes)) {
log.Printf("[%s] ASG has too many non-ready updated nodes/instances (%d), waiting until they become ready", aws.StringValue(autoScalingGroup.AutoScalingGroupName), numberOfNonReadyUpdatedNodesOrInstances)
continue
}
Expand Down Expand Up @@ -539,12 +541,20 @@ func compareLaunchTemplateVersions(targetTemplate *ec2.LaunchTemplate, lt1, lt2
return lt1version == lt2version
}

func HasAcceptableUpdatedNonReadyToUpdatedReadyNodesRatio(numberOfUpdatedNonReadyNodes, numberOfUpdatedReadyNodes int) bool {
// HasAcceptableNumberOfUpdatedNonReadyNodes checks if there's a sufficient amount of updated
// and ready nodes to move on to the next step (drain & terminate an outdated node) for a number of non-ready nodes.
//
// The logic behind this is that the more nodes are ready and updated, the higher the confidence we have that the
// upgrade is going well, so we can ramp things up faster the deeper we are in the upgrade process.
func HasAcceptableNumberOfUpdatedNonReadyNodes(numberOfUpdatedNonReadyNodes, numberOfUpdatedReadyNodes int) bool {
if numberOfUpdatedNonReadyNodes == 0 {
return true // all updated nodes are ready, so we can proceed
}
if numberOfUpdatedReadyNodes == 0 {
return false // there are no ready nodes AND there are non-ready nodes (we know this because of the previous check), so we cannot proceed
}
if numberOfUpdatedNonReadyNodes > MaximumNumberOfUpdatedNonReadyNodes {
return false // there are too many non-ready nodes, so we cannot proceed
}
return float64(numberOfUpdatedNonReadyNodes)/float64(numberOfUpdatedReadyNodes) <= MaximumAcceptableUpdatedNonReadyToUpdatedReadyNodesRatio
}
27 changes: 18 additions & 9 deletions main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -646,28 +646,37 @@ func TestHandleRollingUpgrade_withMixedInstancePolicyWhenOneOfTheInstanceTypesOv
}
}

func TestHasAcceptableUpdatedNonReadyToUpdatedReadyNodesRatio(t *testing.T) {
func TestHasAcceptableNumberOfUpdatedNonReadyNodes(t *testing.T) {
// false: there's too many non-ready nodes
// true: there's an acceptable amount of non-ready nodes given how many ready nodes there are
if HasAcceptableUpdatedNonReadyToUpdatedReadyNodesRatio(100, 0) {
if HasAcceptableNumberOfUpdatedNonReadyNodes(100, 0) {
t.Error("100NR/0R ready should not be acceptable")
}
if HasAcceptableUpdatedNonReadyToUpdatedReadyNodesRatio(50, 50) {
if HasAcceptableNumberOfUpdatedNonReadyNodes(50, 50) {
t.Error("50NR/50R should not be acceptable")
}
if !HasAcceptableUpdatedNonReadyToUpdatedReadyNodesRatio(5, 95) {
t.Error("5NR/95R should be acceptable")
if HasAcceptableNumberOfUpdatedNonReadyNodes(6, 10000) {
t.Error("6NR/10000R should not be acceptable, because MaximumNumberOfUpdatedNonReadyNodes is set to", MaximumNumberOfUpdatedNonReadyNodes)
}
if !HasAcceptableUpdatedNonReadyToUpdatedReadyNodesRatio(1, 99) {
if !HasAcceptableNumberOfUpdatedNonReadyNodes(5, 10000) {
t.Error("5NR/10000R should be acceptable")
}
if !HasAcceptableNumberOfUpdatedNonReadyNodes(4, 100) {
t.Error("4NR/100R should be acceptable")
}
if !HasAcceptableNumberOfUpdatedNonReadyNodes(1, 99) {
t.Error("1NR/99R should be acceptable")
}
if !HasAcceptableUpdatedNonReadyToUpdatedReadyNodesRatio(0, 100) {
if !HasAcceptableNumberOfUpdatedNonReadyNodes(0, 100) {
t.Error("0NR/100R should be acceptable")
}
if !HasAcceptableUpdatedNonReadyToUpdatedReadyNodesRatio(0, 1) {
if !HasAcceptableNumberOfUpdatedNonReadyNodes(0, 1) {
t.Error("0NR/1R should be acceptable")
}
if !HasAcceptableUpdatedNonReadyToUpdatedReadyNodesRatio(0, 0) {
if !HasAcceptableNumberOfUpdatedNonReadyNodes(0, 0) {
t.Error("0NR/0R should be acceptable")
}
if !HasAcceptableNumberOfUpdatedNonReadyNodes(1, 11) {
t.Error("1NR/11R should be acceptable")
}
}

0 comments on commit 38f4e72

Please sign in to comment.