Skip to content

Commit

Permalink
Fix / improve reboot duration calculation and add unit tests
Browse files Browse the repository at this point in the history
- deduplicate batch size calculations
- better structure and comments for reboot duration calculation
- fix usage of MaxTimeForNoPeersResponse in calculation
- unit tests for both

Signed-off-by: Marc Sluiter <[email protected]>
  • Loading branch information
slintes committed Jun 18, 2024
1 parent 5fdcaa2 commit 7ddf56a
Show file tree
Hide file tree
Showing 7 changed files with 380 additions and 55 deletions.
20 changes: 4 additions & 16 deletions pkg/apicheck/check.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"github.com/medik8s/self-node-remediation/pkg/peerhealth"
"github.com/medik8s/self-node-remediation/pkg/peers"
"github.com/medik8s/self-node-remediation/pkg/reboot"
"github.com/medik8s/self-node-remediation/pkg/utils"
)

type ApiConnectivityCheck struct {
Expand Down Expand Up @@ -142,22 +143,8 @@ func (c *ApiConnectivityCheck) getWorkerPeersResponse() peers.Response {
// nodesToAsk is being reduced in every iteration, iterate until no nodes left to ask
for i := 0; len(nodesToAsk) > 0; i++ {

// start asking a few nodes only in first iteration to cover the case we get a healthy / unhealthy result
nodesBatchCount := reboot.MinNodesNumberInBatch
if i > 0 {
// after that ask 10% of the cluster each time to check the api problem case
nodesBatchCount = len(nodesToAsk) / reboot.MaxBatchesAfterFirst
if nodesBatchCount < reboot.MinNodesNumberInBatch {
nodesBatchCount = reboot.MinNodesNumberInBatch
}
}

// but do not ask more than we have
if len(nodesToAsk) < nodesBatchCount {
nodesBatchCount = len(nodesToAsk)
}

chosenNodesAddresses := c.popNodes(&nodesToAsk, nodesBatchCount)
batchSize := utils.GetNextBatchSize(nrAllNodes, len(nodesToAsk))
chosenNodesAddresses := c.popNodes(&nodesToAsk, batchSize)
healthyResponses, unhealthyResponses, apiErrorsResponses, _ := c.getHealthStatusFromPeers(chosenNodesAddresses)
if healthyResponses+unhealthyResponses+apiErrorsResponses > 0 {
c.timeOfLastPeerResponse = time.Now()
Expand Down Expand Up @@ -189,6 +176,7 @@ func (c *ApiConnectivityCheck) getWorkerPeersResponse() peers.Response {

//we asked all peers
now := time.Now()
// MaxTimeForNoPeersResponse check prevents the node from being considered unhealthy in case of short network outages
if now.After(c.timeOfLastPeerResponse.Add(c.config.MaxTimeForNoPeersResponse)) {
c.config.Log.Error(fmt.Errorf("failed health check"), "Failed to get health status peers. Assuming unhealthy")
return peers.Response{IsHealthy: false, Reason: peers.UnHealthyBecauseNodeIsIsolated}
Expand Down
72 changes: 37 additions & 35 deletions pkg/reboot/calculator.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package reboot

import (
"context"
"sync"
"time"

"github.com/go-logr/logr"
Expand All @@ -19,8 +20,6 @@ import (

const (
MaxTimeForNoPeersResponse = 30 * time.Second
MinNodesNumberInBatch = 3
MaxBatchesAfterFirst = 10
)

type Calculator interface {
Expand All @@ -39,7 +38,8 @@ type calculator struct {
k8sClient client.Client
log logr.Logger
// storing the config here when reconciling it increases resilience in case of issues during remediation
snrConfig *v1alpha1.SelfNodeRemediationConfig
snrConfig *v1alpha1.SelfNodeRemediationConfig
snrConfigLock sync.RWMutex
}

func NewCalculator(k8sClient client.Client, log logr.Logger) Calculator {
Expand All @@ -50,12 +50,16 @@ func NewCalculator(k8sClient client.Client, log logr.Logger) Calculator {
}

func (r *calculator) SetConfig(config *v1alpha1.SelfNodeRemediationConfig) {
r.snrConfigLock.Lock()
defer r.snrConfigLock.Unlock()
r.snrConfig = config
}

// TODO add unit test!
func (r *calculator) GetRebootDuration(ctx context.Context, node *v1.Node) (time.Duration, error) {

r.snrConfigLock.Lock()
defer r.snrConfigLock.Unlock()
if r.snrConfig == nil {
return 0, errors.New("SelfNodeRemediationConfig not set yet, can't calculate minimum reboot duration")
}
Expand Down Expand Up @@ -96,57 +100,55 @@ func (r *calculator) calculateMinimumRebootDuration(ctx context.Context, watchdo

spec := r.snrConfig.Spec

// The minimum reboot duration consists of the duration to identify a node issue, and to trigger the reboot
// The minimum reboot duration consists of the duration
// 1) to detect API connectivity issue
// 2) to confirm issue with peers
// 3) to trigger the reboot

// 1. time for determine node issue
// a) API check duration ...
minTime := spec.ApiCheckInterval.Duration + spec.ApiServerTimeout.Duration
// 1. detect API connectivity issue
// a) max API check duration ...
apiCheckDuration := spec.ApiCheckInterval.Duration + spec.ApiServerTimeout.Duration
// b) ... times error threshold ...
minTime *= time.Duration(spec.MaxApiErrorThreshold)
// c) ... plus peer timeout
minTime += MaxTimeForNoPeersResponse
apiCheckDuration *= time.Duration(spec.MaxApiErrorThreshold)

// 2. plus time for asking peers (10% batches + 1st smaller batch)
// 2. confirm issue with peers
// a) nr of peer request batches (10% batches + 1st smaller batch) ...
numBatches, err := r.calcNumOfBatches(r.k8sClient, ctx)
if err != nil {
return 0, errors.Wrap(err, "failed to calculate number of batches")
}
minTime += time.Duration(numBatches)*spec.PeerDialTimeout.Duration + spec.PeerRequestTimeout.Duration

// 3. plus watchdog timeout
minTime += watchdogTimeout
peerRequestsDuration := time.Duration(numBatches)
// b) ... times max peer request duration
peerRequestsDuration *= spec.PeerDialTimeout.Duration + spec.PeerRequestTimeout.Duration
// c) in order to prevent false positives in case of temporary network issues,
// we don't consider nodes being unhealthy before MaxTimeForNoPeersResponse.
// So that's the minimum time we need for the peers check.
if peerRequestsDuration < MaxTimeForNoPeersResponse {
peerRequestsDuration = MaxTimeForNoPeersResponse
}

// 4. plus some buffer
minTime += 15 * time.Second
// 3. trigger the reboot
// a) watchdog timeout ...
rebootDuration := watchdogTimeout
// b) ... plus some buffer for actually rebooting
rebootDuration += 15 * time.Second

return minTime, nil
return apiCheckDuration + peerRequestsDuration + rebootDuration, nil
}

func (r *calculator) calcNumOfBatches(k8sClient client.Client, ctx context.Context) (int, error) {

// get all worker nodes
reqPeers, _ := labels.NewRequirement(commonlabels.WorkerRole, selection.Exists, []string{})
selector := labels.NewSelector()
selector = selector.Add(*reqPeers)

nodes := &v1.NodeList{}
// time for asking peers (10% batches + 1st smaller batch)
maxNumberOfBatches := MaxBatchesAfterFirst + 1
if err := k8sClient.List(ctx, nodes, client.MatchingLabelsSelector{Selector: selector}); err != nil {
return maxNumberOfBatches, errors.Wrap(err, "failed to list worker nodes")
return 0, errors.Wrap(err, "failed to list worker nodes")
}
workerNodesCount := len(nodes.Items)

var numberOfBatches int
switch {
//high number of workers: we need max batches (for example 53 nodes will be done in 11 batches -> 1 * 3 + 10 * 5 )
case workerNodesCount > maxNumberOfBatches*MinNodesNumberInBatch:
numberOfBatches = maxNumberOfBatches
//there are few enough nodes to use the min batch (for example 20 nodes will be done in 7 batches -> 1 * 3 + 6 * 3 )
default:
numberOfBatches = workerNodesCount / MinNodesNumberInBatch
if workerNodesCount%MinNodesNumberInBatch != 0 {
numberOfBatches++
}
}
return numberOfBatches, nil
workerNodesCount := len(nodes.Items)
batchCount := utils.GetNrOfBatches(workerNodesCount)
return batchCount, nil
}
128 changes: 128 additions & 0 deletions pkg/reboot/calculator_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
package reboot_test

import (
"context"
"strconv"
"time"

"github.com/medik8s/common/pkg/labels"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"

v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"sigs.k8s.io/controller-runtime/pkg/client"

"github.com/medik8s/self-node-remediation/api/v1alpha1"
"github.com/medik8s/self-node-remediation/pkg/utils"
)

var _ = Describe("Calculator tests", func() {

var snrConfig *v1alpha1.SelfNodeRemediationConfig
var watchdogTimeoutSeconds int
var unhealthyNode *v1.Node
var nrOfPeers int
var expectedRebootDurationSeconds int

BeforeEach(func() {
// just defaults, override values in tests
snrConfig = &v1alpha1.SelfNodeRemediationConfig{
ObjectMeta: metav1.ObjectMeta{
Namespace: "default",
Name: v1alpha1.ConfigCRName,
},
Spec: v1alpha1.SelfNodeRemediationConfigSpec{},
}
})

JustBeforeEach(func() {
Expect(k8sClient.Create(context.Background(), snrConfig)).To(Succeed())
DeferCleanup(func() {
Expect(k8sClient.Delete(context.Background(), snrConfig)).To(Succeed())
Eventually(func(g Gomega) {
err := k8sClient.Get(context.Background(), client.ObjectKeyFromObject(snrConfig), &v1alpha1.SelfNodeRemediationConfig{})
g.Expect(errors.IsNotFound(err)).To(BeTrue())
}, "5s", "1s").Should(Succeed())
calculator.SetConfig(nil)
})

unhealthyNode = getNode("unhealthy-node")
unhealthyNode.Annotations = map[string]string{
utils.WatchdogTimeoutSecondsAnnotation: strconv.Itoa(watchdogTimeoutSeconds),
}
Expect(k8sClient.Create(context.Background(), unhealthyNode)).To(Succeed())
DeferCleanup(func() {
Expect(k8sClient.Delete(context.Background(), unhealthyNode)).To(Succeed())
Eventually(func(g Gomega) {
err := k8sClient.Get(context.Background(), client.ObjectKeyFromObject(unhealthyNode), &v1.Node{})
g.Expect(errors.IsNotFound(err)).To(BeTrue())
}, "5s", "1s").Should(Succeed())
})

for i := 0; i < nrOfPeers; i++ {
peer := getNode("peer-node-" + strconv.Itoa(i))
peer.Labels[labels.WorkerRole] = ""
Expect(k8sClient.Create(context.Background(), peer)).To(Succeed())
DeferCleanup(func() {
Expect(k8sClient.Delete(context.Background(), peer)).To(Succeed())
Eventually(func(g Gomega) {
err := k8sClient.Get(context.Background(), client.ObjectKeyFromObject(peer), &v1.Node{})
g.Expect(errors.IsNotFound(err)).To(BeTrue())
}, "5s", "1s").Should(Succeed())
})
}
})

Context("with default SNRConfig, 2 peers, and 10s watchdog timeout", func() {
BeforeEach(func() {
watchdogTimeoutSeconds = 10
nrOfPeers = 2
// 3 * (15 + 5) (API server)
// + 30 (MaxTimeForNoPeersResponse)
// + 10 (Watchdog)
// + 15
expectedRebootDurationSeconds = 115
})
It("GetRebootTime should return correct value", func() {
Eventually(func() (time.Duration, error) {
return calculator.GetRebootDuration(context.Background(), unhealthyNode)
}, "5s", "200ms").Should(Equal(time.Duration(expectedRebootDurationSeconds) * time.Second))
})
})

Context("with modified SNRConfig, 20 peers, and 25s watchdog timeout", func() {
BeforeEach(func() {
// modify all values used by calculator
snrConfig.Spec.ApiCheckInterval = &metav1.Duration{Duration: 25 * time.Second}
snrConfig.Spec.ApiServerTimeout = &metav1.Duration{Duration: 7 * time.Second}
snrConfig.Spec.MaxApiErrorThreshold = 4

snrConfig.Spec.PeerDialTimeout = &metav1.Duration{Duration: 11 * time.Second}
snrConfig.Spec.PeerRequestTimeout = &metav1.Duration{Duration: 13 * time.Second}

watchdogTimeoutSeconds = 25
nrOfPeers = 20 // 7 batches

// 4 * (25 + 7) = 128 (API server)
// + 7 * (11 + 13) = 168 (Peers)
// + 25 (Watchdog)
// + 15
expectedRebootDurationSeconds = 336
})
It("GetRebootTime should return correct value", func() {
Eventually(func() (time.Duration, error) {
return calculator.GetRebootDuration(context.Background(), unhealthyNode)
}, "5s", "200ms").Should(Equal(time.Duration(expectedRebootDurationSeconds) * time.Second))
})
})
})

func getNode(name string) *v1.Node {
node := &v1.Node{}
node.Name = name
node.Labels = make(map[string]string)
node.Labels["kubernetes.io/hostname"] = name
return node
}
Loading

0 comments on commit 7ddf56a

Please sign in to comment.