Skip to content

Commit

Permalink
Some fixes and e2e test improvements:
Browse files Browse the repository at this point in the history
- re-enable and fix api check log tests in e2e test
  - use service IP for killing API connection
  - kill API connection on SNR DS pod
  - add peer check server logs and use them for test which can't
    get logs from unhealthy node's SNR agent pod
  - wait for pod deletion only, not restart (restart is caused by
    reboot, not SNR)
- refactor / cleanup e2e tests
- fix owner check / node name / machine name in peer check server
  and agent reconciler
- update sort-imports, which ignores generated files now
  • Loading branch information
clobrano authored and slintes committed Jul 9, 2024
1 parent 3239f53 commit 6f52c77
Show file tree
Hide file tree
Showing 59 changed files with 6,557 additions and 617 deletions.
6 changes: 3 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ KUSTOMIZE_VERSION = v5.3.0
ENVTEST_VERSION = v0.0.0-20240215124517-56159419231e

# versions at https://github.com/slintes/sort-imports/tags
SORT_IMPORTS_VERSION = v0.2.1
SORT_IMPORTS_VERSION = v0.3.0

# version at https://github.com/a8m/envsubst/releases
ENVSUBST_VERSION = v1.4.2
Expand Down Expand Up @@ -415,9 +415,9 @@ protoc-gen-go-grpc: ## Download protoc-gen-go-grpc locally if necessary.

.PHONY: e2e-test
e2e-test:
# KUBECONFIG must be set to the cluster, and PP needs to be deployed already
# KUBECONFIG must be set to the cluster, and SNR needs to be deployed already
# count arg makes the test ignoring cached test results
go test ./e2e -ginkgo.vv -test.v -timeout 60m -count=1 ${TEST_OPS}
go test ./e2e -ginkgo.vv -test.v -timeout 80m -count=1 ${TEST_OPS}

YQ = $(shell pwd)/bin/yq
.PHONY: yq
Expand Down
96 changes: 96 additions & 0 deletions controllers/owner_and_name.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
package controllers

import (
"context"
"errors"

"github.com/go-logr/logr"
commonAnnotations "github.com/medik8s/common/pkg/annotations"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"sigs.k8s.io/controller-runtime/pkg/client"

"github.com/openshift/api/machine/v1beta1"

"github.com/medik8s/self-node-remediation/api/v1alpha1"
)

// GetNodeName gets the node name:
// - if owned by NHC, or as fallback, from annotation or CR name
// - if owned by a Machine, from the Machine's node reference
func GetNodeName(ctx context.Context, c client.Client, snr *v1alpha1.SelfNodeRemediation, log logr.Logger) (string, error) {
// NHC has priority, so check it first: in case the SNR is owned by NHC, get the node name from annotation or CR name
if ownedByNHC, _ := IsOwnedByNHC(snr); ownedByNHC {
return getNodeNameDirect(snr), nil
}
// in case the SNR is owned by a Machine, we need to check the Machine's nodeRef
if ownedByMachine, ref := IsOwnedByMachine(snr); ownedByMachine {
return getNodeNameFromMachine(ctx, c, ref, snr.GetNamespace(), log)
}
// fallback: annotation or name
return getNodeNameDirect(snr), nil
}

func getNodeNameDirect(snr *v1alpha1.SelfNodeRemediation) string {
nodeName, isNodeNameAnnotationExist := snr.GetAnnotations()[commonAnnotations.NodeNameAnnotation]
if isNodeNameAnnotationExist {
return nodeName
}
return snr.GetName()
}

// IsOwnedByNHC checks if the SNR CR is owned by a NodeHealthCheck CR.
func IsOwnedByNHC(snr *v1alpha1.SelfNodeRemediation) (bool, *metav1.OwnerReference) {
for _, ownerRef := range snr.OwnerReferences {
if ownerRef.Kind == "NodeHealthCheck" {
return true, &ownerRef
}
}
return false, nil
}

// IsOwnedByMachine checks if the SNR CR is owned by a Machine CR.
func IsOwnedByMachine(snr *v1alpha1.SelfNodeRemediation) (bool, *metav1.OwnerReference) {
for _, ownerRef := range snr.OwnerReferences {
if ownerRef.Kind == "Machine" {
return true, &ownerRef
}
}
return false, nil
}

// IsSNRMatching checks if the SNR CR is matching the node or machine name,
// and additionally returns the node name for the SNR in case machineName is empty
func IsSNRMatching(ctx context.Context, c client.Client, snr *v1alpha1.SelfNodeRemediation, nodeName string, machineName string, log logr.Logger) (bool, string, error) {
if isOwnedByMachine, ref := IsOwnedByMachine(snr); isOwnedByMachine && machineName == ref.Name {
return true, "", nil
}
snrNodeName, err := GetNodeName(ctx, c, snr, log)
if err != nil {
log.Error(err, "failed to get node name from machine")
return false, "", err
}
return snrNodeName == nodeName, snrNodeName, nil
}

func getNodeNameFromMachine(ctx context.Context, c client.Client, ref *metav1.OwnerReference, ns string, log logr.Logger) (string, error) {
machine := &v1beta1.Machine{}
machineKey := client.ObjectKey{
Name: ref.Name,
Namespace: ns,
}

if err := c.Get(ctx, machineKey, machine); err != nil {
log.Error(err, "failed to get machine from SelfNodeRemediation CR owner ref",
"machine name", machineKey.Name, "namespace", machineKey.Namespace)
return "", err
}

if machine.Status.NodeRef == nil {
err := errors.New("nodeRef is nil")
log.Error(err, "failed to retrieve node from the unhealthy machine")
return "", err
}

return machine.Status.NodeRef.Name, nil
}
90 changes: 14 additions & 76 deletions controllers/selfnoderemediation_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ import (
"time"

"github.com/go-logr/logr"
commonAnnotations "github.com/medik8s/common/pkg/annotations"
"github.com/medik8s/common/pkg/events"
"github.com/medik8s/common/pkg/resources"
"github.com/pkg/errors"
Expand All @@ -39,8 +38,6 @@ import (
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"

"github.com/openshift/api/machine/v1beta1"

"github.com/medik8s/self-node-remediation/api/v1alpha1"
"github.com/medik8s/self-node-remediation/pkg/reboot"
"github.com/medik8s/self-node-remediation/pkg/utils"
Expand Down Expand Up @@ -172,8 +169,12 @@ func (r *SelfNodeRemediationReconciler) ReconcileAgent(ctx context.Context, req
return ctrl.Result{}, err
}

targetNodeName := getNodeName(snr)
if targetNodeName != r.MyNodeName {
snrMatches, targetNodeName, err := IsSNRMatching(ctx, r.Client, snr, r.MyNodeName, "", r.logger)
if err != nil {
r.logger.Error(err, "failed to check if SNR matches our node")
return ctrl.Result{}, err
}
if !snrMatches {
r.logger.Info("agent pod skipping remediation because node belongs to a different agent", "Agent node name", r.MyNodeName, "Remediated node name", targetNodeName)
return ctrl.Result{}, nil
}
Expand All @@ -183,7 +184,7 @@ func (r *SelfNodeRemediationReconciler) ReconcileAgent(ctx context.Context, req
switch phase {
case preRebootCompletedPhase:
r.logger.Info("node reboot not completed yet, start rebooting")
node, err := r.getNodeFromSnr(snr)
node, err := r.getNodeFromSnr(ctx, snr)
if err != nil {
r.logger.Info("didn't find node, eventing might be incomplete", "node name", targetNodeName)
}
Expand Down Expand Up @@ -254,7 +255,7 @@ func (r *SelfNodeRemediationReconciler) ReconcileManager(ctx context.Context, re
result := ctrl.Result{}
var err error

node, err := r.getNodeFromSnr(snr)
node, err := r.getNodeFromSnr(ctx, snr)
if err != nil {
if apiErrors.IsNotFound(err) {
r.logger.Info("couldn't find node matching remediation", "remediation name", snr.Name)
Expand Down Expand Up @@ -688,65 +689,20 @@ func (r *SelfNodeRemediationReconciler) setTimeAssumedRebooted(ctx context.Conte
}

// getNodeFromSnr returns the unhealthy node reported in the given snr
func (r *SelfNodeRemediationReconciler) getNodeFromSnr(snr *v1alpha1.SelfNodeRemediation) (*v1.Node, error) {
// SNR could be created by either machine based controller (e.g. MHC) or
// by a node based controller (e.g. NHC).
// In case snr is created with machine owner reference if NHC isn't it's owner it means
// it was created by a machine based controller (e.g. MHC).
if !IsOwnedByNHC(snr) {
for _, ownerRef := range snr.OwnerReferences {
if ownerRef.Kind == "Machine" {
return r.getNodeFromMachine(ownerRef, snr.Namespace)
}
}
}

// since we didn't find a Machine owner ref, we assume that SNR remediation contains the node's name either in the
// remediation name or in its annotation
node := &v1.Node{}
key := client.ObjectKey{
Name: getNodeName(snr),
Namespace: "",
}

if err := r.Get(context.TODO(), key, node); err != nil {
return nil, err
}

return node, nil
}

func (r *SelfNodeRemediationReconciler) getNodeFromMachine(ref metav1.OwnerReference, ns string) (*v1.Node, error) {
machine := &v1beta1.Machine{}
machineKey := client.ObjectKey{
Name: ref.Name,
Namespace: ns,
}

if err := r.Client.Get(context.Background(), machineKey, machine); err != nil {
r.logger.Error(err, "failed to get machine from SelfNodeRemediation CR owner ref",
"machine name", machineKey.Name, "namespace", machineKey.Namespace)
return nil, err
}

if machine.Status.NodeRef == nil {
err := errors.New("nodeRef is nil")
r.logger.Error(err, "failed to retrieve node from the unhealthy machine")
func (r *SelfNodeRemediationReconciler) getNodeFromSnr(ctx context.Context, snr *v1alpha1.SelfNodeRemediation) (*v1.Node, error) {
nodeName, err := GetNodeName(ctx, r.Client, snr, r.logger)
if err != nil {
return nil, err
}

node := &v1.Node{}
key := client.ObjectKey{
Name: machine.Status.NodeRef.Name,
Namespace: machine.Status.NodeRef.Namespace,
Name: nodeName,
Namespace: "",
}

if err := r.Get(context.Background(), key, node); err != nil {
r.logger.Error(err, "failed to retrieve node from the unhealthy machine",
"node name", node.Name, "machine name", machine.Name)
if err := r.Get(ctx, key, node); err != nil {
return nil, err
}

return node, nil
}

Expand Down Expand Up @@ -950,21 +906,3 @@ func (r *SelfNodeRemediationReconciler) getRuntimeStrategy(snr *v1alpha1.SelfNod
return remediationStrategy

}

func IsOwnedByNHC(snr *v1alpha1.SelfNodeRemediation) bool {
for _, ownerRef := range snr.OwnerReferences {
if ownerRef.Kind == "NodeHealthCheck" {
return true
}
}
return false
}

// getNodeName checks for the node name in SNR CR's annotation. If it does not exist it assumes the node name equals to SNR CR's name and returns it.
func getNodeName(snr *v1alpha1.SelfNodeRemediation) string {
nodeName, isNodeNameAnnotationExist := snr.GetAnnotations()[commonAnnotations.NodeNameAnnotation]
if isNodeNameAnnotationExist {
return nodeName
}
return snr.GetName()
}
Original file line number Diff line number Diff line change
Expand Up @@ -419,7 +419,7 @@ var _ = Describe("SNR Controller", func() {
verifyEvent("Warning", "RemediationCannotStart", "Could not get remediation target Node")
})
})
When("NHC isn set as owner in the remediation", func() {
When("NHC is set as owner in the remediation", func() {
BeforeEach(func() {
snr.OwnerReferences = append(snr.OwnerReferences, metav1.OwnerReference{Name: "nhc", Kind: "NodeHealthCheck", APIVersion: "remediation.medik8s.io/v1alpha1", UID: "12345"})
})
Expand Down Expand Up @@ -625,7 +625,7 @@ func removeUnschedulableTaint() {
func verifyNodeIsUnschedulable() *v1.Node {
By("Verify that node was marked as unschedulable")
node := &v1.Node{}
Eventually(func() (bool, error) {
EventuallyWithOffset(1, func() (bool, error) {
err := k8sClient.Client.Get(context.TODO(), unhealthyNodeNamespacedName, node)
return node.Spec.Unschedulable, err
}, 5*time.Second, 250*time.Millisecond).Should(BeTrue(), "node should be marked as unschedulable")
Expand Down
Loading

0 comments on commit 6f52c77

Please sign in to comment.