Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Controller Revision (Implementation of KEP #238) #277

Merged
merged 27 commits into from
Dec 28, 2024
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
db8a7c5
initial working prototype of controller revision
Edwinhr716 Nov 8, 2024
df4872b
update value of currentRevision after update is done
Edwinhr716 Nov 15, 2024
de9dd70
switched names from sts to lws
Edwinhr716 Nov 15, 2024
267af37
test changes
Edwinhr716 Nov 27, 2024
7389ea1
added unit, integration, and e2e tests
Edwinhr716 Dec 5, 2024
5a49f88
cleanup and added integration test for collisionCount
Edwinhr716 Dec 6, 2024
f28995a
fix lint
Edwinhr716 Dec 6, 2024
1f5adfa
changed from string to const on labels
Edwinhr716 Dec 9, 2024
1b978cf
changed revision logic based on updated design
Edwinhr716 Dec 16, 2024
522f1b0
removed status changes, cleaned up tests that referenced them. Implem…
Edwinhr716 Dec 20, 2024
9436aeb
addressed comments, refactored
Edwinhr716 Dec 23, 2024
8d3da19
rebased
Edwinhr716 Dec 23, 2024
5fa3b34
fixed failing tests and lint error
Edwinhr716 Dec 23, 2024
6a29f29
fixed lint, again
Edwinhr716 Dec 23, 2024
e9d6131
refactored revision code, added fix for PodGroupRestart bug and an e2…
Edwinhr716 Dec 26, 2024
7a19d85
addressed third round of comments
Edwinhr716 Dec 27, 2024
8b66d18
removed blank space
Edwinhr716 Dec 27, 2024
4d2ec47
addressed comments, round 4
Edwinhr716 Dec 27, 2024
949bc68
further changes from templateHash to RevisionKey
Edwinhr716 Dec 27, 2024
6b292d7
further changing from templateHash to revisionKey
Edwinhr716 Dec 28, 2024
a065e93
minor fixes
Edwinhr716 Dec 28, 2024
8b62baa
fixed all tests failing, still need to debug other tests
Edwinhr716 Dec 28, 2024
897e009
added log messages to listRevision for debugging
Edwinhr716 Dec 28, 2024
7822c18
fixed bug with getHighestRevision
Edwinhr716 Dec 28, 2024
cafadfc
added log messages for create revision function
Edwinhr716 Dec 28, 2024
a856b3d
removed fetch after creation, not needed
Edwinhr716 Dec 28, 2024
1faace7
removed e2e tests, will be added as integration tests intead
Edwinhr716 Dec 28, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions api/leaderworkerset/v1/leaderworkerset_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,8 @@ const (
// Worker pods will have an annotation that is the leader pod's name.
LeaderPodNameAnnotationKey string = "leaderworkerset.sigs.k8s.io/leader-name"

// SHAed leaderWorkerTemplate value for version tracking.
// This will be applied to all API objects including:
// leaderStatefulset, leaderPods, workerStatefulsets, workerPods.
TemplateRevisionHashKey string = "leaderworkerset.sigs.k8s.io/template-revision-hash"
// Hash to track the controller revision that matches an LWS object
RevisionKey string = "leaderworkerset.sigs.k8s.io/template-revision-hash"

// Environment variable added to all containers in the LeaderWorkerSet to
// address the leader via the headless service.
Expand Down
3 changes: 3 additions & 0 deletions config/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ rules:
- apiGroups:
- apps
resources:
- controllerrevisions
- statefulsets
verbs:
- create
Expand All @@ -78,12 +79,14 @@ rules:
- apiGroups:
- apps
resources:
- controllerrevisions/finalizers
- statefulsets/finalizers
verbs:
- update
- apiGroups:
- apps
resources:
- controllerrevisions/status
- statefulsets/status
verbs:
- get
Expand Down
170 changes: 123 additions & 47 deletions pkg/controllers/leaderworkerset_controller.go

Large diffs are not rendered by default.

43 changes: 32 additions & 11 deletions pkg/controllers/leaderworkerset_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ limitations under the License.
package controllers

import (
"context"
"testing"

"github.com/google/go-cmp/cmp"
Expand All @@ -30,24 +31,40 @@ import (
"k8s.io/utils/ptr"

leaderworkerset "sigs.k8s.io/lws/api/leaderworkerset/v1"
"sigs.k8s.io/lws/pkg/utils"

"sigs.k8s.io/controller-runtime/pkg/client/fake"
revisionutils "sigs.k8s.io/lws/pkg/utils/revision"
testutils "sigs.k8s.io/lws/test/testutils"
)

func TestLeaderStatefulSetApplyConfig(t *testing.T) {
hash1 := utils.LeaderWorkerTemplateHash(testutils.BuildBasicLeaderWorkerSet("test-sample", "default").
client := fake.NewClientBuilder().Build()
lws1 := testutils.BuildBasicLeaderWorkerSet("test-sample", "default").
LeaderTemplateSpec(testutils.MakeLeaderPodSpec()).
WorkerTemplateSpec(testutils.MakeWorkerPodSpec()).Obj())
hash2 := utils.LeaderWorkerTemplateHash(testutils.BuildBasicLeaderWorkerSet("test-sample", "default").
WorkerTemplateSpec(testutils.MakeWorkerPodSpec()).Obj())
WorkerTemplateSpec(testutils.MakeWorkerPodSpec()).Obj()
cr1, err := revisionutils.NewRevision(context.TODO(), client, lws1, "")
if err != nil {
t.Fatal(err)
}
hash1 := revisionutils.GetRevisionKey(cr1)

lws2 := testutils.BuildBasicLeaderWorkerSet("test-sample", "default").
WorkerTemplateSpec(testutils.MakeWorkerPodSpec()).Obj()
cr2, err := revisionutils.NewRevision(context.TODO(), client, lws2, "")
if err != nil {
t.Fatal(err)
}
hash2 := revisionutils.GetRevisionKey(cr2)

tests := []struct {
name string
templateHash string
lws *leaderworkerset.LeaderWorkerSet
wantApplyConfig *appsapplyv1.StatefulSetApplyConfiguration
}{
{
name: "1 replica, size 1, with empty leader template, exclusive placement disabled",
name: "1 replica, size 1, with empty leader template, exclusive placement disabled",
templateHash: hash2,
lws: testutils.BuildBasicLeaderWorkerSet("test-sample", "default").
Replica(1).
RolloutStrategy(leaderworkerset.RolloutStrategy{
Expand Down Expand Up @@ -112,7 +129,8 @@ func TestLeaderStatefulSetApplyConfig(t *testing.T) {
},
},
{
name: "1 replica, size 2 , with empty leader template, exclusive placement enabled",
name: "1 replica, size 2 , with empty leader template, exclusive placement enabled",
templateHash: hash2,
lws: testutils.BuildBasicLeaderWorkerSet("test-sample", "default").
Annotation(map[string]string{
"leaderworkerset.sigs.k8s.io/exclusive-topology": "topologyKey",
Expand Down Expand Up @@ -180,7 +198,8 @@ func TestLeaderStatefulSetApplyConfig(t *testing.T) {
},
},
{
name: "2 replica, size 2, with leader template, exclusive placement enabled",
name: "2 replica, size 2, with leader template, exclusive placement enabled",
templateHash: hash1,
lws: testutils.BuildBasicLeaderWorkerSet("test-sample", "default").Annotation(map[string]string{
"leaderworkerset.sigs.k8s.io/exclusive-topology": "topologyKey",
}).Replica(2).
Expand Down Expand Up @@ -247,7 +266,8 @@ func TestLeaderStatefulSetApplyConfig(t *testing.T) {
},
},
{
name: "2 maxUnavailable, 1 maxSurge, with empty leader template, exclusive placement disabled",
name: "2 maxUnavailable, 1 maxSurge, with empty leader template, exclusive placement disabled",
templateHash: hash2,
lws: testutils.BuildBasicLeaderWorkerSet("test-sample", "default").
Replica(1).
RolloutStrategy(leaderworkerset.RolloutStrategy{
Expand Down Expand Up @@ -313,7 +333,8 @@ func TestLeaderStatefulSetApplyConfig(t *testing.T) {
},
},
{
name: "1 replica, size 2, with leader template, exclusive placement enabled, subgroupsize enabled",
name: "1 replica, size 2, with leader template, exclusive placement enabled, subgroupsize enabled",
templateHash: hash1,
lws: testutils.BuildBasicLeaderWorkerSet("test-sample", "default").Annotation(map[string]string{
leaderworkerset.SubGroupExclusiveKeyAnnotationKey: "topologyKey",
}).SubGroupSize(2).Replica(1).
Expand Down Expand Up @@ -383,7 +404,7 @@ func TestLeaderStatefulSetApplyConfig(t *testing.T) {
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
stsApplyConfig, err := constructLeaderStatefulSetApplyConfiguration(tc.lws, 0, *tc.lws.Spec.Replicas)
stsApplyConfig, err := constructLeaderStatefulSetApplyConfiguration(tc.lws, 0, *tc.lws.Spec.Replicas, tc.templateHash)
if err != nil {
t.Errorf("failed with error: %s", err.Error())
}
Expand Down
23 changes: 18 additions & 5 deletions pkg/controllers/pod_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ import (
acceleratorutils "sigs.k8s.io/lws/pkg/utils/accelerators"
controllerutils "sigs.k8s.io/lws/pkg/utils/controller"
podutils "sigs.k8s.io/lws/pkg/utils/pod"
revisionutils "sigs.k8s.io/lws/pkg/utils/revision"
statefulsetutils "sigs.k8s.io/lws/pkg/utils/statefulset"
)

Expand Down Expand Up @@ -118,8 +119,12 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R
log.V(2).Info("defer the creation of the worker statefulset because leader pod is not ready.")
return ctrl.Result{}, nil
}

statefulSet, err := constructWorkerStatefulSetApplyConfiguration(pod, leaderWorkerSet)
revision, err := revisionutils.GetRevision(ctx, r.Client, &leaderWorkerSet, pod.Labels[leaderworkerset.RevisionKey])
Edwinhr716 marked this conversation as resolved.
Show resolved Hide resolved
if err != nil {
log.Error(err, "Getting lws revisions")
return ctrl.Result{}, err
}
statefulSet, err := constructWorkerStatefulSetApplyConfiguration(pod, leaderWorkerSet, revision)
if err != nil {
return ctrl.Result{}, err
}
Expand Down Expand Up @@ -180,6 +185,10 @@ func (r *PodReconciler) handleRestartPolicy(ctx context.Context, pod corev1.Pod,
if err := r.Get(ctx, types.NamespacedName{Name: leaderPodName, Namespace: pod.Namespace}, &leader); err != nil {
return false, err
}
// Different revision key means that this pod will be deleted soon and alternative will be created with the matching key
if leader.Labels[leaderworkerset.RevisionKey] != pod.Labels[leaderworkerset.RevisionKey] {
Edwinhr716 marked this conversation as resolved.
Show resolved Hide resolved
return false, nil
}
} else {
leader = pod
}
Expand Down Expand Up @@ -259,8 +268,12 @@ func setControllerReferenceWithStatefulSet(owner metav1.Object, sts *appsapplyv1
}

// constructWorkerStatefulSetApplyConfiguration constructs the applied configuration for the leader StatefulSet
func constructWorkerStatefulSetApplyConfiguration(leaderPod corev1.Pod, lws leaderworkerset.LeaderWorkerSet) (*appsapplyv1.StatefulSetApplyConfiguration, error) {
podTemplateSpec := *lws.Spec.LeaderWorkerTemplate.WorkerTemplate.DeepCopy()
func constructWorkerStatefulSetApplyConfiguration(leaderPod corev1.Pod, lws leaderworkerset.LeaderWorkerSet, currentRevision *appsv1.ControllerRevision) (*appsapplyv1.StatefulSetApplyConfiguration, error) {
currentLws, err := revisionutils.ApplyRevision(&lws, currentRevision)
if err != nil {
return nil, err
}
podTemplateSpec := *currentLws.Spec.LeaderWorkerTemplate.WorkerTemplate.DeepCopy()
// construct pod template spec configuration
obj, err := runtime.DefaultUnstructuredConverter.ToUnstructured(&podTemplateSpec)
if err != nil {
Expand All @@ -280,7 +293,7 @@ func constructWorkerStatefulSetApplyConfiguration(leaderPod corev1.Pod, lws lead
leaderworkerset.GroupIndexLabelKey: leaderPod.Labels[leaderworkerset.GroupIndexLabelKey],
leaderworkerset.SetNameLabelKey: lws.Name,
leaderworkerset.GroupUniqueHashLabelKey: leaderPod.Labels[leaderworkerset.GroupUniqueHashLabelKey],
leaderworkerset.TemplateRevisionHashKey: leaderPod.Labels[leaderworkerset.TemplateRevisionHashKey],
leaderworkerset.RevisionKey: leaderPod.Labels[leaderworkerset.RevisionKey],
Edwinhr716 marked this conversation as resolved.
Show resolved Hide resolved
}

podTemplateApplyConfiguration.WithLabels(labelMap)
Expand Down
Loading