From 88956b96c9ba5e32b5aeb0fe77219f77166efd7f Mon Sep 17 00:00:00 2001 From: Sushmitha Ravikumar <58063229+sushrk@users.noreply.github.com> Date: Fri, 25 Oct 2024 15:26:43 -0700 Subject: [PATCH] Cherry pick- branch ENI operation op latency metrics (#487) * update branch ENI operation metrics & dev guide (#465) * measure branch ENI operation latency in seconds (#469) --- DEVELOPER_GUIDE.md | 8 +++-- Makefile | 9 +++++- config/controller/controller.yaml | 2 ++ pkg/provider/branch/provider.go | 54 +++++++++++++++++-------------- scripts/test/lib/config.sh | 2 +- 5 files changed, 46 insertions(+), 29 deletions(-) diff --git a/DEVELOPER_GUIDE.md b/DEVELOPER_GUIDE.md index 2e1603ab..4b48b99c 100644 --- a/DEVELOPER_GUIDE.md +++ b/DEVELOPER_GUIDE.md @@ -8,8 +8,12 @@ make toolchain # Install required to develop the project ## Testing a code change -Deploy your changes to a local development cluster and run the tests against it. You will need to allowlist your account -for ENI trunking before the deployment. +Deploy your changes to a local development cluster and run the tests against it. You will need to allowlist your account for ENI trunking before the deployment. + +If you are testing on EKS beta cluster, set +```sh +BETA_CLUSTER=true +``` ```sh make apply-dependencies # install the cert manager and certificate diff --git a/Makefile b/Makefile index 2297820b..34dfe68c 100644 --- a/Makefile +++ b/Makefile @@ -16,6 +16,8 @@ GOLANG_VERSION ?= $(shell cat .go-version) BUILD_IMAGE ?= public.ecr.aws/docker/library/golang:$(GOLANG_VERSION) GOARCH ?= amd64 PLATFORM ?= linux/amd64 +USER_ROLE_ARN ?= arn:aws:iam::$(AWS_ACCOUNT):role/VPCResourceControllerRole +BETA_CLUSTER ?= false help: ## Display help @awk 'BEGIN {FS = ":.*##"; printf "Usage:\n make \033[36m\033[0m\n"} /^[a-zA-Z_0-9-]+:.*?##/ { printf " \033[36m%-15s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST) @@ -51,6 +53,11 @@ toolchain: ## Install developer toolchain ./hack/toolchain.sh apply: image check-deployment-env check-env ## Deploy controller to ~/.kube/config +ifeq ($(BETA_CLUSTER), true) + VPC_ID=$(shell aws eks describe-cluster --name ${CLUSTER_NAME} --region ${AWS_REGION} --endpoint https://api.beta.us-west-2.wesley.amazonaws.com --query "cluster.resourcesVpcConfig" --output json | jq '.vpcId') +else + VPC_ID=$(shell aws eks describe-cluster --name ${CLUSTER_NAME} --region ${AWS_REGION} --query "cluster.resourcesVpcConfig" --output json | jq '.vpcId') +endif eksctl create iamserviceaccount vpc-resource-controller --namespace kube-system --cluster ${CLUSTER_NAME} --region ${AWS_REGION} \ --role-name VPCResourceControllerRole \ --attach-policy-arn=arn:aws:iam::aws:policy/AdministratorAccess \ @@ -58,7 +65,7 @@ apply: image check-deployment-env check-env ## Deploy controller to ~/.kube/conf --approve kustomize build config/crd | kubectl apply -f - cd config/controller && kustomize edit set image controller=${IMAGE} - kustomize build config/default | sed "s|CLUSTER_NAME|${CLUSTER_NAME}|g;s|USER_ROLE_ARN|${USER_ROLE_ARN}|g" | kubectl apply -f - + kustomize build config/default | sed "s|CLUSTER_NAME|${CLUSTER_NAME}|g;s|USER_ROLE_ARN|${USER_ROLE_ARN}|g;s|VPC_ID|${VPC_ID}|g" | kubectl apply -f - kubectl patch rolebinding eks-vpc-resource-controller-rolebinding -n kube-system --patch '{"subjects":[{"kind":"ServiceAccount","name":"vpc-resource-controller","namespace":"kube-system"}]}' kubectl patch clusterrolebinding vpc-resource-controller-rolebinding --patch '{"subjects":[{"kind":"ServiceAccount","name":"vpc-resource-controller","namespace":"kube-system"}]}' diff --git a/config/controller/controller.yaml b/config/controller/controller.yaml index 951daf12..212693f5 100644 --- a/config/controller/controller.yaml +++ b/config/controller/controller.yaml @@ -33,6 +33,8 @@ spec: - --role-arn=USER_ROLE_ARN - --leader-elect - --metrics-bind-address=:8443 + - --introspect-bind-addr=:22775 + - --vpc-id=VPC_ID image: controller:latest name: controller resources: diff --git a/pkg/provider/branch/provider.go b/pkg/provider/branch/provider.go index 749b4bdc..b52f9504 100644 --- a/pkg/provider/branch/provider.go +++ b/pkg/provider/branch/provider.go @@ -45,34 +45,39 @@ import ( "sigs.k8s.io/controller-runtime/pkg/metrics" ) +const ( + operationCreateBranchENI = "create_branch_eni" + operationAnnotateBranchENI = "annotate_branch_eni" + operationInitTrunk = "init_trunk" + resourceCountLabel = "resource_count" + operationLabel = "branch_provider_operation" + + ReasonSecurityGroupRequested = "SecurityGroupRequested" + ReasonResourceAllocated = "ResourceAllocated" + ReasonBranchAllocationFailed = "BranchAllocationFailed" + ReasonBranchENIAnnotationFailed = "BranchENIAnnotationFailed" + + ReasonTrunkENICreationFailed = "TrunkENICreationFailed" +) + var ( branchProviderOperationsErrCount = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "branch_provider_operations_err_count", Help: "The number of errors encountered for branch provider operations", }, - []string{"operation"}, + []string{operationLabel}, ) branchProviderOperationLatency = prometheus.NewSummaryVec( prometheus.SummaryOpts{ - Name: "branch_provider_operation_latency", - Help: "Branch Provider operations latency in ms", + Name: "branch_provider_operation_latency", + Help: "Branch Provider operations latency in seconds", + Objectives: map[float64]float64{0: 0, 0.5: 0.05, 0.9: 0.01, 0.99: 0.001, 1: 0}, }, - []string{"operation", "resource_count"}, + []string{operationLabel, resourceCountLabel}, ) - operationCreateBranchENI = "create_branch_eni" - operationCreateBranchENIAndAnnotate = "create_and_annotate_branch_eni" - operationInitTrunk = "init_trunk" - - ReasonSecurityGroupRequested = "SecurityGroupRequested" - ReasonResourceAllocated = "ResourceAllocated" - ReasonBranchAllocationFailed = "BranchAllocationFailed" - ReasonBranchENIAnnotationFailed = "BranchENIAnnotationFailed" - - ReasonTrunkENICreationFailed = "TrunkENICreationFailed" - deleteQueueRequeueRequest = ctrl.Result{RequeueAfter: time.Second * 30, Requeue: true} // NodeDeleteRequeueRequestDelay represents the time after which the resources belonging to a node will be cleaned @@ -80,9 +85,7 @@ var ( NodeDeleteRequeueRequestDelay = time.Minute * 5 prometheusRegistered = false -) -var ( ErrTrunkExistInCache = fmt.Errorf("trunk eni already exist in cache") ErrTrunkNotInCache = fmt.Errorf("trunk eni not present in cache") ) @@ -131,9 +134,9 @@ func prometheusRegister() { } } -// timeSinceMs returns the time since MS from the start time -func timeSinceMs(start time.Time) float64 { - return float64(time.Since(start).Milliseconds()) +// timeSinceSeconds returns the time elapsed in seconds from the start time +func timeSinceSeconds(start time.Time) float64 { + return float64(time.Since(start).Seconds()) } // InitResources initialized the resource for the given node name. The initialized trunk ENI is stored in @@ -172,9 +175,9 @@ func (b *branchENIProvider) InitResource(instance ec2.EC2Instance) error { utils.SendNodeEventWithNodeName(b.apiWrapper.K8sAPI, nodeName, utils.NodeTrunkFailedInitializationReason, "The node failed initializing trunk interface", v1.EventTypeNormal, b.log) branchProviderOperationsErrCount.WithLabelValues("init").Inc() - return fmt.Errorf("initalizing trunk, %w", err) + return fmt.Errorf("initializing trunk, %w", err) } - branchProviderOperationLatency.WithLabelValues(operationInitTrunk, "1").Observe(timeSinceMs(start)) + branchProviderOperationLatency.WithLabelValues(operationInitTrunk, "1").Observe(timeSinceSeconds(start)) // Add the Trunk ENI to cache if err := b.addTrunkToCache(nodeName, trunkENI); err != nil { @@ -367,7 +370,7 @@ func (b *branchENIProvider) CreateAndAnnotateResources(podNamespace string, podN } branchProviderOperationLatency.WithLabelValues(operationCreateBranchENI, strconv.Itoa(resourceCount)). - Observe(timeSinceMs(start)) + Observe(timeSinceSeconds(start)) jsonBytes, err := json.Marshal(branchENIs) if err != nil { @@ -377,6 +380,7 @@ func (b *branchENIProvider) CreateAndAnnotateResources(podNamespace string, podN return ctrl.Result{}, err } + start = time.Now() // Annotate the pod with the created resources err = b.apiWrapper.PodAPI.AnnotatePod(pod.Namespace, pod.Name, pod.UID, config.ResourceNamePodENI, string(jsonBytes)) @@ -393,8 +397,8 @@ func (b *branchENIProvider) CreateAndAnnotateResources(podNamespace string, podN b.apiWrapper.K8sAPI.BroadcastEvent(pod, ReasonResourceAllocated, fmt.Sprintf("Allocated %s to the pod", string(jsonBytes)), v1.EventTypeNormal) - branchProviderOperationLatency.WithLabelValues(operationCreateBranchENIAndAnnotate, strconv.Itoa(resourceCount)). - Observe(timeSinceMs(start)) + branchProviderOperationLatency.WithLabelValues(operationAnnotateBranchENI, strconv.Itoa(resourceCount)). + Observe(timeSinceSeconds(start)) log.Info("created and annotated branch interface/s successfully", "branches", branchENIs) diff --git a/scripts/test/lib/config.sh b/scripts/test/lib/config.sh index 92ab08bf..6bd013c9 100644 --- a/scripts/test/lib/config.sh +++ b/scripts/test/lib/config.sh @@ -14,7 +14,7 @@ function add_suffix() { # IAM Role Name for Linux Node Role where VPC Resource Controller Runs. It should # have the Trunk Association Policy -TRUNK_ASSOC_POLICY_NAME=$(add_suffix "AssociateTrunkInterfcePolicy") +TRUNK_ASSOC_POLICY_NAME=$(add_suffix "AssociateTrunkInterfacePolicy") INSTANCE_ROLE_NAME=$(add_suffix "LinuxNodeRole") # IAM Role and it's Policy Names which have the permission to manage Trunk/Branch