Skip to content

Commit

Permalink
Cherry pick- branch ENI operation op latency metrics (#487)
Browse files Browse the repository at this point in the history
* update branch ENI operation metrics & dev guide (#465)

* measure branch ENI operation latency in seconds (#469)
  • Loading branch information
sushrk authored Oct 25, 2024
1 parent bbad908 commit 88956b9
Show file tree
Hide file tree
Showing 5 changed files with 46 additions and 29 deletions.
8 changes: 6 additions & 2 deletions DEVELOPER_GUIDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,12 @@ make toolchain # Install required to develop the project

## Testing a code change

Deploy your changes to a local development cluster and run the tests against it. You will need to allowlist your account
for ENI trunking before the deployment.
Deploy your changes to a local development cluster and run the tests against it. You will need to allowlist your account for ENI trunking before the deployment.

If you are testing on EKS beta cluster, set
```sh
BETA_CLUSTER=true
```

```sh
make apply-dependencies # install the cert manager and certificate
Expand Down
9 changes: 8 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ GOLANG_VERSION ?= $(shell cat .go-version)
BUILD_IMAGE ?= public.ecr.aws/docker/library/golang:$(GOLANG_VERSION)
GOARCH ?= amd64
PLATFORM ?= linux/amd64
USER_ROLE_ARN ?= arn:aws:iam::$(AWS_ACCOUNT):role/VPCResourceControllerRole
BETA_CLUSTER ?= false

help: ## Display help
@awk 'BEGIN {FS = ":.*##"; printf "Usage:\n make \033[36m<target>\033[0m\n"} /^[a-zA-Z_0-9-]+:.*?##/ { printf " \033[36m%-15s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST)
Expand Down Expand Up @@ -51,14 +53,19 @@ toolchain: ## Install developer toolchain
./hack/toolchain.sh

apply: image check-deployment-env check-env ## Deploy controller to ~/.kube/config
ifeq ($(BETA_CLUSTER), true)
VPC_ID=$(shell aws eks describe-cluster --name ${CLUSTER_NAME} --region ${AWS_REGION} --endpoint https://api.beta.us-west-2.wesley.amazonaws.com --query "cluster.resourcesVpcConfig" --output json | jq '.vpcId')
else
VPC_ID=$(shell aws eks describe-cluster --name ${CLUSTER_NAME} --region ${AWS_REGION} --query "cluster.resourcesVpcConfig" --output json | jq '.vpcId')
endif
eksctl create iamserviceaccount vpc-resource-controller --namespace kube-system --cluster ${CLUSTER_NAME} --region ${AWS_REGION} \
--role-name VPCResourceControllerRole \
--attach-policy-arn=arn:aws:iam::aws:policy/AdministratorAccess \
--override-existing-serviceaccounts \
--approve
kustomize build config/crd | kubectl apply -f -
cd config/controller && kustomize edit set image controller=${IMAGE}
kustomize build config/default | sed "s|CLUSTER_NAME|${CLUSTER_NAME}|g;s|USER_ROLE_ARN|${USER_ROLE_ARN}|g" | kubectl apply -f -
kustomize build config/default | sed "s|CLUSTER_NAME|${CLUSTER_NAME}|g;s|USER_ROLE_ARN|${USER_ROLE_ARN}|g;s|VPC_ID|${VPC_ID}|g" | kubectl apply -f -
kubectl patch rolebinding eks-vpc-resource-controller-rolebinding -n kube-system --patch '{"subjects":[{"kind":"ServiceAccount","name":"vpc-resource-controller","namespace":"kube-system"}]}'
kubectl patch clusterrolebinding vpc-resource-controller-rolebinding --patch '{"subjects":[{"kind":"ServiceAccount","name":"vpc-resource-controller","namespace":"kube-system"}]}'

Expand Down
2 changes: 2 additions & 0 deletions config/controller/controller.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ spec:
- --role-arn=USER_ROLE_ARN
- --leader-elect
- --metrics-bind-address=:8443
- --introspect-bind-addr=:22775
- --vpc-id=VPC_ID
image: controller:latest
name: controller
resources:
Expand Down
54 changes: 29 additions & 25 deletions pkg/provider/branch/provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,44 +45,47 @@ import (
"sigs.k8s.io/controller-runtime/pkg/metrics"
)

const (
operationCreateBranchENI = "create_branch_eni"
operationAnnotateBranchENI = "annotate_branch_eni"
operationInitTrunk = "init_trunk"
resourceCountLabel = "resource_count"
operationLabel = "branch_provider_operation"

ReasonSecurityGroupRequested = "SecurityGroupRequested"
ReasonResourceAllocated = "ResourceAllocated"
ReasonBranchAllocationFailed = "BranchAllocationFailed"
ReasonBranchENIAnnotationFailed = "BranchENIAnnotationFailed"

ReasonTrunkENICreationFailed = "TrunkENICreationFailed"
)

var (
branchProviderOperationsErrCount = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "branch_provider_operations_err_count",
Help: "The number of errors encountered for branch provider operations",
},
[]string{"operation"},
[]string{operationLabel},
)

branchProviderOperationLatency = prometheus.NewSummaryVec(
prometheus.SummaryOpts{
Name: "branch_provider_operation_latency",
Help: "Branch Provider operations latency in ms",
Name: "branch_provider_operation_latency",
Help: "Branch Provider operations latency in seconds",
Objectives: map[float64]float64{0: 0, 0.5: 0.05, 0.9: 0.01, 0.99: 0.001, 1: 0},
},
[]string{"operation", "resource_count"},
[]string{operationLabel, resourceCountLabel},
)

operationCreateBranchENI = "create_branch_eni"
operationCreateBranchENIAndAnnotate = "create_and_annotate_branch_eni"
operationInitTrunk = "init_trunk"

ReasonSecurityGroupRequested = "SecurityGroupRequested"
ReasonResourceAllocated = "ResourceAllocated"
ReasonBranchAllocationFailed = "BranchAllocationFailed"
ReasonBranchENIAnnotationFailed = "BranchENIAnnotationFailed"

ReasonTrunkENICreationFailed = "TrunkENICreationFailed"

deleteQueueRequeueRequest = ctrl.Result{RequeueAfter: time.Second * 30, Requeue: true}

// NodeDeleteRequeueRequestDelay represents the time after which the resources belonging to a node will be cleaned
// up after receiving the actual node delete event.
NodeDeleteRequeueRequestDelay = time.Minute * 5

prometheusRegistered = false
)

var (
ErrTrunkExistInCache = fmt.Errorf("trunk eni already exist in cache")
ErrTrunkNotInCache = fmt.Errorf("trunk eni not present in cache")
)
Expand Down Expand Up @@ -131,9 +134,9 @@ func prometheusRegister() {
}
}

// timeSinceMs returns the time since MS from the start time
func timeSinceMs(start time.Time) float64 {
return float64(time.Since(start).Milliseconds())
// timeSinceSeconds returns the time elapsed in seconds from the start time
func timeSinceSeconds(start time.Time) float64 {
return float64(time.Since(start).Seconds())
}

// InitResources initialized the resource for the given node name. The initialized trunk ENI is stored in
Expand Down Expand Up @@ -172,9 +175,9 @@ func (b *branchENIProvider) InitResource(instance ec2.EC2Instance) error {

utils.SendNodeEventWithNodeName(b.apiWrapper.K8sAPI, nodeName, utils.NodeTrunkFailedInitializationReason, "The node failed initializing trunk interface", v1.EventTypeNormal, b.log)
branchProviderOperationsErrCount.WithLabelValues("init").Inc()
return fmt.Errorf("initalizing trunk, %w", err)
return fmt.Errorf("initializing trunk, %w", err)
}
branchProviderOperationLatency.WithLabelValues(operationInitTrunk, "1").Observe(timeSinceMs(start))
branchProviderOperationLatency.WithLabelValues(operationInitTrunk, "1").Observe(timeSinceSeconds(start))

// Add the Trunk ENI to cache
if err := b.addTrunkToCache(nodeName, trunkENI); err != nil {
Expand Down Expand Up @@ -367,7 +370,7 @@ func (b *branchENIProvider) CreateAndAnnotateResources(podNamespace string, podN
}

branchProviderOperationLatency.WithLabelValues(operationCreateBranchENI, strconv.Itoa(resourceCount)).
Observe(timeSinceMs(start))
Observe(timeSinceSeconds(start))

jsonBytes, err := json.Marshal(branchENIs)
if err != nil {
Expand All @@ -377,6 +380,7 @@ func (b *branchENIProvider) CreateAndAnnotateResources(podNamespace string, podN
return ctrl.Result{}, err
}

start = time.Now()
// Annotate the pod with the created resources
err = b.apiWrapper.PodAPI.AnnotatePod(pod.Namespace, pod.Name, pod.UID,
config.ResourceNamePodENI, string(jsonBytes))
Expand All @@ -393,8 +397,8 @@ func (b *branchENIProvider) CreateAndAnnotateResources(podNamespace string, podN
b.apiWrapper.K8sAPI.BroadcastEvent(pod, ReasonResourceAllocated,
fmt.Sprintf("Allocated %s to the pod", string(jsonBytes)), v1.EventTypeNormal)

branchProviderOperationLatency.WithLabelValues(operationCreateBranchENIAndAnnotate, strconv.Itoa(resourceCount)).
Observe(timeSinceMs(start))
branchProviderOperationLatency.WithLabelValues(operationAnnotateBranchENI, strconv.Itoa(resourceCount)).
Observe(timeSinceSeconds(start))

log.Info("created and annotated branch interface/s successfully", "branches", branchENIs)

Expand Down
2 changes: 1 addition & 1 deletion scripts/test/lib/config.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ function add_suffix() {

# IAM Role Name for Linux Node Role where VPC Resource Controller Runs. It should
# have the Trunk Association Policy
TRUNK_ASSOC_POLICY_NAME=$(add_suffix "AssociateTrunkInterfcePolicy")
TRUNK_ASSOC_POLICY_NAME=$(add_suffix "AssociateTrunkInterfacePolicy")
INSTANCE_ROLE_NAME=$(add_suffix "LinuxNodeRole")

# IAM Role and it's Policy Names which have the permission to manage Trunk/Branch
Expand Down

0 comments on commit 88956b9

Please sign in to comment.