Skip to content

Commit

Permalink
Adding scripts for DP installation of the controller and to run Cyclo…
Browse files Browse the repository at this point in the history
…nus test (#36)

Adding scripts to deploy NP controller on dataplane with custom Image and cyclous tests

Co-authored-by: Jayanth Varavani <[email protected]>
  • Loading branch information
jaydeokar and jayanthvn authored Nov 21, 2023
1 parent 3281280 commit c668cb9
Show file tree
Hide file tree
Showing 9 changed files with 370 additions and 0 deletions.
19 changes: 19 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ ARCHS ?= amd64 arm64
# IMG_SBOM defines the SBOM media type to use, we set to none since ECR doesn't support it yet
IMG_SBOM ?= none

# Disable the control plane network policy controller
DISABLE_CP_NETWORK_POLICY_CONTROLLER=false

# Get the currently used golang install path (in GOPATH/bin, unless GOBIN is set)
ifeq (,$(shell go env GOBIN))
GOBIN=$(shell go env GOPATH)/bin
Expand Down Expand Up @@ -189,3 +192,19 @@ format: ## Format all Go source code files.
-type f \
-name '*.go' \
-print0 | sort -z | xargs -0 -- goimports $(or $(FORMAT_FLAGS),-w) | wc -l | bc)

run-cyclonus-test: ## Runs cyclonus tests on an existing cluster. Call with CLUSTER_NAME=<name of your cluster> to execute cyclonus test
ifdef CLUSTER_NAME
CLUSTER_NAME=$(CLUSTER_NAME) DISABLE_CP_NETWORK_POLICY_CONTROLLER=$(DISABLE_CP_NETWORK_POLICY_CONTROLLER) ./scripts/run-cyclonus-tests.sh
else
@echo 'Pass CLUSTER_NAME parameter'
endif

./PHONY: deploy-controller-on-dataplane
deploy-controller-on-dataplane: ## Deploys the Network Policy controller on an existing cluster. Optionally call with AMAZON_NP_CONTROLLER=<Image URI> to update the image
./scripts/update-controller-image-dataplane.sh AMAZON_NP_CONTROLLER=$(AMAZON_NP_CONTROLLER)

./PHONY: deploy-and-test
deploy-and-test: ## Deploys the Network Policy controller on an existing cluster and runs cyclonus tests. Call with CLUSTER_NAME=<name of the cluster> and AMAZON_NP_CONTROLLER=<Image URI>
$(MAKE) deploy-controller-on-dataplane AMAZON_NP_CONTROLLER=$(AMAZON_NP_CONTROLLER)
$(MAKE) run-cyclonus-test CLUSTER_NAME=$(CLUSTER_NAME) DISABLE_CP_NETWORK_POLICY_CONTROLLER=true
2 changes: 2 additions & 0 deletions config/controller/controller.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ spec:
spec:
containers:
- image: controller:latest
args:
- --enable-configmap-check=false
name: controller
securityContext:
allowPrivilegeEscalation: false
Expand Down
14 changes: 14 additions & 0 deletions scripts/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
## Scripts

This package contains shell scripts and libraries used for running e2e tests and some helper scripts.

`run-cyclonus-tests.sh` Runs cyclonus tests against an existing cluster and validates the output
`update-controller-image-dataplane.sh` Deploys the `amazon-network-policy-controller-k8s` controller on Dataplane

### Cyclonus tests
`run-cyclonus-tests.sh` script runs the cyclonus suite against an existing cluster. It provides the option of disabling the control plane `amazon-network-policy-controller-k8s` controller if you are deploying a custom/dev version of the controller installed on dataplane. The script also skips CNI installation if `SKIP_CNI_INSTALLATION` environment variable is set.
Use `make run-cyclonus-test` to run this script

### Deploy Controller on Dataplane
`update-controller-image-dataplane.sh` script helps in installing the Dataplane manifests for `amazon-network-policy-controller-k8s` controller. It provides the option to run a custom/dev image of the controller on dataplane if you set `AMAZON_NP_CONTROLLER` to the image URI.
Use `make deploy-controller-on-dataplane` action to run this script or `make deploy-and-test` to use run this script and cyclonus suite.
116 changes: 116 additions & 0 deletions scripts/lib/network-policy.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@

function load_addon_details() {

ADDON_NAME="vpc-cni"
echo "loading $ADDON_NAME addon details"
LATEST_ADDON_VERSION=$(aws eks describe-addon-versions $ENDPOINT_FLAG --addon-name $ADDON_NAME --kubernetes-version $K8S_VERSION | jq '.addons[0].addonVersions[0].addonVersion' -r)
EXISTING_SERVICE_ACCOUNT_ROLE_ARN=$(kubectl get serviceaccount -n kube-system aws-node -o json | jq '.metadata.annotations."eks.amazonaws.com/role-arn"' -r)
}

function wait_for_addon_status() {
local expected_status=$1
local retry_attempt=0
if [ "$expected_status" = "DELETED" ]; then
while $(aws eks describe-addon $ENDPOINT_FLAG --cluster-name $CLUSTER_NAME --addon-name $ADDON_NAME --region $REGION >> /dev/null); do
if [ $retry_attempt -ge 30 ]; then
echo "failed to delete addon, qutting after too many attempts"
exit 1
fi
echo "addon is still not deleted"
sleep 5
((retry_attempt=retry_attempt+1))
done
echo "addon deleted"

sleep 10
return
fi

retry_attempt=0
while true
do
STATUS=$(aws eks describe-addon $ENDPOINT_FLAG --cluster-name "$CLUSTER_NAME" --addon-name $ADDON_NAME --region "$REGION" | jq -r '.addon.status')
if [ "$STATUS" = "$expected_status" ]; then
echo "addon status matches expected status"
return
fi

if [ $retry_attempt -ge 30 ]; then
echo "failed to get desired add-on status: $STATUS, qutting after too many attempts"
exit 1
fi
echo "addon status is not equal to $expected_status"
sleep 10
((retry_attempt=retry_attempt+1))
done
}

function install_network_policy_mao() {

local addon_version=$1
if DESCRIBE_ADDON=$(aws eks describe-addon $ENDPOINT_FLAG --cluster-name $CLUSTER_NAME --addon-name $ADDON_NAME --region $REGION); then
local current_addon_version=$(echo "$DESCRIBE_ADDON" | jq '.addon.addonVersion' -r)
echo "deleting the $current_addon_version"
aws eks delete-addon $ENDPOINT_FLAG --cluster-name $CLUSTER_NAME --addon-name $ADDON_NAME --region $REGION
wait_for_addon_status "DELETED"
fi

echo "Installing addon $addon_version with network policy enabled"

if [ "$EXISTING_SERVICE_ACCOUNT_ROLE_ARN" != "null" ]; then
SA_ROLE_ARN_ARG="--service-account-role-arn $EXISTING_SERVICE_ACCOUNT_ROLE_ARN"
fi

aws eks create-addon \
--cluster-name $CLUSTER_NAME \
--addon-name $ADDON_NAME \
--configuration-value '{"enableNetworkPolicy": "true"}' \
--resolve-conflicts OVERWRITE \
--addon-version $addon_version \
--region $REGION $ENDPOINT_FLAG $SA_ROLE_ARN_ARG

wait_for_addon_status "ACTIVE"
}

function install_network_policy_helm(){

helm repo add eks https://aws.github.io/eks-charts

if [[ $IP_FAMILY == "IPv4" ]]; then
ENABLE_IPv4=true
ENABLE_IPv6=false
ENABLE_PREFIX_DELEGATION=false
else
ENABLE_IPv4=false
ENABLE_IPv6=true
ENABLE_PREFIX_DELEGATION=true
fi

echo "Updating annotations and labels on existing resources"
for kind in daemonSet clusterRole clusterRoleBinding serviceAccount; do
echo "setting annotations and labels on $kind/aws-node"
kubectl -n kube-system annotate --overwrite $kind aws-node meta.helm.sh/release-name=aws-vpc-cni || echo "Unable to annotate $kind/aws-node"
kubectl -n kube-system annotate --overwrite $kind aws-node meta.helm.sh/release-namespace=kube-system || echo "Unable to annotate $kind/aws-node"
kubectl -n kube-system label --overwrite $kind aws-node app.kubernetes.io/managed-by=Helm || echo "Unable to label $kind/aws-node"
done

echo "Installing/Updating the aws-vpc-cni helm chart with enableNetworkPolicy=true"
helm upgrade --install aws-vpc-cni eks/aws-vpc-cni --wait --timeout 300s \
--namespace kube-system \
--set enableNetworkPolicy=true \
--set originalMatchLabels=true \
--set init.env.ENABLE_IPv6=$ENABLE_IPv6 \
--set image.env.ENABLE_IPv6=$ENABLE_IPv6 \
--set nodeAgent.enableIpv6=$ENABLE_IPv6 \
--set image.env.ENABLE_PREFIX_DELEGATION=$ENABLE_PREFIX_DELEGATION \
--set image.env.ENABLE_IPv4=$ENABLE_IPv4
}

function disable_cp_network_policy_controller() {

if kubectl get configmap amazon-vpc-cni -n kube-system > /dev/null; then
echo "Disabling Network Policy Controller on Control Plane"
kubectl patch configmap/amazon-vpc-cni -n kube-system --type merge -p '{"data":{"enable-network-policy-controller":"false"}}'
fi

}
20 changes: 20 additions & 0 deletions scripts/lib/tests.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
function run_cyclonus_tests(){

kubectl create ns netpol
kubectl create clusterrolebinding cyclonus --clusterrole=cluster-admin --serviceaccount=netpol:cyclonus
kubectl create sa cyclonus -n netpol
kubectl apply -f ${DIR}/test/cyclonus-config.yaml -n netpol

echo "Executing cyclonus suite"
kubectl wait --for=condition=complete --timeout=240m -n netpol job.batch/cyclonus || echo "Job timed out after 4 hrs"
kubectl logs -n netpol job/cyclonus > ${DIR}/results.log

# Cleanup after test finishes
kubectl delete clusterrolebinding cyclonus
kubectl delete ns netpol

cat ${DIR}/results.log

echo "Verify results against expected"
python3 ${DIR}/lib/verify_test_results.py -f ${DIR}/results.log -ip $IP_FAMILY || (echo "Cyclonus tests have failed" && TEST_FAILED=true)
}
92 changes: 92 additions & 0 deletions scripts/lib/verify_test_results.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import re
import sys
import argparse

def main():
parser = argparse.ArgumentParser()
parser.add_argument("-f", "--file-name",default="", dest="file_name",help="Cyclonus results log file")
parser.add_argument("-ip", "--ip-family",default="IPv4", dest="ip_family",help="IP Family of the cluster")
args = parser.parse_args()
verify_results(args.file_name,args.ip_family)

def verify_results(file_name,ip_family):

# Cyclonus runs 112 test cases in total where each case has a number sub tests. AWS NP doesn't support all these sub-tests
# expected_results maintains a mapping of the test number and the number of sub-tests that are expected to pass for v4/v6 clusters
# For the test numbers not included in this map, it is expected that all the sub-tests should be passing
if ip_family == "IPv6":
expected_results={ 2:80, 3:80, 8:80, 12:80, 23:80, 25:80, 26:80, 28:80,29:80, 31:77, 98:80, 102:72, 104:72, 106:72, 108:72, 111:80, 112:80 }
else:
expected_results={ 2:80, 3:80, 8:80, 12:80, 23:80, 25:80, 26:80, 28:80, 29:80, 31:80, 98:80, 111:80, 112:80 }

start="starting test case"
wrong="wrong"
ignored="ignored"
correct="correct"
delimiter=':|\ |,|\\n'
test_number=0
is_test_run_failed=False
step=0

# Open the log file in read-only mode
with open(file_name, 'r') as filedata:
for line in filedata:
# Checking if the keywords are found in the line
is_test_case_failed=False
if all(key in line for key in [wrong,ignored,correct]):
step+=1
words=re.split(delimiter, line)
count_wrong=int(words[words.index(wrong)-1])
count_correct=int(words[words.index(correct)-1])
count_ignored=int(words[words.index(ignored)-1])

# Expected correct count by default
expected_correct=count_wrong+count_correct+count_ignored

# Check if test results are expected
if test_number in expected_results.keys():

if isinstance(expected_results[test_number], dict):
expected_correct=expected_results[test_number][step]
else:
expected_correct=expected_results[test_number]
# In v6 cluster, test #31 depends on which nodes the pod runs on, so we use here ( < ) instead of ( != )
if count_correct < expected_correct:
is_test_case_failed=True
elif count_wrong > 0:
is_test_case_failed=True

if is_test_case_failed:
# Mark the entire test run as fail since atleast one test deviated from the expected results
is_test_run_failed=True
print("Test Number:{test_number} | step:{step} | Failed -> Correct:{count_correct} Expected:{expected_correct}".format(
test_number=test_number,
step=step,
count_correct=count_correct,
expected_correct=expected_correct
))
else:
print("Test Number:{test_number} | step:{step} | Passed -> Correct:{count_correct} Expected:{expected_correct}".format(
test_number=test_number,
step=step,
count_correct=count_correct,
expected_correct=expected_correct
))

# This denotes the start of test
elif start in line:
step=0
test_number=int(line.split("#")[1])
is_test_case_failed=False
else:
continue

# Fail test if either flag is true or all 112 tests did not get executed
if is_test_run_failed or test_number != 112:
print("Test Run Failed. Check failures")
sys.exit(1)
else:
sys.exit(0)

if __name__ == "__main__":
main()
66 changes: 66 additions & 0 deletions scripts/run-cyclonus-tests.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#!/bin/bash

# The script runs Network Policy Cyclonus tests on a existing cluster
# Parameters:
# CLUSTER_NAME: name of the cluster
# KUBECONFIG: path to the kubeconfig file, default ~/.kube/config
# REGION: defaults to us-west-2
# IP_FAMILY: defaults to IPv4
# ADDON_VERSION: Optional, defaults to the latest version
# ENDPOINT: Optional

set -euoE pipefail
DIR=$(cd "$(dirname "$0")"; pwd)

source ${DIR}/lib/network-policy.sh
source ${DIR}/lib/tests.sh

: "${ENDPOINT_FLAG:=""}"
: "${ENDPOINT:=""}"
: "${ADDON_VERSION:=""}"
: "${IP_FAMILY:="IPv4"}"
: "${REGION:="us-west-2"}"
: "${SKIP_CNI_INSTALLATION:="false"}"
: "${K8S_VERSION:=""}"
: "${DISABLE_CP_NETWORK_POLICY_CONTROLLER="false"}"

if [[ ! -z $ENDPOINT ]]; then
ENDPOINT_FLAG="--endpoint-url $ENDPOINT"
fi

if [[ -z $K8S_VERSION ]]; then
K8S_VERSION=$(aws eks describe-cluster $ENDPOINT_FLAG --name $CLUSTER_NAME --region $REGION | jq -r '.cluster.version')
fi

TEST_FAILED="false"

echo "Running Cyclonus e2e tests with the following variables
KUBECONFIG: $KUBECONFIG
CLUSTER_NAME: $CLUSTER_NAME
REGION: $REGION
IP_FAMILY: $IP_FAMILY
K8S_VERSION: $K8S_VERSION
Optional args
ENDPOINT: $ENDPOINT
"

if [[ $SKIP_CNI_INSTALLATION == "false" ]]; then
install_network_policy_helm
else
echo "Skipping CNI installation. Make sure you have enabled network policy support in your cluster before executing the test"
fi

if [[ $DISABLE_CP_NETWORK_POLICY_CONTROLLER == "true" ]]; then
echo "Disable CP Network Policy Controller on controller plane"
disable_cp_network_policy_controller
else
echo "Skip disabling CP Network Policy controller. Tests will be evaulated against control plane NP controller"
fi

run_cyclonus_tests

if [[ $TEST_FAILED == "true" ]]; then
echo "Test run failed, check failures"
exit 1
fi
18 changes: 18 additions & 0 deletions scripts/test/cyclonus-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
apiVersion: batch/v1
kind: Job
metadata:
name: cyclonus
spec:
backoffLimit: 0
template:
spec:
restartPolicy: Never
containers:
- command:
- ./cyclonus
- generate
- --cleanup-namespaces=true
name: cyclonus
imagePullPolicy: Always
image: mfenwick100/cyclonus:v0.5.3
serviceAccount: cyclonus
23 changes: 23 additions & 0 deletions scripts/update-controller-image-dataplane.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/bin/bash
# Use this script to deploy the amazon-np-controller deployment on Dataplane nodes

# Parameters:
# KUBECONFIG: path to the kubeconfig file, default ~/.kube/config
# AMAZON_NP_CONTROLLER: node agent image

set -e
DIR=$(cd "$(dirname "$0")"; pwd)

echo "Deploy the default Amazon Network Policy Controller on Dataplane"
kubectl apply -k config/default

if [[ ! -z $AMAZON_NP_CONTROLLER ]];then
echo "Setting the Controller Image: $AMAZON_NP_CONTROLLER"
kubectl set image deployment.v1.apps/amazon-network-policy-controller-k8s controller=$AMAZON_NP_CONTROLLER
fi

echo "Restarting the Controller"
kubectl rollout restart deployment.v1.apps/amazon-network-policy-controller-k8s -n kube-system

echo "Ensuring Controller is Running on Dataplane"
kubectl rollout status deployment.v1.apps/amazon-network-policy-controller-k8s -n kube-system --timeout=2m || (echo "Amazon Network Policy controller is unhealthy" && exit 1)

0 comments on commit c668cb9

Please sign in to comment.