diff --git a/Makefile b/Makefile index 68935b3..10ab3a9 100644 --- a/Makefile +++ b/Makefile @@ -21,6 +21,9 @@ ARCHS ?= amd64 arm64 # IMG_SBOM defines the SBOM media type to use, we set to none since ECR doesn't support it yet IMG_SBOM ?= none +# Disable the control plane network policy controller +DISABLE_CP_NETWORK_POLICY_CONTROLLER=false + # Get the currently used golang install path (in GOPATH/bin, unless GOBIN is set) ifeq (,$(shell go env GOBIN)) GOBIN=$(shell go env GOPATH)/bin @@ -189,3 +192,19 @@ format: ## Format all Go source code files. -type f \ -name '*.go' \ -print0 | sort -z | xargs -0 -- goimports $(or $(FORMAT_FLAGS),-w) | wc -l | bc) + +run-cyclonus-test: ## Runs cyclonus tests on an existing cluster. Call with CLUSTER_NAME= to execute cyclonus test +ifdef CLUSTER_NAME + CLUSTER_NAME=$(CLUSTER_NAME) DISABLE_CP_NETWORK_POLICY_CONTROLLER=$(DISABLE_CP_NETWORK_POLICY_CONTROLLER) ./scripts/run-cyclonus-tests.sh +else + @echo 'Pass CLUSTER_NAME parameter' +endif + +./PHONY: deploy-controller-on-dataplane +deploy-controller-on-dataplane: ## Deploys the Network Policy controller on an existing cluster. Optionally call with AMAZON_NP_CONTROLLER= to update the image + ./scripts/update-controller-image-dataplane.sh AMAZON_NP_CONTROLLER=$(AMAZON_NP_CONTROLLER) + +./PHONY: deploy-and-test +deploy-and-test: ## Deploys the Network Policy controller on an existing cluster and runs cyclonus tests. Call with CLUSTER_NAME= and AMAZON_NP_CONTROLLER= + $(MAKE) deploy-controller-on-dataplane AMAZON_NP_CONTROLLER=$(AMAZON_NP_CONTROLLER) + $(MAKE) run-cyclonus-test CLUSTER_NAME=$(CLUSTER_NAME) DISABLE_CP_NETWORK_POLICY_CONTROLLER=true diff --git a/config/controller/controller.yaml b/config/controller/controller.yaml index aec76bf..cbcab73 100644 --- a/config/controller/controller.yaml +++ b/config/controller/controller.yaml @@ -18,6 +18,8 @@ spec: spec: containers: - image: controller:latest + args: + - --enable-configmap-check=false name: controller securityContext: allowPrivilegeEscalation: false diff --git a/scripts/README.md b/scripts/README.md new file mode 100644 index 0000000..343f9fd --- /dev/null +++ b/scripts/README.md @@ -0,0 +1,14 @@ +## Scripts + +This package contains shell scripts and libraries used for running e2e tests and some helper scripts. + +`run-cyclonus-tests.sh` Runs cyclonus tests against an existing cluster and validates the output +`update-controller-image-dataplane.sh` Deploys the `amazon-network-policy-controller-k8s` controller on Dataplane + +### Cyclonus tests +`run-cyclonus-tests.sh` script runs the cyclonus suite against an existing cluster. It provides the option of disabling the control plane `amazon-network-policy-controller-k8s` controller if you are deploying a custom/dev version of the controller installed on dataplane. The script also skips CNI installation if `SKIP_CNI_INSTALLATION` environment variable is set. +Use `make run-cyclonus-test` to run this script + +### Deploy Controller on Dataplane +`update-controller-image-dataplane.sh` script helps in installing the Dataplane manifests for `amazon-network-policy-controller-k8s` controller. It provides the option to run a custom/dev image of the controller on dataplane if you set `AMAZON_NP_CONTROLLER` to the image URI. +Use `make deploy-controller-on-dataplane` action to run this script or `make deploy-and-test` to use run this script and cyclonus suite. diff --git a/scripts/lib/network-policy.sh b/scripts/lib/network-policy.sh new file mode 100644 index 0000000..b8f0e16 --- /dev/null +++ b/scripts/lib/network-policy.sh @@ -0,0 +1,116 @@ + +function load_addon_details() { + + ADDON_NAME="vpc-cni" + echo "loading $ADDON_NAME addon details" + LATEST_ADDON_VERSION=$(aws eks describe-addon-versions $ENDPOINT_FLAG --addon-name $ADDON_NAME --kubernetes-version $K8S_VERSION | jq '.addons[0].addonVersions[0].addonVersion' -r) + EXISTING_SERVICE_ACCOUNT_ROLE_ARN=$(kubectl get serviceaccount -n kube-system aws-node -o json | jq '.metadata.annotations."eks.amazonaws.com/role-arn"' -r) +} + +function wait_for_addon_status() { + local expected_status=$1 + local retry_attempt=0 + if [ "$expected_status" = "DELETED" ]; then + while $(aws eks describe-addon $ENDPOINT_FLAG --cluster-name $CLUSTER_NAME --addon-name $ADDON_NAME --region $REGION >> /dev/null); do + if [ $retry_attempt -ge 30 ]; then + echo "failed to delete addon, qutting after too many attempts" + exit 1 + fi + echo "addon is still not deleted" + sleep 5 + ((retry_attempt=retry_attempt+1)) + done + echo "addon deleted" + + sleep 10 + return + fi + + retry_attempt=0 + while true + do + STATUS=$(aws eks describe-addon $ENDPOINT_FLAG --cluster-name "$CLUSTER_NAME" --addon-name $ADDON_NAME --region "$REGION" | jq -r '.addon.status') + if [ "$STATUS" = "$expected_status" ]; then + echo "addon status matches expected status" + return + fi + + if [ $retry_attempt -ge 30 ]; then + echo "failed to get desired add-on status: $STATUS, qutting after too many attempts" + exit 1 + fi + echo "addon status is not equal to $expected_status" + sleep 10 + ((retry_attempt=retry_attempt+1)) + done +} + +function install_network_policy_mao() { + + local addon_version=$1 + if DESCRIBE_ADDON=$(aws eks describe-addon $ENDPOINT_FLAG --cluster-name $CLUSTER_NAME --addon-name $ADDON_NAME --region $REGION); then + local current_addon_version=$(echo "$DESCRIBE_ADDON" | jq '.addon.addonVersion' -r) + echo "deleting the $current_addon_version" + aws eks delete-addon $ENDPOINT_FLAG --cluster-name $CLUSTER_NAME --addon-name $ADDON_NAME --region $REGION + wait_for_addon_status "DELETED" + fi + + echo "Installing addon $addon_version with network policy enabled" + + if [ "$EXISTING_SERVICE_ACCOUNT_ROLE_ARN" != "null" ]; then + SA_ROLE_ARN_ARG="--service-account-role-arn $EXISTING_SERVICE_ACCOUNT_ROLE_ARN" + fi + + aws eks create-addon \ + --cluster-name $CLUSTER_NAME \ + --addon-name $ADDON_NAME \ + --configuration-value '{"enableNetworkPolicy": "true"}' \ + --resolve-conflicts OVERWRITE \ + --addon-version $addon_version \ + --region $REGION $ENDPOINT_FLAG $SA_ROLE_ARN_ARG + + wait_for_addon_status "ACTIVE" +} + +function install_network_policy_helm(){ + + helm repo add eks https://aws.github.io/eks-charts + + if [[ $IP_FAMILY == "IPv4" ]]; then + ENABLE_IPv4=true + ENABLE_IPv6=false + ENABLE_PREFIX_DELEGATION=false + else + ENABLE_IPv4=false + ENABLE_IPv6=true + ENABLE_PREFIX_DELEGATION=true + fi + + echo "Updating annotations and labels on existing resources" + for kind in daemonSet clusterRole clusterRoleBinding serviceAccount; do + echo "setting annotations and labels on $kind/aws-node" + kubectl -n kube-system annotate --overwrite $kind aws-node meta.helm.sh/release-name=aws-vpc-cni || echo "Unable to annotate $kind/aws-node" + kubectl -n kube-system annotate --overwrite $kind aws-node meta.helm.sh/release-namespace=kube-system || echo "Unable to annotate $kind/aws-node" + kubectl -n kube-system label --overwrite $kind aws-node app.kubernetes.io/managed-by=Helm || echo "Unable to label $kind/aws-node" + done + + echo "Installing/Updating the aws-vpc-cni helm chart with enableNetworkPolicy=true" + helm upgrade --install aws-vpc-cni eks/aws-vpc-cni --wait --timeout 300s \ + --namespace kube-system \ + --set enableNetworkPolicy=true \ + --set originalMatchLabels=true \ + --set init.env.ENABLE_IPv6=$ENABLE_IPv6 \ + --set image.env.ENABLE_IPv6=$ENABLE_IPv6 \ + --set nodeAgent.enableIpv6=$ENABLE_IPv6 \ + --set image.env.ENABLE_PREFIX_DELEGATION=$ENABLE_PREFIX_DELEGATION \ + --set image.env.ENABLE_IPv4=$ENABLE_IPv4 +} + +function disable_cp_network_policy_controller() { + + if kubectl get configmap amazon-vpc-cni -n kube-system > /dev/null; then + echo "Disabling Network Policy Controller on Control Plane" + kubectl patch configmap/amazon-vpc-cni -n kube-system --type merge -p '{"data":{"enable-network-policy-controller":"false"}}' + fi + +} \ No newline at end of file diff --git a/scripts/lib/tests.sh b/scripts/lib/tests.sh new file mode 100644 index 0000000..b944840 --- /dev/null +++ b/scripts/lib/tests.sh @@ -0,0 +1,20 @@ +function run_cyclonus_tests(){ + + kubectl create ns netpol + kubectl create clusterrolebinding cyclonus --clusterrole=cluster-admin --serviceaccount=netpol:cyclonus + kubectl create sa cyclonus -n netpol + kubectl apply -f ${DIR}/test/cyclonus-config.yaml -n netpol + + echo "Executing cyclonus suite" + kubectl wait --for=condition=complete --timeout=240m -n netpol job.batch/cyclonus || echo "Job timed out after 4 hrs" + kubectl logs -n netpol job/cyclonus > ${DIR}/results.log + + # Cleanup after test finishes + kubectl delete clusterrolebinding cyclonus + kubectl delete ns netpol + + cat ${DIR}/results.log + + echo "Verify results against expected" + python3 ${DIR}/lib/verify_test_results.py -f ${DIR}/results.log -ip $IP_FAMILY || (echo "Cyclonus tests have failed" && TEST_FAILED=true) +} diff --git a/scripts/lib/verify_test_results.py b/scripts/lib/verify_test_results.py new file mode 100644 index 0000000..d10de26 --- /dev/null +++ b/scripts/lib/verify_test_results.py @@ -0,0 +1,92 @@ +import re +import sys +import argparse + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("-f", "--file-name",default="", dest="file_name",help="Cyclonus results log file") + parser.add_argument("-ip", "--ip-family",default="IPv4", dest="ip_family",help="IP Family of the cluster") + args = parser.parse_args() + verify_results(args.file_name,args.ip_family) + +def verify_results(file_name,ip_family): + + # Cyclonus runs 112 test cases in total where each case has a number sub tests. AWS NP doesn't support all these sub-tests + # expected_results maintains a mapping of the test number and the number of sub-tests that are expected to pass for v4/v6 clusters + # For the test numbers not included in this map, it is expected that all the sub-tests should be passing + if ip_family == "IPv6": + expected_results={ 2:80, 3:80, 8:80, 12:80, 23:80, 25:80, 26:80, 28:80,29:80, 31:77, 98:80, 102:72, 104:72, 106:72, 108:72, 111:80, 112:80 } + else: + expected_results={ 2:80, 3:80, 8:80, 12:80, 23:80, 25:80, 26:80, 28:80, 29:80, 31:80, 98:80, 111:80, 112:80 } + + start="starting test case" + wrong="wrong" + ignored="ignored" + correct="correct" + delimiter=':|\ |,|\\n' + test_number=0 + is_test_run_failed=False + step=0 + + # Open the log file in read-only mode + with open(file_name, 'r') as filedata: + for line in filedata: + # Checking if the keywords are found in the line + is_test_case_failed=False + if all(key in line for key in [wrong,ignored,correct]): + step+=1 + words=re.split(delimiter, line) + count_wrong=int(words[words.index(wrong)-1]) + count_correct=int(words[words.index(correct)-1]) + count_ignored=int(words[words.index(ignored)-1]) + + # Expected correct count by default + expected_correct=count_wrong+count_correct+count_ignored + + # Check if test results are expected + if test_number in expected_results.keys(): + + if isinstance(expected_results[test_number], dict): + expected_correct=expected_results[test_number][step] + else: + expected_correct=expected_results[test_number] + # In v6 cluster, test #31 depends on which nodes the pod runs on, so we use here ( < ) instead of ( != ) + if count_correct < expected_correct: + is_test_case_failed=True + elif count_wrong > 0: + is_test_case_failed=True + + if is_test_case_failed: + # Mark the entire test run as fail since atleast one test deviated from the expected results + is_test_run_failed=True + print("Test Number:{test_number} | step:{step} | Failed -> Correct:{count_correct} Expected:{expected_correct}".format( + test_number=test_number, + step=step, + count_correct=count_correct, + expected_correct=expected_correct + )) + else: + print("Test Number:{test_number} | step:{step} | Passed -> Correct:{count_correct} Expected:{expected_correct}".format( + test_number=test_number, + step=step, + count_correct=count_correct, + expected_correct=expected_correct + )) + + # This denotes the start of test + elif start in line: + step=0 + test_number=int(line.split("#")[1]) + is_test_case_failed=False + else: + continue + + # Fail test if either flag is true or all 112 tests did not get executed + if is_test_run_failed or test_number != 112: + print("Test Run Failed. Check failures") + sys.exit(1) + else: + sys.exit(0) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scripts/run-cyclonus-tests.sh b/scripts/run-cyclonus-tests.sh new file mode 100755 index 0000000..8348a6b --- /dev/null +++ b/scripts/run-cyclonus-tests.sh @@ -0,0 +1,66 @@ +#!/bin/bash + +# The script runs Network Policy Cyclonus tests on a existing cluster +# Parameters: +# CLUSTER_NAME: name of the cluster +# KUBECONFIG: path to the kubeconfig file, default ~/.kube/config +# REGION: defaults to us-west-2 +# IP_FAMILY: defaults to IPv4 +# ADDON_VERSION: Optional, defaults to the latest version +# ENDPOINT: Optional + +set -euoE pipefail +DIR=$(cd "$(dirname "$0")"; pwd) + +source ${DIR}/lib/network-policy.sh +source ${DIR}/lib/tests.sh + +: "${ENDPOINT_FLAG:=""}" +: "${ENDPOINT:=""}" +: "${ADDON_VERSION:=""}" +: "${IP_FAMILY:="IPv4"}" +: "${REGION:="us-west-2"}" +: "${SKIP_CNI_INSTALLATION:="false"}" +: "${K8S_VERSION:=""}" +: "${DISABLE_CP_NETWORK_POLICY_CONTROLLER="false"}" + +if [[ ! -z $ENDPOINT ]]; then + ENDPOINT_FLAG="--endpoint-url $ENDPOINT" +fi + +if [[ -z $K8S_VERSION ]]; then + K8S_VERSION=$(aws eks describe-cluster $ENDPOINT_FLAG --name $CLUSTER_NAME --region $REGION | jq -r '.cluster.version') +fi + +TEST_FAILED="false" + +echo "Running Cyclonus e2e tests with the following variables +KUBECONFIG: $KUBECONFIG +CLUSTER_NAME: $CLUSTER_NAME +REGION: $REGION +IP_FAMILY: $IP_FAMILY +K8S_VERSION: $K8S_VERSION + +Optional args +ENDPOINT: $ENDPOINT +" + +if [[ $SKIP_CNI_INSTALLATION == "false" ]]; then + install_network_policy_helm +else + echo "Skipping CNI installation. Make sure you have enabled network policy support in your cluster before executing the test" +fi + +if [[ $DISABLE_CP_NETWORK_POLICY_CONTROLLER == "true" ]]; then + echo "Disable CP Network Policy Controller on controller plane" + disable_cp_network_policy_controller +else + echo "Skip disabling CP Network Policy controller. Tests will be evaulated against control plane NP controller" +fi + +run_cyclonus_tests + +if [[ $TEST_FAILED == "true" ]]; then + echo "Test run failed, check failures" + exit 1 +fi diff --git a/scripts/test/cyclonus-config.yaml b/scripts/test/cyclonus-config.yaml new file mode 100644 index 0000000..faf6dc8 --- /dev/null +++ b/scripts/test/cyclonus-config.yaml @@ -0,0 +1,18 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: cyclonus +spec: + backoffLimit: 0 + template: + spec: + restartPolicy: Never + containers: + - command: + - ./cyclonus + - generate + - --cleanup-namespaces=true + name: cyclonus + imagePullPolicy: Always + image: mfenwick100/cyclonus:v0.5.3 + serviceAccount: cyclonus \ No newline at end of file diff --git a/scripts/update-controller-image-dataplane.sh b/scripts/update-controller-image-dataplane.sh new file mode 100755 index 0000000..66a8c19 --- /dev/null +++ b/scripts/update-controller-image-dataplane.sh @@ -0,0 +1,23 @@ +#!/bin/bash +# Use this script to deploy the amazon-np-controller deployment on Dataplane nodes + +# Parameters: +# KUBECONFIG: path to the kubeconfig file, default ~/.kube/config +# AMAZON_NP_CONTROLLER: node agent image + +set -e +DIR=$(cd "$(dirname "$0")"; pwd) + +echo "Deploy the default Amazon Network Policy Controller on Dataplane" +kubectl apply -k config/default + +if [[ ! -z $AMAZON_NP_CONTROLLER ]];then + echo "Setting the Controller Image: $AMAZON_NP_CONTROLLER" + kubectl set image deployment.v1.apps/amazon-network-policy-controller-k8s controller=$AMAZON_NP_CONTROLLER +fi + +echo "Restarting the Controller" +kubectl rollout restart deployment.v1.apps/amazon-network-policy-controller-k8s -n kube-system + +echo "Ensuring Controller is Running on Dataplane" +kubectl rollout status deployment.v1.apps/amazon-network-policy-controller-k8s -n kube-system --timeout=2m || (echo "Amazon Network Policy controller is unhealthy" && exit 1)