From 702fe01a6650ff3ca336b29b15d8b3291a4f5bd8 Mon Sep 17 00:00:00 2001 From: Lukas Metzner Date: Tue, 29 Oct 2024 12:40:40 +0100 Subject: [PATCH] feat: force pods with volumes to be scheduled on Cloud servers (#743) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Due to a bug in the scheduler a node with no driver instance might be picked and the volume is stuck in pending as the "no capacity - > reschedule" recovery is never triggered [[0]](https://github.com/kubernetes/kubernetes/pull/122109), [[1]](https://github.com/kubernetes-csi/external-provisioner/issues/544). - See #400 --------- Co-authored-by: lukasmetzner Co-authored-by: Julian Tölle --- chart/.snapshots/default.yaml | 4 +++ chart/.snapshots/full.values.yaml | 2 ++ chart/.snapshots/full.yaml | 4 +++ chart/templates/core/storageclass.yaml | 7 +++++ chart/values.yaml | 6 ++++ deploy/kubernetes/hcloud-csi.yml | 4 +++ docs/kubernetes/README.md | 39 ++++++++++++++++++++++++-- internal/driver/driver.go | 1 + internal/driver/node.go | 1 + 9 files changed, 66 insertions(+), 2 deletions(-) diff --git a/chart/.snapshots/default.yaml b/chart/.snapshots/default.yaml index 6fbcb4c8..7b2da5de 100644 --- a/chart/.snapshots/default.yaml +++ b/chart/.snapshots/default.yaml @@ -142,6 +142,10 @@ spec: operator: NotIn values: - "true" + - key: instance.hetzner.cloud/provided-by + operator: NotIn + values: + - robot tolerations: - effect: NoExecute operator: Exists diff --git a/chart/.snapshots/full.values.yaml b/chart/.snapshots/full.values.yaml index b766cfe2..51ac7aa2 100644 --- a/chart/.snapshots/full.values.yaml +++ b/chart/.snapshots/full.values.yaml @@ -370,3 +370,5 @@ storageClasses: - name: foobar defaultStorageClass: false reclaimPolicy: Keep + allowedTopologyCloudServer: false + diff --git a/chart/.snapshots/full.yaml b/chart/.snapshots/full.yaml index afcdf916..41186476 100644 --- a/chart/.snapshots/full.yaml +++ b/chart/.snapshots/full.yaml @@ -245,6 +245,10 @@ spec: operator: NotIn values: - "true" + - key: instance.hetzner.cloud/provided-by + operator: NotIn + values: + - robot nodeSelector: foo: bar tolerations: diff --git a/chart/templates/core/storageclass.yaml b/chart/templates/core/storageclass.yaml index 1bc246d2..5771edca 100644 --- a/chart/templates/core/storageclass.yaml +++ b/chart/templates/core/storageclass.yaml @@ -10,6 +10,13 @@ provisioner: csi.hetzner.cloud volumeBindingMode: WaitForFirstConsumer allowVolumeExpansion: true reclaimPolicy: {{ $val.reclaimPolicy | quote }} +{{- if $val.allowedTopologyCloudServer }} +allowedTopologies: +- matchLabelExpressions: + - key: instance.hetzner.cloud/provided-by + values: + - "cloud" +{{- end }} --- {{- end }} {{- end }} \ No newline at end of file diff --git a/chart/values.yaml b/chart/values.yaml index 431f3169..3508ecb8 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -556,6 +556,10 @@ node: operator: NotIn values: - "true" + - key: "instance.hetzner.cloud/provided-by" + operator: NotIn + values: + - "robot" ## @param node.nodeSelector Node labels for node pods assignment ## ref: https://kubernetes.io/docs/user-guide/node-selection/ @@ -724,3 +728,5 @@ storageClasses: - name: hcloud-volumes defaultStorageClass: true reclaimPolicy: Delete + ## @param storageClass.allowedTopologyCloudServer Prevents pods from being scheduled on nodes, specifically Robot servers, where Hetzner volumes are unavailable + allowedTopologyCloudServer: false diff --git a/deploy/kubernetes/hcloud-csi.yml b/deploy/kubernetes/hcloud-csi.yml index f1078d97..a8ebfbd1 100644 --- a/deploy/kubernetes/hcloud-csi.yml +++ b/deploy/kubernetes/hcloud-csi.yml @@ -174,6 +174,10 @@ spec: operator: NotIn values: - "true" + - key: instance.hetzner.cloud/provided-by + operator: NotIn + values: + - robot tolerations: - effect: NoExecute operator: Exists diff --git a/docs/kubernetes/README.md b/docs/kubernetes/README.md index e80ad1dc..b73d0c99 100644 --- a/docs/kubernetes/README.md +++ b/docs/kubernetes/README.md @@ -106,7 +106,7 @@ metadata: stringData: encryption-passphrase: foobar ---- +--- apiVersion: storage.k8s.io/v1 kind: StorageClass @@ -209,8 +209,43 @@ $ kubectl apply -f https://raw.githubusercontent.com/hetznercloud/csi-driver/v2. ## Integration with Root Servers -Root servers can be part of the cluster, but the CSI plugin doesn't work there. Taint the root server as follows to skip that node for the DaemonSet. +Root servers can be part of the cluster, but the CSI plugin doesn't work there and the current behaviour of the scheduler can cause Pods to be stuck in `Pending`. +In the Helm Chart you can set `allowedTopologyCloudServer` to true to prevent pods from being scheduled on nodes, specifically Robot servers, where Hetzner volumes are unavailable. This value can not be changed after the initial creation of a storage class. + +```yaml +storageClasses: + - name: hcloud-volumes + defaultStorageClass: true + reclaimPolicy: Delete + allowedTopologyCloudServer: true # <--- +``` + +To ensure proper topology evaluation, labels are needed to indicate whether a node is a cloud VM or a dedicated server from Robot. If you are using the `hcloud-cloud-controller-manager` version 1.21.0 or later, these labels are added automatically. Otherwise, you will need to label the nodes manually. + +### Adding labels manually + +**Cloud Servers** +```bash +kubectl label nodes instance.hetzner.cloud/provided-by=cloud +``` + +**Root Servers** +```bash +kubectl label nodes instance.hetzner.cloud/provided-by=robot +``` + + +### DEPRECATED: Old Label + +We prefer that you use our [new label](#new-label). The label `instance.hetzner.cloud/is-robot-server` will be deprecated in future releases. + +**Cloud Servers** +```bash +kubectl label nodes instance.hetzner.cloud/is-root-server=false +``` + +**Root Servers** ```bash kubectl label nodes instance.hetzner.cloud/is-root-server=true ``` diff --git a/internal/driver/driver.go b/internal/driver/driver.go index 86e1dcb8..f2a9db01 100644 --- a/internal/driver/driver.go +++ b/internal/driver/driver.go @@ -9,4 +9,5 @@ const ( DefaultVolumeSize = MinVolumeSize TopologySegmentLocation = PluginName + "/location" + ProvidedByLabel = "instance.hetzner.cloud/provided-by" ) diff --git a/internal/driver/node.go b/internal/driver/node.go index 5b084ea1..6716ff02 100644 --- a/internal/driver/node.go +++ b/internal/driver/node.go @@ -178,6 +178,7 @@ func (s *NodeService) NodeGetInfo(_ context.Context, _ *proto.NodeGetInfoRequest AccessibleTopology: &proto.Topology{ Segments: map[string]string{ TopologySegmentLocation: s.serverLocation, + ProvidedByLabel: "cloud", }, }, }