diff --git a/helm-charts/HPA.md b/helm-charts/HPA.md index 4e27f751e..bbf2904f9 100644 --- a/helm-charts/HPA.md +++ b/helm-charts/HPA.md @@ -84,6 +84,10 @@ ChatQnA includes pre-configured values files for scaling the services. To enable HPA, add `-f chatqna/hpa-values.yaml` option to your `helm install` command line. +If **CPU** versions of TGI (and TEI) services are being scaled, resource requests and probe timings +suitable for CPU usage need to be used. Add `-f chatqna/cpu-values.yaml` option to your `helm install` +line. + ### Verify To verify that horizontalPodAutoscaler options work, it's better to check that both inferencing diff --git a/helm-charts/chatqna/cpu-values.yaml b/helm-charts/chatqna/cpu-values.yaml new file mode 100644 index 000000000..b4c5ee5dd --- /dev/null +++ b/helm-charts/chatqna/cpu-values.yaml @@ -0,0 +1,109 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Override CPU resource request and probe timing values in specific subcharts +# +# RESOURCES +# +# Resource request matching actual resource usage (with enough slack) +# is important when service is scaled up, so that right amount of pods +# get scheduled to right nodes. +# +# Because resource usage depends on the used devices, model, data type +# and SW versions, and this top-level chart has overrides for them, +# resource requests need to be specified here too. +# +# To test service without resource request, use "resources: {}". +# +# PROBES +# +# Inferencing pods startup / warmup takes *much* longer on CPUs than +# with acceleration devices, and their responses are also slower, +# especially when node is running several instances of these services. +# +# Kubernetes restarting pod before its startup finishes, or not +# sending it queries because it's not in ready state due to slow +# readiness responses, does really NOT help in getting faster responses. +# +# => probe timings need to be increased when running on CPU. + +tgi: + # TODO: add Helm value also for TGI data type option: + # https://github.com/opea-project/GenAIExamples/issues/330 + LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 + + # Potentially suitable values for scaling CPU TGI 2.2 with Intel/neural-chat-7b-v3-3 @ 32-bit: + resources: + limits: + cpu: 8 + memory: 70Gi + requests: + cpu: 6 + memory: 65Gi + + livenessProbe: + initialDelaySeconds: 8 + periodSeconds: 8 + failureThreshold: 24 + timeoutSeconds: 4 + readinessProbe: + initialDelaySeconds: 16 + periodSeconds: 8 + timeoutSeconds: 4 + startupProbe: + initialDelaySeconds: 10 + periodSeconds: 5 + failureThreshold: 180 + timeoutSeconds: 2 + +teirerank: + RERANK_MODEL_ID: "BAAI/bge-reranker-base" + + # Potentially suitable values for scaling CPU TEI v1.5 with BAAI/bge-reranker-base model: + resources: + limits: + cpu: 4 + memory: 30Gi + requests: + cpu: 2 + memory: 25Gi + + livenessProbe: + initialDelaySeconds: 8 + periodSeconds: 8 + failureThreshold: 24 + timeoutSeconds: 4 + readinessProbe: + initialDelaySeconds: 8 + periodSeconds: 8 + timeoutSeconds: 4 + startupProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + failureThreshold: 120 + +tei: + EMBEDDING_MODEL_ID: "BAAI/bge-base-en-v1.5" + + # Potentially suitable values for scaling CPU TEI 1.5 with BAAI/bge-base-en-v1.5 model: + resources: + limits: + cpu: 4 + memory: 4Gi + requests: + cpu: 2 + memory: 3Gi + + livenessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + failureThreshold: 24 + timeoutSeconds: 2 + readinessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 2 + startupProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + failureThreshold: 120