From be3620bb16cd940050fd5b71e5e583a3cece499d Mon Sep 17 00:00:00 2001 From: jerryzhuang Date: Sat, 21 Dec 2024 10:21:39 +1100 Subject: [PATCH] feat: add qwen preset test (#788) **Reason for Change**: - remove outdated manifests - replace LoadBalancer by ClusterIP, won't directly access to IP anymore --------- Signed-off-by: jerryzhuang Co-authored-by: Fei Guo --- .github/e2e-preset-configs.json | 9 ++ .github/workflows/e2e-preset-test.yml | 54 +++++++----- .../workspace/models/supported_models.yaml | 54 ++++++------ .../falcon-40b-instruct-service.yaml | 2 +- .../falcon-40b/falcon-40b-service.yaml | 2 +- .../falcon-7b-instruct-service.yaml | 2 +- .../falcon-7b-with-adapter/falcon-7b.yaml | 51 ------------ .../falcon-7b/falcon-7b-service.yaml | 2 +- .../llama-2-13b-chat-service.yaml | 2 +- .../llama-2-13b/llama-2-13b-service.yaml | 2 +- .../llama-2-7b-chat-service.yaml | 2 +- .../llama-2-7b/llama-2-7b-service.yaml | 2 +- .../test/manifests/llama-headless.yaml | 14 ---- .../mistral-7b-instruct-service.yaml | 2 +- .../mistral-7b/mistral-7b-service.yaml | 2 +- .../test/manifests/phi-2/phi-2-service.yaml | 2 +- .../phi-3-medium-128k-instruct-service.yaml | 2 +- .../phi-3-medium-4k-instruct-service.yaml | 2 +- .../phi-3-mini-128k-instruct-service.yaml | 2 +- .../phi-3-mini-4k-instruct-service.yaml | 2 +- .../phi-3-small-128k-instruct-service.yaml | 2 +- .../phi-3-small-8k-instruct-service.yaml | 2 +- .../qwen2-5-coder-7b-instruct-service.yaml | 13 +++ .../qwen2-5-coder-7b-instruct_hf.yaml | 55 ++++++++++++ .../qwen2-5-coder-7b-instruct_vllm.yaml | 83 +++++++++++++++++++ 25 files changed, 234 insertions(+), 133 deletions(-) delete mode 100644 presets/workspace/test/manifests/falcon-7b-with-adapter/falcon-7b.yaml delete mode 100644 presets/workspace/test/manifests/llama-headless.yaml create mode 100644 presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct-service.yaml create mode 100644 presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct_hf.yaml create mode 100644 presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct_vllm.yaml diff --git a/.github/e2e-preset-configs.json b/.github/e2e-preset-configs.json index a0c45bdcf..9d20b0e38 100644 --- a/.github/e2e-preset-configs.json +++ b/.github/e2e-preset-configs.json @@ -98,6 +98,15 @@ "OSS": true, "loads_adapter": false }, + { + "name": "qwen2.5-coder-7b-instruct", + "workload": "qwen2-5-coder-7b-instruct", + "node-count": 1, + "node-vm-size": "Standard_NC12s_v3", + "node-osdisk-size": 100, + "OSS": true, + "loads_adapter": false + }, { "name": "llama-2-7b", "node-count": 1, diff --git a/.github/workflows/e2e-preset-test.yml b/.github/workflows/e2e-preset-test.yml index cb2ee2802..75d073d0f 100644 --- a/.github/workflows/e2e-preset-test.yml +++ b/.github/workflows/e2e-preset-test.yml @@ -170,6 +170,7 @@ jobs: run: | NAME_SUFFIX=${{ matrix.model.name }} NAME_SUFFIX_WITHOUT_DASHES=${NAME_SUFFIX//-/} # Removing all '-' symbols + NAME_SUFFIX_WITHOUT_DASHES=${NAME_SUFFIX_WITHOUT_DASHES//./} # Removing all '.' symbols if [ ${#NAME_SUFFIX_WITHOUT_DASHES} -gt 12 ]; then TRUNCATED_NAME_SUFFIX=${NAME_SUFFIX_WITHOUT_DASHES:0:12} @@ -213,16 +214,21 @@ jobs: fi fi + - name: Get testing workload + id: workload + run: | + WORKLOAD_NAME=${{ matrix.model.workload || matrix.model.name }} + echo "WORKLOAD_NAME=$WORKLOAD_NAME" >> $GITHUB_OUTPUT + echo "WORKLOAD_FILE_PREFIX=presets/workspace/test/manifests/$WORKLOAD_NAME/$WORKLOAD_NAME" >> $GITHUB_OUTPUT + - name: Create Service - run: kubectl apply -f presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}-service.yaml + run: | + kubectl apply -f ${{steps.workload.outputs.WORKLOAD_FILE_PREFIX}}-service.yaml - name: Retrieve External Service IP id: get_ip run: | - while [[ -z $SERVICE_IP ]]; do - SERVICE_IP=$(kubectl get svc ${{ matrix.model.name }} -o=jsonpath='{.status.loadBalancer.ingress[0].ip}') - sleep 5 - done + SERVICE_IP=$(kubectl get svc ${{steps.workload.outputs.WORKLOAD_NAME}} -o=jsonpath='{.spec.clusterIP}') echo "Service IP is $SERVICE_IP" echo "SERVICE_IP=$SERVICE_IP" >> $GITHUB_OUTPUT @@ -235,36 +241,38 @@ jobs: - name: Replace IP and Deploy Resource to K8s run: | POSTFIX=$(echo "${{ matrix.model.name }}" | grep -q "llama" && echo "" || echo "_${{ env.RUNTIME }}") - sed -i "s/MASTER_ADDR_HERE/${{ steps.get_ip.outputs.SERVICE_IP }}/g" presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}$POSTFIX.yaml - sed -i "s/TAG_HERE/${{ matrix.model.tag }}/g" presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}$POSTFIX.yaml - sed -i "s/REPO_HERE/${{ secrets.ACR_AMRT_USERNAME }}/g" presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}$POSTFIX.yaml - kubectl apply -f presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}$POSTFIX.yaml + WORKLOAD_FILE=${{steps.workload.outputs.WORKLOAD_FILE_PREFIX}}$POSTFIX.yaml + + sed -i "s/MASTER_ADDR_HERE/${{ steps.get_ip.outputs.SERVICE_IP }}/g" $WORKLOAD_FILE + sed -i "s/TAG_HERE/${{ matrix.model.tag }}/g" $WORKLOAD_FILE + sed -i "s/REPO_HERE/${{ secrets.ACR_AMRT_USERNAME }}/g" $WORKLOAD_FILE + kubectl apply -f $WORKLOAD_FILE - name: Wait for Resource to be ready run: | - kubectl rollout status ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} --timeout=1800s + kubectl rollout status ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} --timeout=1800s - name: Check Adapter Loading from Logs if: matrix.model.loads_adapter == true run: | - POD_NAME=$(kubectl get pods -l app=${{ matrix.model.name }} -o jsonpath="{.items[0].metadata.name}") + POD_NAME=$(kubectl get pods -l app=${{steps.workload.outputs.WORKLOAD_NAME}} -o jsonpath="{.items[0].metadata.name}") kubectl logs $POD_NAME | grep "Adapter added:" | grep "${{ matrix.model.expected_adapter }}" || (echo "Adapter not loaded or incorrect adapter loaded" && exit 1) - name: Install testing commands run: | - kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- apt-get update - kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- apt-get install -y curl + kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- apt-get update + kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- apt-get install -y curl - name: Test healthz endpoint run: | - kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \ + kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- \ curl -s http://localhost:5000/health - name: Test inference endpoint run: | echo "Testing inference for ${{ matrix.model.name }}" if [[ "${{ matrix.model.name }}" == *"llama"* && "${{ matrix.model.name }}" == *"-chat"* ]]; then - kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \ + kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- \ curl -s -X POST \ -H "Content-Type: application/json" \ -d '{ @@ -285,7 +293,7 @@ jobs: }' \ http://localhost:5000/chat elif [[ "${{ matrix.model.name }}" == *"llama"* ]]; then - kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \ + kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- \ curl -s -X POST \ -H "Content-Type: application/json" \ -d '{ @@ -301,7 +309,7 @@ jobs: }' \ http://localhost:5000/generate elif [[ "${{ env.RUNTIME }}" == *"vllm"* ]]; then - kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \ + kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- \ curl -s -X POST \ -H "accept: application/json" \ -H "Content-Type: application/json" \ @@ -320,7 +328,7 @@ jobs: }' \ http://localhost:5000/v1/chat/completions else - kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \ + kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- \ curl -s -X POST \ -H "accept: application/json" \ -H "Content-Type: application/json" \ @@ -367,15 +375,15 @@ jobs: RESOURCE_TYPE=${{ steps.resource.outputs.RESOURCE_TYPE }} # Check and Delete K8s Resource (Deployment or StatefulSet) - if kubectl get $RESOURCE_TYPE ${{ matrix.model.name }} > /dev/null 2>&1; then - kubectl logs $RESOURCE_TYPE/${{ matrix.model.name }} - kubectl delete $RESOURCE_TYPE ${{ matrix.model.name }} + if kubectl get $RESOURCE_TYPE ${{steps.workload.outputs.WORKLOAD_NAME}} > /dev/null 2>&1; then + kubectl logs $RESOURCE_TYPE/${{steps.workload.outputs.WORKLOAD_NAME}} + kubectl delete $RESOURCE_TYPE ${{steps.workload.outputs.WORKLOAD_NAME}} fi fi # Check and Delete K8s Service if it exists - if kubectl get svc ${{ matrix.model.name }} > /dev/null 2>&1; then - kubectl delete svc ${{ matrix.model.name }} + if kubectl get svc ${{steps.workload.outputs.WORKLOAD_NAME}} > /dev/null 2>&1; then + kubectl delete svc ${{steps.workload.outputs.WORKLOAD_NAME}} fi # Check and Delete AKS Nodepool if it exists diff --git a/presets/workspace/models/supported_models.yaml b/presets/workspace/models/supported_models.yaml index 99a61a10e..eaa7dbb10 100644 --- a/presets/workspace/models/supported_models.yaml +++ b/presets/workspace/models/supported_models.yaml @@ -35,13 +35,14 @@ models: type: text-generation version: https://huggingface.co/tiiuae/falcon-7b/commit/898df1396f35e447d5fe44e0a3ccaaaa69f30d36 runtime: tfs - tag: 0.0.7 + tag: 0.0.8 - name: falcon-7b-instruct type: text-generation version: https://huggingface.co/tiiuae/falcon-7b-instruct/commit/cf4b3c42ce2fdfe24f753f0f0d179202fea59c99 runtime: tfs - tag: 0.0.7 + tag: 0.0.8 # Tag history: + # 0.0.8 - Support adapter and config file for VLLM runtime # 0.0.7 - Support VLLM runtime # 0.0.6 - Add Logging & Metrics Server # 0.0.5 - Tuning and Adapters @@ -53,13 +54,14 @@ models: type: text-generation version: https://huggingface.co/tiiuae/falcon-40b/commit/4a70170c215b36a3cce4b4253f6d0612bb7d4146 runtime: tfs - tag: 0.0.8 + tag: 0.0.9 - name: falcon-40b-instruct type: text-generation version: https://huggingface.co/tiiuae/falcon-40b-instruct/commit/ecb78d97ac356d098e79f0db222c9ce7c5d9ee5f runtime: tfs - tag: 0.0.8 + tag: 0.0.9 # Tag history for 40b models: + # 0.0.9 - Support adapter and config file for VLLM runtime # 0.0.8 - Support VLLM runtime # 0.0.7 - Add Logging & Metrics Server # 0.0.6 - Tuning and Adapters @@ -74,13 +76,14 @@ models: type: text-generation version: https://huggingface.co/mistralai/Mistral-7B-v0.3/commit/d8cadc02ac76bd617a919d50b092e59d2d110aff runtime: tfs - tag: 0.0.8 + tag: 0.0.9 - name: mistral-7b-instruct type: text-generation version: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3/commit/e0bc86c23ce5aae1db576c8cca6f06f1f73af2db runtime: tfs - tag: 0.0.8 + tag: 0.0.9 # Tag history: + # 0.0.9 - Support adapter and config file for VLLM runtime # 0.0.8 - Support VLLM runtime # 0.0.7 - Add Logging & Metrics Server # 0.0.6 - Update model version and Address missing weights files fix @@ -95,8 +98,9 @@ models: type: text-generation version: https://huggingface.co/microsoft/phi-2/commit/ef382358ec9e382308935a992d908de099b64c23 runtime: tfs - tag: 0.0.6 + tag: 0.0.7 # Tag history: + # 0.0.7 - Support adapter and config file for VLLM runtime # 0.0.6 - Support VLLM runtime # 0.0.5 - Add Logging & Metrics Server # 0.0.4 - Tuning and Adapters @@ -109,38 +113,24 @@ models: type: text-generation version: https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/commit/0a67737cc96d2554230f90338b163bc6380a2a85 runtime: tfs - tag: 0.0.3 - # Tag history: - # 0.0.3 - Support VLLM runtime - # 0.0.2 - Add Logging & Metrics Server - # 0.0.1 - Initial Release - + tag: 0.0.4 - name: phi-3-mini-128k-instruct type: text-generation version: https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/commit/a90b62ae09941edff87a90ced39ba5807e6b2ade runtime: tfs - tag: 0.0.3 - # Tag history: - # 0.0.3 - Support VLLM runtime - # 0.0.2 - Add Logging & Metrics Server - # 0.0.1 - Initial Release - + tag: 0.0.4 - name: phi-3-medium-4k-instruct type: text-generation version: https://huggingface.co/microsoft/Phi-3-medium-4k-instruct/commit/ae004ae82eb6eddc32906dfacb1d6dfea8f91996 runtime: tfs - tag: 0.0.3 - # Tag history: - # 0.0.3 - Support VLLM runtime - # 0.0.2 - Add Logging & Metrics Server - # 0.0.1 - Initial Release - + tag: 0.0.4 - name: phi-3-medium-128k-instruct type: text-generation version: https://huggingface.co/microsoft/Phi-3-medium-128k-instruct/commit/fa7d2aa4f5ea69b2e36b20d050cdae79c9bfbb3f runtime: tfs - tag: 0.0.3 + tag: 0.0.4 # Tag history: + # 0.0.4 - Support adapter and config file for VLLM runtime # 0.0.3 - Support VLLM runtime # 0.0.2 - Add Logging & Metrics Server # 0.0.1 - Initial Release @@ -149,7 +139,15 @@ models: type: text-generation version: https://huggingface.co/microsoft/Phi-3.5-mini-instruct/commit/af0dfb8029e8a74545d0736d30cb6b58d2f0f3f0 runtime: tfs - tag: 0.0.1 + tag: 0.0.2 # Tag history: + # 0.0.2 - Support adapter and config file for VLLM runtime # 0.0.1 - New Model! Support VLLM Runtime - \ No newline at end of file + + - name: qwen2.5-coder-7b-instruct + type: text-generation + version: https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct/commit/0eb6b1ed2d0c4306bc637d09ecef51e59d3dfe05 + runtime: tfs + tag: 0.0.1 + # Tag history: + # 0.0.1 - New Model! \ No newline at end of file diff --git a/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct-service.yaml b/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct-service.yaml index 27f21ec46..fc357931a 100644 --- a/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct-service.yaml +++ b/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct-service.yaml @@ -9,5 +9,5 @@ spec: - protocol: TCP port: 80 targetPort: 5000 - type: LoadBalancer + type: ClusterIP publishNotReadyAddresses: true diff --git a/presets/workspace/test/manifests/falcon-40b/falcon-40b-service.yaml b/presets/workspace/test/manifests/falcon-40b/falcon-40b-service.yaml index 689361052..80ab4b539 100644 --- a/presets/workspace/test/manifests/falcon-40b/falcon-40b-service.yaml +++ b/presets/workspace/test/manifests/falcon-40b/falcon-40b-service.yaml @@ -9,5 +9,5 @@ spec: - protocol: TCP port: 80 targetPort: 5000 - type: LoadBalancer + type: ClusterIP publishNotReadyAddresses: true diff --git a/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct-service.yaml b/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct-service.yaml index 6acbe2405..2f27d46cb 100644 --- a/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct-service.yaml +++ b/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct-service.yaml @@ -9,5 +9,5 @@ spec: - protocol: TCP port: 80 targetPort: 5000 - type: LoadBalancer + type: ClusterIP publishNotReadyAddresses: true diff --git a/presets/workspace/test/manifests/falcon-7b-with-adapter/falcon-7b.yaml b/presets/workspace/test/manifests/falcon-7b-with-adapter/falcon-7b.yaml deleted file mode 100644 index 349a377a0..000000000 --- a/presets/workspace/test/manifests/falcon-7b-with-adapter/falcon-7b.yaml +++ /dev/null @@ -1,51 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: falcon-7b -spec: - replicas: 1 - selector: - matchLabels: - app: falcon - template: - metadata: - labels: - app: falcon - spec: - volumes: - - name: adapter-volume - emptyDir: {} - initContainers: - - name: falcon-7b-adapter - image: - imagePullPolicy: Always - command: ["/bin/sh", "-c", "mkdir -p /mnt/adapter/falcon-7b-adapter && cp -r /data/* /mnt/adapter/falcon-7b-adapter"] - volumeMounts: - - name: adapter-volume - mountPath: /mnt/adapter - containers: - - name: falcon-container - image: - command: - - /bin/sh - - -c - - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16 - resources: - requests: - nvidia.com/gpu: 2 - limits: - nvidia.com/gpu: 2 # Requesting 2 GPUs - volumeMounts: - - name: adapter-volume - mountPath: /mnt/adapter - env: - - name: falcon-7b-adapter - value: "0.2" - tolerations: - - effect: NoSchedule - value: gpu - key: sku - operator: Equal - - effect: NoSchedule - key: nvidia.com/gpu - operator: Exists diff --git a/presets/workspace/test/manifests/falcon-7b/falcon-7b-service.yaml b/presets/workspace/test/manifests/falcon-7b/falcon-7b-service.yaml index acf56ba74..595e83942 100644 --- a/presets/workspace/test/manifests/falcon-7b/falcon-7b-service.yaml +++ b/presets/workspace/test/manifests/falcon-7b/falcon-7b-service.yaml @@ -9,5 +9,5 @@ spec: - protocol: TCP port: 80 targetPort: 5000 - type: LoadBalancer + type: ClusterIP publishNotReadyAddresses: true diff --git a/presets/workspace/test/manifests/llama-2-13b-chat/llama-2-13b-chat-service.yaml b/presets/workspace/test/manifests/llama-2-13b-chat/llama-2-13b-chat-service.yaml index 79efb227b..58720a91d 100644 --- a/presets/workspace/test/manifests/llama-2-13b-chat/llama-2-13b-chat-service.yaml +++ b/presets/workspace/test/manifests/llama-2-13b-chat/llama-2-13b-chat-service.yaml @@ -15,5 +15,5 @@ spec: protocol: TCP port: 29500 targetPort: 29500 - type: LoadBalancer + type: ClusterIP publishNotReadyAddresses: true diff --git a/presets/workspace/test/manifests/llama-2-13b/llama-2-13b-service.yaml b/presets/workspace/test/manifests/llama-2-13b/llama-2-13b-service.yaml index c3cd3bdb6..f43826a48 100644 --- a/presets/workspace/test/manifests/llama-2-13b/llama-2-13b-service.yaml +++ b/presets/workspace/test/manifests/llama-2-13b/llama-2-13b-service.yaml @@ -15,5 +15,5 @@ spec: protocol: TCP port: 29500 targetPort: 29500 - type: LoadBalancer + type: ClusterIP publishNotReadyAddresses: true diff --git a/presets/workspace/test/manifests/llama-2-7b-chat/llama-2-7b-chat-service.yaml b/presets/workspace/test/manifests/llama-2-7b-chat/llama-2-7b-chat-service.yaml index d661db59b..99fc7895d 100644 --- a/presets/workspace/test/manifests/llama-2-7b-chat/llama-2-7b-chat-service.yaml +++ b/presets/workspace/test/manifests/llama-2-7b-chat/llama-2-7b-chat-service.yaml @@ -10,5 +10,5 @@ spec: - protocol: TCP port: 80 targetPort: 5000 - type: LoadBalancer + type: ClusterIP publishNotReadyAddresses: true diff --git a/presets/workspace/test/manifests/llama-2-7b/llama-2-7b-service.yaml b/presets/workspace/test/manifests/llama-2-7b/llama-2-7b-service.yaml index 30967c332..d8dfb84c7 100644 --- a/presets/workspace/test/manifests/llama-2-7b/llama-2-7b-service.yaml +++ b/presets/workspace/test/manifests/llama-2-7b/llama-2-7b-service.yaml @@ -10,5 +10,5 @@ spec: - protocol: TCP port: 80 targetPort: 5000 - type: LoadBalancer + type: ClusterIP publishNotReadyAddresses: true diff --git a/presets/workspace/test/manifests/llama-headless.yaml b/presets/workspace/test/manifests/llama-headless.yaml deleted file mode 100644 index e0514564f..000000000 --- a/presets/workspace/test/manifests/llama-headless.yaml +++ /dev/null @@ -1,14 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: llama-headless -spec: - selector: - app: llama - clusterIP: None - ports: - - name: torchrun - protocol: TCP - port: 29500 - targetPort: 29500 - publishNotReadyAddresses: true \ No newline at end of file diff --git a/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct-service.yaml b/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct-service.yaml index 31b9206bc..94627746d 100644 --- a/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct-service.yaml +++ b/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct-service.yaml @@ -9,5 +9,5 @@ spec: - protocol: TCP port: 80 targetPort: 5000 - type: LoadBalancer + type: ClusterIP publishNotReadyAddresses: true diff --git a/presets/workspace/test/manifests/mistral-7b/mistral-7b-service.yaml b/presets/workspace/test/manifests/mistral-7b/mistral-7b-service.yaml index 650422c7c..90ba3ec8f 100644 --- a/presets/workspace/test/manifests/mistral-7b/mistral-7b-service.yaml +++ b/presets/workspace/test/manifests/mistral-7b/mistral-7b-service.yaml @@ -9,5 +9,5 @@ spec: - protocol: TCP port: 80 targetPort: 5000 - type: LoadBalancer + type: ClusterIP publishNotReadyAddresses: true diff --git a/presets/workspace/test/manifests/phi-2/phi-2-service.yaml b/presets/workspace/test/manifests/phi-2/phi-2-service.yaml index b81036bd8..d0f99f9ad 100644 --- a/presets/workspace/test/manifests/phi-2/phi-2-service.yaml +++ b/presets/workspace/test/manifests/phi-2/phi-2-service.yaml @@ -9,5 +9,5 @@ spec: - protocol: TCP port: 80 targetPort: 5000 - type: LoadBalancer + type: ClusterIP publishNotReadyAddresses: true diff --git a/presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct-service.yaml b/presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct-service.yaml index 8162f342c..bab354ee9 100644 --- a/presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct-service.yaml +++ b/presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct-service.yaml @@ -9,5 +9,5 @@ spec: - protocol: TCP port: 80 targetPort: 5000 - type: LoadBalancer + type: ClusterIP publishNotReadyAddresses: true diff --git a/presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct-service.yaml b/presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct-service.yaml index b0fd7047c..60710504f 100644 --- a/presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct-service.yaml +++ b/presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct-service.yaml @@ -9,5 +9,5 @@ spec: - protocol: TCP port: 80 targetPort: 5000 - type: LoadBalancer + type: ClusterIP publishNotReadyAddresses: true diff --git a/presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct-service.yaml b/presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct-service.yaml index c4c613237..ef86aefb2 100644 --- a/presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct-service.yaml +++ b/presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct-service.yaml @@ -9,5 +9,5 @@ spec: - protocol: TCP port: 80 targetPort: 5000 - type: LoadBalancer + type: ClusterIP publishNotReadyAddresses: true diff --git a/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct-service.yaml b/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct-service.yaml index 3e7426ae8..0063f24aa 100644 --- a/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct-service.yaml +++ b/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct-service.yaml @@ -9,5 +9,5 @@ spec: - protocol: TCP port: 80 targetPort: 5000 - type: LoadBalancer + type: ClusterIP publishNotReadyAddresses: true diff --git a/presets/workspace/test/manifests/phi-3-small-128k-instruct/phi-3-small-128k-instruct-service.yaml b/presets/workspace/test/manifests/phi-3-small-128k-instruct/phi-3-small-128k-instruct-service.yaml index 86deb6985..a28bac071 100644 --- a/presets/workspace/test/manifests/phi-3-small-128k-instruct/phi-3-small-128k-instruct-service.yaml +++ b/presets/workspace/test/manifests/phi-3-small-128k-instruct/phi-3-small-128k-instruct-service.yaml @@ -9,5 +9,5 @@ spec: - protocol: TCP port: 80 targetPort: 5000 - type: LoadBalancer + type: ClusterIP publishNotReadyAddresses: true diff --git a/presets/workspace/test/manifests/phi-3-small-8k-instruct/phi-3-small-8k-instruct-service.yaml b/presets/workspace/test/manifests/phi-3-small-8k-instruct/phi-3-small-8k-instruct-service.yaml index 648de4337..17e031f87 100644 --- a/presets/workspace/test/manifests/phi-3-small-8k-instruct/phi-3-small-8k-instruct-service.yaml +++ b/presets/workspace/test/manifests/phi-3-small-8k-instruct/phi-3-small-8k-instruct-service.yaml @@ -9,5 +9,5 @@ spec: - protocol: TCP port: 80 targetPort: 5000 - type: LoadBalancer + type: ClusterIP publishNotReadyAddresses: true diff --git a/presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct-service.yaml b/presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct-service.yaml new file mode 100644 index 000000000..73637c99a --- /dev/null +++ b/presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct-service.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: Service +metadata: + name: qwen2-5-coder-7b-instruct +spec: + selector: + app: qwen2-5-coder-7b-instruct + ports: + - protocol: TCP + port: 80 + targetPort: 5000 + type: ClusterIP + publishNotReadyAddresses: true diff --git a/presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct_hf.yaml b/presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct_hf.yaml new file mode 100644 index 000000000..e92d906d7 --- /dev/null +++ b/presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct_hf.yaml @@ -0,0 +1,55 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: qwen2-5-coder-7b-instruct +spec: + replicas: 1 + selector: + matchLabels: + app: qwen2-5-coder-7b-instruct + template: + metadata: + labels: + app: qwen2-5-coder-7b-instruct + spec: + containers: + - name: qwen2-5-coder-7b-instruct-container + image: REPO_HERE.azurecr.io/qwen2.5-coder-7b-instruct:TAG_HERE + command: + - /bin/sh + - -c + - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code + resources: + requests: + nvidia.com/gpu: 2 + limits: + nvidia.com/gpu: 2 + livenessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 600 # 10 Min + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 30 + periodSeconds: 10 + volumeMounts: + - name: dshm + mountPath: /dev/shm + volumes: + - name: dshm + emptyDir: + medium: Memory + tolerations: + - effect: NoSchedule + key: sku + operator: Equal + value: gpu + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + nodeSelector: + pool: qwen25coder7 \ No newline at end of file diff --git a/presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct_vllm.yaml b/presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct_vllm.yaml new file mode 100644 index 000000000..4c1e72510 --- /dev/null +++ b/presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct_vllm.yaml @@ -0,0 +1,83 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: qwen2-5-coder-7b-instruct +spec: + replicas: 1 + selector: + matchLabels: + app: qwen2-5-coder-7b-instruct + template: + metadata: + labels: + app: qwen2-5-coder-7b-instruct + spec: + containers: + - name: qwen2-5-coder-7b-instruct-container + image: REPO_HERE.azurecr.io/qwen2.5-coder-7b-instruct:TAG_HERE + command: + - /bin/sh + - -c + - python3 /workspace/vllm/inference_api.py --kaito-config-file /mnt/config/inference_config.yaml + resources: + requests: + nvidia.com/gpu: 2 + limits: + nvidia.com/gpu: 2 + livenessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 600 # 10 Min + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 30 + periodSeconds: 10 + volumeMounts: + - name: dshm + mountPath: /dev/shm + - mountPath: /mnt/config + name: config-volume + volumes: + - name: dshm + emptyDir: + medium: Memory + - configMap: + defaultMode: 420 + name: qwen2-5-coder-7b-inference-params + name: config-volume + tolerations: + - effect: NoSchedule + key: sku + operator: Equal + value: gpu + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + nodeSelector: + pool: qwen25coder7 +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: qwen2-5-coder-7b-inference-params +data: + inference_config.yaml: | + # Maximum number of steps to find the max available seq len fitting in the GPU memory. + max_probe_steps: 6 + + vllm: + cpu-offload-gb: 0 + gpu-memory-utilization: 0.95 + swap-space: 4 + served-model-name: test + dtype: float16 + tensor-parallel-size: 2 + + # max-seq-len-to-capture: 8192 + # num-scheduler-steps: 1 + # enable-chunked-prefill: false + # see https://docs.vllm.ai/en/stable/models/engine_args.html for more options.