From be3620bb16cd940050fd5b71e5e583a3cece499d Mon Sep 17 00:00:00 2001
From: jerryzhuang <zhuangqhc@gmail.com>
Date: Sat, 21 Dec 2024 10:21:39 +1100
Subject: [PATCH] feat: add qwen preset test (#788)

**Reason for Change**:

- remove outdated manifests
- replace LoadBalancer by ClusterIP, won't directly access to IP anymore

---------

Signed-off-by: jerryzhuang <zhuangqhc@gmail.com>
Co-authored-by: Fei Guo <guofei@microsoft.com>
---
 .github/e2e-preset-configs.json               |  9 ++
 .github/workflows/e2e-preset-test.yml         | 54 +++++++-----
 .../workspace/models/supported_models.yaml    | 54 ++++++------
 .../falcon-40b-instruct-service.yaml          |  2 +-
 .../falcon-40b/falcon-40b-service.yaml        |  2 +-
 .../falcon-7b-instruct-service.yaml           |  2 +-
 .../falcon-7b-with-adapter/falcon-7b.yaml     | 51 ------------
 .../falcon-7b/falcon-7b-service.yaml          |  2 +-
 .../llama-2-13b-chat-service.yaml             |  2 +-
 .../llama-2-13b/llama-2-13b-service.yaml      |  2 +-
 .../llama-2-7b-chat-service.yaml              |  2 +-
 .../llama-2-7b/llama-2-7b-service.yaml        |  2 +-
 .../test/manifests/llama-headless.yaml        | 14 ----
 .../mistral-7b-instruct-service.yaml          |  2 +-
 .../mistral-7b/mistral-7b-service.yaml        |  2 +-
 .../test/manifests/phi-2/phi-2-service.yaml   |  2 +-
 .../phi-3-medium-128k-instruct-service.yaml   |  2 +-
 .../phi-3-medium-4k-instruct-service.yaml     |  2 +-
 .../phi-3-mini-128k-instruct-service.yaml     |  2 +-
 .../phi-3-mini-4k-instruct-service.yaml       |  2 +-
 .../phi-3-small-128k-instruct-service.yaml    |  2 +-
 .../phi-3-small-8k-instruct-service.yaml      |  2 +-
 .../qwen2-5-coder-7b-instruct-service.yaml    | 13 +++
 .../qwen2-5-coder-7b-instruct_hf.yaml         | 55 ++++++++++++
 .../qwen2-5-coder-7b-instruct_vllm.yaml       | 83 +++++++++++++++++++
 25 files changed, 234 insertions(+), 133 deletions(-)
 delete mode 100644 presets/workspace/test/manifests/falcon-7b-with-adapter/falcon-7b.yaml
 delete mode 100644 presets/workspace/test/manifests/llama-headless.yaml
 create mode 100644 presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct-service.yaml
 create mode 100644 presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct_hf.yaml
 create mode 100644 presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct_vllm.yaml

diff --git a/.github/e2e-preset-configs.json b/.github/e2e-preset-configs.json
index a0c45bdcf..9d20b0e38 100644
--- a/.github/e2e-preset-configs.json
+++ b/.github/e2e-preset-configs.json
@@ -98,6 +98,15 @@
         "OSS": true,
         "loads_adapter": false
       },
+      {
+        "name": "qwen2.5-coder-7b-instruct",
+        "workload": "qwen2-5-coder-7b-instruct",
+        "node-count": 1,
+        "node-vm-size": "Standard_NC12s_v3",
+        "node-osdisk-size": 100,
+        "OSS": true,
+        "loads_adapter": false
+      },
       {
         "name": "llama-2-7b",
         "node-count": 1,
diff --git a/.github/workflows/e2e-preset-test.yml b/.github/workflows/e2e-preset-test.yml
index cb2ee2802..75d073d0f 100644
--- a/.github/workflows/e2e-preset-test.yml
+++ b/.github/workflows/e2e-preset-test.yml
@@ -170,6 +170,7 @@ jobs:
         run: |
             NAME_SUFFIX=${{ matrix.model.name }}
             NAME_SUFFIX_WITHOUT_DASHES=${NAME_SUFFIX//-/}  # Removing all '-' symbols
+            NAME_SUFFIX_WITHOUT_DASHES=${NAME_SUFFIX_WITHOUT_DASHES//./} # Removing all '.' symbols
 
             if [ ${#NAME_SUFFIX_WITHOUT_DASHES} -gt 12 ]; then
                 TRUNCATED_NAME_SUFFIX=${NAME_SUFFIX_WITHOUT_DASHES:0:12}
@@ -213,16 +214,21 @@ jobs:
                 fi
             fi
 
+      - name: Get testing workload
+        id: workload
+        run: |
+            WORKLOAD_NAME=${{ matrix.model.workload || matrix.model.name }}
+            echo "WORKLOAD_NAME=$WORKLOAD_NAME" >> $GITHUB_OUTPUT
+            echo "WORKLOAD_FILE_PREFIX=presets/workspace/test/manifests/$WORKLOAD_NAME/$WORKLOAD_NAME" >> $GITHUB_OUTPUT
+
       - name: Create Service
-        run: kubectl apply -f presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}-service.yaml
+        run: |
+            kubectl apply -f ${{steps.workload.outputs.WORKLOAD_FILE_PREFIX}}-service.yaml
       
       - name: Retrieve External Service IP
         id: get_ip
         run: |
-            while [[ -z $SERVICE_IP ]]; do 
-                SERVICE_IP=$(kubectl get svc ${{ matrix.model.name }} -o=jsonpath='{.status.loadBalancer.ingress[0].ip}')
-                sleep 5
-            done 
+            SERVICE_IP=$(kubectl get svc ${{steps.workload.outputs.WORKLOAD_NAME}} -o=jsonpath='{.spec.clusterIP}')
             echo "Service IP is $SERVICE_IP"
             echo "SERVICE_IP=$SERVICE_IP" >> $GITHUB_OUTPUT
 
@@ -235,36 +241,38 @@ jobs:
       - name: Replace IP and Deploy Resource to K8s
         run: |
             POSTFIX=$(echo "${{ matrix.model.name }}" | grep -q "llama" && echo "" || echo "_${{ env.RUNTIME }}")
-            sed -i "s/MASTER_ADDR_HERE/${{ steps.get_ip.outputs.SERVICE_IP }}/g" presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}$POSTFIX.yaml
-            sed -i "s/TAG_HERE/${{ matrix.model.tag }}/g" presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}$POSTFIX.yaml
-            sed -i "s/REPO_HERE/${{ secrets.ACR_AMRT_USERNAME }}/g" presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}$POSTFIX.yaml
-            kubectl apply -f presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}$POSTFIX.yaml
+            WORKLOAD_FILE=${{steps.workload.outputs.WORKLOAD_FILE_PREFIX}}$POSTFIX.yaml
+
+            sed -i "s/MASTER_ADDR_HERE/${{ steps.get_ip.outputs.SERVICE_IP }}/g" $WORKLOAD_FILE
+            sed -i "s/TAG_HERE/${{ matrix.model.tag }}/g" $WORKLOAD_FILE
+            sed -i "s/REPO_HERE/${{ secrets.ACR_AMRT_USERNAME }}/g" $WORKLOAD_FILE
+            kubectl apply -f $WORKLOAD_FILE
 
       - name: Wait for Resource to be ready
         run: |
-            kubectl rollout status ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} --timeout=1800s
+            kubectl rollout status ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} --timeout=1800s
     
       - name: Check Adapter Loading from Logs
         if: matrix.model.loads_adapter == true
         run: |
-            POD_NAME=$(kubectl get pods -l app=${{ matrix.model.name }} -o jsonpath="{.items[0].metadata.name}")
+            POD_NAME=$(kubectl get pods -l app=${{steps.workload.outputs.WORKLOAD_NAME}} -o jsonpath="{.items[0].metadata.name}")
             kubectl logs $POD_NAME | grep "Adapter added:" | grep "${{ matrix.model.expected_adapter }}" || (echo "Adapter not loaded or incorrect adapter loaded" && exit 1)
 
       - name: Install testing commands
         run: |
-            kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- apt-get update
-            kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- apt-get install -y curl
+            kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- apt-get update
+            kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- apt-get install -y curl
 
       - name: Test healthz endpoint
         run: |
-            kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
+            kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- \
             curl -s http://localhost:5000/health
 
       - name: Test inference endpoint
         run: |
             echo "Testing inference for ${{ matrix.model.name }}"
             if [[ "${{ matrix.model.name }}" == *"llama"* && "${{ matrix.model.name }}" == *"-chat"* ]]; then
-                kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
+                kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- \
                 curl -s -X POST \
                 -H "Content-Type: application/json" \
                 -d '{
@@ -285,7 +293,7 @@ jobs:
                 }' \
                 http://localhost:5000/chat
             elif [[ "${{ matrix.model.name }}" == *"llama"* ]]; then
-                kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
+                kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- \
                 curl -s -X POST \
                 -H "Content-Type: application/json" \
                 -d '{
@@ -301,7 +309,7 @@ jobs:
                 }' \
                 http://localhost:5000/generate
             elif [[ "${{ env.RUNTIME }}" == *"vllm"* ]]; then
-                kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
+                kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- \
                 curl -s -X POST \
                 -H "accept: application/json" \
                 -H "Content-Type: application/json" \
@@ -320,7 +328,7 @@ jobs:
                     }' \
                 http://localhost:5000/v1/chat/completions
             else
-                kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
+                kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- \
                 curl -s -X POST \
                 -H "accept: application/json" \
                 -H "Content-Type: application/json" \
@@ -367,15 +375,15 @@ jobs:
                 RESOURCE_TYPE=${{ steps.resource.outputs.RESOURCE_TYPE }}
                 
                 # Check and Delete K8s Resource (Deployment or StatefulSet)
-                if kubectl get $RESOURCE_TYPE ${{ matrix.model.name }} > /dev/null 2>&1; then
-                    kubectl logs $RESOURCE_TYPE/${{ matrix.model.name }}
-                    kubectl delete $RESOURCE_TYPE ${{ matrix.model.name }}
+                if kubectl get $RESOURCE_TYPE ${{steps.workload.outputs.WORKLOAD_NAME}} > /dev/null 2>&1; then
+                    kubectl logs $RESOURCE_TYPE/${{steps.workload.outputs.WORKLOAD_NAME}}
+                    kubectl delete $RESOURCE_TYPE ${{steps.workload.outputs.WORKLOAD_NAME}}
                 fi
             fi
 
             # Check and Delete K8s Service if it exists
-            if kubectl get svc ${{ matrix.model.name }} > /dev/null 2>&1; then
-                kubectl delete svc ${{ matrix.model.name }}
+            if kubectl get svc ${{steps.workload.outputs.WORKLOAD_NAME}} > /dev/null 2>&1; then
+                kubectl delete svc ${{steps.workload.outputs.WORKLOAD_NAME}}
             fi
         
             # Check and Delete AKS Nodepool if it exists            
diff --git a/presets/workspace/models/supported_models.yaml b/presets/workspace/models/supported_models.yaml
index 99a61a10e..eaa7dbb10 100644
--- a/presets/workspace/models/supported_models.yaml
+++ b/presets/workspace/models/supported_models.yaml
@@ -35,13 +35,14 @@ models:
     type: text-generation
     version: https://huggingface.co/tiiuae/falcon-7b/commit/898df1396f35e447d5fe44e0a3ccaaaa69f30d36
     runtime: tfs
-    tag: 0.0.7
+    tag: 0.0.8
   - name: falcon-7b-instruct
     type: text-generation
     version: https://huggingface.co/tiiuae/falcon-7b-instruct/commit/cf4b3c42ce2fdfe24f753f0f0d179202fea59c99
     runtime: tfs
-    tag: 0.0.7
+    tag: 0.0.8
     # Tag history:
+    # 0.0.8 - Support adapter and config file for VLLM runtime
     # 0.0.7 - Support VLLM runtime
     # 0.0.6 - Add Logging & Metrics Server
     # 0.0.5 - Tuning and Adapters
@@ -53,13 +54,14 @@ models:
     type: text-generation
     version: https://huggingface.co/tiiuae/falcon-40b/commit/4a70170c215b36a3cce4b4253f6d0612bb7d4146
     runtime: tfs
-    tag: 0.0.8
+    tag: 0.0.9
   - name: falcon-40b-instruct
     type: text-generation
     version: https://huggingface.co/tiiuae/falcon-40b-instruct/commit/ecb78d97ac356d098e79f0db222c9ce7c5d9ee5f
     runtime: tfs
-    tag: 0.0.8
+    tag: 0.0.9
     # Tag history for 40b models:
+    # 0.0.9 - Support adapter and config file for VLLM runtime
     # 0.0.8 - Support VLLM runtime
     # 0.0.7 - Add Logging & Metrics Server
     # 0.0.6 - Tuning and Adapters
@@ -74,13 +76,14 @@ models:
     type: text-generation 
     version: https://huggingface.co/mistralai/Mistral-7B-v0.3/commit/d8cadc02ac76bd617a919d50b092e59d2d110aff
     runtime: tfs
-    tag: 0.0.8
+    tag: 0.0.9
   - name: mistral-7b-instruct
     type: text-generation
     version: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3/commit/e0bc86c23ce5aae1db576c8cca6f06f1f73af2db
     runtime: tfs
-    tag: 0.0.8
+    tag: 0.0.9
     # Tag history:
+    # 0.0.9 - Support adapter and config file for VLLM runtime
     # 0.0.8 - Support VLLM runtime
     # 0.0.7 - Add Logging & Metrics Server
     # 0.0.6 - Update model version and Address missing weights files fix
@@ -95,8 +98,9 @@ models:
     type: text-generation 
     version: https://huggingface.co/microsoft/phi-2/commit/ef382358ec9e382308935a992d908de099b64c23
     runtime: tfs
-    tag: 0.0.6
+    tag: 0.0.7
     # Tag history:
+    # 0.0.7 - Support adapter and config file for VLLM runtime
     # 0.0.6 - Support VLLM runtime
     # 0.0.5 - Add Logging & Metrics Server
     # 0.0.4 - Tuning and Adapters
@@ -109,38 +113,24 @@ models:
     type: text-generation 
     version: https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/commit/0a67737cc96d2554230f90338b163bc6380a2a85
     runtime: tfs
-    tag: 0.0.3
-    # Tag history:
-    # 0.0.3 - Support VLLM runtime
-    # 0.0.2 - Add Logging & Metrics Server
-    # 0.0.1 - Initial Release
-
+    tag: 0.0.4
   - name: phi-3-mini-128k-instruct
     type: text-generation 
     version: https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/commit/a90b62ae09941edff87a90ced39ba5807e6b2ade
     runtime: tfs
-    tag: 0.0.3
-    # Tag history:
-    # 0.0.3 - Support VLLM runtime
-    # 0.0.2 - Add Logging & Metrics Server
-    # 0.0.1 - Initial Release
-
+    tag: 0.0.4
   - name: phi-3-medium-4k-instruct
     type: text-generation
     version: https://huggingface.co/microsoft/Phi-3-medium-4k-instruct/commit/ae004ae82eb6eddc32906dfacb1d6dfea8f91996
     runtime: tfs
-    tag: 0.0.3
-    # Tag history:
-    # 0.0.3 - Support VLLM runtime
-    # 0.0.2 - Add Logging & Metrics Server
-    # 0.0.1 - Initial Release
-
+    tag: 0.0.4
   - name: phi-3-medium-128k-instruct
     type: text-generation
     version: https://huggingface.co/microsoft/Phi-3-medium-128k-instruct/commit/fa7d2aa4f5ea69b2e36b20d050cdae79c9bfbb3f
     runtime: tfs
-    tag: 0.0.3
+    tag: 0.0.4
     # Tag history:
+    # 0.0.4 - Support adapter and config file for VLLM runtime
     # 0.0.3 - Support VLLM runtime
     # 0.0.2 - Add Logging & Metrics Server
     # 0.0.1 - Initial Release
@@ -149,7 +139,15 @@ models:
     type: text-generation
     version: https://huggingface.co/microsoft/Phi-3.5-mini-instruct/commit/af0dfb8029e8a74545d0736d30cb6b58d2f0f3f0
     runtime: tfs
-    tag: 0.0.1
+    tag: 0.0.2
     # Tag history:
+    # 0.0.2 - Support adapter and config file for VLLM runtime
     # 0.0.1 - New Model! Support VLLM Runtime
-  
\ No newline at end of file
+
+  - name: qwen2.5-coder-7b-instruct
+    type: text-generation
+    version: https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct/commit/0eb6b1ed2d0c4306bc637d09ecef51e59d3dfe05
+    runtime: tfs
+    tag: 0.0.1
+    # Tag history:
+    # 0.0.1 - New Model!
\ No newline at end of file
diff --git a/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct-service.yaml b/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct-service.yaml
index 27f21ec46..fc357931a 100644
--- a/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct-service.yaml
+++ b/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct-service.yaml
@@ -9,5 +9,5 @@ spec:
   - protocol: TCP
     port: 80
     targetPort: 5000
-  type: LoadBalancer
+  type: ClusterIP
   publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/falcon-40b/falcon-40b-service.yaml b/presets/workspace/test/manifests/falcon-40b/falcon-40b-service.yaml
index 689361052..80ab4b539 100644
--- a/presets/workspace/test/manifests/falcon-40b/falcon-40b-service.yaml
+++ b/presets/workspace/test/manifests/falcon-40b/falcon-40b-service.yaml
@@ -9,5 +9,5 @@ spec:
   - protocol: TCP
     port: 80
     targetPort: 5000
-  type: LoadBalancer
+  type: ClusterIP
   publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct-service.yaml b/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct-service.yaml
index 6acbe2405..2f27d46cb 100644
--- a/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct-service.yaml
+++ b/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct-service.yaml
@@ -9,5 +9,5 @@ spec:
   - protocol: TCP
     port: 80
     targetPort: 5000
-  type: LoadBalancer
+  type: ClusterIP
   publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/falcon-7b-with-adapter/falcon-7b.yaml b/presets/workspace/test/manifests/falcon-7b-with-adapter/falcon-7b.yaml
deleted file mode 100644
index 349a377a0..000000000
--- a/presets/workspace/test/manifests/falcon-7b-with-adapter/falcon-7b.yaml
+++ /dev/null
@@ -1,51 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: falcon-7b
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: falcon
-  template:
-    metadata:
-      labels:
-        app: falcon
-    spec:
-      volumes:
-      - name: adapter-volume
-        emptyDir: {} 
-      initContainers:
-      - name: falcon-7b-adapter
-        image: <YOUR_IMAGE>
-        imagePullPolicy: Always
-        command: ["/bin/sh", "-c", "mkdir -p /mnt/adapter/falcon-7b-adapter && cp -r /data/* /mnt/adapter/falcon-7b-adapter"]
-        volumeMounts:
-        - name: adapter-volume
-          mountPath: /mnt/adapter
-      containers:
-      - name: falcon-container
-        image: <YOUR_IMAGE>
-        command:
-          - /bin/sh
-          - -c
-          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
-        resources:
-          requests:
-            nvidia.com/gpu: 2
-          limits:
-            nvidia.com/gpu: 2  # Requesting 2 GPUs
-        volumeMounts:
-        - name: adapter-volume
-          mountPath: /mnt/adapter
-        env:
-        - name: falcon-7b-adapter
-          value: "0.2"
-      tolerations:
-      - effect: NoSchedule
-        value: gpu
-        key: sku
-        operator: Equal
-      - effect: NoSchedule
-        key: nvidia.com/gpu
-        operator: Exists
diff --git a/presets/workspace/test/manifests/falcon-7b/falcon-7b-service.yaml b/presets/workspace/test/manifests/falcon-7b/falcon-7b-service.yaml
index acf56ba74..595e83942 100644
--- a/presets/workspace/test/manifests/falcon-7b/falcon-7b-service.yaml
+++ b/presets/workspace/test/manifests/falcon-7b/falcon-7b-service.yaml
@@ -9,5 +9,5 @@ spec:
   - protocol: TCP
     port: 80
     targetPort: 5000
-  type: LoadBalancer
+  type: ClusterIP
   publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/llama-2-13b-chat/llama-2-13b-chat-service.yaml b/presets/workspace/test/manifests/llama-2-13b-chat/llama-2-13b-chat-service.yaml
index 79efb227b..58720a91d 100644
--- a/presets/workspace/test/manifests/llama-2-13b-chat/llama-2-13b-chat-service.yaml
+++ b/presets/workspace/test/manifests/llama-2-13b-chat/llama-2-13b-chat-service.yaml
@@ -15,5 +15,5 @@ spec:
       protocol: TCP
       port: 29500
       targetPort: 29500
-  type: LoadBalancer
+  type: ClusterIP
   publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/llama-2-13b/llama-2-13b-service.yaml b/presets/workspace/test/manifests/llama-2-13b/llama-2-13b-service.yaml
index c3cd3bdb6..f43826a48 100644
--- a/presets/workspace/test/manifests/llama-2-13b/llama-2-13b-service.yaml
+++ b/presets/workspace/test/manifests/llama-2-13b/llama-2-13b-service.yaml
@@ -15,5 +15,5 @@ spec:
       protocol: TCP
       port: 29500
       targetPort: 29500
-  type: LoadBalancer
+  type: ClusterIP
   publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/llama-2-7b-chat/llama-2-7b-chat-service.yaml b/presets/workspace/test/manifests/llama-2-7b-chat/llama-2-7b-chat-service.yaml
index d661db59b..99fc7895d 100644
--- a/presets/workspace/test/manifests/llama-2-7b-chat/llama-2-7b-chat-service.yaml
+++ b/presets/workspace/test/manifests/llama-2-7b-chat/llama-2-7b-chat-service.yaml
@@ -10,5 +10,5 @@ spec:
     - protocol: TCP
       port: 80
       targetPort: 5000
-  type: LoadBalancer
+  type: ClusterIP
   publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/llama-2-7b/llama-2-7b-service.yaml b/presets/workspace/test/manifests/llama-2-7b/llama-2-7b-service.yaml
index 30967c332..d8dfb84c7 100644
--- a/presets/workspace/test/manifests/llama-2-7b/llama-2-7b-service.yaml
+++ b/presets/workspace/test/manifests/llama-2-7b/llama-2-7b-service.yaml
@@ -10,5 +10,5 @@ spec:
     - protocol: TCP
       port: 80
       targetPort: 5000
-  type: LoadBalancer
+  type: ClusterIP
   publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/llama-headless.yaml b/presets/workspace/test/manifests/llama-headless.yaml
deleted file mode 100644
index e0514564f..000000000
--- a/presets/workspace/test/manifests/llama-headless.yaml
+++ /dev/null
@@ -1,14 +0,0 @@
-apiVersion: v1
-kind: Service
-metadata:
-  name: llama-headless
-spec:
-  selector:
-    app: llama
-  clusterIP: None
-  ports:
-  - name: torchrun
-    protocol: TCP
-    port: 29500
-    targetPort: 29500
-  publishNotReadyAddresses: true
\ No newline at end of file
diff --git a/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct-service.yaml b/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct-service.yaml
index 31b9206bc..94627746d 100644
--- a/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct-service.yaml
+++ b/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct-service.yaml
@@ -9,5 +9,5 @@ spec:
   - protocol: TCP
     port: 80
     targetPort: 5000
-  type: LoadBalancer
+  type: ClusterIP
   publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/mistral-7b/mistral-7b-service.yaml b/presets/workspace/test/manifests/mistral-7b/mistral-7b-service.yaml
index 650422c7c..90ba3ec8f 100644
--- a/presets/workspace/test/manifests/mistral-7b/mistral-7b-service.yaml
+++ b/presets/workspace/test/manifests/mistral-7b/mistral-7b-service.yaml
@@ -9,5 +9,5 @@ spec:
   - protocol: TCP
     port: 80
     targetPort: 5000
-  type: LoadBalancer
+  type: ClusterIP
   publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/phi-2/phi-2-service.yaml b/presets/workspace/test/manifests/phi-2/phi-2-service.yaml
index b81036bd8..d0f99f9ad 100644
--- a/presets/workspace/test/manifests/phi-2/phi-2-service.yaml
+++ b/presets/workspace/test/manifests/phi-2/phi-2-service.yaml
@@ -9,5 +9,5 @@ spec:
   - protocol: TCP
     port: 80
     targetPort: 5000
-  type: LoadBalancer
+  type: ClusterIP
   publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct-service.yaml b/presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct-service.yaml
index 8162f342c..bab354ee9 100644
--- a/presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct-service.yaml
+++ b/presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct-service.yaml
@@ -9,5 +9,5 @@ spec:
   - protocol: TCP
     port: 80
     targetPort: 5000
-  type: LoadBalancer
+  type: ClusterIP
   publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct-service.yaml b/presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct-service.yaml
index b0fd7047c..60710504f 100644
--- a/presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct-service.yaml
+++ b/presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct-service.yaml
@@ -9,5 +9,5 @@ spec:
   - protocol: TCP
     port: 80
     targetPort: 5000
-  type: LoadBalancer
+  type: ClusterIP
   publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct-service.yaml b/presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct-service.yaml
index c4c613237..ef86aefb2 100644
--- a/presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct-service.yaml
+++ b/presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct-service.yaml
@@ -9,5 +9,5 @@ spec:
   - protocol: TCP
     port: 80
     targetPort: 5000
-  type: LoadBalancer
+  type: ClusterIP
   publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct-service.yaml b/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct-service.yaml
index 3e7426ae8..0063f24aa 100644
--- a/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct-service.yaml
+++ b/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct-service.yaml
@@ -9,5 +9,5 @@ spec:
   - protocol: TCP
     port: 80
     targetPort: 5000
-  type: LoadBalancer
+  type: ClusterIP
   publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/phi-3-small-128k-instruct/phi-3-small-128k-instruct-service.yaml b/presets/workspace/test/manifests/phi-3-small-128k-instruct/phi-3-small-128k-instruct-service.yaml
index 86deb6985..a28bac071 100644
--- a/presets/workspace/test/manifests/phi-3-small-128k-instruct/phi-3-small-128k-instruct-service.yaml
+++ b/presets/workspace/test/manifests/phi-3-small-128k-instruct/phi-3-small-128k-instruct-service.yaml
@@ -9,5 +9,5 @@ spec:
   - protocol: TCP
     port: 80
     targetPort: 5000
-  type: LoadBalancer
+  type: ClusterIP
   publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/phi-3-small-8k-instruct/phi-3-small-8k-instruct-service.yaml b/presets/workspace/test/manifests/phi-3-small-8k-instruct/phi-3-small-8k-instruct-service.yaml
index 648de4337..17e031f87 100644
--- a/presets/workspace/test/manifests/phi-3-small-8k-instruct/phi-3-small-8k-instruct-service.yaml
+++ b/presets/workspace/test/manifests/phi-3-small-8k-instruct/phi-3-small-8k-instruct-service.yaml
@@ -9,5 +9,5 @@ spec:
   - protocol: TCP
     port: 80
     targetPort: 5000
-  type: LoadBalancer
+  type: ClusterIP
   publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct-service.yaml b/presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct-service.yaml
new file mode 100644
index 000000000..73637c99a
--- /dev/null
+++ b/presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct-service.yaml
@@ -0,0 +1,13 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: qwen2-5-coder-7b-instruct
+spec:
+  selector:
+    app: qwen2-5-coder-7b-instruct
+  ports:
+  - protocol: TCP
+    port: 80
+    targetPort: 5000
+  type: ClusterIP
+  publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct_hf.yaml b/presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct_hf.yaml
new file mode 100644
index 000000000..e92d906d7
--- /dev/null
+++ b/presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct_hf.yaml
@@ -0,0 +1,55 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: qwen2-5-coder-7b-instruct
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: qwen2-5-coder-7b-instruct
+  template:
+    metadata:
+      labels:
+        app: qwen2-5-coder-7b-instruct
+    spec:
+      containers:
+      - name: qwen2-5-coder-7b-instruct-container
+        image: REPO_HERE.azurecr.io/qwen2.5-coder-7b-instruct:TAG_HERE
+        command:
+          - /bin/sh
+          - -c
+          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code
+        resources:
+          requests:
+            nvidia.com/gpu: 2
+          limits:
+            nvidia.com/gpu: 2
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 600 # 10 Min
+          periodSeconds: 10
+        readinessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 30
+          periodSeconds: 10
+        volumeMounts:
+        - name: dshm
+          mountPath: /dev/shm
+      volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+      tolerations:
+      - effect: NoSchedule
+        key: sku
+        operator: Equal
+        value: gpu
+      - effect: NoSchedule
+        key: nvidia.com/gpu
+        operator: Exists
+      nodeSelector:
+        pool: qwen25coder7
\ No newline at end of file
diff --git a/presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct_vllm.yaml b/presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct_vllm.yaml
new file mode 100644
index 000000000..4c1e72510
--- /dev/null
+++ b/presets/workspace/test/manifests/qwen2-5-coder-7b-instruct/qwen2-5-coder-7b-instruct_vllm.yaml
@@ -0,0 +1,83 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: qwen2-5-coder-7b-instruct
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: qwen2-5-coder-7b-instruct
+  template:
+    metadata:
+      labels:
+        app: qwen2-5-coder-7b-instruct
+    spec:
+      containers:
+      - name: qwen2-5-coder-7b-instruct-container
+        image: REPO_HERE.azurecr.io/qwen2.5-coder-7b-instruct:TAG_HERE
+        command:
+          - /bin/sh
+          - -c
+          - python3 /workspace/vllm/inference_api.py --kaito-config-file /mnt/config/inference_config.yaml
+        resources:
+          requests:
+            nvidia.com/gpu: 2
+          limits:
+            nvidia.com/gpu: 2
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 600 # 10 Min
+          periodSeconds: 10
+        readinessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 30
+          periodSeconds: 10
+        volumeMounts:
+        - name: dshm
+          mountPath: /dev/shm
+        - mountPath: /mnt/config
+          name: config-volume
+      volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+      - configMap:
+          defaultMode: 420
+          name: qwen2-5-coder-7b-inference-params
+        name: config-volume
+      tolerations:
+      - effect: NoSchedule
+        key: sku
+        operator: Equal
+        value: gpu
+      - effect: NoSchedule
+        key: nvidia.com/gpu
+        operator: Exists
+      nodeSelector:
+        pool: qwen25coder7
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: qwen2-5-coder-7b-inference-params
+data:
+  inference_config.yaml: |
+    # Maximum number of steps to find the max available seq len fitting in the GPU memory.
+    max_probe_steps: 6
+
+    vllm:
+      cpu-offload-gb: 0
+      gpu-memory-utilization: 0.95
+      swap-space: 4
+      served-model-name: test
+      dtype: float16
+      tensor-parallel-size: 2
+
+      # max-seq-len-to-capture: 8192
+      # num-scheduler-steps: 1
+      # enable-chunked-prefill: false
+      # see https://docs.vllm.ai/en/stable/models/engine_args.html for more options.