feat: add qwen preset test (#788)

**Reason for Change**: - remove outdated manifests - replace LoadBalancer by ClusterIP, won't directly access to IP anymore --------- Signed-off-by: jerryzhuang <[email protected]> Co-authored-by: Fei Guo <[email protected]>
kaito-project · Dec 20, 2024 · be3620b · be3620b
1 parent 2c1d5bf
commit be3620b
Show file tree

Hide file tree

Showing 25 changed files with 234 additions and 133 deletions.
diff --git a/.github/e2e-preset-configs.json b/.github/e2e-preset-configs.json
@@ -98,6 +98,15 @@
         "OSS": true,
         "loads_adapter": false
       },
+      {
+        "name": "qwen2.5-coder-7b-instruct",
+        "workload": "qwen2-5-coder-7b-instruct",
+        "node-count": 1,
+        "node-vm-size": "Standard_NC12s_v3",
+        "node-osdisk-size": 100,
+        "OSS": true,
+        "loads_adapter": false
+      },
       {
         "name": "llama-2-7b",
         "node-count": 1,

diff --git a/.github/workflows/e2e-preset-test.yml b/.github/workflows/e2e-preset-test.yml
@@ -170,6 +170,7 @@ jobs:
         run: |
             NAME_SUFFIX=${{ matrix.model.name }}
             NAME_SUFFIX_WITHOUT_DASHES=${NAME_SUFFIX//-/}  # Removing all '-' symbols
+            NAME_SUFFIX_WITHOUT_DASHES=${NAME_SUFFIX_WITHOUT_DASHES//./} # Removing all '.' symbols
 
             if [ ${#NAME_SUFFIX_WITHOUT_DASHES} -gt 12 ]; then
                 TRUNCATED_NAME_SUFFIX=${NAME_SUFFIX_WITHOUT_DASHES:0:12}
@@ -213,16 +214,21 @@ jobs:
                 fi
             fi
 
+      - name: Get testing workload
+        id: workload
+        run: |
+            WORKLOAD_NAME=${{ matrix.model.workload || matrix.model.name }}
+            echo "WORKLOAD_NAME=$WORKLOAD_NAME" >> $GITHUB_OUTPUT
+            echo "WORKLOAD_FILE_PREFIX=presets/workspace/test/manifests/$WORKLOAD_NAME/$WORKLOAD_NAME" >> $GITHUB_OUTPUT
+
       - name: Create Service
-        run: kubectl apply -f presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}-service.yaml
+        run: |
+            kubectl apply -f ${{steps.workload.outputs.WORKLOAD_FILE_PREFIX}}-service.yaml
       
       - name: Retrieve External Service IP
         id: get_ip
         run: |
-            while [[ -z $SERVICE_IP ]]; do 
-                SERVICE_IP=$(kubectl get svc ${{ matrix.model.name }} -o=jsonpath='{.status.loadBalancer.ingress[0].ip}')
-                sleep 5
-            done 
+            SERVICE_IP=$(kubectl get svc ${{steps.workload.outputs.WORKLOAD_NAME}} -o=jsonpath='{.spec.clusterIP}')
             echo "Service IP is $SERVICE_IP"
             echo "SERVICE_IP=$SERVICE_IP" >> $GITHUB_OUTPUT
 
@@ -235,36 +241,38 @@ jobs:
       - name: Replace IP and Deploy Resource to K8s
         run: |
             POSTFIX=$(echo "${{ matrix.model.name }}" | grep -q "llama" && echo "" || echo "_${{ env.RUNTIME }}")
-            sed -i "s/MASTER_ADDR_HERE/${{ steps.get_ip.outputs.SERVICE_IP }}/g" presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}$POSTFIX.yaml
-            sed -i "s/TAG_HERE/${{ matrix.model.tag }}/g" presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}$POSTFIX.yaml
-            sed -i "s/REPO_HERE/${{ secrets.ACR_AMRT_USERNAME }}/g" presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}$POSTFIX.yaml
-            kubectl apply -f presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}$POSTFIX.yaml
+            WORKLOAD_FILE=${{steps.workload.outputs.WORKLOAD_FILE_PREFIX}}$POSTFIX.yaml
+
+            sed -i "s/MASTER_ADDR_HERE/${{ steps.get_ip.outputs.SERVICE_IP }}/g" $WORKLOAD_FILE
+            sed -i "s/TAG_HERE/${{ matrix.model.tag }}/g" $WORKLOAD_FILE
+            sed -i "s/REPO_HERE/${{ secrets.ACR_AMRT_USERNAME }}/g" $WORKLOAD_FILE
+            kubectl apply -f $WORKLOAD_FILE
 
       - name: Wait for Resource to be ready
         run: |
-            kubectl rollout status ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} --timeout=1800s
+            kubectl rollout status ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} --timeout=1800s
     
       - name: Check Adapter Loading from Logs
         if: matrix.model.loads_adapter == true
         run: |
-            POD_NAME=$(kubectl get pods -l app=${{ matrix.model.name }} -o jsonpath="{.items[0].metadata.name}")
+            POD_NAME=$(kubectl get pods -l app=${{steps.workload.outputs.WORKLOAD_NAME}} -o jsonpath="{.items[0].metadata.name}")
             kubectl logs $POD_NAME | grep "Adapter added:" | grep "${{ matrix.model.expected_adapter }}" || (echo "Adapter not loaded or incorrect adapter loaded" && exit 1)
 
       - name: Install testing commands
         run: |
-            kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- apt-get update
-            kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- apt-get install -y curl
+            kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- apt-get update
+            kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- apt-get install -y curl
 
       - name: Test healthz endpoint
         run: |
-            kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
+            kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- \
             curl -s http://localhost:5000/health
 
       - name: Test inference endpoint
         run: |
             echo "Testing inference for ${{ matrix.model.name }}"
             if [[ "${{ matrix.model.name }}" == *"llama"* && "${{ matrix.model.name }}" == *"-chat"* ]]; then
-                kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
+                kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- \
                 curl -s -X POST \
                 -H "Content-Type: application/json" \
                 -d '{
@@ -285,7 +293,7 @@ jobs:
                 }' \
                 http://localhost:5000/chat
             elif [[ "${{ matrix.model.name }}" == *"llama"* ]]; then
-                kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
+                kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- \
                 curl -s -X POST \
                 -H "Content-Type: application/json" \
                 -d '{
@@ -301,7 +309,7 @@ jobs:
                 }' \
                 http://localhost:5000/generate
             elif [[ "${{ env.RUNTIME }}" == *"vllm"* ]]; then
-                kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
+                kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- \
                 curl -s -X POST \
                 -H "accept: application/json" \
                 -H "Content-Type: application/json" \
@@ -320,7 +328,7 @@ jobs:
                     }' \
                 http://localhost:5000/v1/chat/completions
             else
-                kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
+                kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{steps.workload.outputs.WORKLOAD_NAME}} -- \
                 curl -s -X POST \
                 -H "accept: application/json" \
                 -H "Content-Type: application/json" \
@@ -367,15 +375,15 @@ jobs:
                 RESOURCE_TYPE=${{ steps.resource.outputs.RESOURCE_TYPE }}
                 
                 # Check and Delete K8s Resource (Deployment or StatefulSet)
-                if kubectl get $RESOURCE_TYPE ${{ matrix.model.name }} > /dev/null 2>&1; then
-                    kubectl logs $RESOURCE_TYPE/${{ matrix.model.name }}
-                    kubectl delete $RESOURCE_TYPE ${{ matrix.model.name }}
+                if kubectl get $RESOURCE_TYPE ${{steps.workload.outputs.WORKLOAD_NAME}} > /dev/null 2>&1; then
+                    kubectl logs $RESOURCE_TYPE/${{steps.workload.outputs.WORKLOAD_NAME}}
+                    kubectl delete $RESOURCE_TYPE ${{steps.workload.outputs.WORKLOAD_NAME}}
                 fi
             fi
 
             # Check and Delete K8s Service if it exists
-            if kubectl get svc ${{ matrix.model.name }} > /dev/null 2>&1; then
-                kubectl delete svc ${{ matrix.model.name }}
+            if kubectl get svc ${{steps.workload.outputs.WORKLOAD_NAME}} > /dev/null 2>&1; then
+                kubectl delete svc ${{steps.workload.outputs.WORKLOAD_NAME}}
             fi
         
             # Check and Delete AKS Nodepool if it exists            

diff --git a/presets/workspace/models/supported_models.yaml b/presets/workspace/models/supported_models.yaml
@@ -35,13 +35,14 @@ models:
     type: text-generation
     version: https://huggingface.co/tiiuae/falcon-7b/commit/898df1396f35e447d5fe44e0a3ccaaaa69f30d36
     runtime: tfs
-    tag: 0.0.7
+    tag: 0.0.8
   - name: falcon-7b-instruct
     type: text-generation
     version: https://huggingface.co/tiiuae/falcon-7b-instruct/commit/cf4b3c42ce2fdfe24f753f0f0d179202fea59c99
     runtime: tfs
-    tag: 0.0.7
+    tag: 0.0.8
     # Tag history:
+    # 0.0.8 - Support adapter and config file for VLLM runtime
     # 0.0.7 - Support VLLM runtime
     # 0.0.6 - Add Logging & Metrics Server
     # 0.0.5 - Tuning and Adapters
@@ -53,13 +54,14 @@ models:
     type: text-generation
     version: https://huggingface.co/tiiuae/falcon-40b/commit/4a70170c215b36a3cce4b4253f6d0612bb7d4146
     runtime: tfs
-    tag: 0.0.8
+    tag: 0.0.9
   - name: falcon-40b-instruct
     type: text-generation
     version: https://huggingface.co/tiiuae/falcon-40b-instruct/commit/ecb78d97ac356d098e79f0db222c9ce7c5d9ee5f
     runtime: tfs
-    tag: 0.0.8
+    tag: 0.0.9
     # Tag history for 40b models:
+    # 0.0.9 - Support adapter and config file for VLLM runtime
     # 0.0.8 - Support VLLM runtime
     # 0.0.7 - Add Logging & Metrics Server
     # 0.0.6 - Tuning and Adapters
@@ -74,13 +76,14 @@ models:
     type: text-generation 
     version: https://huggingface.co/mistralai/Mistral-7B-v0.3/commit/d8cadc02ac76bd617a919d50b092e59d2d110aff
     runtime: tfs
-    tag: 0.0.8
+    tag: 0.0.9
   - name: mistral-7b-instruct
     type: text-generation
     version: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3/commit/e0bc86c23ce5aae1db576c8cca6f06f1f73af2db
     runtime: tfs
-    tag: 0.0.8
+    tag: 0.0.9
     # Tag history:
+    # 0.0.9 - Support adapter and config file for VLLM runtime
     # 0.0.8 - Support VLLM runtime
     # 0.0.7 - Add Logging & Metrics Server
     # 0.0.6 - Update model version and Address missing weights files fix
@@ -95,8 +98,9 @@ models:
     type: text-generation 
     version: https://huggingface.co/microsoft/phi-2/commit/ef382358ec9e382308935a992d908de099b64c23
     runtime: tfs
-    tag: 0.0.6
+    tag: 0.0.7
     # Tag history:
+    # 0.0.7 - Support adapter and config file for VLLM runtime
     # 0.0.6 - Support VLLM runtime
     # 0.0.5 - Add Logging & Metrics Server
     # 0.0.4 - Tuning and Adapters
@@ -109,38 +113,24 @@ models:
     type: text-generation 
     version: https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/commit/0a67737cc96d2554230f90338b163bc6380a2a85
     runtime: tfs
-    tag: 0.0.3
-    # Tag history:
-    # 0.0.3 - Support VLLM runtime
-    # 0.0.2 - Add Logging & Metrics Server
-    # 0.0.1 - Initial Release
-
+    tag: 0.0.4
   - name: phi-3-mini-128k-instruct
     type: text-generation 
     version: https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/commit/a90b62ae09941edff87a90ced39ba5807e6b2ade
     runtime: tfs
-    tag: 0.0.3
-    # Tag history:
-    # 0.0.3 - Support VLLM runtime
-    # 0.0.2 - Add Logging & Metrics Server
-    # 0.0.1 - Initial Release
-
+    tag: 0.0.4
   - name: phi-3-medium-4k-instruct
     type: text-generation
     version: https://huggingface.co/microsoft/Phi-3-medium-4k-instruct/commit/ae004ae82eb6eddc32906dfacb1d6dfea8f91996
     runtime: tfs
-    tag: 0.0.3
-    # Tag history:
-    # 0.0.3 - Support VLLM runtime
-    # 0.0.2 - Add Logging & Metrics Server
-    # 0.0.1 - Initial Release
-
+    tag: 0.0.4
   - name: phi-3-medium-128k-instruct
     type: text-generation
     version: https://huggingface.co/microsoft/Phi-3-medium-128k-instruct/commit/fa7d2aa4f5ea69b2e36b20d050cdae79c9bfbb3f
     runtime: tfs
-    tag: 0.0.3
+    tag: 0.0.4
     # Tag history:
+    # 0.0.4 - Support adapter and config file for VLLM runtime
     # 0.0.3 - Support VLLM runtime
     # 0.0.2 - Add Logging & Metrics Server
     # 0.0.1 - Initial Release
@@ -149,7 +139,15 @@ models:
     type: text-generation
     version: https://huggingface.co/microsoft/Phi-3.5-mini-instruct/commit/af0dfb8029e8a74545d0736d30cb6b58d2f0f3f0
     runtime: tfs
-    tag: 0.0.1
+    tag: 0.0.2
     # Tag history:
+    # 0.0.2 - Support adapter and config file for VLLM runtime
     # 0.0.1 - New Model! Support VLLM Runtime
-
+
+  - name: qwen2.5-coder-7b-instruct
+    type: text-generation
+    version: https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct/commit/0eb6b1ed2d0c4306bc637d09ecef51e59d3dfe05
+    runtime: tfs
+    tag: 0.0.1
+    # Tag history:
+    # 0.0.1 - New Model!
diff --git a/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct-service.yaml b/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct-service.yaml
@@ -9,5 +9,5 @@ spec:
   - protocol: TCP
     port: 80
     targetPort: 5000
-  type: LoadBalancer
+  type: ClusterIP
   publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/falcon-40b/falcon-40b-service.yaml b/presets/workspace/test/manifests/falcon-40b/falcon-40b-service.yaml
@@ -9,5 +9,5 @@ spec:
   - protocol: TCP
     port: 80
     targetPort: 5000
-  type: LoadBalancer
+  type: ClusterIP
   publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct-service.yaml b/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct-service.yaml
@@ -9,5 +9,5 @@ spec:
   - protocol: TCP
     port: 80
     targetPort: 5000
-  type: LoadBalancer
+  type: ClusterIP
   publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/falcon-7b-with-adapter/falcon-7b.yaml b/presets/workspace/test/manifests/falcon-7b-with-adapter/falcon-7b.yaml
diff --git a/presets/workspace/test/manifests/falcon-7b/falcon-7b-service.yaml b/presets/workspace/test/manifests/falcon-7b/falcon-7b-service.yaml
@@ -9,5 +9,5 @@ spec:
   - protocol: TCP
     port: 80
     targetPort: 5000
-  type: LoadBalancer
+  type: ClusterIP
   publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/llama-2-13b-chat/llama-2-13b-chat-service.yaml b/presets/workspace/test/manifests/llama-2-13b-chat/llama-2-13b-chat-service.yaml
@@ -15,5 +15,5 @@ spec:
       protocol: TCP
       port: 29500
       targetPort: 29500
-  type: LoadBalancer
+  type: ClusterIP
   publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/llama-2-13b/llama-2-13b-service.yaml b/presets/workspace/test/manifests/llama-2-13b/llama-2-13b-service.yaml
@@ -15,5 +15,5 @@ spec:
       protocol: TCP
       port: 29500
       targetPort: 29500
-  type: LoadBalancer
+  type: ClusterIP
   publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/llama-2-7b-chat/llama-2-7b-chat-service.yaml b/presets/workspace/test/manifests/llama-2-7b-chat/llama-2-7b-chat-service.yaml
@@ -10,5 +10,5 @@ spec:
     - protocol: TCP
       port: 80
       targetPort: 5000
-  type: LoadBalancer
+  type: ClusterIP
   publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/llama-2-7b/llama-2-7b-service.yaml b/presets/workspace/test/manifests/llama-2-7b/llama-2-7b-service.yaml
@@ -10,5 +10,5 @@ spec:
     - protocol: TCP
       port: 80
       targetPort: 5000
-  type: LoadBalancer
+  type: ClusterIP
   publishNotReadyAddresses: true
diff --git a/presets/workspace/test/manifests/llama-headless.yaml b/presets/workspace/test/manifests/llama-headless.yaml