Skip to content

Commit

Permalink
feat: add preset test for vllm (#694)
Browse files Browse the repository at this point in the history
**Reason for Change**:

add preset test for vllm

---------

Signed-off-by: jerryzhuang <[email protected]>
  • Loading branch information
zhuangqh authored Nov 22, 2024
1 parent 9d19e8f commit 0087e09
Show file tree
Hide file tree
Showing 36 changed files with 1,336 additions and 68 deletions.
12 changes: 6 additions & 6 deletions .github/e2e-preset-configs.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
{
"name": "falcon-7b",
"node-count": 1,
"node-vm-size": "Standard_NC12s_v3",
"node-vm-size": "Standard_NC6s_v3",
"node-osdisk-size": 100,
"OSS": true,
"loads_adapter": false
Expand All @@ -21,39 +21,39 @@
{
"name": "falcon-7b-instruct",
"node-count": 1,
"node-vm-size": "Standard_NC12s_v3",
"node-vm-size": "Standard_NC6s_v3",
"node-osdisk-size": 100,
"OSS": true,
"loads_adapter": false
},
{
"name": "falcon-40b",
"node-count": 1,
"node-vm-size": "Standard_NC24s_v3",
"node-vm-size": "Standard_NC48ads_A100_v4",
"node-osdisk-size": 400,
"OSS": true,
"loads_adapter": false
},
{
"name": "falcon-40b-instruct",
"node-count": 1,
"node-vm-size": "Standard_NC24s_v3",
"node-vm-size": "Standard_NC48ads_A100_v4",
"node-osdisk-size": 400,
"OSS": true,
"loads_adapter": false
},
{
"name": "mistral-7b",
"node-count": 1,
"node-vm-size": "Standard_NC12s_v3",
"node-vm-size": "Standard_NC6s_v3",
"node-osdisk-size": 100,
"OSS": true,
"loads_adapter": false
},
{
"name": "mistral-7b-instruct",
"node-count": 1,
"node-vm-size": "Standard_NC12s_v3",
"node-vm-size": "Standard_NC6s_v3",
"node-osdisk-size": 100,
"OSS": true,
"loads_adapter": false
Expand Down
69 changes: 51 additions & 18 deletions .github/workflows/e2e-preset-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,17 @@ on:
type: boolean
default: false
description: "Test all Phi models for E2E"
test-on-vllm:
type: boolean
default: false
description: "Test on VLLM runtime"

env:
GO_VERSION: "1.22"
BRANCH_NAME: ${{ github.head_ref || github.ref_name}}
FORCE_RUN_ALL: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.force-run-all == 'true' }}
FORCE_RUN_ALL_PHI: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.force-run-all-phi-models== 'true' }}
RUNTIME: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.test-on-vllm == 'true') && 'vllm' || 'hf' }}

permissions:
id-token: write
Expand Down Expand Up @@ -229,10 +234,11 @@ jobs:
- name: Replace IP and Deploy Resource to K8s
run: |
sed -i "s/MASTER_ADDR_HERE/${{ steps.get_ip.outputs.SERVICE_IP }}/g" presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}.yaml
sed -i "s/TAG_HERE/${{ matrix.model.tag }}/g" presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}.yaml
sed -i "s/REPO_HERE/${{ secrets.ACR_AMRT_USERNAME }}/g" presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}.yaml
kubectl apply -f presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}.yaml
POSTFIX=$(echo "${{ matrix.model.name }}" | grep -q "llama" && echo "" || echo "_${{ env.RUNTIME }}")
sed -i "s/MASTER_ADDR_HERE/${{ steps.get_ip.outputs.SERVICE_IP }}/g" presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}$POSTFIX.yaml
sed -i "s/TAG_HERE/${{ matrix.model.tag }}/g" presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}$POSTFIX.yaml
sed -i "s/REPO_HERE/${{ secrets.ACR_AMRT_USERNAME }}/g" presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}$POSTFIX.yaml
kubectl apply -f presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}$POSTFIX.yaml
- name: Wait for Resource to be ready
run: |
Expand All @@ -243,20 +249,27 @@ jobs:
run: |
POD_NAME=$(kubectl get pods -l app=${{ matrix.model.name }} -o jsonpath="{.items[0].metadata.name}")
kubectl logs $POD_NAME | grep "Adapter added:" | grep "${{ matrix.model.expected_adapter }}" || (echo "Adapter not loaded or incorrect adapter loaded" && exit 1)
- name: Test home endpoint
- name: Install testing commands
run: |
curl http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/
kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- apt-get update
kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- apt-get install -y curl
- name: Test healthz endpoint
run: |
curl http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/healthz
if [[ "${{ matrix.model.name }}" == *"llama"* ]]; then
kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
curl -s http://localhost:5000/healthz
else
kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
curl -s http://localhost:5000/health
fi
- name: Test inference endpoint
run: |
echo "Testing inference for ${{ matrix.model.name }}"
if [[ "${{ matrix.model.name }}" == *"llama"* && "${{ matrix.model.name }}" == *"-chat"* ]]; then
echo "Testing inference for ${{ matrix.model.name }}"
curl -X POST \
kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
curl -s -X POST \
-H "Content-Type: application/json" \
-d '{
"input_data": {
Expand All @@ -274,10 +287,10 @@ jobs:
]
}
}' \
http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/chat
http://localhost:5000/chat
elif [[ "${{ matrix.model.name }}" == *"llama"* ]]; then
echo "Testing inference for ${{ matrix.model.name }}"
curl -X POST \
kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
curl -s -X POST \
-H "Content-Type: application/json" \
-d '{
"prompts": [
Expand All @@ -290,10 +303,29 @@ jobs:
"max_gen_len": 128
}
}' \
http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/generate
http://localhost:5000/generate
elif [[ "${{ env.RUNTIME }}" == *"vllm"* ]]; then
kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
curl -s -X POST \
-H "accept: application/json" \
-H "Content-Type: application/json" \
-d '{
"model": "test",
"messages": [
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": "Hello!"
}
]
}' \
http://localhost:5000/v1/chat/completions
else
echo "Testing inference for ${{ matrix.model.name }}"
curl -X POST \
kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
curl -s -X POST \
-H "accept: application/json" \
-H "Content-Type: application/json" \
-d '{
Expand Down Expand Up @@ -327,7 +359,7 @@ jobs:
"remove_invalid_values":null
}
}' \
http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/chat
http://localhost:5000/chat
fi
- name: Cleanup
Expand All @@ -340,6 +372,7 @@ jobs:
# Check and Delete K8s Resource (Deployment or StatefulSet)
if kubectl get $RESOURCE_TYPE ${{ matrix.model.name }} > /dev/null 2>&1; then
kubectl logs $RESOURCE_TYPE/${{ matrix.model.name }}
kubectl delete $RESOURCE_TYPE ${{ matrix.model.name }}
fi
fi
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/kind-cluster/determine_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def read_yaml(file_path):
YAML_PR = read_yaml(supp_models_yaml)
# Format: {falcon-7b : {model_name:falcon-7b, type:text-generation, version: #, tag: #}}
MODELS = {model['name']: model for model in YAML_PR['models']}
KAITO_REPO_URL = "https://github.com/kaito-repo/kaito.git"
KAITO_REPO_URL = "https://github.com/kaito-project/kaito.git"

def set_multiline_output(name, value):
with open(os.environ['GITHUB_OUTPUT'], 'a') as fh:
Expand Down
55 changes: 36 additions & 19 deletions presets/workspace/models/supported_models.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,14 @@ models:
type: text-generation
version: https://huggingface.co/tiiuae/falcon-7b/commit/898df1396f35e447d5fe44e0a3ccaaaa69f30d36
runtime: tfs
tag: 0.0.6
tag: 0.0.7
- name: falcon-7b-instruct
type: text-generation
version: https://huggingface.co/tiiuae/falcon-7b-instruct/commit/cf4b3c42ce2fdfe24f753f0f0d179202fea59c99
runtime: tfs
tag: 0.0.6
tag: 0.0.7
# Tag history:
# 0.0.7 - Support VLLM runtime
# 0.0.6 - Add Logging & Metrics Server
# 0.0.5 - Tuning and Adapters
# 0.0.4 - Adjust default model params (#310)
Expand All @@ -51,13 +52,14 @@ models:
type: text-generation
version: https://huggingface.co/tiiuae/falcon-40b/commit/4a70170c215b36a3cce4b4253f6d0612bb7d4146
runtime: tfs
tag: 0.0.7
tag: 0.0.8
- name: falcon-40b-instruct
type: text-generation
version: https://huggingface.co/tiiuae/falcon-40b-instruct/commit/ecb78d97ac356d098e79f0db222c9ce7c5d9ee5f
runtime: tfs
tag: 0.0.7
tag: 0.0.8
# Tag history for 40b models:
# 0.0.8 - Support VLLM runtime
# 0.0.7 - Add Logging & Metrics Server
# 0.0.6 - Tuning and Adapters
# 0.0.5 - Adjust default model params (#310)
Expand All @@ -69,15 +71,16 @@ models:
# Mistral
- name: mistral-7b
type: text-generation
version: https://huggingface.co/mistralai/Mistral-7B-v0.3/commit/c882233d224d27b727b3d9299b12a9aab9dda6f7
version: https://huggingface.co/mistralai/Mistral-7B-v0.3/commit/d8cadc02ac76bd617a919d50b092e59d2d110aff
runtime: tfs
tag: 0.0.7
tag: 0.0.8
- name: mistral-7b-instruct
type: text-generation
version: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3/commit/0417f4babd26db0b5ed07c1d0bc85658ab526ea3
version: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3/commit/e0bc86c23ce5aae1db576c8cca6f06f1f73af2db
runtime: tfs
tag: 0.0.7
tag: 0.0.8
# Tag history:
# 0.0.8 - Support VLLM runtime
# 0.0.7 - Add Logging & Metrics Server
# 0.0.6 - Update model version and Address missing weights files fix
# 0.0.5 - Tuning and Adapters
Expand All @@ -89,10 +92,11 @@ models:
# Phi-2
- name: phi-2
type: text-generation
version: https://huggingface.co/microsoft/phi-2/commit/b10c3eba545ad279e7208ee3a5d644566f001670
version: https://huggingface.co/microsoft/phi-2/commit/ef382358ec9e382308935a992d908de099b64c23
runtime: tfs
tag: 0.0.5
tag: 0.0.6
# Tag history:
# 0.0.6 - Support VLLM runtime
# 0.0.5 - Add Logging & Metrics Server
# 0.0.4 - Tuning and Adapters
# 0.0.3 - Adjust default model params (#310)
Expand All @@ -102,36 +106,49 @@ models:
# Phi-3
- name: phi-3-mini-4k-instruct
type: text-generation
version: https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/commit/d269012bea6fbe38ce7752c8940fea010eea3383
version: https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/commit/0a67737cc96d2554230f90338b163bc6380a2a85
runtime: tfs
tag: 0.0.2
tag: 0.0.3
# Tag history:
# 0.0.3 - Support VLLM runtime
# 0.0.2 - Add Logging & Metrics Server
# 0.0.1 - Initial Release

- name: phi-3-mini-128k-instruct
type: text-generation
version: https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/commit/5be6479b4bc06a081e8f4c6ece294241ccd32dec
version: https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/commit/a90b62ae09941edff87a90ced39ba5807e6b2ade
runtime: tfs
tag: 0.0.2
tag: 0.0.3
# Tag history:
# 0.0.3 - Support VLLM runtime
# 0.0.2 - Add Logging & Metrics Server
# 0.0.1 - Initial Release

- name: phi-3-medium-4k-instruct
type: text-generation
version: https://huggingface.co/microsoft/Phi-3-medium-4k-instruct/commit/d194e4e74ffad5a5e193e26af25bcfc80c7f1ffc
version: https://huggingface.co/microsoft/Phi-3-medium-4k-instruct/commit/ae004ae82eb6eddc32906dfacb1d6dfea8f91996
runtime: tfs
tag: 0.0.2
tag: 0.0.3
# Tag history:
# 0.0.3 - Support VLLM runtime
# 0.0.2 - Add Logging & Metrics Server
# 0.0.1 - Initial Release

- name: phi-3-medium-128k-instruct
type: text-generation
version: https://huggingface.co/microsoft/Phi-3-medium-128k-instruct/commit/cae1d42b5577398fd1be9f0746052562ae552886
version: https://huggingface.co/microsoft/Phi-3-medium-128k-instruct/commit/fa7d2aa4f5ea69b2e36b20d050cdae79c9bfbb3f
runtime: tfs
tag: 0.0.2
tag: 0.0.3
# Tag history:
# 0.0.3 - Support VLLM runtime
# 0.0.2 - Add Logging & Metrics Server
# 0.0.1 - Initial Release
# 0.0.1 - Initial Release

- name: phi-3.5-mini-instruct
type: text-generation
version: https://huggingface.co/microsoft/Phi-3.5-mini-instruct/commit/af0dfb8029e8a74545d0736d30cb6b58d2f0f3f0
runtime: tfs
tag: 0.0.1
# Tag history:
# 0.0.1 - New Model! Support VLLM Runtime

Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ spec:
- accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
resources:
requests:
nvidia.com/gpu: 4 # Requesting 4 GPUs
nvidia.com/gpu: 2
limits:
nvidia.com/gpu: 4
nvidia.com/gpu: 2
livenessProbe:
httpGet:
path: /health
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: falcon-40b-instruct
spec:
progressDeadlineSeconds: 1800
replicas: 1
selector:
matchLabels:
app: falcon
template:
metadata:
labels:
app: falcon
spec:
containers:
- name: falcon-container
image: REPO_HERE.azurecr.io/falcon-40b-instruct:TAG_HERE
command:
- /bin/sh
- -c
- accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
resources:
requests:
nvidia.com/gpu: 2
limits:
nvidia.com/gpu: 2
livenessProbe:
httpGet:
path: /health
port: 5000
initialDelaySeconds: 600 # 10 Min
periodSeconds: 10
readinessProbe:
httpGet:
path: /health
port: 5000
initialDelaySeconds: 30
periodSeconds: 10
volumeMounts:
- name: dshm
mountPath: /dev/shm
volumes:
- name: dshm
emptyDir:
medium: Memory
tolerations:
- effect: NoSchedule
key: sku
operator: Equal
value: gpu
- effect: NoSchedule
key: nvidia.com/gpu
operator: Exists
nodeSelector:
pool: falcon40bins
Loading

0 comments on commit 0087e09

Please sign in to comment.