diff --git a/.github/e2e-preset-configs.json b/.github/e2e-preset-configs.json index 75c9e6710..972c5e7ad 100644 --- a/.github/e2e-preset-configs.json +++ b/.github/e2e-preset-configs.json @@ -4,7 +4,7 @@ { "name": "falcon-7b", "node-count": 1, - "node-vm-size": "Standard_NC12s_v3", + "node-vm-size": "Standard_NC6s_v3", "node-osdisk-size": 100, "OSS": true, "loads_adapter": false @@ -21,7 +21,7 @@ { "name": "falcon-7b-instruct", "node-count": 1, - "node-vm-size": "Standard_NC12s_v3", + "node-vm-size": "Standard_NC6s_v3", "node-osdisk-size": 100, "OSS": true, "loads_adapter": false @@ -29,7 +29,7 @@ { "name": "falcon-40b", "node-count": 1, - "node-vm-size": "Standard_NC24s_v3", + "node-vm-size": "Standard_NC48ads_A100_v4", "node-osdisk-size": 400, "OSS": true, "loads_adapter": false @@ -37,7 +37,7 @@ { "name": "falcon-40b-instruct", "node-count": 1, - "node-vm-size": "Standard_NC24s_v3", + "node-vm-size": "Standard_NC48ads_A100_v4", "node-osdisk-size": 400, "OSS": true, "loads_adapter": false @@ -45,7 +45,7 @@ { "name": "mistral-7b", "node-count": 1, - "node-vm-size": "Standard_NC12s_v3", + "node-vm-size": "Standard_NC6s_v3", "node-osdisk-size": 100, "OSS": true, "loads_adapter": false @@ -53,7 +53,7 @@ { "name": "mistral-7b-instruct", "node-count": 1, - "node-vm-size": "Standard_NC12s_v3", + "node-vm-size": "Standard_NC6s_v3", "node-osdisk-size": 100, "OSS": true, "loads_adapter": false diff --git a/.github/workflows/e2e-preset-test.yml b/.github/workflows/e2e-preset-test.yml index 574ee54d5..8c79783da 100644 --- a/.github/workflows/e2e-preset-test.yml +++ b/.github/workflows/e2e-preset-test.yml @@ -15,12 +15,17 @@ on: type: boolean default: false description: "Test all Phi models for E2E" + test-on-vllm: + type: boolean + default: false + description: "Test on VLLM runtime" env: GO_VERSION: "1.22" BRANCH_NAME: ${{ github.head_ref || github.ref_name}} FORCE_RUN_ALL: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.force-run-all == 'true' }} FORCE_RUN_ALL_PHI: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.force-run-all-phi-models== 'true' }} + RUNTIME: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.test-on-vllm == 'true') && 'vllm' || 'hf' }} permissions: id-token: write @@ -229,10 +234,11 @@ jobs: - name: Replace IP and Deploy Resource to K8s run: | - sed -i "s/MASTER_ADDR_HERE/${{ steps.get_ip.outputs.SERVICE_IP }}/g" presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}.yaml - sed -i "s/TAG_HERE/${{ matrix.model.tag }}/g" presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}.yaml - sed -i "s/REPO_HERE/${{ secrets.ACR_AMRT_USERNAME }}/g" presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}.yaml - kubectl apply -f presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}.yaml + POSTFIX=$(echo "${{ matrix.model.name }}" | grep -q "llama" && echo "" || echo "_${{ env.RUNTIME }}") + sed -i "s/MASTER_ADDR_HERE/${{ steps.get_ip.outputs.SERVICE_IP }}/g" presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}$POSTFIX.yaml + sed -i "s/TAG_HERE/${{ matrix.model.tag }}/g" presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}$POSTFIX.yaml + sed -i "s/REPO_HERE/${{ secrets.ACR_AMRT_USERNAME }}/g" presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}$POSTFIX.yaml + kubectl apply -f presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}$POSTFIX.yaml - name: Wait for Resource to be ready run: | @@ -243,20 +249,27 @@ jobs: run: | POD_NAME=$(kubectl get pods -l app=${{ matrix.model.name }} -o jsonpath="{.items[0].metadata.name}") kubectl logs $POD_NAME | grep "Adapter added:" | grep "${{ matrix.model.expected_adapter }}" || (echo "Adapter not loaded or incorrect adapter loaded" && exit 1) - - - name: Test home endpoint + + - name: Install testing commands run: | - curl http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/ + kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- apt-get update + kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- apt-get install -y curl - name: Test healthz endpoint run: | - curl http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/healthz - + if [[ "${{ matrix.model.name }}" == *"llama"* ]]; then + kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \ + curl -s http://localhost:5000/healthz + else + kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \ + curl -s http://localhost:5000/health + fi - name: Test inference endpoint run: | + echo "Testing inference for ${{ matrix.model.name }}" if [[ "${{ matrix.model.name }}" == *"llama"* && "${{ matrix.model.name }}" == *"-chat"* ]]; then - echo "Testing inference for ${{ matrix.model.name }}" - curl -X POST \ + kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \ + curl -s -X POST \ -H "Content-Type: application/json" \ -d '{ "input_data": { @@ -274,10 +287,10 @@ jobs: ] } }' \ - http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/chat + http://localhost:5000/chat elif [[ "${{ matrix.model.name }}" == *"llama"* ]]; then - echo "Testing inference for ${{ matrix.model.name }}" - curl -X POST \ + kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \ + curl -s -X POST \ -H "Content-Type: application/json" \ -d '{ "prompts": [ @@ -290,10 +303,29 @@ jobs: "max_gen_len": 128 } }' \ - http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/generate + http://localhost:5000/generate + elif [[ "${{ env.RUNTIME }}" == *"vllm"* ]]; then + kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \ + curl -s -X POST \ + -H "accept: application/json" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "test", + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "Hello!" + } + ] + }' \ + http://localhost:5000/v1/chat/completions else - echo "Testing inference for ${{ matrix.model.name }}" - curl -X POST \ + kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \ + curl -s -X POST \ -H "accept: application/json" \ -H "Content-Type: application/json" \ -d '{ @@ -327,7 +359,7 @@ jobs: "remove_invalid_values":null } }' \ - http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/chat + http://localhost:5000/chat fi - name: Cleanup @@ -340,6 +372,7 @@ jobs: # Check and Delete K8s Resource (Deployment or StatefulSet) if kubectl get $RESOURCE_TYPE ${{ matrix.model.name }} > /dev/null 2>&1; then + kubectl logs $RESOURCE_TYPE/${{ matrix.model.name }} kubectl delete $RESOURCE_TYPE ${{ matrix.model.name }} fi fi diff --git a/.github/workflows/kind-cluster/determine_models.py b/.github/workflows/kind-cluster/determine_models.py index 2ef84f1e0..555c1bc27 100644 --- a/.github/workflows/kind-cluster/determine_models.py +++ b/.github/workflows/kind-cluster/determine_models.py @@ -21,7 +21,7 @@ def read_yaml(file_path): YAML_PR = read_yaml(supp_models_yaml) # Format: {falcon-7b : {model_name:falcon-7b, type:text-generation, version: #, tag: #}} MODELS = {model['name']: model for model in YAML_PR['models']} -KAITO_REPO_URL = "https://github.com/kaito-repo/kaito.git" +KAITO_REPO_URL = "https://github.com/kaito-project/kaito.git" def set_multiline_output(name, value): with open(os.environ['GITHUB_OUTPUT'], 'a') as fh: diff --git a/presets/workspace/models/supported_models.yaml b/presets/workspace/models/supported_models.yaml index db49641b8..74044220f 100644 --- a/presets/workspace/models/supported_models.yaml +++ b/presets/workspace/models/supported_models.yaml @@ -34,13 +34,14 @@ models: type: text-generation version: https://huggingface.co/tiiuae/falcon-7b/commit/898df1396f35e447d5fe44e0a3ccaaaa69f30d36 runtime: tfs - tag: 0.0.6 + tag: 0.0.7 - name: falcon-7b-instruct type: text-generation version: https://huggingface.co/tiiuae/falcon-7b-instruct/commit/cf4b3c42ce2fdfe24f753f0f0d179202fea59c99 runtime: tfs - tag: 0.0.6 + tag: 0.0.7 # Tag history: + # 0.0.7 - Support VLLM runtime # 0.0.6 - Add Logging & Metrics Server # 0.0.5 - Tuning and Adapters # 0.0.4 - Adjust default model params (#310) @@ -51,13 +52,14 @@ models: type: text-generation version: https://huggingface.co/tiiuae/falcon-40b/commit/4a70170c215b36a3cce4b4253f6d0612bb7d4146 runtime: tfs - tag: 0.0.7 + tag: 0.0.8 - name: falcon-40b-instruct type: text-generation version: https://huggingface.co/tiiuae/falcon-40b-instruct/commit/ecb78d97ac356d098e79f0db222c9ce7c5d9ee5f runtime: tfs - tag: 0.0.7 + tag: 0.0.8 # Tag history for 40b models: + # 0.0.8 - Support VLLM runtime # 0.0.7 - Add Logging & Metrics Server # 0.0.6 - Tuning and Adapters # 0.0.5 - Adjust default model params (#310) @@ -69,15 +71,16 @@ models: # Mistral - name: mistral-7b type: text-generation - version: https://huggingface.co/mistralai/Mistral-7B-v0.3/commit/c882233d224d27b727b3d9299b12a9aab9dda6f7 + version: https://huggingface.co/mistralai/Mistral-7B-v0.3/commit/d8cadc02ac76bd617a919d50b092e59d2d110aff runtime: tfs - tag: 0.0.7 + tag: 0.0.8 - name: mistral-7b-instruct type: text-generation - version: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3/commit/0417f4babd26db0b5ed07c1d0bc85658ab526ea3 + version: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3/commit/e0bc86c23ce5aae1db576c8cca6f06f1f73af2db runtime: tfs - tag: 0.0.7 + tag: 0.0.8 # Tag history: + # 0.0.8 - Support VLLM runtime # 0.0.7 - Add Logging & Metrics Server # 0.0.6 - Update model version and Address missing weights files fix # 0.0.5 - Tuning and Adapters @@ -89,10 +92,11 @@ models: # Phi-2 - name: phi-2 type: text-generation - version: https://huggingface.co/microsoft/phi-2/commit/b10c3eba545ad279e7208ee3a5d644566f001670 + version: https://huggingface.co/microsoft/phi-2/commit/ef382358ec9e382308935a992d908de099b64c23 runtime: tfs - tag: 0.0.5 + tag: 0.0.6 # Tag history: + # 0.0.6 - Support VLLM runtime # 0.0.5 - Add Logging & Metrics Server # 0.0.4 - Tuning and Adapters # 0.0.3 - Adjust default model params (#310) @@ -102,36 +106,49 @@ models: # Phi-3 - name: phi-3-mini-4k-instruct type: text-generation - version: https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/commit/d269012bea6fbe38ce7752c8940fea010eea3383 + version: https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/commit/0a67737cc96d2554230f90338b163bc6380a2a85 runtime: tfs - tag: 0.0.2 + tag: 0.0.3 # Tag history: + # 0.0.3 - Support VLLM runtime # 0.0.2 - Add Logging & Metrics Server # 0.0.1 - Initial Release - name: phi-3-mini-128k-instruct type: text-generation - version: https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/commit/5be6479b4bc06a081e8f4c6ece294241ccd32dec + version: https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/commit/a90b62ae09941edff87a90ced39ba5807e6b2ade runtime: tfs - tag: 0.0.2 + tag: 0.0.3 # Tag history: + # 0.0.3 - Support VLLM runtime # 0.0.2 - Add Logging & Metrics Server # 0.0.1 - Initial Release - name: phi-3-medium-4k-instruct type: text-generation - version: https://huggingface.co/microsoft/Phi-3-medium-4k-instruct/commit/d194e4e74ffad5a5e193e26af25bcfc80c7f1ffc + version: https://huggingface.co/microsoft/Phi-3-medium-4k-instruct/commit/ae004ae82eb6eddc32906dfacb1d6dfea8f91996 runtime: tfs - tag: 0.0.2 + tag: 0.0.3 # Tag history: + # 0.0.3 - Support VLLM runtime # 0.0.2 - Add Logging & Metrics Server # 0.0.1 - Initial Release - name: phi-3-medium-128k-instruct type: text-generation - version: https://huggingface.co/microsoft/Phi-3-medium-128k-instruct/commit/cae1d42b5577398fd1be9f0746052562ae552886 + version: https://huggingface.co/microsoft/Phi-3-medium-128k-instruct/commit/fa7d2aa4f5ea69b2e36b20d050cdae79c9bfbb3f runtime: tfs - tag: 0.0.2 + tag: 0.0.3 # Tag history: + # 0.0.3 - Support VLLM runtime # 0.0.2 - Add Logging & Metrics Server - # 0.0.1 - Initial Release \ No newline at end of file + # 0.0.1 - Initial Release + + - name: phi-3.5-mini-instruct + type: text-generation + version: https://huggingface.co/microsoft/Phi-3.5-mini-instruct/commit/af0dfb8029e8a74545d0736d30cb6b58d2f0f3f0 + runtime: tfs + tag: 0.0.1 + # Tag history: + # 0.0.1 - New Model! Support VLLM Runtime + \ No newline at end of file diff --git a/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml b/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml index 37f3c6a6b..a44043894 100644 --- a/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml +++ b/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml @@ -22,9 +22,9 @@ spec: - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16 resources: requests: - nvidia.com/gpu: 4 # Requesting 4 GPUs + nvidia.com/gpu: 2 limits: - nvidia.com/gpu: 4 + nvidia.com/gpu: 2 livenessProbe: httpGet: path: /health diff --git a/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct_hf.yaml b/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct_hf.yaml new file mode 100644 index 000000000..a44043894 --- /dev/null +++ b/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct_hf.yaml @@ -0,0 +1,56 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: falcon-40b-instruct +spec: + progressDeadlineSeconds: 1800 + replicas: 1 + selector: + matchLabels: + app: falcon + template: + metadata: + labels: + app: falcon + spec: + containers: + - name: falcon-container + image: REPO_HERE.azurecr.io/falcon-40b-instruct:TAG_HERE + command: + - /bin/sh + - -c + - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16 + resources: + requests: + nvidia.com/gpu: 2 + limits: + nvidia.com/gpu: 2 + livenessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 600 # 10 Min + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 30 + periodSeconds: 10 + volumeMounts: + - name: dshm + mountPath: /dev/shm + volumes: + - name: dshm + emptyDir: + medium: Memory + tolerations: + - effect: NoSchedule + key: sku + operator: Equal + value: gpu + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + nodeSelector: + pool: falcon40bins diff --git a/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct_vllm.yaml b/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct_vllm.yaml new file mode 100644 index 000000000..7b40cbac4 --- /dev/null +++ b/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct_vllm.yaml @@ -0,0 +1,56 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: falcon-40b-instruct +spec: + progressDeadlineSeconds: 1800 + replicas: 1 + selector: + matchLabels: + app: falcon + template: + metadata: + labels: + app: falcon + spec: + containers: + - name: falcon-container + image: REPO_HERE.azurecr.io/falcon-40b-instruct:TAG_HERE + command: + - /bin/sh + - -c + - python3 /workspace/vllm/inference_api.py --served-model-name test --dtype bfloat16 --chat-template /workspace/chat_templates/falcon-instruct.jinja --tensor-parallel-size 2 + resources: + requests: + nvidia.com/gpu: 2 + limits: + nvidia.com/gpu: 2 + livenessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 600 # 10 Min + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 30 + periodSeconds: 10 + volumeMounts: + - name: dshm + mountPath: /dev/shm + volumes: + - name: dshm + emptyDir: + medium: Memory + tolerations: + - effect: NoSchedule + key: sku + operator: Equal + value: gpu + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + nodeSelector: + pool: falcon40bins diff --git a/presets/workspace/test/manifests/falcon-40b/falcon-40b.yaml b/presets/workspace/test/manifests/falcon-40b/falcon-40b.yaml index a1c11af0e..514d12e60 100644 --- a/presets/workspace/test/manifests/falcon-40b/falcon-40b.yaml +++ b/presets/workspace/test/manifests/falcon-40b/falcon-40b.yaml @@ -22,9 +22,9 @@ spec: - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16 resources: requests: - nvidia.com/gpu: 4 # Requesting 4 GPUs + nvidia.com/gpu: 2 limits: - nvidia.com/gpu: 4 + nvidia.com/gpu: 2 livenessProbe: httpGet: path: /health diff --git a/presets/workspace/test/manifests/falcon-40b/falcon-40b_hf.yaml b/presets/workspace/test/manifests/falcon-40b/falcon-40b_hf.yaml new file mode 100644 index 000000000..514d12e60 --- /dev/null +++ b/presets/workspace/test/manifests/falcon-40b/falcon-40b_hf.yaml @@ -0,0 +1,56 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: falcon-40b +spec: + progressDeadlineSeconds: 1800 + replicas: 1 + selector: + matchLabels: + app: falcon + template: + metadata: + labels: + app: falcon + spec: + containers: + - name: falcon-container + image: REPO_HERE.azurecr.io/falcon-40b:TAG_HERE + command: + - /bin/sh + - -c + - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16 + resources: + requests: + nvidia.com/gpu: 2 + limits: + nvidia.com/gpu: 2 + livenessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 600 # 10 Min + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 30 + periodSeconds: 10 + volumeMounts: + - name: dshm + mountPath: /dev/shm + volumes: + - name: dshm + emptyDir: + medium: Memory + tolerations: + - effect: NoSchedule + key: sku + operator: Equal + value: gpu + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + nodeSelector: + pool: falcon40b diff --git a/presets/workspace/test/manifests/falcon-40b/falcon-40b_vllm.yaml b/presets/workspace/test/manifests/falcon-40b/falcon-40b_vllm.yaml new file mode 100644 index 000000000..7e74ac7a7 --- /dev/null +++ b/presets/workspace/test/manifests/falcon-40b/falcon-40b_vllm.yaml @@ -0,0 +1,56 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: falcon-40b +spec: + progressDeadlineSeconds: 1800 + replicas: 1 + selector: + matchLabels: + app: falcon + template: + metadata: + labels: + app: falcon + spec: + containers: + - name: falcon-container + image: REPO_HERE.azurecr.io/falcon-40b:TAG_HERE + command: + - /bin/sh + - -c + - python3 /workspace/vllm/inference_api.py --served-model-name test --dtype bfloat16 --chat-template /workspace/chat_templates/falcon-instruct.jinja --tensor-parallel-size 2 + resources: + requests: + nvidia.com/gpu: 2 + limits: + nvidia.com/gpu: 2 + livenessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 600 # 10 Min + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 30 + periodSeconds: 10 + volumeMounts: + - name: dshm + mountPath: /dev/shm + volumes: + - name: dshm + emptyDir: + medium: Memory + tolerations: + - effect: NoSchedule + key: sku + operator: Equal + value: gpu + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + nodeSelector: + pool: falcon40b diff --git a/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct.yaml b/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct.yaml index cbf7f6f7f..399b78a3c 100644 --- a/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct.yaml +++ b/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct.yaml @@ -21,9 +21,9 @@ spec: - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16 resources: requests: - nvidia.com/gpu: 2 + nvidia.com/gpu: 1 limits: - nvidia.com/gpu: 2 # Requesting 2 GPUs + nvidia.com/gpu: 1 livenessProbe: httpGet: path: /health diff --git a/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct_hf.yaml b/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct_hf.yaml new file mode 100644 index 000000000..1b2092b36 --- /dev/null +++ b/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct_hf.yaml @@ -0,0 +1,56 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: falcon-7b-instruct +spec: + progressDeadlineSeconds: 1200 + replicas: 1 + selector: + matchLabels: + app: falcon + template: + metadata: + labels: + app: falcon + spec: + containers: + - name: falcon-container + image: REPO_HERE.azurecr.io/falcon-7b-instruct:TAG_HERE + command: + - /bin/sh + - -c + - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16 + resources: + requests: + nvidia.com/gpu: 1 + limits: + nvidia.com/gpu: 1 + livenessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 600 # 10 Min + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 30 + periodSeconds: 10 + volumeMounts: + - name: dshm + mountPath: /dev/shm + volumes: + - name: dshm + emptyDir: + medium: Memory + tolerations: + - effect: NoSchedule + key: sku + operator: Equal + value: gpu + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + nodeSelector: + pool: falcon7binst diff --git a/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct_vllm.yaml b/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct_vllm.yaml new file mode 100644 index 000000000..4019d64f5 --- /dev/null +++ b/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct_vllm.yaml @@ -0,0 +1,56 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: falcon-7b-instruct +spec: + progressDeadlineSeconds: 1200 + replicas: 1 + selector: + matchLabels: + app: falcon + template: + metadata: + labels: + app: falcon + spec: + containers: + - name: falcon-container + image: REPO_HERE.azurecr.io/falcon-7b-instruct:TAG_HERE + command: + - /bin/sh + - -c + - python3 /workspace/vllm/inference_api.py --served-model-name test --dtype float16 --chat-template /workspace/chat_templates/falcon-instruct.jinja + resources: + requests: + nvidia.com/gpu: 1 + limits: + nvidia.com/gpu: 1 + livenessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 600 # 10 Min + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 30 + periodSeconds: 10 + volumeMounts: + - name: dshm + mountPath: /dev/shm + volumes: + - name: dshm + emptyDir: + medium: Memory + tolerations: + - effect: NoSchedule + key: sku + operator: Equal + value: gpu + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + nodeSelector: + pool: falcon7binst diff --git a/presets/workspace/test/manifests/falcon-7b/falcon-7b.yaml b/presets/workspace/test/manifests/falcon-7b/falcon-7b.yaml index f985124ea..8e5786c6e 100644 --- a/presets/workspace/test/manifests/falcon-7b/falcon-7b.yaml +++ b/presets/workspace/test/manifests/falcon-7b/falcon-7b.yaml @@ -21,9 +21,9 @@ spec: - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16 resources: requests: - nvidia.com/gpu: 2 + nvidia.com/gpu: 1 limits: - nvidia.com/gpu: 2 # Requesting 2 GPUs + nvidia.com/gpu: 1 livenessProbe: httpGet: path: /health diff --git a/presets/workspace/test/manifests/falcon-7b/falcon-7b_hf.yaml b/presets/workspace/test/manifests/falcon-7b/falcon-7b_hf.yaml new file mode 100644 index 000000000..56a775fff --- /dev/null +++ b/presets/workspace/test/manifests/falcon-7b/falcon-7b_hf.yaml @@ -0,0 +1,56 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: falcon-7b +spec: + progressDeadlineSeconds: 1200 + replicas: 1 + selector: + matchLabels: + app: falcon + template: + metadata: + labels: + app: falcon + spec: + containers: + - name: falcon-container + image: REPO_HERE.azurecr.io/falcon-7b:TAG_HERE + command: + - /bin/sh + - -c + - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16 + resources: + requests: + nvidia.com/gpu: 1 + limits: + nvidia.com/gpu: 1 + livenessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 600 # 10 Min + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 30 + periodSeconds: 10 + volumeMounts: + - name: dshm + mountPath: /dev/shm + volumes: + - name: dshm + emptyDir: + medium: Memory + tolerations: + - effect: NoSchedule + key: sku + operator: Equal + value: gpu + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + nodeSelector: + pool: falcon7b diff --git a/presets/workspace/test/manifests/falcon-7b/falcon-7b_vllm.yaml b/presets/workspace/test/manifests/falcon-7b/falcon-7b_vllm.yaml new file mode 100644 index 000000000..bceb14560 --- /dev/null +++ b/presets/workspace/test/manifests/falcon-7b/falcon-7b_vllm.yaml @@ -0,0 +1,56 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: falcon-7b +spec: + progressDeadlineSeconds: 1200 + replicas: 1 + selector: + matchLabels: + app: falcon + template: + metadata: + labels: + app: falcon + spec: + containers: + - name: falcon-container + image: REPO_HERE.azurecr.io/falcon-7b:TAG_HERE + command: + - /bin/sh + - -c + - python3 /workspace/vllm/inference_api.py --served-model-name test --dtype float16 --chat-template /workspace/chat_templates/falcon-instruct.jinja + resources: + requests: + nvidia.com/gpu: 1 + limits: + nvidia.com/gpu: 1 + livenessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 600 # 10 Min + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 30 + periodSeconds: 10 + volumeMounts: + - name: dshm + mountPath: /dev/shm + volumes: + - name: dshm + emptyDir: + medium: Memory + tolerations: + - effect: NoSchedule + key: sku + operator: Equal + value: gpu + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + nodeSelector: + pool: falcon7b diff --git a/presets/workspace/test/manifests/llama-2-13b-chat/llama-2-13b-chat.yaml b/presets/workspace/test/manifests/llama-2-13b-chat/llama-2-13b-chat.yaml index 61a309821..973f6d238 100644 --- a/presets/workspace/test/manifests/llama-2-13b-chat/llama-2-13b-chat.yaml +++ b/presets/workspace/test/manifests/llama-2-13b-chat/llama-2-13b-chat.yaml @@ -35,7 +35,7 @@ spec: - | echo "MASTER_ADDR: $MASTER_ADDR" NODE_RANK=$(echo $HOSTNAME | grep -o '[^-]*$') - cd /workspace/llama/llama-2 && torchrun --nnodes 2 --nproc_per_node 1 --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port 29500 /workspace/tfs/inference_api.py + cd /workspace/llama/llama-2 && torchrun --nnodes 2 --nproc_per_node 1 --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port 29500 inference_api.py resources: limits: nvidia.com/gpu: "1" @@ -43,13 +43,13 @@ spec: nvidia.com/gpu: "1" livenessProbe: httpGet: - path: /health + path: /healthz port: 5000 initialDelaySeconds: 600 # 10 Min periodSeconds: 10 readinessProbe: httpGet: - path: /health + path: /healthz port: 5000 initialDelaySeconds: 30 periodSeconds: 10 diff --git a/presets/workspace/test/manifests/llama-2-13b/llama-2-13b.yaml b/presets/workspace/test/manifests/llama-2-13b/llama-2-13b.yaml index daff4cd0a..46c609bbb 100644 --- a/presets/workspace/test/manifests/llama-2-13b/llama-2-13b.yaml +++ b/presets/workspace/test/manifests/llama-2-13b/llama-2-13b.yaml @@ -35,7 +35,7 @@ spec: - | echo "MASTER_ADDR: $MASTER_ADDR" NODE_RANK=$(echo $HOSTNAME | grep -o '[^-]*$') - cd /workspace/llama/llama-2 && torchrun --nnodes 2 --nproc_per_node 1 --node_rank $NODE_RANK --master-addr $MASTER_ADDR --master-port 29500 /workspace/tfs/inference_api.py + cd /workspace/llama/llama-2 && torchrun --nnodes 2 --nproc_per_node 1 --node_rank $NODE_RANK --master-addr $MASTER_ADDR --master-port 29500 inference_api.py resources: limits: nvidia.com/gpu: "1" @@ -43,13 +43,13 @@ spec: nvidia.com/gpu: "1" livenessProbe: httpGet: - path: /health + path: /healthz port: 5000 initialDelaySeconds: 600 # 10 Min periodSeconds: 10 readinessProbe: httpGet: - path: /health + path: /healthz port: 5000 initialDelaySeconds: 30 periodSeconds: 10 diff --git a/presets/workspace/test/manifests/llama-2-7b-chat/llama-2-7b-chat.yaml b/presets/workspace/test/manifests/llama-2-7b-chat/llama-2-7b-chat.yaml index 61ec695dc..f26b003a8 100644 --- a/presets/workspace/test/manifests/llama-2-7b-chat/llama-2-7b-chat.yaml +++ b/presets/workspace/test/manifests/llama-2-7b-chat/llama-2-7b-chat.yaml @@ -19,7 +19,7 @@ spec: command: - /bin/sh - -c - - cd /workspace/llama/llama-2 && torchrun /workspace/tfs/inference_api.py + - cd /workspace/llama/llama-2 && torchrun inference_api.py resources: limits: nvidia.com/gpu: "1" @@ -27,13 +27,13 @@ spec: nvidia.com/gpu: "1" livenessProbe: httpGet: - path: /health + path: /healthz port: 5000 initialDelaySeconds: 600 # 10 Min periodSeconds: 10 readinessProbe: httpGet: - path: /health + path: /healthz port: 5000 initialDelaySeconds: 30 periodSeconds: 10 diff --git a/presets/workspace/test/manifests/llama-2-7b/llama-2-7b.yaml b/presets/workspace/test/manifests/llama-2-7b/llama-2-7b.yaml index af295b8db..f68d43c64 100644 --- a/presets/workspace/test/manifests/llama-2-7b/llama-2-7b.yaml +++ b/presets/workspace/test/manifests/llama-2-7b/llama-2-7b.yaml @@ -19,7 +19,7 @@ spec: command: - /bin/sh - -c - - cd /workspace/llama/llama-2 && torchrun /workspace/tfs/inference_api.py + - cd /workspace/llama/llama-2 && torchrun inference_api.py resources: limits: nvidia.com/gpu: "1" @@ -27,13 +27,13 @@ spec: nvidia.com/gpu: "1" livenessProbe: httpGet: - path: /health + path: /healthz port: 5000 initialDelaySeconds: 600 # 10 Min periodSeconds: 10 readinessProbe: httpGet: - path: /health + path: /healthz port: 5000 initialDelaySeconds: 30 periodSeconds: 10 diff --git a/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct.yaml b/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct.yaml index a64780db9..75179683f 100644 --- a/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct.yaml +++ b/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct.yaml @@ -21,9 +21,9 @@ spec: - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16 resources: requests: - nvidia.com/gpu: 2 + nvidia.com/gpu: 1 limits: - nvidia.com/gpu: 2 # Requesting 2 GPUs + nvidia.com/gpu: 1 livenessProbe: httpGet: path: /health diff --git a/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct_hf.yaml b/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct_hf.yaml new file mode 100644 index 000000000..75179683f --- /dev/null +++ b/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct_hf.yaml @@ -0,0 +1,55 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: mistral-7b-instruct +spec: + replicas: 1 + selector: + matchLabels: + app: mistral + template: + metadata: + labels: + app: mistral + spec: + containers: + - name: mistral-instruct-container + image: REPO_HERE.azurecr.io/mistral-7b-instruct:TAG_HERE + command: + - /bin/sh + - -c + - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16 + resources: + requests: + nvidia.com/gpu: 1 + limits: + nvidia.com/gpu: 1 + livenessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 600 # 10 Min + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 30 + periodSeconds: 10 + volumeMounts: + - name: dshm + mountPath: /dev/shm + volumes: + - name: dshm + emptyDir: + medium: Memory + tolerations: + - effect: NoSchedule + key: sku + operator: Equal + value: gpu + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + nodeSelector: + pool: mistral7bins diff --git a/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct_vllm.yaml b/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct_vllm.yaml new file mode 100644 index 000000000..939d6c75b --- /dev/null +++ b/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct_vllm.yaml @@ -0,0 +1,55 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: mistral-7b-instruct +spec: + replicas: 1 + selector: + matchLabels: + app: mistral + template: + metadata: + labels: + app: mistral + spec: + containers: + - name: mistral-instruct-container + image: REPO_HERE.azurecr.io/mistral-7b-instruct:TAG_HERE + command: + - /bin/sh + - -c + - python3 /workspace/vllm/inference_api.py --served-model-name test --dtype float16 --chat-template /workspace/chat_templates/mistral-instruct.jinja + resources: + requests: + nvidia.com/gpu: 1 + limits: + nvidia.com/gpu: 1 + livenessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 600 # 10 Min + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 30 + periodSeconds: 10 + volumeMounts: + - name: dshm + mountPath: /dev/shm + volumes: + - name: dshm + emptyDir: + medium: Memory + tolerations: + - effect: NoSchedule + key: sku + operator: Equal + value: gpu + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + nodeSelector: + pool: mistral7bins diff --git a/presets/workspace/test/manifests/mistral-7b/mistral-7b.yaml b/presets/workspace/test/manifests/mistral-7b/mistral-7b.yaml index 219f42ff5..3eff5594f 100644 --- a/presets/workspace/test/manifests/mistral-7b/mistral-7b.yaml +++ b/presets/workspace/test/manifests/mistral-7b/mistral-7b.yaml @@ -21,9 +21,9 @@ spec: - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16 resources: requests: - nvidia.com/gpu: 2 + nvidia.com/gpu: 1 limits: - nvidia.com/gpu: 2 # Requesting 2 GPUs + nvidia.com/gpu: 1 livenessProbe: httpGet: path: /health diff --git a/presets/workspace/test/manifests/mistral-7b/mistral-7b_hf.yaml b/presets/workspace/test/manifests/mistral-7b/mistral-7b_hf.yaml new file mode 100644 index 000000000..3eff5594f --- /dev/null +++ b/presets/workspace/test/manifests/mistral-7b/mistral-7b_hf.yaml @@ -0,0 +1,55 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: mistral-7b +spec: + replicas: 1 + selector: + matchLabels: + app: mistral + template: + metadata: + labels: + app: mistral + spec: + containers: + - name: mistral-container + image: REPO_HERE.azurecr.io/mistral-7b:TAG_HERE + command: + - /bin/sh + - -c + - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16 + resources: + requests: + nvidia.com/gpu: 1 + limits: + nvidia.com/gpu: 1 + livenessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 600 # 10 Min + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 30 + periodSeconds: 10 + volumeMounts: + - name: dshm + mountPath: /dev/shm + volumes: + - name: dshm + emptyDir: + medium: Memory + tolerations: + - effect: NoSchedule + key: sku + operator: Equal + value: gpu + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + nodeSelector: + pool: mistral7b diff --git a/presets/workspace/test/manifests/mistral-7b/mistral-7b_vllm.yaml b/presets/workspace/test/manifests/mistral-7b/mistral-7b_vllm.yaml new file mode 100644 index 000000000..2bd945319 --- /dev/null +++ b/presets/workspace/test/manifests/mistral-7b/mistral-7b_vllm.yaml @@ -0,0 +1,55 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: mistral-7b +spec: + replicas: 1 + selector: + matchLabels: + app: mistral + template: + metadata: + labels: + app: mistral + spec: + containers: + - name: mistral-container + image: REPO_HERE.azurecr.io/mistral-7b:TAG_HERE + command: + - /bin/sh + - -c + - python3 /workspace/vllm/inference_api.py --served-model-name test --dtype float16 --chat-template /workspace/chat_templates/mistral-instruct.jinja + resources: + requests: + nvidia.com/gpu: 1 + limits: + nvidia.com/gpu: 1 + livenessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 600 # 10 Min + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 30 + periodSeconds: 10 + volumeMounts: + - name: dshm + mountPath: /dev/shm + volumes: + - name: dshm + emptyDir: + medium: Memory + tolerations: + - effect: NoSchedule + key: sku + operator: Equal + value: gpu + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + nodeSelector: + pool: mistral7b diff --git a/presets/workspace/test/manifests/phi-2/phi-2_hf.yaml b/presets/workspace/test/manifests/phi-2/phi-2_hf.yaml new file mode 100644 index 000000000..cbc6f94e7 --- /dev/null +++ b/presets/workspace/test/manifests/phi-2/phi-2_hf.yaml @@ -0,0 +1,55 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: phi-2 +spec: + replicas: 1 + selector: + matchLabels: + app: phi-2 + template: + metadata: + labels: + app: phi-2 + spec: + containers: + - name: phi-2-container + image: REPO_HERE.azurecr.io/phi-2:TAG_HERE + command: + - /bin/sh + - -c + - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16 + resources: + requests: + nvidia.com/gpu: 1 + limits: + nvidia.com/gpu: 1 # Requesting 1 GPU + livenessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 600 # 10 Min + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 30 + periodSeconds: 10 + volumeMounts: + - name: dshm + mountPath: /dev/shm + volumes: + - name: dshm + emptyDir: + medium: Memory + tolerations: + - effect: NoSchedule + key: sku + operator: Equal + value: gpu + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + nodeSelector: + pool: phi2 diff --git a/presets/workspace/test/manifests/phi-2/phi-2_vllm.yaml b/presets/workspace/test/manifests/phi-2/phi-2_vllm.yaml new file mode 100644 index 000000000..e77f21268 --- /dev/null +++ b/presets/workspace/test/manifests/phi-2/phi-2_vllm.yaml @@ -0,0 +1,55 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: phi-2 +spec: + replicas: 1 + selector: + matchLabels: + app: phi-2 + template: + metadata: + labels: + app: phi-2 + spec: + containers: + - name: phi-2-container + image: REPO_HERE.azurecr.io/phi-2:TAG_HERE + command: + - /bin/sh + - -c + - python3 /workspace/vllm/inference_api.py --served-model-name test --dtype float16 + resources: + requests: + nvidia.com/gpu: 1 + limits: + nvidia.com/gpu: 1 # Requesting 1 GPU + livenessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 600 # 10 Min + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 30 + periodSeconds: 10 + volumeMounts: + - name: dshm + mountPath: /dev/shm + volumes: + - name: dshm + emptyDir: + medium: Memory + tolerations: + - effect: NoSchedule + key: sku + operator: Equal + value: gpu + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + nodeSelector: + pool: phi2 diff --git a/presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct_hf.yaml b/presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct_hf.yaml new file mode 100644 index 000000000..0adb122e4 --- /dev/null +++ b/presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct_hf.yaml @@ -0,0 +1,55 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: phi-3-medium-128k-instruct +spec: + replicas: 1 + selector: + matchLabels: + app: phi-3-medium-128k-instruct + template: + metadata: + labels: + app: phi-3-medium-128k-instruct + spec: + containers: + - name: phi-3-medium-128k-instruct-container + image: REPO_HERE.azurecr.io/phi-3-medium-128k-instruct:TAG_HERE + command: + - /bin/sh + - -c + - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code + resources: + requests: + nvidia.com/gpu: 1 + limits: + nvidia.com/gpu: 1 # Requesting 1 GPU + livenessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 600 # 10 Min + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 30 + periodSeconds: 10 + volumeMounts: + - name: dshm + mountPath: /dev/shm + volumes: + - name: dshm + emptyDir: + medium: Memory + tolerations: + - effect: NoSchedule + key: sku + operator: Equal + value: gpu + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + nodeSelector: + pool: phi3medium12 \ No newline at end of file diff --git a/presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct_vllm.yaml b/presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct_vllm.yaml new file mode 100644 index 000000000..5b93bde50 --- /dev/null +++ b/presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct_vllm.yaml @@ -0,0 +1,55 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: phi-3-medium-128k-instruct +spec: + replicas: 1 + selector: + matchLabels: + app: phi-3-medium-128k-instruct + template: + metadata: + labels: + app: phi-3-medium-128k-instruct + spec: + containers: + - name: phi-3-medium-128k-instruct-container + image: REPO_HERE.azurecr.io/phi-3-medium-128k-instruct:TAG_HERE + command: + - /bin/sh + - -c + - python3 /workspace/vllm/inference_api.py --served-model-name test --dtype float16 --max-model-len 1024 --tensor-parallel-size 2 + resources: + requests: + nvidia.com/gpu: 2 + limits: + nvidia.com/gpu: 2 + livenessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 600 # 10 Min + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 30 + periodSeconds: 10 + volumeMounts: + - name: dshm + mountPath: /dev/shm + volumes: + - name: dshm + emptyDir: + medium: Memory + tolerations: + - effect: NoSchedule + key: sku + operator: Equal + value: gpu + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + nodeSelector: + pool: phi3medium12 \ No newline at end of file diff --git a/presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct_hf.yaml b/presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct_hf.yaml new file mode 100644 index 000000000..1d0d64e47 --- /dev/null +++ b/presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct_hf.yaml @@ -0,0 +1,55 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: phi-3-medium-4k-instruct +spec: + replicas: 1 + selector: + matchLabels: + app: phi-3-medium-4k-instruct + template: + metadata: + labels: + app: phi-3-medium-4k-instruct + spec: + containers: + - name: phi-3-medium-4k-instruct-container + image: REPO_HERE.azurecr.io/phi-3-medium-4k-instruct:TAG_HERE + command: + - /bin/sh + - -c + - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code + resources: + requests: + nvidia.com/gpu: 1 + limits: + nvidia.com/gpu: 1 # Requesting 1 GPU + livenessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 600 # 10 Min + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 30 + periodSeconds: 10 + volumeMounts: + - name: dshm + mountPath: /dev/shm + volumes: + - name: dshm + emptyDir: + medium: Memory + tolerations: + - effect: NoSchedule + key: sku + operator: Equal + value: gpu + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + nodeSelector: + pool: phi3medium4k \ No newline at end of file diff --git a/presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct_vllm.yaml b/presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct_vllm.yaml new file mode 100644 index 000000000..3bdce8072 --- /dev/null +++ b/presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct_vllm.yaml @@ -0,0 +1,55 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: phi-3-medium-4k-instruct +spec: + replicas: 1 + selector: + matchLabels: + app: phi-3-medium-4k-instruct + template: + metadata: + labels: + app: phi-3-medium-4k-instruct + spec: + containers: + - name: phi-3-medium-4k-instruct-container + image: REPO_HERE.azurecr.io/phi-3-medium-4k-instruct:TAG_HERE + command: + - /bin/sh + - -c + - python3 /workspace/vllm/inference_api.py --served-model-name test --dtype float16 --tensor-parallel-size 2 + resources: + requests: + nvidia.com/gpu: 2 + limits: + nvidia.com/gpu: 2 + livenessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 600 # 10 Min + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 30 + periodSeconds: 10 + volumeMounts: + - name: dshm + mountPath: /dev/shm + volumes: + - name: dshm + emptyDir: + medium: Memory + tolerations: + - effect: NoSchedule + key: sku + operator: Equal + value: gpu + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + nodeSelector: + pool: phi3medium4k \ No newline at end of file diff --git a/presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct_hf.yaml b/presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct_hf.yaml new file mode 100644 index 000000000..cf8898015 --- /dev/null +++ b/presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct_hf.yaml @@ -0,0 +1,55 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: phi-3-mini-128k-instruct +spec: + replicas: 1 + selector: + matchLabels: + app: phi-3-mini-128k-instruct + template: + metadata: + labels: + app: phi-3-mini-128k-instruct + spec: + containers: + - name: phi-3-mini-128k-instruct-container + image: REPO_HERE.azurecr.io/phi-3-mini-128k-instruct:TAG_HERE + command: + - /bin/sh + - -c + - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code + resources: + requests: + nvidia.com/gpu: 1 + limits: + nvidia.com/gpu: 1 # Requesting 1 GPU + livenessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 600 # 10 Min + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 30 + periodSeconds: 10 + volumeMounts: + - name: dshm + mountPath: /dev/shm + volumes: + - name: dshm + emptyDir: + medium: Memory + tolerations: + - effect: NoSchedule + key: sku + operator: Equal + value: gpu + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + nodeSelector: + pool: phi3mini128k \ No newline at end of file diff --git a/presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct_vllm.yaml b/presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct_vllm.yaml new file mode 100644 index 000000000..f719bf96b --- /dev/null +++ b/presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct_vllm.yaml @@ -0,0 +1,55 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: phi-3-mini-128k-instruct +spec: + replicas: 1 + selector: + matchLabels: + app: phi-3-mini-128k-instruct + template: + metadata: + labels: + app: phi-3-mini-128k-instruct + spec: + containers: + - name: phi-3-mini-128k-instruct-container + image: REPO_HERE.azurecr.io/phi-3-mini-128k-instruct:TAG_HERE + command: + - /bin/sh + - -c + - python3 /workspace/vllm/inference_api.py --served-model-name test --dtype float16 + resources: + requests: + nvidia.com/gpu: 1 + limits: + nvidia.com/gpu: 1 # Requesting 1 GPU + livenessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 600 # 10 Min + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 30 + periodSeconds: 10 + volumeMounts: + - name: dshm + mountPath: /dev/shm + volumes: + - name: dshm + emptyDir: + medium: Memory + tolerations: + - effect: NoSchedule + key: sku + operator: Equal + value: gpu + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + nodeSelector: + pool: phi3mini128k \ No newline at end of file diff --git a/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct_hf.yaml b/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct_hf.yaml new file mode 100644 index 000000000..1d7069a38 --- /dev/null +++ b/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct_hf.yaml @@ -0,0 +1,55 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: phi-3-mini-4k-instruct +spec: + replicas: 1 + selector: + matchLabels: + app: phi-3-mini-4k-instruct + template: + metadata: + labels: + app: phi-3-mini-4k-instruct + spec: + containers: + - name: phi-3-mini-4k-instruct-container + image: REPO_HERE.azurecr.io/phi-3-mini-4k-instruct:TAG_HERE + command: + - /bin/sh + - -c + - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code + resources: + requests: + nvidia.com/gpu: 1 + limits: + nvidia.com/gpu: 1 # Requesting 1 GPU + livenessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 600 # 10 Min + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 30 + periodSeconds: 10 + volumeMounts: + - name: dshm + mountPath: /dev/shm + volumes: + - name: dshm + emptyDir: + medium: Memory + tolerations: + - effect: NoSchedule + key: sku + operator: Equal + value: gpu + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + nodeSelector: + pool: phi3mini4kin \ No newline at end of file diff --git a/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct_vllm.yaml b/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct_vllm.yaml new file mode 100644 index 000000000..8d1275678 --- /dev/null +++ b/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct_vllm.yaml @@ -0,0 +1,55 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: phi-3-mini-4k-instruct +spec: + replicas: 1 + selector: + matchLabels: + app: phi-3-mini-4k-instruct + template: + metadata: + labels: + app: phi-3-mini-4k-instruct + spec: + containers: + - name: phi-3-mini-4k-instruct-container + image: REPO_HERE.azurecr.io/phi-3-mini-4k-instruct:TAG_HERE + command: + - /bin/sh + - -c + - python3 /workspace/vllm/inference_api.py --served-model-name test --dtype float16 + resources: + requests: + nvidia.com/gpu: 1 + limits: + nvidia.com/gpu: 1 # Requesting 1 GPU + livenessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 600 # 10 Min + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: 5000 + initialDelaySeconds: 30 + periodSeconds: 10 + volumeMounts: + - name: dshm + mountPath: /dev/shm + volumes: + - name: dshm + emptyDir: + medium: Memory + tolerations: + - effect: NoSchedule + key: sku + operator: Equal + value: gpu + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + nodeSelector: + pool: phi3mini4kin \ No newline at end of file