diff --git a/.github/e2e-preset-configs.json b/.github/e2e-preset-configs.json
index 75c9e6710..972c5e7ad 100644
--- a/.github/e2e-preset-configs.json
+++ b/.github/e2e-preset-configs.json
@@ -4,7 +4,7 @@
       {
         "name": "falcon-7b",
         "node-count": 1,
-        "node-vm-size": "Standard_NC12s_v3",
+        "node-vm-size": "Standard_NC6s_v3",
         "node-osdisk-size": 100,
         "OSS": true,
         "loads_adapter": false
@@ -21,7 +21,7 @@
       {
         "name": "falcon-7b-instruct",
         "node-count": 1,
-        "node-vm-size": "Standard_NC12s_v3",
+        "node-vm-size": "Standard_NC6s_v3",
         "node-osdisk-size": 100,
         "OSS": true,
         "loads_adapter": false
@@ -29,7 +29,7 @@
       {
         "name": "falcon-40b",
         "node-count": 1,
-        "node-vm-size": "Standard_NC24s_v3",
+        "node-vm-size": "Standard_NC48ads_A100_v4",
         "node-osdisk-size": 400,
         "OSS": true,
         "loads_adapter": false
@@ -37,7 +37,7 @@
       {
         "name": "falcon-40b-instruct",
         "node-count": 1,
-        "node-vm-size": "Standard_NC24s_v3",
+        "node-vm-size": "Standard_NC48ads_A100_v4",
         "node-osdisk-size": 400,
         "OSS": true,
         "loads_adapter": false
@@ -45,7 +45,7 @@
       {
         "name": "mistral-7b",
         "node-count": 1,
-        "node-vm-size": "Standard_NC12s_v3",
+        "node-vm-size": "Standard_NC6s_v3",
         "node-osdisk-size": 100,
         "OSS": true,
         "loads_adapter": false
@@ -53,7 +53,7 @@
       {
         "name": "mistral-7b-instruct",
         "node-count": 1,
-        "node-vm-size": "Standard_NC12s_v3",
+        "node-vm-size": "Standard_NC6s_v3",
         "node-osdisk-size": 100,
         "OSS": true,
         "loads_adapter": false
diff --git a/.github/workflows/e2e-preset-test.yml b/.github/workflows/e2e-preset-test.yml
index 574ee54d5..8c79783da 100644
--- a/.github/workflows/e2e-preset-test.yml
+++ b/.github/workflows/e2e-preset-test.yml
@@ -15,12 +15,17 @@ on:
                 type: boolean
                 default: false
                 description: "Test all Phi models for E2E"
+            test-on-vllm:
+                type: boolean
+                default: false
+                description: "Test on VLLM runtime"
 
 env:
     GO_VERSION: "1.22"
     BRANCH_NAME: ${{ github.head_ref || github.ref_name}} 
     FORCE_RUN_ALL: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.force-run-all == 'true' }}
     FORCE_RUN_ALL_PHI:  ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.force-run-all-phi-models== 'true' }}
+    RUNTIME: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.test-on-vllm == 'true') && 'vllm' || 'hf' }}
 
 permissions:
     id-token: write
@@ -229,10 +234,11 @@ jobs:
       
       - name: Replace IP and Deploy Resource to K8s
         run: |
-            sed -i "s/MASTER_ADDR_HERE/${{ steps.get_ip.outputs.SERVICE_IP }}/g" presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}.yaml
-            sed -i "s/TAG_HERE/${{ matrix.model.tag }}/g" presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}.yaml
-            sed -i "s/REPO_HERE/${{ secrets.ACR_AMRT_USERNAME }}/g" presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}.yaml
-            kubectl apply -f presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}.yaml
+            POSTFIX=$(echo "${{ matrix.model.name }}" | grep -q "llama" && echo "" || echo "_${{ env.RUNTIME }}")
+            sed -i "s/MASTER_ADDR_HERE/${{ steps.get_ip.outputs.SERVICE_IP }}/g" presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}$POSTFIX.yaml
+            sed -i "s/TAG_HERE/${{ matrix.model.tag }}/g" presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}$POSTFIX.yaml
+            sed -i "s/REPO_HERE/${{ secrets.ACR_AMRT_USERNAME }}/g" presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}$POSTFIX.yaml
+            kubectl apply -f presets/workspace/test/manifests/${{ matrix.model.name }}/${{ matrix.model.name }}$POSTFIX.yaml
 
       - name: Wait for Resource to be ready
         run: |
@@ -243,20 +249,27 @@ jobs:
         run: |
             POD_NAME=$(kubectl get pods -l app=${{ matrix.model.name }} -o jsonpath="{.items[0].metadata.name}")
             kubectl logs $POD_NAME | grep "Adapter added:" | grep "${{ matrix.model.expected_adapter }}" || (echo "Adapter not loaded or incorrect adapter loaded" && exit 1)
-          
-      - name: Test home endpoint
+
+      - name: Install testing commands
         run: |
-            curl http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/
+            kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- apt-get update
+            kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- apt-get install -y curl
 
       - name: Test healthz endpoint
         run: |
-            curl http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/healthz
-    
+            if [[ "${{ matrix.model.name }}" == *"llama"* ]]; then
+                kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
+                curl -s http://localhost:5000/healthz
+            else
+                kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
+                curl -s http://localhost:5000/health
+            fi
       - name: Test inference endpoint
         run: |
+            echo "Testing inference for ${{ matrix.model.name }}"
             if [[ "${{ matrix.model.name }}" == *"llama"* && "${{ matrix.model.name }}" == *"-chat"* ]]; then
-                echo "Testing inference for ${{ matrix.model.name }}"
-                curl -X POST \
+                kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
+                curl -s -X POST \
                 -H "Content-Type: application/json" \
                 -d '{
                     "input_data": {
@@ -274,10 +287,10 @@ jobs:
                         ]
                     }
                 }' \
-                http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/chat
+                http://localhost:5000/chat
             elif [[ "${{ matrix.model.name }}" == *"llama"* ]]; then
-                echo "Testing inference for ${{ matrix.model.name }}"
-                curl -X POST \
+                kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
+                curl -s -X POST \
                 -H "Content-Type: application/json" \
                 -d '{
                     "prompts": [
@@ -290,10 +303,29 @@ jobs:
                         "max_gen_len": 128
                     }
                 }' \
-                http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/generate
+                http://localhost:5000/generate
+            elif [[ "${{ env.RUNTIME }}" == *"vllm"* ]]; then
+                kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
+                curl -s -X POST \
+                -H "accept: application/json" \
+                -H "Content-Type: application/json" \
+                -d '{
+                    "model": "test",
+                    "messages": [
+                    {
+                        "role": "system",
+                        "content": "You are a helpful assistant."
+                    },
+                    {
+                        "role": "user",
+                        "content": "Hello!"
+                    }
+                    ]
+                    }' \
+                http://localhost:5000/v1/chat/completions
             else
-                echo "Testing inference for ${{ matrix.model.name }}"
-                curl -X POST \
+                kubectl exec ${{steps.resource.outputs.RESOURCE_TYPE}}/${{ matrix.model.name }} -- \
+                curl -s -X POST \
                 -H "accept: application/json" \
                 -H "Content-Type: application/json" \
                 -d '{
@@ -327,7 +359,7 @@ jobs:
                             "remove_invalid_values":null
                         }
                     }' \
-                http://${{ steps.get_ip.outputs.SERVICE_IP }}:80/chat                
+                http://localhost:5000/chat
             fi
       
       - name: Cleanup
@@ -340,6 +372,7 @@ jobs:
                 
                 # Check and Delete K8s Resource (Deployment or StatefulSet)
                 if kubectl get $RESOURCE_TYPE ${{ matrix.model.name }} > /dev/null 2>&1; then
+                    kubectl logs $RESOURCE_TYPE/${{ matrix.model.name }}
                     kubectl delete $RESOURCE_TYPE ${{ matrix.model.name }}
                 fi
             fi
diff --git a/.github/workflows/kind-cluster/determine_models.py b/.github/workflows/kind-cluster/determine_models.py
index 2ef84f1e0..555c1bc27 100644
--- a/.github/workflows/kind-cluster/determine_models.py
+++ b/.github/workflows/kind-cluster/determine_models.py
@@ -21,7 +21,7 @@ def read_yaml(file_path):
 YAML_PR = read_yaml(supp_models_yaml)
 # Format: {falcon-7b : {model_name:falcon-7b, type:text-generation, version: #, tag: #}}
 MODELS = {model['name']: model for model in YAML_PR['models']}
-KAITO_REPO_URL = "https://github.com/kaito-repo/kaito.git"
+KAITO_REPO_URL = "https://github.com/kaito-project/kaito.git"
 
 def set_multiline_output(name, value):
     with open(os.environ['GITHUB_OUTPUT'], 'a') as fh:
diff --git a/presets/workspace/models/supported_models.yaml b/presets/workspace/models/supported_models.yaml
index db49641b8..74044220f 100644
--- a/presets/workspace/models/supported_models.yaml
+++ b/presets/workspace/models/supported_models.yaml
@@ -34,13 +34,14 @@ models:
     type: text-generation
     version: https://huggingface.co/tiiuae/falcon-7b/commit/898df1396f35e447d5fe44e0a3ccaaaa69f30d36
     runtime: tfs
-    tag: 0.0.6
+    tag: 0.0.7
   - name: falcon-7b-instruct
     type: text-generation
     version: https://huggingface.co/tiiuae/falcon-7b-instruct/commit/cf4b3c42ce2fdfe24f753f0f0d179202fea59c99
     runtime: tfs
-    tag: 0.0.6
+    tag: 0.0.7
     # Tag history:
+    # 0.0.7 - Support VLLM runtime
     # 0.0.6 - Add Logging & Metrics Server
     # 0.0.5 - Tuning and Adapters
     # 0.0.4 - Adjust default model params (#310)
@@ -51,13 +52,14 @@ models:
     type: text-generation
     version: https://huggingface.co/tiiuae/falcon-40b/commit/4a70170c215b36a3cce4b4253f6d0612bb7d4146
     runtime: tfs
-    tag: 0.0.7
+    tag: 0.0.8
   - name: falcon-40b-instruct
     type: text-generation
     version: https://huggingface.co/tiiuae/falcon-40b-instruct/commit/ecb78d97ac356d098e79f0db222c9ce7c5d9ee5f
     runtime: tfs
-    tag: 0.0.7
+    tag: 0.0.8
     # Tag history for 40b models:
+    # 0.0.8 - Support VLLM runtime
     # 0.0.7 - Add Logging & Metrics Server
     # 0.0.6 - Tuning and Adapters
     # 0.0.5 - Adjust default model params (#310)
@@ -69,15 +71,16 @@ models:
   # Mistral
   - name: mistral-7b
     type: text-generation 
-    version: https://huggingface.co/mistralai/Mistral-7B-v0.3/commit/c882233d224d27b727b3d9299b12a9aab9dda6f7
+    version: https://huggingface.co/mistralai/Mistral-7B-v0.3/commit/d8cadc02ac76bd617a919d50b092e59d2d110aff
     runtime: tfs
-    tag: 0.0.7
+    tag: 0.0.8
   - name: mistral-7b-instruct
     type: text-generation
-    version: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3/commit/0417f4babd26db0b5ed07c1d0bc85658ab526ea3
+    version: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3/commit/e0bc86c23ce5aae1db576c8cca6f06f1f73af2db
     runtime: tfs
-    tag: 0.0.7
+    tag: 0.0.8
     # Tag history:
+    # 0.0.8 - Support VLLM runtime
     # 0.0.7 - Add Logging & Metrics Server
     # 0.0.6 - Update model version and Address missing weights files fix
     # 0.0.5 - Tuning and Adapters
@@ -89,10 +92,11 @@ models:
   # Phi-2
   - name: phi-2
     type: text-generation 
-    version: https://huggingface.co/microsoft/phi-2/commit/b10c3eba545ad279e7208ee3a5d644566f001670
+    version: https://huggingface.co/microsoft/phi-2/commit/ef382358ec9e382308935a992d908de099b64c23
     runtime: tfs
-    tag: 0.0.5
+    tag: 0.0.6
     # Tag history:
+    # 0.0.6 - Support VLLM runtime
     # 0.0.5 - Add Logging & Metrics Server
     # 0.0.4 - Tuning and Adapters
     # 0.0.3 - Adjust default model params (#310)
@@ -102,36 +106,49 @@ models:
   # Phi-3
   - name: phi-3-mini-4k-instruct
     type: text-generation 
-    version: https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/commit/d269012bea6fbe38ce7752c8940fea010eea3383
+    version: https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/commit/0a67737cc96d2554230f90338b163bc6380a2a85
     runtime: tfs
-    tag: 0.0.2
+    tag: 0.0.3
     # Tag history:
+    # 0.0.3 - Support VLLM runtime
     # 0.0.2 - Add Logging & Metrics Server
     # 0.0.1 - Initial Release
 
   - name: phi-3-mini-128k-instruct
     type: text-generation 
-    version: https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/commit/5be6479b4bc06a081e8f4c6ece294241ccd32dec
+    version: https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/commit/a90b62ae09941edff87a90ced39ba5807e6b2ade
     runtime: tfs
-    tag: 0.0.2
+    tag: 0.0.3
     # Tag history:
+    # 0.0.3 - Support VLLM runtime
     # 0.0.2 - Add Logging & Metrics Server
     # 0.0.1 - Initial Release
 
   - name: phi-3-medium-4k-instruct
     type: text-generation
-    version: https://huggingface.co/microsoft/Phi-3-medium-4k-instruct/commit/d194e4e74ffad5a5e193e26af25bcfc80c7f1ffc
+    version: https://huggingface.co/microsoft/Phi-3-medium-4k-instruct/commit/ae004ae82eb6eddc32906dfacb1d6dfea8f91996
     runtime: tfs
-    tag: 0.0.2
+    tag: 0.0.3
     # Tag history:
+    # 0.0.3 - Support VLLM runtime
     # 0.0.2 - Add Logging & Metrics Server
     # 0.0.1 - Initial Release
 
   - name: phi-3-medium-128k-instruct
     type: text-generation
-    version: https://huggingface.co/microsoft/Phi-3-medium-128k-instruct/commit/cae1d42b5577398fd1be9f0746052562ae552886
+    version: https://huggingface.co/microsoft/Phi-3-medium-128k-instruct/commit/fa7d2aa4f5ea69b2e36b20d050cdae79c9bfbb3f
     runtime: tfs
-    tag: 0.0.2
+    tag: 0.0.3
     # Tag history:
+    # 0.0.3 - Support VLLM runtime
     # 0.0.2 - Add Logging & Metrics Server
-    # 0.0.1 - Initial Release
\ No newline at end of file
+    # 0.0.1 - Initial Release
+
+  - name: phi-3.5-mini-instruct
+    type: text-generation
+    version: https://huggingface.co/microsoft/Phi-3.5-mini-instruct/commit/af0dfb8029e8a74545d0736d30cb6b58d2f0f3f0
+    runtime: tfs
+    tag: 0.0.1
+    # Tag history:
+    # 0.0.1 - New Model! Support VLLM Runtime
+  
\ No newline at end of file
diff --git a/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml b/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml
index 37f3c6a6b..a44043894 100644
--- a/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml
+++ b/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct.yaml
@@ -22,9 +22,9 @@ spec:
           - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
         resources:
           requests:
-            nvidia.com/gpu: 4  # Requesting 4 GPUs
+            nvidia.com/gpu: 2
           limits:
-            nvidia.com/gpu: 4
+            nvidia.com/gpu: 2
         livenessProbe:
           httpGet:
             path: /health
diff --git a/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct_hf.yaml b/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct_hf.yaml
new file mode 100644
index 000000000..a44043894
--- /dev/null
+++ b/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct_hf.yaml
@@ -0,0 +1,56 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: falcon-40b-instruct
+spec:
+  progressDeadlineSeconds: 1800
+  replicas: 1
+  selector:
+    matchLabels:
+      app: falcon
+  template:
+    metadata:
+      labels:
+        app: falcon
+    spec:
+      containers:
+      - name: falcon-container
+        image: REPO_HERE.azurecr.io/falcon-40b-instruct:TAG_HERE
+        command:
+          - /bin/sh
+          - -c
+          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
+        resources:
+          requests:
+            nvidia.com/gpu: 2
+          limits:
+            nvidia.com/gpu: 2
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 600 # 10 Min
+          periodSeconds: 10
+        readinessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 30
+          periodSeconds: 10
+        volumeMounts:
+        - name: dshm
+          mountPath: /dev/shm
+      volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+      tolerations:
+      - effect: NoSchedule
+        key: sku
+        operator: Equal
+        value: gpu
+      - effect: NoSchedule
+        key: nvidia.com/gpu
+        operator: Exists
+      nodeSelector:
+        pool: falcon40bins
diff --git a/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct_vllm.yaml b/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct_vllm.yaml
new file mode 100644
index 000000000..7b40cbac4
--- /dev/null
+++ b/presets/workspace/test/manifests/falcon-40b-instruct/falcon-40b-instruct_vllm.yaml
@@ -0,0 +1,56 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: falcon-40b-instruct
+spec:
+  progressDeadlineSeconds: 1800
+  replicas: 1
+  selector:
+    matchLabels:
+      app: falcon
+  template:
+    metadata:
+      labels:
+        app: falcon
+    spec:
+      containers:
+      - name: falcon-container
+        image: REPO_HERE.azurecr.io/falcon-40b-instruct:TAG_HERE
+        command:
+          - /bin/sh
+          - -c
+          - python3 /workspace/vllm/inference_api.py --served-model-name test --dtype bfloat16 --chat-template /workspace/chat_templates/falcon-instruct.jinja --tensor-parallel-size 2
+        resources:
+          requests:
+            nvidia.com/gpu: 2
+          limits:
+            nvidia.com/gpu: 2
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 600 # 10 Min
+          periodSeconds: 10
+        readinessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 30
+          periodSeconds: 10
+        volumeMounts:
+        - name: dshm
+          mountPath: /dev/shm
+      volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+      tolerations:
+      - effect: NoSchedule
+        key: sku
+        operator: Equal
+        value: gpu
+      - effect: NoSchedule
+        key: nvidia.com/gpu
+        operator: Exists
+      nodeSelector:
+        pool: falcon40bins
diff --git a/presets/workspace/test/manifests/falcon-40b/falcon-40b.yaml b/presets/workspace/test/manifests/falcon-40b/falcon-40b.yaml
index a1c11af0e..514d12e60 100644
--- a/presets/workspace/test/manifests/falcon-40b/falcon-40b.yaml
+++ b/presets/workspace/test/manifests/falcon-40b/falcon-40b.yaml
@@ -22,9 +22,9 @@ spec:
           - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
         resources:
           requests:
-            nvidia.com/gpu: 4  # Requesting 4 GPUs
+            nvidia.com/gpu: 2
           limits:
-            nvidia.com/gpu: 4
+            nvidia.com/gpu: 2
         livenessProbe:
           httpGet:
             path: /health
diff --git a/presets/workspace/test/manifests/falcon-40b/falcon-40b_hf.yaml b/presets/workspace/test/manifests/falcon-40b/falcon-40b_hf.yaml
new file mode 100644
index 000000000..514d12e60
--- /dev/null
+++ b/presets/workspace/test/manifests/falcon-40b/falcon-40b_hf.yaml
@@ -0,0 +1,56 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: falcon-40b
+spec:
+  progressDeadlineSeconds: 1800
+  replicas: 1
+  selector:
+    matchLabels:
+      app: falcon
+  template:
+    metadata:
+      labels:
+        app: falcon
+    spec:
+      containers:
+      - name: falcon-container
+        image: REPO_HERE.azurecr.io/falcon-40b:TAG_HERE
+        command:
+          - /bin/sh
+          - -c
+          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
+        resources:
+          requests:
+            nvidia.com/gpu: 2
+          limits:
+            nvidia.com/gpu: 2
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 600 # 10 Min
+          periodSeconds: 10
+        readinessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 30
+          periodSeconds: 10
+        volumeMounts:
+        - name: dshm
+          mountPath: /dev/shm
+      volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+      tolerations:
+      - effect: NoSchedule
+        key: sku
+        operator: Equal
+        value: gpu
+      - effect: NoSchedule
+        key: nvidia.com/gpu
+        operator: Exists
+      nodeSelector:
+        pool: falcon40b
diff --git a/presets/workspace/test/manifests/falcon-40b/falcon-40b_vllm.yaml b/presets/workspace/test/manifests/falcon-40b/falcon-40b_vllm.yaml
new file mode 100644
index 000000000..7e74ac7a7
--- /dev/null
+++ b/presets/workspace/test/manifests/falcon-40b/falcon-40b_vllm.yaml
@@ -0,0 +1,56 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: falcon-40b
+spec:
+  progressDeadlineSeconds: 1800
+  replicas: 1
+  selector:
+    matchLabels:
+      app: falcon
+  template:
+    metadata:
+      labels:
+        app: falcon
+    spec:
+      containers:
+      - name: falcon-container
+        image: REPO_HERE.azurecr.io/falcon-40b:TAG_HERE
+        command:
+          - /bin/sh
+          - -c
+          - python3 /workspace/vllm/inference_api.py --served-model-name test --dtype bfloat16 --chat-template /workspace/chat_templates/falcon-instruct.jinja --tensor-parallel-size 2
+        resources:
+          requests:
+            nvidia.com/gpu: 2
+          limits:
+            nvidia.com/gpu: 2
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 600 # 10 Min
+          periodSeconds: 10
+        readinessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 30
+          periodSeconds: 10
+        volumeMounts:
+        - name: dshm
+          mountPath: /dev/shm
+      volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+      tolerations:
+      - effect: NoSchedule
+        key: sku
+        operator: Equal
+        value: gpu
+      - effect: NoSchedule
+        key: nvidia.com/gpu
+        operator: Exists
+      nodeSelector:
+        pool: falcon40b
diff --git a/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct.yaml b/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct.yaml
index cbf7f6f7f..399b78a3c 100644
--- a/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct.yaml
+++ b/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct.yaml
@@ -21,9 +21,9 @@ spec:
           - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
         resources:
           requests:
-            nvidia.com/gpu: 2
+            nvidia.com/gpu: 1
           limits:
-            nvidia.com/gpu: 2  # Requesting 2 GPUs
+            nvidia.com/gpu: 1
         livenessProbe:
           httpGet:
             path: /health
diff --git a/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct_hf.yaml b/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct_hf.yaml
new file mode 100644
index 000000000..1b2092b36
--- /dev/null
+++ b/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct_hf.yaml
@@ -0,0 +1,56 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: falcon-7b-instruct
+spec:
+  progressDeadlineSeconds: 1200
+  replicas: 1
+  selector:
+    matchLabels:
+      app: falcon
+  template:
+    metadata:
+      labels:
+        app: falcon
+    spec:
+      containers:
+      - name: falcon-container
+        image: REPO_HERE.azurecr.io/falcon-7b-instruct:TAG_HERE
+        command:
+          - /bin/sh
+          - -c
+          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
+        resources:
+          requests:
+            nvidia.com/gpu: 1
+          limits:
+            nvidia.com/gpu: 1
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 600 # 10 Min
+          periodSeconds: 10
+        readinessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 30
+          periodSeconds: 10
+        volumeMounts:
+        - name: dshm
+          mountPath: /dev/shm
+      volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+      tolerations:
+      - effect: NoSchedule
+        key: sku
+        operator: Equal
+        value: gpu
+      - effect: NoSchedule
+        key: nvidia.com/gpu
+        operator: Exists
+      nodeSelector:
+        pool: falcon7binst
diff --git a/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct_vllm.yaml b/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct_vllm.yaml
new file mode 100644
index 000000000..4019d64f5
--- /dev/null
+++ b/presets/workspace/test/manifests/falcon-7b-instruct/falcon-7b-instruct_vllm.yaml
@@ -0,0 +1,56 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: falcon-7b-instruct
+spec:
+  progressDeadlineSeconds: 1200
+  replicas: 1
+  selector:
+    matchLabels:
+      app: falcon
+  template:
+    metadata:
+      labels:
+        app: falcon
+    spec:
+      containers:
+      - name: falcon-container
+        image: REPO_HERE.azurecr.io/falcon-7b-instruct:TAG_HERE
+        command:
+          - /bin/sh
+          - -c
+          - python3 /workspace/vllm/inference_api.py --served-model-name test --dtype float16 --chat-template /workspace/chat_templates/falcon-instruct.jinja
+        resources:
+          requests:
+            nvidia.com/gpu: 1
+          limits:
+            nvidia.com/gpu: 1
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 600 # 10 Min
+          periodSeconds: 10
+        readinessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 30
+          periodSeconds: 10
+        volumeMounts:
+        - name: dshm
+          mountPath: /dev/shm
+      volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+      tolerations:
+      - effect: NoSchedule
+        key: sku
+        operator: Equal
+        value: gpu
+      - effect: NoSchedule
+        key: nvidia.com/gpu
+        operator: Exists
+      nodeSelector:
+        pool: falcon7binst
diff --git a/presets/workspace/test/manifests/falcon-7b/falcon-7b.yaml b/presets/workspace/test/manifests/falcon-7b/falcon-7b.yaml
index f985124ea..8e5786c6e 100644
--- a/presets/workspace/test/manifests/falcon-7b/falcon-7b.yaml
+++ b/presets/workspace/test/manifests/falcon-7b/falcon-7b.yaml
@@ -21,9 +21,9 @@ spec:
           - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
         resources:
           requests:
-            nvidia.com/gpu: 2
+            nvidia.com/gpu: 1
           limits:
-            nvidia.com/gpu: 2  # Requesting 2 GPUs
+            nvidia.com/gpu: 1
         livenessProbe:
           httpGet:
             path: /health
diff --git a/presets/workspace/test/manifests/falcon-7b/falcon-7b_hf.yaml b/presets/workspace/test/manifests/falcon-7b/falcon-7b_hf.yaml
new file mode 100644
index 000000000..56a775fff
--- /dev/null
+++ b/presets/workspace/test/manifests/falcon-7b/falcon-7b_hf.yaml
@@ -0,0 +1,56 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: falcon-7b
+spec:
+  progressDeadlineSeconds: 1200
+  replicas: 1
+  selector:
+    matchLabels:
+      app: falcon
+  template:
+    metadata:
+      labels:
+        app: falcon
+    spec:
+      containers:
+      - name: falcon-container
+        image: REPO_HERE.azurecr.io/falcon-7b:TAG_HERE
+        command:
+          - /bin/sh
+          - -c
+          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
+        resources:
+          requests:
+            nvidia.com/gpu: 1
+          limits:
+            nvidia.com/gpu: 1
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 600 # 10 Min
+          periodSeconds: 10
+        readinessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 30
+          periodSeconds: 10
+        volumeMounts:
+        - name: dshm
+          mountPath: /dev/shm
+      volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+      tolerations:
+      - effect: NoSchedule
+        key: sku
+        operator: Equal
+        value: gpu
+      - effect: NoSchedule
+        key: nvidia.com/gpu
+        operator: Exists
+      nodeSelector:
+        pool: falcon7b
diff --git a/presets/workspace/test/manifests/falcon-7b/falcon-7b_vllm.yaml b/presets/workspace/test/manifests/falcon-7b/falcon-7b_vllm.yaml
new file mode 100644
index 000000000..bceb14560
--- /dev/null
+++ b/presets/workspace/test/manifests/falcon-7b/falcon-7b_vllm.yaml
@@ -0,0 +1,56 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: falcon-7b
+spec:
+  progressDeadlineSeconds: 1200
+  replicas: 1
+  selector:
+    matchLabels:
+      app: falcon
+  template:
+    metadata:
+      labels:
+        app: falcon
+    spec:
+      containers:
+      - name: falcon-container
+        image: REPO_HERE.azurecr.io/falcon-7b:TAG_HERE
+        command:
+          - /bin/sh
+          - -c
+          - python3 /workspace/vllm/inference_api.py --served-model-name test --dtype float16 --chat-template /workspace/chat_templates/falcon-instruct.jinja
+        resources:
+          requests:
+            nvidia.com/gpu: 1
+          limits:
+            nvidia.com/gpu: 1
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 600 # 10 Min
+          periodSeconds: 10
+        readinessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 30
+          periodSeconds: 10
+        volumeMounts:
+        - name: dshm
+          mountPath: /dev/shm
+      volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+      tolerations:
+      - effect: NoSchedule
+        key: sku
+        operator: Equal
+        value: gpu
+      - effect: NoSchedule
+        key: nvidia.com/gpu
+        operator: Exists
+      nodeSelector:
+        pool: falcon7b
diff --git a/presets/workspace/test/manifests/llama-2-13b-chat/llama-2-13b-chat.yaml b/presets/workspace/test/manifests/llama-2-13b-chat/llama-2-13b-chat.yaml
index 61a309821..973f6d238 100644
--- a/presets/workspace/test/manifests/llama-2-13b-chat/llama-2-13b-chat.yaml
+++ b/presets/workspace/test/manifests/llama-2-13b-chat/llama-2-13b-chat.yaml
@@ -35,7 +35,7 @@ spec:
             - |
               echo "MASTER_ADDR: $MASTER_ADDR"
               NODE_RANK=$(echo $HOSTNAME | grep -o '[^-]*$')
-              cd /workspace/llama/llama-2 && torchrun --nnodes 2 --nproc_per_node 1 --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port 29500 /workspace/tfs/inference_api.py
+              cd /workspace/llama/llama-2 && torchrun --nnodes 2 --nproc_per_node 1 --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port 29500 inference_api.py
           resources:
             limits:
               nvidia.com/gpu: "1"
@@ -43,13 +43,13 @@ spec:
               nvidia.com/gpu: "1"
           livenessProbe:
             httpGet:
-              path: /health
+              path: /healthz
               port: 5000
             initialDelaySeconds: 600 # 10 Min
             periodSeconds: 10
           readinessProbe:
             httpGet:
-              path: /health
+              path: /healthz
               port: 5000
             initialDelaySeconds: 30
             periodSeconds: 10
diff --git a/presets/workspace/test/manifests/llama-2-13b/llama-2-13b.yaml b/presets/workspace/test/manifests/llama-2-13b/llama-2-13b.yaml
index daff4cd0a..46c609bbb 100644
--- a/presets/workspace/test/manifests/llama-2-13b/llama-2-13b.yaml
+++ b/presets/workspace/test/manifests/llama-2-13b/llama-2-13b.yaml
@@ -35,7 +35,7 @@ spec:
             - |
               echo "MASTER_ADDR: $MASTER_ADDR"
               NODE_RANK=$(echo $HOSTNAME | grep -o '[^-]*$')
-              cd /workspace/llama/llama-2 && torchrun --nnodes 2 --nproc_per_node 1 --node_rank $NODE_RANK --master-addr $MASTER_ADDR --master-port 29500 /workspace/tfs/inference_api.py
+              cd /workspace/llama/llama-2 && torchrun --nnodes 2 --nproc_per_node 1 --node_rank $NODE_RANK --master-addr $MASTER_ADDR --master-port 29500 inference_api.py
           resources:
             limits:
               nvidia.com/gpu: "1"
@@ -43,13 +43,13 @@ spec:
               nvidia.com/gpu: "1"
           livenessProbe:
             httpGet:
-              path: /health
+              path: /healthz
               port: 5000
             initialDelaySeconds: 600 # 10 Min
             periodSeconds: 10
           readinessProbe:
             httpGet:
-              path: /health
+              path: /healthz
               port: 5000
             initialDelaySeconds: 30
             periodSeconds: 10
diff --git a/presets/workspace/test/manifests/llama-2-7b-chat/llama-2-7b-chat.yaml b/presets/workspace/test/manifests/llama-2-7b-chat/llama-2-7b-chat.yaml
index 61ec695dc..f26b003a8 100644
--- a/presets/workspace/test/manifests/llama-2-7b-chat/llama-2-7b-chat.yaml
+++ b/presets/workspace/test/manifests/llama-2-7b-chat/llama-2-7b-chat.yaml
@@ -19,7 +19,7 @@ spec:
           command:
             - /bin/sh
             - -c
-            - cd /workspace/llama/llama-2 && torchrun /workspace/tfs/inference_api.py
+            - cd /workspace/llama/llama-2 && torchrun inference_api.py
           resources:
             limits:
               nvidia.com/gpu: "1"
@@ -27,13 +27,13 @@ spec:
               nvidia.com/gpu: "1"
           livenessProbe:
             httpGet:
-              path: /health
+              path: /healthz
               port: 5000
             initialDelaySeconds: 600 # 10 Min
             periodSeconds: 10
           readinessProbe:
             httpGet:
-              path: /health
+              path: /healthz
               port: 5000
             initialDelaySeconds: 30
             periodSeconds: 10
diff --git a/presets/workspace/test/manifests/llama-2-7b/llama-2-7b.yaml b/presets/workspace/test/manifests/llama-2-7b/llama-2-7b.yaml
index af295b8db..f68d43c64 100644
--- a/presets/workspace/test/manifests/llama-2-7b/llama-2-7b.yaml
+++ b/presets/workspace/test/manifests/llama-2-7b/llama-2-7b.yaml
@@ -19,7 +19,7 @@ spec:
           command:
             - /bin/sh
             - -c
-            - cd /workspace/llama/llama-2 && torchrun /workspace/tfs/inference_api.py
+            - cd /workspace/llama/llama-2 && torchrun inference_api.py
           resources:
             limits:
               nvidia.com/gpu: "1"
@@ -27,13 +27,13 @@ spec:
               nvidia.com/gpu: "1"
           livenessProbe:
             httpGet:
-              path: /health
+              path: /healthz
               port: 5000
             initialDelaySeconds: 600 # 10 Min
             periodSeconds: 10
           readinessProbe:
             httpGet:
-              path: /health
+              path: /healthz
               port: 5000
             initialDelaySeconds: 30
             periodSeconds: 10
diff --git a/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct.yaml b/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct.yaml
index a64780db9..75179683f 100644
--- a/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct.yaml
+++ b/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct.yaml
@@ -21,9 +21,9 @@ spec:
           - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
         resources:
           requests:
-            nvidia.com/gpu: 2
+            nvidia.com/gpu: 1
           limits:
-            nvidia.com/gpu: 2  # Requesting 2 GPUs
+            nvidia.com/gpu: 1
         livenessProbe:
           httpGet:
             path: /health
diff --git a/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct_hf.yaml b/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct_hf.yaml
new file mode 100644
index 000000000..75179683f
--- /dev/null
+++ b/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct_hf.yaml
@@ -0,0 +1,55 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: mistral-7b-instruct
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: mistral
+  template:
+    metadata:
+      labels:
+        app: mistral
+    spec:
+      containers:
+      - name: mistral-instruct-container
+        image: REPO_HERE.azurecr.io/mistral-7b-instruct:TAG_HERE
+        command:
+          - /bin/sh
+          - -c
+          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
+        resources:
+          requests:
+            nvidia.com/gpu: 1
+          limits:
+            nvidia.com/gpu: 1
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 600 # 10 Min
+          periodSeconds: 10
+        readinessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 30
+          periodSeconds: 10
+        volumeMounts:
+        - name: dshm
+          mountPath: /dev/shm
+      volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+      tolerations:
+      - effect: NoSchedule
+        key: sku
+        operator: Equal
+        value: gpu
+      - effect: NoSchedule
+        key: nvidia.com/gpu
+        operator: Exists
+      nodeSelector:
+        pool: mistral7bins
diff --git a/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct_vllm.yaml b/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct_vllm.yaml
new file mode 100644
index 000000000..939d6c75b
--- /dev/null
+++ b/presets/workspace/test/manifests/mistral-7b-instruct/mistral-7b-instruct_vllm.yaml
@@ -0,0 +1,55 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: mistral-7b-instruct
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: mistral
+  template:
+    metadata:
+      labels:
+        app: mistral
+    spec:
+      containers:
+      - name: mistral-instruct-container
+        image: REPO_HERE.azurecr.io/mistral-7b-instruct:TAG_HERE
+        command:
+          - /bin/sh
+          - -c
+          - python3 /workspace/vllm/inference_api.py --served-model-name test --dtype float16 --chat-template /workspace/chat_templates/mistral-instruct.jinja
+        resources:
+          requests:
+            nvidia.com/gpu: 1
+          limits:
+            nvidia.com/gpu: 1
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 600 # 10 Min
+          periodSeconds: 10
+        readinessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 30
+          periodSeconds: 10
+        volumeMounts:
+        - name: dshm
+          mountPath: /dev/shm
+      volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+      tolerations:
+      - effect: NoSchedule
+        key: sku
+        operator: Equal
+        value: gpu
+      - effect: NoSchedule
+        key: nvidia.com/gpu
+        operator: Exists
+      nodeSelector:
+        pool: mistral7bins
diff --git a/presets/workspace/test/manifests/mistral-7b/mistral-7b.yaml b/presets/workspace/test/manifests/mistral-7b/mistral-7b.yaml
index 219f42ff5..3eff5594f 100644
--- a/presets/workspace/test/manifests/mistral-7b/mistral-7b.yaml
+++ b/presets/workspace/test/manifests/mistral-7b/mistral-7b.yaml
@@ -21,9 +21,9 @@ spec:
           - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
         resources:
           requests:
-            nvidia.com/gpu: 2
+            nvidia.com/gpu: 1
           limits:
-            nvidia.com/gpu: 2  # Requesting 2 GPUs
+            nvidia.com/gpu: 1
         livenessProbe:
           httpGet:
             path: /health
diff --git a/presets/workspace/test/manifests/mistral-7b/mistral-7b_hf.yaml b/presets/workspace/test/manifests/mistral-7b/mistral-7b_hf.yaml
new file mode 100644
index 000000000..3eff5594f
--- /dev/null
+++ b/presets/workspace/test/manifests/mistral-7b/mistral-7b_hf.yaml
@@ -0,0 +1,55 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: mistral-7b
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: mistral
+  template:
+    metadata:
+      labels:
+        app: mistral
+    spec:
+      containers:
+      - name: mistral-container
+        image: REPO_HERE.azurecr.io/mistral-7b:TAG_HERE
+        command:
+          - /bin/sh
+          - -c
+          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
+        resources:
+          requests:
+            nvidia.com/gpu: 1
+          limits:
+            nvidia.com/gpu: 1
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 600 # 10 Min
+          periodSeconds: 10
+        readinessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 30
+          periodSeconds: 10
+        volumeMounts:
+        - name: dshm
+          mountPath: /dev/shm
+      volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+      tolerations:
+      - effect: NoSchedule
+        key: sku
+        operator: Equal
+        value: gpu
+      - effect: NoSchedule
+        key: nvidia.com/gpu
+        operator: Exists
+      nodeSelector:
+        pool: mistral7b
diff --git a/presets/workspace/test/manifests/mistral-7b/mistral-7b_vllm.yaml b/presets/workspace/test/manifests/mistral-7b/mistral-7b_vllm.yaml
new file mode 100644
index 000000000..2bd945319
--- /dev/null
+++ b/presets/workspace/test/manifests/mistral-7b/mistral-7b_vllm.yaml
@@ -0,0 +1,55 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: mistral-7b
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: mistral
+  template:
+    metadata:
+      labels:
+        app: mistral
+    spec:
+      containers:
+      - name: mistral-container
+        image: REPO_HERE.azurecr.io/mistral-7b:TAG_HERE
+        command:
+          - /bin/sh
+          - -c
+          - python3 /workspace/vllm/inference_api.py --served-model-name test --dtype float16 --chat-template /workspace/chat_templates/mistral-instruct.jinja
+        resources:
+          requests:
+            nvidia.com/gpu: 1
+          limits:
+            nvidia.com/gpu: 1
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 600 # 10 Min
+          periodSeconds: 10
+        readinessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 30
+          periodSeconds: 10
+        volumeMounts:
+        - name: dshm
+          mountPath: /dev/shm
+      volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+      tolerations:
+      - effect: NoSchedule
+        key: sku
+        operator: Equal
+        value: gpu
+      - effect: NoSchedule
+        key: nvidia.com/gpu
+        operator: Exists
+      nodeSelector:
+        pool: mistral7b
diff --git a/presets/workspace/test/manifests/phi-2/phi-2_hf.yaml b/presets/workspace/test/manifests/phi-2/phi-2_hf.yaml
new file mode 100644
index 000000000..cbc6f94e7
--- /dev/null
+++ b/presets/workspace/test/manifests/phi-2/phi-2_hf.yaml
@@ -0,0 +1,55 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: phi-2
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: phi-2
+  template:
+    metadata:
+      labels:
+        app: phi-2
+    spec:
+      containers:
+      - name: phi-2-container
+        image: REPO_HERE.azurecr.io/phi-2:TAG_HERE
+        command:
+          - /bin/sh
+          - -c
+          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
+        resources:
+          requests:
+            nvidia.com/gpu: 1
+          limits:
+            nvidia.com/gpu: 1  # Requesting 1 GPU
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 600 # 10 Min
+          periodSeconds: 10
+        readinessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 30
+          periodSeconds: 10
+        volumeMounts:
+        - name: dshm
+          mountPath: /dev/shm
+      volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+      tolerations:
+      - effect: NoSchedule
+        key: sku
+        operator: Equal
+        value: gpu
+      - effect: NoSchedule
+        key: nvidia.com/gpu
+        operator: Exists
+      nodeSelector:
+        pool: phi2
diff --git a/presets/workspace/test/manifests/phi-2/phi-2_vllm.yaml b/presets/workspace/test/manifests/phi-2/phi-2_vllm.yaml
new file mode 100644
index 000000000..e77f21268
--- /dev/null
+++ b/presets/workspace/test/manifests/phi-2/phi-2_vllm.yaml
@@ -0,0 +1,55 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: phi-2
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: phi-2
+  template:
+    metadata:
+      labels:
+        app: phi-2
+    spec:
+      containers:
+      - name: phi-2-container
+        image: REPO_HERE.azurecr.io/phi-2:TAG_HERE
+        command:
+          - /bin/sh
+          - -c
+          - python3 /workspace/vllm/inference_api.py --served-model-name test --dtype float16
+        resources:
+          requests:
+            nvidia.com/gpu: 1
+          limits:
+            nvidia.com/gpu: 1  # Requesting 1 GPU
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 600 # 10 Min
+          periodSeconds: 10
+        readinessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 30
+          periodSeconds: 10
+        volumeMounts:
+        - name: dshm
+          mountPath: /dev/shm
+      volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+      tolerations:
+      - effect: NoSchedule
+        key: sku
+        operator: Equal
+        value: gpu
+      - effect: NoSchedule
+        key: nvidia.com/gpu
+        operator: Exists
+      nodeSelector:
+        pool: phi2
diff --git a/presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct_hf.yaml b/presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct_hf.yaml
new file mode 100644
index 000000000..0adb122e4
--- /dev/null
+++ b/presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct_hf.yaml
@@ -0,0 +1,55 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: phi-3-medium-128k-instruct
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: phi-3-medium-128k-instruct
+  template:
+    metadata:
+      labels:
+        app: phi-3-medium-128k-instruct
+    spec:
+      containers:
+      - name: phi-3-medium-128k-instruct-container
+        image: REPO_HERE.azurecr.io/phi-3-medium-128k-instruct:TAG_HERE
+        command:
+          - /bin/sh
+          - -c
+          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code
+        resources:
+          requests:
+            nvidia.com/gpu: 1
+          limits:
+            nvidia.com/gpu: 1  # Requesting 1 GPU
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 600 # 10 Min
+          periodSeconds: 10
+        readinessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 30
+          periodSeconds: 10
+        volumeMounts:
+        - name: dshm
+          mountPath: /dev/shm
+      volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+      tolerations:
+      - effect: NoSchedule
+        key: sku
+        operator: Equal
+        value: gpu
+      - effect: NoSchedule
+        key: nvidia.com/gpu
+        operator: Exists
+      nodeSelector:
+        pool: phi3medium12
\ No newline at end of file
diff --git a/presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct_vllm.yaml b/presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct_vllm.yaml
new file mode 100644
index 000000000..5b93bde50
--- /dev/null
+++ b/presets/workspace/test/manifests/phi-3-medium-128k-instruct/phi-3-medium-128k-instruct_vllm.yaml
@@ -0,0 +1,55 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: phi-3-medium-128k-instruct
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: phi-3-medium-128k-instruct
+  template:
+    metadata:
+      labels:
+        app: phi-3-medium-128k-instruct
+    spec:
+      containers:
+      - name: phi-3-medium-128k-instruct-container
+        image: REPO_HERE.azurecr.io/phi-3-medium-128k-instruct:TAG_HERE
+        command:
+          - /bin/sh
+          - -c
+          - python3 /workspace/vllm/inference_api.py --served-model-name test --dtype float16 --max-model-len 1024 --tensor-parallel-size 2
+        resources:
+          requests:
+            nvidia.com/gpu: 2
+          limits:
+            nvidia.com/gpu: 2
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 600 # 10 Min
+          periodSeconds: 10
+        readinessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 30
+          periodSeconds: 10
+        volumeMounts:
+        - name: dshm
+          mountPath: /dev/shm
+      volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+      tolerations:
+      - effect: NoSchedule
+        key: sku
+        operator: Equal
+        value: gpu
+      - effect: NoSchedule
+        key: nvidia.com/gpu
+        operator: Exists
+      nodeSelector:
+        pool: phi3medium12
\ No newline at end of file
diff --git a/presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct_hf.yaml b/presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct_hf.yaml
new file mode 100644
index 000000000..1d0d64e47
--- /dev/null
+++ b/presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct_hf.yaml
@@ -0,0 +1,55 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: phi-3-medium-4k-instruct
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: phi-3-medium-4k-instruct
+  template:
+    metadata:
+      labels:
+        app: phi-3-medium-4k-instruct
+    spec:
+      containers:
+      - name: phi-3-medium-4k-instruct-container
+        image: REPO_HERE.azurecr.io/phi-3-medium-4k-instruct:TAG_HERE
+        command:
+          - /bin/sh
+          - -c
+          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code
+        resources:
+          requests:
+            nvidia.com/gpu: 1
+          limits:
+            nvidia.com/gpu: 1  # Requesting 1 GPU
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 600 # 10 Min
+          periodSeconds: 10
+        readinessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 30
+          periodSeconds: 10
+        volumeMounts:
+        - name: dshm
+          mountPath: /dev/shm
+      volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+      tolerations:
+      - effect: NoSchedule
+        key: sku
+        operator: Equal
+        value: gpu
+      - effect: NoSchedule
+        key: nvidia.com/gpu
+        operator: Exists
+      nodeSelector:
+        pool: phi3medium4k
\ No newline at end of file
diff --git a/presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct_vllm.yaml b/presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct_vllm.yaml
new file mode 100644
index 000000000..3bdce8072
--- /dev/null
+++ b/presets/workspace/test/manifests/phi-3-medium-4k-instruct/phi-3-medium-4k-instruct_vllm.yaml
@@ -0,0 +1,55 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: phi-3-medium-4k-instruct
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: phi-3-medium-4k-instruct
+  template:
+    metadata:
+      labels:
+        app: phi-3-medium-4k-instruct
+    spec:
+      containers:
+      - name: phi-3-medium-4k-instruct-container
+        image: REPO_HERE.azurecr.io/phi-3-medium-4k-instruct:TAG_HERE
+        command:
+          - /bin/sh
+          - -c
+          - python3 /workspace/vllm/inference_api.py --served-model-name test --dtype float16 --tensor-parallel-size 2
+        resources:
+          requests:
+            nvidia.com/gpu: 2
+          limits:
+            nvidia.com/gpu: 2
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 600 # 10 Min
+          periodSeconds: 10
+        readinessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 30
+          periodSeconds: 10
+        volumeMounts:
+        - name: dshm
+          mountPath: /dev/shm
+      volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+      tolerations:
+      - effect: NoSchedule
+        key: sku
+        operator: Equal
+        value: gpu
+      - effect: NoSchedule
+        key: nvidia.com/gpu
+        operator: Exists
+      nodeSelector:
+        pool: phi3medium4k
\ No newline at end of file
diff --git a/presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct_hf.yaml b/presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct_hf.yaml
new file mode 100644
index 000000000..cf8898015
--- /dev/null
+++ b/presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct_hf.yaml
@@ -0,0 +1,55 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: phi-3-mini-128k-instruct
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: phi-3-mini-128k-instruct
+  template:
+    metadata:
+      labels:
+        app: phi-3-mini-128k-instruct
+    spec:
+      containers:
+      - name: phi-3-mini-128k-instruct-container
+        image: REPO_HERE.azurecr.io/phi-3-mini-128k-instruct:TAG_HERE
+        command:
+          - /bin/sh
+          - -c
+          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code
+        resources:
+          requests:
+            nvidia.com/gpu: 1
+          limits:
+            nvidia.com/gpu: 1  # Requesting 1 GPU
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 600 # 10 Min
+          periodSeconds: 10
+        readinessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 30
+          periodSeconds: 10
+        volumeMounts:
+        - name: dshm
+          mountPath: /dev/shm
+      volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+      tolerations:
+      - effect: NoSchedule
+        key: sku
+        operator: Equal
+        value: gpu
+      - effect: NoSchedule
+        key: nvidia.com/gpu
+        operator: Exists
+      nodeSelector:
+        pool: phi3mini128k
\ No newline at end of file
diff --git a/presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct_vllm.yaml b/presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct_vllm.yaml
new file mode 100644
index 000000000..f719bf96b
--- /dev/null
+++ b/presets/workspace/test/manifests/phi-3-mini-128k-instruct/phi-3-mini-128k-instruct_vllm.yaml
@@ -0,0 +1,55 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: phi-3-mini-128k-instruct
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: phi-3-mini-128k-instruct
+  template:
+    metadata:
+      labels:
+        app: phi-3-mini-128k-instruct
+    spec:
+      containers:
+      - name: phi-3-mini-128k-instruct-container
+        image: REPO_HERE.azurecr.io/phi-3-mini-128k-instruct:TAG_HERE
+        command:
+          - /bin/sh
+          - -c
+          - python3 /workspace/vllm/inference_api.py --served-model-name test --dtype float16
+        resources:
+          requests:
+            nvidia.com/gpu: 1
+          limits:
+            nvidia.com/gpu: 1  # Requesting 1 GPU
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 600 # 10 Min
+          periodSeconds: 10
+        readinessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 30
+          periodSeconds: 10
+        volumeMounts:
+        - name: dshm
+          mountPath: /dev/shm
+      volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+      tolerations:
+      - effect: NoSchedule
+        key: sku
+        operator: Equal
+        value: gpu
+      - effect: NoSchedule
+        key: nvidia.com/gpu
+        operator: Exists
+      nodeSelector:
+        pool: phi3mini128k
\ No newline at end of file
diff --git a/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct_hf.yaml b/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct_hf.yaml
new file mode 100644
index 000000000..1d7069a38
--- /dev/null
+++ b/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct_hf.yaml
@@ -0,0 +1,55 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: phi-3-mini-4k-instruct
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: phi-3-mini-4k-instruct
+  template:
+    metadata:
+      labels:
+        app: phi-3-mini-4k-instruct
+    spec:
+      containers:
+      - name: phi-3-mini-4k-instruct-container
+        image: REPO_HERE.azurecr.io/phi-3-mini-4k-instruct:TAG_HERE
+        command:
+          - /bin/sh
+          - -c
+          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype auto --trust_remote_code
+        resources:
+          requests:
+            nvidia.com/gpu: 1
+          limits:
+            nvidia.com/gpu: 1  # Requesting 1 GPU
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 600 # 10 Min
+          periodSeconds: 10
+        readinessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 30
+          periodSeconds: 10
+        volumeMounts:
+        - name: dshm
+          mountPath: /dev/shm
+      volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+      tolerations:
+      - effect: NoSchedule
+        key: sku
+        operator: Equal
+        value: gpu
+      - effect: NoSchedule
+        key: nvidia.com/gpu
+        operator: Exists
+      nodeSelector:
+        pool: phi3mini4kin
\ No newline at end of file
diff --git a/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct_vllm.yaml b/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct_vllm.yaml
new file mode 100644
index 000000000..8d1275678
--- /dev/null
+++ b/presets/workspace/test/manifests/phi-3-mini-4k-instruct/phi-3-mini-4k-instruct_vllm.yaml
@@ -0,0 +1,55 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: phi-3-mini-4k-instruct
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: phi-3-mini-4k-instruct
+  template:
+    metadata:
+      labels:
+        app: phi-3-mini-4k-instruct
+    spec:
+      containers:
+      - name: phi-3-mini-4k-instruct-container
+        image: REPO_HERE.azurecr.io/phi-3-mini-4k-instruct:TAG_HERE
+        command:
+          - /bin/sh
+          - -c
+          - python3 /workspace/vllm/inference_api.py --served-model-name test --dtype float16
+        resources:
+          requests:
+            nvidia.com/gpu: 1
+          limits:
+            nvidia.com/gpu: 1  # Requesting 1 GPU
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 600 # 10 Min
+          periodSeconds: 10
+        readinessProbe:
+          httpGet:
+            path: /health
+            port: 5000
+          initialDelaySeconds: 30
+          periodSeconds: 10
+        volumeMounts:
+        - name: dshm
+          mountPath: /dev/shm
+      volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+      tolerations:
+      - effect: NoSchedule
+        key: sku
+        operator: Equal
+        value: gpu
+      - effect: NoSchedule
+        key: nvidia.com/gpu
+        operator: Exists
+      nodeSelector:
+        pool: phi3mini4kin
\ No newline at end of file