kaito-project · Fei-Guo · Nov 12, 2024 · Oct 29, 2024 · Nov 11, 2024 · Nov 11, 2024
@@ -1,4 +1,4 @@
-FROM python:3.10-slim@sha256:684b1aaf96a7942b3c3af438d162e0baa3510aa7af25ad76d238e0c746bdec79
+FROM python:3.12-slim
 
 ARG WEIGHTS_PATH
 ARG MODEL_TYPE

@@ -16,6 +16,7 @@ func (*testModel) GetInferenceParameters() *model.PresetParam {
 	return &model.PresetParam{
 		GPUCountRequirement: "1",
 		ReadinessTimeout:    time.Duration(30) * time.Minute,
+		BaseCommand: "python3",
 	}
 }
 func (*testModel) GetTuningParameters() *model.PresetParam {
@@ -37,6 +38,7 @@ func (*testDistributedModel) GetInferenceParameters() *model.PresetParam {
 	return &model.PresetParam{
 		GPUCountRequirement: "1",
 		ReadinessTimeout:    time.Duration(30) * time.Minute,
+		BaseCommand: "python3",
 	}
 }
 func (*testDistributedModel) GetTuningParameters() *model.PresetParam {

diff --git a/pkg/workspace/inference/preset-inferences.go b/pkg/workspace/inference/preset-inferences.go
@@ -23,9 +23,9 @@ import (
 )
 
 const (
-	ProbePath     = "/healthz"
+	ProbePath     = "/health"
 	Port5000      = int32(5000)
-	InferenceFile = "inference_api.py"
+	InferenceFile = "/workspace/tfs/inference_api.py"
 )
 
 var (

@@ -46,7 +46,7 @@ func TestCreatePresetInference(t *testing.T) {
 			workload: "Deployment",
 			// No BaseCommand, TorchRunParams, TorchRunRdzvParams, or ModelRunParams
 			// So expected cmd consists of shell command and inference file
-			expectedCmd: "/bin/sh -c  inference_api.py",
+			expectedCmd: "/bin/sh -c python3 /workspace/tfs/inference_api.py",
 			hasAdapters: false,
 		},
 
@@ -58,7 +58,7 @@ func TestCreatePresetInference(t *testing.T) {
 				c.On("Create", mock.IsType(context.TODO()), mock.IsType(&appsv1.StatefulSet{}), mock.Anything).Return(nil)
 			},
 			workload:    "StatefulSet",
-			expectedCmd: "/bin/sh -c  inference_api.py",
+			expectedCmd: "/bin/sh -c python3 /workspace/tfs/inference_api.py",
 			hasAdapters: false,
 		},
 
@@ -69,7 +69,7 @@ func TestCreatePresetInference(t *testing.T) {
 				c.On("Create", mock.IsType(context.TODO()), mock.IsType(&appsv1.Deployment{}), mock.Anything).Return(nil)
 			},
 			workload:       "Deployment",
-			expectedCmd:    "/bin/sh -c  inference_api.py",
+			expectedCmd:    "/bin/sh -c python3 /workspace/tfs/inference_api.py",
 			hasAdapters:    true,
 			expectedVolume: "adapter-volume",
 		},

@@ -1,11 +1,14 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 import logging
+import gc
 import os
 
 import uvloop
+import torch
 from vllm.utils import FlexibleArgumentParser
 import vllm.entrypoints.openai.api_server as api_server
+from vllm.engine.llm_engine import (LLMEngine, EngineArgs, EngineConfig)
 
 # Initialize logger
 logger = logging.getLogger(__name__)
@@ -26,22 +29,73 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
     # See https://docs.vllm.ai/en/latest/models/engine_args.html for more args
     engine_default_args = {
         "model": "/workspace/vllm/weights",
-        "cpu-offload-gb": 0,
-        "gpu-memory-utilization": 0.9,
-        "swap-space": 4,
-        "disable-log-stats": False,
+        "cpu_offload_gb": 0,
+        "gpu_memory_utilization": 0.95,
+        "swap_space": 4,
+        "disable_log_stats": False,
+        "uvicorn_log_level": "error"
     }
     parser.set_defaults(**engine_default_args)
 
     return parser
 
+def find_max_available_seq_len(engine_config: EngineConfig) -> int:
+    """
+    Load model and run profiler to find max available seq len.
+    """
+    # see https://github.com/vllm-project/vllm/blob/v0.6.3/vllm/engine/llm_engine.py#L335
+    executor_class = LLMEngine._get_executor_cls(engine_config)
+    executor = executor_class(
+        model_config=engine_config.model_config,
+        cache_config=engine_config.cache_config,
+        parallel_config=engine_config.parallel_config,
+        scheduler_config=engine_config.scheduler_config,
+        device_config=engine_config.device_config,
+        lora_config=engine_config.lora_config,
+        speculative_config=engine_config.speculative_config,
+        load_config=engine_config.load_config,
+        prompt_adapter_config=engine_config.prompt_adapter_config,
+        observability_config=engine_config.observability_config,
+    )
+
+    # see https://github.com/vllm-project/vllm/blob/v0.6.3/vllm/engine/llm_engine.py#L477
+    num_gpu_blocks, _ = executor.determine_num_available_blocks()
+
+    # release memory
+    del executor
+    gc.collect()
+    torch.cuda.empty_cache()
+
+    return engine_config.cache_config.block_size * num_gpu_blocks
 
 if __name__ == "__main__":
     parser = FlexibleArgumentParser(description='vLLM serving server')
     parser = api_server.make_arg_parser(parser)
     parser = make_arg_parser(parser)
     args = parser.parse_args()
 
+    if args.max_model_len is None:
+        engine_args = EngineArgs.from_cli_args(args)
+        # read the model config from hf weights path.
+        # vllm will perform different parser for different model architectures
+        # and read it into a unified EngineConfig.
+        engine_config = engine_args.create_engine_config()
+
+        logger.info("Try run profiler to find max available seq len")
+        available_seq_len = find_max_available_seq_len(engine_config)
+        # see https://github.com/vllm-project/vllm/blob/v0.6.3/vllm/worker/worker.py#L262
+        if available_seq_len <= 0:
+            raise ValueError("No available memory for the cache blocks. "
+                         "Try increasing `gpu_memory_utilization` when "
+                         "initializing the engine.")
+        max_model_len = engine_config.model_config.max_model_len
+        if available_seq_len > max_model_len:
+            available_seq_len = max_model_len
+
+        if available_seq_len != max_model_len:
+            logger.info(f"Set max_model_len from {max_model_len} to {available_seq_len}")
+            args.max_model_len = available_seq_len
+
     # Run the serving server
     logger.info(f"Starting server on port {args.port}")
     # See https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html for more

@@ -28,19 +28,25 @@ func init() {
 		Name:     PresetPhi3Medium128kModel,
 		Instance: &phi3MediumB,
 	})
+	plugin.KaitoModelRegister.Register(&plugin.Registration{
+		Name:     PresetPhi3_5MiniInstruct,
+		Instance: &phi3_5MiniC,
+	})
 }
 
 var (
 	PresetPhi3Mini4kModel     = "phi-3-mini-4k-instruct"
 	PresetPhi3Mini128kModel   = "phi-3-mini-128k-instruct"
 	PresetPhi3Medium4kModel   = "phi-3-medium-4k-instruct"
 	PresetPhi3Medium128kModel = "phi-3-medium-128k-instruct"
+	PresetPhi3_5MiniInstruct  = "phi-3.5-mini-instruct"
 
 	PresetPhiTagMap = map[string]string{
 		"Phi3Mini4kInstruct":     "0.0.2",
 		"Phi3Mini128kInstruct":   "0.0.2",
 		"Phi3Medium4kInstruct":   "0.0.2",
 		"Phi3Medium128kInstruct": "0.0.2",
+		"Phi3_5MiniInstruct":     "0.0.1",
 	}
 
 	baseCommandPresetPhiInference = "accelerate launch"
@@ -130,6 +136,45 @@ func (*phi3Mini128KInst) SupportTuning() bool {
 	return true
 }
 
+var phi3_5MiniC phi3_5MiniInst
+
+type phi3_5MiniInst struct{}
+
+func (*phi3_5MiniInst) GetInferenceParameters() *model.PresetParam {
+	return &model.PresetParam{
+		ModelFamilyName:           "Phi3_5",
+		ImageAccessMode:           string(kaitov1alpha1.ModelImageAccessModePublic),
+		DiskStorageRequirement:    "50Gi",
+		GPUCountRequirement:       "1",
+		TotalGPUMemoryRequirement: "8Gi",
+		PerGPUMemoryRequirement:   "0Gi", // We run Phi using native vertical model parallel, no per GPU memory requirement.
+		TorchRunParams:            inference.DefaultAccelerateParams,
+		ModelRunParams:            phiRunParams,
+		ReadinessTimeout:          time.Duration(30) * time.Minute,
+		BaseCommand:               baseCommandPresetPhiInference,
+		Tag:                       PresetPhiTagMap["Phi3_5MiniInstruct"],
+	}
+}
+func (*phi3_5MiniInst) GetTuningParameters() *model.PresetParam {
+	return &model.PresetParam{
+		ModelFamilyName:           "Phi3_5",
+		ImageAccessMode:           string(kaitov1alpha1.ModelImageAccessModePublic),
+		DiskStorageRequirement:    "50Gi",
+		GPUCountRequirement:       "1",
+		TotalGPUMemoryRequirement: "72Gi",
+		PerGPUMemoryRequirement:   "72Gi",
+		// TorchRunParams:            inference.DefaultAccelerateParams,
+		// ModelRunParams:            phiRunParams,
+		ReadinessTimeout: time.Duration(30) * time.Minute,
+		BaseCommand:      baseCommandPresetPhiTuning,
+		Tag:              PresetPhiTagMap["Phi3_5MiniInstruct"],
+	}
+}
+func (*phi3_5MiniInst) SupportDistributedInference() bool { return false }
+func (*phi3_5MiniInst) SupportTuning() bool {
+	return true
+}
+
 var phi3MediumA Phi3Medium4kInstruct
 
 type Phi3Medium4kInstruct struct{}

@@ -134,4 +134,4 @@ models:
     tag: 0.0.2
     # Tag history:
     # 0.0.2 - Add Logging & Metrics Server
-    # 0.0.1 - Initial Release
+    # 0.0.1 - Initial Release
@@ -19,21 +19,21 @@ spec:
         command:
           - /bin/sh
           - -c
-          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype bfloat16
+          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
         resources:
           requests:
             nvidia.com/gpu: 4  # Requesting 4 GPUs
           limits:
             nvidia.com/gpu: 4
         livenessProbe:
           httpGet:
-            path: /healthz
+            path: /health
             port: 5000
           initialDelaySeconds: 600 # 10 Min
           periodSeconds: 10
         readinessProbe:
           httpGet:
-            path: /healthz
+            path: /health
             port: 5000
           initialDelaySeconds: 30
           periodSeconds: 10

@@ -19,21 +19,21 @@ spec:
         command:
           - /bin/sh
           - -c
-          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype bfloat16
+          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
         resources:
           requests:
             nvidia.com/gpu: 4  # Requesting 4 GPUs
           limits:
             nvidia.com/gpu: 4
         livenessProbe:
           httpGet:
-            path: /healthz
+            path: /health
             port: 5000
           initialDelaySeconds: 600 # 10 Min
           periodSeconds: 10
         readinessProbe:
           httpGet:
-            path: /healthz
+            path: /health
             port: 5000
           initialDelaySeconds: 30
           periodSeconds: 10

@@ -30,21 +30,21 @@ spec:
         command:
           - /bin/sh
           - -c
-          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype bfloat16
+          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
         resources:
           requests:
             nvidia.com/gpu: 2
           limits:
             nvidia.com/gpu: 2  # Requesting 2 GPUs
         livenessProbe:
           httpGet:
-            path: /healthz
+            path: /health
             port: 5000
           initialDelaySeconds: 600 # 10 Min
           periodSeconds: 10
         readinessProbe:
           httpGet:
-            path: /healthz
+            path: /health
             port: 5000
           initialDelaySeconds: 30
           periodSeconds: 10

@@ -18,21 +18,21 @@ spec:
         command:
           - /bin/sh
           - -c
-          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype bfloat16
+          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
         resources:
           requests:
             nvidia.com/gpu: 2
           limits:
             nvidia.com/gpu: 2  # Requesting 2 GPUs
         livenessProbe:
           httpGet:
-            path: /healthz
+            path: /health
             port: 5000
           initialDelaySeconds: 600 # 10 Min
           periodSeconds: 10
         readinessProbe:
           httpGet:
-            path: /healthz
+            path: /health
             port: 5000
           initialDelaySeconds: 30
           periodSeconds: 10

@@ -29,7 +29,7 @@ spec:
         command:
           - /bin/sh
           - -c
-          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype bfloat16
+          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
         resources:
           requests:
             nvidia.com/gpu: 2

@@ -18,21 +18,21 @@ spec:
         command:
           - /bin/sh
           - -c
-          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype bfloat16
+          - accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
         resources:
           requests:
             nvidia.com/gpu: 2
           limits:
             nvidia.com/gpu: 2  # Requesting 2 GPUs
         livenessProbe:
           httpGet:
-            path: /healthz
+            path: /health
             port: 5000
           initialDelaySeconds: 600 # 10 Min
           periodSeconds: 10
         readinessProbe:
           httpGet:
-            path: /healthz
+            path: /health
             port: 5000
           initialDelaySeconds: 30
           periodSeconds: 10

@@ -35,21 +35,21 @@ spec:
             - |
               echo "MASTER_ADDR: $MASTER_ADDR"
               NODE_RANK=$(echo $HOSTNAME | grep -o '[^-]*$')
-              cd /workspace/llama/llama-2 && torchrun --nnodes 2 --nproc_per_node 1 --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port 29500 inference_api.py
+              cd /workspace/llama/llama-2 && torchrun --nnodes 2 --nproc_per_node 1 --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port 29500 /workspace/tfs/inference_api.py
           resources:
             limits:
               nvidia.com/gpu: "1"
             requests:
               nvidia.com/gpu: "1"
           livenessProbe:
             httpGet:
-              path: /healthz
+              path: /health
               port: 5000
             initialDelaySeconds: 600 # 10 Min
             periodSeconds: 10
           readinessProbe:
             httpGet:
-              path: /healthz
+              path: /health
               port: 5000
             initialDelaySeconds: 30
             periodSeconds: 10