Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: support adaptive max_model_len #657

Merged
merged 7 commits into from
Nov 12, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docker/presets/models/tfs/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM python:3.10-slim@sha256:684b1aaf96a7942b3c3af438d162e0baa3510aa7af25ad76d238e0c746bdec79
FROM python:3.12-slim

ARG WEIGHTS_PATH
ARG MODEL_TYPE
Expand Down
2 changes: 2 additions & 0 deletions pkg/utils/test/testModel.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ func (*testModel) GetInferenceParameters() *model.PresetParam {
return &model.PresetParam{
GPUCountRequirement: "1",
ReadinessTimeout: time.Duration(30) * time.Minute,
BaseCommand: "python3",
}
}
func (*testModel) GetTuningParameters() *model.PresetParam {
Expand All @@ -37,6 +38,7 @@ func (*testDistributedModel) GetInferenceParameters() *model.PresetParam {
return &model.PresetParam{
GPUCountRequirement: "1",
ReadinessTimeout: time.Duration(30) * time.Minute,
BaseCommand: "python3",
}
}
func (*testDistributedModel) GetTuningParameters() *model.PresetParam {
Expand Down
4 changes: 2 additions & 2 deletions pkg/workspace/inference/preset-inferences.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@ import (
)

const (
ProbePath = "/healthz"
ProbePath = "/health"
Port5000 = int32(5000)
InferenceFile = "inference_api.py"
InferenceFile = "/workspace/tfs/inference_api.py"
)

var (
Expand Down
6 changes: 3 additions & 3 deletions pkg/workspace/inference/preset-inferences_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ func TestCreatePresetInference(t *testing.T) {
workload: "Deployment",
// No BaseCommand, TorchRunParams, TorchRunRdzvParams, or ModelRunParams
// So expected cmd consists of shell command and inference file
expectedCmd: "/bin/sh -c inference_api.py",
expectedCmd: "/bin/sh -c python3 /workspace/tfs/inference_api.py",
hasAdapters: false,
},

Expand All @@ -58,7 +58,7 @@ func TestCreatePresetInference(t *testing.T) {
c.On("Create", mock.IsType(context.TODO()), mock.IsType(&appsv1.StatefulSet{}), mock.Anything).Return(nil)
},
workload: "StatefulSet",
expectedCmd: "/bin/sh -c inference_api.py",
expectedCmd: "/bin/sh -c python3 /workspace/tfs/inference_api.py",
hasAdapters: false,
},

Expand All @@ -69,7 +69,7 @@ func TestCreatePresetInference(t *testing.T) {
c.On("Create", mock.IsType(context.TODO()), mock.IsType(&appsv1.Deployment{}), mock.Anything).Return(nil)
},
workload: "Deployment",
expectedCmd: "/bin/sh -c inference_api.py",
expectedCmd: "/bin/sh -c python3 /workspace/tfs/inference_api.py",
hasAdapters: true,
expectedVolume: "adapter-volume",
},
Expand Down
62 changes: 58 additions & 4 deletions presets/inference/vllm/inference_api.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import logging
import gc
import os

import uvloop
import torch
from vllm.utils import FlexibleArgumentParser
import vllm.entrypoints.openai.api_server as api_server
from vllm.engine.llm_engine import (LLMEngine, EngineArgs, EngineConfig)

# Initialize logger
logger = logging.getLogger(__name__)
Expand All @@ -26,22 +29,73 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
# See https://docs.vllm.ai/en/latest/models/engine_args.html for more args
engine_default_args = {
"model": "/workspace/vllm/weights",
"cpu-offload-gb": 0,
"gpu-memory-utilization": 0.9,
"swap-space": 4,
"disable-log-stats": False,
"cpu_offload_gb": 0,
"gpu_memory_utilization": 0.95,
"swap_space": 4,
"disable_log_stats": False,
"uvicorn_log_level": "error"
}
parser.set_defaults(**engine_default_args)

return parser

def find_max_available_seq_len(engine_config: EngineConfig) -> int:
"""
Load model and run profiler to find max available seq len.
"""
# see https://github.com/vllm-project/vllm/blob/v0.6.3/vllm/engine/llm_engine.py#L335
executor_class = LLMEngine._get_executor_cls(engine_config)
executor = executor_class(
model_config=engine_config.model_config,
cache_config=engine_config.cache_config,
parallel_config=engine_config.parallel_config,
scheduler_config=engine_config.scheduler_config,
device_config=engine_config.device_config,
lora_config=engine_config.lora_config,
speculative_config=engine_config.speculative_config,
load_config=engine_config.load_config,
prompt_adapter_config=engine_config.prompt_adapter_config,
observability_config=engine_config.observability_config,
)

# see https://github.com/vllm-project/vllm/blob/v0.6.3/vllm/engine/llm_engine.py#L477
num_gpu_blocks, _ = executor.determine_num_available_blocks()

# release memory
del executor
gc.collect()
torch.cuda.empty_cache()

return engine_config.cache_config.block_size * num_gpu_blocks

if __name__ == "__main__":
parser = FlexibleArgumentParser(description='vLLM serving server')
parser = api_server.make_arg_parser(parser)
parser = make_arg_parser(parser)
args = parser.parse_args()

if args.max_model_len is None:
engine_args = EngineArgs.from_cli_args(args)
# read the model config from hf weights path.
# vllm will perform different parser for different model architectures
# and read it into a unified EngineConfig.
engine_config = engine_args.create_engine_config()

logger.info("Try run profiler to find max available seq len")
available_seq_len = find_max_available_seq_len(engine_config)
# see https://github.com/vllm-project/vllm/blob/v0.6.3/vllm/worker/worker.py#L262
if available_seq_len <= 0:
raise ValueError("No available memory for the cache blocks. "
"Try increasing `gpu_memory_utilization` when "
"initializing the engine.")
max_model_len = engine_config.model_config.max_model_len
if available_seq_len > max_model_len:
available_seq_len = max_model_len

if available_seq_len != max_model_len:
logger.info(f"Set max_model_len from {max_model_len} to {available_seq_len}")
args.max_model_len = available_seq_len

# Run the serving server
logger.info(f"Starting server on port {args.port}")
# See https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html for more
Expand Down
45 changes: 45 additions & 0 deletions presets/models/phi3/model.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,19 +28,25 @@ func init() {
Name: PresetPhi3Medium128kModel,
Instance: &phi3MediumB,
})
plugin.KaitoModelRegister.Register(&plugin.Registration{
Name: PresetPhi3_5MiniInstruct,
Instance: &phi3_5MiniC,
})
}

var (
PresetPhi3Mini4kModel = "phi-3-mini-4k-instruct"
PresetPhi3Mini128kModel = "phi-3-mini-128k-instruct"
PresetPhi3Medium4kModel = "phi-3-medium-4k-instruct"
PresetPhi3Medium128kModel = "phi-3-medium-128k-instruct"
PresetPhi3_5MiniInstruct = "phi-3.5-mini-instruct"

PresetPhiTagMap = map[string]string{
"Phi3Mini4kInstruct": "0.0.2",
"Phi3Mini128kInstruct": "0.0.2",
"Phi3Medium4kInstruct": "0.0.2",
"Phi3Medium128kInstruct": "0.0.2",
"Phi3_5MiniInstruct": "0.0.1",
}

baseCommandPresetPhiInference = "accelerate launch"
Expand Down Expand Up @@ -130,6 +136,45 @@ func (*phi3Mini128KInst) SupportTuning() bool {
return true
}

var phi3_5MiniC phi3_5MiniInst

type phi3_5MiniInst struct{}

func (*phi3_5MiniInst) GetInferenceParameters() *model.PresetParam {
return &model.PresetParam{
ModelFamilyName: "Phi3_5",
ImageAccessMode: string(kaitov1alpha1.ModelImageAccessModePublic),
DiskStorageRequirement: "50Gi",
GPUCountRequirement: "1",
TotalGPUMemoryRequirement: "8Gi",
PerGPUMemoryRequirement: "0Gi", // We run Phi using native vertical model parallel, no per GPU memory requirement.
TorchRunParams: inference.DefaultAccelerateParams,
ModelRunParams: phiRunParams,
ReadinessTimeout: time.Duration(30) * time.Minute,
BaseCommand: baseCommandPresetPhiInference,
Tag: PresetPhiTagMap["Phi3_5MiniInstruct"],
}
}
func (*phi3_5MiniInst) GetTuningParameters() *model.PresetParam {
return &model.PresetParam{
ModelFamilyName: "Phi3_5",
ImageAccessMode: string(kaitov1alpha1.ModelImageAccessModePublic),
DiskStorageRequirement: "50Gi",
GPUCountRequirement: "1",
TotalGPUMemoryRequirement: "72Gi",
PerGPUMemoryRequirement: "72Gi",
// TorchRunParams: inference.DefaultAccelerateParams,
// ModelRunParams: phiRunParams,
ReadinessTimeout: time.Duration(30) * time.Minute,
BaseCommand: baseCommandPresetPhiTuning,
Tag: PresetPhiTagMap["Phi3_5MiniInstruct"],
}
}
func (*phi3_5MiniInst) SupportDistributedInference() bool { return false }
func (*phi3_5MiniInst) SupportTuning() bool {
return true
}

var phi3MediumA Phi3Medium4kInstruct

type Phi3Medium4kInstruct struct{}
Expand Down
2 changes: 1 addition & 1 deletion presets/models/supported_models.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -134,4 +134,4 @@ models:
tag: 0.0.2
# Tag history:
# 0.0.2 - Add Logging & Metrics Server
# 0.0.1 - Initial Release
# 0.0.1 - Initial Release
Original file line number Diff line number Diff line change
Expand Up @@ -19,21 +19,21 @@ spec:
command:
- /bin/sh
- -c
- accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype bfloat16
- accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
resources:
requests:
nvidia.com/gpu: 4 # Requesting 4 GPUs
limits:
nvidia.com/gpu: 4
livenessProbe:
httpGet:
path: /healthz
path: /health
port: 5000
initialDelaySeconds: 600 # 10 Min
periodSeconds: 10
readinessProbe:
httpGet:
path: /healthz
path: /health
port: 5000
initialDelaySeconds: 30
periodSeconds: 10
Expand Down
6 changes: 3 additions & 3 deletions presets/test/manifests/falcon-40b/falcon-40b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,21 +19,21 @@ spec:
command:
- /bin/sh
- -c
- accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype bfloat16
- accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
resources:
requests:
nvidia.com/gpu: 4 # Requesting 4 GPUs
limits:
nvidia.com/gpu: 4
livenessProbe:
httpGet:
path: /healthz
path: /health
port: 5000
initialDelaySeconds: 600 # 10 Min
periodSeconds: 10
readinessProbe:
httpGet:
path: /healthz
path: /health
port: 5000
initialDelaySeconds: 30
periodSeconds: 10
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,21 +30,21 @@ spec:
command:
- /bin/sh
- -c
- accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype bfloat16
- accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
resources:
requests:
nvidia.com/gpu: 2
limits:
nvidia.com/gpu: 2 # Requesting 2 GPUs
livenessProbe:
httpGet:
path: /healthz
path: /health
port: 5000
initialDelaySeconds: 600 # 10 Min
periodSeconds: 10
readinessProbe:
httpGet:
path: /healthz
path: /health
port: 5000
initialDelaySeconds: 30
periodSeconds: 10
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,21 +18,21 @@ spec:
command:
- /bin/sh
- -c
- accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype bfloat16
- accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
resources:
requests:
nvidia.com/gpu: 2
limits:
nvidia.com/gpu: 2 # Requesting 2 GPUs
livenessProbe:
httpGet:
path: /healthz
path: /health
port: 5000
initialDelaySeconds: 600 # 10 Min
periodSeconds: 10
readinessProbe:
httpGet:
path: /healthz
path: /health
port: 5000
initialDelaySeconds: 30
periodSeconds: 10
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ spec:
command:
- /bin/sh
- -c
- accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype bfloat16
- accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
resources:
requests:
nvidia.com/gpu: 2
Expand Down
6 changes: 3 additions & 3 deletions presets/test/manifests/falcon-7b/falcon-7b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,21 +18,21 @@ spec:
command:
- /bin/sh
- -c
- accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all inference_api.py --pipeline text-generation --torch_dtype bfloat16
- accelerate launch --num_processes 1 --num_machines 1 --machine_rank 0 --gpu_ids all /workspace/tfs/inference_api.py --pipeline text-generation --torch_dtype bfloat16
resources:
requests:
nvidia.com/gpu: 2
limits:
nvidia.com/gpu: 2 # Requesting 2 GPUs
livenessProbe:
httpGet:
path: /healthz
path: /health
port: 5000
initialDelaySeconds: 600 # 10 Min
periodSeconds: 10
readinessProbe:
httpGet:
path: /healthz
path: /health
port: 5000
initialDelaySeconds: 30
periodSeconds: 10
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,21 +35,21 @@ spec:
- |
echo "MASTER_ADDR: $MASTER_ADDR"
NODE_RANK=$(echo $HOSTNAME | grep -o '[^-]*$')
cd /workspace/llama/llama-2 && torchrun --nnodes 2 --nproc_per_node 1 --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port 29500 inference_api.py
cd /workspace/llama/llama-2 && torchrun --nnodes 2 --nproc_per_node 1 --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port 29500 /workspace/tfs/inference_api.py
resources:
limits:
nvidia.com/gpu: "1"
requests:
nvidia.com/gpu: "1"
livenessProbe:
httpGet:
path: /healthz
path: /health
port: 5000
initialDelaySeconds: 600 # 10 Min
periodSeconds: 10
readinessProbe:
httpGet:
path: /healthz
path: /health
port: 5000
initialDelaySeconds: 30
periodSeconds: 10
Expand Down
Loading
Loading