From 95a30f3185d51923fb90d39ed68a611b0e4eee77 Mon Sep 17 00:00:00 2001 From: hiyouga Date: Tue, 23 Jan 2024 18:30:37 +0800 Subject: [PATCH] remove vllm patch --- requirements.txt | 4 +- src/imitater/config/config.py | 2 - src/imitater/model/chat_model.py | 4 -- src/imitater/utils/vllm_monkey_patch.py | 68 ------------------------- 4 files changed, 2 insertions(+), 76 deletions(-) delete mode 100644 src/imitater/utils/vllm_monkey_patch.py diff --git a/requirements.txt b/requirements.txt index a3503c4..526d3a7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ numpy sse-starlette -transformers -vllm==0.2.6 +transformers>=4.34.0 +vllm>=0.2.6 diff --git a/src/imitater/config/config.py b/src/imitater/config/config.py index 97c6d55..5b36d85 100644 --- a/src/imitater/config/config.py +++ b/src/imitater/config/config.py @@ -14,5 +14,3 @@ class Config: embed_model_path: str embed_model_device: List[int] embed_batch_size: int - - enable_attn_bias: bool diff --git a/src/imitater/model/chat_model.py b/src/imitater/model/chat_model.py index 27e6261..94e2b74 100644 --- a/src/imitater/model/chat_model.py +++ b/src/imitater/model/chat_model.py @@ -4,7 +4,6 @@ from vllm import AsyncEngineArgs, AsyncLLMEngine, SamplingParams from ..agent import get_agent -from ..utils.vllm_monkey_patch import llama_attn_bias_monkey_patch if TYPE_CHECKING: @@ -22,9 +21,6 @@ def __init__(self, config: "Config") -> None: self._load_generation_config() def _init_vllm_engine(self) -> None: - if self._config.enable_attn_bias: - llama_attn_bias_monkey_patch() - engine_args = AsyncEngineArgs(model=self._config.chat_model_path, trust_remote_code=True) engine_args.tensor_parallel_size = len(self._config.chat_model_device) self._engine = AsyncLLMEngine.from_engine_args(engine_args) diff --git a/src/imitater/utils/vllm_monkey_patch.py b/src/imitater/utils/vllm_monkey_patch.py deleted file mode 100644 index 30cc6b4..0000000 --- a/src/imitater/utils/vllm_monkey_patch.py +++ /dev/null @@ -1,68 +0,0 @@ -from typing import Any, Dict, Optional - -import torch.nn as nn -from vllm.model_executor.layers.attention import PagedAttention -from vllm.model_executor.layers.linear import LinearMethodBase, QKVParallelLinear, RowParallelLinear -from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.models.llama import LlamaAttention -from vllm.model_executor.parallel_utils.parallel_state import get_tensor_model_parallel_world_size - - -def __init__( - self: "LlamaAttention", - hidden_size: int, - num_heads: int, - num_kv_heads: int, - rope_theta: Optional[float] = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, - max_position_embeddings: int = 8192, - linear_method: Optional[LinearMethodBase] = None, -) -> None: - nn.Module.__init__(self) - self.hidden_size = hidden_size - tp_size = get_tensor_model_parallel_world_size() - self.total_num_heads = num_heads - assert self.total_num_heads % tp_size == 0 - self.num_heads = self.total_num_heads // tp_size - self.total_num_kv_heads = num_kv_heads - - if self.total_num_kv_heads >= tp_size: - # Number of KV heads is greater than TP size, so we partition - # the KV heads across multiple tensor parallel GPUs. - assert self.total_num_kv_heads % tp_size == 0 - else: - # Number of KV heads is less than TP size, so we replicate - # the KV heads across multiple tensor parallel GPUs. - assert tp_size % self.total_num_kv_heads == 0 - - self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) - self.head_dim = hidden_size // self.total_num_heads - self.q_size = self.num_heads * self.head_dim - self.kv_size = self.num_kv_heads * self.head_dim - self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta - self.max_position_embeddings = max_position_embeddings - - self.qkv_proj = QKVParallelLinear( - hidden_size, - self.head_dim, - self.total_num_heads, - self.total_num_kv_heads, - bias=True, - linear_method=linear_method, - ) - self.o_proj = RowParallelLinear( - self.total_num_heads * self.head_dim, hidden_size, bias=True, linear_method=linear_method - ) - self.rotary_emb = get_rope( - self.head_dim, - rotary_dim=self.head_dim, - max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, - ) - self.attn = PagedAttention(self.num_heads, self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads) - - -def llama_attn_bias_monkey_patch(): - LlamaAttention.__init__ = __init__