support infinity_emb

the-seeds · Feb 2, 2024 · 84a87f6 · 84a87f6
1 parent da73a03
commit 84a87f6
Show file tree

Hide file tree

Showing 19 changed files with 433 additions and 209 deletions.
diff --git a/.gitignore b/.gitignore
@@ -158,3 +158,6 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+
+# Config
+config/local.yaml
diff --git a/README.md b/README.md
@@ -5,45 +5,24 @@
 
 ## Usage
 
-Create a `.env` file in the root directory:
+### Install
 
-```
-.
-├── src
-└── .env
-```
 
+```bash
+pip install -U imitater
 ```
-# imitater
-AGENT_TYPE=react
-
-CHAT_MODEL_PATH=Qwen/Qwen-14B-Chat
-CHAT_MODEL_DEVICE=0
-CHAT_TEMPLATE_PATH=templates/qwen.jinja
-GENERATION_CONFIG_PATH=generation_config/qwen
-
-EMBED_MODEL_PATH=BAAI/bge-small-zh-v1.5
-EMBED_MODEL_DEVICE=1
-EMBED_BATCH_SIZE=16
 
-SERVICE_PORT=8010
+### Launch Server
 
-# tests
-OPENAI_BASE_URL=http://192.168.0.1:8010/v1
-OPENAI_API_KEY=0
+```bash
+python -m imitater.service.app -c config/example.yaml
 ```
 
 > [!NOTE]
 > [Chat template](https://huggingface.co/docs/transformers/chat_templating) is required for the chat models.
 
-## Launch Server
-
-```bash
-python src/launch.py
-```
-
-## Test Server
+### Test Server
 
 ```bash
-python tests/test_openai.py
+python tests/test_openai.py -c config/example.yaml
 ```
diff --git a/config/example.yaml b/config/example.yaml
@@ -0,0 +1,22 @@
+chat:
+  - name: gpt-3.5-turbo
+    path: Qwen/Qwen-14B-Chat
+    device:
+      - 0
+    maxlen: 4096
+    agent_type: react
+    template: templates/qwen.jinja
+    gen_config: generation_config/qwen
+    port: 8020
+
+embed:
+  - name: text-embedding-ada-002
+    path: /home/incoming/zhengyw/bge-base-zh-v1.5
+    device:
+      - 1
+    batchsize: 64
+    port: 8030
+
+service:
+  host: 127.0.0.1
+  port: 8010
diff --git a/pyproject.toml b/pyproject.toml
@@ -13,12 +13,13 @@ line-length = 119
 
 [tool.ruff.isort]
 lines-after-imports = 2
-known-first-party = ["imitater"]
 
 [isort]
 default_section = "FIRSTPARTY"
-known_first_party = "imitater"
 known_third_party = [
+    "infinity_emb",
+    "torch",
+    "transformers",
     "vllm"
 ]
 line_length = 119

diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,5 @@
 numpy
 sse-starlette
-transformers>=4.34.0
-vllm>=0.2.6
+transformers>=4.37.2
+vllm>=0.3.0
+infinity-emb[torch]
diff --git a/setup.py b/setup.py
@@ -32,7 +32,7 @@ def main():
         url="https://github.com/the-seeds/imitater",
         package_dir={"": "src"},
         packages=find_packages("src"),
-        python_requires=">=3.8.0",
+        python_requires=">=3.9.0",
         install_requires=get_requires(),
         classifiers=[
             "Development Status :: 3 - Alpha",

diff --git a/src/imitater/__init__.py b/src/imitater/__init__.py
@@ -1,5 +1 @@
-from .service import Imitater
-
-
-__all__ = ["Imitater"]
-__version__ = "0.1.5"
+__version__ = "0.1.6"
diff --git a/src/imitater/config/__init__.py b/src/imitater/config/__init__.py
diff --git a/src/imitater/config/config.py b/src/imitater/config/config.py
diff --git a/src/imitater/model/__init__.py b/src/imitater/model/__init__.py
@@ -1,5 +1,5 @@
-from .chat_model import ChatModel
-from .embed_model import EmbedModel
+from .chat_model import ChatConfig, ChatModel
+from .embed_model import EmbedConfig, EmbedModel
 
 
-__all__ = ["ChatModel", "EmbedModel"]
+__all__ = ["ChatConfig", "ChatModel", "EmbedConfig", "EmbedModel"]
diff --git a/src/imitater/model/chat_model.py b/src/imitater/model/chat_model.py
@@ -1,42 +1,77 @@
-from typing import TYPE_CHECKING, Any, AsyncIterator, Dict, Generator, List, Tuple, Union
+from dataclasses import dataclass, fields
+from typing import TYPE_CHECKING, Any, AsyncIterator, Dict, Generator, List, Literal, Optional, Tuple, Union
 
 from transformers import AutoTokenizer, GenerationConfig
+from typing_extensions import Self
 from vllm import AsyncEngineArgs, AsyncLLMEngine, SamplingParams
 
 from ..agent import get_agent
 
 
 if TYPE_CHECKING:
+    from argparse import ArgumentParser, Namespace
+
     from vllm import RequestOutput
 
-    from ..config import Config
+
+@dataclass
+class ChatConfig:
+    name: str
+    path: str
+    device: List[int]
+    maxlen: int
+    agent_type: Literal["react", "aligned"]
+    template: Optional[str]
+    gen_config: Optional[str]
+    port: int
+
+    @staticmethod
+    def add_cli_args(parser: "ArgumentParser") -> None:
+        parser.add_argument("--name", type=str)
+        parser.add_argument("--path", type=str)
+        parser.add_argument("--device", type=int, nargs="+")
+        parser.add_argument("--maxlen", type=int, default=1024)
+        parser.add_argument("--agent_type", type=str, choices=["react", "aligned"], default="react")
+        parser.add_argument("--template", type=str, default=None)
+        parser.add_argument("--gen_config", type=str, default=None)
+        parser.add_argument("--port", type=int)
+
+    @classmethod
+    def from_cli_args(cls, args: "Namespace") -> Self:
+        attrs = [attr.name for attr in fields(cls)]
+        return cls(**{attr: getattr(args, attr) for attr in attrs})
 
 
 class ChatModel:
-    def __init__(self, config: "Config") -> None:
-        self._config = config
+    def __init__(self, config: "ChatConfig") -> None:
+        self.config = config
+        self.name = config.name
         self._agent = get_agent(config.agent_type)
         self._init_vllm_engine()
         self._load_tokenizer()
         self._load_generation_config()
 
     def _init_vllm_engine(self) -> None:
-        engine_args = AsyncEngineArgs(model=self._config.chat_model_path, trust_remote_code=True)
-        engine_args.tensor_parallel_size = len(self._config.chat_model_device)
+        engine_args = AsyncEngineArgs(
+            model=self.config.path,
+            trust_remote_code=True,
+            max_model_len=self.config.maxlen,
+            tensor_parallel_size=len(self.config.device),
+        )
         self._engine = AsyncLLMEngine.from_engine_args(engine_args)
 
     def _load_tokenizer(self) -> None:
-        self._tokenizer = AutoTokenizer.from_pretrained(self._config.chat_model_path, trust_remote_code=True)
-        if self._config.chat_template_path:
-            with open(self._config.chat_template_path, "r", encoding="utf-8") as f:
+        self._tokenizer = AutoTokenizer.from_pretrained(self.config.path, trust_remote_code=True)
+        if self.config.template:
+            with open(self.config.template, "r", encoding="utf-8") as f:
                 self._tokenizer.chat_template = f.read()
 
         if self._tokenizer.chat_template is None:
             print("Chat template is not found, use the default one.")
 
     def _load_generation_config(self) -> None:
         try:
-            generation_config_path = self._config.generation_config_path or self._config.chat_model_path
+            generation_config_path = self.config.gen_config or self.config.path
             self._generation_config = GenerationConfig.from_pretrained(generation_config_path)
         except Exception:
             self._generation_config = GenerationConfig(
@@ -62,7 +97,7 @@ def _load_generation_config(self) -> None:
             if eos_token_id != self._tokenizer.eos_token_id:
                 extra_special_tokens.append(self._tokenizer.convert_ids_to_tokens(eos_token_id))
 
-        self._engine.engine.tokenizer.add_special_tokens(
+        self._engine.engine.tokenizer.tokenizer.add_special_tokens(
             {"additional_special_tokens": extra_special_tokens}, replace_additional_special_tokens=False
         )
 

diff --git a/src/imitater/model/embed_model.py b/src/imitater/model/embed_model.py
@@ -1,49 +1,59 @@
-import asyncio
-from typing import TYPE_CHECKING, List, Optional
+from dataclasses import dataclass, fields
+from typing import TYPE_CHECKING, List
 
-import torch
-from transformers import AutoModel, AutoTokenizer
+from infinity_emb import AsyncEmbeddingEngine
+from typing_extensions import Self
 
 
 if TYPE_CHECKING:
-    from transformers import BatchEncoding, PreTrainedModel
+    from argparse import ArgumentParser, Namespace
 
-    from ..config import Config
+    from numpy import float32
+    from numpy.typing import NDArray
 
 
-@torch.inference_mode()
-def _get_embeddings(model: "PreTrainedModel", batch_encoding: "BatchEncoding") -> List[List[float]]:
-    output = model(**batch_encoding.to(model.device))
-    embeddings = output[0][:, 0]
-    embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1).tolist()
-    return embeddings
+@dataclass
+class EmbedConfig:
+    name: str
+    path: str
+    device: List[int]
+    batch_size: int
+    port: int
+
+    @staticmethod
+    def add_cli_args(parser: "ArgumentParser") -> None:
+        parser.add_argument("--name", type=str)
+        parser.add_argument("--path", type=str)
+        parser.add_argument("--device", type=int, nargs="+")
+        parser.add_argument("--batch_size", type=int, default=64)
+        parser.add_argument("--port", type=int)
+
+    @classmethod
+    def from_cli_args(cls, args: "Namespace") -> Self:
+        attrs = [attr.name for attr in fields(cls)]
+        return cls(**{attr: getattr(args, attr) for attr in attrs})
 
 
 class EmbedModel:
-    def __init__(self, config: "Config", max_tasks: Optional[int] = 5) -> None:
-        self._semaphore = asyncio.Semaphore(max_tasks)
-        self._batch_size = config.embed_batch_size
-        self._model: "PreTrainedModel" = AutoModel.from_pretrained(
-            config.embed_model_path,
-            device_map={"": config.embed_model_device[0]},
-            torch_dtype=torch.float16,
+    def __init__(self, config: "EmbedConfig") -> None:
+        self.config = config
+        self.name = config.name
+        if len(config.device) != 1:
+            raise ValueError("Embedding model only accepts one device.")
+
+        self._engine = AsyncEmbeddingEngine(
+            model_name_or_path=config.path,
+            batch_size=config.batch_size,
+            engine="torch",
+            device="cuda",
         )
-        self._model.eval()
-        self._tokenizer = AutoTokenizer.from_pretrained(config.embed_model_path)
-        self._tokenizer.padding_side = "right"
-
-    async def _run_task(self, batch_encoding: "BatchEncoding") -> List[List[float]]:
-        async with self._semaphore:
-            loop = asyncio.get_running_loop()
-            return await loop.run_in_executor(None, _get_embeddings, self._model, batch_encoding)
-
-    async def embed(self, texts: List[str]) -> List[List[float]]:
-        results = []
-        for i in range(0, len(texts), self._batch_size):
-            batch_encoding = self._tokenizer(
-                texts[i : i + self._batch_size], padding=True, truncation=True, return_tensors="pt"
-            )
-            embeddings = await self._run_task(batch_encoding)
-            results.extend(embeddings)
-
-        return results
+
+    async def startup(self) -> None:
+        await self._engine.astart()
+
+    async def shutdown(self) -> None:
+        await self._engine.astop()
+
+    async def embed(self, texts: List[str]) -> List["NDArray[float32]"]:
+        embeddings, _ = await self._engine.embed(texts)
+        return embeddings
diff --git a/src/imitater/service/__init__.py b/src/imitater/service/__init__.py
@@ -1,4 +0,0 @@
-from .app import Imitater
-
-
-__all__ = ["Imitater"]
Original file line number	Diff line number	Diff line change
		@@ -1,4 +0,0 @@
		from .app import Imitater


		__all__ = ["Imitater"]