Skip to content

Commit

Permalink
support infinity_emb
Browse files Browse the repository at this point in the history
  • Loading branch information
hiyouga committed Feb 2, 2024
1 parent da73a03 commit 84a87f6
Show file tree
Hide file tree
Showing 19 changed files with 433 additions and 209 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -158,3 +158,6 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

# Config
config/local.yaml
37 changes: 8 additions & 29 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,45 +5,24 @@

## Usage

Create a `.env` file in the root directory:
### Install

```
.
├── src
└── .env
```

```bash
pip install -U imitater
```
# imitater
AGENT_TYPE=react
CHAT_MODEL_PATH=Qwen/Qwen-14B-Chat
CHAT_MODEL_DEVICE=0
CHAT_TEMPLATE_PATH=templates/qwen.jinja
GENERATION_CONFIG_PATH=generation_config/qwen
EMBED_MODEL_PATH=BAAI/bge-small-zh-v1.5
EMBED_MODEL_DEVICE=1
EMBED_BATCH_SIZE=16

SERVICE_PORT=8010
### Launch Server

# tests
OPENAI_BASE_URL=http://192.168.0.1:8010/v1
OPENAI_API_KEY=0
```bash
python -m imitater.service.app -c config/example.yaml
```

> [!NOTE]
> [Chat template](https://huggingface.co/docs/transformers/chat_templating) is required for the chat models.
## Launch Server

```bash
python src/launch.py
```

## Test Server
### Test Server

```bash
python tests/test_openai.py
python tests/test_openai.py -c config/example.yaml
```
22 changes: 22 additions & 0 deletions config/example.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
chat:
- name: gpt-3.5-turbo
path: Qwen/Qwen-14B-Chat
device:
- 0
maxlen: 4096
agent_type: react
template: templates/qwen.jinja
gen_config: generation_config/qwen
port: 8020

embed:
- name: text-embedding-ada-002
path: /home/incoming/zhengyw/bge-base-zh-v1.5
device:
- 1
batchsize: 64
port: 8030

service:
host: 127.0.0.1
port: 8010
5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,13 @@ line-length = 119

[tool.ruff.isort]
lines-after-imports = 2
known-first-party = ["imitater"]

[isort]
default_section = "FIRSTPARTY"
known_first_party = "imitater"
known_third_party = [
"infinity_emb",
"torch",
"transformers",
"vllm"
]
line_length = 119
Expand Down
5 changes: 3 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
numpy
sse-starlette
transformers>=4.34.0
vllm>=0.2.6
transformers>=4.37.2
vllm>=0.3.0
infinity-emb[torch]
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def main():
url="https://github.com/the-seeds/imitater",
package_dir={"": "src"},
packages=find_packages("src"),
python_requires=">=3.8.0",
python_requires=">=3.9.0",
install_requires=get_requires(),
classifiers=[
"Development Status :: 3 - Alpha",
Expand Down
6 changes: 1 addition & 5 deletions src/imitater/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1 @@
from .service import Imitater


__all__ = ["Imitater"]
__version__ = "0.1.5"
__version__ = "0.1.6"
4 changes: 0 additions & 4 deletions src/imitater/config/__init__.py

This file was deleted.

16 changes: 0 additions & 16 deletions src/imitater/config/config.py

This file was deleted.

6 changes: 3 additions & 3 deletions src/imitater/model/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from .chat_model import ChatModel
from .embed_model import EmbedModel
from .chat_model import ChatConfig, ChatModel
from .embed_model import EmbedConfig, EmbedModel


__all__ = ["ChatModel", "EmbedModel"]
__all__ = ["ChatConfig", "ChatModel", "EmbedConfig", "EmbedModel"]
57 changes: 46 additions & 11 deletions src/imitater/model/chat_model.py
Original file line number Diff line number Diff line change
@@ -1,42 +1,77 @@
from typing import TYPE_CHECKING, Any, AsyncIterator, Dict, Generator, List, Tuple, Union
from dataclasses import dataclass, fields
from typing import TYPE_CHECKING, Any, AsyncIterator, Dict, Generator, List, Literal, Optional, Tuple, Union

from transformers import AutoTokenizer, GenerationConfig
from typing_extensions import Self
from vllm import AsyncEngineArgs, AsyncLLMEngine, SamplingParams

from ..agent import get_agent


if TYPE_CHECKING:
from argparse import ArgumentParser, Namespace

from vllm import RequestOutput

from ..config import Config

@dataclass
class ChatConfig:
name: str
path: str
device: List[int]
maxlen: int
agent_type: Literal["react", "aligned"]
template: Optional[str]
gen_config: Optional[str]
port: int

@staticmethod
def add_cli_args(parser: "ArgumentParser") -> None:
parser.add_argument("--name", type=str)
parser.add_argument("--path", type=str)
parser.add_argument("--device", type=int, nargs="+")
parser.add_argument("--maxlen", type=int, default=1024)
parser.add_argument("--agent_type", type=str, choices=["react", "aligned"], default="react")
parser.add_argument("--template", type=str, default=None)
parser.add_argument("--gen_config", type=str, default=None)
parser.add_argument("--port", type=int)

@classmethod
def from_cli_args(cls, args: "Namespace") -> Self:
attrs = [attr.name for attr in fields(cls)]
return cls(**{attr: getattr(args, attr) for attr in attrs})


class ChatModel:
def __init__(self, config: "Config") -> None:
self._config = config
def __init__(self, config: "ChatConfig") -> None:
self.config = config
self.name = config.name
self._agent = get_agent(config.agent_type)
self._init_vllm_engine()
self._load_tokenizer()
self._load_generation_config()

def _init_vllm_engine(self) -> None:
engine_args = AsyncEngineArgs(model=self._config.chat_model_path, trust_remote_code=True)
engine_args.tensor_parallel_size = len(self._config.chat_model_device)
engine_args = AsyncEngineArgs(
model=self.config.path,
trust_remote_code=True,
max_model_len=self.config.maxlen,
tensor_parallel_size=len(self.config.device),
)
self._engine = AsyncLLMEngine.from_engine_args(engine_args)

def _load_tokenizer(self) -> None:
self._tokenizer = AutoTokenizer.from_pretrained(self._config.chat_model_path, trust_remote_code=True)
if self._config.chat_template_path:
with open(self._config.chat_template_path, "r", encoding="utf-8") as f:
self._tokenizer = AutoTokenizer.from_pretrained(self.config.path, trust_remote_code=True)
if self.config.template:
with open(self.config.template, "r", encoding="utf-8") as f:
self._tokenizer.chat_template = f.read()

if self._tokenizer.chat_template is None:
print("Chat template is not found, use the default one.")

def _load_generation_config(self) -> None:
try:
generation_config_path = self._config.generation_config_path or self._config.chat_model_path
generation_config_path = self.config.gen_config or self.config.path
self._generation_config = GenerationConfig.from_pretrained(generation_config_path)
except Exception:
self._generation_config = GenerationConfig(
Expand All @@ -62,7 +97,7 @@ def _load_generation_config(self) -> None:
if eos_token_id != self._tokenizer.eos_token_id:
extra_special_tokens.append(self._tokenizer.convert_ids_to_tokens(eos_token_id))

self._engine.engine.tokenizer.add_special_tokens(
self._engine.engine.tokenizer.tokenizer.add_special_tokens(
{"additional_special_tokens": extra_special_tokens}, replace_additional_special_tokens=False
)

Expand Down
86 changes: 48 additions & 38 deletions src/imitater/model/embed_model.py
Original file line number Diff line number Diff line change
@@ -1,49 +1,59 @@
import asyncio
from typing import TYPE_CHECKING, List, Optional
from dataclasses import dataclass, fields
from typing import TYPE_CHECKING, List

import torch
from transformers import AutoModel, AutoTokenizer
from infinity_emb import AsyncEmbeddingEngine
from typing_extensions import Self


if TYPE_CHECKING:
from transformers import BatchEncoding, PreTrainedModel
from argparse import ArgumentParser, Namespace

from ..config import Config
from numpy import float32
from numpy.typing import NDArray


@torch.inference_mode()
def _get_embeddings(model: "PreTrainedModel", batch_encoding: "BatchEncoding") -> List[List[float]]:
output = model(**batch_encoding.to(model.device))
embeddings = output[0][:, 0]
embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1).tolist()
return embeddings
@dataclass
class EmbedConfig:
name: str
path: str
device: List[int]
batch_size: int
port: int

@staticmethod
def add_cli_args(parser: "ArgumentParser") -> None:
parser.add_argument("--name", type=str)
parser.add_argument("--path", type=str)
parser.add_argument("--device", type=int, nargs="+")
parser.add_argument("--batch_size", type=int, default=64)
parser.add_argument("--port", type=int)

@classmethod
def from_cli_args(cls, args: "Namespace") -> Self:
attrs = [attr.name for attr in fields(cls)]
return cls(**{attr: getattr(args, attr) for attr in attrs})


class EmbedModel:
def __init__(self, config: "Config", max_tasks: Optional[int] = 5) -> None:
self._semaphore = asyncio.Semaphore(max_tasks)
self._batch_size = config.embed_batch_size
self._model: "PreTrainedModel" = AutoModel.from_pretrained(
config.embed_model_path,
device_map={"": config.embed_model_device[0]},
torch_dtype=torch.float16,
def __init__(self, config: "EmbedConfig") -> None:
self.config = config
self.name = config.name
if len(config.device) != 1:
raise ValueError("Embedding model only accepts one device.")

self._engine = AsyncEmbeddingEngine(
model_name_or_path=config.path,
batch_size=config.batch_size,
engine="torch",
device="cuda",
)
self._model.eval()
self._tokenizer = AutoTokenizer.from_pretrained(config.embed_model_path)
self._tokenizer.padding_side = "right"

async def _run_task(self, batch_encoding: "BatchEncoding") -> List[List[float]]:
async with self._semaphore:
loop = asyncio.get_running_loop()
return await loop.run_in_executor(None, _get_embeddings, self._model, batch_encoding)

async def embed(self, texts: List[str]) -> List[List[float]]:
results = []
for i in range(0, len(texts), self._batch_size):
batch_encoding = self._tokenizer(
texts[i : i + self._batch_size], padding=True, truncation=True, return_tensors="pt"
)
embeddings = await self._run_task(batch_encoding)
results.extend(embeddings)

return results

async def startup(self) -> None:
await self._engine.astart()

async def shutdown(self) -> None:
await self._engine.astop()

async def embed(self, texts: List[str]) -> List["NDArray[float32]"]:
embeddings, _ = await self._engine.embed(texts)
return embeddings
4 changes: 0 additions & 4 deletions src/imitater/service/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +0,0 @@
from .app import Imitater


__all__ = ["Imitater"]
Loading

0 comments on commit 84a87f6

Please sign in to comment.