Skip to content

Commit

Permalink
Merge branch 'master' into tohtana/offload_states
Browse files Browse the repository at this point in the history
  • Loading branch information
tohtana authored Dec 13, 2024
2 parents d25c32c + be0a0e1 commit f5c0ef6
Show file tree
Hide file tree
Showing 205 changed files with 37,422 additions and 39 deletions.
17 changes: 4 additions & 13 deletions applications/DeepSpeed-Chat/dschat/rlhf/rlhf_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,31 +268,22 @@ def _init_reward(self, critic_model_name_or_path):
# If critic is ZeRO-3 then we use it for everything, otherwise assume we have enough memory
zero_stage = 0

ds_config = get_eval_ds_config(offload=self.args.offload,
ds_config = get_eval_ds_config(offload=self.args.offload_reward_model,
dtype=self.args.dtype,
stage=zero_stage)
ds_config[
'train_micro_batch_size_per_gpu'] = self.args.per_device_training_batch_size
ds_config[
'train_batch_size'] = self.args.per_device_training_batch_size * torch.distributed.get_world_size(
) * self.args.gradient_accumulation_steps

ds_eval_config = get_eval_ds_config(offload=False,
dtype=self.args.dtype,
stage=zero_stage)

# We need to set train batch size and micro batch size here to pass the sanity check of DeepSpeed engine.
ds_eval_config[
ds_config[
'train_micro_batch_size_per_gpu'] = self.args.per_device_training_batch_size
ds_eval_config[
ds_config[
'train_batch_size'] = self.args.per_device_training_batch_size * torch.distributed.get_world_size(
) * self.args.gradient_accumulation_steps

# Model
reward_model = create_critic_model(
model_name_or_path=critic_model_name_or_path,
tokenizer=self.tokenizer,
ds_config=ds_eval_config,
ds_config=ds_config,
num_padding_at_beginning=self.args.num_padding_at_beginning,
rlhf_training=True,
dropout=self.args.critic_dropout,
Expand Down
10 changes: 7 additions & 3 deletions applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,9 +149,13 @@ def __len__(self):
def __getitem__(self, idx):
if self.train_phase == 1:
return {
"input_ids": self.chosen_dataset[idx]["input_ids"],
"attention_mask": self.chosen_dataset[idx]["attention_mask"],
"labels": self.chosen_dataset[idx]["input_ids"]
"input_ids":
self.chosen_dataset[idx]["input_ids"],
"attention_mask":
self.chosen_dataset[idx]["attention_mask"],
"labels":
torch.where(self.chosen_dataset[idx]["attention_mask"].bool(),
self.chosen_dataset[idx]["input_ids"], -100)
}
elif self.train_phase == 2:
return self.chosen_dataset[idx]["input_ids"], self.chosen_dataset[idx]["attention_mask"], \
Expand Down
1 change: 1 addition & 0 deletions applications/DeepSpeed-Chat/dschat/utils/ds_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def get_train_ds_config(offload,
dtype_config = {"enabled": True}
zero_opt_dict = {
"stage": stage,
"overlap_comm": True,
"offload_param": {
"device": device
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
AutoModel,
)
from huggingface_hub import snapshot_download
from transformers.deepspeed import HfDeepSpeedConfig
from transformers.integrations.deepspeed import HfDeepSpeedConfig

from dschat.utils.model.reward_model import RewardModel
from dschat.utils.utils import load_state_dict_into_model, print_rank_0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,13 @@ def parse_args():
parser.add_argument(
"--add_eot_token",
action='store_true',
help="Add <|endoftext|> as additional special token to tokenizer")
help="Add `eot_token` as additional special token to tokenizer")
parser.add_argument(
"--eot_token",
type=str,
default="<|endoftext|>",
help="Specify the format of the `eot_token`",
)
## Print loss
parser.add_argument('--print_loss',
action='store_true',
Expand Down Expand Up @@ -234,8 +240,7 @@ def main():
torch.distributed.barrier()

# load_hf_tokenizer will get the correct tokenizer and set padding tokens based on the model family
args.end_of_conversation_token = "<|endoftext|>"
additional_special_tokens = args.end_of_conversation_token if args.add_eot_token else None
additional_special_tokens = args.eot_token if args.add_eot_token else None
tokenizer = load_hf_tokenizer(args.model_name_or_path,
fast_tokenizer=True,
add_special_tokens=additional_special_tokens)
Expand Down Expand Up @@ -270,6 +275,7 @@ def main():
args.seed,
tokenizer,
args.max_seq_len,
end_of_conversation_token=tokenizer.eos_token,
sft_only_data_path=args.sft_only_data_path)
# DataLoaders creation:
if args.local_rank == -1:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,9 @@ def parse_args():
'--offload_reference_model',
action='store_true',
help='Enable ZeRO Offload techniques for reference model')
parser.add_argument('--offload_reward_model',
action='store_true',
help='Enable ZeRO Offload techniques for reward model')
parser.add_argument(
'--actor_zero_stage',
type=int,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)))
import data.DST as DST # default special tokens
from torch.utils.data import DataLoader
from transformers.deepspeed import HfDeepSpeedConfig
from transformers.integrations.deepspeed import HfDeepSpeedConfig
import numpy as np
from .vis_proj import VisProjection_vit, VisProjection_perceiver

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,19 @@
from .dummy_client import DummyClientConfig, DummyClient
from .fastgen_client import FastGenClientConfig, FastGenClient
from .vllm_client import vLLMClientConfig, vLLMClient
from .openai_client import openaiClientConfig, openaiClient

client_config_classes = {
"dummy": DummyClientConfig,
"azure_ml": AzureMLClientConfig,
"fastgen": FastGenClientConfig,
"vllm": vLLMClientConfig,
"openai": openaiClientConfig
}
client_classes = {
"dummy": DummyClient,
"azure_ml": AzureMLClient,
"fastgen": FastGenClient,
"vllm": vLLMClient,
"openai": openaiClient,
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import os
import json
import requests
import subprocess
import time
from typing import Any, Dict

from loguru import logger
from pydantic import Field

from .base import BaseClient
from ..config import BaseConfigModel
from ..prompt import Prompt


# client to test any openai API
class openaiClientConfig(BaseConfigModel):
model: str = Field(..., description="HuggingFace.co model name")
url: str = "http://127.0.0.1:26500/v1/completions"


class openaiClient(BaseClient):
def __init__(self, config: openaiClientConfig):
super().__init__(config)

def start_service(self) -> None:
pass

def stop_service(self) -> None:
pass

def prepare_request(self, prompt: Prompt) -> Dict[str, Any]:
api_url = self.config.url
headers = {
"User-Agent": "Benchmark Client",
"Content-Type": "application/json",
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
}
pload = {
"prompt": prompt.text,
"model": self.config.model,
"n": 1,
"use_beam_search": False,
"temperature": 1.0,
"top_p": 0.9,
"max_tokens": prompt.max_new_tokens,
"ignore_eos": False,
}
return {"url": api_url, "headers": headers, "json": pload, "timeout": 180}

def send_request(self, request_kwargs: Dict[str, Any]) -> Any:
response = requests.post(**request_kwargs)
output = json.loads(response.content)
return output

def process_response(self, raw_response: Any) -> str:
return raw_response["choices"][0]["text"]
84 changes: 83 additions & 1 deletion benchmarks/inference/mii/src/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,80 @@ def get_response(response: requests.Response) -> List[str]:
)


# client talks with openai api
def call_openai(
input_tokens: str, max_new_tokens: int, args: argparse.Namespace
) -> ResponseDetails:

api_url = args.openai_api_url
headers = {
"User-Agent": "Benchmark Client",
"Content-Type": "application/json",
"Authorization": f"Bearer {args.openai_api_key}"
}

pload = {
"prompt": input_tokens,
"model": args.model,
"n": 1,
"use_beam_search": False,
"temperature": 1.0,
"top_p": 0.9,
"max_tokens": max_new_tokens,
"ignore_eos": False,
"stream": args.stream,
}

def clear_line(n: int = 1) -> None:
LINE_UP = "\033[1A"
LINE_CLEAR = "\x1b[2K"
for _ in range(n):
print(LINE_UP, end=LINE_CLEAR, flush=True)

def get_streaming_response(
response: requests.Response, time_last_token
) -> Iterable[List[str]]:
for chunk in response.iter_lines(
chunk_size=8192, decode_unicode=False, delimiter=b"data:"
):
if chunk:
plain=chunk.decode("utf-8")
if plain.strip() == "[DONE]":
continue
data = json.loads(plain)
output = data["choices"][0]["text"]
time_now = time.time()
yield output, time_now - time_last_token
time_last_token = time_now

# For non-streaming, but currently non-streaming is not fully implemented
def get_response(response: requests.Response) -> List[str]:
data = json.loads(response.content)
output = data["choices"][0]["text"]
return output

token_gen_time = []
start_time = time.time()
#response = requests.post(api_url, headers=headers, json=pload, stream=False)
response = requests.post(api_url, headers=headers, json=pload, stream=args.stream)
if args.stream:
output = ""
for h, t in get_streaming_response(response, start_time):
output += h
token_gen_time.append(t)
else:
output = get_response(response)

return ResponseDetails(
generated_tokens=output,
prompt=input_tokens,
start_time=start_time,
end_time=time.time(),
model_time=0,
token_gen_time=token_gen_time,
)


def call_aml(
input_tokens: str,
max_new_tokens: int,
Expand Down Expand Up @@ -205,7 +279,7 @@ def _run_parallel(
event_loop = asyncio.new_event_loop()
asyncio.set_event_loop(event_loop)

backend_call_fns = {"fastgen": call_fastgen, "vllm": call_vllm, "aml": call_aml}
backend_call_fns = {"fastgen": call_fastgen, "vllm": call_vllm, "aml": call_aml, "openai": call_openai}
call_fn = backend_call_fns[args.backend]

barrier.wait()
Expand Down Expand Up @@ -273,6 +347,14 @@ def run_client(args):
p.start()

tokenizer = AutoTokenizer.from_pretrained(args.model)

# make sure max_prompt_length is longer than the target prompt length
args.max_prompt_length = max(args.max_prompt_length, int(args.mean_prompt_length * 3))
# check if the all_text is longer than the max prompt length, if not expand it
global all_text
while len(tokenizer.tokenize(all_text)) < args.max_prompt_length:
all_text += all_text

query_generator = RandomQueryGenerator(all_text, tokenizer, seed=42)
request_text = query_generator.get_random_request_text(
args.mean_prompt_length,
Expand Down
9 changes: 6 additions & 3 deletions benchmarks/inference/mii/src/plot_effective_throughput.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,10 @@

def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--backend", type=str, choices=["fastgen", "vllm"], default=["fastgen", "vllm"], \
parser.add_argument("--backend", type=str, choices=["fastgen", "vllm", "openai"], default=["fastgen", "vllm"], \
nargs="+", help="Specify the backends to generate plots for")
parser.add_argument("--log_dir", type=Path, default="./results")
parser.add_argument("--model", type=str)
parser.add_argument("--out_dir", type=Path, default="./plots/goodtput")
parser.add_argument("--sla_prompt_tokens_per_sec", type=int, default=512, help="SLA prompt tokens per second")
parser.add_argument("--sla_gen_tokens_per_sec", type=int, default=[1, 2, 3, 4, 6, 8], nargs="+", help="SLA generation tokens/s targets")
Expand Down Expand Up @@ -76,7 +77,7 @@ def validate_token_ema_latency_SLA(response_detail, sla_token_gen, ema_span):


def validate_prompt_latency_SLA(response_detail, sla_token_gen, f, sla_prompt_tokens_per_sec ):
tokenizer = get_tokenizer()
tokenizer = get_tokenizer(args.model)
prompt_length = len(tokenizer.tokenize(response_detail.prompt))
prompt_latency_SLA = prompt_length / sla_prompt_tokens_per_sec
if prompt_latency_SLA < response_detail.token_gen_time[0]:
Expand Down Expand Up @@ -137,7 +138,9 @@ def output_charts(args, model, tp_size, bs, replicas, sla_token_gen, prompt, gen
]

plt_cfg = {'vllm': {'label': 'vLLM', 'marker': 'x', 'color': 'orange'},\
'fastgen': {'label': 'DeepSpeed-FastGen', 'marker': 'o', 'color': 'blue'}}
'fastgen': {'label': 'DeepSpeed-FastGen', 'marker': 'o', 'color': 'blue'}, \
'openai': {'label': 'openai-API', 'marker': '+', 'color': 'red'}
}

for f in validate_funcs:
plt.figure()
Expand Down
11 changes: 7 additions & 4 deletions benchmarks/inference/mii/src/postprocess_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,13 @@ def parse_args():
return args


def get_tokenizer():
def get_tokenizer(model=None):
global tokenizer
if tokenizer is None:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
if model==None:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
else:
tokenizer = AutoTokenizer.from_pretrained(model)
return tokenizer


Expand All @@ -78,8 +81,8 @@ def get_summary(args, response_details):

tokens_per_sec = mean(
[
(len(get_tokenizer().tokenize(r.prompt)) +
len(get_tokenizer().tokenize(r.generated_tokens)) if type(r.generated_tokens) == str
(len(get_tokenizer(args["model"]).tokenize(r.prompt)) +
len(get_tokenizer(args["model"]).tokenize(r.generated_tokens)) if type(r.generated_tokens) == str
else len(r.generated_tokens))
/ (r.end_time - r.start_time)
for r in response_details
Expand Down
8 changes: 8 additions & 0 deletions benchmarks/inference/mii/src/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def start_server(args: argparse.Namespace) -> None:
"fastgen": start_fastgen_server,
"vllm": start_vllm_server,
"aml": start_aml_server,
"openai": start_openai_server,
}
start_fn = start_server_fns[args.backend]
start_fn(args)
Expand Down Expand Up @@ -90,12 +91,16 @@ def start_aml_server(args: argparse.Namespace) -> None:
"AML server start not implemented. Please use Azure Portal to start the server."
)

def start_openai_server(args: argparse.Namespace) -> None:
# openai api has no command to stop server
pass

def stop_server(args: argparse.Namespace) -> None:
stop_server_fns = {
"fastgen": stop_fastgen_server,
"vllm": stop_vllm_server,
"aml": stop_aml_server,
"openai": stop_openai_server,
}
stop_fn = stop_server_fns[args.backend]
stop_fn(args)
Expand All @@ -118,6 +123,9 @@ def stop_aml_server(args: argparse.Namespace) -> None:
"AML server stop not implemented. Please use Azure Portal to stop the server."
)

def stop_openai_server(args: argparse.Namespace) -> None:
# openai api has no command to stop server
pass

if __name__ == "__main__":
args = parse_args(server_args=True)
Expand Down
Loading

0 comments on commit f5c0ef6

Please sign in to comment.