Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds script as an example of a run of DS-FastGen #810

Merged
merged 13 commits into from
Nov 17, 2023
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
16 changes: 15 additions & 1 deletion benchmarks/inference/mii/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ python server.py [options] start

Use the -h option to view all available options. To stop the server, use this command:

```bash
```bash
python server.py stop
```

Expand All @@ -30,3 +30,17 @@ The scripts mentioned below were used for generating the plots featured in our b
- `plot_th_lat.py`: This script generates charts for throughput and latency across different model sizes and client counts.
- `plot_effective_throughput.py`: Use this to chart effective throughput.
- `plot_latency_percentile.py`: This script will plot the 50th, 90th, and 95th percentile latencies.

## Running an End-to-End Example

To quickly experience the end-to-end process of running our benchmark and getting results, you can use the `run_example.sh`. This script is designed to execute the benchmark with a specific configuration. The plots below will be generated in the charts directory. These plots show the performance as depicted in figure 8 of our blog [post.](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen#f-other-hardware-platforms)

```bash
bash run_example.sh
```

<div align="center">
<img src="A6000_benchmarks_example.PNG" alt="" width="800"/><br>

*Figure 1: Throughput-latency curve and effective throughput of Llama 2 7b using A6000. Runs the client with 60 generation steps and input prompt length of 2600.*<br>
</div>
40 changes: 30 additions & 10 deletions benchmarks/inference/mii/plot_effective_throughput.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,21 +12,30 @@
SLA_GEN_TOKENS_PER_SEC = [1, 2, 3, 4, 6, 8]
EMA_SPAN = 16

tp_sizes = {
tp_sizes_all = {
"7b": [1],
"70b": [4, 8],
"70b": [4, 8]
}

prompt_gen_pairs = [
tp_sizes_test = {
"7b": [1]
}

prompt_gen_pairs_all = [
(1200, 60),
(1200, 128),
(2600, 60),
(2600, 128),
]

prompt_gen_pairs_test = [
(2600, 60)
]

def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--test", action="store_true")
parser.add_argument("--no_vllm", action="store_true")
parser.add_argument("--log_dir", type=Path, default=".")
parser.add_argument("--out_dir", type=Path, default="charts/goodtput")
args = parser.parse_args()
Expand Down Expand Up @@ -96,7 +105,8 @@ def display_results(model_size, tp, bs, sla_token_gen, prompt, gen, log_dir, out
print(f"model: {model_size} Prompt: {prompt}, Generation: {gen}, TP: {tp} sla_token_gen: {sla_token_gen}")

mii_file_pattern = f"{log_dir}/logs.llama2-{model_size}-tp{tp}-b{bs}/llama2-{model_size}-tp{tp}-b{bs}_c*_p{prompt}_g{gen}.json"
vllm_file_pattern = f"{log_dir}/logs.vllm-llama2-{model_size}-tp{tp}/vllm-llama2-{model_size}-tp{tp}_c*_p{prompt}_g{gen}.json"
if not args.no_vllm:
vllm_file_pattern = f"{log_dir}/logs.vllm-llama2-{model_size}-tp{tp}/vllm-llama2-{model_size}-tp{tp}_c*_p{prompt}_g{gen}.json"

validate_funcs = [
(validate_token_cum_latency_SLA, (), "cum"),
Expand All @@ -109,25 +119,28 @@ def display_results(model_size, tp, bs, sla_token_gen, prompt, gen, log_dir, out
client_num_list = sorted(list(mii_goodputs.keys()))
mii_goodputs_list = [mii_goodputs[client_num] for client_num in client_num_list]

vllm_goodputs, vllm_good_ratios = extract_values(vllm_file_pattern, sla_token_gen, f)
vllm_goodputs_list = [vllm_goodputs[client_num] for client_num in client_num_list]
if not args.no_vllm:
vllm_goodputs, vllm_good_ratios = extract_values(vllm_file_pattern, sla_token_gen, f)
vllm_goodputs_list = [vllm_goodputs[client_num] for client_num in client_num_list]

# print(f"MII {mii_goodputs_list} ratio={mii_good_ratios}")
# print(f"vLLM {vllm_goodputs_list} ratio={vllm_good_ratios}")

# Plotting the scatter plot
plt.figure(figsize=(7, 4))
plt.scatter(client_num_list, mii_goodputs_list, label=f"DeepSpeed-FastGen", marker="o", color="blue")
plt.scatter(client_num_list, vllm_goodputs_list, label=f"vLLM", marker="x", color="orange")
if not args.no_vllm:
plt.scatter(client_num_list, vllm_goodputs_list, label=f"vLLM", marker="x", color="orange")

fit_x_list = np.arange(min(client_num_list), max(client_num_list), 0.1)
mii_fit_model = np.polyfit(client_num_list, mii_goodputs_list, 4)
mii_model_fn = np.poly1d(mii_fit_model)
plt.plot(fit_x_list, mii_model_fn(fit_x_list), color="blue", alpha=0.5, linestyle="--")

vllm_fit_model = np.polyfit(client_num_list, vllm_goodputs_list, 4)
vllm_model_fn = np.poly1d(vllm_fit_model)
plt.plot(fit_x_list, vllm_model_fn(fit_x_list), color="orange", alpha=0.5, linestyle="--")
if not args.no_vllm:
vllm_fit_model = np.polyfit(client_num_list, vllm_goodputs_list, 4)
vllm_model_fn = np.poly1d(vllm_fit_model)
plt.plot(fit_x_list, vllm_model_fn(fit_x_list), color="orange", alpha=0.5, linestyle="--")

title = f"Effective throughput (SLA prompt: {SLA_PROMPT_TOKENS_PER_SEC} tokens/s, generation: {sla_token_gen} tokens/s)\n" \
+ f'Llama 2 {model_size.upper()} Prompt: {prompt}, Generation: {gen}, TP: {tp}'
Expand All @@ -148,6 +161,13 @@ def display_results(model_size, tp, bs, sla_token_gen, prompt, gen, log_dir, out
if __name__ == "__main__":
args = get_args()

if args.test:
tp_sizes = tp_sizes_test
prompt_gen_pairs = prompt_gen_pairs_test
else:
tp_sizes = tp_sizes_all
prompt_gen_pairs = prompt_gen_pairs_all

for model_size, tps in tp_sizes.items():
for tp in tps:
for prompt, gen in prompt_gen_pairs:
Expand Down
46 changes: 32 additions & 14 deletions benchmarks/inference/mii/plot_th_lat.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,25 @@
import argparse
from pathlib import Path
import numpy as np

import pdb
from postprocess_results import read_json, get_summary

bs = 768

tp_sizes = {
tp_sizes_test = {
"7b": [1]
}

tp_sizes_all = {
"7b": [1],
"70b": [4, 8],
}

prompt_gen_pairs = [
prompt_gen_pairs_test = [
(2600, 60)
]

prompt_gen_pairs_all = [
(1200, 60),
(1200, 128),
(2600, 60),
Expand All @@ -22,7 +30,9 @@

def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--log_dir", type=Path, default="logs.release")
parser.add_argument("--test", action="store_true")
parser.add_argument("--no_vllm", action="store_true")
parser.add_argument("--log_dir", type=Path, default=".")
parser.add_argument("--out_dir", type=Path, default="charts/throughput_latency")
args = parser.parse_args()
return args
Expand Down Expand Up @@ -56,19 +66,22 @@ def output_charts(model_size, tp, bs, prompt, gen, log_dir, out_dir):
out_dir.mkdir(parents=True, exist_ok=True)

mii_file_pattern = f"{log_dir}/logs.llama2-{model_size}-tp{tp}-b{bs}/llama2-{model_size}-tp{tp}-b{bs}_c*_p{prompt}_g{gen}.json"
vllm_file_pattern = f"{log_dir}/logs.vllm-llama2-{model_size}-tp{tp}/vllm-llama2-{model_size}-tp{tp}_c*_p{prompt}_g{gen}.json"
if not args.no_vllm:
vllm_file_pattern = f"{log_dir}/logs.vllm-llama2-{model_size}-tp{tp}/vllm-llama2-{model_size}-tp{tp}_c*_p{prompt}_g{gen}.json"

_, mii_throughputs, mii_latencies = extract_values(mii_file_pattern)
_, vllm_throughputs, vllm_latencies = extract_values(vllm_file_pattern)
if not args.no_vllm:
_, vllm_throughputs, vllm_latencies = extract_values(vllm_file_pattern)

# Plotting the scatter plot
plt.figure(figsize=(6, 4))

plt.scatter(vllm_throughputs, vllm_latencies, label=f"vLLM", marker="x", color="orange")
fit_vllm_x_list = np.arange(min(vllm_throughputs), max(vllm_throughputs), 0.01)
vllm_vllm_model = np.polyfit(vllm_throughputs, vllm_latencies, 3)
vllm_model_fn = np.poly1d(vllm_vllm_model)
plt.plot(fit_vllm_x_list, vllm_model_fn(fit_vllm_x_list), color="orange", alpha=0.5, linestyle="--")

if not args.no_vllm:
plt.scatter(vllm_throughputs, vllm_latencies, label=f"vLLM", marker="x", color="orange")
fit_vllm_x_list = np.arange(min(vllm_throughputs), max(vllm_throughputs), 0.01)
vllm_vllm_model = np.polyfit(vllm_throughputs, vllm_latencies, 3)
vllm_model_fn = np.poly1d(vllm_vllm_model)
plt.plot(fit_vllm_x_list, vllm_model_fn(fit_vllm_x_list), color="orange", alpha=0.5, linestyle="--")

plt.scatter(mii_throughputs, mii_latencies, label=f"DeepSpeed FastGen", marker="o", color="blue")
fit_mii_x_list = np.arange(min(mii_throughputs), max(mii_throughputs), 0.01)
Expand All @@ -82,15 +95,20 @@ def output_charts(model_size, tp, bs, prompt, gen, log_dir, out_dir):
plt.legend()
plt.grid(True)
plt.tight_layout()
# plt.show()
out_file = out_dir / f"th_lat_curve_llama{model_size}_tp{tp}_p{prompt}g{gen}.png"
print(f"Saving {out_file}")
plt.savefig(out_file)


if __name__ == "__main__":
args = get_args()

if args.test:
tp_sizes = tp_sizes_test
prompt_gen_pairs = prompt_gen_pairs_test
else:
tp_sizes = tp_sizes_all
prompt_gen_pairs = prompt_gen_pairs_test_all

for model_size, tps in tp_sizes.items():
for tp in tps:
for prompt, gen in prompt_gen_pairs:
Expand Down
23 changes: 2 additions & 21 deletions benchmarks/inference/mii/run_benchmark_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,35 +80,16 @@ def callback(response):
token_gen_time.append(time_now - time_last_token)
time_last_token = time_now

postprocess_config = {
"logit_processor": {
# "name": "TopP",
# "args": {
# "top_p": 0.9
# }
"name": "Temperature",
"args": {
"temperature": 0.9
}
},
"sampler": {
"name": "Logits"
},
"stop_criterion": {
"name": "EosGeneration"
}
}

time_last_token = start_time = time.time()
token_gen_time = []
if stream:
output_tokens = []
client.generate(
input_tokens, max_new_tokens=max_new_tokens, postprocess_config=postprocess_config,
input_tokens, max_new_tokens=max_new_tokens,
streaming_fn=callback)
else:
result = client.generate(
input_tokens, max_new_tokens=max_new_tokens, postprocess_config=postprocess_config)
input_tokens, max_new_tokens=max_new_tokens)
output_tokens = result.response[0]

return ResponseDetails(
Expand Down
19 changes: 19 additions & 0 deletions benchmarks/inference/mii/run_example.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
### Run the server
RAGGED_BATCH_SIZE=768
PARAM_SIZES=(7b)
DEPLOYMENT_NAME=llama2-7b-tp1-b768
python server.py --model_name meta-llama/Llama-2-7b-hf -d llama2-7b-tp1-b768 -m 1 -b 768 start

### This command will run the client with 60 generation steps and input prompt length of 2600
DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=2600 MAX_NEW_TOKENS=60 bash ./run_benchmark_client.sh

### Stop the server
echo "Stopping server"
python server.py -d ${DEPLOYMENT_NAME} stop
sleep 120

### Gernerate the plots
python plot_th_lat.py --log_dir . --test --no_vllm
python plot_effective_throughput.py --log_dir . --test --no_vllm

echo "Find the plots in the charts directory and the logs inside logs.llama2-7b-tp1-b768"
Loading