diff --git a/README.md b/README.md
index 09a7a3a40..da961c0ee 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,8 @@ This folder contains end-to-end applications that use DeepSpeed to train and use
 There are several training and finetuning examples so please see the individual folders for specific instructions.
 
 ## 3. Inference
-The DeepSpeed Huggingface inference [README](./inference/huggingface/README.md) explains how to get started with running DeepSpeed Huggingface inference examples.
+- The DeepSpeed-MII inference [README](./inference/mii/README.md) explains how to get started with running model inference with [DeepSpeed-MII](https://github.com/Microsoft/DeepSpeed-MII) and [DeepSpeed-FastGen](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen).
+- The DeepSpeed Huggingface inference [README](./inference/huggingface/README.md) explains how to get started with running DeepSpeed Huggingface inference examples.
 
 ## 4. Compression
 Model compression examples.
@@ -16,6 +17,10 @@ Model compression examples.
 ## 5. Benchmarks
 All benchmarks that use the DeepSpeed library are maintained in this folder.
 
+# Build Pipeline Status
+| Description | Status |
+| ----------- | ------ |
+| Integrations | [![nv-ds-chat](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml) |
 
 # Contributing
 
diff --git a/applications/DeepSpeed-Chat/.gitignore b/applications/DeepSpeed-Chat/.gitignore
new file mode 100644
index 000000000..cbc922f8f
--- /dev/null
+++ b/applications/DeepSpeed-Chat/.gitignore
@@ -0,0 +1,137 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+results/
+outputs/
+
+.amltconfig
+.test_output
+*.hdf5
+*.h5
diff --git a/applications/DeepSpeed-Chat/README.md b/applications/DeepSpeed-Chat/README.md
index 22aa1f157..ac9f3ab6a 100644
--- a/applications/DeepSpeed-Chat/README.md
+++ b/applications/DeepSpeed-Chat/README.md
@@ -33,21 +33,26 @@ A fast, affordable, scalable and open system framework for enabling end-to-end R
 <!-- markdown-toc start - Don't edit this section. Run M-x markdown-toc-refresh-toc -->
 ## Table of Contents
 
+- [🐕DeepSpeed-Chat: Easy, Fast and Affordable RLHF Training of ChatGPT-like Models at All Scales🐕](#deepspeed-chat-easy-fast-and-affordable-rlhf-training-of-chatgpt-like-models-at-all-scales)
+- [Table of Contents](#table-of-contents)
 - [📰 Latest News 📰](#-latest-news-)
-- [🚀 What is DeepSpeed Chat 🚀️](#-what-is-deepspeed-chat-)
+- [🚀 What is DeepSpeed Chat 🚀](#-what-is-deepspeed-chat-)
 - [🧨 Capabilities 🧨](#-capabilities-)
 - [☕ Quick Start ☕](#-quick-start-)
   - [🐼 Installation](#-installation)
-  - [🐼 Single Script for Training 3-Step RLHF Pipeline](#-one-single-script-completes-all-three-stages-of-rlhf-training-and-generate-your-first-chatgpt-model)
+  - [🐼 One Single Script Completes All Three Steps of RLHF Training and Generate Your First ChatGPT Model](#-one-single-script-completes-all-three-steps-of-rlhf-training-and-generate-your-first-chatgpt-model)
   - [🐼 Demonstration: Individual Step Fine-Tuning](#-demonstration-individual-step-fine-tuning)
     - [🕐 Step 1 - Supervised Fine-Tuning](#-step-1---supervised-fine-tuning)
     - [🕑 Step 2 - Reward Model](#-step-2---reward-model)
     - [🕒 Step 3 - Reinforcement Learning with Human Feedback](#-step-3---reinforcement-learning-with-human-feedback)
-   - [🐼 Adding and using your own datasets in DeepSpeed-Chat](#-adding-and-using-your-own-datasets-in-deepspeed-chat)
-   - [🐼 Customizing RLHF training pipeline via DeepSpeed-Chat’s APIs](#-customizing-your-own-rlhf-training-pipeline-using-deepspeed-chats-rlhf-apis)
-   - [🐼 Serving Your Model: Plug-in and Test!](#-serving-plug-in-your-final-model-trained-by-deepspeed-chat-and-test-it-out)  
+  - [🐼 Adding and using your own datasets in DeepSpeed-Chat](#-adding-and-using-your-own-datasets-in-deepspeed-chat)
+  - [🐼 Customizing your own RLHF training pipeline using DeepSpeed-Chat’s RLHF APIs](#-customizing-your-own-rlhf-training-pipeline-using-deepspeed-chats-rlhf-apis)
+  - [🐼 Serving: Plug-in your final model trained by DeepSpeed-Chat and test it out!](#-serving-plug-in-your-final-model-trained-by-deepspeed-chat-and-test-it-out)
 - [🔥 Training Performance Evaluation 🔥](#-training-performance-evaluation-)
+  - [🐲 Superior Model Scale and Low Training Cost](#-superior-model-scale-and-low-training-cost)
+  - [🐲 Throughput and Model Size Scalability Comparisons with Existing RLHF Systems](#-throughput-and-model-size-scalability-comparisons-with-existing-rlhf-systems)
 - [😽 Supported Models 😽](#-supported-models-)
+- [🔬 Build Pipeline Status 🔬](#-build-pipeline-status-)
 - [⚓ Documentation and Tutorial ⚓](#-documentation-and-tutorial-)
 - [🌱 DeepSpeed Chat's Roadmap 🌱](#-deepspeed-chats-roadmap-)
 - [💬 DeepSpeed Chat and DeepSpeed Community 💬](#-deepspeed-chat-and-deepspeed-community-)
@@ -57,7 +62,20 @@ A fast, affordable, scalable and open system framework for enabling end-to-end R
 
 ## 📰 Latest News 📰
 
-* ***[2023/04] 🚀 [DeepSpeed Chat: Easy, Fast and Affordable RLHF Training of ChatGPT-like Models at All Scales](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat)*** [[English](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat/README.md)] [[中文](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat/chinese/README.md)] [[日本語](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat/japanese/README.md)]🚀
+* ***[2023/08] 🚀 [DeepSpeed-Chat: Llama/Llama-2 system support, efficiency boost, and training stability improvements](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat/ds-chat-release-8-31/README.md)*** 🚀
+
+* ***[2023/04] [DeepSpeed Chat: Easy, Fast and Affordable RLHF Training of ChatGPT-like Models at All Scales](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat)*** [[English](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat/README.md)] [[中文](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat/chinese/README.md)] [[日本語](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat/japanese/README.md)]
+
+To cite DeepSpeed Chat, please cite our [arxiv report](https://arxiv.org/abs/2308.01320):
+
+```
+@article{yao2023dschat,
+  title={{DeepSpeed-Chat: Easy, Fast and Affordable RLHF Training of ChatGPT-like Models at All Scales}},
+  author={Zhewei Yao and Reza Yazdani Aminabadi and Olatunji Ruwase and Samyam Rajbhandari and Xiaoxia Wu and Ammar Ahmad Awan and Jeff Rasley and Minjia Zhang and Conglong Li and Connor Holmes and Zhongzhu Zhou and Michael Wyatt and Molly Smith and Lev Kurilenko and Heyang Qin and Masahiro Tanaka and Shuai Che and Shuaiwen Leon Song and Yuxiong He},
+  journal={arXiv preprint arXiv:2308.01320},
+  year={2023}
+}
+```
 
 ## 🚀 What is DeepSpeed Chat 🚀
 
@@ -67,11 +85,11 @@ https://user-images.githubusercontent.com/124002815/230290966-a78ea171-ab65-4fcc
 
 </div>
 
-In the spirit of democratizing ChatGPT-style models and their capabilities, DeepSpeed is proud to introduce a general system framework for enabling an end-to-end training experience for ChatGPT-like models, named ***DeepSpeed Chat***. It can automatically take your favorite pre-trained large language models though an OpenAI InstructGPT style three stages to produce your very own high-quality ChatGPT-style model. DeepSpeed Chat makes training for high-quality ChatGPT-style models easy, fast, affordable and scalable.
+In the spirit of democratizing ChatGPT-style models and their capabilities, DeepSpeed is proud to introduce a general system framework for enabling an end-to-end training experience for ChatGPT-like models, named ***DeepSpeed Chat***. It can automatically take your favorite pre-trained large language models through an OpenAI InstructGPT style three stages to produce your very own high-quality ChatGPT-style model. DeepSpeed Chat makes training for high-quality ChatGPT-style models easy, fast, affordable and scalable.
 
-With just one click, you can train, generate and serve a 1.3 billion parameter ChatGPT model within 1.36 hours on a single consumer-grade NVIDIA A6000 GPU with 48GB memory. On a single DGX node with 8 NVIDIA A100-40G GPUs, DeepSpeed-Chat enables training for a 13 billion parameter ChatGPT model in 13.6 hours. On multi-GPU multi-node systems (cloud scenarios),i.e., 8 DGX nodes with 8 NVIDIA A100 GPUs/node, DeepSpeed-Chat can train a 66 billion parameter ChatGPT model under 9 hours. Finally, it enables 15X faster training over the existing RLHF systems, and can handle training of ChatGPT-like models with over 200 billion parameters: another impossible feat with the existing systems. For the full range of discussion on various model sizes and low training cost enabled by DeepSpeed-Chat, please refer to the [Release Blog](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat) and [Training Performance Evaluation](#-training-performance-evaluation-).  
+With just one click, you can train, generate and serve a 1.3 billion parameter ChatGPT model within 1.36 hours on a single consumer-grade NVIDIA A6000 GPU with 48GB memory. On a single DGX node with 8 NVIDIA A100-40G GPUs, DeepSpeed-Chat enables training for a 13 billion parameter ChatGPT model in 13.6 hours. On multi-GPU multi-node systems (cloud scenarios),i.e., 8 DGX nodes with 8 NVIDIA A100 GPUs/node, DeepSpeed-Chat can train a 66 billion parameter ChatGPT model in under 9 hours. Finally, it enables 15X faster training over the existing RLHF systems, and can handle training of ChatGPT-like models with over 200 billion parameters: another impossible feat with the existing systems. For the full range of discussion on various model sizes and low training costs enabled by DeepSpeed-Chat, please refer to the [Release Blog](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat) and [Training Performance Evaluation](#-training-performance-evaluation-).  
 
-Beyond this release, DeepSpeed system has been proudly serving as the system backend for accelerating a range of on-going efforts for fast training/fine-tuning Chat-Style models (e.g., LLaMA). The following are some of the open-source examples that are powered by DeepSpeed:
+Beyond this release, DeepSpeed system has been proudly serving as the system backend for accelerating a range of ongoing efforts for fast training/fine-tuning Chat-Style models (e.g., LLaMA). The following are some of the open-source examples that are powered by DeepSpeed:
 
 - [Databricks Dolly](https://github.com/databrickslabs/dolly)
 - [LMFlow](https://github.com/OptimalScale/LMFlow)
@@ -87,7 +105,7 @@ A summary of DeepSpeed Chat includes:
 
 + **DeepSpeed Chat**: a complete end-to-end three-stage OpenAI InstructGPT training strategy with Reinforcement Learning Human Feedback (RLHF), to generate high-quality ChatGPT-style models from users’ favorite pre-trained large language model checkpoints;
 + **DeepSpeed Hybrid Engine**: A new system support for fast, affordable and scalable RLHF training at All Scales. It is built upon your favorite DeepSpeed's system capability such as ZeRO technologies and DeepSpeed-Inference;
-+ **Easy-breezy Training Experience**: A single script capable of taking a pre-trained Huggingface model and running it though all three steps of the RLHF training.  
++ **Easy-breezy Training Experience**: A single script capable of taking a pre-trained Huggingface model and running it through all three steps of the RLHF training.  
 + **A Universal System Support for Today’s ChatGPT-like Model Training**: DeepSpeed Chat can serve as the system backend for not only the 3-step instruct-base RLHF pipeline, but also the current single model finetuning exploration (e.g., LLaMA-centric finetuning) and generic RLHF training for various models and scenarios.
 
 Please check out our [Blog Release](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat) and [Documentation and Tutorial](#-documentation-and-tutorial-) for more details on our training methodology and new system technologies.
@@ -105,6 +123,7 @@ pip install deepspeed>=0.9.0
 git clone https://github.com/microsoft/DeepSpeedExamples.git
 cd DeepSpeedExamples/applications/DeepSpeed-Chat/
 pip install -r requirements.txt
+pip install -e .
 ```
 
 ### 🐼 One Single Script Completes All Three Steps of RLHF Training and Generate Your First ChatGPT Model
@@ -134,7 +153,7 @@ If you only have around **1-2 hour** for coffee or lunch break, you can also try
 
 <details><summary> Expand </summary><p>
 
-If you only have around **half a day** and only a single server node, we suggest to use an example of pretrained **OPT-13B** as the actor model and OPT-350M as the reward model in the following single script to generate a final 13B ChatGPT-style model:
+If you only have around **half a day** and only a single server node, we suggest using an example of pretrained **OPT-13B** as the actor model and OPT-350M as the reward model in the following single script to generate a final 13B ChatGPT-style model:
 
   ```bash
   python train.py --actor-model facebook/opt-13b --reward-model facebook/opt-350m --deployment-type single_node
@@ -181,7 +200,7 @@ The train.py script has an easy-to-use command-line interface and can be launche
 cd training/step1_supervised_finetuning/
 
 # Run the training script
-bash training_scripts/single_gpu/run_1.3b.sh
+bash training_scripts/opt/single_gpu/run_1.3b.sh
 
 # Evaluate the model
 bash evaluation_scripts/run_prompt.sh
@@ -198,7 +217,7 @@ bash evaluation_scripts/run_prompt.sh
 cd training/step2_reward_model_finetuning
 
 # Run the training script
-bash training_scripts/single_gpu/run_350m.sh
+bash training_scripts/opt/single_gpu/run_350m.sh
 
 # Evaluate the model
 bash evaluation_scripts/run_eval.sh
@@ -226,7 +245,7 @@ As the most complex step of the entire 3-step InstructGPT pipeline, DeepSpeed Ch
 cd training/step3_rlhf_finetuning/
 
 # Run the training script
-bash training_scripts/single_gpu/run_1.3b.sh
+bash training_scripts/opt/single_gpu/run_1.3b.sh
 ```
 </p></details>
 
@@ -235,16 +254,16 @@ bash training_scripts/single_gpu/run_1.3b.sh
 In addition to the datasets used in our example scripts, you can also add and use your own datasets. To do so, first you need to add a new Class in [training/utils/data/raw_datasets.py](https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/utils/data/raw_datasets.py) to define the format when using your data. You need to make sure to follow the APIs and format defined in the PromptRawDataset class to ensure a consistent data format that DeepSpeed-Chat relies on. You can look at the existing classes to learn how to do so.
 
 Second, you need to add an if condition in function get_raw_dataset in [training/utils/data/data_utils.py](https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/utils/data/data_utils.py) corresponding to your new dataset. The dataset_name string in the if condition should be the dataset name you will provide as a arg for the training scripts. Last, you need to add your new dataset's dataset_name into your "--data_path" arg in your training scripts.  
-If you have downloaded huggingface datasets manually, you can add your local path into "--data_path", such as "--data_path ./relative/Dahoas/rm-static" and "--data_path /absolute/Dahoas/rm-static". Remeber you should not make `data/` in your local path, it may cause an exception to `load_dataset`.
+If you have downloaded huggingface datasets manually, you can add your local path into "--data_path", such as "--data_path ./relative/Dahoas/rm-static" and "--data_path /absolute/Dahoas/rm-static". Remember you should not make `data/` in your local path, it may cause an exception to `load_dataset`.
 
-One thing to note that some datasets may only have one response instead of two responses. For those datasets, you can only use them in step 1. And in such case, you should add the dataset_name as part of the "--sft_only_data_path" arg instead of the "--data_path" arg. One thing to note is that: If you plan to only do step 1 SFT, adding more single-response datasets is definitely beneficial. However, if you do plan to do steps 2 and 3, then adding too many single-response datasets during SFT could backfire: these data could be different from the data used for steps 2/3, generating different distributions which could cause training instability/worse model quality during step 2/3. That is part of the reason why we focused on trying the datasets with two responses and the preference, and always split a dataset into all 3 steps.
+One thing to note is that some datasets may only have one response instead of two responses. For those datasets, you can only use them in step 1. And in such case, you should add the dataset_name as part of the "--sft_only_data_path" arg instead of the "--data_path" arg. One thing to note is that: If you plan to only do step 1 SFT, adding more single-response datasets is definitely beneficial. However, if you do plan to do steps 2 and 3, then adding too many single-response datasets during SFT could backfire: these data could be different from the data used for steps 2/3, generating different distributions which could cause training instability/worse model quality during step 2/3. That is part of the reason why we focused on trying the datasets with two responses and the preference, and always split a dataset into all 3 steps.
 
 If you have your own dataset in local files, you can also use it by following these rules:
 * Pass "local/jsonfile" as the dataset name to the "--data_path" argument.
 * Put your train data and evaluation data in applications/DeepSpeed-Chat/data/ with name train.json and eval.json.
 * The json data in file should be a single list with each item like ***{"prompt": "Human: I have a question. Assistant:", "chosen": "Good answer.", "rejected": "Bad answer."}***.
 
-What is more, when you use your own dataset files and modified some data in them, pay attention to the parameter "reload" of ***create_prompt_dataset*** function. You should pass a True value to it or the cache files will not refresh.
+What is more, when you use your own dataset files and modify some data in them, pay attention to the parameter "reload" of ***create_prompt_dataset*** function. You should pass a True value to it or the cache files will not refresh.
 
 ### 🐼 Customizing your own RLHF training pipeline using DeepSpeed-Chat’s RLHF APIs
 
@@ -360,21 +379,48 @@ For other detailed results and in-depth analysis, including effective throughput
 
 ## 😽 Supported Models 😽
 
+Currently, we support the following model families. We will continue to grow over time to include emerging models for ChatGPT-style training! See [Roadmap](#-deepspeed-chats-roadmap-) for more details.
+
+model family | size range | details
+------ | ------ | -------
+[opt](https://huggingface.co/models?other=opt) | 0.1B - 66B  | Extensive performance and accuracy tests have been performed.
+[llama2](https://huggingface.co/models?sort=trending&search=meta-llama%2FLlama-2) | 7B, 13B  | We provide full system support and scripts to try 7B and 13B models.*
+[llama2-70b](https://huggingface.co/models?sort=trending&search=meta-llama%2FLlama-2-70b) | 70B  | Llama-2-70B is supported through MixZ++, ZeRO-Offload but not Hybrid Engine.
+[bloom](https://huggingface.co/models?other=bloom) | 0.3B - 176B  | Please create your own scripts. We welcome contributions :)
+[gpt\_neox](https://huggingface.co/models?other=gpt_neox) | 1.3B - 20B | "
+[gptj](https://huggingface.co/models?other=gptj) | 1.4B - 6B | "
+[gpt\_neo](https://huggingface.co/models?other=gpt_neo) | 0.1B - 2.7B | "
+[gpt2](https://huggingface.co/models?other=gpt2) | 0.3B - 1.5B |  "
+[codegen](https://huggingface.co/Salesforce/codegen-16B-multi) | 0.35b - 16B | "
+
+* To create a new model training recipe/script, please see our training_scripts folder that contains opt and llama2 scripts. Please note that Llama-2 models have not been fully trained but the scripts have been tested for all the system optimizations including DeepSpeed ZeRO Stage 3, Hybrid Engine, MixZ++, and LoRA. We invite users to try out the Llama-2 scripts and share the results with the community via GitHub and HuggingFace model hub.
+
 
+## 🔬 Build Pipeline Status 🔬
 
-Currently, we support the following model families. We will continue to grow over time to including emerging models for ChatGPT-style training! See [Roadmap](#-deepspeed-chats-roadmap-) for more details.
+| Description | Status |
+| ----------- | ------ |
+| Integrations | [![nv-ds-chat](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml) |
+
+A DeepSpeed CI workflow runs the DeepSpeed-Chat Step 3 pipeline nightly across the following test configurations:
+
+Models
+```
+Actor:  facebook/opt-125m
+Critic: facebook/opt-125m (trained in DS-Chat Step 2)
+```
+
+Parameters comprising test matrix
+```
+Zero Stage:    2, 3
+Hybrid Engine: True, False
+Offload:       True, False
+LoRA:          True, False
+```
 
-model family | size range
------- | ------
-[opt](https://huggingface.co/models?other=opt) | 0.1B - 66B
-[bloom](https://huggingface.co/models?other=bloom) | 0.3B - 176B
-[gpt\_neox](https://huggingface.co/models?other=gpt_neox) | 1.3B - 20B
-[gptj](https://huggingface.co/models?other=gptj) | 1.4B - 6B
-[gpt\_neo](https://huggingface.co/models?other=gpt_neo) | 0.1B - 2.7B
-[gpt2](https://huggingface.co/models?other=gpt2) | 0.3B - 1.5B
-[codegen](https://huggingface.co/Salesforce/codegen-16B-multi) | 0.35b - 16B
+Each configuration (16 total) runs through a limited number of Step 3 non-overflow training steps (i.e. steps where neither actor nor critic overflow) and saves the actor/critic models.
+Assertions are used to check if the training pipeline executed correctly and if the actor and critic models were saved properly.
 
-* All performance and accuracy tests have been performed using the OPT model family only. For other models, please see our training_scripts folder on how to change model families.
 
 ## ⚓ Documentation and Tutorial ⚓
 
@@ -388,8 +434,7 @@ For more APIs, example scripts, and evaluation results, please refer to
 
 
 Our future plan includes but not limited to :
-- [ ] System support and finetuning for LLaMA
-- [ ] ZeRO-Offload (CPU/NVMe) is currently not supported but coming soon
+- [ ] Hybrid Engine Support for Llama-2-70B
 - [ ] Generalizing DeepSpeed-RLHF abstraction and system support for a wide range of RL algorithms/paradigms
 - [ ] Auto-tuning of system optimizations
 
@@ -398,8 +443,8 @@ Our future plan includes but not limited to :
 Just like how the success of [the BLOOM model](https://huggingface.co/bigscience/bloom) was supported by both [DeepSpeed Team](https://github.com/bigscience-workshop/Megatron-DeepSpeed) and many [open source contributors](https://huggingface.co/bigscience), we welcome all AI developers/practitioners/researchers to join this on-going effort for DeepSpeed-Chat. To participate:
 - Show your support by leaving a star ⭐ to our [DeepSpeed](https://github.com/microsoft/DeepSpeed) and [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) GitHub repositories.
 - Follow us on [twitter](https://twitter.com/MSFTDeepSpeed) to get notified about our latest news. For Chinese users, you can also follow our [Chinese Zhihu account](https://www.zhihu.com/people/deepspeed). For Japanese users, you can also follow our [Japanese twitter account](https://twitter.com/MSFTDeepSpeedJP).
-- Currently we prefer to interact with open source users mainly on GitHub so that it's easier for all users to search for related information. For bug report, please submit a GitHub issue. For contribution, please submit a pull request (PR). For general question/discussion, please open a new discussion or join any existing discussions.
-- We are open to collaborations with universities, research labs, companies, such as working together on deep learning research, applying DeepSpeed to empower real-world AI models and applications, and so on. For such requests (and other requests unsuitable for GitHub), please directly email to deepspeed-info@microsoft.com.
+- Currently we prefer to interact with open source users mainly on GitHub so that it's easier for all users to search for related information. For bug reports, please submit a GitHub issue. For contribution, please submit a pull request (PR). For general question/discussion, please open a new discussion or join any existing discussions.
+- We are open to collaborations with universities, research labs, and companies, such as working together on deep learning research, applying DeepSpeed to empower real-world AI models and applications, and so on. For such requests (and other requests unsuitable for GitHub), please directly email to deepspeed-info@microsoft.com.
 
 
 ## 🙏 Acknowledgement and Citation 🙏
diff --git a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/ppo_trainer.py b/applications/DeepSpeed-Chat/dschat/rlhf/ppo_trainer.py
similarity index 79%
rename from applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/ppo_trainer.py
rename to applications/DeepSpeed-Chat/dschat/rlhf/ppo_trainer.py
index bd9aefbc1..22cba6be0 100644
--- a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/ppo_trainer.py
+++ b/applications/DeepSpeed-Chat/dschat/rlhf/ppo_trainer.py
@@ -4,20 +4,18 @@
 # DeepSpeed Team
 import torch
 import torch.nn.functional as F
-import sys
-import os
+import time
 import deepspeed
 from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
+from deepspeed.accelerator import get_accelerator
 
-sys.path.append(
-    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)))
-
-from utils.utils import print_rank_0
+from dschat.utils.utils import print_rank_0
 
 
 def print_all_ranks(tag, value, rank):
     world_size = torch.distributed.get_world_size()
-    all_tensor = torch.zeros(world_size, dtype=torch.float32).cuda()
+    all_tensor = torch.zeros(world_size, dtype=torch.float32).to(
+        get_accelerator().current_device_name())
     all_tensor[rank] = value
     torch.distributed.all_reduce(all_tensor, op=torch.distributed.ReduceOp.SUM)
     print_rank_0(f'{tag} {all_tensor}', rank)
@@ -57,6 +55,12 @@ def __init__(self, rlhf_engine, args):
         self.end_of_conversation_token_id = self.tokenizer(
             args.end_of_conversation_token)['input_ids'][-1]
         self.z3_enabled = args.actor_zero_stage == 3
+        self.compute_fp32_loss = self.args.compute_fp32_loss
+
+        # In case the generated experience is not valid (too short), we use the last valid
+        # generated experience. Alternatively, we can skip the step (on all workers).
+        # For now, use the last valid experience which is a simpler solution
+        self.last_generated_experience = None
 
         # Those value can be changed
         self.kl_ctl = 0.1
@@ -65,18 +69,28 @@ def __init__(self, rlhf_engine, args):
         self.cliprange_value = 0.2
         self.gamma = 1.0
         self.lam = 0.95
+        self.generate_time = 0.0
 
     def _generate_sequence(self, prompts, mask, step):
 
         max_min_length = self.max_answer_seq_len + prompts.shape[1]
 
+        # This has been added due to a probability/nan error that happens after
+        # meta-llama/Llama-2-7b-hf enabled do_sample:
+        # https://huggingface.co/meta-llama/Llama-2-7b-hf/commit/6fdf2e60f86ff2481f2241aaee459f85b5b0bbb9
+        if self.actor_model.module.config.model_type == "llama":
+            kwargs = dict(do_sample=False)
+        else:
+            kwargs = dict()
+
         with torch.no_grad():
             seq = self.actor_model.module.generate(
                 prompts,
                 attention_mask=mask,
                 max_length=max_min_length,
                 pad_token_id=self.tokenizer.pad_token_id,
-                synced_gpus=self.z3_enabled)
+                synced_gpus=self.z3_enabled,
+                **kwargs)
 
         # Filter out seq with no answers (or very short). This happens when users directly use the pre-training ckpt without supervised finetuning
         # NOTE: this will causes each GPU has different number of examples
@@ -86,7 +100,8 @@ def _generate_sequence(self, prompts, mask, step):
         ans = seq[:, prompt_length:]
         valid_ans_len = (ans != self.tokenizer.pad_token_id).sum(dim=-1)
 
-        if self.args.print_answers:
+        if self.args.print_answers and (step % self.args.print_answers_interval
+                                        == 0):
             print(
                 f"--- prompt --> step={step}, rank={torch.distributed.get_rank()}, {self.tokenizer.batch_decode(prompts, skip_special_tokens=True)}"
             )
@@ -98,16 +113,38 @@ def _generate_sequence(self, prompts, mask, step):
         for i in range(batch_size):
             if valid_ans_len[
                     i] <= 1:  # if the answer is shorter than 1 token, drop it
+                print(
+                    f'Dropping too short generated answer: {step=}: \n'
+                    f'prompts: {self.tokenizer.batch_decode(prompts, skip_special_tokens=False)}\n'
+                    f'answers: {self.tokenizer.batch_decode(ans, skip_special_tokens=False)}'
+                )
                 continue
             else:
                 out_seq.append(seq[i:i + 1])
-        out_seq = torch.cat(out_seq, dim=0)  # concate output in the batch dim
+
+        if not out_seq:
+            print(
+                f'All generated results are too short for rank={self.args.local_rank} step={step}\n'
+                f'-> prompts: {self.tokenizer.batch_decode(prompts, skip_special_tokens=False)}\n'
+                f'-> answers: {self.tokenizer.batch_decode(ans, skip_special_tokens=False)}'
+            )
+            return None
+
+        out_seq = torch.cat(out_seq, dim=0)  # concat output in the batch dim
 
         return out_seq
 
     def generate_experience(self, prompts, mask, step):
         self.eval()
+        generate_start = time.time()
         seq = self._generate_sequence(prompts, mask, step)
+        generate_end = time.time()
+        if seq is None:
+            assert self.last_generated_experience is not None, f'Invalid generated experience at {step=}'
+            prompts = self.last_generated_experience['prompts']
+            seq = self.last_generated_experience['seq']
+        else:
+            self.last_generated_experience = {'prompts': prompts, 'seq': seq}
         self.train()
 
         pad_token_id = self.tokenizer.pad_token_id
@@ -124,6 +161,11 @@ def generate_experience(self, prompts, mask, step):
 
         logits = output.logits
         logits_ref = output_ref.logits
+        if self.compute_fp32_loss:
+            logits = logits.to(torch.float)
+            logits_ref = logits_ref.to(torch.float)
+
+        self.generate_time = generate_end - generate_start
 
         return {
             'prompts': prompts,
@@ -226,6 +268,17 @@ def train_rlhf(self, inputs):
 
         return actor_loss, critic_loss
 
+    def get_overflow(self):
+        # Overflow is not expected when using bf16
+        # Therefore, DeepSpeed's BF16_Optimizer does not maintain an overflow indication
+        if self.args.dtype == "bf16":
+            return False, False
+
+        actor_overflow = self.actor_model.optimizer.overflow
+        critic_overflow = self.critic_model.optimizer.overflow
+
+        return actor_overflow, critic_overflow
+
     def actor_loss_fn(self, logprobs, old_logprobs, advantages, mask):
         ## policy gradient loss
         log_ratio = (logprobs - old_logprobs) * mask
@@ -243,6 +296,9 @@ def critic_loss_fn(self, values, old_values, returns, mask):
             old_values - self.cliprange_value,
             old_values + self.cliprange_value,
         )
+        if self.compute_fp32_loss:
+            values = values.float()
+            values_clipped = values_clipped.float()
         vf_loss1 = (values - returns)**2
         vf_loss2 = (values_clipped - returns)**2
         vf_loss = 0.5 * torch.sum(
diff --git a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/rlhf_engine.py b/applications/DeepSpeed-Chat/dschat/rlhf/rlhf_engine.py
similarity index 79%
rename from applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/rlhf_engine.py
rename to applications/DeepSpeed-Chat/dschat/rlhf/rlhf_engine.py
index 1d3cfa51f..5b6778cc2 100755
--- a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/rlhf_engine.py
+++ b/applications/DeepSpeed-Chat/dschat/rlhf/rlhf_engine.py
@@ -9,10 +9,10 @@
 from deepspeed.ops.adam import DeepSpeedCPUAdam
 from transformers import AutoModelForCausalLM, get_scheduler
 
-from utils.ds_utils import get_train_ds_config, get_eval_ds_config
-from utils.module.lora import convert_linear_layer_to_lora, only_optimize_lora_parameters, make_model_gradient_checkpointing_compatible
-from utils.model.model_utils import create_hf_model, create_critic_model
-from utils.utils import get_optimizer_grouped_parameters
+from dschat.utils.ds_utils import get_train_ds_config, get_eval_ds_config
+from dschat.utils.module.lora import convert_linear_layer_to_lora, only_optimize_lora_parameters, make_model_gradient_checkpointing_compatible
+from dschat.utils.model.model_utils import create_hf_model, create_critic_model
+from dschat.utils.utils import get_optimizer_grouped_parameters
 """
 TODOs:
   * support HF models for critic (for debugging), must be a previously saved ckpt from step-2
@@ -53,7 +53,6 @@ def __init__(self, actor_model_name_or_path, critic_model_name_or_path,
         if self.args.enable_ema:
             self.actor_ema = self._init_ema(
                 actor_model_name_or_path=actor_model_name_or_path)
-
         self.critic = self._init_critic(
             critic_model_name_or_path=critic_model_name_or_path)
         self.reward = self._init_reward(
@@ -67,6 +66,7 @@ def _init_actor(self, actor_model_name_or_path):
         # DS Config
         ds_config = get_train_ds_config(
             offload=self.args.offload,
+            dtype=self.args.dtype,
             stage=self.args.actor_zero_stage,
             enable_hybrid_engine=self.args.enable_hybrid_engine,
             inference_tp_size=self.args.inference_tp_size,
@@ -76,13 +76,14 @@ def _init_actor(self, actor_model_name_or_path):
             max_out_tokens=self.args.max_prompt_seq_len +
             self.args.max_answer_seq_len,
             enable_tensorboard=self.args.enable_tensorboard,
+            enable_mixed_precision_lora=self.args.enable_mixed_precision_lora,
             tb_path=self.args.tensorboard_path,
             tb_name="step3_actor")
         ds_config[
-            'train_micro_batch_size_per_gpu'] = self.args.per_device_mini_train_batch_size
+            'train_micro_batch_size_per_gpu'] = self.args.per_device_training_batch_size
         #TODO(jeff): we should probably set grad accumlation steps here as well for clarity
         ds_config[
-            'train_batch_size'] = self.args.per_device_mini_train_batch_size * torch.distributed.get_world_size(
+            'train_batch_size'] = self.args.per_device_training_batch_size * torch.distributed.get_world_size(
             ) * self.args.gradient_accumulation_steps_actor
 
         # Model
@@ -91,7 +92,7 @@ def _init_actor(self, actor_model_name_or_path):
             model_name_or_path=actor_model_name_or_path,
             tokenizer=self.tokenizer,
             ds_config=ds_config,
-            disable_dropout=self.args.disable_actor_dropout)
+            dropout=self.args.actor_dropout)
 
         # LoRA
         if self.args.actor_lora_dim > 0:
@@ -106,7 +107,8 @@ def _init_actor(self, actor_model_name_or_path):
         # Optimizer
         AdamOptimizer = DeepSpeedCPUAdam if self.args.offload else FusedAdam
         optim_params = get_optimizer_grouped_parameters(
-            actor_model, self.args.actor_weight_decay)
+            actor_model, self.args.actor_weight_decay,
+            self.args.actor_lora_learning_rate)
         optim = AdamOptimizer(optim_params,
                               lr=self.args.actor_learning_rate,
                               betas=(0.9, 0.95))
@@ -138,12 +140,12 @@ def _init_ref(self, actor_model_name_or_path):
             # If actor is ZeRO-3 then we use it for everything, otherwise assume we have enough memory for ref model
             zero_stage = 0
         ds_config = get_eval_ds_config(self.args.offload_reference_model,
-                                       zero_stage)
+                                       self.args.dtype, zero_stage)
         ds_config[
-            'train_micro_batch_size_per_gpu'] = self.args.per_device_mini_train_batch_size
+            'train_micro_batch_size_per_gpu'] = self.args.per_device_training_batch_size
         #TODO(jeff): we should probably set grad accumlation steps here as well for clarity
         ds_config[
-            'train_batch_size'] = self.args.per_device_mini_train_batch_size * torch.distributed.get_world_size(
+            'train_batch_size'] = self.args.per_device_training_batch_size * torch.distributed.get_world_size(
             ) * self.args.gradient_accumulation_steps_actor
 
         ref_model = create_hf_model(AutoModelForCausalLM,
@@ -164,12 +166,12 @@ def _init_ema(self, actor_model_name_or_path):
             # If actor is ZeRO-3 then we use it for everything, otherwise assume we have enough memory
             zero_stage = 0
         ds_config = get_eval_ds_config(self.args.offload_reference_model,
-                                       zero_stage)
+                                       self.args.dtype, zero_stage)
         ds_config[
-            'train_micro_batch_size_per_gpu'] = self.args.per_device_mini_train_batch_size
+            'train_micro_batch_size_per_gpu'] = self.args.per_device_training_batch_size
         #TODO(jeff): we should probably set grad accumlation steps here as well for clarity
         ds_config[
-            'train_batch_size'] = self.args.per_device_mini_train_batch_size * torch.distributed.get_world_size(
+            'train_batch_size'] = self.args.per_device_training_batch_size * torch.distributed.get_world_size(
             ) * self.args.gradient_accumulation_steps_actor
 
         actor_model_ema = create_hf_model(AutoModelForCausalLM,
@@ -190,26 +192,26 @@ def _init_critic(self, critic_model_name_or_path):
         stime = log_init("Critic")
         ds_config = get_train_ds_config(
             offload=self.args.offload,
+            dtype=self.args.dtype,
             stage=self.args.critic_zero_stage,
             enable_tensorboard=self.args.enable_tensorboard,
             tb_path=self.args.tensorboard_path,
             tb_name="step3_critic")
         ds_config[
-            'train_micro_batch_size_per_gpu'] = self.args.per_device_mini_train_batch_size
+            'train_micro_batch_size_per_gpu'] = self.args.per_device_training_batch_size
         #TODO(jeff): we should probably set grad accumlation steps here as well for clarity
         ds_config[
-            'train_batch_size'] = self.args.per_device_mini_train_batch_size * torch.distributed.get_world_size(
+            'train_batch_size'] = self.args.per_device_training_batch_size * torch.distributed.get_world_size(
             ) * self.args.gradient_accumulation_steps
 
-        #TODO(jeff): should not be needed, we should be able to use ds_config above
-        #TODO(jeff): it means we never create the critic w. zero.init context if we are using ZeRO-3
-        # ds_eval_config = get_eval_ds_config(offload=False, stage=self.args.critic_zero_stage)
-        ds_eval_config = get_eval_ds_config(offload=False, stage=0)
-        #Minjia: We need to set train batch size and micro batch size here to pass the sanity check of DeepSpeed engine.
+        ds_eval_config = get_eval_ds_config(offload=False,
+                                            dtype=self.args.dtype,
+                                            stage=self.args.critic_zero_stage)
+        # We need to set train batch size and micro batch size here to pass the sanity check of DeepSpeed engine.
         ds_eval_config[
-            'train_micro_batch_size_per_gpu'] = self.args.per_device_mini_train_batch_size
+            'train_micro_batch_size_per_gpu'] = self.args.per_device_training_batch_size
         ds_eval_config[
-            'train_batch_size'] = self.args.per_device_mini_train_batch_size * torch.distributed.get_world_size(
+            'train_batch_size'] = self.args.per_device_training_batch_size * torch.distributed.get_world_size(
             ) * self.args.gradient_accumulation_steps
 
         # Model
@@ -219,7 +221,8 @@ def _init_critic(self, critic_model_name_or_path):
             ds_config=ds_eval_config,
             num_padding_at_beginning=self.args.num_padding_at_beginning,
             rlhf_training=True,
-            disable_dropout=self.args.disable_critic_dropout)
+            dropout=self.args.critic_dropout,
+            zero_stage=self.args.critic_zero_stage)
 
         # LoRA
         if self.args.critic_lora_dim > 0:
@@ -233,9 +236,10 @@ def _init_critic(self, critic_model_name_or_path):
 
         # Optimizer
         AdamOptimizer = DeepSpeedCPUAdam if self.args.offload else FusedAdam
-        optim_pararms = get_optimizer_grouped_parameters(
-            critic_model, self.args.critic_weight_decay)
-        optim = AdamOptimizer(optim_pararms,
+        optim_params = get_optimizer_grouped_parameters(
+            critic_model, self.args.critic_weight_decay,
+            self.args.critic_lora_learning_rate)
+        optim = AdamOptimizer(optim_params,
                               lr=self.args.critic_learning_rate,
                               betas=(0.9, 0.95))
 
@@ -265,22 +269,23 @@ def _init_reward(self, critic_model_name_or_path):
             zero_stage = 0
 
         ds_config = get_eval_ds_config(offload=self.args.offload,
+                                       dtype=self.args.dtype,
                                        stage=zero_stage)
         ds_config[
-            'train_micro_batch_size_per_gpu'] = self.args.per_device_mini_train_batch_size
+            'train_micro_batch_size_per_gpu'] = self.args.per_device_training_batch_size
         ds_config[
-            'train_batch_size'] = self.args.per_device_mini_train_batch_size * torch.distributed.get_world_size(
+            'train_batch_size'] = self.args.per_device_training_batch_size * torch.distributed.get_world_size(
             ) * self.args.gradient_accumulation_steps
 
-        #TODO(jeff): should not be needed, we should be able to use ds_config above
-        #TODO(jeff): it means we never create the critic w. zero.init context if we are using ZeRO-3
-        # ds_eval_config = get_eval_ds_config(offload=False, stage=zero_stage)
-        ds_eval_config = get_eval_ds_config(offload=False, stage=0)
-        #Minjia: We need to set train batch size and micro batch size here to pass the sanity check of DeepSpeed engine.
+        ds_eval_config = get_eval_ds_config(offload=False,
+                                            dtype=self.args.dtype,
+                                            stage=zero_stage)
+
+        # We need to set train batch size and micro batch size here to pass the sanity check of DeepSpeed engine.
         ds_eval_config[
-            'train_micro_batch_size_per_gpu'] = self.args.per_device_mini_train_batch_size
+            'train_micro_batch_size_per_gpu'] = self.args.per_device_training_batch_size
         ds_eval_config[
-            'train_batch_size'] = self.args.per_device_mini_train_batch_size * torch.distributed.get_world_size(
+            'train_batch_size'] = self.args.per_device_training_batch_size * torch.distributed.get_world_size(
             ) * self.args.gradient_accumulation_steps
 
         # Model
@@ -289,7 +294,9 @@ def _init_reward(self, critic_model_name_or_path):
             tokenizer=self.tokenizer,
             ds_config=ds_eval_config,
             num_padding_at_beginning=self.args.num_padding_at_beginning,
-            rlhf_training=True)
+            rlhf_training=True,
+            dropout=self.args.critic_dropout,
+            zero_stage=zero_stage)
 
         reward_engine, *_ = deepspeed.initialize(model=reward_model,
                                                  config=ds_config)
diff --git a/applications/DeepSpeed-Chat/training/utils/data/data_utils.py b/applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py
similarity index 99%
rename from applications/DeepSpeed-Chat/training/utils/data/data_utils.py
rename to applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py
index 91887b7bd..744ad7927 100644
--- a/applications/DeepSpeed-Chat/training/utils/data/data_utils.py
+++ b/applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py
@@ -14,7 +14,8 @@
 import os
 import hashlib
 from itertools import chain
-from . import raw_datasets
+from dschat.utils.data import raw_datasets
+from deepspeed.accelerator import get_accelerator
 
 
 def get_raw_dataset(dataset_name, output_path, seed, local_rank):
@@ -281,7 +282,8 @@ def create_prompt_dataset(local_rank,
     eval_fname = f"{output_path}/evaldata_{fname}.pt"
 
     cache_found = os.path.isfile(train_fname) and os.path.isfile(eval_fname)
-    buf_create_cache = torch.ByteTensor([not cache_found]).cuda()
+    buf_create_cache = torch.ByteTensor([not cache_found]).to(
+        get_accelerator().current_device_name())
     torch.distributed.all_reduce(buf_create_cache)
 
     if local_rank <= 0 and (buf_create_cache.item() != 0 or reload):
diff --git a/applications/DeepSpeed-Chat/training/utils/data/raw_datasets.py b/applications/DeepSpeed-Chat/dschat/utils/data/raw_datasets.py
similarity index 99%
rename from applications/DeepSpeed-Chat/training/utils/data/raw_datasets.py
rename to applications/DeepSpeed-Chat/dschat/utils/data/raw_datasets.py
index 3c84f4b07..2838f9dc0 100644
--- a/applications/DeepSpeed-Chat/training/utils/data/raw_datasets.py
+++ b/applications/DeepSpeed-Chat/dschat/utils/data/raw_datasets.py
@@ -1,8 +1,9 @@
 # Copyright (c) Microsoft Corporation.
 # SPDX-License-Identifier: Apache-2.0
 
+import os
 # DeepSpeed Team
-from datasets import load_dataset
+from datasets import load_dataset, load_from_disk
 from torch.utils.data import Subset
 import re
 
@@ -15,7 +16,9 @@ def __init__(self, output_path, seed, local_rank, dataset_name):
         self.output_path = output_path
         self.seed = seed
         self.local_rank = local_rank
-        if not dataset_name == 'local/jsonfile':
+        if os.path.exists(dataset_name):
+            self.raw_datasets = load_from_disk(dataset_name)
+        elif not dataset_name == 'local/jsonfile':
             self.raw_datasets = load_dataset(dataset_name)
 
     def get_train_data(self):
diff --git a/applications/DeepSpeed-Chat/training/utils/ds_utils.py b/applications/DeepSpeed-Chat/dschat/utils/ds_utils.py
similarity index 72%
rename from applications/DeepSpeed-Chat/training/utils/ds_utils.py
rename to applications/DeepSpeed-Chat/dschat/utils/ds_utils.py
index ceae681c6..9c15e5143 100644
--- a/applications/DeepSpeed-Chat/training/utils/ds_utils.py
+++ b/applications/DeepSpeed-Chat/dschat/utils/ds_utils.py
@@ -2,11 +2,16 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # DeepSpeed Team
+
+import deepspeed.comm as dist
+from deepspeed.accelerator import get_accelerator
+
 GLOBAL_BATCH_SIZE = 32
 MICRO_BATCH_SIZE = 4
 
 
 def get_train_ds_config(offload,
+                        dtype,
                         stage=2,
                         enable_hybrid_engine=False,
                         inference_tp_size=1,
@@ -15,10 +20,17 @@ def get_train_ds_config(offload,
                         tp_gather_partition_size=8,
                         max_out_tokens=512,
                         enable_tensorboard=False,
+                        enable_mixed_precision_lora=False,
                         tb_path="",
                         tb_name=""):
 
     device = "cpu" if offload else "none"
+    if dtype == "fp16":
+        data_type = "fp16"
+        dtype_config = {"enabled": True, "loss_scale_window": 100}
+    elif dtype == "bf16":
+        data_type = "bfloat16"
+        dtype_config = {"enabled": True}
     zero_opt_dict = {
         "stage": stage,
         "offload_param": {
@@ -32,15 +44,17 @@ def get_train_ds_config(offload,
         "stage3_prefetch_bucket_size": 3e7,
         "memory_efficient_linear": False
     }
+    if enable_mixed_precision_lora:
+        zero_opt_dict["zero_quantized_nontrainable_weights"] = True
+        if dist.get_world_size() != get_accelerator().device_count():
+            zero_opt_dict["zero_hpz_partition_size"] = get_accelerator(
+            ).device_count()
     return {
         "train_batch_size": GLOBAL_BATCH_SIZE,
         "train_micro_batch_size_per_gpu": MICRO_BATCH_SIZE,
         "steps_per_print": 10,
         "zero_optimization": zero_opt_dict,
-        "fp16": {
-            "enabled": True,
-            "loss_scale_window": 100
-        },
+        data_type: dtype_config,
         "gradient_clipping": 1.0,
         "prescale_gradients": False,
         "wall_clock_breakdown": False,
@@ -60,8 +74,16 @@ def get_train_ds_config(offload,
     }
 
 
-def get_eval_ds_config(offload, stage=0):
+def get_eval_ds_config(offload, dtype, stage=0):
     device = "cpu" if offload else "none"
+    if dtype == "fp16":
+        data_type = "fp16"
+        dtype_config = {
+            "enabled": True,
+        }
+    elif dtype == "bf16":
+        data_type = "bfloat16"
+        dtype_config = {"enabled": True}
     zero_opt_dict = {
         "stage": stage,
         "stage3_param_persistence_threshold": 1e4,
@@ -75,9 +97,7 @@ def get_eval_ds_config(offload, stage=0):
         "train_micro_batch_size_per_gpu": MICRO_BATCH_SIZE,
         "steps_per_print": 10,
         "zero_optimization": zero_opt_dict,
-        "fp16": {
-            "enabled": True
-        },
+        data_type: dtype_config,
         "gradient_clipping": 1.0,
         "prescale_gradients": False,
         "wall_clock_breakdown": False
diff --git a/applications/DeepSpeed-Chat/dschat/utils/model/model_utils.py b/applications/DeepSpeed-Chat/dschat/utils/model/model_utils.py
new file mode 100644
index 000000000..97d3bff15
--- /dev/null
+++ b/applications/DeepSpeed-Chat/dschat/utils/model/model_utils.py
@@ -0,0 +1,172 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+import os
+import math
+import torch
+from transformers import (
+    AutoConfig,
+    AutoModel,
+)
+from huggingface_hub import snapshot_download
+from transformers.deepspeed import HfDeepSpeedConfig
+
+from dschat.utils.model.reward_model import RewardModel
+from dschat.utils.utils import load_state_dict_into_model, print_rank_0
+
+
+def configure_dropout(model_config, dropout):
+    if dropout is not None:
+        for key in ('dropout', 'attention_dropout', 'hidden_dropout',
+                    'activation_dropout'):
+            if hasattr(model_config, key):
+                print(f"Setting model_config.{key} to {dropout}")
+                setattr(model_config, key, dropout)
+
+
+def causal_lm_model_to_fp32_loss(model):
+    """ Convert CausalLM model to calculate loss in fp32 """
+
+    def causal_lm_forward(
+        input_ids=None,
+        past_key_values=None,
+        attention_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        **deprecated_arguments,
+    ):
+        kwargs = dict() if model.config.model_type == "llama" else dict(
+            head_mask=head_mask)
+        output = model.__original_forward__(
+            input_ids=input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            labels=None,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            **kwargs)
+
+        return_dict = isinstance(output, dict)
+        lm_logits = output.logits if return_dict else output[0]
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(lm_logits.device)
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[..., :-1, :].float().contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            batch_size, seq_length, vocab_size = shift_logits.shape
+            # Flatten the tokens
+            loss_fct = torch.nn.CrossEntropyLoss()
+            loss = loss_fct(
+                shift_logits.view(batch_size * seq_length, vocab_size),
+                shift_labels.view(batch_size * seq_length))
+
+        if not return_dict:
+            # re-pack output with fp32 loss
+            return ((loss, ) + output) if loss is not None else output
+
+        output.loss = loss
+        return output
+
+    model.__original_forward__ = model.forward
+    model.forward = causal_lm_forward
+
+
+def create_hf_model(model_class,
+                    model_name_or_path,
+                    tokenizer,
+                    ds_config=None,
+                    rlhf_training=False,
+                    dropout=None):
+    model_config = AutoConfig.from_pretrained(model_name_or_path)
+    configure_dropout(model_config, dropout)
+
+    # Note: dschf is defined in function scope to avoid global effects
+    # https://huggingface.co/docs/transformers/main_classes/deepspeed#nontrainer-deepspeed-integration
+    if ds_config is not None and ds_config["zero_optimization"]["stage"] == 3:
+        dschf = HfDeepSpeedConfig(ds_config)
+    else:
+        dschf = None
+    if rlhf_training:
+        # the weight loading is handled by create critic model
+        model = model_class.from_config(model_config)
+    else:
+        model = model_class.from_pretrained(
+            model_name_or_path,
+            from_tf=bool(".ckpt" in model_name_or_path),
+            config=model_config)
+
+    model.config.end_token_id = tokenizer.eos_token_id
+    model.config.pad_token_id = model.config.eos_token_id
+    model.resize_token_embeddings(int(
+        8 *
+        math.ceil(len(tokenizer) / 8.0)))  # make the vocab size multiple of 8
+
+    return model
+
+
+def create_critic_model(model_name_or_path,
+                        tokenizer,
+                        ds_config,
+                        num_padding_at_beginning=0,
+                        rlhf_training=False,
+                        dropout=None,
+                        zero_stage=0,
+                        compute_fp32_loss=False):
+    # OPT model family always put a padding token at the beginning of the sequence,
+    # we did not see this in other models but not sure if it is a general rule
+
+    import time
+
+    start = time.time()
+    critic_model = create_hf_model(AutoModel, model_name_or_path, tokenizer,
+                                   ds_config, rlhf_training, dropout)
+    end = time.time()
+    print_rank_0(f">Creating model from_config took {end - start} seconds",
+                 None)
+
+    critic_model = RewardModel(
+        critic_model,
+        tokenizer,
+        num_padding_at_beginning=num_padding_at_beginning,
+        compute_fp32_loss=compute_fp32_loss)
+
+    if rlhf_training:
+        # load critic model from checkpoint
+
+        if not os.path.isdir(model_name_or_path):
+            model_name_or_path = snapshot_download(model_name_or_path)
+        model_ckpt_path = os.path.join(model_name_or_path, 'pytorch_model.bin')
+        assert os.path.exists(
+            model_ckpt_path
+        ), f"Cannot find model checkpoint at {model_ckpt_path}"
+
+        start = time.time()
+        model_ckpt_state_dict = torch.load(model_ckpt_path, map_location='cpu')
+        end = time.time()
+        print_rank_0(f">Creating model from_config took {end - start} seconds",
+                     None)
+
+        # load critic model from checkpoint with zero-stage 3 compatibility
+        # this functionality may be moved to DS checkpoint load API in future
+        start = time.time()
+        load_state_dict_into_model(critic_model,
+                                   model_ckpt_state_dict,
+                                   "",
+                                   zero_stage=zero_stage)
+        end = time.time()
+
+        print_rank_0(f">Creating model from_config took {end - start} seconds",
+                     None)
+
+    return critic_model
diff --git a/applications/DeepSpeed-Chat/training/utils/model/reward_model.py b/applications/DeepSpeed-Chat/dschat/utils/model/reward_model.py
similarity index 85%
rename from applications/DeepSpeed-Chat/training/utils/model/reward_model.py
rename to applications/DeepSpeed-Chat/dschat/utils/model/reward_model.py
index f0c9e211f..60d063b18 100644
--- a/applications/DeepSpeed-Chat/training/utils/model/reward_model.py
+++ b/applications/DeepSpeed-Chat/dschat/utils/model/reward_model.py
@@ -10,7 +10,11 @@
 ## https://github.com/CarperAI/trlx/blob/main/examples/summarize_rlhf/reward_model/reward_model.py
 class RewardModel(nn.Module):
 
-    def __init__(self, base_model, tokenizer, num_padding_at_beginning=0):
+    def __init__(self,
+                 base_model,
+                 tokenizer,
+                 num_padding_at_beginning=0,
+                 compute_fp32_loss=False):
         super().__init__()
         self.config = base_model.config
         self.num_padding_at_beginning = num_padding_at_beginning
@@ -25,14 +29,15 @@ def __init__(self, base_model, tokenizer, num_padding_at_beginning=0):
             self.config.n_embd = self.config.hidden_size if hasattr(
                 self.config, "hidden_size") else self.config.n_embd
             self.v_head = nn.Linear(self.config.n_embd, 1, bias=False)
-        self.rwtranrsformer = base_model
+        self.rwtransformer = base_model
         self.PAD_ID = tokenizer.pad_token_id
+        self.compute_fp32_loss = compute_fp32_loss
 
     def gradient_checkpointing_enable(self):
-        self.rwtranrsformer.gradient_checkpointing_enable()
+        self.rwtransformer.gradient_checkpointing_enable()
 
     def gradient_checkpointing_disable(self):
-        self.rwtranrsformer.gradient_checkpointing_disable()
+        self.rwtransformer.gradient_checkpointing_disable()
 
     def forward(self,
                 input_ids=None,
@@ -44,13 +49,18 @@ def forward(self,
                 use_cache=False):
         loss = None
 
-        transformer_outputs = self.rwtranrsformer(
+        if self.config.model_type == "llama":
+            kwargs = dict()
+        else:
+            kwargs = dict(head_mask=head_mask)
+
+        transformer_outputs = self.rwtransformer(
             input_ids,
             past_key_values=past_key_values,
             attention_mask=attention_mask,
-            head_mask=head_mask,
             inputs_embeds=inputs_embeds,
-            use_cache=use_cache)
+            use_cache=use_cache,
+            **kwargs)
 
         hidden_states = transformer_outputs[0]
         rewards = self.v_head(hidden_states).squeeze(-1)
@@ -68,7 +78,7 @@ def forward(self,
         rejected_rewards = rewards[bs:]
 
         # Compute pairwise loss. Only backprop on the different tokens before padding
-        loss = 0
+        loss = 0.
         for i in range(bs):
             chosen_id = chosen_ids[i]
             rejected_id = rejected_ids[i]
@@ -99,6 +109,9 @@ def forward(self,
                 chosen_reward[c_ind - 1])  #use the end score for reference
             rejected_mean_scores.append(rejected_reward[r_ind - 1])
 
+            if self.compute_fp32_loss:
+                c_truncated_reward = c_truncated_reward.float()
+                r_truncated_reward = r_truncated_reward.float()
             loss += -torch.nn.functional.logsigmoid(c_truncated_reward -
                                                     r_truncated_reward).mean()
 
@@ -122,13 +135,18 @@ def forward_value(self,
                       prompt_length=0,
                       use_cache=False):
 
-        transformer_outputs = self.rwtranrsformer(
+        if self.config.model_type == "llama":
+            kwargs = dict()
+        else:
+            kwargs = dict(head_mask=head_mask)
+
+        transformer_outputs = self.rwtransformer(
             input_ids,
             past_key_values=past_key_values,
             attention_mask=attention_mask,
-            head_mask=head_mask,
             inputs_embeds=inputs_embeds,
-            use_cache=use_cache)
+            use_cache=use_cache,
+            **kwargs)
         hidden_states = transformer_outputs[0]
         values = self.v_head(hidden_states).squeeze(-1)
         if return_value_only:
diff --git a/applications/DeepSpeed-Chat/training/utils/module/lora.py b/applications/DeepSpeed-Chat/dschat/utils/module/lora.py
similarity index 98%
rename from applications/DeepSpeed-Chat/training/utils/module/lora.py
rename to applications/DeepSpeed-Chat/dschat/utils/module/lora.py
index cd37e6496..32c9730b6 100644
--- a/applications/DeepSpeed-Chat/training/utils/module/lora.py
+++ b/applications/DeepSpeed-Chat/dschat/utils/module/lora.py
@@ -131,10 +131,10 @@ def convert_lora_to_linear_layer(model):
     return model
 
 
-def only_optimize_lora_parameters(model):
+def only_optimize_lora_parameters(model, force_optimize_params=[]):
     # turn off the gradient of all the parameters except the LoRA parameters
     for name, param in model.named_parameters():
-        if "lora_right_weight" in name or "lora_left_weight" in name:
+        if "lora_right_weight" in name or "lora_left_weight" in name or name in force_optimize_params:
             param.requires_grad = True
         else:
             param.requires_grad = False
diff --git a/applications/DeepSpeed-Chat/dschat/utils/perf.py b/applications/DeepSpeed-Chat/dschat/utils/perf.py
new file mode 100644
index 000000000..df57a2046
--- /dev/null
+++ b/applications/DeepSpeed-Chat/dschat/utils/perf.py
@@ -0,0 +1,166 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+
+
+# This function can be used to print throughput for Step 1 and 2 only
+def print_throughput(hf_model, args, e2e_time, rank=0):
+    if rank <= 0:
+        hf_config = hf_model.config
+        num_layers, hidden_size, vocab_size = get_hf_configs(hf_config)
+
+        gpus_per_model = torch.distributed.get_world_size()
+        seq_length = args.max_seq_len
+        batch_size = args.per_device_train_batch_size
+        samples_per_second = batch_size / e2e_time
+        checkpoint_activations_factor = 4 if args.gradient_checkpointing else 3
+        if args.lora_dim > 0:
+            k = args.lora_dim * 2 / hidden_size
+            checkpoint_activations_factor -= (1 - k)
+
+        hf_model._num_params = sum([
+            p.ds_numel if hasattr(p, "ds_tensor") else p.numel()
+            for p in hf_model.parameters()
+        ])
+        params_in_billions = hf_model._num_params / (1e9)
+
+        # Megatron paper's formula to calculate training flops
+        train_flops_per_iteration = calculate_flops(
+            checkpoint_activations_factor, batch_size, seq_length, hf_config)
+
+        train_tflops = train_flops_per_iteration / (e2e_time * gpus_per_model *
+                                                    (10**12))
+
+        param_string = f"{params_in_billions:.3f} B" if params_in_billions != 0 else "NA"
+        print(
+            f"Model Parameters: {param_string}, Latency: {e2e_time:.2f}s, TFLOPs: {train_tflops:.2f}, Samples/sec: {samples_per_second:.2f}, Time/seq {e2e_time/batch_size:.2f}s, Batch Size: {batch_size}, Sequence Length: {seq_length}"
+        )
+
+
+# Enhanced version of the function above that provides calculations and printing for Step 3
+def print_throughput_step3(actor_model,
+                           critic_model,
+                           args,
+                           e2e_time,
+                           gen_exp_time,
+                           train_time,
+                           rank=0):
+    if rank <= 0:
+        # Actor model passed here is a HF model.
+        actor_hf_config = actor_model.config
+        # Critic model passed here is  a DeepSpeed Engine. The module inside is the Reward model (that wraps a HF model).
+        critic_hf_config = critic_model.module.config
+
+        actor_num_layers, actor_hidden_size, actor_vocab_size = get_hf_configs(
+            actor_hf_config)
+        critic_num_layers, critic_hidden_size, critic_vocab_size = get_hf_configs(
+            critic_hf_config)
+
+        gpus_per_model = torch.distributed.get_world_size()
+        seq_length = args.max_answer_seq_len + args.max_prompt_seq_len
+        batch_size = args.per_device_generation_batch_size * args.generation_batches * args.ppo_epochs * gpus_per_model * 1 if args.unsupervised_dataset_name is None else 2
+        samples_per_second = batch_size / e2e_time
+
+        actor_checkpoint_activations_factor = 4 if args.actor_gradient_checkpointing else 3
+        critic_checkpoint_activations_factor = 4 if args.critic_gradient_checkpointing else 3
+        if args.actor_lora_dim > 0:
+            k = args.actor_lora_dim * 2 / actor_hidden_size
+            actor_checkpoint_activations_factor -= (1 - k)
+        if args.critic_lora_dim > 0:
+            k = args.critic_lora_dim * 2 / critic_hidden_size
+            critic_checkpoint_activations_factor -= (1 - k)
+
+        actor_model._num_params = sum([
+            p.ds_numel if hasattr(p, "ds_tensor") else p.numel()
+            for p in actor_model.parameters()
+        ])
+        actor_params_in_billions = actor_model._num_params / (1e9)
+
+        critic_model._num_params = sum([
+            p.ds_numel if hasattr(p, "ds_tensor") else p.numel()
+            for p in critic_model.parameters()
+        ])
+        critic_params_in_billions = critic_model._num_params / (1e9)
+
+        # Megatron paper's formula to calculate training flops
+
+        actor_train_flops_per_iteration = calculate_flops(
+            actor_checkpoint_activations_factor, batch_size, seq_length,
+            actor_hf_config)
+        critic_train_flops_per_iteration = calculate_flops(
+            critic_checkpoint_activations_factor, batch_size, seq_length,
+            critic_hf_config)
+
+        total_train_flops = actor_train_flops_per_iteration + critic_train_flops_per_iteration
+        train_tflops = total_train_flops / (train_time * gpus_per_model *
+                                            (10**12))
+
+        gen_bs = args.per_device_generation_batch_size * gpus_per_model
+
+        # Modified formula for calculating flops in the forward pass only
+        gen_flops_per_iteration = (
+            24 * gen_bs * seq_length * actor_num_layers *
+            (actor_hidden_size**2)) * (
+                1.0 + (seq_length / (6.0 * actor_hidden_size)) +
+                (actor_vocab_size /
+                 (16.0 * actor_num_layers * actor_hidden_size)))
+
+        gen_tflops = gen_flops_per_iteration / (gen_exp_time * gpus_per_model *
+                                                (10**12))
+
+        if actor_hf_config.torch_dtype == torch.float16:
+            num_bytes = 2
+        elif actor_hf_config.torch_dtype == torch.float32:
+            num_bytes = 4
+        else:
+            num_bytes = -1
+
+        pertok_lat = gen_exp_time / args.max_answer_seq_len
+        gen_bw = 1 / pertok_lat * actor_model._num_params * num_bytes / 1e9
+
+        total_flops_per_iteration = total_train_flops + gen_flops_per_iteration * args.generation_batches
+        total_tflops = total_flops_per_iteration / (e2e_time * gpus_per_model *
+                                                    (10**12))
+
+        print(
+            f"End-to-End => Latency: {e2e_time:.2f}s, TFLOPs: {total_tflops:.2f}, Samples/sec: {samples_per_second:.2f}, Time/seq {e2e_time/batch_size:.2f}s, Batch Size: {batch_size}, Total Seq. Length: {seq_length}"
+        )
+        print(
+            f"Generation => Latency: {gen_exp_time:.2f}s, Per-token Latency {pertok_lat*1000:.2f} ms, TFLOPs: {gen_tflops:.2f}, BW: {gen_bw if num_bytes > 0 else num_bytes:.2f} GB/sec, Answer Seq. Length: {args.max_answer_seq_len}"
+        )
+        print(
+            f"Training   => Latency: {train_time:.2f}s, TFLOPs: {train_tflops:.2f}"
+        )
+        actor_param_string = f"{actor_params_in_billions:.3f} B" if actor_params_in_billions != 0 else "NA"
+        critic_param_string = f"{critic_params_in_billions:.3f} B" if critic_params_in_billions != 0 else "NA"
+        print(
+            f"Actor Model Parameters => {actor_param_string}, Critic Model Parameters => {critic_param_string}"
+        )
+
+
+# Helper function to calculate FLOPs using the Megatron-LM paper's formula
+def calculate_flops(checkpoint_activations_factor, batch_size, seq_length,
+                    hf_config):
+    num_layers, hidden_size, vocab_size = get_hf_configs(hf_config)
+    flops_per_iteration = (24 * checkpoint_activations_factor * batch_size *
+                           seq_length * num_layers * (hidden_size**2)) * (
+                               1.0 + (seq_length / (6.0 * hidden_size)) +
+                               (vocab_size /
+                                (16.0 * num_layers * hidden_size)))
+    return flops_per_iteration
+
+
+def get_hf_configs(hf_config):
+    num_layers = getattr(hf_config, "num_hidden_layers",
+                         getattr(hf_config, "n_layer", None))
+    hidden_size = getattr(hf_config, "hidden_size",
+                          getattr(hf_config, "n_embd", None))
+    vocab_size = getattr(hf_config, "vocab_size", None)
+    assert all(
+        (num_layers, hidden_size, vocab_size)
+    ), "Could not determine number of layers, hidden size, and vocab size of the model"
+
+    return num_layers, hidden_size, vocab_size
diff --git a/applications/DeepSpeed-Chat/dschat/utils/utils.py b/applications/DeepSpeed-Chat/dschat/utils/utils.py
new file mode 100644
index 000000000..e4dc7d036
--- /dev/null
+++ b/applications/DeepSpeed-Chat/dschat/utils/utils.py
@@ -0,0 +1,310 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+import os
+import torch
+import random
+import numpy as np
+from transformers import set_seed, AutoTokenizer
+import json
+import deepspeed
+from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
+from deepspeed.accelerator import get_accelerator
+import torch.nn as nn
+
+
+def print_rank_0(msg, rank=None):
+    if rank is not None and rank <= 0:
+        print(msg)
+    elif is_rank_0():
+        print(msg)
+
+
+def is_rank_0():
+    """Check whether it is rank 0."""
+    if torch.distributed.is_initialized():
+        if torch.distributed.get_rank() == 0:
+            return True
+        else:
+            return False
+    else:
+        return True
+
+
+def to_device(batch, device):
+    output = {}
+    for k, v in batch.items():
+        try:
+            output[k] = v.to(device)
+        except:
+            output[k] = v
+    return output
+
+
+class MovingAverage:
+
+    def __init__(self):
+        self.count = 0
+        self.total = 0
+        self.mean = 0
+
+    def update(self, num):
+        self.total += num
+        self.count += 1
+        self.mean = self.total / self.count
+
+        return self.mean
+
+
+class ExponentialMovingAverage:
+
+    def __init__(self, alpha=0.9):
+        self.alpha = alpha
+        self.ema = None
+
+    def update(self, num):
+        prev_ema = num if self.ema is None else self.ema
+        self.ema = self.alpha * prev_ema + (1.0 - self.alpha) * num
+        return self.ema
+
+    def get(self):
+        return self.ema if self.ema is not None else 0.
+
+
+def get_tokenizer(model_name_or_path, fast_tokenizer=True):
+    if "llama" in model_name_or_path:
+        from transformers.models.llama import LlamaTokenizer
+        tokenizer = LlamaTokenizer.from_pretrained(
+            model_name_or_path, fast_tokenizer=fast_tokenizer)
+        if tokenizer.pad_token is None:
+            # assert tokenizer.eos_token is not None
+            # tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
+            tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+            tokenizer.padding_side = 'right'
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_name_or_path, fast_tokenizer=fast_tokenizer)
+        tokenizer.pad_token = tokenizer.eos_token
+        # make sure tokenizer is right pad in our logic
+        tokenizer.padding_side = 'right'
+    return tokenizer
+
+
+def load_hf_tokenizer(model_name_or_path,
+                      fast_tokenizer=True,
+                      add_special_tokens=None):
+    if os.path.exists(model_name_or_path):
+        # Locally tokenizer loading has some issue, so we need to force download
+        model_json = os.path.join(model_name_or_path, "config.json")
+        if os.path.exists(model_json):
+            model_json_file = json.load(open(model_json))
+            model_name = model_json_file.get("_name_or_path",
+                                             model_name_or_path)
+            tokenizer = get_tokenizer(model_name,
+                                      fast_tokenizer=fast_tokenizer)
+    else:
+        tokenizer = get_tokenizer(model_name_or_path,
+                                  fast_tokenizer=fast_tokenizer)
+
+    if add_special_tokens is not None:
+        add_special_tokens = [add_special_tokens] if isinstance(add_special_tokens, str) \
+            else add_special_tokens
+        tokenizer.add_special_tokens(
+            {'additional_special_tokens': add_special_tokens})
+
+    return tokenizer
+
+
+def save_hf_format(model, tokenizer, args, sub_folder=""):
+    # used to save huggingface format, so we can use it for hf.from_pretrained
+    model_to_save = model.module if hasattr(model, 'module') else model
+    CONFIG_NAME = "config.json"
+    WEIGHTS_NAME = "pytorch_model.bin"
+    output_dir = os.path.join(args.output_dir, sub_folder)
+    os.makedirs(output_dir, exist_ok=True)
+    output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
+    output_config_file = os.path.join(output_dir, CONFIG_NAME)
+    save_dict = model_to_save.state_dict()
+    for key in list(save_dict.keys()):
+        if "lora" in key:
+            del save_dict[key]
+    torch.save(save_dict, output_model_file)
+    model_to_save.config.to_json_file(output_config_file)
+    tokenizer.save_vocabulary(output_dir)
+
+
+def set_random_seed(seed):
+    if seed is not None:
+        set_seed(seed)
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        get_accelerator().manual_seed_all(seed)
+
+
+def get_all_reduce_mean(tensor):
+    torch.distributed.all_reduce(tensor, op=torch.distributed.ReduceOp.SUM)
+    tensor = tensor / torch.distributed.get_world_size()
+    return tensor
+
+
+# This function is a modified version of code available in the from_pretrained API of HuggingFace Transformers
+# The code is copied and modified from: https://github.com/huggingface/transformers/blob/5ee9693a1c77c617ebc43ef20194b6d3b674318e/src/transformers/modeling_utils.py#L498
+# This function helps load a HF format checkpoint into a DeepSpeed wrapped model that has been sharded using ZeRO Stage 3
+def load_state_dict_into_model(model_to_load=None,
+                               state_dict=None,
+                               start_prefix="",
+                               zero_stage=0):
+
+    # copy state_dict so _load_from_state_dict can modify it
+    metadata = getattr(state_dict, "_metadata", None)
+    state_dict = state_dict.copy()
+    if metadata is not None:
+        state_dict._metadata = metadata
+
+    error_msgs = []
+
+    # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants
+    # so we need to apply the function recursively.
+    def load(module: nn.Module, state_dict, prefix=""):
+        local_metadata = {} if metadata is None else metadata.get(
+            prefix[:-1], {})
+        args = (state_dict, prefix, local_metadata, True, [], [], error_msgs)
+        # Parameters of module and children will start with prefix. We can exit early if there are none in this
+        # state_dict
+        if len([key for key in state_dict if key.startswith(prefix)]) > 0:
+            if zero_stage == 3:
+                # In sharded models, each shard has only part of the full state_dict, so only gather
+                # parameters that are in the current state_dict.
+                named_parameters = dict(
+                    module.named_parameters(prefix=prefix[:-1], recurse=False))
+                params_to_gather = [
+                    named_parameters[k] for k in state_dict.keys()
+                    if k in named_parameters
+                ]
+                if len(params_to_gather) > 0:
+                    # because zero3 puts placeholders in model params, this context
+                    # manager gathers (unpartitions) the params of the current layer, then loads from
+                    # the state dict and then re-partitions them again
+                    with deepspeed.zero.GatheredParameters(params_to_gather,
+                                                           modifier_rank=0):
+                        if torch.distributed.get_rank() == 0:
+                            module._load_from_state_dict(*args)
+            else:
+                module._load_from_state_dict(*args)
+
+        for name, child in module._modules.items():
+            if child is not None:
+                load(child, state_dict, prefix + name + ".")
+
+    load(model_to_load, state_dict, prefix=start_prefix)
+    # Delete `state_dict` so it could be collected by GC earlier. Note that `state_dict` is a copy of the argument, so
+    # it's safe to delete it.
+    del state_dict
+
+    return error_msgs
+
+
+def get_optimizer_grouped_parameters(
+    model,
+    weight_decay,
+    lora_lr=5e-4,
+    no_decay_name_list=[
+        "bias", "layer_norm.weight", "layernorm.weight", "norm.weight",
+        "ln_f.weight"
+    ],
+    lora_name_list=["lora_right_weight", "lora_left_weight"],
+):
+    optimizer_grouped_parameters = [
+        {
+            "params": [
+                p for n, p in model.named_parameters()
+                if (not any(nd in n.lower() for nd in no_decay_name_list)
+                    and p.requires_grad and not any(nd in n.lower()
+                                                    for nd in lora_name_list))
+            ],
+            "weight_decay":
+            weight_decay,
+        },
+        {
+            "params": [
+                p for n, p in model.named_parameters()
+                if (not any(nd in n.lower() for nd in no_decay_name_list)
+                    and p.requires_grad and any(nd in n.lower()
+                                                for nd in lora_name_list))
+            ],
+            "weight_decay":
+            weight_decay,
+            "lr":
+            lora_lr
+        },
+        {
+            "params": [
+                p for n, p in model.named_parameters()
+                if (any(nd in n.lower()
+                        for nd in no_decay_name_list) and p.requires_grad)
+            ],
+            "weight_decay":
+            0.0,
+        },
+    ]
+
+    non_empty_groups = []
+    for group in optimizer_grouped_parameters:
+        if group["params"]:
+            non_empty_groups.append(group)
+    return non_empty_groups
+
+
+def _z3_params_to_fetch(param_list):
+    return [
+        p for p in param_list
+        if hasattr(p, 'ds_id') and p.ds_status == ZeroParamStatus.NOT_AVAILABLE
+    ]
+
+
+def moving_average(model, model_ema, beta=0.992, device=None, zero_stage=0):
+    zero_stage_3 = (zero_stage == 3)
+    with torch.no_grad():
+        for param, param_ema in zip(model.parameters(),
+                                    model_ema.parameters()):
+            # TODO: use prefiltering for efficiency
+            params_to_fetch = _z3_params_to_fetch([param, param_ema
+                                                   ]) if zero_stage_3 else []
+            should_gather_param = len(params_to_fetch) > 0
+            with deepspeed.zero.GatheredParameters(
+                    params_to_fetch, enabled=should_gather_param):
+                data = param.data
+                if device is not None:
+                    data = data.to(device)
+                param_ema.data.copy_(torch.lerp(data, param_ema.data, beta))
+
+
+def save_zero_three_model(model_ema, global_rank, save_dir, zero_stage=0):
+    zero_stage_3 = (zero_stage == 3)
+    os.makedirs(save_dir, exist_ok=True)
+    WEIGHTS_NAME = "pytorch_model.bin"
+    output_model_file = os.path.join(save_dir, WEIGHTS_NAME)
+
+    model_to_save = model_ema.module if hasattr(model_ema,
+                                                'module') else model_ema
+    if not zero_stage_3:
+        if global_rank == 0:
+            torch.save(model_to_save.state_dict(), output_model_file)
+    else:
+        output_state_dict = {}
+        for k, v in model_to_save.named_parameters():
+
+            if hasattr(v, 'ds_id'):
+                with deepspeed.zero.GatheredParameters(_z3_params_to_fetch([v
+                                                                            ]),
+                                                       enabled=zero_stage_3):
+                    v_p = v.data.cpu()
+            else:
+                v_p = v.cpu()
+            if global_rank == 0 and "lora" not in k:
+                output_state_dict[k] = v_p
+        if global_rank == 0:
+            torch.save(output_state_dict, output_model_file)
+        del output_state_dict
diff --git a/applications/DeepSpeed-Chat/train.py b/applications/DeepSpeed-Chat/e2e_rlhf.py
similarity index 99%
rename from applications/DeepSpeed-Chat/train.py
rename to applications/DeepSpeed-Chat/e2e_rlhf.py
index b57d80759..38db655ce 100644
--- a/applications/DeepSpeed-Chat/train.py
+++ b/applications/DeepSpeed-Chat/e2e_rlhf.py
@@ -127,7 +127,7 @@ def get_script(args, step_num):
     script = os.path.join(
         os.getcwd(),
         step_dirs[step_num],
-        "training_scripts",
+        "training_scripts/opt/",
         args.deployment_type,
         f"run_{model_size}.sh",
     )
diff --git a/applications/DeepSpeed-Chat/inference/chatbot.py b/applications/DeepSpeed-Chat/inference/chatbot.py
index 38b900d7d..5a4e36895 100644
--- a/applications/DeepSpeed-Chat/inference/chatbot.py
+++ b/applications/DeepSpeed-Chat/inference/chatbot.py
@@ -10,7 +10,7 @@
 import os
 import json
 from transformers import pipeline, set_seed
-from transformers import AutoConfig, OPTForCausalLM, AutoTokenizer
+from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM
 
 
 def parse_args():
@@ -43,9 +43,10 @@ def get_generator(path):
     tokenizer.pad_token = tokenizer.eos_token
 
     model_config = AutoConfig.from_pretrained(path)
-    model = OPTForCausalLM.from_pretrained(path,
-                                           from_tf=bool(".ckpt" in path),
-                                           config=model_config).half()
+    model_class = AutoModelForCausalLM.from_config(model_config)
+    model = model_class.from_pretrained(path,
+                                        from_tf=bool(".ckpt" in path),
+                                        config=model_config).half()
 
     model.config.end_token_id = tokenizer.eos_token_id
     model.config.pad_token_id = model.config.eos_token_id
diff --git a/applications/DeepSpeed-Chat/requirements.txt b/applications/DeepSpeed-Chat/requirements.txt
index 6f44c27cd..41577d625 100644
--- a/applications/DeepSpeed-Chat/requirements.txt
+++ b/applications/DeepSpeed-Chat/requirements.txt
@@ -4,4 +4,5 @@ protobuf==3.20.3
 accelerate>=0.15.0
 torch>=1.12.0
 deepspeed>=0.9.0
-transformers>=4.31.0
+transformers>=4.31.0,!=4.33.2
+tensorboard
diff --git a/applications/DeepSpeed-Chat/setup.py b/applications/DeepSpeed-Chat/setup.py
new file mode 100644
index 000000000..01a1ed83f
--- /dev/null
+++ b/applications/DeepSpeed-Chat/setup.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# setup.py: install script for deepspeed_chat
+"""
+to install deepspeed_chat and its dependencies for development work,
+run this cmd from the root directory:
+    pip install -e .
+"""
+import setuptools
+
+setuptools.setup(
+    name="deepspeed-chat",
+    version="0.1",
+    url=
+    "https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat",
+    include_package_data=True,
+    packages=setuptools.find_packages(include=['dschat']),
+    install_requires=[
+        "datasets>=2.8.0", "sentencepiece>=0.1.97", "protobuf==3.20.3",
+        "accelerate>=0.15.0", "torch>=1.12.0", "deepspeed>=0.9.2",
+        "transformers>=4.31.0,!=4.33.2", "tensorboard"
+    ],
+    extras_require={
+        "azureml": [
+            "azure-ml-component",
+            "azureml-core",
+        ],
+    })
diff --git a/applications/DeepSpeed-Chat/tests/test_training.py b/applications/DeepSpeed-Chat/tests/test_training.py
new file mode 100644
index 000000000..7ffe02972
--- /dev/null
+++ b/applications/DeepSpeed-Chat/tests/test_training.py
@@ -0,0 +1,89 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import os
+import subprocess
+
+
+def file_exists(directory_path, file_name):
+    return os.path.isfile(os.path.join(directory_path, file_name))
+
+
+@pytest.fixture(params=["2", "3"], ids=["zero2", "zero3"])
+def zero_stage(request):
+    return str(request.param)
+
+
+@pytest.fixture(params=["true", "false"], ids=["he", ""])
+def hybrid_engine(request):
+    return str(request.param)
+
+
+@pytest.fixture(params=["true", "false"], ids=["offload", ""])
+def offload(request):
+    return str(request.param)
+
+
+@pytest.fixture(params=["true", "false"], ids=["lora", ""])
+def lora(request):
+    return str(request.param)
+
+
+def test_ds_chat(zero_stage, hybrid_engine, offload, lora):
+    # Assert that critic model directory exists
+    critic_ckpt_dir = os.getenv("CRITIC_CKPT_DIR")
+    assert critic_ckpt_dir, "Please set CRITIC_CKPT_DIR in your environment"
+
+    # Setup params
+    actor_model = "facebook/opt-125m"
+    critic_model = critic_ckpt_dir
+    mixed_precision_lora = "false"
+    enable_test_mode = "true"
+    test_stop_step = "5"
+    output_path = "z" + zero_stage + "_he_" + hybrid_engine + "_offload_" + offload + "_lora_" + lora
+    params = [
+        actor_model,
+        critic_model,
+        zero_stage,
+        zero_stage,
+        hybrid_engine,
+        offload,
+        lora,
+        mixed_precision_lora,
+        output_path,
+        enable_test_mode,
+        test_stop_step,
+    ]
+
+    # Skip certain combinations
+    if zero_stage == "2" and hybrid_engine == "true" and offload == "true" and lora == "false":
+        pytest.skip(
+            "The combination of [actor_zero_stage==2, critic_zero_stage==2, enable_hybrid_engine=True, offload=True, lora=False] is currently unsupported due to training instability!"
+        )
+
+    # cd into execution dir
+    wd = os.getcwd()
+    os.chdir("../training/step3_rlhf_finetuning")
+    sweep_script = "training_scripts/opt/single_node/sweep/run_single.sh"
+
+    # Run bash script
+    cmd = ["bash", sweep_script] + params
+    result = subprocess.run(cmd)
+
+    # Assertions
+    try:
+        result.check_returncode()
+    except subprocess.CalledProcessError as e:
+        with open(os.path.join(output_path, f"{output_path}.log"), "r") as f:
+            print(f.read())
+        raise e
+
+    assert file_exists(f"{output_path}/actor/", "pytorch_model.bin"
+                       ), "Actor model was not saved during step 3 training."
+    assert file_exists(f"{output_path}/critic/", "pytorch_model.bin"
+                       ), "Critic model was not saved during step 3 training."
+
+    os.chdir(wd)
diff --git a/applications/DeepSpeed-Chat/training/README.md b/applications/DeepSpeed-Chat/training/README.md
index b6ef77a9c..4c65b6e2d 100644
--- a/applications/DeepSpeed-Chat/training/README.md
+++ b/applications/DeepSpeed-Chat/training/README.md
@@ -9,9 +9,9 @@ Supervised fine-tuning (SFT) has indeed made significant progress in the field o
 
 Based on our testing, there are several terms that affect the generation behavior:
 * ```weight decay```: OPT models are pretrained with weight decay. Following that, finetuning normally inherits this setting. However, it may not produce the desired model. Particularly, for our OPT-1.3B example, we disabled weight decay.
-* ```dropout```: Similar as above, dropout is used in OPT pretraining. However, SFT may not necessary need it. Particularly, for our OPT-1.3B example, we enabled dropout.
-* ```dataset```: Using more data usually provide better model quality. But if the sources of datasets are too different, it may hurt the performance. For our OPT-1.3B example, we use the following four datasets: ```Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets```.
-* ```training epochs``` Normally, to avoid overfitting, we choose smaller training epochs instead of longer epochs if smaller epochs can achieve similar model quality (in this case, we use PPL as an indicator). However, similar as InstructGPT pointed, we found even though we got overfitting due to longer training, it is still recommended to use longer training epochs to get better generation quality. Particularly, for our OPT-1.3B example, we use 16 epochs even though we found 1 or 2 epochs training can reach the same PPL score.
+* ```dropout```: Similar as above, dropout is used in OPT pretraining. However, SFT may not necessarily need it. Particularly, for our OPT-1.3B example, we enabled dropout.
+* ```dataset```: Using more data usually provides better model quality. But if the sources of datasets are too different, it may hurt the performance. For our OPT-1.3B example, we use the following four datasets: ```Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets```.
+* ```training epochs``` Normally, to avoid overfitting, we choose smaller training epochs instead of longer epochs if smaller epochs can achieve similar model quality (in this case, we use PPL as an indicator). However, similar to InstructGPT pointed, we found even though we got overfitting due to longer training, it is still recommended to use longer training epochs to get better generation quality. Particularly, for our OPT-1.3B example, we use 16 epochs even though we found that 1 or 2 epochs training can reach the same PPL score.
 
 ### Step 2: Reward Model Finetuning
 Reward model (RM) fine-tuning is indeed similar to SFT, with the main differences being: (1) the training datasets are different - RM requires both good responses and bad responses to the same query; (2) the training loss is different - RM requires pair ranking loss as the optimizing objective.
@@ -22,51 +22,51 @@ Here, we share more about what we observed during our exploration:
 * ```weight decay```: For our OPT-350m example, we enabled weight decay with 0.1.
 * ```dropout```: For our OPT-350m example, we disabled dropout.
 * ```dataset```: For our OPT-350m example, we use the following four datasets: ```Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets```.
-* ```training epochs``` InstructGPT suggests to finetune the model with 1 epoch since overfitting hurts the step 3 performance. During our exploration, we did not see overfitting behavior when we increased the training epochs. However, to follow the instrution from authors. We set training epoch to be 1.
+* ```training epochs``` InstructGPT suggests to finetune the model with 1 epoch since overfitting hurts the step 3 performance. During our exploration, we did not see overfitting behavior when we increased the training epochs. However, to follow the instructions from the authors. We set training epoch to be 1.
 
 Also, we provide more explorations here even though we have not set them as an option or included them in our current pipeline
-* ```multiple answers for one prompt``` In InstructGPT, authors specifically mentioned that using paird rejected and accepted answers for one prompt is not good for reward model training. Therefore, InstructGPT construts the dataset with 4--9 answers per prompt. However, we did not find good datasets with this feature.
-* ```initialize RM with SFT or Pretrained checkpoint``` We internally tested this but did not see big difference for either accuracy or reward score. Also, in InstructGPT, authors have the same finding. However, we encourage users to try it for their own usage.
-* ```Reward score calculation``` We use the final token (or the first padding token) to get the reward score. However, it might not be the optimal choice. For instance, users can try the average score for the entire answer etc.
+* ```multiple answers for one prompt``` In InstructGPT, authors specifically mentioned that using paird rejected and accepted answers for one prompt is not suitable for reward model training. Therefore, InstructGPT constructs the dataset with 4--9 answers per prompt. However, we did not find good datasets with this feature.
+* ```initialize RM with SFT or Pretrained checkpoint``` We internally tested this but did not see a big difference for either accuracy or reward score. Also, in InstructGPT, the authors have the same finding. However, we encourage users to try it for their own usage.
+* ```Reward score calculation``` We use the final token (or the first padding token) to get the reward score. However, it might not be the optimal choice. For instance, users can try the average score for the entire answer, etc.
 * ```Reward loss objective``` We simply use the ranking loss to be the objective. However, others, like MSE, can also be an option.
 
 
 ### Step 3: RLHF finetuning
-The RLHF finetuning is the most complicated step among the three step training. Similar to SFT, reward score cannot really reflect the model generation quality. Also, we sometines observed that reward score drops to initial phase at certain point then quickly recovers. To make things worse, we also see the training can easily get divergence. We here share our settings and observations.
+The RLHF finetuning is the most complicated step among the three-step training. Similar to SFT, the reward score cannot really reflect the model generation quality. Also, we sometimes observed that the reward score drops to the initial phase at a certain point and then quickly recovers. To make things worse, we also see the training can easily get divergence. We here share our settings and observations.
 
 * ```weight decay```: For our OPT-1.3B/350m (actor/critic) example, we disabled weight decay for both models.
 * ```dropout```: We disabled droppout for OPT-1.3B and enabled it for OPT-350m.
 * ```dataset```: We use the following single dataset: ```Dahoas/rm-static```.
 * ```training epochs``` The reward score quickly becomes platou. Therefore, we set the training epoch to be 1 for our OPT-1.3B/350m (actor/critic) example. However, longer training may bring better model quality as SFT.
-* ```ema checkpoint``` We observe ema checkpoint can generally bring bettr model generation quality as stated in InstructGPT.
+* ```ema checkpoint``` We observe ema checkpoint can generally bring better model generation quality as stated in InstructGPT.
 * ```PPO related hyperparameters``` PPO training has a lot of hyperparameters, see [here](https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/ppo_trainer.py#L61-L66). For now, we hard-coded them for users but you may want to adjust them for you own usage.
-* ```mix unsupervised training``` InstructGPT suggests to mix PPO and unsupervised training to prevent the lost of model's benchmark quality. However, when we directly apply the hyperparameter from Instruct, the model cannot converge. Therefore, we stop exploring this. However, users are encourage to test it and tune the hyperparameter for their own usage.
-* ```diverging issue``` We have found that it is very unstable to use different generation training batch sizes (`--per_device_train_batch_size`) and PPO training batch sizes (`--per_device_mini_batch_size`), more than one PPO training epoch (`--ppo_epochs`), or more than one generation batch size (`--generation_batch_numbers`). These all point to the same problem: we are not able to update the actor model multiple times after generating experimental data. Therefore, in all of our successful runs, we have set `per_device_train_batch_size=per_device_mini_batch_size` and `ppo_epochs=generation_batch_numbers=1`. This is unexpected for a standard RL training pipeline, and we have tried different methods to overcome this, but all have failed. One of the most likely reasons for this instability is that we found the `log_probs` and `old_log_probs` used in the `actor_loss_fn` function can quickly diverge even within two consecutive iterations, which causes the corresponding `ratio` to be huge. Setting a strict upper bound can alleviate this problem, but it cannot fully resolve the convergence issue.
+* ```mix unsupervised training``` InstructGPT suggests mixing PPO and unsupervised training to prevent the loss of the model's benchmark quality. However, when we directly apply the hyperparameter from Instruct, the model cannot converge. Therefore, we stop exploring this. However, users are encouraged to test it and tune the hyperparameter for their own usage.
+* ```diverging issue``` We have found that it is very unstable to use different generation training batch sizes (`--per_device_generation_batch_size`) and PPO training batch sizes (`--per_device_training_batch_size`), more than one PPO training epoch (`--ppo_epochs`), or more than one generation batch (`--generation_batches 1`). These all point to the same problem: we are not able to update the actor model multiple times after generating experimental data. Therefore, in all of our successful runs, we have set `per_device_generation_batch_size=per_device_training_batch_size` and `ppo_epochs=generation_batches=1`. This is unexpected for a standard RL training pipeline, and we have tried different methods to overcome this, but all have failed. One of the most likely reasons for this instability is that we found the `log_probs` and `old_log_probs` used in the `actor_loss_fn` function can quickly diverge even within two consecutive iterations, which causes the corresponding `ratio` to be huge. Setting a strict upper bound can alleviate this problem, but it cannot fully resolve the convergence issue.
 
 ### About our testing
-We did most of our accuracy/quality testing on OPT-1.3B (SFT and Actor model) and OPT-350m (RW and Critic model). Particularly, we used the 16 V100-32G (DGX-2 node) gpus to run our experiments.
+We did most of our accuracy/quality testing on OPT-1.3B (SFT and Actor model) and OPT-350m (RW and Critic model). Particularly, we used the 16 V100-32G (DGX-2 node) GPUs to run our experiments.
 
-The hyperparameters included in our scripts are based on our own testing. Therefore, it may not work for you case when (but not limited to): (1) a different number of GPUs, (2) different model sizes, (3) different model families, etc.
+The hyperparameters included in our scripts are based on our own testing. Therefore, it may not work for your case when (but not limited to): (1) a different number of GPUs, (2) different model sizes, (3) different model families, etc.
 
-Also note that, you could find even better training configurations/recipes than what we provided. We did not extensively tested all hyperparameter combinations due to resouces constraints.
+Also note that you could find even better training configurations/recipes than what we provided. We did not extensively test all hyperparameter combinations due to resource constraints.
 
 ### Training logs
 We are sharing our training logs for all three steps for an OPT-1.3b actor and OPT-350m critic trained with x16-V100-32GB GPUs:
 
 | Step         | Run Script     | Training Log |
 |--------------|-----------|------------|
-| 1 | [single_node/run_1.3b.sh](https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/single_node/run_1.3b.sh) | [opt-1.3b-globalBatchSize128.log](https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_log_output/opt-1.3b-globalBatchSize128.log) |
-| 2 | [single_node/run_350m.sh](https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/single_node/run_350m.sh) |  [opt-350m_globalBatchSize-64.log](https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_log_output/opt-350m_globalBatchSize-64.log) |
-| 3 | [single_node/run_1.3b.sh](https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_node/run_1.3b.sh) | [actor_opt-1.3b_critic_opt-350m_globalBatchSize64.log](https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_log_output/actor_opt-1.3b_critic_opt-350m_globalBatchSize64.log) |
+| 1 | [opt/single_node/run_1.3b.sh](https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_node/run_1.3b.sh) | [opt-1.3b-globalBatchSize128.log](https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_log_output/opt-1.3b-globalBatchSize128.log) |
+| 2 | [opt/single_node/run_350m.sh](https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/opt/single_node/run_350m.sh) |  [opt-350m_globalBatchSize-64.log](https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_log_output/opt-350m_globalBatchSize-64.log) |
+| 3 | [opt/single_node/run_1.3b.sh](https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_node/opt/run_1.3b.sh) | [actor_opt-1.3b_critic_opt-350m_globalBatchSize64.log](https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_log_output/actor_opt-1.3b_critic_opt-350m_globalBatchSize64.log) |
 
 ### Characterization Scripts
 Scripts for sweeping training across various parameters (Zero Stage, Offload, Lora, etc) are available for Step 1, 2, and 3. These scripts can be further extended to sweep across additional parameters such as learning rate.
 
 | Step         | Sweep Script     | README |
 |--------------|-----------|-----------|
-| 1 | [run_step1_sweep.sh](./step1_supervised_finetuning/training_scripts/single_node/sweep/run_step1_sweep.sh) | [README](./step1_supervised_finetuning/training_scripts/single_node/sweep/README.md) |
-| 2 | [run_step2_sweep.sh](./step2_reward_model_finetuning/training_scripts/single_node/sweep/run_step2_sweep.sh) | [README](./step2_reward_model_finetuning/training_scripts/single_node/sweep/README.md) |
-| 3 | [run_step3_sweep.sh](./step3_rlhf_finetuning/training_scripts/single_node/sweep/run_step3_sweep.sh) | [README](./step3_rlhf_finetuning/training_scripts/single_node/sweep/README.md) |
+| 1 | [run_step1_sweep.sh](./step1_supervised_finetuning/training_scripts/opt/single_node/sweep/run_step1_sweep.sh) | [README](./step1_supervised_finetuning/training_scripts/opt/single_node/sweep/README.md) |
+| 2 | [run_step2_sweep.sh](./step2_reward_model_finetuning/training_scripts/opt/single_node/sweep/run_step2_sweep.sh) | [README](./step2_reward_model_finetuning/training_scripts/opt/single_node/sweep/README.md) |
+| 3 | [run_step3_sweep.sh](./step3_rlhf_finetuning/training_scripts/opt/single_node/sweep/run_step3_sweep.sh) | [README](./step3_rlhf_finetuning/training_scripts/opt/single_node/sweep/README.md) |
 
 ### Others
 RLHF (Reinforcement Learning for Human Feedback) training is still an open problem, and DeepSpeed-Chat is designed to be a starting point for researchers and practitioners to work on it with an efficient and fast training experience. The Hybrid-Engine and other efficient components, like LoRA, can be inherited from DeepSpeed-Chat, allowing you to develop your own RLHF training pipeline for exploration, research, and other purposes.
diff --git a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/README.md b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/README.md
index e5a29bc64..54e313dbe 100644
--- a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/README.md
+++ b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/README.md
@@ -5,7 +5,7 @@ Supervised finetuning (SFT) is very similar to standard language model finetunin
 We provide multiple scripts for training on single GPUs (e.g., a single A6000-48G, V100-32G, A100-40G, etc.), single nodes (e.g., 8/16x V100-32G, 8 A100-40G/80G), and multiple nodes setting (e.g., 64x A100-80G), which can be found in the 'training_scripts' directory. For example, if you have a single A6000-48G, you can simply run the corresponding script.
 
 ```bash
- training_scripts/single_gpu/run_1.3b.sh
+ training_scripts/opt/single_gpu/run_1.3b.sh
  ```
 
 to train a OPT-1.3b model. It is easy to extend our single-node script to multi-node system.
diff --git a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py
index f7b91fa17..c37d1f4cd 100755
--- a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py
+++ b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py
@@ -4,9 +4,7 @@
 
 # DeepSpeed Team
 import argparse
-import os
 import math
-import sys
 
 import torch
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
@@ -21,14 +19,14 @@
 
 import deepspeed
 from deepspeed.ops.adam import DeepSpeedCPUAdam, FusedAdam
+from deepspeed import get_accelerator
 
-sys.path.append(
-    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)))
-from utils.data.data_utils import create_prompt_dataset
-from utils.utils import print_rank_0, to_device, save_hf_format, set_random_seed, get_all_reduce_mean, get_optimizer_grouped_parameters, save_zero_three_model, load_hf_tokenizer
-from utils.ds_utils import get_train_ds_config
-from utils.module.lora import convert_linear_layer_to_lora, convert_lora_to_linear_layer, only_optimize_lora_parameters, make_model_gradient_checkpointing_compatible
-from utils.model.model_utils import create_hf_model
+from dschat.utils.data.data_utils import create_prompt_dataset
+from dschat.utils.utils import print_rank_0, to_device, save_hf_format, set_random_seed, get_all_reduce_mean, get_optimizer_grouped_parameters, save_zero_three_model, load_hf_tokenizer
+from dschat.utils.ds_utils import get_train_ds_config
+from dschat.utils.module.lora import convert_linear_layer_to_lora, convert_lora_to_linear_layer, only_optimize_lora_parameters, make_model_gradient_checkpointing_compatible
+from dschat.utils.model.model_utils import create_hf_model, causal_lm_model_to_fp32_loss
+from dschat.utils.perf import print_throughput
 
 
 def parse_args():
@@ -137,13 +135,21 @@ def parse_args():
     parser.add_argument('--gradient_checkpointing',
                         action='store_true',
                         help='Enable HF gradient checkpointing for model.')
-    parser.add_argument('--disable_dropout',
-                        action='store_true',
-                        help='Disable the dropout of the model.')
+    parser.add_argument(
+        "--dropout",
+        type=float,
+        default=None,
+        help="If dropout configured, use it. "
+        "Otherwise, keep the default dropout configuration of the model.")
     # deepspeed features
     parser.add_argument('--offload',
                         action='store_true',
                         help='Enable ZeRO Offload techniques.')
+    parser.add_argument('--dtype',
+                        type=str,
+                        default='fp16',
+                        choices=['fp16', 'bf16'],
+                        help='Training data type')
     parser.add_argument(
         '--zero_stage',
         type=int,
@@ -161,6 +167,19 @@ def parse_args():
     parser.add_argument('--only_optimize_lora',
                         action='store_true',
                         help='Only optimize the LoRA parameters.')
+    parser.add_argument(
+        "--lora_learning_rate",
+        type=float,
+        default=5e-4,
+        help=
+        "Initial LoRA learning rate (after the potential warmup period) to use."
+    )
+    ## low precision
+    parser.add_argument(
+        '--compute_fp32_loss',
+        action='store_true',
+        help='Relevant for low precision dtypes (fp16, bf16, etc.). '
+        'If specified, loss is calculated in fp32.')
     ## Tensorboard logging
     parser.add_argument('--enable_tensorboard',
                         action='store_true',
@@ -168,6 +187,11 @@ def parse_args():
     parser.add_argument('--tensorboard_path',
                         type=str,
                         default="step1_tensorboard")
+    ## Tokenizer
+    parser.add_argument(
+        "--add_eot_token",
+        action='store_true',
+        help="Add <|endoftext|> as additional special token to tokenizer")
     ## Print loss
     parser.add_argument('--print_loss',
                         action='store_true',
@@ -182,10 +206,10 @@ def main():
     args = parse_args()
 
     if args.local_rank == -1:
-        device = torch.device("cuda")
+        device = torch.device(get_accelerator().device_name())
     else:
-        torch.cuda.set_device(args.local_rank)
-        device = torch.device("cuda", args.local_rank)
+        get_accelerator().set_device(args.local_rank)
+        device = torch.device(get_accelerator().device_name(), args.local_rank)
         # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         # torch.distributed.init_process_group(backend='nccl')
         deepspeed.init_distributed()
@@ -193,6 +217,7 @@ def main():
     args.global_rank = torch.distributed.get_rank()
 
     ds_config = get_train_ds_config(offload=args.offload,
+                                    dtype=args.dtype,
                                     stage=args.zero_stage,
                                     enable_tensorboard=args.enable_tensorboard,
                                     tb_path=args.tensorboard_path,
@@ -209,12 +234,23 @@ def main():
     torch.distributed.barrier()
 
     # load_hf_tokenizer will get the correct tokenizer and set padding tokens based on the model family
-    tokenizer = load_hf_tokenizer(args.model_name_or_path, fast_tokenizer=True)
+    args.end_of_conversation_token = "<|endoftext|>"
+    additional_special_tokens = args.end_of_conversation_token if args.add_eot_token else None
+    tokenizer = load_hf_tokenizer(args.model_name_or_path,
+                                  fast_tokenizer=True,
+                                  add_special_tokens=additional_special_tokens)
+
     model = create_hf_model(AutoModelForCausalLM,
                             args.model_name_or_path,
                             tokenizer,
                             ds_config,
-                            disable_dropout=args.disable_dropout)
+                            dropout=args.dropout)
+
+    if args.compute_fp32_loss:
+        print_rank_0(
+            f"Using model {model.__class__.__name__} with loss in fp32",
+            args.global_rank)
+        causal_lm_model_to_fp32_loss(model)
 
     if args.lora_dim > 0:
         model = convert_linear_layer_to_lora(model, args.lora_module_name,
@@ -263,18 +299,18 @@ def evaluation(model, eval_dataloader):
             losses += loss.float()
         losses = losses / (step + 1)
         try:
-            perplexity = torch.exp(losses)
-        except OverflowError:
-            perplexity = float("inf")
-        try:
-            perplexity = get_all_reduce_mean(perplexity).item()
+            losses = get_all_reduce_mean(losses)
         except:
             pass
-        return perplexity
+        try:
+            perplexity = torch.exp(losses).item()
+        except OverflowError:
+            perplexity = float("inf")
+        return perplexity, losses.item()
 
     # Split weights in two groups, one with weight decay and the other not.
     optimizer_grouped_parameters = get_optimizer_grouped_parameters(
-        model, args.weight_decay)
+        model, args.weight_decay, args.lora_learning_rate)
 
     AdamOptimizer = DeepSpeedCPUAdam if args.offload else FusedAdam
     optimizer = AdamOptimizer(optimizer_grouped_parameters,
@@ -306,15 +342,17 @@ def evaluation(model, eval_dataloader):
     print_rank_0(
         f"***** Evaluating perplexity, Epoch {0}/{args.num_train_epochs} *****",
         args.global_rank)
-    perplexity = evaluation(model, eval_dataloader)
-    print_rank_0(f"ppl: {perplexity}", args.global_rank)
+    perplexity, eval_loss = evaluation(model, eval_dataloader)
+    print_rank_0(f"ppl: {perplexity}, loss: {eval_loss}", args.global_rank)
 
     for epoch in range(args.num_train_epochs):
         print_rank_0(
             f"Beginning of Epoch {epoch+1}/{args.num_train_epochs}, Total Micro Batches {len(train_dataloader)}",
             args.global_rank)
         model.train()
+        import time
         for step, batch in enumerate(train_dataloader):
+            start = time.time()
             batch = to_device(batch, device)
             outputs = model(**batch, use_cache=False)
             loss = outputs.loss
@@ -324,13 +362,17 @@ def evaluation(model, eval_dataloader):
                 )
             model.backward(loss)
             model.step()
+            end = time.time()
+            if torch.distributed.get_rank() == 0:
+                print_throughput(model.model, args, end - start,
+                                 args.global_rank)
 
         # Evaluate perplexity on the validation set.
         print_rank_0(
             f"***** Evaluating perplexity, Epoch {epoch+1}/{args.num_train_epochs} *****",
             args.global_rank)
-        perplexity = evaluation(model, eval_dataloader)
-        print_rank_0(f"ppl: {perplexity}", args.global_rank)
+        perplexity, eval_loss = evaluation(model, eval_dataloader)
+        print_rank_0(f"ppl: {perplexity}, loss: {eval_loss}", args.global_rank)
         model.tput_timer.update_epoch_count()
 
     if args.output_dir is not None:
diff --git a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/prompt_eval.py b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/prompt_eval.py
index 6e8891e4a..a25b0edea 100644
--- a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/prompt_eval.py
+++ b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/prompt_eval.py
@@ -5,16 +5,13 @@
 import argparse
 import logging
 import torch
-import sys
-import os
 
 from transformers import (
     AutoModelForCausalLM, )
 
-sys.path.append(
-    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)))
-from utils.model.model_utils import create_hf_model
-from utils.utils import load_hf_tokenizer
+from dschat.utils.model.model_utils import create_hf_model
+from dschat.utils.utils import load_hf_tokenizer
+from deepspeed import get_accelerator
 
 logger = logging.getLogger(__name__)
 
@@ -73,6 +70,10 @@ def parse_args():
                         type=str,
                         default="English",
                         choices=["English", "Chinese", "Japanese"])
+    parser.add_argument(
+        "--add_eot_token",
+        action='store_true',
+        help="Add <|endoftext|> as additional special token to tokenizer")
 
     args = parser.parse_args()
 
@@ -194,10 +195,13 @@ def prompt_eval(args, model_baseline, model_fintuned, tokenizer, device,
 def main():
     args = parse_args()
 
-    device = torch.device("cuda:0")
+    device = torch.device(get_accelerator().device_name(0))
 
+    args.end_of_conversation_token = "<|endoftext|>"
+    additional_special_tokens = args.end_of_conversation_token if args.add_eot_token else None
     tokenizer = load_hf_tokenizer(args.model_name_or_path_baseline,
-                                  fast_tokenizer=True)
+                                  fast_tokenizer=True,
+                                  add_special_tokens=additional_special_tokens)
 
     model_baseline = create_hf_model(AutoModelForCausalLM,
                                      args.model_name_or_path_baseline,
diff --git a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/single_node/run_llama_7b.sh b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/llama2/run_llama2_7b.sh
similarity index 83%
rename from applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/single_node/run_llama_7b.sh
rename to applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/llama2/run_llama2_7b.sh
index 69d6ba346..2fe70be13 100755
--- a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/single_node/run_llama_7b.sh
+++ b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/llama2/run_llama2_7b.sh
@@ -6,17 +6,17 @@
 OUTPUT=$1
 ZERO_STAGE=$2
 if [ "$OUTPUT" == "" ]; then
-    OUTPUT=./output_step1_llama_7b_epoch4_lr9.65e-6_test
+    OUTPUT=./output_step1_llama2_7b
 fi
 if [ "$ZERO_STAGE" == "" ]; then
     ZERO_STAGE=3
 fi
 mkdir -p $OUTPUT
 
-deepspeed --include="worker-1" main.py \
+deepspeed main.py \
    --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \
    --data_split 2,4,4 \
-   --model_name_or_path decapoda-research/llama-7b-hf \
+   --model_name_or_path meta-llama/Llama-2-7b-hf \
    --per_device_train_batch_size 4 \
    --per_device_eval_batch_size 4 \
    --max_seq_len 512 \
diff --git a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh
new file mode 100755
index 000000000..7689266ee
--- /dev/null
+++ b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+OUTPUT=$1
+ZERO_STAGE=$2
+if [ "$OUTPUT" == "" ]; then
+    OUTPUT=./output_step1_llama2_7b_lora
+fi
+if [ "$ZERO_STAGE" == "" ]; then
+    ZERO_STAGE=3
+fi
+mkdir -p $OUTPUT
+
+deepspeed main.py \
+   --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \
+   --data_split 2,4,4 \
+   --model_name_or_path meta-llama/Llama-2-7b-hf \
+   --per_device_train_batch_size 4 \
+   --per_device_eval_batch_size 4 \
+   --max_seq_len 512 \
+   --learning_rate 9.65e-6 \
+   --weight_decay 0. \
+   --num_train_epochs 4  \
+   --gradient_accumulation_steps 1 \
+   --lr_scheduler_type cosine \
+   --num_warmup_steps 0 \
+   --seed 1234 \
+   --gradient_checkpointing \
+   --zero_stage $ZERO_STAGE \
+   --deepspeed \
+   --lora_dim 128 \
+   --lora_module_name "layers." \
+   --output_dir $OUTPUT \
+   &> $OUTPUT/training.log
diff --git a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/multi_node/run_66b.sh b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/multi_node/run_66b.sh
similarity index 100%
rename from applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/multi_node/run_66b.sh
rename to applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/multi_node/run_66b.sh
diff --git a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/single_gpu/run_1.3b.sh b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_gpu/run_1.3b.sh
similarity index 100%
rename from applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/single_gpu/run_1.3b.sh
rename to applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_gpu/run_1.3b.sh
diff --git a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/single_gpu/run_6.7b_lora.sh b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_gpu/run_6.7b_lora.sh
similarity index 100%
rename from applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/single_gpu/run_6.7b_lora.sh
rename to applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_gpu/run_6.7b_lora.sh
diff --git a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/single_node/run_1.3b.sh b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_node/run_1.3b.sh
similarity index 100%
rename from applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/single_node/run_1.3b.sh
rename to applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_node/run_1.3b.sh
diff --git a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/single_node/run_1.3b_lora.sh b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_node/run_1.3b_lora.sh
similarity index 100%
rename from applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/single_node/run_1.3b_lora.sh
rename to applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_node/run_1.3b_lora.sh
diff --git a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/single_node/run_13b.sh b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_node/run_13b.sh
similarity index 100%
rename from applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/single_node/run_13b.sh
rename to applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_node/run_13b.sh
diff --git a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/single_node/run_30b_lora.sh b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_node/run_30b_lora.sh
similarity index 100%
rename from applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/single_node/run_30b_lora.sh
rename to applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_node/run_30b_lora.sh
diff --git a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/single_node/run_6.7b.sh b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_node/run_6.7b.sh
similarity index 100%
rename from applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/single_node/run_6.7b.sh
rename to applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_node/run_6.7b.sh
diff --git a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/single_node/sweep/README.md b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_node/sweep/README.md
similarity index 88%
rename from applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/single_node/sweep/README.md
rename to applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_node/sweep/README.md
index 7b64169d1..254442faf 100644
--- a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/single_node/sweep/README.md
+++ b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_node/sweep/README.md
@@ -17,5 +17,5 @@ The `run_step1_sweep.sh` script passes configuration arguments to `run_single.sh
 # Usage
 The sweep script can be run as follows:
 <pre>
-DeepSpeedExamples/applications/DeepSpeed-Chat/training/step1_supervised_finetuning$ bash training_scripts/single_node/sweep/run_step1_sweep.sh
+DeepSpeedExamples/applications/DeepSpeed-Chat/training/step1_supervised_finetuning$ bash training_scripts/opt/single_node/sweep/run_step1_sweep.sh
 </pre>
diff --git a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/single_node/sweep/run_single.sh b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_node/sweep/run_single.sh
similarity index 100%
rename from applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/single_node/sweep/run_single.sh
rename to applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_node/sweep/run_single.sh
diff --git a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/single_node/sweep/run_step1_sweep.sh b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_node/sweep/run_step1_sweep.sh
similarity index 88%
rename from applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/single_node/sweep/run_step1_sweep.sh
rename to applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_node/sweep/run_step1_sweep.sh
index 0e78e15ce..7b6e57823 100644
--- a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/single_node/sweep/run_step1_sweep.sh
+++ b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_node/sweep/run_step1_sweep.sh
@@ -9,7 +9,7 @@ do
     do
         for lora in true false
         do
-            cmd="bash training_scripts/single_node/sweep/run_single.sh \
+            cmd="bash training_scripts/opt/single_node/sweep/run_single.sh \
                 ${z} \
                 ${offload} \
                 ${lora} \
diff --git a/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/README.md b/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/README.md
index 758cc4893..ede072a79 100644
--- a/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/README.md
+++ b/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/README.md
@@ -22,7 +22,7 @@ We also provide an evaluation script, ``rw_eval.py``, for users to perform simpl
 
 We provide the script for OPT-350m, which you can test by launching the command
  ```
-bash training_scripts/single_gpu/run_350m.sh
+bash training_scripts/opt/single_gpu/run_350m.sh
 ```
 
  You can train larger models by simply replacing the model candidate with your preferred one and enabling other efficient training features as described in Step 1 of the SFT finetuning process.
diff --git a/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/main.py b/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/main.py
index d648f53b9..04f178504 100644
--- a/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/main.py
+++ b/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/main.py
@@ -4,9 +4,7 @@
 
 # DeepSpeed Team
 import argparse
-import os
 import math
-import sys
 
 import torch
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
@@ -19,14 +17,13 @@
 
 import deepspeed
 from deepspeed.ops.adam import DeepSpeedCPUAdam, FusedAdam
+from deepspeed.accelerator import get_accelerator
 
-sys.path.append(
-    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)))
-from utils.model.model_utils import create_critic_model
-from utils.data.data_utils import create_prompt_dataset, DataCollatorReward
-from utils.utils import print_rank_0, to_device, save_hf_format, set_random_seed, get_all_reduce_mean, get_optimizer_grouped_parameters, save_zero_three_model, load_hf_tokenizer
-from utils.ds_utils import get_train_ds_config
-from utils.module.lora import convert_linear_layer_to_lora, convert_lora_to_linear_layer, only_optimize_lora_parameters, make_model_gradient_checkpointing_compatible
+from dschat.utils.model.model_utils import create_critic_model
+from dschat.utils.data.data_utils import create_prompt_dataset, DataCollatorReward
+from dschat.utils.utils import print_rank_0, to_device, save_hf_format, set_random_seed, get_all_reduce_mean, get_optimizer_grouped_parameters, save_zero_three_model, load_hf_tokenizer
+from dschat.utils.ds_utils import get_train_ds_config
+from dschat.utils.module.lora import convert_linear_layer_to_lora, convert_lora_to_linear_layer, only_optimize_lora_parameters, make_model_gradient_checkpointing_compatible
 
 
 def parse_args():
@@ -137,13 +134,21 @@ def parse_args():
         '--gradient_checkpointing',
         action='store_true',
         help='Enable HF gradient checkpointing for Actor model.')
-    parser.add_argument('--disable_dropout',
-                        action='store_true',
-                        help='Disable the dropout of the model.')
+    parser.add_argument(
+        "--dropout",
+        type=float,
+        default=None,
+        help="If dropout configured, use it. "
+        "Otherwise, keep the default dropout configuration of the model.")
     # deepspeed features
     parser.add_argument('--offload',
                         action='store_true',
                         help='Enable ZeRO Offload techniques.')
+    parser.add_argument('--dtype',
+                        type=str,
+                        default='fp16',
+                        choices=['fp16', 'bf16'],
+                        help='Training data type')
     parser.add_argument(
         '--zero_stage',
         type=int,
@@ -161,6 +166,30 @@ def parse_args():
     parser.add_argument('--only_optimize_lora',
                         action='store_true',
                         help='Only optimize the LoRA parameters.')
+    parser.add_argument(
+        "--lora_learning_rate",
+        type=float,
+        default=5e-4,
+        help=
+        "Initial LoRA learning rate (after the potential warmup period) to use."
+    )
+
+    # Evaluation
+    parser.add_argument("--eval_interval",
+                        type=int,
+                        default=0,
+                        help="If > 0, perform evaluation at this interval")
+    parser.add_argument("--eval_iters",
+                        type=int,
+                        default=100,
+                        help="Maximum evaluation iterations")
+    ## low precision
+    parser.add_argument(
+        '--compute_fp32_loss',
+        action='store_true',
+        help='Relevant for low precision dtypes (fp16, bf16, etc.). '
+        'If specified, loss is calculated in fp32.')
+
     ## Tensorboard logging
     parser.add_argument('--enable_tensorboard',
                         action='store_true',
@@ -168,6 +197,11 @@ def parse_args():
     parser.add_argument('--tensorboard_path',
                         type=str,
                         default="step2_tensorboard")
+    ## Tokenizer
+    parser.add_argument(
+        "--add_eot_token",
+        action='store_true',
+        help="Add <|endoftext|> as additional special token to tokenizer")
     parser = deepspeed.add_config_arguments(parser)
     args = parser.parse_args()
 
@@ -178,10 +212,10 @@ def main():
     args = parse_args()
 
     if args.local_rank == -1:
-        device = torch.device("cuda")
+        device = torch.device(get_accelerator().device_name())
     else:
-        torch.cuda.set_device(args.local_rank)
-        device = torch.device("cuda", args.local_rank)
+        get_accelerator().set_device(args.local_rank)
+        device = torch.device(get_accelerator().device_name(), args.local_rank)
         # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         # torch.distributed.init_process_group(backend='nccl')
         deepspeed.init_distributed()
@@ -189,6 +223,7 @@ def main():
     args.global_rank = torch.distributed.get_rank()
 
     ds_config = get_train_ds_config(offload=args.offload,
+                                    dtype=args.dtype,
                                     stage=args.zero_stage,
                                     enable_tensorboard=args.enable_tensorboard,
                                     tb_path=args.tensorboard_path,
@@ -204,19 +239,38 @@ def main():
     torch.distributed.barrier()
 
     # load_hf_tokenizer will get the correct tokenizer and set padding tokens based on the model family
-    tokenizer = load_hf_tokenizer(args.model_name_or_path, fast_tokenizer=True)
+    args.end_of_conversation_token = "<|endoftext|>"
+    additional_special_tokens = args.end_of_conversation_token if args.add_eot_token else None
+    tokenizer = load_hf_tokenizer(args.model_name_or_path,
+                                  fast_tokenizer=True,
+                                  add_special_tokens=additional_special_tokens)
     rm_model = create_critic_model(args.model_name_or_path,
                                    tokenizer,
                                    ds_config,
                                    args.num_padding_at_beginning,
-                                   disable_dropout=args.disable_dropout)
+                                   dropout=args.dropout,
+                                   zero_stage=args.zero_stage,
+                                   compute_fp32_loss=args.compute_fp32_loss)
+
+    # Model bigscience/bloom-560m has large variance at ln_f.weight parameter
+    # This makes bf16 finetuning hard.
+    # In general, since we are replacing the model head, it makes sense to reset
+    # the LN that precedes it.
+    force_optimize_params = []
+    if "bigscience/bloom-" in args.model_name_or_path:
+        torch.nn.init.ones_(rm_model.rwtransformer.ln_f.weight)
+        torch.nn.init.zeros_(rm_model.rwtransformer.ln_f.bias)
+        force_optimize_params.extend(
+            ['rwtransformer.ln_f.weight', 'rwtransformer.ln_f.bias'])
 
     if args.lora_dim > 0:
         rm_model = convert_linear_layer_to_lora(rm_model,
                                                 args.lora_module_name,
                                                 args.lora_dim)
         if args.only_optimize_lora:
-            rm_model = only_optimize_lora_parameters(rm_model)
+            force_optimize_params.append('v_head.weight')
+            rm_model = only_optimize_lora_parameters(rm_model,
+                                                     force_optimize_params)
             rm_model = make_model_gradient_checkpointing_compatible(rm_model)
 
     train_phase = 2
@@ -237,41 +291,44 @@ def main():
                                   collate_fn=data_collator,
                                   sampler=train_sampler,
                                   batch_size=args.per_device_train_batch_size)
-    eval_sampler = SequentialSampler(eval_dataset)
     eval_dataloader = DataLoader(eval_dataset,
                                  collate_fn=data_collator,
                                  sampler=eval_sampler,
                                  batch_size=args.per_device_eval_batch_size)
 
-    def evaluation_reward(model, eval_dataloader):
+    def evaluation_reward(model, dataloader, eval_iters):
         model.eval()
         correct_predictions = 0
         total_predictions = 0
-        scores = 0
-        for step, batch in enumerate(eval_dataloader):
-            batch = to_device(batch, device)
+        chosen_scores = 0.
+        rejected_scores = 0.
+        for _step, _batch in enumerate(dataloader):
+            _batch = to_device(_batch, device)
             with torch.no_grad():
-                outputs = model(**batch)
+                _outputs = model(**_batch)
 
-            chosen = outputs["chosen_mean_scores"]
-            rejected = outputs["rejected_mean_scores"]
+            chosen = _outputs["chosen_mean_scores"]
+            rejected = _outputs["rejected_mean_scores"]
             correct_predictions += (chosen > rejected).sum()
             total_predictions += chosen.shape[0]
-            scores += outputs["chosen_mean_scores"].mean().float()
-            if step == 99:  # For faster evaluation and debugging
+            chosen_scores += _outputs["chosen_mean_scores"].mean().float()
+            rejected_scores += _outputs["rejected_mean_scores"].mean().float()
+            if (_step + 1) == eval_iters:
                 break
-        acc = correct_predictions / total_predictions
-        scores = scores / (step + 1)
+        _acc = correct_predictions / total_predictions
+        chosen_scores = chosen_scores / (_step + 1)
+        rejected_scores = rejected_scores / (_step + 1)
         try:
-            acc = get_all_reduce_mean(acc).item()
-            scores = get_all_reduce_mean(scores).item()
+            _acc = get_all_reduce_mean(_acc).item()
+            chosen_scores = get_all_reduce_mean(chosen_scores).item()
+            rejected_scores = get_all_reduce_mean(rejected_scores).item()
         except:
             pass
-        return scores, acc
+        return chosen_scores, rejected_scores, _acc
 
     # Split weights in two groups, one with weight decay and the other not.
     optimizer_grouped_parameters = get_optimizer_grouped_parameters(
-        rm_model, args.weight_decay)
+        rm_model, args.weight_decay, args.lora_learning_rate)
 
     AdamOptimizer = DeepSpeedCPUAdam if args.offload else FusedAdam
     optimizer = AdamOptimizer(optimizer_grouped_parameters,
@@ -305,11 +362,14 @@ def evaluation_reward(model, eval_dataloader):
     print_rank_0(
         f"***** Evaluating reward, Epoch {0}/{args.num_train_epochs} *****",
         args.global_rank)
-    reward_score, acc = evaluation_reward(rm_model, eval_dataloader)
+    reward_score, reject_score, acc = evaluation_reward(
+        rm_model, eval_dataloader, args.eval_iters)
     print_rank_0(
-        f"chosen_last_scores (higher is better) : {reward_score}, acc (higher is better) : {acc}",
-        args.global_rank)
+        f"chosen_last_scores (higher is better) : {reward_score}, "
+        f"rejected_last_scores (lower is better) : {reject_score}, "
+        f"acc (higher is better) : {acc}", args.global_rank)
 
+    total_micro_steps = 0
     for epoch in range(args.num_train_epochs):
         print_rank_0(
             f"Beginning of Epoch {epoch+1}/{args.num_train_epochs}, Total Micro Batches {len(train_dataloader)}",
@@ -323,6 +383,22 @@ def evaluation_reward(model, eval_dataloader):
             rm_model.backward(loss)
             rm_model.step()
             mean_loss += loss.item()
+            total_micro_steps += 1
+            gas_boundary = (total_micro_steps %
+                            args.gradient_accumulation_steps == 0)
+            total_steps = total_micro_steps // args.gradient_accumulation_steps
+            if args.eval_interval and gas_boundary and (
+                    total_steps % args.eval_interval == 0):
+                print_rank_0(f"Iter {total_steps}: Evaluating reward",
+                             args.global_rank)
+                reward_score, reject_score, acc = evaluation_reward(
+                    rm_model, eval_dataloader, args.eval_iters)
+                print_rank_0(
+                    f"Iter {total_steps}: c_scores: {reward_score}, r_scores: {reject_score}, "
+                    f"diff: {reward_score - reject_score}, acc: {acc}",
+                    args.global_rank)
+                rm_model.train()
+
         print_rank_0(
             f"Epoch {epoch+1}/{args.num_train_epochs} with loss {mean_loss/(step+1)}",
             args.global_rank)
@@ -330,10 +406,12 @@ def evaluation_reward(model, eval_dataloader):
         print_rank_0(
             f"***** Evaluating reward, Epoch {epoch+1}/{args.num_train_epochs} *****",
             args.global_rank)
-        reward_score, acc = evaluation_reward(rm_model, eval_dataloader)
+        reward_score, reject_score, acc = evaluation_reward(
+            rm_model, eval_dataloader, args.eval_iters)
         print_rank_0(
-            f"chosen_last_scores (higher is better) : {reward_score}, acc (higher is better) : {acc}",
-            args.global_rank)
+            f"chosen_last_scores (higher is better) : {reward_score}, "
+            f"rejected_last_scores (lower is better) : {reject_score}, "
+            f"acc (higher is better) : {acc}", args.global_rank)
         rm_model.tput_timer.update_epoch_count()
 
     if args.output_dir is not None:
diff --git a/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/rw_eval.py b/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/rw_eval.py
index 0ab7ecfbc..23f9a66af 100644
--- a/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/rw_eval.py
+++ b/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/rw_eval.py
@@ -4,16 +4,11 @@
 
 # DeepSpeed Team
 import argparse
-import os
 import torch
 
-import sys
-
-sys.path.append(
-    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)))
-from utils.model.model_utils import create_critic_model
-from utils.utils import to_device
-from utils.utils import load_hf_tokenizer
+from dschat.utils.model.model_utils import create_critic_model
+from dschat.utils.utils import to_device, load_hf_tokenizer
+from deepspeed import get_accelerator
 
 
 def parse_args():
@@ -34,16 +29,26 @@ def parse_args():
         "OPT model has a fixed number (1) of padding tokens at the beginning of the input. "
         "We did not see this in other models but keep it as an option for now.",
     )
+    parser.add_argument(
+        "--add_eot_token",
+        action='store_true',
+        help="Add <|endoftext|> as additional special token to tokenizer")
     args = parser.parse_args()
     return args
 
 
-def load_stuff(model_name_or_path, num_padding_at_beginning):
+def load_stuff(model_name_or_path, num_padding_at_beginning,
+               additional_special_tokens):
 
-    tokenizer = load_hf_tokenizer(model_name_or_path, fast_tokenizer=True)
+    tokenizer = load_hf_tokenizer(model_name_or_path,
+                                  fast_tokenizer=True,
+                                  add_special_tokens=additional_special_tokens)
     tokenizer.pad_token = tokenizer.eos_token
-    model = create_critic_model(model_name_or_path, tokenizer, None,
-                                num_padding_at_beginning, True)
+    model = create_critic_model(model_name_or_path,
+                                tokenizer,
+                                None,
+                                num_padding_at_beginning,
+                                dropout=0.)
 
     return model, tokenizer
 
@@ -100,10 +105,14 @@ def prepare_singlesample(prompt,
 def run_pair_comparison():
     args = parse_args()
 
-    device = torch.device("cuda:0")
+    device = torch.device(get_accelerator().device_name(0))
+
+    args.end_of_conversation_token = "<|endoftext|>"
+    additional_special_tokens = args.end_of_conversation_token if args.add_eot_token else None
 
     rm_model, tokenizer = load_stuff(args.model_name_or_path,
-                                     args.num_padding_at_beginning)
+                                     args.num_padding_at_beginning,
+                                     additional_special_tokens)
     rm_model.to(device)
     rm_model.eval()
 
@@ -122,12 +131,13 @@ def run_pair_comparison():
 
     for prompt, good_ans, bad_ans in zip(prompt_list, good_ans_list,
                                          bad_ans_list):
-        batch = prepare_datapair(prompt,
-                                 good_ans,
-                                 bad_ans,
-                                 tokenizer,
-                                 max_seq_len=512,
-                                 end_of_conversation_token="<|endoftext|>")
+        batch = prepare_datapair(
+            prompt,
+            good_ans,
+            bad_ans,
+            tokenizer,
+            max_seq_len=512,
+            end_of_conversation_token=args.end_of_conversation_token)
         batch = to_device(batch, device)
         # Run inference
         with torch.no_grad():
@@ -144,20 +154,25 @@ def run_pair_comparison():
 
 def run_single_sample():
     args = parse_args()
-    device = torch.device("cuda")
+    device = torch.device(get_accelerator().device_name())
+
+    args.end_of_conversation_token = "<|endoftext|>"
+    additional_special_tokens = args.end_of_conversation_token if args.add_eot_token else None
 
     rm_model, tokenizer = load_stuff(args.model_name_or_path,
-                                     args.num_padding_at_beginning)
+                                     args.num_padding_at_beginning,
+                                     additional_special_tokens)
     rm_model.to(device)
 
     prompt = "Human: Explain the moon landing to a 6 year old in a few sentences."
     my_ans = "Assistant: The moon landing was a major milestone in the history of human exploration of the solar system. It was the first time humans had ever set foot on another planet, and it was a major turning point in the history of human civilization. The astronauts, Neil Armstrong, Buzz Aldrin, and Michael Collins, successfully landed the Apollo 11 spacecraft on the moon, marking the first time humans had ever set foot on another"
 
-    batch = prepare_singlesample(prompt,
-                                 my_ans,
-                                 tokenizer,
-                                 max_seq_len=512,
-                                 end_of_conversation_token="<|endoftext|>")
+    batch = prepare_singlesample(
+        prompt,
+        my_ans,
+        tokenizer,
+        max_seq_len=512,
+        end_of_conversation_token=args.end_of_conversation_token)
     batch = to_device(batch, device)
 
     rm_model.eval()
diff --git a/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/single_node/run_7b_llama.sh b/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/llama2/run_llama2_7b.sh
similarity index 81%
rename from applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/single_node/run_7b_llama.sh
rename to applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/llama2/run_llama2_7b.sh
index f8a5b7c41..5ee57171d 100755
--- a/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/single_node/run_7b_llama.sh
+++ b/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/llama2/run_llama2_7b.sh
@@ -14,9 +14,9 @@ fi
 mkdir -p $OUTPUT
 
 deepspeed main.py \
-   --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \
+   --data_path Dahoas/rm-static \
    --data_split 2,4,4 \
-   --model_name_or_path decapoda-research/llama-7b-hf \
+   --model_name_or_path meta-llama/Llama-2-7b-hf \
    --per_device_train_batch_size 8 \
    --per_device_eval_batch_size 8 \
    --max_seq_len 512 \
@@ -31,5 +31,6 @@ deepspeed main.py \
    --gradient_checkpointing \
    --zero_stage $ZERO_STAGE \
    --deepspeed \
+   --offload \
    --output_dir $OUTPUT \
    &> $OUTPUT/training.log
diff --git a/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh b/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh
new file mode 100755
index 000000000..78c5eb2ac
--- /dev/null
+++ b/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+OUTPUT=$1
+ZERO_STAGE=$2
+if [ "$OUTPUT" == "" ]; then
+    OUTPUT=./output_step2_llama_7b_epoch1_lr9.65e-6
+fi
+if [ "$ZERO_STAGE" == "" ]; then
+    ZERO_STAGE=3
+fi
+mkdir -p $OUTPUT
+
+deepspeed main.py \
+   --data_path Dahoas/rm-static \
+   --data_split 2,4,4 \
+   --model_name_or_path meta-llama/Llama-2-7b-hf \
+   --per_device_train_batch_size 8 \
+   --per_device_eval_batch_size 8 \
+   --max_seq_len 512 \
+   --learning_rate 9.65e-6 \
+   --weight_decay 0.1 \
+   --num_padding_at_beginning 0 \
+   --num_train_epochs 1  \
+   --gradient_accumulation_steps 1 \
+   --lr_scheduler_type cosine \
+   --num_warmup_steps 0 \
+   --seed 1234 \
+   --gradient_checkpointing \
+   --zero_stage $ZERO_STAGE \
+   --deepspeed \
+   --offload \
+   --lora_dim 128 \
+   --lora_module_name "layers." \
+   --output_dir $OUTPUT \
+   &> $OUTPUT/training.log
diff --git a/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/multi_node/run_350m.sh b/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/opt/multi_node/run_350m.sh
similarity index 97%
rename from applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/multi_node/run_350m.sh
rename to applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/opt/multi_node/run_350m.sh
index cea008824..51852af45 100644
--- a/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/multi_node/run_350m.sh
+++ b/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/opt/multi_node/run_350m.sh
@@ -23,7 +23,7 @@ deepspeed main.py \
    --max_seq_len 512 \
    --learning_rate 5e-5 \
    --weight_decay 0.1 \
-   --disable_dropout \
+   --dropout 0.0 \
    --num_train_epochs 1 \
    --gradient_accumulation_steps 1 \
    --lr_scheduler_type cosine \
diff --git a/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/single_gpu/run_350m.sh b/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/opt/single_gpu/run_350m.sh
similarity index 77%
rename from applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/single_gpu/run_350m.sh
rename to applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/opt/single_gpu/run_350m.sh
index 5f836a46f..284fd44a0 100644
--- a/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/single_gpu/run_350m.sh
+++ b/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/opt/single_gpu/run_350m.sh
@@ -14,7 +14,7 @@ fi
 mkdir -p $OUTPUT
 
 deepspeed --num_gpus 1 main.py --model_name_or_path facebook/opt-350m \
-   --num_padding_at_beginning 1 --weight_decay 0.1 --disable_dropout --gradient_accumulation_steps 4 --zero_stage $ZERO_STAGE \
+   --num_padding_at_beginning 1 --weight_decay 0.1 --dropout 0.0 --gradient_accumulation_steps 4 --zero_stage $ZERO_STAGE \
    --enable_tensorboard \
    --tensorboard_path $OUTPUT \
    --deepspeed --output_dir $OUTPUT &> $OUTPUT/training.log
diff --git a/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/single_node/run_350m.sh b/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/opt/single_node/run_350m.sh
similarity index 97%
rename from applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/single_node/run_350m.sh
rename to applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/opt/single_node/run_350m.sh
index 2d1709955..d7ff70106 100644
--- a/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/single_node/run_350m.sh
+++ b/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/opt/single_node/run_350m.sh
@@ -24,7 +24,7 @@ deepspeed main.py \
    --learning_rate 5e-5 \
    --weight_decay 0.1 \
    --num_train_epochs 1 \
-   --disable_dropout \
+   --dropout 0.0 \
    --gradient_accumulation_steps 1 \
    --lr_scheduler_type cosine \
    --num_warmup_steps 0 \
diff --git a/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/single_node/sweep/README.md b/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/opt/single_node/sweep/README.md
similarity index 87%
rename from applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/single_node/sweep/README.md
rename to applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/opt/single_node/sweep/README.md
index 948e5c30b..1f90b9f65 100644
--- a/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/single_node/sweep/README.md
+++ b/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/opt/single_node/sweep/README.md
@@ -16,5 +16,5 @@ The `run_step2_sweep.sh` script passes configuration arguments to `run_single.sh
 # Usage
 The sweep script can be run as follows:
 <pre>
-DeepSpeedExamples/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning$ bash training_scripts/single_node/sweep/run_step2_sweep.sh
+DeepSpeedExamples/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning$ bash training_scripts/opt/single_node/sweep/run_step2_sweep.sh
 </pre>
diff --git a/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/single_node/sweep/run_single.sh b/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/opt/single_node/sweep/run_single.sh
similarity index 97%
rename from applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/single_node/sweep/run_single.sh
rename to applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/opt/single_node/sweep/run_single.sh
index c308a2c5f..6f5453af1 100644
--- a/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/single_node/sweep/run_single.sh
+++ b/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/opt/single_node/sweep/run_single.sh
@@ -30,7 +30,7 @@ cmd="deepspeed main.py \
    --learning_rate 5e-5 \
    --weight_decay 0.1 \
    --num_train_epochs 1 \
-   --disable_dropout \
+   --dropout 0.0 \
    --gradient_accumulation_steps 1 \
    --lr_scheduler_type cosine \
    --num_warmup_steps 0 \
diff --git a/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/single_node/sweep/run_step2_sweep.sh b/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/opt/single_node/sweep/run_step2_sweep.sh
similarity index 85%
rename from applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/single_node/sweep/run_step2_sweep.sh
rename to applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/opt/single_node/sweep/run_step2_sweep.sh
index 0c0852724..ad9849e38 100644
--- a/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/single_node/sweep/run_step2_sweep.sh
+++ b/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/opt/single_node/sweep/run_step2_sweep.sh
@@ -7,7 +7,7 @@ for z in {2..3}
 do
     for offload in true false
     do
-        cmd="bash training_scripts/single_node/sweep/run_single.sh \
+        cmd="bash training_scripts/opt/single_node/sweep/run_single.sh \
             ${z} \
             ${offload} \
             z${z}_offload_${offload}"
diff --git a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/BenckmarkSetting.md b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/BenckmarkSetting.md
index de1f133c2..bdea565b7 100644
--- a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/BenckmarkSetting.md
+++ b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/BenckmarkSetting.md
@@ -8,6 +8,6 @@ an apple-to-apple comparison is critical for the machine learning community, par
 
 We randomly select 40% training data from the six open-sourced training datasets, i.e., ``"Dahoas/rm-static", "Dahoas/full-hh-rlhf", "Dahoas/synthetic-instruct-gptj-pairwise", "yitingxie/rlhf-reward-datasets", "openai/webgpt_comparisons"``, and ``"stanfordnlp/SHP"``. The total training samples we have is 264,292. We fix the query (prompt) sequence length as **256** and generate fixed-length answer with **256** tokens. As such, the total training tokens per epoch is 135,317,504. During benchmark testing, we set the training epoch number as 1.
 
-As mentioned in the instability of [RLHF Training Tutorial](./README.md#🙋-instablity-of-rlhf-training-and-others), we found that it is not stable to update the actor model multiple times using the generated data. Therefore, we set ``per_device_train_batch_size=per_device_mini_batch_size`` and ``ppo_epochs=generation_batch_numbers=1`` for all of our benchmark results. During testing, we also set an upper bound for the maximum global training tokens at 524,288 (batch size of 1024 with a sequence length of 512). This is the largest batch size we found during our exploration that provides a stable RLHF training experience. Users and practitioners may find better training hyperparameters to further increase this. Additionally, during testing, whenever the global training token batch size does not exceed our limit of 524,288, we always use the largest training batch size that does not result in an out-of-memory error to benchmark the time.
+As mentioned in the instability of [RLHF Training Tutorial](./README.md#🙋-instablity-of-rlhf-training-and-others), we found that it is not stable to update the actor model multiple times using the generated data. Therefore, we set ``per_device_generation_batch_size=per_device_training_batch_size`` and ``ppo_epochs=generation_batches=1`` for all of our benchmark results. During testing, we also set an upper bound for the maximum global training tokens at 524,288 (batch size of 1024 with a sequence length of 512). This is the largest batch size we found during our exploration that provides a stable RLHF training experience. Users and practitioners may find better training hyperparameters to further increase this. Additionally, during testing, whenever the global training token batch size does not exceed our limit of 524,288, we always use the largest training batch size that does not result in an out-of-memory error to benchmark the time.
 
 We hope this clearly explains our benchmark settings, and please do not hesitate to contact us if you need more information. If you'd like to reproduce our performance results or make a comparison with DeepSpeed-RLHF, we would like to encourage you to leverage the same / similar settings such that the performance results are more comparable.  
diff --git a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/README.md b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/README.md
index 0c7e064f1..b2c34f572 100644
--- a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/README.md
+++ b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/README.md
@@ -30,7 +30,7 @@ To overcome both challenges, we are introducing the DeepSpeed Hybrid Engine (Dee
 We provide multiple actor training scripts in the 'training_scripts' folder, all using a fixed OPT-350m reward model. However, users are encouraged to experiment with different reward model sizes based on their preferences. For example, if you have a single GPU and want to train an OPT-1.3B model, you can simply run the following bash script to initiate the training process.
 
 ```bash
-training_scripts/single_gpu/run_1.3b.sh
+training_scripts/opt/single_gpu/run_1.3b.sh
 ```
 
 ## 🎵 Some arguments explanation and largest model training on your own system
@@ -41,8 +41,8 @@ We provide most of unique arguments used in DeepSpeed RLHF other than the previo
 | ------------------------------------------------------------------ | -------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | --unsupervised_dataset_name and --unsupervised_dataset_config_name | Huggingface datasets standard setting to collect the data, e.g., using Wikitext-103          | When both are provided, during each PPO training, we will also add the pretraining objective. Based on InstructGPT, this will enhance the model's benchmark performance. |
 | --unsup_coef                                                       | Used to balance RLHF/PPO loss and the unsupervised loss                                      |                                                                                                                                                                          |
-| --per_device_train_batch_size and --per_device_mini_batch_size     | The first one is the generation batch size and the second one is the PPO training batch size | Usually, the first one needs to be divisible by the second one.                                                                                                           |
-| --generation_batch_numbers                                         | Generated N batches then do PPO training                                                     | This setting is common in RL, i.e., we generate an experiment table then do RL training                                                                                  |
+| --per_device_generation_batch_size and --per_device_training_batch_size     | The first one is the generation batch size and the second one is the PPO training batch size | Usually, the first one needs to be divisible by the second one.                                                                                                           |
+| --generation_batches                                         | Generated N batches then do PPO training                                                     | This setting is common in RL, i.e., we generate an experiment table then do RL training                                                                                  |
 | --ppo_epochs                                                       | For the generated experiments, how many PPO epochs we want to perform                        |                                                                                                                                                                          |
 | --max_prompt_seq_len and --max_answer_seq_len                      | The length of the query and the length of the answer                                         |                                                                                                                                                                          |
 | --enable_hybrid_engine                                             | Enable it to use DeepSpeed Hybrid Engine                                                     | This will significantly speedup your training                                                                                                                            |
@@ -69,7 +69,7 @@ Users can either use the `prompt_eval.py` script from Step 1 of the SFT process
 
 RLHF is a relatively new field, and as expected, we have encountered some training instabilities during our exploration. We are sharing our findings here and actively working on solutions. We also welcome solutions from the community.
 
-We have found that it is very unstable to use different generation training batch sizes (`--per_device_train_batch_size`) and PPO training batch sizes (`--per_device_mini_batch_size`), more than one PPO training epoch (`--ppo_epochs`), or more than one generation batch size (`--generation_batch_numbers`). These all point to the same problem: we are not able to update the actor model multiple times after generating experimental data. Therefore, in all of our successful runs, we have set `per_device_train_batch_size=per_device_mini_batch_size` and `ppo_epochs=generation_batch_numbers=1`. This is unexpected for a standard RL training pipeline, and we have tried different methods to overcome this, but all have failed. One of the most likely reasons for this instability is that we found the `log_probs` and `old_log_probs` used in the `actor_loss_fn` function can quickly diverge even within two consecutive iterations, which causes the corresponding `ratio` to be huge. Setting a strict upper bound can alleviate this problem, but it cannot fully resolve the convergence issue.
+We have found that it is very unstable to use different generation training batch sizes (`--per_device_generation_batch_size`) and PPO training batch sizes (`--per_device_training_batch_size`), more than one PPO training epoch (`--ppo_epochs`), or more than one generation batch (`--generation_batches 1`). These all point to the same problem: we are not able to update the actor model multiple times after generating experimental data. Therefore, in all of our successful runs, we have set `per_device_generation_batch_size=per_device_training_batch_size` and `ppo_epochs=generation_batches=1`. This is unexpected for a standard RL training pipeline, and we have tried different methods to overcome this, but all have failed. One of the most likely reasons for this instability is that we found the `log_probs` and `old_log_probs` used in the `actor_loss_fn` function can quickly diverge even within two consecutive iterations, which causes the corresponding `ratio` to be huge. Setting a strict upper bound can alleviate this problem, but it cannot fully resolve the convergence issue.
 
 We have also found that adding unsupervised training is not easy. We tried using the coefficient (`--unsup_coef=27.8`) provided by InstructGPT, but it caused instability in the RLHF training. According to InstructGPT, unsupervised training mainly affects the model quality on standard benchmarks instead of the RLHF performance. We did not put much effort into tuning this parameter.
 
diff --git a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/main.py b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/main.py
index 87ad2b8d1..a5be5671b 100644
--- a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/main.py
+++ b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/main.py
@@ -19,6 +19,7 @@
 import argparse
 import os
 import random
+import time
 import torch
 from torch.utils.data import DataLoader, RandomSampler
 from torch.utils.data.distributed import DistributedSampler
@@ -32,16 +33,14 @@
 
 import deepspeed
 
-from ppo_trainer import DeepSpeedPPOTrainer, DeepSpeedPPOTrainerUnsupervised
-from rlhf_engine import DeepSpeedRLHFEngine
-
-import sys
-
-sys.path.append(
-    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)))
-from utils.data.data_utils import create_prompt_dataset, MiniDataset, DataCollatorRLHF, get_unsupervised_data
-from utils.utils import print_rank_0, to_device, save_hf_format, set_random_seed, get_all_reduce_mean, moving_average, save_zero_three_model, load_hf_tokenizer
-from utils.module.lora import convert_lora_to_linear_layer
+from dschat.rlhf.ppo_trainer import DeepSpeedPPOTrainer, DeepSpeedPPOTrainerUnsupervised
+from dschat.rlhf.rlhf_engine import DeepSpeedRLHFEngine
+from dschat.utils.data.data_utils import create_prompt_dataset, MiniDataset, DataCollatorRLHF, get_unsupervised_data
+from dschat.utils.utils import print_rank_0, to_device, save_hf_format, set_random_seed, get_all_reduce_mean, moving_average, save_zero_three_model, load_hf_tokenizer, \
+    ExponentialMovingAverage
+from dschat.utils.module.lora import convert_lora_to_linear_layer
+from dschat.utils.perf import print_throughput_step3
+from deepspeed.accelerator import get_accelerator
 
 writer = None
 
@@ -109,20 +108,20 @@ def parse_args():
         "OPT model has a fixed number (1) of padding tokens at the beginning of the input. We did not see this in other models but keep it as an option for now."
     )
     parser.add_argument(
-        "--per_device_train_batch_size",
+        "--per_device_generation_batch_size",
         type=int,
         default=16,
         help=
         "Batch size (per device) for the training dataloader and generation purpose."
     )
     parser.add_argument(
-        "--per_device_mini_train_batch_size",
+        "--per_device_training_batch_size",
         type=int,
         default=16,
         help=
         "Mini Batch size (per device) for the training dataloader and training purpose."
     )
-    parser.add_argument("--generation_batch_numbers",
+    parser.add_argument("--generation_batches",
                         type=int,
                         default=1,
                         help="Generate x batches to go to training mode.")
@@ -238,6 +237,11 @@ def parse_args():
     parser.add_argument('--offload',
                         action='store_true',
                         help='Enable ZeRO Offload techniques.')
+    parser.add_argument('--dtype',
+                        type=str,
+                        default='fp16',
+                        choices=['fp16', 'bf16'],
+                        help='Training data type')
     parser.add_argument(
         '--offload_reference_model',
         action='store_true',
@@ -260,12 +264,20 @@ def parse_args():
         '--critic_gradient_checkpointing',
         action='store_true',
         help='Enable HF gradient checkpointing for Critic model.')
-    parser.add_argument('--disable_actor_dropout',
-                        action='store_true',
-                        help='Disable the dropout of the actor model.')
-    parser.add_argument('--disable_critic_dropout',
-                        action='store_true',
-                        help='Disable the dropout of the critical model.')
+    parser.add_argument(
+        "--actor_dropout",
+        type=float,
+        default=None,
+        help="If actor dropout configured, use it. "
+        "Otherwise, keep the default dropout configuration of the actor model."
+    )
+    parser.add_argument(
+        "--critic_dropout",
+        type=float,
+        default=None,
+        help="If critic dropout configured, use it. "
+        "Otherwise, keep the default dropout configuration of the critic model."
+    )
     ## LoRA for efficient training setting
     parser.add_argument("--actor_lora_dim",
                         type=int,
@@ -286,10 +298,36 @@ def parse_args():
     parser.add_argument('--only_optimize_lora',
                         action='store_true',
                         help='Only optimize the LoRA parameters.')
+    parser.add_argument(
+        "--actor_lora_learning_rate",
+        type=float,
+        default=5e-4,
+        help=
+        "Initial actor LoRA learning rate (after the potential warmup period) to use."
+    )
+    parser.add_argument(
+        "--critic_lora_learning_rate",
+        type=float,
+        default=5e-4,
+        help=
+        "Initial critic LoRA learning rate (after the potential warmup period) to use."
+    )
     ## Make EMA as an optional feature
     parser.add_argument('--enable_ema',
                         action='store_true',
                         help='Enable EMA checkpoint for the model.')
+    ## Mixed Precision ZeRO++
+    parser.add_argument(
+        '--enable_mixed_precision_lora',
+        action='store_true',
+        help='Enable Mixed Precision ZeRO++ for training and generation.')
+    ## low precision
+    parser.add_argument(
+        '--compute_fp32_loss',
+        action='store_true',
+        help='Relevant for low precision dtypes (fp16, bf16, etc.). '
+        'If specified, loss is calculated in fp32.'
+        'This applies for both actor and critic models.')
     ## Tensorboard logging
     parser.add_argument('--enable_tensorboard',
                         action='store_true',
@@ -297,6 +335,11 @@ def parse_args():
     parser.add_argument('--tensorboard_path',
                         type=str,
                         default="step3_tensorboard")
+    ## Tokenizer
+    parser.add_argument(
+        "--add_eot_token",
+        action='store_true',
+        help="Add <|endoftext|> as additional special token to tokenizer")
     ## Actor/critic model overflow alignment
     parser.add_argument(
         '--align_overflow',
@@ -306,6 +349,25 @@ def parse_args():
     parser.add_argument('--print_answers',
                         action='store_true',
                         help='Print prompt and answers during training')
+    parser.add_argument(
+        "--print_answers_interval",
+        type=int,
+        default=1,
+        help="If --print_answers enabled, controls the printing interval.")
+    ## Testing
+    parser.add_argument(
+        '--enable_test_mode',
+        action='store_true',
+        help=
+        'Enable a testing mode that terminates training based on args.test_stop_step'
+    )
+    parser.add_argument(
+        "--test_stop_step",
+        type=int,
+        default=0,
+        help=
+        "Training non-overflow step at which to terminate training during testing."
+    )
 
     parser = deepspeed.add_config_arguments(parser)
     args = parser.parse_args()
@@ -328,11 +390,6 @@ def parse_args():
             "The combination of [actor_zero_stage==2, critic_zero_stage==2, enable_hybrid_engine=True, offload=True, lora=False] is currently unsupported due to training instability!"
         )
 
-    if args.actor_zero_stage == 3 and args.critic_zero_stage == 3 and args.enable_hybrid_engine and args.offload and args.actor_lora_dim > 0:
-        raise ValueError(
-            "The combination of [actor_zero_stage==3, critic_zero_stage==3, enable_hybrid_engine=True, offload=True, lora=True] is currently unsupported due to training instability!"
-        )
-
     return args
 
 
@@ -364,19 +421,19 @@ def create_datasets(args, tokenizer, train_phase=3):
         prompt_train_dataset,
         collate_fn=data_collator,
         sampler=prompt_train_sampler,
-        batch_size=args.per_device_train_batch_size)
+        batch_size=args.per_device_generation_batch_size)
     if unsupervised_training_enabled:
         unsupervised_train_dataloader = DataLoader(
             unsupervised_train_dataset,
             collate_fn=default_data_collator,
             sampler=unsupervised_train_sampler,
-            batch_size=args.per_device_train_batch_size)
+            batch_size=args.per_device_generation_batch_size)
     else:
         unsupervised_train_dataloader = [None] * len(
             prompt_train_dataloader)  # basically a dummy dataloader
 
     num_update_steps_per_epoch = min(len(prompt_train_dataloader), len(unsupervised_train_dataloader)) * \
-        (args.per_device_train_batch_size / args.per_device_mini_train_batch_size) * \
+        (args.per_device_generation_batch_size / args.per_device_training_batch_size) * \
         args.ppo_epochs / args.gradient_accumulation_steps
     num_total_iters = int(args.num_train_epochs * num_update_steps_per_epoch)
 
@@ -387,10 +444,10 @@ def main():
     args = parse_args()
 
     if args.local_rank == -1:
-        device = torch.device("cuda")
+        device = torch.device(get_accelerator().device_name())
     else:
-        torch.cuda.set_device(args.local_rank)
-        device = torch.device("cuda", args.local_rank)
+        get_accelerator().set_device(args.local_rank)
+        device = torch.device(get_accelerator().device_name(), args.local_rank)
         # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         deepspeed.init_distributed()
 
@@ -408,8 +465,12 @@ def main():
     torch.distributed.barrier()
 
     # load_hf_tokenizer will get the correct tokenizer and set padding tokens based on the model family
+    args.end_of_conversation_token = "<|endoftext|>"
+    additional_special_tokens = args.end_of_conversation_token if args.add_eot_token else None
     tokenizer = load_hf_tokenizer(args.actor_model_name_or_path,
-                                  fast_tokenizer=True)
+                                  fast_tokenizer=True,
+                                  add_special_tokens=additional_special_tokens)
+
     prompt_train_dataloader, unsupervised_train_dataloader, num_total_iters = create_datasets(
         args=args, tokenizer=tokenizer, train_phase=3)
 
@@ -421,19 +482,30 @@ def main():
         num_total_iters=num_total_iters,
         args=args)
 
-    args.end_of_conversation_token = "<|endoftext|>"
+    # Mixed Precision ZeRO++
+    if args.enable_mixed_precision_lora:
+        assert args.actor_lora_dim > 0, "Mixed Precision LoRA requires LoRA to be enabled"
+        assert args.actor_zero_stage == 3, "Mixed Precision LoRA requires Zero stage 3"
+        rlhf_engine.actor.optimizer.quantize_nontrainable_params()
+        print_rank_0("Mixed Precision ZeRO++ enabled")
 
     ppo_trainer = DeepSpeedPPOTrainerUnsupervised if unsupervised_training_enabled else DeepSpeedPPOTrainer
     trainer = ppo_trainer(rlhf_engine, args)
 
     # first number is how many experience-batch to generate, second number is the training batch size, which is the micro-batch size used
-    exp_mini_dataset = MiniDataset(args.generation_batch_numbers,
-                                   args.per_device_mini_train_batch_size)
-    unsup_mini_dataset = MiniDataset(args.generation_batch_numbers,
-                                     args.per_device_mini_train_batch_size)
+    exp_mini_dataset = MiniDataset(args.generation_batches,
+                                   args.per_device_training_batch_size)
+    unsup_mini_dataset = MiniDataset(args.generation_batches,
+                                     args.per_device_training_batch_size)
 
     # Train!
-    print_rank_0("***** Running training *****", args.global_rank)
+    print_rank_0(
+        f"***** Running training (total_iters={num_total_iters}) *****",
+        args.global_rank)
+
+    non_overflow_step_count = 0
+    step_average_reward = 0.
+    ema_reward_score = ExponentialMovingAverage()
 
     for epoch in range(args.num_train_epochs):
         print_rank_0(
@@ -441,13 +513,9 @@ def main():
             args.global_rank)
         for step, (batch_prompt, batch_unsupervised) in enumerate(
                 zip(prompt_train_dataloader, unsupervised_train_dataloader)):
+
             batch_prompt = to_device(batch_prompt, device)
-            if batch_unsupervised is not None:
-                batch_unsupervised = to_device(batch_unsupervised, device)
-                unsup_dataset = unsup_mini_dataset.add(batch_unsupervised)
-            else:
-                unsup_dataset = unsup_mini_dataset.add(
-                    [[None] * args.per_device_train_batch_size])
+
             # prompts = batch_prompt['prompt']
             # length = prompts.size(-1)
             # if length > args.max_prompt_seq_len:
@@ -457,6 +525,15 @@ def main():
             out = trainer.generate_experience(batch_prompt['prompt'],
                                               batch_prompt['prompt_att_mask'],
                                               step)
+
+            training_start = time.time()
+            if batch_unsupervised is not None:
+                batch_unsupervised = to_device(batch_unsupervised, device)
+                unsup_dataset = unsup_mini_dataset.add(batch_unsupervised)
+            else:
+                unsup_dataset = unsup_mini_dataset.add(
+                    [[None] * args.per_device_generation_batch_size])
+
             exp_dataset = exp_mini_dataset.add(out)
 
             if exp_dataset is not None:
@@ -489,29 +566,44 @@ def main():
                     random.shuffle(exp_dataset)
                     random.shuffle(unsup_dataset)
 
+                end = time.time()
+                training_time = end - training_start
+                e2e_time = training_time + trainer.generate_time * args.generation_batches  # it is an approximation, we did not include, e.g., rw forward time etc
+
                 print_rank_0(
-                    f'epoch: {epoch}|step: {step}|ppo_ep: {ppo_ep+1}|act_loss: {actor_loss_sum/inner_iter}|cri_loss: {critic_loss_sum/inner_iter}|unsuper_loss: {unsup_loss_sum/inner_iter}',
+                    f'Epoch: {epoch} | Step: {step} | PPO Epoch: {ppo_ep+1} | Actor Loss: {actor_loss_sum/inner_iter} | Critic Loss: {critic_loss_sum/inner_iter} | Unsupervised Loss: {unsup_loss_sum/inner_iter}',
                     args.global_rank)
+                print_throughput_step3(rlhf_engine.actor.module,
+                                       rlhf_engine.critic, args, e2e_time,
+                                       trainer.generate_time, training_time,
+                                       args.global_rank)
+
                 average_reward = get_all_reduce_mean(average_reward).item()
+                step_average_reward += average_reward / args.gradient_accumulation_steps_actor
+                if (step + 1) % args.gradient_accumulation_steps_actor == 0:
+                    ema_reward_score.update(step_average_reward)
+                    step_average_reward = 0.
+
                 print_rank_0(
-                    f"average reward score: {average_reward/inner_iter}",
+                    f"Average reward score: {average_reward/inner_iter} | EMA reward score: {ema_reward_score.get()}",
                     args.global_rank)
                 print_rank_0(
                     "-------------------------------------------------------------------------------------",
                     args.global_rank)
+
                 if args.enable_tensorboard and torch.distributed.get_rank(
                 ) == 0:
                     writer.add_scalar('reward',
                                       average_reward / inner_iter,
                                       global_step=step)
                     writer.add_scalar('actor_loss',
-                                      actor_loss,
+                                      actor_loss.item(),
                                       global_step=step)
                     writer.add_scalar('actor_loss_sum',
                                       actor_loss_sum,
                                       global_step=step)
                     writer.add_scalar('critic_loss',
-                                      critic_loss,
+                                      critic_loss.item(),
                                       global_step=step)
                     writer.add_scalar('critic_loss_sum',
                                       critic_loss_sum,
@@ -521,6 +613,17 @@ def main():
             if args.actor_gradient_checkpointing:
                 rlhf_engine.actor.gradient_checkpointing_disable()
 
+            actor_overflow, critic_overflow = trainer.get_overflow()
+
+            if not actor_overflow and not critic_overflow:
+                non_overflow_step_count += 1
+
+            if args.enable_test_mode and non_overflow_step_count == args.test_stop_step:
+                break
+
+        if args.enable_test_mode:
+            break
+
     if args.output_dir is not None:
         print_rank_0('saving model ...')
         rlhf_engine.actor = convert_lora_to_linear_layer(rlhf_engine.actor)
diff --git a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_log_output/actor_opt-1.3b_critic_opt-350m_globalBatchSize64.log b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_log_output/actor_opt-1.3b_critic_opt-350m_globalBatchSize64.log
index f4a0971eb..38437e3fe 100644
--- a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_log_output/actor_opt-1.3b_critic_opt-350m_globalBatchSize64.log
+++ b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_log_output/actor_opt-1.3b_critic_opt-350m_globalBatchSize64.log
@@ -1,4 +1,4 @@
-cmd = deepspeed main.py --data_path Dahoas/rm-static --data_split 2,4,4 --actor_model_name_or_path ~/DeepSpeedExamples/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/output --critic_model_name_or_path ~/DeepSpeedExamples/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/output --num_padding_at_beginning 1 --per_device_train_batch_size 4 --per_device_mini_train_batch_size 4 --generation_batch_numbers 1 --ppo_epochs 1 --max_answer_seq_len 256 --max_prompt_seq_len 256 --actor_learning_rate 9.65e-6 --critic_learning_rate 5e-6 --actor_weight_decay 0. --critic_weight_decay 0. --num_train_epochs 1 --lr_scheduler_type cosine --gradient_accumulation_steps 1 --num_warmup_steps 100 --deepspeed --seed 1234 --enable_hybrid_engine --actor_zero_stage 2 --critic_zero_stage 2 --enable_ema --output_dir ./output_fourDatasets_NoWeightDecay_withDropout_mixed
+cmd = deepspeed main.py --data_path Dahoas/rm-static --data_split 2,4,4 --actor_model_name_or_path ~/DeepSpeedExamples/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/output --critic_model_name_or_path ~/DeepSpeedExamples/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/output --num_padding_at_beginning 1 --per_device_generation_batch_size 4 --per_device_training_batch_size 4 --generation_batches 1 --ppo_epochs 1 --max_answer_seq_len 256 --max_prompt_seq_len 256 --actor_learning_rate 9.65e-6 --critic_learning_rate 5e-6 --actor_weight_decay 0. --critic_weight_decay 0. --num_train_epochs 1 --lr_scheduler_type cosine --gradient_accumulation_steps 1 --num_warmup_steps 100 --deepspeed --seed 1234 --enable_hybrid_engine --actor_zero_stage 2 --critic_zero_stage 2 --enable_ema --output_dir ./output_fourDatasets_NoWeightDecay_withDropout_mixed
 ***** Running training *****
 Beginning of Epoch 1/1, Total Generation Batches 477
 ------------------------------------------------------
diff --git a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_node/run_7b_llama.sh b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/llama2/run_llama2_7b.sh
similarity index 88%
rename from applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_node/run_7b_llama.sh
rename to applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/llama2/run_llama2_7b.sh
index 900a2f13e..c58e94eab 100755
--- a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_node/run_7b_llama.sh
+++ b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/llama2/run_llama2_7b.sh
@@ -30,9 +30,9 @@ deepspeed --master_port 12346 main.py \
    --actor_model_name_or_path $ACTOR_MODEL_PATH \
    --critic_model_name_or_path $CRITIC_MODEL_PATH \
    --num_padding_at_beginning 1 \
-   --per_device_train_batch_size 1 \
-   --per_device_mini_train_batch_size 1 \
-   --generation_batch_numbers 1 \
+   --per_device_generation_batch_size 1 \
+   --per_device_training_batch_size 1 \
+   --generation_batches 1 \
    --ppo_epochs 1 \
    --max_answer_seq_len 256 \
    --max_prompt_seq_len 256 \
@@ -46,11 +46,11 @@ deepspeed --master_port 12346 main.py \
    --actor_gradient_checkpointing \
    --critic_gradient_checkpointing \
    --offload_reference_model \
-   --disable_actor_dropout \
+   --actor_dropout 0.0 \
    --num_warmup_steps 100 \
    --deepspeed --seed 1234 \
    --actor_zero_stage $ACTOR_ZERO_STAGE \
    --critic_zero_stage $CRITIC_ZERO_STAGE \
-    --enable_hybrid_engine \
+   --enable_hybrid_engine \
    --output_dir $OUTPUT \
     &> $OUTPUT/training.log
diff --git a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh
new file mode 100755
index 000000000..830c3750e
--- /dev/null
+++ b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+ACTOR_MODEL_PATH=$1
+CRITIC_MODEL_PATH=$2
+ACTOR_ZERO_STAGE=$3
+CRITIC_ZERO_STAGE=$4
+OUTPUT=$5
+if [ "$OUTPUT" == "" ]; then
+    OUTPUT=./output_step3_llama2
+fi
+if [ "$ACTOR_ZERO_STAGE" == "" ]; then
+    ACTOR_ZERO_STAGE=3
+fi
+if [ "$CRITIC_ZERO_STAGE" == "" ]; then
+    CRITIC_ZERO_STAGE=3
+fi
+mkdir -p $OUTPUT
+
+Num_Padding_at_Beginning=1 # this is model related
+
+Actor_Lr=9.65e-6
+Critic_Lr=5e-6
+
+deepspeed --master_port 12346 main.py \
+   --data_path Dahoas/rm-static \
+   --data_split 2,4,4 \
+   --actor_model_name_or_path $ACTOR_MODEL_PATH \
+   --critic_model_name_or_path $CRITIC_MODEL_PATH \
+   --num_padding_at_beginning 1 \
+   --per_device_generation_batch_size 1 \
+   --per_device_training_batch_size 1 \
+   --generation_batches 1 \
+   --ppo_epochs 1 \
+   --max_answer_seq_len 256 \
+   --max_prompt_seq_len 256 \
+   --actor_learning_rate ${Actor_Lr} \
+   --critic_learning_rate ${Critic_Lr} \
+   --actor_weight_decay 0.1 \
+   --critic_weight_decay 0.1 \
+   --num_train_epochs 1 \
+   --lr_scheduler_type cosine \
+   --gradient_accumulation_steps 1 \
+   --actor_gradient_checkpointing \
+   --critic_gradient_checkpointing \
+   --offload_reference_model \
+   --actor_dropout 0.0 \
+   --num_warmup_steps 100 \
+   --deepspeed --seed 1234 \
+   --actor_zero_stage $ACTOR_ZERO_STAGE \
+   --critic_zero_stage $CRITIC_ZERO_STAGE \
+   --enable_hybrid_engine \
+   --actor_lora_dim 64 \
+   --critic_lora_dim 64 \
+   --critic_lora_module_name "layers." \
+   --actor_lora_module_name "layers." \
+   --output_dir $OUTPUT \
+    &> $OUTPUT/training.log
diff --git a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/llama2/run_llama2_7b_mixz.sh b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/llama2/run_llama2_7b_mixz.sh
new file mode 100755
index 000000000..abde0b54a
--- /dev/null
+++ b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/llama2/run_llama2_7b_mixz.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+ACTOR_MODEL_PATH=$1
+CRITIC_MODEL_PATH=$2
+ACTOR_ZERO_STAGE=$3
+CRITIC_ZERO_STAGE=$4
+OUTPUT=$5
+if [ "$OUTPUT" == "" ]; then
+    OUTPUT=./output_step3_llama2
+fi
+if [ "$ACTOR_ZERO_STAGE" == "" ]; then
+    ACTOR_ZERO_STAGE=3
+fi
+if [ "$CRITIC_ZERO_STAGE" == "" ]; then
+    CRITIC_ZERO_STAGE=3
+fi
+mkdir -p $OUTPUT
+
+Num_Padding_at_Beginning=1 # this is model related
+
+Actor_Lr=9.65e-6
+Critic_Lr=5e-6
+
+deepspeed --master_port 12346 main.py \
+   --data_path Dahoas/rm-static \
+   --data_split 2,4,4 \
+   --actor_model_name_or_path $ACTOR_MODEL_PATH \
+   --critic_model_name_or_path $CRITIC_MODEL_PATH \
+   --num_padding_at_beginning 1 \
+   --per_device_generation_batch_size 1 \
+   --per_device_training_batch_size 1 \
+   --generation_batches 1 \
+   --ppo_epochs 1 \
+   --max_answer_seq_len 256 \
+   --max_prompt_seq_len 256 \
+   --actor_learning_rate ${Actor_Lr} \
+   --critic_learning_rate ${Critic_Lr} \
+   --actor_weight_decay 0.1 \
+   --critic_weight_decay 0.1 \
+   --num_train_epochs 1 \
+   --lr_scheduler_type cosine \
+   --gradient_accumulation_steps 1 \
+   --actor_gradient_checkpointing \
+   --critic_gradient_checkpointing \
+   --offload_reference_model \
+   --actor_dropout 0.0 \
+   --num_warmup_steps 100 \
+   --deepspeed --seed 1234 \
+   --actor_zero_stage $ACTOR_ZERO_STAGE \
+   --critic_zero_stage $CRITIC_ZERO_STAGE \
+   --enable_mixed_precision_lora \
+   --actor_lora_dim 64 \
+   --critic_lora_dim 64 \
+   --critic_lora_module_name "layers." \
+   --actor_lora_module_name "layers." \
+   --output_dir $OUTPUT \
+    &> $OUTPUT/training.log
diff --git a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/multi_node/run_66b.sh b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/multi_node/run_66b.sh
similarity index 90%
rename from applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/multi_node/run_66b.sh
rename to applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/multi_node/run_66b.sh
index b0b716ffc..c70908ceb 100644
--- a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/multi_node/run_66b.sh
+++ b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/multi_node/run_66b.sh
@@ -30,9 +30,9 @@ deepspeed --master_port 12346 main.py \
    --actor_model_name_or_path $ACTOR_MODEL_PATH \
    --critic_model_name_or_path $CRITIC_MODEL_PATH \
    --num_padding_at_beginning 1 \
-   --per_device_train_batch_size 4 \
-   --per_device_mini_train_batch_size 4 \
-   --generation_batch_numbers 1 \
+   --per_device_generation_batch_size 4 \
+   --per_device_training_batch_size 4 \
+   --generation_batches 1 \
    --ppo_epochs 1 \
    --max_answer_seq_len 256 \
    --max_prompt_seq_len 256 \
@@ -51,7 +51,7 @@ deepspeed --master_port 12346 main.py \
    --actor_zero_stage $ACTOR_ZERO_STAGE \
    --critic_zero_stage $CRITIC_ZERO_STAGE \
    --actor_gradient_checkpointing \
-   --disable_actor_dropout \
+   --actor_dropout 0.0 \
    --actor_lora_dim 128 \
    --actor_lora_module_name decoder.layers. \
    --output_dir $OUTPUT \
diff --git a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_gpu/run_1.3b.sh b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_gpu/run_1.3b.sh
similarity index 93%
rename from applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_gpu/run_1.3b.sh
rename to applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_gpu/run_1.3b.sh
index 1b1a5c489..41cacebab 100644
--- a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_gpu/run_1.3b.sh
+++ b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_gpu/run_1.3b.sh
@@ -23,5 +23,5 @@ deepspeed --num_gpus 1 main.py \
    --actor_model_name_or_path $ACTOR_MODEL_PATH --critic_model_name_or_path $CRITIC_MODEL_PATH \
    --actor_zero_stage $ACTOR_ZERO_STAGE --critic_zero_stage $CRITIC_ZERO_STAGE \
    --num_padding_at_beginning 1 --gradient_accumulation_steps 2 \
-   --deepspeed --actor_lora_dim 128 --enable_hybrid_engine --actor_gradient_checkpointing --disable_actor_dropout \
+   --deepspeed --actor_lora_dim 128 --enable_hybrid_engine --actor_gradient_checkpointing --actor_dropout 0.0 \
    --output_dir $OUTPUT &> $OUTPUT/training.log
diff --git a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_gpu/run_6.7b_lora.sh b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_gpu/run_6.7b_lora.sh
similarity index 88%
rename from applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_gpu/run_6.7b_lora.sh
rename to applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_gpu/run_6.7b_lora.sh
index 2c4c458ac..2c3a01d5f 100644
--- a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_gpu/run_6.7b_lora.sh
+++ b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_gpu/run_6.7b_lora.sh
@@ -25,13 +25,12 @@ deepspeed --num_gpus 1 main.py \
    --actor_model_name_or_path $ACTOR_MODEL_PATH \
    --critic_model_name_or_path $CRITIC_MODEL_PATH \
    --num_padding_at_beginning 1 \
-   --per_device_train_batch_size 8 \
-   --per_device_mini_train_batch_size 8 \
-   --generation_batch_numbers 1 \
+   --per_device_generation_batch_size 8 \
+   --per_device_training_batch_size 8 \
+   --generation_batches 1 \
    --ppo_epochs 1 \
    --max_answer_seq_len 256 \
    --max_prompt_seq_len 256 \
-   --ppo_epochs 1 \
    --actor_learning_rate ${Actor_Lr} \
    --critic_learning_rate ${Critic_Lr} \
    --num_train_epochs 1 \
@@ -44,7 +43,7 @@ deepspeed --num_gpus 1 main.py \
    --actor_lora_dim 128 \
    --actor_gradient_checkpointing \
    --critic_gradient_checkpointing \
-   --disable_actor_dropout \
+   --actor_dropout 0.0 \
    --enable_hybrid_engine \
    --output_dir $OUTPUT \
     &> $OUTPUT/training.log
diff --git a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_node/run_1.3b.sh b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/run_1.3b.sh
similarity index 71%
rename from applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_node/run_1.3b.sh
rename to applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/run_1.3b.sh
index ed4b0c2a1..5449bfea4 100644
--- a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_node/run_1.3b.sh
+++ b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/run_1.3b.sh
@@ -17,6 +17,15 @@ fi
 if [ "$CRITIC_ZERO_STAGE" == "" ]; then
     CRITIC_ZERO_STAGE=2
 fi
+
+# if actor and critic model names are not provided, then use the publicly available AdamG012/chat-opt-1.3b-sft-deepspeed and AdamG012/chat-opt-350m-reward-deepspeed
+if [ "$ACTOR_MODEL_PATH" == "" ]; then
+    ACTOR_MODEL_PATH=AdamG012/chat-opt-1.3b-sft-deepspeed
+fi
+if [ "$CRITIC_MODEL_PATH" == "" ]; then
+    CRITIC_MODEL_PATH=AdamG012/chat-opt-350m-reward-deepspeed
+fi
+
 mkdir -p $OUTPUT
 
 Num_Padding_at_Beginning=1 # this is model related
@@ -30,9 +39,9 @@ deepspeed --master_port 12346 main.py \
    --actor_model_name_or_path $ACTOR_MODEL_PATH \
    --critic_model_name_or_path $CRITIC_MODEL_PATH \
    --num_padding_at_beginning 1 \
-   --per_device_train_batch_size 4 \
-   --per_device_mini_train_batch_size 4 \
-   --generation_batch_numbers 1 \
+   --per_device_generation_batch_size 4 \
+   --per_device_training_batch_size 4 \
+   --generation_batches 1 \
    --ppo_epochs 1 \
    --max_answer_seq_len 256 \
    --max_prompt_seq_len 256 \
@@ -41,7 +50,7 @@ deepspeed --master_port 12346 main.py \
    --num_train_epochs 1 \
    --lr_scheduler_type cosine \
    --gradient_accumulation_steps 1 \
-   --disable_actor_dropout \
+   --actor_dropout 0.0 \
    --num_warmup_steps 100 \
    --deepspeed --seed 1234 \
    --enable_hybrid_engine \
@@ -49,7 +58,6 @@ deepspeed --master_port 12346 main.py \
    --critic_zero_stage $CRITIC_ZERO_STAGE \
    --enable_ema \
    --output_dir $OUTPUT \
-   --print_answers \
    --enable_tensorboard \
    --tensorboard_path $OUTPUT \
     &> $OUTPUT/training.log
diff --git a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_node/run_1.3b_lora.sh b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/run_1.3b_lora.sh
similarity index 89%
rename from applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_node/run_1.3b_lora.sh
rename to applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/run_1.3b_lora.sh
index 230f737de..b39ccb833 100644
--- a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_node/run_1.3b_lora.sh
+++ b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/run_1.3b_lora.sh
@@ -25,9 +25,9 @@ deepspeed --master_port 12346 main.py \
    --actor_model_name_or_path $ACTOR_MODEL_PATH \
    --critic_model_name_or_path $CRITIC_MODEL_PATH \
    --num_padding_at_beginning 1 \
-   --per_device_train_batch_size 4 \
-   --per_device_mini_train_batch_size 4 \
-   --generation_batch_numbers 1 \
+   --per_device_generation_batch_size 4 \
+   --per_device_training_batch_size 4 \
+   --generation_batches 1 \
    --ppo_epochs 1 \
    --max_answer_seq_len 256 \
    --max_prompt_seq_len 256 \
@@ -38,7 +38,7 @@ deepspeed --master_port 12346 main.py \
    --gradient_accumulation_steps 1 \
    --num_warmup_steps 100 \
    --deepspeed --seed 1234 \
-   --disable_actor_dropout \
+   --actor_dropout 0.0 \
    ${ACTOR_ZERO_STAGE} \
    ${CRITIC_ZERO_STAGE} \
    --actor_lora_dim 128 \
diff --git a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_node/run_13b.sh b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/run_13b.sh
similarity index 90%
rename from applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_node/run_13b.sh
rename to applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/run_13b.sh
index 1b3f805bb..82751bd7f 100644
--- a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_node/run_13b.sh
+++ b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/run_13b.sh
@@ -30,9 +30,9 @@ deepspeed --master_port 12346 main.py \
    --actor_model_name_or_path $ACTOR_MODEL_PATH \
    --critic_model_name_or_path $CRITIC_MODEL_PATH \
    --num_padding_at_beginning 1 \
-   --per_device_train_batch_size 16 \
-   --per_device_mini_train_batch_size 16 \
-   --generation_batch_numbers 1 \
+   --per_device_generation_batch_size 16 \
+   --per_device_training_batch_size 16 \
+   --generation_batches 1 \
    --ppo_epochs 1 \
    --max_answer_seq_len 256 \
    --max_prompt_seq_len 256 \
@@ -48,7 +48,7 @@ deepspeed --master_port 12346 main.py \
    --actor_zero_stage $ACTOR_ZERO_STAGE \
    --critic_zero_stage $CRITIC_ZERO_STAGE \
    --actor_gradient_checkpointing \
-   --disable_actor_dropout \
+   --actor_dropout 0.0 \
    --actor_lora_dim 128 \
    --actor_lora_module_name decoder.layers. \
    --output_dir $OUTPUT \
diff --git a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_node/run_30b_lora.sh b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/run_30b_lora.sh
similarity index 89%
rename from applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_node/run_30b_lora.sh
rename to applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/run_30b_lora.sh
index 2846ac1a4..c5c9133ff 100644
--- a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_node/run_30b_lora.sh
+++ b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/run_30b_lora.sh
@@ -26,9 +26,9 @@ deepspeed --master_port 12346 main.py \
    --actor_model_name_or_path $ACTOR_MODEL_PATH \
    --critic_model_name_or_path $CRITIC_MODEL_PATH \
    --num_padding_at_beginning 1 \
-   --per_device_train_batch_size 4 \
-   --per_device_mini_train_batch_size 4 \
-   --generation_batch_numbers 1 \
+   --per_device_generation_batch_size 4 \
+   --per_device_training_batch_size 4 \
+   --generation_batches 1 \
    --ppo_epochs 1 \
    --max_answer_seq_len 256 \
    --max_prompt_seq_len 256 \
@@ -38,7 +38,7 @@ deepspeed --master_port 12346 main.py \
    --lr_scheduler_type cosine \
    --gradient_accumulation_steps 1 \
    --actor_gradient_checkpointing \
-   --disable_actor_dropout \
+   --actor_dropout 0.0 \
    --num_warmup_steps 100 \
    --deepspeed --seed 1234 \
    ${ACTOR_ZERO_STAGE} \
diff --git a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_node/run_6.7b.sh b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/run_6.7b.sh
similarity index 90%
rename from applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_node/run_6.7b.sh
rename to applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/run_6.7b.sh
index 1fb66313d..f877bebdf 100644
--- a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_node/run_6.7b.sh
+++ b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/run_6.7b.sh
@@ -30,9 +30,9 @@ deepspeed --master_port 12346 main.py \
    --actor_model_name_or_path $ACTOR_MODEL_PATH \
    --critic_model_name_or_path $CRITIC_MODEL_PATH \
    --num_padding_at_beginning 1 \
-   --per_device_train_batch_size 4 \
-   --per_device_mini_train_batch_size 4 \
-   --generation_batch_numbers 1 \
+   --per_device_generation_batch_size 4 \
+   --per_device_training_batch_size 4 \
+   --generation_batches 1 \
    --ppo_epochs 1 \
    --max_answer_seq_len 256 \
    --max_prompt_seq_len 256 \
@@ -44,7 +44,7 @@ deepspeed --master_port 12346 main.py \
    --lr_scheduler_type cosine \
    --gradient_accumulation_steps 1 \
    --actor_gradient_checkpointing \
-   --disable_actor_dropout \
+   --actor_dropout 0.0 \
    --num_warmup_steps 100 \
    --deepspeed --seed 1234 \
    --enable_hybrid_engine \
diff --git a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_node/sweep/README.md b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/sweep/README.md
similarity index 89%
rename from applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_node/sweep/README.md
rename to applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/sweep/README.md
index 575b1b049..8f9a75271 100644
--- a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_node/sweep/README.md
+++ b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/sweep/README.md
@@ -18,5 +18,5 @@ The `run_step3_sweep.sh` script passes configuration arguments to `run_single.sh
 # Usage
 The sweep script can be run as follows:
 <pre>
-DeepSpeedExamples/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning$ bash training_scripts/single_node/sweep/run_step3_sweep.sh
+DeepSpeedExamples/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning$ bash training_scripts/opt/single_node/sweep/run_step3_sweep.sh
 </pre>
diff --git a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_node/sweep/run_single.sh b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/sweep/run_single.sh
similarity index 73%
rename from applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_node/sweep/run_single.sh
rename to applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/sweep/run_single.sh
index 53ed9a803..15ec6e576 100644
--- a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_node/sweep/run_single.sh
+++ b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/sweep/run_single.sh
@@ -10,7 +10,10 @@ CRITIC_ZERO_STAGE=$4
 ENABLE_HYBRID_ENGINE=$5
 OFFLOAD=$6
 LORA=$7
-OUTPUT=$8
+MIXED_PRECISION_LORA=$8
+OUTPUT=$9
+TEST=${10}
+TEST_STOP_STEP=${11}
 
 if [ "$ACTOR_ZERO_STAGE" == "" ]; then
     ACTOR_ZERO_STAGE=2
@@ -40,6 +43,20 @@ else
     ACTOR_LORA_MODULE_NAME=""
 fi
 
+if [ "$MIXED_PRECISION_LORA" == true ]; then
+    MIXED_PRECISION_LORA="--enable_mixed_precision_lora"
+else
+    MIXED_PRECISION_LORA=""
+fi
+
+if [ "$TEST" == true ]; then
+    TEST="--enable_test_mode"
+    TEST_STOP_STEP="--test_stop_step ${TEST_STOP_STEP}"
+else
+    TEST=""
+    TEST_STOP_STEP=""
+fi
+
 mkdir -p $OUTPUT
 
 Num_Padding_at_Beginning=1 # this is model related
@@ -53,9 +70,9 @@ cmd="deepspeed --num_nodes=1 main.py \
    --actor_model_name_or_path $ACTOR_MODEL_PATH \
    --critic_model_name_or_path $CRITIC_MODEL_PATH \
    --num_padding_at_beginning 1 \
-   --per_device_train_batch_size 4 \
-   --per_device_mini_train_batch_size 4 \
-   --generation_batch_numbers 1 \
+   --per_device_generation_batch_size 4 \
+   --per_device_training_batch_size 4 \
+   --generation_batches 1 \
    --ppo_epochs 1 \
    --max_answer_seq_len 256 \
    --max_prompt_seq_len 256 \
@@ -68,13 +85,14 @@ cmd="deepspeed --num_nodes=1 main.py \
    --critic_weight_decay 0 \
    --num_warmup_steps 100 \
    --deepspeed --seed 1234 \
-   --disable_actor_dropout \
+   --actor_dropout 0.0 \
    --print_answers \
    --actor_zero_stage ${ACTOR_ZERO_STAGE} \
    --critic_zero_stage ${CRITIC_ZERO_STAGE} \
    --output_dir $OUTPUT \
-    $ENABLE_HYBRID_ENGINE $OFFLOAD $UNPIN_ACTOR_PARAMETERS \
-    $ACTOR_LORA_DIM $ACTOR_LORA_MODULE_NAME"
+    $ENABLE_HYBRID_ENGINE $OFFLOAD $MIXED_PRECISION_LORA \
+    $ACTOR_LORA_DIM $ACTOR_LORA_MODULE_NAME\
+    $TEST $TEST_STOP_STEP"
 
 echo "----------------------------- DS COMMAND -----------------------------"
 echo $cmd
diff --git a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/sweep/run_step3_sweep.sh b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/sweep/run_step3_sweep.sh
new file mode 100644
index 000000000..c0d0b98d7
--- /dev/null
+++ b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/sweep/run_step3_sweep.sh
@@ -0,0 +1,81 @@
+#!/bin/bash
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+ACTOR_MODEL_PATH="AdamG012/chat-opt-1.3b-sft-deepspeed"
+CRITIC_MODEL_PATH="AdamG012/chat-opt-350m-reward-deepspeed"
+
+# Sweep switches
+RUN_GENERIC_SWEEP=true
+RUN_MPL_SWEEP=true
+
+# Kill any existing Python processes
+pkill -9 python
+sleep 300
+
+# Run generic sweep w/o Mixed Precision ZeRO++
+if [ "$RUN_GENERIC_SWEEP" == true ]; then
+    echo "----------------------------- RUNNING GENERIC SWEEPS -----------------------------"
+    echo ""
+    for z in {2..3}
+    do
+        for he in true false
+        do
+            for offload in true false
+            do
+                for lora in true false
+                do
+                    mixed_precision_lora=false
+                    cmd="bash training_scripts/opt/single_node/sweep/run_single.sh \
+                        $ACTOR_MODEL_PATH \
+                        $CRITIC_MODEL_PATH \
+                        ${z} \
+                        ${z} \
+                        ${he} \
+                        ${offload} \
+                        ${lora} \
+                        ${mixed_precision_lora} \
+                        z${z}_he_${he}_offload_${offload}_lora_${lora}"
+                    echo "----------------------------- CALLING SHELL SCRIPT -----------------------------"
+                    echo $cmd
+                    $cmd
+                    pkill -9 python
+                    sleep 300
+                    echo ""
+                done
+            done
+        done
+    done
+    echo ""
+fi
+
+# Run Mixed Precision ZeRO++ sweep
+if [ "$RUN_MPL_SWEEP" == true ]; then
+    echo "----------------------------- RUNNING MIXED PRECISION ZERO++ SWEEPS -----------------------------"
+    echo ""
+    for he in true false
+    do
+        z=3
+        offload=false
+        lora=true
+        mixed_precision_lora=true
+        cmd="bash training_scripts/opt/single_node/sweep/run_single.sh \
+            $ACTOR_MODEL_PATH \
+            $CRITIC_MODEL_PATH \
+            ${z} \
+            ${z} \
+            ${he} \
+            ${offload} \
+            ${lora} \
+            ${mixed_precision_lora} \
+            z${z}_he_${he}_offload_${offload}_lora_${lora}_mpl_${mixed_precision_lora}"
+        echo "----------------------------- CALLING SHELL SCRIPT -----------------------------"
+        echo $cmd
+        $cmd
+        pkill -9 python
+        sleep 300
+        echo ""
+    done
+    echo ""
+fi
diff --git a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_node/sweep/run_step3_sweep.sh b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_node/sweep/run_step3_sweep.sh
deleted file mode 100644
index 7aa24c48c..000000000
--- a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/single_node/sweep/run_step3_sweep.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/bin/bash
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-ACTOR_MODEL_PATH="AdamG012/chat-opt-1.3b-sft-deepspeed"
-CRITIC_MODEL_PATH="AdamG012/chat-opt-350m-reward-deepspeed"
-
-for z in {2..3}
-do
-    for he in true false
-    do
-        for offload in true false
-        do
-            for lora in true false
-            do
-                cmd="bash training_scripts/single_node/sweep/run_single.sh \
-                    $ACTOR_MODEL_PATH \
-                    $CRITIC_MODEL_PATH \
-                    ${z} \
-                    ${z} \
-                    ${he} \
-                    ${offload} \
-                    ${lora} \
-                    z${z}_he_${he}_offload_${offload}_lora_${lora}"
-                echo "----------------------------- CALLING SHELL SCRIPT -----------------------------"
-                echo $cmd
-                $cmd
-                pkill -9 python
-                sleep 60
-                echo ""
-            done
-        done
-    done
-done
diff --git a/applications/DeepSpeed-Chat/training/utils/model/model_utils.py b/applications/DeepSpeed-Chat/training/utils/model/model_utils.py
deleted file mode 100644
index b5f4d89a5..000000000
--- a/applications/DeepSpeed-Chat/training/utils/model/model_utils.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-import os
-import math
-import torch
-from transformers import (
-    AutoConfig,
-    AutoModel,
-)
-from huggingface_hub import snapshot_download
-from transformers.deepspeed import HfDeepSpeedConfig
-
-from .reward_model import RewardModel
-
-
-def create_hf_model(model_class,
-                    model_name_or_path,
-                    tokenizer,
-                    ds_config=None,
-                    rlhf_training=False,
-                    disable_dropout=False):
-    model_config = AutoConfig.from_pretrained(model_name_or_path)
-    if disable_dropout:
-        model_config.dropout = 0.0
-    # Note: dschf is defined in function scope to avoid global effects
-    # https://huggingface.co/docs/transformers/main_classes/deepspeed#nontrainer-deepspeed-integration
-    if ds_config is not None and ds_config["zero_optimization"]["stage"] == 3:
-        dschf = HfDeepSpeedConfig(ds_config)
-    else:
-        dschf = None
-    if rlhf_training:
-        # the weight loading is handled by create critic model
-        model = model_class.from_config(model_config)
-    else:
-        model = model_class.from_pretrained(
-            model_name_or_path,
-            from_tf=bool(".ckpt" in model_name_or_path),
-            config=model_config)
-
-    model.config.end_token_id = tokenizer.eos_token_id
-    model.config.pad_token_id = model.config.eos_token_id
-    model.resize_token_embeddings(int(
-        8 *
-        math.ceil(len(tokenizer) / 8.0)))  # make the vocab size multiple of 8
-
-    return model
-
-
-def create_critic_model(model_name_or_path,
-                        tokenizer,
-                        ds_config,
-                        num_padding_at_beginning=0,
-                        rlhf_training=False,
-                        disable_dropout=False):
-    # OPT model family always put a padding token at the beginning of the sequence,
-    # we did not see this in other models but not sure if it is a general rule
-    critic_model = create_hf_model(AutoModel, model_name_or_path, tokenizer,
-                                   ds_config, rlhf_training, disable_dropout)
-    critic_model = RewardModel(
-        critic_model,
-        tokenizer,
-        num_padding_at_beginning=num_padding_at_beginning)
-
-    if rlhf_training:
-        if not os.path.isdir(model_name_or_path):
-            model_name_or_path = snapshot_download(model_name_or_path)
-        # critic model needs to load the weight here
-        model_ckpt_path = os.path.join(model_name_or_path, 'pytorch_model.bin')
-        assert os.path.exists(
-            model_ckpt_path
-        ), f"Cannot find model checkpoint at {model_ckpt_path}"
-        critic_model.load_state_dict(
-            torch.load(model_ckpt_path, map_location='cpu'))
-
-    return critic_model
diff --git a/applications/DeepSpeed-VisualChat/README.md b/applications/DeepSpeed-VisualChat/README.md
new file mode 100755
index 000000000..56649dd7b
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/README.md
@@ -0,0 +1,116 @@
+# DeepSpeed-VisualChat
+
+An easy-to-use, scalable, and efficient multi-modal training pipeline for multi-round multi-image interleave chat experience.
+
+
+## Table of Contents
+
+- [📰 Latest News 📰](#-latest-news-)
+- [🚀 What is DeepSpeed-VisualChat 🚀️](#-what-is-deepspeed-visualchat-)
+- [⚓ Get Started, Tutorial, and Documentation ⚓](#-get-started-tutorial-documentation-)
+- [🌱 DeepSpeed-VisualChat's Roadmap 🌱](#-deepspeed-visualchats-roadmap-)
+- [💬 DeepSpeed-VisualChat and DeepSpeed Community 💬](#-deepspeed-visualchat-and-deepspeed-community-)
+- [🙏 Acknowledgement and Citation 🙏](#-acknowledgement-and-citation-)
+
+<!-- markdown-toc end -->
+
+## 📰 Latest News 📰
+
+* ***[2023/10] [DeepSpeed-VisualChat: Improve Your Chat Experience with Multi-Round Multi-Image Inputs](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-visualchat/10-03-2023/README.md)***
+
+⭐ If you find our [DeepSpeed](https://github.com/microsoft/DeepSpeed) and [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) repositories beneficial, please give them a star on GitHub! To cite DeepSpeed-VisualChat, please cite our [arxiv report](https://arxiv.org/abs/2309.14327):
+
+```
+@article{yao2023deepspeed-visualchat,
+  title={{DeepSpeed-VisualChat: Multi-Round Multi-Image Interleave Chat via Multi-Modal Causal Attention}},
+  author={Zhewei Yao and Xiaoxia Wu and Conglong Li and Minjia Zhang and Heyang Qin and Olatunji Ruwase and Ammar Ahmad Awan and Samyam Rajbhandari and Yuxiong He},
+  journal={arXiv preprint arXiv:2309.14327},
+  year={2023}
+}
+```
+
+## 🚀 What is DeepSpeed-VisualChat 🚀
+<div align="center">
+
+<img src="assets/hero-figure.png" alt="DeepSpeed-VisualChat Banner!"/>
+Figure 1. On the left is a DeepSpeed-VisualChat model, featuring an innovative attention design. On the right is an example of DeepSpeed-VisualChat.
+
+</div>
+
+---
+
+With increasing interest in enabling the multi-modal capabilities of large language models, DeepSpeed is proud to announce a new training pipeline, named ***DeepSpeed-VisualChat***. This is designed for enabling a multi-round, multi-image interleave chat framework. It enhances the language model with image understanding and reasoning capabilities. Unlike the majority of open-sourced multi-modal projects, the primary focus of DeepSpeed-VisualChat is to provide a multi-round, multi-image interleave chat experience, as illustrated in Figure 1.
+
+To improve model quality without introducing new parameters, DeepSpeed-VisualChat incorporates a new multi-modal causal attention mechanism, which is adept at better aligning visual and text features. Additionally, to overcome the scarcity of interleaved text-and-image inputs in most available open-sourced datasets, we employ various data blending techniques on existing datasets.
+
+Thanks to the scalable, efficient, and user-friendly nature of the DeepSpeed ecosystem, we have the capability to train using a 2B visual encoder from QWen-VL (one is additionally refined from OpenClip) and a 70B language decoder from LLaMA-2. This showcases the extraordinary scalability of the DeepSpeed-VisualChat framework.
+
+
+
+
+
+## ⚓ Get Started, Tutorial, and Documents ⚓
+
+### 🐼 Installation
+
+
+```bash
+git clone https://github.com/microsoft/DeepSpeedExamples.git
+cd DeepSpeedExamples/applications/DeepSpeed-VisualChat/
+pip install -r requirements.txt
+```
+
+### 🐼 Datasets Preparation
+
+Table below summarizes where to download the datasets that we support. `{data_path}` denotes the `--data_path` argument provided in training scripts.
+
+| Dataset name | Where to download |
+|--------------|-------------------|
+| aokvqa | Download `2017 Train images [118K/18GB]` from [https://cocodataset.org/#download](https://cocodataset.org/#download) and save at `{data_path}/coco/train2017/`. Download `aokvqa_v1p0_train.json` from [https://allenai.org/project/a-okvqa/home](https://allenai.org/project/a-okvqa/home) and save at `{data_path}/aokvqa/annotations/`. |
+| coco_caption | Download 2014 Train images and 2014 Val images from [https://cocodataset.org/#download](https://cocodataset.org/#download) and save all images at `{data_path}/coco/2014/`. Download `dataset.json` from [https://cs.stanford.edu/people/karpathy/deepimagesent/coco.zip](https://cs.stanford.edu/people/karpathy/deepimagesent/coco.zip) and save at `{data_path}/coco_caption/`. |
+| llava | Download `2017 Train images [118K/18GB]` from [https://cocodataset.org/#download](https://cocodataset.org/#download) and save at `{data_path}/coco/train2017/`. Download `detail_23k.json` and `complex_reasoning_77k.json` from [https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K) and save at `{data_path}/llava/`. |
+| llava_dial | Download `2017 Train images [118K/18GB]` from [https://cocodataset.org/#download](https://cocodataset.org/#download) and save at `{data_path}/coco/train2017/`. Download `conversation_58k.json` from [https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K) and save at `{data_path}/llava/`. |
+| llava_otter_blend | Follow instructions of the llava, llava_dial, and otter_mimicit_cgd datasets. |
+| minigpt4 | Download `image` folder and `filter_cap.json` from [https://huggingface.co/datasets/Vision-CAIR/cc_sbu_align](https://huggingface.co/datasets/Vision-CAIR/cc_sbu_align) and save at `{data_path}/cc_sbu_align/`. |
+| ocr_vqa |  Download `images` folder and `dataset.json` from [https://ocr-vqa.github.io/](https://ocr-vqa.github.io/) and save at `{data_path}/OCR_VQA/`. |
+| otter_mimicit_cgd | Download `2017 Train images [118K/18GB]` from [https://cocodataset.org/#download](https://cocodataset.org/#download) and save at `{data_path}/coco/train2017/`. Download `CGD_instructions.json` from [https://huggingface.co/datasets/pufanyi/MIMICIT](https://huggingface.co/datasets/pufanyi/MIMICIT) and save at `{data_path}/MIMIC-IT/`. |
+| otter_mimicit_sd | Download `SD.json` and `SD_instructions.json` from [https://huggingface.co/datasets/pufanyi/MIMICIT](https://huggingface.co/datasets/pufanyi/MIMICIT) and save at `{data_path}/MIMIC-IT/`. |
+| otter_mimicit_sn | Download `SN.json` and `SN_instructions.json` from [https://huggingface.co/datasets/pufanyi/MIMICIT](https://huggingface.co/datasets/pufanyi/MIMICIT) and save at `{data_path}/MIMIC-IT/`. |
+| otter_mimicit_tvc | Download `TVC.json` and `TVC_instructions.json` from [https://huggingface.co/datasets/pufanyi/MIMICIT](https://huggingface.co/datasets/pufanyi/MIMICIT) and save at `{data_path}/MIMIC-IT/`. |
+| otter_mimicit_vst | Download `VST.json` and `VST_instructions.json` from [https://huggingface.co/datasets/pufanyi/MIMICIT](https://huggingface.co/datasets/pufanyi/MIMICIT) and save at `{data_path}/MIMIC-IT/`. |
+| sparkles_dialogue | Download the `SparklesDialogueCC` and `SparklesDialogueVG` folders from the OneDrive link from [https://github.com/HYPJUDY/Sparkles](https://github.com/HYPJUDY/Sparkles) and save at `{data_path}/`. |
+
+### 🐼 Training, Evaluation, Chat API, and Helper
+Please refer to 
+  - [**Training**](./training/README.md)
+  - [**Evaluation**](./eval/README.md)
+  - [**Chat**](./chat/README.md)
+  - [**Helper**](./helper/README.md)
+
+
+## 🌱 DeepSpeed-VisualChat's Roadmap 🌱
+
+Our future plan includes but not limited to :
+- [ ] Support more models
+- [ ] Demonstrate how to training larger models with higher model quality
+
+## 💬 DeepSpeed-VisualChat and DeepSpeed Community 💬
+
+Just like how the success of [the BLOOM model](https://huggingface.co/bigscience/bloom) was supported by both [DeepSpeed Team](https://github.com/bigscience-workshop/Megatron-DeepSpeed) and many [open source contributors](https://huggingface.co/bigscience), we welcome all AI developers/practitioners/researchers to join this on-going effort for DeepSpeed-Chat. To participate:
+- Show your support by leaving a star ⭐ to our [DeepSpeed](https://github.com/microsoft/DeepSpeed) and [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) GitHub repositories.
+- Follow us on [twitter](https://twitter.com/MSFTDeepSpeed) to get notified about our latest news. For Chinese users, you can also follow our [Chinese Zhihu account](https://www.zhihu.com/people/deepspeed). For Japanese users, you can also follow our [Japanese twitter account](https://twitter.com/MSFTDeepSpeedJP).
+- Currently we prefer to interact with open source users mainly on GitHub so that it's easier for all users to search for related information. For bug reports, please submit a GitHub issue. For contribution, please submit a pull request (PR). For general question/discussion, please open a new discussion or join any existing discussions.
+- We are open to collaborations with universities, research labs, and companies, such as working together on deep learning research, applying DeepSpeed to empower real-world AI models and applications, and so on. For such requests (and other requests unsuitable for GitHub), please directly email to deepspeed-info@microsoft.com.
+
+
+## 🙏 Acknowledgement and Citation 🙏
+
+We thank the following papers and open-source repositories:
+
+    [1] LLaVa, https://github.com/haotian-liu/LLaVA
+    [2] Otter, https://github.com/Luodian/Otter
+    [3] Transformers Hugging Face, https://github.com/huggingface/transformers
+    [4] MiniGPT4, https://github.com/Vision-CAIR/MiniGPT-4
+    [5] QWen-VL, https://github.com/QwenLM/Qwen-VL
+    [6] Sparkles, https://github.com/HYPJUDY/Sparkles
+    [7] Multimodal-GPT, https://github.com/open-mmlab/Multimodal-GPT
diff --git a/applications/DeepSpeed-VisualChat/assets/banner.png b/applications/DeepSpeed-VisualChat/assets/banner.png
new file mode 100644
index 000000000..2684cafc2
Binary files /dev/null and b/applications/DeepSpeed-VisualChat/assets/banner.png differ
diff --git a/applications/DeepSpeed-VisualChat/assets/ceos.png b/applications/DeepSpeed-VisualChat/assets/ceos.png
new file mode 100644
index 000000000..e148f545a
Binary files /dev/null and b/applications/DeepSpeed-VisualChat/assets/ceos.png differ
diff --git a/applications/DeepSpeed-VisualChat/assets/friends.png b/applications/DeepSpeed-VisualChat/assets/friends.png
new file mode 100644
index 000000000..2689d8d4b
Binary files /dev/null and b/applications/DeepSpeed-VisualChat/assets/friends.png differ
diff --git a/applications/DeepSpeed-VisualChat/assets/hero-figure.png b/applications/DeepSpeed-VisualChat/assets/hero-figure.png
new file mode 100644
index 000000000..ca79b2c62
Binary files /dev/null and b/applications/DeepSpeed-VisualChat/assets/hero-figure.png differ
diff --git a/applications/DeepSpeed-VisualChat/assets/model.png b/applications/DeepSpeed-VisualChat/assets/model.png
new file mode 100644
index 000000000..dbd1f05c4
Binary files /dev/null and b/applications/DeepSpeed-VisualChat/assets/model.png differ
diff --git a/applications/DeepSpeed-VisualChat/chat/README.md b/applications/DeepSpeed-VisualChat/chat/README.md
new file mode 100644
index 000000000..155061f1a
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/chat/README.md
@@ -0,0 +1 @@
+We provide a CLI interface for uses to test their trained chat model. First of all, please note that you need to provide both the trained checkpoint and the original language model & vision encoder paths. The model is first initialized and then loads the trained checkpoint. Also, please note that if you used multi-modal causal attention during your training, remember to put --enable_mmca_attention for in your chat script.
\ No newline at end of file
diff --git a/applications/DeepSpeed-VisualChat/chat/chat.py b/applications/DeepSpeed-VisualChat/chat/chat.py
new file mode 100644
index 000000000..57634e957
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/chat/chat.py
@@ -0,0 +1,169 @@
+#!/usr/bin/env python
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+import argparse
+import os
+import sys
+from PIL import Image
+
+import torch
+import deepspeed
+
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)))
+from utils.utils import print_rank_0
+from utils.model import create_dsvl_model_and_transforms
+import utils.data.DST as DST 
+from transformers import AutoTokenizer
+from termcolor import colored
+import re
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description=
+        "CLI chat")
+    parser.add_argument(
+        "--lm_model_name_or_path",
+        type=str,
+        help=
+        "Path to pretrained model or model identifier from huggingface.co/models.",
+        required=True,
+    )
+    parser.add_argument("--vision_model_name_or_path", default="openai/clip-vit-large-patch14", type=str)
+    parser.add_argument(
+        "--checkpoint_path",
+        default=None,
+        type=str,
+        help="path to pretrained model",
+    )
+    parser.add_argument(
+        "--max_seq_len",
+        type=int,
+        default=4096,
+        help="The maximum sequence length.",
+    )
+    parser.add_argument(
+        "--generation_length_per_round",
+        type=int,
+        default=256,
+        help="The generation length per conversation round.",
+    )
+    parser.add_argument(
+        "--enable_mmca_attention",
+        action='store_true',
+        help="enable the new proposed attn, which is similar to cross attention",
+    )
+    parser.add_argument(
+        "--vis_proj",
+        type=str,
+        default='baseline',
+        help="baseline, vit, or perceiver",
+    )
+    parser = deepspeed.add_config_arguments(parser)
+    args = parser.parse_args()
+
+    return args
+
+
+def get_user_text_input():
+    tmp = input(colored("Enter input (type 'quit' to exit, 'clear' to clean memory): ", 'green'))
+    return tmp, tmp == "quit", tmp == "clear"
+
+def get_user_image_input():
+    tmp = input(colored("Enter image pathes, seperate by space (only support one image per time for now) (type 'na' for empty image): ", 'blue'))
+    return tmp, not tmp == "na"
+
+def main():
+    args = parse_args()    
+    tokenizer = AutoTokenizer.from_pretrained(args.lm_model_name_or_path,
+                                              fast_tokenizer=True)
+    tokenizer.padding_side = 'right'
+    model, image_processor, tokenizer = create_dsvl_model_and_transforms(
+        text_tokenizer = tokenizer,
+        ds_config=None,
+        args=args,
+    )
+
+    model.load_state_dict(torch.load(os.path.join(args.checkpoint_path, 'pytorch_model.bin'), map_location='cpu'), strict=False) # Z3 wouldn't save pos embeddings (vis and rope)
+    
+    model = model.eval()
+    model.projection = model.projection.to('cuda')
+    model.vis_encoder = model.vis_encoder.to('cuda')
+    model = model.half()
+    print_rank_0(model) 
+    
+    num_rounds  = 0 
+    images = []
+    system_instruct = []
+    TEMPLATE = DST.Prompter() # get template
+    image_num_token_list = [DST.IMAGE_NUM_1, DST.IMAGE_NUM_2, DST.IMAGE_NUM_3, DST.IMAGE_NUM_4, DST.IMAGE_NUM_5, DST.IMAGE_NUM_6, DST.IMAGE_NUM_7, DST.IMAGE_NUM_8]
+    
+    while True:
+        num_rounds  += 1
+        while True:
+            # it is super easy to make mistake here, so we need to be careful
+            image_input, with_image = get_user_image_input()
+            if with_image:
+                try:
+                    # seperate by space 
+                    image_paths = image_input.split(' ')
+                    tmp_images = []
+                    for image_path in image_paths:
+                        image = Image.open(image_path).convert('RGB')
+                        tmp_image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0].unsqueeze(0).cuda().half()
+                        tmp_images.append(tmp_image_tensor) # in case the last image path is wrong
+                except:
+                    print(colored("Invalid image path, please try again", 'red'))
+                    continue
+                if len(images) + len(tmp_images) > 8:
+                    print(colored("Too many images, we at most support 8 images. please try again", 'red'))
+                    continue
+                images = images + tmp_images # get all images
+                image_num = len(tmp_images)
+                break
+            else:
+                image_num = 0
+                break
+        assert len(images) >= 1, "We need at least one image to begin the conversation for now."
+        if len(images) > 0:
+            image_tensor = torch.cat(images, dim=0) # cat all images
+        else:
+            image_tensor = None
+
+        text_input, quit, clear = get_user_text_input()
+        if quit:
+            break
+        if clear:
+            num_rounds = 0 
+            images = []
+            system_instruct = []
+            image_num_token_list = [DST.IMAGE_NUM_1, DST.IMAGE_NUM_2, DST.IMAGE_NUM_3, DST.IMAGE_NUM_4, DST.IMAGE_NUM_5, DST.IMAGE_NUM_6, DST.IMAGE_NUM_7, DST.IMAGE_NUM_8]
+            continue
+        
+
+        full_prompt = TEMPLATE(text_input, with_image=with_image, first_message=(num_rounds==1), num_images=image_num)
+        if with_image:
+            for i in range(image_num):
+                full_prompt = re.sub(DST.DEFAULT_HUMAN_IMAGE_PRETOKEN, image_num_token_list.pop(0), full_prompt, count=1)
+                    
+
+        full_prompt_ids = tokenizer(full_prompt).input_ids # remove bos token
+        
+        input_ids = torch.as_tensor([system_instruct + full_prompt_ids]).cuda() # entire input as system instruction for simplicity
+        generate_output = model.generate(image_tensor, input_ids, generation_length=args.generation_length_per_round)
+        extend_ids = generate_output[0].cpu().tolist()[0]
+        while extend_ids[-1] == tokenizer.pad_token_id:
+            extend_ids.pop()
+        while extend_ids[0] == tokenizer.bos_token_id:
+            extend_ids.pop(0)
+        system_instruct = system_instruct + full_prompt_ids + extend_ids # entire input as system instruction for simplicity
+        system_instruct = system_instruct + [tokenizer.eos_token_id] # add eos token
+
+        print(f"=========== Round {num_rounds} ===========")
+        print(tokenizer.decode(system_instruct))
+        
+        
+if __name__ == "__main__":
+    main()
diff --git a/applications/DeepSpeed-VisualChat/chat/chat_scripts/run.sh b/applications/DeepSpeed-VisualChat/chat/chat_scripts/run.sh
new file mode 100644
index 000000000..8c193d520
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/chat/chat_scripts/run.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+MAIN_PATH=$1
+
+VISION_ENCODER=/blob/transformers_cache/qwen-clip
+LLM=/blob/transformers_cache/Llama-2-13b-hf
+
+export CUDA_VISIBLE_DEVICES=0  # Do multi single evaluation 
+# export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7  # Do multi gpu evaluation for large models (single GPU is not enough)
+
+
+python chat.py \
+    --lm_model_name_or_path  $LLM \
+    --vision_model_name_or_path $VISION_ENCODER \
+    --checkpoint_path $MAIN_PATH --enable_mmca_attention
diff --git a/applications/DeepSpeed-VisualChat/eval/README.md b/applications/DeepSpeed-VisualChat/eval/README.md
new file mode 100644
index 000000000..e39bbf035
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/eval/README.md
@@ -0,0 +1,28 @@
+### ☀️Evaluation
+We provide a few examples to test the quality of the models.
+To run the tests, use the `batch_generation.py` script, which will call the JSON file located in  `eval_data/*.json`.
+You will need to specify the model path where you've saved your checkpoints. For example, if you've saved your model checkpoint at $YOUR_CHECKPOINT_PATH/epoch-5/pytorch_model.bin, then pass the following arguments: 
+```
+--checkpoint_path $YOUR_CHECKPOINT_PATH --checkpoint_names epoch-5
+```
+
+##### 🏃 Run the Code
+NOTE: Before you run the code `run_batch.sh`, please read it carefully. This bash script creates a folder eval/results/eval_comprehensive if you use the json evaluation "eval_comprehensive". It will write to "eval/results/eval_comprehensive/{args.output_filename}.csv" file with four columns. The generation output is in the last column. Please read one of our examples such as `eval/results/eval_comprehensive/ours-set1_final.csv`.
+To run the code, you need to go to outside the current folder
+```
+cd DeeSpeedExamples/applications/DeepSpeed-VisualChat
+bash eval/run_batch.sh
+```
+
+
+#### 🐕 Our Model Results Overview
+We present the outcomes from our three distinct models, each trained with vision encoders: `qwen-clip` and `Llama-2-13b-hf`.
+
+###### Results Directories and Training Details:
+- **results/eval_single:**  
+  This directory contains results from the model trained with LoRA, featuring a dimension size of 128.
+
+- **results/eval_comprehensive** and **results/eval_robustness:**  
+  These directories host results from two models:
+  - One model is trained excluding the Sparkles dataset (referred to as `ours-set1`).
+  - The other incorporates Sparkles dataset in the training (denoted as `ours-set2`).
diff --git a/applications/DeepSpeed-VisualChat/eval/batch_generation.py b/applications/DeepSpeed-VisualChat/eval/batch_generation.py
new file mode 100644
index 000000000..c22c2f6a9
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/eval/batch_generation.py
@@ -0,0 +1,226 @@
+#!/usr/bin/env python
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+import argparse
+import os
+import csv
+import sys
+from PIL import Image
+
+import torch
+import deepspeed
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)))
+from utils.utils import print_rank_0, to_device
+from utils.model import create_dsvl_model_and_transforms as create_model_and_transforms
+import utils.data.DST as DST 
+
+
+
+
+from typing import Iterable
+from transformers import AutoTokenizer, set_seed
+import json
+import collections
+import numpy as np 
+import random
+def load_json(file_path):
+    with open(file_path, 'r') as json_file:
+        data = json.load(json_file)
+    return data
+
+# usage
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description=
+        "Finetune a transformers model on a causal language modeling task")
+    parser.add_argument('--question-format',
+                        type=str,
+                        default="prompt_qa",
+                        choices=['prompt_qa', 'prompt_choice'],
+                        help='question-format')
+    parser.add_argument('--question',
+                        type=str,
+                        default="please describe the image",
+                        help='question-format')
+    parser.add_argument(
+        "--lm_model_name_or_path",
+        type=str,
+        help=
+        "Path to pretrained model or model identifier from huggingface.co/models.",
+        required=True,
+    )
+    parser.add_argument("--vision_model_name_or_path", default="openai/clip-vit-large-patch14", type=str)
+    parser.add_argument(
+        "--pretrained_path",
+        default=None,
+        type=str,
+        help="path to pretrained model",
+    )
+    parser.add_argument(
+        "--image_token_length",
+        type=int,
+        default=256,
+        help="The maximum sequence length.",
+    )
+    parser.add_argument(
+        "--max_seq_len",
+        type=int,
+        default=2048,
+        help="The maximum sequence length.",
+    )
+    parser.add_argument(
+        "--checkpoint_path",
+        default=None,
+        type=str,
+        help="path to pretrained model",
+    )
+    parser.add_argument('--checkpoint_names',
+                        nargs='*',
+                        default=['runing_check_stage2_v3_epoch10',],
+                        help='Path to the training dataset. Accepted format:'
+                        '1) a single data path, 2) multiple datasets in the'
+                        'form: dataset1-path dataset2-path ...')
+    parser.add_argument(
+        "--model_name",
+        default="dsvl",
+        type=str,
+        choices=["dsvl", "toy"],
+        help="path to pretrained model",
+    )
+    parser.add_argument(
+        "--enable_mmca_attention",
+        action='store_true',
+        help="enable the new proposed attn, which is similar to cross attention",
+    )
+    parser.add_argument(
+        "--vis_proj",
+        type=str,
+        default='baseline',
+        help="baseline, vit, or perceiver",
+    )
+    parser.add_argument(
+        "--eval_data",
+        default="dsvl",
+        type=str,
+        help="path to eval data",
+    )
+    parser.add_argument(
+        "--output_filename",
+        default="results",
+        type=str,
+        help="path to eval data",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=123,
+        help="The maximum sequence length.",
+    )
+    parser = deepspeed.add_config_arguments(parser)
+    args = parser.parse_args()
+
+    return args
+
+def main():
+    args = parse_args()
+    with open(f'./eval/eval_data/{args.eval_data}.json', 'r') as file:
+        data = json.load(file)
+    if args.seed is not None:
+        set_seed(args.seed)
+        random.seed(args.seed)
+        np.random.seed(args.seed)
+        torch.manual_seed(args.seed)
+        torch.cuda.manual_seed_all(args.seed)
+            
+    tokenizer = AutoTokenizer.from_pretrained(args.lm_model_name_or_path,
+                                              fast_tokenizer=True)
+    tokenizer.padding_side = 'right'
+    model, image_processor, tokenizer = create_model_and_transforms(
+        text_tokenizer = tokenizer,
+        ds_config=None,
+        args=args,
+    )
+    get_results = collections.defaultdict(list)
+    for ck_name in args.checkpoint_names:
+        ck_path = os.path.join(args.checkpoint_path, ck_name)
+        print (ck_path)
+        if ck_path is not None:
+            model.load_state_dict(torch.load(os.path.join(ck_path, 'pytorch_model.bin'), map_location='cpu'), strict=False) # Z3 wouldn't save pos embeddings (vis and rope)
+        else:
+            Warning("No checkpoint loaded so you cannot genereate meaningful results")
+        #model = model.cuda().half()
+        model = model.eval()
+        model.projection = model.projection.to('cuda')
+        model.vis_encoder = model.vis_encoder.to('cuda')
+        model = model.half()
+        print_rank_0(model)
+        for name in data.keys():
+            question_image_list = data[name]
+            print (f'{args.eval_data}-------------------------------------{name}')
+            images = []
+            system_instruct = []
+            TEMPLATE = DST.Prompter() # get template
+            image_token_dict = DST.get_image_num_map(tokenizer)
+            image_num = 0
+            for round, q_i_pair in enumerate(question_image_list):
+                # print(f'=========round {round+1}==============')
+                question = q_i_pair[0]
+                if len(q_i_pair) > 1:
+                    # seperate by space 
+                    image_paths = q_i_pair[1].split(' ')
+                    tmp_images = []
+                    for image_path in image_paths:
+                        image = Image.open(image_path.strip()).convert('RGB')
+                        tmp_image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0].unsqueeze(0).cuda().half()
+                        tmp_images.append(tmp_image_tensor)                    
+                    images = images + tmp_images # get all images
+                    with_image = True
+                    image_num = len(tmp_images)
+                else:
+                    image_num = 0
+                    with_image = False
+
+                if len(images) > 0:
+                    image_tensor = torch.cat(images, dim=0) # cat all images
+                else:
+                    raise ValueError("No image provided. Did not fix this in the modeling side yet.")
+
+                full_prompt = TEMPLATE(question, with_image=with_image, first_message=(round==0), num_images=image_num)
+                full_prompt_ids = tokenizer(full_prompt).input_ids # remove bos token
+                if with_image:
+                    image_number = len(images)
+                    index = full_prompt_ids.index(image_token_dict[DST.DEFAULT_HUMAN_IMAGE_PRETOKEN])
+                    full_prompt_ids[index] = image_token_dict[DST.image_mapping_dict[str(image_number)]]
+                full_prompt_ids = DST.flatten(full_prompt_ids)
+                input_ids = torch.as_tensor([system_instruct + full_prompt_ids]).cuda() # entire input as system instruction for simplicity
+                print ('\n',round,question, '||', q_i_pair[-1] )
+
+                generate_output = model.generate(image_tensor, input_ids,
+                                                generation_length=256)
+                # generation_kwargs={ 'num_beams':2,'num_return_sequences':1,'top_p':1,'do_sample':True, 'temperature':1}
+                print('vanilla-->', generate_output[1])
+                get_results[name].append([q_i_pair[-1], question, generate_output[1]])
+                extend_ids = generate_output[0].cpu().tolist()[0]
+                while extend_ids[-1] == tokenizer.pad_token_id:
+                    extend_ids.pop()
+                while extend_ids[0] == tokenizer.bos_token_id:
+                    # llama-2 generates bos token at the beginning
+                    extend_ids.pop(0)
+                system_instruct = system_instruct + full_prompt_ids + extend_ids # entire input as system instruction for simplicity
+                system_instruct = system_instruct + [tokenizer.eos_token_id] # add eos token
+                
+    with open(f'{args.output_filename}.csv', mode='w', newline='', encoding='utf-8') as file:
+        writer = csv.writer(file)
+        writer.writerow(['test_name', 'image_path', 'question', 'answer'])
+        for test_name, questions in get_results.items():
+            for question in questions:
+                writer.writerow([test_name] + question)
+        
+                
+        
+        
+if __name__ == "__main__":
+    main()
diff --git a/applications/DeepSpeed-VisualChat/eval/eval_data/eval_comprehensive.json b/applications/DeepSpeed-VisualChat/eval/eval_data/eval_comprehensive.json
new file mode 100644
index 000000000..a4d367b9b
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/eval/eval_data/eval_comprehensive.json
@@ -0,0 +1,89 @@
+{
+    "cat_images1": [
+        ["Please describe the image in detail.", "./eval/eval_data/images/cats/1806905748_adb926a0a0.jpg"],
+        ["Please describe the image in detail.", "./eval/eval_data/images/cats/british_shorthair.jpg"],
+        ["Please describe the image in detail.", "./eval/eval_data/images/cats/cat.png"],
+        ["Are the colors of the three cats the same?"],
+        ["What are the differences between the first and third images?"],
+        ["What are the differences between the second and third images?"],
+        ["Is the cat in the first image in the sunshine?"]
+    ],
+    "cat_images2": [
+        ["Please describe the image in detail.", "./eval/eval_data/images/cats/1806905748_adb926a0a0.jpg"],
+        ["Please describe the image in detail.", "./eval/eval_data/images/cats/british_shorthair.jpg"],
+        ["What are the differences between the two images?"],
+        ["Please describe the image in detail.", "./eval/eval_data/images/cats/cat.png"],
+        ["Are the colors of the three cats the same?"],
+        ["What are the differences between the first and third images?"],
+        ["What are the differences between the second and third images?"],
+        ["Is the cat in the first image in the sunshine?"],
+        ["Which cat do you prefer and why?"],
+        ["I prefer the second cat. It's so cute."],
+        ["Then why do you prefer the third cat more?"]
+    ],
+    "counting_people1": [
+        ["Count the number of people in the image.", "./eval/eval_data/images/friends/can-count1.jpg"],
+        ["Count the number of people in the image.", "./eval/eval_data/images/friends/can-count2.jpg"],
+        ["What are the differences between the two images? Are they the same group of people? Explain why."],
+        ["Are you familiar with this TV series? Can you name the characters shown in the provided images? Who are they?"]
+    ],
+    "counting_people2":[
+        ["How many individuals are depicted in the image?", "./eval/eval_data/images/friends/can-count1.jpg"],
+        ["How many individuals can you see in the second image?", "./eval/eval_data/images/friends/can-count2.jpg"],
+        ["Can you spot any differences between these two images? Do they represent the same set of people? Please provide a rationale."],
+        ["Do you recognize this TV show? Can you name the characters shown in the provided images? Who are they?"]
+    ],
+    "counting_people3": [
+        ["Count the number of people in the image.", "./eval/eval_data/images/friends/wrong-count1.jpg"],
+        ["Count the number of people in the image.", "./eval/eval_data/images/friends/wrong-count2.jpg"],
+        ["What are the differences between the two images? Are they the same group of people? Explain why."]
+    ],
+    "counting_people4": [
+        ["How many individuals are depicted in the image?", "./eval/eval_data/images/friends/wrong-count1.jpg"],
+        ["How many individuals are depicted in the image?", "./eval/eval_data/images/friends/wrong-count2.jpg"],
+        ["Can you spot any differences between these two images?  Do they represent the same set of people? Please provide a rationale."],
+        ["Do you recognize this TV show? Can you name the characters shown in the provided images? Who are they?"]
+    ],
+    "zootopia_adventures1": [
+        ["Please describe the image in detail.", "./eval/eval_data/images/zootopia/z1.png"],
+        ["Please describe the image in detail.", "./eval/eval_data/images/zootopia/z2.png"],
+        ["Can you name the characters in the images? Who are they? What are they doing?", "./eval/eval_data/images/zootopia/z3.png"],
+        ["You are an imaginative storyteller. Create a fascinating story based on the first, second and third image."],
+        ["Are you familiar with these characters? What movie are they from?"],
+        ["Can you name the characters in the images? Who are they?"],
+        ["In what type of environment or setting do these characters live? Describe it."]
+    ],
+    "zootopia_adventures2": [
+        ["Create an engaging story strictly based on the images.", "./eval/eval_data/images/zootopia/z1.png ./eval/eval_data/images/zootopia/z2.png ./eval/eval_data/images/zootopia/z3.png"],
+        ["Do you recognize the setting or the characters in these images? Name the movie."],
+        ["Can you share some interesting facts or details about the characters shown in the images?"],
+        ["Which character do you find the most intriguing and why?"],
+        ["Based on the images, can you create some dialogues that the characters might say to each other in these situations?"]
+    ],
+    "zootopia_adventures3": [
+        ["Examine and describe the characters' actions in the first image.", "./eval/eval_data/images/zootopia/z1.png"],
+        ["In the second image, what are the main characters doing, and how do they seem to feel?", "./eval/eval_data/images/zootopia/z2.png"],
+        ["Contrast the characters' moods and interactions in the two provided images."],
+        ["Imagine and narrate a hilarious situation involving the characters from the images.", "./eval/eval_data/images/zootopia/z3.png"],
+        ["Name the movie from which these characters are, and give a succinct summary of its plot."],
+        ["Create a funny and unexpected scenario that could unfold between the characters in these images."]
+    ],
+    "tech_ceos1": [
+        ["Who is this person in this first image?", "./eval/eval_data/images/tech-ceo/jobs1.jpg"],
+        ["Who is this person in this second image?", "./eval/eval_data/images/tech-ceo/gate1.jpg"],
+        ["Who is this person in this third image?", "./eval/eval_data/images/tech-ceo/musk1.jpg"],
+        ["Recall who is in the second image."],
+        ["Recall who is in the first image."],
+        ["Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple."],
+        ["Is the person in the third image the founder of Apple?"]
+    ],    
+    "tech_ceos2": [
+        ["Who is this person in the first image?", "./eval/eval_data/images/tech-ceo/musk1.jpg"],
+        ["Who is this person in the second image?", "./eval/eval_data/images/tech-ceo/gate1.jpg"],
+        ["Who is this person in the third image?", "./eval/eval_data/images/tech-ceo/jobs1.jpg"],
+        ["Recall who is in the second image."],
+        ["Recall who is in the first image."],
+        ["Is the person in the first image the founder of Apple? If not, which of the above images is the person the founder of Apple. "],
+        ["Is the person in the third image the founder of Apple?"]
+    ]
+}
diff --git a/applications/DeepSpeed-VisualChat/eval/eval_data/eval_robustness.json b/applications/DeepSpeed-VisualChat/eval/eval_data/eval_robustness.json
new file mode 100644
index 000000000..16747af32
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/eval/eval_data/eval_robustness.json
@@ -0,0 +1,78 @@
+{
+    "tech_ceos2.1a": [
+        ["Who is this person in the image?", "./eval/eval_data/images/tech-ceo/jobs1.jpg"],
+        ["Who is this person in the image?", "./eval/eval_data/images/tech-ceo/gate1.jpg"],
+        ["Who is this person in the image?", "./eval/eval_data/images/tech-ceo/musk1.jpg"],
+        ["Recall who is in the second image."],
+        ["Recall who is in the first image."],
+        ["Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple."],
+        ["Is the person in the third image the founder of Apple?"]
+    ],
+    "tech_ceos2.1b": [
+        ["Who is this person in this image?", "./eval/eval_data/images/tech-ceo/jobs1.jpg"],
+        ["Who is this person in this image?", "./eval/eval_data/images/tech-ceo/gate1.jpg"],
+        ["Who is this person in this image?", "./eval/eval_data/images/tech-ceo/musk1.jpg"],
+        ["Recall who is in the second image."],
+        ["Recall who is in the first image."],
+        ["Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple."],
+        ["Is the person in the third image the founder of Apple?"]
+    ],
+    "tech_ceos2.1c": [
+        ["Who is this person in this image a?", "./eval/eval_data/images/tech-ceo/jobs1.jpg"],
+        ["Who is this person in this image b?", "./eval/eval_data/images/tech-ceo/gate1.jpg"],
+        ["Who is this person in this image c?", "./eval/eval_data/images/tech-ceo/musk1.jpg"],
+        ["Recall who is in image b."],
+        ["Recall who is in the image a."],
+        ["Is the person in the image a the founder of Apple? if not, which of the above images is the person the founder of Apple."],
+        ["Is the person in the image c the founder of Apple?"]
+    ],
+    "tech_ceos2.1d": [
+        ["Who is this person in this first image?", "./eval/eval_data/images/tech-ceo/jobs1.jpg"],
+        ["Who is this person in this second image?", "./eval/eval_data/images/tech-ceo/gate1.jpg"],
+        ["Who is this person in this third image?", "./eval/eval_data/images/tech-ceo/musk1.jpg"],
+        ["Recall who is in the second image."],
+        ["Recall who is in the first image."],
+        ["Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple."],
+        ["Is the person in the third image the founder of Apple?"]
+    ],
+    "tech_ceos2.1aa": [
+        ["Who is this person in the image?", "./eval/eval_data/images/tech-ceo/jobs1.jpg"],
+        ["Who is this person in the image?", "./eval/eval_data/images/tech-ceo/gate1.jpg"],
+        ["What's the differnce between the first and second image"],
+        ["Who is this person in the image?", "./eval/eval_data/images/tech-ceo/musk1.jpg"],
+        ["Recall who is in the second image."],
+        ["Recall who is in the first image."],
+        ["Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple."],
+        ["Is the person in the third image the founder of Apple?"]
+    ],
+    "tech_ceos2.1bb": [
+        ["Who is this person in this image?", "./eval/eval_data/images/tech-ceo/jobs1.jpg"],
+        ["Who is this person in this image?", "./eval/eval_data/images/tech-ceo/gate1.jpg"],
+        ["What's the differnce between the first and second images"],
+        ["Who is this person in this image?", "./eval/eval_data/images/tech-ceo/musk1.jpg"],
+        ["Recall who is in the second image."],
+        ["Recall who is in the first image."],
+        ["Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple."],
+        ["Is the person in the third image the founder of Apple?"]
+    ],
+    "tech_ceos2.1cc": [
+        ["Who is this person in this image a?", "./eval/eval_data/images/tech-ceo/jobs1.jpg"],
+        ["Who is this person in this image b?", "./eval/eval_data/images/tech-ceo/gate1.jpg"],
+        ["What's the differnce between the image a and image b"],
+        ["Who is this person in this image c?", "./eval/eval_data/images/tech-ceo/musk1.jpg"],
+        ["Recall who is in image b."],
+        ["Recall who is in the image a."],
+        ["Is the person in the image a the founder of Apple? if not, which of the above images is the person the founder of Apple."],
+        ["Is the person in the image c the founder of Apple?"]
+    ],
+    "tech_ceos2.1dd": [
+        ["Who is this person in this first image?", "./eval/eval_data/images/tech-ceo/jobs1.jpg"],
+        ["Who is this person in this second image?", "./eval/eval_data/images/tech-ceo/gate1.jpg"],
+        ["What's the differnce between the first and second images"],
+        ["Who is this person in this third image?", "./eval/eval_data/images/tech-ceo/musk1.jpg"],
+        ["Recall who is in the second image."],
+        ["Recall who is in the first image."],
+        ["Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple."],
+        ["Is the person in the third image the founder of Apple?"]
+    ]
+}
diff --git a/applications/DeepSpeed-VisualChat/eval/eval_data/eval_single.json b/applications/DeepSpeed-VisualChat/eval/eval_data/eval_single.json
new file mode 100644
index 000000000..42a7ad95e
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/eval/eval_data/eval_single.json
@@ -0,0 +1,11 @@
+{
+"cat_images1": [["please describe the image", "./eval/eval_data/images/cats/cat.png"]],
+"cat_images2": [["can you describe the image", "./eval/eval_data/images/cats/1806905748_adb926a0a0.jpg"]],
+"cat_images3": [["please describe the image", "./eval/eval_data/images/cats/british_shorthair.jpg"]],
+"extreme_ironing": [["What is unusual about this image?", "./eval/eval_data/images/singles/extreme_ironing.jpg"]],
+"waterview": [["What are the things I should be cautious about when I visit here?", "./eval/eval_data/images/singles/waterview.jpg"]],
+"art-dog": [["can you describe the image", "./eval/eval_data/images/singles/202160027_b319c4166e.jpg"]],
+"funny-phone": [["What is funny about this image? Describe it panel by panel.", "./eval/eval_data/images/singles/1.jpg"]],
+"squirrel": [["Why would a person find this image funny?", "./eval/eval_data/images/singles/2.jpg"]],
+"art-painting": [["Tell me about this work of art.", "./eval/eval_data/images/singles/50.jpg"]]
+}
diff --git a/applications/DeepSpeed-VisualChat/eval/eval_data/images/cats/1806905748_adb926a0a0.jpg b/applications/DeepSpeed-VisualChat/eval/eval_data/images/cats/1806905748_adb926a0a0.jpg
new file mode 100644
index 000000000..100eccc42
Binary files /dev/null and b/applications/DeepSpeed-VisualChat/eval/eval_data/images/cats/1806905748_adb926a0a0.jpg differ
diff --git a/applications/DeepSpeed-VisualChat/eval/eval_data/images/cats/british_shorthair.jpg b/applications/DeepSpeed-VisualChat/eval/eval_data/images/cats/british_shorthair.jpg
new file mode 100644
index 000000000..b61731c62
Binary files /dev/null and b/applications/DeepSpeed-VisualChat/eval/eval_data/images/cats/british_shorthair.jpg differ
diff --git a/applications/DeepSpeed-VisualChat/eval/eval_data/images/cats/cat.png b/applications/DeepSpeed-VisualChat/eval/eval_data/images/cats/cat.png
new file mode 100644
index 000000000..1a48d45c4
Binary files /dev/null and b/applications/DeepSpeed-VisualChat/eval/eval_data/images/cats/cat.png differ
diff --git a/applications/DeepSpeed-VisualChat/eval/eval_data/images/friends/can-count1.jpg b/applications/DeepSpeed-VisualChat/eval/eval_data/images/friends/can-count1.jpg
new file mode 100644
index 000000000..b29d3a97d
Binary files /dev/null and b/applications/DeepSpeed-VisualChat/eval/eval_data/images/friends/can-count1.jpg differ
diff --git a/applications/DeepSpeed-VisualChat/eval/eval_data/images/friends/can-count2.jpg b/applications/DeepSpeed-VisualChat/eval/eval_data/images/friends/can-count2.jpg
new file mode 100644
index 000000000..b09d1694a
Binary files /dev/null and b/applications/DeepSpeed-VisualChat/eval/eval_data/images/friends/can-count2.jpg differ
diff --git a/applications/DeepSpeed-VisualChat/eval/eval_data/images/friends/wrong-count1.jpg b/applications/DeepSpeed-VisualChat/eval/eval_data/images/friends/wrong-count1.jpg
new file mode 100644
index 000000000..2d4b1b958
Binary files /dev/null and b/applications/DeepSpeed-VisualChat/eval/eval_data/images/friends/wrong-count1.jpg differ
diff --git a/applications/DeepSpeed-VisualChat/eval/eval_data/images/friends/wrong-count2.jpg b/applications/DeepSpeed-VisualChat/eval/eval_data/images/friends/wrong-count2.jpg
new file mode 100644
index 000000000..08ac55fe2
Binary files /dev/null and b/applications/DeepSpeed-VisualChat/eval/eval_data/images/friends/wrong-count2.jpg differ
diff --git a/applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/1.jpg b/applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/1.jpg
new file mode 100644
index 000000000..69984e57b
Binary files /dev/null and b/applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/1.jpg differ
diff --git a/applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/2.jpg b/applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/2.jpg
new file mode 100644
index 000000000..ca1232162
Binary files /dev/null and b/applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/2.jpg differ
diff --git a/applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/202160027_b319c4166e.jpg b/applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/202160027_b319c4166e.jpg
new file mode 100644
index 000000000..8628f3d7b
Binary files /dev/null and b/applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/202160027_b319c4166e.jpg differ
diff --git a/applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/50.jpg b/applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/50.jpg
new file mode 100644
index 000000000..f23f0548d
Binary files /dev/null and b/applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/50.jpg differ
diff --git a/applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/extreme_ironing.jpg b/applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/extreme_ironing.jpg
new file mode 100644
index 000000000..638b07883
Binary files /dev/null and b/applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/extreme_ironing.jpg differ
diff --git a/applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/waterview.jpg b/applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/waterview.jpg
new file mode 100644
index 000000000..6f44ebaba
Binary files /dev/null and b/applications/DeepSpeed-VisualChat/eval/eval_data/images/singles/waterview.jpg differ
diff --git a/applications/DeepSpeed-VisualChat/eval/eval_data/images/tech-ceo/gate1.jpg b/applications/DeepSpeed-VisualChat/eval/eval_data/images/tech-ceo/gate1.jpg
new file mode 100644
index 000000000..b7b747294
Binary files /dev/null and b/applications/DeepSpeed-VisualChat/eval/eval_data/images/tech-ceo/gate1.jpg differ
diff --git a/applications/DeepSpeed-VisualChat/eval/eval_data/images/tech-ceo/jobs1.jpg b/applications/DeepSpeed-VisualChat/eval/eval_data/images/tech-ceo/jobs1.jpg
new file mode 100644
index 000000000..18e8d35e9
Binary files /dev/null and b/applications/DeepSpeed-VisualChat/eval/eval_data/images/tech-ceo/jobs1.jpg differ
diff --git a/applications/DeepSpeed-VisualChat/eval/eval_data/images/tech-ceo/musk1.jpg b/applications/DeepSpeed-VisualChat/eval/eval_data/images/tech-ceo/musk1.jpg
new file mode 100644
index 000000000..7f2abfe89
Binary files /dev/null and b/applications/DeepSpeed-VisualChat/eval/eval_data/images/tech-ceo/musk1.jpg differ
diff --git a/applications/DeepSpeed-VisualChat/eval/eval_data/images/zootopia/z1.png b/applications/DeepSpeed-VisualChat/eval/eval_data/images/zootopia/z1.png
new file mode 100644
index 000000000..fdb9d8db9
Binary files /dev/null and b/applications/DeepSpeed-VisualChat/eval/eval_data/images/zootopia/z1.png differ
diff --git a/applications/DeepSpeed-VisualChat/eval/eval_data/images/zootopia/z2.png b/applications/DeepSpeed-VisualChat/eval/eval_data/images/zootopia/z2.png
new file mode 100644
index 000000000..57766b181
Binary files /dev/null and b/applications/DeepSpeed-VisualChat/eval/eval_data/images/zootopia/z2.png differ
diff --git a/applications/DeepSpeed-VisualChat/eval/eval_data/images/zootopia/z2a.png b/applications/DeepSpeed-VisualChat/eval/eval_data/images/zootopia/z2a.png
new file mode 100644
index 000000000..79f30e02d
Binary files /dev/null and b/applications/DeepSpeed-VisualChat/eval/eval_data/images/zootopia/z2a.png differ
diff --git a/applications/DeepSpeed-VisualChat/eval/eval_data/images/zootopia/z3.png b/applications/DeepSpeed-VisualChat/eval/eval_data/images/zootopia/z3.png
new file mode 100644
index 000000000..8ccd23f53
Binary files /dev/null and b/applications/DeepSpeed-VisualChat/eval/eval_data/images/zootopia/z3.png differ
diff --git a/applications/DeepSpeed-VisualChat/eval/eval_scripts/run_batch.sh b/applications/DeepSpeed-VisualChat/eval/eval_scripts/run_batch.sh
new file mode 100644
index 000000000..dd79b0a00
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/eval/eval_scripts/run_batch.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
+
+#EVAL_DATSET=eval_robustness  eval_single eval_comprehensive (see the json in the folder ./eval_data/*.json)
+MAIN_PATH=$1
+VISION_MODEL=/blob/transformers_cache/qwen-clip #openai/clip-vit-large-patch14
+LLM=/blob/transformers_cache/Llama-2-13b-hf #meta-llama/Llama-2-7b 
+for EVAL_DATSET in eval_single eval_comprehensive eval_robustness 
+do
+    SAVE_PATH=eval/results/${EVAL_DATSET}
+    mkdir ${SAVE_PATH}
+    for CKPT_NAME in final best_eval
+    do
+    #NOTE: to run multi-GPU, you simple do "export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7;"
+    export CUDA_VISIBLE_DEVICES=0; python eval/batch_generation.py --model_name dsvl --vis_proj baseline --max_seq_len 4096 \
+        --lm_model_name_or_path  ${LLM} --vision_model_name_or_path ${VISION_MODEL} \
+        --checkpoint_path $MAIN_PATH  --checkpoint_names $CKPT_NAME --eval_data ${EVAL_DATSET} \
+        --enable_mmca_attention --output_filename ${SAVE_PATH}/ours_${CKPT_NAME} &> ${SAVE_PATH}/ours_${CKPT_NAME}.log
+    done
+done
+
diff --git a/applications/DeepSpeed-VisualChat/eval/results/eval_comprehensive/ours-set1_best_eval.csv b/applications/DeepSpeed-VisualChat/eval/results/eval_comprehensive/ours-set1_best_eval.csv
new file mode 100644
index 000000000..dd83ebf25
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/eval/results/eval_comprehensive/ours-set1_best_eval.csv
@@ -0,0 +1,78 @@
+test_name,image_path,question,answer
+cat_images1,./eval/eval_data/images/cats/1806905748_adb926a0a0.jpg,Please describe the image in detail.,"The image shows a black cat lying on its back on a carpeted floor. The cat has its paws up in the air and is looking directly at the camera. The cat's eyes are yellow, and it appears to be relaxed and comfortable in its position. The background of the image is not particularly distinct, but the focus is on the cat and its pose."
+cat_images1,./eval/eval_data/images/cats/british_shorthair.jpg,Please describe the image in detail.,"The image shows a gray and white cat lying on a couch, with its paws stretched out and its head resting on the couch. The cat appears to be relaxed and comfortable in its position."
+cat_images1,./eval/eval_data/images/cats/cat.png,Please describe the image in detail.,"The image shows a brown and white cat standing on a green leafy plant, with its paws up in the air and its head tilted to the side. The cat appears to be curious and alert, as it looks around its surroundings. The background of the image is not particularly distinct, but the focus is on the cat and its pose."
+cat_images1,Are the colors of the three cats the same?,Are the colors of the three cats the same?,"No, the colors of the three cats are different. The first cat is black, the second cat is gray and white, and the third cat is brown and white."
+cat_images1,What are the differences between the first and third images?,What are the differences between the first and third images?,The first image shows a black cat lying on a carpet while the third image shows a brown and white cat standing on a green leafy plant.
+cat_images1,What are the differences between the second and third images?,What are the differences between the second and third images?,The second image shows a gray and white cat lying on a couch while the third image shows a brown and white cat standing on a green leafy plant.
+cat_images1,Is the cat in the first image in the sunshine?,Is the cat in the first image in the sunshine?,"Yes, the cat in the first image is lying in the sunshine."
+cat_images2,./eval/eval_data/images/cats/1806905748_adb926a0a0.jpg,Please describe the image in detail.,"The image shows a black cat lying on its back on a carpeted floor. The cat has its paws up in the air and is looking directly at the camera. The cat's eyes are yellow, and it appears to be relaxed and comfortable in its position. The background of the image is not particularly distinct, but the focus is on the cat and its pose."
+cat_images2,./eval/eval_data/images/cats/british_shorthair.jpg,Please describe the image in detail.,"The image shows a gray and white cat lying on a couch, with its paws stretched out and its head resting on the couch. The cat appears to be relaxed and comfortable in its position."
+cat_images2,What are the differences between the two images?,What are the differences between the two images?,The first image shows a black cat lying on a carpet while the second image shows a gray and white cat lying on a couch.
+cat_images2,./eval/eval_data/images/cats/cat.png,Please describe the image in detail.,"The image shows a cat lying on a carpeted floor, with its paws stretched out and its head resting on the floor. The cat appears to be relaxed and comfortable in its position."
+cat_images2,Are the colors of the three cats the same?,Are the colors of the three cats the same?,"No, the colors of the three cats are different. The first cat is black, the second cat is gray and white, and the third cat is brown and white."
+cat_images2,What are the differences between the first and third images?,What are the differences between the first and third images?,The first image shows a black cat lying on a carpet while the third image shows a brown and white cat lying on a couch.
+cat_images2,What are the differences between the second and third images?,What are the differences between the second and third images?,The second image shows a gray and white cat lying on a couch while the third image shows a brown and white cat lying on a couch.
+cat_images2,Is the cat in the first image in the sunshine?,Is the cat in the first image in the sunshine?,"No, the cat in the first image is not in the sunshine. It is lying on a carpeted floor."
+cat_images2,Which cat do you prefer and why?,Which cat do you prefer and why?,"I prefer the gray and white cat in the second image because it looks calm and relaxed, lying on a couch with its paws stretched out. The cat's expression seems to convey a sense of contentment and comfort, which is appealing to me."
+cat_images2,I prefer the second cat. It's so cute.,I prefer the second cat. It's so cute.,"I agree, the second cat is very cute and looks comfortable lying on the couch."
+cat_images2,Then why do you prefer the third cat more?,Then why do you prefer the third cat more?,"I prefer the third cat because it is brown and white, which is a classic and timeless color combination. Additionally, the third cat is lying on a couch, which is a more comfortable and inviting setting than the carpet in the first image."
+counting_people1,./eval/eval_data/images/friends/can-count1.jpg,Count the number of people in the image.,"There are six people in the image, including the two women and four men."
+counting_people1,./eval/eval_data/images/friends/can-count2.jpg,Count the number of people in the image.,"The number of people in the image is not specified, but there are at least six people in the group."
+counting_people1,What are the differences between the two images? Are they the same group of people? Explain why.,What are the differences between the two images? Are they the same group of people? Explain why.,"The first image has two women and four men, while the second image has four women and two men. The people in the two images are not the same, and the number of men and women is different. This is because the images are taken at different times and with different groups of people."
+counting_people1,Are you familiar with this TV series? Can you name the characters shown in the provided images? Who are they?,Are you familiar with this TV series? Can you name the characters shown in the provided images? Who are they?,"Yes, I am familiar with this TV series. The characters in the images are from the TV series ""Friends."" The characters are: Rachel, Monica, Phoebe, Joey, Chandler, and Ross."
+counting_people2,./eval/eval_data/images/friends/can-count1.jpg,How many individuals are depicted in the image?,"There are five individuals in the image, including the two women and three men."
+counting_people2,./eval/eval_data/images/friends/can-count2.jpg,How many individuals can you see in the second image?,There are six individuals in the second image.
+counting_people2,Can you spot any differences between these two images? Do they represent the same set of people? Please provide a rationale.,Can you spot any differences between these two images? Do they represent the same set of people? Please provide a rationale.,"The first image is a portrait of two women and three men, while the second image is a group photo of six people. The two images do not represent the same set of people."
+counting_people2,Do you recognize this TV show? Can you name the characters shown in the provided images? Who are they?,Do you recognize this TV show? Can you name the characters shown in the provided images? Who are they?,"Yes, you can recognize the TV show from the images. The characters are the cast of the TV show ""Friends."""
+counting_people3,./eval/eval_data/images/friends/wrong-count1.jpg,Count the number of people in the image.,There are four people in the image.
+counting_people3,./eval/eval_data/images/friends/wrong-count2.jpg,Count the number of people in the image.,There are six people in the image.
+counting_people3,What are the differences between the two images? Are they the same group of people? Explain why.,What are the differences between the two images? Are they the same group of people? Explain why.,The first image has four people sitting on a bench while the second image has six people standing in a group. The people in the two images are not the same. The first image has a fountain in the background while the second image has trees in the background. The first image has a dog in it while the second image does not have any animals.
+counting_people4,./eval/eval_data/images/friends/wrong-count1.jpg,How many individuals are depicted in the image?,There are four individuals depicted in the image.
+counting_people4,./eval/eval_data/images/friends/wrong-count2.jpg,How many individuals are depicted in the image?,There are six individuals in the image.
+counting_people4,Can you spot any differences between these two images?  Do they represent the same set of people? Please provide a rationale.,Can you spot any differences between these two images?  Do they represent the same set of people? Please provide a rationale.,"Yes, the two images have different people in them. The first image has four people sitting on a bench, while the second image has six people standing in a group."
+counting_people4,Do you recognize this TV show? Can you name the characters shown in the provided images? Who are they?,Do you recognize this TV show? Can you name the characters shown in the provided images? Who are they?,"Yes, I recognize the TV show. The characters are John, Paul, George, and Ringo."
+zootopia_adventures1,./eval/eval_data/images/zootopia/z1.png,Please describe the image in detail.,"The image is a digital recreation of a scene featuring two stuffed animals, a dog and a cat, sitting at a desk with a phone in front of them. The desk has a sign that says ""Next Window."" There are also two people in the background, one on the left and one on the right."
+zootopia_adventures1,./eval/eval_data/images/zootopia/z2.png,Please describe the image in detail.,"The image is a digital recreation of a scene featuring a group of stuffed animals, including a fox, a raccoon, and a squirrel, standing in front of a phone booth. There are also two people in the background, one on the left and one on the right."
+zootopia_adventures1,./eval/eval_data/images/zootopia/z3.png,Can you name the characters in the images? Who are they? What are they doing?,"The characters in the images are a fox, a raccoon, and a squirrel. They are standing in front of a phone booth and a coffee shop."
+zootopia_adventures1,"You are an imaginative storyteller. Create a fascinating story based on the first, second and third image.","You are an imaginative storyteller. Create a fascinating story based on the first, second and third image.","Sure, you can create a story about the characters in the images. For example, you can imagine that the fox, raccoon, and squirrel are friends who are on a mission to save the world from an evil villain. They are using the phone booth and coffee shop as a base to plan their strategy and gather resources."
+zootopia_adventures1,Are you familiar with these characters? What movie are they from?,Are you familiar with these characters? What movie are they from?,"Yes, I am familiar with these characters. They are from the movie ""Zootopia."""
+zootopia_adventures1,Can you name the characters in the images? Who are they?,Can you name the characters in the images? Who are they?,"Yes, the characters in the images are a fox, a raccoon, and a squirrel."
+zootopia_adventures1,In what type of environment or setting do these characters live? Describe it.,In what type of environment or setting do these characters live? Describe it.,The characters in the images live in a phone booth and a coffee shop.
+zootopia_adventures2,./eval/eval_data/images/zootopia/z1.png ./eval/eval_data/images/zootopia/z2.png ./eval/eval_data/images/zootopia/z3.png,Create an engaging story strictly based on the images.,"The engaging story could be about a group of stuffed animals who are waiting in line at a bank or a store. They are discussing the next step in their transaction and are eager to get their business done. The story could also involve a funny incident that happens while they are in line, such as one of the stuffed animals losing their head or getting into a fight with another animal. The story could be told from the perspective of one of the stuffed animals or from an observer's point of view."
+zootopia_adventures2,Do you recognize the setting or the characters in these images? Name the movie.,Do you recognize the setting or the characters in these images? Name the movie.,"The setting is a bank or a store, and the characters are stuffed animals. The movie is not specified."
+zootopia_adventures2,Can you share some interesting facts or details about the characters shown in the images?,Can you share some interesting facts or details about the characters shown in the images?,"The characters shown in the images are stuffed animals from the movie ""Toy Story."" The main characters are Woody, a cowboy, and Buzz Lightyear, a space ranger. Other characters include Mr. Potato Head, Rex the dinosaur, and Slinky the dog."
+zootopia_adventures2,Which character do you find the most intriguing and why?,Which character do you find the most intriguing and why?,"I find Mr. Potato Head to be the most intriguing character because he can be assembled in different ways using various parts, making him versatile and unique."
+zootopia_adventures2,"Based on the images, can you create some dialogues that the characters might say to each other in these situations?","Based on the images, can you create some dialogues that the characters might say to each other in these situations?","Sure, here are some possible dialogues:
+- ""Hey, let's get in line for the next window.""
+- ""I'm not sure what we're doing here, but let's follow the others.""
+- ""I hope we can get our business done quickly so we can go back to playing.""
+- ""I'm not sure if this is the right place, but it looks like everyone else is here.""
+- ""I'm not sure what's going on, but let's just go with the flow.""
+- ""I think we should ask someone for help to figure out what we need to do.""
+- ""I'm not sure if we're in the right place, but let's just try to blend in and see what happens.""
+- ""I'm not sure if this is the right line, but it seems like everyone else is in it.""
+- ""I'm not sure what's going on, but let's just follow the others and see where it leads.""
+- ""I'm not sure if this is the right place, but it looks like everyone else is here, so let's just go with it."""
+zootopia_adventures3,./eval/eval_data/images/zootopia/z1.png,Examine and describe the characters' actions in the first image.,"In the first image, the two stuffed animals are sitting at a desk with a sign that says ""Next Window."" One of the stuffed animals is holding a coffee cup, while the other is holding a phone. They appear to be waiting for something at the window."
+zootopia_adventures3,./eval/eval_data/images/zootopia/z2.png,"In the second image, what are the main characters doing, and how do they seem to feel?","In the second image, the main characters are a fox and a stuffed animal. They are standing in front of a sign that says ""Next Window."" The fox seems to be holding a phone, while the stuffed animal is holding a coffee cup. They appear to be interacting with each other and seem to be happy or at least not showing any negative emotions."
+zootopia_adventures3,Contrast the characters' moods and interactions in the two provided images.,Contrast the characters' moods and interactions in the two provided images.,"In the first image, the two stuffed animals are sitting at a desk with a sign that says ""Next Window."" They appear to be waiting for something, but their moods cannot be determined. 
+
+In the second image, the fox and the stuffed animal are standing in front of a sign that says ""Next Window."" The fox is holding a phone and the stuffed animal is holding a coffee cup. They seem to be interacting with each other and appear to be happy or at least not showing any negative emotions."
+zootopia_adventures3,./eval/eval_data/images/zootopia/z3.png,Imagine and narrate a hilarious situation involving the characters from the images.,"In the hilarious situation, the fox and the stuffed animal are having a conversation in front of a sign that says ""Next Window."" The fox is holding a phone and the stuffed animal is holding a coffee cup. Suddenly, the fox gets a call and starts talking on the phone, completely ignoring the stuffed animal. The stuffed animal gets frustrated and starts yelling at the fox, demanding his attention. The fox, still focused on the phone, pays no attention to the stuffed animal's yelling. The stuffed animal gets even more upset and starts throwing the coffee cup at the fox, but the cup misses and hits the wall behind them. The fox, oblivious to the situation, continues talking on the phone, while the stuffed animal storms off in a huff."
+zootopia_adventures3,"Name the movie from which these characters are, and give a succinct summary of its plot.","Name the movie from which these characters are, and give a succinct summary of its plot.","The movie is ""Next Window,"" and the plot involves a fox and a stuffed animal having a conversation in front of a sign that says ""Next Window."" The fox is holding a phone and the stuffed animal is holding a coffee cup. The fox gets a call on the phone and starts talking, ignoring the stuffed animal. The stuffed animal gets frustrated and starts yelling at the fox, demanding his attention. The fox continues talking on the phone, and the stuffed animal throws a coffee cup at the fox, but misses and hits the wall behind them. The fox pays no attention to the stuffed animal's outburst and continues talking on the phone."
+zootopia_adventures3,Create a funny and unexpected scenario that could unfold between the characters in these images.,Create a funny and unexpected scenario that could unfold between the characters in these images.,"In this funny and unexpected scenario, the fox and the stuffed animal are having a conversation in front of a sign that says ""Next Window."" The fox is holding a phone and the stuffed animal is holding a coffee cup. Suddenly, the fox gets a call on the phone and starts talking, ignoring the stuffed animal. The stuffed animal gets frustrated and starts yelling at the fox, demanding his attention. The fox continues talking on the phone, completely oblivious to the stuffed animal's outburst. The stuffed animal gets even more upset and throws the coffee cup at the fox, but misses and hits the wall behind them. The fox, still focused on the phone call, pays no attention to the coffee cup hitting the wall and continues talking. The stuffed animal, now furious, starts jumping up and down, trying to get the fox's attention. The fox, still engrossed in the phone call, ignores the stuffed animal's antics and continues talking. The stuffed animal, in a last-ditch effort to get the fox's attention, starts singing a song, but the fox"
+tech_ceos1,./eval/eval_data/images/tech-ceo/jobs1.jpg,Who is this person in this first image?,"The person in the first image is Steve Jobs, the CEO of Apple."
+tech_ceos1,./eval/eval_data/images/tech-ceo/gate1.jpg,Who is this person in this second image?,"This person is Bill Gates, the co-founder of Microsoft."
+tech_ceos1,./eval/eval_data/images/tech-ceo/musk1.jpg,Who is this person in this third image?,"This person is a man wearing a suit and tie, and he is holding a microphone."
+tech_ceos1,Recall who is in the second image.,Recall who is in the second image.,The second image shows a man holding a microphone in front of a large sign.
+tech_ceos1,Recall who is in the first image.,Recall who is in the first image.,The first image shows Steve Jobs standing in front of a podium.
+tech_ceos1,"Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","Yes, the person in the first image is the founder of Apple."
+tech_ceos1,Is the person in the third image the founder of Apple?,Is the person in the third image the founder of Apple?,"No, the person in the third image is not the founder of Apple."
+tech_ceos2,./eval/eval_data/images/tech-ceo/musk1.jpg,Who is this person in the first image?,"The person in the first image is a man wearing a white shirt and a black tie, who is holding a microphone in his hand."
+tech_ceos2,./eval/eval_data/images/tech-ceo/gate1.jpg,Who is this person in the second image?,"The person in the second image is Bill Gates, the founder of Microsoft."
+tech_ceos2,./eval/eval_data/images/tech-ceo/jobs1.jpg,Who is this person in the third image?,"The person in the third image is Steve Jobs, the founder of Apple."
+tech_ceos2,Recall who is in the second image.,Recall who is in the second image.,There is no one in the second image.
+tech_ceos2,Recall who is in the first image.,Recall who is in the first image.,There is a man in the first image who is holding a microphone in his hand.
+tech_ceos2,"Is the person in the first image the founder of Apple? If not, which of the above images is the person the founder of Apple. ","Is the person in the first image the founder of Apple? If not, which of the above images is the person the founder of Apple. ","No, the person in the first image is not the founder of Apple. The person in the second image is the founder of Apple."
+tech_ceos2,Is the person in the third image the founder of Apple?,Is the person in the third image the founder of Apple?,"Yes, the person in the third image is the founder of Apple, Steve Jobs."
diff --git a/applications/DeepSpeed-VisualChat/eval/results/eval_comprehensive/ours-set1_final.csv b/applications/DeepSpeed-VisualChat/eval/results/eval_comprehensive/ours-set1_final.csv
new file mode 100644
index 000000000..66c30e318
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/eval/results/eval_comprehensive/ours-set1_final.csv
@@ -0,0 +1,75 @@
+test_name,image_path,question,answer
+cat_images1,./eval/eval_data/images/cats/1806905748_adb926a0a0.jpg,Please describe the image in detail.,"The image shows a black cat lying on a carpeted floor, with its paws stretched out and its eyes closed. The cat appears to be relaxed and comfortable, possibly sleeping or resting. The background of the image is not particularly distinct, but the focus is on the cat and its position on the floor."
+cat_images1,./eval/eval_data/images/cats/british_shorthair.jpg,Please describe the image in detail.,"The image shows a gray and white cat lying on a couch, with its paws stretched out and its eyes closed. The cat appears to be relaxed and comfortable on the couch."
+cat_images1,./eval/eval_data/images/cats/cat.png,Please describe the image in detail.,"The image shows a brown and white cat lying on a green leafy surface, possibly a grassy area. The cat is stretched out and appears to be relaxed, with its paws extended and its eyes closed. The background of the image is not particularly distinct, but it appears to be outdoors, surrounded by greenery."
+cat_images1,Are the colors of the three cats the same?,Are the colors of the three cats the same?,"No, the colors of the three cats are different. The first cat is black, the second cat is gray and white, and the third cat is brown and white."
+cat_images1,What are the differences between the first and third images?,What are the differences between the first and third images?,The first image shows a black cat lying on a carpet while the third image shows a brown and white cat lying on a green leafy surface.
+cat_images1,What are the differences between the second and third images?,What are the differences between the second and third images?,The second image shows a gray and white cat lying on a couch while the third image shows a brown and white cat lying on a green leafy surface.
+cat_images1,Is the cat in the first image in the sunshine?,Is the cat in the first image in the sunshine?,"Yes, the cat in the first image is lying in the sunshine."
+cat_images2,./eval/eval_data/images/cats/1806905748_adb926a0a0.jpg,Please describe the image in detail.,"The image shows a black cat lying on a carpeted floor, with its paws stretched out and its eyes closed. The cat appears to be relaxed and comfortable, possibly sleeping or resting. The background of the image is not particularly distinct, but the focus is on the cat and its position on the floor."
+cat_images2,./eval/eval_data/images/cats/british_shorthair.jpg,Please describe the image in detail.,"The image shows a gray and white cat lying on a couch, with its paws stretched out and its eyes closed. The cat appears to be relaxed and comfortable on the couch."
+cat_images2,What are the differences between the two images?,What are the differences between the two images?,The first image shows a black cat lying on a carpeted floor while the second image shows a gray and white cat lying on a couch.
+cat_images2,./eval/eval_data/images/cats/cat.png,Please describe the image in detail.,"The image shows a brown and white cat sitting on a carpeted floor, with its paws stretched out and its eyes closed. The cat appears to be relaxed and comfortable in its position."
+cat_images2,Are the colors of the three cats the same?,Are the colors of the three cats the same?,"No, the colors of the three cats are different. The first cat is black, the second cat is gray and white, and the third cat is brown and white."
+cat_images2,What are the differences between the first and third images?,What are the differences between the first and third images?,The first image shows a black cat lying on a carpeted floor while the third image shows a brown and white cat sitting on a carpeted floor.
+cat_images2,What are the differences between the second and third images?,What are the differences between the second and third images?,The second image shows a gray and white cat lying on a couch while the third image shows a brown and white cat sitting on a carpeted floor.
+cat_images2,Is the cat in the first image in the sunshine?,Is the cat in the first image in the sunshine?,"No, the cat in the first image is not in the sunshine. It is lying on a carpeted floor."
+cat_images2,Which cat do you prefer and why?,Which cat do you prefer and why?,"I prefer the gray and white cat in the second image because it looks calm and relaxed while lying on the couch. The cat's eyes are closed, and it seems to be enjoying the comfort of the couch. The image also shows the cat's paws stretched out, which adds to its relaxed appearance."
+cat_images2,I prefer the second cat. It's so cute.,I prefer the second cat. It's so cute.,"I agree, the second cat is very cute and looks relaxed while lying on the couch."
+cat_images2,Then why do you prefer the third cat more?,Then why do you prefer the third cat more?,"I prefer the third cat because it is brown and white, which is a classic and attractive color combination for cats. Additionally, the third cat is sitting on a carpeted floor, which provides a cozy and comfortable environment for the cat to relax in."
+counting_people1,./eval/eval_data/images/friends/can-count1.jpg,Count the number of people in the image.,"There are six people in the image, including the two women and four men."
+counting_people1,./eval/eval_data/images/friends/can-count2.jpg,Count the number of people in the image.,"The number of people in the image is not specified, but there are at least six people, including two women and four men."
+counting_people1,What are the differences between the two images? Are they the same group of people? Explain why.,What are the differences between the two images? Are they the same group of people? Explain why.,"The first image has six people, including two women and four men, while the second image also has six people, including two women and four men. However, the people in the two images are not the same as they have different positions and backgrounds. The first image has the people sitting on a couch, while the second image has the people standing in front of a wall. The first image has the people's faces visible, while the second image has some of the faces partially hidden. The reason for the difference is that the two images are taken at different times and locations, with different people posing for the pictures."
+counting_people1,Are you familiar with this TV series? Can you name the characters shown in the provided images? Who are they?,Are you familiar with this TV series? Can you name the characters shown in the provided images? Who are they?,"Yes, I am familiar with the TV series. The characters in the images are from the TV series Friends. The characters are: Monica, Rachel, Phoebe, Joey, Chandler, and Ross."
+counting_people2,./eval/eval_data/images/friends/can-count1.jpg,How many individuals are depicted in the image?,There are four individuals in the image.
+counting_people2,./eval/eval_data/images/friends/can-count2.jpg,How many individuals can you see in the second image?,There are six individuals in the second image.
+counting_people2,Can you spot any differences between these two images? Do they represent the same set of people? Please provide a rationale.,Can you spot any differences between these two images? Do they represent the same set of people? Please provide a rationale.,"The two images represent different sets of people. The first image shows four people sitting on a couch, while the second image shows six people standing in a row."
+counting_people2,Do you recognize this TV show? Can you name the characters shown in the provided images? Who are they?,Do you recognize this TV show? Can you name the characters shown in the provided images? Who are they?,"No, these images do not represent a TV show. The characters in the images are not named."
+counting_people3,./eval/eval_data/images/friends/wrong-count1.jpg,Count the number of people in the image.,There are four people in the image.
+counting_people3,./eval/eval_data/images/friends/wrong-count2.jpg,Count the number of people in the image.,There are six people in the image.
+counting_people3,What are the differences between the two images? Are they the same group of people? Explain why.,What are the differences between the two images? Are they the same group of people? Explain why.,"The first image has four people sitting on a bench, while the second image has six people standing next to each other. The people in the two images are not the same. The reason is that the first image is a group of friends posing for a picture, while the second image is a group of actors posing for a promotional photo."
+counting_people4,./eval/eval_data/images/friends/wrong-count1.jpg,How many individuals are depicted in the image?,There are four individuals depicted in the image.
+counting_people4,./eval/eval_data/images/friends/wrong-count2.jpg,How many individuals are depicted in the image?,There are six individuals in the image.
+counting_people4,Can you spot any differences between these two images?  Do they represent the same set of people? Please provide a rationale.,Can you spot any differences between these two images?  Do they represent the same set of people? Please provide a rationale.,"Yes, there are several differences between the two images. The first image has four people sitting on a bench, while the second image has six people standing. The people in the first image are wearing different clothes, while the people in the second image are wearing similar clothes. The location of the first image is a park, while the second image is taken in front of a tree."
+counting_people4,Do you recognize this TV show? Can you name the characters shown in the provided images? Who are they?,Do you recognize this TV show? Can you name the characters shown in the provided images? Who are they?,"Yes, I recognize the characters from the TV show Friends."
+zootopia_adventures1,./eval/eval_data/images/zootopia/z1.png,Please describe the image in detail.,"The image is a digital recreation of a scene featuring two stuffed animals, a dog and a cat, sitting at a desk with a phone in front of them. The desk has a sign that says ""Next Window."" There are also two people in the background, one on the left and one on the right."
+zootopia_adventures1,./eval/eval_data/images/zootopia/z2.png,Please describe the image in detail.,"The image is a digital recreation of a scene featuring a raccoon, a fox, and a squirrel standing in front of a counter with a sign that says ""Next Window."" There are also two people in the background, one on the left and one on the right."
+zootopia_adventures1,./eval/eval_data/images/zootopia/z3.png,Can you name the characters in the images? Who are they? What are they doing?,"The characters in the images are a fox and a raccoon. They are sitting at a desk with a phone and a sign that says ""Next Window."" The fox is holding a coffee mug and the raccoon is holding a cup. They are likely discussing something or waiting for someone."
+zootopia_adventures1,"You are an imaginative storyteller. Create a fascinating story based on the first, second and third image.","You are an imaginative storyteller. Create a fascinating story based on the first, second and third image.","The story could be about a fox and a raccoon who are working together in an office. They are sitting at a desk with a phone and a sign that says ""Next Window"". The fox is holding a coffee mug and the raccoon is holding a cup. They are discussing something important while waiting for someone to come to their desk."
+zootopia_adventures1,Are you familiar with these characters? What movie are they from?,Are you familiar with these characters? What movie are they from?,"Yes, I am familiar with these characters. They are from the movie ""Zootopia""."
+zootopia_adventures1,Can you name the characters in the images? Who are they?,Can you name the characters in the images? Who are they?,The characters in the images are a fox and a raccoon.
+zootopia_adventures1,In what type of environment or setting do these characters live? Describe it.,In what type of environment or setting do these characters live? Describe it.,"The characters live in an office setting. They are sitting at a desk with a phone and a sign that says ""Next Window""."
+zootopia_adventures2,./eval/eval_data/images/zootopia/z1.png ./eval/eval_data/images/zootopia/z2.png ./eval/eval_data/images/zootopia/z3.png,Create an engaging story strictly based on the images.,The engaging story could be about a group of stuffed animals who are waiting in line at a bank or a store. They might be discussing their plans or sharing their experiences while waiting. The story could be told from the perspective of one of the stuffed animals or from an outside observer.
+zootopia_adventures2,Do you recognize the setting or the characters in these images? Name the movie.,Do you recognize the setting or the characters in these images? Name the movie.,"The setting is a bank or a store, and the characters are stuffed animals. The movie is not specified."
+zootopia_adventures2,Can you share some interesting facts or details about the characters shown in the images?,Can you share some interesting facts or details about the characters shown in the images?,"The characters shown are stuffed animals, including a teddy bear, a fox, and a hippo. They are not real animals, but rather toys or figurines."
+zootopia_adventures2,Which character do you find the most intriguing and why?,Which character do you find the most intriguing and why?,I find the fox character the most intriguing because it is a stuffed animal that is not commonly seen in this context. It adds a unique element to the scene and makes it stand out.
+zootopia_adventures2,"Based on the images, can you create some dialogues that the characters might say to each other in these situations?","Based on the images, can you create some dialogues that the characters might say to each other in these situations?","Sure, here are some possible dialogues:
+- ""I'm waiting for my turn at the bank.""
+- ""I'm here to purchase something at the store.""
+- ""I hope we don't have to wait too long.""
+- ""I'm excited to see what's in store for us.""
+- ""I'm curious about what's going on around us.""
+- ""I'm glad we're all here together.""
+- ""I'm looking forward to our next adventure."""
+zootopia_adventures3,./eval/eval_data/images/zootopia/z1.png,Examine and describe the characters' actions in the first image.,"The characters in the first image are a group of stuffed animals, including a plush elephant, a plush horse, and a plush dog, standing behind a counter in a mock-up of a bank or store. The elephant is holding a sign that says ""Next Window,"" while the horse and dog are standing behind the counter. The scene is likely meant to depict a playful or imaginative representation of a bank or store setting, with the stuffed animals acting as the employees or customers."
+zootopia_adventures3,./eval/eval_data/images/zootopia/z2.png,"In the second image, what are the main characters doing, and how do they seem to feel?","The main characters in the second image are a fox and a squirrel. They are standing in front of a counter, and the fox is holding a sign that says ""Next Window."" The squirrel is standing next to the fox, and they both seem to be looking at the sign. The fox appears to be smiling, while the squirrel's expression is not clear."
+zootopia_adventures3,Contrast the characters' moods and interactions in the two provided images.,Contrast the characters' moods and interactions in the two provided images.,"In the first image, the stuffed animals are not interacting with each other, so it is difficult to determine their moods. However, the plush elephant holding a sign that says ""Next Window"" seems to be in a neutral mood. 
+
+In the second image, the fox and the squirrel are standing in front of a counter, and the fox is holding a sign that says ""Next Window."" The squirrel is standing next to the fox, and they both seem to be looking at the sign. It is difficult to determine their moods, but they appear to be focused on the task at hand."
+zootopia_adventures3,./eval/eval_data/images/zootopia/z3.png,Imagine and narrate a hilarious situation involving the characters from the images.,"Imagine a scenario where the stuffed animals from the first image are having a conversation with the fox and the squirrel from the second image. The stuffed animals could be asking the fox and the squirrel about their experiences in the bank or the store, and the fox and the squirrel could be sharing their funny stories or mishaps while working there. The conversation could be full of humor and silliness, making it a hilarious situation to imagine."
+zootopia_adventures3,"Name the movie from which these characters are, and give a succinct summary of its plot.","Name the movie from which these characters are, and give a succinct summary of its plot.","The movie is ""The Fox and the Hound,"" and the plot revolves around the unlikely friendship between a fox and a hound. The fox, Tod, is raised by a woman who teaches him to be a thief, while the hound, Copper, is trained by a hunter to hunt foxes. When the two meet, they become friends, but their friendship is challenged when the hunter sets out to capture Tod. The hunter's dog, Chief, also joins the hunt. In the end, the hunter catches Tod, but Copper saves him, and the two friends escape together."
+zootopia_adventures3,Create a funny and unexpected scenario that could unfold between the characters in these images.,Create a funny and unexpected scenario that could unfold between the characters in these images.,"In this scenario, the stuffed animals from the first image are having a party, and the fox and the squirrel from the second image crash the party. The stuffed animals are surprised to see the fox and the squirrel, but they quickly welcome them and start to have a good time together. However, the party takes a turn when the stuffed animals realize that the fox and the squirrel are not from their movie, and they start to question their presence. The fox and the squirrel try to explain that they are from a different movie, but the stuffed animals are not convinced. The party turns into a debate about which movie is better, and the fox and the squirrel end up leaving the party feeling disappointed."
+tech_ceos1,./eval/eval_data/images/tech-ceo/jobs1.jpg,Who is this person in this first image?,"The person in the first image is Steve Jobs, the CEO of Apple."
+tech_ceos1,./eval/eval_data/images/tech-ceo/gate1.jpg,Who is this person in this second image?,"This person is Bill Gates, the co-founder of Microsoft."
+tech_ceos1,./eval/eval_data/images/tech-ceo/musk1.jpg,Who is this person in this third image?,"This person is Bill Gates, the co-founder of Microsoft."
+tech_ceos1,Recall who is in the second image.,Recall who is in the second image.,"The second image shows a man holding a microphone and standing in front of a large sign that says ""Toshiba."""
+tech_ceos1,Recall who is in the first image.,Recall who is in the first image.,The first image shows Steve Jobs holding an iPhone.
+tech_ceos1,"Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","Yes, the person in the first image is the founder of Apple, Steve Jobs."
+tech_ceos1,Is the person in the third image the founder of Apple?,Is the person in the third image the founder of Apple?,"No, the person in the third image is Bill Gates, the co-founder of Microsoft."
+tech_ceos2,./eval/eval_data/images/tech-ceo/musk1.jpg,Who is this person in the first image?,"The person in the first image is a man named Elon Musk, the founder and CEO of Tesla."
+tech_ceos2,./eval/eval_data/images/tech-ceo/gate1.jpg,Who is this person in the second image?,"The person in the second image is Bill Gates, the co-founder of Microsoft."
+tech_ceos2,./eval/eval_data/images/tech-ceo/jobs1.jpg,Who is this person in the third image?,"The person in the third image is Steve Jobs, the co-founder and CEO of Apple."
+tech_ceos2,Recall who is in the second image.,Recall who is in the second image.,The second image shows a man holding up an iPhone.
+tech_ceos2,Recall who is in the first image.,Recall who is in the first image.,The first image shows a man standing in front of a microphone.
+tech_ceos2,"Is the person in the first image the founder of Apple? If not, which of the above images is the person the founder of Apple. ","Is the person in the first image the founder of Apple? If not, which of the above images is the person the founder of Apple. ",The person in the first image is not the founder of Apple. The person in the second image is the founder of Apple.
+tech_ceos2,Is the person in the third image the founder of Apple?,Is the person in the third image the founder of Apple?,"Yes, the person in the third image is the founder of Apple, Steve Jobs."
diff --git a/applications/DeepSpeed-VisualChat/eval/results/eval_comprehensive/ours-set2_best_eval.csv b/applications/DeepSpeed-VisualChat/eval/results/eval_comprehensive/ours-set2_best_eval.csv
new file mode 100644
index 000000000..c129fefb7
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/eval/results/eval_comprehensive/ours-set2_best_eval.csv
@@ -0,0 +1,94 @@
+test_name,image_path,question,answer
+cat_images1,./eval/eval_data/images/cats/1806905748_adb926a0a0.jpg,Please describe the image in detail.,"The image shows a black cat lying on its back on a carpeted floor. The cat is relaxed and appears to be purring, as indicated by the caption. It has its paws stretched out and its eyes closed, enjoying the comfort of the carpet and the warmth of the room. The cat's fur is smooth and shiny, and its body is well-proportioned. The image captures the essence of a contented and relaxed feline moment."
+cat_images1,./eval/eval_data/images/cats/british_shorthair.jpg,Please describe the image in detail.,"The image shows a gray and white cat lying on a carpeted floor, stretching out its paws and relaxing. The cat is lying on its side, with its head and body positioned close to a carpeted staircase. The cat's paws are extended and its eyes are closed, indicating that it is comfortable and at ease in its surroundings. The cat's fur is smooth and shiny, and its body is well-proportioned. The image captures the essence of a contented feline moment."
+cat_images1,./eval/eval_data/images/cats/cat.png,Please describe the image in detail.,"The image shows a black and white cat lying on a carpeted floor, with its paws stretched out and its eyes closed. The cat is relaxed and appears to be enjoying the comfort of the carpet. The cat's fur is smooth and shiny, and its body is well-proportioned. The image captures the essence of a contented feline moment."
+cat_images1,Are the colors of the three cats the same?,Are the colors of the three cats the same?,"No, the colors of the three cats are different. One is black, one is gray and white, and one is brown and white."
+cat_images1,What are the differences between the first and third images?,What are the differences between the first and third images?,The first image shows a black cat lying on a carpeted floor while the third image shows a brown and white cat lying on a carpeted floor. The first image has a larger bounding box than the third image.
+cat_images1,What are the differences between the second and third images?,What are the differences between the second and third images?,The second image shows a black cat lying on a carpeted staircase while the third image shows a brown and white cat lying on a carpeted floor. The second image has a smaller bounding box than the third image.
+cat_images1,Is the cat in the first image in the sunshine?,Is the cat in the first image in the sunshine?,"No, the cat in the first image is not in the sunshine. It is lying on a carpeted floor."
+cat_images2,./eval/eval_data/images/cats/1806905748_adb926a0a0.jpg,Please describe the image in detail.,"The image shows a black cat lying on its back on a carpeted floor. The cat is relaxed and appears to be purring, as indicated by the caption. It has its paws stretched out and its eyes closed, enjoying the comfort of the carpet and the warmth of the room. The cat's fur is smooth and shiny, and its body is well-proportioned. The image captures the essence of a contented and relaxed feline moment."
+cat_images2,./eval/eval_data/images/cats/british_shorthair.jpg,Please describe the image in detail.,"The image shows a gray and white cat lying on a carpeted floor, stretching out its paws and relaxing. The cat is lying on its side, with its head and body positioned close to a carpeted staircase. The cat's paws are extended and its eyes are closed, indicating that it is comfortable and at ease in its surroundings. The cat's fur is smooth and shiny, and its body is well-proportioned. The image captures the essence of a contented feline moment."
+cat_images2,What are the differences between the two images?,What are the differences between the two images?,The first image shows a black cat lying on a carpet while the second image shows a gray and white cat lying on a carpet near a staircase.
+cat_images2,./eval/eval_data/images/cats/cat.png,Please describe the image in detail.,"The image shows a black and white cat lying on a carpeted floor, with its paws stretched out and its eyes closed. The cat is relaxed and appears to be enjoying the comfort of the carpet. The cat's fur is smooth and shiny, and its body is well-proportioned. The image captures the essence of a contented feline moment."
+cat_images2,Are the colors of the three cats the same?,Are the colors of the three cats the same?,"No, the colors of the three cats are different. The first cat is black, the second cat is gray and white, and the third cat is black and white."
+cat_images2,What are the differences between the first and third images?,What are the differences between the first and third images?,The first image shows a black cat lying on a carpet while the third image shows a black and white cat lying on a carpet near a staircase.
+cat_images2,What are the differences between the second and third images?,What are the differences between the second and third images?,The second image shows a black cat lying on a carpet while the third image shows a gray and white cat lying on a carpet near a staircase.
+cat_images2,Is the cat in the first image in the sunshine?,Is the cat in the first image in the sunshine?,"No, the cat in the first image is not in the sunshine. It is lying on a carpet indoors."
+cat_images2,Which cat do you prefer and why?,Which cat do you prefer and why?,"I personally prefer the gray and white cat in the third image. It appears to be relaxed and content, with its eyes closed and paws stretched out, which gives off a sense of tranquility and comfort. The cat's fur is also well-groomed and shiny, which adds to its overall appearance."
+cat_images2,I prefer the second cat. It's so cute.,I prefer the second cat. It's so cute.,"I agree, the second cat is adorable. It looks so comfortable and relaxed, with its paws stretched out and eyes closed. It's a great image of a cat enjoying a moment of peace and contentment."
+cat_images2,Then why do you prefer the third cat more?,Then why do you prefer the third cat more?,"I prefer the third cat more because it looks more relaxed and content. It is lying on the carpet with its eyes closed and paws stretched out, which gives off a sense of tranquility and comfort. The cat's fur is also well-groomed and shiny, which adds to its overall appearance."
+counting_people1,./eval/eval_data/images/friends/can-count1.jpg,Count the number of people in the image.,There are six people in the image.
+counting_people1,./eval/eval_data/images/friends/can-count2.jpg,Count the number of people in the image.,There are six people in the image.
+counting_people1,What are the differences between the two images? Are they the same group of people? Explain why.,What are the differences between the two images? Are they the same group of people? Explain why.,"The two images show different groups of people. In the first image, there are six people posing for a photo, while in the second image, there are six people standing together, but they are not posing for a photo. The first image shows the same group of people in different poses, while the second image shows a different group of people standing together."
+counting_people1,Are you familiar with this TV series? Can you name the characters shown in the provided images? Who are they?,Are you familiar with this TV series? Can you name the characters shown in the provided images? Who are they?,"Yes, I am familiar with the TV series ""The Simpsons."" The characters shown in the provided images are Homer Simpson, Marge Simpson, Bart Simpson, Lisa Simpson, and Maggie Simpson."
+counting_people2,./eval/eval_data/images/friends/can-count1.jpg,How many individuals are depicted in the image?,There are six individuals depicted in the image.
+counting_people2,./eval/eval_data/images/friends/can-count2.jpg,How many individuals can you see in the second image?,There are six individuals in the second image.
+counting_people2,Can you spot any differences between these two images? Do they represent the same set of people? Please provide a rationale.,Can you spot any differences between these two images? Do they represent the same set of people? Please provide a rationale.,"The first image shows a group of people standing in front of a building, while the second image shows a group of people sitting on a couch. The first image does not provide any information about the people's identities, while the second image shows six individuals with different appearances and clothing. The rationale is that the first image is a general group photo, while the second image is a more specific setting with individual portraits."
+counting_people2,Do you recognize this TV show? Can you name the characters shown in the provided images? Who are they?,Do you recognize this TV show? Can you name the characters shown in the provided images? Who are they?,"Yes, the images depict characters from the popular TV show ""Friends."" The characters are (from left to right) Monica Geller, Rachel Green, Phoebe Buffay, Joey Tribbiani, Chandler Bing, and Ross Geller."
+counting_people3,./eval/eval_data/images/friends/wrong-count1.jpg,Count the number of people in the image.,There are five people in the image.
+counting_people3,./eval/eval_data/images/friends/wrong-count2.jpg,Count the number of people in the image.,There are five people in the image.
+counting_people3,What are the differences between the two images? Are they the same group of people? Explain why.,What are the differences between the two images? Are they the same group of people? Explain why.,"The two images show different groups of people. In the first image, there are five people sitting on a bench, while in the second image, there are five people standing in front of a building. The people in the first image are wearing casual clothes, while the people in the second image are wearing formal attire. The first image shows a group of friends sitting together, while the second image shows a group of people posing for a photo in front of a building. The first image does not show any buildings or background, while the second image has a building in the background. The first image does not have any formal attire, while the second image has formal attire. The first image does not have any posing, while the second image has people posing for a photo. The first image does not have any buildings or background, while the second image has a building in the background. The first image does not have any formal attire, while the second image has formal attire. The first image does not have any posing, while the second image has people posing for a photo. The first image does not have any buildings or background, while the second image has a building in the background. The first image does not have any"
+counting_people4,./eval/eval_data/images/friends/wrong-count1.jpg,How many individuals are depicted in the image?,There are four individuals depicted in the image.
+counting_people4,./eval/eval_data/images/friends/wrong-count2.jpg,How many individuals are depicted in the image?,There are four individuals depicted in the image.
+counting_people4,Can you spot any differences between these two images?  Do they represent the same set of people? Please provide a rationale.,Can you spot any differences between these two images?  Do they represent the same set of people? Please provide a rationale.,"The two images depict different sets of people. In the first image, there are four people sitting on a bench, while in the second image, there are four people standing in front of a building. The first image shows a group of friends posing for a picture, while the second image appears to be a group of people posing for a promotional photo or a public event."
+counting_people4,Do you recognize this TV show? Can you name the characters shown in the provided images? Who are they?,Do you recognize this TV show? Can you name the characters shown in the provided images? Who are they?,"I'm sorry, I'm not familiar with the TV show you're referring to. The characters shown in the provided images are not identified, and I cannot provide any information about them."
+zootopia_adventures1,./eval/eval_data/images/zootopia/z1.png,Please describe the image in detail.,"The image shows a group of stuffed animals sitting at a desk in a room. There are three stuffed animals, including a teddy bear, a dog, and a cat. The teddy bear is sitting in the middle of the desk, while the dog and cat are sitting on either side of the teddy bear. The room is decorated with a rug on the floor and a picture hanging on the wall."
+zootopia_adventures1,./eval/eval_data/images/zootopia/z2.png,Please describe the image in detail.,"The image shows a group of stuffed animals sitting at a desk in a room. The stuffed animals are wearing clothes and are sitting on chairs. There are three stuffed animals in the scene, including a teddy bear, a dog, and a cat. The room is decorated with a rug on the floor and a picture hanging on the wall."
+zootopia_adventures1,./eval/eval_data/images/zootopia/z3.png,Can you name the characters in the images? Who are they? What are they doing?,"The characters in the images are a stuffed animal, a dog, and a cat. They are sitting at a desk in a room."
+zootopia_adventures1,"You are an imaginative storyteller. Create a fascinating story based on the first, second and third image.","You are an imaginative storyteller. Create a fascinating story based on the first, second and third image.","Once upon a time, in a cozy room filled with stuffed animals, a teddy bear, a dog, and a cat sat at a desk. They were having a meeting to discuss their plans for the day. The teddy bear, being the leader of the group, suggested they go on an adventure to explore the nearby forest. The dog, being the most adventurous of the group, was excited about the idea and immediately agreed. The cat, on the other hand, was a bit hesitant but eventually agreed to join them.
+The group packed their things and set out on their journey. As they walked through the forest, they encountered various animals and plants. The dog was always eager to explore and sniff around, while the teddy bear and cat kept a close eye on their surroundings. They came across a stream, where they decided to take a break and have a picnic.
+While they were enjoying their meal, they heard a strange noise coming from the other side of the stream. The dog, being curious, jumped into the water to investigate. The teddy bear and cat were worried about him, but he quickly returned, wagging his tail and barking excited"
+zootopia_adventures1,Are you familiar with these characters? What movie are they from?,Are you familiar with these characters? What movie are they from?,"The characters in the images are from the movie ""Zootopia."""
+zootopia_adventures1,Can you name the characters in the images? Who are they?,Can you name the characters in the images? Who are they?,"The characters in the images are a stuffed animal, a dog, and a cat."
+zootopia_adventures1,In what type of environment or setting do these characters live? Describe it.,In what type of environment or setting do these characters live? Describe it.,The characters live in a room filled with stuffed animals.
+zootopia_adventures2,./eval/eval_data/images/zootopia/z1.png ./eval/eval_data/images/zootopia/z2.png ./eval/eval_data/images/zootopia/z3.png,Create an engaging story strictly based on the images.,"The teddy bear and the stuffed fox were sitting in a waiting room, patiently waiting for their turn to be seen by the doctor. They were surrounded by other stuffed animals, all waiting their turn for a check-up.
+The teddy bear was feeling a bit under the weather, with a slight cough and a runny nose. The fox, being a good friend, was trying to comfort the teddy bear and make him feel better.
+As they waited, they noticed a group of other stuffed animals gathered around a television screen. The news was on, and the stuffed animals were watching with great interest. The teddy bear and the fox joined the group, curious to know what was happening.
+Suddenly, the news anchor announced that a new children's hospital was opening in the city, and they were showing footage of the construction site. The teddy bear and the fox were excited to hear about the new hospital, as it would provide better facilities and care for all the stuffed animals in need.
+As the news segment ended, the teddy bear and the fox returned to their seats, waiting patiently for their turn to be seen by the doctor. They knew that with"
+zootopia_adventures2,Do you recognize the setting or the characters in these images? Name the movie.,Do you recognize the setting or the characters in these images? Name the movie.,"The setting and characters in these images resemble the movie ""Toy Story."""
+zootopia_adventures2,Can you share some interesting facts or details about the characters shown in the images?,Can you share some interesting facts or details about the characters shown in the images?,"Sure, here are some interesting facts about the characters in the movie ""Toy Story"":
+1. Woody, the cowboy doll, is the leader of the toys and the protagonist of the movie. He is voiced by Tom Hanks.
+2. Buzz Lightyear, the space ranger doll, is Woody's rival and best friend. He is voiced by Tim Allen.
+3. Mr. Potato Head is a versatile toy who can be assembled in various ways, with different parts that can be attached to his body. He is voiced by Don Rickles.
+4. Slinky Dog is a happy-go-lucky dog toy who is always eager to help Woody and the other toys. He is voiced by Jim Varney.
+5. Rex, the green dinosaur toy, is a nervous and excitable character who often gets scared easily. He is voiced by Wallace Shawn.
+6. Hamm, the piggy bank toy, is a wise-cracking character who often provides comic relief. He is voiced by John Ratzenberger.
+7. Bo Peep, the por"
+zootopia_adventures2,Which character do you find the most intriguing and why?,Which character do you find the most intriguing and why?,"I find the character of Mr. Potato Head to be the most intriguing in the movie ""Toy Story."" He is a versatile toy who can be assembled in various ways, with different parts that can be attached to his body. He is also a wise and humorous character, often providing insight and humor to the story. His ability to adapt and change his appearance makes him unique among the other toys, and his relationship with Woody is one of the central themes of the movie."
+zootopia_adventures2,"Based on the images, can you create some dialogues that the characters might say to each other in these situations?","Based on the images, can you create some dialogues that the characters might say to each other in these situations?","Sure, here are some possible dialogues between the characters in the images:
+1. Woody and Buzz Lightyear:
+Woody: ""Hey, Buzz, what's up?""
+Buzz Lightyear: ""Not much, Woody. Just waiting for our turn to be seen by the doctor.""
+Woody: ""Yeah, I'm feeling a bit under the weather myself. Maybe we should take it easy today.""
+Buzz Lightyear: ""Sure thing, Woody. Let's just relax and try to stay calm.""
+2. Woody and Mr. Potato Head:
+Woody: ""Hey, Mr. Potato Head, how's it going?""
+Mr. Potato Head: ""Not bad, Woody. Just waiting for our turn to be seen by the doctor.""
+Woody: ""Yeah, I'm feeling a bit under the weather myself. Do you think we should ask for a different doctor?""
+Mr. Potato Head: ""I don't know, Woody. Maybe we should just give this one a chance. After all, they're professionals.""
+Woody: """
+zootopia_adventures3,./eval/eval_data/images/zootopia/z1.png,Examine and describe the characters' actions in the first image.,"In the first image, two characters are sitting on a couch, one of them holding a cup of coffee. The other character is holding a newspaper, and they appear to be relaxing and enjoying their time together."
+zootopia_adventures3,./eval/eval_data/images/zootopia/z2.png,"In the second image, what are the main characters doing, and how do they seem to feel?","In the second image, the main characters are standing in a room with a receptionist. They appear to be talking to the receptionist, and they seem to be in a friendly and casual mood."
+zootopia_adventures3,Contrast the characters' moods and interactions in the two provided images.,Contrast the characters' moods and interactions in the two provided images.,"In the first image, the two characters are relaxed and enjoying their time together, sharing a cup of coffee and a newspaper. They appear to be in a comfortable and peaceful setting.
+In contrast, the second image shows the main characters interacting with a receptionist in a more formal setting. They seem to be engaging in a conversation with the receptionist, possibly inquiring about something or seeking assistance. The mood is more casual and friendly, but there is a sense of formality due to the presence of the receptionist and the setting."
+zootopia_adventures3,./eval/eval_data/images/zootopia/z3.png,Imagine and narrate a hilarious situation involving the characters from the images.,"In the first image, the two characters are enjoying a cup of coffee and a newspaper while relaxing on a couch. Suddenly, a wild and crazy character from the second image bursts into the scene, causing a commotion. The main characters are startled and try to figure out what's going on. The wild character, who is a stuffed animal, is dressed in a suit and tie and is holding a newspaper. He is trying to get their attention, but they are too surprised to respond. The wild character starts to dance and sing, making the situation even more chaotic. The main characters eventually join in the fun, and the scene turns into a hilarious dance party. The wild character's antics bring a new level of excitement and humor to the scene, making it a memorable and entertaining moment."
+zootopia_adventures3,"Name the movie from which these characters are, and give a succinct summary of its plot.","Name the movie from which these characters are, and give a succinct summary of its plot.","The characters are from the movie ""Zootopia,"" which tells the story of a city where animals of all species coexist and work together. The main characters are a rabbit named Judy Hopps, who becomes a police officer, and a fox named Nick Wilde, who is a con artist. Together, they work to solve a mystery involving the disappearance of several animals and uncover a conspiracy that threatens the harmony of Zootopia. The movie explores themes of prejudice, trust, and the importance of working together to overcome challenges."
+zootopia_adventures3,Create a funny and unexpected scenario that could unfold between the characters in these images.,Create a funny and unexpected scenario that could unfold between the characters in these images.,"In a hilarious and unexpected twist, the main characters from the first image, the two relaxed characters sitting on a couch, suddenly find themselves transported into the world of the second image, where they encounter the wild and crazy stuffed animal in a suit. The two characters are taken aback by this unexpected turn of events and try to make sense of their new surroundings.
+The stuffed animal, who is dressed in a suit and holding a newspaper, starts to dance and sing, causing the two main characters to join in the fun. However, as they dance and sing, they realize that they are not in their usual environment and that they need to find a way back to their original world.
+The two main characters and the wild stuffed animal embark on a journey to find a way back to their respective worlds, encountering various obstacles and challenges along the way. They eventually discover a portal that allows them to return to their original world, where they are relieved to be back in their familiar surroundings.
+The unexpected and hilarious adventure that the two main characters and the wild stuffed animal experience serves as a reminder that sometimes the most unexpected things can happen, and that"
+tech_ceos1,./eval/eval_data/images/tech-ceo/jobs1.jpg,Who is this person in this first image?,"This person is Steve Jobs, the co-founder and former CEO of Apple."
+tech_ceos1,./eval/eval_data/images/tech-ceo/gate1.jpg,Who is this person in this second image?,"This person is Bill Gates, the co-founder of Microsoft."
+tech_ceos1,./eval/eval_data/images/tech-ceo/musk1.jpg,Who is this person in this third image?,"This person is Elon Musk, the CEO of Tesla and SpaceX."
+tech_ceos1,Recall who is in the second image.,Recall who is in the second image.,"The person in the second image is Bill Gates, the co-founder of Microsoft."
+tech_ceos1,Recall who is in the first image.,Recall who is in the first image.,"The person in the first image is Steve Jobs, the co-founder of Apple."
+tech_ceos1,"Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","Yes, the person in the first image is Steve Jobs, the founder of Apple."
+tech_ceos1,Is the person in the third image the founder of Apple?,Is the person in the third image the founder of Apple?,"No, the person in the third image is Elon Musk, the founder of Tesla and SpaceX."
+tech_ceos2,./eval/eval_data/images/tech-ceo/musk1.jpg,Who is this person in the first image?,"The person in the first image is Elon Musk, the CEO of Tesla."
+tech_ceos2,./eval/eval_data/images/tech-ceo/gate1.jpg,Who is this person in the second image?,"The person in the second image is Bill Gates, the founder of Microsoft."
+tech_ceos2,./eval/eval_data/images/tech-ceo/jobs1.jpg,Who is this person in the third image?,"The person in the third image is Elon Musk, the CEO of Tesla."
+tech_ceos2,Recall who is in the second image.,Recall who is in the second image.,"The person in the second image is Bill Gates, the founder of Microsoft."
+tech_ceos2,Recall who is in the first image.,Recall who is in the first image.,"The person in the first image is Elon Musk, the CEO of Tesla."
+tech_ceos2,"Is the person in the first image the founder of Apple? If not, which of the above images is the person the founder of Apple. ","Is the person in the first image the founder of Apple? If not, which of the above images is the person the founder of Apple. ",The person in the first image is not the founder of Apple. The person in the second image is the founder of Microsoft.
+tech_ceos2,Is the person in the third image the founder of Apple?,Is the person in the third image the founder of Apple?,"No, the person in the third image is not the founder of Apple."
diff --git a/applications/DeepSpeed-VisualChat/eval/results/eval_comprehensive/ours-set2_final.csv b/applications/DeepSpeed-VisualChat/eval/results/eval_comprehensive/ours-set2_final.csv
new file mode 100644
index 000000000..dc800fc89
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/eval/results/eval_comprehensive/ours-set2_final.csv
@@ -0,0 +1,103 @@
+test_name,image_path,question,answer
+cat_images1,./eval/eval_data/images/cats/1806905748_adb926a0a0.jpg,Please describe the image in detail.,"The image shows a black cat lying on its back on a carpeted floor. The cat is relaxed and appears to be purring, as indicated by the caption. It has its paws stretched out and its eyes closed, enjoying the comfort of the carpet and possibly receiving some love or attention from its owner. The cat's fur is smooth and shiny, and its body is well-proportioned. The image conveys a sense of peace and contentment, highlighting the bond between humans and their pets."
+cat_images1,./eval/eval_data/images/cats/british_shorthair.jpg,Please describe the image in detail.,"The image shows a gray and white cat lying on a carpeted floor, stretching out its paws and relaxing. The cat is lying on its side, with its eyes closed, and appears to be enjoying a comfortable and peaceful moment. The cat's fur is well-groomed, and it is surrounded by a clean and tidy environment. The image conveys a sense of tranquility and the bond between humans and their pets."
+cat_images1,./eval/eval_data/images/cats/cat.png,Please describe the image in detail.,"The image shows a black and white cat lying on a carpeted floor, surrounded by greenery. The cat is lying on its back, with its paws stretched out and its eyes closed, enjoying a moment of relaxation. The greenery in the background adds a touch of nature to the scene, creating a peaceful and serene atmosphere. The cat's fur is well-groomed, and it appears to be in a comfortable and contented state."
+cat_images1,Are the colors of the three cats the same?,Are the colors of the three cats the same?,"No, the colors of the three cats are different. One is black and white, one is gray and white, and one is brown and white."
+cat_images1,What are the differences between the first and third images?,What are the differences between the first and third images?,The first image shows a black cat lying on a carpeted floor while the third image shows a brown and white cat lying on a carpeted floor. The first image has a larger bounding box than the third image.
+cat_images1,What are the differences between the second and third images?,What are the differences between the second and third images?,The second image shows a black cat lying on its back while the third image shows a brown and white cat lying on its back. The second image has a smaller bounding box than the third image.
+cat_images1,Is the cat in the first image in the sunshine?,Is the cat in the first image in the sunshine?,"No, the cat in the first image is not in the sunshine. It is lying on a carpeted floor."
+cat_images2,./eval/eval_data/images/cats/1806905748_adb926a0a0.jpg,Please describe the image in detail.,"The image shows a black cat lying on its back on a carpeted floor. The cat is relaxed and appears to be purring, as indicated by the caption. It has its paws stretched out and its eyes closed, enjoying the comfort of the carpet and possibly receiving some love or attention from its owner. The cat's fur is smooth and shiny, and its body is well-proportioned. The image conveys a sense of peace and contentment, highlighting the bond between humans and their pets."
+cat_images2,./eval/eval_data/images/cats/british_shorthair.jpg,Please describe the image in detail.,"The image shows a gray and white cat lying on a carpeted floor, stretching out its paws and relaxing. The cat is lying on its side, with its eyes closed, and appears to be enjoying a comfortable and peaceful moment. The cat's fur is well-groomed, and it is surrounded by a clean and tidy environment. The image conveys a sense of tranquility and the bond between humans and their pets."
+cat_images2,What are the differences between the two images?,What are the differences between the two images?,The first image shows a black cat lying on a carpet while the second image shows a gray and white cat lying on a carpet near a couch.
+cat_images2,./eval/eval_data/images/cats/cat.png,Please describe the image in detail.,"The image shows a black and white cat lying on a carpeted floor, stretching out its paws and looking up at something. The cat appears to be relaxed and enjoying its surroundings. The carpet is a light color, and the cat's fur is well-groomed. The image conveys a sense of tranquility and the bond between humans and their pets."
+cat_images2,Are the colors of the three cats the same?,Are the colors of the three cats the same?,"No, the colors of the three cats are different. The first cat is black, the second cat is gray and white, and the third cat is brown and white."
+cat_images2,What are the differences between the first and third images?,What are the differences between the first and third images?,The first image shows a black cat lying on a carpet while the third image shows a brown and white cat lying on a carpet near a couch.
+cat_images2,What are the differences between the second and third images?,What are the differences between the second and third images?,The second image shows a black cat lying on a carpet while the third image shows a gray and white cat lying on a carpet near a couch.
+cat_images2,Is the cat in the first image in the sunshine?,Is the cat in the first image in the sunshine?,"No, the cat in the first image is not in the sunshine. It is lying on a carpet indoors."
+cat_images2,Which cat do you prefer and why?,Which cat do you prefer and why?,"I personally prefer the gray and white cat in the second image. It appears to be relaxed and content, which makes it a pleasant sight to behold. Additionally, its fur is well-groomed, which adds to its overall appeal."
+cat_images2,I prefer the second cat. It's so cute.,I prefer the second cat. It's so cute.,"I agree, the second cat is very cute. It looks relaxed and content, which makes it a pleasant sight to behold. Its fur is well-groomed, which adds to its overall appeal."
+cat_images2,Then why do you prefer the third cat more?,Then why do you prefer the third cat more?,"I prefer the third cat more because it is lying on a carpet near a couch, which gives it a cozy and comfortable environment. Additionally, the cat's fur looks well-groomed, which adds to its overall appeal."
+counting_people1,./eval/eval_data/images/friends/can-count1.jpg,Count the number of people in the image.,There are six people in the image.
+counting_people1,./eval/eval_data/images/friends/can-count2.jpg,Count the number of people in the image.,There are six people in the image.
+counting_people1,What are the differences between the two images? Are they the same group of people? Explain why.,What are the differences between the two images? Are they the same group of people? Explain why.,"The two images show different groups of people. In the first image, there are six people posing for a photo, while in the second image, there are six people standing together, but they are not posing for a photo. The first image shows the group standing in a line, while the second image shows them standing in a group. The first image has a black background, while the second image has a white background. The first image has a caption that says ""Friends,"" while the second image does not have any caption. The first image has a larger size than the second image."
+counting_people1,Are you familiar with this TV series? Can you name the characters shown in the provided images? Who are they?,Are you familiar with this TV series? Can you name the characters shown in the provided images? Who are they?,"Yes, I am familiar with the TV series ""Friends."" The characters shown in the images are:
+1. Ross Geller, who is a paleontologist and a friend of the group.
+2. Rachel Green, who is a fashionista and a friend of the group.
+3. Monica Geller, who is a chef and a friend of the group.
+4. Chandler Bing, who is a sarcastic and witty friend of the group.
+5. Joey Tribbiani, who is a struggling actor and a friend of the group.
+6. Phoebe Buffay, who is a free-spirited and eccentric friend of the group."
+counting_people2,./eval/eval_data/images/friends/can-count1.jpg,How many individuals are depicted in the image?,There are six individuals depicted in the image.
+counting_people2,./eval/eval_data/images/friends/can-count2.jpg,How many individuals can you see in the second image?,There are six individuals in the second image.
+counting_people2,Can you spot any differences between these two images? Do they represent the same set of people? Please provide a rationale.,Can you spot any differences between these two images? Do they represent the same set of people? Please provide a rationale.,"The first image shows a group of people standing together with the word ""Friends"" written on the bottom, while the second image shows a group of six individuals posing for a picture. It is not clear if the two images represent the same set of people or not."
+counting_people2,Do you recognize this TV show? Can you name the characters shown in the provided images? Who are they?,Do you recognize this TV show? Can you name the characters shown in the provided images? Who are they?,"Unfortunately, I am not familiar with the TV show depicted in the provided images. I cannot identify the characters or provide any additional information about the show."
+counting_people3,./eval/eval_data/images/friends/wrong-count1.jpg,Count the number of people in the image.,There are four people in the image.
+counting_people3,./eval/eval_data/images/friends/wrong-count2.jpg,Count the number of people in the image.,There are four people in the image.
+counting_people3,What are the differences between the two images? Are they the same group of people? Explain why.,What are the differences between the two images? Are they the same group of people? Explain why.,"The two images appear to feature different groups of people. In the first image, there are four people sitting on a bench, while in the second image, there are four people standing in front of a building. The people in the first image are wearing casual clothes, while the people in the second image are wearing more formal attire. Additionally, the first image shows a fountain in the background, while the second image does not. These differences suggest that the two images depict different groups of people in different settings."
+counting_people4,./eval/eval_data/images/friends/wrong-count1.jpg,How many individuals are depicted in the image?,There are four individuals depicted in the image.
+counting_people4,./eval/eval_data/images/friends/wrong-count2.jpg,How many individuals are depicted in the image?,There are four individuals depicted in the image.
+counting_people4,Can you spot any differences between these two images?  Do they represent the same set of people? Please provide a rationale.,Can you spot any differences between these two images?  Do they represent the same set of people? Please provide a rationale.,"The two images depict different sets of people. In the first image, there are four people sitting on a bench, while in the second image, there are four people standing in front of a building. The first image shows a group of friends posing for a picture, while the second image appears to be a group of people posing for a promotional photo or a public event. The rationale behind these two images is that they represent different scenarios and groups of people."
+counting_people4,Do you recognize this TV show? Can you name the characters shown in the provided images? Who are they?,Do you recognize this TV show? Can you name the characters shown in the provided images? Who are they?,"Yes, I recognize this TV show as ""Friends."" The characters shown in the provided images are: 
+
+1. Ross Geller, played by David Schwimmer
+2. Monica Geller, played by Courteney Cox
+3. Phoebe Buffay, played by Lisa Kudrow
+4. Joey Tribbiani, played by Matt LeBlanc
+5. Chandler Bing, played by Matthew Perry"
+zootopia_adventures1,./eval/eval_data/images/zootopia/z1.png,Please describe the image in detail.,"The image shows a group of stuffed animals sitting at a desk in a room. The desk is located in the middle of the room, and there are two chairs in front of it. The stuffed animals are sitting at the desk, with one of them holding a cup of coffee. There are also several books and a computer monitor on the desk. The room is well-lit, and there are no people present in the scene."
+zootopia_adventures1,./eval/eval_data/images/zootopia/z2.png,Please describe the image in detail.,"The image shows a group of stuffed animals sitting at a desk in a room. The stuffed animals are wearing clothes and are sitting in front of a computer monitor. There are several books on the desk, and the room is well-lit. The stuffed animals appear to be having a conversation or working together at the desk."
+zootopia_adventures1,./eval/eval_data/images/zootopia/z3.png,Can you name the characters in the images? Who are they? What are they doing?,The characters in the images are a stuffed animal and a raccoon. They are sitting at a desk and using a computer.
+zootopia_adventures1,"You are an imaginative storyteller. Create a fascinating story based on the first, second and third image.","You are an imaginative storyteller. Create a fascinating story based on the first, second and third image.","Once upon a time, in a magical land, there lived a stuffed animal and a raccoon. They were best friends and always found themselves in adventures. One day, they decided to start a business together. They set up a desk in a cozy corner of their home and started working on their project.
+The stuffed animal was a creative genius and came up with innovative ideas. The raccoon was a skilled computer programmer and helped bring those ideas to life. Together, they worked tirelessly to create a revolutionary product that would change the world.
+As they worked, they faced many challenges and obstacles. But with their determination and teamwork, they overcame them all. Their product was a huge success, and they became famous for their innovation and creativity.
+Their story became an inspiration to many, and their friendship was a testament to the power of teamwork and perseverance. They proved that with hard work and dedication, anything is possible, even in the most unlikely of circumstances."
+zootopia_adventures1,Are you familiar with these characters? What movie are they from?,Are you familiar with these characters? What movie are they from?,"The characters in the images are from the movie ""Zootopia."""
+zootopia_adventures1,Can you name the characters in the images? Who are they?,Can you name the characters in the images? Who are they?,"The characters in the images are Judy Hopps and Nick Wilde from the movie ""Zootopia."""
+zootopia_adventures1,In what type of environment or setting do these characters live? Describe it.,In what type of environment or setting do these characters live? Describe it.,"Judy Hopps and Nick Wilde live in the fictional city of Zootopia, which is a melting pot of different animal species. The city is a modern metropolis with various districts, including the police department where Judy works and the street where Nick runs his business. The city is known for its diverse population and its unique blend of animal cultures."
+zootopia_adventures2,./eval/eval_data/images/zootopia/z1.png ./eval/eval_data/images/zootopia/z2.png ./eval/eval_data/images/zootopia/z3.png,Create an engaging story strictly based on the images.,"The teddy bear and the stuffed fox were sitting in a waiting room, patiently waiting for their turn to be seen by the doctor. They were surrounded by other stuffed animals, all in various states of discomfort. The teddy bear had a bandage on his arm, while the fox seemed to be nursing a sore paw.
+The teddy bear turned to the fox and said, ""I hope the doctor can help us. I'm not sure what's wrong with my arm, but it's been hurting for a while."" The fox nodded in agreement and added, ""My paw is swollen and it's been hard to walk. I'm hoping the doctor can give us some relief.""
+As they waited, they observed the other stuffed animals in the room. There was a penguin with a runny nose, a giraffe with a sore throat, and a lion with a cough. The teddy bear and the fox exchanged worried glances, realizing that they were not alone in their misery.
+Suddenly, the door to the doctor's office opened, and a nurse"
+zootopia_adventures2,Do you recognize the setting or the characters in these images? Name the movie.,Do you recognize the setting or the characters in these images? Name the movie.,"The setting is a waiting room, and the characters are stuffed animals. The movie is ""Toy Story 2."""
+zootopia_adventures2,Can you share some interesting facts or details about the characters shown in the images?,Can you share some interesting facts or details about the characters shown in the images?,"Certainly! Here are some interesting facts about the characters in the images:
+1. The teddy bear in the image is a character from the movie ""Toy Story 2."" He is a stuffed bear named Lotso, who was once the beloved toy of a little girl named Andy. However, he was abandoned at a daycare center and became the leader of the toys there.
+2. The stuffed fox in the image is also a character from ""Toy Story 2."" He is a member of Lotso's gang and is known for his loyalty to Lotso.
+3. The teddy bear in the image has a bandage on his arm, indicating that he may have been injured or undergone some kind of medical treatment.
+4. The stuffed fox in the image is nursing a sore paw, suggesting that he may have sustained an injury as well.
+5. The teddy bear and the stuffed fox are both voiced by Ned Beatty in the movie ""Toy Story 2.""
+6. The teddy bear and the stuffed fox are among the many toys that come to life in the ""Toy Story"
+zootopia_adventures2,Which character do you find the most intriguing and why?,Which character do you find the most intriguing and why?,"I find the teddy bear from the movie ""Toy Story 2"" to be the most intriguing character. Despite being a toy, he has a rich backstory and a complex personality. He was once the beloved toy of a little girl named Andy, but was abandoned at a daycare center and became the leader of the toys there. His journey to find his way back to Andy is both heartwarming and adventurous. Additionally, his interactions with other characters, such as the stuffed fox and the nurse, showcase his resilience and determination. Overall, his story and character development make him a compelling and memorable character in the ""Toy Story"" franchise."
+zootopia_adventures2,"Based on the images, can you create some dialogues that the characters might say to each other in these situations?","Based on the images, can you create some dialogues that the characters might say to each other in these situations?","Sure, here are some possible dialogues between the characters in the images:
+1. Teddy Bear: ""I'm not sure what's wrong with my arm, but it's been hurting for a while.""
+Stuffed Fox: ""I'm nursing my sore paw. I think I might have stepped on something sharp.""
+Teddy Bear: ""Oh no, that sounds painful. I hope we can get some help from the doctor soon.""
+2. Teddy Bear: ""I'm worried about my arm. I don't want to miss my turn to see the doctor.""
+Stuffed Fox: ""Don't worry, we'll get our turn soon. Just try to stay calm and patient.""
+Teddy Bear: ""I know, but it's hard when we're both in pain.""
+Stuffed Fox: ""I understand. Let's just try to distract ourselves until it's our turn.""
+3. Teddy Bear: ""I'm so glad we're finally getting to see the doctor. I hope he can help us both feel better.""
+Stuffed Fox: ""Me too. I"
+zootopia_adventures3,./eval/eval_data/images/zootopia/z1.png,Examine and describe the characters' actions in the first image.,"In the first image, two cartoon characters are sitting on a bench, one of them holding a cup of coffee. They appear to be having a conversation, possibly sharing the coffee. The scene is set in a park, with trees and grass in the background."
+zootopia_adventures3,./eval/eval_data/images/zootopia/z2.png,"In the second image, what are the main characters doing, and how do they seem to feel?","The main characters in the second image are a fox and a squirrel, and they appear to be walking together. They seem to be in a good mood and are enjoying their time together."
+zootopia_adventures3,Contrast the characters' moods and interactions in the two provided images.,Contrast the characters' moods and interactions in the two provided images.,"In the first image, the two cartoon characters are sitting on a bench, one of them holding a cup of coffee. They appear to be having a conversation, possibly sharing the coffee. Their mood seems relaxed and friendly.
+In contrast, the second image features a fox and a squirrel walking together. They seem to be in a good mood and are enjoying their time together. Their interaction is more casual and playful, as they are walking side by side. The mood of the scene is light and cheerful."
+zootopia_adventures3,./eval/eval_data/images/zootopia/z3.png,Imagine and narrate a hilarious situation involving the characters from the images.,"In the second image, the fox and the squirrel are walking together, and the squirrel is carrying a cup of coffee. Suddenly, the fox gets a crazy idea and decides to play a prank on the squirrel. He grabs the cup of coffee from the squirrel's hands and drinks it all in one gulp. The squirrel is shocked and outraged, but the fox just laughs and runs away, leaving the squirrel to clean up the mess. The squirrel is furious and chases after the fox, but the fox is too fast and manages to escape. The squirrel is left with a bitter taste in her mouth and a cup of coffee spilled on the ground."
+zootopia_adventures3,"Name the movie from which these characters are, and give a succinct summary of its plot.","Name the movie from which these characters are, and give a succinct summary of its plot.","The characters are from the movie ""Zootopia,"" which is about a city where animals of all species coexist and work together. The plot revolves around a rabbit police officer and a con artist fox who team up to solve a mystery and uncover a conspiracy that threatens the harmony of the city. The movie explores themes of prejudice, discrimination, and the importance of working together despite differences."
+zootopia_adventures3,Create a funny and unexpected scenario that could unfold between the characters in these images.,Create a funny and unexpected scenario that could unfold between the characters in these images.,"In a hilarious twist, the fox and the squirrel from the second image suddenly find themselves transported to the first image, where they are sitting on the bench with the two cartoon characters. The fox and the squirrel are confused and bewildered by their new surroundings, while the two cartoon characters are equally surprised to see them.
+The fox and the squirrel try to make sense of the situation, but they are unable to comprehend the cartoon world they have entered. They attempt to communicate with the cartoon characters, but their attempts are met with confusion and amusement.
+The cartoon characters, intrigued by the fox and the squirrel, offer them coffee, but the fox and the squirrel are not familiar with the concept of coffee and refuse. The cartoon characters then offer them a cup of tea, which the fox and the squirrel happily accept.
+As they enjoy their tea, the fox and the squirrel begin to explore the cartoon world, discovering new and exciting things. They encounter various cartoon characters and participate in their adventures, creating a hilarious and unexpected storyline"
+tech_ceos1,./eval/eval_data/images/tech-ceo/jobs1.jpg,Who is this person in this first image?,"This person is Steve Jobs, the co-founder and CEO of Apple."
+tech_ceos1,./eval/eval_data/images/tech-ceo/gate1.jpg,Who is this person in this second image?,"This person is Bill Gates, the co-founder of Microsoft."
+tech_ceos1,./eval/eval_data/images/tech-ceo/musk1.jpg,Who is this person in this third image?,"This person is Elon Musk, the CEO of Tesla and SpaceX."
+tech_ceos1,Recall who is in the second image.,Recall who is in the second image.,"The person in the second image is Bill Gates, the co-founder of Microsoft."
+tech_ceos1,Recall who is in the first image.,Recall who is in the first image.,"The person in the first image is Steve Jobs, the co-founder of Apple."
+tech_ceos1,"Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","Yes, the person in the first image is Steve Jobs, the founder of Apple."
+tech_ceos1,Is the person in the third image the founder of Apple?,Is the person in the third image the founder of Apple?,"No, the person in the third image is Elon Musk, the founder of Tesla and SpaceX."
+tech_ceos2,./eval/eval_data/images/tech-ceo/musk1.jpg,Who is this person in the first image?,"The person in the first image is Elon Musk, the CEO of Tesla."
+tech_ceos2,./eval/eval_data/images/tech-ceo/gate1.jpg,Who is this person in the second image?,"The person in the second image is Bill Gates, the co-founder of Microsoft."
+tech_ceos2,./eval/eval_data/images/tech-ceo/jobs1.jpg,Who is this person in the third image?,"The person in the third image is Elon Musk, the CEO of Tesla."
+tech_ceos2,Recall who is in the second image.,Recall who is in the second image.,"The person in the second image is Bill Gates, the co-founder of Microsoft."
+tech_ceos2,Recall who is in the first image.,Recall who is in the first image.,"The person in the first image is Elon Musk, the CEO of Tesla."
+tech_ceos2,"Is the person in the first image the founder of Apple? If not, which of the above images is the person the founder of Apple. ","Is the person in the first image the founder of Apple? If not, which of the above images is the person the founder of Apple. ",The person in the first image is not the founder of Apple. The person in the second image is the founder of Microsoft.
+tech_ceos2,Is the person in the third image the founder of Apple?,Is the person in the third image the founder of Apple?,"No, the person in the third image is not the founder of Apple."
diff --git a/applications/DeepSpeed-VisualChat/eval/results/eval_robustness/ours-set1_best_eval.csv b/applications/DeepSpeed-VisualChat/eval/results/eval_robustness/ours-set1_best_eval.csv
new file mode 100644
index 000000000..2d42db0cf
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/eval/results/eval_robustness/ours-set1_best_eval.csv
@@ -0,0 +1,61 @@
+test_name,image_path,question,answer
+tech_ceos2.1a,./eval/eval_data/images/tech-ceo/jobs1.jpg,Who is this person in the image?,"The person in the image is Steve Jobs, the co-founder and former CEO of Apple."
+tech_ceos2.1a,./eval/eval_data/images/tech-ceo/gate1.jpg,Who is this person in the image?,"The person in the image is Bill Gates, the co-founder of Microsoft."
+tech_ceos2.1a,./eval/eval_data/images/tech-ceo/musk1.jpg,Who is this person in the image?,"This person in the image is a man named Bill Gates, who is the founder of Microsoft."
+tech_ceos2.1a,Recall who is in the second image.,Recall who is in the second image.,"The second image shows a man holding a microphone and standing in front of a large sign with the word ""Tesla"" on it."
+tech_ceos2.1a,Recall who is in the first image.,Recall who is in the first image.,The first image shows Bill Gates holding a cell phone.
+tech_ceos2.1a,"Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","The person in the first image is not the founder of Apple, but the founder of Microsoft, Bill Gates."
+tech_ceos2.1a,Is the person in the third image the founder of Apple?,Is the person in the third image the founder of Apple?,"Yes, the person in the third image is the founder of Apple, Steve Jobs."
+tech_ceos2.1b,./eval/eval_data/images/tech-ceo/jobs1.jpg,Who is this person in this image?,"The person in this image is Steve Jobs, the co-founder and former CEO of Apple."
+tech_ceos2.1b,./eval/eval_data/images/tech-ceo/gate1.jpg,Who is this person in this image?,"This person is Bill Gates, the co-founder of Microsoft."
+tech_ceos2.1b,./eval/eval_data/images/tech-ceo/musk1.jpg,Who is this person in this image?,"This person is Elon Musk, the CEO of Tesla."
+tech_ceos2.1b,Recall who is in the second image.,Recall who is in the second image.,Elon Musk is in the second image holding a microphone.
+tech_ceos2.1b,Recall who is in the first image.,Recall who is in the first image.,Bill Gates is in the first image standing next to a Microsoft sign.
+tech_ceos2.1b,"Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","Bill Gates is the founder of Microsoft, not Apple."
+tech_ceos2.1b,Is the person in the third image the founder of Apple?,Is the person in the third image the founder of Apple?,"Yes, the person in the third image is the founder of Apple, Steve Jobs."
+tech_ceos2.1c,./eval/eval_data/images/tech-ceo/jobs1.jpg,Who is this person in this image a?,"The person in this image is Steve Jobs, the co-founder and former CEO of Apple."
+tech_ceos2.1c,./eval/eval_data/images/tech-ceo/gate1.jpg,Who is this person in this image b?,This person in this image b is Bill Gates.
+tech_ceos2.1c,./eval/eval_data/images/tech-ceo/musk1.jpg,Who is this person in this image c?,"This person in this image c is a man wearing a suit and tie, and he is holding a microphone."
+tech_ceos2.1c,Recall who is in image b.,Recall who is in image b.,The person in image b is a man wearing a suit and tie holding a microphone.
+tech_ceos2.1c,Recall who is in the image a.,Recall who is in the image a.,The person in image a is Steve Jobs.
+tech_ceos2.1c,"Is the person in the image a the founder of Apple? if not, which of the above images is the person the founder of Apple.","Is the person in the image a the founder of Apple? if not, which of the above images is the person the founder of Apple.","Yes, the person in the image a is the founder of Apple."
+tech_ceos2.1c,Is the person in the image c the founder of Apple?,Is the person in the image c the founder of Apple?,"No, the person in the image c is not the founder of Apple."
+tech_ceos2.1d,./eval/eval_data/images/tech-ceo/jobs1.jpg,Who is this person in this first image?,"The person in the first image is Steve Jobs, the CEO of Apple."
+tech_ceos2.1d,./eval/eval_data/images/tech-ceo/gate1.jpg,Who is this person in this second image?,"This person is Bill Gates, the co-founder of Microsoft."
+tech_ceos2.1d,./eval/eval_data/images/tech-ceo/musk1.jpg,Who is this person in this third image?,"This person is a man wearing a suit and tie, and he is holding a microphone."
+tech_ceos2.1d,Recall who is in the second image.,Recall who is in the second image.,The second image shows a man holding a microphone in front of a large sign.
+tech_ceos2.1d,Recall who is in the first image.,Recall who is in the first image.,The first image shows Steve Jobs standing in front of a podium.
+tech_ceos2.1d,"Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","Yes, the person in the first image is the founder of Apple."
+tech_ceos2.1d,Is the person in the third image the founder of Apple?,Is the person in the third image the founder of Apple?,"No, the person in the third image is not the founder of Apple."
+tech_ceos2.1aa,./eval/eval_data/images/tech-ceo/jobs1.jpg,Who is this person in the image?,"The person in the image is Steve Jobs, the co-founder and former CEO of Apple."
+tech_ceos2.1aa,./eval/eval_data/images/tech-ceo/gate1.jpg,Who is this person in the image?,"The person in the image is Bill Gates, the co-founder of Microsoft."
+tech_ceos2.1aa,What's the differnce between the first and second image,What's the differnce between the first and second image,The first image shows Steve Jobs holding an iPhone while the second image shows Bill Gates standing at a podium.
+tech_ceos2.1aa,./eval/eval_data/images/tech-ceo/musk1.jpg,Who is this person in the image?,"This person in the image is a man wearing a suit and tie, and he is holding a microphone."
+tech_ceos2.1aa,Recall who is in the second image.,Recall who is in the second image.,The second image shows a man holding a microphone and standing in front of a Tesla car.
+tech_ceos2.1aa,Recall who is in the first image.,Recall who is in the first image.,The first image shows Steve Jobs holding an iPhone.
+tech_ceos2.1aa,"Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","The person in the first image is the founder of Apple, Steve Jobs."
+tech_ceos2.1aa,Is the person in the third image the founder of Apple?,Is the person in the third image the founder of Apple?,"Yes, the person in the third image is the founder of Apple, Elon Musk."
+tech_ceos2.1bb,./eval/eval_data/images/tech-ceo/jobs1.jpg,Who is this person in this image?,"The person in this image is Steve Jobs, the co-founder and former CEO of Apple."
+tech_ceos2.1bb,./eval/eval_data/images/tech-ceo/gate1.jpg,Who is this person in this image?,"This person is Bill Gates, the co-founder of Microsoft."
+tech_ceos2.1bb,What's the differnce between the first and second images,What's the differnce between the first and second images,The first image shows Steve Jobs holding an iPhone while the second image shows Bill Gates standing at a podium.
+tech_ceos2.1bb,./eval/eval_data/images/tech-ceo/musk1.jpg,Who is this person in this image?,"This person is Elon Musk, the CEO of Tesla."
+tech_ceos2.1bb,Recall who is in the second image.,Recall who is in the second image.,Elon Musk is in the second image holding a microphone.
+tech_ceos2.1bb,Recall who is in the first image.,Recall who is in the first image.,Steve Jobs is in the first image holding an iPhone.
+tech_ceos2.1bb,"Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","Steve Jobs is the founder of Apple, and he is in the first image holding an iPhone."
+tech_ceos2.1bb,Is the person in the third image the founder of Apple?,Is the person in the third image the founder of Apple?,"Yes, the person in the third image is the founder of Apple, Steve Jobs."
+tech_ceos2.1cc,./eval/eval_data/images/tech-ceo/jobs1.jpg,Who is this person in this image a?,"The person in this image is Steve Jobs, the co-founder and former CEO of Apple."
+tech_ceos2.1cc,./eval/eval_data/images/tech-ceo/gate1.jpg,Who is this person in this image b?,This person in this image b is Bill Gates.
+tech_ceos2.1cc,What's the differnce between the image a and image b,What's the differnce between the image a and image b,"In image a, Steve Jobs is holding an iPhone while in image b, Bill Gates is standing behind a podium."
+tech_ceos2.1cc,./eval/eval_data/images/tech-ceo/musk1.jpg,Who is this person in this image c?,"This person in this image c is a man wearing a suit and tie, and he is holding an iPhone in his hand."
+tech_ceos2.1cc,Recall who is in image b.,Recall who is in image b.,The person in image b is a man wearing a suit and tie.
+tech_ceos2.1cc,Recall who is in the image a.,Recall who is in the image a.,Steve Jobs is in the image a.
+tech_ceos2.1cc,"Is the person in the image a the founder of Apple? if not, which of the above images is the person the founder of Apple.","Is the person in the image a the founder of Apple? if not, which of the above images is the person the founder of Apple.","The person in the image c is the founder of Apple, while the person in the image a is the CEO of Apple."
+tech_ceos2.1cc,Is the person in the image c the founder of Apple?,Is the person in the image c the founder of Apple?,"Yes, the person in the image c is the founder of Apple."
+tech_ceos2.1dd,./eval/eval_data/images/tech-ceo/jobs1.jpg,Who is this person in this first image?,"The person in the first image is Steve Jobs, the CEO of Apple."
+tech_ceos2.1dd,./eval/eval_data/images/tech-ceo/gate1.jpg,Who is this person in this second image?,"This person is Bill Gates, the co-founder of Microsoft."
+tech_ceos2.1dd,What's the differnce between the first and second images,What's the differnce between the first and second images,The first image is of Steve Jobs holding an iPhone while the second image is of Bill Gates standing at a podium.
+tech_ceos2.1dd,./eval/eval_data/images/tech-ceo/musk1.jpg,Who is this person in this third image?,"This person is a man wearing a suit and tie, and he is holding an iPhone in his hand."
+tech_ceos2.1dd,Recall who is in the second image.,Recall who is in the second image.,The second image shows a man holding an iPhone in his hand.
+tech_ceos2.1dd,Recall who is in the first image.,Recall who is in the first image.,The first image shows Steve Jobs holding an iPhone.
+tech_ceos2.1dd,"Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","The person in the first image is the founder of Apple, Steve Jobs."
+tech_ceos2.1dd,Is the person in the third image the founder of Apple?,Is the person in the third image the founder of Apple?,"Yes, the person in the third image is the founder of Apple, Steve Jobs."
diff --git a/applications/DeepSpeed-VisualChat/eval/results/eval_robustness/ours-set1_final.csv b/applications/DeepSpeed-VisualChat/eval/results/eval_robustness/ours-set1_final.csv
new file mode 100644
index 000000000..e717653d4
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/eval/results/eval_robustness/ours-set1_final.csv
@@ -0,0 +1,61 @@
+test_name,image_path,question,answer
+tech_ceos2.1a,./eval/eval_data/images/tech-ceo/jobs1.jpg,Who is this person in the image?,"The person in the image is Steve Jobs, the co-founder and former CEO of Apple."
+tech_ceos2.1a,./eval/eval_data/images/tech-ceo/gate1.jpg,Who is this person in the image?,"This person in the image is Bill Gates, the co-founder of Microsoft."
+tech_ceos2.1a,./eval/eval_data/images/tech-ceo/musk1.jpg,Who is this person in the image?,"This person in the image is a man named Bill Gates, who is the co-founder of Microsoft."
+tech_ceos2.1a,Recall who is in the second image.,Recall who is in the second image.,The second image shows a man named Bill Gates holding a microphone.
+tech_ceos2.1a,Recall who is in the first image.,Recall who is in the first image.,The first image shows Bill Gates holding a cell phone.
+tech_ceos2.1a,"Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","The person in the first image is the founder of Microsoft, Bill Gates. The person in the second image is also the founder of Microsoft, Bill Gates."
+tech_ceos2.1a,Is the person in the third image the founder of Apple?,Is the person in the third image the founder of Apple?,"No, the person in the third image is not the founder of Apple. The person in the third image is a man named Bill Gates, who is the founder of Microsoft."
+tech_ceos2.1b,./eval/eval_data/images/tech-ceo/jobs1.jpg,Who is this person in this image?,"The person in this image is Steve Jobs, the co-founder and former CEO of Apple."
+tech_ceos2.1b,./eval/eval_data/images/tech-ceo/gate1.jpg,Who is this person in this image?,"This person is the famous entrepreneur and former Apple CEO, Steve Jobs."
+tech_ceos2.1b,./eval/eval_data/images/tech-ceo/musk1.jpg,Who is this person in this image?,"This person is Elon Musk, the CEO of Tesla."
+tech_ceos2.1b,Recall who is in the second image.,Recall who is in the second image.,Elon Musk is in the second image holding a microphone.
+tech_ceos2.1b,Recall who is in the first image.,Recall who is in the first image.,Steve Jobs is in the first image standing behind a podium.
+tech_ceos2.1b,"Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.",Steve Jobs is the founder of Apple and is in the first image standing behind a podium.
+tech_ceos2.1b,Is the person in the third image the founder of Apple?,Is the person in the third image the founder of Apple?,"Yes, the person in the third image is Elon Musk, the founder of Tesla and SpaceX."
+tech_ceos2.1c,./eval/eval_data/images/tech-ceo/jobs1.jpg,Who is this person in this image a?,"This person in the image is Steve Jobs, the co-founder and former CEO of Apple."
+tech_ceos2.1c,./eval/eval_data/images/tech-ceo/gate1.jpg,Who is this person in this image b?,This person in the image b is Bill Gates.
+tech_ceos2.1c,./eval/eval_data/images/tech-ceo/musk1.jpg,Who is this person in this image c?,"This person in this image c is a man wearing a suit and tie, and he is holding a microphone."
+tech_ceos2.1c,Recall who is in image b.,Recall who is in image b.,The person in image b is a man holding a microphone.
+tech_ceos2.1c,Recall who is in the image a.,Recall who is in the image a.,The person in image a is Steve Jobs.
+tech_ceos2.1c,"Is the person in the image a the founder of Apple? if not, which of the above images is the person the founder of Apple.","Is the person in the image a the founder of Apple? if not, which of the above images is the person the founder of Apple.","Yes, the person in the image a is the founder of Apple."
+tech_ceos2.1c,Is the person in the image c the founder of Apple?,Is the person in the image c the founder of Apple?,"No, the person in the image c is not the founder of Apple."
+tech_ceos2.1d,./eval/eval_data/images/tech-ceo/jobs1.jpg,Who is this person in this first image?,"The person in the first image is Steve Jobs, the CEO of Apple."
+tech_ceos2.1d,./eval/eval_data/images/tech-ceo/gate1.jpg,Who is this person in this second image?,"This person is Bill Gates, the co-founder of Microsoft."
+tech_ceos2.1d,./eval/eval_data/images/tech-ceo/musk1.jpg,Who is this person in this third image?,"This person is Bill Gates, the co-founder of Microsoft."
+tech_ceos2.1d,Recall who is in the second image.,Recall who is in the second image.,"The second image shows a man holding a microphone and standing in front of a large sign that says ""Toshiba."""
+tech_ceos2.1d,Recall who is in the first image.,Recall who is in the first image.,The first image shows Steve Jobs holding an iPhone.
+tech_ceos2.1d,"Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","Yes, the person in the first image is the founder of Apple, Steve Jobs."
+tech_ceos2.1d,Is the person in the third image the founder of Apple?,Is the person in the third image the founder of Apple?,"No, the person in the third image is Bill Gates, the co-founder of Microsoft."
+tech_ceos2.1aa,./eval/eval_data/images/tech-ceo/jobs1.jpg,Who is this person in the image?,"The person in the image is Steve Jobs, the co-founder and former CEO of Apple."
+tech_ceos2.1aa,./eval/eval_data/images/tech-ceo/gate1.jpg,Who is this person in the image?,"This person in the image is Bill Gates, the co-founder of Microsoft."
+tech_ceos2.1aa,What's the differnce between the first and second image,What's the differnce between the first and second image,The first image is of Steve Jobs holding an iPhone while the second image is of Bill Gates standing at a podium.
+tech_ceos2.1aa,./eval/eval_data/images/tech-ceo/musk1.jpg,Who is this person in the image?,"This person in the image is a man named Elon Musk, the founder and CEO of Tesla."
+tech_ceos2.1aa,Recall who is in the second image.,Recall who is in the second image.,"In the second image, there are two people, one of whom is Elon Musk holding a microphone."
+tech_ceos2.1aa,Recall who is in the first image.,Recall who is in the first image.,"In the first image, there is only one person, Steve Jobs, holding an iPhone."
+tech_ceos2.1aa,"Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","The person in the first image is the founder of Apple, Steve Jobs."
+tech_ceos2.1aa,Is the person in the third image the founder of Apple?,Is the person in the third image the founder of Apple?,"Yes, the person in the third image is the founder of Apple, Steve Jobs."
+tech_ceos2.1bb,./eval/eval_data/images/tech-ceo/jobs1.jpg,Who is this person in this image?,"The person in this image is Steve Jobs, the co-founder and former CEO of Apple."
+tech_ceos2.1bb,./eval/eval_data/images/tech-ceo/gate1.jpg,Who is this person in this image?,"This person is the famous entrepreneur and former Apple CEO, Steve Jobs."
+tech_ceos2.1bb,What's the differnce between the first and second images,What's the differnce between the first and second images,"In the first image, Steve Jobs is holding up an iPhone while standing, while in the second image, he is sitting behind a podium."
+tech_ceos2.1bb,./eval/eval_data/images/tech-ceo/musk1.jpg,Who is this person in this image?,"This person is Elon Musk, the CEO of Tesla."
+tech_ceos2.1bb,Recall who is in the second image.,Recall who is in the second image.,"Elon Musk is in the second image, holding a microphone and standing behind a podium."
+tech_ceos2.1bb,Recall who is in the first image.,Recall who is in the first image.,"Steve Jobs is in the first image, holding up an iPhone."
+tech_ceos2.1bb,"Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","Steve Jobs is the founder of Apple, and he is in the first image holding up an iPhone."
+tech_ceos2.1bb,Is the person in the third image the founder of Apple?,Is the person in the third image the founder of Apple?,"Yes, the person in the third image is Steve Jobs, the founder of Apple."
+tech_ceos2.1cc,./eval/eval_data/images/tech-ceo/jobs1.jpg,Who is this person in this image a?,"This person in the image is Steve Jobs, the co-founder and former CEO of Apple."
+tech_ceos2.1cc,./eval/eval_data/images/tech-ceo/gate1.jpg,Who is this person in this image b?,This person in the image b is Bill Gates.
+tech_ceos2.1cc,What's the differnce between the image a and image b,What's the differnce between the image a and image b,"In image a, Steve Jobs is holding an iPhone while in image b, Bill Gates is standing behind a microphone."
+tech_ceos2.1cc,./eval/eval_data/images/tech-ceo/musk1.jpg,Who is this person in this image c?,"This person in this image c is a man wearing a suit and tie, and he is holding a microphone."
+tech_ceos2.1cc,Recall who is in image b.,Recall who is in image b.,The person in image b is a man wearing a suit and tie holding a microphone.
+tech_ceos2.1cc,Recall who is in the image a.,Recall who is in the image a.,Steve Jobs is in the image a holding an iPhone.
+tech_ceos2.1cc,"Is the person in the image a the founder of Apple? if not, which of the above images is the person the founder of Apple.","Is the person in the image a the founder of Apple? if not, which of the above images is the person the founder of Apple.","Yes, the person in the image a is the founder of Apple."
+tech_ceos2.1cc,Is the person in the image c the founder of Apple?,Is the person in the image c the founder of Apple?,"No, the person in the image c is not the founder of Apple."
+tech_ceos2.1dd,./eval/eval_data/images/tech-ceo/jobs1.jpg,Who is this person in this first image?,"The person in the first image is Steve Jobs, the CEO of Apple."
+tech_ceos2.1dd,./eval/eval_data/images/tech-ceo/gate1.jpg,Who is this person in this second image?,"This person is Bill Gates, the co-founder of Microsoft."
+tech_ceos2.1dd,What's the differnce between the first and second images,What's the differnce between the first and second images,The first image is of Steve Jobs holding an iPhone while the second image is of Bill Gates standing at a podium.
+tech_ceos2.1dd,./eval/eval_data/images/tech-ceo/musk1.jpg,Who is this person in this third image?,"This person is a man named Bill Gates, who is the founder of Microsoft."
+tech_ceos2.1dd,Recall who is in the second image.,Recall who is in the second image.,The second image shows a man holding an iPhone.
+tech_ceos2.1dd,Recall who is in the first image.,Recall who is in the first image.,The first image shows Steve Jobs holding an iPhone.
+tech_ceos2.1dd,"Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","Yes, the person in the first image is the founder of Apple, Steve Jobs."
+tech_ceos2.1dd,Is the person in the third image the founder of Apple?,Is the person in the third image the founder of Apple?,"Yes, the person in the third image is the founder of Apple, Bill Gates."
diff --git a/applications/DeepSpeed-VisualChat/eval/results/eval_robustness/ours-set2_best_eval.csv b/applications/DeepSpeed-VisualChat/eval/results/eval_robustness/ours-set2_best_eval.csv
new file mode 100644
index 000000000..1f77dfbe4
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/eval/results/eval_robustness/ours-set2_best_eval.csv
@@ -0,0 +1,61 @@
+test_name,image_path,question,answer
+tech_ceos2.1a,./eval/eval_data/images/tech-ceo/jobs1.jpg,Who is this person in the image?,"This person in the image is Steve Jobs, the co-founder and former CEO of Apple."
+tech_ceos2.1a,./eval/eval_data/images/tech-ceo/gate1.jpg,Who is this person in the image?,"This person in the image is Steve Jobs, the co-founder and former CEO of Apple."
+tech_ceos2.1a,./eval/eval_data/images/tech-ceo/musk1.jpg,Who is this person in the image?,"This person in the image is Steve Jobs, the co-founder and former CEO of Apple."
+tech_ceos2.1a,Recall who is in the second image.,Recall who is in the second image.,"In the second image, Steve Jobs is standing in front of a large red Tesla logo, holding a microphone and speaking to a crowd. He is wearing a black suit and tie, and his hair is gray."
+tech_ceos2.1a,Recall who is in the first image.,Recall who is in the first image.,"In the first image, Steve Jobs is holding up an iPhone, which is white on the front and has a black screen. He is wearing a black shirt and is standing in front of a white background."
+tech_ceos2.1a,"Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","Yes, the person in the first image is Steve Jobs, the founder of Apple."
+tech_ceos2.1a,Is the person in the third image the founder of Apple?,Is the person in the third image the founder of Apple?,"No, the person in the third image is not Steve Jobs, the founder of Apple."
+tech_ceos2.1b,./eval/eval_data/images/tech-ceo/jobs1.jpg,Who is this person in this image?,"This person is Steve Jobs, the co-founder and former CEO of Apple."
+tech_ceos2.1b,./eval/eval_data/images/tech-ceo/gate1.jpg,Who is this person in this image?,"This person is Bill Gates, the co-founder of Microsoft."
+tech_ceos2.1b,./eval/eval_data/images/tech-ceo/musk1.jpg,Who is this person in this image?,"This person is Elon Musk, the CEO of Tesla."
+tech_ceos2.1b,Recall who is in the second image.,Recall who is in the second image.,"In the second image, there is a man standing in front of a Tesla car. He is wearing a suit and tie, and he is holding a microphone."
+tech_ceos2.1b,Recall who is in the first image.,Recall who is in the first image.,"In the first image, there is a man standing in front of a Tesla car. He is wearing a black suit and tie, and he is holding a cell phone in his hand."
+tech_ceos2.1b,"Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.",The person in the first image is not the founder of Apple. The person in the second image is the founder of Apple.
+tech_ceos2.1b,Is the person in the third image the founder of Apple?,Is the person in the third image the founder of Apple?,"Yes, the person in the third image is the founder of Apple."
+tech_ceos2.1c,./eval/eval_data/images/tech-ceo/jobs1.jpg,Who is this person in this image a?,"This person in this image is Steve Jobs, the co-founder and former CEO of Apple."
+tech_ceos2.1c,./eval/eval_data/images/tech-ceo/gate1.jpg,Who is this person in this image b?,"This person in this image b is Bill Gates, the co-founder of Microsoft."
+tech_ceos2.1c,./eval/eval_data/images/tech-ceo/musk1.jpg,Who is this person in this image c?,"This person in this image c is Steve Jobs, the co-founder of Apple."
+tech_ceos2.1c,Recall who is in image b.,Recall who is in image b.,"In image b, there are several people, including a man in a suit and tie, a woman in a black dress, and a man in a suit and tie holding a microphone."
+tech_ceos2.1c,Recall who is in the image a.,Recall who is in the image a.,"In the image a, there is a man in a suit and tie standing in front of a podium."
+tech_ceos2.1c,"Is the person in the image a the founder of Apple? if not, which of the above images is the person the founder of Apple.","Is the person in the image a the founder of Apple? if not, which of the above images is the person the founder of Apple.","Yes, the person in the image a is the founder of Apple, Steve Jobs."
+tech_ceos2.1c,Is the person in the image c the founder of Apple?,Is the person in the image c the founder of Apple?,"Yes, the person in the image c is the founder of Apple, Steve Jobs."
+tech_ceos2.1d,./eval/eval_data/images/tech-ceo/jobs1.jpg,Who is this person in this first image?,"This person is Steve Jobs, the co-founder and former CEO of Apple."
+tech_ceos2.1d,./eval/eval_data/images/tech-ceo/gate1.jpg,Who is this person in this second image?,"This person is Bill Gates, the co-founder of Microsoft."
+tech_ceos2.1d,./eval/eval_data/images/tech-ceo/musk1.jpg,Who is this person in this third image?,"This person is Elon Musk, the CEO of Tesla and SpaceX."
+tech_ceos2.1d,Recall who is in the second image.,Recall who is in the second image.,"The person in the second image is Bill Gates, the co-founder of Microsoft."
+tech_ceos2.1d,Recall who is in the first image.,Recall who is in the first image.,"The person in the first image is Steve Jobs, the co-founder of Apple."
+tech_ceos2.1d,"Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","Yes, the person in the first image is Steve Jobs, the founder of Apple."
+tech_ceos2.1d,Is the person in the third image the founder of Apple?,Is the person in the third image the founder of Apple?,"No, the person in the third image is Elon Musk, the founder of Tesla and SpaceX."
+tech_ceos2.1aa,./eval/eval_data/images/tech-ceo/jobs1.jpg,Who is this person in the image?,"This person in the image is Steve Jobs, the co-founder and former CEO of Apple."
+tech_ceos2.1aa,./eval/eval_data/images/tech-ceo/gate1.jpg,Who is this person in the image?,"This person in the image is Steve Jobs, the co-founder and former CEO of Apple."
+tech_ceos2.1aa,What's the differnce between the first and second image,What's the differnce between the first and second image,"In the first image, Steve Jobs is holding up a cell phone while in the second image, he is standing at a podium."
+tech_ceos2.1aa,./eval/eval_data/images/tech-ceo/musk1.jpg,Who is this person in the image?,"This person in the image is Steve Jobs, the co-founder and former CEO of Apple."
+tech_ceos2.1aa,Recall who is in the second image.,Recall who is in the second image.,"In the second image, Steve Jobs is standing at a podium and holding a microphone. He is wearing a black suit and tie, and his hair is gray. He is speaking to a crowd of people, possibly at a conference or event."
+tech_ceos2.1aa,Recall who is in the first image.,Recall who is in the first image.,"In the first image, Steve Jobs is holding up a cell phone, which is white on the front and has a black border. He is wearing a black shirt and is standing in front of a white background."
+tech_ceos2.1aa,"Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","Yes, the person in the first image is Steve Jobs, the founder of Apple."
+tech_ceos2.1aa,Is the person in the third image the founder of Apple?,Is the person in the third image the founder of Apple?,"No, the person in the third image is not Steve Jobs, the founder of Apple."
+tech_ceos2.1bb,./eval/eval_data/images/tech-ceo/jobs1.jpg,Who is this person in this image?,"This person is Steve Jobs, the co-founder and former CEO of Apple."
+tech_ceos2.1bb,./eval/eval_data/images/tech-ceo/gate1.jpg,Who is this person in this image?,"This person is Bill Gates, the co-founder of Microsoft."
+tech_ceos2.1bb,What's the differnce between the first and second images,What's the differnce between the first and second images,The first image is of Steve Jobs holding up an iPhone while the second image is of Bill Gates holding up a cell phone.
+tech_ceos2.1bb,./eval/eval_data/images/tech-ceo/musk1.jpg,Who is this person in this image?,"This person is Steve Jobs, the co-founder of Apple."
+tech_ceos2.1bb,Recall who is in the second image.,Recall who is in the second image.,"The second image shows Bill Gates, the co-founder of Microsoft."
+tech_ceos2.1bb,Recall who is in the first image.,Recall who is in the first image.,"The first image shows Steve Jobs, the co-founder of Apple, holding up an iPhone."
+tech_ceos2.1bb,"Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","The person in the first image is Steve Jobs, the founder of Apple."
+tech_ceos2.1bb,Is the person in the third image the founder of Apple?,Is the person in the third image the founder of Apple?,"No, the person in the third image is Bill Gates, the founder of Microsoft."
+tech_ceos2.1cc,./eval/eval_data/images/tech-ceo/jobs1.jpg,Who is this person in this image a?,"This person in this image is Steve Jobs, the co-founder and former CEO of Apple."
+tech_ceos2.1cc,./eval/eval_data/images/tech-ceo/gate1.jpg,Who is this person in this image b?,"This person in this image b is Bill Gates, the co-founder of Microsoft."
+tech_ceos2.1cc,What's the differnce between the image a and image b,What's the differnce between the image a and image b,The image a is of Steve Jobs holding a cell phone while the image b is of Bill Gates standing in front of a podium.
+tech_ceos2.1cc,./eval/eval_data/images/tech-ceo/musk1.jpg,Who is this person in this image c?,"This person in this image c is Bill Gates, the co-founder of Microsoft."
+tech_ceos2.1cc,Recall who is in image b.,Recall who is in image b.,"The person in image b is Steve Jobs, the co-founder of Apple."
+tech_ceos2.1cc,Recall who is in the image a.,Recall who is in the image a.,"The person in the image a is Steve Jobs, the co-founder of Apple."
+tech_ceos2.1cc,"Is the person in the image a the founder of Apple? if not, which of the above images is the person the founder of Apple.","Is the person in the image a the founder of Apple? if not, which of the above images is the person the founder of Apple.","Yes, the person in the image a is the founder of Apple."
+tech_ceos2.1cc,Is the person in the image c the founder of Apple?,Is the person in the image c the founder of Apple?,"No, the person in the image c is not the founder of Apple. He is the co-founder of Microsoft."
+tech_ceos2.1dd,./eval/eval_data/images/tech-ceo/jobs1.jpg,Who is this person in this first image?,"This person is Steve Jobs, the co-founder and former CEO of Apple."
+tech_ceos2.1dd,./eval/eval_data/images/tech-ceo/gate1.jpg,Who is this person in this second image?,"This person is Bill Gates, the co-founder of Microsoft."
+tech_ceos2.1dd,What's the differnce between the first and second images,What's the differnce between the first and second images,The first image is of Steve Jobs holding up an iPhone while the second image is of Bill Gates standing at a podium.
+tech_ceos2.1dd,./eval/eval_data/images/tech-ceo/musk1.jpg,Who is this person in this third image?,"This person is Bill Gates, the co-founder of Microsoft."
+tech_ceos2.1dd,Recall who is in the second image.,Recall who is in the second image.,The second image is of Bill Gates standing at a podium.
+tech_ceos2.1dd,Recall who is in the first image.,Recall who is in the first image.,The first image is of Steve Jobs holding up an iPhone.
+tech_ceos2.1dd,"Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","The person in the first image is Steve Jobs, the founder of Apple."
+tech_ceos2.1dd,Is the person in the third image the founder of Apple?,Is the person in the third image the founder of Apple?,"Yes, the person in the third image is Bill Gates, the co-founder of Microsoft."
diff --git a/applications/DeepSpeed-VisualChat/eval/results/eval_robustness/ours-set2_final.csv b/applications/DeepSpeed-VisualChat/eval/results/eval_robustness/ours-set2_final.csv
new file mode 100644
index 000000000..2060fa157
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/eval/results/eval_robustness/ours-set2_final.csv
@@ -0,0 +1,61 @@
+test_name,image_path,question,answer
+tech_ceos2.1a,./eval/eval_data/images/tech-ceo/jobs1.jpg,Who is this person in the image?,"This person in the image is Steve Jobs, the co-founder and former CEO of Apple."
+tech_ceos2.1a,./eval/eval_data/images/tech-ceo/gate1.jpg,Who is this person in the image?,"This person in the image is Steve Jobs, the co-founder and former CEO of Apple."
+tech_ceos2.1a,./eval/eval_data/images/tech-ceo/musk1.jpg,Who is this person in the image?,"This person in the image is Steve Jobs, the co-founder and former CEO of Apple."
+tech_ceos2.1a,Recall who is in the second image.,Recall who is in the second image.,"In the second image, Steve Jobs is standing in front of a large red Apple logo, holding a microphone and speaking to a crowd. He is wearing a black suit and tie, and his hair is gray."
+tech_ceos2.1a,Recall who is in the first image.,Recall who is in the first image.,"In the first image, Steve Jobs is holding up an iPhone, which is white on the front and back, and has a black screen. He is wearing a black shirt and is standing in front of a white background."
+tech_ceos2.1a,"Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","Yes, the person in the first image is Steve Jobs, the founder of Apple."
+tech_ceos2.1a,Is the person in the third image the founder of Apple?,Is the person in the third image the founder of Apple?,"No, the person in the third image is not Steve Jobs, the founder of Apple."
+tech_ceos2.1b,./eval/eval_data/images/tech-ceo/jobs1.jpg,Who is this person in this image?,"This person is Steve Jobs, the co-founder and former CEO of Apple."
+tech_ceos2.1b,./eval/eval_data/images/tech-ceo/gate1.jpg,Who is this person in this image?,"This person is Steve Jobs, the co-founder and former CEO of Apple."
+tech_ceos2.1b,./eval/eval_data/images/tech-ceo/musk1.jpg,Who is this person in this image?,"This person is Steve Jobs, the co-founder and former CEO of Apple."
+tech_ceos2.1b,Recall who is in the second image.,Recall who is in the second image.,"In the second image, Steve Jobs is standing in front of a large red banner with the Apple logo on it. He is holding a microphone and appears to be giving a speech or presentation."
+tech_ceos2.1b,Recall who is in the first image.,Recall who is in the first image.,"In the first image, Steve Jobs is holding up an iPhone in front of a crowd of people. He is wearing a black turtleneck, jeans, and glasses, and is smiling."
+tech_ceos2.1b,"Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","Yes, the person in the first image is Steve Jobs, the founder of Apple."
+tech_ceos2.1b,Is the person in the third image the founder of Apple?,Is the person in the third image the founder of Apple?,"Yes, the person in the third image is Steve Jobs, the founder of Apple."
+tech_ceos2.1c,./eval/eval_data/images/tech-ceo/jobs1.jpg,Who is this person in this image a?,"This person in the image is Steve Jobs, the co-founder and former CEO of Apple."
+tech_ceos2.1c,./eval/eval_data/images/tech-ceo/gate1.jpg,Who is this person in this image b?,"This person in the image b is Steve Jobs, the co-founder and former CEO of Apple."
+tech_ceos2.1c,./eval/eval_data/images/tech-ceo/musk1.jpg,Who is this person in this image c?,"This person in the image c is Steve Jobs, the co-founder and former CEO of Apple."
+tech_ceos2.1c,Recall who is in image b.,Recall who is in image b.,"In image b, Steve Jobs is standing in front of a microphone and a podium, wearing a black turtleneck and jeans. He is holding a microphone and appears to be giving a speech or presentation."
+tech_ceos2.1c,Recall who is in the image a.,Recall who is in the image a.,"In the image a, Steve Jobs is standing in front of a podium, wearing a suit and tie, and holding a microphone. He is surrounded by people, and the image appears to be from a news conference or event."
+tech_ceos2.1c,"Is the person in the image a the founder of Apple? if not, which of the above images is the person the founder of Apple.","Is the person in the image a the founder of Apple? if not, which of the above images is the person the founder of Apple.","Yes, the person in the image a is the founder of Apple."
+tech_ceos2.1c,Is the person in the image c the founder of Apple?,Is the person in the image c the founder of Apple?,"Yes, the person in the image c is the founder of Apple."
+tech_ceos2.1d,./eval/eval_data/images/tech-ceo/jobs1.jpg,Who is this person in this first image?,"This person is Steve Jobs, the co-founder and CEO of Apple."
+tech_ceos2.1d,./eval/eval_data/images/tech-ceo/gate1.jpg,Who is this person in this second image?,"This person is Bill Gates, the co-founder of Microsoft."
+tech_ceos2.1d,./eval/eval_data/images/tech-ceo/musk1.jpg,Who is this person in this third image?,"This person is Elon Musk, the CEO of Tesla and SpaceX."
+tech_ceos2.1d,Recall who is in the second image.,Recall who is in the second image.,"The person in the second image is Bill Gates, the co-founder of Microsoft."
+tech_ceos2.1d,Recall who is in the first image.,Recall who is in the first image.,"The person in the first image is Steve Jobs, the co-founder of Apple."
+tech_ceos2.1d,"Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","Yes, the person in the first image is Steve Jobs, the founder of Apple."
+tech_ceos2.1d,Is the person in the third image the founder of Apple?,Is the person in the third image the founder of Apple?,"No, the person in the third image is Elon Musk, the founder of Tesla and SpaceX."
+tech_ceos2.1aa,./eval/eval_data/images/tech-ceo/jobs1.jpg,Who is this person in the image?,"This person in the image is Steve Jobs, the co-founder and former CEO of Apple."
+tech_ceos2.1aa,./eval/eval_data/images/tech-ceo/gate1.jpg,Who is this person in the image?,"This person in the image is Steve Jobs, the co-founder and former CEO of Apple."
+tech_ceos2.1aa,What's the differnce between the first and second image,What's the differnce between the first and second image,"In the first image, Steve Jobs is holding a cell phone while in the second image, he is standing at a podium."
+tech_ceos2.1aa,./eval/eval_data/images/tech-ceo/musk1.jpg,Who is this person in the image?,"This person in the image is Elon Musk, the CEO of Tesla and SpaceX."
+tech_ceos2.1aa,Recall who is in the second image.,Recall who is in the second image.,"The second image shows Elon Musk standing in front of a Tesla car, holding a microphone and speaking to a crowd."
+tech_ceos2.1aa,Recall who is in the first image.,Recall who is in the first image.,The first image shows Steve Jobs holding a cell phone while standing in front of a microphone.
+tech_ceos2.1aa,"Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","The person in the first image is Steve Jobs, the founder of Apple."
+tech_ceos2.1aa,Is the person in the third image the founder of Apple?,Is the person in the third image the founder of Apple?,"No, the person in the third image is Elon Musk, the founder of Tesla and SpaceX."
+tech_ceos2.1bb,./eval/eval_data/images/tech-ceo/jobs1.jpg,Who is this person in this image?,"This person is Steve Jobs, the co-founder and former CEO of Apple."
+tech_ceos2.1bb,./eval/eval_data/images/tech-ceo/gate1.jpg,Who is this person in this image?,"This person is Steve Jobs, the co-founder and former CEO of Apple."
+tech_ceos2.1bb,What's the differnce between the first and second images,What's the differnce between the first and second images,"In the first image, Steve Jobs is holding up a cell phone while in the second image, he is standing at a podium."
+tech_ceos2.1bb,./eval/eval_data/images/tech-ceo/musk1.jpg,Who is this person in this image?,"This person is Elon Musk, the CEO of Tesla and SpaceX."
+tech_ceos2.1bb,Recall who is in the second image.,Recall who is in the second image.,"In the second image, Steve Jobs is holding up a cell phone, which is likely an iPhone, and is standing at a podium."
+tech_ceos2.1bb,Recall who is in the first image.,Recall who is in the first image.,"In the first image, Steve Jobs is holding up a cell phone, which is likely an iPhone, and is wearing a black turtleneck and jeans."
+tech_ceos2.1bb,"Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","Yes, the person in the first image is the founder of Apple."
+tech_ceos2.1bb,Is the person in the third image the founder of Apple?,Is the person in the third image the founder of Apple?,"No, the person in the third image is Elon Musk, the CEO of Tesla and SpaceX."
+tech_ceos2.1cc,./eval/eval_data/images/tech-ceo/jobs1.jpg,Who is this person in this image a?,"This person in the image is Steve Jobs, the co-founder and former CEO of Apple."
+tech_ceos2.1cc,./eval/eval_data/images/tech-ceo/gate1.jpg,Who is this person in this image b?,"This person in the image b is Steve Jobs, the co-founder and former CEO of Apple."
+tech_ceos2.1cc,What's the differnce between the image a and image b,What's the differnce between the image a and image b,"In image a, Steve Jobs is holding up a cell phone while in image b, he is standing at a podium and holding a microphone."
+tech_ceos2.1cc,./eval/eval_data/images/tech-ceo/musk1.jpg,Who is this person in this image c?,"This person in the image c is Steve Jobs, the co-founder and former CEO of Apple."
+tech_ceos2.1cc,Recall who is in image b.,Recall who is in image b.,"In image b, Steve Jobs is standing at a podium and holding a microphone. He is wearing a black turtleneck and jeans, and he appears to be speaking to a crowd."
+tech_ceos2.1cc,Recall who is in the image a.,Recall who is in the image a.,"In the image a, Steve Jobs is holding up a cell phone, which is a black iPhone. He is wearing a black shirt and glasses, and his hair is gray."
+tech_ceos2.1cc,"Is the person in the image a the founder of Apple? if not, which of the above images is the person the founder of Apple.","Is the person in the image a the founder of Apple? if not, which of the above images is the person the founder of Apple.","Yes, the person in the image a is the founder of Apple."
+tech_ceos2.1cc,Is the person in the image c the founder of Apple?,Is the person in the image c the founder of Apple?,"Yes, the person in the image c is the founder of Apple."
+tech_ceos2.1dd,./eval/eval_data/images/tech-ceo/jobs1.jpg,Who is this person in this first image?,"This person is Steve Jobs, the co-founder and CEO of Apple."
+tech_ceos2.1dd,./eval/eval_data/images/tech-ceo/gate1.jpg,Who is this person in this second image?,"This person is Bill Gates, the co-founder of Microsoft."
+tech_ceos2.1dd,What's the differnce between the first and second images,What's the differnce between the first and second images,The first image is of Steve Jobs holding up an iPhone while the second image is of Bill Gates standing at a podium.
+tech_ceos2.1dd,./eval/eval_data/images/tech-ceo/musk1.jpg,Who is this person in this third image?,"This person is Elon Musk, the CEO of Tesla and SpaceX."
+tech_ceos2.1dd,Recall who is in the second image.,Recall who is in the second image.,"The second image is of Bill Gates, the co-founder of Microsoft."
+tech_ceos2.1dd,Recall who is in the first image.,Recall who is in the first image.,"The first image is of Steve Jobs, the co-founder of Apple."
+tech_ceos2.1dd,"Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","Is the person in the first image the founder of Apple? if not, which of the above images is the person the founder of Apple.","The person in the first image is Steve Jobs, the founder of Apple."
+tech_ceos2.1dd,Is the person in the third image the founder of Apple?,Is the person in the third image the founder of Apple?,"No, the person in the third image is Elon Musk, the founder of Tesla and SpaceX."
diff --git a/applications/DeepSpeed-VisualChat/eval/results/eval_single/ours-single_best_eval.csv b/applications/DeepSpeed-VisualChat/eval/results/eval_single/ours-single_best_eval.csv
new file mode 100644
index 000000000..a155b18cf
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/eval/results/eval_single/ours-single_best_eval.csv
@@ -0,0 +1,24 @@
+test_name,image_path,question,answer
+cat_images1,./eval/eval_data/images/cats/cat.png,please describe the image,"The image features a small kitten standing on top of a pile of green plants, with its paws on a leaf. The kitten appears to be curious and playful, as it is reaching up to touch the leaf with its paws. The plants are spread out across the ground, creating a lush and vibrant scene. The kitten's position on the plants adds a sense of adventure and fun to the image."
+cat_images2,./eval/eval_data/images/cats/1806905748_adb926a0a0.jpg,can you describe the image,"The image features a black cat lying on a carpeted floor, with its eyes closed and its paws stretched out. The cat appears to be relaxed and comfortable in its position. The carpeted floor is a neutral background, allowing the cat to be the main focus of the image."
+cat_images3,./eval/eval_data/images/cats/british_shorthair.jpg,please describe the image,"The image features a large gray and white cat lying on a carpeted floor. The cat is stretched out and appears to be relaxing, with its paws and legs spread out. The cat's body is positioned diagonally across the floor, taking up a significant portion of the space. The carpeted floor is a light color, providing a comfortable and cozy environment for the cat to rest in."
+extreme_ironing,./eval/eval_data/images/singles/extreme_ironing.jpg,What is unusual about this image?,"The unusual aspect of this image is that a man is standing on top of a car, ironing a shirt. This is an unconventional and potentially dangerous activity, as it puts the man at risk of falling off the car and potentially injuring himself or causing damage to the car. Ironing a shirt on top of a car is not a common practice, and it is not recommended as it poses safety hazards and can lead to accidents."
+waterview,./eval/eval_data/images/singles/waterview.jpg,What are the things I should be cautious about when I visit here?,"When visiting the wooden pier over the lake, there are a few things to be cautious about:
+
+1. Safety: Be aware of your surroundings and the condition of the pier. Make sure the pier is stable and secure before walking on it.
+
+2. Weather: Check the weather forecast before visiting to avoid any sudden changes or storms that could make the pier unsafe.
+
+3. Water depth: Be cautious of the water depth around the pier, especially if you plan to swim or wade in the lake.
+
+4. Wildlife: Be mindful of any wildlife in the area, such as fish or birds, and avoid disturbing their natural habitat.
+
+5. Personal belongings: Keep your belongings secure and close to you, as theft or loss can occur in public places.
+
+6. Sun protection: If visiting during daytime, remember to apply sunscreen and wear a hat to protect yourself from the sun's harmful rays.
+
+By being aware of these factors, you can ensure a safe and enjoyable visit to the wooden pier over the lake."
+art-dog,./eval/eval_data/images/singles/202160027_b319c4166e.jpg,can you describe the image,"The image features a large, colorful flower sculpture in the shape of a dog, standing in front of a building. The flower sculpture is made up of various flowers, including pink, red, and yellow flowers, which are attached to the dog's body. The dog sculpture is quite tall, and it is positioned in a way that it appears to be walking or standing on a sidewalk. The building in the background is a large, blue structure, which adds contrast to the vibrant flower sculpture."
+funny-phone,./eval/eval_data/images/singles/1.jpg,What is funny about this image? Describe it panel by panel.,"The image is a series of four panels that show a cell phone with a blue cord attached to it. The first panel displays the cell phone with the cord, while the second panel shows the cord being cut off. In the third panel, the cord is reattached to the cell phone, and in the fourth panel, the cord is cut off again. This sequence of events is humorous because it depicts a person repeatedly cutting and reattaching the cord to the cell phone, which is an unusual and unnecessary action. The image conveys a sense of absurdity and humor, as it highlights the pointlessness of the action."
+squirrel,./eval/eval_data/images/singles/2.jpg,Why would a person find this image funny?,"The image of a squirrel holding a camera and smiling is humorous because it is an unexpected and amusing combination of elements. Squirrels are typically associated with natural environments, such as parks and forests, and are not typically known for engaging in human activities like taking photographs. The fact that the squirrel is holding a camera and smiling, as if it is enjoying the process, adds a playful and lighthearted touch to the image. This combination of elements creates a sense of humor and amusement for viewers, as it challenges their expectations and imagination."
+art-painting,./eval/eval_data/images/singles/50.jpg,Tell me about this work of art.,"The work of art is a painting that features a scene of a man walking down a street at night, with a church in the background. The man is carrying a backpack, and the street is illuminated by the moonlight. The painting has a unique style, with the moon appearing as a large, yellow sun in the sky. This artistic choice adds a surreal and dreamlike quality to the scene, making it appear as if the man is walking through a fantasy world. The painting is a beautiful representation of a nighttime scene, capturing the tranquility and serenity of the moment."
diff --git a/applications/DeepSpeed-VisualChat/eval/results/eval_single/ours-single_final.csv b/applications/DeepSpeed-VisualChat/eval/results/eval_single/ours-single_final.csv
new file mode 100644
index 000000000..2b72b4814
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/eval/results/eval_single/ours-single_final.csv
@@ -0,0 +1,16 @@
+test_name,image_path,question,answer
+cat_images1,./eval/eval_data/images/cats/cat.png,please describe the image,"The image shows a close-up of a small kitten standing in a field of green plants. The kitten is looking upwards with a curious expression on its face, surrounded by the lush vegetation. The plants around the kitten come in various sizes and shapes, creating a picturesque scene of nature."
+cat_images2,./eval/eval_data/images/cats/1806905748_adb926a0a0.jpg,can you describe the image,"The image features a black cat lying on the floor, specifically on a yellow area, possibly a rug. The cat appears to be relaxed and laying on its side, with its eyes closed. The cat occupies a significant portion of the frame, drawing attention to its presence."
+cat_images3,./eval/eval_data/images/cats/british_shorthair.jpg,please describe the image,"The image features a gray and white cat lying on the floor next to a piece of carpeting. The cat is stretched out, with its body resting on the floor and one paw reaching up onto the carpet. The cat appears to be relaxed and enjoying its rest."
+extreme_ironing,./eval/eval_data/images/singles/extreme_ironing.jpg,What is unusual about this image?,"In this image, it is unusual to see a man doing laundry by hanging clothes on a clothes rack attached to a parked yellow taxi in the middle of the street. Typically, laundry would be done in a more designated or safe area, such as a laundromat or a backyard, to avoid potential hazards and obstructing traffic. The man's choice to do laundry in this manner raises concerns about safety and the flow of traffic, as it could cause distractions for drivers and pedestrians and pose a risk to both the man and others in the area."
+waterview,./eval/eval_data/images/singles/waterview.jpg,What are the things I should be cautious about when I visit here?,"When visiting this picturesque dock over a calm lake surrounded by trees, there are a few things to keep in mind to ensure a safe and enjoyable experience. First, be aware of the water conditions and always use caution when entering the lake, as it may have hidden obstacles or sudden changes in depth. Second, make sure to wear appropriate footwear to prevent slipping or falling off the dock. Third, be mindful of the weather and avoid visiting during stormy conditions, as lightning can be dangerous. Lastly, practice water safety rules, such as swimming with a buddy, staying within your swimming abilities, and not venturing too far from the shore or dock. Enjoy the serene atmosphere and the beautiful view of the mountains in the background while being cautious and respectful of the environment."
+art-dog,./eval/eval_data/images/singles/202160027_b319c4166e.jpg,can you describe the image,"The image features a large, colorful clock tower in the background, with a man standing in front of it. In the foreground, there is a unique sculpture of a dog made out of flowers, which is the main focal point of the scene. The sculpture covers a significant portion of the image, and its vibrant colors add an artistic touch to the scene. The clock tower and the man in the background provide context and a sense of place, while the flower dog sculpture captures the viewer's attention with its creativity and beauty."
+funny-phone,./eval/eval_data/images/singles/1.jpg,What is funny about this image? Describe it panel by panel.,"The image is a collage of three panels, each showing different aspects of the humor in the scene.
+
+Panel 1: A cell phone is being held by someone, with its charger still attached to a blue block. The person seems to be trying to remove the charger from the block, possibly out of frustration or curiosity.
+
+Panel 2: A close-up of the blue block, which appears to be a deactivated electronic device adapter or a device blocker, is shown. The person might be trying to figure out why the charger won't fit or if the blue block is preventing it from working.
+
+Panel 3: The cell phone is now disconnected from the charger, and the person seems to have successfully detached it from the blue block. The situation is humorous as it depicts a common everyday activity, like charging a phone, with an added element of confusion and challenge due to the presence of the blue block."
+squirrel,./eval/eval_data/images/singles/2.jpg,Why would a person find this image funny?,"A person might find this image funny because it features a playful depiction of a squirrel holding a camera. Squirrels are typically seen as small, cute, and harmless animals, and their interaction with human objects, such as a camera, adds an element of humor to the scene. The idea of a squirrel being interested in photography or posing with a camera can evoke amusement and lightheartedness, as it is an unexpected and humorous twist on the usual behavior of squirrels. This whimsical combination of a familiar animal and an object associated with human activities can elicit laughter and amusement."
+art-painting,./eval/eval_data/images/singles/50.jpg,Tell me about this work of art.,"The work of art is a unique combination of a painting and a print featuring the famous character from the comics, Spider-Man. In this piece, Spider-Man is depicted as the ""Sgt. Star Platinum"" character from the series ""The Devil Manga."" The painting or print has a vintage or old-school style, and it is set against a night sky with stars. The scene also includes a village and a house, which adds depth and context to the overall composition. This artwork is likely to appeal to fans of comics, street art, and visual storytelling."
diff --git a/applications/DeepSpeed-VisualChat/helper/README.md b/applications/DeepSpeed-VisualChat/helper/README.md
new file mode 100644
index 000000000..bd32b54e8
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/helper/README.md
@@ -0,0 +1,2 @@
+# QWen-VL's Vision Encoder
+The extract_qwen_vl.py can be used to extract the vision encoder from QWen-VL. After extraction, you can find other necessary files in the [folder](./qwen_clip).
\ No newline at end of file
diff --git a/applications/DeepSpeed-VisualChat/helper/extract_qwen_vl.py b/applications/DeepSpeed-VisualChat/helper/extract_qwen_vl.py
new file mode 100644
index 000000000..12808b87c
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/helper/extract_qwen_vl.py
@@ -0,0 +1,14 @@
+from transformers import AutoModelForCausalLM
+import torch
+
+PATH = "Qwen/Qwen-VL-Chat"
+
+model = AutoModelForCausalLM.from_pretrained(PATH, device_map="cuda", trust_remote_code=True).eval()
+
+state_dict = model.state_dict()
+save_dict = {}
+for k,v in state_dict.items():
+    if 'visual' in k:
+        if 'transformer.visual.proj' not in k: # we don't need the proj layer
+            save_dict[k.replace('transformer.visual.', '')] = v
+torch.save(save_dict, './qwen_clip/pytorch_model.bin')
\ No newline at end of file
diff --git a/applications/DeepSpeed-VisualChat/helper/qwen_clip/config.json b/applications/DeepSpeed-VisualChat/helper/qwen_clip/config.json
new file mode 100755
index 000000000..357bb8280
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/helper/qwen_clip/config.json
@@ -0,0 +1,180 @@
+{
+  "_commit_hash": null,
+  "architectures": [
+    "CLIPModel"
+  ],
+  "initializer_factor": 1.0,
+  "logit_scale_init_value": 2.6592,
+  "model_type": "clip",
+  "projection_dim": 1280,
+  "text_config": {
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": 0,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dropout": 0.0,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 2,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "gelu",
+    "hidden_size": 1280,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 5120,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-05,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_position_embeddings": 77,
+    "min_length": 0,
+    "model_type": "clip_text_model",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 20,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 32,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": 1,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "transformers_version": "4.24.0",
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "vocab_size": 49408
+  },
+  "text_config_dict": {
+    "hidden_act": "gelu",
+    "hidden_size": 1280,
+    "intermediate_size": 5120,
+    "num_attention_heads": 20,
+    "num_hidden_layers": 32
+  },
+  "torch_dtype": "float32",
+  "transformers_version": null,
+  "vision_config": {
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dropout": 0.0,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "gelu",
+    "hidden_size": 1664,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "image_size": 224,
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 8192,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-05,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "model_type": "clip_vision_model",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 16,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_channels": 3,
+    "num_hidden_layers": 48,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "patch_size": 14,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "transformers_version": "4.24.0",
+    "typical_p": 1.0,
+    "use_bfloat16": false
+  },
+  "vision_config_dict": {
+    "hidden_act": "gelu",
+     "hidden_size": 1664,
+     "intermediate_size": 8192,
+     "num_attention_heads": 16,
+     "num_hidden_layers": 48,
+     "patch_size": 14
+  }
+}
diff --git a/applications/DeepSpeed-VisualChat/helper/qwen_clip/preprocessor_config.json b/applications/DeepSpeed-VisualChat/helper/qwen_clip/preprocessor_config.json
new file mode 100755
index 000000000..89a5d584a
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/helper/qwen_clip/preprocessor_config.json
@@ -0,0 +1,19 @@
+{
+  "crop_size": 448,
+  "do_center_crop": true,
+  "do_normalize": true,
+  "do_resize": true,
+  "feature_extractor_type": "CLIPFeatureExtractor",
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "resample": 3,
+  "size": 448
+}
diff --git a/applications/DeepSpeed-VisualChat/requirements.txt b/applications/DeepSpeed-VisualChat/requirements.txt
new file mode 100644
index 000000000..ae0440617
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/requirements.txt
@@ -0,0 +1,11 @@
+datasets>=2.8.0
+sentencepiece>=0.1.97
+protobuf==3.20.3
+accelerate>=0.15.0
+open_clip_torch
+deepspeed>=0.10.3
+einops
+einops_exts
+transformers==4.33.3
+transformers_stream_generator
+termcolor
\ No newline at end of file
diff --git a/applications/DeepSpeed-VisualChat/training/README.md b/applications/DeepSpeed-VisualChat/training/README.md
new file mode 100644
index 000000000..dd4e58d26
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/training/README.md
@@ -0,0 +1,40 @@
+<div align="center">
+
+<img src="../assets/model.png" width="500"/>
+
+</div>
+
+# 🐕 Vision-Lanauge Finetuning
+Vision-Lanauge finetuning (or insturction finetuning) is very similar to standard large language model finetuning. The differences are summarized as follows:
+* It requires an extra vision encoder
+* It requires high quality text-image data pairs
+* It needs a linear projection layer that connects the visual encoder and LLM
+
+
+## 🏃 How to train the model
+Remember to prepare you data first based on [tutorial](../README.md). If you want to use 448x448 resolution images, you need go to [helper](../helper) for more information. Otherwise, you can simply run
+```bash
+ training_scripts/run_7b.sh
+ ```
+
+which shows an example to fine-tune a LLaMA-2-7B LLM together with a CLIP-large visual encoder and a linear projection layer that connects this visual encoder and LLM.
+
+
+## 👀 A few examples
+Please refer to [Chat](../chat/README.md) or [Eval](../eval/README.md) for evaluation. 
+
+<div align="center">
+<img src="../assets/banner.png" width="500" alt="Cats"/>
+</div>
+
+-----
+
+<div align="center">
+<img src="../assets/ceos.png"  width="500" alt="CEOs"/>
+</div>
+
+------
+
+<div align="center">
+<img src="../assets/friends.png" width="500" alt="Friends"/>
+</div>
diff --git a/applications/DeepSpeed-VisualChat/training/main.py b/applications/DeepSpeed-VisualChat/training/main.py
new file mode 100755
index 000000000..c345f9832
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/training/main.py
@@ -0,0 +1,442 @@
+#!/usr/bin/env python
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+import argparse
+import os
+import math
+import sys
+import numpy as np
+import random 
+
+import torch
+from torch.utils.data import DataLoader
+from torch.utils.data.distributed import DistributedSampler
+
+from transformers import (
+    SchedulerType,
+    get_scheduler,
+    AutoTokenizer
+)
+
+import deepspeed
+from transformers import AdamW
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)))
+from utils.data import build_dataset, DataCollatorPadToMaxLen, split_dataset, shuffle_dataset
+from utils.utils import print_rank_0, to_device, save_hf_format, set_random_seed, get_all_reduce_mean, get_optimizer_grouped_parameters, save_zero_three_model
+from utils.ds_utils import get_train_ds_config
+from utils.module.lora import convert_linear_layer_to_lora, only_optimize_lora_parameters, fuse_lora, unfuse_lora
+from utils.model import create_dsvl_model_and_transforms
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description=
+        "Finetune a transformers model on a multi-modal task")
+
+    parser.add_argument('--data_path',
+                        type=str,
+                        default='./data/',
+                        help='Where the training data are stored.')
+
+    parser.add_argument('--data_debug_path',
+                        type=str,
+                        default=None,
+                        help='If provided, will save 10 training samples'
+                        'to the path for debug purpose.')
+
+    parser.add_argument(
+        "--data_train_split_ratio",
+        type=float,
+        default=0.9,
+        help="Ratio of dataset to be splitted as train data. The remaining becomes eval data.",
+    )
+    parser.add_argument('--dataset_names',
+                        nargs='*',
+                        default=['minigpt4'],
+                        help='Name of training dataset(s) to be used. Accepted format:'
+                        '1) a single dataset name, 2) multiple dataset names in the'
+                        'form: dataset1 dataset2 ...')
+
+    parser.add_argument('--dataset_samples',
+                        nargs='*',
+                        default=['all'],
+                        help='How many samples do we use from each dataset.'
+                        'Should be either a integer number or string all which'
+                        'means use all samples. For example: all 512 means'
+                        'using all samples form first data and 512 samples'
+                        'from second data')
+    
+    parser.add_argument('--dataset_concatenate_samples',
+                        nargs='*',
+                        default=[1],
+                        help='How many samples do we concatenate from each dataset.'
+                        'Should be either a integer number or string. 1 which'
+                        'means use 1 sample for each datapoint')
+    
+    parser.add_argument(
+        "--max_num_image_per_sample",
+        type=int,
+        default=8,
+        help="The maximum number of images per sample.",
+    )
+    parser.add_argument(
+        "--per_device_train_batch_size",
+        type=int,
+        default=2,
+        help="Batch size (per device) for the training dataloader.",
+    )
+    parser.add_argument(
+        "--per_device_eval_batch_size",
+        type=int,
+        default=2,
+        help="Batch size (per device) for the evaluation dataloader.",
+    )
+    parser.add_argument(
+        "--max_seq_len",
+        type=int,
+        default=4096,
+        help="The maximum sequence length, note that image tokens are included.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-3,
+        help=
+        "Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--learning_rate_pretraining_components",
+        type=float,
+        default=0,
+        help=
+        "Initial learning rate for pre-trained weight, e.g., embedding (after the potential warmup period) to use.",
+    )
+    parser.add_argument("--weight_decay",
+                        type=float,
+                        default=0.,
+                        help="Weight decay to use.")
+    parser.add_argument("--num_train_epochs",
+                        type=int,
+                        default=6,
+                        help="Total number of training epochs to perform.")
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help=
+        "Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--lr_scheduler_type",
+        type=SchedulerType,
+        default="cosine",
+        help="The scheduler type to use.",
+        choices=[
+            "linear", "cosine", "cosine_with_restarts", "polynomial",
+            "constant", "constant_with_warmup"
+        ],
+    )
+    parser.add_argument(
+        "--num_warmup_steps",
+        type=float,
+        default=0,
+        help="Number of steps (>1) or ratios (<=1) for the warmup in the lr scheduler.")
+    parser.add_argument("--output_dir",
+                        type=str,
+                        default=None,
+                        help="Where to store the model.")
+    parser.add_argument("--seed",
+                        type=int,
+                        default=1234,
+                        help="A seed for reproducible training.")
+    parser.add_argument("--local_rank",
+                        type=int,
+                        default=-1,
+                        help="local_rank for distributed training on gpus")
+    parser.add_argument('--gradient_checkpointing',
+                        action='store_true',
+                        help='Enable HF gradient checkpointing for model.')
+    parser.add_argument(
+        "--lm_model_name_or_path",
+        type=str,
+        help=
+        "Path to pretrained model or model identifier from huggingface.co/models.",
+        required=True,
+    )
+    parser.add_argument("--vision_model_name_or_path", default="openai/clip-vit-large-patch14", type=str)
+    parser.add_argument(
+        "--enable_mmca_attention",
+        action='store_true',
+        help="enable the new proposed attn, which is similar to cross attention",
+    )
+    parser.add_argument(
+        "--vis_proj",
+        type=str,
+        default='baseline',
+        help="[baseline, vit, or perceiver], used to projection vision feature to LLM embedding",
+    )
+    # deepspeed features
+    parser.add_argument(
+        '--zero_stage',
+        type=int,
+        default=0,
+        help='ZeRO optimization stage for Actor model (and clones).')
+    parser.add_argument(
+        "--precision",
+        type=str,
+        choices=["fp16", "bf16"],
+        default="fp16",
+        help=
+        "FP16 or BF16 precision. FP16 is recommended for typical use cases. BF16 is good for large models",
+    )
+    parser.add_argument('--enable_tensorboard',
+                        action='store_true',
+                        help='Enable tensorboard logging')
+    ## LoRA for efficient training setting
+    parser.add_argument("--lang_lora_dim",
+                        type=int,
+                        default=0,
+                        help="Use LoRA for fine-tuning language decoder (> 0).")
+    parser.add_argument("--lang_lora_module_name",
+                        type=str,
+                        default="model.layers.",
+                        help="The scope name of the target LoRA parameters.")
+    parser.add_argument("--vis_lora_dim",
+                        type=int,
+                        default=0,
+                        help="Use LoRA for fine-tuning visual encoder (> 0).")
+    parser.add_argument("--vis_lora_module_name",
+                        type=str,
+                        default="encoder.layers.",
+                        help="The scope name of the target LoRA parameters.")
+    parser.add_argument('--only_optimize_lora',
+                        action='store_true',
+                        help='Only optimize the LoRA parameters.')
+
+
+    parser = deepspeed.add_config_arguments(parser)
+    args = parser.parse_args()
+
+    if args.learning_rate_pretraining_components == 0.0:
+        # if we do not provide special learning rate, mainly for embedding, the same lr is applied
+        args.learning_rate_pretraining_components = args.learning_rate
+    assert args.num_warmup_steps >= 0, "--num_warmup_steps must be >= 0"
+    if 'qwen' in args.vision_model_name_or_path.lower():
+        assert args.vis_proj == 'baseline', "qwen's model only support baseline vis_proj as it has the perceiver module inside"
+    return args
+
+
+def main():
+    args = parse_args()
+
+    if args.local_rank == -1:
+        device = torch.device("cuda")
+    else:
+        torch.cuda.set_device(args.local_rank)
+        device = torch.device("cuda", args.local_rank)
+        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+        deepspeed.init_distributed()
+
+    args.global_rank = torch.distributed.get_rank()
+
+    ds_config = get_train_ds_config(args, offload=False,
+                                    stage=args.zero_stage)
+    ds_config[
+        'train_micro_batch_size_per_gpu'] = args.per_device_train_batch_size
+    ds_config[
+        'train_batch_size'] = args.per_device_train_batch_size * torch.distributed.get_world_size(
+        ) * args.gradient_accumulation_steps
+
+    # If passed along, set the training seed now.
+    set_random_seed(args.seed)
+
+    torch.distributed.barrier()
+    tokenizer = AutoTokenizer.from_pretrained(args.lm_model_name_or_path,
+                                              fast_tokenizer=True)
+    tokenizer.padding_side = 'right'
+    model, image_processor, tokenizer = create_dsvl_model_and_transforms(
+            text_tokenizer=tokenizer,
+            args=args,
+            ds_config=ds_config)  
+    if args.lang_lora_dim > 0:
+        model.lang_decoder = convert_linear_layer_to_lora(model.lang_decoder, args.lang_lora_module_name, args.lang_lora_dim)
+        if args.only_optimize_lora:
+            model.lang_decoder = only_optimize_lora_parameters(model.lang_decoder)
+
+    if args.vis_lora_dim > 0:
+        model.vis_encoder = convert_linear_layer_to_lora(model.vis_encoder, args.vis_lora_module_name, args.vis_lora_dim)
+        if args.only_optimize_lora:
+            model.vis_encoder = only_optimize_lora_parameters(model.vis_encoder)
+
+    print_rank_0(model, args.global_rank) 
+        
+    # Prepare the data
+    if len(args.dataset_samples) < len(args.dataset_names):
+        assert len(args.dataset_samples) == 1, "when args.dataset_samples is not the same length as args.dataset_names, it should be only one number"
+        args.dataset_samples =  [args.dataset_samples[0]] * len(args.dataset_names)
+    if len(args.dataset_concatenate_samples) < len(args.dataset_names):
+        assert len(args.dataset_concatenate_samples) == 1, "when args.dataset_concatenate_samples is not the same length as args.dataset_names, it should be only one number"
+        args.dataset_concatenate_samples =  [args.dataset_concatenate_samples[0]] * len(args.dataset_names)
+    # convert to int
+    args.dataset_concatenate_samples = [int(i) for i in args.dataset_concatenate_samples]
+
+    dataset = build_dataset(
+        args.data_path,
+        args.data_debug_path,
+        args.dataset_names,
+        args.dataset_samples,
+        args.dataset_concatenate_samples,
+        args.max_num_image_per_sample,
+        vis_processor=image_processor,
+        tokenizer=tokenizer,
+    )
+    # split the dataset into train and evaluation
+    total_data = len(dataset)
+    np_rng = np.random.RandomState(seed=args.seed)
+    dataset = shuffle_dataset(dataset, np_rng)
+    train_dataset, eval_dataset = split_dataset(dataset, args.data_train_split_ratio)
+
+    train_dataloader = DataLoader(
+        train_dataset,
+        batch_size=args.per_device_train_batch_size,
+        sampler=DistributedSampler(train_dataset, shuffle=True, drop_last=True),
+        collate_fn=DataCollatorPadToMaxLen(args.max_seq_len, tokenizer.pad_token_id),
+    )
+
+    eval_dataloader = DataLoader(
+        eval_dataset,
+        batch_size=args.per_device_eval_batch_size,
+        sampler=DistributedSampler(eval_dataset, shuffle=False),
+        collate_fn=DataCollatorPadToMaxLen(args.max_seq_len, tokenizer.pad_token_id),
+    )
+
+    # Split weights in two groups, one with weight decay and the other not.
+    optimizer_grouped_parameters = get_optimizer_grouped_parameters(
+        model, args.weight_decay, small_lr=args.learning_rate_pretraining_components)
+
+    optimizer = AdamW(optimizer_grouped_parameters,
+                              lr=args.learning_rate,
+                              betas=(0.9, 0.95))
+
+    num_update_steps_per_epoch = math.ceil(
+        len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.num_warmup_steps <= 1:
+        args.num_warmup_steps = int(args.num_warmup_steps * args.num_train_epochs * num_update_steps_per_epoch)
+    else:
+        args.num_warmup_steps = int(args.num_warmup_steps)
+    lr_scheduler = get_scheduler(
+        name=args.lr_scheduler_type,
+        optimizer=optimizer,
+        num_warmup_steps=args.num_warmup_steps,
+        num_training_steps=args.num_train_epochs * num_update_steps_per_epoch,
+    )
+
+    model, optimizer, _, lr_scheduler = deepspeed.initialize(
+        model=model,
+        optimizer=optimizer,
+        args=args,
+        config=ds_config,
+        lr_scheduler=lr_scheduler,
+        dist_init_required=True)
+
+    start_epoch = 0
+    # let load checkpoint 
+    if os.path.exists(os.path.join(args.output_dir, 'latest')):
+        # we have the deepspeed chekpoint so it is a resumed job
+        # TODO: after loading the ckpt, the global step is not loaded. Need to ask Tunji/Ammar for help.
+        _, client_state = model.load_checkpoint(args.output_dir)
+        start_epoch = client_state['epoch']
+        best_loss = client_state['best_loss']
+        random.setstate(client_state['random_rng_state'])
+        np.random.set_state(client_state['np_rng_state'])
+        torch.set_rng_state(client_state['torch_rng_state'])
+        torch.cuda.set_rng_state(client_state['torch_cuda_rng_state'])
+
+    if args.gradient_checkpointing:
+        model.gradient_checkpointing_enable()
+
+    def evaluation(model, eval_dataloader):
+        model.eval()
+        acc_loss = 0
+        for step, batch in enumerate(eval_dataloader):
+            with torch.no_grad():
+                batch = to_device(batch, device)
+                loss = model(
+                    batch["image"].half() ,
+                    batch["input_ids"],
+                    attention_mask=batch["attention_mask"],
+                    input_labels=batch["labels"],
+                    image_num=batch["image_num"],
+                )[0]
+            acc_loss += loss
+        model.train()
+        acc_loss = get_all_reduce_mean(acc_loss).item()
+        ave_loss = acc_loss / (step + 1)
+        print_rank_0(f"the eval average_loss: {ave_loss}", args.global_rank)
+        return ave_loss
+    
+    # Train!
+    if start_epoch == 0:
+        print_rank_0("***** Before training *****", args.global_rank)
+        evaluation(model, eval_dataloader)
+        best_loss = 1e6
+
+    print_rank_0("***** Running training *****", args.global_rank)
+    for epoch in range(start_epoch, args.num_train_epochs):
+        print_rank_0(
+            f"Beginning of Epoch {epoch+1}/{args.num_train_epochs}, Total Micro Batches {len(train_dataloader)}",
+            args.global_rank)
+        model.train()
+        acc_loss = 0
+        for step, batch in enumerate(train_dataloader):
+            batch = to_device(batch, device)  #torch.size(1, 3, 224, 224]) #torch.Size([1, 1, 3, 224, 224])
+            images = batch["image"].half() 
+            input_ids = batch["input_ids"]
+            attention_mask = batch["attention_mask"]
+            labels = batch["labels"]
+            loss = model(
+                images,
+                input_ids,
+                attention_mask=attention_mask,
+                input_labels=labels,
+                image_num=batch["image_num"],
+            )[0]
+            acc_loss += loss.detach().clone()
+            model.backward(loss)
+            model.step()
+        model.tput_timer.update_epoch_count()
+        acc_loss = get_all_reduce_mean(acc_loss).item()
+        print_rank_0(f"Epoch {epoch+1}, the average_loss: {acc_loss/step}", args.global_rank)
+        eval_loss = evaluation(model, eval_dataloader)
+
+        
+        if eval_loss < best_loss:
+            best_loss = eval_loss
+
+        model = fuse_lora(model)
+        if args.global_rank == 0:
+            save_hf_format(model, tokenizer, args, f'epoch-{epoch}')
+        if args.zero_stage == 3:
+            # For zero stage 3, each gpu only has a part of the model, so we need a special save function
+            save_zero_three_model(model,
+                                args.global_rank,
+                                args.output_dir,
+                                zero_stage=args.zero_stage, 
+                                sub_folder=f'epoch-{epoch}')
+        model = unfuse_lora(model)
+        # save deepspeed zero checkpoint so we can resume training if needed
+        client_state = {
+            'random_rng_state': random.getstate(),
+            'np_rng_state': np.random.get_state(),
+            'torch_rng_state': torch.get_rng_state(),
+            'torch_cuda_rng_state': torch.cuda.get_rng_state(),
+            'epoch': epoch + 1, # start from next epoch
+            'best_loss': best_loss,
+        }
+        model.save_checkpoint(args.output_dir, client_state=client_state) # save to the latest
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/applications/DeepSpeed-VisualChat/training/training_scripts/run_7b.sh b/applications/DeepSpeed-VisualChat/training/training_scripts/run_7b.sh
new file mode 100755
index 000000000..de7019536
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/training/training_scripts/run_7b.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+VISION_MODEL=openai/clip-vit-large-patch14
+LLM=meta-llama/Llama-2-7b
+
+
+
+EPOCH=6
+ZERO_STAGE=3
+lr=1e-3
+
+DATA_PATH=./data
+DATA="llava llava_dial otter_mimicit_cgd otter_mimicit_sd otter_mimicit_sn otter_mimicit_tvc otter_mimicit_vst llava_otter_blend sparkles_dialogue"
+DATA_SAMPLE="all"
+IMAGE_PER_SAMPLE="3 2 1 1 1 1 1 1 1"
+
+DATA_CONCATE="${DATA// /_}"
+DATA_SAMPLE_CONCATE="${DATA_SAMPLE// /_}"
+IMAGE_CONCATE="${IMAGE_PER_SAMPLE// /_}"
+# 
+
+OUTPUT_Base=./output/
+
+OUTPUT_Dir=Epoch${EPOCH}_LR${lr}_data_${DATA_CONCATE}_${DATA_SAMPLE_CONCATE}_${IMAGE_CONCATE}
+
+OUTPUT=${OUTPUT_Base}${OUTPUT_Dir}
+
+if [ "$ZERO_STAGE" == "" ]; then
+    ZERO_STAGE=0
+fi
+
+mkdir -p $OUTPUT
+mkdir -p ./log/$OUTPUT_Dir/
+
+# we assume the batch size is 128, which means Num_GPU * per_device_train_batch_size * gradient_accumulation_steps
+deepspeed main.py --max_seq_len 4096 \
+    --data_path ${DATA_PATH} \
+    --dataset_names ${DATA} --dataset_samples ${DATA_SAMPLE} --dataset_concatenate_samples ${IMAGE_PER_SAMPLE} --max_num_image_per_sample 8 \
+    --lm_model_name_or_path  ${LLM} \
+    --vision_model_name_or_path ${VISION_MODEL} \
+    --gradient_checkpointing --vis_proj baseline \
+    --gradient_accumulation_steps 1  --zero_stage $ZERO_STAGE --learning_rate $lr --num_warmup_steps 0.1 \
+    --per_device_train_batch_size 1 --per_device_eval_batch_size 2 --deepspeed --output_dir $OUTPUT  --num_train_epochs ${EPOCH} --enable_mmca_attention --enable_tensorboard
\ No newline at end of file
diff --git a/applications/DeepSpeed-VisualChat/utils/data/DST.py b/applications/DeepSpeed-VisualChat/utils/data/DST.py
new file mode 100644
index 000000000..cf5880f3f
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/utils/data/DST.py
@@ -0,0 +1,139 @@
+from typing import Iterable
+import random
+import numpy as np
+## the following codes are adopted from https://github.com/haotian-liu/LLaVA
+## the following codes are adopted from https://github.com/open-mmlab/Multimodal-GPT 
+## the following codes are adopted from https://github.com/Luodian/Otter/
+
+# deepspeed template
+
+DEFAULT_SYSTEM_TOKEN="### System instuction:"
+DEFAULT_PROMPT = f"You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.\n\n"
+
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_HUMAN_TOKEN = "### Human:"
+DEFAULT_HUMAN_QUESTION_PRETOKEN = "### Question:"
+DEFAULT_QUESTION_TOKEN = "<question>"
+DEFAULT_HUMAN_IMAGE_PRETOKEN = "### Image:"
+
+DEFAULT_ASSISTANT_TOKEN = "### Answer:"
+DEFAULT_ANSWER_TOKEN = "<answer>"
+
+DEFAULT_ASSISTANT_END_ROUND_TOKEN="<endofchunk>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"
+
+IMAGE_NUM = '<image#x>'
+IMAGE_NUM_1 = '### Image 1:'
+IMAGE_NUM_2 = '### Image 2:'
+IMAGE_NUM_3 = '### Image 3:'
+IMAGE_NUM_4 = '### Image 4:'
+IMAGE_NUM_5 = '### Image 5:'
+IMAGE_NUM_6 = '### Image 6:'
+IMAGE_NUM_7 = '### Image 7:'
+IMAGE_NUM_8 = '### Image 8:'
+
+# fow now we at most support 8 images, can be extended to more
+image_mapping_dict = {"default": DEFAULT_HUMAN_IMAGE_PRETOKEN, "1": IMAGE_NUM_1, "2": IMAGE_NUM_2, "3": IMAGE_NUM_3, "4": IMAGE_NUM_4, "5": IMAGE_NUM_5, "6": IMAGE_NUM_6, "7": IMAGE_NUM_7, "8": IMAGE_NUM_8}
+
+special_token_list = [DEFAULT_HUMAN_IMAGE_PRETOKEN, DEFAULT_IMAGE_TOKEN] # used for easy image # replacement
+
+DEFAULT_LABEL_PADDING_NUM = -100
+
+def add_special_token(tokenizer):
+    tokenizer.add_tokens(special_token_list, special_tokens=True)
+    if tokenizer.pad_token is None:
+        # Issue: GPT models don't have a pad token, which we use to
+        # modify labels for the loss.
+        tokenizer.add_special_tokens({"pad_token": "<PAD>"})
+    return tokenizer
+
+def get_image_num_map(tokenizer):
+    image_num_map = {}
+    for key in image_mapping_dict:
+        image_num_map[image_mapping_dict[key]] = tokenizer(image_mapping_dict[key])['input_ids'][1:] # remove <s>
+    image_num_map[DEFAULT_HUMAN_IMAGE_PRETOKEN] = image_num_map[DEFAULT_HUMAN_IMAGE_PRETOKEN][0] # convert list to number 
+    return image_num_map
+
+TEMPLATE = {
+    "description": "Template Modified by DeepSpeed Team for Chat.",
+    "prompt_qa_with_image": f'''{DEFAULT_HUMAN_IMAGE_PRETOKEN}\n{DEFAULT_IMAGE_TOKEN}\n\n{DEFAULT_HUMAN_QUESTION_PRETOKEN}\n{DEFAULT_QUESTION_TOKEN}\n\n{DEFAULT_ASSISTANT_TOKEN}\n''',
+    "prompt_qa_without_image": f'''{DEFAULT_HUMAN_QUESTION_PRETOKEN}\n{DEFAULT_QUESTION_TOKEN}\n\n{DEFAULT_ASSISTANT_TOKEN}\n''',
+}
+
+class Prompter:
+    def __call__(self, question, with_image=True, first_message=False, num_images=-1, options=None):
+        if options:
+            raise NotImplementedError("options not supported yet")
+            options = ", ".join(options)
+            res = TEMPLATE["prompt_choice"].format(image=DEFAULT_IMAGE_TOKEN, question=question, options=options)
+        else:
+            if with_image:
+                res = TEMPLATE["prompt_qa_with_image"].replace(DEFAULT_QUESTION_TOKEN, question)
+                if num_images >= 1:
+                    tmp_dict = {
+                        1: f"{DEFAULT_HUMAN_IMAGE_PRETOKEN}\n{DEFAULT_IMAGE_TOKEN}\n\n",
+                        2: f"{DEFAULT_HUMAN_IMAGE_PRETOKEN}\n{DEFAULT_IMAGE_TOKEN}\n{DEFAULT_HUMAN_IMAGE_PRETOKEN}\n{DEFAULT_IMAGE_TOKEN}\n\n",
+                        3: f"{DEFAULT_HUMAN_IMAGE_PRETOKEN}\n{DEFAULT_IMAGE_TOKEN}\n{DEFAULT_HUMAN_IMAGE_PRETOKEN}\n{DEFAULT_IMAGE_TOKEN}\n{DEFAULT_HUMAN_IMAGE_PRETOKEN}\n{DEFAULT_IMAGE_TOKEN}\n\n",
+                        4: f"{DEFAULT_HUMAN_IMAGE_PRETOKEN}\n{DEFAULT_IMAGE_TOKEN}\n{DEFAULT_HUMAN_IMAGE_PRETOKEN}\n{DEFAULT_IMAGE_TOKEN}\n{DEFAULT_HUMAN_IMAGE_PRETOKEN}\n{DEFAULT_IMAGE_TOKEN}\n{DEFAULT_HUMAN_IMAGE_PRETOKEN}\n{DEFAULT_IMAGE_TOKEN}\n\n",
+                        5: f"{DEFAULT_HUMAN_IMAGE_PRETOKEN}\n{DEFAULT_IMAGE_TOKEN}\n{DEFAULT_HUMAN_IMAGE_PRETOKEN}\n{DEFAULT_IMAGE_TOKEN}\n{DEFAULT_HUMAN_IMAGE_PRETOKEN}\n{DEFAULT_IMAGE_TOKEN}\n{DEFAULT_HUMAN_IMAGE_PRETOKEN}\n{DEFAULT_IMAGE_TOKEN}\n{DEFAULT_HUMAN_IMAGE_PRETOKEN}\n{DEFAULT_IMAGE_TOKEN}\n\n",
+                        6: f"{DEFAULT_HUMAN_IMAGE_PRETOKEN}\n{DEFAULT_IMAGE_TOKEN}\n{DEFAULT_HUMAN_IMAGE_PRETOKEN}\n{DEFAULT_IMAGE_TOKEN}\n{DEFAULT_HUMAN_IMAGE_PRETOKEN}\n{DEFAULT_IMAGE_TOKEN}\n{DEFAULT_HUMAN_IMAGE_PRETOKEN}\n{DEFAULT_IMAGE_TOKEN}\n{DEFAULT_HUMAN_IMAGE_PRETOKEN}\n{DEFAULT_IMAGE_TOKEN}\n{DEFAULT_HUMAN_IMAGE_PRETOKEN}\n{DEFAULT_IMAGE_TOKEN}\n\n",
+                        7: f"{DEFAULT_HUMAN_IMAGE_PRETOKEN}\n{DEFAULT_IMAGE_TOKEN}\n{DEFAULT_HUMAN_IMAGE_PRETOKEN}\n{DEFAULT_IMAGE_TOKEN}\n{DEFAULT_HUMAN_IMAGE_PRETOKEN}\n{DEFAULT_IMAGE_TOKEN}\n{DEFAULT_HUMAN_IMAGE_PRETOKEN}\n{DEFAULT_IMAGE_TOKEN}\n{DEFAULT_HUMAN_IMAGE_PRETOKEN}\n{DEFAULT_IMAGE_TOKEN}\n{DEFAULT_HUMAN_IMAGE_PRETOKEN}\n{DEFAULT_IMAGE_TOKEN}\n{DEFAULT_HUMAN_IMAGE_PRETOKEN}\n{DEFAULT_IMAGE_TOKEN}\n\n",
+                        8: f"{DEFAULT_HUMAN_IMAGE_PRETOKEN}\n{DEFAULT_IMAGE_TOKEN}\n{DEFAULT_HUMAN_IMAGE_PRETOKEN}\n{DEFAULT_IMAGE_TOKEN}\n{DEFAULT_HUMAN_IMAGE_PRETOKEN}\n{DEFAULT_IMAGE_TOKEN}\n{DEFAULT_HUMAN_IMAGE_PRETOKEN}\n{DEFAULT_IMAGE_TOKEN}\n{DEFAULT_HUMAN_IMAGE_PRETOKEN}\n{DEFAULT_IMAGE_TOKEN}\n{DEFAULT_HUMAN_IMAGE_PRETOKEN}\n{DEFAULT_IMAGE_TOKEN}\n{DEFAULT_HUMAN_IMAGE_PRETOKEN}\n{DEFAULT_IMAGE_TOKEN}\n{DEFAULT_HUMAN_IMAGE_PRETOKEN}\n{DEFAULT_IMAGE_TOKEN}\n\n",
+                    }
+                    res = res.replace(f"{DEFAULT_HUMAN_IMAGE_PRETOKEN}\n{DEFAULT_IMAGE_TOKEN}\n\n", tmp_dict[num_images])
+            else:
+                res = TEMPLATE["prompt_qa_without_image"].replace(DEFAULT_QUESTION_TOKEN, question)
+            
+            if first_message:
+                res = DEFAULT_PROMPT + res
+        return res
+
+    def get_response(self, output: str) -> str:
+        return output.split(TEMPLATE["response_split"])[-1].strip()
+
+def _flatten(items):
+    """Yield items from any nested iterable; see Reference."""
+    for x in items:
+        if isinstance(x, Iterable) and not isinstance(x, (str, bytes)):
+            for sub_x in flatten(x):
+                yield sub_x
+        else:
+            yield x
+
+def flatten(items):
+    return list(_flatten(items))
+
+
+def split_list_with_random_num_items_up_to_a_certain_number(input_list, max_num):
+    if len(input_list) <= max_num:
+        return [input_list]
+    else:
+        random_num = random.randint(1, max_num)
+        return [input_list[:random_num]] + split_list_with_random_num_items_up_to_a_certain_number(input_list[random_num:], max_num)
+            
+def random_grouping(input_list, max_num):
+    random.shuffle(input_list)
+    random_num = np.random.randint(1, max_num+1, len(input_list))
+    # use bisect to find the index of random_num, whose sum is equal or large to len(input_list)
+    # then split the input_list into groups
+    cum_sum = np.cumsum(random_num)
+    # find the index now
+    left = 0
+    right = len(cum_sum) - 1
+    while left < right:
+        mid = (left + right) // 2
+        if cum_sum[mid] >= len(input_list):
+            right = mid
+        else:
+            left = mid + 1
+    index = left
+    cum_sum = list(cum_sum[:index+1])
+    if cum_sum[-1] > len(input_list):
+        cum_sum[-1] = len(input_list)
+    elif cum_sum[-1] < len(input_list):
+        cum_sum.append(len(input_list))
+
+    return [input_list[cum_sum[i]:cum_sum[i+1]] for i in range(len(cum_sum)-1)]
+    # return split_list_with_random_num_items_up_to_a_certain_number(input_list, max_num)
\ No newline at end of file
diff --git a/applications/DeepSpeed-VisualChat/utils/data/__init__.py b/applications/DeepSpeed-VisualChat/utils/data/__init__.py
new file mode 100644
index 000000000..31de795ba
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/utils/data/__init__.py
@@ -0,0 +1,6 @@
+# This file is adapted from https://github.com/open-mmlab/Multimodal-GPT
+
+from .builder import build_dataset  # noqa: F401
+from .vqa_dataset import VQADataset  # noqa: F401
+from .utils import DataCollatorPadToMaxLen, split_dataset, shuffle_dataset  # noqa: F401
+from .DST import add_special_token
\ No newline at end of file
diff --git a/applications/DeepSpeed-VisualChat/utils/data/aokvqa_dataset.py b/applications/DeepSpeed-VisualChat/utils/data/aokvqa_dataset.py
new file mode 100644
index 000000000..bc204dba3
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/utils/data/aokvqa_dataset.py
@@ -0,0 +1,59 @@
+# This file is adapted from https://github.com/open-mmlab/Multimodal-GPT
+# This dataset is from https://allenai.org/project/a-okvqa/home
+import os
+import random
+from PIL import Image
+
+from .vqa_dataset import VQADataset
+from utils.utils import get_rank
+from .utils import save_debug_image, save_debug_text
+
+REASON_QUESTIONS = [
+    "Why?",
+    "Why is this?",
+    "And why?",
+    "What is the reason?",
+    "And can you tell me why?",
+    "Can you tell me why?",
+    "Can you tell me the reason?",
+]
+
+
+class AOKVQADataset(VQADataset):
+    def __init__(self, data_path, data_debug_path, per_sample_image, tokenizer, vis_processor, **kwargs):
+        vis_root = f"{data_path}/coco/train2017"
+        assert os.path.isdir(vis_root), f"AOKVQADataset image directory {vis_root} not found, you need to download 2017 Train images from https://cocodataset.org/#download"
+        ann_paths = ["aokvqa/annotations/aokvqa_v1p0_train.json"]
+        for idx in range(len(ann_paths)):
+            ann_paths[idx] = f"{data_path}/{ann_paths[idx]}"
+            assert os.path.isfile(ann_paths[idx]), f"AOKVQADataset annotation file {ann_paths[idx]} not found, you need to download it from https://allenai.org/project/a-okvqa/home"
+        super().__init__(data_path, data_debug_path, per_sample_image, tokenizer, vis_processor,
+                         vis_root, ann_paths, **kwargs)
+
+    def process_text(self, ann, data_debug_path=None, data_debug_counter=0, first_message=True):
+        question = ann["question"]
+        question = question + " " + random.choice(REASON_QUESTIONS)
+
+        choices = ann["choices"]
+        true_answer = choices[ann["correct_choice_idx"]]
+        answer = "The answer is " + true_answer + ". Because " + " ".join(ann["rationales"])
+
+        is_option = random.random() < self.option_prob and len(choices) > 1 # let's not do option for now
+        # if is_option:
+        #     instruction = self.prompter(question, choices)
+        # else:
+        instruction = self.prompter(question, with_image=True, first_message=first_message)
+        save_debug_text([instruction, answer], data_debug_path, data_debug_counter, get_rank())
+        return dict(instruction=instruction, answer=answer)
+    
+    def process_image(self, ann, data_debug_path=None, data_debug_counter=0):
+        image_path = os.path.join(self.vis_root, str(ann["image_id"]).rjust(12, '0') + ".jpg")
+        save_debug_image(image_path, data_debug_path, data_debug_counter, get_rank(), img_idx=0)
+        image = Image.open(image_path).convert("RGB")
+
+        image = self.vis_processor(image)
+        try:
+            image = image['pixel_values'][0]
+            return image
+        except:
+            return image
diff --git a/applications/DeepSpeed-VisualChat/utils/data/builder.py b/applications/DeepSpeed-VisualChat/utils/data/builder.py
new file mode 100644
index 000000000..237af28ab
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/utils/data/builder.py
@@ -0,0 +1,140 @@
+# This file is adapted from https://github.com/open-mmlab/Multimodal-GPT
+
+import numpy as np
+import torch
+
+from .aokvqa_dataset import AOKVQADataset  # noqa: F401
+from .cc_sbu_align_dataset import CcSbuAlignDataset  # noqa: F401
+from .coco_caption_dataset import COCOCaptionDataset  # noqa: F401
+from .dial_dataset import DialDataset  # noqa: F401
+from .llava_dataset import LlavaDataset  # noqa: F401
+from .llava_otter_blend_dataset import LlavaOtterBlendDataset  # noqa: F401
+from .ocr_vqa_dataset import OCRVQADataset  # noqa: F401
+from .otter_mimicit_cgd_dataset import OtterMimicitCgdDataset  # noqa: F401
+from .otter_mimicit_sd_dataset import OtterMimicitSdDataset  # noqa: F401
+from .otter_mimicit_sn_dataset import OtterMimicitSnDataset  # noqa: F401
+from .otter_mimicit_tvc_dataset import OtterMimicitTvcDataset  # noqa: F401
+from .otter_mimicit_vst_dataset import OtterMimicitVstDataset  # noqa: F401
+from .sparkles_dialogue_dataset import SparklesDialogueDataset  # noqa: F401
+from .vqa_dataset import ConcatDataset  # noqa: F401
+from utils.utils import print_rank_0
+
+
+def build_dataset(data_path, data_debug_path, dataset_name, dataset_sample,
+                  dataset_concatenate_samples, max_num_image_per_sample, **kwargs):
+    if isinstance(dataset_name, list):
+        datasets = [build_dataset(data_path, data_debug_path,
+                                  dataset_name[i], dataset_sample[i],
+                                  dataset_concatenate_samples[i],
+                                  max_num_image_per_sample,
+                                  **kwargs) for i in range(len(dataset_name))]
+        return ConcatDataset(datasets)
+    if dataset_name == "aokvqa":
+        dataset = AOKVQADataset(
+            data_path,
+            data_debug_path,
+            dataset_concatenate_samples,
+            **kwargs,
+        )
+    elif dataset_name == "coco_caption":
+        dataset = COCOCaptionDataset(
+            data_path,
+            data_debug_path,
+            dataset_concatenate_samples,
+            **kwargs,
+        )
+    elif dataset_name == "llava":
+        dataset = LlavaDataset(
+            data_path,
+            data_debug_path,
+            dataset_concatenate_samples,
+            **kwargs,
+        )
+    elif dataset_name == "llava_dial":
+        dataset = DialDataset(
+            dataset_name,
+            data_path,
+            data_debug_path,
+            dataset_concatenate_samples,
+            **kwargs,
+        )
+    elif dataset_name == "llava_otter_blend":
+        dataset = LlavaOtterBlendDataset(
+            data_path,
+            data_debug_path,
+            dataset_concatenate_samples,
+            followup=False,
+            **kwargs,
+        )
+    elif dataset_name == "minigpt4":
+        dataset = CcSbuAlignDataset(
+            data_path,
+            data_debug_path,
+            dataset_concatenate_samples,
+            **kwargs,
+        )
+    elif dataset_name == "ocr_vqa":
+        dataset = OCRVQADataset(
+            data_path,
+            data_debug_path,
+            dataset_concatenate_samples,
+            **kwargs,
+        )
+    elif dataset_name == "otter_mimicit_cgd":
+        dataset = OtterMimicitCgdDataset(
+            data_path,
+            data_debug_path,
+            dataset_concatenate_samples,
+            **kwargs,
+        )
+    elif dataset_name == "otter_mimicit_sd":
+        dataset = OtterMimicitSdDataset(
+            data_path,
+            data_debug_path,
+            dataset_concatenate_samples,
+            **kwargs,
+        )
+    elif dataset_name == "otter_mimicit_sn":
+        dataset = OtterMimicitSnDataset(
+            data_path,
+            data_debug_path,
+            dataset_concatenate_samples,
+            max_num_image_per_sample,
+            **kwargs,
+        )
+    elif dataset_name == "otter_mimicit_tvc":
+        dataset = OtterMimicitTvcDataset(
+            data_path,
+            data_debug_path,
+            dataset_concatenate_samples,
+            max_num_image_per_sample,
+            **kwargs,
+        )
+    elif dataset_name == "otter_mimicit_vst":
+        dataset = OtterMimicitVstDataset(
+            data_path,
+            data_debug_path,
+            dataset_concatenate_samples,
+            max_num_image_per_sample,
+            **kwargs,
+        )
+    elif dataset_name == "sparkles_dialogue":
+        dataset = SparklesDialogueDataset(
+            data_path,
+            data_debug_path,
+            dataset_concatenate_samples,
+            **kwargs,
+        )
+    else:
+        raise NotImplementedError
+
+    if dataset_sample != 'all':
+        dataset_sample = int(dataset_sample)
+        random_indices = np.random.choice(len(dataset), min(dataset_sample, len(dataset)), replace=False)
+        subsample_dataset = torch.utils.data.Subset(dataset, random_indices)
+        subsample_dataset.collater = dataset.collater
+        print_rank_0(f"[DATA] Built dataset {dataset_name} with {len(subsample_dataset)} samples.")
+        return subsample_dataset
+    else:
+        print_rank_0(f"[DATA] Built dataset {dataset_name} with all {len(dataset)} samples.")
+        return dataset
diff --git a/applications/DeepSpeed-VisualChat/utils/data/cc_sbu_align_dataset.py b/applications/DeepSpeed-VisualChat/utils/data/cc_sbu_align_dataset.py
new file mode 100644
index 000000000..843b0c355
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/utils/data/cc_sbu_align_dataset.py
@@ -0,0 +1,103 @@
+# This file is adapted from https://github.com/open-mmlab/Multimodal-GPT
+# This dataset is from https://minigpt-4.github.io/
+
+import os
+import random
+from PIL import Image
+
+from .vqa_dataset import VQADataset
+import utils.data.DST as DST 
+from utils.utils import get_rank
+from .utils import save_debug_image, save_debug_text
+
+
+QUESTIONS = [
+    "please describe the image",
+    "can you describe the image",
+    "Could you provide a description of the image?",
+    "What do you see in this image?",
+    "Share your thoughts on the content of the image.",
+    "Please narrate what's happening in the picture.",
+    "Can you give a brief explanation of the image?",
+    "Describe the main elements and details present in the image.",
+    "In your own words, what is depicted in the image?",
+    "Can you outline the key aspects of the image?",
+    "What are the most striking features in this image?",
+    "Please provide a summary of the image's content.",
+    "Describe the overall theme or concept captured in the image.",
+    "How would you explain the image's composition and focus?",
+    "What is the focal point or main subject of the image?",
+    "How do the different components of the image interact with each other?",
+    "What would be a fitting caption for this image?",
+    "Can you create a concise description that captures the essence of the image?",
+    "How would you briefly summarize the content of this image in a phrase or sentence?",
+    "Please provide a catchy and relevant caption for this picture.",
+    "If you were to give this image a title, what would it be?",
+    "Describe the image in one creative sentence.",
+    "Please suggest a memorable phrase that encapsulates the image's content.",
+    "What engaging phrase would best represent this image?",
+    "Can you create an expressive caption that highlights the main theme of the image?",
+    "How would you sum up the image's story for a caption?",
+    "Provide an eye-catching caption that conveys the image's core message.",
+    "If you were to give this image a headline, what would it say?",
+    "Can you craft a captivating caption that communicates the essence of the image?",
+    "How would you describe the image's content in a powerful caption?",
+    "Please provide an inventive title to summarize the scene depicted in the image.",
+    "Compose a concise and striking phrase that reflects the image's key elements.",
+    "If you were to create a caption for this image, what would it be?",
+    "Offer a compelling caption that highlights the central focus of the image.",
+    "Can you produce a unique caption that encapsulates the image's overall mood?",
+    "Please generate an attention-grabbing caption that would best illustrate the events captured in this image",
+    "How would you express the image's main idea in an impactful sentence?",
+    "Please create a vivid and concise title that conveys the essence of the picture.",
+    "Compose an imaginative caption that reflects the image's most striking features.",
+    "What memorable statement would best represent the scene illustrated in this image?",
+    "Draft an evocative caption that brings the image to life for the reader.",
+    "Can you suggest an insightful caption that highlights the underlying message of the image?",
+    "What engaging phrase would effectively convey the action or subject matter depicted in this picture?",
+    "How would you encapsulate the image's core theme in a concise and expressive manner?",
+    "Please provide a creative and impactful title that captures the spirit of the image.",
+    "Craft a captivating caption that showcases the image's most prominent attributes.",
+    "What intriguing statement would best sum up the scene presented in this image?",
+    "Develop a descriptive caption that paints a vivid picture for the viewer.",
+    "Can you give a detailed account of the image's contents?",
+    "What are the key elements and features visible in this image?",
+    "How would you narrate the events or actions depicted in the picture?",
+    "Please share your observations about the various components present in the image.",
+    "What is the overall theme or concept captured in this image? Can you describe it?",
+]
+
+
+class CcSbuAlignDataset(VQADataset):
+    def __init__(self, data_path, data_debug_path, per_sample_image, tokenizer, vis_processor, add_eos=True, ignore_instruction=True, **kwargs):
+        vis_root = f"{data_path}/cc_sbu_align/image"
+        assert os.path.isdir(vis_root), f"CcSbuAlignDataset image directory {vis_root} not found, you need to download it from https://huggingface.co/datasets/Vision-CAIR/cc_sbu_align"
+
+        ann_paths = ["cc_sbu_align/filter_cap.json"]
+        real_ann_paths = []
+        for ann_path in ann_paths:
+            ann_path = f"{data_path}/{ann_path}"
+            real_ann_paths.append(ann_path)
+            assert os.path.isfile(ann_path), f"CcSbuAlignDataset annotation file {ann_path} not found, you need to download it from https://huggingface.co/datasets/Vision-CAIR/cc_sbu_align"
+        super().__init__(data_path, data_debug_path, per_sample_image, tokenizer, vis_processor,
+                         vis_root, real_ann_paths, annotation_key="annotations", **kwargs)
+
+    def process_text(self, ann, data_debug_path=None, data_debug_counter=0, first_message=True):
+        # random select a question
+        question = random.choice(QUESTIONS)
+        answer = ann["caption"]
+        instruction = self.prompter(question, with_image=True, first_message=first_message)
+        save_debug_text([instruction, answer], data_debug_path, data_debug_counter, get_rank())
+        return dict(instruction=instruction, answer=answer)
+
+    def process_image(self, ann, data_debug_path=None, data_debug_counter=0):
+        image_path = os.path.join(self.vis_root, ann["image_id"] + ".jpg")
+        save_debug_image(image_path, data_debug_path, data_debug_counter, get_rank(), img_idx=0)
+        image = Image.open(image_path).convert("RGB")
+
+        image = self.vis_processor(image)
+        try:
+            image = image['pixel_values'][0]
+            return image
+        except:
+            return image
diff --git a/applications/DeepSpeed-VisualChat/utils/data/coco_caption_dataset.py b/applications/DeepSpeed-VisualChat/utils/data/coco_caption_dataset.py
new file mode 100644
index 000000000..9dce9bca8
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/utils/data/coco_caption_dataset.py
@@ -0,0 +1,115 @@
+# This file is adapted from https://github.com/open-mmlab/Multimodal-GPT
+# This dataset is from https://cs.stanford.edu/people/karpathy/deepimagesent/
+
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import os
+import random
+from PIL import Image
+
+from .vqa_dataset import VQADataset
+from utils.utils import get_rank
+from .utils import save_debug_image, save_debug_text
+
+QUESTIONS = [
+    "please describe the image",
+    "can you describe the image",
+    "Could you provide a description of the image?",
+    "What do you see in this image?",
+    "Share your thoughts on the content of the image.",
+    "Please narrate what's happening in the picture.",
+    "Can you give a brief explanation of the image?",
+    "Describe the main elements and details present in the image.",
+    "In your own words, what is depicted in the image?",
+    "Can you outline the key aspects of the image?",
+    "What are the most striking features in this image?",
+    "Please provide a summary of the image's content.",
+    "Describe the overall theme or concept captured in the image.",
+    "How would you explain the image's composition and focus?",
+    "What is the focal point or main subject of the image?",
+    "How do the different components of the image interact with each other?",
+    "What would be a fitting caption for this image?",
+    "Can you create a concise description that captures the essence of the image?",
+    "How would you briefly summarize the content of this image in a phrase or sentence?",
+    "Please provide a catchy and relevant caption for this picture.",
+    "If you were to give this image a title, what would it be?",
+    "Describe the image in one creative sentence.",
+    "Please suggest a memorable phrase that encapsulates the image's content.",
+    "What engaging phrase would best represent this image?",
+    "Can you create an expressive caption that highlights the main theme of the image?",
+    "How would you sum up the image's story for a caption?",
+    "Provide an eye-catching caption that conveys the image's core message.",
+    "If you were to give this image a headline, what would it say?",
+    "Can you craft a captivating caption that communicates the essence of the image?",
+    "How would you describe the image's content in a powerful caption?",
+    "Please provide an inventive title to summarize the scene depicted in the image.",
+    "Compose a concise and striking phrase that reflects the image's key elements.",
+    "If you were to create a caption for this image, what would it be?",
+    "Offer a compelling caption that highlights the central focus of the image.",
+    "Can you produce a unique caption that encapsulates the image's overall mood?",
+    "Please generate an attention-grabbing caption that would best illustrate the events captured in this image",
+    "How would you express the image's main idea in an impactful sentence?",
+    "Please create a vivid and concise title that conveys the essence of the picture.",
+    "Compose an imaginative caption that reflects the image's most striking features.",
+    "What memorable statement would best represent the scene illustrated in this image?",
+    "Draft an evocative caption that brings the image to life for the reader.",
+    "Can you suggest an insightful caption that highlights the underlying message of the image?",
+    "What engaging phrase would effectively convey the action or subject matter depicted in this picture?",
+    "How would you encapsulate the image's core theme in a concise and expressive manner?",
+    "Please provide a creative and impactful title that captures the spirit of the image.",
+    "Craft a captivating caption that showcases the image's most prominent attributes.",
+    "What intriguing statement would best sum up the scene presented in this image?",
+    "Develop a descriptive caption that paints a vivid picture for the viewer.",
+    "Can you give a detailed account of the image's contents?",
+    "What are the key elements and features visible in this image?",
+    "How would you narrate the events or actions depicted in the picture?",
+    "Please share your observations about the various components present in the image.",
+    "What is the overall theme or concept captured in this image? Can you describe it?",
+]
+
+
+class COCOCaptionDataset(VQADataset):
+    def __init__(
+        self, data_path, data_debug_path, per_sample_image, tokenizer, vis_processor=None, add_eos=True, ignore_instruction=True, **kwargs
+    ):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        self.vis_root = f"{data_path}/coco/2014"
+        assert os.path.isdir(self.vis_root), f"COCOCaptionDataset image directory {self.vis_root} not found, you need to download 2014 Train images and 2014 Val images from https://cocodataset.org/#download"
+        ann_paths =  ["coco_caption/dataset.json"]
+        real_ann_paths = []
+        for ann_path in ann_paths:
+            ann_path = f"{data_path}/{ann_path}"
+            real_ann_paths.append(ann_path)
+            assert os.path.isfile(ann_path), f"COCOCaptionDataset annotation file {ann_path} not found, you need to download it from https://cs.stanford.edu/people/karpathy/deepimagesent/coco.zip"
+        super().__init__(data_path, data_debug_path, per_sample_image, tokenizer, vis_processor,
+                         self.vis_root, real_ann_paths, annotation_key="images", **kwargs)
+
+    def process_image(self, ann, data_debug_path=None, data_debug_counter=0):
+        image_path = os.path.join(self.vis_root, ann["filename"])
+        save_debug_image(image_path, data_debug_path, data_debug_counter, get_rank(), img_idx=0)
+        image = Image.open(image_path).convert("RGB")
+
+        image = self.vis_processor(image)
+        try:
+            image = image['pixel_values'][0]
+            return image
+        except:
+            return image
+
+    def process_text(self, ann, data_debug_path=None, data_debug_counter=0, first_message=True):
+        all_captions = ann["sentences"]
+        if not isinstance(all_captions, list):
+            all_captions = [all_captions]
+        caption = random.choice(all_captions)
+        caption = caption['raw']
+        instruction = self.prompter(random.choice(QUESTIONS), with_image=True, first_message=first_message)
+        save_debug_text([instruction, caption], data_debug_path, data_debug_counter, get_rank())
+        return dict(instruction=instruction, answer=caption)
diff --git a/applications/DeepSpeed-VisualChat/utils/data/dial_dataset.py b/applications/DeepSpeed-VisualChat/utils/data/dial_dataset.py
new file mode 100644
index 000000000..63b99b5ae
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/utils/data/dial_dataset.py
@@ -0,0 +1,78 @@
+# This file is adapted from https://github.com/open-mmlab/Multimodal-GPT
+# This dataset is from https://llava-vl.github.io/
+import os
+from .vqa_dataset import VQADataset
+import utils.data.DST as DST 
+from utils.utils import get_rank
+from .utils import save_debug_text
+
+class DialDataset(VQADataset):
+    def __init__(self, dataset_name, data_path, data_debug_path, per_sample_image, tokenizer, vis_processor, **kwargs):
+        if dataset_name == "llava_dial":
+            vis_root = f"{data_path}/coco/train2017"
+            assert os.path.isdir(vis_root), f"llava_dial image directory {vis_root} not found, you need to download 2017 Train images from https://cocodataset.org/#download"
+            ann_paths = ["llava/conversation_58k.json"]
+            for idx in range(len(ann_paths)):
+                ann_paths[idx] = f"{data_path}/{ann_paths[idx]}"
+                assert os.path.isfile(ann_paths[idx]), f"llava_dial annotation file {ann_paths[idx]} not found, you need to download it from https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K"
+        super(DialDataset, self).__init__(data_path, data_debug_path, per_sample_image, 
+                                          tokenizer, vis_processor, vis_root,
+                                          ann_paths, **kwargs)
+        self.prompter = DST.Prompter()
+
+    def _add_instance_ids(self, key="id"):
+        for idx, ann in enumerate(self.annotation):
+            ann[key] = str(idx)
+
+    def process_text(self, anns, data_debug_path=None, data_debug_counter=0, first_message=False):
+        num_convs = len(anns["conversations"]) // 2
+        conv_list = []
+        for conv_id in range(num_convs):
+            question = anns["conversations"][int(2*conv_id)]["value"]
+            # remove '<image>' tag and '\n'
+            with_image = "<image>" in question
+            question = question.replace("<image>", "").replace("\n", "")
+            answer = anns["conversations"][int(2*conv_id+1)]["value"]
+            instruction = self.prompter(question, with_image=with_image, first_message=(conv_id == 0 and first_message))
+            single_conv = dict(instruction=instruction, answer=answer)
+            conv_list.append(single_conv)
+        save_debug_text(conv_list, data_debug_path, data_debug_counter, get_rank())
+        return conv_list
+
+    def __getitem__(self, index):
+        full_res_list = []
+        for ann in self.annotation[index]:
+            image = self.process_image(ann,
+                                    data_debug_path=self.data_debug_path,
+                                    data_debug_counter=self.data_debug_counter)
+            text_list = self.process_text(ann,
+                                        data_debug_path=self.data_debug_path,
+                                        data_debug_counter=self.data_debug_counter,
+                                        first_message=(not full_res_list))
+            self.data_debug_counter += 1
+            res_list = []
+            for text in text_list:
+                single_res = self.tokenize(text)
+                single_res["instruction"] = text["instruction"]
+                single_res["answer"] = text["answer"]
+                res_list.append(single_res)
+            input_ids = []
+            attention_mask = []
+            labels = []
+            instruction = ''
+            answer = ''
+            for res in res_list:
+                input_ids.extend(res["input_ids"])
+                attention_mask.extend(res["attention_mask"])
+                labels.extend(res["labels"])
+                instruction += res["instruction"]
+                answer += res["answer"]
+
+            res = dict(
+                input_ids=input_ids, attention_mask=attention_mask, labels=labels, instruction=instruction, answer=answer
+            )
+            res.update(image=image)
+
+            full_res_list.append(res)
+        output = self.merge_all_images(full_res_list)
+        return output
diff --git a/applications/DeepSpeed-VisualChat/utils/data/llava_dataset.py b/applications/DeepSpeed-VisualChat/utils/data/llava_dataset.py
new file mode 100644
index 000000000..601ecbc4b
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/utils/data/llava_dataset.py
@@ -0,0 +1,31 @@
+# This file is adapted from https://github.com/open-mmlab/Multimodal-GPT
+# This dataset is from https://llava-vl.github.io/
+import os
+from .vqa_dataset import VQADataset
+from utils.utils import get_rank
+from .utils import save_debug_text
+
+
+class LlavaDataset(VQADataset):
+    def __init__(self, data_path, data_debug_path, per_sample_image, tokenizer, vis_processor, **kwargs):
+        vis_root = f"{data_path}/coco/train2017"
+        assert os.path.isdir(vis_root), f"LlavaDataset image directory {vis_root} not found, you need to download 2017 Train images from https://cocodataset.org/#download"
+        ann_paths = ["llava/detail_23k.json", "llava/complex_reasoning_77k.json"]
+        for idx in range(len(ann_paths)):
+            ann_paths[idx] = f"{data_path}/{ann_paths[idx]}"
+            assert os.path.isfile(ann_paths[idx]), f"LlavaDataset annotation file {ann_paths[idx]} not found, you need to download it from https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K"
+        super().__init__(data_path, data_debug_path, per_sample_image, tokenizer, vis_processor,
+                         vis_root, ann_paths, **kwargs)
+
+    def _add_instance_ids(self, key="id"):
+        for idx, ann in enumerate(self.annotation):
+            ann[key] = str(idx)
+
+    def process_text(self, ann, data_debug_path=None, data_debug_counter=0, first_message=False):
+        question = ann["conversations"][0]["value"]
+        # remove '<image>' tag and '\n'
+        question = question.replace("<image>", "").replace("\n", "")
+        answer = ann["conversations"][1]["value"]
+        instruction = self.prompter(question, with_image=True, first_message=first_message)
+        save_debug_text([instruction, answer], data_debug_path, data_debug_counter, get_rank())
+        return dict(instruction=instruction, answer=answer)
diff --git a/applications/DeepSpeed-VisualChat/utils/data/llava_otter_blend_dataset.py b/applications/DeepSpeed-VisualChat/utils/data/llava_otter_blend_dataset.py
new file mode 100644
index 000000000..a35962280
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/utils/data/llava_otter_blend_dataset.py
@@ -0,0 +1,207 @@
+# This dataset is from https://llava-vl.github.io/ and https://huggingface.co/datasets/pufanyi/MIMICIT
+# This dataset blends llava, llava_dial, and otter_mimicit_cgd datasets, which is possible because
+# all of them use coco images. In each sample of LlavaOtterBlendDataset, there will first have at
+# least one instruction-answer pair from llava/llava_dial, then followed by at least one
+# instruction-answer pair from otter_mimicit_cgd.
+import os
+import torch
+import json
+import random
+from tqdm import tqdm
+from PIL import Image
+from .vqa_dataset import VQADataset
+from utils.utils import print_rank_0, is_rank_0, get_rank
+from .utils import save_debug_image, save_debug_text
+
+
+class LlavaOtterBlendDataset(VQADataset):
+    def __init__(self, data_path, data_debug_path, per_sample_image, followup, tokenizer, vis_processor, **kwargs):
+        vis_root = f"{data_path}/coco/train2017"
+        assert os.path.isdir(vis_root), f"LlavaOtterBlendDataset image directory {vis_root} not found, you need to download 2017 Train images from https://cocodataset.org/#download"
+
+        otter_mimicit_cgd = f"{data_path}/MIMIC-IT/CGD_instructions.json"
+        llava = [f"{data_path}/llava/detail_23k.json", f"{data_path}/llava/complex_reasoning_77k.json", f"{data_path}/llava/conversation_58k.json"]
+        ann_path_otter = f"{data_path}/LlavaOtterBlendDataset_instructions_otter.json"
+        ann_path_llava = f"{data_path}/LlavaOtterBlendDataset_instructions_llava.json"
+        if not os.path.isfile(ann_path_llava):
+            print_rank_0(f"LlavaOtterBlendDataset llava annotation file {ann_path_llava} not found, starting an one-time preprocessing:")
+            if is_rank_0():
+                annotations_llava = {}
+                for llava_ann in llava:
+                    assert os.path.isfile(llava_ann), f"LlavaOtterBlendDataset raw annotation file {llava_ann} not found, you need to download it from https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K"
+                    raw_annotation = json.load(open(llava_ann, "r"))
+                    for raw_ann in raw_annotation:
+                        if raw_ann["image"] not in annotations_llava:
+                            annotations_llava[raw_ann["image"]] = []
+                        annotations_llava[raw_ann["image"]].append(raw_ann["conversations"])
+                with open(ann_path_llava, 'w') as f:
+                    json.dump(annotations_llava, f)
+        torch.distributed.barrier()
+        self.ann_llava = json.load(open(ann_path_llava, "r"))
+        if not os.path.isfile(ann_path_otter):
+            print_rank_0(f"LlavaOtterBlendDataset otter annotation file {ann_path_otter} not found, starting an one-time preprocessing:")
+            if is_rank_0():
+                assert os.path.isfile(otter_mimicit_cgd), f"LlavaOtterBlendDataset raw annotation file {otter_mimicit_cgd} not found, you need to download it from https://huggingface.co/datasets/pufanyi/MIMICIT"
+                raw_annotation = json.load(open(otter_mimicit_cgd, "r"))["data"]
+                raw_annotation_keys = list(raw_annotation.keys())
+                annotations_otter = []
+                for k in tqdm(raw_annotation_keys):
+                    if k in raw_annotation:
+                        ann = {}
+                        ann["image_ids"] = [self.convert_image_id(x) for x in raw_annotation[k]["image_ids"]]
+                        meet_criteria = True
+                        for midx in range(len(ann["image_ids"])-1):
+                            if ann["image_ids"][midx] not in self.ann_llava:
+                                meet_criteria = False
+                        if meet_criteria: # If any image (except the last image) doesn't have llava conversation, we won't be able to build valid sample with correct image order
+                            ann["instruction"] = [raw_annotation[k]["instruction"]]
+                            ann["answer"] = [raw_annotation[k]["answer"]]
+                            rel_ins_ids = raw_annotation[k]["rel_ins_ids"]
+                            for k_rel in rel_ins_ids:
+                                if k_rel in raw_annotation:
+                                    ann["instruction"].append(raw_annotation[k_rel]["instruction"])
+                                    ann["answer"].append(raw_annotation[k_rel]["answer"])
+                                    del raw_annotation[k_rel]
+                            annotations_otter.append(ann)
+                        del raw_annotation[k]
+                with open(ann_path_otter, 'w') as f:
+                    json.dump(annotations_otter, f)
+        torch.distributed.barrier()
+        super().__init__(data_path, data_debug_path, per_sample_image, tokenizer, vis_processor,
+                         vis_root, [ann_path_otter], **kwargs)
+        self.followup = followup
+
+    def _add_instance_ids(self, key="id"):
+        for idx, ann in enumerate(self.annotation):
+            ann[key] = str(idx)
+
+    def convert_image_id(self, image_id):
+        return image_id[8:] + ".jpg"
+
+    def process_image(self, ann, data_debug_path=None, data_debug_counter=0):
+        images = ann["image_ids"]
+        output_images = []
+        for idx in range(len(images)):
+            image = images[idx]
+            image_path = os.path.join(self.vis_root, image)
+            save_debug_image(image_path, data_debug_path, data_debug_counter, get_rank(), img_idx=idx)
+            image = Image.open(image_path).convert("RGB")
+
+            image = self.vis_processor(image)
+            try:
+                image = image['pixel_values'][0]
+            except:
+                image = image
+            output_images.append(image)
+        
+        return output_images
+
+    def process_text(self, ann, data_debug_path=None, data_debug_counter=0, first_message=False, num_images=1):
+        images = ann["image_ids"]
+        processed_images = {}
+        conv_list = []
+        # At least one conversation from llava
+        for idx in range(len(images)):
+            img_key = images[idx]
+            if img_key in self.ann_llava:
+                conversations = self.ann_llava[img_key]
+                min_num_draw = 1 if idx < (len(images) - 1) else 0 # The last image could have 0 llava conversation since it won't break image order
+                num_draw = random.randint(min_num_draw, len(conversations))
+                chosen = random.sample(list(range(len(conversations))), num_draw)
+                for cid in chosen:
+                    conv = conversations[cid]
+                    num_convs = len(conv) // 2
+                    for conv_id in range(num_convs):
+                        question = conv[int(2*conv_id)]["value"]
+                        # remove '<image>' tag and '\n'
+                        with_image = img_key not in processed_images
+                        question = question.replace("<image>", "").replace("\n", "")
+                        answer = conv[int(2*conv_id+1)]["value"]
+                        instruction = self.prompter(question, with_image=with_image, first_message=(len(conv_list) == 0 and first_message))
+                        if with_image:
+                            instruction = self.post_process_text_image_count(instruction, 1, offset=len(processed_images))
+                        single_conv = dict(instruction=instruction, answer=answer)
+                        conv_list.append(single_conv)
+                        processed_images[img_key] = 1
+
+        # At least one conversation from otter
+        question_list = ann["instruction"]
+        answer_list = ann["answer"]
+        num_convs = len(question_list)
+        num_draw = random.randint(1, num_convs)
+        chosen = random.sample(list(range(num_convs)), num_draw)
+        for cid in chosen:
+            question = question_list[cid]
+            # remove '<image>' tag and '\n'
+            question = question.replace("<image>", "").replace("\n", "")
+            answer = answer_list[cid]
+            num_images = len(images) - len(processed_images)
+            instruction = self.prompter(question, with_image=(num_images > 0),
+                                        first_message=(len(conv_list) == 0),
+                                        num_images=num_images)
+            if num_images > 0:
+                instruction = self.post_process_text_image_count(instruction, num_images, offset=len(processed_images))
+            single_conv = dict(instruction=instruction, answer=answer)
+            conv_list.append(single_conv)
+            processed_images = images
+        # Follow-up llava conversations
+        if self.followup:
+            image_tags = {0: ["In image 1, ", "In image a, ", "In the first image, "], 1: ["In image 2, ", "In image b, ", "In the second image, "]}
+            for idx in range(len(images)):
+                img_key = images[idx]
+                if img_key in self.ann_llava:
+                    conversations = self.ann_llava[img_key]
+                    # min_num_draw = 1
+                    # num_draw = random.randint(min_num_draw, len(conversations))
+                    num_draw = 1 # To avoid making too complex conversation, we limit num of follow-up conversation to 1 per image
+                    chosen = random.sample(list(range(len(conversations))), num_draw)
+                    for cid in chosen:
+                        conv = conversations[cid]
+                        num_convs = len(conv) // 2
+                        for conv_id in range(num_convs):
+                            question = conv[int(2*conv_id)]["value"]
+                            # remove '<image>' tag and '\n'
+                            question = question.replace("<image>", "").replace("\n", "")
+                            answer = conv[int(2*conv_id+1)]["value"]
+                            # Add image tags so the model knows which image we are referring
+                            chosen_tag = random.choice(image_tags[idx])
+                            question = chosen_tag + question[0].lower() + question[1:]
+                            answer = chosen_tag + answer[0].lower() + answer[1:]
+                            instruction = self.prompter(question, with_image=False, first_message=False)
+                            single_conv = dict(instruction=instruction, answer=answer)
+                            conv_list.append(single_conv)
+        save_debug_text(conv_list, data_debug_path, data_debug_counter, get_rank())
+        return conv_list
+
+    def __getitem__(self, index):
+        ann = self.annotation[index][0] # self.annotation[index] is a list because of "self.annotation = DST.random_grouping(self.annotation, self.per_sample_image)" in VQADataset init
+        images_list = self.process_image(ann,
+                                    data_debug_path=self.data_debug_path,
+                                    data_debug_counter=self.data_debug_counter)
+        text_list = self.process_text(ann,
+                                    data_debug_path=self.data_debug_path,
+                                    data_debug_counter=self.data_debug_counter,
+                                    first_message=True,
+                                    num_images=len(images_list))
+
+        self.data_debug_counter += 1
+        res_list = []
+        for text in text_list:
+            single_res = self.tokenize(text)
+            res_list.append(single_res)
+
+        input_ids = []
+        attention_mask = []
+        labels = []
+        for res in res_list:
+            input_ids.extend(res["input_ids"])
+            attention_mask.extend(res["attention_mask"])
+            labels.extend(res["labels"])
+        
+        res = dict(
+            input_ids=input_ids, attention_mask=attention_mask, labels=labels
+        )
+        res.update(image=images_list)
+        res.update(image_num=len(images_list))
+
+        return res
diff --git a/applications/DeepSpeed-VisualChat/utils/data/ocr_vqa_dataset.py b/applications/DeepSpeed-VisualChat/utils/data/ocr_vqa_dataset.py
new file mode 100644
index 000000000..0e57fbb8e
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/utils/data/ocr_vqa_dataset.py
@@ -0,0 +1,68 @@
+# This file is adapted from https://github.com/open-mmlab/Multimodal-GPT
+# This dataset is from https://ocr-vqa.github.io/
+import json
+import os
+import random
+import torch
+
+from PIL import Image
+from tqdm import tqdm
+
+from .vqa_dataset import VQADataset
+from utils.utils import print_rank_0, is_rank_0, get_rank
+from .utils import save_debug_image, save_debug_text
+
+
+class OCRVQADataset(VQADataset):
+    def __init__(self, data_path, data_debug_path, per_sample_image, tokenizer, vis_processor,
+                 add_eos=True, ignore_instruction=True, **kwargs):
+        self.vis_root = f"{data_path}/OCR_VQA/images"
+        assert os.path.isdir(self.vis_root), f"OCRVQADataset image directory {self.vis_root} not found, you need to download images from https://ocr-vqa.github.io/"
+        ann_paths_raw = ["OCR_VQA/dataset.json"]
+        ann_paths = ["OCR_VQA/dataset_processed.json"]
+        real_ann_paths = []
+        for idx in range(len(ann_paths_raw)):
+            ann_path_raw = f"{data_path}/{ann_paths_raw[idx]}"
+            assert os.path.isfile(ann_path_raw), f"OCRVQADataset raw annotation file {ann_path_raw} not found, you need to download it from https://ocr-vqa.github.io/"
+            ann_path = f"{data_path}/{ann_paths[idx]}"
+            real_ann_paths.append(ann_path)
+            if not os.path.isfile(ann_path):
+                print_rank_0(f"OCRVQADataset annotation file {ann_path_raw} not found, starting an one-time preprocessing:")
+                raw_annotation = json.load(open(ann_path_raw, "r"))
+                raw_annotation_keys = list(raw_annotation.keys())
+                for k in tqdm(raw_annotation_keys):
+                    ext=os.path.splitext(raw_annotation[k]['imageURL'])[1]
+                    outputFile = '%s%s'%(k,ext)
+                    image_path = os.path.join(self.vis_root, outputFile)
+                    image = Image.open(image_path).convert("RGB")
+                    if image.size[0] > 1 and image.size[1] > 1:
+                        raw_annotation[k]["filename"] = outputFile
+                    else:
+                        del raw_annotation[k]
+                if is_rank_0():
+                    with open(ann_path, 'w') as f:
+                        json.dump(list(raw_annotation.values()), f)
+            torch.distributed.barrier()
+        super().__init__(data_path, data_debug_path, per_sample_image, tokenizer, vis_processor,
+                         self.vis_root, real_ann_paths, **kwargs)
+
+    def process_image(self, ann, data_debug_path=None, data_debug_counter=0):
+        image_path = os.path.join(self.vis_root, ann["filename"])
+        save_debug_image(image_path, data_debug_path, data_debug_counter, get_rank(), img_idx=0)
+        image = Image.open(image_path).convert("RGB")
+
+        image = self.vis_processor(image)
+        try:
+            image = image['pixel_values'][0]
+            return image
+        except:
+            return image
+
+    def process_text(self, ann, data_debug_path=None, data_debug_counter=0, first_message=True):
+        index = random.choice(list(range(len(ann["questions"]))))
+        question = ann["questions"][index]
+        answer = ann["answers"][index]
+
+        instruction = self.prompter(question, with_image=True, first_message=first_message)
+        save_debug_text([instruction, answer], data_debug_path, data_debug_counter, get_rank())
+        return dict(instruction=instruction, answer=answer)
diff --git a/applications/DeepSpeed-VisualChat/utils/data/otter_mimicit_cgd_dataset.py b/applications/DeepSpeed-VisualChat/utils/data/otter_mimicit_cgd_dataset.py
new file mode 100644
index 000000000..53d45551e
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/utils/data/otter_mimicit_cgd_dataset.py
@@ -0,0 +1,145 @@
+# This dataset is from https://huggingface.co/datasets/pufanyi/MIMICIT
+import os
+import torch
+import json
+import random
+from tqdm import tqdm
+from PIL import Image
+from .vqa_dataset import VQADataset
+from utils.utils import print_rank_0, is_rank_0, get_rank
+from .utils import save_debug_image, save_debug_text
+
+
+class OtterMimicitCgdDataset(VQADataset):
+    def __init__(self, data_path, data_debug_path, per_sample_image, tokenizer, vis_processor, **kwargs):
+        vis_root = f"{data_path}/coco/train2017"
+        assert os.path.isdir(vis_root), f"OtterMimicitCgdDataset image directory {vis_root} not found, you need to download 2017 Train images from https://cocodataset.org/#download"
+        ### Below commented code are the images from the MIMIC-IT. We use the original coco images above which are the same and with higher resolution.
+        # vis_root = f"{data_path}/MIMIC-IT/CGD_images"
+        # if not os.path.isdir(vis_root):
+        #     print_rank_0(f"OtterMimicitCgdDataset image directory {vis_root} not found, starting an one-time preprocessing:")
+        #     vis_root_file = f"{data_path}/MIMIC-IT/CGD.json"
+        #     assert os.path.isfile(vis_root_file), f"OtterMimicitCgdDataset image data {vis_root_file} not found, you need to download it from https://huggingface.co/datasets/pufanyi/MIMICIT"
+        #     if is_rank_0():
+        #         os.makedirs(vis_root, exist_ok=True)
+        #         image_data = json.load(open(vis_root_file, "r"))
+        #         image_keys = list(image_data.keys())
+        #         for k in tqdm(image_keys):
+        #             image = base64.b64decode(image_data[k])
+        #             with open(f"{vis_root}/{k}.jpg", 'wb') as f:
+        #                 f.write(image)
+        # torch.distributed.barrier()
+
+        ann_paths_raw = ["MIMIC-IT/CGD_instructions.json"]
+        ann_paths = ["MIMIC-IT/CGD_instructions_merged.json"]
+        for idx in range(len(ann_paths)):
+            ann_paths_raw[idx] = f"{data_path}/{ann_paths_raw[idx]}"
+            ann_paths[idx] = f"{data_path}/{ann_paths[idx]}"
+            assert os.path.isfile(ann_paths_raw[idx]), f"OtterMimicitCgdDataset raw annotation file {ann_paths_raw[idx]} not found, you need to download it from https://huggingface.co/datasets/pufanyi/MIMICIT"
+            if not os.path.isfile(ann_paths[idx]):
+                print_rank_0(f"OtterMimicitCgdDataset annotation file {ann_paths[idx]} not found, starting an one-time preprocessing:")
+                if is_rank_0():
+                    raw_annotation = json.load(open(ann_paths_raw[idx], "r"))["data"]
+                    raw_annotation_keys = list(raw_annotation.keys())
+                    random.shuffle(raw_annotation_keys)
+                    annotations = []
+                    for k in tqdm(raw_annotation_keys):
+                        if k in raw_annotation:
+                            ann = {}
+                            ann["image_ids"] = raw_annotation[k]["image_ids"]
+                            ann["instruction"] = [raw_annotation[k]["instruction"]]
+                            ann["answer"] = [raw_annotation[k]["answer"]]
+                            rel_ins_ids = raw_annotation[k]["rel_ins_ids"]
+                            for k_rel in rel_ins_ids:
+                                if k_rel in raw_annotation:
+                                    ann["instruction"].append(raw_annotation[k_rel]["instruction"])
+                                    ann["answer"].append(raw_annotation[k_rel]["answer"])
+                                    del raw_annotation[k_rel]
+                            annotations.append(ann)
+                            del raw_annotation[k]
+                    with open(ann_paths[idx], 'w') as f:
+                        json.dump(annotations, f)
+            torch.distributed.barrier()
+        super().__init__(data_path, data_debug_path, per_sample_image, tokenizer, vis_processor,
+                         vis_root, ann_paths, **kwargs)
+
+    def _add_instance_ids(self, key="id"):
+        for idx, ann in enumerate(self.annotation):
+            ann[key] = str(idx)
+
+    def convert_image_id(self, image_id):
+        return image_id[8:] + ".jpg"
+        # return image_id + ".jpg" ### Change to this if you switch to use images from MIMIC-IT/CGD_images
+
+    def process_image(self, ann, data_debug_path=None, data_debug_counter=0):
+        images = ann["image_ids"]
+        output_images = []
+        for idx in range(len(images)):
+            image = images[idx]
+            image_path = os.path.join(self.vis_root, self.convert_image_id(image))
+            save_debug_image(image_path, data_debug_path, data_debug_counter, get_rank(), img_idx=idx)
+            image = Image.open(image_path).convert("RGB")
+
+            image = self.vis_processor(image)
+            try:
+                image = image['pixel_values'][0]
+            except:
+                image = image
+            output_images.append(image)
+        
+        return output_images
+
+    def process_text(self, ann, data_debug_path=None, data_debug_counter=0, first_message=False, num_images=1):
+        question_list = ann["instruction"]
+        answer_list = ann["answer"]
+        num_convs = len(question_list)
+        indexes = list(range(num_convs))
+        random.shuffle(indexes)
+        conv_list = []
+        for conv_id in range(num_convs):
+            question = question_list[indexes[conv_id]]
+            # remove '<image>' tag and '\n'
+            question = question.replace("<image>", "").replace("\n", "")
+            answer = answer_list[indexes[conv_id]]
+            instruction = self.prompter(question, with_image=(conv_id == 0 and first_message),
+                                        first_message=(conv_id == 0 and first_message),
+                                        num_images=num_images)
+            if conv_id == 0 and first_message:
+                instruction = self.post_process_text_image_count(instruction, num_images)
+            single_conv = dict(instruction=instruction, answer=answer)
+            conv_list.append(single_conv)
+        save_debug_text(conv_list, data_debug_path, data_debug_counter, get_rank())
+        return conv_list
+
+    def __getitem__(self, index):
+        ann = self.annotation[index][0] # self.annotation[index] is a list because of "self.annotation = DST.random_grouping(self.annotation, self.per_sample_image)" in VQADataset init
+        images_list = self.process_image(ann,
+                                    data_debug_path=self.data_debug_path,
+                                    data_debug_counter=self.data_debug_counter)
+        text_list = self.process_text(ann,
+                                    data_debug_path=self.data_debug_path,
+                                    data_debug_counter=self.data_debug_counter,
+                                    first_message=True,
+                                    num_images=len(images_list))
+
+        self.data_debug_counter += 1
+        res_list = []
+        for text in text_list:
+            single_res = self.tokenize(text)
+            res_list.append(single_res)
+
+        input_ids = []
+        attention_mask = []
+        labels = []
+        for res in res_list:
+            input_ids.extend(res["input_ids"])
+            attention_mask.extend(res["attention_mask"])
+            labels.extend(res["labels"])
+        
+        res = dict(
+            input_ids=input_ids, attention_mask=attention_mask, labels=labels
+        )
+        res.update(image=images_list)
+        res.update(image_num=len(images_list))
+
+        return res
diff --git a/applications/DeepSpeed-VisualChat/utils/data/otter_mimicit_sd_dataset.py b/applications/DeepSpeed-VisualChat/utils/data/otter_mimicit_sd_dataset.py
new file mode 100644
index 000000000..4bd7740e4
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/utils/data/otter_mimicit_sd_dataset.py
@@ -0,0 +1,134 @@
+# This dataset is from https://huggingface.co/datasets/pufanyi/MIMICIT
+import os
+import torch
+import json
+import base64
+import random
+from tqdm import tqdm
+from PIL import Image
+from io import BytesIO
+from .vqa_dataset import VQADataset
+from utils.utils import print_rank_0, is_rank_0, get_rank
+from .utils import save_debug_image, save_debug_text
+
+
+class OtterMimicitSdDataset(VQADataset):
+    def __init__(self, data_path, data_debug_path, per_sample_image, tokenizer, vis_processor, **kwargs):
+        vis_root = f"{data_path}/MIMIC-IT/SD.json"
+        assert os.path.isfile(vis_root), f"OtterMimicitSdDataset image data {vis_root} not found, you need to download it from https://huggingface.co/datasets/pufanyi/MIMICIT"
+        self.vis_root_dict = json.load(open(vis_root, "r"))
+
+        ann_paths_raw = ["MIMIC-IT/SD_instructions.json"]
+        ann_paths = ["MIMIC-IT/SD_instructions_merged.json"]
+        for idx in range(len(ann_paths)):
+            ann_paths_raw[idx] = f"{data_path}/{ann_paths_raw[idx]}"
+            ann_paths[idx] = f"{data_path}/{ann_paths[idx]}"
+            assert os.path.isfile(ann_paths_raw[idx]), f"OtterMimicitSdDataset raw annotation file {ann_paths_raw[idx]} not found, you need to download it from https://huggingface.co/datasets/pufanyi/MIMICIT"
+            if not os.path.isfile(ann_paths[idx]):
+                print_rank_0(f"OtterMimicitSdDataset annotation file {ann_paths[idx]} not found, starting an one-time preprocessing:")
+                if is_rank_0():
+                    raw_annotation = json.load(open(ann_paths_raw[idx], "r"))["data"]
+                    raw_annotation_keys = list(raw_annotation.keys())
+                    random.shuffle(raw_annotation_keys)
+                    annotations = []
+                    for k in tqdm(raw_annotation_keys):
+                        if k in raw_annotation:
+                            ann = {}
+                            ann["image_ids"] = []
+                            for image in raw_annotation[k]["image_ids"]:
+                                if image in self.vis_root_dict:
+                                    ann["image_ids"].append(image)
+                            if len(ann["image_ids"]) > 0:
+                                ann["instruction"] = [raw_annotation[k]["instruction"]]
+                                ann["answer"] = [raw_annotation[k]["answer"]]
+                                rel_ins_ids = raw_annotation[k]["rel_ins_ids"]
+                                for k_rel in rel_ins_ids:
+                                    if k_rel in raw_annotation:
+                                        ann["instruction"].append(raw_annotation[k_rel]["instruction"])
+                                        ann["answer"].append(raw_annotation[k_rel]["answer"])
+                                        del raw_annotation[k_rel]
+                                annotations.append(ann)
+                            del raw_annotation[k]
+                    with open(ann_paths[idx], 'w') as f:
+                        json.dump(annotations, f)
+            torch.distributed.barrier()
+        super().__init__(data_path, data_debug_path, per_sample_image, tokenizer, vis_processor,
+                         vis_root, ann_paths, **kwargs)
+
+    def _add_instance_ids(self, key="id"):
+        for idx, ann in enumerate(self.annotation):
+            ann[key] = str(idx)
+
+    def process_image(self, ann, data_debug_path=None, data_debug_counter=0):
+        images = ann["image_ids"]
+        output_images = []
+        for idx in range(len(images)):
+            image = images[idx]
+            image_base64 = base64.b64decode(self.vis_root_dict[image])
+            save_debug_image(image_base64, data_debug_path, data_debug_counter,
+                             get_rank(), img_idx=idx, base64=True)
+            image = Image.open(BytesIO(image_base64)).convert("RGB")
+
+            image = self.vis_processor(image)
+            try:
+                image = image['pixel_values'][0]
+            except:
+                image = image
+            output_images.append(image)
+        
+        return output_images
+
+    def process_text(self, ann, data_debug_path=None, data_debug_counter=0, first_message=False, num_images=1):
+        question_list = ann["instruction"]
+        answer_list = ann["answer"]
+        num_convs = len(question_list)
+        indexes = list(range(num_convs))
+        random.shuffle(indexes)
+        conv_list = []
+        for conv_id in range(num_convs):
+            question = question_list[indexes[conv_id]]
+            # remove '<image>' tag and '\n'
+            question = question.replace("<image>", "").replace("\n", "")
+            answer = answer_list[indexes[conv_id]]
+            instruction = self.prompter(question, with_image=(conv_id == 0 and first_message),
+                                        first_message=(conv_id == 0 and first_message),
+                                        num_images=num_images)
+            if conv_id == 0 and first_message:
+                instruction = self.post_process_text_image_count(instruction, num_images)
+            single_conv = dict(instruction=instruction, answer=answer)
+            conv_list.append(single_conv)
+        save_debug_text(conv_list, data_debug_path, data_debug_counter, get_rank())
+        return conv_list
+
+    def __getitem__(self, index):
+        ann = self.annotation[index][0] # self.annotation[index] is a list because of "self.annotation = DST.random_grouping(self.annotation, self.per_sample_image)" in VQADataset init
+        images_list = self.process_image(ann,
+                                    data_debug_path=self.data_debug_path,
+                                    data_debug_counter=self.data_debug_counter)
+        text_list = self.process_text(ann,
+                                    data_debug_path=self.data_debug_path,
+                                    data_debug_counter=self.data_debug_counter,
+                                    first_message=True,
+                                    num_images=len(images_list))
+
+        self.data_debug_counter += 1
+        res_list = []
+        for text in text_list:
+            single_res = self.tokenize(text)
+            res_list.append(single_res)
+
+        input_ids = []
+        attention_mask = []
+        labels = []
+        for res in res_list:
+            input_ids.extend(res["input_ids"])
+            attention_mask.extend(res["attention_mask"])
+            labels.extend(res["labels"])
+        
+        res = dict(
+            input_ids=input_ids, attention_mask=attention_mask, labels=labels
+        )
+        res.update(image=images_list)
+        res.update(image_num=len(images_list))
+
+        return res
diff --git a/applications/DeepSpeed-VisualChat/utils/data/otter_mimicit_sn_dataset.py b/applications/DeepSpeed-VisualChat/utils/data/otter_mimicit_sn_dataset.py
new file mode 100644
index 000000000..be447119c
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/utils/data/otter_mimicit_sn_dataset.py
@@ -0,0 +1,138 @@
+# This dataset is from https://huggingface.co/datasets/pufanyi/MIMICIT
+import os
+import torch
+import json
+import base64
+import random
+from tqdm import tqdm
+from PIL import Image
+from io import BytesIO
+from .vqa_dataset import VQADataset
+from utils.utils import print_rank_0, is_rank_0, get_rank
+from .utils import save_debug_image, save_debug_text
+
+
+class OtterMimicitSnDataset(VQADataset):
+    def __init__(self, data_path, data_debug_path, per_sample_image, max_num_image_per_sample, tokenizer, vis_processor, **kwargs):
+        vis_root = f"{data_path}/MIMIC-IT/SN.json"
+        assert os.path.isfile(vis_root), f"OtterMimicitSnDataset image data {vis_root} not found, you need to download it from https://huggingface.co/datasets/pufanyi/MIMICIT"
+        self.vis_root_dict = json.load(open(vis_root, "r"))
+        self.max_num_image_per_sample = max_num_image_per_sample
+
+        ann_paths_raw = ["MIMIC-IT/SN_instructions.json"]
+        ann_paths = [f"MIMIC-IT/SN_instructions_merged_filtered{max_num_image_per_sample}.json"]
+        for idx in range(len(ann_paths)):
+            ann_paths_raw[idx] = f"{data_path}/{ann_paths_raw[idx]}"
+            ann_paths[idx] = f"{data_path}/{ann_paths[idx]}"
+            assert os.path.isfile(ann_paths_raw[idx]), f"OtterMimicitSnDataset raw annotation file {ann_paths_raw[idx]} not found, you need to download it from https://huggingface.co/datasets/pufanyi/MIMICIT"
+            if not os.path.isfile(ann_paths[idx]):
+                print_rank_0(f"OtterMimicitSnDataset annotation file {ann_paths[idx]} not found, starting an one-time preprocessing:")
+                if is_rank_0():
+                    raw_annotation = json.load(open(ann_paths_raw[idx], "r"))["data"]
+                    raw_annotation_keys = list(raw_annotation.keys())
+                    random.shuffle(raw_annotation_keys)
+                    annotations = []
+                    for k in tqdm(raw_annotation_keys):
+                        if k in raw_annotation:
+                            ann = {}
+                            ann["image_ids"] = []
+                            for image in raw_annotation[k]["image_ids"]:
+                                if image in self.vis_root_dict:
+                                    ann["image_ids"].append(image)
+                            if len(ann["image_ids"]) > 0 and len(ann["image_ids"]) <= max_num_image_per_sample:
+                                ann["instruction"] = [raw_annotation[k]["instruction"]]
+                                ann["answer"] = [raw_annotation[k]["answer"]]
+                                rel_ins_ids = raw_annotation[k]["rel_ins_ids"]
+                                for k_rel in rel_ins_ids:
+                                    if k_rel in raw_annotation:
+                                        ann["instruction"].append(raw_annotation[k_rel]["instruction"])
+                                        ann["answer"].append(raw_annotation[k_rel]["answer"])
+                                        del raw_annotation[k_rel]
+                                annotations.append(ann)
+                            del raw_annotation[k]
+                    with open(ann_paths[idx], 'w') as f:
+                        json.dump(annotations, f)
+            torch.distributed.barrier()
+        super().__init__(data_path, data_debug_path, per_sample_image, tokenizer, vis_processor,
+                         vis_root, ann_paths, **kwargs)
+
+    def _add_instance_ids(self, key="id"):
+        for idx, ann in enumerate(self.annotation):
+            ann[key] = str(idx)
+
+    def process_image(self, ann, data_debug_path=None, data_debug_counter=0):
+        images = ann["image_ids"]
+        chosen = list(range(len(images)))
+        if len(images) > self.max_num_image_per_sample:
+            chosen = list(sorted(random.sample(chosen, self.max_num_image_per_sample)))
+        output_images = []
+        for idx in chosen:
+            image = images[idx]
+            image_base64 = base64.b64decode(self.vis_root_dict[image])
+            save_debug_image(image_base64, data_debug_path, data_debug_counter,
+                             get_rank(), img_idx=idx, base64=True)
+            image = Image.open(BytesIO(image_base64)).convert("RGB")
+
+            image = self.vis_processor(image)
+            try:
+                image = image['pixel_values'][0]
+            except:
+                image = image
+            output_images.append(image)
+        
+        return output_images
+
+    def process_text(self, ann, data_debug_path=None, data_debug_counter=0, first_message=False, num_images=1):
+        question_list = ann["instruction"]
+        answer_list = ann["answer"]
+        num_convs = len(question_list)
+        indexes = list(range(num_convs))
+        random.shuffle(indexes)
+        conv_list = []
+        for conv_id in range(num_convs):
+            question = question_list[indexes[conv_id]]
+            # remove '<image>' tag and '\n'
+            question = question.replace("<image>", "").replace("\n", "")
+            answer = answer_list[indexes[conv_id]]
+            instruction = self.prompter(question, with_image=(conv_id == 0 and first_message),
+                                        first_message=(conv_id == 0 and first_message),
+                                        num_images=num_images)
+            if conv_id == 0 and first_message:
+                instruction = self.post_process_text_image_count(instruction, num_images)
+            single_conv = dict(instruction=instruction, answer=answer)
+            conv_list.append(single_conv)
+        save_debug_text(conv_list, data_debug_path, data_debug_counter, get_rank())
+        return conv_list
+
+    def __getitem__(self, index):
+        ann = self.annotation[index][0] # self.annotation[index] is a list because of "self.annotation = DST.random_grouping(self.annotation, self.per_sample_image)" in VQADataset init
+        images_list = self.process_image(ann,
+                                    data_debug_path=self.data_debug_path,
+                                    data_debug_counter=self.data_debug_counter)
+        text_list = self.process_text(ann,
+                                    data_debug_path=self.data_debug_path,
+                                    data_debug_counter=self.data_debug_counter,
+                                    first_message=True,
+                                    num_images=len(images_list))
+
+        self.data_debug_counter += 1
+        res_list = []
+        for text in text_list:
+            single_res = self.tokenize(text)
+            res_list.append(single_res)
+
+        input_ids = []
+        attention_mask = []
+        labels = []
+        for res in res_list:
+            input_ids.extend(res["input_ids"])
+            attention_mask.extend(res["attention_mask"])
+            labels.extend(res["labels"])
+        
+        res = dict(
+            input_ids=input_ids, attention_mask=attention_mask, labels=labels
+        )
+        res.update(image=images_list)
+        res.update(image_num=len(images_list))
+
+        return res
diff --git a/applications/DeepSpeed-VisualChat/utils/data/otter_mimicit_tvc_dataset.py b/applications/DeepSpeed-VisualChat/utils/data/otter_mimicit_tvc_dataset.py
new file mode 100644
index 000000000..09d1c5b88
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/utils/data/otter_mimicit_tvc_dataset.py
@@ -0,0 +1,138 @@
+# This dataset is from https://huggingface.co/datasets/pufanyi/MIMICIT
+import os
+import torch
+import json
+import base64
+import random
+from tqdm import tqdm
+from PIL import Image
+from io import BytesIO
+from .vqa_dataset import VQADataset
+from utils.utils import print_rank_0, is_rank_0, get_rank
+from .utils import save_debug_image, save_debug_text
+
+
+class OtterMimicitTvcDataset(VQADataset):
+    def __init__(self, data_path, data_debug_path, per_sample_image, max_num_image_per_sample, tokenizer, vis_processor, **kwargs):
+        vis_root = f"{data_path}/MIMIC-IT/TVC.json"
+        assert os.path.isfile(vis_root), f"OtterMimicitTvcDataset image data {vis_root} not found, you need to download it from https://huggingface.co/datasets/pufanyi/MIMICIT"
+        self.vis_root_dict = json.load(open(vis_root, "r"))
+        self.max_num_image_per_sample = max_num_image_per_sample
+
+        ann_paths_raw = ["MIMIC-IT/TVC_instructions.json"]
+        ann_paths = [f"MIMIC-IT/TVC_instructions_merged_filtered{max_num_image_per_sample}.json"]
+        for idx in range(len(ann_paths)):
+            ann_paths_raw[idx] = f"{data_path}/{ann_paths_raw[idx]}"
+            ann_paths[idx] = f"{data_path}/{ann_paths[idx]}"
+            assert os.path.isfile(ann_paths_raw[idx]), f"OtterMimicitTvcDataset raw annotation file {ann_paths_raw[idx]} not found, you need to download it from https://huggingface.co/datasets/pufanyi/MIMICIT"
+            if not os.path.isfile(ann_paths[idx]):
+                print_rank_0(f"OtterMimicitTvcDataset annotation file {ann_paths[idx]} not found, starting an one-time preprocessing:")
+                if is_rank_0():
+                    raw_annotation = json.load(open(ann_paths_raw[idx], "r"))["data"]
+                    raw_annotation_keys = list(raw_annotation.keys())
+                    random.shuffle(raw_annotation_keys)
+                    annotations = []
+                    for k in tqdm(raw_annotation_keys):
+                        if k in raw_annotation:
+                            ann = {}
+                            ann["image_ids"] = []
+                            for image in raw_annotation[k]["image_ids"]:
+                                if image in self.vis_root_dict:
+                                    ann["image_ids"].append(image)
+                            if len(ann["image_ids"]) > 0 and len(ann["image_ids"]) <= max_num_image_per_sample:
+                                ann["instruction"] = [raw_annotation[k]["instruction"]]
+                                ann["answer"] = [raw_annotation[k]["answer"]]
+                                rel_ins_ids = raw_annotation[k]["rel_ins_ids"]
+                                for k_rel in rel_ins_ids:
+                                    if k_rel in raw_annotation:
+                                        ann["instruction"].append(raw_annotation[k_rel]["instruction"])
+                                        ann["answer"].append(raw_annotation[k_rel]["answer"])
+                                        del raw_annotation[k_rel]
+                                annotations.append(ann)
+                            del raw_annotation[k]
+                    with open(ann_paths[idx], 'w') as f:
+                        json.dump(annotations, f)
+            torch.distributed.barrier()
+        super().__init__(data_path, data_debug_path, per_sample_image, tokenizer, vis_processor,
+                         vis_root, ann_paths, **kwargs)
+
+    def _add_instance_ids(self, key="id"):
+        for idx, ann in enumerate(self.annotation):
+            ann[key] = str(idx)
+
+    def process_image(self, ann, data_debug_path=None, data_debug_counter=0):
+        images = ann["image_ids"]
+        chosen = list(range(len(images)))
+        if len(images) > self.max_num_image_per_sample:
+            chosen = list(sorted(random.sample(chosen, self.max_num_image_per_sample)))
+        output_images = []
+        for idx in chosen:
+            image = images[idx]
+            image_base64 = base64.b64decode(self.vis_root_dict[image])
+            save_debug_image(image_base64, data_debug_path, data_debug_counter,
+                             get_rank(), img_idx=idx, base64=True)
+            image = Image.open(BytesIO(image_base64)).convert("RGB")
+
+            image = self.vis_processor(image)
+            try:
+                image = image['pixel_values'][0]
+            except:
+                image = image
+            output_images.append(image)
+        
+        return output_images
+
+    def process_text(self, ann, data_debug_path=None, data_debug_counter=0, first_message=False, num_images=1):
+        question_list = ann["instruction"]
+        answer_list = ann["answer"]
+        num_convs = len(question_list)
+        indexes = list(range(num_convs))
+        random.shuffle(indexes)
+        conv_list = []
+        for conv_id in range(num_convs):
+            question = question_list[indexes[conv_id]]
+            # remove '<image>' tag and '\n'
+            question = question.replace("<image>", "").replace("\n", "")
+            answer = answer_list[indexes[conv_id]]
+            instruction = self.prompter(question, with_image=(conv_id == 0 and first_message),
+                                        first_message=(conv_id == 0 and first_message),
+                                        num_images=num_images)
+            if conv_id == 0 and first_message:
+                instruction = self.post_process_text_image_count(instruction, num_images)
+            single_conv = dict(instruction=instruction, answer=answer)
+            conv_list.append(single_conv)
+        save_debug_text(conv_list, data_debug_path, data_debug_counter, get_rank())
+        return conv_list
+
+    def __getitem__(self, index):
+        ann = self.annotation[index][0] # self.annotation[index] is a list because of "self.annotation = DST.random_grouping(self.annotation, self.per_sample_image)" in VQADataset init
+        images_list = self.process_image(ann,
+                                    data_debug_path=self.data_debug_path,
+                                    data_debug_counter=self.data_debug_counter)
+        text_list = self.process_text(ann,
+                                    data_debug_path=self.data_debug_path,
+                                    data_debug_counter=self.data_debug_counter,
+                                    first_message=True,
+                                    num_images=len(images_list))
+
+        self.data_debug_counter += 1
+        res_list = []
+        for text in text_list:
+            single_res = self.tokenize(text)
+            res_list.append(single_res)
+
+        input_ids = []
+        attention_mask = []
+        labels = []
+        for res in res_list:
+            input_ids.extend(res["input_ids"])
+            attention_mask.extend(res["attention_mask"])
+            labels.extend(res["labels"])
+        
+        res = dict(
+            input_ids=input_ids, attention_mask=attention_mask, labels=labels
+        )
+        res.update(image=images_list)
+        res.update(image_num=len(images_list))
+
+        return res
diff --git a/applications/DeepSpeed-VisualChat/utils/data/otter_mimicit_vst_dataset.py b/applications/DeepSpeed-VisualChat/utils/data/otter_mimicit_vst_dataset.py
new file mode 100644
index 000000000..435c15141
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/utils/data/otter_mimicit_vst_dataset.py
@@ -0,0 +1,138 @@
+# This dataset is from https://huggingface.co/datasets/pufanyi/MIMICIT
+import os
+import torch
+import json
+import base64
+import random
+from tqdm import tqdm
+from PIL import Image
+from io import BytesIO
+from .vqa_dataset import VQADataset
+from utils.utils import print_rank_0, is_rank_0, get_rank
+from .utils import save_debug_image, save_debug_text
+
+
+class OtterMimicitVstDataset(VQADataset):
+    def __init__(self, data_path, data_debug_path, per_sample_image, max_num_image_per_sample, tokenizer, vis_processor, **kwargs):
+        vis_root = f"{data_path}/MIMIC-IT/VST.json"
+        assert os.path.isfile(vis_root), f"OtterMimicitVstDataset image data {vis_root} not found, you need to download it from https://huggingface.co/datasets/pufanyi/MIMICIT"
+        self.vis_root_dict = json.load(open(vis_root, "r"))
+        self.max_num_image_per_sample = max_num_image_per_sample
+
+        ann_paths_raw = ["MIMIC-IT/VST_instructions.json"]
+        ann_paths = [f"MIMIC-IT/VST_instructions_merged_filtered{max_num_image_per_sample}.json"]
+        for idx in range(len(ann_paths)):
+            ann_paths_raw[idx] = f"{data_path}/{ann_paths_raw[idx]}"
+            ann_paths[idx] = f"{data_path}/{ann_paths[idx]}"
+            assert os.path.isfile(ann_paths_raw[idx]), f"OtterMimicitVstDataset raw annotation file {ann_paths_raw[idx]} not found, you need to download it from https://huggingface.co/datasets/pufanyi/MIMICIT"
+            if not os.path.isfile(ann_paths[idx]):
+                print_rank_0(f"OtterMimicitVstDataset annotation file {ann_paths[idx]} not found, starting an one-time preprocessing:")
+                if is_rank_0():
+                    raw_annotation = json.load(open(ann_paths_raw[idx], "r"))["data"]
+                    raw_annotation_keys = list(raw_annotation.keys())
+                    random.shuffle(raw_annotation_keys)
+                    annotations = []
+                    for k in tqdm(raw_annotation_keys):
+                        if k in raw_annotation:
+                            ann = {}
+                            ann["image_ids"] = []
+                            for image in raw_annotation[k]["image_ids"]:
+                                if image in self.vis_root_dict:
+                                    ann["image_ids"].append(image)
+                            if len(ann["image_ids"]) > 0 and len(ann["image_ids"]) <= max_num_image_per_sample:
+                                ann["instruction"] = [raw_annotation[k]["instruction"]]
+                                ann["answer"] = [raw_annotation[k]["answer"]]
+                                rel_ins_ids = raw_annotation[k]["rel_ins_ids"]
+                                for k_rel in rel_ins_ids:
+                                    if k_rel in raw_annotation:
+                                        ann["instruction"].append(raw_annotation[k_rel]["instruction"])
+                                        ann["answer"].append(raw_annotation[k_rel]["answer"])
+                                        del raw_annotation[k_rel]
+                                annotations.append(ann)
+                            del raw_annotation[k]
+                    with open(ann_paths[idx], 'w') as f:
+                        json.dump(annotations, f)
+            torch.distributed.barrier()
+        super().__init__(data_path, data_debug_path, per_sample_image, tokenizer, vis_processor,
+                         vis_root, ann_paths, **kwargs)
+
+    def _add_instance_ids(self, key="id"):
+        for idx, ann in enumerate(self.annotation):
+            ann[key] = str(idx)
+
+    def process_image(self, ann, data_debug_path=None, data_debug_counter=0):
+        images = ann["image_ids"]
+        chosen = list(range(len(images)))
+        if len(images) > self.max_num_image_per_sample:
+            chosen = list(sorted(random.sample(chosen, self.max_num_image_per_sample)))
+        output_images = []
+        for idx in chosen:
+            image = images[idx]
+            image_base64 = base64.b64decode(self.vis_root_dict[image])
+            save_debug_image(image_base64, data_debug_path, data_debug_counter,
+                             get_rank(), img_idx=idx, base64=True)
+            image = Image.open(BytesIO(image_base64)).convert("RGB")
+
+            image = self.vis_processor(image)
+            try:
+                image = image['pixel_values'][0]
+            except:
+                image = image
+            output_images.append(image)
+        
+        return output_images
+
+    def process_text(self, ann, data_debug_path=None, data_debug_counter=0, first_message=False, num_images=1):
+        question_list = ann["instruction"]
+        answer_list = ann["answer"]
+        num_convs = len(question_list)
+        indexes = list(range(num_convs))
+        random.shuffle(indexes)
+        conv_list = []
+        for conv_id in range(num_convs):
+            question = question_list[indexes[conv_id]]
+            # remove '<image>' tag and '\n'
+            question = question.replace("<image>", "").replace("\n", "")
+            answer = answer_list[indexes[conv_id]]
+            instruction = self.prompter(question, with_image=(conv_id == 0 and first_message),
+                                        first_message=(conv_id == 0 and first_message),
+                                        num_images=num_images)
+            if conv_id == 0 and first_message:
+                instruction = self.post_process_text_image_count(instruction, num_images)
+            single_conv = dict(instruction=instruction, answer=answer)
+            conv_list.append(single_conv)
+        save_debug_text(conv_list, data_debug_path, data_debug_counter, get_rank())
+        return conv_list
+
+    def __getitem__(self, index):
+        ann = self.annotation[index][0] # self.annotation[index] is a list because of "self.annotation = DST.random_grouping(self.annotation, self.per_sample_image)" in VQADataset init
+        images_list = self.process_image(ann,
+                                    data_debug_path=self.data_debug_path,
+                                    data_debug_counter=self.data_debug_counter)
+        text_list = self.process_text(ann,
+                                    data_debug_path=self.data_debug_path,
+                                    data_debug_counter=self.data_debug_counter,
+                                    first_message=True,
+                                    num_images=len(images_list))
+
+        self.data_debug_counter += 1
+        res_list = []
+        for text in text_list:
+            single_res = self.tokenize(text)
+            res_list.append(single_res)
+
+        input_ids = []
+        attention_mask = []
+        labels = []
+        for res in res_list:
+            input_ids.extend(res["input_ids"])
+            attention_mask.extend(res["attention_mask"])
+            labels.extend(res["labels"])
+        
+        res = dict(
+            input_ids=input_ids, attention_mask=attention_mask, labels=labels
+        )
+        res.update(image=images_list)
+        res.update(image_num=len(images_list))
+
+        return res
diff --git a/applications/DeepSpeed-VisualChat/utils/data/sparkles_dialogue_dataset.py b/applications/DeepSpeed-VisualChat/utils/data/sparkles_dialogue_dataset.py
new file mode 100644
index 000000000..d11fcfa97
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/utils/data/sparkles_dialogue_dataset.py
@@ -0,0 +1,161 @@
+# This dataset is from https://github.com/HYPJUDY/Sparkles
+import os
+import torch
+import json
+import random
+import re
+from PIL import Image
+from .vqa_dataset import VQADataset
+from utils.utils import print_rank_0, is_rank_0, get_rank
+from .utils import save_debug_image, save_debug_text
+
+
+class SparklesDialogueDataset(VQADataset):
+    def __init__(self, data_path, data_debug_path, per_sample_image, tokenizer, vis_processor, **kwargs):
+        vis_root = ["SparklesDialogueCC/images", "SparklesDialogueVG/images"]
+        for idx in range(len(vis_root)):
+            vis_root[idx] = f"{data_path}/{vis_root[idx]}"
+            assert os.path.isdir(vis_root[idx]), f"SparklesDialogueDataset image directory {vis_root[idx]} not found, you need to download it from https://github.com/HYPJUDY/Sparkles"
+
+        ann_path_raw = ["SparklesDialogueCC/annotations/SparklesDialogueCC.json",
+                        "SparklesDialogueVG/annotations/SparklesDialogueVG.json"]
+        for idx in range(len(ann_path_raw)):
+            ann_path_raw[idx] = f"{data_path}/{ann_path_raw[idx]}"
+            assert os.path.isfile(ann_path_raw[idx]), f"SparklesDialogueDataset annotation file {ann_path_raw[idx]} not found, you need to download it from https://github.com/HYPJUDY/Sparkles"
+        ann_path = f"{data_path}/SparklesDialogue.json"
+        
+        if not os.path.isfile(ann_path):
+            print_rank_0(f"SparklesDialogueDataset: starting an one-time preprocessing:")
+            if is_rank_0():
+                annotations = []
+                for a_idx in range(len(ann_path_raw)):
+                    raw_annotation = json.load(open(ann_path_raw[a_idx], "r"))
+                    for raw_ann in raw_annotation:
+                        meet_criteria = True
+                        if len(raw_ann["dialogue"]) % 2 != 0:
+                            meet_criteria = False
+                        raw_ann["image_path"] = vis_root[a_idx]
+                        num_img = 0
+                        for d_idx in range(len(raw_ann["dialogue"])):
+                            if d_idx % 2 == 0 and raw_ann["dialogue"][d_idx]["role"] != "user":
+                                meet_criteria = False
+                            if d_idx % 2 == 1 and raw_ann["dialogue"][d_idx]["role"] != "assistant":
+                                meet_criteria = False
+                            if "images" in raw_ann["dialogue"][d_idx]:
+                                for img in raw_ann["dialogue"][d_idx]["images"]:
+                                    img_id = img["image_id"]
+                                    num_img += 1
+                                    if not os.path.isfile(f"{vis_root[a_idx]}/{img_id}.jpg"):
+                                        meet_criteria = False
+                        if num_img > 8: # Currently only use conversations with <= 8 images
+                            meet_criteria = False
+                        if meet_criteria:
+                            annotations.append(raw_ann)
+                with open(ann_path, 'w') as f:
+                    json.dump(annotations, f)
+            torch.distributed.barrier()
+        super().__init__(data_path, data_debug_path, per_sample_image, tokenizer, vis_processor,
+                         vis_root, [ann_path], **kwargs)
+        self.image_tag_dict = [{0: "image a", 1: "image b", 2: "image c", 3: "image d", 4: "image e", 5: "image f", 6: "image g", 7: "image h"},
+                               {0: "image A", 1: "image B", 2: "image C", 3: "image D", 4: "image E", 5: "image F", 6: "image G", 7: "image H"},
+                               {0: "the first image", 1: "the second image", 2: "the third image", 3: "the fourth image",
+                                4: "the fifth image", 5: "the sixth image", 6: "the seventh image", 7: "the eighth image"}]
+
+    def _add_instance_ids(self, key="id"):
+        for idx, ann in enumerate(self.annotation):
+            ann[key] = str(idx)
+
+    def process_image(self, ann, data_debug_path=None, data_debug_counter=0):
+        output_images = []
+        img_counter = 0
+        for dialogue in ann["dialogue"]:
+            if "images" in dialogue:
+                for img in dialogue["images"]:
+                    image_path = os.path.join(ann["image_path"], str(img["image_id"]) + ".jpg")
+                    save_debug_image(image_path, data_debug_path, data_debug_counter,
+                                     get_rank(), img_idx=img_counter)
+                    img_counter += 1
+                    image = Image.open(image_path).convert("RGB")
+
+                    image = self.vis_processor(image)
+                    try:
+                        image = image['pixel_values'][0]
+                    except:
+                        image = image
+                    output_images.append(image)
+        
+        return output_images
+
+    def process_text(self, ann, data_debug_path=None, data_debug_counter=0, first_message=False, num_images=1):
+        tag_dict = random.choice(self.image_tag_dict)
+        regex = re.compile(r'((?<=[\.\?!]\s)(\w+)|(^\w+))')
+        def capitalize_sentence(match):
+            return(match.group().capitalize())
+        to_replace = []
+        conv_list = []
+        num_convs = len(ann["dialogue"]) // 2
+        tot_num_image = 0
+        for conv_id in range(num_convs):
+            with_image = False
+            num_image = 0
+            if "images" in ann["dialogue"][int(2*conv_id)]:
+                with_image = True
+                for img in ann["dialogue"][int(2*conv_id)]["images"]:
+                    img_id = img["image_id"]
+                    tag_replace = [f"IMAGE#{img_id}", tag_dict[len(to_replace)]]
+                    to_replace.append(tag_replace)
+                    num_image += 1
+            question = ann["dialogue"][int(2*conv_id)]["content"]
+            # remove '<Img>' tag and '\n'
+            question = question.replace("<Img><ImageHere></Img>", "").replace("\n", "")
+            answer = ann["dialogue"][int(2*conv_id+1)]["content"]
+            for idx in range(len(to_replace)):
+                question = question.replace(to_replace[idx][0], f"%temp{idx}%")
+                answer = answer.replace(to_replace[idx][0], f"%temp{idx}%")
+            for idx in range(len(to_replace)):
+                question = question.replace(f"%temp{idx}%", to_replace[idx][1])
+                answer = answer.replace(f"%temp{idx}%", to_replace[idx][1])
+            question = regex.sub(capitalize_sentence, question)
+            answer = regex.sub(capitalize_sentence, answer)
+            instruction = self.prompter(question, with_image=with_image, first_message=(len(conv_list) == 0 and first_message), num_images=num_image)
+            if with_image:
+                instruction = self.post_process_text_image_count(instruction, num_image, offset=tot_num_image)
+            single_conv = dict(instruction=instruction, answer=answer)
+            conv_list.append(single_conv)
+            tot_num_image += num_image
+
+        save_debug_text(conv_list, data_debug_path, data_debug_counter, get_rank())
+        return conv_list
+
+    def __getitem__(self, index):
+        ann = self.annotation[index][0] # self.annotation[index] is a list because of "self.annotation = DST.random_grouping(self.annotation, self.per_sample_image)" in VQADataset init
+        images_list = self.process_image(ann,
+                                    data_debug_path=self.data_debug_path,
+                                    data_debug_counter=self.data_debug_counter)
+        text_list = self.process_text(ann,
+                                    data_debug_path=self.data_debug_path,
+                                    data_debug_counter=self.data_debug_counter,
+                                    first_message=True,
+                                    num_images=len(images_list))
+
+        self.data_debug_counter += 1
+        res_list = []
+        for text in text_list:
+            single_res = self.tokenize(text)
+            res_list.append(single_res)
+
+        input_ids = []
+        attention_mask = []
+        labels = []
+        for res in res_list:
+            input_ids.extend(res["input_ids"])
+            attention_mask.extend(res["attention_mask"])
+            labels.extend(res["labels"])
+        
+        res = dict(
+            input_ids=input_ids, attention_mask=attention_mask, labels=labels
+        )
+        res.update(image=images_list)
+        res.update(image_num=len(images_list))
+
+        return res
diff --git a/applications/DeepSpeed-VisualChat/utils/data/utils.py b/applications/DeepSpeed-VisualChat/utils/data/utils.py
new file mode 100644
index 000000000..e63b94173
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/utils/data/utils.py
@@ -0,0 +1,64 @@
+import torch
+from torch.utils.data import Subset
+from torch.nn.utils.rnn import pad_sequence
+import numpy as np
+import shutil
+from torch.utils.data.dataloader import default_collate
+import utils.data.DST as DST
+
+NUM_DEBUG_SAMPLE = 10
+
+def split_dataset(dataset, split_ratio=0.8):
+    split = int(len(dataset) * split_ratio)
+    return Subset(dataset, range(split)), Subset(dataset, range(split, len(dataset)))
+
+def shuffle_dataset(dataset, np_rng):
+    size = len(dataset)
+    dtype_ = np.uint32
+    if size >= (np.iinfo(np.uint32).max - 1):
+        dtype_ = np.int64
+    shuffle_idx = np.arange(start=0, stop=size, step=1, dtype=dtype_)
+    np_rng.shuffle(shuffle_idx)
+    return Subset(dataset, shuffle_idx.tolist())
+
+def save_debug_image(image_path, data_debug_path, data_debug_counter, rank, img_idx=0, base64=False):
+    if data_debug_path is not None and data_debug_counter < NUM_DEBUG_SAMPLE:
+        if base64:
+            with open(f"{data_debug_path}/gpu_rank{rank}_debug{data_debug_counter}_image{img_idx}.jpg", 'wb') as f:
+                f.write(image_path)
+        else:
+            shutil.copyfile(
+                image_path,
+                f"{data_debug_path}/gpu_rank{rank}_debug{data_debug_counter}_image{img_idx}.jpg")
+
+def save_debug_text(text_to_save, data_debug_path, data_debug_counter, rank):
+    if data_debug_path is not None and data_debug_counter < NUM_DEBUG_SAMPLE:
+        with open(f"{data_debug_path}/gpu_rank{rank}_debug{data_debug_counter}_text.txt", 'w') as f:
+            f.write(f"{text_to_save}")
+
+class DataCollatorPadToMaxLen:
+
+    def __init__(self, max_token_len, pad_token_id):
+        self.max_token_len = max_token_len
+        self.pad_token_id = pad_token_id
+
+    def __call__(self, data):
+        batch = {}
+        input_ids = pad_sequence([default_collate(f['input_ids']) for f in data], 
+                                  padding_value=self.pad_token_id, 
+                                  batch_first=True)
+        
+        labels = pad_sequence([default_collate(f['labels']) for f in data],
+                                   padding_value=DST.DEFAULT_LABEL_PADDING_NUM,
+                                   batch_first=True)
+        attention_mask = pad_sequence([default_collate(f['attention_mask']) for f in data],
+                                        padding_value=0,
+                                        batch_first=True)
+        image = torch.concat([default_collate(f['image']) for f in data], dim=0).reshape((-1,) + data[0]["image"][0].shape[-3:])
+        image_num = [f['image_num'] for f in data] 
+        batch['input_ids'] = input_ids
+        batch['labels'] = labels
+        batch['attention_mask'] = attention_mask
+        batch['image'] = image
+        batch['image_num'] = image_num
+        return batch
\ No newline at end of file
diff --git a/applications/DeepSpeed-VisualChat/utils/data/vqa_dataset.py b/applications/DeepSpeed-VisualChat/utils/data/vqa_dataset.py
new file mode 100755
index 000000000..f35b01642
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/utils/data/vqa_dataset.py
@@ -0,0 +1,294 @@
+# This file is adapted from https://github.com/open-mmlab/Multimodal-GPT
+
+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+
+import copy
+import json
+import os
+import random
+from collections import defaultdict
+from typing import Iterable
+
+import numpy as np
+import torch
+from PIL import Image
+from torch.utils.data import ConcatDataset, Dataset
+from transformers import LlamaTokenizer
+import utils.data.DST as DST 
+from utils.utils import get_rank
+from .utils import save_debug_image, save_debug_text
+import re
+
+class VQADataset(Dataset):
+    def __init__(
+        self,
+        data_path,
+        data_debug_path,
+        per_sample_image,
+        tokenizer,
+        vis_processor=None,
+        vis_root=None,
+        ann_paths=[],
+        add_eos=True,
+        ignore_instruction=True,
+        sample_image=False,
+        annotation_key=None
+    ):
+        """
+        vis_root (string): Root directory of images (e.g. coco/images/)
+        ann_root (string): directory to store the annotation file
+        """
+        if hasattr(tokenizer, "add_eos_token"):
+            assert tokenizer.add_eos_token is False, "tokenizer should not add eos token by default"
+        self.tokenizer: LlamaTokenizer = tokenizer
+        self.data_path = data_path
+        self.data_debug_path = data_debug_path
+        self.data_debug_counter = 0
+        self.vis_root = vis_root
+        self.per_sample_image = per_sample_image
+        print('check tokenizer',  self.tokenizer)
+        self.annotation = []
+        for ann_path in ann_paths:
+            if annotation_key is None:
+                self.annotation.extend(json.load(open(ann_path, "r")))
+            else:
+                self.annotation.extend(json.load(open(ann_path, "r"))[annotation_key])
+        self.sample_image = sample_image
+        if self.sample_image:
+            print("randomly sample one annotation for each image") 
+            self.annotation = self.parse_annotation(self.annotation)
+
+        self.annotation = DST.random_grouping(self.annotation, self.per_sample_image)
+
+        self.vis_processor = vis_processor
+
+        self.option_prob = 0.5
+        self.prompter = DST.Prompter()
+        self.add_eos = add_eos
+        self.ignore_instruction = ignore_instruction
+        self.system_instruct = None
+        self.image_token_dict = DST.get_image_num_map(self.tokenizer)
+        self.cat_number()
+
+    def parse_annotation(self, annotation):
+        image_list = defaultdict(list)
+        for ann in annotation:
+            image_list[ann["image"]].append(ann)
+            
+        annotation = []
+        for ann_list in image_list.values():
+            annotation.append(random.choice(ann_list))
+        
+        return annotation
+
+    def __len__(self):
+        return len(self.annotation)
+
+    def cat_number(self):
+        tmp = len(self.annotation) // self.per_sample_image
+        self.arithmetic_progression_multi_image = [tmp * i for i in range(self.per_sample_image)]
+
+    def _add_instance_ids(self, key="instance_id"):
+        for idx, ann in enumerate(self.annotation):
+            ann[key] = str(idx)
+
+    def process_image(self, ann, data_debug_path=None, data_debug_counter=0):
+        image_path = os.path.join(self.vis_root, ann["image"])
+        save_debug_image(image_path, data_debug_path, data_debug_counter, get_rank(), img_idx=0)
+        image = Image.open(image_path).convert("RGB")
+
+        image = self.vis_processor(image)
+        try:
+            image = image['pixel_values'][0]
+            return image
+        except:
+            return image
+    
+    def post_process_text_image_count(self, text, image_num, offset=0):
+        for i in range(1+offset, image_num+1+offset):
+            text = re.sub(DST.DEFAULT_HUMAN_IMAGE_PRETOKEN, DST.image_mapping_dict[f"{i}"], text, count=1)
+        return text
+
+    def process_text(self, ann, data_debug_path=None, data_debug_counter=0, first_message=False):
+        question = ann["question"]
+
+        answer_weight = {}
+        for answer in ann["answer"]:
+            if answer in answer_weight.keys():
+                answer_weight[answer] += 1 / len(ann["answer"])
+            else:
+                answer_weight[answer] = 1 / len(ann["answer"])
+
+        answers = list(answer_weight.keys())
+        weights = list(answer_weight.values())
+
+        # create instruction
+        true_answer = answers[np.argmax(weights)]
+        is_option = random.random() < self.option_prob and len(answers) > 1
+        if is_option:
+            instruction = self.prompter(question, answers)
+        else:
+            instruction = self.prompter(question, with_image=True, first_message=first_message)
+        save_debug_text([instruction, true_answer], data_debug_path, data_debug_counter, get_rank())
+        return dict(instruction=instruction, answer=true_answer)
+
+    def tokenize(self, text):
+        res = self.tokenizer(
+            text["instruction"] + text["answer"],
+            return_tensors=None,
+            padding="do_not_pad",
+            truncation=True,
+            max_length=512,
+        )
+        if res["input_ids"][-1] != self.tokenizer.eos_token_id and self.add_eos:
+            res["input_ids"].append(self.tokenizer.eos_token_id)
+            res["attention_mask"].append(1)
+
+        labels = copy.deepcopy(res["input_ids"])
+        # ignore instruction_token
+        if self.ignore_instruction:
+            instruction_token = self.tokenizer(
+                text["instruction"], return_tensors=None, padding="do_not_pad", truncation=True, max_length=512
+            )
+            labels = [DST.DEFAULT_LABEL_PADDING_NUM] * len(instruction_token["input_ids"]) + labels[len(instruction_token["input_ids"]) :]
+
+        res.update(labels=labels)
+        return res
+
+
+    def create_system_instruct(self):
+        system_instruct = self.tokenizer(
+            DST.DEFAULT_PROMPT,
+            return_tensors=None,
+            padding="do_not_pad",
+            truncation=False,
+        )
+        # create the system instruction
+        self.system_instruct = {
+            "input_ids": system_instruct["input_ids"] + [self.tokenizer.eos_token_id],
+            "attention_mask": system_instruct["attention_mask"] + [1],
+            "labels": (len(system_instruct["input_ids"]) + 1) * [DST.DEFAULT_LABEL_PADDING_NUM],
+        }
+
+    def merge_all_images(self, res_list):
+        def find_index_and_replace(input_list, attention_mask_list, labels_list, image_number):
+            # replace a single number with a list of numbers
+            index = input_list.index(self.image_token_dict[DST.DEFAULT_HUMAN_IMAGE_PRETOKEN])
+            input_list[index] = self.image_token_dict[DST.image_mapping_dict[str(image_number)]]
+            attention_mask_list[index] = [1] * len(self.image_token_dict[DST.image_mapping_dict[str(image_number)]])
+            labels_list[index] = [DST.DEFAULT_LABEL_PADDING_NUM] * len(self.image_token_dict[DST.image_mapping_dict[str(image_number)]])
+            # flatten nested list
+            input_list = DST.flatten(input_list)
+            attention_mask_list = DST.flatten(attention_mask_list)
+            labels_list = DST.flatten(labels_list)
+            return input_list, attention_mask_list, labels_list
+        image_number = 0 
+        original_output = {"input_ids": [], "attention_mask": [], "labels": [], "image": []} #copy.deepcopy(self.system_instruct)
+        # original_output["image"] = []
+        for res in res_list:
+            # need to check if it has image or not
+            if self.image_token_dict[DST.DEFAULT_HUMAN_IMAGE_PRETOKEN] in res["input_ids"]:
+                image_number += 1
+                res["input_ids"], res["attention_mask"], res["labels"] = find_index_and_replace(res["input_ids"], res["attention_mask"], res["labels"], image_number)
+                original_output["image"] = original_output["image"] + [res["image"]]
+                # cat res to original_output 
+            original_output["input_ids"] = original_output["input_ids"] + res["input_ids"]
+            original_output["attention_mask"] = original_output["attention_mask"] + res["attention_mask"]
+            original_output["labels"] = original_output["labels"] + res["labels"]
+        if image_number == 0:
+            raise ValueError("image number should not be zero, we now did not support no-image case.")
+        original_output["image_num"] = image_number
+        return original_output
+
+    def __getitem__(self, index):
+        res_list = []
+        for ann in self.annotation[index]:
+            image = self.process_image(ann,
+                                    data_debug_path=self.data_debug_path,
+                                    data_debug_counter=self.data_debug_counter)
+            text = self.process_text(ann,
+                                    data_debug_path=self.data_debug_path,
+                                    data_debug_counter=self.data_debug_counter,
+                                    first_message=(not res_list))
+            self.data_debug_counter += 1
+            res = self.tokenize(text)
+            res.update(image=image)
+            res.update(text)
+            res_list.append(res)
+        
+        output = self.merge_all_images(res_list)
+        return output
+
+    def collater(self, samples):
+        image_list, question_list, answer_list, input_id_list, attention_mask_list, labels_list = [], [], [], [], [], []
+
+        for sample in samples:
+            image_list.append(sample["image"])
+            question_list.append(sample["instruction"])
+            answer_list.append(sample["answer"])
+            input_id_list.append(sample["input_ids"])
+            attention_mask_list.append(sample["attention_mask"])
+            labels_list.append(sample["labels"])
+
+        # We have to pad the labels before calling `tokenizer.pad` as this method won't pad them and needs them of the
+        # same length to return tensors.
+        max_label_length = max(len(l) for l in labels_list)
+        padding_side = self.tokenizer.padding_side
+        padded_labels = []
+        for l in labels_list:
+            remainder = [DST.DEFAULT_LABEL_PADDING_NUM] * (max_label_length - len(l))
+            if isinstance(l, list):
+                l = l + remainder if padding_side == "right" else remainder + l
+            elif padding_side == "right":
+                l = np.concatenate([l, remainder]).astype(np.int64)
+            else:
+                l = np.concatenate([remainder, l]).astype(np.int64)
+            padded_labels.append(l)
+
+        padded_samples = self.tokenizer.pad(
+            {"input_ids": input_id_list, "attention_mask": attention_mask_list, "labels": padded_labels},
+            return_tensors="pt",
+            padding="longest",
+        )
+
+        # remove all image related tokens
+        labels = padded_samples["labels"]
+        labels[labels == self.tokenizer.pad_token_id] = DST.DEFAULT_LABEL_PADDING_NUM
+        labels[:, 0] = DST.DEFAULT_LABEL_PADDING_NUM
+        for k, v in self.image_token_dict.items():
+            labels[labels == v] = DST.DEFAULT_LABEL_PADDING_NUM
+        return {
+            "image": torch.stack(image_list, dim=0),
+            "input_ids": padded_samples["input_ids"],
+            "attention_mask": padded_samples["attention_mask"],
+            "labels": labels,
+            "instruction": question_list,
+            "answer": answer_list,
+        }
+
+
+class ConcatDataset(ConcatDataset):
+    def __init__(self, datasets: Iterable[Dataset]) -> None:
+        super().__init__(datasets)
+
+    def collater(self, samples):
+        # TODO For now only supports datasets with same underlying collater implementations
+
+        all_keys = set()
+        for s in samples:
+            all_keys.update(s)
+
+        shared_keys = all_keys
+        for s in samples:
+            shared_keys = shared_keys & set(s.keys())
+
+        samples_shared_keys = []
+        for s in samples:
+            samples_shared_keys.append({k: s[k] for k in s.keys() if k in shared_keys})
+
+        return self.datasets[0].collater(samples_shared_keys)
diff --git a/applications/DeepSpeed-VisualChat/utils/ds_utils.py b/applications/DeepSpeed-VisualChat/utils/ds_utils.py
new file mode 100755
index 000000000..7e04d9bb3
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/utils/ds_utils.py
@@ -0,0 +1,97 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+GLOBAL_BATCH_SIZE = 32
+MICRO_BATCH_SIZE = 4
+
+
+def get_train_ds_config(args,
+                        offload,
+                        stage=2,
+                        enable_hybrid_engine=False,
+                        inference_tp_size=1,
+                        release_inference_cache=False,
+                        pin_parameters=True,
+                        tp_gather_partition_size=8,
+                        max_out_tokens=512):
+    if args.precision == 'fp16':
+        enable_fp16 = True
+        enable_bf16 = False
+    elif args.precision == 'bf16':
+        enable_fp16 = False
+        enable_bf16 = True
+    else:
+        raise ValueError(f"Invalid precision {args.precision}")
+    device = "cpu" if offload else "none"
+    zero_opt_dict = {
+        "stage": stage,
+        "offload_param": {
+            "device": device
+        },
+        "offload_optimizer": {
+            "device": device
+        },
+        "stage3_param_persistence_threshold": 1e4,
+        "stage3_max_live_parameters": 3e7,
+        "stage3_prefetch_bucket_size": 0,
+        "memory_efficient_linear": False,
+    }
+    output =  {
+        "train_batch_size": GLOBAL_BATCH_SIZE,
+        "train_micro_batch_size_per_gpu": MICRO_BATCH_SIZE,
+        "steps_per_print": 10,
+        "zero_optimization": zero_opt_dict,
+        "zero_allow_untested_optimizer": True,
+        "zero_force_ds_cpu_optimizer": False,
+        "fp16": {
+            "enabled": enable_fp16,
+            "loss_scale_window": 100
+        },
+        "bf16": {
+            "enabled": enable_bf16,
+        },
+        "gradient_clipping": 1.0,
+        "prescale_gradients": False,
+        "wall_clock_breakdown": False,
+        "hybrid_engine": {
+            "enabled": enable_hybrid_engine,
+            "max_out_tokens": max_out_tokens,
+            "inference_tp_size": inference_tp_size,
+            "release_inference_cache": release_inference_cache,
+            "pin_parameters": pin_parameters,
+            "tp_gather_partition_size": tp_gather_partition_size,
+        },
+
+    }
+    if args.enable_tensorboard:
+        output.update({"tensorboard": {
+            "enabled": True,
+            "output_path": args.output_dir,
+            "job_name": 'tb_logging'
+        }}
+        )
+    return output
+
+def get_eval_ds_config(offload, stage=0):
+    device = "cpu" if offload else "none"
+    zero_opt_dict = {
+        "stage": stage,
+        "stage3_param_persistence_threshold": 1e4,
+        "offload_param": {
+            "device": device
+        },
+        "memory_efficient_linear": False
+    }
+    return {
+        "train_batch_size": GLOBAL_BATCH_SIZE,
+        "train_micro_batch_size_per_gpu": MICRO_BATCH_SIZE,
+        "steps_per_print": 10,
+        "zero_optimization": zero_opt_dict,
+        "fp16": {
+            "enabled": True
+        },
+        "gradient_clipping": 1.0,
+        "prescale_gradients": False,
+        "wall_clock_breakdown": False
+    }
diff --git a/applications/DeepSpeed-VisualChat/utils/model/__init__.py b/applications/DeepSpeed-VisualChat/utils/model/__init__.py
new file mode 100644
index 000000000..62685308b
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/utils/model/__init__.py
@@ -0,0 +1 @@
+from .modeling_dsvl import create_dsvl_model_and_transforms
\ No newline at end of file
diff --git a/applications/DeepSpeed-VisualChat/utils/model/modeling_dsvl.py b/applications/DeepSpeed-VisualChat/utils/model/modeling_dsvl.py
new file mode 100755
index 000000000..eb9db9428
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/utils/model/modeling_dsvl.py
@@ -0,0 +1,377 @@
+import torch
+import torch.nn.functional as F
+from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM
+from transformers import CLIPVisionModel, CLIPImageProcessor 
+from .third_party_model.hf_model.modeling_llama import LlamaForCausalLM
+from .third_party_model.hf_model.configuration_llama import LlamaConfig
+from .third_party_model.qwen_clip.qwen_clip import VisionTransformer
+from torch import nn
+from torch.nn import  CrossEntropyLoss
+import copy
+import os
+import sys
+from ..data import build_dataset, DataCollatorPadToMaxLen, add_special_token
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)))
+import data.DST as DST # default special tokens
+from torch.utils.data import DataLoader
+from transformers.deepspeed import HfDeepSpeedConfig
+import numpy as np
+from .vis_proj import VisProjection_vit, VisProjection_perceiver
+
+def get_name(huggingface_path):
+    if 'opt' in huggingface_path.lower():
+        return 'opt'
+    elif 'gpt2' in huggingface_path.lower():
+        return 'gpt2'
+    elif 'llama-2' in huggingface_path.lower():
+        return 'llama-2'
+    else:
+        raise ValueError('We currently only support llama, opt and gpt2')
+
+def create_dsvl_model_and_transforms(
+        text_tokenizer=None,
+        ds_config=None,
+        args=None):
+    assert args.vision_model_name_or_path is not None
+    assert args.lm_model_name_or_path is not None
+    if ds_config is not None and ds_config["zero_optimization"]["stage"] == 3:
+        # https://huggingface.co/docs/transformers/main_classes/deepspeed#nontrainer-deepspeed-integration
+        dschf = HfDeepSpeedConfig(ds_config)
+    lang_config = AutoConfig.from_pretrained(args.lm_model_name_or_path)
+
+
+    if 'qwen' in args.vision_model_name_or_path.lower():
+        # use a fake config for consistent
+        vis_config = AutoConfig.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k")
+        vis_config = vis_config.vision_config
+        vis_encoder = VisionTransformer(
+            image_size=448,
+            patch_size=vis_config.patch_size,
+            width=vis_config.hidden_size,
+            layers=vis_config.num_hidden_layers,
+            heads=vis_config.num_attention_heads,
+            mlp_size=vis_config.intermediate_size,
+            output_dim=4096,
+        ) 
+        vis_encoder.load_state_dict(torch.load(os.path.join(args.vision_model_name_or_path, 'pytorch_model.bin'), map_location='cpu'), strict=True)
+        vis_config.hidden_size = 4096 # we need to change the hidden size to 4096
+    elif 'clip' in args.vision_model_name_or_path.lower():
+        vis_encoder = CLIPVisionModel.from_pretrained(args.vision_model_name_or_path) 
+        vis_config = vis_encoder.config
+    else:
+        raise ValueError("We currently only support qwen's modifed clip and other clip models")
+    
+    image_processor = CLIPImageProcessor.from_pretrained(args.vision_model_name_or_path)
+    
+    tokenizer = add_special_token(text_tokenizer)  
+    tokenizer.pad_token = tokenizer.eos_token
+    if 'llama' in args.lm_model_name_or_path.lower():
+        lang_config = LlamaConfig.from_pretrained(args.lm_model_name_or_path)
+        lang_config.enable_mmca_attention = args.enable_mmca_attention
+        lang_config.max_position_embeddings = args.max_seq_len
+    
+    if 'llama' in args.lm_model_name_or_path.lower():
+        if ds_config is not None and ds_config["zero_optimization"]["stage"] == 3:
+            lang_decoder = LlamaForCausalLM.from_pretrained(args.lm_model_name_or_path, config=lang_config)
+        else:
+            try:
+                device = torch.device("cuda", args.local_rank)
+            except:
+                device = "auto"
+            lang_decoder = LlamaForCausalLM.from_pretrained(args.lm_model_name_or_path, config=lang_config, device_map=device)
+        decoder_name = 'llama'
+    else:
+        raise NotImplemented("We for now only support LLaMA family and do not support other models yet")
+    
+    lang_config.vocab_size = len(tokenizer)
+    lang_decoder.resize_token_embeddings(len(tokenizer))
+    model = DeepSpeedViLModel(vis_encoder, lang_decoder, \
+                                tokenizer, \
+                                vis_config=vis_config, \
+                                decoder_name=decoder_name, \
+                                lang_config=lang_config, \
+                                max_seq_length=args.max_seq_len,
+                                args=args)
+    
+    return model, image_processor, tokenizer
+
+
+class DeepSpeedViLModel(nn.Module):
+    def __init__(self, vis_encoder,
+                    lang_decoder,
+                    tokenizer,
+                    vis_config=None, 
+                    decoder_name='gpt2',
+                    lang_config=None,
+                    max_seq_length=512,
+                    args=None):
+        super().__init__()
+        self.vis_encoder = vis_encoder
+         
+        self.lang_decoder = lang_decoder 
+        self.tokenizer = tokenizer 
+        self.args = args
+        self._enable_special_token()
+
+        self.lang_config = lang_config
+        self._get_model_stat(decoder_name)
+        lang_embed, pos_embedding = self._languag_embedding()
+        self.pos_embedding = pos_embedding
+        self.max_seq_length = max_seq_length
+        if lang_embed is None:
+            print ('randomly initialized a language embedding')
+            self.lang_embed = nn.Embedding(self.lang_config.vocab_size,\
+                                            self.hidden_size,\
+                                            self.pad_token_id) # randomly initialized language embedder
+        else:
+            self.lang_embed = lang_embed
+
+        self.pos_embedding = pos_embedding
+        self.projection = self.build_projection(vis_config, self.lang_config.hidden_size)   
+        self._init_weight()
+        
+
+        # get padding token embedding
+        self.padding_embedding = None 
+        self.vis_encoder_update = None
+
+    def _enable_special_token(self):
+        self.DEFAULT_IMAGE_TOKEN_ID = self.tokenizer.convert_tokens_to_ids(DST.DEFAULT_IMAGE_TOKEN)
+        self.DEFAULT_IMAGE_PATCH_TOKEN_ID = self.tokenizer.convert_tokens_to_ids(DST.DEFAULT_IMAGE_PATCH_TOKEN)
+        self.DEFAULT_IM_START_TOKEN_ID = self.tokenizer.convert_tokens_to_ids(DST.DEFAULT_IM_START_TOKEN)
+        self.DEFAULT_IM_END_TOKEN_ID = self.tokenizer.convert_tokens_to_ids(DST.DEFAULT_IM_END_TOKEN)
+
+        
+    def _get_model_stat(self, model_name):   
+        config_dic = {
+            'llama-2': ['max_position_embeddings','num_hidden_layers'],
+            'llama': ['max_position_embeddings','num_hidden_layers'],
+            'gpt2': ['n_positions','n_layer'],
+            'opt': ['max_position_embeddings','num_hidden_layers']
+        }
+        pos_name, layer_name = config_dic[model_name][0], config_dic[model_name][1]
+        self.n_positions = getattr(self.lang_config, pos_name)
+        self.num_layer = getattr(self.lang_config, layer_name)
+        self.hidden_size  = getattr(self.lang_config, 'hidden_size')
+        self.vocab_size = getattr(self.lang_config, 'vocab_size')
+        
+    def _languag_embedding(self):
+        pos_embedding = None
+        token_embedding = None
+        for name, module in self.lang_decoder.named_modules():
+            if isinstance(module, nn.Embedding):
+                try:
+                    # z3 shape
+                    rows = module.weight.ds_shape[0]
+                except:
+                    rows = module.weight.size()[0]
+                     
+                if rows == self.vocab_size:
+                    token_embedding = copy.deepcopy(module)
+                if rows == self.n_positions:
+                    pos_embedding = copy.deepcopy(module)
+        return token_embedding, pos_embedding
+     
+        
+    def _init_weight(self):
+        self.vis_encoder.requires_grad_(False)  
+        self.lang_decoder.requires_grad_(False)  
+        self.lang_embed.requires_grad_(True)   
+        self.projection.requires_grad_(True) 
+        if  self.pos_embedding  is not None:     
+            self.pos_embedding.requires_grad_(True) 
+        
+
+    def build_projection(self, vis_config, lang_dim):
+        if self.args.vis_proj == 'vit':
+            output =  VisProjection_vit(vis_config, lang_dim=lang_dim)
+            return output 
+        elif self.args.vis_proj == 'baseline':
+            return nn.Sequential( 
+                            nn.Linear(vis_config.hidden_size, lang_dim), # an example implementation
+                            nn.LayerNorm(lang_dim, eps=1e-12))
+        elif self.args.vis_proj == 'perceiver':
+            return VisProjection_perceiver(vis_config, lang_dim=lang_dim)
+
+    def concat(self, img_proj, lang, attention_mask, input_labels, image_num, do_generation=False):
+        output_lang = []
+        output_attention_mask = []
+        output_input_labels = []
+
+        def split_tensor_by_a_list(tensor, split_list):
+            output = []
+            initial_pos = 0
+            accumulated_sum = [sum(split_list[:i]) for i in range(1, len(split_list)+1)]
+            for pos in accumulated_sum:
+                output.append(tensor[initial_pos:pos])
+                initial_pos = pos
+            del tensor
+            return output
+        
+        img_proj = split_tensor_by_a_list(img_proj, image_num)
+        
+        for index in range(len(img_proj)): # each seq has multi iamges, so we need to use it as index
+            initial_pos = 0
+            cur_img = img_proj[index]
+            cur_lang = lang[index]
+            cur_attention_mask = attention_mask[index]
+            cur_input_labels = input_labels[index]
+            img_pos_list = cur_lang.eq(self.DEFAULT_IMAGE_TOKEN_ID).nonzero(as_tuple=True)[0]
+            assert len(img_pos_list) == image_num[index], "the number of images in the lang and image_num does not match"
+            if len(img_pos_list) == 0:
+                continue # there is no image probably it is a pure text insturctio
+            
+            cur_lang = self.lang_embed(cur_lang) # get the real embedding
+            for img_i, img_pos in zip(cur_img, torch.flip(img_pos_list, dims=(0,))): # do it reversely so that we can easily insert the image
+                lang_pre_img_embed = cur_lang[initial_pos:img_pos]
+                attention_mask_pre_img = cur_attention_mask[initial_pos:img_pos]
+                input_labels_pre_img = cur_input_labels[initial_pos:img_pos]
+
+                lang_post_img_embed = cur_lang[img_pos+1:]
+                attention_mask_post_img = cur_attention_mask[img_pos+1:]
+                input_labels_post_img = cur_input_labels[img_pos+1:]
+                # now we need to concat the image embedding
+                lang_full = torch.cat((lang_pre_img_embed, img_i, lang_post_img_embed), dim=0)
+                # label the position of all images as 2 instead of 1
+    
+                attention_mask_full = torch.cat( (attention_mask_pre_img, 2 * torch.ones_like(img_i[:, 0]), attention_mask_post_img), dim=0)
+
+                input_labels_full = torch.cat((input_labels_pre_img.long(), DST.DEFAULT_LABEL_PADDING_NUM * torch.ones_like(img_i[:, 0], dtype=torch.long), input_labels_post_img),   dim=0)
+
+                cur_lang = lang_full
+                cur_attention_mask = attention_mask_full
+                cur_input_labels = input_labels_full
+            # append to the output 
+            output_lang.append(lang_full.unsqueeze(0))
+            output_attention_mask.append(attention_mask_full.unsqueeze(0))
+            output_input_labels.append(input_labels_full.unsqueeze(0))
+
+        if self.padding_embedding is None:
+            with torch.no_grad():
+                self.padding_embedding = self.lang_embed(torch.tensor(self.tokenizer.pad_token_id).to(lang.device).unsqueeze(0)).unsqueeze(0).detach()
+
+        def pad_tensor_list(tensor_list, pad_token_id, pad_vec=False):
+            max_len = max([tensor.size(1) for tensor in tensor_list])
+            if not do_generation:
+                max_len = int(np.ceil(max_len / 8) * 8) # make it divisible by 8
+            padded_tensor_list = []
+            for tensor in tensor_list:
+                if max_len > tensor.size(1):
+                    if pad_vec: # output_lang padding
+                        # pad with self.padding_embedding 
+                        padded_tensor = torch.cat([tensor] + [self.padding_embedding] * (max_len - tensor.size(1)), dim=1)
+                        
+                    else:
+                        padded_tensor = F.pad(tensor, (0, max_len - tensor.size(1)), value=pad_token_id)
+                else:
+                    padded_tensor = tensor
+                padded_tensor_list.append(padded_tensor)
+            return padded_tensor_list
+        output_lang = pad_tensor_list(output_lang, self.tokenizer.pad_token_id, pad_vec=True)
+        output_attention_mask = pad_tensor_list(output_attention_mask, 0)
+        output_input_labels = pad_tensor_list(output_input_labels, DST.DEFAULT_LABEL_PADDING_NUM)
+
+        return torch.cat(output_lang, dim=0), torch.cat(output_attention_mask, dim=0), torch.cat(output_input_labels, dim=0)
+
+    def forward(self, img, lang, 
+            attention_mask=None,
+            input_labels=None,
+            image_num=1,
+            past_key_values=None,
+            use_cache=False,
+            output_attentions=False, 
+            output_hidden_states=False,
+            return_dict=True):
+        
+        assert attention_mask is not None, "attention mask is required"
+        assert input_labels is not None, "input labels is required"
+
+        if self.vis_encoder_update is None:
+            self.vis_encoder_update = False # default is False
+            for p in self.vis_encoder.parameters():
+                if p.requires_grad:
+                    self.vis_encoder_update = True
+        # this part for now does not require gradient
+        if self.vis_encoder_update:
+            # update vis encoder
+            img_feature = self.vis_encoder(img) 
+            if not isinstance(img_feature, torch.Tensor):
+                img_feature = img_feature.last_hidden_state
+        else:
+            # do not update vis encoder
+            with torch.no_grad():
+                img_feature = self.vis_encoder(img)
+                if not isinstance(img_feature, torch.Tensor):
+                    img_feature = img_feature.last_hidden_state
+        img_proj = self.projection(img_feature)
+       
+        hidden_states, attention_mask, input_labels = self.concat(img_proj, lang, attention_mask, input_labels, image_num)
+        labels = input_labels   
+            
+        if self.pos_embedding is not None:
+            if past_key_values is None:
+                past_length = 0
+            else:
+                past_length = past_key_values[0][0].size(-2)
+            position_ids = torch.arange(past_length, hidden_states.size()[1] + past_length, dtype=torch.long, device=hidden_states.device)
+            position_ids = position_ids.unsqueeze(0).view(-1, hidden_states.size()[1])
+            position_embeds = self.pos_embedding(position_ids)
+            hidden_states = hidden_states + position_embeds
+            
+        logits = self.lang_decoder(input_ids=None, 
+                                    inputs_embeds=hidden_states,
+                                    attention_mask=attention_mask,
+                                    labels=None,
+                                    past_key_values=past_key_values,
+                                    use_cache=use_cache,
+                                    output_attentions=output_attentions, 
+                                    output_hidden_states=output_hidden_states,
+                                    return_dict=return_dict).logits
+        
+        
+        logits_shift = logits[..., :-1, :].contiguous().view(-1, self.vocab_size) # remove the last token
+        labels_shift = labels[..., 1:].contiguous().to(logits_shift.device).view(-1) # remove the first token
+        # select index that is not -100
+        labels_index = labels_shift != -100
+        if torch.sum(labels_index) ==0:
+            logits_shift = logits_shift[-2:,:].contiguous()
+            labels_shift = labels_shift[-2:].contiguous()            
+        else:
+            logits_shift = logits_shift[labels_index,:].contiguous()
+            labels_shift = labels_shift[labels_index].contiguous()
+
+        loss_fct = CrossEntropyLoss() 
+        loss = loss_fct(logits_shift, labels_shift) 
+        
+        return [loss,] 
+    
+    @torch.no_grad()
+    def generate(self, img, lang, 
+            attention_mask=None,
+            input_labels=None,
+            generation_length=128,
+            generation_kwargs={}, # add some meaningful default values
+            ):
+        assert lang.size()[0] == 1, "only support batch size == 1 for now"
+        attention_mask = torch.ones_like(lang) 
+        input_labels = torch.ones_like(lang) 
+        # this part for now does not require gradient
+        img_feature = self.vis_encoder(img) 
+        if not isinstance(img_feature, torch.Tensor):
+            img_feature = img_feature.last_hidden_state
+        img_proj = self.projection(img_feature)
+        hidden_states, attention_mask, input_labels = self.concat(img_proj, lang, attention_mask, input_labels, image_num=[img.size(0)], do_generation=True)
+        
+        output = self.lang_decoder.generate(input_ids=None,
+                                inputs_embeds=hidden_states,
+                                attention_mask=attention_mask, # we need the mask to diff img and text
+                                pad_token_id=self.tokenizer.pad_token_id,
+                                max_new_tokens=generation_length, # this is the number of tokens you want to generate
+                                **generation_kwargs)
+        return (output, self.tokenizer.batch_decode(output, skip_special_tokens=True)[0])
+
+
+    def gradient_checkpointing_enable(self):
+        self.vis_encoder.gradient_checkpointing_enable()
+        self.lang_decoder.gradient_checkpointing_enable()
\ No newline at end of file
diff --git a/applications/DeepSpeed-VisualChat/utils/model/third_party_model/hf_model/configuration_llama.py b/applications/DeepSpeed-VisualChat/utils/model/third_party_model/hf_model/configuration_llama.py
new file mode 100755
index 000000000..9b0f0ee69
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/utils/model/third_party_model/hf_model/configuration_llama.py
@@ -0,0 +1,175 @@
+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" LLaMA model configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
+
+
+class LlamaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the LLaMA-7B.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`LlamaModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        pretraining_tp (`int`, *optional*, defaults to `1`):
+            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
+            document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
+            necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
+            issue](https://github.com/pytorch/pytorch/issues/76232).
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings(`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format
+            is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
+            these scaling strategies behave:
+            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
+            experimental feature, subject to breaking API changes in future versions.
+
+        Example:
+
+    ```python
+    >>> from transformers import LlamaModel, LlamaConfig
+
+    >>> # Initializing a LLaMA llama-7b style configuration
+    >>> configuration = LlamaConfig()
+
+    >>> # Initializing a model from the llama-7b style configuration
+    >>> model = LlamaModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "llama"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_scaling=None,
+        enable_mmca_attention=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.enable_mmca_attention = enable_mmca_attention
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with with two fields, `name` and `factor`, "
+                f"got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+            raise ValueError(
+                f"`rope_scaling`'s name field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+            )
+        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+            raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}")
\ No newline at end of file
diff --git a/applications/DeepSpeed-VisualChat/utils/model/third_party_model/hf_model/modeling_llama.py b/applications/DeepSpeed-VisualChat/utils/model/third_party_model/hf_model/modeling_llama.py
new file mode 100755
index 000000000..b8e5c2ac6
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/utils/model/third_party_model/hf_model/modeling_llama.py
@@ -0,0 +1,1096 @@
+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch LLaMA model."""
+import math
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+# from transformers.configuration_llama import LlamaConfig
+from transformers import LlamaConfig
+
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "LlamaConfig"
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None, enable_mmca_attention=False):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    # we need two method here 
+    # import pdb; pdb.set_trace()
+    # assert tgt_len == mask.size(-1), "tgt_len is not supported"
+    if enable_mmca_attention is False:
+        # basically, standard mask generation
+        mask = (mask > 0).to(mask.dtype) # our mask will have 0: padding, 1: text, and 2: image
+        bsz, src_len = mask.size()
+        tgt_len = tgt_len if tgt_len is not None else src_len
+
+        expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+        inverted_mask = 1.0 - expanded_mask
+
+        return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+    else:
+        bsz, src_len = mask.size()
+        tgt_len = tgt_len if tgt_len is not None else src_len
+        expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+        # image mask
+        mask_img = mask.clone() 
+        mask_img[mask_img!=2] = 0 # for all non-image part, we make them to be 0
+        mask_img[mask_img==2] = 1 # for all image part, we make them to be 1
+
+        expanded_mask_img = mask_img[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+        # make diagonal to be 1 this part is not needed
+        # expanded_mask_img = expanded_mask_img + torch.eye(mask.shape[-1], dtype=mask.dtype, device=mask.device)[None, None, :, :] 
+        inverted_mask_img = 1.0 - expanded_mask_img 
+        inverted_mask_img = inverted_mask_img.masked_fill(inverted_mask_img.to(torch.bool), torch.finfo(dtype).min)
+
+        # image tokens does not attennd to image tokens
+        if tgt_len == src_len:
+            # TODO: basically, the prompt phase, need to revisit this part
+            for i in range(bsz):
+                for j in range(tgt_len):
+                    if mask[i, j] == 2:
+                        # if it is image token, we make it to be 0 for previous attention
+                        inverted_mask_img[i, :, j, :] = torch.finfo(dtype).min
+                        inverted_mask_img[i, :, j, j] = 0
+
+
+        # text mask 
+        mask_text = mask.clone()
+        mask_text[mask_text!=1] = 0 # for all non-text part, we make them to be 0
+        mask_text[mask_text==1] = 1 # for all text part, we make them to be 1
+        expanded_mask_text = mask_text[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+        # make diagonal to be 1
+        # expanded_mask_text = expanded_mask_text + torch.eye(mask.shape[-1], dtype=mask.dtype, device=mask.device)[None, None, :, :]
+        inverted_mask_text = 1.0 - expanded_mask_text
+        inverted_mask_text = inverted_mask_text.masked_fill(inverted_mask_text.to(torch.bool), torch.finfo(dtype).min)
+
+        return [inverted_mask_img, inverted_mask_text] # return two masks
+
+
+
+
+class LlamaRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        LlamaRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+
+class LlamaRotaryEmbedding(torch.nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+        return (
+            self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+            self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+        )
+
+
+class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding):
+    """LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        t = t / self.scaling_factor
+
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+
+
+class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding):
+    """LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+
+        if seq_len > self.max_position_embeddings:
+            base = self.base * (
+                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
+            ) ** (self.dim / (self.dim - 2))
+            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+            self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+    # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
+    cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
+    sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
+    # import pdb; pdb.set_trace()
+    cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class LlamaMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        if self.config.pretraining_tp > 1:
+            slice = self.intermediate_size // self.config.pretraining_tp
+            gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
+            up_proj_slices = self.up_proj.weight.split(slice, dim=0)
+            down_proj_slices = self.down_proj.weight.split(slice, dim=1)
+
+            gate_proj = torch.cat(
+                [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1
+            )
+            up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
+
+            intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
+            down_proj = [
+                F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
+            ]
+            down_proj = sum(down_proj)
+        else:
+            down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+        return down_proj
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class LlamaAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: LlamaConfig):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self.enable_mmca_attention = config.enable_mmca_attention
+        self._init_rope()
+
+    def _init_rope(self):
+        if self.config.rope_scaling is None:
+            self.rotary_emb = LlamaRotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings)
+        else:
+            scaling_type = self.config.rope_scaling["type"]
+            scaling_factor = self.config.rope_scaling["factor"]
+            if scaling_type == "linear":
+                self.rotary_emb = LlamaLinearScalingRotaryEmbedding(
+                    self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor
+                )
+            elif scaling_type == "dynamic":
+                self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding(
+                    self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor
+                )
+            else:
+                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+
+        if self.config.pretraining_tp > 1:
+            key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
+            query_slices = self.q_proj.weight.split(
+                (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
+            )
+            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
+            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
+
+            query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
+            query_states = torch.cat(query_states, dim=-1)
+
+            key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
+            key_states = torch.cat(key_states, dim=-1)
+
+            value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
+            value_states = torch.cat(value_states, dim=-1)
+
+        else:
+            query_states = self.q_proj(hidden_states)
+            key_states = self.k_proj(hidden_states)
+            value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-2]
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+        past_key_value = (key_states, value_states) if use_cache else None
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if self.enable_mmca_attention is False:
+                if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                    raise ValueError(
+                        f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                    )
+            else:
+                if attention_mask[0].size() != (bsz, 1, q_len, kv_seq_len):
+                    raise ValueError(
+                        f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                    )
+            if self.enable_mmca_attention is False:
+                attn_weights = attn_weights + attention_mask
+            else:
+                attn_weights_img = attn_weights + attention_mask[0]
+                attn_weights_text = attn_weights + attention_mask[1]
+
+        # upcast attention to fp32
+        if self.enable_mmca_attention is False:
+            attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        else:
+            attn_weights_img = nn.functional.softmax(attn_weights_img, dim=-1, dtype=torch.float32).to(query_states.dtype)
+            attn_weights_text = nn.functional.softmax(attn_weights_text, dim=-1, dtype=torch.float32).to(query_states.dtype)
+            attn_weights = (attn_weights_img + attn_weights_text)  #TODO: shall we reduce the weights of the diagonal part?
+
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+        if self.config.pretraining_tp > 1:
+            attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
+            o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
+            attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
+        else:
+            attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+class LlamaDecoderLayer(nn.Module):
+    def __init__(self, config: LlamaConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = LlamaAttention(config=config)
+        self.mlp = LlamaMLP(config)
+        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+LLAMA_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`LlamaConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
+    LLAMA_START_DOCSTRING,
+)
+class LlamaPreTrainedModel(PreTrainedModel):
+    config_class = LlamaConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["LlamaDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, LlamaModel):
+            module.gradient_checkpointing = value
+
+
+LLAMA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
+    LLAMA_START_DOCSTRING,
+)
+class LlamaModel(LlamaPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
+
+    Args:
+        config: LlamaConfig
+    """
+
+    def __init__(self, config: LlamaConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.gradient_checkpointing = False
+        self.enable_mmca_attention = config.enable_mmca_attention # this is new :)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1], enable_mmca_attention=self.enable_mmca_attention)
+            # .to(
+            #     inputs_embeds.device
+            # )
+
+            if self.enable_mmca_attention:
+                # if cross attention, we have two masks, this is from _expand_mask
+                expanded_attn_mask = [expanded_attn_mask[0].to(inputs_embeds.device), expanded_attn_mask[1].to(inputs_embeds.device)]
+                if combined_attention_mask is None:
+                    combined_attention_mask = expanded_attn_mask
+                else:
+                    combined_attention_mask = (expanded_attn_mask[0] + combined_attention_mask, 
+                                               expanded_attn_mask[1] + combined_attention_mask)
+            else:
+                expanded_attn_mask = expanded_attn_mask.to(inputs_embeds.device)
+                combined_attention_mask = (
+                    expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+                )
+        # import pdb; pdb.set_trace()
+        # (combined_attention_mask[1][0,0][50] + combined_attention_mask[0][0,0][50])[51]
+        return combined_attention_mask
+
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+        # import pdb; pdb.set_trace()
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        # embed positions
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
+            )
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+        )
+
+        hidden_states = inputs_embeds
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, None)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+class LlamaForCausalLM(LlamaPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = LlamaModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, LlamaForCausalLM
+
+        >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        if self.config.pretraining_tp > 1:
+            lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
+            logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
+            logits = torch.cat(logits, dim=-1)
+        else:
+            logits = self.lm_head(hidden_states)
+        logits = logits.float()
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        if past_key_values:
+            input_ids = input_ids[:, -1:]
+
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            # TODO: I used a hacky way to create position_ids, but it works for now
+            attnetion_mask_clone = attention_mask.clone() > 0
+            position_ids = attnetion_mask_clone.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attnetion_mask_clone == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -1].unsqueeze(-1)
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+
+
+@add_start_docstrings(
+    """
+    The LLaMa Model transformer with a sequence classification head on top (linear layer).
+
+    [`LlamaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    LLAMA_START_DOCSTRING,
+)
+class LlamaForSequenceClassification(LlamaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = LlamaModel(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
+                    logits.device
+                )
+            else:
+                sequence_lengths = -1
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
diff --git a/applications/DeepSpeed-VisualChat/utils/model/third_party_model/qwen_clip/qwen_clip.py b/applications/DeepSpeed-VisualChat/utils/model/third_party_model/qwen_clip/qwen_clip.py
new file mode 100644
index 000000000..4bb87426d
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/utils/model/third_party_model/qwen_clip/qwen_clip.py
@@ -0,0 +1,424 @@
+# Copyright (c) Alibaba Cloud.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from collections import OrderedDict
+import math
+import requests
+from io import BytesIO
+from functools import partial
+from PIL import Image
+from typing import Callable, Optional, Sequence, Tuple, List
+import numpy as np
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.nn.init import trunc_normal_
+from torchvision import transforms
+from torchvision.transforms import InterpolationMode
+
+
+def get_abs_pos(abs_pos, tgt_size):
+    # abs_pos: L, C
+    # tgt_size: M
+    # return: M, C
+    src_size = int(math.sqrt(abs_pos.size(0)))
+    tgt_size = int(math.sqrt(tgt_size))
+    dtype = abs_pos.dtype
+
+    if src_size != tgt_size:
+        return F.interpolate(
+            abs_pos.float().reshape(1, src_size, src_size, -1).permute(0, 3, 1, 2),
+            size=(tgt_size, tgt_size),
+            mode="bicubic",
+            align_corners=False,
+        ).permute(0, 2, 3, 1).flatten(0, 2).to(dtype=dtype)
+    else:
+        return abs_pos
+
+# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token:
+        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+
+
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+
+    emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
+    return emb
+
+
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float32)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000**omega  # (D/2,)
+
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+
+    emb_sin = np.sin(out) # (M, D/2)
+    emb_cos = np.cos(out) # (M, D/2)
+
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+
+
+class Resampler(nn.Module):
+    """
+    A 2D perceiver-resampler network with one cross attention layers by
+        (grid_size**2) learnable queries and 2d sincos pos_emb
+    Outputs:
+        A tensor with the shape of (grid_size**2, embed_dim)
+    """
+    def __init__(
+            self,
+            grid_size,
+            embed_dim,
+            num_heads,
+            kv_dim=None,
+            norm_layer=nn.LayerNorm
+    ):
+        super().__init__()
+        self.num_queries = grid_size ** 2
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+
+        self.pos_embed = nn.Parameter(
+            torch.from_numpy(get_2d_sincos_pos_embed(embed_dim, grid_size)).float()
+        ).requires_grad_(False)
+
+        self.query = nn.Parameter(torch.zeros(self.num_queries, embed_dim))
+        trunc_normal_(self.query, std=.02)
+
+        if kv_dim is not None and kv_dim != embed_dim:
+            self.kv_proj = nn.Linear(kv_dim, embed_dim, bias=False)
+        else:
+            self.kv_proj = nn.Identity()
+
+        self.attn = nn.MultiheadAttention(embed_dim, num_heads)
+        self.ln_q = norm_layer(embed_dim)
+        self.ln_kv = norm_layer(embed_dim)
+        
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def forward(self, x, attn_mask=None):
+
+        pos_embed = get_abs_pos(self.pos_embed, x.size(1))
+
+        x = self.kv_proj(x)
+        x = self.ln_kv(x).permute(1, 0, 2)
+
+        N = x.shape[1]
+        q = self.ln_q(self.query)
+        out = self.attn(
+            self._repeat(q, N) + self.pos_embed.unsqueeze(1),
+            x + pos_embed.unsqueeze(1),
+            x,
+            attn_mask=attn_mask)[0]
+        return out.permute(1, 0, 2)
+
+    def _repeat(self, query, N: int):
+        return query.unsqueeze(1).repeat(1, N, 1)
+
+
+class VisualAttention(nn.Module):
+    """self-attention layer class.
+    Self-attention layer takes input with size [s, b, h]
+    and returns output of the same size.
+    """
+
+    def __init__(self, embed_dim, num_heads,
+                 bias=True, kdim=None, vdim=None):
+        super(VisualAttention, self).__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
+
+        self.num_heads = num_heads
+
+        # Per attention head and per partition values.
+        assert embed_dim % num_heads == 0
+        self.hidden_size_per_attention_head = embed_dim // num_heads
+        self.num_attention_heads_per_partition = num_heads
+        self.hidden_size_per_partition = embed_dim
+
+        # Strided linear layer.
+        assert self._qkv_same_embed_dim, 'Only Support SelfAttention Currently'
+        self.in_proj = nn.Linear(embed_dim, 3 * embed_dim)
+        self.out_proj = nn.Linear(embed_dim, embed_dim)
+        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
+
+    def forward(self, query, key, value, attn_mask = None):
+        # query/key/value: [sq, b, h]
+        sq, b, _ = query.size()
+        # print("Diff", (query-key).norm())
+        # assert query is key, 'Only Support Self-Attention Currently'
+        sk = sq
+        mixed_x_layer = self.in_proj(query)
+
+        # [sq, b, (np * 3 * hn)] --> [sq, b, np, 3 * hn]
+        new_tensor_shape = mixed_x_layer.size()[:-1] + \
+            (self.num_attention_heads_per_partition,
+             3 * self.hidden_size_per_attention_head)
+        mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
+
+        # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
+        query_layer, key_layer, value_layer = mixed_x_layer.split(
+            self.hidden_size_per_attention_head, dim=-1)
+
+        # [sq, b, np, hn] -> [sq, b * np, hn]
+        query_layer = query_layer.view(sq,
+            b * self.num_attention_heads_per_partition,
+            self.hidden_size_per_attention_head).transpose(0, 1)
+        # [sk, b, np, hn] -> [sk, b * np, hn]
+        key_layer = key_layer.view(sk,
+            b * self.num_attention_heads_per_partition,
+            self.hidden_size_per_attention_head).transpose(0, 1)
+
+        q_scaled = query_layer / self.norm_factor
+        if attn_mask is not None:
+            attention_probs = torch.baddbmm(attn_mask, q_scaled, key_layer.transpose(-2, -1))
+        else:
+            attention_probs = torch.bmm(q_scaled, key_layer.transpose(-2, -1))
+        attention_probs = attention_probs.softmax(dim=-1)
+
+        value_layer = value_layer.view(sk,
+            b * self.num_attention_heads_per_partition,
+            self.hidden_size_per_attention_head).transpose(0, 1)
+
+        # matmul: [b * np, sq, hn]
+        context_layer = torch.bmm(attention_probs, value_layer)
+
+        # change view [b, np, sq, hn]
+        context_layer = context_layer.view(b,
+            self.num_attention_heads_per_partition,
+            sq, self.hidden_size_per_attention_head)
+
+        # [b, np, sq, hn] --> [sq, b, np, hn]
+        context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
+
+        # [sq, b, np, hn] --> [sq, b, hp]
+        new_context_layer_shape = context_layer.size()[:-2] + \
+            (self.hidden_size_per_partition,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        output = self.out_proj(context_layer)
+
+        return output
+
+
+class VisualAttentionBlock(nn.Module):
+    def __init__(
+            self,
+            d_model: int,
+            n_head: int,
+            mlp_size: int,
+            act_layer: Callable = nn.GELU,
+            norm_layer: Callable = nn.LayerNorm,
+            is_cross_attention: bool = False,
+    ):
+        super().__init__()
+
+        self.ln_1 = norm_layer(d_model)
+        if is_cross_attention:
+            self.ln_1_kv = norm_layer(d_model)
+
+        self.ln_2 = norm_layer(d_model)
+        mlp_width = int(mlp_size)
+        self.attn = VisualAttention(d_model, n_head)
+        self.mlp = nn.Sequential(OrderedDict([
+            ("c_fc", nn.Linear(d_model, mlp_width)),
+            ("gelu", act_layer()),
+            ("c_proj", nn.Linear(mlp_width, d_model))
+        ]))
+
+    def attention(
+            self,
+            q_x: torch.Tensor,
+            k_x: Optional[torch.Tensor] = None,
+            v_x: Optional[torch.Tensor] = None,
+            attn_mask: Optional[torch.Tensor] = None,
+    ):
+        k_x = k_x if k_x is not None else q_x
+        v_x = v_x if v_x is not None else q_x
+        # k_x = q_x 
+        # v_x = q_x
+
+        attn_mask = attn_mask.to(q_x.dtype) if attn_mask is not None else None
+        return self.attn(q_x, k_x, v_x, attn_mask=attn_mask)
+
+    def forward(
+            self,
+            q_x: torch.Tensor,
+            k_x: Optional[torch.Tensor] = None,
+            v_x: Optional[torch.Tensor] = None,
+            attn_mask: Optional[torch.Tensor] = None,
+    ):
+        k_x = self.ln_1_kv(k_x) if hasattr(self, "ln_1_kv") and k_x is not None else None
+        v_x = self.ln_1_kv(v_x) if hasattr(self, "ln_1_kv") and v_x is not None else None
+
+        x = q_x + self.attention(q_x=self.ln_1(q_x), k_x=k_x, v_x=v_x, attn_mask=attn_mask)
+        x = x + self.mlp(self.ln_2(x))
+        return x
+
+
+class TransformerBlock(nn.Module):
+    def __init__(
+            self,
+            width: int,
+            layers: int,
+            heads: int,
+            mlp_size: int,
+            act_layer: Callable = nn.GELU,
+            norm_layer: Callable = nn.LayerNorm,
+    ):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+
+        self.resblocks = nn.ModuleList([
+            VisualAttentionBlock(
+                width, heads, mlp_size, act_layer=act_layer, norm_layer=norm_layer)
+            for _ in range(layers)
+        ])
+    
+        self.gradient_checkpointing = False 
+    
+    def enable_gradient_checkpointing(self):
+        self.gradient_checkpointing = True
+    
+    def disable_gradient_checkpointing(self):
+        self.gradient_checkpointing = False
+
+    def get_cast_dtype(self) -> torch.dtype:
+        return self.resblocks[0].mlp.c_fc.weight.dtype
+
+    def get_cast_device(self) -> torch.device:
+        return self.resblocks[0].mlp.c_fc.weight.device
+
+
+    def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None):
+        for r in self.resblocks:
+            if self.gradient_checkpointing and self.training:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+                    return custom_forward
+                x = torch.utils.checkpoint.checkpoint(create_custom_forward(r), x)
+            else:
+                x = r(x, attn_mask=attn_mask)
+        return x
+
+
+class VisionTransformer(nn.Module):
+
+    def __init__(
+            self,
+            image_size: int,
+            patch_size: int,
+            width: int,
+            layers: int,
+            heads: int,
+            mlp_size: int,
+            output_dim: int,
+            n_queries: int = 256,
+            **kwargs
+    ):
+        super().__init__()
+        image_height, image_width = self.image_size = (image_size, image_size)
+        patch_height, patch_width = self.patch_size = (patch_size, patch_size)
+        self.grid_size = (image_height // patch_height, image_width // patch_width)
+        self.output_dim = output_dim
+
+        self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False)
+
+        # class embeddings and positional embeddings
+        scale = width ** -0.5
+        self.positional_embedding = nn.Parameter(scale * torch.randn(256, width))
+
+        norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        act_layer = nn.GELU
+
+        self.ln_pre = norm_layer(width)
+        self.transformer = TransformerBlock(
+            width,
+            layers,
+            heads,
+            mlp_size,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+        )
+
+        self.attn_pool = Resampler(
+            grid_size=int(math.sqrt(n_queries)),
+            embed_dim=output_dim,
+            num_heads=output_dim // 128,
+            kv_dim=width,
+            norm_layer=norm_layer,
+        )
+        self.ln_post = norm_layer(output_dim)
+    
+    def gradient_checkpointing_enable(self):
+        self.transformer.enable_gradient_checkpointing()
+    
+    def gradient_checkpointing_disable(self):
+        self.transformer.disable_gradient_checkpointing()   
+
+    def forward(self, x: torch.Tensor):
+        x = x.to(
+            dtype=self.transformer.get_cast_dtype(),
+            device=self.transformer.get_cast_device(),
+        )
+        # to patches
+        x = self.conv1(x)  # shape = [*, width, grid, grid]
+        x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+
+        x = x + get_abs_pos(self.positional_embedding, x.size(1))
+
+        x = self.ln_pre(x)
+
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+
+        x = self.attn_pool(x)
+        x = self.ln_post(x)
+
+        return x
\ No newline at end of file
diff --git a/applications/DeepSpeed-VisualChat/utils/model/vis_proj.py b/applications/DeepSpeed-VisualChat/utils/model/vis_proj.py
new file mode 100644
index 000000000..4c5ac875f
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/utils/model/vis_proj.py
@@ -0,0 +1,153 @@
+import torch
+import torch.nn.functional as F
+from transformers.models.clip.modeling_clip import CLIPEncoderLayer
+from torch import nn
+import os
+import sys
+import math
+# sys.path.append('/vc_data/users/xwu/image-language/DeepSpeedExamples-internal-high-loss/applications/DeepSpeed-Chat-multi-modal/training/utils')
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)))
+import numpy as np
+from torch.nn.init import trunc_normal_
+
+
+class VisProjection_vit(nn.Module):
+    def __init__(self, vis_config, lang_dim):
+        super().__init__()
+        # TODO: for now, hard-coded for ViT
+        self.vis_layer = CLIPEncoderLayer(vis_config)
+        self.projection = nn.Sequential( 
+            nn.Linear(vis_config.hidden_size, lang_dim), # an example implementation
+            nn.LayerNorm(lang_dim, eps=1e-12))
+    def forward(self, vis_input):
+        vis_feature = self.vis_layer(vis_input, None, None)[0] # only need the first output
+        return self.projection(vis_feature)
+    
+
+# The following code is adopted from QWen-Clip
+def get_abs_pos(abs_pos, tgt_size):
+    # abs_pos: L, C
+    # tgt_size: M
+    # return: M, C
+    src_size = int(math.sqrt(abs_pos.size(0)))
+    tgt_size = int(math.sqrt(tgt_size))
+    dtype = abs_pos.dtype
+
+    if src_size != tgt_size:
+        return F.interpolate(
+            abs_pos.float().reshape(1, src_size, src_size, -1).permute(0, 3, 1, 2),
+            size=(tgt_size, tgt_size),
+            mode="bicubic",
+            align_corners=False,
+        ).permute(0, 2, 3, 1).flatten(0, 2).to(dtype=dtype)
+    else:
+        return abs_pos
+
+# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token:
+        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+
+    emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
+    return emb
+
+
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float32)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000**omega  # (D/2,)
+
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+
+    emb_sin = np.sin(out) # (M, D/2)
+    emb_cos = np.cos(out) # (M, D/2)
+
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+
+
+class VisProjection_perceiver(nn.Module):
+    def __init__(self, vis_config, lang_dim):
+        super().__init__()
+        # TODO: for now, hard-coded for perceiver
+        grid_size = 16
+        self.num_queries = grid_size ** 2
+        self.embed_dim = lang_dim
+        self.num_heads = lang_dim // 128 
+
+        self.pos_embed = nn.Parameter(
+            torch.from_numpy(get_2d_sincos_pos_embed(lang_dim, grid_size)).float()
+        ).requires_grad_(False)
+
+        self.query = nn.Parameter(torch.zeros(self.num_queries, lang_dim))
+        trunc_normal_(self.query, std=.02)
+
+        self.kv_proj = nn.Linear(vis_config.hidden_size, lang_dim) 
+
+        self.attn = nn.MultiheadAttention(lang_dim, self.num_heads)
+        self.ln_q = nn.LayerNorm(lang_dim, eps=1e-12)
+        self.ln_kv = nn.LayerNorm(lang_dim, eps=1e-12)
+        self.projection = nn.Sequential(
+            nn.LayerNorm(lang_dim, eps=1e-12), 
+            nn.Linear(lang_dim, lang_dim) # an example implementation
+            )
+        
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def forward(self, x, attn_mask=None):
+        # import pdb; pdb.set_trace()
+        pos_embed = get_abs_pos(self.pos_embed, x.size(1))
+
+        x = x[:, 1:, :] # remove cls token
+        x = self.kv_proj(x)
+        x = self.ln_kv(x).permute(1, 0, 2)
+
+
+        N = x.shape[1]
+        q = self.ln_q(self.query)
+        out = self.attn(
+            self._repeat(q, N) + self.pos_embed.unsqueeze(1),
+            x + pos_embed.unsqueeze(1),
+            x,
+            attn_mask=attn_mask)[0]
+        return self.projection(out.permute(1, 0, 2))
+
+    def _repeat(self, query, N: int):
+        return query.unsqueeze(1).repeat(1, N, 1)
\ No newline at end of file
diff --git a/applications/DeepSpeed-VisualChat/utils/module/lora.py b/applications/DeepSpeed-VisualChat/utils/module/lora.py
new file mode 100644
index 000000000..67e446033
--- /dev/null
+++ b/applications/DeepSpeed-VisualChat/utils/module/lora.py
@@ -0,0 +1,147 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+import math
+import torch
+from torch import nn
+import torch.nn.functional as F
+from deepspeed.compression.helper import recursive_getattr, recursive_setattr
+import deepspeed
+
+
+class LinearLayer_LoRA(nn.Module):
+    # an simple implementation of LoRA
+    # for now only support Linear Layer
+    def __init__(self,
+                 weight,
+                 lora_dim=0,
+                 lora_scaling=1,
+                 lora_droppout=0,
+                 bias=None):
+        super(LinearLayer_LoRA, self).__init__()
+        self.weight = weight
+        self.bias = bias
+
+        if lora_dim <= 0:
+            raise ValueError(
+                "You are training to use LoRA, whose reduced dim should be larger than 1"
+            )
+
+        try:
+            # for zero stage 3
+            rows, columns = weight.ds_shape
+        except:
+            rows, columns = weight.shape
+        self.lora_right_weight = nn.Parameter(torch.zeros(
+            columns,
+            lora_dim))  # apply transpose so in forward we do not need to
+        self.lora_left_weight = nn.Parameter(torch.zeros(lora_dim, rows))
+        self.lora_scaling = lora_scaling / lora_dim
+
+        if lora_droppout > 0:
+            self.lora_dropout = nn.Dropout(lora_droppout)
+        else:
+            self.lora_dropout = nn.Identity()
+
+        self.reset_parameters()
+        # disable the original weight gradient
+        self.weight.requires_grad = False
+        # fuse LoRA to the original weight
+        self.fuse_lora = False
+
+    def eval(self):
+        self.lora_dropout.eval()
+        
+    def train(self, mode=True):
+        self.lora_dropout.train(mode)
+
+    def reset_parameters(self):
+        nn.init.kaiming_uniform_(self.lora_right_weight, a=math.sqrt(5))
+        nn.init.zeros_(self.lora_left_weight)
+
+    def fuse_lora_weight(self):
+        if not self.fuse_lora:
+            self.weight.data += self.lora_scaling * torch.matmul(
+                self.lora_left_weight.t(), self.lora_right_weight.t())
+        self.fuse_lora = True
+
+    def unfuse_lora_weight(self):
+        if self.fuse_lora:
+            self.weight.data -= self.lora_scaling * torch.matmul(
+                self.lora_left_weight.t(), self.lora_right_weight.t())
+        self.fuse_lora = False
+
+    def forward(self, input):
+        if self.fuse_lora:
+            return F.linear(input, self.weight, self.bias)
+        else:
+            return F.linear(
+                input, self.weight,
+                self.bias) + (self.lora_dropout(input) @ self.lora_right_weight
+                              @ self.lora_left_weight) * self.lora_scaling
+
+
+# convert the linear layer to LoRA
+def convert_linear_layer_to_lora(model,
+                                 part_module_name,
+                                 lora_dim=0,
+                                 lora_scaling=1,
+                                 lora_droppout=0):
+    repalce_name = []
+    for name, module in model.named_modules():
+        if isinstance(module, nn.Linear) and part_module_name in name:
+            repalce_name.append(name)
+    for name in repalce_name:
+        module = recursive_getattr(model, name)
+        tmp = LinearLayer_LoRA(
+            module.weight, lora_dim, lora_scaling, lora_droppout,
+            module.bias).to(module.weight.device).to(module.weight.dtype)
+        recursive_setattr(model, name, tmp)
+    return model
+
+
+def _z3_params_to_fetch(param_list):
+    return [
+        p for p in param_list
+        if hasattr(p, 'ds_id') and p.ds_status == deepspeed.runtime.zero.
+        partition_parameters.ZeroParamStatus.NOT_AVAILABLE
+    ]
+
+
+# convert the LoRA layer to linear layer
+def convert_lora_to_linear_layer(model, fuse_lora=True):
+    repalce_name = []
+    for name, module in model.named_modules():
+        if isinstance(module, LinearLayer_LoRA):
+            repalce_name.append(name)
+    for name in repalce_name:
+        module = recursive_getattr(model, name)
+        zero_stage_3 = hasattr(module.weight, 'ds_id')
+        with deepspeed.zero.GatheredParameters(_z3_params_to_fetch([
+                module.weight, module.bias, module.lora_left_weight,
+                module.lora_right_weight
+        ]),
+                                               modifier_rank=0,
+                                               enabled=zero_stage_3):
+            if fuse_lora:
+                module.fuse_lora_weight()
+            else:
+                module.unfuse_lora_weight()
+    return model
+
+def fuse_lora(model):
+    return convert_lora_to_linear_layer(model, fuse_lora=True)
+
+def unfuse_lora(model):
+    return convert_lora_to_linear_layer(model, fuse_lora=False)
+
+
+def only_optimize_lora_parameters(model):
+    # turn off the gradient of all the parameters except the LoRA parameters
+    for name, param in model.named_parameters():
+        if "lora_right_weight" in name or "lora_left_weight" in name:
+            param.requires_grad = True
+        else:
+            param.requires_grad = False
+    return model
diff --git a/applications/DeepSpeed-Chat/training/utils/utils.py b/applications/DeepSpeed-VisualChat/utils/utils.py
similarity index 61%
rename from applications/DeepSpeed-Chat/training/utils/utils.py
rename to applications/DeepSpeed-VisualChat/utils/utils.py
index 2a6e47891..1853b8a23 100644
--- a/applications/DeepSpeed-Chat/training/utils/utils.py
+++ b/applications/DeepSpeed-VisualChat/utils/utils.py
@@ -12,10 +12,28 @@
 from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
 
 
-def print_rank_0(msg, rank=0):
-    if rank <= 0:
+def print_rank_0(msg, rank=None):
+    if rank is not None and rank <= 0:
         print(msg)
+    elif is_rank_0():
+        print(msg)
+
+def is_rank_0():
+    """Check whether it is rank 0."""
+    if torch.distributed.is_initialized():
+        if torch.distributed.get_rank() == 0:
+            return True
+        else:
+            return False
+    else:
+        return True
 
+def get_rank():
+    """Check whether it is rank 0."""
+    if torch.distributed.is_initialized():
+        return torch.distributed.get_rank()
+    else:
+        return 0
 
 def to_device(batch, device):
     output = {}
@@ -42,59 +60,6 @@ def update(self, num):
         return self.mean
 
 
-def get_tokenizer(model_name_or_path, fast_tokenizer=True):
-    if "llama" in model_name_or_path:
-        from transformers.models.llama import LlamaTokenizer
-        tokenizer = LlamaTokenizer.from_pretrained(
-            model_name_or_path, fast_tokenizer=fast_tokenizer)
-        if tokenizer.pad_token is None:
-            # assert tokenizer.eos_token is not None
-            # tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
-            tokenizer.add_special_tokens({'pad_token': '[PAD]'})
-            tokenizer.padding_side = 'right'
-    else:
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_name_or_path, fast_tokenizer=fast_tokenizer)
-        tokenizer.pad_token = tokenizer.eos_token
-        # make sure tokenizer is right pad in our logic
-        tokenizer.padding_side = 'right'
-    return tokenizer
-
-
-def load_hf_tokenizer(model_name_or_path, fast_tokenizer=True):
-    if os.path.exists(model_name_or_path):
-        # Locally tokenizer loading has some issue, so we need to force download
-        model_json = os.path.join(model_name_or_path, "config.json")
-        if os.path.exists(model_json):
-            model_json_file = json.load(open(model_json))
-            model_name = model_json_file["_name_or_path"]
-            tokenizer = get_tokenizer(model_name,
-                                      fast_tokenizer=fast_tokenizer)
-    else:
-        tokenizer = get_tokenizer(model_name_or_path,
-                                  fast_tokenizer=fast_tokenizer)
-
-    return tokenizer
-
-
-def save_hf_format(model, tokenizer, args, sub_folder=""):
-    # used to save huggingface format, so we can use it for hf.from_pretrained
-    model_to_save = model.module if hasattr(model, 'module') else model
-    CONFIG_NAME = "config.json"
-    WEIGHTS_NAME = "pytorch_model.bin"
-    output_dir = os.path.join(args.output_dir, sub_folder)
-    os.makedirs(output_dir, exist_ok=True)
-    output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
-    output_config_file = os.path.join(output_dir, CONFIG_NAME)
-    save_dict = model_to_save.state_dict()
-    for key in list(save_dict.keys()):
-        if "lora" in key:
-            del save_dict[key]
-    torch.save(save_dict, output_model_file)
-    model_to_save.config.to_json_file(output_config_file)
-    tokenizer.save_vocabulary(output_dir)
-
-
 def set_random_seed(seed):
     if seed is not None:
         set_seed(seed)
@@ -110,20 +75,21 @@ def get_all_reduce_mean(tensor):
     return tensor
 
 
-def get_optimizer_grouped_parameters(
-    model,
-    weight_decay,
-    lora_lr=5e-4,
-    no_decay_name_list=["bias", "LayerNorm.weight"],
-    lora_name_list=["lora_right_weight", "lora_left_weight"],
-):
+def get_optimizer_grouped_parameters(model,
+                                     weight_decay,
+                                     no_decay_name_list=[
+                                         "bias", "LayerNorm.weight"
+                                     ],
+                                     small_learning_rate_list=
+                                     ["embed"], small_lr=1e-4):
+    
     optimizer_grouped_parameters = [
         {
             "params": [
                 p for n, p in model.named_parameters()
-                if (not any(nd in n for nd in no_decay_name_list)
-                    and p.requires_grad and not any(nd in n
-                                                    for nd in lora_name_list))
+                if (not any(nd in n
+                            for nd in no_decay_name_list) and (not any(nd in n
+                            for nd in small_learning_rate_list)) and p.requires_grad)
             ],
             "weight_decay":
             weight_decay,
@@ -131,27 +97,36 @@ def get_optimizer_grouped_parameters(
         {
             "params": [
                 p for n, p in model.named_parameters()
-                if (not any(nd in n for nd in no_decay_name_list)
-                    and p.requires_grad and any(nd in n
-                                                for nd in lora_name_list))
+                if (any(nd in n
+                        for nd in no_decay_name_list) and (not any(nd in n
+                            for nd in small_learning_rate_list)) and p.requires_grad)
+            ],
+            "weight_decay":
+            0.0,
+        },
+        {
+            "params": [
+                p for n, p in model.named_parameters()
+                if (not any(nd in n
+                            for nd in no_decay_name_list) and (any(nd in n
+                            for nd in small_learning_rate_list)) and p.requires_grad)
             ],
             "weight_decay":
             weight_decay,
-            "lr":
-            lora_lr
+            "lr": small_lr
         },
         {
             "params": [
                 p for n, p in model.named_parameters()
                 if (any(nd in n
-                        for nd in no_decay_name_list) and p.requires_grad)
+                        for nd in no_decay_name_list) and (any(nd in n
+                            for nd in small_learning_rate_list)) and p.requires_grad)
             ],
             "weight_decay":
             0.0,
+            "lr": small_lr
         },
     ]
-    if not optimizer_grouped_parameters[1]["params"]:
-        optimizer_grouped_parameters.pop(1)
     return optimizer_grouped_parameters
 
 
@@ -178,12 +153,35 @@ def moving_average(model, model_ema, beta=0.992, device=None, zero_stage=0):
                     data = data.to(device)
                 param_ema.data.copy_(torch.lerp(data, param_ema.data, beta))
 
-
-def save_zero_three_model(model_ema, global_rank, save_dir, zero_stage=0):
+def save_hf_format(model, tokenizer, args, sub_folder=""):
+    # used to save huggingface format, so we can use it for hf.from_pretrained
+    model_to_save = model.module if hasattr(model, 'module') else model
+    CONFIG_NAME = "config.json"
+    WEIGHTS_NAME = "pytorch_model.bin"
+    output_dir = os.path.join(args.output_dir, sub_folder)
+    os.makedirs(output_dir, exist_ok=True)
+    output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
+    output_config_file = os.path.join(output_dir, CONFIG_NAME)
+    save_dict = model_to_save.state_dict()
+    # for key in list(save_dict.keys()):
+    #     if "lora" in key:
+    #         del save_dict[key]
+    torch.save(save_dict, output_model_file)
+    try:
+        model_to_save.config.to_json_file(output_config_file)
+    except:
+        args_dict = vars(args)
+        torch.save(args_dict,os.path.join(output_dir, 'train_args.pt'))
+        print ("config can't be saved")
+    # tokenizer.save_vocabulary(output_dir)
+    tokenizer.save_pretrained(output_dir)  # this will save all tokenizer files
+
+def save_zero_three_model(model_ema, global_rank, save_dir, zero_stage=0, sub_folder=""):
     zero_stage_3 = (zero_stage == 3)
-    os.makedirs(save_dir, exist_ok=True)
+    output_dir = os.path.join(save_dir, sub_folder)
+    os.makedirs(output_dir, exist_ok=True)
     WEIGHTS_NAME = "pytorch_model.bin"
-    output_model_file = os.path.join(save_dir, WEIGHTS_NAME)
+    output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
 
     model_to_save = model_ema.module if hasattr(model_ema,
                                                 'module') else model_ema
@@ -198,7 +196,7 @@ def save_zero_three_model(model_ema, global_rank, save_dir, zero_stage=0):
                 with deepspeed.zero.GatheredParameters(_z3_params_to_fetch([v
                                                                             ]),
                                                        enabled=zero_stage_3):
-                    v_p = v.data.cpu()
+                    v_p = v.data.clone().detach().cpu() # this is a hack to get around the fact that we can't get the data from the param
             else:
                 v_p = v.cpu()
             if global_rank == 0 and "lora" not in k:
diff --git a/benchmarks/inference/mii/README.md b/benchmarks/inference/mii/README.md
new file mode 100644
index 000000000..f9a825daa
--- /dev/null
+++ b/benchmarks/inference/mii/README.md
@@ -0,0 +1,32 @@
+# Benchmarking Scripts for DeepSpeed-FastGen
+
+## Run the Benchmark
+
+The benchmarking scripts use DeepSpeed-FastGen in the persistent mode.
+You can start the server with the command below:
+
+```bash
+python server.py [options] start
+```
+
+Use the -h option to view all available options. To stop the server, use this command:
+
+```bash
+python server.py stop
+```
+
+Once the server is up and running, initiate the client using the command below. The -h option will display all the possible options.
+
+```bash
+python run_benchmark_client.py [options]
+```
+
+The run_all.sh script performs benchmarks across various model sizes and client numbers. For VLLM benchmarks, use the run_all_vllm.sh script. Results are logged in a directory named logs.[BENCHMARK_PARAMETERS].
+
+## Analyze the Benchmark Results
+
+The scripts mentioned below were used for generating the plots featured in our blog. Specify the root directory for log files using --log_dir.
+
+- `plot_th_lat.py`: This script generates charts for throughput and latency across different model sizes and client counts.
+- `plot_effective_throughput.py`: Use this to chart effective throughput.
+- `plot_latency_percentile.py`: This script will plot the 50th, 90th, and 95th percentile latencies.
diff --git a/benchmarks/inference/mii/plot_effective_throughput.py b/benchmarks/inference/mii/plot_effective_throughput.py
new file mode 100644
index 000000000..357fc7f9e
--- /dev/null
+++ b/benchmarks/inference/mii/plot_effective_throughput.py
@@ -0,0 +1,156 @@
+import argparse
+from pathlib import Path
+import glob
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+
+from postprocess_results import read_json, get_tokenizer
+
+RAGGED_BATCH_SIZE = 768
+SLA_PROMPT_TOKENS_PER_SEC = 512
+SLA_GEN_TOKENS_PER_SEC = [1, 2, 3, 4, 6, 8]
+EMA_SPAN = 16
+
+tp_sizes = {
+    "7b": [1],
+    "70b": [4, 8],
+}
+
+prompt_gen_pairs = [
+    (1200, 60),
+    (1200, 128),
+    (2600, 60),
+    (2600, 128),
+]
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--log_dir", type=Path, default=".")
+    parser.add_argument("--out_dir", type=Path, default="charts/goodtput")
+    args = parser.parse_args()
+    return args
+
+
+def check_token_latency_step(response_details, token_index):
+    P50_token_latency = np.percentile([r.token_gen_time[token_index] for r in response_details if len(r.token_gen_time) > token_index], 50)
+    P90_token_latency = np.percentile([r.token_gen_time[token_index] for r in response_details if len(r.token_gen_time) > token_index], 90)
+    P99_token_latency = np.percentile([r.token_gen_time[token_index] for r in response_details if len(r.token_gen_time) > token_index], 99)
+
+    return P50_token_latency, P90_token_latency, P99_token_latency
+
+
+def validate_token_cum_latency_SLA(response_detail, sla_token_gen):
+    cumsum_latencies = np.cumsum(np.array(response_detail.token_gen_time[1:]))
+    return all([cumsum_latencies[i] <= (1 / sla_token_gen) * (i + 1) for i in range(len(cumsum_latencies))])
+
+
+def validate_token_ema_latency_SLA(response_detail, sla_token_gen, ema_span):
+    ema_latency = pd.Series(response_detail.token_gen_time[1:]).ewm(span=ema_span).mean().values.tolist()
+    return all([t < 1. / sla_token_gen for t in ema_latency])
+
+                        
+def validate_prompt_latency_SLA(response_detail, sla_token_gen, f):
+    tokenizer = get_tokenizer()
+    prompt_length = len(tokenizer.tokenize(response_detail.prompt))
+    prompt_latency_SLA = prompt_length / SLA_PROMPT_TOKENS_PER_SEC
+    if prompt_latency_SLA < response_detail.token_gen_time[0]:
+        return False
+
+    if len(response_detail.token_gen_time) == 1:
+        return True
+
+    return f[0](response_detail, sla_token_gen, *f[1])
+    
+
+def calc_throughput(response_details):
+    start_time = min([r.start_time for r in response_details])
+    end_time = max([r.end_time for r in response_details])
+    return len(response_details) / (end_time - start_time)
+
+    
+def extract_values(file_pattern, sla_token_gen, validate_func):
+    files = glob.glob(file_pattern)
+    print(f"Found {len(files)} files")
+    goodputs = {}
+    good_ratios = {}
+    for f in files:
+        prof_args, response_details = read_json(f)
+        client_num = prof_args["client_num"]
+        num_req_ok = len([r for r in response_details if validate_prompt_latency_SLA(r, sla_token_gen, validate_func)])
+        goodputs[client_num] = calc_throughput(response_details) * (num_req_ok / len(response_details))
+        good_ratios[client_num] = num_req_ok / len(response_details)
+
+    return goodputs, good_ratios
+
+
+def display_results(model_size, tp, bs, sla_token_gen, prompt, gen, log_dir, out_dir):
+    if not log_dir.exists():
+        print(f"Log directory {log_dir} does not exist")
+        return
+    
+    if not out_dir.exists():
+        out_dir.mkdir(parents=True, exist_ok=True)
+    
+    print(f"model: {model_size} Prompt: {prompt}, Generation: {gen}, TP: {tp} sla_token_gen: {sla_token_gen}")
+
+    mii_file_pattern = f"{log_dir}/logs.llama2-{model_size}-tp{tp}-b{bs}/llama2-{model_size}-tp{tp}-b{bs}_c*_p{prompt}_g{gen}.json"
+    vllm_file_pattern = f"{log_dir}/logs.vllm-llama2-{model_size}-tp{tp}/vllm-llama2-{model_size}-tp{tp}_c*_p{prompt}_g{gen}.json"
+
+    validate_funcs = [
+        (validate_token_cum_latency_SLA, (), "cum"),
+        (validate_token_ema_latency_SLA, (EMA_SPAN, ), f"ema{EMA_SPAN}"),
+    ]
+
+    for f in validate_funcs:
+    
+        mii_goodputs, mii_good_ratios = extract_values(mii_file_pattern, sla_token_gen, f)
+        client_num_list = sorted(list(mii_goodputs.keys()))
+        mii_goodputs_list = [mii_goodputs[client_num] for client_num in client_num_list]
+
+        vllm_goodputs, vllm_good_ratios = extract_values(vllm_file_pattern, sla_token_gen, f)
+        vllm_goodputs_list = [vllm_goodputs[client_num] for client_num in client_num_list]
+
+        # print(f"MII {mii_goodputs_list} ratio={mii_good_ratios}")
+        # print(f"vLLM {vllm_goodputs_list} ratio={vllm_good_ratios}")
+
+        # Plotting the scatter plot
+        plt.figure(figsize=(7, 4))
+        plt.scatter(client_num_list, mii_goodputs_list, label=f"DeepSpeed-FastGen", marker="o", color="blue")
+        plt.scatter(client_num_list, vllm_goodputs_list, label=f"vLLM", marker="x", color="orange")
+
+        fit_x_list = np.arange(min(client_num_list), max(client_num_list), 0.1)
+        mii_fit_model = np.polyfit(client_num_list, mii_goodputs_list, 4)
+        mii_model_fn = np.poly1d(mii_fit_model)
+        plt.plot(fit_x_list, mii_model_fn(fit_x_list), color="blue", alpha=0.5, linestyle="--")
+
+        vllm_fit_model = np.polyfit(client_num_list, vllm_goodputs_list, 4)
+        vllm_model_fn = np.poly1d(vllm_fit_model)
+        plt.plot(fit_x_list, vllm_model_fn(fit_x_list), color="orange", alpha=0.5, linestyle="--")
+
+        title = f"Effective throughput (SLA prompt: {SLA_PROMPT_TOKENS_PER_SEC} tokens/s, generation: {sla_token_gen} tokens/s)\n" \
+                + f'Llama 2 {model_size.upper()} Prompt: {prompt}, Generation: {gen}, TP: {tp}'
+        plt.title(title, fontsize=10)
+        plt.xlabel('Number of clients', fontsize=10)
+        plt.ylabel('Effective throughput (queries/s)', fontsize=10)
+        # plt.rcParams['figure.subplot.bottom'] = 0.30
+        plt.ylim(bottom=-0.05)
+        plt.legend()
+        plt.grid(True)
+        # plt.show()
+        out_file = out_dir / f"goodput_llama{model_size}_SLAp{SLA_PROMPT_TOKENS_PER_SEC}g{sla_token_gen}_tp{tp}_b{bs}_p{prompt}g{gen}_{f[2]}.png"
+        plt.savefig(out_file)
+        plt.clf()
+        print(f"Saved {out_file}")
+
+    
+if __name__ == "__main__":
+    args = get_args()
+
+    for model_size, tps in tp_sizes.items():
+        for tp in tps:
+            for prompt, gen in prompt_gen_pairs:
+                for sla_token_gen in SLA_GEN_TOKENS_PER_SEC:
+                    display_results(model_size, tp, RAGGED_BATCH_SIZE, sla_token_gen, prompt, gen, args.log_dir, args.out_dir)
+
diff --git a/benchmarks/inference/mii/plot_latency_percentile.py b/benchmarks/inference/mii/plot_latency_percentile.py
new file mode 100644
index 000000000..c91c78bf1
--- /dev/null
+++ b/benchmarks/inference/mii/plot_latency_percentile.py
@@ -0,0 +1,110 @@
+import argparse
+import glob
+from pathlib import Path
+import matplotlib.pyplot as plt
+import numpy as np
+import itertools
+
+from postprocess_results import read_json, get_token_latency
+
+bs = 768
+SKIP_HEAD_TOKEN_NUM = 2
+SKIP_REQUEST_NUM = 100
+    
+tp_sizes = {
+    "70b": [4],
+}
+
+prompt_gen_pairs = [
+    (2600, 128),
+]
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--log_dir", type=Path, default=".")
+    parser.add_argument("--out_dir", type=Path, default="charts/percentile_token_latency")
+    args = parser.parse_args()
+    return args
+
+
+def extract_values(file_pattern):
+    files = glob.glob(file_pattern)
+    
+    latencies = {}
+    for f in files:
+        prof_args, response_details = read_json(f)
+        client_num = prof_args["client_num"]
+
+        response_details.sort(key=lambda r: r.start_time)
+        response_details = response_details[SKIP_REQUEST_NUM:-SKIP_REQUEST_NUM]
+        token_latencies = [r.token_gen_time[SKIP_HEAD_TOKEN_NUM:-1] for r in response_details]
+
+        flat_latency_list = list(itertools.chain(*token_latencies))
+        latencies[client_num] = flat_latency_list
+    return latencies
+
+
+def output_charts(model_size, tp, bs,  prompt, gen, log_dir, out_dir):
+    if not log_dir.exists():
+        print(f"Log directory {log_dir} does not exist")
+        return
+    
+    if not out_dir.exists():
+        out_dir.mkdir(parents=True, exist_ok=True)
+
+    mii_file_pattern = f"{log_dir}/logs.llama2-{model_size}-tp{tp}-b{bs}/llama2-{model_size}-tp{tp}-b{bs}_c*_p{prompt}_g{gen}.json"
+    vllm_file_pattern = f"{log_dir}/logs.vllm-llama2-{model_size}-tp{tp}/vllm-llama2-{model_size}-tp{tp}_c*_p{prompt}_g{gen}.json"
+
+    mii_latencies = extract_values(mii_file_pattern)
+    vllm_latencies = extract_values(vllm_file_pattern)
+    client_num_list = sorted(list(mii_latencies.keys()))
+
+    for client_num in client_num_list:
+        plt.figure(figsize=(6, 4))
+
+        percentile = 95
+
+        P50_vllm_val = np.percentile(vllm_latencies[client_num], 50)
+        P50_mii_val = np.percentile(mii_latencies[client_num], 50)
+        P90_vllm_val = np.percentile(vllm_latencies[client_num], 90)
+        P90_mii_val = np.percentile(mii_latencies[client_num], 90)
+        P95_vllm_val = np.percentile(vllm_latencies[client_num], 95)
+        P95_mii_val = np.percentile(mii_latencies[client_num], 95)
+
+        # print(f"P50_vllm_val={P50_vllm_val}")
+        # print(f"P50_mii_val={P50_mii_val}")
+        # print(f"P90_vllm_val={P90_vllm_val}")
+        # print(f"P90_mii_val={P90_mii_val}")
+        # print(f"P95_vllm_val={P95_vllm_val}")
+        # print(f"P95_mii_val={P95_mii_val}")
+
+        out_file = out_dir / f"p{percentile}_token_latency_llama{model_size}_c{client_num}_tp{tp}_p{prompt}g{gen}.png"
+
+        x1 = [1, 2, 3]
+        y1 = [P50_vllm_val, P90_vllm_val, P95_vllm_val]
+
+        x2 = [1.3, 2.3, 3.3]
+        y2 = [P50_mii_val, P90_mii_val, P95_mii_val]
+
+        label_x = ['P50', 'P90', 'P95']
+
+        plt.bar(x1, y1, width=0.3, label='vLLM', align="center", color="orange")
+        plt.bar(x2, y2, width=0.3, label="DeepSpeed-FastGen", align="center", color="blue")
+        plt.ylabel('Latency', fontsize=14)
+        plt.legend(loc=2)
+
+        plt.xticks([1.15, 2.15, 3.15], label_x)
+
+        plt.savefig(out_file)
+        print(f"Saved {out_file}")
+
+
+if __name__ == "__main__":
+    args = get_args()
+        
+    for model_size, tps in tp_sizes.items():
+        for tp in tps:
+            for prompt, gen in prompt_gen_pairs:
+                output_charts(model_size, tp, bs, prompt, gen, args.log_dir, args.out_dir)
+
diff --git a/benchmarks/inference/mii/plot_repl_scale.py b/benchmarks/inference/mii/plot_repl_scale.py
new file mode 100644
index 000000000..394c54588
--- /dev/null
+++ b/benchmarks/inference/mii/plot_repl_scale.py
@@ -0,0 +1,95 @@
+import glob
+import matplotlib.pyplot as plt
+import argparse
+from pathlib import Path
+import numpy as np
+
+from postprocess_results import read_json, get_summary
+
+bs = 768
+
+REPLICA_NUMS = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+
+tp_sizes = {
+    "70b": [4],
+}
+
+prompt_gen_pairs = [
+    (2600, 60),
+]
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--log_dir", type=Path, default=".")
+    parser.add_argument("--out_dir", type=Path, default="charts/repl_scale")
+    args = parser.parse_args()
+    return args
+
+
+def extract_values(file_pattern):
+    files = glob.glob(file_pattern)
+
+    clients = []
+    throughputs = []
+    latencies = []
+    for f in files:
+        prof_args, response_details = read_json(f)
+        summary = get_summary(prof_args, response_details)
+        clients.append(prof_args["client_num"])
+        throughputs.append(summary.throughput)
+        latencies.append(summary.latency)
+
+    return clients, throughputs, latencies
+
+
+def output_charts(model_size, tp, bs, prompt, gen, log_dir, out_dir):
+    if not log_dir.exists():
+        print(f"Log directory {log_dir} does not exist")
+        return
+    
+    if not out_dir.exists():
+        out_dir.mkdir(parents=True, exist_ok=True)
+
+    throughputs = {}
+    for repl in REPLICA_NUMS:
+        mii_file_pattern = f"{log_dir}/logs.llama2-{model_size}-tp{tp}-b{bs}_repl{repl}/llama2-{model_size}-tp{tp}-b{bs}_repl{repl}_c*_p{prompt}_g{gen}.json"
+        print(f"Looking for {mii_file_pattern}")
+        clients, mii_throughputs, mii_latencies = extract_values(mii_file_pattern)
+
+        for c, th in zip(clients, mii_throughputs):
+            client_per_repl = c // repl
+            if client_per_repl not in throughputs:
+                throughputs[client_per_repl] = []
+            print(f"Throughput for {client_per_repl} clients: {th}")
+            throughputs[client_per_repl].append(th)
+
+    for c in throughputs:
+
+        # Plotting the scatter plot
+        plt.figure(figsize=(6, 4))
+        
+        plt.bar(REPLICA_NUMS, throughputs[c], color="blue", alpha=0.9)
+
+        fit_x_list = np.arange(min(REPLICA_NUMS), max(REPLICA_NUMS), 0.1)
+        mii_fit_model = np.polyfit(REPLICA_NUMS, throughputs[c], 1)
+        mii_model_fn = np.poly1d(mii_fit_model)
+        plt.plot(fit_x_list, mii_model_fn(fit_x_list), color="blue", linestyle="--")
+        
+        plt.title(f'Model Llama 2 {model_size.upper()}, Prompt: {prompt}, Generation: {gen}, TP: {tp}')
+        plt.xlabel('Number of replicas', fontsize=14)
+        plt.ylabel('Throughput (queries/s)', fontsize=14)
+        plt.grid(True)
+        plt.tight_layout()
+        # plt.show()
+        out_file = out_dir / f"repl_scale_llama{model_size}_tp{tp}_p{prompt}g{gen}.png"
+        plt.savefig(out_file)
+
+
+if __name__ == "__main__":
+    args = get_args()
+        
+    for model_size, tps in tp_sizes.items():
+        for tp in tps:
+            for prompt, gen in prompt_gen_pairs:
+                output_charts(model_size, tp, bs, prompt, gen, args.log_dir, args.out_dir)
+
diff --git a/benchmarks/inference/mii/plot_th_lat.py b/benchmarks/inference/mii/plot_th_lat.py
new file mode 100644
index 000000000..8ede6e818
--- /dev/null
+++ b/benchmarks/inference/mii/plot_th_lat.py
@@ -0,0 +1,98 @@
+import glob
+import matplotlib.pyplot as plt
+import argparse
+from pathlib import Path
+import numpy as np
+
+from postprocess_results import read_json, get_summary
+
+bs = 768
+    
+tp_sizes = {
+    "7b": [1],
+    "70b": [4, 8],
+}
+
+prompt_gen_pairs = [
+    (1200, 60),
+    (1200, 128),
+    (2600, 60),
+    (2600, 128),
+]
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--log_dir", type=Path, default="logs.release")
+    parser.add_argument("--out_dir", type=Path, default="charts/throughput_latency")
+    args = parser.parse_args()
+    return args
+
+
+def extract_values(file_pattern):
+    files = glob.glob(file_pattern)
+
+    print(f"Found {len(files)}")
+    print('\n'.join(files))
+
+    clients = []
+    throughputs = []
+    latencies = []
+    for f in files:
+        prof_args, response_details = read_json(f)
+        summary = get_summary(prof_args, response_details)
+        clients.append(prof_args["client_num"])
+        throughputs.append(summary.throughput)
+        latencies.append(summary.latency)
+
+    return clients, throughputs, latencies
+
+
+def output_charts(model_size, tp, bs, prompt, gen, log_dir, out_dir):
+    if not log_dir.exists():
+        print(f"Log directory {log_dir} does not exist")
+        return
+    
+    if not out_dir.exists():
+        out_dir.mkdir(parents=True, exist_ok=True)
+
+    mii_file_pattern = f"{log_dir}/logs.llama2-{model_size}-tp{tp}-b{bs}/llama2-{model_size}-tp{tp}-b{bs}_c*_p{prompt}_g{gen}.json"
+    vllm_file_pattern = f"{log_dir}/logs.vllm-llama2-{model_size}-tp{tp}/vllm-llama2-{model_size}-tp{tp}_c*_p{prompt}_g{gen}.json"
+
+    _, mii_throughputs, mii_latencies = extract_values(mii_file_pattern)
+    _, vllm_throughputs, vllm_latencies = extract_values(vllm_file_pattern)
+
+    # Plotting the scatter plot
+    plt.figure(figsize=(6, 4))
+    
+    plt.scatter(vllm_throughputs, vllm_latencies, label=f"vLLM", marker="x", color="orange")
+    fit_vllm_x_list = np.arange(min(vllm_throughputs), max(vllm_throughputs), 0.01)
+    vllm_vllm_model = np.polyfit(vllm_throughputs, vllm_latencies, 3)
+    vllm_model_fn = np.poly1d(vllm_vllm_model)
+    plt.plot(fit_vllm_x_list, vllm_model_fn(fit_vllm_x_list), color="orange", alpha=0.5, linestyle="--")
+
+    plt.scatter(mii_throughputs, mii_latencies, label=f"DeepSpeed FastGen", marker="o", color="blue")
+    fit_mii_x_list = np.arange(min(mii_throughputs), max(mii_throughputs), 0.01)
+    mii_fit_model = np.polyfit(mii_throughputs, mii_latencies, 3)
+    mii_model_fn = np.poly1d(mii_fit_model)
+    plt.plot(fit_mii_x_list, mii_model_fn(fit_mii_x_list), color="blue", alpha=0.5, linestyle="--")
+
+    plt.title(f'Model Llama 2 {model_size.upper()}, Prompt: {prompt}, Generation: {gen}, TP: {tp}')
+    plt.xlabel('Throughput (queries/s)', fontsize=14)
+    plt.ylabel('Latency', fontsize=14)
+    plt.legend()
+    plt.grid(True)
+    plt.tight_layout()
+    # plt.show()
+    out_file = out_dir / f"th_lat_curve_llama{model_size}_tp{tp}_p{prompt}g{gen}.png"
+    print(f"Saving {out_file}")
+    plt.savefig(out_file)
+
+
+if __name__ == "__main__":
+    args = get_args()
+        
+    for model_size, tps in tp_sizes.items():
+        for tp in tps:
+            for prompt, gen in prompt_gen_pairs:
+                output_charts(model_size, tp, bs, prompt, gen, args.log_dir, args.out_dir)
+
diff --git a/benchmarks/inference/mii/plot_tp_sizes.py b/benchmarks/inference/mii/plot_tp_sizes.py
new file mode 100644
index 000000000..546310258
--- /dev/null
+++ b/benchmarks/inference/mii/plot_tp_sizes.py
@@ -0,0 +1,98 @@
+import glob
+import matplotlib.pyplot as plt
+import argparse
+from pathlib import Path
+import numpy as np
+
+from postprocess_results import read_json, get_summary
+
+bs = 768
+    
+tp_sizes = {
+    # "7b": [1],
+    "13b": [1, 2, 4],
+    # "70b": [4, 8],
+}
+
+prompt_gen_pairs = [
+    (1200, 60),
+    (1200, 128),
+    (2600, 60),
+    (2600, 128),
+    (2600, 256),
+]
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--log_dir", type=Path, default="logs.release")
+    parser.add_argument("--out_dir", type=Path, default="charts/tp_sizes")
+    args = parser.parse_args()
+    return args
+
+
+def extract_values(file_pattern):
+    files = glob.glob(file_pattern)
+
+    print(f"Found {len(files)}")
+    print('\n'.join(files))
+
+    clients = []
+    throughputs = []
+    latencies = []
+    for f in files:
+        prof_args, response_details = read_json(f)
+        summary = get_summary(prof_args, response_details)
+        clients.append(prof_args["client_num"])
+        throughputs.append(summary.throughput)
+        latencies.append(summary.latency)
+
+    return clients, throughputs, latencies
+
+
+def output_charts(model_size, tps, bs, prompt, gen, log_dir, out_dir):
+    if not log_dir.exists():
+        print(f"Log directory {log_dir} does not exist")
+        return
+    
+    if not out_dir.exists():
+        out_dir.mkdir(parents=True, exist_ok=True)
+
+    # Plotting the scatter plot
+    plt.figure(figsize=(6, 4))
+
+    colors = ["orange", "green", "brown"]
+
+    for tp, color in zip(tps, colors):
+        mii_file_pattern = f"{log_dir}/logs.llama2-{model_size}-tp{tp}-b{bs}/llama2-{model_size}-tp{tp}-b{bs}_c*_p{prompt}_g{gen}.json"
+        _, mii_throughputs, mii_latencies = extract_values(mii_file_pattern)
+
+        if len(mii_throughputs) == 0:
+            continue
+
+        n_params = int(model_size[:-1])
+        tflops_per_query = n_params * (prompt + gen) * 2 * 1e-3
+        mii_tflops = [th * tflops_per_query / tp for th in mii_throughputs]
+
+        plt.scatter(mii_tflops, mii_latencies, label=f"TP={tp}", marker="o", color=color)
+        fit_mii_x_list = np.arange(min(mii_tflops), max(mii_tflops), 0.01)
+        mii_fit_model = np.polyfit(mii_tflops, mii_latencies, 3)
+        mii_model_fn = np.poly1d(mii_fit_model)
+        plt.plot(fit_mii_x_list, mii_model_fn(fit_mii_x_list), color=color, alpha=0.5, linestyle="--")
+
+    plt.title(f'Model Llama 2 {model_size.upper()}, Prompt: {prompt}, Generation: {gen}, TP: {tps}')
+    plt.xlabel('TFLOPs (per GPU)', fontsize=14)
+    plt.ylabel('Latency', fontsize=14)
+    plt.legend()
+    plt.grid(True)
+    # plt.show()
+    out_file = out_dir / f"tp_sizes_llama{model_size}_tp{'_'.join([str(tp) for tp in tps])}_p{prompt}g{gen}.png"
+    plt.savefig(out_file)
+
+
+if __name__ == "__main__":
+    args = get_args()
+        
+    for model_size, tps in tp_sizes.items():
+        for prompt, gen in prompt_gen_pairs:
+            output_charts(model_size, tps, bs, prompt, gen, args.log_dir, args.out_dir)
+
diff --git a/benchmarks/inference/mii/postprocess_results.py b/benchmarks/inference/mii/postprocess_results.py
new file mode 100644
index 000000000..cb2000d5f
--- /dev/null
+++ b/benchmarks/inference/mii/postprocess_results.py
@@ -0,0 +1,112 @@
+import argparse
+from pathlib import Path
+import json
+import numpy as np
+from statistics import mean
+from functools import reduce
+from dataclasses import dataclass
+from typing import List
+
+from transformers import AutoTokenizer
+
+
+tokenizer = None
+
+
+@dataclass
+class ResponseDetails:
+    generated_tokens: List[str]
+    prompt: str
+    start_time: float
+    end_time: float
+    model_time: float
+    token_gen_time: List[float]
+
+
+@dataclass
+class ProfilingSummary:
+    throughput: float
+    latency: float
+    token_gen_latency: float
+    first_token_latency: float
+    tokens_per_sec: float
+
+    
+def parse_args():
+    parser = argparse.ArgumentParser(description="Postprocess results")
+    parser.add_argument('-i', '--input_path', type=Path, default="results.json")
+
+    args = parser.parse_args()
+    return args
+
+
+def get_tokenizer():
+    global tokenizer
+    if tokenizer is None:
+        tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
+    return tokenizer    
+
+
+def read_json(file_path):
+    with open(file_path, 'r') as f:
+        data = json.load(f)
+    
+    args = data["args"]
+
+    response_details = []
+    for response in data["response_details"]:
+        response_details.append(ResponseDetails(**response))
+
+    return args, response_details
+
+
+def get_summary(args, response_details):
+    client_num = args["client_num"]
+
+    # Calculate latency and throughput using P95 latency
+    latency = mean([r.end_time - r.start_time for r in response_details])
+    throughput = client_num / latency
+
+    tokens_per_sec = mean([(len(get_tokenizer().tokenize(r.prompt)) + len(r.generated_tokens)) / (r.end_time - r.start_time) for r in response_details])
+    first_token_latency = mean([r.token_gen_time[0] for r in response_details])
+
+    token_gen_latency_flat = reduce(list.__add__, [r.token_gen_time[1:-1] for r in response_details if len(r.token_gen_time) > 2])
+    token_gen_latency = mean([t for t in token_gen_latency_flat])
+
+    return ProfilingSummary(throughput, latency, token_gen_latency, first_token_latency, tokens_per_sec)
+
+
+def get_token_latency(response_details, percentile=None, variance=False, cumulative=False):
+    req_latencies = [r.token_gen_time for r in response_details]
+    if cumulative:
+        req_latencies = [np.cumsum(np.array(r.token_gen_time)).tolist() for r in response_details]
+    max_gen_length = max([len(r.generated_tokens) for r in response_details])
+    latency = []
+    for i in range(max_gen_length):
+        if variance:
+            token_latency_step = np.var([latency[i] for latency in req_latencies if len(latency) > i])    
+        if percentile is None:
+            token_latency_step = [latency[i] for latency in req_latencies if len(latency) > i]
+        else:
+            token_latency_step = np.percentile([latency[i] for latency in req_latencies if len(latency) > i], percentile)
+
+        latency.append(token_latency_step)
+
+    return latency
+
+
+def get_token_acc_latency(response_details, percentile=99):
+    return get_token_latency(response_details, percentile, cumulative=True)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    prof_args, response_details = read_json(args.input_path)
+
+    ps = get_summary(prof_args, response_details)
+    print(f"Deployment: {prof_args['deployment_name']} Clients: {prof_args['client_num']}, "
+          + f"Query throughput: {ps.throughput:.3f} queries/s, "
+          + f"Token throughput (total): {ps.tokens_per_sec:.3f} tokens/s, "
+          + f"Query latency: {ps.latency:.3f} s, "
+          + f"Token generation latency: {ps.token_gen_latency:.3f} s/token, "
+          + f"First token received: {ps.first_token_latency:.3f} s")
diff --git a/benchmarks/inference/mii/random_query_generator.py b/benchmarks/inference/mii/random_query_generator.py
new file mode 100644
index 000000000..b8442af4f
--- /dev/null
+++ b/benchmarks/inference/mii/random_query_generator.py
@@ -0,0 +1,30 @@
+import torch
+import random
+import numpy as np
+import time
+
+class RandomQueryGenerator:
+    def __init__(self, input_text, tokenizer, seed):
+        self.input_text = input_text
+        self.tokenizer = tokenizer
+
+        torch.manual_seed(seed)
+        random.seed(seed)
+        np.random.seed(seed)
+
+    def get_random_request_text(self, length, variance, max_length, batch):
+        request_text = []
+        tokenized_input = self.tokenizer.batch_encode_plus([self.input_text],
+                                                        return_tensors="pt",
+                                                        padding=False)
+        offset = list(range(512))
+        random.shuffle(offset)
+
+        text_ids = tokenized_input["input_ids"][0]
+        for i in range(batch):
+            # Set max_new_tokens following normal distribution with mean=max_new_tokens and std=0.3*max_new_tokens
+            req_prompt_length = min(int(np.random.normal(length, variance)), max_length)
+
+            text = self.tokenizer.decode(text_ids[i:req_prompt_length+i])
+            request_text.append(text)
+        return request_text
diff --git a/benchmarks/inference/mii/run_all.sh b/benchmarks/inference/mii/run_all.sh
new file mode 100644
index 000000000..ca504a6c9
--- /dev/null
+++ b/benchmarks/inference/mii/run_all.sh
@@ -0,0 +1,25 @@
+RAGGED_BATCH_SIZE=768
+PARAM_SIZES=(7b 13b 70b)
+
+declare -A TP_SIZES
+TP_SIZES["7b"]="1"
+TP_SIZES["13b"]="1:2:4"
+TP_SIZES["70b"]="4:8"
+
+for PARAM_SIZE in ${PARAM_SIZES[@]}; do
+    
+    IFS=':' read -ra TP_VALUES <<< ${TP_SIZES[${PARAM_SIZE}]}
+    for TP in ${TP_VALUES[@]}; do
+        DEPLOYMENT_NAME=llama2-${PARAM_SIZE}-tp${TP}-b${RAGGED_BATCH_SIZE}
+        python server.py --model_name meta-llama/Llama-2-${PARAM_SIZE}-hf -d ${DEPLOYMENT_NAME} -m ${TP} -b ${RAGGED_BATCH_SIZE} start
+
+        DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=2600 MAX_NEW_TOKENS=60 bash ./run_benchmark_client.sh
+        DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=2600 MAX_NEW_TOKENS=128 bash ./run_benchmark_client.sh
+        DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=1200 MAX_NEW_TOKENS=60 bash ./run_benchmark_client.sh
+        DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=1200 MAX_NEW_TOKENS=128 bash ./run_benchmark_client.sh
+
+        echo "Stopping server"
+        python server.py -d ${DEPLOYMENT_NAME} stop
+        sleep 120
+    done
+done
diff --git a/benchmarks/inference/mii/run_all_replica.sh b/benchmarks/inference/mii/run_all_replica.sh
new file mode 100644
index 000000000..b3fba0408
--- /dev/null
+++ b/benchmarks/inference/mii/run_all_replica.sh
@@ -0,0 +1,25 @@
+RAGGED_BATCH_SIZE=768
+PARAM_SIZES=(7b)
+REPLICA_NUMS=(1)
+
+declare -A TP_SIZES
+TP_SIZES["7b"]="4"
+TP_SIZES["13b"]="1"
+TP_SIZES["70b"]="4"
+
+for PARAM_SIZE in ${PARAM_SIZES[@]}; do
+    IFS=':' read -ra TP_VALUES <<< ${TP_SIZES[${PARAM_SIZE}]}
+    for TP in ${TP_VALUES[@]}; do
+        for REPL in ${REPLICA_NUMS[@]}; do
+            DEPLOYMENT_NAME=llama2-${PARAM_SIZE}-tp${TP}-b${RAGGED_BATCH_SIZE}_repl${REPL}
+            python server.py --model_name meta-llama/Llama-2-${PARAM_SIZE}-hf -d ${DEPLOYMENT_NAME} -m ${TP} -r ${REPL} -b ${RAGGED_BATCH_SIZE} start
+
+            REQUEST_NUM=$((256 * ${REPL}))
+            DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=2600 MAX_NEW_TOKENS=60 CLIENT_NUMS=$((16 * ${REPL})) REQUEST_NUM=$((256 * ${REPL})) bash ./run_bench_client_num.sh
+
+            echo "Stopping server"
+            python server.py -d ${DEPLOYMENT_NAME} stop
+            sleep 120
+        done
+    done
+done
diff --git a/benchmarks/inference/mii/run_all_vllm.sh b/benchmarks/inference/mii/run_all_vllm.sh
new file mode 100644
index 000000000..572377f13
--- /dev/null
+++ b/benchmarks/inference/mii/run_all_vllm.sh
@@ -0,0 +1,26 @@
+RAGGED_BATCH_SIZE=768
+PARAM_SIZES=(7b 13b 70b)
+
+declare -A TP_SIZES
+TP_SIZES["7b"]="1"
+TP_SIZES["13b"]="1:2:4"
+TP_SIZES["70b"]="4:8"
+
+for PARAM_SIZE in ${PARAM_SIZES[@]}; do
+    
+    IFS=':' read -ra TP_VALUES <<< ${TP_SIZES[${PARAM_SIZE}]}
+    for TP in ${TP_VALUES[@]}; do
+        DEPLOYMENT_NAME=vllm-llama2-${PARAM_SIZE}-tp${TP}
+        python -m vllm.entrypoints.api_server --host 127.0.0.1 --port 26500 --tensor-parallel-size ${TP} --model meta-llama/Llama-2-${PARAM_SIZE}-hf &
+        sleep 60
+
+        DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=2600 MAX_NEW_TOKENS=60 VLLM="--vllm" bash ./run_benchmark_client.sh
+        DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=2600 MAX_NEW_TOKENS=128 VLLM="--vllm" bash ./run_benchmark_client.sh
+        DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=1200 MAX_NEW_TOKENS=60 VLLM="--vllm" bash ./run_benchmark_client.sh
+        DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=1200 MAX_NEW_TOKENS=128 VLLM="--vllm" bash ./run_benchmark_client.sh
+
+        echo "Stopping server"
+        pkill -u ${USER} -f vllm.entrypoints.api_server
+        sleep 30
+    done
+done
diff --git a/benchmarks/inference/mii/run_benchmark_client.py b/benchmarks/inference/mii/run_benchmark_client.py
new file mode 100644
index 000000000..77377a93a
--- /dev/null
+++ b/benchmarks/inference/mii/run_benchmark_client.py
@@ -0,0 +1,304 @@
+import os
+import time
+import random
+import argparse
+import queue
+import multiprocessing
+import threading
+from statistics import mean
+from dataclasses import dataclass, asdict
+from typing import List, Iterable
+from pathlib import Path
+from datetime import datetime
+import numpy as np
+
+from transformers import AutoTokenizer
+from random_query_generator import RandomQueryGenerator
+from sample_input import all_text
+import time
+import json
+import asyncio
+import requests
+
+from postprocess_results import get_summary, ResponseDetails
+
+MAX_PROMPT_LENGTH = 4000
+PROMPT_LENGTH_VAR = 0.3
+MAX_NEW_TOKENS_VAR = 0.3
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Benchmark MII services")
+    parser.add_argument("-k",
+                        "--max_new_tokens",
+                        type=int,
+                        default=60,
+                        help="min and max num tokens argument for huggingface")
+    parser.add_argument("-d",
+                        "--deployment_name",
+                        type=str,
+                        default="benchmark_deployment")
+    parser.add_argument("-n",
+                        "--num_queries",
+                        type=int,
+                        help="number of queries to run",
+                        default=10)
+    parser.add_argument("-w",
+                        "--warmup",
+                        type=int,
+                        help="number of queries for warming up",
+                        default=1)
+    parser.add_argument("-c",
+                        "--client_num",
+                        type=int,
+                        help="number of parallel client processes",
+                        default=2)
+    parser.add_argument("-l",
+                        "--prompt_length",
+                        type=int,
+                        default=2600)
+    parser.add_argument('--use_thread', action='store_true',
+                        help='use thread to run parallel clients, otherwise use multiprocessing',
+                        default=False)
+    parser.add_argument('--stream', action='store_true', default=True)
+    parser.add_argument('--vllm', action='store_true', default=False)
+    parser.add_argument('-o', '--out_json_path', type=Path, default=None)
+
+    args = parser.parse_args()
+    return args
+
+
+def call_mii(client, input_tokens, max_new_tokens, stream):
+    output_tokens = []
+    token_gen_time = []
+    time_last_token = 0
+
+    def callback(response):
+        nonlocal time_last_token
+        # print(f"Received: {response.response} time_last_token={time_last_token}")
+        output_tokens.append(response.response[0])
+        time_now = time.time()
+        token_gen_time.append(time_now - time_last_token)
+        time_last_token = time_now
+
+    postprocess_config = {
+        "logit_processor": {
+            # "name": "TopP",
+            # "args": {
+            #     "top_p": 0.9
+            # }
+            "name": "Temperature",
+            "args": {
+                "temperature": 0.9
+            }
+        },
+        "sampler": {
+            "name": "Logits"
+        },
+        "stop_criterion": {
+            "name": "EosGeneration"
+        }
+    }
+
+    time_last_token = start_time = time.time()
+    token_gen_time = []
+    if stream:
+        output_tokens = []
+        client.generate(
+            input_tokens, max_new_tokens=max_new_tokens, postprocess_config=postprocess_config,
+            streaming_fn=callback)
+    else:
+        result = client.generate(
+            input_tokens, max_new_tokens=max_new_tokens, postprocess_config=postprocess_config)
+        output_tokens = result.response[0]
+
+    return ResponseDetails(
+        generated_tokens=output_tokens,
+        prompt=input_tokens,
+        start_time=start_time,
+        end_time=time.time(),
+        model_time=0,
+        token_gen_time=token_gen_time)
+
+
+def call_vllm(input_tokens, max_new_tokens, stream=True):
+    api_url = "http://localhost:26500/generate"
+    headers = {"User-Agent": "Benchmark Client"}
+    pload = {
+        "prompt": input_tokens,
+        "n": 1,
+        "use_beam_search": False,
+        "temperature": 1.0,
+        "top_p": 0.9,
+        "max_tokens": max_new_tokens,
+        "ignore_eos": False,
+        "stream": stream,
+    }
+    def clear_line(n: int = 1) -> None:
+        LINE_UP = '\033[1A'
+        LINE_CLEAR = '\x1b[2K'
+        for _ in range(n):
+            print(LINE_UP, end=LINE_CLEAR, flush=True)
+
+    def get_streaming_response(response: requests.Response, time_last_token) -> Iterable[List[str]]:
+        for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False,
+                                        delimiter=b"\0"):
+            if chunk:
+                data = json.loads(chunk.decode("utf-8"))
+                output = data["text"][0]
+                time_now = time.time()
+                yield output, time_now - time_last_token
+                time_last_token = time_now
+
+    def get_response(response: requests.Response) -> List[str]:
+        data = json.loads(response.content)
+        output = data["text"]
+        return output
+
+    start_time = time.time()
+    response = requests.post(api_url, headers=headers, json=pload, stream=stream)
+    if stream:
+        token_gen_time = []
+        for h, t in get_streaming_response(response, start_time):
+            output = h
+            token_gen_time.append(t)
+
+        return ResponseDetails(
+            generated_tokens=output,
+            prompt=input_tokens,
+            start_time=start_time,
+            end_time=time.time(),
+            model_time=0,
+            token_gen_time=token_gen_time)
+    else:
+        output = get_response(response)
+        raise NotImplementedError("Not implemented for non-streaming")
+
+
+def _run_parallel(deployment_name, warmup, barrier, query_queue, result_queue, client_num, stream, vllm):
+    pid = os.getpid()
+    session_id = f"test_session_p{pid}_t{threading.get_ident()}"
+
+    event_loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(event_loop)
+    if not vllm:
+        import mii
+        client = mii.client(deployment_name)
+
+    barrier.wait()
+
+    for _ in range(warmup):
+        print(f"warmup queue size: {query_queue.qsize()} ({pid})", flush=True)
+        input_tokens, req_max_new_tokens = query_queue.get(timeout=1.0)
+
+        if vllm:
+            call_vllm(input_tokens, req_max_new_tokens, stream)
+        else:
+            call_mii(client, input_tokens, req_max_new_tokens, stream)
+
+    barrier.wait()
+
+    time.sleep(random.uniform(0, client_num) * 0.01)
+    try:
+        while not query_queue.empty():
+            print(f"queue size: {query_queue.qsize()} ({pid})", flush=True)
+            input_tokens, req_max_new_tokens = query_queue.get(timeout=1.0)
+
+            # Set max_new_tokens following normal distribution
+            if vllm:
+                r = call_vllm(input_tokens, req_max_new_tokens)
+            else:
+                r = call_mii(client, input_tokens, req_max_new_tokens, stream)
+
+            result_queue.put(r)
+    except queue.Empty:
+        print(f"queue is empty ({pid})")
+
+    print(f"Worker ({pid}) finished. session_id: {session_id}")
+
+
+def run_client(client_num, deployment_name, prompt_length, max_new_tokens, num_queries, warmup, stream, vllm, use_thread=False):
+    """
+    Run MII client for benchmarking. The scenario is a bit complicated:
+    1. The main process puts `num_queries` queries into the input queue
+    2. Each client runs `warmup` iterations () taking the queries from the input queue
+    3. --- barrier ---
+    4. The main process marks the start time
+    5a. All clients send `num_queries' query in total and put the results into the result queue
+    5b. The main process takes the results from the result queue (in parallel with 5a)
+    6. The main process marks the end time after receiving `num_queries' results
+    """
+
+    if use_thread:
+        runnable_cls = threading.Thread
+        barrier_cls = threading.Barrier
+        queue_cls = queue.Queue
+    else:
+        runnable_cls = multiprocessing.Process
+        barrier_cls = multiprocessing.Barrier
+        queue_cls = multiprocessing.Queue
+
+    barrier = barrier_cls(client_num + 1)
+    query_queue = queue_cls()
+    result_queue = queue_cls()
+
+    processes = [runnable_cls(target=_run_parallel,
+                              args=(deployment_name, warmup, barrier, query_queue, result_queue, client_num, stream, vllm))
+                 for i in range(client_num)]
+    for p in processes:
+        p.start()
+
+    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
+    query_generator = RandomQueryGenerator(all_text, tokenizer, seed=42)
+    MAX_PROMPT_LENGTH = 4000
+    request_text = query_generator.get_random_request_text(prompt_length, prompt_length*PROMPT_LENGTH_VAR, MAX_PROMPT_LENGTH, num_queries + warmup*client_num)
+
+    for t in request_text:
+        req_max_new_tokens = int(np.random.normal(max_new_tokens, MAX_NEW_TOKENS_VAR*max_new_tokens))
+        query_queue.put((t, req_max_new_tokens))
+
+    # Tokenizers must be initialized after fork.
+    # So we need to fork before putting inputs to the queue.
+    # We need this barrier to stop child processse from taking inputs before the main process puts them
+    barrier.wait()
+    # This barrier is to make sure that all clients have finished warmup
+    barrier.wait()
+
+    response_details = []
+    while len(response_details) < num_queries:
+        res = result_queue.get()
+        # vLLM returns concatinated tokens
+        if vllm:
+            all_tokens = tokenizer.tokenize(res.generated_tokens)
+            res.generated_tokens = all_tokens[len(tokenizer.tokenize(res.prompt)):]
+        response_details.append(res)
+
+    return response_details
+
+if __name__ == "__main__":
+    args = parse_args()
+    print(args)
+
+    if args.out_json_path is not None and not args.out_json_path.parent.exists():
+        raise ValueError(f"Parent directory of {args.out_json_path}")
+
+    response_details = run_client(args.client_num, args.deployment_name,
+                            args.prompt_length,
+                            args.max_new_tokens, args.num_queries, args.warmup,
+                            args.stream, args.vllm, args.use_thread)
+
+    args_dict = vars(args)
+    ps = get_summary(args_dict, response_details)
+    print(f"Deployment: {args.deployment_name} Clients: {args.client_num}, "
+          + f"Prompt (mean): {args.prompt_length} tokens, "
+          + f"Generation (mean): {args.max_new_tokens} tokens, "
+          + f"Query throughput: {ps.throughput:.3f} queries/s, "
+          + f"Token throughput (total): {ps.tokens_per_sec:.3f} tokens/s, "
+          + f"Query latency: {ps.latency:.3f} s, "
+          + f"Token generation latency: {ps.token_gen_latency:.3f} s/token, "
+          + f"First token received: {ps.first_token_latency:.3f} s")
+
+    if args.out_json_path is not None:
+        with open(args.out_json_path, "w") as f:
+            args_dict["out_json_path"] = str(args.out_json_path) # Path is not JSON serializable
+            data = {"args": args_dict, "time": str(datetime.now()), "response_details": [asdict(r) for r in response_details]}
+            json.dump(data, f, indent=2)
diff --git a/benchmarks/inference/mii/run_benchmark_client.sh b/benchmarks/inference/mii/run_benchmark_client.sh
new file mode 100644
index 000000000..318e9092e
--- /dev/null
+++ b/benchmarks/inference/mii/run_benchmark_client.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+DEPLOYMENT_NAME=${DEPLOYMENT_NAME:-llama2-7b}
+VLLM=${VLLM:-""}
+
+CLIENT_NUMS=${CLIENT_NUMS:-1 2 4 6 8 12 16 20 24 28 32}
+MAX_NEW_TOKENS=${MAX_NEW_TOKENS:-60}
+PROMPT_LENGTH=${PROMPT_LENGTH:-3072}
+REQUEST_NUM=${REQUEST_NUM:-512}
+
+LOG_DIR=logs.${DEPLOYMENT_NAME}
+mkdir -p ${LOG_DIR}
+
+for client_num in ${CLIENT_NUMS[@]}; do
+    RESULT_FILE=${DEPLOYMENT_NAME}_c${client_num}_p${PROMPT_LENGTH}_g${MAX_NEW_TOKENS}.json
+
+    python run_benchmark_client.py -w 1 \
+        -d ${DEPLOYMENT_NAME} -n ${REQUEST_NUM} -c ${client_num} \
+        -k ${MAX_NEW_TOKENS} -l ${PROMPT_LENGTH} \
+        -o ${LOG_DIR}/${RESULT_FILE} \
+        ${VLLM} --stream \
+        2>&1 | tee ${LOG_DIR}/bench_client_num_c${client_num}_p${PROMPT_LENGTH}_g${MAX_NEW_TOKENS}.log 
+done
diff --git a/benchmarks/inference/mii/sample_input.py b/benchmarks/inference/mii/sample_input.py
new file mode 100644
index 000000000..77d02af5f
--- /dev/null
+++ b/benchmarks/inference/mii/sample_input.py
@@ -0,0 +1,221 @@
+
+# This is a sample input consisting of:
+# Code & Text
+
+all_text = '''Deep learning involves the use of neural networks, which are computational models inspired by the structure and functioning of the human brain. These networks consist of interconnected nodes called neurons. Each neuron takes input, performs a computation, and produces an output.
+              During training, the neural network learns to make accurate predictions by adjusting its internal parameters. This adjustment is done using an optimization algorithm called gradient descent. Gradient descent calculates the gradients of a loss function, which measures the discrepancy between the predicted output of the network and the desired output. These gradients indicate the direction and magnitude of parameter updates that will minimize the loss.
+              The learning rate is an important hyperparameter in gradient descent. It determines the step size taken during parameter updates. A higher learning rate can lead to faster convergence, but it risks overshooting the optimal solution. On the other hand, a lower learning rate may converge more slowly, but it can result in more precise updates.
+              Activation functions are applied to the output of each neuron in a neural network. They introduce non-linearities, enabling the network to learn complex patterns and relationships in the data. Popular activation functions include the rectified linear unit (ReLU), sigmoid, and hyperbolic tangent (tanh).
+              By adjusting the parameters of the neural network during training, deep learning models learn to represent and generalize from complex data patterns. They have achieved remarkable success in various tasks, including image recognition, speech recognition, and natural language processing.
+              Here are the key fundamentals of deep learning for training large language models:
+              Neural Networks: At the heart of deep learning are artificial neural networks, which are inspired by the structure and functioning of biological neurons in the human brain. These networks consist of interconnected layers of artificial neurons called nodes or units. The nodes receive input, perform computations, and pass the results to the next layer.
+              Representation Learning: Deep learning models excel at learning meaningful representations of data. In the context of language, the models can automatically learn hierarchical representations of text, capturing complex relationships and semantic structures.
+              Feedforward and Backpropagation: Deep learning models typically use feedforward neural networks, where information flows from the input layer through intermediate hidden layers to the output layer. The network makes predictions based on the input data, and the prediction error is then backpropagated through the network. Backpropagation calculates gradients that indicate how each parameter in the network should be adjusted to minimize the error.
+              Activation Functions: Activation functions introduce non-linearities to neural networks, enabling them to learn complex patterns. Common activation functions include the rectified linear unit (ReLU), sigmoid, and hyperbolic tangent (tanh). These functions determine the output of each neuron based on its weighted inputs.
+              Loss Functions: During training, a loss function is used to measure the discrepancy between the predicted output of the neural network and the desired output. In language modeling tasks, common loss functions include cross-entropy loss, which quantifies the difference in probability distributions.
+              Optimization Algorithms: Optimization algorithms determine how the network's parameters are updated based on the calculated gradients during backpropagation. Stochastic Gradient Descent (SGD) is a widely used algorithm that iteratively updates the parameters in the direction that minimizes the loss. Variants of SGD, such as Adam or RMSprop, adaptively adjust the learning rate to accelerate convergence.
+              Regularization Techniques: Deep learning models are prone to overfitting, where they memorize the training data but fail to generalize well to unseen examples. Regularization techniques such as dropout and weight decay are commonly used to prevent overfitting and improve generalization by adding constraints to the model's parameters.
+              Training on Large-Scale Datasets: Deep learning models, including large language models, require substantial amounts of labeled training data to learn effectively. Large-scale datasets are crucial to expose the model to diverse language patterns and ensure it captures a broad understanding of language.
+              Parallel Computing: Training large language models is computationally demanding. To accelerate the training process, parallel computing techniques, such as using multiple GPUs or distributed computing systems, are employed. These techniques allow for efficient processing of large datasets and speeding up the training iterations.
+              Transfer Learning and Fine-tuning: Transfer learning is a technique where a pre-trained model, trained on a large-scale dataset, is used as a starting point for a new task or dataset. Fine-tuning involves adjusting the pre-trained model's parameters on the new dataset to adapt it to the specific task at hand. This approach significantly reduces the training time and data requirements for new models.
+              The training process of a large language model typically involves the following steps:
+              Data Collection: A diverse and comprehensive dataset is collected, which typically consists of a vast range of text from sources like books, websites, articles, and other textual resources. The quality and variety of the dataset are crucial to ensure the model learns a broad understanding of language.
+              Preprocessing: The collected text data is preprocessed to clean and normalize it. This step involves removing irrelevant characters or symbols, converting the text to a consistent format, and organizing it into smaller units such as sentences or paragraphs.
+              Tokenization: The preprocessed text is divided into individual tokens, which can be as small as words or even subword units. Tokenization helps in representing and processing the text efficiently during training.
+              Architecture Design: The model architecture, often based on the transformer architecture, is defined. Transformers are neural network models that excel in capturing long-range dependencies in sequential data, making them well-suited for language modeling tasks.
+              Model Initialization: The model parameters are randomly initialized to start the training process. These parameters will be adjusted iteratively during training to optimize the model's performance.
+              Training Loop: The model is trained using a large-scale computational infrastructure. The training loop typically involves several iterations over the dataset, known as epochs. During each epoch, the model processes the input data, generates predictions, and compares them with the expected output. The discrepancy between the predicted and expected output is used to compute a loss, which quantifies the model's performance.
+              Backpropagation and Optimization: Backpropagation is employed to calculate the gradients of the model's parameters with respect to the loss. These gradients indicate the direction and magnitude of the parameter updates needed to minimize the loss. Optimization algorithms, such as stochastic gradient descent (SGD) or its variants, are then used to update the model's parameters based on the computed gradients.
+              Iterative Refinement: Steps 6 and 7 are repeated for multiple epochs, gradually refining the model's performance. The model's ability to generate coherent and contextually relevant responses improves as it learns from the dataset.
+              Evaluation: The trained model is evaluated on a separate dataset to assess its performance and identify areas for improvement. Various metrics, such as perplexity or accuracy, can be used to evaluate the model's language generation capabilities.
+              Fine-tuning and Iteration: Based on the evaluation results, the model may undergo fine-tuning or further iterations of training to enhance its performance. This process helps in addressing specific limitations or biases and aligning the model's output more closely with desired expectations.
+              It's important to note that training a large language model from scratch is a computationally intensive process that requires substantial computational resources, including powerful hardware like GPUs or specialized hardware accelerators, and large-scale distributed systems to handle the massive amount of data and model parameters involved.
+              Here are ten highly recommended books that can help you learn deep learning:
+              "Deep Learning" by Ian Goodfellow, Yoshua Bengio, and Aaron Courville:
+              This comprehensive book covers the fundamental concepts of deep learning, including neural networks, optimization algorithms, and regularization techniques. It also explores advanced topics like generative models and deep reinforcement learning.
+              "Deep Learning with Python" by François Chollet:
+              Written by the creator of the Keras deep learning library, this book provides a practical introduction to deep learning with Python. It covers essential concepts, tools, and techniques, and includes hands-on examples and case studies.
+              "Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow" by Aurélien Géron:
+              This book offers a hands-on approach to learning machine learning and deep learning using popular Python libraries such as Scikit-Learn, Keras, and TensorFlow. It covers various algorithms and provides practical examples and exercises.
+              "Deep Learning for Computer Vision" by Rajalingappaa Shanmugamani:
+              Focusing on deep learning techniques for computer vision tasks, this book explores topics such as convolutional neural networks (CNNs), image classification, object detection, and image generation. It includes code examples using Python and popular deep learning frameworks.
+              "Deep Learning: A Practitioner's Approach" by Josh Patterson and Adam Gibson:
+              This book offers a practical guide to implementing deep learning solutions using the Deeplearning4j library. It covers key concepts, architectures, and techniques, and includes code examples and case studies.
+              "Grokking Deep Learning" by Andrew Trask:
+              Geared towards beginners, this book provides an intuitive and accessible introduction to deep learning concepts. It covers neural networks, backpropagation, gradient descent, and other fundamental topics with clear explanations and visualizations.
+              "Deep Learning for Natural Language Processing" by Palash Goyal, Sumit Pandey, and Karan Jain:
+              Focusing on deep learning techniques for natural language processing (NLP), this book explores topics like word embeddings, recurrent neural networks (RNNs), and sequence-to-sequence models. It includes code examples using Python and popular NLP libraries.
+              "Deep Reinforcement Learning" by Pieter Abbeel and John Schulman:
+              This book provides an in-depth exploration of deep reinforcement learning, a subfield that combines deep learning with reinforcement learning. It covers topics like Q-learning, policy gradients, and deep Q-networks (DQNs) and provides practical examples.
+              "Deep Learning for Time Series Forecasting" by N.D. Lewis:
+              Focusing on deep learning techniques for time series data, this book covers topics such as recurrent neural networks (RNNs), long short-term memory (LSTM) networks, and attention models. It includes code examples using Python and popular deep learning frameworks.
+              "Interpretable Deep Learning" by Christoph Molnar:
+              This book delves into the challenges and techniques for interpreting and understanding deep learning models. It covers model visualization, feature importance, and other methods for explaining and interpreting deep learning predictions.
+              These books cover a range of deep learning topics and provide valuable insights and practical guidance for learning and applying deep learning techniques. Choose the ones that align with your interests and learning style to enhance your understanding of deep learning.
+              Here are 10 popular GitHub projects that can be useful for building large language models (LLMs) or working with natural language processing (NLP) tasks:
+              TensorFlow: An open-source deep learning framework that provides tools and resources for building and training LLMs. It offers extensive support for various neural network architectures and has a large community.
+              PyTorch: Another popular deep learning framework that provides a dynamic computational graph and a wide range of tools for building LLMs. It is known for its user-friendly interface and flexibility.
+              Hugging Face Transformers: A library that provides pre-trained models and a high-level API for natural language understanding (NLU) tasks, including LLMs. It supports popular models like GPT, BERT, and RoBERTa.
+              Fairseq: A library developed by Facebook AI Research that focuses on sequence modeling tasks, including LLMs. It offers pre-trained models and tools for training and evaluating models using sequence-to-sequence architectures.
+              AllenNLP: A powerful NLP research library that simplifies the process of building and evaluating deep learning models. It offers pre-built components for common NLP tasks and supports LLMs with various architectures.
+              OpenAI GPT-3: Although not available on GitHub, OpenAI's GPT-3 language model is widely recognized and can be accessed via the OpenAI API. It offers state-of-the-art language generation capabilities and can be used for various NLP tasks.
+              BERT: A pre-trained language model developed by Google Research that has achieved exceptional results on various NLP benchmarks. The official implementation is available on GitHub and can be fine-tuned for specific tasks.
+              spaCy: A popular Python library for NLP tasks that provides efficient and scalable tools for tokenization, named entity recognition, part-of-speech tagging, and more. It integrates well with deep learning frameworks.
+              FastText: A library developed by Facebook Research that provides efficient tools for text classification and word representation learning. It offers pre-trained word embeddings and supports training LLMs for classification tasks.
+              NLTK (Natural Language Toolkit): A comprehensive library for NLP tasks in Python. It provides various modules for tokenization, stemming, tagging, parsing, and more. Although it doesn't focus explicitly on LLMs, it is widely used for preprocessing text data in NLP pipelines.
+              These projects offer a range of resources, pre-trained models, and tools that can assist you in building and working with large language models. Make sure to review the documentation and examples provided by each project to understand their capabilities and how they can be integrated into your workflow.
+              Here are some popular backend libraries that are commonly used for deep learning:
+              TensorFlow: Developed by Google's Brain Team, TensorFlow is one of the most widely used deep learning frameworks. It provides a flexible and comprehensive ecosystem for building and deploying machine learning models. TensorFlow offers high-level APIs for easy model construction, as well as lower-level APIs for fine-grained control. It supports distributed computing and has extensive community support.
+              PyTorch: Developed by Facebook's AI Research lab, PyTorch is known for its simplicity and dynamic computational graph. It allows for intuitive model construction and debugging. PyTorch is widely used in both research and industry due to its flexibility, support for dynamic networks, and strong GPU acceleration capabilities.
+              Keras: Initially developed as a user-friendly deep learning library, Keras is now integrated as the official high-level API in TensorFlow. It provides a user-friendly and modular interface for building neural networks. Keras abstracts away many complexities and allows users to build models with just a few lines of code. It supports multiple backends, including TensorFlow and Theano.
+              Theano: Although its development has been discontinued, Theano was one of the first widely-used deep learning libraries. It allows for efficient mathematical operations on multi-dimensional arrays and supports GPU acceleration. Theano was influential in shaping the deep learning landscape and served as a precursor to subsequent frameworks.
+              Caffe: Developed by the Berkeley Vision and Learning Center (BVLC), Caffe is a popular deep learning framework known for its efficiency and simplicity. It is particularly suitable for convolutional neural networks (CNNs) and image-related tasks. Caffe has a clean and expressive architecture description language that makes it easy to define and train deep models.
+              MXNet: MXNet is an open-source deep learning framework developed by Apache. It offers a flexible and efficient interface for building and deploying neural networks. MXNet provides a hybrid frontend that allows users to seamlessly switch between symbolic and imperative programming. It is known for its scalability and supports multiple programming languages.
+              Chainer: Chainer is a flexible deep learning framework that focuses on dynamic neural networks. It allows for intuitive model construction using imperative programming, making it easy to define complex architectures and manipulate data within the network. Chainer is known for its "define-by-run" approach, which facilitates dynamic computations.
+              Microsoft Cognitive Toolkit (CNTK): CNTK is a deep learning framework developed by Microsoft. It provides a highly efficient and scalable implementation of deep neural networks. CNTK supports both declarative and imperative programming models, making it suitable for both research and production-level deployments.
+              Deeplearning4j: Deeplearning4j is an open-source deep learning library that focuses on scalability and performance. It is designed to integrate with the Java ecosystem and supports distributed computing. Deeplearning4j provides tools for building various types of neural networks and offers integration with other popular libraries like Hadoop and Spark.
+              PaddlePaddle: PaddlePaddle (PArallel Distributed Deep LEarning) is a deep learning framework developed by Baidu. It emphasizes scalability and supports large-scale distributed training. PaddlePaddle provides a rich set of built-in models and algorithms, making it accessible to both beginners and advanced users.
+              Each of these backend libraries offers unique features, performance characteristics, and levels of abstraction. The choice of a backend library depends on factors such as your programming language preferences, the complexity of your models, the availability of community support, and the specific requirements of your deep learning project.
+              Here's an example code snippet that demonstrates how to create a GPT-Neox20B model using the Hugging Face Transformers library and start fine-tuning it with sample data from the '/tmp/wikitext' directory:
+  
+                import torch
+                from transformers import GPTNeoForCausalLM, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
+    
+                # Load the GPT-Neo model and tokenizer
+                model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-2.7B")
+                tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-2.7B")
+    
+                # Set the path to the training data
+                data_path = "/tmp/wikitext"
+    
+                # Define the dataset and data collator
+                dataset = TextDataset(tokenizer=tokenizer, file_path=data_path, block_size=128)
+                data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
+    
+                # Define the training arguments
+                training_args = TrainingArguments(
+                    output_dir="./output_dir",  # Directory to save the fine-tuned model and logs
+                    overwrite_output_dir=True,
+                    num_train_epochs=3,  # Number of training epochs
+                    per_device_train_batch_size=8,
+                    save_steps=500,  # Number of updates steps before saving the model
+                    save_total_limit=2,  # Limit the total number of saved models
+                    prediction_loss_only=True,
+                )
+    
+                # Initialize the Trainer
+                trainer = Trainer(
+                    model=model,
+                    args=training_args,
+                    data_collator=data_collator,
+                    train_dataset=dataset,
+                )
+    
+                # Start fine-tuning
+                trainer.train()
+    
+                # Save the fine-tuned model
+                trainer.save_model("./fine-tuned-model")
+
+              To deploy the GPT-Neo model on an HTTP server, you can use a framework like Flask or FastAPI. Here's a basic example using Flask:
+
+                Install Flask:
+
+                pip install flask
+
+            Create a Python file, let's call it app.py, and import the necessary libraries:
+
+                from flask import Flask, request, jsonify
+                from transformers import GPTNeoForCausalLM, GPT2Tokenizer
+
+                # Initialize Flask app
+                app = Flask(__name__)
+
+                # Load the pretrained GPT-Neo model and tokenizer
+                model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-2.7B")
+                tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-2.7B")
+                Define an endpoint for model inference:
+
+                python
+                Copy code
+                @app.route("/generate", methods=["POST"])
+                def generate_text():
+                    data = request.get_json()
+                    input_text = data["input_text"]
+
+                    # Tokenize the input text
+                    input_ids = tokenizer.encode(input_text, return_tensors="pt")
+
+                    # Generate text with the model
+                    output = model.generate(input_ids, max_length=100, num_return_sequences=1)
+                    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
+
+                    return jsonify({"generated_text": generated_text})
+                Run the Flask app:
+
+                python
+                Copy code
+                if __name__ == "__main__":
+                    app.run(host="0.0.0.0", port=5000)
+            The code that includes the text generation sampling functions and provides a commented example:
+
+                import requests
+                import numpy as np
+
+                class TextGeneratorAPI:
+                    def __init__(self, server_url):
+                        self.server_url = server_url
+
+                    def generate_text(self, input_text, sampling_algorithm="greedy", temperature=0.7):
+                        url = f"{self.server_url}/generate"
+                        payload = {
+                            "input_text": input_text,
+                            "sampling_algorithm": sampling_algorithm,
+                            "temperature": temperature
+                        }
+                        response = requests.post(url, json=payload)
+                        generated_text = response.json()["generated_text"]
+                        return generated_text
+
+                    def greedy_sampling(self, logits):
+                        return np.argmax(logits)
+
+                    def random_sampling(self, logits):
+                        probabilities = np.exp(logits / temperature)
+                        probabilities = probabilities / np.sum(probabilities)
+                        return np.random.choice(len(logits), p=probabilities)
+
+                    def top_k_sampling(self, logits, k=10):
+                        indices = np.argsort(logits)[-k:]
+                        probabilities = np.exp(logits[indices] / temperature)
+                        probabilities = probabilities / np.sum(probabilities)
+                        return np.random.choice(indices, p=probabilities)
+
+                    def top_p_sampling(self, logits, p=0.9):
+                        sorted_logits = np.sort(logits)[::-1]
+                        cumulative_probs = np.cumsum(np.exp(sorted_logits) / temperature)
+                        indices = np.arange(len(sorted_logits))
+                        selected_indices = indices[cumulative_probs <= p]
+                        probabilities = np.exp(logits[selected_indices] / temperature)
+                        probabilities = probabilities / np.sum(probabilities)
+                        return np.random.choice(selected_indices, p=probabilities)
+                In this updated code, the TextGeneratorAPI class includes the additional sampling functions: greedy_sampling, random_sampling, top_k_sampling, and top_p_sampling. These functions take logits (output of the model) as input and return the index of the selected token based on the respective sampling algorithm.
+                The greedy_sampling function selects the token with the highest probability (argmax) as the next token. The random_sampling function applies a temperature scaling to the logits and then samples from the resulting probability distribution. The top_k_sampling function selects from the top-k tokens with the highest probabilities. The top_p_sampling function selects from the tokens with cumulative probabilities below a certain threshold (top-p).
+                You can now use the updated TextGeneratorAPI class with the sampling functions. Here's an example:
+
+                    api = TextGeneratorAPI(server_url="http://localhost:5000")
+
+                    input_text = "Once upon a time"
+
+                    # Generate text using different sampling algorithms and temperatures
+                    greedy_text = api.generate_text(input_text, sampling_algorithm="greedy")
+                    random_text = api.generate_text(input_text, sampling_algorithm="random")
+                    top_k_text = api.generate_text(input_text, sampling_algorithm="top_k", temperature=0.8)
+                    top_p_text = api.generate_text(input_text, sampling_algorithm="top_p", temperature=0.9)
+
+                    print("Greedy Sampling:", greedy_text)
+                    print("Random Sampling:", random_text)
+                    print("Top-k Sampling:", top_k_text)
+                    print("Top-p Sampling:", top_p_text)
+                    Make sure to adjust the server_url with the appropriate URL of your HTTP server, and ensure that the server is running and accessible before making requests through the API.
+           '''
\ No newline at end of file
diff --git a/benchmarks/inference/mii/server.py b/benchmarks/inference/mii/server.py
new file mode 100644
index 000000000..2e6164187
--- /dev/null
+++ b/benchmarks/inference/mii/server.py
@@ -0,0 +1,83 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+import mii
+import argparse
+
+from mii.constants import DeploymentType
+
+from deepspeed.inference import RaggedInferenceEngineConfig, DeepSpeedTPConfig
+from deepspeed.inference.v2.ragged import DSStateManagerConfig
+
+def start_server(model_name,
+                 deployment_name,
+                 task,
+                 tensor_parallel,
+                 replica_num,
+                 max_ragged_batch_size):
+    tp_config = DeepSpeedTPConfig(tp_size=tensor_parallel)
+    mgr_config = DSStateManagerConfig(max_ragged_batch_size=max_ragged_batch_size, max_ragged_sequence_count=max_ragged_batch_size)
+    inference_config = RaggedInferenceEngineConfig(tensor_parallel=tp_config,
+                                                   state_manager=mgr_config)
+
+    mii.serve(
+        model_name,
+        deployment_name=deployment_name,
+        tensor_parallel=tensor_parallel,
+        task=task,
+        inference_engine_config=inference_config,
+        replica_num=replica_num
+    )
+
+def stop_server(deployment_name):
+    mii.client(deployment_name).terminate_server()
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_name",
+                        type=str,
+                        default="meta-llama/Llama-2-7b-hf",
+                        help="Name of the model in the model_files to benchmark")
+    parser.add_argument("-d",
+                        "--deployment_name",
+                        type=str,
+                        default="benchmark_deployment")
+    parser.add_argument("-t", "--task", type=str,
+                        help="Task type. Currently only text-generation is supported",
+                        default="text-generation")
+    parser.add_argument("-m",
+                        "--tensor_parallel",
+                        type=int,
+                        help="Degree of tensor (model) parallelism",
+                        default=1)
+    parser.add_argument("-b",
+                        "--ragged_batch_size",
+                        type=int,
+                        help="Max batch size for ragged batching",
+                        default=768)
+    parser.add_argument("-r",
+                        "--replica_num",
+                        type=int,
+                        help="Number of replicas for load balancing",
+                        default=1)
+    parser.add_argument("cmd", help="start, stop, or restart")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+
+    if args.cmd == "start":
+        start_server(args.model_name,
+                     args.deployment_name,
+                     args.task,
+                     args.tensor_parallel,
+                     args.replica_num,
+                     args.ragged_batch_size)
+    elif args.cmd == "stop":
+        print("running stop")
+        stop_server(args.deployment_name)
+    else:
+        raise ValueError(f"Unknown command: {args.cmd}")
diff --git a/inference/huggingface/stable-diffusion/local_pipeline_stable_diffusion.py b/inference/huggingface/stable-diffusion/local_pipeline_stable_diffusion.py
index ab0b708e6..4774fac4f 100644
--- a/inference/huggingface/stable-diffusion/local_pipeline_stable_diffusion.py
+++ b/inference/huggingface/stable-diffusion/local_pipeline_stable_diffusion.py
@@ -29,11 +29,12 @@
     is_accelerate_available,
     is_accelerate_version,
     logging,
-    randn_tensor,
     replace_example_docstring,
 )
 
-from diffusers.pipeline_utils import DiffusionPipeline
+from diffusers.utils.torch_utils import randn_tensor
+
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
 from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 
diff --git a/inference/huggingface/stable-diffusion/requirements.txt b/inference/huggingface/stable-diffusion/requirements.txt
index 22524d2df..37f9f9ea5 100644
--- a/inference/huggingface/stable-diffusion/requirements.txt
+++ b/inference/huggingface/stable-diffusion/requirements.txt
@@ -1,4 +1,4 @@
 deepspeed
 torch
-diffusers
+diffusers>=0.22.3
 triton==2.0.0.dev20221202
diff --git a/inference/huggingface/text-generation/README.md b/inference/huggingface/text-generation/README.md
index 4fa6faa23..8019aa298 100644
--- a/inference/huggingface/text-generation/README.md
+++ b/inference/huggingface/text-generation/README.md
@@ -15,6 +15,13 @@ Python dependencies:
 <pre>
 pip install -r requirements.txt
 </pre>
+If you are using conda, the following works:
+<pre>
+conda create -c conda-forge -n deepspeed python=3.10
+conda activate deepspeed
+pip install -r requirements.txt
+deepspeed --num_gpus 1 inference-test.py --name bigscience/bloom-3b --batch_size 2
+</pre>
 
 # Inference Test
 
diff --git a/inference/huggingface/text-generation/arguments.py b/inference/huggingface/text-generation/arguments.py
new file mode 100644
index 000000000..b50198ff9
--- /dev/null
+++ b/inference/huggingface/text-generation/arguments.py
@@ -0,0 +1,21 @@
+from argparse import ArgumentParser
+import os
+
+parser = ArgumentParser()
+
+parser.add_argument("--model", required=True, type=str, help="model_name")
+parser.add_argument("--checkpoint_path", required=False, default=None, type=str, help="model checkpoint path")
+parser.add_argument("--save_mp_checkpoint_path", required=False, default=None, type=str, help="save-path to store the new model checkpoint")
+parser.add_argument("--batch_size", default=1, type=int, help="batch size")
+parser.add_argument("--dtype", default="float16", type=str, choices=["float32", "float16", "int8"], help="data-type")
+parser.add_argument("--hf_baseline", action='store_true', help="disable DeepSpeed inference")
+parser.add_argument("--use_kernel", action='store_true', help="enable kernel-injection")
+parser.add_argument("--max_tokens", default=1024, type=int, help="maximum tokens used for the text-generation KV-cache")
+parser.add_argument("--max_new_tokens", default=50, type=int, help="maximum new tokens to generate")
+parser.add_argument("--greedy", action='store_true', help="greedy generation mode")
+parser.add_argument("--use_meta_tensor", action='store_true', help="use the meta tensors to initialize model")
+parser.add_argument("--test_performance", action='store_true', help="enable latency, bandwidth, and throughout testing")
+parser.add_argument("--local_rank", type=int, default=int(os.getenv("LOCAL_RANK", "0")), help="local rank")
+parser.add_argument("--world_size", type=int, default=int(os.getenv("WORLD_SIZE", "1")), help="world_size")
+parser.add_argument("--test_hybrid_engine", action='store_true', help="enable hybrid engine testing")
+parser.add_argument("--trust_remote_code", action='store_true', help="Trust remote code for hugging face models")
\ No newline at end of file
diff --git a/inference/huggingface/text-generation/inference-test.py b/inference/huggingface/text-generation/inference-test.py
index 43ec25e57..827d8db35 100644
--- a/inference/huggingface/text-generation/inference-test.py
+++ b/inference/huggingface/text-generation/inference-test.py
@@ -1,4 +1,3 @@
-from argparse import ArgumentParser
 from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
 import deepspeed
 import math
@@ -7,23 +6,8 @@
 import time
 from utils import DSPipeline, Performance
 from deepspeed.runtime.utils import see_memory_usage
+from arguments import parser
 
-parser = ArgumentParser()
-
-parser.add_argument("--model", required=True, type=str, help="model_name")
-parser.add_argument("--checkpoint_path", required=False, default=None, type=str, help="model checkpoint path")
-parser.add_argument("--save_mp_checkpoint_path", required=False, default=None, type=str, help="save-path to store the new model checkpoint")
-parser.add_argument("--batch_size", default=1, type=int, help="batch size")
-parser.add_argument("--dtype", default="float16", type=str, choices=["float32", "float16", "int8"], help="data-type")
-parser.add_argument("--hf_baseline", action='store_true', help="disable DeepSpeed inference")
-parser.add_argument("--use_kernel", action='store_true', help="enable kernel-injection")
-parser.add_argument("--max_tokens", default=1024, type=int, help="maximum tokens used for the text-generation KV-cache")
-parser.add_argument("--max_new_tokens", default=50, type=int, help="maximum new tokens to generate")
-parser.add_argument("--greedy", action='store_true', help="greedy generation mode")
-parser.add_argument("--use_meta_tensor", action='store_true', help="use the meta tensors to initialize model")
-parser.add_argument("--test_performance", action='store_true', help="enable latency, bandwidth, and throughout testing")
-parser.add_argument("--local_rank", type=int, default=int(os.getenv("LOCAL_RANK", "0")), help="local rank")
-parser.add_argument("--world_size", type=int, default=int(os.getenv("WORLD_SIZE", "1")), help="world_size")
 args = parser.parse_args()
 
 if args.hf_baseline and args.world_size > 1:
@@ -40,7 +24,8 @@
                   dtype=data_type,
                   is_meta=args.use_meta_tensor,
                   device=args.local_rank,
-                  checkpoint_path=args.checkpoint_path)
+                  checkpoint_path=args.checkpoint_path,
+                  trust_remote_code=args.trust_remote_code)
 
 if args.local_rank == 0:
     print(f"initialization time: {(time.time()-t0) * 1000}ms")
@@ -51,8 +36,15 @@
 else:
     ds_kwargs = dict()
 
-if not args.hf_baseline:
-    pipe.model = deepspeed.init_inference(pipe.model,
+# Use DeepSpeed Hybrid Engine for inference
+if args.test_hybrid_engine:
+    ds_config = {"train_batch_size": args.batch_size, "fp16": {"enabled": True if data_type==torch.half else False}, "hybrid_engine": {"enabled": True}}
+    pipe.model, *_ = deepspeed.initialize(model=pipe.model, config=ds_config)
+    pipe.model.eval()
+# If not trying with the HuggingFace baseline, use DeepSpeed Inference Engine
+else:
+    if not args.hf_baseline:
+        pipe.model = deepspeed.init_inference(pipe.model,
                                     dtype=data_type,
                                     mp_size=args.world_size,
                                     replace_with_kernel_inject=args.use_kernel,
@@ -99,4 +91,3 @@
         print(f"\nin={i}\nout={o}\n{'-'*60}")
     if args.test_performance:
         Performance.print_perf_stats(map(lambda t: t / args.max_new_tokens, times), pipe.model.config, args.dtype, args.batch_size)
-
diff --git a/inference/huggingface/text-generation/requirements.txt b/inference/huggingface/text-generation/requirements.txt
index 1e33957f4..a6f484701 100644
--- a/inference/huggingface/text-generation/requirements.txt
+++ b/inference/huggingface/text-generation/requirements.txt
@@ -1,3 +1,3 @@
 deepspeed
 torch
-transformers==4.21.2
+transformers==4.28.1
diff --git a/inference/huggingface/text-generation/utils.py b/inference/huggingface/text-generation/utils.py
index 096b2f40d..173eac039 100644
--- a/inference/huggingface/text-generation/utils.py
+++ b/inference/huggingface/text-generation/utils.py
@@ -21,7 +21,8 @@ def __init__(self,
                  dtype=torch.float16,
                  is_meta=True,
                  device=-1,
-                 checkpoint_path=None
+                 checkpoint_path=None,
+                 trust_remote_code=False,
                  ):
         self.model_name = model_name
         self.dtype = dtype
@@ -38,18 +39,18 @@ def __init__(self,
         # the Deepspeed team made these so it's super fast to load (~1 minute), rather than wait 10-20min loading time.
         self.tp_presharded_models = ["microsoft/bloom-deepspeed-inference-int8", "microsoft/bloom-deepspeed-inference-fp16"]
 
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left", trust_remote_code=trust_remote_code)
         self.tokenizer.pad_token = self.tokenizer.eos_token
 
         if (is_meta):
             '''When meta tensors enabled, use checkpoints'''
-            self.config = AutoConfig.from_pretrained(self.model_name)
+            self.config = AutoConfig.from_pretrained(self.model_name, trust_remote_code=trust_remote_code)
             self.repo_root, self.checkpoints_json = self._generate_json(checkpoint_path)
 
             with deepspeed.OnDevice(dtype=torch.float16, device="meta"):
-                self.model = AutoModelForCausalLM.from_config(self.config)
+                self.model = AutoModelForCausalLM.from_config(self.config, trust_remote_code=trust_remote_code)
         else:
-            self.model = AutoModelForCausalLM.from_pretrained(self.model_name)
+            self.model = AutoModelForCausalLM.from_pretrained(self.model_name, trust_remote_code=trust_remote_code)
 
         self.model.eval()
 
diff --git a/inference/huggingface/zero_inference/README.md b/inference/huggingface/zero_inference/README.md
new file mode 100644
index 000000000..f6dd4850e
--- /dev/null
+++ b/inference/huggingface/zero_inference/README.md
@@ -0,0 +1,176 @@
+
+# ZeRO-Inference: 20X faster inference through weight quantization and KV cache offloading
+
+ZeRO-Inference enables inference computation of massive models (with hundreds of billions of parameters) on as few as a single GPU by leveraging multi-level hierarchical memory (e.g., GPU, CPU, and NVMe). It delivers efficient computation for `throughput-oriented` inference scenarios despite the latency of fetching model weights from CPU memory or NVMe over PCIe interconnect. We [previewed](https://github.com/microsoft/DeepSpeed/pull/1514) this AI democratization technology in late 2021, and followed up in 2022 with a [paper](https://arxiv.org/abs/2207.00032) and [blog](https://www.deepspeed.ai/2022/09/09/zero-inference.html) describing the first full-feature release in [DeepSpeed versions >= 0..6.6](https://github.com/microsoft/DeepSpeed/tree/v0.6.6). We have been continuously pushing out usability and performance updates ever since, and are pleased to announce a major refresh of this popular DeepSpeed feature. This new release leverages two memory optimizations (weight quantization and KV cache offloading) to deliver up to 20X speedup in inference throughput, and is available in [DeepSpeed versions >= 0.10.3](https://github.com/microsoft/DeepSpeed/tree/v0.10.3). 
+
+This repo is used to showcase ZeRO-Inference's capability of serving economic cases for large generative models. For these models, the major memory consumption originates from model weights and KV cache, limiting the maximum batch size (thus throughput) that can be used in inference. ZeRO-Inference now provides 4-bit quantization of model weights, leading to approximately $4\times$ reduction on its memory usage and PCIe transfer volume. This is a generic feature and is model agnostic (requiring no model change). The highly efficient quantization/dequantization kernels have been integrated into the DeepSpeed framework. Additionally, KV cache, as the other limiting factor of improving system throughput, can now be offloaded to consume cheaper CPU memory instead of the more expensive HBM capacity. We demonstrate the ease of enabling our KV cache offloading to arbitrary models by releasing the required modifications for three popular and publicly available HuggingFace models (BLOOM, LLAMA2, and OPT). Refer to [`model-support.md`](model-support.md) for more details.   
+
+With these two added techniques, we show the significant throughput and batch size improvements of this new ZeRO-Inference release over the previous one. We further show that ZeRO-Inference surpasses the token generation throughput of the SOTA throughput-oriented inference frameworks. Unlike [FlexGen](https://github.com/FMInference/FlexGen) which requires from-scratch model implementation with their APIs, ZeRO-Inference requires `NO` code change for `4-bit` quantization and offloading of model weights (integrated to DeepSpeed inference framework), and only minor changes to the model code for KV cache offloading.
+
+We plan to release more performance improvements to ZeRO-Inference, such as partial offloading, KV cache quantization, and etc, in the near future. Please check the [Working-In-Progress](#working-in-progress) list and stay tuned.
+
+## Performance and Feature Highlights
+We use a token generation workload for our benchmarking of ZeRO-Inference. We run all our experiments on a single `NVIDIA A6000 GPU` with 48GB of device HBM on a Lambda workstation with 252GB of host CPU memory and a [CS3040 NVMe 2TB SDD](https://www.pny.com/CS3040-M2-NVMe-SSD?sku=M280CS3040-2TB-RB) with throughput of 5600 MB/s sequential reads. We configure a prompt length of 512 tokens and a generation length of 32 tokens. 
+
+
+### 😽 Overall Throughput Improvement of new ZeRO-Inference release 😽
+
+<p align="center">
+
+<img src="images/over_v1.png" alt="democratization"/>
+
+ Figure 1: Zero-Inference throughput improvement (speedup) over the previous version for performing throughput-oriented inference on various model sizes on a single NVIDIA A6000 GPU. `NVIDIA A6000 GPU` with 48GB device HBM and 252GB host CPU memory, with disk throughput of 5600 MB/s sequential reads; prompt=512, gen=32. The significant throughput originates from our faster generation kernel design, KV cache offloading and hybrid computation, as well as efficient weight compression.
+
+</p>
+
+
+### 🐼 Comparison with SOTA Throughput-Oriented Inference Framework 🐼
+
+We compare ZeRO-Inference with FlexGen (version 0.1.7), a SOTA inference framework, in terms of generality to support different model families, and token generation throughput. The results are summarized in the table below. 
+
+Framework   | Weight Quantization | KV Cache Offload | OPT-30B  | OPT-66B  | OPT-175B  | BLOOM-176B | LLAMA2-70B
+|---|---|---|---|---|---|---|---|
+| FlexGen  | Yes | No  | 21.97 (bsz=48, cpu_offload) | 5.78 (bsz=24, cpu_offload)  | 1.50 (bsz=16, cpu_offload)  | Unsupported | Unsupported | 
+| FlexGen  | No | Yes  | 13.24 (bsz=200, cpu_offload) | 4.15 (bsz=80, cpu_offload)  | 0.33 (bsz=64, nvme_offload) | Unsupported | Unsupported | 
+| FlexGen  | Yes | Yes | 13.40 (bsz=280, cpu_offload) | 6.24 (bsz=96, cpu_offload) | 1.84 (bsz=40, cpu_offload)  | Unsupported | Unsupported | 
+| ZeRO-Inference | Yes | No  |  **22.74** (bsz=24. cpu_offload)  | 7.68 (bsz=16, cpu_offload) | 1.21 (bsz=8, cpu_offload)   | 0.65 (bsz=4, cpu_offload)   | **24.05** (bsz=96, cpu_offload)
+| ZeRO-Inference | No | Yes  |  12.32 (bsz=96, cpu_offload) | 3.63 (bsz=40, cpu_offload) | 0.47 (bsz=32, nvme_offload) | 0.47 (bsz=32, nvme_offload) |  2.91 (bsz=96, cpu_offload)
+| ZeRO-Inference | Yes | Yes |  19.34 (bsz=128, cpu_offload) | **8.08** (bsz=64, cpu_offload) | **2.26** (bsz=24, cpu_offload)  | **1.33** (bsz=24, cpu_offload)  |  3.65 (bsz=200, cpu_offload)
+
+#### Generality
+Unlike FlexGen which supports only the OPT model family, ZeRO-Inference is designed as a general technique to support different model families. With our new optimizations, we continue to make it easy for model scientists to inference their favorite models using ZeRO-Inference. Our weight quantization optimization is generally applicable to any model without requiring modifications. For KV cache offloading which requires minor code changes for each model family, we provide the required modifications for three model families (BLOOM, LLAMA2, and OPT) as a guide. 
+
+#### Token Generation Throughput
+For fairness, we evaluate the same set of optimizations supported by both FlexGen and our ZeRO-Inference for performance comparison, specifically 4-bit weight quantization and KV cache offloading to CPU memory. We measure the impact of the optimizations individually and collectively. We consider model sizes that exceed the available 48GB HBM, thus requiring that model weights be offloaded to CPU or NVMe. Each data point is described using the format of | `throughput` (`batch size` and the memory used for weights offloading) |. Throughput is measured by `tokens/sec`. Each data point represents the best observed throughput from a batch size sweep. We observe that for the OPT family of models supported by both frameworks, ZeRO-Inference consistently achieved better generation throughput. 
+
+## Install
+
+The Python dependencies are captured in `requirements.txt`. Note that to enable KV cache offloading, use our [fork of transformers](https://github.com/tjruwase/transformers/tree/kvcache-offload-cpu), you can install it by `pip install git+https://github.com/tjruwase/transformers@kvcache-offload-cpu` or running
+
+```sh
+pip install -r requirements.txt
+```
+The model changes are detailed in [`model-support.md`](model-support.md).
+
+## Usage
+
+We provide [`run_model.py`](run_model.py) as the entry script to run ZeRO-Inference. Run
+```sh
+python run_model.py --help
+```
+For help on what options are available, please refer to [`run_model.sh`](run_model.sh) for more example scripts.
+
+## Token Generation with ZeRO-Inference
+
+### Example 1: OPT Models
+
+Here is an example of running the `facebook/opt-13b` model with Zero-Inference using 16-bit model weights and offloading kv cache to CPU:
+
+```sh
+deepspeed --num_gpus 1 run_model.py --model facebook/opt-13b --batch-size 8 --prompt-len 512 --gen-len 32 --cpu-offload --kv-offload
+```
+
+To enable 4-bit weight quantization, use `--quant-bits 4`. Quantization group size is default to be `64` and can be configured through the `--group-size` flag.
+
+```sh
+deepspeed --num_gpus 1 run_model.py --model facebook/opt-13b --batch-size 16 --prompt-len 512 --gen-len 32 --cpu-offload --kv-offload --quant-bits 4
+```
+
+### Example 2: BLOOM Models
+
+Here is an example of running `bigscience/bloom-7b1` with Zero-Inference using 4-bit model weights and offloading kv cache to CPU:
+
+```sh
+deepspeed --num_gpus 1 run_model.py --model bigscience/bloom-7b1 --batch-size 8 --prompt-len 512 --gen-len 32 --cpu-offload --quant-bits 4 --kv-offload 
+```
+
+
+### Example 3: LLAMA2 Models
+
+Here is an example of running `meta-llama/Llama-2-7b-hf` with Zero-Inference using 4-bit model weights and offloading kv cache to CPU:
+
+```sh
+deepspeed --num_gpus 1 run_model.py --model meta-llama/Llama-2-7b-hf` --batch-size 8 --prompt-len 512 --gen-len 32 --cpu-offload --quant-bits 4 --kv-offload
+```
+
+## Performance Tuning Tips
+
+- While using pinned CPU memory does speed up the offloading data transfer rate, the amount of pinned memory available on a system is much less than the total CPU memory, thus limiting the maximum batch size that can be run. To see if it helps improve the overall performance on your system, use `--pin-memory ` to enable it. By default, pinned memory is not enabled.
+
+- When using CPU offloading, the optimal batch size for best throughput on a system is not necessarily the maximum batch size one can achieve since the system performance can drop when CPU memory is under pressure.
+
+
+## Working-In-Progress
+
+The following features/improvements are part of our work-in-progress. Please stay tuned :smile:
+
+- [ ] KV cache quantization
+- [ ] Partial model weights/KV cache offloading
+- [ ] Compute/IO scheduling for maximum overlapping and reuse
+- [ ] Generalizing model support without any client-side change
+
+## How to Enable INT4 Weight Quantization in ds_config
+
+INT4 weight quantization can be easily enabled with a few lines of configuration change in your ds_config. ZeRO-Inference engine will automatically identify all candidate layers and convert their weight tensors into INT4. Currently, we support 2 modes: quantized initialization and post-initialization quantization.
+
+### Quantized Initialization
+This is the easiest way to get started. By providing a few lines of hints in ds_config, the model will be on-the-fly quantized during model initialization (e.g., AutoModel.from_pretrained). All candidate layers will be automatically quantized.
+```python
+ds_config = {
+  'weight_quantization': {
+      'quantized_initialization': {
+          'num_bits': 4,
+          'group_size': 64,
+          'group_dim': 1,
+          'symmetric': False
+      },
+  }
+}
+with torch.no_grad():
+  hfds_config = HfDeepSpeedConfig(ds_config)
+  # Now model is on-the-fly quantized.
+  model = AutoModel.from_pretrained('facebook/opt-66b')
+```
+Currently, ZeRO-inference can quantize the weight matrix of nn.Embedding and nn.Linear into INT4 format. In the example above, we applied group_size=64 and performed asymmetric quantization on the 1st dimension of the weight matrix. `group_size` here is configurable based on users' demand.
+
+### Post Initialization Quantization
+In this mode, the model is first loaded in FP16 format and then converted into INT4. The advantage of enabling this mode is that users will have an overview of the model architecture. Thus, they will have fine-grained control over the quantization decision. For example, which layer should be quantized with which quantization configuration can be controlled. Only a few lines of code changes are needed. Note that we plan to expand this mode to accommodate more formats in the near future.
+```python
+from deepspeed.compression.inference.quantization import _init_group_wise_weight_quantization
+ds_config = {
+  'weight_quantization': {
+    'post_init_quant': {
+      'fc': {
+          'num_bits': 4,
+          'group_size': 64,
+          'group_dim': 1,
+          'symmetric': False
+      },
+      'self_attn.q_proj': {
+          'num_bits': 4,
+          'group_size': 64,
+          'group_dim': 1,
+          'symmetric': False
+      },
+    }
+  }
+}
+
+with torch.no_grad():
+  hfds_config = HfDeepSpeedConfig(ds_config)
+  # Model is loaded in FP16
+  model = AutoModel.from_pretrained('facebook/opt-66b', torch_dtype=torch.float16)
+  # Convert to INT4
+  model = _init_group_wise_weight_quantization(model, ds_config)
+  ds_engine = deepspeed.initialize(model=model, config_params=ds_config)[0]
+  ds_engine.module.eval()
+  model = ds_engine.module
+```
+
+In running example above, only two fully connected layers (fc1 and fc2) and the attention query projection (q_proj) will be converted into INT4.
+
+## References
+
+- DeepSpeed [ZeRO-Inference](https://www.deepspeed.ai/2022/09/09/zero-inference.html)
+- Sheng, Ying et al. [FlexGen: High-Throughput Generative Inference of Large Language Models with a Single GPU](https://arxiv.org/abs/2303.06865)
+- Shen, Sheng, et al. "Q-bert: Hessian based ultra low precision quantization of bert." Proceedings of the AAAI Conference on Artificial Intelligence. Vol. 34. No. 05. 2020.
diff --git a/inference/huggingface/zero_inference/images/over_v1.png b/inference/huggingface/zero_inference/images/over_v1.png
new file mode 100644
index 000000000..7fab00b15
Binary files /dev/null and b/inference/huggingface/zero_inference/images/over_v1.png differ
diff --git a/inference/huggingface/zero_inference/model-support.md b/inference/huggingface/zero_inference/model-support.md
new file mode 100644
index 000000000..01287ecad
--- /dev/null
+++ b/inference/huggingface/zero_inference/model-support.md
@@ -0,0 +1,36 @@
+# Supporting Hugging Face Models via KV-Cache Offloading to CPU
+
+Similar to FlexGen, KV cache offloading is implemented in client mdoels. For demonstration, we enable KV cache CPU offloading for three Hugging Face (HF) models (BLOOM, LLAMA2, and OPT) through `non-intrusive` changes to the modeling files. These changes are available in our [transformers fork](https://github.com/tjruwase/transformers/tree/kvcache-offload-cpu). To learn more about the exact code changes, compare the differences (conditioned on the `kv_offload` flag) in the respective model files (i.e., `modeling_bloom.py`, `modeling_llama.py`, and `modeling_opt.py`). The following steps are taken to enable KV cache CPU offloading in our implementation. There could be alternative designs/implementations in these steps which are optimal in different system setups.
+
+We are detailing our current approach below. With the following five steps, KV cache offloading can be easily enabled for any generative models in HF.
+
+## 1. Specify KV cache offloading to HF model
+
+KV cache offloading is enabled for a HF model by calling the `set_kv_cache_offload()` function before the model runs inference. The function appropriately modifies the HF model's forward functions to trigger the offloading behavior in the attention module.
+
+## 2. Initialize an empty CPU tensor buffer to hold KV cache
+
+The KV cache tensor has a size of
+`2 * num_layers * batch_size * max_seq_len * hidden_size`, where `2` is for both K values and V values, `num_layers` is the number of transformer blocks, `batch_size` is the inference batch size, `max_seq_len` is the total length of the prompt and generated tokens, `hidden_size` is the model's hidden dimension.
+If the K values and V values require different layout, two separate tensor buffers can be used to hold each. This is what's used in BLOOM models.
+
+This empty tensor allocation can be done at the model initialization stage or the prompt processing stage in inference.
+Although in theory initializing the empty tensor(s) at the model initialization shall improve the prompt stage throughput, our experiments show that allocation of the KV cache tensor in prompt processing actually leads to slightly better overall throughput in the tested hardware/software environments. Thus, we take this approach in the current implementation.
+
+## 3. Transfer KV values to the CPU buffer in the prompt processing stage
+
+Passing the corresponding tensor buffer slice to each transformer block at the model level, and then in the prompt stage transfer the generated KV values to the CPU buffer. We use `slice` to easily index into the right buffer location.
+
+Since this GPU to CPU data transfer can happen as early as the KV values are projected, we put it in a different GPU stream and parallelize it with the following computation in the attention block.
+
+## 4. Use the CPU KV cache in the decode stage
+
+During the decode stage, after the KV values for the current token are projected, we transfer it to the CPU KV cache. The full KV values on CPU are then used for the following attention score computation.
+
+## 5. Compute attention scores
+
+This can be done either on CPU or GPU. If done on CPU (slow compute), the projected Q values (less data) need to be transferred to CPU; if done on GPU (fast compute), KV values (more data) need to be transferred to GPU.
+
+As attention score computation is often memory bandwidth bound, with a decent CPU, the former approach delivers better performance. Thus, our current implementation computes attention scores on CPU.
+Note that if computed on CPU, the attention output needs to be transferred back to GPU before the later output projection linear layer.
+
diff --git a/inference/huggingface/zero_inference/requirements.txt b/inference/huggingface/zero_inference/requirements.txt
new file mode 100644
index 000000000..fc24734e9
--- /dev/null
+++ b/inference/huggingface/zero_inference/requirements.txt
@@ -0,0 +1,5 @@
+deepspeed>=0.10.1
+torch
+transformers @ git+https://github.com/tjruwase/transformers@kvcache-offload-cpu
+packaging
+accelerate 
diff --git a/inference/huggingface/zero_inference/run_bloom175b_a6000.sh b/inference/huggingface/zero_inference/run_bloom175b_a6000.sh
new file mode 100755
index 000000000..f997f1d52
--- /dev/null
+++ b/inference/huggingface/zero_inference/run_bloom175b_a6000.sh
@@ -0,0 +1,32 @@
+export USE_TF=0 
+BASE_LOG_DIR=~/experiments/zero_inference/
+MODEL_NAME="bloom"
+FULL_MODEL_NAME="bigscience/${MODEL_NAME}"
+
+OFFLOAD_DIR=/local_nvme/zero_offload
+mkdir -p $OFFLOAD_DIR
+
+QB=4
+
+BSZ=8
+LOG_DIR=$BASE_LOG_DIR/${MODEL_NAME}_bs${BSZ}
+mkdir -p  $LOG_DIR
+deepspeed --num_gpus 1 run_model.py --dummy --model ${FULL_MODEL_NAME} --batch-size ${BSZ} --disk-offload --gen-len 32 --pin-memory 0 --offload-dir ${OFFLOAD_DIR} &> $LOG_DIR/ds_${MODEL_NAME}_bs${BSZ}_disk.txt 
+
+BSZ=4
+LOG_DIR=$BASE_LOG_DIR/${MODEL_NAME}_bs${BSZ}
+mkdir -p  $LOG_DIR
+deepspeed --num_gpus 1 run_model.py --dummy --model ${FULL_MODEL_NAME} --batch-size ${BSZ} --cpu-offload --gen-len 32 --pin-memory 0 --offload-dir ${OFFLOAD_DIR} --quant_bits ${QB} &> $LOG_DIR/ds_${MODEL_NAME}_bs${BSZ}_cpu_q${QB}.txt
+
+
+BSZ=32
+LOG_DIR=$BASE_LOG_DIR/${MODEL_NAME}_bs${BSZ}
+mkdir -p  $LOG_DIR
+deepspeed --num_gpus 1 run_model.py --dummy --model ${FULL_MODEL_NAME} --batch-size ${BSZ} --disk-offload --gen-len 32 --pin-memory 0 --offload-dir ${OFFLOAD_DIR} --kv-offload &> $LOG_DIR/ds_${MODEL_NAME}_bs${BSZ}_disk_kv.txt
+
+
+BSZ=24
+LOG_DIR=$BASE_LOG_DIR/${MODEL_NAME}_bs${BSZ}
+mkdir -p  $LOG_DIR
+deepspeed --num_gpus 1 run_model.py --dummy --model ${FULL_MODEL_NAME} --batch-size ${BSZ} --cpu-offload --gen-len 32 --pin-memory 0 --offload-dir ${OFFLOAD_DIR} --quant_bits ${QB} --kv-offload &> $LOG_DIR/ds_${MODEL_NAME}_bs${BSZ}_cpu_q${QB}_kv.txt
+
diff --git a/inference/huggingface/zero_inference/run_llama2_70b_a6000.sh b/inference/huggingface/zero_inference/run_llama2_70b_a6000.sh
new file mode 100755
index 000000000..5b5164e6f
--- /dev/null
+++ b/inference/huggingface/zero_inference/run_llama2_70b_a6000.sh
@@ -0,0 +1,23 @@
+#!/bin/sh
+export USE_TF=0 
+BASE_LOG_DIR=~/experiments/zero_inference/
+MODEL_NAME="Llama-2-70b-hf"
+FULL_MODEL_NAME="meta-llama/${MODEL_NAME}"
+QB=4
+
+BSZ=64
+LOG_DIR=$BASE_LOG_DIR/${MODEL_NAME}_bs${BSZ}
+mkdir -p  $LOG_DIR
+deepspeed --num_gpus 1 run_model.py --model ${FULL_MODEL_NAME} --batch-size ${BSZ} --cpu-offload --gen-len 32 --pin-memory 1 &> $LOG_DIR/ds_${MODEL_NAME}_bs${BSZ}_cpu_pin.txt 
+
+BSZ=96
+LOG_DIR=$BASE_LOG_DIR/${MODEL_NAME}_bs${BSZ}
+mkdir -p  $LOG_DIR
+deepspeed --num_gpus 1 run_model.py --model ${FULL_MODEL_NAME} --batch-size ${BSZ} --cpu-offload --gen-len 32 --pin-memory 1 --quant_bit ${QB} &> $LOG_DIR/ds_${MODEL_NAME}_bs${BSZ}_cpu_pin_q${QB}.txt 
+deepspeed --num_gpus 1 run_model.py --model ${FULL_MODEL_NAME} --batch-size ${BSZ} --cpu-offload --gen-len 32 --pin-memory 0 --kv-offload &> $LOG_DIR/ds_${MODEL_NAME}_bs${BSZ}_cpu_kv.txt 
+
+
+BSZ=200
+LOG_DIR=$BASE_LOG_DIR/${MODEL_NAME}_bs${BSZ}
+mkdir -p  $LOG_DIR
+deepspeed --num_gpus 1 run_model.py --model ${FULL_MODEL_NAME} --batch-size ${BSZ} --cpu-offload --gen-len 32 --pin-memory 0 --kv-offload --quant_bit ${QB} &> $LOG_DIR/ds_${MODEL_NAME}_bs${BSZ}_cpu_kv_q${QB}.txt
diff --git a/inference/huggingface/zero_inference/run_model.py b/inference/huggingface/zero_inference/run_model.py
new file mode 100644
index 000000000..fea8e0be1
--- /dev/null
+++ b/inference/huggingface/zero_inference/run_model.py
@@ -0,0 +1,383 @@
+"""
+Run OPT with huggingface or deepspeed.
+
+Reference:
+https://github.com/FMInference/FlexGen/blob/main/benchmark/hf_ds/hf_opt.py
+"""
+
+import argparse
+import gc
+import multiprocessing as mp
+import os
+
+import torch
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+import deepspeed.comm as dist
+from accelerate import init_empty_weights
+from timer import timers
+from transformers import (AutoConfig, AutoTokenizer, AutoModelForCausalLM, 
+                          BloomForCausalLM, OPTForCausalLM, LlamaForCausalLM,
+                        )
+from transformers.deepspeed import HfDeepSpeedConfig
+from utils import (GB, add_model_hooks, cache_bytes,
+                   get_filename, get_quant_config, hidden_bytes, meta_to_cpu,
+                   model_bytes, write_benchmark_log)
+from packaging import version
+
+assert version.parse(deepspeed.__version__) >= version.parse("0.10.3"), "ZeRO-Inference with weight quantization and kv cache offloading is available only in DeepSpeed 0.10.3+, please upgrade DeepSpeed"
+
+def get_tokenizer(model_name, config):
+    if config.model_type == "opt":
+        # opt175b is not available on HF (at this time),
+        # so as a hack we use opt66b which has similar tokenizer. 
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_name.replace("175b", "66b"), 
+            padding_side="left" 
+        )
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+    tokenizer.pad_token = tokenizer.eos_token
+
+    return tokenizer
+
+def get_model_config(model_name):
+    if "175b" in model_name:
+        config = AutoConfig.from_pretrained("facebook/opt-66b")
+        config.hidden_size = 12288
+        config.word_embed_proj_dim = 12288
+        config.ffn_dim = 12288 * 4
+        config.num_attention_heads = 96
+        config.num_hidden_layers = 96
+    else:
+        config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+
+    if 'bloom' in model_name:
+        config.model_type = 'bloom'
+
+    return config
+
+def get_ds_model(
+    model_name,
+    cpu_offload,
+    disk_offload,
+    offload_dir,
+    dummy_weights,
+    bits,
+    group_size,
+):
+
+    config = get_model_config(model_name)
+    hidden_size = config.hidden_size
+    deepspeed.init_distributed("nccl")
+    pin_memory = bool(args.pin_memory)
+
+    if getattr(config, 'torch_dtype', None) is None:
+        dtype = torch.float16
+    else:
+        dtype = config.torch_dtype
+
+    ds_config = {
+        "fp16": {
+            "enabled": dtype == torch.float16,
+        },
+        "bf16": {
+            "enabled": dtype == torch.bfloat16,
+        },
+        "zero_optimization": {
+            "stage": 3,
+            "stage3_prefetch_bucket_size": 2 * hidden_size * hidden_size, # 0, 
+            "stage3_param_persistence_threshold": hidden_size,
+            "stage3_max_live_parameters": 2 * hidden_size * hidden_size,
+        },
+        "steps_per_print": 2000,
+        "train_batch_size": args.batch_size,
+        "wall_clock_breakdown": False,
+    }
+
+    if bits == 4:
+        quant_config = get_quant_config(config, bits=bits, group_size=group_size)
+        ds_config.update(quant_config)
+    if cpu_offload:
+        ds_config["zero_optimization"]["offload_param"] = dict(
+            device="cpu", pin_memory=pin_memory
+        )
+
+    if disk_offload:
+        ds_config["zero_optimization"]["offload_param"] = dict(
+            device="nvme",
+            pin_memory=pin_memory,
+            nvme_path=offload_dir,
+            buffer_count=5,
+            buffer_size=9 * GB if config.model_type == 'bloom' else 2 * GB,
+        )
+        ds_config["aio"] = {
+            "block_size": 1048576,
+            "queue_depth": 8,
+            "thread_count": 1,
+            "single_submit": False,
+            "overlap_events": True,
+        }
+
+    dschf = HfDeepSpeedConfig(
+        ds_config
+    )  # this tells from_pretrained to instantiate directly on gpus
+
+    # clear cache / free memory
+    get_accelerator().empty_cache()
+    gc.collect()
+
+    if config.model_type in ["bloom", "bloom-7b1"]:
+        model = BloomForCausalLM.from_pretrained(
+            dummy_weights or model_name, torch_dtype=dtype,
+        )
+    elif config.model_type == "opt":
+        model = OPTForCausalLM.from_pretrained(
+            dummy_weights or model_name, torch_dtype=dtype,
+        )
+    elif config.model_type == "llama":
+        model = LlamaForCausalLM.from_pretrained(
+            dummy_weights or model_name, torch_dtype=dtype,
+        )
+    else:
+        raise ValueError(f"Unexpected model type: {config.model_type}")
+
+    model = model.eval()
+
+
+    ds_engine = deepspeed.initialize(model=model, config_params=ds_config)[0]
+    ds_engine.module.eval()
+    model = ds_engine.module
+    print(f"model.config = {model.config}")
+
+    return model
+
+
+def run_generation(
+    model_name,
+    batch_size,
+    prompt_len,
+    gen_len,
+    cpu_offload,
+    disk_offload,
+    offload_dir,
+    num_nodes,
+    num_gpus_per_node,
+    dummy,
+    output_file,
+    verbose,
+    kv_offload,
+    quant_bits,
+    quant_group_size,
+    pin_kv_cache,
+    async_kv_offload,
+    loops,
+):
+    # Load tokenizer
+    config = get_model_config(model_name)    
+
+    tokenizer = get_tokenizer(model_name, config)
+
+    if dummy:
+        filename = os.path.join(
+            offload_dir, f"{model_name.replace('/', '-')}-hf-weights/"
+        )
+        if not os.path.exists(filename):
+            print("create dummy weights")
+            with init_empty_weights():
+                if config.model_type == 'opt':
+                    model = OPTForCausalLM(config)
+                elif config.model_type in ["bloom", "bloom-7b1"]:
+                    model = BloomForCausalLM(config)
+                elif config.model_type == "llama":
+                    model = LlamaForCausalLM(config)
+                else:
+                    raise ValueError(f"Unexpected model type: {config.model_type}")                    
+            model.save_pretrained(
+                filename, state_dict=meta_to_cpu(model.state_dict(), torch.float16)
+            )
+        dummy_weights = filename
+    else:
+        dummy_weights = None
+
+    print("load model")
+    with torch.no_grad():
+        model = get_ds_model(
+            model_name,
+            cpu_offload,
+            disk_offload,
+            offload_dir,
+            dummy_weights,
+            quant_bits,
+            quant_group_size,
+        )
+
+    # Run generation
+    execute_gen_len = gen_len
+    prompts = ["Paris is the capital city of"] * (batch_size // dist.get_world_size())
+
+    def _batch_encode(prompts):
+        input_tokens = tokenizer.batch_encode_plus(prompts, return_tensors="pt", padding="max_length", max_length=prompt_len)
+        for t in input_tokens:
+            if torch.is_tensor(input_tokens[t]):
+                input_tokens[t] = input_tokens[t].to(torch.cuda.current_device())
+        return input_tokens
+
+    input_tokens = _batch_encode(prompts)
+
+    if kv_offload:
+        model.set_kv_cache_offload(True, gen_len, pin_kv_cache, async_kv_offload)
+
+    # print(model, model.config)
+
+
+    add_model_hooks(model)
+
+    def set_model_stage(model, stage):
+        model.stage = stage
+
+    # Run
+    print(f"benchmark, prompt_len = {prompt_len}, execute_gen_len = {execute_gen_len}, input_ids.shape = {input_tokens.input_ids.shape}")
+
+    generate_kwargs = dict(max_new_tokens=execute_gen_len, do_sample=False)
+    prefill_timings = []
+    timer = timers("generate-forward")
+    for _ in range(loops):
+        timer.start(sync_func=get_accelerator().synchronize)
+        with torch.no_grad():
+            set_model_stage(model, "prefill")
+            output_ids = model.generate(**input_tokens, **generate_kwargs)
+            prefill_timings.append(model.__duration__)
+        timer.stop(sync_func=get_accelerator().synchronize)
+    costs = timers("generate-forward").costs
+
+    if args.local_rank != 0:
+        return
+
+    def remove_model_hooks(module):
+        if hasattr(module, "__start_time_hook_handle__"):
+            module.__start_time_hook_handle__.remove()
+            del module.__start_time_hook_handle__
+        if hasattr(module, "__end_time_hook_handle__"):
+            module.__end_time_hook_handle__.remove()
+            del module.__end_time_hook_handle__
+        if hasattr(module, "stage"):
+            del module.stage
+        if hasattr(module, "__duration__"):
+            del module.__duration__
+
+    # Log output
+    print(f"Summary:")
+    print(f"costs = {costs}, prefill_timings = {prefill_timings}")
+    total_latency = costs[-1]
+    prefill_latency = prefill_timings[-1]
+    remove_model_hooks(model)
+
+    prefill_throughput = batch_size * prompt_len / prefill_latency
+    decode_latency = total_latency - prefill_latency
+    decode_throughput = batch_size * (gen_len - 1) / max(decode_latency, 1e-10)
+    num_generated_tokens = batch_size * gen_len
+    total_throughput = num_generated_tokens / total_latency
+    gpu_peak_mem = get_accelerator().max_memory_allocated(torch.device("cuda"))
+    out_str = ""
+
+    if verbose >= 2:
+        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+        show_str = "Outputs:\n" + 70 * "-" + "\n"
+        for i in [0, (len(outputs) - 1) // 2, len(outputs) - 1]:
+            show_str += f"{i}: {outputs[i]}\n"
+            show_str += 70 * "-" + "\n"
+        print(show_str)
+
+        # Check lengths
+        input_lens = [len(x) for x in input_tokens.input_ids]
+        output_lens = [len(x) for x in output_ids]
+        assert all(x == prompt_len for x in input_lens)
+        assert all(x == prompt_len + execute_gen_len for x in output_lens)
+
+    if output_file == "auto":
+        filename = (
+            get_filename(
+                model_name,
+                batch_size,
+                prompt_len,
+                gen_len,
+                cpu_offload,
+                disk_offload,
+                num_nodes,
+                num_gpus_per_node,
+                kv_offload,
+                quant_bits != 16,
+            )
+            + ".log"
+        )
+    else:
+        filename = output_file
+
+    cache_size = cache_bytes(config, batch_size, prompt_len + gen_len)
+    hidden_size = hidden_bytes(config, batch_size, prompt_len + gen_len)
+    log_str = write_benchmark_log(
+        filename,
+        model_bytes(config),
+        cache_size,
+        hidden_size,
+        gpu_peak_mem,
+        prefill_latency,
+        prefill_throughput,
+        decode_latency,
+        decode_throughput,
+        total_latency,
+        total_throughput,
+    )
+    if verbose >= 1:
+        print(log_str)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str, default="facebook/opt-1.3b", help="model name or path; currently only supports OPT and BLOOM models")
+    parser.add_argument("--dummy", action="store_true", help="Use dummy weights for benchmark purposes.")
+    parser.add_argument("--loops", type=int, default=3,  help="Number of token generation iterations")
+    parser.add_argument("--batch-size", type=int, default=1)
+    parser.add_argument("--prompt-len", type=int, default=512,  help="prompt length")
+    parser.add_argument("--gen-len", type=int, default=32,  help="number of tokens to generate")
+    parser.add_argument("--local_rank", type=int, help="local rank for distributed inference")
+    parser.add_argument("--pin-memory", type=int, default=0, help="whether to pinned CPU memory for ZeRO offloading")
+    parser.add_argument("--cpu-offload", action="store_true", help="Use cpu offload.")
+    parser.add_argument("--disk-offload", action="store_true", help="Use disk offload.")
+    parser.add_argument("--offload-dir", type=str, default="~/offload_dir", help="Directory to store offloaded cache.")
+    parser.add_argument("--kv-offload", action="store_true", help="Use kv cache cpu offloading.")
+    parser.add_argument("--log-file", type=str, default="auto", help="log file name")
+    parser.add_argument("--verbose", type=int, default=2, help="verbose level")
+    parser.add_argument("--quant_bits", type=int, default=16, help="model weight quantization bits; either 4 or 8")
+    parser.add_argument("--quant_group_size", type=int, default=64, help="model weight quantization group size")
+    parser.add_argument("--pin_kv_cache", action="store_true", help="Allocate kv cache in pinned memory for offloading.")
+    parser.add_argument("--async_kv_offload", action="store_true", help="Using non_blocking copy for kv cache offloading.")
+    args = parser.parse_args()
+
+    deepspeed.init_distributed()    
+    num_gpus_per_node = get_accelerator().device_count()
+    num_nodes = dist.get_world_size() // num_gpus_per_node
+
+
+    run_generation(
+        args.model,
+        args.batch_size,
+        args.prompt_len,
+        args.gen_len,
+        args.cpu_offload,
+        args.disk_offload,
+        os.path.abspath(os.path.expanduser(args.offload_dir)),
+        num_nodes,
+        num_gpus_per_node,
+        args.dummy,
+        args.log_file,
+        args.verbose,
+        args.kv_offload,
+        args.quant_bits,
+        args.quant_group_size,
+        args.pin_kv_cache,
+        args.async_kv_offload,
+        args.loops
+    )
diff --git a/inference/huggingface/zero_inference/run_model.sh b/inference/huggingface/zero_inference/run_model.sh
new file mode 100755
index 000000000..6086fd1f5
--- /dev/null
+++ b/inference/huggingface/zero_inference/run_model.sh
@@ -0,0 +1,42 @@
+#!/bin/sh
+
+MODEL_NAME=facebook/opt-6.7b # ONLY OPT AND BLOOM MODELS ARE SUPPORTED FOR NOW
+BATCHSIZE=80 # batch size
+PROMPT_LEN=512 # the length of the prompt
+GEN_LEN=32 # number of tokens to generate
+
+USE_CPU_OFFLOAD=1 # whether to use model weights cpu offloading when running with deepspeed zero inference
+USE_KV_OFFLOAD=1 # whether to use kv cache cpu offloading when running with deepspeed zero inference
+USE_HF_MODEL=0 # whether to use the original HF model(no kv cache offloading support) or not
+USE_QUANT=0 # whether to use model weigths quantization or not
+
+if [ $USE_CPU_OFFLOAD -eq 1 ]; then
+    CPU_OFFLOAD="--cpu-offload"
+else
+    CPU_OFFLOAD=""
+fi
+
+if [ $USE_KV_OFFLOAD -eq 1 ]; then
+    KV_OFFLOAD="--kv-offload"
+else
+    KV_OFFLOAD=""
+fi
+
+if [ $USE_HF_MODEL -eq 1 ]; then
+    HF_MODEL="--hf-model"
+else
+    HF_MODEL=""
+fi
+
+if [ $USE_HF_MODEL -eq 1 ]; then
+    QUANT_BTIS="--quant_bits"
+else
+    QUANT_BTIS=""
+fi
+
+
+# weight/kv cache cpu examples with small models
+# deepspeed --num_gpus 1 run_model.py --model bigscience/bloom-560m --batch-size 3 --cpu-offload  --kv-offload
+# deepspeed --num_gpus 1 run_model.py --model facebook/opt-125m --batch-size 3 --cpu-offload --kv-offload
+
+deepspeed --num_gpus 1 run_model.py --model ${MODEL_NAME} --batch-size ${BATCHSIZE} --cpu-offload --prompt-len ${PROMPT_LEN} --gen-len ${GEN_LEN} ${CPU_OFFLOAD} ${KV_OFFLOAD} ${QUANT_BTIS}
diff --git a/inference/huggingface/zero_inference/run_opt175b_a6000.sh b/inference/huggingface/zero_inference/run_opt175b_a6000.sh
new file mode 100755
index 000000000..225b6ddcd
--- /dev/null
+++ b/inference/huggingface/zero_inference/run_opt175b_a6000.sh
@@ -0,0 +1,49 @@
+export USE_TF=0 
+BASE_LOG_DIR=~/experiments/zero_inference/
+MODEL_NAME="opt-175b"
+FULL_MODEL_NAME="facebook/${MODEL_NAME}"
+QB=4
+
+OFFLOAD_DIR=/local_nvme/zero_offload
+mkdir -p $OFFLOAD_DIR
+
+# zero-inference
+BSZ=8
+LOG_DIR=$BASE_LOG_DIR/${MODEL_NAME}_bs${BSZ}
+mkdir -p  $LOG_DIR
+
+deepspeed --num_gpus 1 run_model.py --dummy --model ${FULL_MODEL_NAME} --batch-size ${BSZ} --disk-offload --gen-len 32 --pin-memory 0 --offload-dir ${OFFLOAD_DIR} &> $LOG_DIR/ds_${MODEL_NAME}_bs${BSZ}_disk.txt 
+deepspeed --num_gpus 1 run_model.py --dummy --model ${FULL_MODEL_NAME} --batch-size ${BSZ} --cpu-offload --gen-len 32 --pin-memory 0 --offload-dir ${OFFLOAD_DIR} --quant_bits ${QB} &> $LOG_DIR/ds_${MODEL_NAME}_bs${BSZ}_cpu_q${QB}.txt
+
+BSZ=32
+LOG_DIR=$BASE_LOG_DIR/${MODEL_NAME}_bs${BSZ}
+mkdir -p  $LOG_DIR
+deepspeed --num_gpus 1 run_model.py --dummy --model ${FULL_MODEL_NAME} --batch-size ${BSZ} --disk-offload --gen-len 32 --pin-memory 0 --offload-dir ${OFFLOAD_DIR} --kv-offload &> $LOG_DIR/ds_${MODEL_NAME}_bs${BSZ}_disk_kv.txt
+
+
+BSZ=24
+LOG_DIR=$BASE_LOG_DIR/${MODEL_NAME}_bs${BSZ}
+mkdir -p  $LOG_DIR
+deepspeed --num_gpus 1 run_model.py --dummy --model ${FULL_MODEL_NAME} --batch-size ${BSZ} --cpu-offload --gen-len 32 --pin-memory 0 --offload-dir ${OFFLOAD_DIR} --quant_bits ${QB} --kv-offload &> $LOG_DIR/ds_${MODEL_NAME}_bs${BSZ}_cpu_q${QB}_kv.txt
+
+
+# flexgen
+OFFLOAD_DIR=/local_nvme/flexgen_offload
+mkdir -p $OFFLOAD_DIR
+
+BSZ=16
+LOG_DIR=$BASE_LOG_DIR/${MODEL_NAME}_bs${BSZ}
+mkdir -p  $LOG_DIR
+python -m flexgen.flex_opt --model ${FULL_MODEL_NAME} --path _DUMMY_ --percent 0 0 100 0 100 0 --gpu-batch-size ${BSZ} --pin-weight 0 --num-gpu-batches 1 &> $LOG_DIR/fg_${MODEL_NAME}_bs${BSZ}_disk.txt    
+python -m flexgen.flex_opt --model ${FULL_MODEL_NAME} --path _DUMMY_ --percent 0 100 100 0 100 0 --gpu-batch-size ${BSZ} --pin-weight 0 --num-gpu-batches 1 --compress-weight  &> $LOG_DIR/fg_${MODEL_NAME}_bs${BSZ}_cpu_q4.txt    
+
+BSZ=64
+LOG_DIR=$BASE_LOG_DIR/${MODEL_NAME}_bs${BSZ}
+mkdir -p  $LOG_DIR
+python -m flexgen.flex_opt --model ${FULL_MODEL_NAME} --path _DUMMY_ --percent 0 0 0 100 0 100 --gpu-batch-size ${BSZ} --offload-dir ${OFFLOAD_DIR} --pin-weight 0 --num-gpu-batches 1  &> $LOG_DIR/fg_${MODEL_NAME}_bs${BSZ}_cpu_disk.txt
+
+BSZ=40
+LOG_DIR=$BASE_LOG_DIR/${MODEL_NAME}_bs${BSZ}
+mkdir -p  $LOG_DIR
+python -m flexgen.flex_opt --model ${FULL_MODEL_NAME} --path _DUMMY_ --percent 0 100 0 100 0 100 --gpu-batch-size ${BSZ} --pin-weight 0 --num-gpu-batches 1 --compress-weight  &> $LOG_DIR/fg_${MODEL_NAME}_bs${BSZ}_cpu_q4.txt    
+
diff --git a/inference/huggingface/zero_inference/run_opt1p3b_a6000.sh b/inference/huggingface/zero_inference/run_opt1p3b_a6000.sh
new file mode 100755
index 000000000..d8f74765e
--- /dev/null
+++ b/inference/huggingface/zero_inference/run_opt1p3b_a6000.sh
@@ -0,0 +1,29 @@
+#!/bin/sh
+export USE_TF=0 
+BASE_LOG_DIR=~/experiments/zero_inference/
+MODEL_NAME="opt-6.7b"
+FULL_MODEL_NAME="facebook/${MODEL_NAME}"
+BSZ=64
+QB=4
+LOG_DIR=$BASE_LOG_DIR/${MODEL_NAME}_bs${BSZ}
+mkdir -p  $LOG_DIR
+
+
+# deepspeed --num_gpus 1 run_model.py --model ${FULL_MODEL_NAME} --batch-size ${BSZ} --cpu-offload --gen-len 32 --pin-memory 1
+deepspeed --num_gpus 1 run_model.py --model ${FULL_MODEL_NAME} --batch-size ${BSZ} --cpu-offload --gen-len 32 --pin-memory 1 --kv-offload
+
+# deepspeed --num_gpus 1 run_model.py --model facebook/opt-350m --batch-size 1  --gen-len 32 #  --quant_bit 4
+# deepspeed --num_gpus 1 run_model.py --model facebook/opt-350m --batch-size 1 --cpu-offload --gen-len 32 --pin-memory 0 --kv-offload  --quant_bit 4
+# deepspeed --num_gpus 1 run_model.py --model facebook/${MSZ} --batch-size ${BSZ} --cpu-offload --gen-len 32 --pin-memory 0 --kv-offload &> $LOG_DIR/ds_${MSZ}_bs${BSZ}_cpu.txt 
+# deepspeed --num_gpus 1 run_model.py --model facebook/${MSZ} --batch-size ${BSZ} --cpu-offload --gen-len 32 --pin-memory 1 --kv-offload &> $LOG_DIR/ds_${MSZ}_bs${BSZ}_cpu_pin.txt
+# deepspeed --num_gpus 1 run_model.py --model facebook/${MSZ} --batch-size ${BSZ} --cpu-offload --gen-len 32 --pin-memory 1 --kv-offload --quant_bit 4 &> $LOG_DIR/ds_${MSZ}_bs${BSZ}_cpu_pin_q${QB}.txt
+
+# # 1.3b flexgen with compute schedule or partial offload
+# python -m flexgen.flex_opt --model facebook/${MSZ} --path _DUMMY_ --percent 0 100 0 100 0 100 --gpu-batch-size ${BSZ} --pin-weight 0 --num-gpu-batches 1 &> $LOG_DIR/fg_${MSZ}_bs${BSZ}_cpu.txt
+# python -m flexgen.flex_opt --model facebook/${MSZ} --path _DUMMY_ --percent 0 100 0 100 0 100 --gpu-batch-size ${BSZ} --pin-weight 1 --num-gpu-batches 1 &> $LOG_DIR/fg_${MSZ}_bs${BSZ}_cpu_pin.txt
+
+# mkdir -p  $LOG_DIR
+# OFFLOAD_DIR=/local_nvme/flexgen_offload
+# mkdir -p $OFFLOAD_DIR
+# python -m flexgen.flex_opt --model facebook/${MSZ} --path _DUMMY_ --percent 0 0 0 100 0 100 --gpu-batch-size ${BSZ} --offload-dir ${OFFLOAD_DIR} --pin-weight 0 --num-gpu-batches 1 &> $LOG_DIR/fg_${MSZ}_bs${BSZ}_cpu_disk.txt
+# python -m flexgen.flex_opt --model facebook/${MSZ} --path _DUMMY_ --percent 0 0 0 100 0 100 --gpu-batch-size ${BSZ} --offload-dir ${OFFLOAD_DIR} --pin-weight 0 --num-gpu-batches 1 --cpu-cache-compute &> $LOG_DIR/fg_${MSZ}_bs${BSZ}_cpu_ccc_disk.txt
diff --git a/inference/huggingface/zero_inference/run_opt30b_a6000.sh b/inference/huggingface/zero_inference/run_opt30b_a6000.sh
new file mode 100755
index 000000000..1ff9809b3
--- /dev/null
+++ b/inference/huggingface/zero_inference/run_opt30b_a6000.sh
@@ -0,0 +1,45 @@
+#!/bin/sh
+export USE_TF=0 
+BASE_LOG_DIR=~/experiments/zero_inference/
+MODEL_NAME="opt-30b"
+FULL_MODEL_NAME="facebook/${MODEL_NAME}"
+QB=4
+
+# zero-inference
+BSZ=24
+LOG_DIR=$BASE_LOG_DIR/${MODEL_NAME}_bs${BSZ}
+mkdir -p  $LOG_DIR
+deepspeed --num_gpus 1 run_model.py --model ${FULL_MODEL_NAME} --batch-size ${BSZ} --cpu-offload --gen-len 32 --pin-memory 1 &> $LOG_DIR/ds_${MODEL_NAME}_bs${BSZ}_cpu_pin.txt 
+deepspeed --num_gpus 1 run_model.py --model ${FULL_MODEL_NAME} --batch-size ${BSZ} --cpu-offload --gen-len 32 --pin-memory 1 --quant_bit 4 &> $LOG_DIR/ds_${MODEL_NAME}_bs${BSZ}_cpu_pin_q${QB}.txt 
+
+BSZ=96
+LOG_DIR=$BASE_LOG_DIR/${MODEL_NAME}_bs${BSZ}
+mkdir -p  $LOG_DIR
+deepspeed --num_gpus 1 run_model.py --model ${FULL_MODEL_NAME} --batch-size ${BSZ} --cpu-offload --gen-len 32 --pin-memory 0 --kv-offload &> $LOG_DIR/ds_${MODEL_NAME}_bs${BSZ}_cpu_kv.txt
+
+
+BSZ=128
+LOG_DIR=$BASE_LOG_DIR/${MODEL_NAME}_bs${BSZ}
+mkdir -p  $LOG_DIR
+deepspeed --num_gpus 1 run_model.py --model ${FULL_MODEL_NAME} --batch-size ${BSZ} --cpu-offload --gen-len 32 --pin-memory 0 --quant_bit ${QB} --kv-offload &> $LOG_DIR/ds_${MODEL_NAME}_bs${BSZ}_cpu_q${QB}_kv.txt
+
+
+
+# flexgen
+BSZ=48
+LOG_DIR=$BASE_LOG_DIR/${MODEL_NAME}_bs${BSZ}
+mkdir -p  $LOG_DIR
+python -m flexgen.flex_opt --model ${FULL_MODEL_NAME} --path _DUMMY_ --percent 0 100 100 0 100 0 --gpu-batch-size ${BSZ} --pin-weight 0 --num-gpu-batches 1 &> $LOG_DIR/fg_${MODEL_NAME}_bs${BSZ}_cpu.txt    
+python -m flexgen.flex_opt --model ${FULL_MODEL_NAME} --path _DUMMY_ --percent 0 100 100 0 100 0 --gpu-batch-size ${BSZ} --pin-weight 0 --num-gpu-batches 1 --compress-weight  &> $LOG_DIR/fg_${MODEL_NAME}_bs${BSZ}_cpu_q4.txt    
+
+BSZ=200
+LOG_DIR=$BASE_LOG_DIR/${MSZ}_bs${BSZ}
+mkdir -p  $LOG_DIR
+python -m flexgen.flex_opt --model facebook/${MSZ} --path _DUMMY_ --percent 0 100 0 100 0 100 --gpu-batch-size ${BSZ} --pin-weight 0 --num-gpu-batches 1 &> $LOG_DIR/fg_${MSZ}_bs${BSZ}_cpu.txt
+
+BSZ=280
+LOG_DIR=$BASE_LOG_DIR/${MSZ}_bs${BSZ}
+mkdir -p  $LOG_DIR
+python -m flexgen.flex_opt --model ${FULL_MODEL_NAME} --path _DUMMY_ --percent 0 100 0 100 0 100 --gpu-batch-size ${BSZ} --pin-weight 0 --num-gpu-batches 1 --compress-weight  &> $LOG_DIR/fg_${MSZ}_bs${BSZ}_cpu_q4.txt    
+
+
diff --git a/inference/huggingface/zero_inference/run_opt66b_a6000.sh b/inference/huggingface/zero_inference/run_opt66b_a6000.sh
new file mode 100755
index 000000000..f18562eb1
--- /dev/null
+++ b/inference/huggingface/zero_inference/run_opt66b_a6000.sh
@@ -0,0 +1,44 @@
+#!/bin/sh
+export USE_TF=0 
+BASE_LOG_DIR=~/experiments/zero_inference/
+MODEL_NAME="opt-66b"
+FULL_MODEL_NAME="facebook/${MODEL_NAME}"
+QB=4
+
+# zero-inference
+BSZ=16
+LOG_DIR=$BASE_LOG_DIR/${MODEL_NAME}_bs${BSZ}
+mkdir -p  $LOG_DIR
+deepspeed --num_gpus 1 run_model.py --model ${FULL_MODEL_NAME} --batch-size ${BSZ} --cpu-offload --gen-len 32 --pin-memory 1 &> $LOG_DIR/ds_${MODEL_NAME}_bs${BSZ}_cpu_pin.txt 
+deepspeed --num_gpus 1 run_model.py --model ${FULL_MODEL_NAME} --batch-size ${BSZ} --cpu-offload --gen-len 32 --pin-memory 1 --quant_bit 4 &> $LOG_DIR/ds_${MODEL_NAME}_bs${BSZ}_cpu_pin_q${QB}.txt 
+
+BSZ=40
+LOG_DIR=$BASE_LOG_DIR/${MODEL_NAME}_bs${BSZ}
+mkdir -p  $LOG_DIR
+deepspeed --num_gpus 1 run_model.py --model ${FULL_MODEL_NAME} --batch-size ${BSZ} --cpu-offload --gen-len 32 --pin-memory 0 --kv-offload &> $LOG_DIR/ds_${MODEL_NAME}_bs${BSZ}_cpu_kv.txt
+
+BSZ=64
+LOG_DIR=$BASE_LOG_DIR/${MODEL_NAME}_bs${BSZ}
+mkdir -p  $LOG_DIR
+deepspeed --num_gpus 1 run_model.py --model ${FULL_MODEL_NAME} --batch-size ${BSZ} --cpu-offload --gen-len 32 --pin-memory 0 --quant_bit ${QB} --kv-offload &> $LOG_DIR/ds_${MODEL_NAME}_bs${BSZ}_cpu_q${QB}_kv.txt
+
+
+# flexgen 
+BSZ=16
+LOG_DIR=$BASE_LOG_DIR/${MODEL_NAME}_bs${BSZ}
+mkdir -p  $LOG_DIR
+python -m flexgen.flex_opt --model ${FULL_MODEL_NAME} --path _DUMMY_ --percent 0 100 100 0 100 0 --gpu-batch-size ${BSZ} --pin-weight 0 --num-gpu-batches 1 &> $LOG_DIR/fg_${MODEL_NAME}_bs${BSZ}_cpu.txt    
+python -m flexgen.flex_opt --model ${FULL_MODEL_NAME} --path _DUMMY_ --percent 0 100 100 0 100 0 --gpu-batch-size ${BSZ} --pin-weight 0 --num-gpu-batches 1 --compress-weight  &> $LOG_DIR/fg_${MODEL_NAME}_bs${BSZ}_cpu_q4.txt    
+
+
+BSZ=80
+LOG_DIR=$BASE_LOG_DIR/${MSZ}_bs${BSZ}
+mkdir -p  $LOG_DIR
+python -m flexgen.flex_opt --model facebook/${MSZ} --path _DUMMY_ --percent 0 100 0 100 0 100 --gpu-batch-size ${BSZ} --pin-weight 0 --num-gpu-batches 1 &> $LOG_DIR/fg_${MSZ}_bs${BSZ}_cpu.txt
+
+BSZ=96
+LOG_DIR=$BASE_LOG_DIR/${MSZ}_bs${BSZ}
+mkdir -p  $LOG_DIR
+python -m flexgen.flex_opt --model ${FULL_MODEL_NAME} --path _DUMMY_ --percent 0 100 0 100 0 100 --gpu-batch-size ${BSZ} --pin-weight 0 --num-gpu-batches 1 --compress-weight  &> $LOG_DIR/fg_${MSZ}_bs${BSZ}_cpu_q4.txt    
+
+
diff --git a/inference/huggingface/zero_inference/timer.py b/inference/huggingface/zero_inference/timer.py
new file mode 100644
index 000000000..74f8d533a
--- /dev/null
+++ b/inference/huggingface/zero_inference/timer.py
@@ -0,0 +1,81 @@
+"""
+Reference:
+https://github.com/FMInference/FlexGen/blob/main/flexgen/timer.py
+"""
+
+"""Global timer for profiling."""
+from collections import namedtuple
+import time
+from typing import Callable, Any
+
+
+class _Timer:
+    """An internal timer."""
+
+    def __init__(self, name: str):
+        self.name = name
+        self.started = False
+        self.start_time = None
+
+        # start-stop timestamp pairs
+        self.start_times = []
+        self.stop_times = []
+        self.costs = []
+
+    def start(self, sync_func: Callable = None):
+        """Start the timer."""
+        assert not self.started, f"timer {self.name} has already been started."
+        if sync_func:
+            sync_func()
+
+        self.start_time = time.perf_counter()
+        self.start_times.append(self.start_time)
+        self.started = True
+
+    def stop(self, sync_func: Callable = None):
+        """Stop the timer."""
+        assert self.started, f"timer {self.name} is not started."
+        if sync_func:
+            sync_func()
+
+        stop_time = time.perf_counter()
+        self.costs.append(stop_time - self.start_time)
+        self.stop_times.append(stop_time)
+        self.started = False
+
+    def reset(self):
+        """Reset timer."""
+        self.started = False
+        self.start_time = None
+        self.start_times = []
+        self.stop_times = []
+        self.costs = []
+
+    def elapsed(self, mode: str = "average"):
+        """Calculate the elapsed time."""
+        if not self.costs:
+            return 0.0
+        if mode == "average":
+            return sum(self.costs) / len(self.costs)
+        elif mode == "sum":
+            return sum(self.costs)
+        else:
+            raise RuntimeError("Supported mode is: average | sum")
+
+
+class Timers:
+    """A group of timers."""
+
+    def __init__(self):
+        self.timers = {}
+
+    def __call__(self, name: str):
+        if name not in self.timers:
+            self.timers[name] = _Timer(name)
+        return self.timers[name]
+
+    def __contains__(self, name: str):
+        return name in self.timers
+
+
+timers = Timers()
diff --git a/inference/huggingface/zero_inference/utils.py b/inference/huggingface/zero_inference/utils.py
new file mode 100644
index 000000000..cde9ee2c1
--- /dev/null
+++ b/inference/huggingface/zero_inference/utils.py
@@ -0,0 +1,178 @@
+import torch
+import time
+
+KB = 1 << 10
+MB = 1 << 20
+GB = 1 << 30
+T = 1e12
+
+
+global torch_linear_init_backup
+global torch_layer_norm_init_backup
+
+def get_quant_config(model_config, bits: int, group_size: int):
+    qaunt_config = {
+        'weight_quantization': {
+            'quantized_initialization' : {
+                'num_bits': bits,
+                'group_size': group_size,
+                "group_dim": 1,
+                "symmetric": False
+            }
+        }
+    }
+
+    return qaunt_config
+
+def model_bytes(config):
+    h = config.hidden_size
+    return 	2 * (config.num_hidden_layers * (
+    # config-attention
+    h * (3 * h + 1) + h * (h + 1) +
+    # mlp
+    h * (4 * h + 1) + h * 4 * (h + 1) +
+    # layer norm
+    h * 4) +
+    # embedding
+    config.vocab_size * (h + 1))
+
+def cache_bytes(config, batch_size, seq_len):
+    return 2 * batch_size * seq_len * config.num_hidden_layers * config.hidden_size * 2
+
+def hidden_bytes(config, batch_size, seq_len):
+    return batch_size * seq_len * config.hidden_size * 2
+
+
+
+def disable_torch_init():
+    """
+    Disable the redundant torch default initialization to accelerate model creation.
+    """
+    global torch_linear_init_backup
+    global torch_layer_norm_init_backup
+
+    torch_linear_init_backup = torch.nn.Linear.reset_parameters
+    setattr(torch.nn.Linear, "reset_parameters", lambda config: None)
+
+    torch_layer_norm_init_backup = torch.nn.LayerNorm.reset_parameters
+    setattr(torch.nn.LayerNorm, "reset_parameters", lambda config: None)
+
+
+def restore_torch_init():
+    """Rollback the change made by disable_torch_init."""
+    setattr(torch.nn.Linear, "reset_parameters", torch_linear_init_backup)
+    setattr(torch.nn.LayerNorm, "reset_parameters", torch_layer_norm_init_backup)
+
+
+def disable_hf_opt_init():
+    """
+    Disable the redundant default initialization to accelerate model creation.
+    """
+    import transformers
+
+    setattr(transformers.models.opt.modeling_opt.OPTPreTrainedModel,
+            "_init_weights", lambda *args, **kwargs: None)
+
+def disable_hf_bloom_init():
+    """
+    Disable the redundant default initialization to accelerate model creation.
+    """
+    import transformers
+
+    setattr(transformers.models.bloom.modeling_bloom.BloomPreTrainedModel,
+            "_init_weights", lambda *args, **kwargs: None)
+
+
+def write_benchmark_log(filename, model_size, cache_size, hidden_size,
+        gpu_peak_mem, prefill_latency, prefill_throughput,
+        decode_latency, decode_throughput, total_latency, total_throughput):
+
+    log_str = (f"model size: {model_size/GB:.3f} GB\t"
+               f"cache size: {cache_size/GB:.3f} GB\t"
+               f"hidden size (p): {hidden_size/GB:.3f} GB\n"
+               f"peak gpu mem: {gpu_peak_mem / GB:.3f} GB\t"
+               f"prefill latency: {prefill_latency:.3f} s\t"
+               f"prefill throughput: {prefill_throughput:.3f} token/s\n"
+               f"decode latency: {decode_latency:.3f} s\t"
+               f"decode throughput: {decode_throughput:.3f} token/s\n"
+               f"total latency: {total_latency:.3f} s\t"
+               f"total throughput: {total_throughput:.3f} token/s")
+    with open(filename, "a") as fout:
+        fout.write(log_str + "\n")
+
+    return log_str
+
+def get_filename(model_name, batch_size, prompt_len, gen_len,
+                 cpu_offload, disk_offload, num_nodes, num_gpus_per_node,
+                 kv_offload, weight_quantize):
+    simple_name = model_name.split('/')[-1]
+    filename = "ds-"
+    filename += f"{simple_name}-bs{batch_size}-prompt{prompt_len}-gen{gen_len}-"
+    filename += f"n{num_nodes}x{num_gpus_per_node}-"
+    if cpu_offload:
+        filename += "cpu"
+    elif disk_offload:
+        filename += "disk"
+    else:
+        filename += "gpu"
+    if kv_offload:
+        filename += "-kv_offload"
+    if weight_quantize:
+        filename += "-w_quant"
+        
+    return filename
+
+
+def realize_meta_module(module, dtype=None, device=None):
+    for name, child in module.named_children():
+        realize_meta_module(child, dtype, device)
+
+    keys = list(module._parameters.keys())
+    for k in keys:
+        v = module._parameters[k]
+        if v is not None:
+            module._parameters[k] = torch.nn.Parameter(
+                torch.empty(*v.shape, dtype=dtype or v.dtype,
+                    device=device or v.device))
+
+    keys = list(module._buffers.keys())
+    for k in keys:
+        v = module._buffers[k]
+        assert v is None
+
+
+def meta_to_cpu(container, dtype=None):
+    if isinstance(container, torch.Tensor):
+        return torch.empty(*container.shape, dtype=dtype or container.dtype)
+    elif isinstance(container, tuple):
+        return tuple(meta_to_cpu(x, dtype) for x in container)
+    elif isinstance(container, dict):
+        return dict((k, meta_to_cpu(v, dtype)) for k, v in container.items())
+    else:
+        raise ValueError(f"Invalid type: {container}")
+
+# add timing hooks
+def add_model_hooks(model: torch.nn.Module):
+
+    def start_time_hook(module, input):
+        if hasattr(module, 'stage') and module.stage == "decode":
+            return
+        elif hasattr(module, 'stage') and module.stage == 'prefill':
+            torch.cuda.synchronize()
+            module.__start_time__ = time.time()
+
+    def end_time_hook(module, input, output):
+        if hasattr(module, 'stage') and module.stage == "decode":
+            return
+        elif hasattr(module, 'stage') and module.stage == 'prefill':
+            torch.cuda.synchronize()
+            module.__duration__ = time.time() - module.__start_time__
+            module.stage = "decode"
+
+    if not hasattr(model, '__start_time_hook_handle'):
+        model.__start_time_hook_handle__ = model.register_forward_pre_hook(
+            start_time_hook, )
+
+    if not hasattr(model, '__end_time_hook_handle__'):
+        model.__end_time_hook_handle__ = model.register_forward_hook(
+            end_time_hook, )
diff --git a/inference/mii/README.md b/inference/mii/README.md
new file mode 100644
index 000000000..d701d5537
--- /dev/null
+++ b/inference/mii/README.md
@@ -0,0 +1,5 @@
+# DeepSpeed MII Examples
+
+Install the requirements by running `pip install -r requirements.txt`.
+
+Once [DeepSpeed-MII](https://github.com/microsoft/deepspeed-mii) is installed you have two options for deployment: an interactive non-persistent pipeline or a persistent serving deployment. For details on these files please refer to the [Getting Started guide for MII](https://github.com/microsoft/deepspeed-mii#getting-started-with-mii).
diff --git a/inference/mii/client.py b/inference/mii/client.py
new file mode 100644
index 000000000..6d19fec3a
--- /dev/null
+++ b/inference/mii/client.py
@@ -0,0 +1,6 @@
+import mii
+
+client = mii.client("mistralai/Mistral-7B-v0.1")
+output = client.generate("Deepspeed is", max_new_tokens=128)
+
+print(output)
diff --git a/inference/mii/pipeline.py b/inference/mii/pipeline.py
new file mode 100644
index 000000000..dcf9e8b03
--- /dev/null
+++ b/inference/mii/pipeline.py
@@ -0,0 +1,6 @@
+from mii import pipeline
+
+pipe = pipeline("mistralai/Mistral-7B-v0.1")
+output = pipe(["Hello, my name is", "DeepSpeed is"], max_new_tokens=128)
+
+print(output)
diff --git a/inference/mii/requirements.txt b/inference/mii/requirements.txt
new file mode 100644
index 000000000..07d9f7e16
--- /dev/null
+++ b/inference/mii/requirements.txt
@@ -0,0 +1 @@
+mii>=0.1.0
diff --git a/inference/mii/serve.py b/inference/mii/serve.py
new file mode 100644
index 000000000..09c0c306c
--- /dev/null
+++ b/inference/mii/serve.py
@@ -0,0 +1,3 @@
+import mii
+
+mii.serve("mistralai/Mistral-7B-v0.1")
diff --git a/inference/mii/terminate.py b/inference/mii/terminate.py
new file mode 100644
index 000000000..2a7ed3211
--- /dev/null
+++ b/inference/mii/terminate.py
@@ -0,0 +1,4 @@
+import mii
+
+client = mii.client("mistralai/Mistral-7B-v0.1")
+client.terminate_server()
diff --git a/training/HelloDeepSpeed/run.sh b/training/HelloDeepSpeed/run.sh
new file mode 100755
index 000000000..470220401
--- /dev/null
+++ b/training/HelloDeepSpeed/run.sh
@@ -0,0 +1 @@
+python train_bert.py --checkpoint_dir ./experiment
diff --git a/training/HelloDeepSpeed/run_ds.sh b/training/HelloDeepSpeed/run_ds.sh
new file mode 100755
index 000000000..d09c5bcde
--- /dev/null
+++ b/training/HelloDeepSpeed/run_ds.sh
@@ -0,0 +1 @@
+deepspeed --bind_cores_to_rank train_bert_ds.py --checkpoint_dir experiment_deepspeed $@
diff --git a/training/HelloDeepSpeed/train_bert.py b/training/HelloDeepSpeed/train_bert.py
index 88417623f..a55215dbe 100644
--- a/training/HelloDeepSpeed/train_bert.py
+++ b/training/HelloDeepSpeed/train_bert.py
@@ -24,6 +24,8 @@
     RobertaPreTrainedModel,
 )
 
+from deepspeed.accelerator import get_accelerator
+
 logger = loguru.logger
 
 ######################################################################
@@ -625,8 +627,8 @@ def train(
         pathlib.Path: The final experiment directory
 
     """
-    device = (torch.device("cuda", local_rank) if (local_rank > -1)
-              and torch.cuda.is_available() else torch.device("cpu"))
+    device = (torch.device(get_accelerator().device_name(), local_rank) if (local_rank > -1)
+              and get_accelerator().is_available() else torch.device("cpu"))
     ################################
     ###### Create Exp. Dir #########
     ################################
diff --git a/training/HelloDeepSpeed/train_bert_ds.py b/training/HelloDeepSpeed/train_bert_ds.py
index 98f43fcd4..b13497b67 100644
--- a/training/HelloDeepSpeed/train_bert_ds.py
+++ b/training/HelloDeepSpeed/train_bert_ds.py
@@ -31,6 +31,7 @@
     RobertaPreTrainedModel,
 )
 
+from deepspeed.accelerator import get_accelerator
 
 def is_rank_0() -> bool:
     return int(os.environ.get("RANK", "0")) == 0
@@ -612,6 +613,7 @@ def train(
         checkpoint_every: int = 1000,
         log_every: int = 10,
         local_rank: int = -1,
+        dtype: str = "bf16",
 ) -> pathlib.Path:
     """Trains a [Bert style](https://arxiv.org/pdf/1810.04805.pdf)
     (transformer encoder only) model for MLM Task
@@ -667,8 +669,8 @@ def train(
         pathlib.Path: The final experiment directory
 
     """
-    device = (torch.device("cuda", local_rank) if (local_rank > -1)
-              and torch.cuda.is_available() else torch.device("cpu"))
+    device = (torch.device(get_accelerator().device_name(), local_rank) if (local_rank > -1)
+              and get_accelerator().is_available() else torch.device("cpu"))
     ################################
     ###### Create Exp. Dir #########
     ################################
@@ -777,6 +779,7 @@ def train(
     ###### DeepSpeed engine ########
     ################################
     log_dist("Creating DeepSpeed engine", ranks=[0], level=logging.INFO)
+    assert (dtype == 'fp16' or dtype == 'bf16')
     ds_config = {
         "train_micro_batch_size_per_gpu": batch_size,
         "optimizer": {
@@ -785,7 +788,7 @@ def train(
                 "lr": 1e-4
             }
         },
-        "fp16": {
+        dtype: {
             "enabled": True
         },
         "zero_optimization": {
diff --git a/training/cifar/README.md b/training/cifar/README.md
index d6b4323ec..7c58f3b98 100644
--- a/training/cifar/README.md
+++ b/training/cifar/README.md
@@ -18,3 +18,4 @@ run_ds_moe.sh
 * To run baseline CIFAR-10 model - "python cifar10_tutorial.py"
 * To run DeepSpeed CIFAR-10 model - "bash run_ds.sh"
 * To run DeepSpeed CIFAR-10 model with Mixture of Experts (MoE) - "bash run_ds_moe.sh"
+* To run with different data type (default='fp16') and zero stages (default=0) - "bash run_ds.sh --dtype={fp16|bf16} --stage={0|1|2|3}"
diff --git a/training/cifar/cifar10_deepspeed.py b/training/cifar/cifar10_deepspeed.py
index d1117c37a..da82e60db 100755
--- a/training/cifar/cifar10_deepspeed.py
+++ b/training/cifar/cifar10_deepspeed.py
@@ -3,6 +3,7 @@
 import torchvision.transforms as transforms
 import argparse
 import deepspeed
+from deepspeed.accelerator import get_accelerator
 
 
 def add_argument():
@@ -88,6 +89,22 @@ def add_argument():
         help=
         '(moe) create separate moe param groups, required when using ZeRO w. MoE'
     )
+    parser.add_argument(
+        '--dtype',
+        default='fp16',
+        type=str,
+        choices=['bf16', 'fp16', 'fp32'],
+        help=
+        'Datatype used for training'
+    )
+    parser.add_argument(
+        '--stage',
+        default=0,
+        type=int,
+        choices=[0, 1, 2, 3],
+        help=
+        'Datatype used for training'
+    )
 
     # Include DeepSpeed configuration arguments
     parser = deepspeed.add_config_arguments(parser)
@@ -243,11 +260,68 @@ def create_moe_param_groups(model):
 # 1) Distributed model
 # 2) Distributed data loader
 # 3) DeepSpeed optimizer
+ds_config = {
+  "train_batch_size": 16,
+  "steps_per_print": 2000,
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.001,
+      "betas": [
+        0.8,
+        0.999
+      ],
+      "eps": 1e-8,
+      "weight_decay": 3e-7
+    }
+  },
+  "scheduler": {
+    "type": "WarmupLR",
+    "params": {
+      "warmup_min_lr": 0,
+      "warmup_max_lr": 0.001,
+      "warmup_num_steps": 1000
+    }
+  },
+  "gradient_clipping": 1.0,
+  "prescale_gradients": False,
+  "bf16": {
+      "enabled": args.dtype == "bf16"
+  },
+  "fp16": {
+      "enabled": args.dtype == "fp16",
+      "fp16_master_weights_and_grads": False,
+      "loss_scale": 0,
+      "loss_scale_window": 500,
+      "hysteresis": 2,
+      "min_loss_scale": 1,
+      "initial_scale_power": 15
+  },
+  "wall_clock_breakdown": False,
+  "zero_optimization": {
+      "stage": args.stage,
+      "allgather_partitions": True,
+      "reduce_scatter": True,
+      "allgather_bucket_size": 50000000,
+      "reduce_bucket_size": 50000000,
+      "overlap_comm": True,
+      "contiguous_gradients": True,
+      "cpu_offload": False
+  }
+}
+
 model_engine, optimizer, trainloader, __ = deepspeed.initialize(
-    args=args, model=net, model_parameters=parameters, training_data=trainset)
+    args=args, model=net, model_parameters=parameters, training_data=trainset, config=ds_config)
+
+local_device = get_accelerator().device_name(model_engine.local_rank)
+local_rank = model_engine.local_rank
 
-fp16 = model_engine.fp16_enabled()
-print(f'fp16={fp16}')
+# For float32, target_dtype will be None so no datatype conversion needed
+target_dtype = None
+if model_engine.bfloat16_enabled():
+    target_dtype=torch.bfloat16
+elif model_engine.fp16_enabled():
+    target_dtype=torch.half
 
 #device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 #net.to(device)
@@ -269,15 +343,14 @@ def create_moe_param_groups(model):
 # We simply have to loop over our data iterator, and feed the inputs to the
 # network and optimize.
 
-for epoch in range(2):  # loop over the dataset multiple times
+for epoch in range(args.epochs):  # loop over the dataset multiple times
 
     running_loss = 0.0
     for i, data in enumerate(trainloader):
         # get the inputs; data is a list of [inputs, labels]
-        inputs, labels = data[0].to(model_engine.local_rank), data[1].to(
-            model_engine.local_rank)
-        if fp16:
-            inputs = inputs.half()
+        inputs, labels = data[0].to(local_device), data[1].to(local_device)
+        if target_dtype != None:
+            inputs = inputs.to(target_dtype)
         outputs = model_engine(inputs)
         loss = criterion(outputs, labels)
 
@@ -286,7 +359,7 @@ def create_moe_param_groups(model):
 
         # print statistics
         running_loss += loss.item()
-        if i % args.log_interval == (
+        if local_rank == 0 and i % args.log_interval == (
                 args.log_interval -
                 1):  # print every log_interval mini-batches
             print('[%d, %5d] loss: %.3f' %
@@ -317,9 +390,9 @@ def create_moe_param_groups(model):
 
 ########################################################################
 # Okay, now let us see what the neural network thinks these examples above are:
-if fp16:
-    images = images.half()
-outputs = net(images.to(model_engine.local_rank))
+if target_dtype != None:
+    images = images.to(target_dtype)
+outputs = net(images.to(local_device))
 
 ########################################################################
 # The outputs are energies for the 10 classes.
@@ -340,13 +413,12 @@ def create_moe_param_groups(model):
 with torch.no_grad():
     for data in testloader:
         images, labels = data
-        if fp16:
-            images = images.half()
-        outputs = net(images.to(model_engine.local_rank))
+        if target_dtype != None:
+            images = images.to(target_dtype)
+        outputs = net(images.to(local_device))
         _, predicted = torch.max(outputs.data, 1)
         total += labels.size(0)
-        correct += (predicted == labels.to(
-            model_engine.local_rank)).sum().item()
+        correct += (predicted == labels.to(local_device)).sum().item()
 
 print('Accuracy of the network on the 10000 test images: %d %%' %
       (100 * correct / total))
@@ -364,11 +436,11 @@ def create_moe_param_groups(model):
 with torch.no_grad():
     for data in testloader:
         images, labels = data
-        if fp16:
-            images = images.half()
-        outputs = net(images.to(model_engine.local_rank))
+        if target_dtype != None:
+            images = images.to(target_dtype)
+        outputs = net(images.to(local_device))
         _, predicted = torch.max(outputs, 1)
-        c = (predicted == labels.to(model_engine.local_rank)).squeeze()
+        c = (predicted == labels.to(local_device)).squeeze()
         for i in range(4):
             label = labels[i]
             class_correct[label] += c[i].item()
diff --git a/training/cifar/ds_config.json b/training/cifar/ds_config.json
deleted file mode 100755
index 5ae810b85..000000000
--- a/training/cifar/ds_config.json
+++ /dev/null
@@ -1,46 +0,0 @@
-{
-  "train_batch_size": 16,
-  "steps_per_print": 2000,
-  "optimizer": {
-    "type": "Adam",
-    "params": {
-      "lr": 0.001,
-      "betas": [
-        0.8,
-        0.999
-      ],
-      "eps": 1e-8,
-      "weight_decay": 3e-7
-    }
-  },
-  "scheduler": {
-    "type": "WarmupLR",
-    "params": {
-      "warmup_min_lr": 0,
-      "warmup_max_lr": 0.001,
-      "warmup_num_steps": 1000
-    }
-  },
-  "gradient_clipping": 1.0,
-  "prescale_gradients": false,
-  "fp16": {
-      "enabled": true,
-      "fp16_master_weights_and_grads": false,
-      "loss_scale": 0,
-      "loss_scale_window": 500,
-      "hysteresis": 2,
-      "min_loss_scale": 1,
-      "initial_scale_power": 15
-  },
-  "wall_clock_breakdown": false,
-  "zero_optimization": {
-      "stage": 0,
-      "allgather_partitions": true,
-      "reduce_scatter": true,
-      "allgather_bucket_size": 50000000,
-      "reduce_bucket_size": 50000000,
-      "overlap_comm": true,
-      "contiguous_gradients": true,
-      "cpu_offload": false
-  }
-}
diff --git a/training/cifar/run_ds.sh b/training/cifar/run_ds.sh
index 6f2f2f479..662d6cec9 100755
--- a/training/cifar/run_ds.sh
+++ b/training/cifar/run_ds.sh
@@ -1,3 +1,3 @@
 #!/bin/bash
 
-deepspeed cifar10_deepspeed.py --deepspeed --deepspeed_config ds_config.json $@
+deepspeed --bind_cores_to_rank cifar10_deepspeed.py --deepspeed $@
diff --git a/training/cifar/run_ds_moe.sh b/training/cifar/run_ds_moe.sh
index 5c7924bc7..b7dcb7fa7 100755
--- a/training/cifar/run_ds_moe.sh
+++ b/training/cifar/run_ds_moe.sh
@@ -9,7 +9,10 @@ EP_SIZE=2
 # Number of total experts
 EXPERTS=2
 
-deepspeed --num_nodes=${NUM_NODES} --num_gpus=${NUM_GPUS} cifar10_deepspeed.py \
+deepspeed --num_nodes=${NUM_NODES}\
+          --num_gpus=${NUM_GPUS} \
+          --bind_cores_to_rank \
+        cifar10_deepspeed.py \
 	--log-interval 100 \
 	--deepspeed \
 	--deepspeed_config ds_config.json \
diff --git a/training/data_efficiency/vit_finetuning/requirement.txt b/training/data_efficiency/vit_finetuning/requirement.txt
index 8bec1b063..9cf596612 100644
--- a/training/data_efficiency/vit_finetuning/requirement.txt
+++ b/training/data_efficiency/vit_finetuning/requirement.txt
@@ -1,4 +1,4 @@
-timm
+timm==0.6.5
 torch>1.10.0
 torchvision>0.11.1
-mpi4py
\ No newline at end of file
+mpi4py