diff --git a/evaluation/inference/human_eval/run_human_eval.py b/evaluation/inference/human_eval/run_human_eval.py index 3a9c2f0fa..5ca29dcf6 100644 --- a/evaluation/inference/human_eval/run_human_eval.py +++ b/evaluation/inference/human_eval/run_human_eval.py @@ -9,9 +9,9 @@ from human_eval.evaluation import evaluate_functional_correctness parser = argparse.ArgumentParser() -parser.add_argument("--model", "-m", type=str, help="evaluation model name") -parser.add_argument("--max-tokens", type=int, default=256, help="max new tokens") -parser.add_argument("--num-samples-per-task", type=int, default=10, help="number of samples to gen/eval per task") +parser.add_argument("--model", "-m", type=str, default="codellama/CodeLlama-7b-Python-hf", help="evaluation model name") +parser.add_argument("--max-tokens", type=int, default=512, help="max new tokens") +parser.add_argument("--num-samples-per-task", type=int, default=20, help="number of samples to gen/eval per task") parser.add_argument("--local_rank", type=int, default=int(os.getenv("LOCAL_RANK", "0")), help="local rank") args = parser.parse_args() @@ -27,7 +27,9 @@ def generate_samples(generation_function): for _ in range(args.num_samples_per_task) ] return samples -# TODO (lekurile): Move these functions to utils in DSE repo (TBD) + +print("Loading Problems") +problems = read_problems("human-eval/data/HumanEval.jsonl.gz") print("Initializing HuggingFace Pipeline") device = torch.device(get_accelerator().device_name(args.local_rank)) @@ -36,18 +38,22 @@ def generate_samples(generation_function): max_length=args.max_tokens, return_full_text=False) -print("Initializing DeepSpeed-MII Pipeline") -mii_pipe = mii.pipeline(args.model) - -print("Loading Problems") -problems = read_problems("human-eval/data/HumanEval.jsonl.gz") - print("Generating Base Samples") base_samples = generate_samples(generate_base_completion) +print("Base Pipeline Teardown") +del base_pipe +torch.cuda.empty_cache() + +print("Initializing DeepSpeed-MII Pipeline") +mii_pipe = mii.pipeline(args.model) + print("Generating MII Samples") mii_samples = generate_samples(generate_mii_completion) +print("MII Pipeline Teardown") +mii_pipe.destroy() + print("Writing Samples") write_jsonl("base_samples.jsonl", base_samples) write_jsonl("mii_samples.jsonl", mii_samples) @@ -59,7 +65,5 @@ def generate_samples(generation_function): print(f"Base Results = {base_results}") print(f"MII Results = {mii_results}") -print("Executing Assertions") for key in base_results.keys(): - assert numpy.allclose(base_results[key], mii_results[key], rtol=0.2), \ - f"Base result: {base_results[key]}, MII result: {mii_results[key]}, outside of rtol." + print(f"{key} - Base Result: {base_results[key]}, MII result: {mii_results[key]}")