From 5bba361928291cf53d47edfa0b0a4da39e312aa8 Mon Sep 17 00:00:00 2001 From: Moshe Island Date: Fri, 15 Sep 2023 11:53:52 +0300 Subject: [PATCH] deepspeed-chat: fix weight decay configuration Current default name used to detect LN layers is "LayerNorm.weight". This does not work for the following models: - opt: uses "layer_norm" - llama: uses "norm" and "layernorm" - bloom: uses "layernorm" and "ln_f" Therefore, modify the default names to accomodate for the above. Also, compare names in lower-caps to capture models with different caps. Change-Id: I5b805df2663c62daf3d9c8a31a973742e344e76b Signed-off-by: Moshe Island --- .../DeepSpeed-Chat/training/utils/utils.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/applications/DeepSpeed-Chat/training/utils/utils.py b/applications/DeepSpeed-Chat/training/utils/utils.py index b5cfb8d6d..a9a84b618 100644 --- a/applications/DeepSpeed-Chat/training/utils/utils.py +++ b/applications/DeepSpeed-Chat/training/utils/utils.py @@ -174,15 +174,18 @@ def get_optimizer_grouped_parameters( model, weight_decay, lora_lr=5e-4, - no_decay_name_list=["bias", "LayerNorm.weight"], + no_decay_name_list=[ + "bias", "layer_norm.weight", "layernorm.weight", "norm.weight", + "ln_f.weight" + ], lora_name_list=["lora_right_weight", "lora_left_weight"], ): optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() - if (not any(nd in n for nd in no_decay_name_list) - and p.requires_grad and not any(nd in n + if (not any(nd in n.lower() for nd in no_decay_name_list) + and p.requires_grad and not any(nd in n.lower() for nd in lora_name_list)) ], "weight_decay": @@ -191,8 +194,8 @@ def get_optimizer_grouped_parameters( { "params": [ p for n, p in model.named_parameters() - if (not any(nd in n for nd in no_decay_name_list) - and p.requires_grad and any(nd in n + if (not any(nd in n.lower() for nd in no_decay_name_list) + and p.requires_grad and any(nd in n.lower() for nd in lora_name_list)) ], "weight_decay": @@ -203,7 +206,7 @@ def get_optimizer_grouped_parameters( { "params": [ p for n, p in model.named_parameters() - if (any(nd in n + if (any(nd in n.lower() for nd in no_decay_name_list) and p.requires_grad) ], "weight_decay":