From 5bba361928291cf53d47edfa0b0a4da39e312aa8 Mon Sep 17 00:00:00 2001
From: Moshe Island <misland@habana.ai>
Date: Fri, 15 Sep 2023 11:53:52 +0300
Subject: [PATCH] deepspeed-chat: fix weight decay configuration

Current default name used to detect LN layers is "LayerNorm.weight".
This does not work for the following models:
- opt: uses "layer_norm"
- llama: uses "norm" and "layernorm"
- bloom: uses "layernorm" and "ln_f"

Therefore, modify the default names to accomodate for the above.
Also, compare names in lower-caps to capture models with different caps.

Change-Id: I5b805df2663c62daf3d9c8a31a973742e344e76b
Signed-off-by: Moshe Island <misland@habana.ai>
---
 .../DeepSpeed-Chat/training/utils/utils.py        | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/applications/DeepSpeed-Chat/training/utils/utils.py b/applications/DeepSpeed-Chat/training/utils/utils.py
index b5cfb8d6d..a9a84b618 100644
--- a/applications/DeepSpeed-Chat/training/utils/utils.py
+++ b/applications/DeepSpeed-Chat/training/utils/utils.py
@@ -174,15 +174,18 @@ def get_optimizer_grouped_parameters(
     model,
     weight_decay,
     lora_lr=5e-4,
-    no_decay_name_list=["bias", "LayerNorm.weight"],
+    no_decay_name_list=[
+        "bias", "layer_norm.weight", "layernorm.weight", "norm.weight",
+        "ln_f.weight"
+    ],
     lora_name_list=["lora_right_weight", "lora_left_weight"],
 ):
     optimizer_grouped_parameters = [
         {
             "params": [
                 p for n, p in model.named_parameters()
-                if (not any(nd in n for nd in no_decay_name_list)
-                    and p.requires_grad and not any(nd in n
+                if (not any(nd in n.lower() for nd in no_decay_name_list)
+                    and p.requires_grad and not any(nd in n.lower()
                                                     for nd in lora_name_list))
             ],
             "weight_decay":
@@ -191,8 +194,8 @@ def get_optimizer_grouped_parameters(
         {
             "params": [
                 p for n, p in model.named_parameters()
-                if (not any(nd in n for nd in no_decay_name_list)
-                    and p.requires_grad and any(nd in n
+                if (not any(nd in n.lower() for nd in no_decay_name_list)
+                    and p.requires_grad and any(nd in n.lower()
                                                 for nd in lora_name_list))
             ],
             "weight_decay":
@@ -203,7 +206,7 @@ def get_optimizer_grouped_parameters(
         {
             "params": [
                 p for n, p in model.named_parameters()
-                if (any(nd in n
+                if (any(nd in n.lower()
                         for nd in no_decay_name_list) and p.requires_grad)
             ],
             "weight_decay":