Fix softmax scale (#903)

apple · Dec 24, 2024 · e4ff72c · e4ff72c
1 parent f91709f
commit e4ff72c
Showing 1 changed file with 3 additions and 3 deletions.
diff --git a/axlearn/common/flash_attention/tpu_attention.py b/axlearn/common/flash_attention/tpu_attention.py
@@ -594,7 +594,7 @@ def lm_index_map(batch_index, head_index, q_seq_index, _):
         _flash_attention_kernel,
         causal=causal,
         mask_value=DEFAULT_MASK_VALUE,
-        softmax_scale=softmax_scale,
+        sm_scale=softmax_scale,
         block_k=block_k,
         kv_seq_len=kv_seq_len,
     )
@@ -878,7 +878,7 @@ def dkv_index_map(batch_index, head_index, kv_seq_index, _):
         _flash_attention_dkv_kernel,
         block_q=block_q,
         block_k=block_k,
-        softmax_scale=softmax_scale,
+        sm_scale=softmax_scale,
         causal=causal,
         mask_value=mask_value,
         q_seq_len=q_seq_len,
@@ -1068,7 +1068,7 @@ def kv_segment_ids_index_map(batch_index, head_index, q_seq_index, kv_seq_index)
 
     kernel = functools.partial(
         _flash_attention_dq_kernel,
-        softmax_scale=softmax_scale,
+        sm_scale=softmax_scale,
         causal=causal,
         mask_value=mask_value,
         block_k=block_k,