GSAI-ML
/

LLaDA-8B-Instruct

@@ -654,7 +654,7 @@ class LLaDABlock(nn.Module):
                 q,
                 k,
                 v,
-                attn_mask=None,
                 dropout_p=dropout_p,
                 is_causal=False,
             )
@@ -665,6 +665,7 @@ class LLaDABlock(nn.Module):
         k: torch.Tensor,
         v: torch.Tensor,
         attention_bias: Optional[torch.Tensor] = None,
         layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
         use_cache: bool = False,
     ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
@@ -712,7 +713,7 @@ class LLaDABlock(nn.Module):
             q,
             k,
             v,
-            attn_mask=None,
             dropout_p=0.0 if not self.training else self.config.attention_dropout,
             is_causal=False,
         )
@@ -785,6 +786,7 @@ class LLaDASequentialBlock(LLaDABlock):
         self,
         x: torch.Tensor,
         attention_bias: Optional[torch.Tensor] = None,
         layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
         use_cache: bool = False,
     ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
@@ -805,10 +807,10 @@ class LLaDASequentialBlock(LLaDABlock):
         # Get attention scores.
         if self._activation_checkpoint_fn is not None:
             att, cache = self._activation_checkpoint_fn(  # type: ignore
-                self.attention, q, k, v, attention_bias, layer_past=layer_past, use_cache=use_cache
             )
         else:
-            att, cache = self.attention(q, k, v, attention_bias, layer_past=layer_past, use_cache=use_cache)
         # Add attention scores.
         # shape: (B, T, C)
@@ -887,6 +889,7 @@ class LLaDALlamaBlock(LLaDABlock):
         self,
         x: torch.Tensor,
         attention_bias: Optional[torch.Tensor] = None,
         layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
         use_cache: bool = False,
     ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
@@ -905,10 +908,10 @@ class LLaDALlamaBlock(LLaDABlock):
         # Get attention scores.
         if self._activation_checkpoint_fn is not None:
             att, cache = self._activation_checkpoint_fn(  # type: ignore
-                self.attention, q, k, v, attention_bias, layer_past=layer_past, use_cache=use_cache
             )
         else:
-            att, cache = self.attention(q, k, v, attention_bias, layer_past=layer_past, use_cache=use_cache)
         # Add attention scores.
         # shape: (B, T, C)
@@ -977,6 +980,7 @@ class LLaDABlockGroup(nn.ModuleList):
         self,
         x: torch.Tensor,
         attention_bias: Optional[torch.FloatTensor] = None,
         layers_past: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = None,
         use_cache: bool = False,
     ) -> Tuple[torch.Tensor, Optional[List[Tuple[torch.Tensor, torch.Tensor]]]]:
@@ -1001,11 +1005,11 @@ class LLaDABlockGroup(nn.ModuleList):
             ):
                 # shape: (batch_size, seq_len, d_model)
                 x, cache = self._activation_checkpoint_fn(  # type: ignore
-                    block, x, attention_bias=attention_bias, layer_past=layer_past, use_cache=use_cache
                 )
             else:
                 # shape: (batch_size, seq_len, d_model)
-                x, cache = block(x, attention_bias=attention_bias, layer_past=layer_past, use_cache=use_cache)
             if attn_key_values is not None:
                 assert cache is not None
                 attn_key_values.append(cache)
@@ -1308,11 +1312,11 @@ class LLaDAModel(nn.Module):
                 ):
                     # shape: (batch_size, seq_len, d_model)
                     x, cache = self._activation_checkpoint_fn(
-                        block, x, attention_bias=attention_bias, layer_past=layer_past, use_cache=use_cache
                     )
                 else:
                     # shape: (batch_size, seq_len, d_model)
-                    x, cache = block(x, attention_bias=attention_bias, layer_past=layer_past, use_cache=use_cache)
                 if attn_key_values is not None:
                     assert cache is not None
                     attn_key_values.append(cache)
@@ -1330,7 +1334,7 @@ class LLaDAModel(nn.Module):
                     ]
                 )
                 x, cache = block_group(
-                    x, attention_bias=attention_bias, layers_past=layers_past, use_cache=use_cache
                 )
                 if attn_key_values is not None:
                     assert cache is not None

                 q,
                 k,
                 v,
+                attn_mask=attn_mask,
                 dropout_p=dropout_p,
                 is_causal=False,
             )
         k: torch.Tensor,
         v: torch.Tensor,
         attention_bias: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
         layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
         use_cache: bool = False,
     ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
             q,
             k,
             v,
+            attn_mask=attention_mask,
             dropout_p=0.0 if not self.training else self.config.attention_dropout,
             is_causal=False,
         )
         self,
         x: torch.Tensor,
         attention_bias: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
         layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
         use_cache: bool = False,
     ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
         # Get attention scores.
         if self._activation_checkpoint_fn is not None:
             att, cache = self._activation_checkpoint_fn(  # type: ignore
+                self.attention, q, k, v, attention_bias, attention_mask, layer_past=layer_past, use_cache=use_cache
             )
         else:
+            att, cache = self.attention(q, k, v, attention_bias, attention_mask, layer_past=layer_past, use_cache=use_cache)
         # Add attention scores.
         # shape: (B, T, C)
         self,
         x: torch.Tensor,
         attention_bias: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
         layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
         use_cache: bool = False,
     ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
         # Get attention scores.
         if self._activation_checkpoint_fn is not None:
             att, cache = self._activation_checkpoint_fn(  # type: ignore
+                self.attention, q, k, v, attention_bias, attention_mask, layer_past=layer_past, use_cache=use_cache
             )
         else:
+            att, cache = self.attention(q, k, v, attention_bias, attention_mask, layer_past=layer_past, use_cache=use_cache)
         # Add attention scores.
         # shape: (B, T, C)
         self,
         x: torch.Tensor,
         attention_bias: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
         layers_past: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = None,
         use_cache: bool = False,
     ) -> Tuple[torch.Tensor, Optional[List[Tuple[torch.Tensor, torch.Tensor]]]]:
             ):
                 # shape: (batch_size, seq_len, d_model)
                 x, cache = self._activation_checkpoint_fn(  # type: ignore
+                    block, x, attention_bias=attention_bias, attention_mask=attention_mask, layer_past=layer_past, use_cache=use_cache
                 )
             else:
                 # shape: (batch_size, seq_len, d_model)
+                x, cache = block(x, attention_bias=attention_bias, attention_mask=attention_mask, layer_past=layer_past, use_cache=use_cache)
             if attn_key_values is not None:
                 assert cache is not None
                 attn_key_values.append(cache)
                 ):
                     # shape: (batch_size, seq_len, d_model)
                     x, cache = self._activation_checkpoint_fn(
+                        block, x, attention_bias=attention_bias, attention_mask=attention_mask, layer_past=layer_past, use_cache=use_cache
                     )
                 else:
                     # shape: (batch_size, seq_len, d_model)
+                    x, cache = block(x, attention_bias=attention_bias, attention_mask=attention_mask, layer_past=layer_past, use_cache=use_cache)
                 if attn_key_values is not None:
                     assert cache is not None
                     attn_key_values.append(cache)
                     ]
                 )
                 x, cache = block_group(
+                    x, attention_bias=attention_bias, attention_mask=attention_mask, layers_past=layers_past, use_cache=use_cache
                 )
                 if attn_key_values is not None:
                     assert cache is not None

tokenizer_config.json CHANGED Viewed

@@ -2164,7 +2164,7 @@
     "<|number_end|>"
   ],
   "bos_token": "<|startoftext|>",
-  "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}",
   "clean_up_tokenization_spaces": false,
   "cls_token": "[CLS]",
   "eos_token": "<|endoftext|>",

     "<|number_end|>"
   ],
   "bos_token": "<|startoftext|>",
+  "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{%- if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{%- endif %}",
   "clean_up_tokenization_spaces": false,
   "cls_token": "[CLS]",
   "eos_token": "<|endoftext|>",