PaddlePaddle · ZHUI · Dec 2, 2024 · Nov 29, 2024 · Nov 29, 2024
diff --git a/llm/experimental/ernie-3.5-se/modeling.py b/llm/experimental/ernie-3.5-se/modeling.py
@@ -135,7 +135,7 @@ class BFloatFInfo:
 
 def masked_fill(x, mask, value):
     y = paddle.full(x.shape, value, x.dtype)
-    return paddle.where(mask, y, x)
+    return paddle.where(mask.to("bool"), y, x)
 
 
 def scaled_dot_product_attention(

diff --git a/paddlenlp/data/data_collator.py b/paddlenlp/data/data_collator.py
@@ -571,7 +571,7 @@ def paddle_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = N
 
         def masked_fill(x, mask, value):
             y = paddle.full(x.shape, value, x.dtype)
-            return paddle.where(mask, y, x)
+            return paddle.where(mask.to("bool"), y, x)
 
         # probability_matrix.masked_fill_(special_tokens_mask, value=0.0)
         probability_matrix = masked_fill(probability_matrix, special_tokens_mask, value=0.0)
@@ -789,6 +789,7 @@ def paddle_mask_tokens(self, inputs: Any, mask_labels: Any) -> Tuple[Any, Any]:
         ]
 
         def masked_fill(x, mask, value):
+            mask = mask.astype("bool")
             y = paddle.full(x.shape, value, x.dtype)
             return paddle.where(mask, y, x)
 

diff --git a/paddlenlp/transformers/bloom/modeling.py b/paddlenlp/transformers/bloom/modeling.py
@@ -855,7 +855,7 @@ def _prepare_attn_mask(
         # Attention score will be cast to float32 in the following calculation, therefore we set attention_mask dtype as float32
         zero = paddle.zeros(expanded_attn_mask.shape, dtype=paddle.float32)
         neg_inf = paddle.full(expanded_attn_mask.shape, paddle.finfo(paddle.float32).min, dtype=paddle.float32)
-        expanded_attn_mask = paddle.where(expanded_attn_mask, zero, neg_inf)
+        expanded_attn_mask = paddle.where(expanded_attn_mask.to("bool"), zero, neg_inf)
         batch_size, num_heads, sq_len, kv_len = expanded_attn_mask.shape
         return expanded_attn_mask.reshape([batch_size * num_heads, sq_len, kv_len])
 

diff --git a/paddlenlp/transformers/codegen/modeling.py b/paddlenlp/transformers/codegen/modeling.py
@@ -135,7 +135,7 @@ def _attn(self, query, key, value, attention_mask=None):
         attn_weights = attn_weights / self.scale_attn
         mask_value = paddle.to_tensor(-1e4, dtype=attn_weights.dtype)
         # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
-        attn_weights = paddle.where(causal_mask, attn_weights, mask_value)
+        attn_weights = paddle.where(causal_mask.to("bool"), attn_weights, mask_value)
 
         if attention_mask is not None:
             # Apply the attention mask

diff --git a/paddlenlp/transformers/gemma/modeling.py b/paddlenlp/transformers/gemma/modeling.py
@@ -1135,7 +1135,7 @@ def _prepare_decoder_attention_mask(attention_mask, input_shape, past_key_values
         else:
             expanded_attn_mask = _make_causal_mask(input_shape, past_key_values_length=past_key_values_length)
         # Convert bool attention_mask to float attention mask, which will be added to attention_scores later
-        expanded_attn_mask = paddle.where(expanded_attn_mask, 0.0, paddle.finfo(dtype).min).astype(dtype)
+        expanded_attn_mask = paddle.where(expanded_attn_mask.to("bool"), 0.0, paddle.finfo(dtype).min).astype(dtype)
         return expanded_attn_mask
 
     @paddle.jit.not_to_static

diff --git a/paddlenlp/transformers/gptj/modeling.py b/paddlenlp/transformers/gptj/modeling.py
@@ -152,7 +152,7 @@ def _attn(
         # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
         # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
         mask_value = paddle.to_tensor(mask_value, dtype=attn_weights.dtype, place=attn_weights.place)
-        attn_weights = paddle.where(causal_mask, attn_weights, mask_value)
+        attn_weights = paddle.where(causal_mask.to("bool"), attn_weights, mask_value)
 
         attn_weights = attn_weights / self.scale_attn
 

diff --git a/paddlenlp/transformers/mixtral/modeling.py b/paddlenlp/transformers/mixtral/modeling.py
@@ -299,7 +299,7 @@ def scaled_dot_product_attention(
 
 def masked_fill(x, mask, value):
     y = paddle.full(x.shape, value, x.dtype)
-    return paddle.where(mask, y, x)
+    return paddle.where(mask.to("bool"), y, x)
 
 
 def is_casual_mask(attention_mask):
@@ -519,7 +519,7 @@ def forward(self, hidden_states):
         # this will be used to easily index which expert is going to be sollicitated.
         # shape: [num_experts, top_k, batch_size * seq_len]
         expert_mask = F.one_hot(selected_experts, num_classes=self.num_experts).transpose([2, 1, 0])
-
+        expert_mask = expert_mask.to("bool")
         # Loop over all available experts in the model and perform the computation on each expert.
         for expert_id in range(self.num_experts):
             expert_layer = self.experts[expert_id]
@@ -1098,7 +1098,7 @@ def _prepare_decoder_attention_mask(attention_mask, input_shape, past_key_values
                 past_key_values_length=past_key_values_length,
             )
         # Convert bool attention_mask to float attention mask, which will be added to attention_scores later
-        expanded_attn_mask = paddle.where(expanded_attn_mask, 0.0, paddle.finfo(dtype).min).astype(dtype)
+        expanded_attn_mask = paddle.where(expanded_attn_mask.to("bool"), 0.0, paddle.finfo(dtype).min).astype(dtype)
         return expanded_attn_mask
 
     @paddle.jit.not_to_static

diff --git a/paddlenlp/transformers/qwen2/modeling.py b/paddlenlp/transformers/qwen2/modeling.py
@@ -233,7 +233,7 @@ def scaled_dot_product_attention(
 
 def masked_fill(x, mask, value):
     y = paddle.full(x.shape, value, x.dtype)
-    return paddle.where(mask, y, x)
+    return paddle.where(mask.to("bool"), y, x)
 
 
 def is_casual_mask(attention_mask):
@@ -1020,7 +1020,7 @@ def _prepare_decoder_attention_mask(attention_mask, input_shape, past_key_values
                 past_key_values_length=past_key_values_length,
             )
         # Convert bool attention_mask to float attention mask, which will be added to attention_scores later
-        expanded_attn_mask = paddle.where(expanded_attn_mask, 0.0, paddle.finfo(dtype).min).astype(dtype)
+        expanded_attn_mask = paddle.where(expanded_attn_mask.to("bool"), 0.0, paddle.finfo(dtype).min).astype(dtype)
         return expanded_attn_mask
 
     @paddle.jit.not_to_static

diff --git a/paddlenlp/transformers/qwen2_moe/modeling.py b/paddlenlp/transformers/qwen2_moe/modeling.py
@@ -300,7 +300,7 @@ def scaled_dot_product_attention(
 
 def masked_fill(x, mask, value):
     y = paddle.full(x.shape, value, x.dtype)
-    return paddle.where(mask, y, x)
+    return paddle.where(mask.to("bool"), y, x)
 
 
 def is_casual_mask(attention_mask):
@@ -1124,7 +1124,7 @@ def _prepare_decoder_attention_mask(attention_mask, input_shape, past_key_values
                 past_key_values_length=past_key_values_length,
             )
         # Convert bool attention_mask to float attention mask, which will be added to attention_scores later
-        expanded_attn_mask = paddle.where(expanded_attn_mask, 0.0, paddle.finfo(dtype).min).astype(dtype)
+        expanded_attn_mask = paddle.where(expanded_attn_mask.to("bool"), 0.0, paddle.finfo(dtype).min).astype(dtype)
         return expanded_attn_mask
 
     @paddle.jit.not_to_static