diff --git a/llm/experimental/ernie-3.5-se/modeling.py b/llm/experimental/ernie-3.5-se/modeling.py index c4ce1e72ea6a..2033a9632133 100644 --- a/llm/experimental/ernie-3.5-se/modeling.py +++ b/llm/experimental/ernie-3.5-se/modeling.py @@ -135,7 +135,7 @@ class BFloatFInfo: def masked_fill(x, mask, value): y = paddle.full(x.shape, value, x.dtype) - return paddle.where(mask, y, x) + return paddle.where(mask.to("bool"), y, x) def scaled_dot_product_attention( diff --git a/paddlenlp/data/data_collator.py b/paddlenlp/data/data_collator.py index a6be66ebbba8..78d3b3517ca0 100644 --- a/paddlenlp/data/data_collator.py +++ b/paddlenlp/data/data_collator.py @@ -571,7 +571,7 @@ def paddle_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = N def masked_fill(x, mask, value): y = paddle.full(x.shape, value, x.dtype) - return paddle.where(mask, y, x) + return paddle.where(mask.to("bool"), y, x) # probability_matrix.masked_fill_(special_tokens_mask, value=0.0) probability_matrix = masked_fill(probability_matrix, special_tokens_mask, value=0.0) @@ -789,6 +789,7 @@ def paddle_mask_tokens(self, inputs: Any, mask_labels: Any) -> Tuple[Any, Any]: ] def masked_fill(x, mask, value): + mask = mask.astype("bool") y = paddle.full(x.shape, value, x.dtype) return paddle.where(mask, y, x) diff --git a/paddlenlp/transformers/bloom/modeling.py b/paddlenlp/transformers/bloom/modeling.py index f18b88f406e0..01bb9781c97a 100755 --- a/paddlenlp/transformers/bloom/modeling.py +++ b/paddlenlp/transformers/bloom/modeling.py @@ -855,7 +855,7 @@ def _prepare_attn_mask( # Attention score will be cast to float32 in the following calculation, therefore we set attention_mask dtype as float32 zero = paddle.zeros(expanded_attn_mask.shape, dtype=paddle.float32) neg_inf = paddle.full(expanded_attn_mask.shape, paddle.finfo(paddle.float32).min, dtype=paddle.float32) - expanded_attn_mask = paddle.where(expanded_attn_mask, zero, neg_inf) + expanded_attn_mask = paddle.where(expanded_attn_mask.to("bool"), zero, neg_inf) batch_size, num_heads, sq_len, kv_len = expanded_attn_mask.shape return expanded_attn_mask.reshape([batch_size * num_heads, sq_len, kv_len]) diff --git a/paddlenlp/transformers/codegen/modeling.py b/paddlenlp/transformers/codegen/modeling.py index 2759b203c24f..187afda437af 100644 --- a/paddlenlp/transformers/codegen/modeling.py +++ b/paddlenlp/transformers/codegen/modeling.py @@ -135,7 +135,7 @@ def _attn(self, query, key, value, attention_mask=None): attn_weights = attn_weights / self.scale_attn mask_value = paddle.to_tensor(-1e4, dtype=attn_weights.dtype) # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`. - attn_weights = paddle.where(causal_mask, attn_weights, mask_value) + attn_weights = paddle.where(causal_mask.to("bool"), attn_weights, mask_value) if attention_mask is not None: # Apply the attention mask diff --git a/paddlenlp/transformers/gemma/modeling.py b/paddlenlp/transformers/gemma/modeling.py index 783f0bbb0d36..1aa75ece7a21 100644 --- a/paddlenlp/transformers/gemma/modeling.py +++ b/paddlenlp/transformers/gemma/modeling.py @@ -1135,7 +1135,7 @@ def _prepare_decoder_attention_mask(attention_mask, input_shape, past_key_values else: expanded_attn_mask = _make_causal_mask(input_shape, past_key_values_length=past_key_values_length) # Convert bool attention_mask to float attention mask, which will be added to attention_scores later - expanded_attn_mask = paddle.where(expanded_attn_mask, 0.0, paddle.finfo(dtype).min).astype(dtype) + expanded_attn_mask = paddle.where(expanded_attn_mask.to("bool"), 0.0, paddle.finfo(dtype).min).astype(dtype) return expanded_attn_mask @paddle.jit.not_to_static diff --git a/paddlenlp/transformers/gptj/modeling.py b/paddlenlp/transformers/gptj/modeling.py index df8ea5e7f1e2..ad49ffff8b8a 100644 --- a/paddlenlp/transformers/gptj/modeling.py +++ b/paddlenlp/transformers/gptj/modeling.py @@ -152,7 +152,7 @@ def _attn( # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`. # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device` mask_value = paddle.to_tensor(mask_value, dtype=attn_weights.dtype, place=attn_weights.place) - attn_weights = paddle.where(causal_mask, attn_weights, mask_value) + attn_weights = paddle.where(causal_mask.to("bool"), attn_weights, mask_value) attn_weights = attn_weights / self.scale_attn diff --git a/paddlenlp/transformers/mixtral/modeling.py b/paddlenlp/transformers/mixtral/modeling.py index 232833bd9538..4bd884502a05 100644 --- a/paddlenlp/transformers/mixtral/modeling.py +++ b/paddlenlp/transformers/mixtral/modeling.py @@ -299,7 +299,7 @@ def scaled_dot_product_attention( def masked_fill(x, mask, value): y = paddle.full(x.shape, value, x.dtype) - return paddle.where(mask, y, x) + return paddle.where(mask.to("bool"), y, x) def is_casual_mask(attention_mask): @@ -519,7 +519,7 @@ def forward(self, hidden_states): # this will be used to easily index which expert is going to be sollicitated. # shape: [num_experts, top_k, batch_size * seq_len] expert_mask = F.one_hot(selected_experts, num_classes=self.num_experts).transpose([2, 1, 0]) - + expert_mask = expert_mask.to("bool") # Loop over all available experts in the model and perform the computation on each expert. for expert_id in range(self.num_experts): expert_layer = self.experts[expert_id] @@ -1098,7 +1098,7 @@ def _prepare_decoder_attention_mask(attention_mask, input_shape, past_key_values past_key_values_length=past_key_values_length, ) # Convert bool attention_mask to float attention mask, which will be added to attention_scores later - expanded_attn_mask = paddle.where(expanded_attn_mask, 0.0, paddle.finfo(dtype).min).astype(dtype) + expanded_attn_mask = paddle.where(expanded_attn_mask.to("bool"), 0.0, paddle.finfo(dtype).min).astype(dtype) return expanded_attn_mask @paddle.jit.not_to_static diff --git a/paddlenlp/transformers/qwen2/modeling.py b/paddlenlp/transformers/qwen2/modeling.py index 95061f55f15d..3ab1be5b3d81 100644 --- a/paddlenlp/transformers/qwen2/modeling.py +++ b/paddlenlp/transformers/qwen2/modeling.py @@ -233,7 +233,7 @@ def scaled_dot_product_attention( def masked_fill(x, mask, value): y = paddle.full(x.shape, value, x.dtype) - return paddle.where(mask, y, x) + return paddle.where(mask.to("bool"), y, x) def is_casual_mask(attention_mask): @@ -1020,7 +1020,7 @@ def _prepare_decoder_attention_mask(attention_mask, input_shape, past_key_values past_key_values_length=past_key_values_length, ) # Convert bool attention_mask to float attention mask, which will be added to attention_scores later - expanded_attn_mask = paddle.where(expanded_attn_mask, 0.0, paddle.finfo(dtype).min).astype(dtype) + expanded_attn_mask = paddle.where(expanded_attn_mask.to("bool"), 0.0, paddle.finfo(dtype).min).astype(dtype) return expanded_attn_mask @paddle.jit.not_to_static diff --git a/paddlenlp/transformers/qwen2_moe/modeling.py b/paddlenlp/transformers/qwen2_moe/modeling.py index 18507c1d5dc7..501e79673c8e 100644 --- a/paddlenlp/transformers/qwen2_moe/modeling.py +++ b/paddlenlp/transformers/qwen2_moe/modeling.py @@ -300,7 +300,7 @@ def scaled_dot_product_attention( def masked_fill(x, mask, value): y = paddle.full(x.shape, value, x.dtype) - return paddle.where(mask, y, x) + return paddle.where(mask.to("bool"), y, x) def is_casual_mask(attention_mask): @@ -1124,7 +1124,7 @@ def _prepare_decoder_attention_mask(attention_mask, input_shape, past_key_values past_key_values_length=past_key_values_length, ) # Convert bool attention_mask to float attention mask, which will be added to attention_scores later - expanded_attn_mask = paddle.where(expanded_attn_mask, 0.0, paddle.finfo(dtype).min).astype(dtype) + expanded_attn_mask = paddle.where(expanded_attn_mask.to("bool"), 0.0, paddle.finfo(dtype).min).astype(dtype) return expanded_attn_mask @paddle.jit.not_to_static