diff --git a/ppdiffusers/deploy/sd3/infer_dygraph_paddle.py b/ppdiffusers/deploy/sd3/infer_dygraph_paddle.py
new file mode 100644
index 000000000..14d1f5f24
--- /dev/null
+++ b/ppdiffusers/deploy/sd3/infer_dygraph_paddle.py
@@ -0,0 +1,264 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import time
+import warnings
+
+import cv2
+import numpy as np
+import paddle
+from PIL import Image
+from tqdm.auto import trange
+
+from ppdiffusers import (
+    FlowMatchEulerDiscreteScheduler,
+    DDIMScheduler,
+    DDPMScheduler,
+    DEISMultistepScheduler,
+    DPMSolverMultistepScheduler,
+    DPMSolverSinglestepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    HeunDiscreteScheduler,
+    KDPM2AncestralDiscreteScheduler,
+    KDPM2DiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    StableDiffusion3Pipeline,
+    UniPCMultistepScheduler,
+)
+from ppdiffusers.utils import load_image
+
+
+
+def strtobool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ("yes", "true", "t", "y", "1"):
+        return True
+    elif v.lower() in ("no", "false", "f", "n", "0"):
+        return False
+    else:
+        raise ValueError(
+            f"Truthy value expected: got {v} but expected one of yes/no, true/false, t/f, y/n, 1/0 (case insensitive)."
+        )
+
+
+def change_scheduler(self, scheduler_type="ddim"):
+    self.orginal_scheduler_config = self.scheduler.config
+    scheduler_type = scheduler_type.lower()
+    if scheduler_type == "flow":
+        scheduler = FlowMatchEulerDiscreteScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True)
+    elif scheduler_type == "pndm":
+        scheduler = PNDMScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True)
+    elif scheduler_type == "lms":
+        scheduler = LMSDiscreteScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "heun":
+        scheduler = HeunDiscreteScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "euler":
+        scheduler = EulerDiscreteScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "euler-ancestral":
+        scheduler = EulerAncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "dpm-multi":
+        scheduler = DPMSolverMultistepScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "dpm-single":
+        scheduler = DPMSolverSinglestepScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "kdpm2-ancestral":
+        scheduler = KDPM2AncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "kdpm2":
+        scheduler = KDPM2DiscreteScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "unipc-multi":
+        scheduler = UniPCMultistepScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "ddim":
+        scheduler = DDIMScheduler.from_config(
+            self.orginal_scheduler_config,
+            steps_offset=1,
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+    elif scheduler_type == "ddpm":
+        scheduler = DDPMScheduler.from_config(
+            self.orginal_scheduler_config,
+        )
+    elif scheduler_type == "deis-multi":
+        scheduler = DEISMultistepScheduler.from_config(
+            self.orginal_scheduler_config,
+        )
+    else:
+        raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
+    return scheduler
+
+
+def parse_arguments():
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default="stabilityai/stable-diffusion-3-medium-diffusers",
+        help="Path to the `diffusers` checkpoint to convert (either a local directory or on the bos).",
+    )
+    parser.add_argument(
+        "--inference_steps",
+        type=int,
+        default=50,
+        help="The number of unet inference steps.",
+    )
+    parser.add_argument(
+        "--benchmark_steps",
+        type=int,
+        default=10,
+        help="The number of performance benchmark steps.",
+    )
+    parser.add_argument(
+        "--task_name",
+        type=str,
+        default="all",
+        choices=[
+            "text2img",
+            "img2img",
+            "inpaint_legacy",
+            "all",
+        ],
+        help="The task can be one of [text2img, img2img, inpaint_legacy, all]. ",
+    )
+    parser.add_argument(
+        "--parse_prompt_type",
+        type=str,
+        default="raw",
+        choices=[
+            "raw",
+            "lpw",
+        ],
+        help="The parse_prompt_type can be one of [raw, lpw]. ",
+    )
+    parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode")
+    parser.add_argument(
+        "--attention_type", type=str, default="raw", choices=["raw", "cutlass", "flash", "all"], help="attention_type."
+    )
+    parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu")
+    parser.add_argument(
+        "--scheduler",
+        type=str,
+        default="euler-ancestral",
+        choices=[
+            "flow",
+            "pndm",
+            "lms",
+            "euler",
+            "euler-ancestral",
+            "dpm-multi",
+            "dpm-single",
+            "unipc-multi",
+            "ddim",
+            "ddpm",
+            "deis-multi",
+            "heun",
+            "kdpm2-ancestral",
+            "kdpm2",
+        ],
+        help="The scheduler type of stable diffusion.",
+    )
+    parser.add_argument("--height", type=int, default=512, help="Height of input image")
+    parser.add_argument("--width", type=int, default=512, help="Width of input image")
+    parser.add_argument("--strength", type=float, default=1.0, help="Strength for img2img / inpaint")
+    return parser.parse_args()
+
+
+def main(args):
+
+    seed = 1024
+    paddle_dtype = paddle.float16 if args.use_fp16 else paddle.float32
+    pipe = StableDiffusion3Pipeline.from_pretrained(
+        args.pretrained_model_name_or_path,
+        safety_checker=None,
+        feature_extractor=None,
+        requires_safety_checker=False,
+        paddle_dtype=paddle_dtype,
+    )
+    scheduler = change_scheduler(pipe, args.scheduler)
+    pipe.scheduler = scheduler
+
+    if args.attention_type == "all":
+        args.attention_type = ["raw", "cutlass", "flash"]
+    else:
+        args.attention_type = [args.attention_type]
+
+    for attention_type in args.attention_type:
+        if attention_type == "raw":
+            pipe.disable_xformers_memory_efficient_attention()
+        else:
+            try:
+                pipe.enable_xformers_memory_efficient_attention(attention_type)
+            except Exception as e:
+                if attention_type == "flash":
+                    warnings.warn(
+                        "Attention type flash is not supported on your GPU! We need to use 3060、3070、3080、3090、4060、4070、4080、4090、A30、A100 etc."
+                    )
+                    continue
+                else:
+                    raise ValueError(e)
+
+        if not args.use_fp16 and attention_type == "flash":
+            print("Flash attention is not supported dtype=float32! Please use float16 or bfloat16. We will skip this!")
+            continue
+
+        width = args.width
+        height = args.height
+        pipe.set_progress_bar_config(disable=False)
+
+        folder = f"paddle_attn_{attention_type}_fp16" if args.use_fp16 else f"paddle_attn_{attention_type}_fp32"
+        os.makedirs(folder, exist_ok=True)
+        if args.task_name in ["text2img", "all"]:
+            init_image = load_image(
+                "https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/control_bird_canny_demo.png"
+            )
+            # text2img
+            prompt = "bird"
+            time_costs = []
+            # warmup
+            pipe(
+                prompt,
+                num_inference_steps=10,
+                height=height,
+                width=width,
+            )
+            print("==> Test text2img performance.")
+            for step in trange(args.benchmark_steps):
+                start = time.time()
+                paddle.seed(seed)
+                images = pipe(
+                    prompt,
+                    num_inference_steps=args.inference_steps,
+                    height=height,
+                    width=width,
+                ).images
+                latency = time.time() - start
+                time_costs += [latency]
+                # print(f"No {step:3d} time cost: {latency:2f} s")
+            print(
+                f"Attention type: {attention_type}, "
+                f"Use fp16: {'true' if args.use_fp16 else 'false'}, "
+                f"Mean iter/sec: {1 / (np.mean(time_costs) / args.inference_steps):2f} it/s, "
+                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
+                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
+            )
+            images[0].save(f"{folder}/text2img.png")
+
+
+if __name__ == "__main__":
+    args = parse_arguments()
+    main(args)
diff --git a/ppdiffusers/deploy/sd3/infer_dygraph_torch.py b/ppdiffusers/deploy/sd3/infer_dygraph_torch.py
new file mode 100644
index 000000000..14c547b56
--- /dev/null
+++ b/ppdiffusers/deploy/sd3/infer_dygraph_torch.py
@@ -0,0 +1,325 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import time
+
+import torch
+
+# torch.nn.functional.scaled_dot_product_attention_ = torch.nn.functional.scaled_dot_product_attention
+# delattr(torch.nn.functional, "scaled_dot_product_attention")
+
+import cv2
+import numpy as np
+from diffusers import (
+    FlowMatchEulerDiscreteScheduler,
+    DDIMScheduler,
+    DDPMScheduler,
+    DEISMultistepScheduler,
+    DPMSolverMultistepScheduler,
+    DPMSolverSinglestepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    HeunDiscreteScheduler,
+    KDPM2AncestralDiscreteScheduler,
+    KDPM2DiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    StableDiffusion3Pipeline,
+    UniPCMultistepScheduler,
+)
+from diffusers.models.attention_processor import AttnProcessor, AttnProcessor2_0
+from diffusers.utils import load_image
+from PIL import Image
+from tqdm.auto import trange
+
+
+
+def strtobool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ("yes", "true", "t", "y", "1"):
+        return True
+    elif v.lower() in ("no", "false", "f", "n", "0"):
+        return False
+    else:
+        raise ValueError(
+            f"Truthy value expected: got {v} but expected one of yes/no, true/false, t/f, y/n, 1/0 (case insensitive)."
+        )
+
+
+def change_scheduler(self, scheduler_type="ddim"):
+    self.orginal_scheduler_config = self.scheduler.config
+    scheduler_type = scheduler_type.lower()
+    if scheduler_type == "flow":
+        scheduler = FlowMatchEulerDiscreteScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True)
+    elif scheduler_type == "pndm":
+        scheduler = PNDMScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True)
+    elif scheduler_type == "lms":
+        scheduler = LMSDiscreteScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "heun":
+        scheduler = HeunDiscreteScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "euler":
+        scheduler = EulerDiscreteScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "euler-ancestral":
+        scheduler = EulerAncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "dpm-multi":
+        scheduler = DPMSolverMultistepScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "dpm-single":
+        scheduler = DPMSolverSinglestepScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "kdpm2-ancestral":
+        scheduler = KDPM2AncestralDiscreteScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "kdpm2":
+        scheduler = KDPM2DiscreteScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "unipc-multi":
+        scheduler = UniPCMultistepScheduler.from_config(self.orginal_scheduler_config)
+    elif scheduler_type == "ddim":
+        scheduler = DDIMScheduler.from_config(
+            self.orginal_scheduler_config,
+            steps_offset=1,
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+    elif scheduler_type == "ddpm":
+        scheduler = DDPMScheduler.from_config(
+            self.orginal_scheduler_config,
+        )
+    elif scheduler_type == "deis-multi":
+        scheduler = DEISMultistepScheduler.from_config(
+            self.orginal_scheduler_config,
+        )
+    else:
+        raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
+    return scheduler
+
+
+def parse_arguments():
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default="stabilityai/stable-diffusion-3-medium-diffusers",
+        help="Path to the `diffusers` checkpoint to convert (either a local directory or on the bos).",
+    )
+    parser.add_argument(
+        "--inference_steps",
+        type=int,
+        default=50,
+        help="The number of unet inference steps.",
+    )
+    parser.add_argument(
+        "--benchmark_steps",
+        type=int,
+        default=10,
+        help="The number of performance benchmark steps.",
+    )
+    parser.add_argument(
+        "--task_name",
+        type=str,
+        default="all",
+        choices=[
+            "text2img",
+            "img2img",
+            "inpaint_legacy",
+            "all",
+        ],
+        help="The task can be one of [text2img, img2img, inpaint_legacy, all]. ",
+    )
+    parser.add_argument(
+        "--parse_prompt_type",
+        type=str,
+        default="raw",
+        choices=[
+            "raw",
+            "lpw",
+        ],
+        help="The parse_prompt_type can be one of [raw, lpw]. ",
+    )
+    parser.add_argument(
+        "--channels_last",
+        type=strtobool,
+        default=False,
+        help="Wheter to use channels_last",
+    )
+    parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode")
+    parser.add_argument("--tf32", type=strtobool, default=True, help="tf32")
+    parser.add_argument("--compile", type=strtobool, default=False, help="compile")
+    parser.add_argument(
+        "--attention_type",
+        type=str,
+        default="sdp",
+        choices=[
+            "raw",
+            "sdp",
+        ],
+        help="attention_type.",
+    )
+    parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu")
+    parser.add_argument(
+        "--scheduler",
+        type=str,
+        default="euler-ancestral",
+        choices=[
+            "flow",
+            "pndm",
+            "lms",
+            "euler",
+            "euler-ancestral",
+            "dpm-multi",
+            "dpm-single",
+            "unipc-multi",
+            "ddim",
+            "ddpm",
+            "deis-multi",
+            "heun",
+            "kdpm2-ancestral",
+            "kdpm2",
+        ],
+        help="The scheduler type of stable diffusion.",
+    )
+    parser.add_argument("--height", type=int, default=512, help="Height of input image")
+    parser.add_argument("--width", type=int, default=512, help="Width of input image")
+    parser.add_argument("--strength", type=float, default=1.0, help="Strength for img2img / inpaint")
+    return parser.parse_args()
+
+
+def attn_processors(self):
+    processors = {}
+
+    def fn_recursive_add_processors(name: str, module, processors):
+        if hasattr(module, "set_processor"):
+            processors[f"{name}.processor"] = module.processor
+
+        for sub_name, child in module.named_children():
+            fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+        return processors
+
+    for name, module in self.named_children():
+        fn_recursive_add_processors(name, module, processors)
+
+    return processors
+
+
+def set_attn_processor(self, processor):
+    count = len(attn_processors(self).keys())
+
+    if isinstance(processor, dict) and len(processor) != count:
+        raise ValueError(
+            f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+            f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+        )
+
+    def fn_recursive_attn_processor(name: str, module, processor):
+        if hasattr(module, "set_processor"):
+            if not isinstance(processor, dict):
+                module.set_processor(processor)
+            else:
+                module.set_processor(processor.pop(f"{name}.processor"))
+
+        for sub_name, child in module.named_children():
+            fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+    for name, module in self.named_children():
+        fn_recursive_attn_processor(name, module, processor)
+
+
+def main(args):
+    if args.tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+    else:
+        torch.backends.cuda.matmul.allow_tf32 = False
+
+    seed = 1024
+    torch_dtype = torch.float16 if args.use_fp16 else torch.float32
+    pipe = StableDiffusion3Pipeline.from_pretrained(
+        args.pretrained_model_name_or_path,
+        safety_checker=None,
+        feature_extractor=None,
+        requires_safety_checker=False,
+        torch_dtype=torch_dtype,
+    )
+    scheduler = change_scheduler(pipe, args.scheduler)
+    pipe.scheduler = scheduler
+    if args.device_id >= 0:
+        pipe.to(f"cuda:{args.device_id}")
+
+    if args.attention_type == "all":
+        args.attention_type = ["raw", "sdp"]
+    else:
+        args.attention_type = [args.attention_type]
+
+    for attention_type in args.attention_type:
+        # attn_prrocessor_cls = AttnProcessor if attention_type == "raw" else AttnProcessor2_0
+        # if attention_type == "sdp":
+        #     torch.nn.functional.scaled_dot_product_attention = torch.nn.functional.scaled_dot_product_attention_
+        # set_attn_processor(pipe.transformer, attn_prrocessor_cls())
+        # set_attn_processor(pipe.vae, attn_prrocessor_cls())
+
+        # if args.channels_last:
+        #     pipe.transformer.to(memory_format=torch.channels_last)
+
+        # if args.compile:
+        #     print("Run torch compile")
+        #     pipe.unet = torch.compile(pipe.transformer, mode="reduce-overhead", fullgraph=True)
+
+        width = args.width
+        height = args.height
+        pipe.set_progress_bar_config(disable=False)
+
+        folder = f"torch_attn_{attention_type}_fp16" if args.use_fp16 else f"torch_attn_{attention_type}_fp32"
+        os.makedirs(folder, exist_ok=True)
+        if args.task_name in ["text2img", "all"]:
+            init_image = load_image(
+                "https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/control_bird_canny_demo.png"
+            )
+            # text2img
+            prompt = "bird"
+            time_costs = []
+            # warmup
+            pipe(
+                prompt,
+                num_inference_steps=10,
+                height=height,
+                width=width,
+            )
+            print("==> Test text2img performance.")
+            for step in trange(args.benchmark_steps):
+                start = time.time()
+                torch.cuda.manual_seed(seed)
+                images = pipe(
+                    prompt,
+                    num_inference_steps=args.inference_steps,
+                    height=height,
+                    width=width,
+                ).images
+                latency = time.time() - start
+                time_costs += [latency]
+                # print(f"No {step:3d} time cost: {latency:2f} s")
+            print(
+                f"Attention type: {attention_type}, "
+                f"Use fp16: {'true' if args.use_fp16 else 'false'}, "
+                f"Mean iter/sec: {1 / (np.mean(time_costs) / args.inference_steps):2f} it/s, "
+                f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, "
+                f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s."
+            )
+            images[0].save(f"{folder}/text2img.png")
+
+
+
+if __name__ == "__main__":
+    args = parse_arguments()
+    main(args)
diff --git a/ppdiffusers/deploy/sd3/scripts/benchmark_paddle.sh b/ppdiffusers/deploy/sd3/scripts/benchmark_paddle.sh
new file mode 100644
index 000000000..a0c2d8d45
--- /dev/null
+++ b/ppdiffusers/deploy/sd3/scripts/benchmark_paddle.sh
@@ -0,0 +1,32 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# attention raw fp16
+python infer_dygraph_paddle.py --scheduler "flow" --task_name all --attention_type raw --use_fp16 True --inference_steps 50 --height 1024 --width 1024 --benchmark_steps 10
+
+# attention cutlass fp16
+python infer_dygraph_paddle.py --scheduler "flow" --task_name all --attention_type cutlass --use_fp16 True --inference_steps 50 --height 1024 --width 1024 --benchmark_steps 10
+
+# attention flash fp16
+python infer_dygraph_paddle.py --scheduler "flow" --task_name all --attention_type flash --use_fp16 True --inference_steps 50 --height 1024 --width 1024 --benchmark_steps 10
+
+
+# attention raw fp32
+python infer_dygraph_paddle.py --scheduler "flow" --task_name all --attention_type raw --use_fp16 False --inference_steps 50 --height 1024 --width 1024 --benchmark_steps 10
+
+# attention cutlass fp32
+python infer_dygraph_paddle.py --scheduler "flow" --task_name all --attention_type cutlass --use_fp16 False --inference_steps 50 --height 1024 --width 1024 --benchmark_steps 10
+
+# attention flash fp32
+python infer_dygraph_paddle.py --scheduler "flow" --task_name all --attention_type flash --use_fp16 False --inference_steps 50 --height 1024 --width 1024 --benchmark_steps 10
diff --git a/ppdiffusers/deploy/sd3/scripts/benchmark_torch.sh b/ppdiffusers/deploy/sd3/scripts/benchmark_torch.sh
new file mode 100644
index 000000000..020c54969
--- /dev/null
+++ b/ppdiffusers/deploy/sd3/scripts/benchmark_torch.sh
@@ -0,0 +1,21 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# sd3 do ot supprot attention raw
+
+# attention sdp
+python infer_dygraph_torch.py --scheduler "flow" --task_name all --attention_type sdp --use_fp16 True --inference_steps 50 --height 1024 --width 1024 --benchmark_steps 10 
+
+# attention sdp fp32
+python infer_dygraph_torch.py --scheduler "flow" --task_name all --attention_type sdp --use_fp16 False --inference_steps 50 --height 1024 --width 1024 --benchmark_steps 10 
\ No newline at end of file
diff --git a/ppdiffusers/ppdiffusers/loaders/deprecate.py b/ppdiffusers/ppdiffusers/loaders/deprecate.py
index 2b88f5aeb..e1e72424b 100644
--- a/ppdiffusers/ppdiffusers/loaders/deprecate.py
+++ b/ppdiffusers/ppdiffusers/loaders/deprecate.py
@@ -19,7 +19,7 @@
 def text_encoder_lora_state_dict(text_encoder):
     deprecate(
         "text_encoder_load_state_dict in `models`",
-        "0.27.0",
+        "0.45.0",
         "`text_encoder_lora_state_dict` is deprecated and will be removed in 0.27.0. Make sure to retrieve the weights using `get_peft_model`. See https://huggingface.co/docs/peft/v0.6.2/en/quicktour#peftmodel for more information.",
     )
     state_dict = {}
@@ -45,7 +45,7 @@ def text_encoder_lora_state_dict(text_encoder):
     def text_encoder_attn_modules(text_encoder):
         deprecate(
             "text_encoder_attn_modules in `models`",
-            "0.27.0",
+            "0.45.0",
             "`text_encoder_lora_state_dict` is deprecated and will be removed in 0.27.0. Make sure to retrieve the weights using `get_peft_model`. See https://huggingface.co/docs/peft/v0.6.2/en/quicktour#peftmodel for more information.",
         )
         from ppdiffusers.transformers import CLIPTextModel, CLIPTextModelWithProjection
diff --git a/ppdiffusers/ppdiffusers/models/attention_processor.py b/ppdiffusers/ppdiffusers/models/attention_processor.py
index 3d6f1659b..3d20e0140 100644
--- a/ppdiffusers/ppdiffusers/models/attention_processor.py
+++ b/ppdiffusers/ppdiffusers/models/attention_processor.py
@@ -372,7 +372,7 @@ def set_processor(self, processor: "AttnProcessor", _remove_lora: bool = False)
         if not USE_PEFT_BACKEND and hasattr(self, "processor") and _remove_lora and self.to_q.lora_layer is not None:
             deprecate(
                 "set_processor to offload LoRA",
-                "0.26.0",
+                "0.45.0",
                 "In detail, removing LoRA layers via calling `set_default_attn_processor` is deprecated. Please make sure to call `pipe.unload_lora_weights()` instead.",
             )
             # TODO(Patrick, Sayak) - this can be deprecated once PEFT LoRA integration is complete
@@ -1645,7 +1645,7 @@ def __call__(self, attn: Attention, hidden_states: paddle.Tensor, *args, **kwarg
         self_cls_name = self.__class__.__name__
         deprecate(
             self_cls_name,
-            "0.26.0",
+            "0.45.0",
             (
                 f"Make sure use {self_cls_name[4:]} instead by setting"
                 "LoRA layers to `self.{to_q,to_k,to_v,to_out[0]}.lora_layer` respectively. This will be done automatically when using"
@@ -1724,7 +1724,7 @@ def __call__(self, attn: Attention, hidden_states: paddle.Tensor, *args, **kwarg
         self_cls_name = self.__class__.__name__
         deprecate(
             self_cls_name,
-            "0.26.0",
+            "0.45.0",
             (
                 f"Make sure use {self_cls_name[4:]} instead by setting"
                 "LoRA layers to `self.{to_q,to_k,to_v,add_k_proj,add_v_proj,to_out[0]}.lora_layer` respectively. This will be done automatically when using"
@@ -1783,7 +1783,7 @@ def __call__(self, attn: Attention, hidden_states: paddle.Tensor, *args, **kwarg
         self_cls_name = self.__class__.__name__
         deprecate(
             self_cls_name,
-            "0.26.0",
+            "0.45.0",
             (
                 f"Make sure use {self_cls_name[4:]} instead by setting"
                 "LoRA layers to `self.{to_q,to_k,to_v,add_k_proj,add_v_proj,to_out[0]}.lora_layer` respectively. This will be done automatically when using"
diff --git a/ppdiffusers/ppdiffusers/peft/tuners/lora/layer.py b/ppdiffusers/ppdiffusers/peft/tuners/lora/layer.py
index 907a92776..80debd760 100644
--- a/ppdiffusers/ppdiffusers/peft/tuners/lora/layer.py
+++ b/ppdiffusers/ppdiffusers/peft/tuners/lora/layer.py
@@ -165,7 +165,7 @@ def reset_lora_parameters(self, adapter_name, init_lora_weights):
             else:
                 raise ValueError(f"Unknown initialization {init_lora_weights=}")
             nn.init.zeros_(self.lora_B[adapter_name].weight)
-        if adapter_name in self.lora_embedding_A.keys():
+        if adapter_name in dict(self.lora_embedding_A).keys():
             # initialize a the same way as the default for nn.linear and b to zero
             nn.init.zeros_(self.lora_embedding_A[adapter_name])
             nn.init.normal_(self.lora_embedding_B[adapter_name])
diff --git a/ppdiffusers/ppdiffusers/peft/tuners/tuners_utils.py b/ppdiffusers/ppdiffusers/peft/tuners/tuners_utils.py
index 66467f94c..5f643a017 100644
--- a/ppdiffusers/ppdiffusers/peft/tuners/tuners_utils.py
+++ b/ppdiffusers/ppdiffusers/peft/tuners/tuners_utils.py
@@ -416,6 +416,8 @@ def set_adapter(self, adapter_names: str | list[str]) -> None:
         # Deactivate grads on the inactive adapter and activate grads on the active adapter
         for layer_name in self.adapter_layer_names:
             module_dict = getattr(self, layer_name)
+            if isinstance(module_dict, paddle.nn.ParameterDict):
+                module_dict = dict(module_dict)
             for key, layer in module_dict.items():
                 if key in adapter_names:
                     # Note: It is possible that not a single layer is called with requires_grad_(True) here. This may
diff --git a/ppdiffusers/ppdiffusers/pipelines/pipeline_utils.py b/ppdiffusers/ppdiffusers/pipelines/pipeline_utils.py
index 1ba2d67ab..ec9c5e4c4 100644
--- a/ppdiffusers/ppdiffusers/pipelines/pipeline_utils.py
+++ b/ppdiffusers/ppdiffusers/pipelines/pipeline_utils.py
@@ -688,10 +688,10 @@ def to(self, *args, **kwargs):
 
         paddle_dtype = kwargs.pop("paddle_dtype", None)
         if paddle_dtype is not None:
-            deprecate("paddle_dtype", "0.35.0", "")
+            deprecate("paddle_dtype", "0.45.0", "")
         paddle_device = kwargs.pop("paddle_device", None)
         if paddle_device is not None:
-            deprecate("paddle_device", "0.35.0", "")
+            deprecate("paddle_device", "0.45.0", "")
 
         dtype_kwarg = kwargs.pop("dtype", None)
         device_kwarg = kwargs.pop("device", None)
diff --git a/tests/test_tipc/dygraph/dp/stable_diffusion_3/N1C1/stable_diffusion_3-dreambooth_ft_bs1_fp16_DP.sh b/tests/test_tipc/dygraph/dp/stable_diffusion_3/N1C1/stable_diffusion_3-dreambooth_ft_bs1_fp16_DP.sh
new file mode 100644
index 000000000..2d2242d30
--- /dev/null
+++ b/tests/test_tipc/dygraph/dp/stable_diffusion_3/N1C1/stable_diffusion_3-dreambooth_ft_bs1_fp16_DP.sh
@@ -0,0 +1,27 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+model_item=stable_diffusion_3-dreambooth_ft
+model=stable_diffusion_3
+bs_item=1
+fp_item=fp16
+run_mode=DP
+device_num=N1C1
+max_iter=1000
+num_workers=0
+
+# get data
+bash ./test_tipc/dygraph/dp/${model}/benchmark_common/prepare.sh
+# run
+bash ./test_tipc/dygraph/dp/${model}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1;
diff --git a/tests/test_tipc/dygraph/dp/stable_diffusion_3/N1C1/stable_diffusion_3-dreambooth_ft_bs4_fp16_DP.sh b/tests/test_tipc/dygraph/dp/stable_diffusion_3/N1C1/stable_diffusion_3-dreambooth_ft_bs4_fp16_DP.sh
new file mode 100644
index 000000000..1fc7d081b
--- /dev/null
+++ b/tests/test_tipc/dygraph/dp/stable_diffusion_3/N1C1/stable_diffusion_3-dreambooth_ft_bs4_fp16_DP.sh
@@ -0,0 +1,27 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+model_item=stable_diffusion_3-dreambooth_ft
+model=stable_diffusion_3
+bs_item=4
+fp_item=fp16
+run_mode=DP
+device_num=N1C1
+max_iter=1000
+num_workers=0
+
+# get data
+bash ./test_tipc/dygraph/dp/${model}/benchmark_common/prepare.sh
+# run
+bash ./test_tipc/dygraph/dp/${model}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1;
diff --git a/tests/test_tipc/dygraph/dp/stable_diffusion_3/N1C1/stable_diffusion_3-dreambooth_lora_bs1_fp16_DP.sh b/tests/test_tipc/dygraph/dp/stable_diffusion_3/N1C1/stable_diffusion_3-dreambooth_lora_bs1_fp16_DP.sh
new file mode 100644
index 000000000..6d561ee48
--- /dev/null
+++ b/tests/test_tipc/dygraph/dp/stable_diffusion_3/N1C1/stable_diffusion_3-dreambooth_lora_bs1_fp16_DP.sh
@@ -0,0 +1,27 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+model_item=stable_diffusion_3-dreambooth_lora
+model=stable_diffusion_3
+bs_item=1
+fp_item=fp16
+run_mode=DP
+device_num=N1C1
+max_iter=1000
+num_workers=0
+
+# get data
+bash ./test_tipc/dygraph/dp/${model}/benchmark_common/prepare.sh
+# run
+bash ./test_tipc/dygraph/dp/${model}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1;
diff --git a/tests/test_tipc/dygraph/dp/stable_diffusion_3/N1C1/stable_diffusion_3-dreambooth_lora_bs4_fp16_DP.sh b/tests/test_tipc/dygraph/dp/stable_diffusion_3/N1C1/stable_diffusion_3-dreambooth_lora_bs4_fp16_DP.sh
new file mode 100644
index 000000000..13d90c9ba
--- /dev/null
+++ b/tests/test_tipc/dygraph/dp/stable_diffusion_3/N1C1/stable_diffusion_3-dreambooth_lora_bs4_fp16_DP.sh
@@ -0,0 +1,27 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+model_item=stable_diffusion_3-dreambooth_lora
+model=stable_diffusion_3
+bs_item=4
+fp_item=fp16
+run_mode=DP
+device_num=N1C1
+max_iter=1000
+num_workers=0
+
+# get data
+bash ./test_tipc/dygraph/dp/${model}/benchmark_common/prepare.sh
+# run
+bash ./test_tipc/dygraph/dp/${model}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1;
diff --git a/tests/test_tipc/dygraph/dp/stable_diffusion_3/N1C8/stable_diffusion_3-dreambooth_ft_bs1_fp16_DP.sh b/tests/test_tipc/dygraph/dp/stable_diffusion_3/N1C8/stable_diffusion_3-dreambooth_ft_bs1_fp16_DP.sh
new file mode 100644
index 000000000..fe454c0ef
--- /dev/null
+++ b/tests/test_tipc/dygraph/dp/stable_diffusion_3/N1C8/stable_diffusion_3-dreambooth_ft_bs1_fp16_DP.sh
@@ -0,0 +1,27 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+model_item=stable_diffusion_3-dreambooth_ft
+model=stable_diffusion_3
+bs_item=1
+fp_item=fp16
+run_mode=DP
+device_num=N1C8
+max_iter=1000
+num_workers=0
+
+# get data
+bash ./test_tipc/dygraph/dp/${model}/benchmark_common/prepare.sh
+# run
+bash ./test_tipc/dygraph/dp/${model}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1;
diff --git a/tests/test_tipc/dygraph/dp/stable_diffusion_3/N1C8/stable_diffusion_3-dreambooth_ft_bs4_fp16_DP.sh b/tests/test_tipc/dygraph/dp/stable_diffusion_3/N1C8/stable_diffusion_3-dreambooth_ft_bs4_fp16_DP.sh
new file mode 100644
index 000000000..ded063935
--- /dev/null
+++ b/tests/test_tipc/dygraph/dp/stable_diffusion_3/N1C8/stable_diffusion_3-dreambooth_ft_bs4_fp16_DP.sh
@@ -0,0 +1,27 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+model_item=stable_diffusion_3-dreambooth_ft
+model=stable_diffusion_3
+bs_item=4
+fp_item=fp16
+run_mode=DP
+device_num=N1C8
+max_iter=1000
+num_workers=0
+
+# get data
+bash ./test_tipc/dygraph/dp/${model}/benchmark_common/prepare.sh
+# run
+bash ./test_tipc/dygraph/dp/${model}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1;
diff --git a/tests/test_tipc/dygraph/dp/stable_diffusion_3/N1C8/stable_diffusion_3-dreambooth_lora_bs1_fp16_DP.sh b/tests/test_tipc/dygraph/dp/stable_diffusion_3/N1C8/stable_diffusion_3-dreambooth_lora_bs1_fp16_DP.sh
new file mode 100644
index 000000000..686428ad5
--- /dev/null
+++ b/tests/test_tipc/dygraph/dp/stable_diffusion_3/N1C8/stable_diffusion_3-dreambooth_lora_bs1_fp16_DP.sh
@@ -0,0 +1,27 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+model_item=stable_diffusion_3-dreambooth_lora
+model=stable_diffusion_3
+bs_item=1
+fp_item=fp16
+run_mode=DP
+device_num=N1C8
+max_iter=1000
+num_workers=0
+
+# get data
+bash ./test_tipc/dygraph/dp/${model}/benchmark_common/prepare.sh
+# run
+bash ./test_tipc/dygraph/dp/${model}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1;
diff --git a/tests/test_tipc/dygraph/dp/stable_diffusion_3/N1C8/stable_diffusion_3-dreambooth_lora_bs4_fp16_DP.sh b/tests/test_tipc/dygraph/dp/stable_diffusion_3/N1C8/stable_diffusion_3-dreambooth_lora_bs4_fp16_DP.sh
new file mode 100644
index 000000000..ead2da890
--- /dev/null
+++ b/tests/test_tipc/dygraph/dp/stable_diffusion_3/N1C8/stable_diffusion_3-dreambooth_lora_bs4_fp16_DP.sh
@@ -0,0 +1,27 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+model_item=stable_diffusion_3-dreambooth_lora
+model=stable_diffusion_3
+bs_item=4
+fp_item=fp16
+run_mode=DP
+device_num=N1C8
+max_iter=1000
+num_workers=0
+
+# get data
+bash ./test_tipc/dygraph/dp/${model}/benchmark_common/prepare.sh
+# run
+bash ./test_tipc/dygraph/dp/${model}/benchmark_common/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_iter} ${num_workers} 2>&1;
diff --git a/tests/test_tipc/dygraph/dp/stable_diffusion_3/benchmark_common/analysis_log.py b/tests/test_tipc/dygraph/dp/stable_diffusion_3/benchmark_common/analysis_log.py
new file mode 100644
index 000000000..7606e8adc
--- /dev/null
+++ b/tests/test_tipc/dygraph/dp/stable_diffusion_3/benchmark_common/analysis_log.py
@@ -0,0 +1,155 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+
+import json
+import os
+import re
+import sys
+from pdb import line_prefix
+
+import numpy as np
+from numpy import mean, var
+
+class TimeAnalyzer(object):
+    def __init__(self, filename, keyword=None, loss_keyword=None):
+        if filename is None:
+            raise Exception("Please specify the filename!")
+
+        if keyword is None:
+            raise Exception("Please specify the keyword!")
+
+        self.filename = filename
+        self.keyword = keyword
+        self.loss_keyword = loss_keyword
+
+    def get_ips(self):
+        ips_list = []
+        loss_list = []
+        loss_value = None
+        with open(self.filename, "r") as f_object:
+            lines = f_object.read().splitlines()
+            for line in lines:
+                if self.keyword not in line:
+                    continue
+                try:
+                    # result = None
+
+                    # # Distill the string from a line.
+                    # line = line.strip()
+                    # line_words = line.split()
+                    # for i in range(len(line_words) - 1):
+                    #     if line_words[i] == self.keyword:
+                    #         result = float(line_words[i + 1].replace(',', ''))
+                    #         ips_list.append(result)
+                    #     if line_words[i] == self.loss_keyword:
+                    #         # 剔除掉该值后面的逗号并保留5位小数点
+                    #         loss_value = line_words[i + 1].replace(',', '')  
+                    #         # 保留5位小数
+                    #         # loss_value = float("{:.5f}".format(float(loss_str_without_comma)))
+                            
+                    # # Distil the result from the picked string.
+
+                    # 提取 ips
+                    ips_match = re.search(r'(\d+\.\d+)it/s', line)
+                    if ips_match:
+                        ips = float(ips_match.group(1))
+                        ips_list.append(ips)
+
+                    # 提取 loss
+                    loss_match = re.search(r'loss=(\d+\.\d+)', line)
+                    if loss_match:
+                        loss = float(loss_match.group(1))
+                        loss_list.append(loss)
+                        loss_value = loss
+
+                except Exception as exc:
+                    print("line is: {}; failed".format(line))
+                    print("Exception: {}".format(exc))
+        if loss_value is None:
+            loss_value = -1
+        def ewma(data, alpha):
+            smoothed_data = []
+            for i, value in enumerate(data):
+                if i == 0:
+                    smoothed_data.append(value)
+                else:
+                    smoothed_value = alpha * value + (1 - alpha) * smoothed_data[-1]
+                    smoothed_data.append(smoothed_value)
+            return smoothed_data
+        smoothed_loss = ewma(loss_list, 0.9)[-1]
+        return mean(ips_list[4:]), loss_value, smoothed_loss
+
+
+def analyze(model_item, log_file, res_log_file, device_num, bs, fp_item):
+
+    analyzer = TimeAnalyzer(log_file, 'Steps:', None)
+    ips, convergence_value, smoothed_value = analyzer.get_ips()
+    ips = round(ips, 3)
+    # with open(str(log_file), "r", encoding="utf8") as f:
+    #     data = f.readlines()
+    # ips_lines = []
+    # for eachline in data:
+    #     if "train_samples_per_second:" in eachline:
+    #         ips = float(eachline.split("train_samples_per_second: ")[1].split()[0].replace(',', ''))
+    #         print("----ips: ", ips)
+    #         ips_lines.append(ips)
+    # print("----ips_lines: ", ips_lines)
+    # ips = np.round(np.mean(ips_lines), 3)
+    ngpus = int(re.findall("\d+", device_num)[-1])
+    batch_size = int(re.findall("\d+", str(bs))[-1])
+    print("----ips: ", ips, "ngpus", ngpus, "batch_size", batch_size)
+    ips *= batch_size
+    ips *= ngpus
+    run_mode = "DP"
+
+    model_name = model_item + "_" + "bs" + str(bs) + "_" + fp_item + "_" + run_mode
+    info = {
+        "model_branch": os.getenv("model_branch"),
+        "model_commit": os.getenv("model_commit"),
+        "model_name": model_name,
+        "batch_size": bs,
+        "fp_item": fp_item,
+        "run_mode": run_mode,
+        "convergence_value": convergence_value,
+        "smoothed_value": smoothed_value,
+        "convergence_key": "",
+        "ips": ips,
+        "speed_unit": "sample/sec",
+        "device_num": device_num,
+        "model_run_time": os.getenv("model_run_time"),
+        "frame_commit": "",
+        "frame_version": os.getenv("frame_version"),
+    }
+    json_info = json.dumps(info)
+    print(json_info)
+    with open(res_log_file, "w") as of:
+        of.write(json_info)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 7:
+        print("Usage:" + sys.argv[0] + " model_item path/to/log/file path/to/res/log/file")
+        sys.exit()
+    
+
+    model_item = sys.argv[1]
+    log_file = sys.argv[2]
+    res_log_file = sys.argv[3]
+    device_num = sys.argv[4]
+    bs = int(sys.argv[5])
+    fp_item = sys.argv[6]
+
+    analyze(model_item, log_file, res_log_file, device_num, bs, fp_item)
\ No newline at end of file
diff --git a/tests/test_tipc/dygraph/dp/stable_diffusion_3/benchmark_common/prepare.sh b/tests/test_tipc/dygraph/dp/stable_diffusion_3/benchmark_common/prepare.sh
new file mode 100644
index 000000000..6b6dbdf7c
--- /dev/null
+++ b/tests/test_tipc/dygraph/dp/stable_diffusion_3/benchmark_common/prepare.sh
@@ -0,0 +1,59 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+if [ ! -d "stable-diffusion-3-medium-diffusers-paddle-init" ]; then
+    echo "Downloading stable-diffusion-3-medium-diffusers-paddle-init.tar.gz..."
+    wget https://bj.bcebos.com/paddlenlp/models/community/westfish/sd3_benchmark/stable-diffusion-3-medium-diffusers-paddle-init.tar.gz
+    echo "Extracting stable-diffusion-3-medium-diffusers-paddle-init.tar.gz..."
+    tar -zxvf stable-diffusion-3-medium-diffusers-paddle-init.tar.gz
+else
+    echo "Directory stable-diffusion-3-medium-diffusers-paddle-init already exists. Skipping download."
+fi
+
+if [ ! -d "dog" ]; then
+    echo "Downloading dog.zip..."
+    wget https://paddlenlp.bj.bcebos.com/models/community/westfish/develop-sdxl/dog.zip
+    echo "Unzipping dog.zip..."
+    unzip dog.zip
+else
+    echo "Directory dog already exists. Skipping download."
+fi
+
+
+RUN_SETUP=${RUN_SETUP:-"true"}
+if [ "$RUN_SETUP" = "true" ]; then
+    echo "Running setup and installation steps..."
+
+    export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH
+    python -m pip install --upgrade pip
+    # python -m pip install paddlepaddle-gpu==3.0.0b2 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
+    python -m pip install einops
+    python -m pip install -r ../requirements.txt
+    python -m pip install --upgrade pybind11 regex sentencepiece tqdm visualdl attrdict easydict pyyaml paddlesde
+    python -m pip install paddlenlp==3.0.0b2
+    python -m pip install huggingface-hub==0.23.0
+
+    # uninstall ppdiffusers and install develop paddlemix
+    python -m pip uninstall -y ppdiffusers
+    cd ../ppdiffusers/
+    python -m pip install -e .
+    cd -
+    cd ../ppdiffusers/examples/dreambooth
+    pip install -r requirements_sd3.txt
+    cd -
+    python -m pip list
+else
+    echo "fast mode, skipping setup and installation steps as RUN_SETUP is set to false."
+fi
diff --git a/tests/test_tipc/dygraph/dp/stable_diffusion_3/benchmark_common/run_benchmark.sh b/tests/test_tipc/dygraph/dp/stable_diffusion_3/benchmark_common/run_benchmark.sh
new file mode 100644
index 000000000..7d6e3c26b
--- /dev/null
+++ b/tests/test_tipc/dygraph/dp/stable_diffusion_3/benchmark_common/run_benchmark.sh
@@ -0,0 +1,188 @@
+#!/usr/bin/env bash
+
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Test training benchmark for a model.
+# Usage：bash benchmark/run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num}
+function _set_params(){
+    model_item=${1:-"stable_diffusion_3-dreambooth_ft"}   # (必选) 模型 item |fastscnn|segformer_b0| ocrnet_hrnetw48
+    base_batch_size=${2:-"1"}       # (必选) 如果是静态图单进程，则表示每张卡上的BS，需在训练时*卡数
+    fp_item=${3:-"fp32"}            # (必选) fp32|fp16|bf16
+    run_mode=${4:-"DP"}             # (必选) MP模型并行|DP数据并行|PP流水线并行|混合并行DP1-MP1-PP1|DP1-MP4-PP1
+    device_num=${5:-"N1C1"}         # (必选) 使用的卡数量，N1C1|N1C8|N4C32 （4机32卡）
+    profiling=${PROFILING:-"false"}      # (必选) Profiling  开关，默认关闭，通过全局变量传递
+
+    model_repo="PaddleMIX"          # (必选) 模型套件的名字
+    speed_unit="sample/sec"         # (必选)速度指标单位
+    skip_steps=0                  # (必选)解析日志，跳过模型前几个性能不稳定的step
+    keyword="ips:"                 # (必选)解析日志，筛选出性能数据所在行的关键字
+    convergence_key="loss:"        # (可选)解析日志，筛选出收敛数据所在行的关键字 如：convergence_key="loss:"
+    max_iter=${6:-"20"}                 # （可选）需保证模型执行时间在5分钟内，需要修改代码提前中断的直接提PR 合入套件  或是max_epoch
+    num_workers=${7:-"5"}                # (可选)
+    is_large_model=False           # (可选)普通模型默认为False，如果添加大模型且只取一条ips设置为True
+
+    # 以下为通用执行命令，无特殊可不用修改
+    model_name=${model_item}_bs${base_batch_size}_${fp_item}_${run_mode}  # (必填) 且格式不要改动,与竞品名称对齐
+    device=${CUDA_VISIBLE_DEVICES//,/ }
+    arr=(${device})
+    num_gpu_devices=${#arr[*]}
+    run_log_path=${TRAIN_LOG_DIR:-$(pwd)}  # （必填） TRAIN_LOG_DIR  benchmark框架设置该参数为全局变量
+    profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)}  # （必填） PROFILING_LOG_DIR benchmark框架设置该参数为全局变量
+    speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)}
+
+    train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log
+    profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling
+    speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed
+}
+
+function _train(){
+    batch_size=${base_batch_size}  # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs
+    echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}"
+    if [ ${profiling} = "true" ];then
+            add_options="--profiler_options=\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\""
+            log_file=${profiling_log_file}
+        else
+            add_options=""
+            log_file=${train_log_file}
+    fi
+
+    # 原生动态图
+    export FLAG_FUSED_LINEAR=0
+    export FLAGS_conv_workspace_size_limit=4096
+
+    export FLAGS_cudnn_deterministic=True
+    env |grep FLAG
+
+    if [ ${fp_item} = "fp32" ]; then
+        fp_item_cmd="no"
+    else
+        fp_item_cmd=${fp_item}
+    fi
+    echo "------------"
+    ls;
+    echo "------------"
+
+
+    if [ ${model_item} = "stable_diffusion_3-dreambooth_ft" ];then
+        train_cmd="
+            ../ppdiffusers/examples/dreambooth/train_dreambooth_sd3.py \
+            --pretrained_model_name_or_path=stable-diffusion-3-medium-diffusers-paddle-init  \
+            --instance_data_dir=dog \
+            --output_dir=trained-sd3 \
+            --mixed_precision=${fp_item_cmd} \
+            --instance_prompt=a-photo-of-sks-dog \
+            --resolution=512 \
+            --train_batch_size=${batch_size} \
+            --gradient_accumulation_steps=4 \
+            --learning_rate=5e-5 \
+            --report_to=tensorboard \
+            --lr_scheduler=constant \
+            --lr_warmup_steps=0 \
+            --max_train_steps=${max_iter} \
+            --validation_prompt=A-photo-of-sks-dog-in-a-bucket \
+            --validation_epochs=100 \
+            --num_validation_images 1 \
+            --seed=0 \
+            --checkpointing_steps=10000
+        "
+    else
+        export USE_PEFT_BACKEND=True
+        train_cmd="
+            ../ppdiffusers/examples/dreambooth/train_dreambooth_lora_sd3.py \
+            --pretrained_model_name_or_path=stable-diffusion-3-medium-diffusers-paddle-init  \
+            --instance_data_dir=dog \
+            --output_dir=trained-sd3-lora \
+            --mixed_precision=${fp_item_cmd} \
+            --instance_prompt=a-photo-of-sks-dog \
+            --resolution=512 \
+            --train_batch_size=${batch_size} \
+            --gradient_accumulation_steps=4 \
+            --learning_rate=5e-5 \
+            --report_to=tensorboard \
+            --lr_scheduler=constant \
+            --lr_warmup_steps=0 \
+            --max_train_steps=${max_iter} \
+            --validation_prompt=A-photo-of-sks-dog-in-a-bucket \
+            --validation_epochs=100 \
+            --num_validation_images 1 \
+            --seed=0 \
+            --checkpointing_steps=10000
+        "
+    fi 
+
+    # 以下为通用执行命令，无特殊可不用修改
+    case ${run_mode} in
+    DP) if [[ ${device_num} = "N1C1" ]];then
+            echo "run ${run_mode} "
+            train_cmd="python -u ${train_cmd}"
+        else
+            rm -rf ./mylog   # 注意执行前删掉log目录
+            train_cmd="python -u -m paddle.distributed.launch --log_dir=./mylog --gpus=$CUDA_VISIBLE_DEVICES \
+                  ${train_cmd}"
+        fi
+        ;;
+    DP1-MP1-PP1)  echo "run run_mode: DP1-MP1-PP1" ;;
+    *) echo "choose run_mode "; exit 1;
+    esac
+
+    echo "train_cmd: ${train_cmd}  log_file: ${log_file}"
+    RUN_SLOW=${RUN_SLOW:-"true"}
+    if [ "$RUN_SLOW" = "true" ]; then
+        timeout 30m ${train_cmd} > ${log_file} 2>&1
+    else
+        echo "fast mode, only run 3m"
+        timeout 3m ${train_cmd} > ${log_file} 2>&1
+    fi
+    # eval ${train_cmd}
+    # eval "timeout 30m ${train_cmd} > ${log_file} 2>&1"
+    if [ $? -ne 0 ];then
+        echo -e "${model_name}, FAIL"
+    else
+        echo -e "${model_name}, SUCCESS"
+    fi
+    # kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
+
+    if [ ${device_num} != "N1C1" -a -d mylog ]; then
+        rm ${log_file}
+        cp mylog/workerlog.0 ${log_file}
+    fi
+    echo ${train_cmd} >> ${log_file}
+    cat ${log_file}
+}
+
+function _analysis_log(){
+    # cd -
+    analysis_log_cmd="python test_tipc/dygraph/dp/stable_diffusion_3/benchmark_common/analysis_log.py \
+        ${model_item} ${log_file} ${speed_log_file} ${device_num} ${base_batch_size} ${fp_item}"
+    echo ${analysis_log_cmd}
+    eval ${analysis_log_cmd}
+}
+
+_set_params $@
+str_tmp=$(echo `pip list|grep paddlepaddle-gpu|awk -F ' ' '{print $2}'`)
+export frame_version=${str_tmp%%.post*}
+export frame_commit=$(echo `python -c "import paddle;print(paddle.version.commit)"`)
+export model_branch=`git symbolic-ref HEAD 2>/dev/null | cut -d"/" -f 3`
+export model_commit=$(git log|head -n1|awk '{print $2}')
+echo "---------frame_version is ${frame_version}"
+echo "---------Paddle commit is ${frame_commit}"
+echo "---------Model commit is ${model_commit}"
+echo "---------model_branch is ${model_branch}"
+
+job_bt=`date '+%Y%m%d%H%M%S'`
+_train
+job_et=`date '+%Y%m%d%H%M%S'`
+export model_run_time=$((${job_et}-${job_bt}))
+_analysis_log