nod-ai · monorimet · Jul 12, 2024 · Jul 12, 2024 · Jul 12, 2024 · Jul 12, 2024
diff --git a/.github/workflows/test_models.yml b/.github/workflows/test_models.yml
@@ -57,7 +57,7 @@ jobs:
           # Note: We install in three steps in order to satisfy requirements
           # from non default locations first. Installing the PyTorch CPU
           # wheels saves multiple minutes and a lot of bandwidth on runner setup.
-          pip install --no-compile -r ${{ github.workspace }}/iree-turbine/pytorch-cpu-requirements.txt
+          pip install --pre torch==2.4.1+cpu torchvision --index-url https://download.pytorch.org/whl/cpu
           pip install --pre --upgrade -r ${{ github.workspace }}/iree-turbine/requirements.txt
           pip install --no-compile --pre -e ${{ github.workspace }}/iree-turbine[testing]
           pip install --upgrade --pre --no-cache-dir iree-compiler iree-runtime -f https://iree.dev/pip-release-links.html
@@ -77,8 +77,8 @@ jobs:
           source turbine_venv/bin/activate
 
           pytest -v models/turbine_models/tests/sd_test.py
-          pytest -v models/turbine_models/tests/sdxl_test.py --device cpu --rt_device local-task --iree_target_triple x86_64-linux-gnu --num_inference_steps 5
-          pytest -v models/turbine_models/tests/sdxl_test.py --device vulkan --rt_device vulkan --iree_target_triple rdna3-unknown-linux
-          pytest -v models/turbine_models/tests/sdxl_test.py --device rocm --rt_device hip --iree_target_triple gfx90a --precision fp16 --attn_spec default
-          pytest -v models/turbine_models/tests/sdxl_test.py --device rocm --rt_device hip --iree_target_triple gfx90a --precision fp16 --attn_spec default --batch_size 2
-          pytest -v models/turbine_models/tests/sd3_test.py --device cpu --rt_device local-task --iree_target_triple x86_64-linux-gnu --num_inference_steps 5
+          pytest -v models/turbine_models/tests/sdxl_test.py --device cpu --rt_device local-task --iree_target_triple x86_64-linux-gnu --num_inference_steps 2 -x -s
+          pytest -v models/turbine_models/tests/sdxl_test.py --device rocm --rt_device hip --iree_target_triple gfx90a --precision fp16 -x -s
+          pytest -v models/turbine_models/tests/sdxl_test.py --device rocm --rt_device hip --iree_target_triple gfx90a --precision fp16 --batch_size 2 -x
+          pytest -v models/turbine_models/tests/sd3_test.py --device cpu --rt_device local-task --iree_target_triple x86_64-linux-gnu --num_inference_steps 2 -x
+
diff --git a/models/README.md b/models/README.md
@@ -1,26 +1,59 @@
-# LLAMA 2 Inference
+# Turbine-Models setup (source)
 
-This example require some extra dependencies. Here's an easy way to get it running on a fresh server.
-
-Don't forget to put in your huggingface token from https://huggingface.co/settings/tokens
+For private/gated models, make sure you have run `huggingface-cli login`.
 
+For MI Instinct:
 ```bash
 #!/bin/bash
+sudo apt install -y git
+
+# Clone and build IREE at the shared/sdxl_quantized branch
+git clone https://github.com/iree-org/iree && cd iree
+git checkout shared/sdxl_quantized
+git submodule update --init
+python -m venv iree.venv
+pip install pybind11 numpy nanobind
+cmake -S . -B build-release \
+  -G Ninja -DCMAKE_BUILD_TYPE=Release \
+  -DCMAKE_C_COMPILER=`which clang` -DCMAKE_CXX_COMPILER=`which clang++` \
+  -DIREE_HAL_DRIVER_CUDA=OFF \
+  -DIREE_BUILD_PYTHON_BINDINGS=ON \
+  -DPython3_EXECUTABLE="$(which python3)" && \
+  cmake --build build-release/
+
+export PYTHONPATH=/path/to/iree/build-release/compiler/bindings/python:/path/to/iree/build-release/runtime/bindings/python
+
+# Clone and setup turbine-models
+cd ..
+git clone https://github.com/nod-ai/SHARK-Turbine.git && cd SHARK-Turbine
+git checkout merge_punet_sdxl
+pip install torch==2.5.0.dev20240801 torchvision --index-url https://download.pytorch.org/whl/nightly/cpu
+pip install -r models/requirements.txt
+pip uninstall -y iree-compiler iree-runtime
 
+pip install -e models
 
-# if you don't insert it, you will be prompted to log in later;
-# you may need to rerun this script after logging in
-YOUR_HF_TOKEN="insert token for headless" 
+# Run sdxl tests.
+python models/turbine_models/tests/sdxl_test.py pytest --device=rocm --rt_device=hip --iree_target_triple=gfx942 --external_weights=safetensors --precision=fp16 --clip_spec=mfma --unet_spec=mfma --vae_spec=mfma
+
+# Generate an image.
+# To reuse test artifacts/weights, add: --pipeline_dir=./test_vmfbs --external_weights_dir=./test_weights
+python models/turbine_models/custom_models/sd_inference/sd_pipeline.py --hf_model_name=stabilityai/stable-diffusion-xl-base-1.0 --device=hip://0 --precision=fp16 --external_weights=safetensors --iree_target_triple=gfx942 --vae_decomp_attn --clip_decomp_attn --use_i8_punet --width=1024 --height=1024 --num_inference_steps=20 --benchmark=all --verbose
+
+```
+For GFX11 (RDNA3 Discrete GPUs/Ryzen laptops) the following setup is validated:
+```bash
+#!/bin/bash
 
 # clone and install dependencies
 sudo apt install -y git
 git clone https://github.com/nod-ai/SHARK-Turbine.git
 cd SHARK-Turbine
-pip install -r core/requirements.txt
+pip install torch==2.5.0.dev20240801 torchvision --index-url https://download.pytorch.org/whl/nightly/cpu
 pip install -r models/requirements.txt
 
 # do an editable install from the cloned SHARK-Turbine
-pip install --editable core models
+pip install --editable models
 
 # Log in with Hugging Face CLI if token setup is required
 if [[ $YOUR_HF_TOKEN == hf_* ]]; then
@@ -42,6 +75,3 @@ else
     huggingface-cli login
 fi
 
-# Step 7: Run the Python script
-python .\python\turbine_models\custom_models\stateless_llama.py --compile_to=torch --external_weights=safetensors --external_weight_file=llama_f32.safetensors
-```
diff --git a/models/requirements.txt b/models/requirements.txt
@@ -1,16 +1,16 @@
 protobuf
 gguf
-transformers==4.37.1
+transformers==4.43.3
 torchsde
 accelerate
 peft
+safetensors>=0.4.0
 diffusers @ git+https://github.com/nod-ai/[email protected]
 brevitas @ git+https://github.com/Xilinx/brevitas.git@6695e8df7f6a2c7715b9ed69c4b78157376bb60b
 # turbine tank downloading/uploading
 azure-storage-blob
-# microsoft/phi model
 einops
 pytest
 scipy
 shark-turbine @ git+https://github.com/iree-org/iree-turbine.git@main
--e git+https://github.com/nod-ai/sharktank.git@main#egg=sharktank&subdirectory=sharktank
+-e git+https://github.com/nod-ai/sharktank.git@main#egg=sharktank&subdirectory=sharktank
diff --git a/models/setup.py b/models/setup.py
@@ -57,7 +57,7 @@ def load_version_info():
         "Shark-Turbine",
         "protobuf",
         "sentencepiece",
-        "transformers>=4.37.1",
+        "transformers>=4.43.3",
         "accelerate",
         "diffusers==0.29.0.dev0",
         "azure-storage-blob",

diff --git a/models/turbine_models/custom_models/pipeline_base.py b/models/turbine_models/custom_models/pipeline_base.py
@@ -68,13 +68,6 @@ def merge_export_arg(model_map, arg, arg_name):
     return model_map
 
 
-# def str_to_list(string):
-#     out = string.strip("[]").replace(" ", "").split(";")
-#     for item in out:
-#         item = ast.literal_eval(item)
-#     return out
-
-
 class PipelineComponent:
     """
     Wraps a VMFB runner with attributes for embedded metadata, device info, utilities and
@@ -84,16 +77,24 @@ class PipelineComponent:
     """
 
     def __init__(
-        self, printer, dest_type="devicearray", dest_dtype="float16", benchmark=False
+        self,
+        printer,
+        dest_type="devicearray",
+        dest_dtype="float16",
+        benchmark=False,
+        save_outputs=False,
     ):
         self.runner = None
         self.module_name = None
         self.device = None
         self.metadata = None
         self.printer = printer
         self.benchmark = benchmark
+        self.save_outputs = save_outputs
+        self.output_counter = 0
         self.dest_type = dest_type
         self.dest_dtype = dest_dtype
+        self.validate = False
 
     def load(
         self,
@@ -218,6 +219,16 @@ def _output_cast(self, output):
             case _:
                 return output
 
+    def save_output(self, function_name, output):
+        if isinstance(output, tuple) or isinstance(output, list):
+            for i in output:
+                self.save_output(function_name, i)
+        else:
+            np.save(
+                f"{function_name}_output_{self.output_counter}.npy", output.to_host()
+            )
+            self.output_counter += 1
+
     def _run(self, function_name, inputs: list):
         return self.module[function_name](*inputs)
 
@@ -235,10 +246,16 @@ def __call__(self, function_name, inputs: list):
         if not isinstance(inputs, list):
             inputs = [inputs]
         inputs = self._validate_or_convert_inputs(function_name, inputs)
+
+        if self.validate:
+            self.save_torch_inputs(inputs)
+
         if self.benchmark:
             output = self._run_and_benchmark(function_name, inputs)
         else:
             output = self._run(function_name, inputs)
+        if self.save_outputs:
+            self.save_output(function_name, output)
         output = self._output_cast(output)
         return output
 
@@ -332,14 +349,15 @@ def __init__(
         target: str | dict[str],
         ireec_flags: str | dict[str] = None,
         precision: str | dict[str] = "fp16",
-        td_spec: str | dict[str] = None,
+        attn_spec: str | dict[str] = None,
         decomp_attn: bool | dict[bool] = False,
         external_weights: str | dict[str] = None,
         pipeline_dir: str = "./shark_vmfbs",
         external_weights_dir: str = "./shark_weights",
         hf_model_name: str | dict[str] = None,
         benchmark: bool | dict[bool] = False,
         verbose: bool = False,
+        save_outputs: bool | dict[bool] = False,
         common_export_args: dict = {},
     ):
         self.map = model_map
@@ -350,6 +368,8 @@ def __init__(
                 target, dict
             ), "Device and target triple must be both dicts or both strings."
             for submodel in self.map.keys():
+                if self.map[submodel].get("load") == False:
+                    continue
                 assert submodel in device.keys(), f"Device for {submodel} not found."
                 assert (
                     submodel in target.keys()
@@ -369,11 +389,12 @@ def __init__(
         map_arguments = {
             "ireec_flags": ireec_flags,
             "precision": precision,
-            "td_spec": td_spec,
+            "attn_spec": attn_spec,
             "decomp_attn": decomp_attn,
             "external_weights": external_weights,
             "hf_model_name": hf_model_name,
             "benchmark": benchmark,
+            "save_outputs": save_outputs,
         }
         for arg in map_arguments.keys():
             self.map = merge_arg_into_map(self.map, map_arguments[arg], arg)
@@ -391,7 +412,8 @@ def __init__(
                 )
         for submodel in self.map.keys():
             for key, value in map_arguments.items():
-                self.map = merge_export_arg(self.map, value, key)
+                if key not in ["benchmark", "save_outputs"]:
+                    self.map = merge_export_arg(self.map, value, key)
             for key, value in self.map[submodel].get("export_args", {}).items():
                 if key == "hf_model_name":
                     self.map[submodel]["keywords"].append(
@@ -539,7 +561,11 @@ def is_prepared(self, vmfbs, weights):
                 avail_files = os.listdir(self.external_weights_dir)
                 candidates = []
                 for filename in avail_files:
-                    if all(str(x) in filename for x in w_keywords):
+                    if all(
+                        str(x) in filename
+                        or str(x) == os.path.join(self.external_weights_dir, filename)
+                        for x in w_keywords
+                    ):
                         candidates.append(
                             os.path.join(self.external_weights_dir, filename)
                         )
@@ -723,7 +749,7 @@ def export_submodel(
     def load_map(self):
         for submodel in self.map.keys():
             if not self.map[submodel]["load"]:
-                self.printer.print("Skipping load for ", submodel)
+                self.printer.print(f"Skipping load for {submodel}")
                 continue
             self.load_submodel(submodel)
 
@@ -739,6 +765,7 @@ def load_submodel(self, submodel):
             printer=self.printer,
             dest_type=dest_type,
             benchmark=self.map[submodel].get("benchmark", False),
+            save_outputs=self.map[submodel].get("save_outputs", False),
         )
         self.map[submodel]["runner"].load(
             self.map[submodel]["driver"],
@@ -751,6 +778,10 @@ def load_submodel(self, submodel):
 
     def unload_submodel(self, submodel):
         self.map[submodel]["runner"].unload()
+        self.map[submodel]["vmfb"] = None
+        self.map[submodel]["mlir"] = None
+        self.map[submodel]["weights"] = None
+        self.map[submodel]["export_args"]["input_mlir"] = None
         setattr(self, submodel, None)
 
 

diff --git a/models/turbine_models/custom_models/sd3_inference/diffusers_ref.py b/models/turbine_models/custom_models/sd3_inference/diffusers_ref.py
@@ -0,0 +1,49 @@
+from diffusers import StableDiffusion3Pipeline
+import torch
+from datetime import datetime as dt
+
+
+def run_diffusers_cpu(
+    hf_model_name,
+    prompt,
+    negative_prompt,
+    guidance_scale,
+    seed,
+    height,
+    width,
+    num_inference_steps,
+):
+    from diffusers import StableDiffusion3Pipeline
+
+    pipe = StableDiffusion3Pipeline.from_pretrained(
+        hf_model_name, torch_dtype=torch.float32
+    )
+    pipe = pipe.to("cpu")
+    generator = torch.Generator().manual_seed(int(seed))
+
+    image = pipe(
+        prompt=prompt,
+        negative_prompt=negative_prompt,
+        num_inference_steps=num_inference_steps,
+        guidance_scale=guidance_scale,
+        height=height,
+        width=width,
+        generator=generator,
+    ).images[0]
+    timestamp = dt.now().strftime("%Y-%m-%d_%H-%M-%S")
+    image.save(f"diffusers_reference_output_{timestamp}.png")
+
+
+if __name__ == "__main__":
+    from turbine_models.custom_models.sd_inference.sd_cmd_opts import args
+
+    run_diffusers_cpu(
+        args.hf_model_name,
+        args.prompt,
+        args.negative_prompt,
+        args.guidance_scale,
+        args.seed,
+        args.height,
+        args.width,
+        args.num_inference_steps,
+    )