From 929e457b174fde48914715e1464626bfaed001e1 Mon Sep 17 00:00:00 2001
From: Robert Haase <haesleinhuepf@users.noreply.github.com>
Date: Sat, 31 Jul 2021 15:56:35 +0200
Subject: [PATCH 1/6] basic multi-GPU support

works with push, execute, execute_separable_kernel, copy and gaussian_blur only so far
---
 pyclesperanto_prototype/_tier0/__init__.py    |  2 +-
 pyclesperanto_prototype/_tier0/_create.py     | 12 +++---
 pyclesperanto_prototype/_tier0/_device.py     |  8 ++++
 pyclesperanto_prototype/_tier0/_execute.py    |  1 -
 pyclesperanto_prototype/_tier0/_push.py       |  8 ++--
 pyclesperanto_prototype/_tier0/_pycl.py       |  3 +-
 pyclesperanto_prototype/_tier0/_utils.py      |  3 ++
 pyclesperanto_prototype/_tier1/_copy.py       |  8 ++--
 .../_tier1/_execute_separable_kernel.py       | 26 +++++++------
 .../_tier1/_gaussian_blur.py                  |  9 ++---
 tests/test_multi_gpu_support.py               | 38 +++++++++++++++++++
 11 files changed, 86 insertions(+), 32 deletions(-)
 create mode 100644 pyclesperanto_prototype/_tier0/_utils.py
 create mode 100644 tests/test_multi_gpu_support.py

diff --git a/pyclesperanto_prototype/_tier0/__init__.py b/pyclesperanto_prototype/_tier0/__init__.py
index df04a6fb..c074d101 100644
--- a/pyclesperanto_prototype/_tier0/__init__.py
+++ b/pyclesperanto_prototype/_tier0/__init__.py
@@ -35,7 +35,7 @@
 from ._plugin_function import plugin_function
 from ._types import Image
 from ._cl_info import cl_info
-from ._device import get_device, select_device, set_device_scoring_key
+from ._device import get_device, select_device, set_device_scoring_key, new_device, Device
 from ._cl_image import create_image, empty_image_like, empty_image
 from ._available_device_names import available_device_names
 from ._set_wait_for_kernel_finish import set_wait_for_kernel_finish
\ No newline at end of file
diff --git a/pyclesperanto_prototype/_tier0/_create.py b/pyclesperanto_prototype/_tier0/_create.py
index 01c2da40..05af7f8f 100644
--- a/pyclesperanto_prototype/_tier0/_create.py
+++ b/pyclesperanto_prototype/_tier0/_create.py
@@ -1,8 +1,8 @@
 from ._pycl import OCLArray
 import numpy as np
+from ._device import Device, get_device
 
-
-def create(dimensions, dtype=np.float32):
+def create(dimensions, dtype=np.float32, device:Device = None):
 
     """
     Convenience method for creating images on the GPU. This method basically does the same as in CLIJ:
@@ -18,18 +18,20 @@ def create(dimensions, dtype=np.float32):
         if isinstance(dimensions, OCLArray)
         else tuple(dimensions)  # reverses a list/tuple
     )
-    return OCLArray.empty(dimensions, dtype)
+    if device is None:
+        device = get_device()
+    return device.empty(dimensions, dtype)
 
 def create_zyx(dimensions):
     return create(dimensions[::-1])
 
-def create_like(*args):
+def create_like(*args, device:Device = None):
     dimensions = args[0]
     if isinstance(dimensions, OCLArray):
         dimensions = dimensions.shape
     elif isinstance(dimensions, np.ndarray):
         dimensions = dimensions.shape[::-1]
-    return create(dimensions)
+    return create(dimensions, device=device)
 
 def create_binary_like(*args):
     dimensions = args[0]
diff --git a/pyclesperanto_prototype/_tier0/_device.py b/pyclesperanto_prototype/_tier0/_device.py
index dc34ffdc..27d7c415 100644
--- a/pyclesperanto_prototype/_tier0/_device.py
+++ b/pyclesperanto_prototype/_tier0/_device.py
@@ -1,6 +1,9 @@
 import pyopencl as cl
+from pyopencl import array
 from typing import Callable, List, Optional
 from functools import lru_cache
+from ._utils import prepare
+import numpy as np
 
 # TODO: we should discuss whether this collection is actually the best thing to pass
 # around. might be better to work lower level with contexts...
@@ -25,6 +28,11 @@ def program_from_source(self, source):
         from ._program import OCLProgram
         return OCLProgram(src_str=source, dev=self)
 
+    def from_array(self, arr, *args, **kwargs):
+        return array.to_device(self.queue, prepare(arr), *args, **kwargs)
+
+    def empty(self, shape, dtype=np.float32):
+        return array.empty(self.queue, shape, dtype)
 
 def score_device(dev: cl.Device) -> float:
     score = 4e12 if dev.type == cl.device_type.GPU else 2e12
diff --git a/pyclesperanto_prototype/_tier0/_execute.py b/pyclesperanto_prototype/_tier0/_execute.py
index 0573dddc..e5ddd395 100644
--- a/pyclesperanto_prototype/_tier0/_execute.py
+++ b/pyclesperanto_prototype/_tier0/_execute.py
@@ -98,7 +98,6 @@ def execute(anchor, opencl_kernel_filename, kernel_name, global_size, parameters
     # time_stamp = time.time()
     defines = ["#define MAX_ARRAY_SIZE 1000"]
 
-
     if image_size_independent_kernel_compilation:
         defines.extend([
             "#define GET_IMAGE_WIDTH(image_key) image_size_ ## image_key ## _width",
diff --git a/pyclesperanto_prototype/_tier0/_push.py b/pyclesperanto_prototype/_tier0/_push.py
index 1847ad5a..69855a59 100644
--- a/pyclesperanto_prototype/_tier0/_push.py
+++ b/pyclesperanto_prototype/_tier0/_push.py
@@ -1,8 +1,8 @@
 import numpy as np
 from ._pycl import OCLArray
+from ._device import Device, get_device
 
-
-def push(any_array):
+def push(any_array, device : Device = None):
     """Copies an image to GPU memory and returns its handle
 
     .. deprecated:: 0.6.0
@@ -36,7 +36,9 @@ def push(any_array):
         any_array = np.asarray(any_array.get())
 
     float_arr = any_array.astype(np.float32)
-    return OCLArray.from_array(float_arr)
+    if device is None:
+        device = get_device()
+    return device.from_array(float_arr)
 
 
 def push_zyx(any_array):
diff --git a/pyclesperanto_prototype/_tier0/_pycl.py b/pyclesperanto_prototype/_tier0/_pycl.py
index f7f9ef09..d47e1dc9 100644
--- a/pyclesperanto_prototype/_tier0/_pycl.py
+++ b/pyclesperanto_prototype/_tier0/_pycl.py
@@ -6,6 +6,7 @@
 from pyopencl import characterize
 from pyopencl import array
 from ._device import get_device
+from ._utils import prepare
 
 """ Below here, vendored from GPUtools
 Copyright (c) 2016, Martin Weigert
@@ -105,8 +106,6 @@ def _wrap_OCLArray(cls):
     WRAPPER
     """
 
-    def prepare(arr):
-        return np.require(arr, None, "C")
 
     @classmethod
     def from_array(cls, arr, *args, **kwargs):
diff --git a/pyclesperanto_prototype/_tier0/_utils.py b/pyclesperanto_prototype/_tier0/_utils.py
new file mode 100644
index 00000000..56a7d2c9
--- /dev/null
+++ b/pyclesperanto_prototype/_tier0/_utils.py
@@ -0,0 +1,3 @@
+import numpy as np
+def prepare(arr):
+    return np.require(arr, None, "C")
diff --git a/pyclesperanto_prototype/_tier1/_copy.py b/pyclesperanto_prototype/_tier1/_copy.py
index 6dfcaf16..72094be6 100644
--- a/pyclesperanto_prototype/_tier1/_copy.py
+++ b/pyclesperanto_prototype/_tier1/_copy.py
@@ -1,9 +1,9 @@
 from .._tier0 import execute
 from .._tier0 import plugin_function
-from .._tier0 import Image
+from .._tier0 import Image, Device
 
 @plugin_function
-def copy(source : Image, destination : Image = None):
+def copy(source : Image, destination : Image = None, device: Device = None):
     """Copies an image.
     
     <pre>f(x) = x</pre> 
@@ -12,6 +12,8 @@ def copy(source : Image, destination : Image = None):
     ----------
     source : Image
     destination : Image
+    device : Device, optional
+        OpenCL-device to operate on
     
     Returns
     -------
@@ -33,5 +35,5 @@ def copy(source : Image, destination : Image = None):
         "src":source
     }
 
-    execute(__file__, '../clij-opencl-kernels/kernels/copy_' + str(len(destination.shape)) + 'd_x.cl', 'copy_' + str(len(destination.shape)) + 'd', destination.shape, parameters)
+    execute(__file__, '../clij-opencl-kernels/kernels/copy_' + str(len(destination.shape)) + 'd_x.cl', 'copy_' + str(len(destination.shape)) + 'd', destination.shape, parameters, device=device)
     return destination
diff --git a/pyclesperanto_prototype/_tier1/_execute_separable_kernel.py b/pyclesperanto_prototype/_tier1/_execute_separable_kernel.py
index 17d6cdb8..d28a313b 100644
--- a/pyclesperanto_prototype/_tier1/_execute_separable_kernel.py
+++ b/pyclesperanto_prototype/_tier1/_execute_separable_kernel.py
@@ -1,19 +1,21 @@
 from pyclesperanto_prototype._tier0._create import create_like
 from pyclesperanto_prototype._tier1 import copy
 from pyclesperanto_prototype._tier0._execute import execute
+from pyclesperanto_prototype._tier0._device import Device
+
 from ._set import set
 
-def execute_separable_kernel(src, dst, anchor, opencl_kernel_filename, kernel_name, kernel_size_x, kernel_size_y, kernel_size_z, sigma_x, sigma_y, sigma_z, dimensions) :
+def execute_separable_kernel(src, dst, anchor, opencl_kernel_filename, kernel_name, kernel_size_x, kernel_size_y, kernel_size_z, sigma_x, sigma_y, sigma_z, dimensions, device : Device = None) :
 
     n = [kernel_size_x, kernel_size_y, kernel_size_z]
     sigma = [sigma_x, sigma_y, sigma_z]
 
     # todo: ensure that temp1 and temp2 become of type float
-    temp1 = create_like(src);
-    temp2 = create_like(src);
+    temp1 = create_like(src, device=device)
+    temp2 = create_like(src, device=device)
 
     if (sigma[0] > 0) :
-        param_src = src;
+        param_src = src
         if (dimensions == 2):
             param_dst = temp1
         else :
@@ -27,13 +29,13 @@ def execute_separable_kernel(src, dst, anchor, opencl_kernel_filename, kernel_na
             "s": float(sigma[0])
         }
 
-        execute(anchor, opencl_kernel_filename, kernel_name, src.shape, parameters)
+        execute(anchor, opencl_kernel_filename, kernel_name, src.shape, parameters, device=device)
 
     else :
         if (dimensions == 2):
-            copy(src, temp1)
+            copy(src, temp1, device=device)
         else :
-            copy(src, temp2)
+            copy(src, temp2, device=device)
 
     if (sigma[1] > 0) :
         if (dimensions == 2):
@@ -51,12 +53,12 @@ def execute_separable_kernel(src, dst, anchor, opencl_kernel_filename, kernel_na
             "s": float(sigma[1])
         }
 
-        execute(anchor, opencl_kernel_filename, kernel_name, src.shape, parameters)
+        execute(anchor, opencl_kernel_filename, kernel_name, src.shape, parameters, device=device)
     else :
         if (dimensions == 2):
-            copy(temp1, dst)
+            copy(temp1, dst, device=device)
         else :
-            copy(temp2, temp1)
+            copy(temp2, temp1, device=device)
 
     if (dimensions == 3):
         if (sigma[2] > 0):
@@ -68,8 +70,8 @@ def execute_separable_kernel(src, dst, anchor, opencl_kernel_filename, kernel_na
                 "N": int(n[2]),
                 "s": float(sigma[2])
             }
-            execute(anchor, opencl_kernel_filename, kernel_name, src.shape, parameters)
+            execute(anchor, opencl_kernel_filename, kernel_name, src.shape, parameters, device=device)
         else:
-            copy(temp1, dst)
+            copy(temp1, dst, device=device)
 
     return dst
\ No newline at end of file
diff --git a/pyclesperanto_prototype/_tier1/_gaussian_blur.py b/pyclesperanto_prototype/_tier1/_gaussian_blur.py
index 50b4156f..6cfebede 100644
--- a/pyclesperanto_prototype/_tier1/_gaussian_blur.py
+++ b/pyclesperanto_prototype/_tier1/_gaussian_blur.py
@@ -1,10 +1,10 @@
 from .._tier0 import sigma_to_kernel_size
 from .._tier0 import plugin_function
-from .._tier0 import Image
+from .._tier0 import Image, Device
 from ._execute_separable_kernel import execute_separable_kernel
 
 @plugin_function(categories=['filter', 'denoise', 'in assistant'], priority=1)
-def gaussian_blur(source : Image, destination : Image = None, sigma_x : float = 0, sigma_y : float = 0, sigma_z : float = 0):
+def gaussian_blur(source : Image, destination : Image = None, sigma_x : float = 0, sigma_y : float = 0, sigma_z : float = 0, device : Device = None):
     """Computes the Gaussian blurred image of an image given sigma values
     in X, Y and Z. 
     
@@ -34,8 +34,6 @@ def gaussian_blur(source : Image, destination : Image = None, sigma_x : float =
     ----------
     .. [1] https://clij.github.io/clij2-docs/reference_gaussianBlur3D
     """
-
-
     kernel_size_x = sigma_to_kernel_size(sigma_x)
     kernel_size_y = sigma_to_kernel_size(sigma_y)
     kernel_size_z = sigma_to_kernel_size(sigma_z)
@@ -52,7 +50,8 @@ def gaussian_blur(source : Image, destination : Image = None, sigma_x : float =
         sigma_x,
         sigma_y,
         sigma_z,
-        len(destination.shape)
+        len(destination.shape),
+        device=device
     )
 
     return destination
diff --git a/tests/test_multi_gpu_support.py b/tests/test_multi_gpu_support.py
new file mode 100644
index 00000000..4d302da8
--- /dev/null
+++ b/tests/test_multi_gpu_support.py
@@ -0,0 +1,38 @@
+import numpy as np
+import pyclesperanto_prototype as cle
+def test_single_gpu_support():
+    dev1 = cle.new_device("RTX")
+
+    print(dev1)
+
+    image = np.random.random((2048,2048,10))
+    gpu_input1 = cle.push(image, device=dev1)
+
+    gpu_output1 = cle.create_like(gpu_input1, device=dev1)
+
+    cle.gaussian_blur(gpu_input1, gpu_output1, sigma_x=1, sigma_y=1, device=dev1)
+
+    output1 = cle.pull(gpu_output1)
+    assert not np.allclose(output1, image)
+
+def test_multi_gpu_support():
+    dev1 = cle.new_device("RTX")
+    dev2 = cle.new_device("gfx")
+
+    print(dev1)
+    print(dev2)
+
+    image = np.random.random((2048,2048,10))
+    gpu_input1 = cle.push(image, device=dev1)
+    gpu_input2 = cle.push(image, device=dev2)
+
+    gpu_output1 = cle.create_like(gpu_input1, device=dev1)
+    gpu_output2 = cle.create_like(gpu_input2, device=dev2)
+
+    cle.gaussian_blur(gpu_input1, gpu_output1, sigma_x=1, sigma_y=1, device=dev1)
+    cle.gaussian_blur(gpu_input2, gpu_output2, sigma_x=1, sigma_y=1, device=dev2)
+
+    output1 = cle.pull(gpu_output1)
+    output2 = cle.pull(gpu_output2)
+
+    assert(np.allclose(output1, output2))

From 358f84762965c78edc584da82df6d83524610dc3 Mon Sep 17 00:00:00 2001
From: Robert Haase <haesleinhuepf@users.noreply.github.com>
Date: Sat, 31 Jul 2021 16:16:22 +0200
Subject: [PATCH 2/6] output_creators accept a device now

---
 pyclesperanto_prototype/_tier0/_create.py     | 84 +++++++++----------
 pyclesperanto_prototype/_tier0/_device.py     |  1 -
 .../_tier0/_plugin_function.py                | 14 +++-
 tests/test_multi_gpu_support.py               | 26 +++++-
 4 files changed, 78 insertions(+), 47 deletions(-)

diff --git a/pyclesperanto_prototype/_tier0/_create.py b/pyclesperanto_prototype/_tier0/_create.py
index 05af7f8f..24ed7f38 100644
--- a/pyclesperanto_prototype/_tier0/_create.py
+++ b/pyclesperanto_prototype/_tier0/_create.py
@@ -22,8 +22,8 @@ def create(dimensions, dtype=np.float32, device:Device = None):
         device = get_device()
     return device.empty(dimensions, dtype)
 
-def create_zyx(dimensions):
-    return create(dimensions[::-1])
+def create_zyx(dimensions, device:Device = None):
+    return create(dimensions[::-1], device=device)
 
 def create_like(*args, device:Device = None):
     dimensions = args[0]
@@ -33,42 +33,42 @@ def create_like(*args, device:Device = None):
         dimensions = dimensions.shape[::-1]
     return create(dimensions, device=device)
 
-def create_binary_like(*args):
+def create_binary_like(*args, device:Device = None):
     dimensions = args[0]
     if isinstance(dimensions, OCLArray):
         dimensions = dimensions.shape
     elif isinstance(dimensions, np.ndarray):
         dimensions = dimensions.shape[::-1]
-    return create(dimensions, np.uint8)
+    return create(dimensions, np.uint8, device=device)
 
-def create_labels_like(*args):
+def create_labels_like(*args, device:Device = None):
     dimensions = args[0]
     if isinstance(dimensions, OCLArray):
         dimensions = dimensions.shape
     elif isinstance(dimensions, np.ndarray):
         dimensions = dimensions.shape[::-1]
-    return create(dimensions, np.uint32)
+    return create(dimensions, np.uint32, device=device)
 
-def create_pointlist_from_labelmap(input:OCLArray, *args):
+def create_pointlist_from_labelmap(input:OCLArray, *args, device:Device = None):
     from .._tier2 import maximum_of_all_pixels
     number_of_labels = int(maximum_of_all_pixels(input))
     number_of_dimensions = len(input.shape)
     
-    return create([number_of_dimensions, number_of_labels])
+    return create([number_of_dimensions, number_of_labels], device=device)
 
-def create_vector_from_labelmap(input: OCLArray, *args):
+def create_vector_from_labelmap(input: OCLArray, *args, device:Device = None):
     from .._tier2 import maximum_of_all_pixels
     number_of_labels = int(maximum_of_all_pixels(input)) + 1
 
-    return create([1, number_of_labels])
+    return create([1, number_of_labels], device=device)
 
-def create_matrix_from_pointlists(pointlist1:OCLArray, pointlist2:OCLArray):
+def create_matrix_from_pointlists(pointlist1:OCLArray, pointlist2:OCLArray, device:Device = None):
     width = pointlist1.shape[1] + 1
     height = pointlist2.shape[1] + 1
 
-    return create([width, height])
+    return create([width, height], device=device)
 
-def create_from_pointlist(pointlist: OCLArray, *args):
+def create_from_pointlist(pointlist: OCLArray, *args, device:Device = None):
     from .._tier1 import maximum_x_projection
     from .._tier0 import pull
 
@@ -76,73 +76,73 @@ def create_from_pointlist(pointlist: OCLArray, *args):
     max_pos = max_pos[0]
 
     if len(max_pos) == 3:  # 3D image requested
-        destination = create([max_pos[2] + 1, max_pos[1] + 1, max_pos[0] + 1])
+        destination = create([max_pos[2] + 1, max_pos[1] + 1, max_pos[0] + 1], device=device)
     elif len(max_pos) == 2:  # 2D image requested
-        destination = create([max_pos[1] + 1, max_pos[0] + 1])
+        destination = create([max_pos[1] + 1, max_pos[0] + 1], device=device)
     else:
         raise Exception("Size not supported: " + str(max_pos))
     return destination
 
-def create_square_matrix_from_pointlist(pointlist1:OCLArray):
+def create_square_matrix_from_pointlist(pointlist1:OCLArray, device:Device = None):
     width = pointlist1.shape[1] + 1
 
-    return create([width, width])
+    return create([width, width], device=device)
 
 
-def create_square_matrix_from_labelmap(labelmap: OCLArray):
+def create_square_matrix_from_labelmap(labelmap: OCLArray, device:Device = None):
     from .._tier2 import maximum_of_all_pixels
     width = int(maximum_of_all_pixels(labelmap) + 1)
 
-    return create([width, width])
+    return create([width, width], device=device)
 
 
-def create_square_matrix_from_two_labelmaps(labelmap1: OCLArray, labelmap2: OCLArray):
+def create_square_matrix_from_two_labelmaps(labelmap1: OCLArray, labelmap2: OCLArray, device:Device = None):
     from .._tier2 import maximum_of_all_pixels
     width = int(maximum_of_all_pixels(labelmap1) + 1)
     height = int(maximum_of_all_pixels(labelmap2) + 1)
 
-    return create([height, width])
+    return create([height, width],device=device)
 
 
-def create_vector_from_square_matrix(square_matrix : OCLArray, *args):
-    return create([1, square_matrix.shape[0]])
+def create_vector_from_square_matrix(square_matrix : OCLArray, *args, device:Device = None):
+    return create([1, square_matrix.shape[0]], device=device)
 
 
-def create_2d_xy(input):
+def create_2d_xy(input, device:Device = None):
     if len(input.shape) == 3:
-        return create([input.shape[2], input.shape[1]])
+        return create([input.shape[2], input.shape[1]], device=device)
     else:
-        return create([input.shape[1], input.shape[0]])
+        return create([input.shape[1], input.shape[0]], device=device)
 
-def create_2d_yx(input):
+def create_2d_yx(input, device:Device = None):
     if len(input.shape) == 3:
-        return create([input.shape[1], input.shape[2]])
+        return create([input.shape[1], input.shape[2]], device=device)
     else:
-        return create([input.shape[0], 1])
+        return create([input.shape[0], 1], device=device)
 
-def create_2d_zy(input):
+def create_2d_zy(input, device:Device = None):
     if len(input.shape) == 3:
-        return create([input.shape[0], input.shape[1]])
+        return create([input.shape[0], input.shape[1]], device=device)
     else:
-        return create([1, input.shape[0]])
+        return create([1, input.shape[0]], device=device)
 
-def create_2d_yz(input):
+def create_2d_yz(input, device:Device = None):
     if len(input.shape) == 3:
-        return create([input.shape[1], input.shape[0]])
+        return create([input.shape[1], input.shape[0]], device=device)
     else:
-        return create([input.shape[0], 1])
+        return create([input.shape[0], 1], device=device)
 
-def create_2d_zx(input):
+def create_2d_zx(input, device:Device = None):
     if len(input.shape) == 3:
-        return create([input.shape[0], input.shape[2]])
+        return create([input.shape[0], input.shape[2]], device=device)
     else:
-        return create([1, input.shape[1]])
+        return create([1, input.shape[1]], device=device)
 
-def create_2d_xz(input):
+def create_2d_xz(input, device:Device = None):
     if len(input.shape) == 3:
-        return create([input.shape[2], input.shape[0]])
+        return create([input.shape[2], input.shape[0]], device=device)
     else:
-        return create([input.shape[1], 1])
+        return create([input.shape[1], 1], device=device)
 
-def create_none(*args):
+def create_none(*args, device:Device = None):
     return None
\ No newline at end of file
diff --git a/pyclesperanto_prototype/_tier0/_device.py b/pyclesperanto_prototype/_tier0/_device.py
index 27d7c415..167317f8 100644
--- a/pyclesperanto_prototype/_tier0/_device.py
+++ b/pyclesperanto_prototype/_tier0/_device.py
@@ -118,4 +118,3 @@ def set_device_scoring_key(func: Callable[[cl.Device], int]) -> None:
     except Exception as e:
         raise ValueError(f"Scoring algorithm invalid: {e}")
     _current_device.score_key = func
-
diff --git a/pyclesperanto_prototype/_tier0/_plugin_function.py b/pyclesperanto_prototype/_tier0/_plugin_function.py
index 921c0b77..84cf2e24 100644
--- a/pyclesperanto_prototype/_tier0/_plugin_function.py
+++ b/pyclesperanto_prototype/_tier0/_plugin_function.py
@@ -7,6 +7,7 @@
 from ._create import create_like
 from ._types import Image, is_image
 from ._push import push
+from ._device import get_device
 
 
 @curry
@@ -48,6 +49,8 @@ def plugin_function(
     function.categories = categories
     function.priority = priority
 
+
+
     @wraps(function)
     def worker_function(*args, **kwargs):
         sig = inspect.signature(function)
@@ -59,13 +62,20 @@ def worker_function(*args, **kwargs):
         # https://docs.python.org/3/library/inspect.html#inspect.BoundArguments.apply_defaults
         bound.apply_defaults()
 
+        # determine on which GPU the operation should be executed and
+        # potentially, output images should be created on
+        if 'device' in kwargs.keys():
+            device = kwargs['device']
+        else:
+            device = get_device()
+
         # copy images to GPU, and create output array if necessary
         for key, value in bound.arguments.items():
             if is_image(value):
                 bound.arguments[key] = push(value)
             if key in sig.parameters and sig.parameters[key].annotation is Image and value is None:
-                sig = inspect.signature(output_creator)
-                bound.arguments[key] = output_creator(*bound.args[:len(sig.parameters)])
+                sig = inspect.signature(output_creator)            # -1 because we add device by hand
+                bound.arguments[key] = output_creator(*bound.args[:len(sig.parameters) - 1], device=device)
 
         # call the decorated function
         return function(*bound.args, **bound.kwargs)
diff --git a/tests/test_multi_gpu_support.py b/tests/test_multi_gpu_support.py
index 4d302da8..d0d1ed40 100644
--- a/tests/test_multi_gpu_support.py
+++ b/tests/test_multi_gpu_support.py
@@ -15,6 +15,7 @@ def test_single_gpu_support():
     output1 = cle.pull(gpu_output1)
     assert not np.allclose(output1, image)
 
+
 def test_multi_gpu_support():
     dev1 = cle.new_device("RTX")
     dev2 = cle.new_device("gfx")
@@ -22,7 +23,7 @@ def test_multi_gpu_support():
     print(dev1)
     print(dev2)
 
-    image = np.random.random((2048,2048,10))
+    image = np.random.random((2048, 2048, 10))
     gpu_input1 = cle.push(image, device=dev1)
     gpu_input2 = cle.push(image, device=dev2)
 
@@ -35,4 +36,25 @@ def test_multi_gpu_support():
     output1 = cle.pull(gpu_output1)
     output2 = cle.pull(gpu_output2)
 
-    assert(np.allclose(output1, output2))
+    assert (np.allclose(output1, output2))
+
+
+def test_multi_gpu_support_output_creators():
+    dev1 = cle.new_device("RTX")
+    dev2 = cle.new_device("gfx")
+
+    print(dev1)
+    print(dev2)
+
+    image = np.random.random((2048, 2048, 10))
+    gpu_input1 = cle.push(image, device=dev1)
+    gpu_input2 = cle.push(image, device=dev2)
+
+    gpu_output1 = cle.gaussian_blur(gpu_input1, sigma_x=1, sigma_y=1, device=dev1)
+    gpu_output2 = cle.gaussian_blur(gpu_input2, sigma_x=1, sigma_y=1, device=dev2)
+
+    output1 = cle.pull(gpu_output1)
+    output2 = cle.pull(gpu_output2)
+
+    assert (np.allclose(output1, output2))
+

From 5296717f9db2c454b5097cac0b3c4ce4c440da35 Mon Sep 17 00:00:00 2001
From: Robert Haase <haesleinhuepf@users.noreply.github.com>
Date: Sat, 31 Jul 2021 16:46:02 +0200
Subject: [PATCH 3/6] push must also send data to the same device by  default

---
 pyclesperanto_prototype/_tier0/_plugin_function.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyclesperanto_prototype/_tier0/_plugin_function.py b/pyclesperanto_prototype/_tier0/_plugin_function.py
index 84cf2e24..c2d21ea2 100644
--- a/pyclesperanto_prototype/_tier0/_plugin_function.py
+++ b/pyclesperanto_prototype/_tier0/_plugin_function.py
@@ -72,7 +72,7 @@ def worker_function(*args, **kwargs):
         # copy images to GPU, and create output array if necessary
         for key, value in bound.arguments.items():
             if is_image(value):
-                bound.arguments[key] = push(value)
+                bound.arguments[key] = push(value, device=device)
             if key in sig.parameters and sig.parameters[key].annotation is Image and value is None:
                 sig = inspect.signature(output_creator)            # -1 because we add device by hand
                 bound.arguments[key] = output_creator(*bound.args[:len(sig.parameters) - 1], device=device)

From 1fd4e7bdf52bad2bf192cf9533c7761741c15941 Mon Sep 17 00:00:00 2001
From: Robert Haase <haesleinhuepf@users.noreply.github.com>
Date: Sat, 31 Jul 2021 18:00:48 +0200
Subject: [PATCH 4/6] multi-threading technically works; not sure about
 performance

---
 tests/test_multi_threading.py | 48 +++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)
 create mode 100644 tests/test_multi_threading.py

diff --git a/tests/test_multi_threading.py b/tests/test_multi_threading.py
new file mode 100644
index 00000000..b022f40d
--- /dev/null
+++ b/tests/test_multi_threading.py
@@ -0,0 +1,48 @@
+import pyclesperanto_prototype as cle
+import numpy as np
+import time
+
+def test_multi_gpu_threading():
+    import threading
+    import time
+
+    class myThread(threading.Thread):
+        def __init__(self, threadID, name, counter, image):
+            threading.Thread.__init__(self)
+            self.threadID = threadID
+            self.name = name
+            self.counter = counter
+            self.device = cle.new_device(name)
+            self.image = cle.push(image, device=self.device)
+
+        def run(self):
+            print("Starting " + self.name)
+            self.print_time()
+            print("Exiting " + self.name)
+
+        def print_time(self):
+            while self.counter:
+                self.image = cle.gaussian_blur(self.image, sigma_x=15, sigma_y=15, sigma_z=15, device=self.device)
+                time.sleep(0.1)
+                print("%s: %s gaussian blur" % (self.name, time.ctime(time.time())))
+                self.counter -= 1
+
+    image = np.random.random((100, 100, 10))
+
+    # Create new threads
+    thread1 = myThread(1, "RTX", 15, image) # RTX gpu
+    thread2 = myThread(2, "gfx", 12, image) # AMD gpu
+    thread3 = myThread(3, "Intel", 11, image) # Intel gpu
+
+    # Start new Threads
+    thread1.start()
+    thread2.start()
+    thread3.start()
+
+    print("Exiting Main Thread")
+    thread1.join()
+    thread2.join()
+    thread3.join()
+
+    assert False
+

From 2ce44985a6adc711f0bba5a78880fb0f49758f79 Mon Sep 17 00:00:00 2001
From: Robert Haase <haesleinhuepf@users.noreply.github.com>
Date: Sat, 31 Jul 2021 18:43:36 +0200
Subject: [PATCH 5/6] demo showing 2x speedup with parallelization on a RTX
 3050 Ti mobile

---
 tests/test_multi_threading.py | 61 ++++++++++++++++++++++++++++++++++-
 1 file changed, 60 insertions(+), 1 deletion(-)

diff --git a/tests/test_multi_threading.py b/tests/test_multi_threading.py
index b022f40d..2a7572d7 100644
--- a/tests/test_multi_threading.py
+++ b/tests/test_multi_threading.py
@@ -2,6 +2,8 @@
 import numpy as np
 import time
 
+
+
 def test_multi_gpu_threading():
     import threading
     import time
@@ -39,10 +41,67 @@ def print_time(self):
     thread2.start()
     thread3.start()
 
-    print("Exiting Main Thread")
     thread1.join()
     thread2.join()
     thread3.join()
 
+    print("Exiting Main Thread")
+
+def test_slinge_gpu_multi_threading():
+    import threading
+    import time
+
+    class myThread(threading.Thread):
+        def __init__(self, threadID, name, counter, image):
+            threading.Thread.__init__(self)
+            self.threadID = threadID
+            self.name = "T" + str(threadID)
+            self.counter = counter
+            self.device = cle.new_device(name)
+            self.image = cle.push(image, device=self.device)
+
+        def run(self):
+            #print("Starting " + self.name)
+            self.print_time()
+            #print("Exiting " + self.name)
+
+        def print_time(self):
+            while self.counter:
+                self.image = cle.gaussian_blur(self.image, sigma_x=25, sigma_y=25, sigma_z=25, device=self.device)
+                time.sleep(0.1) # this is important; other threads can use this time better
+                #print("%s: %s gaussian blur" % (self.name, time.ctime(time.time())))
+                self.counter -= 1
+
+    image = np.random.random((100, 100, 10))
+
+    num_tasks = 10
+    gpu_name = "RTX"
+
+    start_time = time.time()
+    # Create new threads
+    threads = [myThread(i + 1, gpu_name, 15, image) for i in range(num_tasks)]
+
+    # Start new Threads
+    for thread in threads:
+        thread.start()
+
+    # wait for finish
+    for thread in threads:
+        thread.join()
+    print("Parallel all done after ", time.time() - start_time, "s")
+
+    # Create new threads
+    start_time = time.time()
+    threads = [myThread(i + 1, gpu_name, 15, image) for i in range(num_tasks)]
+
+    for thread in threads:
+        # Start new Threads
+        thread.start()
+        # wait for finish
+        thread.join()
+
+    print("Sequential all done after ", time.time() - start_time, "s")
+
+
     assert False
 

From 9450673a2a42838069ca7af8328c972ce597213e Mon Sep 17 00:00:00 2001
From: Robert Haase <haesleinhuepf@users.noreply.github.com>
Date: Sat, 31 Jul 2021 18:53:19 +0200
Subject: [PATCH 6/6] make test pass

---
 tests/test_multi_threading.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_multi_threading.py b/tests/test_multi_threading.py
index 2a7572d7..d4206908 100644
--- a/tests/test_multi_threading.py
+++ b/tests/test_multi_threading.py
@@ -103,5 +103,5 @@ def print_time(self):
     print("Sequential all done after ", time.time() - start_time, "s")
 
 
-    assert False
+    #assert False