Merge pull request #292 from Blosc/lazy-guess

Reductions to be preserved in persistent lazy expressions
Blosc · Oct 19, 2024 · 93f9d77 · 93f9d77
2 parents df10d36 + bf15e21
commit 93f9d77
Show file tree

Hide file tree

Showing 36 changed files with 467 additions and 244 deletions.
diff --git a/.github/workflows/cibuildwheels.yml b/.github/workflows/cibuildwheels.yml
@@ -135,7 +135,8 @@ jobs:
           cd ./dist
           tar -xzf blosc2-*.tar.gz
           cd ./blosc2-*/
-          pip install -e .[test] --break-system-packages
+          pip install pip --upgrade
+          pip install --break-system-packages -e .[test]
 
       - name: Test sdist package with pytest
         run: |

diff --git a/README.rst b/README.rst
@@ -106,7 +106,7 @@ Here it is a simple example:
     expr = ((a ** 3 + blosc2.sin(c * 2)) < b) & (c > 0)
 
     # Evaluate and get a NDArray as result
-    out = expr.eval()
+    out = expr.compute()
     print(out.info)
 
 As you can see, the `NDArray` instances are very similar to NumPy arrays, but behind the scenes,
@@ -123,7 +123,7 @@ you can achieve when the operands fit comfortably in memory:
 In this case, the performance is somewhat below that of top-tier libraries like Numexpr or Numba,
 but it is still quite good. Using CPUs with more cores than the M2 could further reduce the
 performance gap. One important point to note is that the memory consumption when
-using the `LazyArray.eval()` method is very low because the output is an `NDArray` object, which
+using the `LazyArray.compute()` method is very low because the output is an `NDArray` object, which
 is compressed and stored in memory by default.  On the other hand, the `LazyArray.__getitem__()`
 method returns an actual NumPy array, so it is not recommended for large datasets, as it can consume
 a significant amount of memory (though it may still be convenient for small outputs).

diff --git a/bench/io.py b/bench/io.py
@@ -13,7 +13,6 @@
 
 import blosc2
 
-
 CUBE_SIDE = 128
 
 class MmapBenchmarking:

diff --git a/bench/lazyarray-expr-small-dask.ipynb b/bench/lazyarray-expr-small-dask.ipynb
@@ -143,7 +143,7 @@
    "source": [
     "%%mprof_run 1.lazyexpr::eval-LZ4-1\n",
     "# Evaluate and get a NDArray as result\n",
-    "out = expr.eval()"
+    "out = expr.compute()"
    ]
   },
   {
@@ -452,7 +452,7 @@
    "source": [
     "%%mprof_run 2.lazyexpr::eval-nocompr\n",
     "# Evaluate and get a NDArray as result\n",
-    "out4 = expr.eval()"
+    "out4 = expr.compute()"
    ]
   },
   {

diff --git a/bench/ndarray/broadcast_expr.py b/bench/ndarray/broadcast_expr.py
@@ -36,8 +36,8 @@
     c = a * b
     # print(f"Elapsed time (expr): {time() - t0:.6f} s")
     t0 = time()
-    # d = c.eval(cparams=dict(codec=codec, clevel=5), chunks=(chunks, chunks), blocks=(blocks, blocks))
-    d = c.eval(cparams=dict(codec=codec, clevel=5))
+    # d = c.compute(cparams=dict(codec=codec, clevel=5), chunks=(chunks, chunks), blocks=(blocks, blocks))
+    d = c.compute(cparams=dict(codec=codec, clevel=5))
     print(f"Elapsed time (eval): {time() - t0:.6f} s")
     # print(d[:])
     print(f"cratio: {d.schunk.cratio:.2f}x")

diff --git a/bench/ndarray/eval_expr_numba.py b/bench/ndarray/eval_expr_numba.py
@@ -18,7 +18,6 @@
 
 import blosc2
 
-
 shape = (5000, 10_000)
 chunks = [500, 10_000]
 blocks = [4, 10_000]
@@ -112,7 +111,7 @@ def udf_numba(inputs, output, offset):
     b2expr = expr.replace("sin", "blosc2.sin").replace("cos", "blosc2.cos")
     c = eval(b2expr, b2vardict)
     t0 = time()
-    d = c.eval()
+    d = c.compute()
     print("LazyExpr+eval took %.3f s" % (time() - t0))
     # Check
     np.testing.assert_allclose(d[:], npres, rtol=rtol, atol=atol)
@@ -139,7 +138,7 @@ def udf_numba(inputs, output, offset):
     # actual benchmark
     # eval() uses the udf function as a prefilter
     t0 = time()
-    res = expr_.eval()
+    res = expr_.compute()
     print("LazyUDF+eval took %.3f s" % (time() - t0))
     np.testing.assert_allclose(res[...], npres, rtol=rtol, atol=atol)
     # getitem uses the same compiled function but as a postfilter
@@ -152,7 +151,7 @@ def udf_numba(inputs, output, offset):
                            chunks=chunks, blocks=blocks, cparams=cparams)
     # getitem but using chunked evaluation
     t0 = time()
-    res = expr_.eval()
+    res = expr_.compute()
     print("LazyUDF+chunked_eval took %.3f s" % (time() - t0))
     np.testing.assert_allclose(res[...], npres, rtol=rtol, atol=atol)
     t0 = time()

diff --git a/bench/ndarray/eval_fields.py b/bench/ndarray/eval_fields.py
@@ -6,12 +6,12 @@
 # LICENSE file in the root directory of this source tree)
 #######################################################################
 
-import numpy as np
-
-import blosc2
 from time import time
 
 import numexpr as ne
+import numpy as np
+
+import blosc2
 
 shape = (4_000, 5_000)
 chunks = (10, 5_000)
@@ -49,7 +49,7 @@
 c = a**2 + b**2 > 2 * a * b + 1
 # Evaluate: output is a NDArray
 t0 = time()
-d = c.eval(cparams=cparams)
+d = c.compute(cparams=cparams)
 t = time() - t0
 print(f"Time to evaluate field expression (eval): {t:.3f} s; {nps.nbytes/2**30/t:.2f} GB/s")
 

diff --git a/bench/ndarray/eval_where.py b/bench/ndarray/eval_where.py
@@ -6,12 +6,12 @@
 # LICENSE file in the root directory of this source tree)
 #######################################################################
 
-import numpy as np
-
-import blosc2
 from time import time
 
 import numexpr as ne
+import numpy as np
+
+import blosc2
 
 shape = (4_000, 5_000)
 chunks = (10, 5_000)
@@ -49,7 +49,7 @@
 # Evaluate: output is a NDArray
 t0 = time()
 c = a**2 + b**2 > 2 * a * b + 1
-d = c.where(0, 1).eval(cparams=cparams)
+d = c.where(0, 1).compute(cparams=cparams)
 t = time() - t0
 print(f"Time to evaluate where expression (eval): {t:.3f} s; {nps.nbytes/2**30/t:.3f} GB/s")
 
@@ -69,13 +69,13 @@
 
 # Evaluate and get row values: output is a NDArray
 t0 = time()
-npd = s[a**2 + b**2 > 2 * a * b + 1].eval(cparams=cparams)
+npd = s[a**2 + b**2 > 2 * a * b + 1].compute(cparams=cparams)
 t = time() - t0
 print(f"Time to get row values (eval): {t:.3f} s; {nps.nbytes/2**30/t:.3f} GB/s")
 
 # Evaluate and get row values: output is a NDArray
 t0 = time()
-npd = s['a**2 + b**2 > 2 * a * b + 1'].eval(cparams=cparams)
+npd = s['a**2 + b**2 > 2 * a * b + 1'].compute(cparams=cparams)
 t = time() - t0
 print(f"Time to get row values (eval, string): {t:.3f} s; {nps.nbytes/2**30/t:.3f} GB/s")
 

diff --git a/bench/ndarray/lazyarray-expr.ipynb b/bench/ndarray/lazyarray-expr.ipynb
@@ -181,7 +181,7 @@
    "source": [
     "%%mprof_run 0.lazyexpr::mmap-warmup\n",
     "# Warm memory-map cache\n",
-    "out1 = expr.eval()"
+    "out1 = expr.compute()"
    ]
   },
   {
@@ -206,7 +206,7 @@
    "source": [
     "%%mprof_run 1.lazyexpr::eval\n",
     "# Evaluate and get a NDArray as result\n",
-    "out1 = expr.eval()"
+    "out1 = expr.compute()"
    ]
   },
   {
@@ -325,7 +325,7 @@
    "outputs": [],
    "source": [
     "# Warm numba jit and compile\n",
-    "out1 = lzyudf.eval()"
+    "out1 = lzyudf.compute()"
    ]
   },
   {
@@ -341,7 +341,7 @@
    "outputs": [],
    "source": [
     "#%%mprof_run 2.lazyudf::eval\n",
-    "#out2 = lzyudf.eval()"
+    "#out2 = lzyudf.compute()"
    ]
   },
   {
@@ -451,7 +451,7 @@
    "source": [
     "%%mprof_run 6.lazyexpr::eval-second-time\n",
     "# Evaluate and get a NDArray as result\n",
-    "out6 = expr.eval()"
+    "out6 = expr.compute()"
    ]
   },
   {
@@ -476,7 +476,7 @@
    "source": [
     "%%mprof_run 6.lazyexpr::eval-second-time\n",
     "# Evaluate and get a NDArray as result\n",
-    "out6 = expr.eval()"
+    "out6 = expr.compute()"
    ]
   },
   {

diff --git a/bench/ndarray/reduce_expr.py b/bench/ndarray/reduce_expr.py
@@ -17,7 +17,6 @@
 
 import blosc2
 
-
 shape = (50, 100, 10_000)
 chunks = [5, 100, 10_000]
 blocks = [4, 10, 1_000]
@@ -63,7 +62,7 @@
     b2expr = expr.replace("sin", "blosc2.sin").replace("cos", "blosc2.cos")
     c = eval(b2expr, b2vardict)
     t0 = time()
-    d = c.eval()
+    d = c.compute()
     d = d.sum(axis=axis)  #, dtype=npres.dtype)
     print("LazyExpr+eval took %.3f s" % (time() - t0))
     # Check

diff --git a/bench/pack_tensor.py b/bench/pack_tensor.py
@@ -11,7 +11,6 @@
 Packaging tensors (PyTorch, TensorFlow) larger than 2 GB.
 """
 
-import io
 import sys
 import time
 
@@ -20,6 +19,7 @@
 import torch
 
 import blosc2
+import io
 
 NREP = 1
 # N = int(5e8 + 2**27)  # larger than 2 GB

diff --git a/doc/getting_started/tutorials/03.lazyarray-expressions.ipynb b/doc/getting_started/tutorials/03.lazyarray-expressions.ipynb
@@ -116,7 +116,7 @@
     }
    ],
    "source": [
-    "d = c.eval()  # evaluate the expression\n",
+    "d = c.compute()  # evaluate the expression\n",
     "print(f\"Class: {type(d)}\")\n",
     "print(f\"Compression ratio: {d.schunk.cratio:.2f}x\")"
    ]
@@ -148,7 +148,7 @@
     "cparams = blosc2.CParams(\n",
     "    codec=blosc2.Codec.ZSTD, filters=[blosc2.Filter.BITSHUFFLE], clevel=9, filters_meta=[0]\n",
     ")\n",
-    "d = c.eval(cparams=cparams)\n",
+    "d = c.compute(cparams=cparams)\n",
     "print(f\"Compression ratio: {d.schunk.cratio:.2f}x\")"
    ]
   },
@@ -262,7 +262,7 @@
     }
    ],
    "source": [
-    "d2 = c2.eval()\n",
+    "d2 = c2.compute()\n",
     "print(f\"Compression ratio: {d2.schunk.cratio:.2f}x\")"
    ]
   },
@@ -602,7 +602,7 @@
    ],
    "source": [
     "c2 = a + b2\n",
-    "d2 = c2.eval()\n",
+    "d2 = c2.compute()\n",
     "print(f\"Compression ratio: {d2.schunk.cratio:.2f}x, shape: {d2.shape}\")"
    ]
   },

diff --git a/doc/getting_started/tutorials/03.lazyarray-udf.ipynb b/doc/getting_started/tutorials/03.lazyarray-udf.ipynb
@@ -185,7 +185,7 @@
     }
    ],
    "source": [
-    "c = b.eval(urlpath=\"res.b2nd\", mode=\"w\")\n",
+    "c = b.compute(urlpath=\"res.b2nd\", mode=\"w\")\n",
     "print(f\"Class: {type(c)}\")\n",
     "print(c.info)"
    ]

diff --git a/examples/ndarray/broadcast_expr.py b/examples/ndarray/broadcast_expr.py
@@ -37,7 +37,7 @@
 # print(d, d.shape, d.dtype)
 # print(d.expression, d.operands)
 assert isinstance(d, blosc2.LazyExpr)
-e = d.eval()
+e = d.compute()
 print(e)
 assert isinstance(d, blosc2.LazyExpr)
 # Check

diff --git a/examples/ndarray/c2array_expr.py b/examples/ndarray/c2array_expr.py
@@ -32,5 +32,5 @@
 np.testing.assert_allclose(c[:], a[:] + b[:])
 
 # Get an NDArray instance instead of a NumPy array
-ndarr = c.eval()
+ndarr = c.compute()
 np.testing.assert_allclose(ndarr[:], a[:] + b[:])
diff --git a/examples/ndarray/eval_expr.py → examples/ndarray/compute_expr.py b/examples/ndarray/eval_expr.py → examples/ndarray/compute_expr.py
@@ -25,7 +25,7 @@
 # Get a LazyExpr instance
 c = a**2 + b**2 + 2 * a * b + 1
 # Evaluate: output is a NDArray
-d = c.eval()
+d = c.compute()
 # Check
 assert isinstance(d, blosc2.NDArray)
 assert np.allclose(d[:], npc)
@@ -54,7 +54,7 @@
 dc = blosc2.open("c.b2nd")
 
 # Evaluate: output is a NDArray
-dc2 = dc.eval()
+dc2 = dc.compute()
 # Check
 assert isinstance(dc2, blosc2.NDArray)
 assert np.allclose(dc2[:], npc)

diff --git a/examples/ndarray/eval_fields.py → examples/ndarray/compute_fields.py b/examples/ndarray/eval_fields.py → examples/ndarray/compute_fields.py
@@ -30,7 +30,7 @@
 c = a**2 + b**2 > 2 * a * b + 1
 
 # Evaluate: output is a NDArray
-d = c.eval()
+d = c.compute()
 # Check
 assert isinstance(d, blosc2.NDArray)
 assert np.allclose(d[:], npc)

diff --git a/examples/ndarray/eval_udf_numba.py → examples/ndarray/compute_udf_numba.py b/examples/ndarray/eval_udf_numba.py → examples/ndarray/compute_udf_numba.py
@@ -29,7 +29,7 @@ def func_numba(inputs_tuple, output, offset):
 
 lazyarray = blosc2.lazyudf(func_numba, (npa,), npa.dtype)
 print(lazyarray.info)
-res = lazyarray.eval()
+res = lazyarray.compute()
 print(res.info)
 np.testing.assert_allclose(res[...], npc)
 print("Numba + LazyArray evaluated correctly!")
diff --git a/examples/ndarray/eval_where.py → examples/ndarray/compute_where.py b/examples/ndarray/eval_where.py → examples/ndarray/compute_where.py
diff --git a/examples/ndarray/general_expressions.py b/examples/ndarray/general_expressions.py
@@ -24,17 +24,17 @@
 
 # Get a LazyExpr instance with all NDArray operands
 c = blosc2.lazyexpr("a**2 + b**2 + 2 * a * b + 1", {"a": a, "b": b})
-d = c.eval()
+d = c.compute()
 assert np.allclose(d[:], npc)
 
 # A LazyExpr instance with a mix of NDArray and NumPy operands
 c = blosc2.lazyexpr("a**2 + b**2 + 2 * a * b + 1", {"a": npa, "b": b})
-d = c.eval()
+d = c.compute()
 assert np.allclose(d[:], npc)
 
 # A LazyExpr instance with a all NumPy operands
 c = blosc2.lazyexpr("a**2 + b**2 + 2 * a * b + 1", {"a": npa, "b": npb})
-d = c.eval()
+d = c.compute()
 assert np.allclose(d[:], npc)
 
 # Evaluate partial slices

diff --git a/examples/ndarray/reduce_expr.py b/examples/ndarray/reduce_expr.py
@@ -34,7 +34,7 @@
 # print(d, d.shape, d.dtype)
 # print(d.expression, d.operands)
 assert isinstance(d, blosc2.LazyExpr)
-e = d.eval()
+e = d.compute()
 # print(e)
 assert isinstance(d, blosc2.LazyExpr)
 # Check