Skip to content

Commit

Permalink
Merge pull request #292 from Blosc/lazy-guess
Browse files Browse the repository at this point in the history
Reductions to be preserved in persistent lazy expressions
  • Loading branch information
FrancescAlted authored Oct 19, 2024
2 parents df10d36 + bf15e21 commit 93f9d77
Show file tree
Hide file tree
Showing 36 changed files with 467 additions and 244 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/cibuildwheels.yml
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,8 @@ jobs:
cd ./dist
tar -xzf blosc2-*.tar.gz
cd ./blosc2-*/
pip install -e .[test] --break-system-packages
pip install pip --upgrade
pip install --break-system-packages -e .[test]
- name: Test sdist package with pytest
run: |
Expand Down
4 changes: 2 additions & 2 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ Here it is a simple example:
expr = ((a ** 3 + blosc2.sin(c * 2)) < b) & (c > 0)
# Evaluate and get a NDArray as result
out = expr.eval()
out = expr.compute()
print(out.info)
As you can see, the `NDArray` instances are very similar to NumPy arrays, but behind the scenes,
Expand All @@ -123,7 +123,7 @@ you can achieve when the operands fit comfortably in memory:
In this case, the performance is somewhat below that of top-tier libraries like Numexpr or Numba,
but it is still quite good. Using CPUs with more cores than the M2 could further reduce the
performance gap. One important point to note is that the memory consumption when
using the `LazyArray.eval()` method is very low because the output is an `NDArray` object, which
using the `LazyArray.compute()` method is very low because the output is an `NDArray` object, which
is compressed and stored in memory by default. On the other hand, the `LazyArray.__getitem__()`
method returns an actual NumPy array, so it is not recommended for large datasets, as it can consume
a significant amount of memory (though it may still be convenient for small outputs).
Expand Down
1 change: 0 additions & 1 deletion bench/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@

import blosc2


CUBE_SIDE = 128

class MmapBenchmarking:
Expand Down
4 changes: 2 additions & 2 deletions bench/lazyarray-expr-small-dask.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@
"source": [
"%%mprof_run 1.lazyexpr::eval-LZ4-1\n",
"# Evaluate and get a NDArray as result\n",
"out = expr.eval()"
"out = expr.compute()"
]
},
{
Expand Down Expand Up @@ -452,7 +452,7 @@
"source": [
"%%mprof_run 2.lazyexpr::eval-nocompr\n",
"# Evaluate and get a NDArray as result\n",
"out4 = expr.eval()"
"out4 = expr.compute()"
]
},
{
Expand Down
4 changes: 2 additions & 2 deletions bench/ndarray/broadcast_expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@
c = a * b
# print(f"Elapsed time (expr): {time() - t0:.6f} s")
t0 = time()
# d = c.eval(cparams=dict(codec=codec, clevel=5), chunks=(chunks, chunks), blocks=(blocks, blocks))
d = c.eval(cparams=dict(codec=codec, clevel=5))
# d = c.compute(cparams=dict(codec=codec, clevel=5), chunks=(chunks, chunks), blocks=(blocks, blocks))
d = c.compute(cparams=dict(codec=codec, clevel=5))
print(f"Elapsed time (eval): {time() - t0:.6f} s")
# print(d[:])
print(f"cratio: {d.schunk.cratio:.2f}x")
Expand Down
7 changes: 3 additions & 4 deletions bench/ndarray/eval_expr_numba.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@

import blosc2


shape = (5000, 10_000)
chunks = [500, 10_000]
blocks = [4, 10_000]
Expand Down Expand Up @@ -112,7 +111,7 @@ def udf_numba(inputs, output, offset):
b2expr = expr.replace("sin", "blosc2.sin").replace("cos", "blosc2.cos")
c = eval(b2expr, b2vardict)
t0 = time()
d = c.eval()
d = c.compute()
print("LazyExpr+eval took %.3f s" % (time() - t0))
# Check
np.testing.assert_allclose(d[:], npres, rtol=rtol, atol=atol)
Expand All @@ -139,7 +138,7 @@ def udf_numba(inputs, output, offset):
# actual benchmark
# eval() uses the udf function as a prefilter
t0 = time()
res = expr_.eval()
res = expr_.compute()
print("LazyUDF+eval took %.3f s" % (time() - t0))
np.testing.assert_allclose(res[...], npres, rtol=rtol, atol=atol)
# getitem uses the same compiled function but as a postfilter
Expand All @@ -152,7 +151,7 @@ def udf_numba(inputs, output, offset):
chunks=chunks, blocks=blocks, cparams=cparams)
# getitem but using chunked evaluation
t0 = time()
res = expr_.eval()
res = expr_.compute()
print("LazyUDF+chunked_eval took %.3f s" % (time() - t0))
np.testing.assert_allclose(res[...], npres, rtol=rtol, atol=atol)
t0 = time()
Expand Down
8 changes: 4 additions & 4 deletions bench/ndarray/eval_fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@
# LICENSE file in the root directory of this source tree)
#######################################################################

import numpy as np

import blosc2
from time import time

import numexpr as ne
import numpy as np

import blosc2

shape = (4_000, 5_000)
chunks = (10, 5_000)
Expand Down Expand Up @@ -49,7 +49,7 @@
c = a**2 + b**2 > 2 * a * b + 1
# Evaluate: output is a NDArray
t0 = time()
d = c.eval(cparams=cparams)
d = c.compute(cparams=cparams)
t = time() - t0
print(f"Time to evaluate field expression (eval): {t:.3f} s; {nps.nbytes/2**30/t:.2f} GB/s")

Expand Down
12 changes: 6 additions & 6 deletions bench/ndarray/eval_where.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@
# LICENSE file in the root directory of this source tree)
#######################################################################

import numpy as np

import blosc2
from time import time

import numexpr as ne
import numpy as np

import blosc2

shape = (4_000, 5_000)
chunks = (10, 5_000)
Expand Down Expand Up @@ -49,7 +49,7 @@
# Evaluate: output is a NDArray
t0 = time()
c = a**2 + b**2 > 2 * a * b + 1
d = c.where(0, 1).eval(cparams=cparams)
d = c.where(0, 1).compute(cparams=cparams)
t = time() - t0
print(f"Time to evaluate where expression (eval): {t:.3f} s; {nps.nbytes/2**30/t:.3f} GB/s")

Expand All @@ -69,13 +69,13 @@

# Evaluate and get row values: output is a NDArray
t0 = time()
npd = s[a**2 + b**2 > 2 * a * b + 1].eval(cparams=cparams)
npd = s[a**2 + b**2 > 2 * a * b + 1].compute(cparams=cparams)
t = time() - t0
print(f"Time to get row values (eval): {t:.3f} s; {nps.nbytes/2**30/t:.3f} GB/s")

# Evaluate and get row values: output is a NDArray
t0 = time()
npd = s['a**2 + b**2 > 2 * a * b + 1'].eval(cparams=cparams)
npd = s['a**2 + b**2 > 2 * a * b + 1'].compute(cparams=cparams)
t = time() - t0
print(f"Time to get row values (eval, string): {t:.3f} s; {nps.nbytes/2**30/t:.3f} GB/s")

Expand Down
12 changes: 6 additions & 6 deletions bench/ndarray/lazyarray-expr.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@
"source": [
"%%mprof_run 0.lazyexpr::mmap-warmup\n",
"# Warm memory-map cache\n",
"out1 = expr.eval()"
"out1 = expr.compute()"
]
},
{
Expand All @@ -206,7 +206,7 @@
"source": [
"%%mprof_run 1.lazyexpr::eval\n",
"# Evaluate and get a NDArray as result\n",
"out1 = expr.eval()"
"out1 = expr.compute()"
]
},
{
Expand Down Expand Up @@ -325,7 +325,7 @@
"outputs": [],
"source": [
"# Warm numba jit and compile\n",
"out1 = lzyudf.eval()"
"out1 = lzyudf.compute()"
]
},
{
Expand All @@ -341,7 +341,7 @@
"outputs": [],
"source": [
"#%%mprof_run 2.lazyudf::eval\n",
"#out2 = lzyudf.eval()"
"#out2 = lzyudf.compute()"
]
},
{
Expand Down Expand Up @@ -451,7 +451,7 @@
"source": [
"%%mprof_run 6.lazyexpr::eval-second-time\n",
"# Evaluate and get a NDArray as result\n",
"out6 = expr.eval()"
"out6 = expr.compute()"
]
},
{
Expand All @@ -476,7 +476,7 @@
"source": [
"%%mprof_run 6.lazyexpr::eval-second-time\n",
"# Evaluate and get a NDArray as result\n",
"out6 = expr.eval()"
"out6 = expr.compute()"
]
},
{
Expand Down
3 changes: 1 addition & 2 deletions bench/ndarray/reduce_expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@

import blosc2


shape = (50, 100, 10_000)
chunks = [5, 100, 10_000]
blocks = [4, 10, 1_000]
Expand Down Expand Up @@ -63,7 +62,7 @@
b2expr = expr.replace("sin", "blosc2.sin").replace("cos", "blosc2.cos")
c = eval(b2expr, b2vardict)
t0 = time()
d = c.eval()
d = c.compute()
d = d.sum(axis=axis) #, dtype=npres.dtype)
print("LazyExpr+eval took %.3f s" % (time() - t0))
# Check
Expand Down
2 changes: 1 addition & 1 deletion bench/pack_tensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
Packaging tensors (PyTorch, TensorFlow) larger than 2 GB.
"""

import io
import sys
import time

Expand All @@ -20,6 +19,7 @@
import torch

import blosc2
import io

NREP = 1
# N = int(5e8 + 2**27) # larger than 2 GB
Expand Down
8 changes: 4 additions & 4 deletions doc/getting_started/tutorials/03.lazyarray-expressions.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@
}
],
"source": [
"d = c.eval() # evaluate the expression\n",
"d = c.compute() # evaluate the expression\n",
"print(f\"Class: {type(d)}\")\n",
"print(f\"Compression ratio: {d.schunk.cratio:.2f}x\")"
]
Expand Down Expand Up @@ -148,7 +148,7 @@
"cparams = blosc2.CParams(\n",
" codec=blosc2.Codec.ZSTD, filters=[blosc2.Filter.BITSHUFFLE], clevel=9, filters_meta=[0]\n",
")\n",
"d = c.eval(cparams=cparams)\n",
"d = c.compute(cparams=cparams)\n",
"print(f\"Compression ratio: {d.schunk.cratio:.2f}x\")"
]
},
Expand Down Expand Up @@ -262,7 +262,7 @@
}
],
"source": [
"d2 = c2.eval()\n",
"d2 = c2.compute()\n",
"print(f\"Compression ratio: {d2.schunk.cratio:.2f}x\")"
]
},
Expand Down Expand Up @@ -602,7 +602,7 @@
],
"source": [
"c2 = a + b2\n",
"d2 = c2.eval()\n",
"d2 = c2.compute()\n",
"print(f\"Compression ratio: {d2.schunk.cratio:.2f}x, shape: {d2.shape}\")"
]
},
Expand Down
2 changes: 1 addition & 1 deletion doc/getting_started/tutorials/03.lazyarray-udf.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@
}
],
"source": [
"c = b.eval(urlpath=\"res.b2nd\", mode=\"w\")\n",
"c = b.compute(urlpath=\"res.b2nd\", mode=\"w\")\n",
"print(f\"Class: {type(c)}\")\n",
"print(c.info)"
]
Expand Down
2 changes: 1 addition & 1 deletion examples/ndarray/broadcast_expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
# print(d, d.shape, d.dtype)
# print(d.expression, d.operands)
assert isinstance(d, blosc2.LazyExpr)
e = d.eval()
e = d.compute()
print(e)
assert isinstance(d, blosc2.LazyExpr)
# Check
Expand Down
2 changes: 1 addition & 1 deletion examples/ndarray/c2array_expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,5 +32,5 @@
np.testing.assert_allclose(c[:], a[:] + b[:])

# Get an NDArray instance instead of a NumPy array
ndarr = c.eval()
ndarr = c.compute()
np.testing.assert_allclose(ndarr[:], a[:] + b[:])
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
# Get a LazyExpr instance
c = a**2 + b**2 + 2 * a * b + 1
# Evaluate: output is a NDArray
d = c.eval()
d = c.compute()
# Check
assert isinstance(d, blosc2.NDArray)
assert np.allclose(d[:], npc)
Expand Down Expand Up @@ -54,7 +54,7 @@
dc = blosc2.open("c.b2nd")

# Evaluate: output is a NDArray
dc2 = dc.eval()
dc2 = dc.compute()
# Check
assert isinstance(dc2, blosc2.NDArray)
assert np.allclose(dc2[:], npc)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
c = a**2 + b**2 > 2 * a * b + 1

# Evaluate: output is a NDArray
d = c.eval()
d = c.compute()
# Check
assert isinstance(d, blosc2.NDArray)
assert np.allclose(d[:], npc)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def func_numba(inputs_tuple, output, offset):

lazyarray = blosc2.lazyudf(func_numba, (npa,), npa.dtype)
print(lazyarray.info)
res = lazyarray.eval()
res = lazyarray.compute()
print(res.info)
np.testing.assert_allclose(res[...], npc)
print("Numba + LazyArray evaluated correctly!")
File renamed without changes.
6 changes: 3 additions & 3 deletions examples/ndarray/general_expressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,17 +24,17 @@

# Get a LazyExpr instance with all NDArray operands
c = blosc2.lazyexpr("a**2 + b**2 + 2 * a * b + 1", {"a": a, "b": b})
d = c.eval()
d = c.compute()
assert np.allclose(d[:], npc)

# A LazyExpr instance with a mix of NDArray and NumPy operands
c = blosc2.lazyexpr("a**2 + b**2 + 2 * a * b + 1", {"a": npa, "b": b})
d = c.eval()
d = c.compute()
assert np.allclose(d[:], npc)

# A LazyExpr instance with a all NumPy operands
c = blosc2.lazyexpr("a**2 + b**2 + 2 * a * b + 1", {"a": npa, "b": npb})
d = c.eval()
d = c.compute()
assert np.allclose(d[:], npc)

# Evaluate partial slices
Expand Down
2 changes: 1 addition & 1 deletion examples/ndarray/reduce_expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
# print(d, d.shape, d.dtype)
# print(d.expression, d.operands)
assert isinstance(d, blosc2.LazyExpr)
e = d.eval()
e = d.compute()
# print(e)
assert isinstance(d, blosc2.LazyExpr)
# Check
Expand Down
Loading

0 comments on commit 93f9d77

Please sign in to comment.