diff --git a/docs/api/api_core.rst b/docs/api/api_core.rst index fdeef3e3..8e13160b 100644 --- a/docs/api/api_core.rst +++ b/docs/api/api_core.rst @@ -15,6 +15,7 @@ Measures of Entropy entropy_gc entropy_gauss entropy_bin + entropy_hist entropy_knn entropy_kernel prepare_for_it diff --git a/docs/theory.rst b/docs/theory.rst index 2c084ff1..cacb93d1 100644 --- a/docs/theory.rst +++ b/docs/theory.rst @@ -66,10 +66,15 @@ problems due to size effects when using a small data set and variables exploring A more complicated and common scenario is the one of continuous variables. To estimate the entropy of a continuous variable, different methods are implemented in the toolbox: -* Binning method, that consists in binning the continuous data in a discrete set of bins. - In this way, variables are discretized and the entropy can be computed as described above. - This procedure can be performed in many different - ways :cite:`endres2005bayesian, darbellay1999estimation, fraser1986independent`. +* Histogram estimator, that consists in binning the continuous data in a + discrete set of bins. In this way, variables are discretized and the entropy + can be computed as described above, correcting for the bin size . + +* Binning method, that allow to extimate the entropy of a discrete variable + estimating the probability of each possible values in a frequentist approach. + Note that this procedure can be performed also on continuous variables after + binarization in many different ways + :cite:`endres2005bayesian, darbellay1999estimation, fraser1986independent`. * K-Nearest Neighbors (KNN), that estimates the probability distribution by considering the K-nearest neighbors of each data point :cite:`kraskov2004estimating`. diff --git a/examples/it/plot_entropies.py b/examples/it/plot_entropies.py index 396c1cf6..cb6ba476 100644 --- a/examples/it/plot_entropies.py +++ b/examples/it/plot_entropies.py @@ -14,6 +14,7 @@ """ # %% + import numpy as np from hoi.core import get_entropy @@ -34,7 +35,7 @@ # list of estimators to compare metrics = { "GC": get_entropy("gc"), - "Gaussian": get_entropy("gauss"), + "Gaussian": get_entropy(method="gauss"), "KNN-3": get_entropy("knn", k=3), "Kernel": get_entropy("kernel"), } diff --git a/examples/it/plot_mi.py b/examples/it/plot_mi.py index ac58147f..454dc6c5 100644 --- a/examples/it/plot_mi.py +++ b/examples/it/plot_mi.py @@ -13,6 +13,8 @@ 4. See if the computed MI converge toward the theoretical value """ +# %% + import numpy as np from functools import partial @@ -35,10 +37,10 @@ # create a special function for the binning approach as it requires binary data mi_binning_fcn = get_mi("binning", base=2) - def mi_binning(x, y, **kwargs): x = digitize(x.T, **kwargs).T y = digitize(y.T, **kwargs).T + return mi_binning_fcn(x, y) diff --git a/hoi/core/__init__.py b/hoi/core/__init__.py index 4bcd4f9f..39b7dcb0 100644 --- a/hoi/core/__init__.py +++ b/hoi/core/__init__.py @@ -4,6 +4,7 @@ entropy_gc, entropy_gauss, entropy_bin, + entropy_hist, entropy_knn, prepare_for_it, entropy_kernel, diff --git a/hoi/core/entropies.py b/hoi/core/entropies.py index 34a21166..63f5c7f0 100644 --- a/hoi/core/entropies.py +++ b/hoi/core/entropies.py @@ -26,15 +26,18 @@ def get_entropy(method="gc", **kwargs): Parameters ---------- - method : {'gc', 'gauss', 'binning', 'knn', 'kernel'} + method : {'gc', 'gauss', 'binning', 'histogram', 'knn', 'kernel'} Name of the method to compute entropy. Use either : * 'gc': gaussian copula entropy [default]. See :func:`hoi.core.entropy_gc` * 'gauss': gaussian entropy. See :func:`hoi.core.entropy_gauss` - * 'binning': binning-based estimator of entropy. Note that to - use this estimator, the data have be to discretized. See + * 'binning': estimator to use for discrete variables. See :func:`hoi.core.entropy_bin` + * 'histogram' : estimator based on binning the data, to estimate + the probability distribution of the variables and then + compute the differential entropy. For more details see + :func:`hoi.core.entropy_hist` * 'knn': k-nearest neighbor estimator. See :func:`hoi.core.entropy_knn` * 'kernel': kernel-based estimator of entropy @@ -58,6 +61,8 @@ def get_entropy(method="gc", **kwargs): return entropy_gauss elif method == "binning": return partial(entropy_bin, **kwargs) + elif method == "histogram": + return partial(entropy_hist, **kwargs) elif method == "knn": return partial(entropy_knn, **kwargs) elif method == "kernel": @@ -102,7 +107,9 @@ def prepare_for_it(data, method, samples=None, **kwargs): "data dtype should be integer. Check that you discretized your" " data. If so, use `data.astype(int)`" ) - elif (method in ["kernel", "gc", "knn"]) and (data.dtype != float): + elif (method in ["kernel", "gc", "knn", "histogram"]) and ( + data.dtype != float + ): raise ValueError(f"data dtype should be float, not {data.dtype}") # ------------------------------------------------------------------------- @@ -266,7 +273,9 @@ def entropy_bin(x: jnp.array, base: int = 2) -> jnp.array: hx : float Entropy of x (in bits) """ + n_features, n_samples = x.shape + # here, we count the number of possible multiplets. The worst is that each # trial is unique. So we can prepare the output to be at most (n_samples,) # and if trials are repeated, just set to zero it's going to be compensated @@ -275,9 +284,63 @@ def entropy_bin(x: jnp.array, base: int = 2) -> jnp.array: x, return_counts=True, size=n_samples, axis=1, fill_value=0 )[1] probs = counts / n_samples + return jax.scipy.special.entr(probs).sum() / jnp.log(base) +############################################################################### +############################################################################### +# HISTOGRAM +############################################################################### +############################################################################### + + +@partial(jax.jit, static_argnums=(1, 2)) +def entropy_hist(x: jnp.array, base: float = 2, n_bins: int = 8) -> jnp.array: + """Entropy using binning. + + Parameters + ---------- + x : array_like + Input data of shape (n_features, n_samples). The data should already + be discretize + base : float | 2 + The logarithmic base to use. Default is base 2. + n_bins : int | 8 + The number of bin to be considered in the binarization process + + Returns + ------- + hx : float + Entropy of x (in bits) + """ + + # bin size computation + bins_arr = (x.max(axis=1) - x.min(axis=1)) / n_bins + bin_s = jnp.prod(bins_arr) + + # binning of the data + x_min, x_max = x.min(axis=1, keepdims=True), x.max(axis=1, keepdims=True) + dx = (x_max - x_min) / n_bins + x_binned = ((x - x_min) / dx).astype(int) + x_binned = jnp.minimum(x_binned, n_bins - 1).astype(int) + + n_features, n_samples = x_binned.shape + + # here, we count the number of possible multiplets. The worst is that each + # trial is unique. So we can prepare the output to be at most (n_samples,) + # and if trials are repeated, just set to zero it's going to be compensated + # by the entr() function + + counts = jnp.unique( + x_binned, return_counts=True, size=n_samples, axis=1, fill_value=0 + )[1] + + probs = counts / n_samples + + return bin_s * jax.scipy.special.entr(probs / bin_s).sum() / jnp.log(base) + + ############################################################################### ############################################################################### # KNN diff --git a/hoi/core/mi.py b/hoi/core/mi.py index 2809e9e5..f149fb0e 100644 --- a/hoi/core/mi.py +++ b/hoi/core/mi.py @@ -20,7 +20,7 @@ def get_mi(method="gc", **kwargs): Parameters ---------- - method : {'gc', 'binning', 'knn', 'kernel'} + method : {'gc', 'gauss', 'binning', 'knn', 'kernel'} Name of the method to compute mutual-information. kwargs : dict | {} Additional arguments sent to the mutual-information function. diff --git a/hoi/core/tests/test_entropies.py b/hoi/core/tests/test_entropies.py index 82bc4a12..5441be3c 100644 --- a/hoi/core/tests/test_entropies.py +++ b/hoi/core/tests/test_entropies.py @@ -5,6 +5,7 @@ entropy_gc, entropy_gauss, entropy_bin, + entropy_hist, entropy_knn, entropy_kernel, get_entropy, @@ -24,7 +25,7 @@ def custom_est(x): # method names -names = ["gc", "gauss", "knn", "kernel", "binning", custom_est] +names = ["gc", "gauss", "knn", "kernel", "binning", "histogram", custom_est] # Smoke tests @@ -46,6 +47,13 @@ def test_entropy_bin(self, x): assert hx.dtype == np.float32 assert hx.shape == () + @pytest.mark.parametrize("x", [x1, x2, j1, j2]) + def test_entropy_hist(self, x): + hx = entropy_hist(x) + hx = np.asarray(hx) + assert hx.dtype == np.float32 + assert hx.shape == () + @pytest.mark.parametrize("x", [x1, x2, j1, j2]) def test_entropy_kernel(self, x): hx = entropy_kernel(x)