brainets · EtienneCmb · Sep 3, 2024 · Jul 8, 2024 · Jul 8, 2024 · Jul 8, 2024
diff --git a/docs/api/api_core.rst b/docs/api/api_core.rst
@@ -15,6 +15,7 @@ Measures of Entropy
    entropy_gc
    entropy_gauss
    entropy_bin
+   entropy_hist
    entropy_knn
    entropy_kernel
    prepare_for_it

diff --git a/docs/theory.rst b/docs/theory.rst
@@ -66,10 +66,15 @@ problems due to size effects when using a small data set and variables exploring
 A more complicated and common scenario is the one of continuous variables. To estimate the 
 entropy of a continuous variable, different methods are implemented in the toolbox:
 
-* Binning method, that consists in binning the continuous data in a discrete set of bins. 
-  In this way, variables are discretized and the entropy can be computed as described above. 
-  This procedure can be performed in many different 
-  ways :cite:`endres2005bayesian, darbellay1999estimation, fraser1986independent`. 
+* Histogram estimator, that consists in binning the continuous data in a 
+  discrete set of bins. In this way, variables are discretized and the entropy 
+  can be computed as described above, correcting for the bin size .
+
+* Binning method, that allow to extimate the entropy of a discrete variable 
+  estimating the probability of each possible values in a frequentist approach.
+  Note that this procedure can be performed also on continuous variables after 
+  binarization in many different ways 
+  :cite:`endres2005bayesian, darbellay1999estimation, fraser1986independent`.  
 
 * K-Nearest Neighbors (KNN), that estimates the probability distribution by considering the 
   K-nearest neighbors of each data point :cite:`kraskov2004estimating`. 

diff --git a/examples/it/plot_entropies.py b/examples/it/plot_entropies.py
@@ -14,6 +14,7 @@
 """
 
 # %%
+
 import numpy as np
 
 from hoi.core import get_entropy
@@ -34,7 +35,7 @@
 # list of estimators to compare
 metrics = {
     "GC": get_entropy("gc"),
-    "Gaussian": get_entropy("gauss"),
+    "Gaussian": get_entropy(method="gauss"),
     "KNN-3": get_entropy("knn", k=3),
     "Kernel": get_entropy("kernel"),
 }

diff --git a/examples/it/plot_mi.py b/examples/it/plot_mi.py
@@ -13,6 +13,8 @@
 4. See if the computed MI converge toward the theoretical value
 """
 
+# %%
+
 import numpy as np
 from functools import partial
 
@@ -35,10 +37,10 @@
 # create a special function for the binning approach as it requires binary data
 mi_binning_fcn = get_mi("binning", base=2)
 
-
 def mi_binning(x, y, **kwargs):
     x = digitize(x.T, **kwargs).T
     y = digitize(y.T, **kwargs).T
+
     return mi_binning_fcn(x, y)
 
 

diff --git a/hoi/core/__init__.py b/hoi/core/__init__.py
@@ -4,6 +4,7 @@
     entropy_gc,
     entropy_gauss,
     entropy_bin,
+    entropy_hist,
     entropy_knn,
     prepare_for_it,
     entropy_kernel,

diff --git a/hoi/core/entropies.py b/hoi/core/entropies.py
@@ -26,15 +26,18 @@ def get_entropy(method="gc", **kwargs):
 
     Parameters
     ----------
-    method : {'gc', 'gauss', 'binning', 'knn', 'kernel'}
+    method : {'gc', 'gauss', 'binning', 'histogram', 'knn', 'kernel'}
         Name of the method to compute entropy. Use either :
 
             * 'gc': gaussian copula entropy [default]. See
                 :func:`hoi.core.entropy_gc`
             * 'gauss': gaussian entropy. See :func:`hoi.core.entropy_gauss`
-            * 'binning': binning-based estimator of entropy. Note that to
-                use this estimator, the data have be to discretized. See
+            * 'binning': estimator to use for discrete variables. See
                 :func:`hoi.core.entropy_bin`
+            * 'histogram' : estimator based on binning the data, to estimate
+                the probability distribution of the variables and then
+                compute the differential entropy. For more details see
+                :func:`hoi.core.entropy_hist`
             * 'knn': k-nearest neighbor estimator. See
                 :func:`hoi.core.entropy_knn`
             * 'kernel': kernel-based estimator of entropy
@@ -58,6 +61,8 @@ def get_entropy(method="gc", **kwargs):
         return entropy_gauss
     elif method == "binning":
         return partial(entropy_bin, **kwargs)
+    elif method == "histogram":
+        return partial(entropy_hist, **kwargs)
     elif method == "knn":
         return partial(entropy_knn, **kwargs)
     elif method == "kernel":
@@ -102,7 +107,9 @@ def prepare_for_it(data, method, samples=None, **kwargs):
             "data dtype should be integer. Check that you discretized your"
             " data. If so, use `data.astype(int)`"
         )
-    elif (method in ["kernel", "gc", "knn"]) and (data.dtype != float):
+    elif (method in ["kernel", "gc", "knn", "histogram"]) and (
+        data.dtype != float
+    ):
         raise ValueError(f"data dtype should be float, not {data.dtype}")
 
     # -------------------------------------------------------------------------
@@ -266,7 +273,9 @@ def entropy_bin(x: jnp.array, base: int = 2) -> jnp.array:
     hx : float
         Entropy of x (in bits)
     """
+
     n_features, n_samples = x.shape
+
     # here, we count the number of possible multiplets. The worst is that each
     # trial is unique. So we can prepare the output to be at most (n_samples,)
     # and if trials are repeated, just set to zero it's going to be compensated
@@ -275,9 +284,63 @@ def entropy_bin(x: jnp.array, base: int = 2) -> jnp.array:
         x, return_counts=True, size=n_samples, axis=1, fill_value=0
     )[1]
     probs = counts / n_samples
+
     return jax.scipy.special.entr(probs).sum() / jnp.log(base)
 
 
+###############################################################################
+###############################################################################
+#                               HISTOGRAM
+###############################################################################
+###############################################################################
+
+
+@partial(jax.jit, static_argnums=(1, 2))
+def entropy_hist(x: jnp.array, base: float = 2, n_bins: int = 8) -> jnp.array:
+    """Entropy using binning.
+
+    Parameters
+    ----------
+    x : array_like
+        Input data of shape (n_features, n_samples). The data should already
+        be discretize
+    base : float | 2
+        The logarithmic base to use. Default is base 2.
+    n_bins : int | 8
+        The number of bin to be considered in the binarization process
+
+    Returns
+    -------
+    hx : float
+        Entropy of x (in bits)
+    """
+
+    # bin size computation
+    bins_arr = (x.max(axis=1) - x.min(axis=1)) / n_bins
+    bin_s = jnp.prod(bins_arr)
+
+    # binning of the data
+    x_min, x_max = x.min(axis=1, keepdims=True), x.max(axis=1, keepdims=True)
+    dx = (x_max - x_min) / n_bins
+    x_binned = ((x - x_min) / dx).astype(int)
+    x_binned = jnp.minimum(x_binned, n_bins - 1).astype(int)
+
+    n_features, n_samples = x_binned.shape
+
+    # here, we count the number of possible multiplets. The worst is that each
+    # trial is unique. So we can prepare the output to be at most (n_samples,)
+    # and if trials are repeated, just set to zero it's going to be compensated
+    # by the entr() function
+
+    counts = jnp.unique(
+        x_binned, return_counts=True, size=n_samples, axis=1, fill_value=0
+    )[1]
+
+    probs = counts / n_samples
+
+    return bin_s * jax.scipy.special.entr(probs / bin_s).sum() / jnp.log(base)
+
+
 ###############################################################################
 ###############################################################################
 #                                    KNN

diff --git a/hoi/core/mi.py b/hoi/core/mi.py
@@ -20,7 +20,7 @@ def get_mi(method="gc", **kwargs):
 
     Parameters
     ----------
-    method : {'gc', 'binning', 'knn', 'kernel'}
+    method : {'gc', 'gauss', 'binning', 'knn', 'kernel'}
         Name of the method to compute mutual-information.
     kwargs : dict | {}
         Additional arguments sent to the mutual-information function.

diff --git a/hoi/core/tests/test_entropies.py b/hoi/core/tests/test_entropies.py
@@ -5,6 +5,7 @@
     entropy_gc,
     entropy_gauss,
     entropy_bin,
+    entropy_hist,
     entropy_knn,
     entropy_kernel,
     get_entropy,
@@ -24,7 +25,7 @@ def custom_est(x):
 
 
 # method names
-names = ["gc", "gauss", "knn", "kernel", "binning", custom_est]
+names = ["gc", "gauss", "knn", "kernel", "binning", "histogram", custom_est]
 
 
 # Smoke tests
@@ -46,6 +47,13 @@ def test_entropy_bin(self, x):
         assert hx.dtype == np.float32
         assert hx.shape == ()
 
+    @pytest.mark.parametrize("x", [x1, x2, j1, j2])
+    def test_entropy_hist(self, x):
+        hx = entropy_hist(x)
+        hx = np.asarray(hx)
+        assert hx.dtype == np.float32
+        assert hx.shape == ()
+
     @pytest.mark.parametrize("x", [x1, x2, j1, j2])
     def test_entropy_kernel(self, x):
         hx = entropy_kernel(x)