Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle NaNs #213

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 73 additions & 16 deletions mapclassify/classifiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -508,6 +508,34 @@ def load_example():
return calemp.load()


def _jenks_caspall(y, k):
x = y.copy()
q = quantile(x, k)
solving = True
xb, cnts = bin1d(x, q)
if x.ndim == 1:
x.shape = (x.size, 1)
n, _k = x.shape
xm = [np.median(x[xb==i]) for i in np.unique(xb)]
xb0 = xb.copy()
q = xm
it = 0
rk = list(range(k))
while solving:
xb = np.zeros(xb0.shape, int)
d = abs(x - q)
xb = d.argmin(axis=1)
if (xb0 == xb).all():
solving = False
else:
xb0 = xb
it += 1
q = np.array([np.median(x[xb==i]) for i in rk])
cuts = np.array([max(x[xb==i]) for i in np.unique(xb)])
cuts.shape = (len(cuts),)
return cuts, it


def _kmeans(y, k=5, n_init=10):
"""
Helper function to do k-means in one dimension.
Expand Down Expand Up @@ -680,7 +708,9 @@ class MapClassifier:
"""

def __init__(self, y):
y = np.asarray(y).flatten()
y = np.asarray(y).flatten().astype(float)

self.n = len(y)
self.name = "Map Classifier"
self.fmt = FMT
self.y = y
Expand All @@ -696,15 +726,23 @@ def set_fmt(self, fmt):
fmt = property(get_fmt, set_fmt)

def _summary(self):
yb = self.yb
yb = self.yb[self.mask]
self.classes = [np.nonzero(yb == c)[0].tolist() for c in range(self.k)]
self.tss = self.get_tss()
self.adcm = self.get_adcm()
self.gadf = self.get_gadf()

def _classify(self):
mask = ~np.isnan(self.y)
_y = self.y
self.y = self.y[mask]
self._set_bins()
self.yb, self.counts = bin1d(self.y, self.bins)
yb, self.counts = bin1d(self.y, self.bins)
self.yb = np.zeros(_y.shape, np.int8)
self.yb[mask] = yb
self.yb[~mask] = -1
self.y = _y
self.mask = mask

def _update(self, data, *args, **kwargs):
"""
Expand Down Expand Up @@ -965,7 +1003,7 @@ def get_adcm(self):

def get_gadf(self):
"""Goodness of absolute deviation of fit."""
adam = (np.abs(self.y - np.median(self.y))).sum()
adam = (np.abs(self.y[self.mask] - np.median(self.y[self.mask]))).sum()
# return 1 if array is invariant
gadf = 1 if adam == 0 else 1 - self.adcm / adam
return gadf
Expand Down Expand Up @@ -2028,16 +2066,24 @@ def __init__(self, y, k=K, pct=0.10, truncate=True):
pct = 1000.0 / n
ids = np.random.randint(0, n, int(n * pct))
y = np.asarray(y)
yr = y[ids]
mask = ~np.isnan(y)
yr = y[mask][ids]
yr[-1] = max(y) # make sure we have the upper bound
yr[0] = min(y) # make sure we have the min
self.original_y = y
self.pct = pct
self._truncated = truncate
self.yr = yr
self.yr_n = yr.size
MapClassifier.__init__(self, yr)
self.yb, self.counts = bin1d(y, self.bins)
#MapClassifier.__init__(self, yr)
bins = _fisher_jenks_means(yr, k)


yb, self.counts = bin1d(y[mask], bins)
self.yb = np.zeros(y.shape, np.uint8)
self.yb[mask] = yb
self.yb[~mask] = -1
self.bins = bins
self.name = "FisherJenksSampled"
self.y = y
self._summary() # have to recalculate summary stats
Expand All @@ -2046,6 +2092,9 @@ def _set_bins(self):
fj = FisherJenks(self.y, self.k)
self.bins = fj.bins

def _summary(self):
pass

def update(self, y=None, inplace=False, **kwargs):
"""
Add data or change classification parameters.
Expand Down Expand Up @@ -2219,27 +2268,35 @@ class JenksCaspallSampled(MapClassifier):

def __init__(self, y, k=K, pct=0.10):
self.k = k
n = y.size
mask = ~np.isnan(y)
n = y[mask].size
if pct * n > 1000:
pct = 1000.0 / n
ids = np.random.randint(0, n, int(n * pct))
y = np.asarray(y)
yr = y[ids]
yr[0] = max(y) # make sure we have the upper bound
yr = y[mask][ids]
yr[0] = max(y[mask]) # make sure we have the upper bound
# fit on sample
bins, its = _jenks_caspall(yr, k)
self.original_y = y
self.pct = pct
self.yr = yr
self.yr_n = yr.size
MapClassifier.__init__(self, yr)
self.yb, self.counts = bin1d(y, self.bins)
self.mask = mask
# transform all values
yb, counts = bin1d(y[mask], bins)
self.yb = np.zeros(y.shape, np.uint8)
self.yb[mask] = yb
self.yb[~mask] = -1
self.name = "JenksCaspallSampled"
self.y = y
self.bins = bins
self.counts = counts
self.iterations = its
self._summary() # have to recalculate summary stats

def _set_bins(self):
jc = JenksCaspall(self.y, self.k)
self.bins = jc.bins
self.iterations = jc.iterations
def _summary(self):
pass

def update(self, y=None, inplace=False, **kwargs):
"""
Expand Down
21 changes: 21 additions & 0 deletions mapclassify/tests/test_mapclassify.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,13 @@ def test_quantile_k(self):
numpy.testing.assert_almost_equal(k, len(quantile(y, k)))
assert k == len(quantile(y, k))

def test_quantile_nan(self):
numpy.random.seed(4414)
y = numpy.random.normal(0, 11, size=11)
y[10] = numpy.NaN
quants = Quantiles(y, k=3)
known_yb = numpy.array([0, 1, 0, 1, 0, 2, 0, 2, 1, 2, -1])
numpy.testing.assert_allclose(quants.yb, known_yb, rtol=RTOL)

class TestUpdate:
def setup_method(self):
Expand Down Expand Up @@ -525,6 +532,9 @@ def test_MaximumBreaks(self):
class TestFisherJenks:
def setup_method(self):
self.V = load_example()
v = load_example().to_list()
v.append(numpy.NaN)
self.VNAN = numpy.array(v)

def test_FisherJenks(self):
fj = FisherJenks(self.V)
Expand All @@ -536,6 +546,17 @@ def test_FisherJenks(self):
fj.counts, numpy.array([49, 3, 4, 1, 1])
)

def test_FisherJenksNAN(self):
fj = FisherJenks(self.VNAN)
assert fj.adcm == 799.24000000000001
numpy.testing.assert_array_almost_equal(
fj.bins, numpy.array([75.29, 192.05, 370.5, 722.85, 4111.45])
)
numpy.testing.assert_array_almost_equal(
fj.counts, numpy.array([49, 3, 4, 1, 1])
)
numpy.testing.assert_equal(fj.yb[-1], -1)


class TestJenksCaspall:
def setup_method(self):
Expand Down
Loading