diff --git a/mapclassify/classifiers.py b/mapclassify/classifiers.py index 515a0dc0..1c7666a9 100644 --- a/mapclassify/classifiers.py +++ b/mapclassify/classifiers.py @@ -508,6 +508,34 @@ def load_example(): return calemp.load() +def _jenks_caspall(y, k): + x = y.copy() + q = quantile(x, k) + solving = True + xb, cnts = bin1d(x, q) + if x.ndim == 1: + x.shape = (x.size, 1) + n, _k = x.shape + xm = [np.median(x[xb==i]) for i in np.unique(xb)] + xb0 = xb.copy() + q = xm + it = 0 + rk = list(range(k)) + while solving: + xb = np.zeros(xb0.shape, int) + d = abs(x - q) + xb = d.argmin(axis=1) + if (xb0 == xb).all(): + solving = False + else: + xb0 = xb + it += 1 + q = np.array([np.median(x[xb==i]) for i in rk]) + cuts = np.array([max(x[xb==i]) for i in np.unique(xb)]) + cuts.shape = (len(cuts),) + return cuts, it + + def _kmeans(y, k=5, n_init=10): """ Helper function to do k-means in one dimension. @@ -680,7 +708,9 @@ class MapClassifier: """ def __init__(self, y): - y = np.asarray(y).flatten() + y = np.asarray(y).flatten().astype(float) + + self.n = len(y) self.name = "Map Classifier" self.fmt = FMT self.y = y @@ -696,15 +726,23 @@ def set_fmt(self, fmt): fmt = property(get_fmt, set_fmt) def _summary(self): - yb = self.yb + yb = self.yb[self.mask] self.classes = [np.nonzero(yb == c)[0].tolist() for c in range(self.k)] self.tss = self.get_tss() self.adcm = self.get_adcm() self.gadf = self.get_gadf() def _classify(self): + mask = ~np.isnan(self.y) + _y = self.y + self.y = self.y[mask] self._set_bins() - self.yb, self.counts = bin1d(self.y, self.bins) + yb, self.counts = bin1d(self.y, self.bins) + self.yb = np.zeros(_y.shape, np.int8) + self.yb[mask] = yb + self.yb[~mask] = -1 + self.y = _y + self.mask = mask def _update(self, data, *args, **kwargs): """ @@ -965,7 +1003,7 @@ def get_adcm(self): def get_gadf(self): """Goodness of absolute deviation of fit.""" - adam = (np.abs(self.y - np.median(self.y))).sum() + adam = (np.abs(self.y[self.mask] - np.median(self.y[self.mask]))).sum() # return 1 if array is invariant gadf = 1 if adam == 0 else 1 - self.adcm / adam return gadf @@ -2028,7 +2066,8 @@ def __init__(self, y, k=K, pct=0.10, truncate=True): pct = 1000.0 / n ids = np.random.randint(0, n, int(n * pct)) y = np.asarray(y) - yr = y[ids] + mask = ~np.isnan(y) + yr = y[mask][ids] yr[-1] = max(y) # make sure we have the upper bound yr[0] = min(y) # make sure we have the min self.original_y = y @@ -2036,8 +2075,15 @@ def __init__(self, y, k=K, pct=0.10, truncate=True): self._truncated = truncate self.yr = yr self.yr_n = yr.size - MapClassifier.__init__(self, yr) - self.yb, self.counts = bin1d(y, self.bins) + #MapClassifier.__init__(self, yr) + bins = _fisher_jenks_means(yr, k) + + + yb, self.counts = bin1d(y[mask], bins) + self.yb = np.zeros(y.shape, np.uint8) + self.yb[mask] = yb + self.yb[~mask] = -1 + self.bins = bins self.name = "FisherJenksSampled" self.y = y self._summary() # have to recalculate summary stats @@ -2046,6 +2092,9 @@ def _set_bins(self): fj = FisherJenks(self.y, self.k) self.bins = fj.bins + def _summary(self): + pass + def update(self, y=None, inplace=False, **kwargs): """ Add data or change classification parameters. @@ -2219,27 +2268,35 @@ class JenksCaspallSampled(MapClassifier): def __init__(self, y, k=K, pct=0.10): self.k = k - n = y.size + mask = ~np.isnan(y) + n = y[mask].size if pct * n > 1000: pct = 1000.0 / n ids = np.random.randint(0, n, int(n * pct)) y = np.asarray(y) - yr = y[ids] - yr[0] = max(y) # make sure we have the upper bound + yr = y[mask][ids] + yr[0] = max(y[mask]) # make sure we have the upper bound + # fit on sample + bins, its = _jenks_caspall(yr, k) self.original_y = y self.pct = pct self.yr = yr self.yr_n = yr.size - MapClassifier.__init__(self, yr) - self.yb, self.counts = bin1d(y, self.bins) + self.mask = mask + # transform all values + yb, counts = bin1d(y[mask], bins) + self.yb = np.zeros(y.shape, np.uint8) + self.yb[mask] = yb + self.yb[~mask] = -1 self.name = "JenksCaspallSampled" self.y = y + self.bins = bins + self.counts = counts + self.iterations = its self._summary() # have to recalculate summary stats - def _set_bins(self): - jc = JenksCaspall(self.y, self.k) - self.bins = jc.bins - self.iterations = jc.iterations + def _summary(self): + pass def update(self, y=None, inplace=False, **kwargs): """ diff --git a/mapclassify/tests/test_mapclassify.py b/mapclassify/tests/test_mapclassify.py index dc06e190..c3b53e90 100644 --- a/mapclassify/tests/test_mapclassify.py +++ b/mapclassify/tests/test_mapclassify.py @@ -28,6 +28,13 @@ def test_quantile_k(self): numpy.testing.assert_almost_equal(k, len(quantile(y, k))) assert k == len(quantile(y, k)) + def test_quantile_nan(self): + numpy.random.seed(4414) + y = numpy.random.normal(0, 11, size=11) + y[10] = numpy.NaN + quants = Quantiles(y, k=3) + known_yb = numpy.array([0, 1, 0, 1, 0, 2, 0, 2, 1, 2, -1]) + numpy.testing.assert_allclose(quants.yb, known_yb, rtol=RTOL) class TestUpdate: def setup_method(self): @@ -525,6 +532,9 @@ def test_MaximumBreaks(self): class TestFisherJenks: def setup_method(self): self.V = load_example() + v = load_example().to_list() + v.append(numpy.NaN) + self.VNAN = numpy.array(v) def test_FisherJenks(self): fj = FisherJenks(self.V) @@ -536,6 +546,17 @@ def test_FisherJenks(self): fj.counts, numpy.array([49, 3, 4, 1, 1]) ) + def test_FisherJenksNAN(self): + fj = FisherJenks(self.VNAN) + assert fj.adcm == 799.24000000000001 + numpy.testing.assert_array_almost_equal( + fj.bins, numpy.array([75.29, 192.05, 370.5, 722.85, 4111.45]) + ) + numpy.testing.assert_array_almost_equal( + fj.counts, numpy.array([49, 3, 4, 1, 1]) + ) + numpy.testing.assert_equal(fj.yb[-1], -1) + class TestJenksCaspall: def setup_method(self):