Skip to content

Commit

Permalink
Remove V cache (#6)
Browse files Browse the repository at this point in the history
Disable V cache; compress directions file
  • Loading branch information
crusaderky authored Apr 29, 2019
1 parent 51e6bcb commit d96a9d8
Show file tree
Hide file tree
Showing 12 changed files with 41 additions and 21,265 deletions.
3 changes: 0 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,3 @@ pyscenarios/version.py

doc/_build
_build

# Precompiled SOBOL directions
pyscenarios/resources/*.npy
5 changes: 0 additions & 5 deletions .stickler.yml

This file was deleted.

3 changes: 1 addition & 2 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
include pyscenarios/resources/*.txt
exclude pyscenarios/resources/*.npy
include pyscenarios/*.txt.xz
include LICENSE
recursive-include doc *
prune doc/_build
Expand Down
1 change: 1 addition & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ v0.2.0 (unreleased)
- 'rng' parameter in copula functions is now case insensitive
- Work around regression in IT copula with dask >= 1.1
(`<https://github.com/dask/dask/issues/4739> dask#4739>`)
- Smaller binary package; simplified setup
- Explicit CI tests for Windows, Python 3.5.0, and Python 3.7
- Mandatory flake8 and mypy in CI
- Changed license to Apache 2.0
Expand Down
10 changes: 5 additions & 5 deletions notebooks/tail_dependence.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,10 @@
"cov = [[1.0, 0.5],\n",
" [0.5, 1.0]]\n",
"\n",
"s3 = norm_cdf(t_copula(cov, 3, 100000, rng='SOBOL'))\n",
"s9 = norm_cdf(t_copula(cov, 9, 100000, rng='SOBOL'))\n",
"s999 = norm_cdf(t_copula(cov, 999, 100000, rng='SOBOL'))\n",
"sg = norm_cdf(gaussian_copula(cov, 100000, rng='SOBOL'))\n",
"s3 = norm_cdf(t_copula(cov, 3, 100000, rng='Sobol'))\n",
"s9 = norm_cdf(t_copula(cov, 9, 100000, rng='Sobol'))\n",
"s999 = norm_cdf(t_copula(cov, 999, 100000, rng='Sobol'))\n",
"sg = norm_cdf(gaussian_copula(cov, 100000, rng='Sobol'))\n",
"\n",
"q = np.arange(.001, 1, .001)\n",
"d3 = tail_dependence(s3[:, 0], s3[:, 1], q)\n",
Expand Down Expand Up @@ -71,7 +71,7 @@
" [0.5, 0.5, 1.0, 0.5],\n",
" [0.5, 0.5, 0.5, 1.0]]\n",
"\n",
"s = norm_cdf(t_copula(cov, [3, 3, 999, 999], 100000, rng='SOBOL'))\n",
"s = norm_cdf(t_copula(cov, [3, 3, 999, 999], 100000, rng='Sobol'))\n",
"\n",
"d33 = tail_dependence(s[:, 0], s[:, 1], q)\n",
"d39 = tail_dependence(s[:, 1], s[:, 2], q)\n",
Expand Down
Empty file removed pyscenarios/kernels/__init__.py
Empty file.
Binary file added pyscenarios/new-joe-kuo-6.21201.txt.xz
Binary file not shown.
Empty file removed pyscenarios/resources/__init__.py
Empty file.
21,201 changes: 0 additions & 21,201 deletions pyscenarios/resources/new-joe-kuo-6.21201.txt

This file was deleted.

62 changes: 25 additions & 37 deletions pyscenarios/sobol.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
`Stephen Joe and Frances Y. Kuo <http://web.maths.unsw.edu.au/~fkuo/sobol/>`_.
Directions are based on :file:`new-joe-kuo-6.21201` from the URL above.
"""
import lzma
import pkg_resources
from functools import lru_cache
from typing import Tuple, Union, cast
from typing import Iterator, Tuple, Union, cast

import numpy as np
import dask.array as da
Expand All @@ -17,28 +17,30 @@

__all__ = ('sobol', 'max_dimensions')

DIRECTIONS = 'new-joe-kuo-6.21201'
DIRECTIONS = 'new-joe-kuo-6.21201.txt.xz'


def calc_v() -> None:
"""Precalculate V array from the original author's file and then store the
result to disk, in the same directory of this script. This function is
invoked by ``setup.py build_ext``.
"""
import os.path
fdata = pkg_resources.resource_string(
'pyscenarios.resources', DIRECTIONS + '.txt').decode('ascii')
directions = _load_directions(fdata)
v = _calc_v_kernel(directions)
v_cache = None


# This is dirty, but this function is exclusively invoked by setup.py
output_fname = os.path.join(
os.path.dirname(__file__), 'resources', DIRECTIONS + '.npy')
np.save(output_fname, v)
print("Generated Sobol V matrix: %s" % output_fname)
def load_v() -> np.ndarray:
"""Load V from the original author's file. This function is executed
automatically the first time you call the :func:`sobol` function.
When using a dask backend, the V array is only loaded when
actually needed by the kernel; this results in smaller pickle files.
When using dask distributed, V is loaded locally on the workers instead of
being transferred over the network.
"""
global v_cache
if v_cache is None:
with pkg_resources.resource_stream('pyscenarios', DIRECTIONS) as fh:
with lzma.open(fh, 'rt') as zfh:
directions = _load_directions(zfh)
v_cache = _calc_v_kernel(directions)
return v_cache


def _load_directions(fdata: str) -> np.ndarray:
def _load_directions(fh: Iterator[str]) -> np.ndarray:
"""Load input file containing direction numbers.
The file must one of those available on the website of the
original author, or formatted like one.
Expand All @@ -49,15 +51,15 @@ def _load_directions(fdata: str) -> np.ndarray:
Column 0 contains the a values, while columns 1+ contain the m values.
The m values are padded on the right with zeros.
"""
rows = [row.split() for row in fdata.splitlines()]
rows = [row.split() for row in fh]

# Add padding at end of rows
# Drop first 2 columns
# Replace header with element for d=1
rowlen = max(len(row) for row in rows) - 2
rowlen = len(rows[-1])
for row in rows:
row[:] = row[2:] + ['0'] * (rowlen - len(row) + 2)
rows[0] = ['0'] + ['1'] * (rowlen - 1)
row[:] = row[2:] + ['0'] * (rowlen - len(row))
rows[0] = ['0'] + ['1'] * (rowlen - 3)
return np.array(rows, dtype='uint32')


Expand Down Expand Up @@ -91,20 +93,6 @@ def _calc_v_kernel(directions: np.ndarray) -> np.ndarray:
return v


@lru_cache(None)
def load_v() -> np.ndarray:
"""Load V from the on-disk cache. This function is executed
automatically the first time you call the :func:`sobol` function.
When using a dask backend, the V array is only loaded when
actually needed by the kernel; this results in smaller pickle files.
When using dask distributed, V is loaded locally on the workers instead of
being transferred over the network.
"""
buf = pkg_resources.resource_stream(
'pyscenarios.resources', DIRECTIONS + '.npy')
return np.load(buf)


def _sobol_kernel(samples: int, dimensions: int, s0: int, d0: int
) -> np.ndarray:
"""Numba kernel for :func:`sobol`
Expand Down
14 changes: 7 additions & 7 deletions pyscenarios/tests/test_copula.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def test_gaussian_mersenne_da():
])
def test_gaussian_sobol(chunks, expect_chunks):
actual = gaussian_copula(cov, samples=4, seed=123, chunks=chunks,
rng='SOBOL')
rng='Sobol')
expect = [[ 0. , 0. , 0. ], # noqa
[-0.67448975, -0.31303751, -1.15262386],
[ 0.67448975, 0.31303751, 1.15262386], # noqa
Expand Down Expand Up @@ -81,7 +81,7 @@ def test_student_t_mersenne_da():
])
def test_student_t_sobol(chunks, expect_chunks):
actual = t_copula(cov, df=3, samples=4, seed=123, chunks=chunks,
rng='SOBOL')
rng='Sobol')
expect = [[ 0. , 0. , 0. ], # noqa
[-0.90292647, -0.44513114, -1.38033019],
[ 0.51756147, 0.24504617, 0.84650386], # noqa
Expand Down Expand Up @@ -122,7 +122,7 @@ def test_it_mersenne_da():
])
def test_it_sobol(chunks, expect_chunks):
actual = t_copula(cov, df=[3, 4, 5], samples=4, seed=123,
chunks=chunks, rng='SOBOL')
chunks=chunks, rng='Sobol')
expect = [[ 0. , 0. , 0. ], # noqa
[-0.90292647, -0.41928686, -1.35361744],
[ 0.51756147, 0.25248047, 0.91032037], # noqa
Expand All @@ -138,14 +138,14 @@ def test_it_sobol(chunks, expect_chunks):
'func,kwargs', [
(gaussian_copula, {'rng': 'Mersenne Twister'}),
(gaussian_copula, {'rng': 'Mersenne Twister', 'chunks': (4096, 2)}),
(gaussian_copula, {'rng': 'SOBOL'}),
(gaussian_copula, {'rng': 'Sobol'}),
(t_copula, {'df': 8, 'rng': 'Mersenne Twister'}),
(t_copula, {'df': 8, 'rng': 'Mersenne Twister', 'chunks': (4096, 2)}),
(t_copula, {'df': 8, 'rng': 'SOBOL'}),
(t_copula, {'df': 8, 'rng': 'Sobol'}),
(t_copula, {'df': [8, 9, 10], 'rng': 'Mersenne Twister'}),
(t_copula, {'df': [8, 9, 10], 'rng': 'Mersenne Twister',
'chunks': (4096, 2)}),
(t_copula, {'df': [8, 9, 10], 'rng': 'SOBOL'}),
(t_copula, {'df': [8, 9, 10], 'rng': 'Sobol'}),
])


Expand Down Expand Up @@ -185,7 +185,7 @@ def test_cov_roundtrip(func, kwargs):
(999, [.13, .13, .13]),
([3, 3, 999, 999], [.33, .08, .13])
])
@pytest.mark.parametrize('rng', ['Mersenne Twister', 'SOBOL'])
@pytest.mark.parametrize('rng', ['Mersenne Twister', 'Sobol'])
@pytest.mark.parametrize('chunks', [None, (65536, 1)])
def test_tail_dependence(df, expect_td, rng, chunks):
cov2 = [[1.0, 0.5, 0.5, 0.5],
Expand Down
7 changes: 2 additions & 5 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,9 +114,6 @@ def write_version_py(filename=None):
if write_version:
write_version_py()

import pyscenarios.sobol # noqa: E402
pyscenarios.sobol.calc_v()


setup(name=DISTNAME,
version=FULLVERSION,
Expand All @@ -130,5 +127,5 @@ def write_version_py(filename=None):
tests_require=TESTS_REQUIRE,
python_requires='>=3.5.0',
url=URL,
packages=find_packages(),
package_data={'pyscenarios': ['tests/data/*', 'resources/*.npy']})
package_data={'pyscenarios': ['*.txt.xz']},
packages=find_packages())

0 comments on commit d96a9d8

Please sign in to comment.