diff --git a/.github/workflows/run-test-push.yml b/.github/workflows/run-test-push.yml index c92940c8..317e0e1e 100644 --- a/.github/workflows/run-test-push.yml +++ b/.github/workflows/run-test-push.yml @@ -13,11 +13,11 @@ jobs: runs-on: "ubuntu-latest" strategy: matrix: - python-version: ["3.13"] # latest only + python-version: ["3.14"] # latest only fail-fast: false name: Linux Python ${{ matrix.python-version }} steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v6 with: fetch-depth: 0 - uses: conda-incubator/setup-miniconda@v3 @@ -29,14 +29,9 @@ jobs: use-mamba: true - run: conda --version - run: python -V - - name: Install development version of NCAS-CMS/Pyfive:h5netcdf - run: | - cd .. - git clone https://github.com/NCAS-CMS/pyfive.git - cd pyfive - git checkout h5netcdf - pip install -e . - run: pip install -e . - run: conda list - - run: pytest -n 2 --junitxml=report-1.xml - - uses: codecov/codecov-action@v3 + # Flake8 fails quite heavily; we should move to pre-commit+Ruff here, anyway + # - run: flake8 --exclude tests,doc --max-line-length 120 --ignore F405,F401 + - run: pytest -n 2 -m "not slow" --junitxml=report-1.xml + - uses: codecov/codecov-action@v6 diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index f0719b1b..232ced5d 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -4,7 +4,6 @@ on: push: branches: - main - - pyfive schedule: - cron: '0 0 * * *' # nightly @@ -18,11 +17,11 @@ jobs: runs-on: "ubuntu-latest" strategy: matrix: - python-version: ["3.10", "3.11", "3.12", "3.13"] + python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"] fail-fast: false name: Linux Python ${{ matrix.python-version }} steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v6 with: fetch-depth: 0 - uses: conda-incubator/setup-miniconda@v3 @@ -34,13 +33,6 @@ jobs: use-mamba: true - run: conda --version - run: python -V - - name: Install development version of NCAS-CMS/Pyfive:h5netcdf - run: | - cd .. - git clone https://github.com/NCAS-CMS/pyfive.git - cd pyfive - git checkout h5netcdf - pip install -e . - run: conda list - run: pip install -e . - run: conda list @@ -50,11 +42,11 @@ jobs: runs-on: "macos-latest" strategy: matrix: - python-version: ["3.10", "3.11", "3.12", "3.13"] + python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"] fail-fast: false name: OSX Python ${{ matrix.python-version }} steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v6 with: fetch-depth: 0 - uses: conda-incubator/setup-miniconda@v3 @@ -66,14 +58,7 @@ jobs: use-mamba: true - run: conda --version - run: python -V - - name: Install development version of NCAS-CMS/Pyfive:h5netcdf - run: | - cd .. - git clone https://github.com/NCAS-CMS/pyfive.git - cd pyfive - git checkout h5netcdf - pip install -e . - run: conda list - run: mamba install -c conda-forge git - run: pip install -e . - - run: pytest + - run: pytest -n 2 diff --git a/.gitignore b/.gitignore index 703cf466..3dec0d85 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,8 @@ tests/*.pyc* tests/__pycache__/ .eggs ActiveStorage.egg-info +activestorage.egg-info +PyActiveStorage.egg-info # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/README.md b/README.md index b40355f5..1172af4f 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,18 @@ [![Maintenance](https://img.shields.io/badge/Maintained%3F-yes-green.svg)](https://GitHub.com/Naereen/StrapDown.js/graphs/commit-activity) +[![Documentation Status](https://app.readthedocs.org/projects/pyactivestorage/badge/?version=latest)](https://pyactivestorage.readthedocs.io/en/latest/?badge=latest) [![made-with-python](https://img.shields.io/badge/Made%20with-Python-1f425f.svg)](https://www.python.org/) -[![Test](https://github.com/valeriupredoi/PyActiveStorage/actions/workflows/run-tests.yml/badge.svg)](https://github.com/valeriupredoi/PyActiveStorage/actions/workflows/run-tests.yml) -[![codecov](https://codecov.io/gh/valeriupredoi/PyActiveStorage/branch/main/graph/badge.svg?token=1VGKP4L3S3)](https://codecov.io/gh/valeriupredoi/PyActiveStorage) +[![Test](https://github.com/NCAS-CMS/PyActiveStorage/actions/workflows/run-tests.yml/badge.svg)](https://github.com/NCAS-CMS/PyActiveStorage/actions/workflows/run-tests.yml) +[![codecov](https://codecov.io/gh/NCAS-CMS/PyActiveStorage/graph/badge.svg?token=1olGjnvAOp)](https://codecov.io/gh/NCAS-CMS/PyActiveStorage) +[![Anaconda-Server Badge](https://anaconda.org/conda-forge/pyactivestorage/badges/version.svg)](https://anaconda.org/conda-forge/pyactivestorage) -![pyactivestoragelogo](https://github.com/valeriupredoi/PyActiveStorage/blob/main/doc/figures/PyActiveStorage-logo-complete.jpg) +![pyactivestoragelogo](https://raw.githubusercontent.com/NCAS-CMS/PyActiveStorage/main/doc/figures/PyActiveStorage-logo-complete.jpg) -## Active Storage Prototype +## PyActiveStorage + +- [Latest documentation on ReadTheDocs (RTD)](https://pyactivestorage.readthedocs.io/en/latest/) +- [RTD latest builds](https://app.readthedocs.org/projects/pyactivestorage/) +- [GHA Tests](https://github.com/NCAS-CMS/PyActiveStorage/actions) +- [conda-forge feedstock](https://github.com/conda-forge/pyactivestorage-feedstock) ### Create virtual environment @@ -29,31 +36,40 @@ pip install -e . pytest -n 2 ``` -Python versions supported: (3.9 EOL but no more testing with it), 3.10, 3.11, 3.12, 3.13. Fully compatible with `numpy >=2.0.0`. +### Main dependencies + +- Python versions supported: 3.10, 3.11, 3.12, 3.13. Fully compatible with `numpy >=2.0.0`. +- [Pyfive](https://anaconda.org/conda-forge/pyfive) needs to be pinned `>=0.5.0` (first fully upgraded Pyfive version). ## Active Storage Data Interface This package provides -1. the class `Active`, which is a shimmy to NetCDF4 (and HDF5) storage via kerchunk metadata and the zarr indexer. It does not however, use zarr for the actual read. +1. the class `Active`, which is a shimmy to NetCDF4 (and HDF5) via a [`Pyfive.File`](https://github.com/NCAS-CMS/pyfive) file object 2. The actual reads are done in the methods of `storage.py` or `reductionist.py`, which are called from within an `Active.__getitem__`. -Example usage is in the file `tests/test_harness.py`, but it's basically this simple: +Example usage is in the test files, depending on the case: + +- [`tests/test_harness.py`](https://github.com/NCAS-CMS/PyActiveStorage/blob/main/tests/test_harness.py) +- [`test_real_s3.py`](https://github.com/NCAS-CMS/PyActiveStorage/blob/main/tests/test_real_s3.py) +- [`test_real_https.py`](https://github.com/NCAS-CMS/PyActiveStorage/blob/main/tests/test_real_https.py) + +but it's basically this simple: ```python -active = Active(self.testfile, "data") -active.method = "mean" -result = active[0:2, 4:6, 7:9] +active = Active(file.Path | Pyfive.Dataset, ncvar="some_var") +active._version = 2 +result = active.mean[0:2, 4:6, 7:9] ``` -where `result` will be the mean of the appropriate slice of the hyperslab in `var`. +where `result` will be the mean of the appropriate slice of the hyperslab in `some_var` variable data. There are some (relatively obsolete) documents from our exploration of zarr internals in the docs4understanding, but they are not germane to the usage of the Active class. ## Storage types PyActiveStorage is designed to interact with various storage backends. -The storage backend is specified using the `storage_type` argument to `Active` constructor. +The storage backend is automatically detected, but can still be specified using the `interface_type` argument to the `Active` constructor. There are two main integration points for a storage backend: #. Load netCDF metadata @@ -62,26 +78,30 @@ There are two main integration points for a storage backend: ### Local file The default storage backend is a local file. -To use a local file, use a `storage_type` of `None`, which is its default value. +To use a local file, use a `interface_type` of `None`, which is its default value. netCDF metadata is loaded using the [netCDF4](https://pypi.org/project/netCDF4/) library. The chunk reductions are implemented in `activestorage.storage` using NumPy. ### S3-compatible object store -We now have support for Active runs with netCDF4 files on S3, from [PR 89](https://github.com/valeriupredoi/PyActiveStorage/pull/89). +We now have support for Active runs with netCDF4 files on S3, from [PR 89](https://github.com/NCAS-CMS/PyActiveStorage/pull/89). To achieve this we integrate with [Reductionist](https://github.com/stackhpc/reductionist-rs), an S3 Active Storage Server. Reductionist is typically deployed "near" to an S3-compatible object store and provides an API to perform numerical reductions on object data. -To use Reductionist, use a `storage_type` of `s3`. +To use Reductionist, use a `interface_type` of `s3`. To load metadata, netCDF files are opened using `s3fs`, with `h5netcdf` used to put the open file (which is nothing more than a memory view of the netCDF file) into an hdf5/netCDF-like object format. Chunk reductions are implemented in `activestorage.reductionist`, with each operation resulting in an API request to the Reductionist server. From there on, `Active` works as per normal. +### HTTPS-compatible on an NGINX server + +The same infrastructure as for S3, but the file is passed in as an `https` URI. + ## Testing overview -We have written unit and integration tests, and employ a coverage measurement tool - Codecov, see PyActiveStorage [test coverage](https://app.codecov.io/gh/valeriupredoi/PyActiveStorage) with current coverage of 87%; our Continuous Integration (CI) testing is deployed on [Github Actions](https://github.com/valeriupredoi/PyActiveStorage/actions), and we have nightly tests that run the entire testing suite, to be able to detect any issues introduced by updated versions of our dependencies. Github Actions (GA) tests also test the integration of various storage types we currently support; as such, we have dedicated tests that test Active Storage with S3 storage (by creating and running a MinIO client from within the test, and deploying and testing PyActiveStorage with data shipped to the S3 client). +We have written unit and integration tests, and employ a coverage measurement tool - Codecov, see PyActiveStorage [test coverage](https://app.codecov.io/gh/NCAS-CMS/PyActiveStorage) with current coverage of 87%; our Continuous Integration (CI) testing is deployed on [Github Actions](https://github.com/NCAS-CMS/PyActiveStorage/actions), and we have nightly tests that run the entire testing suite, to be able to detect any issues introduced by updated versions of our dependencies. Github Actions (GA) tests also test the integration of various storage types we currently support; as such, we have dedicated tests that test Active Storage with S3 storage (by creating and running a MinIO client from within the test, and deploying and testing PyActiveStorage with data shipped to the S3 client). -Of particular interest are performance tests, and we have started using tests that measure system run time and resident memory (RES); we use ``pytest-monitor`` for this purpose, inside the GA CI testing environemnt. So far, performance testing showed us that HDF5 chunking is paramount for performance `ie` a large number of small HDF5 chunks leads to very long system run times, and high memory consumption; however, larger HDF5 chunks significantly increase performance – as an example, running PyActiveStorage on an uncompressed netCDF4 file of size 1GB on disk (500x500x500 data elements, float64 each), with optimal HDF5 chunking (eg 75 data elements per chunk, on each dimesnional axis) takes order 0.1s for a local POSIX storage and 0.3s for the case when the file is on an S3 server; the same run needs only order approx. 100MB of RES memory for each of the two storage options see [test result](https://github.com/valeriupredoi/PyActiveStorage/actions/runs/6313871715/job/17142905423?pr=146); the same types of runs with much smaller HDF5 chunks (eg 20x smaller) will need order a factor of 300 more time to complete, and order a few GB of RES memory. +Of particular interest are performance tests, and we have started using tests that measure system run time and resident memory (RES); we use ``pytest-monitor`` for this purpose, inside the GA CI testing environemnt. So far, performance testing showed us that HDF5 chunking is paramount for performance `ie` a large number of small HDF5 chunks leads to very long system run times, and high memory consumption; however, larger HDF5 chunks significantly increase performance – as an example, running PyActiveStorage on an uncompressed netCDF4 file of size 1GB on disk (500x500x500 data elements, float64 each), with optimal HDF5 chunking (eg 75 data elements per chunk, on each dimesnional axis) takes order 0.1s for a local POSIX storage and 0.3s for the case when the file is on an S3 server; the same run needs only order approx. 100MB of RES memory for each of the two storage options see [test result](https://github.com/NCAS-CMS/PyActiveStorage/actions/runs/6313871715/job/17142905423?pr=146); the same types of runs with much smaller HDF5 chunks (eg 20x smaller) will need order a factor of 300 more time to complete, and order a few GB of RES memory. ## Testing HDF5 chunking @@ -152,11 +172,14 @@ Kerchunking needs ~200MB same as Active in total - kerchunking is memory-dominan ## Documentation -See available Sphinx [documentation](https://htmlpreview.github.io/?https://github.com/valeriupredoi/PyActiveStorage/blob/main/doc/build/index.html). To build locally the documentation run: +See available Sphinx [documentation](https://pyactivestorage.readthedocs.io/en/latest/). To build locally the documentation run: ``` sphinx-build -Ea doc doc/build ``` + +Docs are webhooked to build on Pull Requests, and pushes. + ## Code coverage (test coverage) -We monitor test coverage via the [Codecov app](https://app.codecov.io/gh/valeriupredoi/PyActiveStorage) and employ a bot that displays coverage changes introduced in every PR; the bot posts a comment directly to the PR, in which coverage variations introduced by the proposed code changes are displayed. +We monitor test coverage via the [Codecov app](https://app.codecov.io/gh/NCAS-CMS/PyActiveStorage) and employ a bot that displays coverage changes introduced in every PR; the bot posts a comment directly to the PR, in which coverage variations introduced by the proposed code changes are displayed. diff --git a/activestorage/active.py b/activestorage/active.py index 761cf3b6..73c734f7 100644 --- a/activestorage/active.py +++ b/activestorage/active.py @@ -1,568 +1,20 @@ -import concurrent.futures -import os -import numpy as np -import pathlib -import urllib -import pyfive -import time -from pyfive.h5d import StoreInfo - -import s3fs - -from activestorage.config import * from activestorage import reductionist -from activestorage.storage import reduce_chunk, reduce_opens3_chunk -from activestorage.hdf2numcodec import decode_filters - - -def load_from_s3(uri, storage_options=None): - """ - Load a netCDF4-like object from S3. - - First, set up an S3 filesystem with s3fs.S3FileSystem. - Then open the uri with this FS -> s3file - s3file is a File-like object: a memory view but wih all the metadata - gubbins inside it (no data!) - calling >> ds = netCDF4.Dataset(s3file) << - will throw a FileNotFoundError because the netCDF4 library is always looking for - a local file, resulting in [Errno 2] No such file or directory: - '' - instead, we use h5netcdf: https://github.com/h5netcdf/h5netcdf - a Python binder straight to HDF5-netCDF4 interface, that doesn't need a "local" file - - storage_options: kwarg dict containing S3 credentials passed straight to Active - """ - if storage_options is None: # use pre-configured S3 credentials - fs = s3fs.S3FileSystem(key=S3_ACCESS_KEY, # eg "minioadmin" for Minio - secret=S3_SECRET_KEY, # eg "minioadmin" for Minio - client_kwargs={'endpoint_url': S3_URL}) # eg "http://localhost:9000" for Minio - else: - fs = s3fs.S3FileSystem(**storage_options) # use passed-in dictionary - - t1=time.time() - s3file = fs.open(uri, 'rb') - t2=time.time() - ds = pyfive.File(s3file) - t3=time.time() - print(f"Dataset loaded from S3 with s3fs and Pyfive: {uri} ({t2-t1:.2},{t3-t2:.2})") - return ds - -def _metricise(method): - """ Decorator for class methods loads into metric_data""" - def timed(self, *args, **kw): - ts = time.time() - metric_name='' - if '__metric_name' in kw: - metric_name = kw['__metric_name'] - del kw['__metric_name'] - result = method(self,*args, **kw) - te = time.time() - if metric_name: - self.metric_data[metric_name] = te-ts - return result - return timed - - -def get_missing_attributes(ds): - """" - Load all the missing attributes we need from a netcdf file - """ - - def hfix(x): - ''' - return item if single element list/array - see https://github.com/h5netcdf/h5netcdf/issues/116 - ''' - if x is None: - return x - if not np.isscalar(x) and len(x) == 1: - return x[0] - return x - - _FillValue = hfix(ds.attrs.get('_FillValue')) - missing_value = ds.attrs.get('missing_value') - valid_min = hfix(ds.attrs.get('valid_min')) - valid_max = hfix(ds.attrs.get('valid_max')) - valid_range = hfix(ds.attrs.get('valid_range')) - if valid_max is not None or valid_min is not None: - if valid_range is not None: - raise ValueError( - "Invalid combination in the file of valid_min, " - "valid_max, valid_range: " - f"{valid_min}, {valid_max}, {valid_range}" - ) - elif valid_range is not None: - valid_min, valid_max = valid_range - - return _FillValue, missing_value, valid_min, valid_max - -class Active: - """ - Instantiates an interface to active storage which contains either zarr files - or HDF5 (NetCDF4) files. - - This is Verson 1 which simply provides support for standard read operations, but done via - explicit reads within this class rather than within the underlying format libraries. - - Version 2 will add methods for actual active storage. - - """ - def __new__(cls, *args, **kwargs): - """Store reduction methods.""" - instance = super().__new__(cls) - instance._methods = { - "min": np.min, - "max": np.max, - "sum": np.sum, - # For the unweighted mean we calulate the sum and divide - # by the number of non-missing elements - "mean": np.sum, - } - return instance - - def __init__( - self, - uri, - ncvar, - storage_type=None, - max_threads=100, - storage_options=None, - active_storage_url=None - ): - """ - Instantiate with a NetCDF4 dataset URI and the variable of interest within that file. - (We need the variable, because we need variable specific metadata from within that - file, however, if that information is available at instantiation, it can be provided - using keywords and avoid a metadata read.) - - :param storage_options: s3fs.S3FileSystem options - :param active_storage_url: Reductionist server URL - """ - # Assume NetCDF4 for now - self.uri = uri - if self.uri is None: - raise ValueError(f"Must use a valid file for uri. Got {uri}") - - # still allow for a passable storage_type - # for special cases eg "special-POSIX" ie DDN - if not storage_type and storage_options is not None: - storage_type = urllib.parse.urlparse(uri).scheme - self.storage_type = storage_type - - # get storage_options - self.storage_options = storage_options - self.active_storage_url = active_storage_url - - # basic check on file - if not os.path.isfile(self.uri) and not self.storage_type: - raise ValueError(f"Must use existing file for uri. {self.uri} not found") - - self.ncvar = ncvar - if self.ncvar is None: - raise ValueError("Must set a netCDF variable name to slice") - - self._version = 1 - self._components = False - self._method = None - self._max_threads = max_threads - self.missing = None - self.ds = None - self.metric_data = {} - self.data_read = 0 - - @_metricise - def __load_nc_file(self): - """ Get the netcdf file and it's b-tree""" - ncvar = self.ncvar - # in all cases we need an open netcdf file to get at attributes - # we keep it open because we need it's b-tree - if self.storage_type is None: - nc = pyfive.File(self.uri) - elif self.storage_type == "s3": - nc = load_from_s3(self.uri, self.storage_options) - self.filename = self.uri - - self.ds = nc[ncvar] - - def __get_missing_attributes(self): - if self.ds is None: - self.__load_nc_file() - return get_missing_attributes(self.ds) - - def __getitem__(self, index): - """ - Provides support for a standard get item. - #FIXME-BNL: Why is the argument index? - """ - self.metric_data = {} - if self.ds is None: - self.__load_nc_file(__metric_name='load nc time') - #self.__metricise('Load','__load_nc_file') - - self.missing = self.__get_missing_attributes() - - self.data_read = 0 - - if self.method is None and self._version == 0: - - # No active operation - return self._get_vanilla(index, __metric_name='vanilla_time') - - elif self._version == 1: - - #FIXME: is the difference between version 1 and 2 still honoured? - return self._get_selection(index, __metric_name='selection 1 time (s)') - - elif self._version == 2: - - return self._get_selection(index, __metric_name='selection 2 time (s)') - - else: - raise ValueError(f'Version {self._version} not supported') - - @_metricise - def _get_vanilla(self, index): - """ - Get the data without any active operation - """ - data = self.ds[index] - data = self._mask_data(data) - return data - - @property - def components(self): - """Return or set the components flag. - - If True and `method` is not `None` then return the processed - result in a dictionary that includes a processed value and the - sample size, from which the final result can be calculated. - - """ - return self._components - - @components.setter - def components(self, value): - self._components = bool(value) - - @property - def method(self): - """Return or set the active method. - - The active method to apply when retrieving a subspace of the - data. By default the data is returned unprocessed. Valid - methods are: - - ========== ================================================== - *method* Description - ========== ================================================== - ``'min'`` The minumum - - ``'max'`` The maximum - - ``'mean'`` The unweighted mean - - ``'sum'`` The unweighted sum - ========== ================================================== - - """ - return self._methods.get(self._method) - - @method.setter - def method(self, value): - if value is not None and value not in self._methods: - raise ValueError(f"Bad 'method': {value}. Choose from min/max/mean/sum.") - - self._method = value - - @property - def ncvar(self): - """Return or set the netCDF variable name.""" - return self._ncvar - - @ncvar.setter - def ncvar(self, value): - self._ncvar = value - - - def _get_active(self, method, *args): - """ - *args defines a slice of data. This method loops over each of the chunks - necessary to extract the parts of the slice, and asks the active storage - to apply the method to each part. It then applies the method to - the partial results and returns a value is if method had been applied to - an array returned via getitem. - """ - raise NotImplementedError - - @_metricise - def _get_selection(self, *args): - """ - At this point we have a Dataset object, but all the important information about - how to use it is in the attribute DataoobjectDataset class. Here we gather - metadata from the dataset instance and then continue with the dataobjects instance. - """ - - # stick this here for later, to discuss with David - keepdims = True - - name = self.ds.name - dtype = np.dtype(self.ds.dtype) - # hopefully fix pyfive to get a dtype directly - array = pyfive.indexing.ZarrArrayStub(self.ds.shape, self.ds.chunks) - ds = self.ds.id - - self.metric_data['args'] = args - self.metric_data['dataset shape'] = self.ds.shape - self.metric_data['dataset chunks'] = self.ds.chunks - if ds.filter_pipeline is None: - compressor, filters = None, None - else: - compressor, filters = decode_filters(ds.filter_pipeline , dtype.itemsize, name) - - indexer = pyfive.indexing.OrthogonalIndexer(*args, array) - out_shape = indexer.shape - #stripped_indexer = [(a, b, c) for a,b,c in indexer] - drop_axes = indexer.drop_axes and keepdims - - # we use array._chunks rather than ds.chunks, as the latter is none in the case of - # unchunked data, and we need to tell the storage the array dimensions in this case. - return self._from_storage(ds, indexer, array._chunks, out_shape, dtype, compressor, filters, drop_axes) - - def _from_storage(self, ds, indexer, chunks, out_shape, out_dtype, compressor, filters, drop_axes): - method = self.method - - if method is not None: - out = [] - counts = [] - else: - out = np.empty(out_shape, dtype=out_dtype, order=ds._order) - counts = None # should never get touched with no method! - - # Create a shared session object. - if self.storage_type == "s3" and self._version==2: - if self.storage_options is not None: - key, secret = None, None - if "key" in self.storage_options: - key = self.storage_options["key"] - if "secret" in self.storage_options: - secret = self.storage_options["secret"] - if key and secret: - session = reductionist.get_session(key, secret, - S3_ACTIVE_STORAGE_CACERT) - else: - session = reductionist.get_session(S3_ACCESS_KEY, S3_SECRET_KEY, - S3_ACTIVE_STORAGE_CACERT) - else: - session = reductionist.get_session(S3_ACCESS_KEY, S3_SECRET_KEY, - S3_ACTIVE_STORAGE_CACERT) - else: - session = None - - # Process storage chunks using a thread pool. - # Because we do this, we need to read the dataset b-tree now, not as we go, so - # it is already in cache. If we remove the thread pool from here, we probably - # wouldn't need to do it before the first one. - - if ds.chunks is not None: - t1 = time.time() - # ds._get_chunk_addresses() - t2 = time.time() - t1 - self.metric_data['indexing time (s)'] = t2 - # self.metric_data['chunk number'] = len(ds._zchunk_index) - chunk_count = 0 - t1 = time.time() - with concurrent.futures.ThreadPoolExecutor(max_workers=self._max_threads) as executor: - futures = [] - # Submit chunks for processing. - for chunk_coords, chunk_selection, out_selection in indexer: - future = executor.submit( - self._process_chunk, - session, ds, chunks, chunk_coords, chunk_selection, - counts, out_selection, compressor, filters, drop_axes=drop_axes) - futures.append(future) - # Wait for completion. - for future in concurrent.futures.as_completed(futures): - try: - result = future.result() - except Exception as exc: - raise - else: - chunk_count +=1 - if method is not None: - result, count = result - out.append(result) - counts.append(count) - else: - # store selected data in output - result, selection = result - out[selection] = result - - if method is not None: - # Apply the method (again) to aggregate the result - out = method(out) - shape1 = (1,) * len(out_shape) - - if self._components: - # Return a dictionary of components containing the - # reduced data and the sample size ('n'). (Rationale: - # cf-python needs the sample size for all reductions; - # see the 'mtol' parameter of cf.Field.collapse.) - # - # Note that in all components must always have the - # same number of dimensions as the original array, - # i.e. 'drop_axes' is always considered False, - # regardless of its setting. (Rationale: dask - # reductions require the per-dask-chunk partial - # reductions to retain these dimensions so that - # partial results can be concatenated correctly.) - out = out.reshape(shape1) - - n = np.sum(counts).reshape(shape1) - if self._method == "mean": - # For the average, the returned component is - # "sum", not "mean" - out = {"sum": out, "n": n} - else: - out = {self._method: out, "n": n} - else: - # Return the reduced data as a numpy array. For most - # methods the data is already in this form. - if self._method == "mean": - # For the average, it is actually the sum that has - # been created, so we need to divide by the sample - # size. - out = out / np.sum(counts).reshape(shape1) - - t2 = time.time() - self.metric_data['reduction time (s)'] = t2-t1 - self.metric_data['chunks processed'] = chunk_count - self.metric_data['storage read (B)'] = self.data_read - return out - - def _get_endpoint_url(self): - """Return the endpoint_url of an S3 object store, or `None`""" - endpoint_url = self.storage_options.get('endpoint_url') - if endpoint_url is not None: - return endpoint_url - - client_kwargs = self.storage_options.get('client_kwargs') - if client_kwargs: - endpoint_url = client_kwargs.get('endpoint_url') - if endpoint_url is not None: - return endpoint_url - - return f"http://{urllib.parse.urlparse(self.filename).netloc}" - - def _process_chunk(self, session, ds, chunks, chunk_coords, chunk_selection, counts, - out_selection, compressor, filters, drop_axes=None): - """ - Obtain part or whole of a chunk. - - This is done by taking binary data from storage and filling - the output array. - - Note the need to use counts for some methods - #FIXME: Do, we, it's not actually used? - - """ - - # retrieve coordinates from chunk index - storeinfo = ds.get_chunk_info_from_chunk_coord(chunk_coords) - offset, size = storeinfo.byte_offset, storeinfo.size - self.data_read += size - - if self.storage_type == 's3' and self._version == 1: - - tmp, count = reduce_opens3_chunk(ds._fh, offset, size, compressor, filters, - self.missing, ds.dtype, - chunks, ds._order, - chunk_selection, method=self.method - ) - - elif self.storage_type == "s3" and self._version==2: - # S3: pass in pre-configured storage options (credentials) - # print("S3 rfile is:", self.filename) - parsed_url = urllib.parse.urlparse(self.filename) - bucket = parsed_url.netloc - object = parsed_url.path - - # for certain S3 servers rfile needs to contain the bucket eg "bucket/filename" - # as a result the parser above finds empty string bucket - if bucket == "": - bucket = os.path.dirname(object) - object = os.path.basename(object) - # print("S3 bucket:", bucket) - # print("S3 file:", object) - if self.storage_options is None: - # for the moment we need to force ds.dtype to be a numpy type - tmp, count = reductionist.reduce_chunk(session, - S3_ACTIVE_STORAGE_URL, - S3_URL, - bucket, object, offset, - size, compressor, filters, - self.missing, np.dtype(ds.dtype), - chunks, - ds._order, - chunk_selection, - operation=self._method) - else: - # special case for "anon=True" buckets that work only with e.g. - # fs = s3fs.S3FileSystem(anon=True, client_kwargs={'endpoint_url': S3_URL}) - # where file uri = bucketX/fileY.mc - print("S3 Storage options to Reductionist:", self.storage_options) - if self.storage_options.get("anon", None) == True: - bucket = os.path.dirname(parsed_url.path) # bucketX - object = os.path.basename(parsed_url.path) # fileY - print("S3 anon=True Bucket and File:", bucket, object) - tmp, count = reductionist.reduce_chunk(session, - self.active_storage_url, - self._get_endpoint_url(), - bucket, object, offset, - size, compressor, filters, - self.missing, np.dtype(ds.dtype), - chunks, - ds._order, - chunk_selection, - operation=self._method) - elif self.storage_type=='ActivePosix' and self.version==2: - # This is where the DDN Fuse and Infinia wrappers go - raise NotImplementedError - else: - # note there is an ongoing discussion about this interface, and what it returns - # see https://github.com/valeriupredoi/PyActiveStorage/issues/33 - # so neither the returned data or the interface should be considered stable - # although we will version changes. - tmp, count = reduce_chunk(self.filename, offset, size, compressor, filters, - self.missing, ds.dtype, - chunks, ds._order, - chunk_selection, method=self.method) - - if self.method is not None: - return tmp, count - else: - if drop_axes: - tmp = np.squeeze(tmp, axis=drop_axes) - return tmp, out_selection - - def _mask_data(self, data): - """ - Missing values obtained at initial getitem, and are used here to - mask data, if necessary - """ - if self.missing is None: - self.missing = self.__get_missing_attributes() - _FillValue, missing_value, valid_min, valid_max = self.missing - - if _FillValue is not None: - data = np.ma.masked_equal(data, _FillValue) - - if missing_value is not None: - data = np.ma.masked_equal(data, missing_value) - - if valid_max is not None: - data = np.ma.masked_greater(data, valid_max) - - if valid_min is not None: - data = np.ma.masked_less(data, valid_min) - - return data +from activestorage.core import Active +from activestorage.helpers import ( + get_endpoint_url, + get_missing_attributes, + load_from_https, + load_from_s3, + return_interface_type, +) + + +__all__ = [ + "Active", + "load_from_https", + "load_from_s3", + "get_missing_attributes", + "get_endpoint_url", + "return_interface_type", + "reductionist", +] diff --git a/activestorage/backends.py b/activestorage/backends.py new file mode 100644 index 00000000..c0f30f09 --- /dev/null +++ b/activestorage/backends.py @@ -0,0 +1,244 @@ +from __future__ import annotations + +import os +import urllib + +import numpy as np + +from activestorage import reductionist +from activestorage.config import ( + S3_ACCESS_KEY, + S3_ACTIVE_STORAGE_CACERT, + S3_ACTIVE_STORAGE_URL, + S3_SECRET_KEY, + S3_URL, +) +from activestorage.core import ChunkRequest, ChunkResult, SelectionRequest, SelectionResult, StorageBackend +from activestorage.helpers import get_endpoint_url +from activestorage.storage import reduce_chunk, reduce_opens3_chunk +from activestorage.strategies import ChunkedLocalStrategy, ChunkedRemoteStrategy, WholeArrayStrategy + + +class CacheAwareBackend(StorageBackend): + def is_range_cached(self, fh, offset, size) -> bool: + return False + + +class ReductionistBackend(CacheAwareBackend): + def __init__(self, active): + super().__init__(active) + self.execution_strategy = ChunkedRemoteStrategy() + + def build_url(self, filename, storage_options): + raise NotImplementedError + + def reduce_selection(self, request: SelectionRequest) -> SelectionResult: + raise NotImplementedError("Whole-array reduction not implemented for ReductionistBackend") + + +class LocalBackend(StorageBackend): + def __init__(self, active): + super().__init__(active) + self.execution_strategy = ChunkedLocalStrategy() + + def reduce_chunk(self, request: ChunkRequest) -> ChunkResult: + simulate_cbor = bool((self._active.storage_options or {}).get("local_simulate_cbor")) + method = self._active._methods.get(request.method) if request.method else None + missing = ( + request.missing.fill_value, + request.missing.missing_value, + request.missing.valid_min, + request.missing.valid_max, + ) + + # Remote datasets loaded through fsspec/pyfive need fh-based reads, + # not open(uri, "rb") on the URL string. + parsed = urllib.parse.urlparse(str(request.uri)) + if parsed.scheme in ("http", "https", "s3"): + fh = self._active._format.file_handle + data, count = reduce_opens3_chunk( + fh, + request.offset, + request.size, + request.compressor, + request.filters, + missing, + request.dtype, + request.chunks, + request.order, + request.chunk_selection, + method=method, + axis=request.axis, + ) + else: + data, count = reduce_chunk( + request.uri, + request.offset, + request.size, + request.compressor, + request.filters, + missing, + request.dtype, + request.chunks, + request.order, + request.chunk_selection, + method=method, + axis=request.axis, + ) + if simulate_cbor: + payload = reductionist.encode_result(data, count) + data, count = reductionist.decode_result_buffer(payload) + return ChunkResult(data=data, count=count, out_selection=()) + + +class S3Backend(ReductionistBackend): + def get_session(self): + opts = self._active.storage_options or {} + key = opts.get("key", S3_ACCESS_KEY) + secret = opts.get("secret", S3_SECRET_KEY) + return reductionist.get_session(key, secret, S3_ACTIVE_STORAGE_CACERT) + + def _resolve_bucket_object(self, filename, storage_options): + parsed = urllib.parse.urlparse(filename) + bucket = parsed.netloc + obj = parsed.path + + if bucket == "": + bucket = os.path.dirname(obj) + obj = os.path.basename(obj) + + if storage_options is not None and storage_options.get("anon", None) is True: + bucket = os.path.dirname(parsed.path) + obj = os.path.basename(parsed.path) + + return bucket, obj + + def reduce_chunk(self, request: ChunkRequest) -> ChunkResult: + if self._active._version == 1: + fh = self._active._format.file_handle + method = self._active._methods.get(request.method) if request.method else None + data, count = reduce_opens3_chunk( + fh, + request.offset, + request.size, + request.compressor, + request.filters, + ( + request.missing.fill_value, + request.missing.missing_value, + request.missing.valid_min, + request.missing.valid_max, + ), + request.dtype, + request.chunks, + request.order, + request.chunk_selection, + method=method, + axis=request.axis, + ) + return ChunkResult(data=data, count=count, out_selection=()) + + bucket, obj = self._resolve_bucket_object(request.uri, self._active.storage_options) + if self._active.storage_options is None: + endpoint = S3_URL + server = S3_ACTIVE_STORAGE_URL + else: + endpoint = get_endpoint_url(self._active.storage_options, request.uri) + server = self._active.active_storage_url or S3_ACTIVE_STORAGE_URL + + endpoint = str(endpoint).rstrip("/") + bucket = str(bucket).strip("/") + obj = str(obj).lstrip("/") + if bucket: + source = f"{endpoint}/{bucket}/{obj}" + else: + source = f"{endpoint}/{obj}" + + session = self.get_session() + data, count = reductionist.reduce_chunk( + session, + server, + source, + request.offset, + request.size, + request.compressor, + request.filters, + ( + request.missing.fill_value, + request.missing.missing_value, + request.missing.valid_min, + request.missing.valid_max, + ), + np.dtype(request.dtype), + request.chunks, + request.order, + request.chunk_selection, + axis=request.axis, + operation=request.method, + interface_type='s3', + option_disable_chunk_cache=self._active._option_disable_chunk_cache, + ) + self.close_session(session) + return ChunkResult(data=data, count=count, out_selection=()) + + +class HttpsBackend(ReductionistBackend): + def get_session(self): + opts = self._active.storage_options or {} + username = opts.get("username") + password = opts.get("password") + return reductionist.get_session(username, password, None) + + def build_url(self, filename, storage_options): + return filename + + def reduce_chunk(self, request: ChunkRequest) -> ChunkResult: + session = self.get_session() + data, count = reductionist.reduce_chunk( + session, + self._active.active_storage_url, + request.uri, + request.offset, + request.size, + request.compressor, + request.filters, + ( + request.missing.fill_value, + request.missing.missing_value, + request.missing.valid_min, + request.missing.valid_max, + ), + np.dtype(request.dtype), + request.chunks, + request.order, + request.chunk_selection, + axis=request.axis, + operation=request.method, + interface_type='https', + option_disable_chunk_cache=self._active._option_disable_chunk_cache, + ) + self.close_session(session) + return ChunkResult(data=data, count=count, out_selection=()) + + +class P5RemBackend(StorageBackend): + def __init__(self, active): + super().__init__(active) + self.execution_strategy = WholeArrayStrategy() + + def get_session(self): + return None + + def reduce_chunk(self, request: ChunkRequest) -> ChunkResult: + raise NotImplementedError("Chunked p5rem path is not implemented") + + def reduce_selection(self, request: SelectionRequest) -> SelectionResult: + data = self._active.ds[request.selection] + method = self._active._methods.get(request.method) if request.method else None + if method is None: + return SelectionResult(data=data, n=None) + reduced = method(data, axis=request.axis, keepdims=True) + n = np.size(data) + if request.method == "mean": + return SelectionResult(data=reduced, n=n) + return SelectionResult(data=reduced, n=n) diff --git a/activestorage/core.py b/activestorage/core.py new file mode 100644 index 00000000..1f6c4807 --- /dev/null +++ b/activestorage/core.py @@ -0,0 +1,428 @@ +from __future__ import annotations + +import os +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Any, Optional + +import numpy as np + +from activestorage.helpers import get_endpoint_url, get_missing_attributes, return_interface_type + + +@dataclass +class MissingAttributes: + fill_value: Any = None + missing_value: Any = None + valid_min: Any = None + valid_max: Any = None + + +@dataclass +class ChunkMetadata: + dtype: Any + shape: tuple + chunks: tuple + compressor: Any + filters: list + order: str + filename: str + + +@dataclass +class ChunkRequest: + uri: str + offset: int + size: int + dtype: Any + chunks: tuple + order: str + compressor: Any + filters: list + chunk_selection: tuple + missing: MissingAttributes + method: Optional[str] + axis: Optional[tuple] + + +@dataclass +class SelectionRequest: + uri: str + variable: str + selection: tuple + method: Optional[str] + axis: Optional[tuple] + missing: MissingAttributes + compressor: Any + filters: list + + +@dataclass +class ChunkResult: + data: Any + count: Optional[int] + out_selection: tuple + + +@dataclass +class SelectionResult: + data: Any + n: Optional[int] + + +class StorageFormat(ABC): + @abstractmethod + def open(self, uri, storage_options): + raise NotImplementedError + + @abstractmethod + def get_variable(self, ncvar): + raise NotImplementedError + + @abstractmethod + def get_missing_attributes(self) -> MissingAttributes: + raise NotImplementedError + + @abstractmethod + def get_indexer(self, selection): + raise NotImplementedError + + @abstractmethod + def get_chunk_metadata(self) -> ChunkMetadata: + raise NotImplementedError + + @abstractmethod + def close(self): + raise NotImplementedError + + +class StorageBackend(ABC): + supported_methods = {"min", "max", "sum", "mean"} + + def __init__(self, active): + self._active = active + self.execution_strategy = None + + def get_session(self): + return None + + def close_session(self, session): + return None + + @abstractmethod + def reduce_chunk(self, request: ChunkRequest) -> ChunkResult: + raise NotImplementedError + + def reduce_selection(self, request: SelectionRequest) -> SelectionResult: + raise NotImplementedError("Whole-array reduction not implemented for this backend") + + +class ExecutionStrategy(ABC): + @abstractmethod + def execute( + self, + backend, + session, + chunk_metadata, + indexer, + missing, + method, + need_counts, + axis, + ): + raise NotImplementedError + + +def _select_backend(interface_type, version): + from activestorage.backends import HttpsBackend, LocalBackend, P5RemBackend, S3Backend + + backends = { + (None, 0): LocalBackend, + (None, 1): LocalBackend, + (None, 2): LocalBackend, + ("s3", 0): S3Backend, + ("s3", 1): S3Backend, + ("s3", 2): S3Backend, + ("https", 0): HttpsBackend, + ("https", 1): LocalBackend, + ("https", 2): HttpsBackend, + ("p5rem", 0): P5RemBackend, + ("p5rem", 1): P5RemBackend, + ("p5rem", 2): P5RemBackend, + } + backend = backends.get((interface_type, version)) + if backend is None: + raise ValueError( + f"No backend registered for interface_type={interface_type!r}, " + f"version={version}. Available: {list(backends)}" + ) + return backend + + +def _select_format(dataset): + from activestorage.formats import KerchunkFormat, P5RemFormat, PyfiveFormat, ZarrFormat + + lower = str(dataset).lower() + if lower.endswith(".kerchunk"): + return KerchunkFormat + if lower.endswith(".zarr"): + return ZarrFormat + return PyfiveFormat + + +class Active: + def __new__(cls, *args, **kwargs): + instance = super().__new__(cls) + instance._methods = { + "min": np.ma.min, + "max": np.ma.max, + "sum": np.ma.sum, + "mean": np.ma.sum, + } + return instance + + def __init__( + self, + uri, + ncvar=None, + storage_type=None, + interface_type=None, + max_threads=100, + storage_options=None, + active_storage_url=None, + axis=None, + option_disable_chunk_cache=False, + ): + self.uri = uri + if self.uri is None: + raise ValueError(f"Must use a valid file for uri. Got {uri}") + + # Keep source URI when a dataset/variable object is provided. + is_pathlike = isinstance(uri, (str, bytes, os.PathLike)) + source_uri = uri + if not is_pathlike: + file_obj = getattr(uri, "file", None) + fh = getattr(file_obj, "_fh", None) + source_uri = ( + getattr(fh, "path", None) + or getattr(fh, "url", None) + or str(uri) + ) + + # interface_type is an alias for storage_type + if interface_type is not None: + storage_type = interface_type + + self.storage_type = storage_type or return_interface_type(source_uri) + self.storage_options = storage_options or {} + if self.storage_type is None: + # Backward-compatible inference for bare S3 object paths like + # "bucket/key.nc" when only storage_options indicate S3 access. + # Only infer when the URI is not an existing local file. + s3_hints = {"key", "secret", "anon", "client_kwargs", "endpoint_url"} + if any(k in self.storage_options for k in s3_hints): + if not (is_pathlike and os.path.isfile(str(uri))): + self.storage_type = "s3" + self._option_disable_chunk_cache = bool(option_disable_chunk_cache) + self.active_storage_url = active_storage_url + + # Allow passing dataset/variable objects directly (ncvar optional). + is_file_object = not is_pathlike + if is_pathlike and not os.path.isfile(self.uri) and not self.storage_type: + raise ValueError(f"Must use existing file for uri. {self.uri} not found") + + # When uri is a dataset object, ncvar can be None (user will select variable via indexing) + if ncvar is None and not is_file_object: + raise ValueError("Must set a netCDF variable name to slice") + + self._ncvar = ncvar + self._version = 1 + self._components = False + self._method = None + self._axis = (axis,) if isinstance(axis, int) else axis + self._max_threads = max_threads + self.metric_data = {} + self.data_read = 0 + + self._format = _select_format(source_uri)() + self._format._storage_type = self.storage_type + if is_file_object: + # uri is already a pyfive.Group or similar + self._format._dataset = uri + self._format._uri = str(source_uri) + self.ds = uri + else: + self._format.open(uri, self.storage_options) + if ncvar is not None: + self.ds = self._format.get_variable(ncvar) + else: + self.ds = None + self.missing = None + + self._refresh_backend() + + @property + def ncvar(self): + return self._ncvar + + @ncvar.setter + def ncvar(self, value): + self._ncvar = value + + @property + def interface_type(self): + return self.storage_type + + def _refresh_backend(self): + backend_cls = _select_backend(self.storage_type, self._version) + self._backend = backend_cls(self) + + @property + def components(self): + return self._components + + @components.setter + def components(self, value): + self._components = bool(value) + + @property + def method(self): + return self._methods.get(self._method) + + @method.setter + def method(self, value): + if value is not None and value not in self._methods: + raise ValueError(f"Bad 'method': {value}. Choose from min/max/mean/sum.") + self._method = value + + def register_method(self, name, func): + self._methods[name] = func + + def min(self, axis=None): + self._method = "min" + self._axis = (axis,) if isinstance(axis, int) else axis + return self + + def max(self, axis=None): + self._method = "max" + self._axis = (axis,) if isinstance(axis, int) else axis + return self + + def mean(self, axis=None): + self._method = "mean" + self._axis = (axis,) if isinstance(axis, int) else axis + return self + + def sum(self, axis=None): + self._method = "sum" + self._axis = (axis,) if isinstance(axis, int) else axis + return self + + def __getitem__(self, index): + self.metric_data = {} + if self._version not in (0, 1, 2): + raise ValueError(f"Version {self._version} not supported") + + self._refresh_backend() + self.missing = self._format.get_missing_attributes() + self.data_read = 0 + + if self.method is None and self._version == 0: + return self._get_vanilla(index) + return self._get_selection(index) + + def _get_vanilla(self, index): + data = self.ds[index] + return self._mask_data(data) + + def _get_active(self, method, *args): + raise NotImplementedError + + def _get_selection(self, selection): + chunk_metadata = self._format.get_chunk_metadata() + indexer = self._format.get_indexer(selection) + ndim = len(chunk_metadata.shape) + axis = self._axis + if axis is None: + axis = tuple(range(ndim)) + else: + # Validate axis values; normalise negative indices for internal use. + normalised = [] + for i in axis: + if not (-ndim <= i < ndim): + raise ValueError( + f"axis {i} is out of bounds for array of dimension {ndim}" + ) + normalised.append(i % ndim) + axis = tuple(normalised) + + session = self._backend.get_session() + try: + need_counts = self._components or self._method == "mean" + return self._from_storage( + session, + chunk_metadata, + indexer, + self.missing, + self._method, + need_counts, + axis, + ) + finally: + self._backend.close_session(session) + + def _from_storage(self, session, chunk_metadata, indexer, missing, method, need_counts, axis): + return self._backend.execution_strategy.execute( + self._backend, + session, + chunk_metadata, + indexer, + missing, + method, + need_counts, + axis, + ) + + def _process_chunk(self, request: ChunkRequest) -> ChunkResult: + return self._backend.reduce_chunk(request) + + def _get_endpoint_url(self): + return get_endpoint_url(self.storage_options, self.uri) + + def _mask_data(self, data): + if self.missing is None: + self.missing = get_missing_attributes(self.ds) + + if isinstance(self.missing, MissingAttributes): + fill_value = self.missing.fill_value + missing_value = self.missing.missing_value + valid_min = self.missing.valid_min + valid_max = self.missing.valid_max + else: + fill_value, missing_value, valid_min, valid_max = self.missing + + def _as_scalar(value): + if value is None: + return None + if not np.isscalar(value): + try: + if len(value) == 1: + return value[0] + except TypeError: + pass + return value + + fill_value = _as_scalar(fill_value) + missing_value = _as_scalar(missing_value) + valid_min = _as_scalar(valid_min) + valid_max = _as_scalar(valid_max) + + if fill_value is not None: + data = np.ma.masked_equal(data, fill_value) + if missing_value is not None: + data = np.ma.masked_equal(data, missing_value) + if valid_max is not None: + data = np.ma.masked_greater(data, valid_max) + if valid_min is not None: + data = np.ma.masked_less(data, valid_min) + return data diff --git a/activestorage/formats.py b/activestorage/formats.py new file mode 100644 index 00000000..4192543d --- /dev/null +++ b/activestorage/formats.py @@ -0,0 +1,164 @@ +from __future__ import annotations + +import numpy as np +import pyfive + +from activestorage.core import ChunkMetadata, MissingAttributes, StorageFormat +from activestorage.hdf2numcodec import decode_filters +from activestorage.helpers import get_missing_attributes + + +class PyfiveFormat(StorageFormat): + def __init__(self): + self._dataset_file = None + self._dataset = None + self._uri = None + + def open(self, uri, storage_options): + self._uri = str(uri) + scheme = "" + if "://" in self._uri: + scheme = self._uri.split("://", 1)[0] + storage_type = getattr(self, '_storage_type', None) or scheme + if storage_type == "s3": + from activestorage import active as active_module + + self._dataset_file = active_module.load_from_s3(self._uri, storage_options) + elif storage_type in ("http", "https"): + from activestorage import active as active_module + + self._dataset_file = active_module.load_from_https(self._uri, storage_options) + else: + self._dataset_file = pyfive.File(self._uri) + + def get_variable(self, ncvar): + self._dataset = self._dataset_file[ncvar] + return self._dataset + + def get_missing_attributes(self) -> MissingAttributes: + missing = get_missing_attributes(self._dataset) + return MissingAttributes(*missing) + + def get_indexer(self, selection): + if not isinstance(selection, tuple): + selection = (selection,) + array = pyfive.indexing.ZarrArrayStub(self._dataset.shape, self._dataset.chunks) + return pyfive.indexing.OrthogonalIndexer(selection, array) + + def get_chunk_metadata(self) -> ChunkMetadata: + dataset = self._dataset + ds = dataset.id + dtype = np.dtype(dataset.dtype) + array = pyfive.indexing.ZarrArrayStub(dataset.shape, dataset.chunks) + if ds.filter_pipeline is None: + compressor, filters = None, None + else: + compressor, filters = decode_filters(ds.filter_pipeline, dtype.itemsize, dataset.name) + + return ChunkMetadata( + dtype=dtype, + shape=dataset.shape, + chunks=array._chunks, + compressor=compressor, + filters=filters, + order=ds._order, + filename=self._uri, + ) + + def get_chunk_offset_size(self, chunk_coords): + storeinfo = self._dataset.id.get_chunk_info_from_chunk_coord(chunk_coords) + return storeinfo.byte_offset, storeinfo.size + + @property + def file_handle(self): + return self._dataset.id._fh + + def close(self): + return None + + +class ZarrFormat(StorageFormat): + def __init__(self): + self._array = None + + def open(self, uri, storage_options): + import zarr + + self._array = zarr.open(uri, mode="r") + + def get_variable(self, ncvar): + return self._array + + def get_missing_attributes(self) -> MissingAttributes: + attrs = self._array.attrs + return MissingAttributes( + attrs.get("_FillValue"), + attrs.get("missing_value"), + attrs.get("valid_min"), + attrs.get("valid_max"), + ) + + def get_indexer(self, selection): + if not isinstance(selection, tuple): + selection = (selection,) + return pyfive.indexing.OrthogonalIndexer(selection, self._array) + + def get_chunk_metadata(self) -> ChunkMetadata: + return ChunkMetadata( + dtype=np.dtype(self._array.dtype), + shape=self._array.shape, + chunks=self._array.chunks, + compressor=getattr(self._array, "compressor", None), + filters=getattr(self._array, "filters", None), + order=getattr(self._array, "order", "C"), + filename="", + ) + + def close(self): + return None + + +class P5RemFormat(StorageFormat): + def __init__(self): + self._file = None + self._dataset = None + + def open(self, uri, storage_options): + self._file = uri + + def get_variable(self, ncvar): + self._dataset = self._file[ncvar] + return self._dataset + + def get_missing_attributes(self) -> MissingAttributes: + attrs = self._dataset.attrs + return MissingAttributes( + attrs.get("_FillValue"), + attrs.get("missing_value"), + attrs.get("valid_min"), + attrs.get("valid_max"), + ) + + def get_indexer(self, selection): + if not isinstance(selection, tuple): + selection = (selection,) + array = pyfive.indexing.ZarrArrayStub(self._dataset.shape, self._dataset.chunks) + return pyfive.indexing.OrthogonalIndexer(selection, array) + + def get_chunk_metadata(self) -> ChunkMetadata: + return ChunkMetadata( + dtype=np.dtype(self._dataset.dtype), + shape=self._dataset.shape, + chunks=self._dataset.chunks, + compressor=None, + filters=None, + order="C", + filename="", + ) + + def close(self): + return None + + +class KerchunkFormat(ZarrFormat): + pass diff --git a/activestorage/helpers.py b/activestorage/helpers.py new file mode 100644 index 00000000..3f8267ab --- /dev/null +++ b/activestorage/helpers.py @@ -0,0 +1,116 @@ +import time +import urllib + +import aiohttp +import fsspec +import numpy as np +import pyfive +import s3fs + +from activestorage.config import S3_ACCESS_KEY, S3_SECRET_KEY, S3_URL + + +def load_from_https(uri, storage_options=None): + """ + Load a pyfive.high_level.Dataset from a netCDF4 file on an https server. + Works for both http and https endpoints. + """ + try: + if storage_options is None: + client_kwargs = {'auth': None} + fs = fsspec.filesystem('http', **client_kwargs) + http_file = fs.open(uri, 'rb') + else: + username = storage_options.get("username", None) + password = storage_options.get("password", None) + client_kwargs = { + 'auth': aiohttp.BasicAuth(username, password) if username and password else None + } + fs = fsspec.filesystem('http', **client_kwargs) + http_file = fs.open(uri, 'rb') + except FileNotFoundError as exc: + # fsspec wraps all failures as FileNotFoundError. + # Distinguish by cause: connection-level errors (bad hostname, refused) + # have an OSError cause; HTTP-level errors (404) do not. + if isinstance(exc.__cause__, OSError): + raise ValueError(f"Failed to access HTTPS dataset: {uri}") from exc + raise + + ds = pyfive.File(http_file) + print(f"Dataset loaded from https with Pyfive: {uri}") + return ds + + +def load_from_s3(uri, storage_options=None): + """Load a pyfive file-like dataset from S3.""" + if storage_options is None: + fs = s3fs.S3FileSystem( + key=S3_ACCESS_KEY, + secret=S3_SECRET_KEY, + client_kwargs={"endpoint_url": S3_URL}, + ) + else: + fs = s3fs.S3FileSystem(**storage_options) + + t1 = time.time() + s3file = fs.open(uri, "rb") + t2 = time.time() + ds = pyfive.File(s3file) + t3 = time.time() + print(f"Dataset loaded from S3 with s3fs and Pyfive: {uri} ({t2-t1:.2},{t3-t2:.2})") + return ds + + +def get_missing_attributes(ds): + """Load missing-value related attributes from a dataset variable.""" + + def _hfix(x): + if x is None: + return x + if not np.isscalar(x) and len(x) == 1: + return x[0] + return x + + fill_value = _hfix(ds.attrs.get("_FillValue")) + missing_value = ds.attrs.get("missing_value") + valid_min = _hfix(ds.attrs.get("valid_min")) + valid_max = _hfix(ds.attrs.get("valid_max")) + valid_range = _hfix(ds.attrs.get("valid_range")) + + if valid_max is not None or valid_min is not None: + if valid_range is not None: + raise ValueError( + "Invalid combination in the file of valid_min, " + "valid_max, valid_range: " + f"{valid_min}, {valid_max}, {valid_range}" + ) + elif valid_range is not None: + valid_min, valid_max = valid_range + + return fill_value, missing_value, valid_min, valid_max + + +def get_endpoint_url(storage_options, filename): + """Return endpoint URL from storage options or infer from URI.""" + if not storage_options: + return f"http://{urllib.parse.urlparse(filename).netloc}" + + endpoint_url = storage_options.get("endpoint_url") + if endpoint_url is not None: + return endpoint_url + + client_kwargs = storage_options.get("client_kwargs") + if client_kwargs: + endpoint_url = client_kwargs.get("endpoint_url") + if endpoint_url is not None: + return endpoint_url + + return f"http://{urllib.parse.urlparse(filename).netloc}" + + +def return_interface_type(uri): + """Infer interface type from URI scheme.""" + scheme = urllib.parse.urlparse(str(uri)).scheme + if scheme in ("s3", "https"): + return scheme + return None diff --git a/activestorage/reductionist.py b/activestorage/reductionist.py index 13c3974b..bd05472c 100644 --- a/activestorage/reductionist.py +++ b/activestorage/reductionist.py @@ -1,18 +1,22 @@ """Reductionist S3 Active Storage server storage interface module.""" +import cbor2 as cbor import collections.abc import http.client import json -import requests -import numcodecs -import numpy as np import sys import typing +import numcodecs +import numpy as np +import requests + + DEBUG = 0 -def get_session(username: str, password: str, cacert: typing.Optional[str]) -> requests.Session: +def get_session(username: str, password: str, + cacert: typing.Optional[str]) -> requests.Session: """Create and return a client session object. :param username: S3 username / access key @@ -20,21 +24,37 @@ def get_session(username: str, password: str, cacert: typing.Optional[str]) -> r :returns: a client session object. """ session = requests.Session() + # TODO Stack-HPC + # we need to allow Anon buckets. though this + # will break connection to data server + # if username is None and password is None: + # return session session.auth = (username, password) session.verify = cacert or False return session -def reduce_chunk(session, server, source, bucket, object, - offset, size, compression, filters, missing, dtype, shape, - order, chunk_selection, operation): +def reduce_chunk(session, + server, + url, + offset, + size, + compression, + filters, + missing, + dtype, + shape, + order, + chunk_selection, + axis, + operation, + interface_type=None, + option_disable_chunk_cache=False): """Perform a reduction on a chunk using Reductionist. :param server: Reductionist server URL :param cacert: Reductionist CA certificate path - :param source: S3 URL - :param bucket: S3 bucket - :param object: S3 object + :param url: object URL :param offset: offset of data in object :param size: size of data in object :param compression: optional `numcodecs.abc.Codec` compression codec @@ -49,16 +69,31 @@ def reduce_chunk(session, server, source, bucket, object, 1), slice(1, 3, 1), slice(0, 1, 1)) this defines the part of the chunk which is to be obtained or operated upon. + :param axis: tuple of the axes to be reduced (non-negative integers) :param operation: name of operation to perform + :param interface_type: optional testing flag to allow HTTPS reduction + :param option_disable_chunk_cache: optional turn off chunk cache :returns: the reduced data as a numpy array or scalar :raises ReductionistError: if the request to Reductionist fails """ - request_data = build_request_data(source, bucket, object, offset, size, compression, filters, missing, dtype, shape, order, chunk_selection) + request_data = build_request_data(url, + offset, + size, + compression, + filters, + missing, + dtype, + shape, + order, + chunk_selection, + axis, + interface_type=interface_type, + option_disable_chunk_cache=option_disable_chunk_cache) if DEBUG: print(f"Reductionist request data dictionary: {request_data}") api_operation = "sum" if operation == "mean" else operation or "select" - url = f'{server}/v1/{api_operation}/' + url = f'{server}/v2/{api_operation}/' response = request(session, url, request_data) if response.ok: @@ -80,6 +115,7 @@ def encode_byte_order(dtype): def encode_selection(selection): """Encode a chunk selection in a JSON-compatible format.""" + def encode_slice(s): if isinstance(s, slice): return [s.start, s.stop, s.step] @@ -118,13 +154,21 @@ def encode_missing(missing): missing_value = fill_value or missing_value if missing_value: if isinstance(missing_value, collections.abc.Sequence): - return {"missing_values": [encode_dvalue(v) for v in missing_value]} + return { + "missing_values": [encode_dvalue(v) for v in missing_value] + } elif isinstance(missing_value, np.ndarray): - return {"missing_values": [encode_dvalue(v) for v in missing_value]} + return { + "missing_values": [encode_dvalue(v) for v in missing_value] + } else: return {"missing_value": encode_dvalue(missing_value)} if valid_min and valid_max: - return {"valid_range": [encode_dvalue(valid_min), encode_dvalue(valid_max)]} + return { + "valid_range": + [encode_dvalue(valid_min), + encode_dvalue(valid_max)] + } if valid_min: return {"valid_min": encode_dvalue(valid_min)} if valid_max: @@ -132,14 +176,23 @@ def encode_missing(missing): assert False, "Expected missing values not found" -def build_request_data(source: str, bucket: str, object: str, offset: int, - size: int, compression, filters, missing, dtype, shape, - order, selection) -> dict: +def build_request_data(url: str, + offset: int, + size: int, + compression, + filters, + missing, + dtype, + shape, + order, + selection, + axis, + interface_type=None, + option_disable_chunk_cache=False) -> dict: """Build request data for Reductionist API.""" request_data = { - 'source': source, - 'bucket': bucket, - 'object': object, + 'interface_type': interface_type if interface_type else "s3", + 'url': url, 'dtype': dtype.name, 'byte_order': encode_byte_order(dtype), 'offset': int(offset), @@ -159,6 +212,11 @@ def build_request_data(source: str, bucket: str, object: str, offset: int, request_data["filters"] = encode_filters(filters) if any(missing): request_data["missing"] = encode_missing(missing) + if option_disable_chunk_cache: + request_data["option_disable_chunk_cache"] = True + + if axis is not None: + request_data['axis'] = axis return {k: v for k, v in request_data.items() if v is not None} @@ -172,21 +230,59 @@ def request(session: requests.Session, url: str, request_data: dict): return response -def decode_result(response): - """Decode a successful response, return as a 2-tuple of (numpy array or scalar, count).""" - dtype = response.headers['x-activestorage-dtype'] - shape = json.loads(response.headers['x-activestorage-shape']) - result = np.frombuffer(response.content, dtype=dtype) +def encode_result(data, count): + """Encode a reduction result using the same CBOR payload shape as Reductionist.""" + result = np.ma.getdata(np.asanyarray(data)) + # Serialise count: numpy arrays/scalars must be converted to plain Python types. + if isinstance(count, np.ndarray): + serialised_count = count.tolist() + elif isinstance(count, (np.integer, np.floating)): + serialised_count = count.item() + else: + serialised_count = count + reduction_result = { + "bytes": result.tobytes(), + "dtype": result.dtype.name, + "shape": list(result.shape), + "count": serialised_count, + } + return cbor.dumps(reduction_result) + + +def decode_result_buffer(buffer): + """Decode a CBOR-encoded Reductionist result buffer.""" + return decode_result_payload(cbor.loads(buffer)) + + +def decode_result_payload(reduction_result): + """Decode a Reductionist result mapping into a 2-tuple of (array, count).""" + dtype = reduction_result['dtype'] + shape = reduction_result['shape'] if "shape" in reduction_result else None + + result = np.frombuffer(reduction_result['bytes'], dtype=dtype) result = result.reshape(shape) - count = json.loads(response.headers['x-activestorage-count']) + + count = reduction_result.get('count') + if count is None: + return result, count + + result = np.ma.masked_where(count == 0, result) + return result, count +def decode_result(response): + """Decode a successful response, return as a 2-tuple of (numpy array or scalar, count).""" + return decode_result_buffer(response.content) + + class ReductionistError(Exception): """Exception for Reductionist failures.""" def __init__(self, status_code, error): - super(ReductionistError, self).__init__(f"Reductionist error: HTTP {status_code}: {error}") + super( + ReductionistError, + self).__init__(f"Reductionist error: HTTP {status_code}: {error}") def decode_and_raise_error(response): diff --git a/activestorage/storage.py b/activestorage/storage.py index 80a575ba..0197aa2a 100644 --- a/activestorage/storage.py +++ b/activestorage/storage.py @@ -3,9 +3,41 @@ from numcodecs.compat import ensure_ndarray -def reduce_chunk(rfile, - offset, size, compression, filters, missing, dtype, shape, - order, chunk_selection, method=None): +def _apply_missing_mask(data, missing): + """Apply missing-value masks to *data* without compressing/flattening.""" + fill_value, missing_value, valid_min, valid_max = missing + + def _as_scalar(value): + if value is None: + return None + if not np.isscalar(value): + try: + if len(value) == 1: + return value[0] + except TypeError: + pass + return value + + fill_value = _as_scalar(fill_value) + missing_value = _as_scalar(missing_value) + valid_min = _as_scalar(valid_min) + valid_max = _as_scalar(valid_max) + + data = np.ma.array(data) + if fill_value is not None: + data = np.ma.masked_equal(data, fill_value) + if missing_value is not None: + data = np.ma.masked_equal(data, missing_value) + if valid_max is not None: + data = np.ma.masked_greater(data, valid_max) + if valid_min is not None: + data = np.ma.masked_less(data, valid_min) + return data + + +def reduce_chunk(rfile, + offset, size, compression, filters, missing, dtype, shape, + order, chunk_selection, method=None, axis=None): """ We do our own read of chunks and decoding etc rfile - the actual file with the data @@ -43,11 +75,19 @@ def reduce_chunk(rfile, tmp = chunk[chunk_selection] if method: if missing != (None, None, None, None): - tmp = remove_missing(tmp, missing) - # Check on size of tmp; method(empty) fails or gives incorrect - # results + if axis is None: + # Flatten to valid elements (original behaviour for all-axes reduction). + tmp = remove_missing(tmp, missing) + else: + # Keep array structure so axis-specific reduction is possible. + tmp = _apply_missing_mask(tmp, missing) if tmp.size: - return method(tmp), tmp.size + if axis is None: + return method(tmp), tmp.size + else: + result = method(tmp, axis=axis, keepdims=True) + count = np.ma.count(tmp, axis=axis, keepdims=True) + return result, count else: return tmp, 0 else: @@ -80,13 +120,29 @@ def remove_missing(data, missing): """ fill_value, missing_value, valid_min, valid_max = missing - if fill_value: + def _as_scalar(value): + if value is None: + return None + if not np.isscalar(value): + try: + if len(value) == 1: + return value[0] + except TypeError: + pass + return value + + fill_value = _as_scalar(fill_value) + missing_value = _as_scalar(missing_value) + valid_min = _as_scalar(valid_min) + valid_max = _as_scalar(valid_max) + + if fill_value is not None: data = np.ma.masked_equal(data, fill_value) - if missing_value: + if missing_value is not None: data = np.ma.masked_equal(data, missing_value) - if valid_max: + if valid_max is not None: data = np.ma.masked_greater(data, valid_max) - if valid_min: + if valid_min is not None: data = np.ma.masked_less(data, valid_min) data = np.ma.compressed(data) @@ -105,7 +161,7 @@ def read_block(open_file, offset, size): def reduce_opens3_chunk(fh, offset, size, compression, filters, missing, dtype, shape, - order, chunk_selection, method=None): + order, chunk_selection, method=None, axis=None): """ Same function as reduce_chunk, but this mimics what is done deep in the bowels of H5py/pyfive. The reason for doing this is @@ -125,10 +181,18 @@ def reduce_opens3_chunk(fh, tmp = chunk[chunk_selection] if method: if missing != (None, None, None, None): - tmp = remove_missing(tmp, missing) + if axis is None: + tmp = remove_missing(tmp, missing) + else: + tmp = _apply_missing_mask(tmp, missing) # check on size of tmp; method(empty) returns nan if tmp.any(): - return method(tmp), tmp.size + if axis is None: + return method(tmp), tmp.size + else: + result = method(tmp, axis=axis, keepdims=True) + count = np.ma.count(tmp, axis=axis, keepdims=True) + return result, count else: return tmp, None else: diff --git a/activestorage/strategies.py b/activestorage/strategies.py new file mode 100644 index 00000000..4a3faadf --- /dev/null +++ b/activestorage/strategies.py @@ -0,0 +1,159 @@ +from __future__ import annotations + +from concurrent.futures import ThreadPoolExecutor, as_completed + +import numpy as np + +from activestorage.core import ChunkRequest, SelectionRequest + + +class ChunkedLocalStrategy: + def _iter_chunk_requests(self, backend, chunk_metadata, indexer, missing, method, axis): + for chunk_coords, chunk_selection, out_selection in indexer: + offset, size = backend._active._format.get_chunk_offset_size(chunk_coords) + request = ChunkRequest( + uri=chunk_metadata.filename, + offset=offset, + size=size, + dtype=chunk_metadata.dtype, + chunks=chunk_metadata.chunks, + order=chunk_metadata.order, + compressor=chunk_metadata.compressor, + filters=chunk_metadata.filters, + chunk_selection=chunk_selection, + missing=missing, + method=method, + axis=axis, + ) + yield chunk_coords, out_selection, request + + def _store_chunk_result(self, backend, out, counts, axis, need_counts, method, chunk_coords, out_selection, result): + if method is not None: + out_sel = list(out_selection) + for i in axis: + n = chunk_coords[i] + out_sel[i] = slice(n, n + 1) + out[tuple(out_sel)] = result.data + if need_counts: + counts[tuple(out_sel)] = result.count + return + + out[out_selection] = result.data + + def execute( + self, + backend, + session, + chunk_metadata, + indexer, + missing, + method, + need_counts, + axis, + ): + out_shape = list(indexer.shape) + if method is not None: + nchunks = [] + for dim in indexer.dim_indexers: + nchunks.append(getattr(dim, "nchunks", 1)) + for i in axis: + out_shape[i] = nchunks[i] + + out = np.ma.empty(out_shape, dtype=chunk_metadata.dtype, order=chunk_metadata.order) + out.mask = True + counts = None + if need_counts: + counts = np.ma.empty(out_shape, dtype="int64", order=chunk_metadata.order) + counts.mask = True + + max_threads = max(1, getattr(backend._active, "_max_threads", 1) or 1) + + if max_threads == 1: + for chunk_coords, out_selection, request in self._iter_chunk_requests( + backend, chunk_metadata, indexer, missing, method, axis + ): + result = backend.reduce_chunk(request) + self._store_chunk_result( + backend, + out, + counts, + axis, + need_counts, + method, + chunk_coords, + out_selection, + result, + ) + else: + with ThreadPoolExecutor(max_workers=max_threads) as executor: + futures = { + executor.submit(backend.reduce_chunk, request): (chunk_coords, out_selection) + for chunk_coords, out_selection, request in self._iter_chunk_requests( + backend, chunk_metadata, indexer, missing, method, axis + ) + } + for future in as_completed(futures): + chunk_coords, out_selection = futures[future] + result = future.result() + self._store_chunk_result( + backend, + out, + counts, + axis, + need_counts, + method, + chunk_coords, + out_selection, + result, + ) + + if method is None: + return out + + reducer = backend._active.method + reduced = reducer(out, axis=axis, keepdims=True) + if not need_counts: + return reduced + + n = np.ma.sum(counts, axis=axis, keepdims=True) + if backend._active.components: + key = "sum" if method == "mean" else method + return {key: reduced, "n": n} + if method == "mean": + return reduced / n + return reduced + + +class ChunkedRemoteStrategy(ChunkedLocalStrategy): + pass + + +class WholeArrayStrategy: + def execute( + self, + backend, + session, + chunk_metadata, + indexer, + missing, + method, + need_counts, + axis, + ): + request = SelectionRequest( + uri=chunk_metadata.filename, + variable=backend._active.ncvar, + selection=getattr(indexer, "selection", ()), + method=method, + axis=axis, + missing=missing, + compressor=chunk_metadata.compressor, + filters=chunk_metadata.filters, + ) + result = backend.reduce_selection(request) + if backend._active.components and result.n is not None: + key = "sum" if method == "mean" else method + return {key: result.data, "n": result.n} + if method == "mean" and result.n: + return result.data / result.n + return result.data diff --git a/bnl/bnl_test.py b/bnl/bnl_test.py index fd9e4d29..57936ef2 100644 --- a/bnl/bnl_test.py +++ b/bnl/bnl_test.py @@ -7,7 +7,6 @@ import s3fs from activestorage.active import Active -from activestorage.config import * def mytest(): diff --git a/bnl/common_cl_a.nc b/bnl/common_cl_a.nc new file mode 100644 index 00000000..0b73775c Binary files /dev/null and b/bnl/common_cl_a.nc differ diff --git a/bnl/test_missing_gubbins.py b/bnl/test_missing_gubbins.py index 65f353a5..e54b04be 100644 --- a/bnl/test_missing_gubbins.py +++ b/bnl/test_missing_gubbins.py @@ -1,11 +1,13 @@ -from activestorage import dummy_data as dd +import sys +from pathlib import Path +sys.path.insert(0, str(Path(__file__).parent.parent / 'tests')) +import dummy_data as dd from activestorage import Active import pyfive import numpy.ma as ma import numpy as np import os -from activestorage.config import * from pathlib import Path import s3fs diff --git a/codecov.yml b/codecov.yml index 66c9e1c8..9f742648 100644 --- a/codecov.yml +++ b/codecov.yml @@ -1,3 +1,5 @@ +codecov: + token: 562143a4-8321-437a-bb94-a4c4a5455a1a coverage: status: project: off diff --git a/docs4understanding/active_refactor_v7.puml b/docs4understanding/active_refactor_v7.puml new file mode 100644 index 00000000..4998eb60 --- /dev/null +++ b/docs4understanding/active_refactor_v7.puml @@ -0,0 +1,305 @@ +@startuml active_refactor_v7_core + +skinparam classAttributeIconSize 0 +skinparam classFontStyle bold +skinparam packageStyle rectangle + +' ------------------------------------------------------------------ +' Diagram 1: core only +' ------------------------------------------------------------------ + +package "activestorage.core" { + + class SelectionRequest <> { + -- + + uri : str + + variable : str + + selection : tuple + + method : str + + axis : tuple + + missing : MissingAttributes + + compressor : object + + filters : list + } + + class ChunkRequest <> { + -- + + uri : str + + offset : int + + size : int + + dtype : str + + chunks : tuple + + order : str + + compressor : object + + filters : list + + chunk_selection : tuple + + missing : MissingAttributes + + method : str + + axis : tuple + } + + class MissingAttributes <> { + -- + + fill_value + + missing_value + + valid_min + + valid_max + } + + class ChunkMetadata <> { + -- + + dtype + + shape : tuple + + chunks : tuple + + compressor + + filters : list + + order : str + + filename : str + } + + class ChunkResult <> { + -- + + data : np.ma.array + + count : int + + out_selection : tuple + } + + class SelectionResult <> { + -- + + data : np.ma.array | dict + + n : int + } + + abstract class StorageFormat { + -- + + {abstract} open(uri, storage_options) + + {abstract} get_variable(ncvar) : dataset + + {abstract} get_missing_attributes() : MissingAttributes + + {abstract} get_indexer(selection) : indexer + + {abstract} get_chunk_metadata() : ChunkMetadata + + {abstract} close() + } + + abstract class StorageBackend { + + supported_methods : set + + execution_strategy : ExecutionStrategy + -- + + get_session() + + close_session(session) + + {abstract} reduce_chunk(request: ChunkRequest) : ChunkResult + + reduce_selection(request: SelectionRequest) : SelectionResult + } + + abstract class ExecutionStrategy { + + {abstract} execute(backend, session,\n chunk_metadata, indexer,\n missing, method,\n need_counts, axis) : array | dict + } + + class "_select_backend(interface_type, version)" as SelectBackend <> { + Returns StorageBackend subclass + from _BACKENDS dict. + } + + class "_select_format(dataset)" as SelectFormat <> { + Returns StorageFormat subclass + from _FORMATS dict. + } + + enum "_BACKENDS" as BackendRegistry { + (None, 1/2) => LocalBackend + ("s3", 1/2) => S3Backend + ("https", 2) => HttpsBackend + ("p5rem", *) => P5RemBackend + } + + enum "_FORMATS" as FormatRegistry { + pyfive.Dataset => PyfiveFormat + zarr.Array => ZarrFormat + p5rem.File => P5RemFormat + ".nc"/".he5" => PyfiveFormat + ".zarr" => ZarrFormat + ".kerchunk" => KerchunkFormat + } + + class Active { + - _methods : dict + - _method : str + - _axis : tuple + - _max_threads : int + - _components : bool + - _backend : StorageBackend + - _format : StorageFormat + -- + + __init__(..., max_threads=100, ...) + + register_method(name, func) + + method : property + + components : property + + min(axis) : self + + max(axis) : self + + mean(axis) : self + + sum(axis) : self + -- + + __getitem__(index) + - _get_vanilla(index) + - _from_storage(...) + - _mask_data(data) + } + + BackendRegistry .. SelectBackend + FormatRegistry .. SelectFormat + + Active o-- StorageBackend : _backend + Active o-- StorageFormat : _format + Active ..> SelectBackend : __init__ + Active ..> SelectFormat : __init__ + Active ..> ExecutionStrategy : delegates via backend.execution_strategy + + StorageBackend o-- ExecutionStrategy : execution_strategy + + StorageBackend ..> ChunkRequest : consumes + StorageBackend ..> ChunkResult : produces + StorageBackend ..> SelectionRequest : consumes + StorageBackend ..> SelectionResult : produces + + StorageFormat ..> ChunkMetadata : produces + StorageFormat ..> MissingAttributes : produces + + ChunkRequest *-- MissingAttributes + SelectionRequest *-- MissingAttributes +} + +@enduml + +@startuml active_refactor_v7_impls + +skinparam classAttributeIconSize 0 +skinparam classFontStyle bold +skinparam packageStyle rectangle + +' ------------------------------------------------------------------ +' Diagram 2: formats/backends/strategies +' Core base classes are shown via stereotypes. +' ------------------------------------------------------------------ + +package "activestorage.formats" { + + class PyfiveFormat <> { + + open(uri, storage_options) + + get_variable(ncvar) + + get_missing_attributes() : MissingAttributes + + get_indexer(selection) + + get_chunk_metadata() : ChunkMetadata + + close() + } + + class ZarrFormat <> { + + open(uri, storage_options) + + get_variable(ncvar) + + get_missing_attributes() : MissingAttributes + + get_indexer(selection) + + get_chunk_metadata() : ChunkMetadata + + close() + } + + class P5RemFormat <> { + + open(uri, storage_options) + + get_variable(ncvar) + + get_missing_attributes() : MissingAttributes + + get_indexer(selection) + + get_chunk_metadata() : ChunkMetadata + + close() + } + + class KerchunkFormat <> { + + open(uri, storage_options) + + get_variable(ncvar) + + get_missing_attributes() : MissingAttributes + + get_indexer(selection) + + get_chunk_metadata() : ChunkMetadata + + close() + } +} + +package "activestorage.backends" { + + abstract class CacheAwareBackend <> { + + is_range_cached(fh, offset, size) : bool + - _get_block_size(fh) : int + - _get_cache_dict(fh) : dict + } + + abstract class ReductionistBackend <> { + + execution_strategy : ChunkedRemoteStrategy + -- + + {abstract} build_url(filename, storage_options) : str + + {abstract} get_session() : session + + reduce_chunk(request: ChunkRequest) : ChunkResult + + reduce_selection(request: SelectionRequest) : SelectionResult + } + + class LocalBackend <> { + + execution_strategy : ChunkedLocalStrategy + -- + + reduce_chunk(request: ChunkRequest) : ChunkResult + } + + class S3Backend <> { + + execution_strategy : ChunkedRemoteStrategy + -- + + build_url(filename, storage_options) : str + + get_session() : session + - _resolve_bucket_object(filename, opts) + } + + class HttpsBackend <> { + + execution_strategy : ChunkedRemoteStrategy + -- + + build_url(filename, storage_options) : str + + get_session() : session + } + + class P5RemBackend <> { + + execution_strategy : WholeArrayStrategy + -- + + get_session() : p5rem.Client + + reduce_chunk(request: ChunkRequest) : ChunkResult + + reduce_selection(request: SelectionRequest) : SelectionResult + } + + CacheAwareBackend <|-- ReductionistBackend + ReductionistBackend <|-- S3Backend + ReductionistBackend <|-- HttpsBackend +} + +package "activestorage.strategies" { + + class ChunkedLocalStrategy <> { + + execute(...) + - _iter_chunk_requests(...) + - _store_chunk_result(...) + -- + Uses ThreadPoolExecutor when\nactive._max_threads > 1 + } + + class ChunkedRemoteStrategy <> { + + execute(...) + -- + Inherits threaded execution from\nChunkedLocalStrategy + } + + class WholeArrayStrategy <> { + + execute(...) + } + + ChunkedRemoteStrategy ..> CacheAwareBackend : inspects cache via + WholeArrayStrategy ..> CacheAwareBackend : inspects cache via + ChunkedLocalStrategy ..> ChunkRequest : builds + ChunkedRemoteStrategy ..> ChunkRequest : builds + WholeArrayStrategy ..> SelectionRequest : builds + +} + +ChunkRequest -[hidden]- CacheAwareBackend + + + +@enduml diff --git a/docs4understanding/active_refactor_v8_formats_adapters.puml b/docs4understanding/active_refactor_v8_formats_adapters.puml new file mode 100644 index 00000000..a7db2a42 --- /dev/null +++ b/docs4understanding/active_refactor_v8_formats_adapters.puml @@ -0,0 +1,141 @@ +@startuml active_refactor_v8_formats_adapters + +skinparam classAttributeIconSize 0 +skinparam classFontStyle bold +skinparam packageStyle rectangle + +' ------------------------------------------------------------------ +' StorageFormat adapters: thin wrappers around external library objects +' ------------------------------------------------------------------ + +package "activestorage.core" { + + abstract class StorageFormat { + -- + + {abstract} open(uri, storage_options) + + {abstract} get_variable(ncvar) : dataset + + {abstract} get_missing_attributes() : MissingAttributes + + {abstract} get_indexer(selection) : indexer + + {abstract} get_chunk_metadata() : ChunkMetadata + + {abstract} close() + } +} + +package "activestorage.formats" { + + ' Adapters implementing the StorageFormat contract + ' Each wraps an external library object and delegates to it + + class PyfiveFormat <> { + - _dataset : pyfive.Dataset + -- + + open(uri, storage_options) + + get_variable(ncvar) : dataset + + get_missing_attributes() : MissingAttributes + + get_indexer(selection) + + get_chunk_metadata() : ChunkMetadata + + close() + } + + class ZarrFormat <> { + - _array : zarr.Array + -- + + open(uri, storage_options) + + get_variable(ncvar) : dataset + + get_missing_attributes() : MissingAttributes + + get_indexer(selection) + + get_chunk_metadata() : ChunkMetadata + + close() + } + + class P5RemFormat <> { + - _file : p5rem.File + -- + + open(uri, storage_options) + + get_variable(ncvar) : dataset + + get_missing_attributes() : MissingAttributes + + get_indexer(selection) + + get_chunk_metadata() : ChunkMetadata + + close() + } + + class KerchunkFormat <> { + - _zarr : zarr.Array + -- + + open(uri, storage_options) + + get_variable(ncvar) : dataset + + get_missing_attributes() : MissingAttributes + + get_indexer(selection) + + get_chunk_metadata() : ChunkMetadata + + close() + } + + + +note top of PyfiveFormat + Simple wrapper around pyfive.Dataset. + Delegates chunk queries to Dataset's + built-in HDF5 machinery. No methods + to add — just translate the interface. +end note + +note top of ZarrFormat + Wraps zarr.Array. get_chunk_metadata() + reads Zarr's array attributes. + get_indexer() uses Zarr's indexing. +end note + +note top of P5RemFormat + Wraps p5rem.File. Delegates most queries + to the remote p5rem server via the client. + Thin wrapper around the network interface. +end note + +note top of KerchunkFormat + Wraps zarr.Array created from Kerchunk + JSON metadata files. Identical to ZarrFormat + in delegation, but semantic distinction + in how the zarr object was created. +end note +} + +package "external libraries" { + + class "pyfive.Dataset" as PyfiveDataset { + attrs : dict + dimensions : dict + variables : dict + -- + + __getitem__(key) + + __getattr__(name) + } + + class "zarr.Array" as ZarrArray { + attrs : dict + chunks : tuple + dtype : dtype + shape : tuple + -- + + __getitem__(key) + + [other zarr methods] + } + + class "p5rem.File" as P5RemFile { + variables : dict + dimensions : dict + attrs : dict + -- + + __getitem__(key) + + connect() + } + + +' Composition: adapters wrap and delegate to external objects +PyfiveFormat *-- PyfiveDataset : wraps +ZarrFormat *-- ZarrArray : wraps +P5RemFormat *-- P5RemFile : wraps +KerchunkFormat *-- ZarrArray : wraps (kerchunk metadata) + + +} +@enduml diff --git a/docs4understanding/axis_flow_v1.puml b/docs4understanding/axis_flow_v1.puml new file mode 100644 index 00000000..ddd52c14 --- /dev/null +++ b/docs4understanding/axis_flow_v1.puml @@ -0,0 +1,98 @@ +@startuml axis_flow_v1 + +title "Axis Parameter Flow Through Reduction Pipeline (Path-Specific)" + +actor User + +participant "Active\n.min/.max/.mean/.sum\n(axis=N)" as Active +participant "Active\n._get_selection()" as GetSelection +participant "Active\n._from_storage()" as FromStorage +participant "ExecutionStrategy\n.execute()" as Strategy +participant "ChunkedLocalStrategy\n.execute()" as LocalStrat +participant "ChunkedRemoteStrategy\n.execute()" as RemoteStrat +participant "WholeArrayStrategy\n.execute()" as WholeStrat +participant "StorageBackend\n.reduce_chunk()" as ChunkBackend +participant "StorageBackend\n.reduce_selection()" as SelBackend +participant "Reductionist\nv2 API" as Reductionist + +User ->> Active: active.mean(axis=0)[:] + +note over Active + axis parameter stored in + Active._axis = (axis,) if int + or = axis if tuple/None + chunk parallelism via + Active._max_threads +end note + +Active ->> GetSelection: _get_selection(selection) +note over GetSelection + Sets axis from Internal state + axis = self._axis + if axis is None: + axis = tuple(range(ndim)) +end note + +GetSelection ->> FromStorage: _from_storage(..., axis=axis) + +FromStorage ->> Strategy: strategy.execute(\n ..., axis=axis) + +alt Local chunked path (LocalBackend + ChunkedLocalStrategy) + Strategy ->> LocalStrat: execute(..., axis=axis) + alt max_threads > 1 + LocalStrat ->> LocalStrat: submit per-chunk tasks via\nThreadPoolExecutor(max_workers=max_threads) + LocalStrat ->> ChunkBackend: reduce_chunk(request) [parallel tasks] + ChunkBackend -->> LocalStrat: ChunkResult(data, count) [as futures complete] + else max_threads == 1 + LocalStrat ->> LocalStrat: for chunk in indexer:\nChunkRequest(axis=axis) + LocalStrat ->> ChunkBackend: reduce_chunk(request) + ChunkBackend -->> LocalStrat: ChunkResult(data, count) + end + LocalStrat ->> LocalStrat: accumulate chunk outputs\nand finalize reduction + LocalStrat -->> FromStorage: final_result +else Remote chunked path (S3/Https + ChunkedRemoteStrategy) + Strategy ->> RemoteStrat: execute(..., axis=axis) + alt max_threads > 1 + RemoteStrat ->> RemoteStrat: submit per-chunk tasks via\nThreadPoolExecutor(max_workers=max_threads) + RemoteStrat ->> ChunkBackend: reduce_chunk(request) [parallel tasks] + else max_threads == 1 + RemoteStrat ->> RemoteStrat: for chunk in indexer:\nChunkRequest(axis=axis) + RemoteStrat ->> ChunkBackend: reduce_chunk(request) + end + + note over ChunkBackend + S3Backend / HttpsBackend: + passes axis=request.axis + to Reductionist v2 API + end note + + ChunkBackend ->> Reductionist: reduce_chunk(\n..., axis=request.axis, operation='mean') + + note over Reductionist + Reductionist v2 API: + - axis tuple is applied server-side + - returns per-chunk reduced result + - examples: + axis=(0,1) -> shape[2] + axis=(2,) -> shape[0,1] + axis=None -> scalar + end note + + Reductionist -->> ChunkBackend: ChunkResult(data, count) + ChunkBackend -->> RemoteStrat: ChunkResult(data, count) [as futures complete] + RemoteStrat ->> RemoteStrat: accumulate chunk outputs\nand finalize reduction + RemoteStrat -->> FromStorage: final_result +else Whole-array path (P5Rem + WholeArrayStrategy) + Strategy ->> WholeStrat: execute(..., axis=axis) + WholeStrat ->> SelBackend: reduce_selection(SelectionRequest(axis=axis)) + SelBackend -->> WholeStrat: SelectionResult(data, n) + WholeStrat -->> FromStorage: final_result +end + +FromStorage ->> GetSelection: final_result + +GetSelection ->> Active: final_result + +Active ->> User: results[axis_reduced_shape] + +@enduml diff --git a/docs4understanding/axis_support_architecture_v1.puml b/docs4understanding/axis_support_architecture_v1.puml new file mode 100644 index 00000000..cd99562f --- /dev/null +++ b/docs4understanding/axis_support_architecture_v1.puml @@ -0,0 +1,186 @@ +@startuml axis_support_v1 + +title "Axis Parameter Support Architecture" + +skinparam classAttributeIconSize 0 +skinparam classFontStyle bold +skinparam packageStyle rectangle + +package "activestorage.core" { + + class Active { + - _axis: tuple | None + - _max_threads: int + -- + + min(axis: int | tuple | None) : self + + max(axis: int | tuple | None) : self + + mean(axis: int | tuple | None) : self + + sum(axis: int | tuple | None) : self + - _get_selection(selection) : array + - _from_storage(..., axis: tuple) : array + -- + **Axis Flow:** + User calls method(axis=N) → + stores in _axis → + uses in _get_selection() → + passes to _from_storage()\n + chunk dispatch policy from _max_threads + } + + class ChunkRequest { + -- + + axis: tuple | None + -- + **Axis contains:** + - Tuple of dimensions to reduce + - Example: (0, 1) to reduce first 2 dims + - None means reduce all (scalar result) + } + + class SelectionRequest { + -- + + axis: tuple | None + -- + **When used:** + - Whole-array reduction paths + - Less common than chunked paths + } + + class ChunkResult { + -- + + data: np.ndarray | dict + + count: int | None + -- + **Return values depend on axis:** + - axis=(0,1): shape reduced by 2 dims + - axis=None: scalar or dict of scalars + - components=True: returns dict with 'sum' or 'mean' and 'n' + } + + Active --> ChunkRequest : creates via\nstrategy + Active --> SelectionRequest : creates via\nstrategy + ChunkRequest --> ChunkResult : received from\nbackend + +note left of Active + **Example Usage:** + // Reduce along first axis only + active = Active("file.nc", "temperature") + result = active.mean(axis=0)[:] + + // Result shape: (10,10) if original (100,10,10) + + // Reduce all axes to scalar + result = active.mean(axis=None)[:] + // Result: scalar value + + // Multiple axes + result = active.sum(axis=(0,2))[:] + // Reduces axes 0 and 2, keeps axis 1 +end note + +note bottom of SelectionRequest +This is not implemented +at the moment, will likely +be implemented first in p5rem +backend. +end note + +} + +package "activestorage.backends" { + + abstract class StorageBackend { + -- + + reduce_chunk(request: ChunkRequest) : ChunkResult + + reduce_selection(request: SelectionRequest) : SelectionResult + -- + **Axis Handling:** + LocalBackend: ignores axis + (uses storage.reduce_chunk) + S3Backend: passes axis=request.axis + to reductionist.reduce_chunk() + HttpsBackend: passes axis=request.axis + to reductionist.reduce_chunk() + } + + class ChunkedLocalStrategy { + + execute(...) + -- + if max_threads > 1:\nThreadPoolExecutor dispatch + else:\nserial chunk loop + } + + class ChunkedRemoteStrategy { + + execute(...) + -- + if max_threads > 1:\nThreadPoolExecutor dispatch + else:\nserial chunk loop + } + + class LocalBackend { + + reduce_chunk(request: ChunkRequest) + -- + Does NOT use axis parameter + (local numpy operations on full chunks) + } + + class S3Backend { + + reduce_chunk(request: ChunkRequest) + -- + Passes axis=request.axis + to reductionist.reduce_chunk() + } + + class HttpsBackend { + + reduce_chunk(request: ChunkRequest) + -- + Passes axis=request.axis + to reductionist.reduce_chunk() + } + + + StorageBackend <|-- LocalBackend + StorageBackend <|-- S3Backend + StorageBackend <|-- HttpsBackend + LocalBackend --> ChunkedLocalStrategy : execution_strategy + S3Backend --> ChunkedRemoteStrategy : execution_strategy + HttpsBackend --> ChunkedRemoteStrategy : execution_strategy + +} + +package "activestorage.reductionist (v2 API)" { + + class ReductionistAPI { + + reduce_chunk( + session, + server, + source, + offset, size, + ..., + axis: tuple | None, + operation: str, + ...) + -- + **Axis Handling:** + - Performs reduction on remote server + - Axis=(0,1): reduces first 2 dims + - Axis=None: returns scalar + - Returns shaped result for chunked reductions + } + + note right of ReductionistAPI + **V2 Protocol Features:** + - Added axis parameter support + - Axis-aware server-side reduction + - Enables efficient dimensionality reduction + - Works with all compression/filter types + end note +} + +S3Backend --> ReductionistAPI : calls with\naxis=request.axis +HttpsBackend --> ReductionistAPI : calls with\naxis=request.axis + + + + +@enduml diff --git a/environment.yml b/environment.yml index 7fafb21a..61efdd1b 100644 --- a/environment.yml +++ b/environment.yml @@ -5,30 +5,27 @@ channels: - nodefaults dependencies: - - python >=3.9 - - dask !=2024.8.0 # github.com/dask/dask/issues/11296 + - python >=3.10 + - pyfive >=1.1.1 + - cbor2 - fsspec - h5netcdf - - h5py # needed by Kerchunk - - kerchunk >=0.2.4 # issues with numcodecs in 0.2.3 and API change in 0.2.4 - netcdf4 - numcodecs >=0.12 # github.com/valeriupredoi/PyActiveStorage/issues/162 - - numpy !=1.24.3 # severe masking bug + - numpy >=2 - pip !=21.3 - requests - s3fs >=2024.2.0 # loose s3fs deps leading to old aiobotocore for <2024.2.0 - # pin Zarr to avoid using old KVStore interface - # see github.com/zarr-developers/zarr-python/issues/1362 - - zarr >=2.13.6 # KVStore to FSStore + - ujson # Python packages for testing + - flake8 - moto # mock S3 tests - pytest - pytest-cov >=2.10.1 - pytest-html !=2.1.0 - pytest-metadata >=1.5.1 - pytest-xdist - # Python packages needed for building docs - # re-add when we deploy the docs - # - autodocsumm >=0.2.2 - # - sphinx >=5 - # - sphinx_rtd_theme + # docs + - autodocsumm + - sphinx + - sphinx_rtd_theme diff --git a/figs/docs4understanding/active_refactor_v7_core.svg b/figs/docs4understanding/active_refactor_v7_core.svg new file mode 100644 index 00000000..5def3533 --- /dev/null +++ b/figs/docs4understanding/active_refactor_v7_core.svg @@ -0,0 +1 @@ +activestoragecore«dataclass»SelectionRequest+uri : str+variable : str+selection : tuple+method : str+axis : tuple+missing : MissingAttributes+compressor : object+filters : list«dataclass»ChunkRequest+uri : str+offset : int+size : int+dtype : str+chunks : tuple+order : str+compressor : object+filters : list+chunk_selection : tuple+missing : MissingAttributes+method : str+axis : tuple«dataclass»MissingAttributes+fill_value+missing_value+valid_min+valid_max«dataclass»ChunkMetadata+dtype+shape : tuple+chunks : tuple+compressor+filters : list+order : str+filename : str«dataclass»ChunkResult+data : np.ma.array+count : int+out_selection : tuple«dataclass»SelectionResult+data : np.ma.array | dict+n : intStorageFormat+open(uri, storage_options)+get_variable(ncvar) : dataset+get_missing_attributes() : MissingAttributes+get_indexer(selection) : indexer+get_chunk_metadata() : ChunkMetadata+close()StorageBackend+supported_methods : set+execution_strategy : ExecutionStrategy+get_session()+close_session(session)+reduce_chunk(request: ChunkRequest) : ChunkResult+reduce_selection(request: SelectionRequest) : SelectionResultExecutionStrategy+execute(backend, session,chunk_metadata, indexer,missing, method,need_counts, axis) : array | dict«function»_select_backend(interface_type, version)Returns StorageBackend subclassfrom _BACKENDS dict.«function»_select_format(dataset)Returns StorageFormat subclassfrom _FORMATS dict._BACKENDS(None, 1/2) => LocalBackend("s3", 1/2) => S3Backend("https", 2) => HttpsBackend("p5rem", *) => P5RemBackend_FORMATSpyfive.Dataset => PyfiveFormatzarr.Array => ZarrFormatp5rem.File => P5RemFormat".nc"/".he5" => PyfiveFormat".zarr" => ZarrFormat".kerchunk" => KerchunkFormatActive-_methods : dict-_method : str-_axis : tuple-_components : bool-_backend : StorageBackend-_format : StorageFormat+register_method(name, func)+method : property+components : property+min(axis) : self+max(axis) : self+mean(axis) : self+sum(axis) : self+__getitem__(index)-_get_vanilla(index)-_from_storage(...)-_mask_data(data)_backend_format__init____init__delegates via backend.execution_strategyexecution_strategyconsumesproducesconsumesproducesproducesproduces \ No newline at end of file diff --git a/figs/docs4understanding/axis_flow_v1.svg b/figs/docs4understanding/axis_flow_v1.svg new file mode 100644 index 00000000..8c86a6a5 --- /dev/null +++ b/figs/docs4understanding/axis_flow_v1.svg @@ -0,0 +1 @@ +Axis Parameter Flow Through Reduction Pipeline (Path-Specific)Axis Parameter Flow Through Reduction Pipeline (Path-Specific)UserActiveActiveActiveExecutionStrategyChunkedLocalStrategyChunkedRemoteStrategyWholeArrayStrategyStorageBackendStorageBackendReductionistUserUserActive.min/.max/.mean/.sum(axis=N)Active.min/.max/.mean/.sum(axis=N)Active._get_selection()Active._get_selection()Active._from_storage()Active._from_storage()ExecutionStrategy.execute()ExecutionStrategy.execute()ChunkedLocalStrategy.execute()ChunkedLocalStrategy.execute()ChunkedRemoteStrategy.execute()ChunkedRemoteStrategy.execute()WholeArrayStrategy.execute()WholeArrayStrategy.execute()StorageBackend.reduce_chunk()StorageBackend.reduce_chunk()StorageBackend.reduce_selection()StorageBackend.reduce_selection()Reductionistv2 APIReductionistv2 APIactive.mean(axis=0)[:]axis parameter stored inActive._axis = (axis,) if intor = axis if tuple/None_get_selection(selection)Sets axis from Internal stateaxis = self._axisif axis is None:axis = tuple(range(ndim))_from_storage(..., axis=axis)strategy.execute(..., axis=axis)alt[Local chunked path (LocalBackend + ChunkedLocalStrategy)]execute(..., axis=axis)for chunk in indexer:ChunkRequest(axis=axis)reduce_chunk(request)ChunkResult(data, count)accumulate chunk outputsand finalize reductionfinal_result[Remote chunked path (S3/Https + ChunkedRemoteStrategy)]execute(..., axis=axis)for chunk in indexer:ChunkRequest(axis=axis)reduce_chunk(request)S3Backend / HttpsBackend:passes axis=request.axisto Reductionist v2 APIreduce_chunk(..., axis=request.axis, operation='mean')Reductionist v2 API:- axis tuple is applied server-side- returns per-chunk reduced result- examples:axis=(0,1) -> shape[2]axis=(2,) -> shape[0,1]axis=None -> scalarChunkResult(data, count)ChunkResult(data, count)accumulate chunk outputsand finalize reductionfinal_result[Whole-array path (P5Rem + WholeArrayStrategy)]execute(..., axis=axis)reduce_selection(SelectionRequest(axis=axis))SelectionResult(data, n)final_resultfinal_resultfinal_resultresults[axis_reduced_shape] \ No newline at end of file diff --git a/figs/docs4understanding/axis_support_v1.svg b/figs/docs4understanding/axis_support_v1.svg new file mode 100644 index 00000000..3a9607a7 --- /dev/null +++ b/figs/docs4understanding/axis_support_v1.svg @@ -0,0 +1 @@ +Axis Parameter Support ArchitectureAxis Parameter Support Architectureactivestoragecorebackendsreductionist (v2 API)Active-_axis: tuple | None+min(axis: int | tuple | None) : self+max(axis: int | tuple | None) : self+mean(axis: int | tuple | None) : self+sum(axis: int | tuple | None) : self-_get_selection(selection) : array-_from_storage(..., axis: tuple) : arrayAxis Flow:User calls method(axis=N) →stores in _axis →uses in _get_selection() →passes to _from_storage()ChunkRequest+axis: tuple | NoneAxis contains:-Tuple of dimensions to reduce-Example: (0, 1) to reduce first 2 dims-None means reduce all (scalar result)SelectionRequest+axis: tuple | NoneWhen used:-Whole-array reduction paths-Less common than chunked pathsChunkResult+data: np.ndarray | dict+count: int | NoneReturn values depend on axis:-axis=(0,1): shape reduced by 2 dims-axis=None: scalar or dict of scalars-components=True: returns dict with 'sum' or 'mean' and 'n'Example Usage:// Reduce along first axis onlyactive = Active("file.nc", "temperature")result = active.mean(axis=0)[:] // Result shape: (10,10) if original (100,10,10) // Reduce all axes to scalarresult = active.mean(axis=None)[:]// Result: scalar value // Multiple axesresult = active.sum(axis=(0,2))[:]// Reduces axes 0 and 2, keeps axis 1This is not implementedat the moment, will likelybe implemented first in p5rembackend.StorageBackend+reduce_chunk(request: ChunkRequest) : ChunkResult+reduce_selection(request: SelectionRequest) : SelectionResultAxis Handling:LocalBackend: ignores axis(uses storage.reduce_chunk)S3Backend: passes axis=request.axisto reductionist.reduce_chunk()HttpsBackend: passes axis=request.axisto reductionist.reduce_chunk()LocalBackend+reduce_chunk(request: ChunkRequest)Does NOT use axis parameter(local numpy operations on full chunks)S3Backend+reduce_chunk(request: ChunkRequest)Passes axis=request.axisto reductionist.reduce_chunk()HttpsBackend+reduce_chunk(request: ChunkRequest)Passes axis=request.axisto reductionist.reduce_chunk()ReductionistAPI+reduce_chunk(session,server,source,offset, size,...,axis: tuple | None,operation: str,...)Axis Handling:-Performs reduction on remote server-Axis=(0,1): reduces first 2 dims-Axis=None: returns scalar-Returns shaped result for chunked reductionsV2 Protocol Features:- Added axis parameter support- Axis-aware server-side reduction- Enables efficient dimensionality reduction- Works with all compression/filter typescreates viastrategycreates viastrategyreceived frombackendcalls withaxis=request.axiscalls withaxis=request.axis \ No newline at end of file diff --git a/pas_testing_bnl_4vs.tgz b/pas_testing_bnl_4vs.tgz new file mode 100644 index 00000000..cc56bb16 Binary files /dev/null and b/pas_testing_bnl_4vs.tgz differ diff --git a/pyproject.toml b/pyproject.toml index 941ddb78..1ec0da96 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,79 @@ [build-system] requires = ["setuptools >= 40.6.0", "wheel", "setuptools_scm>=6.2"] build-backend = "setuptools.build_meta" + +[project] +name = "ActiveStorage" +dynamic = ["version"] +description = "" +readme = {file = "README.md", content-type = "text/markdown"} +classifiers = [ + "Development Status :: 0 - Prototype", + "Environment :: Console", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "Natural Language :: English", + "Operating System :: POSIX :: Linux", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Topic :: Scientific/Engineering", + "Topic :: Scientific/Engineering :: Atmospheric Science", + "Topic :: Scientific/Engineering :: GIS", + "Topic :: Scientific/Engineering :: Hydrology", + "Topic :: Scientific/Engineering :: Physics", +] +dependencies = [ + "dask!=2024.8.0", # github.com/dask/dask/issues/11296 + "fsspec", + "h5netcdf", + "h5py", + "kerchunk>=0.2.4", + "netcdf4", + "numcodecs>=0.12", # github/issues/162 + "numpy!=1.24.3", # severe masking bug + "requests", + "s3fs>=2024.2.0", + "zarr>=2.13.3", # github.com/zarr-developers/zarr-python/issues/1362 +] + +[project.optional-dependencies] +test = [ + "moto", + "pytest", + "pytest-cov>=2.10.1", + "pytest-html!=2.1.0", + "pytest-metadata>=1.5.1", + "pytest-xdist", +] + +[tool.setuptools] +packages = ["activestorage"] +include-package-data = true + +[tool.setuptools_scm] + +[tool.pytest.ini_options] +addopts = """ + -m 'not slow' + --ignore=old_code/ + --ignore=tests/s3_exploratory + --ignore=bnl + --cov=activestorage + --cov-report=xml:test-reports/coverage.xml + --cov-report=html:test-reports/coverage_html + --html=test-reports/report.html +""" +markers = [ + "slow: marks tests as slow (deselect with '-m \"not slow\"')", +] + +[tool.coverage.run] +parallel = true + +[tool.coverage.report] +exclude_lines = [ + "if __name__ == .__main__.:", +] diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 1cfa061e..00000000 --- a/setup.cfg +++ /dev/null @@ -1,32 +0,0 @@ -[tool:pytest] -addopts = -# --doctest-modules - --ignore=old_code/ - --ignore=tests/s3_exploratory - --ignore=bnl - --cov=activestorage - --cov-report=xml:test-reports/coverage.xml - --cov-report=html:test-reports/coverage_html - --html=test-reports/report.html - -#[flake8] -#exclude = -# doc/conf.py - -[coverage:run] -parallel = true -[coverage:report] -exclude_lines = - if __name__ == .__main__.: - -#[pydocstyle] -#convention = numpy - -#[isort] -#multi_line_output = 3 -#include_trailing_comma = true - -#[yapf] -#based_on_style = pep8 -# see https://github.com/google/yapf/issues/744 -#blank_line_before_nested_class_or_def = true diff --git a/setup.py b/setup.py deleted file mode 100644 index deafdeba..00000000 --- a/setup.py +++ /dev/null @@ -1,100 +0,0 @@ -import json -import os -import re -import sys -from pathlib import Path - -from setuptools import Command, setup - -PACKAGES = [ - 'activestorage', -] - -REQUIREMENTS = { - # Installation script (this file) dependencies - 'setup': [ - 'setuptools_scm', - ], - # Installation dependencies - # Use with pip install . to install from source - 'install': [ - 'dask!=2024.8.0', # github.com/dask/dask/issues/11296 - 'fsspec', - 'h5netcdf', - 'h5py', # needed by Kerchunk - 'kerchunk>=0.2.4', - 'netcdf4', - 'numcodecs>=0.12', # github/issues/162 - 'numpy!=1.24.3', # severe masking bug - 'requests', - 's3fs>=2024.2.0', # see environment.yml for pin reason - # pin Zarr to use new FSStore instead of KVStore - 'zarr>=2.13.3', # github.com/zarr-developers/zarr-python/issues/1362 - # for testing - 'moto', # mock S3 tests - 'pytest', - 'pytest-cov>=2.10.1', - 'pytest-html!=2.1.0', - 'pytest-metadata>=1.5.1', - 'pytest-xdist', - # for documentation - # re-add when we deploy the docs - # 'autodocsumm', - # 'sphinx>=5', - # 'sphinx_rtd_theme', - ], -} - - -def discover_python_files(paths, ignore): - """Discover Python files.""" - - def _ignore(path): - """Return True if `path` should be ignored, False otherwise.""" - return any(re.match(pattern, path) for pattern in ignore) - - for path in sorted(set(paths)): - for root, _, files in os.walk(path): - if _ignore(path): - continue - for filename in files: - filename = os.path.join(root, filename) - if (filename.lower().endswith('.py') - and not _ignore(filename)): - yield filename - - -setup( - name='ActiveStorage', - author="", - description="", - long_description="", - long_description_content_type='text/markdown', - url='', - download_url='', - license='', - classifiers=[ - 'Development Status :: 0 - Prototype', - 'Environment :: Console', - 'Intended Audience :: Developers', - 'Intended Audience :: Science/Research', - 'Natural Language :: English', - 'Operating System :: POSIX :: Linux', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.10', - 'Programming Language :: Python :: 3.11', - 'Programming Language :: Python :: 3.12', - 'Programming Language :: Python :: 3.13', - 'Topic :: Scientific/Engineering', - 'Topic :: Scientific/Engineering :: Atmospheric Science', - 'Topic :: Scientific/Engineering :: GIS', - 'Topic :: Scientific/Engineering :: Hydrology', - 'Topic :: Scientific/Engineering :: Physics', - ], - packages=PACKAGES, - # Include all version controlled files - include_package_data=True, - setup_requires=REQUIREMENTS['setup'], - install_requires=REQUIREMENTS['install'], - zip_safe=False, -) diff --git a/test_fillvalue.nc b/test_fillvalue.nc new file mode 100644 index 00000000..74117d32 Binary files /dev/null and b/test_fillvalue.nc differ diff --git a/test_missing.nc b/test_missing.nc new file mode 100644 index 00000000..dc214064 Binary files /dev/null and b/test_missing.nc differ diff --git a/test_validmax.nc b/test_validmax.nc new file mode 100644 index 00000000..b9d4af8e Binary files /dev/null and b/test_validmax.nc differ diff --git a/test_validmin.nc b/test_validmin.nc new file mode 100644 index 00000000..f8d4e1fc Binary files /dev/null and b/test_validmin.nc differ diff --git a/test_validrange.nc b/test_validrange.nc new file mode 100644 index 00000000..b47fc413 Binary files /dev/null and b/test_validrange.nc differ diff --git a/test_vanilla.nc b/test_vanilla.nc new file mode 100644 index 00000000..2816c7ce Binary files /dev/null and b/test_vanilla.nc differ diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..66173aec --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ +# Test package diff --git a/activestorage/dummy_data.py b/tests/dummy_data.py similarity index 100% rename from activestorage/dummy_data.py rename to tests/dummy_data.py diff --git a/tests/s3_exploratory/__init__.py b/tests/s3_exploratory/__init__.py new file mode 100644 index 00000000..4537b13f --- /dev/null +++ b/tests/s3_exploratory/__init__.py @@ -0,0 +1 @@ +# S3 exploratory tests diff --git a/tests/s3_exploratory/test_s3_arrange_files.py b/tests/s3_exploratory/test_s3_arrange_files.py index 8117ce17..b43cd26d 100644 --- a/tests/s3_exploratory/test_s3_arrange_files.py +++ b/tests/s3_exploratory/test_s3_arrange_files.py @@ -7,12 +7,13 @@ import tempfile from activestorage.active import Active -from activestorage.dummy_data import make_vanilla_ncdata +from .. import dummy_data +from ..dummy_data import make_vanilla_ncdata from numpy.testing import assert_allclose, assert_array_equal from pathlib import Path -from config_minio import * +from .config_minio import * # HDF5 chunking is paramount for performance # many small chunks slow down the process by factors of hundreds diff --git a/tests/s3_exploratory/test_s3_performance.py b/tests/s3_exploratory/test_s3_performance.py index c47d2ab7..99e84588 100644 --- a/tests/s3_exploratory/test_s3_performance.py +++ b/tests/s3_exploratory/test_s3_performance.py @@ -6,7 +6,7 @@ from pathlib import Path from activestorage.active import Active -from config_minio import * +from .config_minio import * @pytest.fixture diff --git a/tests/s3_exploratory/test_s3_reduction.py b/tests/s3_exploratory/test_s3_reduction.py index 3546a528..1e99f5e4 100644 --- a/tests/s3_exploratory/test_s3_reduction.py +++ b/tests/s3_exploratory/test_s3_reduction.py @@ -5,14 +5,15 @@ import tempfile from activestorage.active import Active -from activestorage.dummy_data import make_vanilla_ncdata +from .. import dummy_data +from ..dummy_data import make_vanilla_ncdata import activestorage.storage as st from activestorage.reductionist import reduce_chunk as reductionist_reduce_chunk from activestorage.reductionist import get_session from numpy.testing import assert_allclose, assert_array_equal from pathlib import Path -from config_minio import * +from .config_minio import * def make_tempfile(): diff --git a/tests/test_bigger_data.py b/tests/test_bigger_data.py index 2173558d..6fe2bd85 100644 --- a/tests/test_bigger_data.py +++ b/tests/test_bigger_data.py @@ -8,10 +8,10 @@ import s3fs from activestorage.active import Active -from activestorage.config import * +from activestorage.config import USE_S3 from pyfive.core import InvalidHDF5File as InvalidHDF5Err -import utils +from . import utils @pytest.fixture @@ -169,12 +169,8 @@ def test_native_emac_model_fails(test_data_path): with pytest.raises(InvalidHDF5Err): active[...] else: - active = Active(uri, "aps_ave") - active._version = 2 - active.method = "mean" - active.components = True with pytest.raises(InvalidHDF5Err): - result2 = active[4:5, 1:2] + active = Active(uri, "aps_ave") def test_cesm2_native(test_data_path): diff --git a/tests/test_byte_order.py b/tests/test_byte_order.py index 5e2148e4..6782bf35 100644 --- a/tests/test_byte_order.py +++ b/tests/test_byte_order.py @@ -4,10 +4,11 @@ from netCDF4 import Dataset from activestorage.active import Active -from activestorage.config import * -from activestorage.dummy_data import make_byte_order_ncdata +from activestorage.config import USE_S3 +from . import dummy_data +from .dummy_data import make_byte_order_ncdata -import utils +from . import utils def check_dataset_byte_order(temp_file: str, ncvar: str, byte_order: str): @@ -36,6 +37,7 @@ def test_byte_order(tmp_path: str, byte_order: str): """ Test use of datasets with different byte orders (endianness). """ + test_file = create_byte_order_dataset(tmp_path, byte_order) active = Active(test_file, 'data', utils.get_storage_type()) diff --git a/tests/test_compression.py b/tests/test_compression.py index 0db5ad55..d788d6d3 100644 --- a/tests/test_compression.py +++ b/tests/test_compression.py @@ -6,10 +6,12 @@ from pathlib import Path from activestorage.active import Active, load_from_s3 -from activestorage.config import * -from activestorage.dummy_data import make_compressed_ncdata +from activestorage.config import USE_S3, S3_ACCESS_KEY, S3_SECRET_KEY, S3_URL, S3_ACTIVE_STORAGE_URL +from . import dummy_data +from .dummy_data import make_compressed_ncdata -import utils +from . import utils +import h5py, pyfive def check_dataset_filters(temp_file: str, ncvar: str, compression: str, shuffle: bool): @@ -75,6 +77,23 @@ def test_compression_and_filters(tmp_path: str, compression: str, shuffle: bool) result = active[0:2,4:6,7:9] assert result == 740.0 +@pytest.mark.parametrize('compression', ['zlib']) +@pytest.mark.parametrize('shuffle', [False, True]) +def test_filter_pipeline(tmp_path: str, compression: str, shuffle: bool): + """ + Test the filter pipeline looks the way we expect it to. + """ + test_file = create_compressed_dataset(tmp_path, compression, shuffle) + with pyfive.File(test_file) as pfile: + ds = pfile['data'] + pipeline = ds.compression_opts + + with h5py.File(test_file) as hfile: + ds1 = hfile['data'] + hpipeline = ds1.compression_opts + assert pipeline == hpipeline + + @pytest.mark.parametrize("storage_options, active_storage_url", storage_options_paramlist) def test_compression_and_filters_cmip6_data(storage_options, active_storage_url): diff --git a/tests/test_compression_remote_reductionist.py b/tests/test_compression_remote_reductionist.py index 888dbcff..e5d0d3db 100644 --- a/tests/test_compression_remote_reductionist.py +++ b/tests/test_compression_remote_reductionist.py @@ -6,11 +6,12 @@ from pathlib import Path from activestorage.active import Active, load_from_s3 -from activestorage.config import * -from activestorage.dummy_data import make_compressed_ncdata +from activestorage.config import USE_S3, S3_BUCKET, REMOTE_RED, S3_ACTIVE_STORAGE_URL +from . import dummy_data +from .dummy_data import make_compressed_ncdata from activestorage.reductionist import ReductionistError as RedErr -import utils +from . import utils # Bryan's S3 machine + Bryan's reductionist diff --git a/tests/test_data/test1.nc b/tests/test_data/test1.nc new file mode 100644 index 00000000..1aa6fc51 Binary files /dev/null and b/tests/test_data/test1.nc differ diff --git a/tests/test_harness.py b/tests/test_harness.py index 315a3b46..0924cd5e 100644 --- a/tests/test_harness.py +++ b/tests/test_harness.py @@ -6,10 +6,11 @@ import unittest from activestorage.active import Active -from activestorage.config import * -from activestorage.dummy_data import make_vanilla_ncdata +from activestorage.config import USE_S3 +from . import dummy_data +from .dummy_data import make_vanilla_ncdata -import utils +from . import utils def create_test_dataset(tmp_path): diff --git a/tests/test_missing.py b/tests/test_missing.py index 934d9039..5d9b46b4 100644 --- a/tests/test_missing.py +++ b/tests/test_missing.py @@ -16,10 +16,9 @@ from netCDF4 import Dataset from activestorage.active import Active, load_from_s3 -from activestorage.config import * -from activestorage import dummy_data as dd - -import utils +from activestorage.config import USE_S3 +from . import dummy_data as dd +from . import utils def load_dataset(testfile): diff --git a/tests/test_real_https.py b/tests/test_real_https.py new file mode 100644 index 00000000..8ad727a3 --- /dev/null +++ b/tests/test_real_https.py @@ -0,0 +1,135 @@ +import numpy as np +import pytest + +from requests.exceptions import MissingSchema +from activestorage.active import Active, load_from_https + + +def test_https(): + """ + Run a https test with a small enough file for the test + not to be marked as slow. We test all aspects here. + File: https://esgf.ceda.ac.uk/thredds/fileServer/esg_cmip6/CMIP6/CMIP/MOHC/UKESM1-1-LL/piControl/r1i1p1f2/Amon/ta/gn/latest/ta_Amon_UKESM1-1-LL_piControl_r1i1p1f2_gn_274301-274912.nc + Size: 75 MiB, variable: ta + Entire test uses at most 400M RES memory. + """ + test_file_uri = "https://esgf.ceda.ac.uk/thredds/fileServer/esg_cmip6/CMIP6/CMIP/MOHC/UKESM1-1-LL/piControl/r1i1p1f2/Amon/ta/gn/latest/ta_Amon_UKESM1-1-LL_piControl_r1i1p1f2_gn_274301-274912.nc" + active_storage_url = "https://reductionist.jasmin.ac.uk/" # Wacasoft new Reductionist + + # v1: all local + active = Active(test_file_uri, "ta") + active._version = 1 + result = active.min()[0:3, 4:6, 7:9] + print("Result is", result) + assert result == np.array([220.3180694580078], dtype="float32") + + # v2: declared storage type, no activa storage URL + active = Active(test_file_uri, "ta", + interface_type="https", ) + active._version = 2 + with pytest.raises(MissingSchema): + result = active.min()[0:3, 4:6, 7:9] + + # v2: declared storage type + active = Active(test_file_uri, "ta", + interface_type="https", + active_storage_url=active_storage_url, + option_disable_chunk_cache=True) + active._version = 2 + result = active.min()[0:3, 4:6, 7:9] + print("Result is", result) + assert result == np.array([220.3180694580078], dtype="float32") + + # v2: inferred storage type + active = Active(test_file_uri, "ta", + active_storage_url=active_storage_url, + option_disable_chunk_cache=True) + active._version = 2 + result = active.min()[0:3, 4:6, 7:9] + print("Result is", result) + assert result == np.array([220.3180694580078], dtype="float32") + + # set these as fixed floats + f_1 = 176.882080078125 + f_2 = 190.227783203125 + + # v2: inferred storage type, pop axis + active = Active(test_file_uri, "ta", + interface_type="https", + active_storage_url=active_storage_url, + option_disable_chunk_cache=True) + active._version = 2 + result = active.min(axis=(0, 1))[:] + print("Result is", result) + print("Result shape is", result.shape) + assert result.shape == (1, 1, 144, 192) + assert result[0, 0, 0, 0] == f_1 + assert result[0, 0, 143, 191] == f_2 + + # load dataset with Pyfive + dataset = load_from_https(test_file_uri) + av = dataset['ta'] + r_min = np.min(av[:], axis=(0, 1)) + # NOTE the difference in shapes: + # - Reductionist: (1, 1, 144, 192) + # - numpy: (144, 192) + # Contents is identical though. + print(r_min) + assert r_min[0, 0] == f_1 + assert r_min[143, 191] == f_2 + + # basic auth on; username and password + # should work with both Active and Reductionist but we + # don't have such an NGINX-auth-ed file yet + active = Active(test_file_uri, "ta", + interface_type="https", + storage_options={"username": None, "password": None}, + active_storage_url=active_storage_url, + option_disable_chunk_cache=True) + active._version = 2 + result = active.min(axis=(0, 1))[:] + print("Result is", result) + print("Result shape is", result.shape) + assert result.shape == (1, 1, 144, 192) + assert result[0, 0, 0, 0] == f_1 + assert result[0, 0, 143, 191] == f_2 + + # run with pyfive.Dataset instead of File + dataset = load_from_https(test_file_uri) + av = dataset['ta'] + active = Active(av, + active_storage_url=active_storage_url) + active._version = 2 + print("Interface type", active.interface_type) + result = active.min(axis=(0, 1))[:] + print("Result is", result) + print("Result shape is", result.shape) + assert result.shape == (1, 1, 144, 192) + assert result[0, 0, 0, 0] == f_1 + assert result[0, 0, 143, 191] == f_2 + + +def test_https_implicit_storage_file_not_found(): + """ + Run a true test with a https FILE that is not found. + Code raises a very descriptive exception via fsspec. + Keep test to capture any changes in behaviour. + """ + test_file_uri = "https://esgf.ceda.ac.uk/thredds/fileServer/esg_cmip6/CMIP6/AerChemMIP/MOHC/UKESM1-0-LL/ssp370SST-lowNTCF/r1i1p1f2/Amon/cl/gn/latest/cl_Amon_UKESM1-0-LL_ssp370SST-lowNTCF_r1i1p1f2_gn_205001-209912.ncx" + + with pytest.raises(FileNotFoundError): + active = Active(test_file_uri, "cl") + active._version = 1 + result = active.min()[0:3, 4:6, 7:9] + + +def test_https_implicit_storage_wrong_url(): + """ + Run a true test with a bogus URL. + """ + test_file_uri = "https://esgf.cedacow.ac.uk/thredds/fileServer/esg_cmip6/CMIP6/AerChemMIP/MOHC/UKESM1-0-LL/ssp370SST-lowNTCF/r1i1p1f2/Amon/cl/gn/latest/cl_Amon_UKESM1-0-LL_ssp370SST-lowNTCF_r1i1p1f2_gn_205001-209912.nc" + + with pytest.raises(ValueError): + active = Active(test_file_uri, "cl") + active._version = 1 + result = active.min[0:3, 4:6, 7:9] diff --git a/tests/test_real_https_extras.py b/tests/test_real_https_extras.py new file mode 100644 index 00000000..1b26c7b0 --- /dev/null +++ b/tests/test_real_https_extras.py @@ -0,0 +1,83 @@ +import numpy as np +import pytest + +from activestorage.active import Active, load_from_https + + +@pytest.mark.skip( + reason="save time: test_https_implicit_storage is more general.") +def test_https_v1(): + """Run a true test with a https FILE.""" + test_file_uri = "https://esgf.ceda.ac.uk/thredds/fileServer/esg_cmip6/CMIP6/AerChemMIP/MOHC/UKESM1-0-LL/ssp370SST-lowNTCF/r1i1p1f2/Amon/cl/gn/latest/cl_Amon_UKESM1-0-LL_ssp370SST-lowNTCF_r1i1p1f2_gn_205001-209912.nc" + + active = Active(test_file_uri, "cl", interface_type="https") + active._version = 1 + result = active.min()[0:3, 4:6, 7:9] + print("Result is", result) + assert result == np.array([0.6909787], dtype="float32") + + +@pytest.mark.skip(reason="save time: 2xdata = 2xtime compared to test_https.") +def test_https_v1_100years_file(): + """Run a true test with a https FILE.""" + test_file_uri = "https://esgf.ceda.ac.uk/thredds/fileServer/esg_cmip6/CMIP6/CMIP/MOHC/UKESM1-1-LL/historical/r1i1p1f2/Amon/pr/gn/latest/pr_Amon_UKESM1-1-LL_historical_r1i1p1f2_gn_195001-201412.nc" + active = Active(test_file_uri, "pr") + active._version = 1 + result = active.min()[0:3, 4:6, 7:9] + print("Result is", result) + assert result == np.array([5.4734613e-07], dtype="float32") + + +@pytest.mark.slow +def test_https_bigger_file(): + """Run a true test with a https FILE.""" + test_file_uri = "https://esgf.ceda.ac.uk/thredds/fileServer/esg_cmip6/CMIP6/AerChemMIP/MOHC/UKESM1-0-LL/ssp370SST-lowNTCF/r1i1p1f2/Amon/cl/gn/latest/cl_Amon_UKESM1-0-LL_ssp370SST-lowNTCF_r1i1p1f2_gn_205001-209912.nc" + active_storage_url = "https://reductionist.jasmin.ac.uk/" # Wacasoft new Reductionist + active = Active(test_file_uri, "cl", + active_storage_url=active_storage_url, + option_disable_chunk_cache=True) + active._version = 2 + result = active.min()[0:3, 4:6, 7:9] + print("Result is", result) + assert result == np.array([0.6909787], dtype="float32") + + +@pytest.mark.slow +def test_https_implicit_storage(): + """Run a true test with a https FILE.""" + test_file_uri = "https://esgf.ceda.ac.uk/thredds/fileServer/esg_cmip6/CMIP6/AerChemMIP/MOHC/UKESM1-0-LL/ssp370SST-lowNTCF/r1i1p1f2/Amon/cl/gn/latest/cl_Amon_UKESM1-0-LL_ssp370SST-lowNTCF_r1i1p1f2_gn_205001-209912.nc" + + active = Active(test_file_uri, "cl") + active._version = 1 + result = active.min()[0:3, 4:6, 7:9] + print("Result is", result) + assert result == np.array([0.6909787], dtype="float32") + + +@pytest.mark.skip( + reason="save time: test_https_dataset_implicit_storage is more general.") +def test_https_dataset(): + """Run a true test with a https DATASET.""" + test_file_uri = "https://esgf.ceda.ac.uk/thredds/fileServer/esg_cmip6/CMIP6/AerChemMIP/MOHC/UKESM1-0-LL/ssp370SST-lowNTCF/r1i1p1f2/Amon/cl/gn/latest/cl_Amon_UKESM1-0-LL_ssp370SST-lowNTCF_r1i1p1f2_gn_205001-209912.nc" + dataset = load_from_https(test_file_uri) + av = dataset['cl'] + + active = Active(av, interface_type="https") + active._version = 1 + result = active.min()[0:3, 4:6, 7:9] + print("Result is", result) + assert result == np.array([0.6909787], dtype="float32") + + +@pytest.mark.slow +def test_https_dataset_implicit_storage(): + """Run a true test with a https DATASET.""" + test_file_uri = "https://esgf.ceda.ac.uk/thredds/fileServer/esg_cmip6/CMIP6/AerChemMIP/MOHC/UKESM1-0-LL/ssp370SST-lowNTCF/r1i1p1f2/Amon/cl/gn/latest/cl_Amon_UKESM1-0-LL_ssp370SST-lowNTCF_r1i1p1f2_gn_205001-209912.nc" + dataset = load_from_https(test_file_uri) + av = dataset['cl'] + + active = Active(av) + active._version = 1 + result = active.min()[0:3, 4:6, 7:9] + print("Result is", result) + assert result == np.array([0.6909787], dtype="float32") diff --git a/tests/test_real_s3.py b/tests/test_real_s3.py new file mode 100644 index 00000000..a0d52fae --- /dev/null +++ b/tests/test_real_s3.py @@ -0,0 +1,156 @@ +import os + +import numpy as np +import pytest + +from activestorage.active import Active, load_from_s3 +from activestorage.reductionist import ReductionistError + + +S3_BUCKET = "bnl" + + +def test_anon_s3(): + """Test a very basic but real S3 ANON access.""" + active_storage_url = "https://reductionist.jasmin.ac.uk/" # Wacasoft + bigger_file = "CMIP6-test.nc" # tas; 15 (time) x 143 x 144 + + test_file_uri = os.path.join( + "esmvaltool-zarr", + bigger_file + ) + print("S3 Test file path:", test_file_uri) + + # no secrets - just living life + active = Active(test_file_uri, 'tas', + storage_options={ + "anon": True, + 'client_kwargs': { + 'endpoint_url': "https://uor-aces-o.s3-ext.jc.rl.ac.uk" + } + }, + active_storage_url=active_storage_url, + option_disable_chunk_cache=True) + active._version = 2 + with pytest.raises(ReductionistError): + result = active.min()[:] + assert result == 197.69595 + + +def test_s3_small_file(): + """Run an S3 test on a small file.""" + storage_options = { + 'key': "f2d55c6dcfc7618b2c34e00b58df3cef", + 'secret': + "$/'#M{0{/4rVhp%n^(XeX$q@y#&(NM3W1->~N.Q6VP.5[@bLpi='nt]AfH)>78pT", + 'client_kwargs': { + 'endpoint_url': "https://uor-aces-o.s3-ext.jc.rl.ac.uk" + }, + } + active_storage_url = "https://reductionist.jasmin.ac.uk/" # Wacasoft + bigger_file = "CMIP6-test.nc" # tas; 15 (time) x 143 x 144 + + test_file_uri = os.path.join(S3_BUCKET, bigger_file) + print("S3 Test file path:", test_file_uri) + active = Active(test_file_uri, + 'tas', + storage_options=storage_options, + active_storage_url=active_storage_url, + option_disable_chunk_cache=True) + active._version = 2 + result = active.min()[0:3, 4:6, 7:9] + print("Result is", result) + assert result == 222.09129333496094 + + +def test_s3_small_dataset(): + """Run an S3 test on a small file.""" + storage_options = { + 'key': "f2d55c6dcfc7618b2c34e00b58df3cef", + 'secret': + "$/'#M{0{/4rVhp%n^(XeX$q@y#&(NM3W1->~N.Q6VP.5[@bLpi='nt]AfH)>78pT", + 'client_kwargs': { + 'endpoint_url': "https://uor-aces-o.s3-ext.jc.rl.ac.uk" + }, + } + active_storage_url = "https://reductionist.jasmin.ac.uk/" # Wacasoft + bigger_file = "CMIP6-test.nc" # tas; 15 (time) x 143 x 144 + + test_file_uri = os.path.join(S3_BUCKET, bigger_file) + # load and pass dataset + dataset = load_from_s3(test_file_uri, storage_options=storage_options) + av = dataset['tas'] + active = Active(av, + storage_options=storage_options, + active_storage_url=active_storage_url, + option_disable_chunk_cache=True) + active._version = 2 + result = active.min()[0:3, 4:6, 7:9] + print("Result is", result) + assert result == 222.09129333496094 + + +@pytest.mark.slow +def test_s3_dataset(): + """Run somewhat as the 'gold' test.""" + storage_options = { + 'key': "f2d55c6dcfc7618b2c34e00b58df3cef", + 'secret': + "$/'#M{0{/4rVhp%n^(XeX$q@y#&(NM3W1->~N.Q6VP.5[@bLpi='nt]AfH)>78pT", + 'client_kwargs': { + 'endpoint_url': "https://uor-aces-o.s3-ext.jc.rl.ac.uk" + }, + } + active_storage_url = "https://reductionist.jasmin.ac.uk/" # Wacasoft + bigger_file = "ch330a.pc19790301-def.nc" # 17GB 64 HDF5 chunks + + test_file_uri = os.path.join(S3_BUCKET, bigger_file) + print("S3 Test file path:", test_file_uri) + + # file: explicit interface_type + active = Active(test_file_uri, + 'UM_m01s16i202_vn1106', + interface_type="s3", + storage_options=storage_options, + active_storage_url=active_storage_url, + option_disable_chunk_cache=True) + active._version = 2 + result = active.min()[0:3, 4:6, 7:9] # standardized slice + print("Result is", result) + assert result == 5098.625 + + # file: implicit interface_type + active = Active(test_file_uri, + 'UM_m01s16i202_vn1106', + storage_options=storage_options, + active_storage_url=active_storage_url, + option_disable_chunk_cache=True) + active._version = 2 + result = active.min()[0:3, 4:6, 7:9] # standardized slice + print("Result is", result) + assert result == 5098.625 + + # load dataset + dataset = load_from_s3(test_file_uri, storage_options=storage_options) + av = dataset['UM_m01s16i202_vn1106'] + + # dataset: explicit interface_type + active = Active(av, + interface_type="s3", + storage_options=storage_options, + active_storage_url=active_storage_url, + option_disable_chunk_cache=True) + active._version = 2 + result = active.min()[0:3, 4:6, 7:9] # standardized slice + print("Result is", result) + assert result == 5098.625 + + # dataset: implicit interface_type + active = Active(av, + storage_options=storage_options, + active_storage_url=active_storage_url, + option_disable_chunk_cache=True) + active._version = 2 + result = active.min()[0:3, 4:6, 7:9] # standardized slice + print("Result is", result) + assert result == 5098.625 diff --git a/tests/test_real_s3_with_axes.py b/tests/test_real_s3_with_axes.py new file mode 100644 index 00000000..57f09395 --- /dev/null +++ b/tests/test_real_s3_with_axes.py @@ -0,0 +1,175 @@ +import os +import numpy as np +import pyfive +import pytest + +from activestorage.active import Active + + +S3_BUCKET = "bnl" + +def build_active_test1_file(): + """Run an integration test with real data off S3 but with a small file.""" + storage_options = { + 'key': "f2d55c6dcfc7618b2c34e00b58df3cef", + 'secret': "$/'#M{0{/4rVhp%n^(XeX$q@y#&(NM3W1->~N.Q6VP.5[@bLpi='nt]AfH)>78pT", + 'client_kwargs': {'endpoint_url': "https://uor-aces-o.s3-ext.jc.rl.ac.uk"}, # final proxy + } + active_storage_url = "https://reductionist.jasmin.ac.uk/" # Wacasoft new Reductionist + bigger_file = "test1.nc" # tas; 15 (time) x 143 x 144 + + test_file_uri = os.path.join( + S3_BUCKET, + bigger_file + ) + print("S3 Test file path:", test_file_uri) + active = Active(test_file_uri, 'tas', interface_type="s3", + storage_options=storage_options, + active_storage_url=active_storage_url, + option_disable_chunk_cache=True) + + active._version = 2 + + return active + + +def build_active_small_file(): + """Run an integration test with real data off S3 but with a small file.""" + storage_options = { + 'key': "f2d55c6dcfc7618b2c34e00b58df3cef", + 'secret': "$/'#M{0{/4rVhp%n^(XeX$q@y#&(NM3W1->~N.Q6VP.5[@bLpi='nt]AfH)>78pT", + 'client_kwargs': {'endpoint_url': "https://uor-aces-o.s3-ext.jc.rl.ac.uk"}, # final proxy + } + active_storage_url = "https://reductionist.jasmin.ac.uk/" # Wacasoft new Reductionist + bigger_file = "CMIP6-test.nc" # tas; 15 (time) x 143 x 144 + + test_file_uri = os.path.join( + S3_BUCKET, + bigger_file + ) + print("S3 Test file path:", test_file_uri) + active = Active(test_file_uri, 'tas', interface_type="s3", + storage_options=storage_options, + active_storage_url=active_storage_url, + option_disable_chunk_cache=True) + + active._version = 2 + + return active + + +def test_small_file_axis_0_1(): + """Fails: activestorage.reductionist.ReductionistError: Reductionist error: HTTP 502: -""" + active = build_active_small_file() + result = active.min(axis=(0, 1))[:] + print("Reductionist final result", result) + assert min(result[0][0]) == 197.69595 + + +def test_test1_file_axis_0_1(): + """Fails: activestorage.reductionist.ReductionistError: Reductionist error: HTTP 502: -""" + active = build_active_test1_file() + result = active.min(axis=(0, 1))[:] + print("Reductionist final result", result) + assert min(result[0][0]) == 198.82859802246094 + + +def test_small_file_axis_0_1_compare_with_numpy(): + """Fails: activestorage.reductionist.ReductionistError: Reductionist error: HTTP 502: -""" + active = build_active_small_file() + result = active.min(axis=(0, 1))[:] + print("Reductionist final result", result) + + # use numpy and local test data + ds = pyfive.File("tests/test_data/CMIP6-test.nc")["tas"] + minarr= np.min(ds[:], axis=(0, 1), keepdims=True) + print(len(minarr)) # 144 + print(min(minarr)) # 197.69595 + assert np.min(result) == np.min(minarr) + np.testing.assert_array_equal(result, minarr) + + +def build_active(): + """Run an integration test with real data off S3.""" + storage_options = { + 'key': "f2d55c6dcfc7618b2c34e00b58df3cef", + 'secret': "$/'#M{0{/4rVhp%n^(XeX$q@y#&(NM3W1->~N.Q6VP.5[@bLpi='nt]AfH)>78pT", + 'client_kwargs': {'endpoint_url': "https://uor-aces-o.s3-ext.jc.rl.ac.uk"}, # final proxy + } + active_storage_url = "https://reductionist.jasmin.ac.uk/" # Wacasoft new Reductionist + bigger_file = "da193a_25_6hr_t_pt_cordex__198807-198807.nc" # m01s30i111 ## older 3GB 30 chunks + + test_file_uri = os.path.join( + S3_BUCKET, + bigger_file + ) + print("S3 Test file path:", test_file_uri) + active = Active(test_file_uri, 'm01s30i111', interface_type="s3", # 'm01s06i247_4', interface_type="s3", + storage_options=storage_options, + active_storage_url=active_storage_url, + option_disable_chunk_cache=True) + + active._version = 2 + + return active + + +## Active loads a 4dim dataset +## Loaded dataset +## default axis arg (when axis=None): 'axis': (0, 1, 2, 3) + +def test_no_axis(): + """ + Fails: it should pass: 'axis': (0, 1, 2, 3) default + are fine! + + activestorage.reductionist.ReductionistError: Reductionist error: HTTP 400: {"error": {"message": "request data is not valid", "caused_by": ["__all__: Validation error: Number of reduction axes must be less than length of shape - to reduce over all axes omit the axis field completely [{}]"]}} + """ + active = build_active() + result = active.min()[:] + assert result == [[[[164.8125]]]] + + +@pytest.mark.skip(reason="HIGHMEM: Reductionist returns a lot of response") +# TODO this test gobbles large amounts of memory - it shouldn't - it should +# perform like a standard global min - return a single number +def test_no_axis_2(): + """ + Fails: it should pass: 'axis': (0, 1, 2, 3) default + are fine! Just as no axis is defined - global stats returned. + """ + active = build_active() + result = active.min(axis=())[:] + assert result == [[[[164.8125]]]] + + +@pytest.mark.skip(reason="HIGHMEM: Reductionist returns a lot of response") +# TODO test on a machine with lots of memory +def test_axis_0(): + active = build_active() + result = active.min(axis=(0, ))[:] + assert result == [[[[164.8125]]]] + + +def test_axis_0_1(): + """Passes fine.""" + active = build_active() + result = active.min(axis=(0, 1))[:] + assert result.shape == (1, 1, 324, 432) + assert result[0, 0, 0, 0] == 173.39794921875 + assert result[0, 0, 0, 431] == 173.395263671875 + + +@pytest.mark.skip(reason="HIGHMEM: Reductionist returns a lot of response") +# TODO test on a machine with lots of memory +def test_axis_1(): + active = build_active() + result = active.min(axis=(1, ))[:] + assert result == [[[[164.8125]]]] + + +def test_axis_0_1_2(): + """Passes fine.""" + active = build_active() + result = active.min(axis=(0, 1, 2))[:] + assert result[0][0][0][0] == 171.05126953125 diff --git a/tests/test_reductionist_json.py b/tests/test_reductionist_json.py index c7cc09c0..e8ed0255 100644 --- a/tests/test_reductionist_json.py +++ b/tests/test_reductionist_json.py @@ -1,16 +1,17 @@ -import pyfive -from activestorage.active import Active, get_missing_attributes -from activestorage.hdf2numcodec import decode_filters +import json + import numpy as np +import pyfive +from .test_bigger_data import save_cl_file_with_a from activestorage import reductionist -from activestorage.active import load_from_s3 -from activestorage.config import * -from test_bigger_data import save_cl_file_with_a +from activestorage.active import Active, get_missing_attributes, load_from_s3 +from activestorage.config import USE_S3 +from activestorage.hdf2numcodec import decode_filters -import json class MockActive: + def __init__(self, f, v): if USE_S3: self.f = load_from_s3(f) @@ -18,15 +19,18 @@ def __init__(self, f, v): self.f = pyfive.File(f) ds = self.f[v] self.dtype = np.dtype(ds.dtype) - self.array = pyfive.indexing.ZarrArrayStub(ds.shape, ds.chunks or ds.shape) + self.array = pyfive.indexing.ZarrArrayStub(ds.shape, ds.chunks + or ds.shape) self.missing = get_missing_attributes(ds) ds = ds.id self.ds = ds + def __getitem__(self, args): if self.ds.filter_pipeline is None: compressor, filters = None, None else: - compressor, filters = decode_filters(self.ds.filter_pipeline , self.dtype.itemsize, 'a') + compressor, filters = decode_filters(self.ds.filter_pipeline, + self.dtype.itemsize, 'a') if self.ds.chunks is not None: self.ds._get_chunk_addresses() @@ -34,15 +38,17 @@ def __getitem__(self, args): for chunk_coords, chunk_selection, out_selection in indexer: storeinfo = self.ds.get_chunk_info_from_chunk_coord(chunk_coords) offset, size = storeinfo.byte_offset, storeinfo.size - jd = reductionist.build_request_data('a','b','c', - offset, size, compressor, filters, self.missing, self.dtype, - self.array._chunks,self.ds._order,chunk_selection) + jd = reductionist.build_request_data( + 'a', offset, size, compressor, filters, self.missing, + self.dtype, self.array._chunks, self.ds._order, + chunk_selection, tuple(range(len(self.array._chunks)))) js = json.dumps(jd) return None + def test_build_request(tmp_path): ncfile, v = save_cl_file_with_a(tmp_path), 'cl' - A = MockActive(ncfile,v) + A = MockActive(ncfile, v) x = A[4:5, 1:2] # not interested in what is returned, checking that the request builds ok diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py new file mode 100644 index 00000000..4a5d2636 --- /dev/null +++ b/tests/unit/__init__.py @@ -0,0 +1 @@ +# Unit tests package diff --git a/tests/unit/test_active.py b/tests/unit/test_active.py index 25f988cc..0d2dd838 100644 --- a/tests/unit/test_active.py +++ b/tests/unit/test_active.py @@ -5,7 +5,7 @@ from activestorage.active import Active from activestorage.active import load_from_s3 -from activestorage.config import * +from activestorage.config import USE_S3 from botocore.exceptions import EndpointConnectionError as botoExc from botocore.exceptions import NoCredentialsError as NoCredsExc diff --git a/tests/unit/test_active_axis.py b/tests/unit/test_active_axis.py new file mode 100644 index 00000000..aa502842 --- /dev/null +++ b/tests/unit/test_active_axis.py @@ -0,0 +1,148 @@ +import itertools + +import netCDF4 +import numpy as np +import pytest + +from activestorage.active import Active + + +def axis_combinations(ndim): + """Create axes permutations""" + return [None] + [ + axes for n in range(1, ndim + 1) + for axes in itertools.permutations(range(ndim), n) + ] + + +rfile = "tests/test_data/test1.nc" +ncvar = 'tas' +# Dimensions +# netcdf test1 { +# dimensions: +# time = 12 ; +# bounds = 2 ; +# lat = 64 ; +# lon = 128 ; +ref = netCDF4.Dataset(rfile)[ncvar][...] + + +@pytest.mark.parametrize("index", ( + Ellipsis, + (slice(6, 7), slice(None), slice(None)), + (slice(None), slice(0, 64, 3), slice(None)), + (slice(None), slice(None), slice(0, 128, 4)), + (slice(6, 7), slice(0, 64, 3), slice(0, 128, 4)), + (slice(1, 11, 2), slice(0, 64, 3), slice(0, 128, 4)), + (slice(None), [0, 1, 5, 7, 30, 31], slice(None)), + (slice(None), [0, 1, 5, 7, 30, 31, 50, 51, 53], slice(None)), +)) +def test_active_axis_reduction(index): + """Unit test for class:Active axis combinations.""" + for axis in axis_combinations(ref.ndim): + for method, numpy_func in zip( + ("mean", "sum", "min", "max"), + (np.ma.mean, np.ma.sum, np.ma.min, np.ma.max)): + print(axis, index, method) + + r = numpy_func(ref[index], axis=axis, keepdims=True) + + active = Active(rfile, ncvar, axis=axis) + active.method = method + x = active[index] + + assert x.shape == r.shape + assert (x.mask == r.mask).all() + assert np.ma.allclose(x, r) + + # Test dictionary components output + # re-add method + active.components = True + active.method = method + + rn = np.ma.count(ref[index], axis=axis, keepdims=True) + + x = active[index] + + xn = x["n"] + assert xn.shape == rn.shape + assert (xn == rn).all() + + if method == "mean": + method = "sum" + r = np.ma.sum(ref[index], axis=axis, keepdims=True) + + x = x[method] + assert x.shape == r.shape + assert (x.mask == r.mask).all() + assert np.ma.allclose(x, r) + + +def test_active_axis_format_1(): + """Unit test for class:Active axis format.""" + active1 = Active(rfile, ncvar, axis=[0, 2]) + active2 = Active(rfile, ncvar, axis=(-1, -3)) + + x1 = active2.mean()[...] + x2 = active2.mean()[...] + + assert x1.shape == x2.shape + assert (x1.mask == x2.mask).all() + assert np.ma.allclose(x1, x2) + + +def test_active_axis_format_new_api(): + """Unit test for class:Active axis format with Numpy-style API.""" + active1 = Active(rfile, ncvar) + active2 = Active(rfile, ncvar) + + x1 = active2.mean(axis=(0, 2))[...] + assert active2._axis == (0, 2) + x2 = active2.mean(axis=(-1, -3))[...] + assert active2._axis == (-1, -3) + + assert x1.shape == x2.shape + assert (x1.mask == x2.mask).all() + assert np.ma.allclose(x1, x2) + + xmin = active2.min(axis=(0, 2))[...] + xmax = active2.max(axis=(0, 2))[...] + assert xmin[0][0][0] == 209.44680786132812 + assert xmax[0][0][0] == 255.54661560058594 + + # use offline old case + active2._version = 1 + xmin = active2.min(axis=(0, 1))[...] + assert xmin[0][0][0] == 217.1494140625 + + +def test_active_axis_format_new_api_cmip6_file(): + """Unit test for class:Active axis format with Numpy-style API.""" + # file is NOT chunked = 1 chunk + cmip6_file = "tests/test_data/CMIP6-test.nc" + ncvar = 'tas' + active = Active(cmip6_file, ncvar) + active._version = 1 + xmin = active.min(axis=(0, 1))[...] + assert xmin[0][0][0] == 206.40918 + + +def test_active_axis_format_2(): + """Unit test for class:Active axis format.""" + # Disallow out-of-range axes + active = Active(rfile, ncvar, axis=(0, 3)) + active.method = "mean" + + with pytest.raises(ValueError): + active[...] + + +def test_active_axis_index(): + """Unit test for class:Active axis format.""" + # Disallow reductions when the index drops an axis (i.e. index + # contains an integer) + active = Active(rfile, ncvar) + active.method = "mean" + + with pytest.raises(IndexError): + active[0] diff --git a/tests/unit/test_dummy_data.py b/tests/unit/test_dummy_data.py index ff2a7fba..6b135360 100644 --- a/tests/unit/test_dummy_data.py +++ b/tests/unit/test_dummy_data.py @@ -2,7 +2,7 @@ import numpy as np import pytest -from activestorage import dummy_data as dd +from .. import dummy_data as dd from netCDF4 import Dataset diff --git a/tests/unit/test_reductionist.py b/tests/unit/test_reductionist.py index 043ccbd6..5d850f7e 100644 --- a/tests/unit/test_reductionist.py +++ b/tests/unit/test_reductionist.py @@ -1,24 +1,30 @@ import os +import sys +from unittest import mock + +import cbor2 as cbor import numcodecs import numpy as np import pytest import requests -import sys -from unittest import mock from activestorage import reductionist def make_response(content, status_code, dtype=None, shape=None, count=None): - response = requests.Response() - response._content = content - response.status_code = status_code + reduction_result = { + "bytes": content + } if dtype: - response.headers["x-activestorage-dtype"] = dtype + reduction_result["dtype"] = dtype if shape: - response.headers["x-activestorage-shape"] = shape + reduction_result["shape"] = shape if count: - response.headers["x-activestorage-count"] = count + reduction_result["count"] = count + print("Reduction result", reduction_result) + response = requests.Response() + response._content = cbor.dumps(reduction_result) + response.status_code = status_code return response @@ -26,7 +32,7 @@ def make_response(content, status_code, dtype=None, shape=None, count=None): def test_reduce_chunk_defaults(mock_request): """Unit test for reduce_chunk with default arguments.""" result = np.int32(134351386) - response = make_response(result.tobytes(), 200, "int32", "[]", "2") + response = make_response(result.tobytes(), 200, "int32", [], 2) mock_request.return_value = response active_url = "https://s3.example.com" @@ -44,6 +50,7 @@ def test_reduce_chunk_defaults(mock_request): dtype = np.dtype("int32") shape = None order = None + axis = None chunk_selection = None operation = "min" @@ -52,42 +59,38 @@ def test_reduce_chunk_defaults(mock_request): # FIXME this is hacky and comes from peasantly setting the cacert to False in reductionist.py assert not session.verify - tmp, count = reductionist.reduce_chunk(session, active_url, - s3_url, bucket, object, offset, - size, compression, filters, missing, - dtype, shape, order, - chunk_selection, operation) + tmp, count = reductionist.reduce_chunk(session, active_url, s3_url, + offset, size, compression, + filters, missing, dtype, shape, + order, chunk_selection, axis, + operation) assert tmp == result assert count == 2 - expected_url = f"{active_url}/v1/{operation}/" + expected_url = f"{active_url}/v2/{operation}/" expected_data = { - "source": s3_url, - "bucket": bucket, - "object": object, + "interface_type": "s3", + "url": s3_url, "dtype": "int32", - 'offset':0, - 'size':0, "byte_order": sys.byteorder, + 'offset': 0, + 'size': 0, } mock_request.assert_called_once_with(session, expected_url, expected_data) -@pytest.mark.parametrize( - "compression, filters", - [ - ( - numcodecs.Zlib(), - [numcodecs.Shuffle()], - ), - ] -) +@pytest.mark.parametrize("compression, filters", [ + ( + numcodecs.Zlib(), + [numcodecs.Shuffle()], + ), +]) @mock.patch.object(reductionist, 'request') def test_reduce_chunk_compression(mock_request, compression, filters): """Unit test for reduce_chunk with compression and filter arguments.""" result = np.int32(134351386) - response = make_response(result.tobytes(), 200, "int32", "[]", "2") + response = make_response(result.tobytes(), 200, "int32", [], 2) mock_request.return_value = response active_url = "https://s3.example.com" @@ -103,78 +106,104 @@ def test_reduce_chunk_compression(mock_request, compression, filters): dtype = np.dtype("int32") shape = (32, ) order = "C" + axis = (0, ) chunk_selection = [slice(0, 2, 1)] operation = "min" session = reductionist.get_session(access_key, secret_key, cacert) assert session.verify == cacert - tmp, count = reductionist.reduce_chunk(session, active_url, - s3_url, bucket, object, offset, - size, compression, filters, missing, - dtype, shape, order, - chunk_selection, operation) + tmp, count = reductionist.reduce_chunk(session, active_url, s3_url, + offset, size, compression, + filters, missing, dtype, shape, + order, chunk_selection, axis, + operation) assert tmp == result assert count == 2 - expected_url = f"{active_url}/v1/{operation}/" + expected_url = f"{active_url}/v2/{operation}/" expected_data = { - "source": s3_url, - "bucket": bucket, - "object": object, - "dtype": "int32", - "byte_order": sys.byteorder, - "offset": offset, - "size": size, - "order": order, - "shape": shape, - "selection": [[chunk_selection[0].start, - chunk_selection[0].stop, - chunk_selection[0].step]], - "compression": {"id": compression.codec_id}, - "filters": [{"id": filter.codec_id, "element_size": filter.elementsize} - for filter in filters], + "interface_type": "s3", + "url": + s3_url, + "dtype": + "int32", + "byte_order": + sys.byteorder, + "offset": + offset, + "size": + size, + "order": + order, + "shape": + shape, + "selection": [[ + chunk_selection[0].start, chunk_selection[0].stop, + chunk_selection[0].step + ]], + "compression": { + "id": compression.codec_id + }, + "filters": [{ + "id": filter.codec_id, + "element_size": filter.elementsize + } for filter in filters], + "axis": + axis, } mock_request.assert_called_once_with(session, expected_url, expected_data) -@pytest.mark.parametrize( - "missing", - [ - ( - (np.float32(42.), None, None, None), - {"missing_value": np.float64(42.)}, - ), - ( - (None, np.float32(-42.), None, None), - {"missing_value": np.float64(-42.)}, - ), - ( - (None, [np.float32(42.), np.float32(-42.)], None, None), - {"missing_values": [np.float64(42.), np.float64(-42.)]}, - ), - ( - (None, None, np.float32(-1e6), None), - {"valid_min": np.float64(np.float32(-1e6))}, - ), - ( - (None, None, None, np.float32(1e6)), - {"valid_max": np.float64(np.float32(1e6))}, - ), - ( - (None, None, np.float32(-1e6), np.float32(1e6)), - {"valid_range": [np.float64(np.float32(-1e6)), np.float64(np.float32(1e6))]}, - ), - ] -) +@pytest.mark.parametrize("missing", [ + ( + (np.float32(42.), None, None, None), + { + "missing_value": np.float64(42.) + }, + ), + ( + (None, np.float32(-42.), None, None), + { + "missing_value": np.float64(-42.) + }, + ), + ( + (None, [np.float32(42.), np.float32(-42.)], None, None), + { + "missing_values": [np.float64(42.), + np.float64(-42.)] + }, + ), + ( + (None, None, np.float32(-1e6), None), + { + "valid_min": np.float64(np.float32(-1e6)) + }, + ), + ( + (None, None, None, np.float32(1e6)), + { + "valid_max": np.float64(np.float32(1e6)) + }, + ), + ( + (None, None, np.float32(-1e6), np.float32(1e6)), + { + "valid_range": + [np.float64(np.float32(-1e6)), + np.float64(np.float32(1e6))] + }, + ), +]) @mock.patch.object(reductionist, 'request') def test_reduce_chunk_missing(mock_request, missing): """Unit test for reduce_chunk with missing data.""" reduce_arg, api_arg = missing result = np.float32(-42.) - response = make_response(result.tobytes(), 200, "float32", "[]", "2") + response = make_response(result.tobytes(), 200, "float32", [], 2) mock_request.return_value = response active_url = "https://s3.example.com" @@ -192,34 +221,45 @@ def test_reduce_chunk_missing(mock_request, missing): dtype = np.dtype("float32").newbyteorder() shape = (32, ) order = "C" + axis = (0, ) chunk_selection = [slice(0, 2, 1)] operation = "min" session = reductionist.get_session(access_key, secret_key, cacert) tmp, count = reductionist.reduce_chunk(session, active_url, s3_url, - bucket, object, offset, size, - compression, filters, missing, - dtype, shape, order, - chunk_selection, operation) + offset, size, compression, + filters, missing, dtype, shape, + order, chunk_selection, axis, + operation) assert tmp == result assert count == 2 - expected_url = f"{active_url}/v1/{operation}/" + expected_url = f"{active_url}/v2/{operation}/" expected_data = { - "source": s3_url, - "bucket": bucket, - "object": object, - "dtype": "float32", - "byte_order": "little" if sys.byteorder == "big" else "big", - "offset": offset, - "size": size, - "order": order, - "shape": shape, - "selection": [[chunk_selection[0].start, - chunk_selection[0].stop, - chunk_selection[0].step]], - "missing": api_arg, + "interface_type": "s3", + "url": + s3_url, + "dtype": + "float32", + "byte_order": + "little" if sys.byteorder == "big" else "big", + "offset": + offset, + "size": + size, + "order": + order, + "shape": + shape, + "selection": [[ + chunk_selection[0].start, chunk_selection[0].stop, + chunk_selection[0].step + ]], + "missing": + api_arg, + "axis": + axis, } mock_request.assert_called_once_with(session, expected_url, expected_data) @@ -229,6 +269,7 @@ def test_reduce_chunk_not_found(mock_request): """Unit test for reduce_chunk testing 404 response.""" result = b'"Not found"' response = make_response(result, 404) + print("Response", response, type(response)) mock_request.return_value = response active_url = "https://s3.example.com" @@ -245,16 +286,17 @@ def test_reduce_chunk_not_found(mock_request): missing = [] dtype = np.dtype("int32") shape = (32, ) + axis = (0, ) order = "C" chunk_selection = [slice(0, 2, 1)] operation = "min" session = reductionist.get_session(access_key, secret_key, cacert) with pytest.raises(reductionist.ReductionistError) as exc: - reductionist.reduce_chunk(session, active_url, s3_url, bucket, - object, offset, size, compression, filters, - missing, dtype, shape, order, - chunk_selection, operation) - + reductionist.reduce_chunk(session, active_url, s3_url, + offset, size, compression, filters, missing, + dtype, shape, order, chunk_selection, axis, + operation) - assert str(exc.value) == 'Reductionist error: HTTP 404: "Not found"' + print("Not found exc from reductionist", str(exc.value)) + assert str(exc.value) == 'Reductionist error: HTTP 404: -' diff --git a/tests/unit/test_storage_types.py b/tests/unit/test_storage_types.py index 70b23559..b84410af 100644 --- a/tests/unit/test_storage_types.py +++ b/tests/unit/test_storage_types.py @@ -12,9 +12,11 @@ import activestorage.active +import activestorage.backends from activestorage.active import Active -from activestorage.config import * -from activestorage.dummy_data import make_vanilla_ncdata +from activestorage.config import S3_ACTIVE_STORAGE_URL, S3_URL +from .. import dummy_data +from ..dummy_data import make_vanilla_ncdata import activestorage.reductionist import activestorage.storage @@ -36,9 +38,7 @@ def load_from_s3(uri, storage_options=None): def reduce_chunk( session, server, - source, - bucket, - object, + url, offset, size, compressor, @@ -48,7 +48,10 @@ def reduce_chunk( shape, order, chunk_selection, + axis, operation, + interface_type=None, + option_disable_chunk_cache=False, ): return activestorage.storage.reduce_chunk( test_file, @@ -95,8 +98,6 @@ def reduce_chunk( S3_URL, mock.ANY, mock.ANY, - mock.ANY, - mock.ANY, None, None, (None, None, None, None), @@ -104,7 +105,9 @@ def reduce_chunk( mock.ANY, "C", mock.ANY, + axis=(0, 1, 2), operation="max", + interface_type="s3", ) @@ -129,6 +132,32 @@ def load_from_s3(uri, storage_options=None): assert np.max(result) == 999.0 +@mock.patch.object( + activestorage.backends.reductionist, + "decode_result_buffer", + wraps=activestorage.backends.reductionist.decode_result_buffer, +) +@mock.patch.object( + activestorage.backends.reductionist, + "encode_result", + wraps=activestorage.backends.reductionist.encode_result, +) +def test_local_simulate_cbor_response(mock_encode, mock_decode, tmp_path): + """Test local reduction with Reductionist-like CBOR response decoding enabled.""" + test_file = str(tmp_path / "test.nc") + make_vanilla_ncdata(test_file) + + active = Active(test_file, "data", storage_options={"local_simulate_cbor": True}) + active._version = 2 + active._method = "max" + + result = active[::] + + assert result == 999.0 + assert mock_encode.called + assert mock_decode.called + + @pytest.mark.skip(reason="No more valid file load in Active") @mock.patch.object(activestorage.active, "load_from_s3") def test_s3_load_failure(mock_load): diff --git a/tests/unit/test_strategies.py b/tests/unit/test_strategies.py new file mode 100644 index 00000000..4f420e2f --- /dev/null +++ b/tests/unit/test_strategies.py @@ -0,0 +1,121 @@ +import threading +import time +from types import SimpleNamespace + +import numpy as np + +from activestorage.core import ChunkMetadata, ChunkResult, MissingAttributes +from activestorage.strategies import ChunkedLocalStrategy, ChunkedRemoteStrategy + + +class FakeFormat: + def get_chunk_offset_size(self, chunk_coords): + return chunk_coords[0], 1 + + +class FakeDimIndexer: + def __init__(self, nchunks): + self.nchunks = nchunks + + +class FakeIndexer: + def __init__(self, nchunks): + self.shape = (nchunks,) + self.dim_indexers = [FakeDimIndexer(nchunks)] + self._nchunks = nchunks + + def __iter__(self): + for i in range(self._nchunks): + yield (i,), (slice(0, 1, 1),), (slice(i, i + 1, 1),) + + +class FakeBackend: + def __init__(self, max_threads): + self._active = SimpleNamespace( + _format=FakeFormat(), + _max_threads=max_threads, + method=np.ma.max, + components=False, + ) + self.thread_ids = set() + + def reduce_chunk(self, request): + self.thread_ids.add(threading.get_ident()) + time.sleep(0.01) + return ChunkResult( + data=np.array([request.offset], dtype=np.float64), + count=1, + out_selection=(), + ) + + +def make_chunk_metadata(): + return ChunkMetadata( + dtype=np.dtype("float64"), + shape=(4,), + chunks=(1,), + compressor=None, + filters=None, + order="C", + filename="fake.nc", + ) + + +def test_chunked_local_strategy_parallel_threads(): + strategy = ChunkedLocalStrategy() + backend = FakeBackend(max_threads=4) + + result = strategy.execute( + backend, + session=None, + chunk_metadata=make_chunk_metadata(), + indexer=FakeIndexer(4), + missing=MissingAttributes(), + method="max", + need_counts=False, + axis=(0,), + ) + + assert result.shape == (1,) + assert result[0] == 3.0 + assert len(backend.thread_ids) > 1 + + +def test_chunked_remote_strategy_parallel_threads(): + strategy = ChunkedRemoteStrategy() + backend = FakeBackend(max_threads=4) + + result = strategy.execute( + backend, + session=None, + chunk_metadata=make_chunk_metadata(), + indexer=FakeIndexer(4), + missing=MissingAttributes(), + method="max", + need_counts=False, + axis=(0,), + ) + + assert result.shape == (1,) + assert result[0] == 3.0 + assert len(backend.thread_ids) > 1 + + +def test_chunked_strategy_serial_when_single_thread(): + strategy = ChunkedLocalStrategy() + backend = FakeBackend(max_threads=1) + + result = strategy.execute( + backend, + session=None, + chunk_metadata=make_chunk_metadata(), + indexer=FakeIndexer(4), + missing=MissingAttributes(), + method="max", + need_counts=False, + axis=(0,), + ) + + assert result.shape == (1,) + assert result[0] == 3.0 + assert len(backend.thread_ids) == 1 \ No newline at end of file diff --git a/tests/utils.py b/tests/utils.py index d312cf58..e2cd1096 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -4,7 +4,7 @@ import s3fs -from activestorage.config import * +from activestorage.config import USE_S3, S3_URL, S3_ACCESS_KEY, S3_SECRET_KEY, S3_BUCKET def get_storage_type(): if USE_S3: