diff --git a/README.md b/README.md index 9eb9493..50613e1 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,7 @@ To download all the data used for the benchmark run the following commands: uv run python -m climatebenchpress.data_loader.datasets.esa_biomass_cci uv run python -m climatebenchpress.data_loader.datasets.cams uv run python -m climatebenchpress.data_loader.datasets.ifs_uncompressed +uv run python -m climatebenchpress.data_loader.datasets.ifs_humidity uv run python -m climatebenchpress.data_loader.datasets.nextgems uv run python -m climatebenchpress.data_loader.datasets.cmip6.access_ta uv run python -m climatebenchpress.data_loader.datasets.cmip6.access_tos diff --git a/src/climatebenchpress/data_loader/datasets/all.py b/src/climatebenchpress/data_loader/datasets/all.py index 0f3700e..8b7361a 100644 --- a/src/climatebenchpress/data_loader/datasets/all.py +++ b/src/climatebenchpress/data_loader/datasets/all.py @@ -4,5 +4,6 @@ from .cmip6.all import * from .era5 import * from .esa_biomass_cci import * +from .ifs_humidity import * from .ifs_uncompressed import * from .nextgems import * diff --git a/src/climatebenchpress/data_loader/datasets/ifs_humidity.py b/src/climatebenchpress/data_loader/datasets/ifs_humidity.py new file mode 100644 index 0000000..8f51d5c --- /dev/null +++ b/src/climatebenchpress/data_loader/datasets/ifs_humidity.py @@ -0,0 +1,80 @@ +__all__ = ["IFSHumidityDataset"] + +import argparse +from pathlib import Path + +import xarray as xr + +from .. import ( + monitor, + open_downloaded_canonicalized_dataset, + open_downloaded_tiny_canonicalized_dataset, +) +from .abc import Dataset +from .ifs_uncompressed import load_hplp_data, regrid_to_regular + + +class IFSHumidityDataset(Dataset): + """Dataset for the humidity field of the uncompressed IFS data. + + Contains data from the [hplp](https://apps.ecmwf.int/ifs-experiments/rd/hplp/) + experiment from the Integrated Forecasting System (IFS) model. Crucially, + this dataset contains uncompressed 64-bit floating point data. + """ + + name = "ifs-humidity" + + @staticmethod + def download(download_path: Path, progress: bool = True): + donefile = download_path / "download.done" + if donefile.exists(): + return + + ds = load_hplp_data(leveltype="ml", gridtype="reduced_gg", step=0) + ds = ds[["q"]] + ds_regridded = regrid_to_regular( + ds, + in_grid={"grid": "O400"}, + out_grid={"grid": [0.25, 0.25]}, + ) + downloadfile = download_path / "ifs_humidity.zarr" + with monitor.progress_bar(progress): + ds_regridded.to_zarr( + downloadfile, mode="w", encoding=dict(), compute=False + ).compute() + + @staticmethod + def open(download_path: Path) -> xr.Dataset: + ds = xr.open_dataset(download_path / "ifs_humidity.zarr") + num_levels = ds["level"].size + ds = ds.isel(time=slice(0, 1)).chunk( + { + "latitude": -1, + "longitude": -1, + "time": -1, + "level": (num_levels // 2) + 1, + } + ) + + # Needed to make the dataset CF-compliant. + ds.longitude.attrs["axis"] = "X" + ds.latitude.attrs["axis"] = "Y" + ds.level.attrs["axis"] = "Z" + ds.time.attrs["standard_name"] = "time" + return ds + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--basepath", type=Path, default=Path()) + args = parser.parse_args() + + ds = open_downloaded_canonicalized_dataset( + IFSHumidityDataset, basepath=args.basepath + ) + open_downloaded_tiny_canonicalized_dataset( + IFSHumidityDataset, basepath=args.basepath + ) + + for v, da in ds.items(): + print(f"- {v}: {da.dims}") diff --git a/src/climatebenchpress/data_loader/datasets/ifs_uncompressed.py b/src/climatebenchpress/data_loader/datasets/ifs_uncompressed.py index adece58..3e0a7d8 100644 --- a/src/climatebenchpress/data_loader/datasets/ifs_uncompressed.py +++ b/src/climatebenchpress/data_loader/datasets/ifs_uncompressed.py @@ -126,14 +126,27 @@ def regrid_to_regular(ds, in_grid, out_grid): """ out_data = {var: [] for var in ds.data_vars} for var in ds.data_vars: + var_has_level = "level" in ds[var].dims for time in ds.time: - r = earthkit.regrid.interpolate( - ds[var].sel(time=time).values, - in_grid=in_grid, - out_grid=out_grid, - method="linear", - ) - out_data[var].append(r) + if var_has_level: + level_data = [] + for level in ds[var].level: + r = earthkit.regrid.interpolate( + ds[var].sel(time=time, level=level).values, + in_grid=in_grid, + out_grid=out_grid, + method="linear", + ) + level_data.append(r) + out_data[var].append(level_data) + else: + r = earthkit.regrid.interpolate( + ds[var].sel(time=time).values, + in_grid=in_grid, + out_grid=out_grid, + method="linear", + ) + out_data[var].append(r) dx = out_grid["grid"][0] assert ( @@ -146,13 +159,16 @@ def regrid_to_regular(ds, in_grid, out_grid): "latitude": lats, "longitude": lons, } - out_ds = xr.Dataset( - { - var: (("time", "latitude", "longitude"), out_data[var]) - for var in ds.data_vars - }, - coords=coords, - ) + + data_vars = {} + for var in ds.data_vars: + if "level" in ds[var].dims: + coords["level"] = ds[var].level + data_vars[var] = (("time", "level", "latitude", "longitude"), out_data[var]) + else: + data_vars[var] = (("time", "latitude", "longitude"), out_data[var]) + + out_ds = xr.Dataset(data_vars, coords=coords) return out_ds