diff --git a/openeo_driver/save_result.py b/openeo_driver/save_result.py index 35ecc139..bb7d0154 100644 --- a/openeo_driver/save_result.py +++ b/openeo_driver/save_result.py @@ -144,6 +144,12 @@ class WorkspaceExport: merge: Optional[str] +def clean_parquet_column_name(name: str): + # Could not find official SPEC for this, but cleaning just for consistency + # https://geoparquet.org/releases/v1.0.0/#column-metadata + return re.sub(r"[()/\\]", "_", name) + + def get_temp_file(suffix="", prefix="openeo-pydrvr-"): # TODO: make sure temp files are cleaned up when read _, filename = tempfile.mkstemp(suffix=suffix, prefix=prefix) @@ -631,7 +637,9 @@ def to_geoparquet(self, destination: Optional[str] = None) -> str: # TODO: avoid accessing _geometries to combine _regions and CSV gdf = self._regions._geometries - gpd.GeoDataFrame(stats.join(gdf, on='feature_index')).to_parquet(filename) + (gpd.GeoDataFrame(stats.join(gdf, on='feature_index')) + .rename(columns=clean_parquet_column_name) + .to_parquet(filename)) return filename def to_driver_vector_cube(self) -> DriverVectorCube: @@ -837,7 +845,7 @@ def to_geoparquet(self, destination: Optional[str] = None) -> str: (gdf .join(stats.set_index('feature_index'), on='feature_index') - .rename(columns=lambda col_name: re.sub(r"\W", "_", col_name)) # adhere to naming restriction [A-Za-z0-9_] + .rename(columns=clean_parquet_column_name) .to_parquet(filename)) # TODO: Is naming restriction required for parquet files? diff --git a/tests/data/aggregate_spatial_spatial_cube_name_collision/1.csv b/tests/data/aggregate_spatial_spatial_cube_name_collision/1.csv new file mode 100644 index 00000000..fb0f313f --- /dev/null +++ b/tests/data/aggregate_spatial_spatial_cube_name_collision/1.csv @@ -0,0 +1,3 @@ +feature_index,avg(band_0),avg_band_0_,avg(band_2) +1,4645.719597475695,4865.467252259935,5177.803342998465 +0,4646.262612301313,4865.926572218383,5178.517363510712 diff --git a/tests/test_save_result_parquet.py b/tests/test_save_result_parquet.py index 95b436a1..e17ebb74 100644 --- a/tests/test_save_result_parquet.py +++ b/tests/test_save_result_parquet.py @@ -1,3 +1,5 @@ +import pytest + from openeo_driver.datacube import DriverVectorCube from openeo_driver.save_result import AggregatePolygonSpatialResult from .data import get_path @@ -27,6 +29,20 @@ def test_save_aggregate_polygon_spatial_result(tmp_path): } +def test_save_column_name(tmp_path): + csv_dir = get_path("aggregate_spatial_spatial_cube_name_collision") + + vector_cube = DriverVectorCube(gpd.read_file(str(get_path("geojson/FeatureCollection02.json")))) + + output_file = tmp_path / "test.parquet" + + spatial_result = AggregatePolygonSpatialResult(csv_dir, regions=vector_cube, format="Parquet") + + with pytest.raises(ValueError) as exc_info: + spatial_result.to_geoparquet(destination=str(output_file)) + assert "Duplicate column names found" in str(exc_info.value) + + def test_write_driver_vector_cube_to_parquet(tmp_path): vector_cube = DriverVectorCube(gpd.read_file(str(get_path("geojson/FeatureCollection02.json")))) vector_cube.write_assets(tmp_path / "dummy", format="Parquet")