From 7a75087a08d55cd820e844fb560dd70c2ea04d44 Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Thu, 21 Nov 2024 16:30:52 -0500 Subject: [PATCH] Collection Metadata (#62) * Initial metadata implementation * Initial extents with dask arrays * More accurate to the spec version of the metadata * Update readme * More tests * Suggestions, formatting * Use projected dataset for extents * Better coordinate handling in projections * Adjust typing * Better parameter filtering * Extract metadata logic placeholder * forgot metadata file * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add a whole bunch of pydantic * Full pydantic typesafe collection metadata --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- README.md | 4 +- tests/test_cf_router.py | 90 ++++++- xpublish_edr/geometry/common.py | 54 +++- xpublish_edr/metadata.py | 420 ++++++++++++++++++++++++++++++++ xpublish_edr/plugin.py | 25 +- 5 files changed, 569 insertions(+), 24 deletions(-) create mode 100644 xpublish_edr/metadata.py diff --git a/README.md b/README.md index d5d09fa..4854854 100644 --- a/README.md +++ b/README.md @@ -55,9 +55,9 @@ This package attempts to follow [the spec](https://docs.ogc.org/is/19-086r6/19-0 ### [collections](https://docs.ogc.org/is/19-086r6/19-086r6.html#_e55ba0f5-8f24-4f1b-a7e3-45775e39ef2e) and Resource Paths Support -`xpublish-edr` does not currently support the `/collections/{collectionId}/query` path template described in the spec. Instead the path resource appears as `/{dataset_id}/query`. This is because of the path structure of xpublish. +`xpublish-edr` does not currently support the `/collections/{collectionId}/query` path template described in the spec. Instead the path resource appears as `/{dataset_id}/edr/{query}`. This is because of the path structure of xpublish. In the future, if `xpublish` supports [`DataTree`](https://docs.xarray.dev/en/stable/generated/xarray.DataTree.html) it could provide a path to supporting the spec compliant `collections` resource path. -In the future, when `xpublish` supports [`DataTree`](https://docs.xarray.dev/en/stable/generated/xarray.DataTree.html) it will provide a path to supporting the spec compliant `collections` resource path. + However, despite the collections resource not existing, this implementation supports [collection metadata](https://docs.ogc.org/is/19-086r6/19-086r6.html#_5d07dde9-231a-4652-a1f3-dd036c337bdc) at the dataset level through the `/{dataset_id}/edr/` resource. ### Supported Queries diff --git a/tests/test_cf_router.py b/tests/test_cf_router.py index f1b5dc7..76cf29e 100644 --- a/tests/test_cf_router.py +++ b/tests/test_cf_router.py @@ -62,6 +62,74 @@ def test_cf_area_formats(cf_client): assert "csv" in data, "csv is not a valid format" +def test_cf_metadata_query(cf_client): + response = cf_client.get("/datasets/air/edr/") + assert response.status_code == 200, "Response did not return successfully" + data = response.json() + + assert data["id"] == "air", "The id should be air" + assert data["title"] == "4x daily NMC reanalysis (1948)", "The title is incorrect" + assert ( + data["description"] + == "Data is from NMC initialized reanalysis\n(4x/day). These are the 0.9950 sigma level values." + ), "The description is incorrect" + assert data["crs"] == ["EPSG:4326"], "The crs is incorrect" + assert set(data["output_formats"]) == { + "cf_covjson", + "nc", + "netcdf4", + "nc4", + "netcdf", + "csv", + "geojson", + }, "The output formats are incorrect" + assert ( + "position" in data["data_queries"] and "area" in data["data_queries"] + ), "The data queries are incorrect" + + assert ( + "temporal" and "spatial" in data["extent"] + ), "Temporal and spatial extents should be present in extent" + assert ( + "vertical" not in data["extent"] + ), "Vertical extent should not be present in extent" + + assert data["extent"]["temporal"]["interval"] == [ + "2013-01-01T00:00:00", + "2013-01-01T18:00:00", + ], "Temporal interval is incorrect" + assert ( + data["extent"]["temporal"]["values"][0] + == "2013-01-01T00:00:00/2013-01-01T18:00:00" + ), "Temporal values are incorrect" + + assert data["extent"]["spatial"]["bbox"] == [ + [200.0, 15.0, 322.5, 75.0], + ], "Spatial bbox is incorrect" + assert data["extent"]["spatial"]["crs"] == "EPSG:4326", "Spatial CRS is incorrect" + + assert "air" in data["parameter_names"], "Air parameter should be present" + assert "lat" not in data["parameter_names"], "lat should not be present" + assert "lon" not in data["parameter_names"], "lon should not be present" + + +def test_cf_metadata_query_temp_smoke_test(cf_client): + response = cf_client.get("/datasets/temp/edr/") + assert response.status_code == 200, "Response did not return successfully" + data = response.json() + + assert data["id"] == "temp", "The id should be temp" + for key in ( + "title", + "description", + "crs", + "extent", + "output_formats", + "data_queries", + ): + assert key in data, f"Key {key} is not a top level key in the metadata response" + + def test_cf_position_query(cf_client, cf_air_dataset, cf_temp_dataset): x = 204 y = 44 @@ -122,14 +190,20 @@ def test_cf_position_query(cf_client, cf_air_dataset, cf_temp_dataset): axes = data["domain"]["axes"] - npt.assert_array_almost_equal( - axes["x"]["values"], - [[x]], - ), "Did not select nearby x coordinate" - npt.assert_array_almost_equal( - axes["y"]["values"], - [[y]], - ), "Did not select a nearby y coordinate" + ( + npt.assert_array_almost_equal( + axes["x"]["values"], + [[x]], + ), + "Did not select nearby x coordinate", + ) + ( + npt.assert_array_almost_equal( + axes["y"]["values"], + [[y]], + ), + "Did not select a nearby y coordinate", + ) temp_range = data["ranges"]["temp"] assert temp_range["type"] == "NdArray", "Response range should be a NdArray" diff --git a/xpublish_edr/geometry/common.py b/xpublish_edr/geometry/common.py index bd2e00c..a96a446 100644 --- a/xpublish_edr/geometry/common.py +++ b/xpublish_edr/geometry/common.py @@ -4,6 +4,7 @@ import itertools from functools import lru_cache +from typing import Union import pyproj import xarray as xr @@ -16,6 +17,9 @@ transformer_from_crs = lru_cache(pyproj.Transformer.from_crs) +DEFAULT_CRS = pyproj.CRS.from_epsg(4326) + + def coord_is_regular(da: xr.DataArray) -> bool: """ Check if the DataArray has a regular grid @@ -30,6 +34,20 @@ def is_regular_xy_coords(ds: xr.Dataset) -> bool: return coord_is_regular(ds.cf["X"]) and coord_is_regular(ds.cf["Y"]) +def spatial_bounds(ds: xr.Dataset) -> tuple[float, float, float, float]: + """ + Get the spatial bounds of the dataset, naively, in whatever CRS it is in + """ + x = ds.cf["X"] + min_x = float(x.min().values) + max_x = float(x.max().values) + + y = ds.cf["Y"] + min_y = float(y.min().values) + max_y = float(y.max().values) + return min_x, min_y, max_x, max_y + + def dataset_crs(ds: xr.Dataset) -> pyproj.CRS: grid_mapping_names = ds.cf.grid_mapping_names if len(grid_mapping_names) == 0: @@ -61,12 +79,15 @@ def project_geometry(ds: xr.Dataset, geometry_crs: str, geometry: Geometry) -> G return transform(transformer.transform, geometry) -def project_dataset(ds: xr.Dataset, query_crs: str) -> xr.Dataset: +def project_dataset(ds: xr.Dataset, query_crs: Union[str, pyproj.CRS]) -> xr.Dataset: """ Project the dataset to the given CRS """ data_crs = dataset_crs(ds) - target_crs = pyproj.CRS.from_string(query_crs) + if isinstance(query_crs, pyproj.CRS): + target_crs = query_crs + else: + target_crs = pyproj.CRS.from_string(query_crs) if data_crs == target_crs: return ds @@ -76,15 +97,23 @@ def project_dataset(ds: xr.Dataset, query_crs: str) -> xr.Dataset: always_xy=True, ) - # TODO: Handle rotated pole - cf_coords = target_crs.coordinate_system.to_cf() + # Unpack the coordinates + try: + X = ds.cf["X"] + Y = ds.cf["Y"] + except KeyError: + # If the dataset has multiple X axes, we can try to find the right one + source_cf_coords = data_crs.coordinate_system.to_cf() - # Get the new X and Y coordinates - target_y_coord = next(coord for coord in cf_coords if coord["axis"] == "Y") - target_x_coord = next(coord for coord in cf_coords if coord["axis"] == "X") + source_x_coord = next( + coord["standard_name"] for coord in source_cf_coords if coord["axis"] == "X" + ) + source_y_coord = next( + coord["standard_name"] for coord in source_cf_coords if coord["axis"] == "Y" + ) - X = ds.cf["X"] - Y = ds.cf["Y"] + X = ds.cf[source_x_coord] + Y = ds.cf[source_y_coord] # Transform the coordinates # If the data is vectorized, we just transform the points in full @@ -104,6 +133,13 @@ def project_dataset(ds: xr.Dataset, query_crs: str) -> xr.Dataset: c for c in ds.coords if x_dim in ds[c].dims or y_dim in ds[c].dims ] + # TODO: Handle rotated pole + target_cf_coords = target_crs.coordinate_system.to_cf() + + # Get the new X and Y coordinates + target_x_coord = next(coord for coord in target_cf_coords if coord["axis"] == "X") + target_y_coord = next(coord for coord in target_cf_coords if coord["axis"] == "Y") + target_x_coord_name = target_x_coord["standard_name"] target_y_coord_name = target_y_coord["standard_name"] diff --git a/xpublish_edr/metadata.py b/xpublish_edr/metadata.py new file mode 100644 index 0000000..fccb4a7 --- /dev/null +++ b/xpublish_edr/metadata.py @@ -0,0 +1,420 @@ +from typing import Literal, Optional + +import pyproj +import xarray as xr +from pydantic import BaseModel, Field + +from xpublish_edr.geometry.common import ( + DEFAULT_CRS, + dataset_crs, + project_dataset, + spatial_bounds, +) + + +class CRSDetails(BaseModel): + """OGC EDR CRS metadata + + https://docs.ogc.org/is/19-086r6/19-086r6.html#_7124ec17-6401-4eb7-ba1d-8ec329b7e677 + """ + + crs: str + wkt: str + + +class VariablesMetadata(BaseModel): + """OGC EDR Variables metadata + + https://docs.ogc.org/is/19-086r6/19-086r6.html#_1b54f97a-e1dc-4920-b8b4-e4981554138d + """ + + title: Optional[str] = None + description: Optional[str] = None + query_type: Optional[str] = None + coords: Optional[dict] = None + output_formats: Optional[list[str]] = None + default_output_format: Optional[str] = None + crs_details: Optional[list[CRSDetails]] = None + + +class Link(BaseModel): + """OGC EDR Link metadata + + https://docs.ogc.org/is/19-086r6/19-086r6.html#_ea77762b-89c0-4704-b845-748efc66e597 + """ + + href: str + rel: str + type_: Optional[str] = Field(None, serialization_alias="type") + hreflang: Optional[str] = None + title: Optional[str] = None + length: Optional[int] = None + templated: Optional[bool] = None + variables: Optional[VariablesMetadata] = None + + +class SpatialExtent(BaseModel): + """OGC EDR Spatial Extent metadata + + https://docs.ogc.org/is/19-086r6/19-086r6.html#_0afff399-4d8e-4a9b-961b-cab841d23cc1 + """ + + bbox: list[list[float]] + crs: str + + +class TemporalExtent(BaseModel): + """OGC EDR Temporal Extent metadata + + https://docs.ogc.org/is/19-086r6/19-086r6.html#_8f4c9f38-bc6a-4b98-8fd9-772e42d60ab2 + """ + + interval: list[str] + values: list[str] + trs: str + + +class VerticalExtent(BaseModel): + """OGC EDR Vertical Extent metadata + + https://docs.ogc.org/is/19-086r6/19-086r6.html#_52bf970b-315a-4a09-8b92-51757b584a62 + """ + + interval: list[float] + values: list[float] + vrs: str + + +class Extent(BaseModel): + """OGC EDR Extent metadata + + https://docs.ogc.org/is/19-086r6/19-086r6.html#_2a2d533f-6efe-48df-8056-2eca9deb848f + """ + + spatial: SpatialExtent + temporal: Optional[TemporalExtent] = None + vertical: Optional[VerticalExtent] = None + + +class EDRQueryMetadata(BaseModel): + """OGC EDR Query metadata + + https://docs.ogc.org/is/19-086r6/19-086r6.html#_9a6620ce-6093-4b1b-8f68-2e2c04a13746 + """ + + link: Link + + +class DataQueries(BaseModel): + """OGC EDR Data Queries metadata + + https://docs.ogc.org/is/19-086r6/19-086r6.html#_df2c080b-949c-40c3-ad14-d20228270c2d + """ + + position: Optional[EDRQueryMetadata] = None + radius: Optional[EDRQueryMetadata] = None + area: Optional[EDRQueryMetadata] = None + cube: Optional[EDRQueryMetadata] = None + trajectory: Optional[EDRQueryMetadata] = None + corridor: Optional[EDRQueryMetadata] = None + item: Optional[EDRQueryMetadata] = None + location: Optional[EDRQueryMetadata] = None + + +class SymbolMetadata(BaseModel): + """OGC EDR Symbol metadata + + https://docs.ogc.org/is/19-086r6/19-086r6.html#_3e50c10c-85bd-46d9-8e09-1c5fffffb055 + """ + + title: Optional[str] = None + description: Optional[str] = None + value: Optional[str] = None + type_: Optional[str] = Field(None, serialization_alias="type") + + +class UnitMetadata(BaseModel): + """OGC EDR Unit metadata + + https://docs.ogc.org/is/19-086r6/19-086r6.html#_5378d779-6a38-4607-9051-6f12c3d3107b + """ + + label: str + symbol: SymbolMetadata + + +class MeasurementType(BaseModel): + """OGC EDR Measurement Type metadata + + https://docs.ogc.org/is/19-086r6/19-086r6.html#_c81181d6-fd09-454e-9c00-a3bb3b21d592 + """ + + method: str + duration: str + + +class ObservedProperty(BaseModel): + """OGC EDR Observed Property metadata + + https://docs.ogc.org/is/19-086r6/19-086r6.html#_7e053ab4-5cde-4a5c-a8be-acc6495f9eb5 + """ + + id: Optional[str] = None + label: str + description: Optional[str] = None + + +class Parameter(BaseModel): + """OGC EDR Parameter metadata + + https://docs.ogc.org/is/19-086r6/19-086r6.html#_da400aef-f6ee-4d08-b36c-2f535d581d53 + """ + + id: Optional[str] = None + type_: Literal["Parameter"] = Field("Parameter", serialization_alias="type") + label: Optional[str] = None + description: Optional[str] = None + data_type: Optional[str] = Field(None, serialization_alias="data-type") + unit: Optional[UnitMetadata] = None + observed_property: ObservedProperty = Field( + ..., + serialization_alias="observedProperty", + ) + extent: Optional[Extent] = None + measurement_type: Optional[MeasurementType] = Field( + None, + serialization_alias="measurementType", + ) + + +class Collection(BaseModel): + """OGC EDR Collection metadata + + https://docs.ogc.org/is/19-086r6/19-086r6.html#_b6db449c-4ca7-4117-9bf4-241984cef569 + """ + + links: list[Link] + id: str + title: str + description: str + keywords: list[str] + extent: Extent + data_queries: DataQueries + crs: list[str] + output_formats: list[str] + parameter_names: dict[str, Parameter] + + +def crs_details(crs: pyproj.CRS) -> CRSDetails: + """ + Return CF version of EDR CRS metadata + """ + return CRSDetails(crs=crs.to_string(), wkt=crs.to_wkt()) + + +def unit(unit: str) -> UnitMetadata: + """ + Return CF version of EDR Unit metadata + """ + return UnitMetadata( + label=unit, + symbol=SymbolMetadata( + value=unit, + type="unit", + ), + ) + + +def parameter(da: xr.DataArray) -> Parameter: + """ + Return CF version of EDR Parameter metadata for a given xarray variable + """ + name = da.attrs.get("name", None) + standard_name = da.attrs.get("standard_name", name if name else "") + observed_property = ObservedProperty( + label=standard_name, + description=da.attrs.get("long_name", ""), + ) + return Parameter( + label=standard_name, + type_="Parameter", + description=da.attrs.get("long_name", ""), + data_type=da.dtype.name, + unit=unit(da.attrs.get("units", "")), + observed_property=observed_property, + ) + + +def spatial_extent(ds: xr.Dataset, crs: pyproj.CRS) -> SpatialExtent: + """Extract the spatial extent from the dataset into collection metadata specific format""" + bounds = spatial_bounds(ds) + + return SpatialExtent( + bbox=[bounds], + crs=crs.to_string(), + ) + + +def temporal_extent(ds: xr.Dataset) -> Optional[TemporalExtent]: + """Extract the temporal extent from the dataset into collection metadata specific format""" + if "T" not in ds.cf: + return None + + t = ds.cf["T"] + time_min = t.min().dt.strftime("%Y-%m-%dT%H:%M:%S").values + time_max = t.max().dt.strftime("%Y-%m-%dT%H:%M:%S").values + return TemporalExtent( + interval=[str(time_min), str(time_max)], + values=[f"{time_min}/{time_max}"], + trs='TIMECRS["DateTime",TDATUM["Gregorian Calendar"],CS[TemporalDateTime,1],AXIS["Time (T)",unspecified]]', # noqa + ) + + +def vertical_extent(ds: xr.Dataset) -> Optional[VerticalExtent]: + """Extract the vertical extent from the dataset into collection metadata specific format""" + if "Z" not in ds.cf: + return None + + z = ds.cf["Z"] + elevations = z.values + units = z.attrs.get("units", "unknown") + positive = z.attrs.get("positive", "up") + min_z = elevations.min() + max_z = elevations.max() + elevation_values = ",".join([str(e) for e in elevations]) + + return VerticalExtent( + interval=[min_z, max_z], + values=elevation_values, + vrs=f"VERTCRS[VERT_CS['unknown'],AXIS['Z',{positive}],UNIT[{units},1]]", # noqa + ) + + +def extent(ds: xr.Dataset, crs: pyproj.CRS) -> Extent: + """ + Extract the extent from the dataset into collection metadata specific format + """ + spatial = spatial_extent(ds, crs) + temporal = temporal_extent(ds) + vertical = vertical_extent(ds) + + return Extent( + spatial=spatial, + temporal=temporal, + vertical=vertical, + ) + + +def extract_parameters(ds: xr.Dataset) -> dict[str, Parameter]: + """ + Extract the parameters from the dataset into collection metadata specific format + """ + return {k: parameter(v) for k, v in ds.variables.items() if "axis" not in v.attrs} + + +def position_query_description( + output_formats: list[str], + crs_details: list[CRSDetails], +) -> EDRQueryMetadata: + """ + Return CF version of EDR Position Query metadata + """ + return EDRQueryMetadata( + link=Link( + href="/edr/position?coords={coords}", + hreflang="en", + rel="data", + templated=True, + variables=VariablesMetadata( + title="Position query", + description="Returns position data based on WKT `POINT(lon lat)` or `MULTIPOINT(lon lat, ...)` coordinates", # noqa + query_type="position", + coords={ + "type": "string", + "description": "WKT `POINT(lon lat)` or `MULTIPOINT(lon lat, ...)` coordinates", # noqa + "required": True, + }, + output_formats=output_formats, + default_output_format="cf_covjson", + crs_details=crs_details, + ), + ), + ) + + +def area_query_description( + output_formats: list[str], + crs_details: list[CRSDetails], +) -> EDRQueryMetadata: + """ + Return CF version of EDR Area Query metadata + """ + return EDRQueryMetadata( + link=Link( + href="/edr/area?coords={coords}", + hreflang="en", + rel="data", + templated=True, + variables=VariablesMetadata( + title="Area query", + description="Returns data in a polygon based on WKT `POLYGON(lon lat, ...)` coordinates", # noqa + query_type="position", + coords={ + "type": "string", + "description": "WKT `POLYGON(lon lat, ...)` coordinates", + "required": True, + }, + output_formats=output_formats, + default_output_format="cf_covjson", + crs_details=crs_details, + ), + ), + ) + + +def collection_metadata(ds: xr.Dataset, output_formats: list[str]) -> Collection: + """ + Returns the collection metadata for the dataset + There is no nested hierarchy in our router right now, so instead we return the metadata + for the current dataset as the a single collection. See the spec for more information: + https://docs.ogc.org/is/19-086r6/19-086r6.html#_162817c2-ccd7-43c9-b1ea-ad3aea1b4d6b + """ + id = ds.attrs.get("_xpublish_id", "unknown") + title = ds.attrs.get("title", "unknown") + description = ds.attrs.get("description", "no description") + + crs = dataset_crs(ds) + + # We will use the dataset's CRS as the default CRS, but use 4326 for the extents + # since it is always available + projected_ds = project_dataset(ds, DEFAULT_CRS) + + extents = extent(projected_ds, crs) + + parameters = extract_parameters(ds) + + supported_crs = [ + crs_details(crs), + ] + + # 4326 is always available + if crs != DEFAULT_CRS: + supported_crs.append( + crs_details(DEFAULT_CRS), + ) + + return Collection( + links=[], + id=id, + title=title, + description=description, + keywords=[], + extent=extents, + data_queries=DataQueries( + position=position_query_description(output_formats, supported_crs), + area=area_query_description(output_formats, supported_crs), + ), + crs=[crs.to_string()], + output_formats=output_formats, + parameter_names=parameters, + ) diff --git a/xpublish_edr/plugin.py b/xpublish_edr/plugin.py index fc8fd71..ba20d79 100644 --- a/xpublish_edr/plugin.py +++ b/xpublish_edr/plugin.py @@ -14,10 +14,11 @@ from xpublish_edr.geometry.common import project_dataset from xpublish_edr.geometry.position import select_by_position from xpublish_edr.logger import logger +from xpublish_edr.metadata import collection_metadata from xpublish_edr.query import EDRQuery, edr_query -def position_formats(): +def output_formats(): """ Return response format functions from registered `xpublish_edr_position_formats` entry_points @@ -57,7 +58,7 @@ def get_position_formats(): """ Returns the various supported formats for position queries """ - formats = {key: value.__doc__ for key, value in position_formats().items()} + formats = {key: value.__doc__ for key, value in output_formats().items()} return formats @@ -69,7 +70,7 @@ def get_area_formats(): """ Returns the various supported formats for area queries """ - formats = {key: value.__doc__ for key, value in position_formats().items()} + formats = {key: value.__doc__ for key, value in output_formats().items()} return formats @@ -80,6 +81,20 @@ def dataset_router(self, deps: Dependencies): """Register dataset level router for EDR endpoints""" router = APIRouter(prefix=self.app_router_prefix, tags=self.dataset_router_tags) + @router.get("/", summary="Collection metadata") + def get_collection_metadata(dataset: xr.Dataset = Depends(deps.dataset)): + """ + Returns the collection metadata for the dataset + + There is no nested hierarchy in our router right now, so instead we return the metadata + for the current dataset as the a single collection. See the spec for more information: + https://docs.ogc.org/is/19-086r6/19-086r6.html#_162817c2-ccd7-43c9-b1ea-ad3aea1b4d6b + """ + available_output_formats = list(output_formats().keys()) + return collection_metadata(dataset, available_output_formats).dict( + exclude_none=True, + ) + @router.get("/position", summary="Position query") def get_position( request: Request, @@ -126,7 +141,7 @@ def get_position( if query.format: try: - format_fn = position_formats()[query.format] + format_fn = output_formats()[query.format] except KeyError: raise HTTPException( 404, @@ -182,7 +197,7 @@ def get_area( if query.format: try: - format_fn = position_formats()[query.format] + format_fn = output_formats()[query.format] except KeyError: raise HTTPException( 404,