Skip to content

Commit

Permalink
#911 dcat distribution model fix inspect api updates (#912)
Browse files Browse the repository at this point in the history
* Updating the release version in pyproject.toml

* test commit

* WIP

* WIP

* WIP

* WIP

* WIP

* WIP

* Tidy up

* tidy up

* Working

* Added comments

* fixed pyright errors

* more pyright

* Changed #csvqb to #qbDataSet

* PR comments addressed

* poetry lock

* poetry lock

* oops

* small change

---------

Co-authored-by: Auto-version-incrementer <none@none.com>
  • Loading branch information
SarahJohnsonONS and Auto-version-incrementer authored Jul 17, 2024
1 parent 2fcf99c commit be725e5
Show file tree
Hide file tree
Showing 42 changed files with 1,501 additions and 245 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ RUN apt-get update && apt-get install -y default-jre

# Install Apache JENA CLI tools for this.
WORKDIR /
ADD https://downloads.apache.org/jena/binaries/apache-jena-4.9.0.tar.gz /apache-jena.tar.gz
ADD https://downloads.apache.org/jena/binaries/apache-jena-5.0.0.tar.gz /apache-jena.tar.gz
RUN tar xvfz /apache-jena.tar.gz && \
cd /apache-jena-* && \
cp -r * /usr/local/ && \
Expand Down
8 changes: 4 additions & 4 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ uritemplate = "^4.1.1"
# This restraint can be lifted once the fix has been published by removing urllib3 as it is a dependency of other packages.
urllib3 = "^1.0.0"
python-dateutil = "^2.8.2"
csvcubed-models = "<=0.1.9"
csvcubed-models = "^0.1.10"
platformdirs = "^3.5.0"
numpy = "<2.0.0"

Expand Down
51 changes: 36 additions & 15 deletions src/csvcubed/cli/inspectcsvw/metadataprinter.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ class MetadataPrinter:
csvw_type_str: str = field(init=False)
primary_csv_url: str = field(init=False)
dataset: DataFrame = field(init=False)

result_build_minor_version: int = field(init=False)
result_catalog_metadata: CatalogMetadataResult = field(init=False)
result_column_component_infos: List[ColumnComponentInfo] = field(init=False)
primary_cube_table_identifiers: CubeTableIdentifiers = field(init=False)
Expand Down Expand Up @@ -97,13 +97,28 @@ def get_csvw_type_str(csvw_type: CSVWType) -> str:
else:
raise InputNotSupportedException()

def get_build_minor_version(self) -> int:
"""Return the minor version of csvcubed used to build the cube."""
# TODO Create a class to store csvcubed version information similar to readers/cubeconfig/schema_versions.py
build_info = self.state.csvw_repository.get_build_information()[0]
csvcubed_version = build_info.github_url.split("/")[-1]
csvcubed_version_parts = csvcubed_version.split(".")
csvcubed_minor_version = int(csvcubed_version_parts[1])
return csvcubed_minor_version

def get_primary_csv_url(self) -> str:
"""Return the csv_url for the primary table in the graph."""
primary_metadata = self.state.csvw_repository.get_primary_catalog_metadata()
if isinstance(self.state, DataCubeRepository):
return self.state.get_cube_identifiers_for_data_set(
primary_metadata.dataset_uri
).csv_url
# Get csv_url based on whether cube identifiers are recorded against the dataset or the distribution
if self.result_build_minor_version >= 5:
return self.state.get_cube_identifiers_for_dataset(
primary_metadata.distribution_uri
).csv_url
else:
return self.state.get_cube_identifiers_for_dataset(
primary_metadata.dataset_uri
).csv_url
elif isinstance(self.state, CodeListRepository):
return self.state.get_table_identifiers_for_concept_scheme(
primary_metadata.dataset_uri
Expand Down Expand Up @@ -142,16 +157,20 @@ def generate_general_results(self):

self.csvw_type_str = self.get_csvw_type_str(csvw_type)
self.result_catalog_metadata = csvw_repository.get_primary_catalog_metadata()
if isinstance(self.state, DataCubeRepository):
self.result_build_minor_version = self.get_build_minor_version()
self.primary_csv_url = self.get_primary_csv_url()
self.dataset = load_csv_to_dataframe(
csvw_repository.csvw_json_path, Path(self.primary_csv_url)
)
self.result_dataset_observations_info = get_dataset_observations_info(
self.dataset,
csvw_type,
self.state.get_shape_for_csv(self.primary_csv_url)
if isinstance(self.state, DataCubeRepository)
else None,
(
self.state.get_shape_for_csv(self.primary_csv_url)
if isinstance(self.state, DataCubeRepository)
else None
),
)

def get_datacube_results(self):
Expand Down Expand Up @@ -242,14 +261,16 @@ def _get_column_component_info_for_output(
"Type": c.column_type.name,
"Required": c.column_definition.required,
"Property URL": c.column_definition.property_url,
"Observations Column Titles": ""
if c.component is None
else ", ".join(
[
c.title
for c in c.component.used_by_observed_value_columns
if c.title is not None
]
"Observations Column Titles": (
""
if c.component is None
else ", ".join(
[
c.title
for c in c.component.used_by_observed_value_columns
if c.title is not None
]
)
),
}
for c in column_component_infos
Expand Down
2 changes: 1 addition & 1 deletion src/csvcubed/inspect/inspectortable.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def _get_shape(self) -> CubeShape:
def _get_dataset_uri(self) -> str:
return self.data_cube_repository.get_cube_identifiers_for_csv(
self.csv_url
).data_set_url
).dataset_url

def _get_columns(self) -> OrderedDict[str, DataCubeColumn]:
columns = OrderedDict[str, DataCubeColumn]()
Expand Down
17 changes: 17 additions & 0 deletions src/csvcubed/inspect/sparql_handler/csvw_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from csvcubed.inspect.sparql_handler.sparqlquerymanager import (
ask_is_csvw_code_list,
ask_is_csvw_qb_dataset,
select_build_information,
select_column_definitions,
select_csvw_catalog_metadata,
select_table_schema_properties,
Expand All @@ -25,6 +26,7 @@
from csvcubed.models.inspect.sparqlresults import (
CatalogMetadataResult,
ColumnDefinition,
CsvcubedVersionResult,
TableSchemaPropertiesResult,
)
from csvcubed.utils.dict import get_from_dict_ensure_exists
Expand Down Expand Up @@ -99,6 +101,14 @@ def _table_schema_properties(self) -> Dict[str, TableSchemaPropertiesResult]:
results_dict[result.csv_url] = result
return results_dict

@cached_property
def build_information(self) -> List[CsvcubedVersionResult]:
"""
Cached property for the select_build_information query.
"""
results = select_build_information(self.rdf_graph)
return results

def get_column_definitions_for_csv(self, csv_url: str) -> List[ColumnDefinition]:
"""
Returns the `ColumnDefinition`s for a given csv file, raises a KeyError if the csv_url
Expand Down Expand Up @@ -129,3 +139,10 @@ def get_table_info_for_csv_url(self, csv_url: str) -> TableSchemaPropertiesResul
self._table_schema_properties, csv_url
)
return result

def get_build_information(self) -> List[CsvcubedVersionResult]:
"""
Returns the csvcubed build activity and GitHub version used to build a given cube.
"""
result: List[CsvcubedVersionResult] = self.build_information
return result
20 changes: 10 additions & 10 deletions src/csvcubed/inspect/sparql_handler/data_cube_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from csvcubed.inspect.sparql_handler.csvw_repository import CsvWRepository
from csvcubed.inspect.sparql_handler.sparqlquerymanager import (
select_csvw_dsd_qube_components,
select_data_set_dsd_and_csv_url,
select_dataset_dsd_and_csv_url,
select_dsd_code_list_and_cols,
select_is_pivoted_shape_data_set,
select_labels_for_resource_uris,
Expand Down Expand Up @@ -95,7 +95,7 @@ def _cube_table_identifiers(self) -> Dict[str, CubeTableIdentifiers]:
Maps from csv_url to the identifiers.
"""
results = select_data_set_dsd_and_csv_url(self.csvw_repository.rdf_graph)
results = select_dataset_dsd_and_csv_url(self.csvw_repository.rdf_graph)
results_dict: Dict[str, CubeTableIdentifiers] = {}
for result in results:
results_dict[result.csv_url] = result
Expand Down Expand Up @@ -188,26 +188,26 @@ def get_units(self) -> List[UnitResult]:

def get_cube_identifiers_for_csv(self, csv_url: str) -> CubeTableIdentifiers:
"""
Get csv url, data set uri, data set label and DSD uri for the given csv url.
Get CSV URL, dataset URI and DSD URI for the given csv url.
"""
result: CubeTableIdentifiers = get_from_dict_ensure_exists(
self._cube_table_identifiers, csv_url
)
return result

def get_cube_identifiers_for_data_set(
self, data_set_uri: str
def get_cube_identifiers_for_dataset(
self, dataset_uri: str
) -> CubeTableIdentifiers:
"""
Get csv url, data set uri, data set label and DSD uri for the given data set uri.
Get CSV URL, dataset URI and DSD URI for the given dataset URI.
"""

result = first(
self._cube_table_identifiers.values(),
lambda i: i.data_set_url == data_set_uri,
lambda i: i.dataset_url == dataset_uri,
)
if result is None:
raise KeyError(f"Could not find the data_set with URI '{data_set_uri}'.")
raise KeyError(f"Could not find the dataset with URI '{dataset_uri}'.")

return result

Expand Down Expand Up @@ -321,8 +321,8 @@ def get_primary_csv_url(self) -> str:
data cube.
"""
primary_catalog_metadata = self.csvw_repository.get_primary_catalog_metadata()
return self.get_cube_identifiers_for_data_set(
primary_catalog_metadata.dataset_uri
return self.get_cube_identifiers_for_dataset(
primary_catalog_metadata.distribution_uri
).csv_url

def get_dataframe(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
PREFIX prov: <http://www.w3.org/ns/prov#>
PREFIX qb: <http://purl.org/linked-data/cube#>

SELECT ?dataset ?buildActivity ?csvcubedVersion
WHERE {
?dataset a qb:DataSet;
prov:wasGeneratedBy ?buildActivity.
?buildActivity a prov:Activity;
prov:used ?csvcubedVersion.
}
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
PREFIX dcat: <http://www.w3.org/ns/dcat#>
PREFIX dcterms: <http://purl.org/dc/terms/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX prov: <http://www.w3.org/ns/prov#>

SELECT ?graph ?dataset ?title ?label ?issued ?modified ?comment ?description
?license ?creator ?publisher ?landingPages ?themes ?keywords ?contactPoint
SELECT ?graph ?dataset ?distribution ?title ?label ?issued ?modified ?comment ?description
?license ?creator ?publisher ?landingPages ?themes ?keywords ?contactPoint ?buildActivity
?identifier
WHERE {
{
Expand All @@ -20,14 +21,15 @@ WHERE {
rdfs:label ?label;
dcterms:issued ?issued;
dcterms:modified ?modified.

OPTIONAL { ?dataset dcat:distribution ?distribution }.
OPTIONAL { ?dataset rdfs:comment ?comment }.
OPTIONAL { ?dataset dcterms:description ?description }.
OPTIONAL { ?dataset dcterms:license ?license }.
OPTIONAL { ?dataset dcterms:creator ?creator }.
OPTIONAL { ?dataset dcterms:publisher ?publisher }.
OPTIONAL { ?dataset dcat:contactPoint ?contactPoint }.
OPTIONAL { ?dataset dcterms:identifier ?identifier }.
OPTIONAL { ?dataset prov:wasGeneratedBy ?buildActivity }.
}

{
Expand Down
18 changes: 16 additions & 2 deletions src/csvcubed/inspect/sparql_handler/sparqlquerymanager.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
CatalogMetadataResult,
CodelistsResult,
ColumnDefinition,
CsvcubedVersionResult,
CSVWTableSchemaFileDependenciesResult,
CubeTableIdentifiers,
IsPivotedShapeResult,
Expand All @@ -34,6 +35,7 @@
QubeComponentsResult,
TableSchemaPropertiesResult,
UnitResult,
map_build_activity_results,
map_catalog_metadata_results,
map_codelists_sparql_result,
map_column_definition_results,
Expand Down Expand Up @@ -89,6 +91,8 @@ class SPARQLQueryName(Enum):

SELECT_LABELS_FOR_RESOURCE_URIS = "select_labels_for_resource_uris"

SELECT_BUILD_INFORMATION = "select_build_information"


def _get_query_string_from_file(query_type: SPARQLQueryName) -> str:
"""
Expand Down Expand Up @@ -169,7 +173,7 @@ def select_csvw_catalog_metadata(
return map_catalog_metadata_results(results)


def select_data_set_dsd_and_csv_url(
def select_dataset_dsd_and_csv_url(
rdf_graph: rdflib.ConjunctiveGraph,
) -> List[CubeTableIdentifiers]:
"""
Expand Down Expand Up @@ -251,7 +255,7 @@ def _cube_table_identifiers_to_values_binding(
rows=[
[
Literal(uris.csv_url, datatype=XSD.anyURI),
URIRef(uris.data_set_url),
URIRef(uris.dataset_url),
URIRef(uris.dsd_uri),
]
for uris in csv_dsd_dataset_uris
Expand Down Expand Up @@ -399,3 +403,13 @@ def select_column_definitions(
)

return map_column_definition_results(results)


def select_build_information(rdf_graph: rdflib.Graph) -> List[CsvcubedVersionResult]:
"""
Selects the csvcubed build activity and GitHub version used to build a given cube.
"""
results: List[ResultRow] = select(
_get_query_string_from_file(SPARQLQueryName.SELECT_BUILD_INFORMATION), rdf_graph
)
return map_build_activity_results(results)
9 changes: 9 additions & 0 deletions src/csvcubed/models/cube/qb/catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Catalog Metadata (DCAT)
-----------------------
"""

import json
from dataclasses import dataclass, field
from datetime import date, datetime, time
Expand Down Expand Up @@ -76,6 +77,14 @@ def get_description(self) -> Optional[str]:
def get_identifier(self) -> str:
return self.identifier or self.title

def configure_dcat_distribution(self, distribution: dcat.Distribution) -> None:
dt_now = datetime.now()
dt_issued = _convert_date_to_date_time(self.dataset_issued or dt_now)
distribution.issued = dt_issued
distribution.label = distribution.title = self.title
distribution.identifier = self.get_identifier()
distribution.creator = self.creator_uri

def configure_dcat_dataset(self, dataset: dcat.Dataset) -> None:
dt_now = datetime.now()
dt_issued = _convert_date_to_date_time(self.dataset_issued or dt_now)
Expand Down
Loading

0 comments on commit be725e5

Please sign in to comment.