-
Notifications
You must be signed in to change notification settings - Fork 7
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add a datasource field to the DesignWorkflow object #968
Changes from 1 commit
f53a021
b03680b
e777535
f5f7538
169e809
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
__version__ = "3.8.0" | ||
__version__ = "3.9.0" |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
"""Tools for working with Descriptors.""" | ||
from abc import abstractmethod | ||
from typing import Type, List, Mapping, Optional, Union | ||
from uuid import UUID | ||
|
||
|
@@ -7,6 +8,7 @@ | |
from citrine._serialization.serializable import Serializable | ||
from citrine.informatics.descriptors import Descriptor | ||
from citrine.resources.file_link import FileLink | ||
from citrine.resources.gemtables import GemTable | ||
|
||
__all__ = ['DataSource', | ||
'CSVDataSource', | ||
|
@@ -34,13 +36,39 @@ def get_type(cls, data) -> Type[Serializable]: | |
if "type" not in data: | ||
raise ValueError("Can only get types from dicts with a 'type' key") | ||
types: List[Type[Serializable]] = [ | ||
CSVDataSource, GemTableDataSource, ExperimentDataSourceRef | ||
CSVDataSource, GemTableDataSource, ExperimentDataSourceRef, SnapshotDataSource | ||
] | ||
res = next((x for x in types if x.typ == data["type"]), None) | ||
if res is None: | ||
raise ValueError("Unrecognized type: {}".format(data["type"])) | ||
return res | ||
|
||
@property | ||
@abstractmethod | ||
def _data_source_type(self) -> str: | ||
"""The data source type string, which is the leading term of the data_source_id.""" | ||
|
||
@classmethod | ||
def from_data_source_id(cls, data_source_id: str) -> "DataSource": | ||
"""Build a DataSource from a datasource_id.""" | ||
terms = data_source_id.split("::") | ||
types: List[Type[Serializable]] = [ | ||
CSVDataSource, GemTableDataSource, ExperimentDataSourceRef, SnapshotDataSource | ||
] | ||
res = next((x for x in types if x._data_source_type == terms[0]), None) | ||
if res is None: | ||
raise ValueError("Unrecognized type: {}".format(terms[0])) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nitpick: f-string. |
||
return res._data_source_id_builder(*terms[1:]) | ||
|
||
@classmethod | ||
@abstractmethod | ||
def _data_source_id_builder(cls, *args) -> "DataSource": | ||
"""Build a DataSource based on a parsed data_source_id.""" | ||
|
||
@abstractmethod | ||
def to_data_source_id(self) -> str: | ||
"""Generate the data_source_id for this DataSource.""" | ||
|
||
|
||
class CSVDataSource(Serializable['CSVDataSource'], DataSource): | ||
"""A data source based on a CSV file stored on the data platform. | ||
|
@@ -65,6 +93,8 @@ class CSVDataSource(Serializable['CSVDataSource'], DataSource): | |
properties.String, properties.Object(Descriptor), "column_definitions") | ||
identifiers = properties.Optional(properties.List(properties.String), "identifiers") | ||
|
||
_data_source_type = "csv" | ||
|
||
def __init__(self, | ||
*, | ||
file_link: FileLink, | ||
|
@@ -74,6 +104,20 @@ def __init__(self, | |
self.column_definitions = column_definitions | ||
self.identifiers = identifiers | ||
|
||
@classmethod | ||
def _data_source_id_builder(cls, *args) -> DataSource: | ||
# TODO Figure out how to populate the column definitions | ||
return CSVDataSource( | ||
file_link=FileLink(url=args[0], filename=args[1]), | ||
column_definitions={} | ||
) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is no obvious method to identify what the column definitions and identifiers would be from just the DataSourceId. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would it be too disruptive to issue a warning here, so the user knows that the data source they're getting is incomplete? Actually, maybe the warning should come when they access the Either way, I'd like if we can alert them this isn't a completely accurate representation. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, using a warning makes sense. I don't think people are using CSVDataSources anymore, but... |
||
|
||
def to_data_source_id(self) -> str: | ||
"""Generate the data_source_id for this DataSource.""" | ||
return "::".join( | ||
str(x) for x in [self._data_source_type, self.file_link.url, self.file_link.filename] | ||
) | ||
|
||
|
||
class GemTableDataSource(Serializable['GemTableDataSource'], DataSource): | ||
"""A data source based on a GEM Table hosted on the data platform. | ||
|
@@ -92,13 +136,37 @@ class GemTableDataSource(Serializable['GemTableDataSource'], DataSource): | |
table_id = properties.UUID("table_id") | ||
table_version = properties.Integer("table_version") | ||
|
||
_data_source_type = "gemd" | ||
|
||
def __init__(self, | ||
*, | ||
table_id: UUID, | ||
table_version: Union[int, str]): | ||
self.table_id: UUID = table_id | ||
self.table_version: Union[int, str] = table_version | ||
|
||
@classmethod | ||
def _data_source_id_builder(cls, *args) -> DataSource: | ||
return GemTableDataSource(table_id=UUID(args[0]), table_version=args[1]) | ||
|
||
def to_data_source_id(self) -> str: | ||
"""Generate the data_source_id for this DataSource.""" | ||
return "::".join( | ||
str(x) for x in [self._data_source_type, self.table_id, self.table_version] | ||
) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nitpick: what's the benefit of this over an f-string?
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My thought when writing was that we have a series of identifiers joined by |
||
|
||
@classmethod | ||
def from_gemtable(cls, table: GemTable) -> "GemTableDataSource": | ||
"""Generate a DataSource that corresponds to a GemTable. | ||
|
||
Parameters | ||
---------- | ||
table: GemTable | ||
The GemTable object to reference | ||
|
||
""" | ||
return GemTableDataSource(table_id=table.uid, table_version=table.version) | ||
|
||
|
||
class ExperimentDataSourceRef(Serializable['ExperimentDataSourceRef'], DataSource): | ||
"""A reference to a data source based on an experiment result hosted on the data platform. | ||
|
@@ -113,5 +181,42 @@ class ExperimentDataSourceRef(Serializable['ExperimentDataSourceRef'], DataSourc | |
typ = properties.String('type', default='experiments_data_source', deserializable=False) | ||
datasource_id = properties.UUID("datasource_id") | ||
|
||
_data_source_type = "experiments" | ||
|
||
def __init__(self, *, datasource_id: UUID): | ||
self.datasource_id: UUID = datasource_id | ||
|
||
@classmethod | ||
def _data_source_id_builder(cls, *args) -> DataSource: | ||
return ExperimentDataSourceRef(datasource_id=UUID(args[0])) | ||
|
||
def to_data_source_id(self) -> str: | ||
"""Generate the data_source_id for this DataSource.""" | ||
return "::".join(str(x) for x in [self._data_source_type, self.datasource_id]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same nitpick as above. |
||
|
||
|
||
class SnapshotDataSource(Serializable['SnapshotDataSource'], DataSource): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This will presumably be useful soon, since it's the next step after Multistep Materials tables. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, Bill and I were just talking about what's needed to properly support training, etc on snapshots. |
||
"""A reference to a data source based on a Snapshot on the data platform. | ||
|
||
Parameters | ||
---------- | ||
snapshot_id: UUID | ||
Unique identifier for the Snapshot Data Source | ||
|
||
""" | ||
|
||
typ = properties.String('type', default='snapshot_data_source', deserializable=False) | ||
snapshot_id = properties.UUID("snapshot_id") | ||
|
||
_data_source_type = "snapshot" | ||
|
||
def __init__(self, *, snapshot_id: UUID): | ||
self.snapshot_id = snapshot_id | ||
|
||
@classmethod | ||
def _data_source_id_builder(cls, *args) -> DataSource: | ||
return SnapshotDataSource(snapshot_id=UUID(args[0])) | ||
|
||
def to_data_source_id(self) -> str: | ||
"""Generate the data_source_id for this DataSource.""" | ||
return "::".join(str(x) for x in [self._data_source_type, self.snapshot_id]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same as above. |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,6 +3,7 @@ | |
|
||
from citrine._rest.resource import Resource | ||
from citrine._serialization import properties | ||
from citrine.informatics.data_sources import DataSource | ||
from citrine.informatics.workflows.workflow import Workflow | ||
from citrine.resources.design_execution import DesignExecutionCollection | ||
from citrine._rest.ai_resource_metadata import AIResourceMetadata | ||
|
@@ -31,11 +32,12 @@ class DesignWorkflow(Resource['DesignWorkflow'], Workflow, AIResourceMetadata): | |
design_space_id = properties.Optional(properties.UUID, 'design_space_id') | ||
predictor_id = properties.Optional(properties.UUID, 'predictor_id') | ||
predictor_version = properties.Optional( | ||
properties.Union([properties.Integer(), properties.String()]), 'predictor_version') | ||
properties.Union([properties.Integer, properties.String]), 'predictor_version') | ||
branch_root_id: Optional[UUID] = properties.Optional(properties.UUID, 'branch_root_id') | ||
""":Optional[UUID]: Root ID of the branch that contains this workflow.""" | ||
branch_version: Optional[int] = properties.Optional(properties.Integer, 'branch_version') | ||
""":Optional[int]: Version number of the branch that contains this workflow.""" | ||
data_source = properties.Optional(properties.Object(DataSource), "data_source") | ||
|
||
status_description = properties.String('status_description', serializable=False) | ||
""":str: more detailed description of the workflow's status""" | ||
|
@@ -50,20 +52,54 @@ def __init__(self, | |
design_space_id: Optional[UUID] = None, | ||
predictor_id: Optional[UUID] = None, | ||
predictor_version: Optional[Union[int, str]] = None, | ||
data_source: Optional[DataSource] = None, | ||
description: Optional[str] = None): | ||
self.name = name | ||
self.design_space_id = design_space_id | ||
self.predictor_id = predictor_id | ||
self.predictor_version = predictor_version | ||
self.data_source = data_source | ||
self.description = description | ||
|
||
def __str__(self): | ||
return '<DesignWorkflow {!r}>'.format(self.name) | ||
|
||
@classmethod | ||
def _pre_build(cls, data: dict) -> dict: | ||
"""Run data modification before building.""" | ||
data_source_id = data.pop("data_source_id", None) | ||
if data_source_id is not None: | ||
data["data_source"] = DataSource.from_data_source_id(data_source_id).dump() | ||
return data | ||
|
||
def _post_dump(self, data: dict) -> dict: | ||
"""Run data modification after dumping.""" | ||
data_source = data.pop("data_source", None) | ||
if data_source is not None: | ||
data["data_source_id"] = DataSource.build(data_source).to_data_source_id() | ||
else: | ||
data["data_source_id"] = None | ||
return data | ||
|
||
Comment on lines
+67
to
+83
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This uses the existing serde hooks to filter out |
||
@property | ||
def design_executions(self) -> DesignExecutionCollection: | ||
"""Return a resource representing all visible executions of this workflow.""" | ||
if getattr(self, 'project_id', None) is None: | ||
raise AttributeError('Cannot initialize execution without project reference!') | ||
return DesignExecutionCollection( | ||
project_id=self.project_id, session=self._session, workflow_id=self.uid) | ||
|
||
@property | ||
def data_source_id(self) -> Optional[str]: | ||
"""A resource referencing the workflow's data source.""" | ||
if self.data_source is None: | ||
return None | ||
else: | ||
return self.data_source.to_data_source_id() | ||
|
||
@data_source_id.setter | ||
def data_source_id(self, value: Optional[str]): | ||
if value is None: | ||
self.data_source = None | ||
else: | ||
self.data_source = DataSource.from_data_source_id(value) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -949,16 +949,3 @@ def predictor_evaluation_workflow_dict(generic_entity, example_cv_evaluator_dict | |
"evaluators": [example_cv_evaluator_dict, example_holdout_evaluator_dict] | ||
}) | ||
return ret | ||
|
||
@pytest.fixture | ||
def design_workflow_dict(generic_entity): | ||
ret = generic_entity.copy() | ||
ret.update({ | ||
"name": "Example Design Workflow", | ||
"description": "A description! Of the Design Workflow! So you know what it's for!", | ||
"design_space_id": str(uuid.uuid4()), | ||
"predictor_id": str(uuid.uuid4()), | ||
"predictor_version": random.randint(1, 10), | ||
"branch_id": str(uuid.uuid4()), | ||
}) | ||
return ret | ||
Comment on lines
-952
to
-964
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Having two different DesignWorkflow test objects (a fixture & a Factory) made testing on the updated object complicated. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nitpick: maybe lift this list to the class level, as it's duplicated above.