From 45ce05e082993b9751b5a01b87ac1d6848193d58 Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Thu, 13 Feb 2025 13:13:18 -0800 Subject: [PATCH 01/59] feat(ingestion) Adding vertexAI ingestion source --- metadata-ingestion/setup.py | 2 + .../src/datahub/ingestion/source/vertexai.py | 185 ++++++++++++++++++ .../tests/unit/test_vertexai_source.py | 135 +++++++++++++ 3 files changed, 322 insertions(+) create mode 100644 metadata-ingestion/src/datahub/ingestion/source/vertexai.py create mode 100644 metadata-ingestion/tests/unit/test_vertexai_source.py diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 3bf8917f23f891..edc8afdb220a8f 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -528,6 +528,7 @@ "sigma": sqlglot_lib | {"requests"}, "sac": sac, "neo4j": {"pandas", "neo4j"}, + "vertexai": {"google-cloud-aiplatform>=1.80.0"}, } # This is mainly used to exclude plugins from the Docker image. @@ -796,6 +797,7 @@ "sac = datahub.ingestion.source.sac.sac:SACSource", "cassandra = datahub.ingestion.source.cassandra.cassandra:CassandraSource", "neo4j = datahub.ingestion.source.neo4j.neo4j_source:Neo4jSource", + "vertexai = datahub.ingestion.source.vertexai:VertexAISource", ], "datahub.ingestion.transformer.plugins": [ "pattern_cleanup_ownership = datahub.ingestion.transformer.pattern_cleanup_ownership:PatternCleanUpOwnership", diff --git a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py new file mode 100644 index 00000000000000..03d59a022e8472 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py @@ -0,0 +1,185 @@ +from dataclasses import dataclass +from typing import Any, Callable, Iterable, Optional, TypeVar, Union +import logging + +from google.cloud import aiplatform +from google.cloud.aiplatform.models import Model, VersionInfo + +from pydantic.fields import Field + +import datahub.emitter.mce_builder as builder +from datahub.configuration.source_common import EnvConfigMixin +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) +from datahub.ingestion.api.source import Source, SourceCapability, SourceReport +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.metadata.schema_classes import ( + GlobalTagsClass, + MLHyperParamClass, + MLMetricClass, + MLModelGroupPropertiesClass, + MLModelPropertiesClass, + TagAssociationClass, + TagPropertiesClass, + VersionTagClass, + _Aspect, +) + +T = TypeVar("T") + +logger = logging.getLogger(__name__) +class VertexAIConfig(EnvConfigMixin): + + project_id: str = Field( + description=( + "Project ID" + ) + ) + region: str = Field( + description=( + "Region" + ), + ) + bucket_uri: Optional[str] = Field( + default=None, + description=( + "Bucket URI" + ), + ) + + model_name_separator: str = Field( + default="_", + description="A string which separates model name from its version (e.g. model_1 or model-1)", + ) + + +@platform_name("vertexai") +@config_class(VertexAIConfig) +@support_status(SupportStatus.TESTING) +@capability( + SourceCapability.DESCRIPTIONS, + "Extract descriptions for vertexai Registered Models and Model Versions", +) +@capability(SourceCapability.TAGS, "Extract tags for vertexai Registered Model Stages") +class VertexAISource(Source): + platform = "vertexai" + + def __init__(self, ctx: PipelineContext, config: VertexAIConfig): + super().__init__(ctx) + self.config = config + self.report = SourceReport() + aiplatform.init(project=config.project_id, location=config.region) + self.client = aiplatform + + + def get_report(self) -> SourceReport: + return self.report + + def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: + yield from self._get_ml_model_workunits() + + def _create_workunit(self, urn: str, aspect: _Aspect) -> MetadataWorkUnit: + """ + Utility to create an MCP workunit. + """ + return MetadataChangeProposalWrapper( + entityUrn=urn, + aspect=aspect, + ).as_workunit() + + def _get_ml_model_workunits(self) -> Iterable[MetadataWorkUnit]: + """ + Traverse each Registered Model in Model Registry and generate a corresponding workunit. + """ + registered_models = self.client.Model.list() + for model in registered_models: + yield self._get_ml_group_workunit(model) + model_versions = model.versioning_registry.list_versions() + for model_version in model_versions: + yield self._get_ml_model_properties_workunit( + registered_model=model, + model_version=model_version + ) + + def _get_ml_group_workunit( + self, + registered_model: Model, + ) -> MetadataWorkUnit: + """ + Generate an MLModelGroup workunit for an MLflow Registered Model. + """ + ml_model_group_urn = self._make_ml_model_group_urn(registered_model) + ml_model_group_properties = MLModelGroupPropertiesClass( + name=registered_model.name, + description=registered_model.description, + createdAt=registered_model.create_time, + ) + wu = self._create_workunit( + urn=ml_model_group_urn, + aspect=ml_model_group_properties, + ) + return wu + + def _make_ml_model_group_urn(self, registered_model: Model) -> str: + urn = builder.make_ml_model_group_urn( + platform=self.platform, + group_name=registered_model.name, + env=self.config.env, + ) + return urn + + def _get_ml_model_properties_workunit( + self, + registered_model: Model, + model_version: VersionInfo, + ) -> MetadataWorkUnit: + """ + Generate an MLModel workunit for an Vertex Model Version. + Every Model Version is a DataHub MLModel entity associated with an MLModelGroup corresponding to a Registered Model. + """ + ml_model_group_urn = self._make_ml_model_group_urn(registered_model) + ml_model_urn = self._make_ml_model_urn(model_version) + + training_job = None + training_metrics = None + hyperparams = None + try: + job = registered_model.training_job + training_job = [job] + except RuntimeError as e: + logger.info("No training job found for model %s", registered_model.name) + + ml_model_properties = MLModelPropertiesClass( + name=registered_model.name, + description=model_version.version_description, + date=model_version.version_create_time, + version=VersionTagClass(versionTag=str(model_version.version_id)), + hyperParams=hyperparams, + trainingMetrics=training_metrics, + trainingJobs=training_job, + groups=[ml_model_group_urn], + # tags=list(model_version.tags.keys()), + # customProperties=model_version.tags, + # externalUrl=self._make_external_url(model_version), + ) + + wu = self._create_workunit(urn=ml_model_urn, aspect=ml_model_properties) + return wu + + def _make_ml_model_urn(self, model_version: VersionInfo) -> str: + urn = builder.make_ml_model_urn( + platform=self.platform, + model_name=f"{model_version.model_display_name}{self.config.model_name_separator}{model_version.version_id}", + env=self.config.env, + ) + return urn + + + diff --git a/metadata-ingestion/tests/unit/test_vertexai_source.py b/metadata-ingestion/tests/unit/test_vertexai_source.py new file mode 100644 index 00000000000000..121bab6eb7da30 --- /dev/null +++ b/metadata-ingestion/tests/unit/test_vertexai_source.py @@ -0,0 +1,135 @@ +from typing import List +from unittest.mock import MagicMock, patch + +import pytest +from google.cloud import aiplatform +from google.cloud.aiplatform.models import Model, VersionInfo +from google.protobuf import timestamp_pb2 + +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.source.vertexai import VertexAISource, VertexAIConfig + +@pytest.fixture +def project_id() -> str: + return "acryl-poc" + +@pytest.fixture +def region() -> str: + return "us-west2" + +@pytest.fixture +def source(project_id: str, region: str) -> VertexAISource: + return VertexAISource( + ctx=PipelineContext(run_id="vertexai-source-test"), + config=VertexAIConfig(project_id=project_id, region=region), + ) + + +@pytest.fixture +def real_model(source: VertexAISource) -> Model: + """ + Fixture for the model that is actually registered in the Vertex AI Model Registry + use mock_models for local testing purpose + """ + model_name = "projects/872197881936/locations/us-west2/models/3583871344875405312" + return Model(model_name=model_name) + + +@pytest.fixture +def model_version( + source: VertexAISource, + real_model: Model, +) -> VersionInfo: + version = "1" + return VersionInfo( + version_id=version, + version_description="test", + # how to create timestamp_pb2.Timestamp using current time? + version_create_time=timestamp_pb2.Timestamp().GetCurrentTime(), + version_update_time=timestamp_pb2.Timestamp().GetCurrentTime(), + model_display_name=real_model.name, + model_resource_name=real_model.resource_name, + ) +@pytest.fixture +def mock_models()-> List[Model]: + mock_model_1 = MagicMock(spec=Model) + mock_model_1.name = "mock_prediction_model_1" + mock_model_1.create_time = timestamp_pb2.Timestamp().GetCurrentTime() + mock_model_1.update_time = timestamp_pb2.Timestamp().GetCurrentTime() + mock_model_1.version_id = "1" + mock_model_1.display_name = "mock_prediction_model_1_display_name" + + mock_model_2 = MagicMock(spec=Model) + mock_model_2.name = "mock_prediction_model_2" + + mock_model_2.create_time = timestamp_pb2.Timestamp().GetCurrentTime() + mock_model_2.update_time = timestamp_pb2.Timestamp().GetCurrentTime() + mock_model_2.version_id = "1" + mock_model_2.display_name = "mock_prediction_model_2_display_name" + + return [mock_model_1, mock_model_2] + +@pytest.fixture +def mock_model(): + mock_model_1 = MagicMock(spec=Model) + mock_model_1.name = "mock_prediction_model_1" + mock_model_1.create_time = timestamp_pb2.Timestamp().GetCurrentTime() + mock_model_1.update_time = timestamp_pb2.Timestamp().GetCurrentTime() + mock_model_1.version_id = "1" + mock_model_1.display_name = "mock_prediction_model_1_display_name" + return mock_model_1 + +def test_mock_model_workunit(source, mock_model, model_version): + wu = source._get_ml_model_properties_workunit( + registered_model=mock_model, + model_version=model_version, + ) + aspect = wu.metadata.aspect + # aspect is MLModelPropertiesClass + print(aspect) + assert aspect.description == model_version.version_description + assert aspect.date == model_version.version_create_time + +@pytest.mark.skip(reason="Skipping, this is for debugging purpose") +def test_real_model_workunit(source, real_model, model_version): + """ + Disabled as default + Use real model registered in the Vertex AI Model Registry + """ + wu = source._get_ml_model_properties_workunit( + registered_model=real_model, + model_version=model_version, + ) + aspect = wu.metadata.aspect + # aspect is MLModelPropertiesClass + assert aspect.description == model_version.version_description + assert aspect.date == model_version.version_create_time + assert aspect.hyperParams is None + assert aspect.trainingMetrics is None + + +@patch('google.cloud.aiplatform.Model.list') +def test_mock_models_workunits(mock_list, source, real_model, model_version, mock_models): + mock_list.return_value=mock_models + wcs = [wc for wc in source._get_ml_model_workunits()] + assert len(wcs) == 2 + # aspect is MLModelGroupPropertiesClass + assert wcs[0].metadata.aspect.name == mock_models[0].name + assert wcs[0].metadata.aspect.description == mock_models[0].description + assert wcs[0].metadata.aspect.createdAt == mock_models[0].create_time + assert wcs[1].metadata.aspect.name == mock_models[1].name + assert wcs[1].metadata.aspect.description == mock_models[1].description + assert wcs[1].metadata.aspect.createdAt == mock_models[1].create_time + + +def test_config_model_name_separator(source, model_version): + name_version_sep = "+" + source.config.model_name_separator = name_version_sep + expected_model_name = ( + f"{model_version.model_display_name}{name_version_sep}{model_version.version_id}" + ) + expected_urn = f"urn:li:mlModel:(urn:li:dataPlatform:vertexai,{expected_model_name},{source.config.env})" + + urn = source._make_ml_model_urn(model_version) + + assert urn == expected_urn From 9a1355d0067460f31f09fb9bf3d6f76c400362a8 Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Thu, 13 Feb 2025 13:21:31 -0800 Subject: [PATCH 02/59] lintfix --- .../src/datahub/ingestion/source/vertexai.py | 67 +++++++------------ .../tests/unit/test_vertexai_source.py | 39 ++++++----- 2 files changed, 48 insertions(+), 58 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py index 03d59a022e8472..cbcee2ddc439bd 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py +++ b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py @@ -1,10 +1,8 @@ -from dataclasses import dataclass -from typing import Any, Callable, Iterable, Optional, TypeVar, Union import logging +from typing import Iterable, Optional, TypeVar from google.cloud import aiplatform from google.cloud.aiplatform.models import Model, VersionInfo - from pydantic.fields import Field import datahub.emitter.mce_builder as builder @@ -21,13 +19,8 @@ from datahub.ingestion.api.source import Source, SourceCapability, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.metadata.schema_classes import ( - GlobalTagsClass, - MLHyperParamClass, - MLMetricClass, MLModelGroupPropertiesClass, MLModelPropertiesClass, - TagAssociationClass, - TagPropertiesClass, VersionTagClass, _Aspect, ) @@ -35,23 +28,16 @@ T = TypeVar("T") logger = logging.getLogger(__name__) -class VertexAIConfig(EnvConfigMixin): - project_id: str = Field( - description=( - "Project ID" - ) - ) + +class VertexAIConfig(EnvConfigMixin): + project_id: str = Field(description=("Project ID")) region: str = Field( - description=( - "Region" - ), + description=("Region"), ) bucket_uri: Optional[str] = Field( default=None, - description=( - "Bucket URI" - ), + description=("Bucket URI"), ) model_name_separator: str = Field( @@ -78,7 +64,6 @@ def __init__(self, ctx: PipelineContext, config: VertexAIConfig): aiplatform.init(project=config.project_id, location=config.region) self.client = aiplatform - def get_report(self) -> SourceReport: return self.report @@ -104,22 +89,22 @@ def _get_ml_model_workunits(self) -> Iterable[MetadataWorkUnit]: model_versions = model.versioning_registry.list_versions() for model_version in model_versions: yield self._get_ml_model_properties_workunit( - registered_model=model, - model_version=model_version + model=model, model_version=model_version ) def _get_ml_group_workunit( self, - registered_model: Model, + model: Model, ) -> MetadataWorkUnit: """ - Generate an MLModelGroup workunit for an MLflow Registered Model. + Generate an MLModelGroup workunit for a VertexAI Model. + """ - ml_model_group_urn = self._make_ml_model_group_urn(registered_model) + ml_model_group_urn = self._make_ml_model_group_urn(model) ml_model_group_properties = MLModelGroupPropertiesClass( - name=registered_model.name, - description=registered_model.description, - createdAt=registered_model.create_time, + name=model.name, + description=model.description, + createdAt=model.create_time, ) wu = self._create_workunit( urn=ml_model_group_urn, @@ -127,37 +112,38 @@ def _get_ml_group_workunit( ) return wu - def _make_ml_model_group_urn(self, registered_model: Model) -> str: + def _make_ml_model_group_urn(self, model: Model) -> str: urn = builder.make_ml_model_group_urn( platform=self.platform, - group_name=registered_model.name, + group_name=model.name, env=self.config.env, ) return urn def _get_ml_model_properties_workunit( self, - registered_model: Model, + model: Model, model_version: VersionInfo, ) -> MetadataWorkUnit: """ - Generate an MLModel workunit for an Vertex Model Version. - Every Model Version is a DataHub MLModel entity associated with an MLModelGroup corresponding to a Registered Model. + Generate an MLModel workunit for an VertexAI Model Version. + Every Model Version is a DataHub MLModel entity associated with an MLModelGroup + corresponding to a Registered Model in VertexAI Model Registry. """ - ml_model_group_urn = self._make_ml_model_group_urn(registered_model) + ml_model_group_urn = self._make_ml_model_group_urn(model) ml_model_urn = self._make_ml_model_urn(model_version) training_job = None training_metrics = None hyperparams = None try: - job = registered_model.training_job + job = model.training_job training_job = [job] - except RuntimeError as e: - logger.info("No training job found for model %s", registered_model.name) + except RuntimeError: + logger.info("No training job found for model %s", model.name) ml_model_properties = MLModelPropertiesClass( - name=registered_model.name, + name=model.name, description=model_version.version_description, date=model_version.version_create_time, version=VersionTagClass(versionTag=str(model_version.version_id)), @@ -180,6 +166,3 @@ def _make_ml_model_urn(self, model_version: VersionInfo) -> str: env=self.config.env, ) return urn - - - diff --git a/metadata-ingestion/tests/unit/test_vertexai_source.py b/metadata-ingestion/tests/unit/test_vertexai_source.py index 121bab6eb7da30..d4b8eb53371e1c 100644 --- a/metadata-ingestion/tests/unit/test_vertexai_source.py +++ b/metadata-ingestion/tests/unit/test_vertexai_source.py @@ -2,21 +2,23 @@ from unittest.mock import MagicMock, patch import pytest -from google.cloud import aiplatform from google.cloud.aiplatform.models import Model, VersionInfo from google.protobuf import timestamp_pb2 from datahub.ingestion.api.common import PipelineContext -from datahub.ingestion.source.vertexai import VertexAISource, VertexAIConfig +from datahub.ingestion.source.vertexai import VertexAIConfig, VertexAISource + @pytest.fixture def project_id() -> str: return "acryl-poc" + @pytest.fixture def region() -> str: return "us-west2" + @pytest.fixture def source(project_id: str, region: str) -> VertexAISource: return VertexAISource( @@ -28,8 +30,8 @@ def source(project_id: str, region: str) -> VertexAISource: @pytest.fixture def real_model(source: VertexAISource) -> Model: """ - Fixture for the model that is actually registered in the Vertex AI Model Registry - use mock_models for local testing purpose + Fixture for the model that is actually registered in the Vertex AI Model Registry + use mock_models for local testing purpose """ model_name = "projects/872197881936/locations/us-west2/models/3583871344875405312" return Model(model_name=model_name) @@ -38,7 +40,7 @@ def real_model(source: VertexAISource) -> Model: @pytest.fixture def model_version( source: VertexAISource, - real_model: Model, + real_model: Model, ) -> VersionInfo: version = "1" return VersionInfo( @@ -50,8 +52,10 @@ def model_version( model_display_name=real_model.name, model_resource_name=real_model.resource_name, ) + + @pytest.fixture -def mock_models()-> List[Model]: +def mock_models() -> List[Model]: mock_model_1 = MagicMock(spec=Model) mock_model_1.name = "mock_prediction_model_1" mock_model_1.create_time = timestamp_pb2.Timestamp().GetCurrentTime() @@ -69,6 +73,7 @@ def mock_models()-> List[Model]: return [mock_model_1, mock_model_2] + @pytest.fixture def mock_model(): mock_model_1 = MagicMock(spec=Model) @@ -79,9 +84,10 @@ def mock_model(): mock_model_1.display_name = "mock_prediction_model_1_display_name" return mock_model_1 + def test_mock_model_workunit(source, mock_model, model_version): wu = source._get_ml_model_properties_workunit( - registered_model=mock_model, + model=mock_model, model_version=model_version, ) aspect = wu.metadata.aspect @@ -90,14 +96,15 @@ def test_mock_model_workunit(source, mock_model, model_version): assert aspect.description == model_version.version_description assert aspect.date == model_version.version_create_time + @pytest.mark.skip(reason="Skipping, this is for debugging purpose") def test_real_model_workunit(source, real_model, model_version): """ - Disabled as default - Use real model registered in the Vertex AI Model Registry + Disabled as default + Use real model registered in the Vertex AI Model Registry """ wu = source._get_ml_model_properties_workunit( - registered_model=real_model, + model=real_model, model_version=model_version, ) aspect = wu.metadata.aspect @@ -108,9 +115,11 @@ def test_real_model_workunit(source, real_model, model_version): assert aspect.trainingMetrics is None -@patch('google.cloud.aiplatform.Model.list') -def test_mock_models_workunits(mock_list, source, real_model, model_version, mock_models): - mock_list.return_value=mock_models +@patch("google.cloud.aiplatform.Model.list") +def test_mock_models_workunits( + mock_list, source, real_model, model_version, mock_models +): + mock_list.return_value = mock_models wcs = [wc for wc in source._get_ml_model_workunits()] assert len(wcs) == 2 # aspect is MLModelGroupPropertiesClass @@ -125,9 +134,7 @@ def test_mock_models_workunits(mock_list, source, real_model, model_version, moc def test_config_model_name_separator(source, model_version): name_version_sep = "+" source.config.model_name_separator = name_version_sep - expected_model_name = ( - f"{model_version.model_display_name}{name_version_sep}{model_version.version_id}" - ) + expected_model_name = f"{model_version.model_display_name}{name_version_sep}{model_version.version_id}" expected_urn = f"urn:li:mlModel:(urn:li:dataPlatform:vertexai,{expected_model_name},{source.config.env})" urn = source._make_ml_model_urn(model_version) From 04315d4383c57c783618b2ec63a4f5e9e2dd5d84 Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Thu, 13 Feb 2025 13:26:32 -0800 Subject: [PATCH 03/59] minor comment change --- .../src/datahub/ingestion/source/vertexai.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py index cbcee2ddc439bd..170031c420a05e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py +++ b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py @@ -31,13 +31,13 @@ class VertexAIConfig(EnvConfigMixin): - project_id: str = Field(description=("Project ID")) + project_id: str = Field(description=("Project ID in Google Cloud Platform")) region: str = Field( - description=("Region"), + description=("Region of your project in Google Cloud Platform"), ) bucket_uri: Optional[str] = Field( default=None, - description=("Bucket URI"), + description=("Bucket URI used in your GCP project"), ) model_name_separator: str = Field( @@ -53,7 +53,7 @@ class VertexAIConfig(EnvConfigMixin): SourceCapability.DESCRIPTIONS, "Extract descriptions for vertexai Registered Models and Model Versions", ) -@capability(SourceCapability.TAGS, "Extract tags for vertexai Registered Model Stages") +@capability(SourceCapability.TAGS, "Extract tags for VertexAI Registered Model Stages") class VertexAISource(Source): platform = "vertexai" @@ -81,7 +81,7 @@ def _create_workunit(self, urn: str, aspect: _Aspect) -> MetadataWorkUnit: def _get_ml_model_workunits(self) -> Iterable[MetadataWorkUnit]: """ - Traverse each Registered Model in Model Registry and generate a corresponding workunit. + Fetch List of Models in Model Registry and generate a corresponding workunit. """ registered_models = self.client.Model.list() for model in registered_models: From e3a17b562e329735038d99e2f9d3aa2fed2ca713 Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Thu, 13 Feb 2025 13:31:29 -0800 Subject: [PATCH 04/59] minor --- metadata-ingestion/src/datahub/ingestion/source/vertexai.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py index 170031c420a05e..815450102ba946 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py +++ b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py @@ -37,7 +37,7 @@ class VertexAIConfig(EnvConfigMixin): ) bucket_uri: Optional[str] = Field( default=None, - description=("Bucket URI used in your GCP project"), + description=("Bucket URI used in your project"), ) model_name_separator: str = Field( From 2a5ea5838a6ac8595db4bb28a0fc1331f37e6e91 Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Thu, 13 Feb 2025 13:51:19 -0800 Subject: [PATCH 05/59] minor change in unit test --- .../tests/unit/test_vertexai_source.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/metadata-ingestion/tests/unit/test_vertexai_source.py b/metadata-ingestion/tests/unit/test_vertexai_source.py index d4b8eb53371e1c..e7ec577da037a8 100644 --- a/metadata-ingestion/tests/unit/test_vertexai_source.py +++ b/metadata-ingestion/tests/unit/test_vertexai_source.py @@ -116,7 +116,7 @@ def test_real_model_workunit(source, real_model, model_version): @patch("google.cloud.aiplatform.Model.list") -def test_mock_models_workunits( +def test_mock_models_and_versions_workunits( mock_list, source, real_model, model_version, mock_models ): mock_list.return_value = mock_models @@ -131,6 +131,19 @@ def test_mock_models_workunits( assert wcs[1].metadata.aspect.createdAt == mock_models[1].create_time +@pytest.mark.skip(reason="Skipping, this is for debugging purpose") +def test_real_models_and_versions_workunits(source): + """ + Disabled as default + Use real model registered in the Vertex AI Model Registry + """ + wcs = [wc for wc in source._get_ml_model_workunits()] + assert len(wcs) == 2 + # aspect is MLModelGroupPropertiesClass or MLModelPropertiesClass + # assert using real name in GCP model registry + # assert wcs[0].metadata.aspect.name == "mock_prediction_model_1" + + def test_config_model_name_separator(source, model_version): name_version_sep = "+" source.config.model_name_separator = name_version_sep From 3739c2040dcde3e95ebf248d39c0030a5ee16a1f Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Tue, 18 Feb 2025 14:28:00 -0800 Subject: [PATCH 06/59] Adding sources and documents --- .../app/ingest/source/builder/sources.json | 7 ++ .../docs/sources/vertexai/README.md | 1 + .../docs/sources/vertexai/vertexai_pre.md | 109 ++++++++++++++++++ .../docs/sources/vertexai/vertexai_recipe.yml | 10 ++ 4 files changed, 127 insertions(+) create mode 100644 metadata-ingestion/docs/sources/vertexai/README.md create mode 100644 metadata-ingestion/docs/sources/vertexai/vertexai_pre.md create mode 100644 metadata-ingestion/docs/sources/vertexai/vertexai_recipe.yml diff --git a/datahub-web-react/src/app/ingest/source/builder/sources.json b/datahub-web-react/src/app/ingest/source/builder/sources.json index 102cce0f491e36..43fd1c28ea94f7 100644 --- a/datahub-web-react/src/app/ingest/source/builder/sources.json +++ b/datahub-web-react/src/app/ingest/source/builder/sources.json @@ -333,5 +333,12 @@ "description": "Import Nodes and Relationships from Neo4j.", "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/neo4j/", "recipe": "source:\n type: 'neo4j'\n config:\n uri: 'neo4j+ssc://host:7687'\n username: 'neo4j'\n password: 'password'\n env: 'PROD'\n\nsink:\n type: \"datahub-rest\"\n config:\n server: 'http://localhost:8080'" + }, + { + "urn": "urn:li:dataPlatform:vertexai", + "name": "vertexai", + "displayName": "VertexAI", + "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/vertexai/", + "recipe": "source:\n type: vertexai\n config:\n tracking_uri: tracking_uri" } ] diff --git a/metadata-ingestion/docs/sources/vertexai/README.md b/metadata-ingestion/docs/sources/vertexai/README.md new file mode 100644 index 00000000000000..07bc128a6007d6 --- /dev/null +++ b/metadata-ingestion/docs/sources/vertexai/README.md @@ -0,0 +1 @@ +Ingesting metadata from VertexAI requires using the **vertexai** module. diff --git a/metadata-ingestion/docs/sources/vertexai/vertexai_pre.md b/metadata-ingestion/docs/sources/vertexai/vertexai_pre.md new file mode 100644 index 00000000000000..d6efe9334f7567 --- /dev/null +++ b/metadata-ingestion/docs/sources/vertexai/vertexai_pre.md @@ -0,0 +1,109 @@ +### Prerequisites +To understand how BigQuery ingestion needs to be set up, first familiarize yourself with the concepts in the diagram below: +

+ +

+ +There are two important concepts to understand and identify: +- *Extractor Project*: This is the project associated with a service-account, whose credentials you will be configuring in the connector. The connector uses this service-account to run jobs (including queries) within the project. +- *Bigquery Projects* are the projects from which table metadata, lineage, usage, and profiling data need to be collected. By default, the extractor project is included in the list of projects that DataHub collects metadata from, but you can control that by passing in a specific list of project ids that you want to collect metadata from. Read the configuration section below to understand how to limit the list of projects that DataHub extracts metadata from. + +#### Create a datahub profile in GCP +1. Create a custom role for datahub as per [BigQuery docs](https://cloud.google.com/iam/docs/creating-custom-roles#creating_a_custom_role). +2. Follow the sections below to grant permissions to this role on this project and other projects. + +##### Basic Requirements (needed for metadata ingestion) +1. Identify your Extractor Project where the service account will run queries to extract metadata. + +| permission                       | Description                                                                                                                         | Capability                                                               | +|----------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------| +| `bigquery.jobs.create`           | Run jobs (e.g. queries) within the project. *This only needs for the extractor project where the service account belongs*           |                                                                                                               | +| `bigquery.jobs.list`             | Manage the queries that the service account has sent. *This only needs for the extractor project where the service account belongs* |                                                                                                               | +| `bigquery.readsessions.create`   | Create a session for streaming large results. *This only needs for the extractor project where the service account belongs*         |                                                                                                               | +| `bigquery.readsessions.getData` | Get data from the read session. *This only needs for the extractor project where the service account belongs*                       | +2. Grant the following permissions to the Service Account on every project where you would like to extract metadata from + +:::info + +If you have multiple projects in your BigQuery setup, the role should be granted these permissions in each of the projects. + +::: +| Permission | Description | Capability | Default GCP Role Which Contains This Permission | +|----------------------------------|-----------------------------------------------------------------------------------------------------------------|-------------------------------------|---------------------------------------------------------------------------| +| `bigquery.datasets.get` | Retrieve metadata about a dataset. | Table Metadata Extraction | [roles/bigquery.metadataViewer](https://cloud.google.com/bigquery/docs/access-control#bigquery.metadataViewer) | +| `bigquery.datasets.getIamPolicy` | Read a dataset's IAM permissions. | Table Metadata Extraction | [roles/bigquery.metadataViewer](https://cloud.google.com/bigquery/docs/access-control#bigquery.metadataViewer) | +| `bigquery.tables.list` | List BigQuery tables. | Table Metadata Extraction | [roles/bigquery.metadataViewer](https://cloud.google.com/bigquery/docs/access-control#bigquery.metadataViewer) | +| `bigquery.tables.get` | Retrieve metadata for a table. | Table Metadata Extraction | [roles/bigquery.metadataViewer](https://cloud.google.com/bigquery/docs/access-control#bigquery.metadataViewer) | +| `bigquery.routines.get` | Get Routines. Needs to retrieve metadata for a table from system table. | Table Metadata Extraction | [roles/bigquery.metadataViewer](https://cloud.google.com/bigquery/docs/access-control#bigquery.metadataViewer) | +| `bigquery.routines.list` | List Routines. Needs to retrieve metadata for a table from system table. | Table Metadata Extraction | [roles/bigquery.metadataViewer](https://cloud.google.com/bigquery/docs/access-control#bigquery.metadataViewer) | +| `resourcemanager.projects.get` | Retrieve project names and metadata. | Table Metadata Extraction | [roles/bigquery.metadataViewer](https://cloud.google.com/bigquery/docs/access-control#bigquery.metadataViewer) | +| `bigquery.jobs.listAll` | List all jobs (queries) submitted by any user. Needs for Lineage extraction. | Lineage Extraction/Usage Extraction | [roles/bigquery.resourceViewer](https://cloud.google.com/bigquery/docs/access-control#bigquery.resourceViewer) | +| `logging.logEntries.list` | Fetch log entries for lineage/usage data. Not required if `use_exported_bigquery_audit_metadata` is enabled. | Lineage Extraction/Usage Extraction | [roles/logging.privateLogViewer](https://cloud.google.com/logging/docs/access-control#logging.privateLogViewer) | +| `logging.privateLogEntries.list` | Fetch log entries for lineage/usage data. Not required if `use_exported_bigquery_audit_metadata` is enabled. | Lineage Extraction/Usage Extraction | [roles/logging.privateLogViewer](https://cloud.google.com/logging/docs/access-control#logging.privateLogViewer) | +| `bigquery.tables.getData` | Access table data to extract storage size, last updated at, data profiles etc. | Profiling | | +| `datacatalog.policyTags.get` | *Optional* Get policy tags for columns with associated policy tags. This permission is required only if `extract_policy_tags_from_catalog` is enabled. | Policy Tag Extraction | [roles/datacatalog.viewer](https://cloud.google.com/data-catalog/docs/access-control#permissions-and-roles) | + + +#### Create a service account in the Extractor Project + +1. Setup a ServiceAccount as per [BigQuery docs](https://cloud.google.com/iam/docs/creating-managing-service-accounts#iam-service-accounts-create-console) + and assign the previously created role to this service account. +2. Download a service account JSON keyfile. + Example credential file: + +```json +{ + "type": "service_account", + "project_id": "project-id-1234567", + "private_key_id": "d0121d0000882411234e11166c6aaa23ed5d74e0", + "private_key": "-----BEGIN PRIVATE KEY-----\nMIIyourkey\n-----END PRIVATE KEY-----", + "client_email": "test@suppproject-id-1234567.iam.gserviceaccount.com", + "client_id": "113545814931671546333", + "auth_uri": "https://accounts.google.com/o/oauth2/auth", + "token_uri": "https://oauth2.googleapis.com/token", + "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", + "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/test%suppproject-id-1234567.iam.gserviceaccount.com" +} +``` + +3. To provide credentials to the source, you can either: + + Set an environment variable: + + ```sh + $ export GOOGLE_APPLICATION_CREDENTIALS="/path/to/keyfile.json" + ``` + + _or_ + + Set credential config in your source based on the credential json file. For example: + + ```yml + credential: + project_id: project-id-1234567 + private_key_id: "d0121d0000882411234e11166c6aaa23ed5d74e0" + private_key: "-----BEGIN PRIVATE KEY-----\nMIIyourkey\n-----END PRIVATE KEY-----\n" + client_email: "test@suppproject-id-1234567.iam.gserviceaccount.com" + client_id: "123456678890" + ``` + +##### Profiling Requirements + +To profile BigQuery external tables backed by Google Drive document, you need to grant document's "Viewer" access to service account's email address (`client_email` in credentials json file). To find the Google Drive document linked to BigQuery table, open the BigQuery console, locate the needed table, select "Details" from the drop-down menu in the top-right corner and refer "Source" field . To share access of Google Drive document, open the document, click "Share" in the top-right corner, add the service account's email address that needs "Viewer" access. ![Google Drive Sharing Dialog](https://github.com/datahub-project/static-assets/raw/main/imgs/integrations/bigquery/google_drive_share.png) + +### Lineage Computation Details + +When `use_exported_bigquery_audit_metadata` is set to `true`, lineage information will be computed using exported bigquery logs. On how to setup exported bigquery audit logs, refer to the following [docs](https://cloud.google.com/bigquery/docs/reference/auditlogs#defining_a_bigquery_log_sink_using_gcloud) on BigQuery audit logs. Note that only protoPayloads with "type.googleapis.com/google.cloud.audit.BigQueryAuditMetadata" are supported by the current ingestion version. The `bigquery_audit_metadata_datasets` parameter will be used only if `use_exported_bigquery_audit_metadat` is set to `true`. + +Note: the `bigquery_audit_metadata_datasets` parameter receives a list of datasets, in the format $PROJECT.$DATASET. This way queries from a multiple number of projects can be used to compute lineage information. + +Note: Since bigquery source also supports dataset level lineage, the auth client will require additional permissions to be able to access the google audit logs. Refer the permissions section in bigquery-usage section below which also accesses the audit logs. + +### Profiling Details + +For performance reasons, we only profile the latest partition for partitioned tables and the latest shard for sharded tables. +You can set partition explicitly with `partition.partition_datetime` property if you want, though note that partition config will be applied to all partitioned tables. + +### Caveats + +- For materialized views, lineage is dependent on logs being retained. If your GCP logging is retained for 30 days (default) and 30 days have passed since the creation of the materialized view we won't be able to get lineage for them. diff --git a/metadata-ingestion/docs/sources/vertexai/vertexai_recipe.yml b/metadata-ingestion/docs/sources/vertexai/vertexai_recipe.yml new file mode 100644 index 00000000000000..91d8e2a86fee48 --- /dev/null +++ b/metadata-ingestion/docs/sources/vertexai/vertexai_recipe.yml @@ -0,0 +1,10 @@ +source: + type: vertexai + config: + project_id: "acryl-poc" + region: "us-west2" + +sink: + type: "datahub-rest" + config: + server: "http://localhost:8080" From 520eda61e569904bcb0f88f77e1b78b3665d291b Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Tue, 18 Feb 2025 14:39:03 -0800 Subject: [PATCH 07/59] delete unnecessary file --- .../docs/sources/vertexai/vertexai_pre.md | 109 ------------------ 1 file changed, 109 deletions(-) delete mode 100644 metadata-ingestion/docs/sources/vertexai/vertexai_pre.md diff --git a/metadata-ingestion/docs/sources/vertexai/vertexai_pre.md b/metadata-ingestion/docs/sources/vertexai/vertexai_pre.md deleted file mode 100644 index d6efe9334f7567..00000000000000 --- a/metadata-ingestion/docs/sources/vertexai/vertexai_pre.md +++ /dev/null @@ -1,109 +0,0 @@ -### Prerequisites -To understand how BigQuery ingestion needs to be set up, first familiarize yourself with the concepts in the diagram below: -

- -

- -There are two important concepts to understand and identify: -- *Extractor Project*: This is the project associated with a service-account, whose credentials you will be configuring in the connector. The connector uses this service-account to run jobs (including queries) within the project. -- *Bigquery Projects* are the projects from which table metadata, lineage, usage, and profiling data need to be collected. By default, the extractor project is included in the list of projects that DataHub collects metadata from, but you can control that by passing in a specific list of project ids that you want to collect metadata from. Read the configuration section below to understand how to limit the list of projects that DataHub extracts metadata from. - -#### Create a datahub profile in GCP -1. Create a custom role for datahub as per [BigQuery docs](https://cloud.google.com/iam/docs/creating-custom-roles#creating_a_custom_role). -2. Follow the sections below to grant permissions to this role on this project and other projects. - -##### Basic Requirements (needed for metadata ingestion) -1. Identify your Extractor Project where the service account will run queries to extract metadata. - -| permission                       | Description                                                                                                                         | Capability                                                               | -|----------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------| -| `bigquery.jobs.create`           | Run jobs (e.g. queries) within the project. *This only needs for the extractor project where the service account belongs*           |                                                                                                               | -| `bigquery.jobs.list`             | Manage the queries that the service account has sent. *This only needs for the extractor project where the service account belongs* |                                                                                                               | -| `bigquery.readsessions.create`   | Create a session for streaming large results. *This only needs for the extractor project where the service account belongs*         |                                                                                                               | -| `bigquery.readsessions.getData` | Get data from the read session. *This only needs for the extractor project where the service account belongs*                       | -2. Grant the following permissions to the Service Account on every project where you would like to extract metadata from - -:::info - -If you have multiple projects in your BigQuery setup, the role should be granted these permissions in each of the projects. - -::: -| Permission | Description | Capability | Default GCP Role Which Contains This Permission | -|----------------------------------|-----------------------------------------------------------------------------------------------------------------|-------------------------------------|---------------------------------------------------------------------------| -| `bigquery.datasets.get` | Retrieve metadata about a dataset. | Table Metadata Extraction | [roles/bigquery.metadataViewer](https://cloud.google.com/bigquery/docs/access-control#bigquery.metadataViewer) | -| `bigquery.datasets.getIamPolicy` | Read a dataset's IAM permissions. | Table Metadata Extraction | [roles/bigquery.metadataViewer](https://cloud.google.com/bigquery/docs/access-control#bigquery.metadataViewer) | -| `bigquery.tables.list` | List BigQuery tables. | Table Metadata Extraction | [roles/bigquery.metadataViewer](https://cloud.google.com/bigquery/docs/access-control#bigquery.metadataViewer) | -| `bigquery.tables.get` | Retrieve metadata for a table. | Table Metadata Extraction | [roles/bigquery.metadataViewer](https://cloud.google.com/bigquery/docs/access-control#bigquery.metadataViewer) | -| `bigquery.routines.get` | Get Routines. Needs to retrieve metadata for a table from system table. | Table Metadata Extraction | [roles/bigquery.metadataViewer](https://cloud.google.com/bigquery/docs/access-control#bigquery.metadataViewer) | -| `bigquery.routines.list` | List Routines. Needs to retrieve metadata for a table from system table. | Table Metadata Extraction | [roles/bigquery.metadataViewer](https://cloud.google.com/bigquery/docs/access-control#bigquery.metadataViewer) | -| `resourcemanager.projects.get` | Retrieve project names and metadata. | Table Metadata Extraction | [roles/bigquery.metadataViewer](https://cloud.google.com/bigquery/docs/access-control#bigquery.metadataViewer) | -| `bigquery.jobs.listAll` | List all jobs (queries) submitted by any user. Needs for Lineage extraction. | Lineage Extraction/Usage Extraction | [roles/bigquery.resourceViewer](https://cloud.google.com/bigquery/docs/access-control#bigquery.resourceViewer) | -| `logging.logEntries.list` | Fetch log entries for lineage/usage data. Not required if `use_exported_bigquery_audit_metadata` is enabled. | Lineage Extraction/Usage Extraction | [roles/logging.privateLogViewer](https://cloud.google.com/logging/docs/access-control#logging.privateLogViewer) | -| `logging.privateLogEntries.list` | Fetch log entries for lineage/usage data. Not required if `use_exported_bigquery_audit_metadata` is enabled. | Lineage Extraction/Usage Extraction | [roles/logging.privateLogViewer](https://cloud.google.com/logging/docs/access-control#logging.privateLogViewer) | -| `bigquery.tables.getData` | Access table data to extract storage size, last updated at, data profiles etc. | Profiling | | -| `datacatalog.policyTags.get` | *Optional* Get policy tags for columns with associated policy tags. This permission is required only if `extract_policy_tags_from_catalog` is enabled. | Policy Tag Extraction | [roles/datacatalog.viewer](https://cloud.google.com/data-catalog/docs/access-control#permissions-and-roles) | - - -#### Create a service account in the Extractor Project - -1. Setup a ServiceAccount as per [BigQuery docs](https://cloud.google.com/iam/docs/creating-managing-service-accounts#iam-service-accounts-create-console) - and assign the previously created role to this service account. -2. Download a service account JSON keyfile. - Example credential file: - -```json -{ - "type": "service_account", - "project_id": "project-id-1234567", - "private_key_id": "d0121d0000882411234e11166c6aaa23ed5d74e0", - "private_key": "-----BEGIN PRIVATE KEY-----\nMIIyourkey\n-----END PRIVATE KEY-----", - "client_email": "test@suppproject-id-1234567.iam.gserviceaccount.com", - "client_id": "113545814931671546333", - "auth_uri": "https://accounts.google.com/o/oauth2/auth", - "token_uri": "https://oauth2.googleapis.com/token", - "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", - "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/test%suppproject-id-1234567.iam.gserviceaccount.com" -} -``` - -3. To provide credentials to the source, you can either: - - Set an environment variable: - - ```sh - $ export GOOGLE_APPLICATION_CREDENTIALS="/path/to/keyfile.json" - ``` - - _or_ - - Set credential config in your source based on the credential json file. For example: - - ```yml - credential: - project_id: project-id-1234567 - private_key_id: "d0121d0000882411234e11166c6aaa23ed5d74e0" - private_key: "-----BEGIN PRIVATE KEY-----\nMIIyourkey\n-----END PRIVATE KEY-----\n" - client_email: "test@suppproject-id-1234567.iam.gserviceaccount.com" - client_id: "123456678890" - ``` - -##### Profiling Requirements - -To profile BigQuery external tables backed by Google Drive document, you need to grant document's "Viewer" access to service account's email address (`client_email` in credentials json file). To find the Google Drive document linked to BigQuery table, open the BigQuery console, locate the needed table, select "Details" from the drop-down menu in the top-right corner and refer "Source" field . To share access of Google Drive document, open the document, click "Share" in the top-right corner, add the service account's email address that needs "Viewer" access. ![Google Drive Sharing Dialog](https://github.com/datahub-project/static-assets/raw/main/imgs/integrations/bigquery/google_drive_share.png) - -### Lineage Computation Details - -When `use_exported_bigquery_audit_metadata` is set to `true`, lineage information will be computed using exported bigquery logs. On how to setup exported bigquery audit logs, refer to the following [docs](https://cloud.google.com/bigquery/docs/reference/auditlogs#defining_a_bigquery_log_sink_using_gcloud) on BigQuery audit logs. Note that only protoPayloads with "type.googleapis.com/google.cloud.audit.BigQueryAuditMetadata" are supported by the current ingestion version. The `bigquery_audit_metadata_datasets` parameter will be used only if `use_exported_bigquery_audit_metadat` is set to `true`. - -Note: the `bigquery_audit_metadata_datasets` parameter receives a list of datasets, in the format $PROJECT.$DATASET. This way queries from a multiple number of projects can be used to compute lineage information. - -Note: Since bigquery source also supports dataset level lineage, the auth client will require additional permissions to be able to access the google audit logs. Refer the permissions section in bigquery-usage section below which also accesses the audit logs. - -### Profiling Details - -For performance reasons, we only profile the latest partition for partitioned tables and the latest shard for sharded tables. -You can set partition explicitly with `partition.partition_datetime` property if you want, though note that partition config will be applied to all partitioned tables. - -### Caveats - -- For materialized views, lineage is dependent on logs being retained. If your GCP logging is retained for 30 days (default) and 30 days have passed since the creation of the materialized view we won't be able to get lineage for them. From c320a6c9748f8e472c796334dd9fca2e9ffa8289 Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Sat, 22 Feb 2025 08:43:37 -0800 Subject: [PATCH 08/59] fetch list of training jobs --- .../src/datahub/ingestion/source/vertexai.py | 249 ++++++++++++++++-- 1 file changed, 233 insertions(+), 16 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py index 815450102ba946..b915f954aca6f7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py +++ b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py @@ -1,11 +1,16 @@ import logging -from typing import Iterable, Optional, TypeVar +import time +from typing import Iterable, Optional, TypeVar, List from google.cloud import aiplatform +from google.cloud.aiplatform import AutoMLTabularTrainingJob, CustomJob, AutoMLTextTrainingJob, AutoMLImageTrainingJob, \ + AutoMLVideoTrainingJob, AutoMLForecastingTrainingJob from google.cloud.aiplatform.models import Model, VersionInfo +from google.cloud.aiplatform.training_jobs import _TrainingJob from pydantic.fields import Field import datahub.emitter.mce_builder as builder +from datahub.api.entities.dataprocess.dataprocess_instance import DataProcessInstance from datahub.configuration.source_common import EnvConfigMixin from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.common import PipelineContext @@ -18,10 +23,13 @@ ) from datahub.ingestion.api.source import Source, SourceCapability, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.metadata._schema_classes import DataProcessInstancePropertiesClass, AuditStampClass, \ + DataProcessInstanceInputClass from datahub.metadata.schema_classes import ( MLModelGroupPropertiesClass, MLModelPropertiesClass, VersionTagClass, + MLModelDeploymentPropertiesClass, _Aspect, ) @@ -56,6 +64,7 @@ class VertexAIConfig(EnvConfigMixin): @capability(SourceCapability.TAGS, "Extract tags for VertexAI Registered Model Stages") class VertexAISource(Source): platform = "vertexai" + vertexai_base_url = "https://console.cloud.google.com/vertex-ai" def __init__(self, ctx: PipelineContext, config: VertexAIConfig): super().__init__(ctx) @@ -68,7 +77,11 @@ def get_report(self) -> SourceReport: return self.report def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: + # Fetch Models, Model Versions a from Model Registry yield from self._get_ml_model_workunits() + # Fetch Training Jobs + yield from self._get_training_job_workunit() + # TODO Fetch Experiments and Experiment Runs def _create_workunit(self, urn: str, aspect: _Aspect) -> MetadataWorkUnit: """ @@ -79,32 +92,81 @@ def _create_workunit(self, urn: str, aspect: _Aspect) -> MetadataWorkUnit: aspect=aspect, ).as_workunit() + def _validate_training_job(self, model: Model) -> bool: + """ + Validate Model Has Valid Training Job + """ + job = model.training_job + if not job: + return False + + try: + # when model has ref to training job, but field is not accessible, it is not valid + name = job.name + return True + except RuntimeError: + logger.info("Job name is not accessible, not valid training job for %s ", model.name) + + return False + def _get_ml_model_workunits(self) -> Iterable[MetadataWorkUnit]: """ Fetch List of Models in Model Registry and generate a corresponding workunit. """ + registered_models = self.client.Model.list() for model in registered_models: + # create work unit for Model Group (= Model in VertexAI) yield self._get_ml_group_workunit(model) model_versions = model.versioning_registry.list_versions() for model_version in model_versions: + # create work unit for Training Job (if Model has reference to Training Job) + if self._validate_training_job(model): + yield self._get_data_process_properties_workunit(model.training_job) + + # create work unit for Model (= Model Version in VertexAI) yield self._get_ml_model_properties_workunit( model=model, model_version=model_version ) + def _get_training_job_workunit(self) -> Iterable[MetadataWorkUnit]: + """ + Fetches training jobs from Vertex AI and generates corresponding workunits. + This method retrieves various types of training jobs from Vertex AI, including + CustomJob, CustomTrainingJob, CustomContainerTrainingJob, CustomPythonPackageTrainingJob, + AutoMLTabularTrainingJob, AutoMLTextTrainingJob, AutoMLImageTrainingJob, AutoMLVideoTrainingJob, + and AutoMLForecastingTrainingJob. For each job, it generates workunits containing metadata + about the job, its inputs, and its outputs. + """ + yield from self._get_data_process_workunit(self.client.CustomJob.list()) + yield from self._get_data_process_workunit(self.client.CustomTrainingJob.list()) + yield from self._get_data_process_workunit(self.client.CustomContainerTrainingJob.list()) + yield from self._get_data_process_workunit(self.client.CustomPythonPackageTrainingJob.list()) + yield from self._get_data_process_workunit(self.client.AutoMLTabularTrainingJob.list()) + yield from self._get_data_process_workunit(self.client.AutoMLTextTrainingJob.list()) + yield from self._get_data_process_workunit(self.client.AutoMLImageTrainingJob.list()) + yield from self._get_data_process_workunit(self.client.AutoMLVideoTrainingJob.list()) + yield from self._get_data_process_workunit(self.client.AutoMLForecastingTrainingJob.list()) + + def _get_data_process_workunit(self, jobs: List[_TrainingJob]) -> Iterable[MetadataWorkUnit]: + for job in jobs: + yield self._get_data_process_properties_workunit(job) + yield from self._get_job_output_workunit(job) + yield from self._get_job_input_workunit(job) + def _get_ml_group_workunit( self, model: Model, ) -> MetadataWorkUnit: """ Generate an MLModelGroup workunit for a VertexAI Model. - """ ml_model_group_urn = self._make_ml_model_group_urn(model) ml_model_group_properties = MLModelGroupPropertiesClass( name=model.name, description=model.description, createdAt=model.create_time, + trainingJobs=[], ) wu = self._create_workunit( urn=ml_model_group_urn, @@ -120,6 +182,107 @@ def _make_ml_model_group_urn(self, model: Model) -> str: ) return urn + def _get_data_process_properties_workunit(self, job: _TrainingJob) -> MetadataWorkUnit: + """ + Generate a work unit for VertexAI Training Job + """ + created_time = job.start_time or int(time.time() * 1000) + created_actor = "" + # created_actor = ( + # f"urn:li:platformResource:{job, "user"}" if getattr(job, "user") else "" + # ) + + job_id = self._make_vertexai_name(entity_type="job", entity_id=job.name) + entityUrn = builder.make_data_process_instance_urn(job_id) + aspect = DataProcessInstancePropertiesClass( + name=job_id, + created=AuditStampClass( + time=created_time, + actor=created_actor, + ), + externalUrl=self._make_job_external_url(job), + customProperties={"displayName": job.display_name}, + ) + + return self._create_workunit(urn=entityUrn, aspect=aspect) + + def _is_automl_job(self, job: _TrainingJob) -> bool: + return ((isinstance(job, AutoMLTabularTrainingJob) or + isinstance(job, AutoMLTextTrainingJob) or + isinstance(job, AutoMLImageTrainingJob) or + isinstance(job, AutoMLVideoTrainingJob)) or + isinstance(job, AutoMLForecastingTrainingJob)) + + def _get_job_output_workunit(self, job: _TrainingJob) -> Iterable[MetadataWorkUnit]: + """ + This method creates work units that link the training job to the model version + that it produces. It checks if the job configuration contains a model to upload, + and if so, it generates a work unit for the model version with the training job + as part of its properties. + """ + + job_conf = job.to_dict() + if ("modelToUpload" in job_conf and "name" in job_conf["modelToUpload"] and job_conf["modelToUpload"]["name"]): + + model_id = job_conf["modelToUpload"]["name"].split("/")[-1] + model_version = job_conf["modelToUpload"]["versionId"] + model_display_name = job_conf["modelToUpload"]["displayName"] + entity_id = f"{model_id}{self.config.model_name_separator}{model_version}" + model_version = self._make_vertexai_name(entity_type="model", entity_id=entity_id) + + model_version_urn = builder.make_ml_model_urn( + platform=self.platform, + model_name=model_version, + env=self.config.env, + ) + + job_urn = self._make_job_urn(job) + + aspect = MLModelPropertiesClass( + trainingJobs=[job_urn], + customProperties={"displayName": model_display_name} + ) + yield self._create_workunit(urn=model_version_urn, aspect=aspect) + + def _get_job_input_workunit(self, job: _TrainingJob) -> Iterable[MetadataWorkUnit]: + """ + Generate work units for the input data of a training job. + This method checks if the training job is an AutoML job and if it has an input dataset + configuration. If so, it creates a work unit for the input dataset. + """ + + if self._is_automl_job(job): + job_conf = job.to_dict() + if "inputDataConfig" in job_conf and "datasetId" in job_conf["inputDataConfig"]: + # Create URN of Input Dataset for Training Job + dataset_id = job_conf["inputDataConfig"]["datasetId"] + if dataset_id: + yield self._get_data_process_input_workunit(job, dataset_id) + + def _get_data_process_input_workunit(self, job: _TrainingJob, dataset_id: str) -> MetadataWorkUnit: + """ + This method creates a work unit for the input dataset of a training job. It constructs the URN + for the input dataset and the training job, and then creates a DataProcessInstanceInputClass aspect + to link the input dataset to the training job. + """ + + # Create URN of Input Dataset for Training Job + dataset_name = self._make_vertexai_name(entity_type="dataset", entity_id=dataset_id) + dataset_urn = builder.make_dataset_urn( + platform=self.platform, + name=dataset_name, + env=self.config.env, + ) + + # Create URN of Training Job + job_id = self._make_vertexai_name(entity_type="job", entity_id=job.name) + entityUrn = builder.make_data_process_instance_urn(job_id) + aspect = DataProcessInstanceInputClass( + inputs=[dataset_urn] + ) + return self._create_workunit(urn=entityUrn, aspect=aspect) + + def _get_ml_model_properties_workunit( self, model: Model, @@ -128,41 +291,95 @@ def _get_ml_model_properties_workunit( """ Generate an MLModel workunit for an VertexAI Model Version. Every Model Version is a DataHub MLModel entity associated with an MLModelGroup - corresponding to a Registered Model in VertexAI Model Registry. + corresponding to a registered Model in VertexAI Model Registry. """ ml_model_group_urn = self._make_ml_model_group_urn(model) - ml_model_urn = self._make_ml_model_urn(model_version) + model_name = self._make_vertexai_name(entity_type="model", entity_id=model.name) + ml_model_urn = self._make_ml_model_urn(model_version, model_name=model_name) - training_job = None + training_job_names = None training_metrics = None hyperparams = None - try: - job = model.training_job - training_job = [job] - except RuntimeError: - logger.info("No training job found for model %s", model.name) + + if self._validate_training_job(model): + training_job_names = [model.training_job.name] ml_model_properties = MLModelPropertiesClass( name=model.name, description=model_version.version_description, - date=model_version.version_create_time, + created=model_version.version_create_time, + lastModified=model_version.version_update_time, version=VersionTagClass(versionTag=str(model_version.version_id)), hyperParams=hyperparams, trainingMetrics=training_metrics, - trainingJobs=training_job, - groups=[ml_model_group_urn], + groups=[ml_model_group_urn], # link model version to model group + trainingJobs=training_job_names if training_job_names else None, # link to training job + deployments=[], # link to model registry and endpoint + externalUrl=self._make_model_version_external_url(model) # tags=list(model_version.tags.keys()), # customProperties=model_version.tags, - # externalUrl=self._make_external_url(model_version), ) wu = self._create_workunit(urn=ml_model_urn, aspect=ml_model_properties) return wu - def _make_ml_model_urn(self, model_version: VersionInfo) -> str: + + + def _make_ml_model_urn(self, model_version: VersionInfo, model_name:str) -> str: urn = builder.make_ml_model_urn( platform=self.platform, - model_name=f"{model_version.model_display_name}{self.config.model_name_separator}{model_version.version_id}", + model_name=f"{model_name}{self.config.model_name_separator}{model_version.version_id}", env=self.config.env, ) return urn + + def _make_job_urn(self, job: _TrainingJob) -> str: + job_id = self._make_vertexai_name(entity_type="job", entity_id=job.name) + urn = builder.make_data_process_instance_urn( + dataProcessInstanceId=job_id + ) + return urn + + + def _make_vertexai_name(self, + entity_type:str, + entity_id:str, + separator:str=".") -> str: + return f"{self.config.project_id}{separator}{entity_type}{separator}{entity_id}" + + + def _make_job_external_url(self, job: _TrainingJob): + """ + Model external URL in Vertex AI + Sample URLs: + https://console.cloud.google.com/vertex-ai/training/training-pipelines?project=acryl-poc&trainingPipelineId=5401695018589093888 + https://console.cloud.google.com/vertex-ai/models/locations/us-west2/models/812468724182286336/versions/default?backTo=training&trainingPipelineId=5401695018589093888&project=acryl-poc + """ + entity_type = "training" + external_url = (f"{self.vertexai_base_url}/{entity_type}/training-pipelines?trainingPipelineId={job.name}" + f"?project={self.config.project_id}") + return external_url + + def _make_model_external_url(self, model: Model): + """ + Model external URL in Vertex AI + Sample URL: + https://console.cloud.google.com/vertex-ai/models/locations/us-west2/models/812468724182286336?project=acryl-poc + """ + entity_type = "models" + external_url = (f"{self.vertexai_base_url}/{entity_type}/locations/{self.config.region}/{entity_type}/{model.name}" + f"?project={self.config.project_id}") + return external_url + + def _make_model_version_external_url(self, model: Model): + """ + Model Version external URL in Vertex AI + Sample URL: + https://console.cloud.google.com/vertex-ai/models/locations/us-west2/models/812468724182286336/versions/1?project=acryl-poc + """ + entity_type = "models" + external_url = (f"{self.vertexai_base_url}/{entity_type}/locations/{self.config.region}/{entity_type}/{model.name}" + f"/versions/{model.version_id}" + f"?project={self.config.project_id}") + return external_url + From bc9e451ba9bd7684e78fc08bd7e14562269548c4 Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Sat, 22 Feb 2025 20:14:50 -0800 Subject: [PATCH 09/59] adding comments --- .../src/datahub/ingestion/source/vertexai.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py index b915f954aca6f7..49e945755d9fd3 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py +++ b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py @@ -77,6 +77,11 @@ def get_report(self) -> SourceReport: return self.report def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: + """ + Main Function to fetch and yields workunits for various VertexAI resources. + - Models and Model Versions from the Model Registry + - Training Jobs + """ # Fetch Models, Model Versions a from Model Registry yield from self._get_ml_model_workunits() # Fetch Training Jobs @@ -177,7 +182,7 @@ def _get_ml_group_workunit( def _make_ml_model_group_urn(self, model: Model) -> str: urn = builder.make_ml_model_group_urn( platform=self.platform, - group_name=model.name, + group_name=self._make_vertexai_name("model",model.name), env=self.config.env, ) return urn @@ -202,6 +207,7 @@ def _get_data_process_properties_workunit(self, job: _TrainingJob) -> MetadataWo ), externalUrl=self._make_job_external_url(job), customProperties={"displayName": job.display_name}, + type= ) return self._create_workunit(urn=entityUrn, aspect=aspect) @@ -307,6 +313,8 @@ def _get_ml_model_properties_workunit( ml_model_properties = MLModelPropertiesClass( name=model.name, description=model_version.version_description, + customProperties={"displayName": model_version.model_display_name + + self.config.model_name_separator + model_version.version_id}, created=model_version.version_create_time, lastModified=model_version.version_update_time, version=VersionTagClass(versionTag=str(model_version.version_id)), From 960129ba0fe7357564528a3efffa23fad1d7e05b Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Wed, 12 Feb 2025 13:21:35 -0800 Subject: [PATCH 10/59] feat(ingest): add vertex AI sample data ingestion --- .../app/ingest/source/IngestionSourceList.tsx | 5 - datahub-web-react/src/images/vertexai.png | Bin 0 -> 12122 bytes .../examples/ai/dh_ai_client.py | 99 +++++++++++- .../ai/vertexai_example_experiment.py | 141 ++++++++++++++++++ .../ai/vertexai_example_training_job.py | 116 ++++++++++++++ .../bootstrap_mcps/data-platforms.yaml | 10 ++ 6 files changed, 359 insertions(+), 12 deletions(-) create mode 100644 datahub-web-react/src/images/vertexai.png create mode 100644 metadata-ingestion/examples/ai/vertexai_example_experiment.py create mode 100644 metadata-ingestion/examples/ai/vertexai_example_training_job.py diff --git a/datahub-web-react/src/app/ingest/source/IngestionSourceList.tsx b/datahub-web-react/src/app/ingest/source/IngestionSourceList.tsx index 8cc3845b1b338b..f9d0e48e1f0867 100644 --- a/datahub-web-react/src/app/ingest/source/IngestionSourceList.tsx +++ b/datahub-web-react/src/app/ingest/source/IngestionSourceList.tsx @@ -98,11 +98,6 @@ export const IngestionSourceList = () => { const [sort, setSort] = useState(); const [hideSystemSources, setHideSystemSources] = useState(true); - // When source filter changes, reset page to 1 - useEffect(() => { - setPage(1); - }, [sourceFilter]); - /** * Show or hide system ingestion sources using a hidden command S command. */ diff --git a/datahub-web-react/src/images/vertexai.png b/datahub-web-react/src/images/vertexai.png new file mode 100644 index 0000000000000000000000000000000000000000..93b43b7d61200f37ea094f77eff6af7db7408cc9 GIT binary patch literal 12122 zcmb8Vbx<5%@GnY`;O_3exCD2H;Ie4g;0}Qx!NS7go?rHlU3GjM8XOc96nsq$6+;vhRN%iK78-IU zuFd%v1?8ERzK*f#)6>)a!{gBfVqy1TXz9Lt`o4MW4mxsIJABvh>#l9$4*utUa{XcJ z7;$m)h-^!Hi2nKemhro;-}e)15Bq0`%iG7f?T3!3`^M3`frb0sQv?F>`1ks8WgpQx zepfYk_oeT){O8^GnfsZ|hl6v(<`H6S^&$P+jeF6RaoVLu+=UY4LM`S(H|YYDa~V;4 zeRKCXwf?Ykf^f>aQjEGVOTUb3yt%!9JiS8dfBG?Zk9d3xEWftOxV*l7d_X*=wBDRt zJR)anhVH^^u5}VGLaMIYC+~i5JdCb9ysNyP-Foo*dhK0$r4@fMy!5bsh$w{Jjx0ae zWL+vpU*z`O=5*iMf4)j+x^XSI@+!W1^W`eP_tqfg!a4uS{Nv@_{bR|G+pNypkL@>W z2MGVN>)zS>=!ToRkvqq{t0?F-Z1#R}7h&737lVSrH>|0mXdL+Lur~r}X_WVchziOI z)PKYOyS<<5n2m1AkX_#BVaj^;k=QuZ%>>;mvWm=%JzjjW(z(mM$yv3^q-v&Z^gjIE zXm{QcZfZEHd*eok&3Bo*Pl+P-Utf{UzF*k56xt?4$Ba>;OaI+9?kxd(gv$#{eN{+r zCrwIfeAP!fNLISMx~H=~{bTc^fOMYGsbT54`HWTwM-oG-Rqx!lTa2>+NT0mTy|6$@ z&H85vH@MOIZn~)GwsWB&)^dO*T38n1WD1k~Jr9t3 z7#YEj6~Rv$ zAxnt8)}6vQ-6Otk2|Beo{Z|bUOR#ebY7F)?)tu-^>P=dqMI{Vm<)h|50sDZpC=Q=tz!BB(hSiW&#e*$kd_wdfs2WDkRvv^X(Dt? zlWb;qdL9=o)tD{Z0D@Rl^QiKhjj_mQY|9214{5J+^3IjC% zr;ld;H;!f>gtGllPo0T&L1;GQ@C9s>268E~3S(-a?9i^9rSmT{v8#Hl_{<&dJ@%!@ zEdFwJvwwRYadn<`2E?-d0CJfIwrx8-ttQCeWyV8=Y%2M}y8*F1ynwJr25POMV40Kq zn--?WFFrxuq(Xv6@I%yzs(h&tV?1k~K6e>;-0UVtw z0c^Ct*hr6S9SJJ4y8KbSF{GNKSrg>GV8i3X??tDcA#&KX&=CY=_oAT!uPJWTdA{GxyS2h0zM(%yahD# zp_JM(YCfsSNX~F!wD{Fm^EXB-wWg-uJn9B`bNqmQb z={Z6G!9RUzu3wX-Al}Aa9^#!Vt+vGt2J{ceILNeisVcseKxV06Sz7jdgCJ_RiB+BhW!v4>)(vi#borQ8oY$cV>@)hJ~_VEH*|N% zSC$*w5g>m;+UyVb^!WiWEZ^MGysjJ)|f712MZX#im?ZBIOh$Yx7iv>eJ_80diZe`>{`PG z7gnadYG^Sv$^OFb4wciQ^uj9U=7$SIIAyEFo}8NsEK{p+@?RmU3Ug8vqa6&L$A>Kj zw#9%%^dPpB0BFQDEqJ>E02i)FmrjNj+%}+{*kY#$rMnaRLw&iqS{745=Bbta;579s zfyc9r5MonR*~ISwn3npE7=3RHq8-wTx33*Z)}n&F0|6F_l>1?hfTHgV`ya_;>d;a_ zZzIe^lS(Y-`9+MYU>v6Xtx;(h@~w(ML(u`eAAjlmeL@Y1^28hypvu7XeGvUECJ!{_ z!d%(!!Wbe8OShsQjwdX&<=i({Y?=)m>g5M%?(?`EOk3kzrOU~Z9RWs6K?TSNXq($i zG2-**rR1q*QwwtU&_xC%!_zbbKh!LW-eHuY@1&*sC(36Cd&UVk?5c4-c?hI8D>D3d z-6=~ij>VlSvox_Q==AQ7eBaNd0A_-F!{;1wVX|aO58P3}z+ic5UNmuhZJsJ+w5gI3 zuh*AWO{g8b1oS^sJe|B!);%Y`X5v)^|9^%mIvw8V$vyouSvG3ZI_LeR-F*!{BCN#f zQF~*IbmYRmcPy~5ReY!7RD0kOw^KvPp&2Whs)jJ&AdvrIzyg~?l8ri3bzUK=`6437 z4hMO_zNN_McHq;Br5Wv@7!>_>lYhOhg3AFi3~evQ7e?#Au}L-d91m`Icz1s}mo=Zy z18;~Yyw8sK3B(Y9(8%0(vOgY=$*bDc2T74;fXvf_`|yH3?2^pZCaB&mKy>21PVE&( zGgRE|Q2ZEck$YYu4ES3MpAq<4$g}<25jK}btT*2Ef=sJOy*Ov}_tE*()YRNwNr$o8 z?8oW9;G3njZP6@H&f6V3G9iKl(OF&b(MS1LNvC_biad-G9l*(_qy0zAamaC-XRC^v z*PJwUrX(`L2$M8~pnSxs=jZr97UAlIbXNI`8gYUZM-#aNv`v zP4@9)CIe1c0`HDZZKqFjcQ*&w;lj*Y$2;HVWUjwk9C#|qKOJd#9ffE?#4f+dtE%f& z;#D!4X#`dYCd{HCfa5#Cb(bxf-RvOM%M!uyHojr#z-LhH-dciT!{!DPB@DeJC(V91ot-- zk%kXomqE^@0ZlTF3QF*6(=^b;$Hqi=861&f!jV*Y!p3;c_9Dj`R9{pKMIT8%cyZ8+ z(E*z;ASH)S#DPkJF?Hf^?XIb_)4Pd88$aPCuvysI6O(hau_VF{Z8VRYb$tw@sgIQv z8|5xKL4#03kwmsa7#;&~l6$laq_p*!$rW9p{{?zZ2)UsNYocQcJZj%9Z#w~1>QYOe z&y7i%*q+irGKh1n{&7+4*mGa>TDdU68i%I~N0?(EU-a9eLh(ai?1SdG%PYU)FMwS& z9Ps`d6#F;~#ka66RBp}b;qtG)@H#4t@|>AIliA~9r%tcn6szG21Zs1lwM!v$9j1i5 z1LHaq{hE+_qYJyM$736!Ur&q!FP-Bv=+#UU!0vA~%i7z}$mQ>iQiW2_G!>f|ArEtS z7y%-}bvVL3x-)d>CJyG~FW>Wv3uw@g(g|G0gi(*XR3(lSh@4@K4`z~VQ|59#6m~rF zYs?&Y=L=t!4H}Gd6DE#ip_}X{=w@NeRWP|FevjHvRrE;*qfLh zU{FsDeiA`o9|&aky2Zc-jIc(3h=e(Qr>y#pCpR?U%A6YH*YFo49R+Dv+q(HBldH85 zWS{qoOVFuJURqW9WuB|FQzR5CYvV}>f5%XbF*qdAfPb`HO>&@LZr^xi=n4yWUrn*; zI*3g7>*0;LWp@tC2+upJ2|EVAp&DKCHF0d*A_2?dyacM;m$U>I7sG~f&SCMYY1Wf2 z1kY`=poRAG37IY3TdRIDlvnkpU^_aB#6Id@Iv5;m6(_K{;!qfBwC9c`UNfWVDXP_q zL%8DH&Um;n_WxS-XqD1JOmNj$s@7p3aME&g)g1)21JU=6rT_q*DF7Fm=5sLd(3y9|f7X+C#(aGrn({_Y#@ z-C+@q8`)UI^71x8z&l{NI337CrhN*^)7(+fTHoY@DIo`l+c9M{=Z$!VVLebVvR7jz zQyEAM(%b?O-X{fo9h%4}4lb#9+J9inglS`I$2*#K9b$H;(VJmdYj=2^WL-Qtd+Hr$ zL{O4tmbN`lZWQy9VF(u^$>;Cd*u{2mzT9>Xep-E44fC~ImoR%~V*w=TA*kRt%LvOO z!53&h&l$1S4p%yRfnpGwR1mk< z=yH4ky$!l?Cwiw8jcrft;Wp3iCp_M#qlRhVJvK)2wdWnVTdy>gQh+_s;|j#y@vb zeUuEqEL#uonMn&TMAfzjRuJMAH_Fdid6(B?i4ZYbM34(tZ~Ut$g7MXHg0hQz$gL-o zzTGJRD26}&7FPH2>3+Qcwc3IGnc>Oh5*xx=!~slI)_eaNd+go$m7E#Bo2DPuD(>Se zGAQGf?m!!DlH-mA{>wub>t9|Hjz!_3*?OH--+*S|S4U+xO!*Q~@X4*Qx&3u3^t`}1 z@=lUYU0X2SmfF-itHY!%Um9a6uH9+uvS2AZj@ut6Aq=!;HC;pqKwTjM=s!pGi1sa; zfDA3W8$B)cz8usAnbe{GQXCBiFHua&$!M2h}!0`5FEoM`Ac zq+qeAF4^Q-estck@K1|-8Ox*Jg6#Y(#=s5{Y@)2scN_um=N{@t&#R%6Jjxq*gj5%6CVoaV!Z(5Uq<1R1)AExw@^K(M)$IWkoZzv!xGJ)K9 z?ka2geS#uHc)b*$Bk{MD&1@rc_sV|$^3QP%!y>Z|9m$h32pW=Qcn*nY#=CVL9?u%? zHT~Y`$FRpgo$F1QZg68_YS#(hz5P2zWqP4g_Gwkn&6x5ibOc^(d}XxIK+t2d<~%|0 zDwAUpstuH>CH;jD>m2bf@<8DSQo6l3F4LeUD8ritYS+DX_`?Cfd$lXNG6hx0yW_E# zP#@Y=n}0HYIDW$#NvW`O*|73siJyQLVj?%KndwSrdK!@wbXKMJLeq~|QYe5c>^E>F zuw+$LV@u5~e4ntCio_g#iUkA-oUbCT>xW<%~N~rir0-D^My`%xAc0! z%_$FEm1`iVEqyfO{ML})dK<-^l476ntzDywA1C=+cI`peH06C+%jmvH-}ba-kOq*5DX~65TbF1_+x?G39W;!SDROvt*cjiYR@FfCz zWW*|!iqk;U8ea|~*V5A5U$>KOrSAT}ks`Xx@bz5fefIpxCbIBB5Jb3&w2YRPw$-S2 zO|+Cj|M$bPCkvhKq3&$ePy$|+OsJpGE21akY(_q`AKZQJ>$uvpg8w*=3OtC+*64j^j3)2ITNL{nk6?L|Th&<# z4Lo$wemM5_jVY17rR`!0JNLhBOQd9yq6hH>Vk!%8Ru?~2IUgOVU|H*PJ`(jt?z`pQ37$(7w_)krcf_D?`836mM0dtXt^U*QQ}==9f3}|Z z5_r2DDVd&X_J?JP{%xmK6+0g!=^-suJ)y1Wll`9iA5G}yB9W4A%hua~4_}{{%)e*8 zUtK;p=B3}#8SmtWB??*V0}>@JBUD#u8?1r*BfsYmi_<+lyo)uk%8pj>_8USD?-?St zd*;x$dqtno9F81`!BSPseENlfteIDT1NvPiMQye_CB!UGY&*kuAN zbjPW@57-v)WY4bo4oVMGGilWWhMc%-dO7SN!~ayj9Cl@EQ0f`0VzV)%vHAzq98;?{ zP|N?7ms5BtUZZ~t`ZSoFnXxwJ{feC4JPO;lHZ+{eX1yh;x;WS9GT80vm?z>6#A?S6mD9}w<$IB#5 zagvK6)aF+-_LA}?^IL^gI=iAq+~a-x7}INb179gI=C}iPO_?J)CbP^AQFI184n!G$ zDwrVvJ25R_D3eL*QjOLY7csT*2jHQ>-fT&ylTxO(7VO)0c}Z2!mq2cR=bOF;)G4(U9M&XKj8S79LA2g~ zc@d*IXM_+k+4mE zVa6OQkIv?<4;fQ>)!(R&jugwOpBOlAT@QAWpO-g(v@vth{_S#+AI%R(>Je=^&gyzBy(cyrT5FT z5dD;NlkU`+Lvjf#p`R+18pw}o`z_ALq7Er3R?K;0O=&1a=nSx999lMEme4qEW4it>}aW7b8>rx%rZ z?dQ;BhBtJGH%Jl;)JFkYs=YinbIa`(ZgO;~sr+6cg=kEi^#RUsz0zmHXc|#+XcFOqU5=03CKZbT5SBcCw@#?Kf`8-nqR`a~K|DFg-NX z_rNc1Esns=a~&jTtoBQq=nIJ7WtW2wpzb*lx-~PE(}-Od?=@9Sd8nKn9*2x1E+3zk zl5iCJ%LW6(6ko(KO!~h%a6EK6J#@^LY?j8CMSeib2|t{Jw-ftu?&dX}D7^S```#(b zf#X)aWS4v^BJEadjA16NutRlhF|=dyWqWU9g5cESkk3W*c~%@W{mDqL+UO!S{m;xx zX?Wvbp}#hlt}*JkA`OAH@}MAvE5&j8OE1D0TiUT?^R)&1BFY;f)`W5u#1;USwZtd*%f&O8RNiY19 zj<*=8O;|7*I?(d=L#XWS4#4{!^h?7_qHo=14yA%P19i%GEu>)OuaV`8=f3d*e`-c6 z#&j6j()!fO+bWxlIa$GBwP>EFuDM_tOsOr?vOCk~b3;*uQtP{hO)Yhn;%*xlhTt1N zVrv>lm0q`O{izBTIzRYdu%i#AHBX-C+XIf%!+p6Ki1%ervx($)5Nrve>nn&y`tbzp z*b!UnEK+yLFG?!D5_SAUKv>|&#>$*+&^_(qm>{?jsI5=;d?KUiN#UgbXZljO_|y|U z!|Ok2bYJOS+@F_OTTzgGHt;KXJkmNDs(5|Rct?!xOE+@{Viakyh?XqE&wI^^?)&-` zr2e+|_}bh3k%LX&)I19nj#F(!GNR9h*|?m0&_C*AYuMMylt}i~5UQIeG23xbI)7pp z@APL)GdTO%lVFVb&__>q!wB;Ue9&evO=b&oZ%WGb9@X z*s;xbv-I;Xh;rD95$kdr<~dZPce4K5PX-j9sRyhiqw)%We&@6RhqBClt44&py7t~Z z381AqZ>W_;5fBy8`%sAuBNenf!QgVUQ8zNCgv79sJmfgh4`2yFB-jZLy@D*+fy#>L*8Qc*vk= zE9dplE=KL~bh3E9c!78~`qwJDnQ5=O3kF~?7uuElkCkFuPYmm|3}cKgK_+EfLStz}!tzE{0#6iR%Rg8)OK`nAhG#K-<5ot9hODR3WpqOOY^>;}l zQ1F(lz(M|4Ui5DS1Hs@|OSHo}!*l4<@G>=`wEnFDUHqybI5J+J2#OQittiADrXbf# zV~iUo79pXaVTmrg`1Ew!ef8BwCdb@q^{`{MN`L^(7xw58hMU^u3U3AE4lV1T2;Y8r z`OL|GgIqB%GdV#5(+&&12NAui+|*-9hOT-ZV5($B7RZ~w)b?K+xEy}D&3A5&o z^NyG3YqyCRnTCVd(QAHQByvaVgmg6Te?NJ%de@5AVz3Bc@bxg7_eq+=lq1&utR%?2 z{1B^5!JwrRSn9U6DS~S%TH7?Hc=fcc6By=}{f#nKhMlh~M|mkV)}4>OED0pxr0#L_ z6J7w<2~2JFj$ND$C(7QGRk(TrnH*t95<>ZDs zfs0?ysNvpHF7ic`?Cnk{EU&aS zFYJ^{@5P4=3GUC!-Cd2(qtBNL@?Kb??dU6haM%((8sxuQO3zy7YVDWLcg{3fILJmwU zB1#feSBraZCB=ETF6ec z_oWl)P$(cXWQVP1t>J{85iqk&6>W5-p{w#LkW7muoSTwjvYvHdZ%*&?``p<0Q!Kr} zYF6evU#svpC8sR!->@Xh`#ZmP-4h&4goK<6M*B{&GjSGk$!L))L~%}0o?{nN>8@NE z>;AwcEVq!l15_a{r9Efs)_@Dgc0BAEGe#lh#GGnaCtEg|Yyu#G2n zd*9vj+nXMR11mI1V%!wLC{Fn_hc`l5ln-V`tp8?|_}%sh?8jg(7~j1w(n#x1cc=53 zM129JughkOC~8}|ed=>9gqz;M{xSt9HMcD3K;-?!J|g9~Ov=AWq{DaC5qKM($qU^z zU;Zgoo5xsl8P*_8MvM|J+$h;@s_>Xm0|TQtWd^tJQTlGZ{iLVS2IEZYCw{_8yi3<_ zPx7S~In5;j|CcakKgUYn@3 zgBb<8&xPx@-$z7G)ijofWRemSF+6kKYhlFvHF%<~zr%HlRzl_=N&IZh0J@)jdQ){- zQl{Jb;_%l{*{hRy{GVA3FYNxEC5C2>RyLJ<`{R3AyzLn3?cSKHA+nbCyxA{jKna@b zd=#b5z2vrs3}dOeyF;y>pfv+3X1>1vK%=b@cV`l$4kx)G)l)L$NS?H=^CvQE=r(oJ zh5}@8vN~;Shw}0pb7Q+tcdLX}T05&VYwW0WXE&4NAot-hG}DvJ8O7^cruF}d zEK_Z>2T$Y;C7*J6Rjcz^Z(OM&AB%fADbU))vL>7!F4O_wEY7IP{&r{4zW?3vpR<=o zM={Bpy#buB*Cu}bWiO;i{`Zu_i1K&-;Q+`e%}(AmaOcnE)hiWRd7s z?&jNeVg|<~*4Md9Y_^wXHxyHT0jh~A=6d?~JncvT_^Q1^!Bb7L@J?1)zt{f{^znf=k4 z>xCNMn8=&^Ml{R&SW@P|-}noHZ`4z?VMREQud{ngTLHWuQLq>gGF7fqBx~L*o)tST z^sSR0iLe;vg7gY9g!y{r_AUf#r)s#hP|;J|;m#-0EOaTcCTB#PN~c`!%&zt!SU)*C ziOp6t4Yo%-;{jd^;2!%QkQA?&x$v=+HD~cO=eU1 zAyf1OOc?*6nCmEpsx5vv%qtArT#-X(#3SZ}ojfTWL!N+^}v$jbg~PpfAJJyFD2%o+ebxGeQA`VEnbl7>cSkDLlMAz(V*JKRNtkDG)>Vgjmt@jsx1oxh8W)F_f_>JtCE zRefcu4SV)OoAmow=B#9@w?l9DY|r_9-CUb5=z=0l*P`qOJNS4#bZ}T>oUXO(1{$k3K#+gL-UKd zP)-CA7V)3llZv0|Bn8oZZhm?PdHt*{97vqpC$djQ`CZ4g8J4#A#y_t88B}=u96+#< z`)6qk=X>tM_Brge^~A8=?4AY69YfO24zB7W_`oW}`Q?~w$pM<+LB(0)@NM)fzqz*$ x+b&}F7YMXs#`#FJXaD|}&Ga;@rf>59Y|m6?0hTC&iT(dXG*xv~YL#rk{|hT^c1ZvL literal 0 HcmV?d00001 diff --git a/metadata-ingestion/examples/ai/dh_ai_client.py b/metadata-ingestion/examples/ai/dh_ai_client.py index b51c8dd050cf1b..7c65f54f5dfeb9 100644 --- a/metadata-ingestion/examples/ai/dh_ai_client.py +++ b/metadata-ingestion/examples/ai/dh_ai_client.py @@ -290,6 +290,62 @@ def create_model( logger.info(f"Created model: {model_urn}") return str(model_urn) + def create_training_job( + self, + run_id: str, + properties: Optional[models.DataProcessInstancePropertiesClass] = None, + training_run_properties: Optional[models.MLTrainingRunPropertiesClass] = None, + run_result: Optional[str] = None, + start_timestamp: Optional[int] = None, + end_timestamp: Optional[int] = None, + **kwargs: Any, + ) -> str: + """Create a training job with properties and events.""" + dpi_urn = f"urn:li:dataProcessInstance:{run_id}" + + # Create basic properties and aspects + aspects = [ + ( + properties + or self._create_properties_class( + models.DataProcessInstancePropertiesClass, kwargs + ) + ), + models.SubTypesClass(typeNames=["ML Training Run"]), + ] + + # Add training run properties if provided + if training_run_properties: + aspects.append(training_run_properties) + + # Handle run events + current_time = int(time.time() * 1000) + start_ts = start_timestamp or current_time + end_ts = end_timestamp or current_time + + # Create events + aspects.append( + self._create_run_event( + status=DataProcessRunStatusClass.STARTED, timestamp=start_ts + ) + ) + + if run_result: + aspects.append( + self._create_run_event( + status=DataProcessRunStatusClass.COMPLETE, + timestamp=end_ts, + result=run_result, + duration_millis=end_ts - start_ts, + ) + ) + + # Create and emit MCPs + mcps = [self._create_mcp(dpi_urn, aspect) for aspect in aspects] + self._emit_mcps(mcps) + logger.info(f"Created training job: {dpi_urn}") + return dpi_urn + def create_experiment( self, experiment_id: str, @@ -382,28 +438,48 @@ def create_dataset(self, name: str, platform: str, **kwargs: Any) -> str: raise ValueError(f"Failed to create dataset URN for {name}") return dataset.urn - def add_run_to_model(self, model_urn: str, run_urn: str) -> None: - """Add a run to a model while preserving existing properties.""" + def _add_process_to_model(self, model_urn: str, process_urn: str) -> None: + """Add a DataProcessInstance to a model while preserving existing properties.""" self._update_entity_properties( entity_urn=model_urn, aspect_type=models.MLModelPropertiesClass, - updates={"trainingJobs": run_urn}, + updates={"trainingJobs": process_urn}, entity_type="mlModel", skip_properties=["trainingJobs"], ) + + def add_run_to_model(self, model_urn: str, run_urn: str) -> None: + """Add a run to a model while preserving existing properties.""" + self._add_process_to_model(model_urn, run_urn) logger.info(f"Added run {run_urn} to model {model_urn}") - def add_run_to_model_group(self, model_group_urn: str, run_urn: str) -> None: - """Add a run to a model group while preserving existing properties.""" + def add_job_to_model(self, model_urn: str, job_urn: str) -> None: + """Add a job to a model while preserving existing properties.""" + self._add_process_to_model(model_urn, job_urn) + logger.info(f"Added training job {job_urn} to model {model_urn}") + + def _add_process_to_model_group( + self, model_group_urn: str, process_urn: str + ) -> None: + """Add DatapProcessInstance to a model group while preserving existing properties.""" self._update_entity_properties( entity_urn=model_group_urn, aspect_type=models.MLModelGroupPropertiesClass, - updates={"trainingJobs": run_urn}, + updates={"trainingJobs": process_urn}, entity_type="mlModelGroup", skip_properties=["trainingJobs"], ) + + def add_run_to_model_group(self, model_group_urn: str, run_urn: str) -> None: + """Add a run to a model group while preserving existing properties.""" + self._add_process_to_model_group(model_group_urn, run_urn) logger.info(f"Added run {run_urn} to model group {model_group_urn}") + def add_job_to_model_group(self, model_group_urn: str, job_urn: str) -> None: + """Add a job to a model group while preserving existing properties.""" + self._add_process_to_model_group(model_group_urn, job_urn) + logger.info(f"Added job {job_urn} to model group {model_group_urn}") + def add_model_to_model_group(self, model_urn: str, group_urn: str) -> None: """Add a model to a group while preserving existing properties""" self._update_entity_properties( @@ -423,7 +499,9 @@ def add_run_to_experiment(self, run_urn: str, experiment_urn: str) -> None: self._emit_mcps(mcp) logger.info(f"Added run {run_urn} to experiment {experiment_urn}") - def add_input_datasets_to_run(self, run_urn: str, dataset_urns: List[str]) -> None: + def _add_input_datasets_to_process( + self, run_urn: str, dataset_urns: List[str] + ) -> None: """Add input datasets to a run""" mcp = self._create_mcp( entity_urn=run_urn, @@ -432,8 +510,15 @@ def add_input_datasets_to_run(self, run_urn: str, dataset_urns: List[str]) -> No aspect=DataProcessInstanceInput(inputs=dataset_urns), ) self._emit_mcps(mcp) + + def add_input_datasets_to_run(self, run_urn: str, dataset_urns: List[str]) -> None: + self._add_input_datasets_to_process(run_urn, dataset_urns) logger.info(f"Added input datasets to run {run_urn}") + def add_input_datasets_to_job(self, job_urn: str, dataset_urns: List[str]) -> None: + self._add_input_datasets_to_process(job_urn, dataset_urns) + logger.info(f"Added input datasets to training job {job_urn}") + def add_output_datasets_to_run(self, run_urn: str, dataset_urns: List[str]) -> None: """Add output datasets to a run""" mcp = self._create_mcp( diff --git a/metadata-ingestion/examples/ai/vertexai_example_experiment.py b/metadata-ingestion/examples/ai/vertexai_example_experiment.py new file mode 100644 index 00000000000000..264b428a880aa0 --- /dev/null +++ b/metadata-ingestion/examples/ai/vertexai_example_experiment.py @@ -0,0 +1,141 @@ +import argparse + +from dh_ai_client import DatahubAIClient + +import datahub.metadata.schema_classes as models +from datahub.metadata.com.linkedin.pegasus2avro.dataprocess import RunResultType + + +def create_experiment_example(client: DatahubAIClient) -> None: + experiment_urn = client.create_experiment( + experiment_id="table_classification_experiment", + properties=models.ContainerPropertiesClass( + name="Tabular classification Experiment", + description="Experiment for tabular classification", + customProperties={"team": "forecasting"}, + created=models.TimeStampClass( + time=1628580000000, actor="urn:li:corpuser:datahub" + ), + lastModified=models.TimeStampClass( + time=1628580000000, actor="urn:li:corpuser:datahub" + ), + ), + ) + + # Create a training run + run_urn = client.create_training_run( + run_id="simple_training_run", + properties=models.DataProcessInstancePropertiesClass( + name="Simple Training Run", + created=models.AuditStampClass( + time=1628580000000, actor="urn:li:corpuser:datahub" + ), + customProperties={"team": "forecasting"}, + ), + training_run_properties=models.MLTrainingRunPropertiesClass( + id="simple_training_run", + outputUrls=["gc://my-bucket/output"], + trainingMetrics=[models.MLMetricClass(name="accuracy", value="0.9")], + hyperParams=[models.MLHyperParamClass(name="learning_rate", value="0.01")], + externalUrl="https:localhost:5000", + ), + run_result=RunResultType.FAILURE, + start_timestamp=1628580000000, + end_timestamp=1628580001000, + ) + + # Create model group + model_group_urn = client.create_model_group( + group_id="AutoML-prediction-model-group", + properties=models.MLModelGroupPropertiesClass( + name="AutoML training", + description="Tabular classification prediction models", + created=models.TimeStampClass( + time=1628580000000, actor="urn:li:corpuser:datahub" + ), + ), + ) + + # Creating a model + model_urn = client.create_model( + model_id="automl-prediction-model", + properties=models.MLModelPropertiesClass( + name="AutoML training", + description="Tabular classification prediction models", + customProperties={"team": "forecasting"}, + trainingMetrics=[ + models.MLMetricClass(name="accuracy", value="0.9"), + models.MLMetricClass(name="precision", value="0.8"), + ], + hyperParams=[ + models.MLHyperParamClass(name="learning_rate", value="0.01"), + models.MLHyperParamClass(name="batch_size", value="32"), + ], + externalUrl="https:localhost:5000", + created=models.TimeStampClass( + time=1628580000000, actor="urn:li:corpuser:datahub" + ), + lastModified=models.TimeStampClass( + time=1628580000000, actor="urn:li:corpuser:datahub" + ), + tags=["forecasting", "prediction"], + ), + version="3583871344875405312", + alias="champion", + ) + + # Create datasets + input_dataset_urn = client.create_dataset( + platform="gcs", + name="table_input", + ) + + output_dataset_urn = client.create_dataset( + platform="gcs", + name="table_output", + ) + + # Add run to experiment + client.add_run_to_experiment(run_urn=run_urn, experiment_urn=experiment_urn) + + # Add run to model + client.add_run_to_model( + model_urn=model_urn, + run_urn=run_urn, + ) + + # add run to model group + client.add_run_to_model_group( + model_group_urn=model_group_urn, + run_urn=run_urn, + ) + + # Add input and output datasets to run + client.add_input_datasets_to_run( + run_urn=run_urn, dataset_urns=[str(input_dataset_urn)] + ) + + client.add_output_datasets_to_run( + run_urn=run_urn, dataset_urns=[str(output_dataset_urn)] + ) + + +if __name__ == "__main__": + # Example usage + parser = argparse.ArgumentParser() + parser.add_argument("--token", required=False, help="DataHub access token") + parser.add_argument( + "--server_url", + required=False, + default="http://localhost:8080", + help="DataHub server URL (defaults to http://localhost:8080)", + ) + parser.add_argument("--platform", default="vertexai", help="platform name") + args = parser.parse_args() + + # Create Client + client = DatahubAIClient( + token=args.token, server_url=args.server_url, platform=args.platform + ) + + create_experiment_example(client) diff --git a/metadata-ingestion/examples/ai/vertexai_example_training_job.py b/metadata-ingestion/examples/ai/vertexai_example_training_job.py new file mode 100644 index 00000000000000..25febf1b74e3ff --- /dev/null +++ b/metadata-ingestion/examples/ai/vertexai_example_training_job.py @@ -0,0 +1,116 @@ +import argparse + +from dh_ai_client import DatahubAIClient + +import datahub.metadata.schema_classes as models +from datahub.metadata.com.linkedin.pegasus2avro.dataprocess import RunResultType + + +def create_training_job_example(client: DatahubAIClient) -> None: + # Create Training Job + training_job_urn = client.create_training_job( + run_id="train-petfinder-automl-job", + properties=models.DataProcessInstancePropertiesClass( + name="Training Job", + created=models.AuditStampClass( + time=1628580000000, actor="urn:li:corpuser:datahub" + ), + customProperties={"team": "classification"}, + ), + training_run_properties=models.MLTrainingRunPropertiesClass( + id="train-petfinder-automl-job", + outputUrls=["gc://my-bucket/output"], + trainingMetrics=[models.MLMetricClass(name="accuracy", value="0.9")], + hyperParams=[models.MLHyperParamClass(name="learning_rate", value="0.01")], + externalUrl="https:localhost:5000", + ), + run_result=RunResultType.FAILURE, + start_timestamp=1628580000000, + end_timestamp=1628580001000, + ) + + # Create model group + model_group_urn = client.create_model_group( + group_id="AutoML-prediction-model-group", + properties=models.MLModelGroupPropertiesClass( + name="AutoML training", + description="Tabular classification prediction models", + created=models.TimeStampClass( + time=1628580000000, actor="urn:li:corpuser:datahub" + ), + ), + ) + + # Creating a model with metrics + model_urn = client.create_model( + model_id="automl-prediction-model", + properties=models.MLModelPropertiesClass( + name="AutoML training", + description="Tabular classification prediction models", + customProperties={"team": "forecasting"}, + trainingMetrics=[ + models.MLMetricClass(name="accuracy", value="0.9"), + models.MLMetricClass(name="precision", value="0.8"), + ], + hyperParams=[ + models.MLHyperParamClass(name="learning_rate", value="0.01"), + models.MLHyperParamClass(name="batch_size", value="32"), + ], + externalUrl="https:localhost:5000", + created=models.TimeStampClass( + time=1628580000000, actor="urn:li:corpuser:datahub" + ), + lastModified=models.TimeStampClass( + time=1628580000000, actor="urn:li:corpuser:datahub" + ), + tags=["forecasting", "prediction"], + ), + version="3583871344875405312", + alias="champion", + ) + + # Create datasets + input_dataset_urn = client.create_dataset( + platform="gcs", + name="classification_input_data", + ) + + # Add model to model group + client.add_model_to_model_group(model_urn=model_urn, group_urn=model_group_urn) + + # Add training job to model + client.add_job_to_model( + model_urn=model_urn, + job_urn=training_job_urn, + ) + + # add training job to model group + client.add_job_to_model_group( + model_group_urn=model_group_urn, + job_urn=training_job_urn, + ) + + # Add input and output datasets to run + client.add_input_datasets_to_job( + job_urn=training_job_urn, dataset_urns=[str(input_dataset_urn)] + ) + + +if __name__ == "__main__": + # Example usage + parser = argparse.ArgumentParser() + parser.add_argument("--token", required=False, help="DataHub access token") + parser.add_argument( + "--server_url", + required=False, + default="http://localhost:8080", + help="DataHub server URL (defaults to http://localhost:8080)", + ) + parser.add_argument("--platform", default="vertexai", help="platform name") + args = parser.parse_args() + # Create Client + client = DatahubAIClient( + token=args.token, server_url=args.server_url, platform=args.platform + ) + + create_training_job_example(client) diff --git a/metadata-service/configuration/src/main/resources/bootstrap_mcps/data-platforms.yaml b/metadata-service/configuration/src/main/resources/bootstrap_mcps/data-platforms.yaml index 4c4a7c2183073c..a4a4e426258d4a 100644 --- a/metadata-service/configuration/src/main/resources/bootstrap_mcps/data-platforms.yaml +++ b/metadata-service/configuration/src/main/resources/bootstrap_mcps/data-platforms.yaml @@ -747,4 +747,14 @@ displayName: Neo4j type: OTHERS logoUrl: "/assets/platforms/neo4j.png" +- entityUrn: urn:li:dataPlatform:vertexai + entityType: dataPlatform + aspectName: dataPlatformInfo + changeType: UPSERT + aspect: + datasetNameDelimiter: "." + name: vertexai + displayName: vertexai + type: OTHERS + logoUrl: "/assets/platforms/vertexai.png" From 95712f5c3987b69f309d5fe8ec38b6ac7aefa74d Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Mon, 24 Feb 2025 01:00:53 -0800 Subject: [PATCH 11/59] Update vertexai.py --- .../src/datahub/ingestion/source/vertexai.py | 112 +++++++++++------- 1 file changed, 72 insertions(+), 40 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py index 49e945755d9fd3..bd3eff560c3d44 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py +++ b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py @@ -4,7 +4,7 @@ from google.cloud import aiplatform from google.cloud.aiplatform import AutoMLTabularTrainingJob, CustomJob, AutoMLTextTrainingJob, AutoMLImageTrainingJob, \ - AutoMLVideoTrainingJob, AutoMLForecastingTrainingJob + AutoMLVideoTrainingJob, AutoMLForecastingTrainingJob, Endpoint from google.cloud.aiplatform.models import Model, VersionInfo from google.cloud.aiplatform.training_jobs import _TrainingJob from pydantic.fields import Field @@ -24,7 +24,7 @@ from datahub.ingestion.api.source import Source, SourceCapability, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.metadata._schema_classes import DataProcessInstancePropertiesClass, AuditStampClass, \ - DataProcessInstanceInputClass + DataProcessInstanceInputClass, TimeStampClass from datahub.metadata.schema_classes import ( MLModelGroupPropertiesClass, MLModelPropertiesClass, @@ -37,7 +37,6 @@ logger = logging.getLogger(__name__) - class VertexAIConfig(EnvConfigMixin): project_id: str = Field(description=("Project ID in Google Cloud Platform")) region: str = Field( @@ -72,6 +71,7 @@ def __init__(self, ctx: PipelineContext, config: VertexAIConfig): self.report = SourceReport() aiplatform.init(project=config.project_id, location=config.region) self.client = aiplatform + self.endpoints = None def get_report(self) -> SourceReport: return self.report @@ -110,7 +110,7 @@ def _validate_training_job(self, model: Model) -> bool: name = job.name return True except RuntimeError: - logger.info("Job name is not accessible, not valid training job for %s ", model.name) + logger.info(f"cannot fetch training job name, not valid for model (name:{model.display_name} id:{model.name})") return False @@ -118,21 +118,23 @@ def _get_ml_model_workunits(self) -> Iterable[MetadataWorkUnit]: """ Fetch List of Models in Model Registry and generate a corresponding workunit. """ - registered_models = self.client.Model.list() for model in registered_models: # create work unit for Model Group (= Model in VertexAI) yield self._get_ml_group_workunit(model) model_versions = model.versioning_registry.list_versions() for model_version in model_versions: + # create work unit for Training Job (if Model has reference to Training Job) if self._validate_training_job(model): - yield self._get_data_process_properties_workunit(model.training_job) + logger.info( + f"generating TrainingJob work unit for model: {model_version.model_display_name}") + yield from self._get_data_process_properties_workunit(model.training_job) # create work unit for Model (= Model Version in VertexAI) - yield self._get_ml_model_properties_workunit( - model=model, model_version=model_version - ) + logger.info(f"generating MLProperties work unit for model (name: {model.display_name} id:{model.name})") + yield self._get_ml_model_properties_workunit(model=model, model_version=model_version) + def _get_training_job_workunit(self) -> Iterable[MetadataWorkUnit]: """ @@ -143,14 +145,23 @@ def _get_training_job_workunit(self) -> Iterable[MetadataWorkUnit]: and AutoMLForecastingTrainingJob. For each job, it generates workunits containing metadata about the job, its inputs, and its outputs. """ + logger.info(f"fetching a list of CustomJobs") yield from self._get_data_process_workunit(self.client.CustomJob.list()) + logger.info(f"fetching a list of CustomTrainingJobs") yield from self._get_data_process_workunit(self.client.CustomTrainingJob.list()) + logger.info(f"fetching a list of CustomContainerTrainingJob") yield from self._get_data_process_workunit(self.client.CustomContainerTrainingJob.list()) + logger.info(f"fetching a list of CustomPythonPackageTrainingJob") yield from self._get_data_process_workunit(self.client.CustomPythonPackageTrainingJob.list()) + logger.info(f"fetching a list of AutoMLTabularTrainingJobs") yield from self._get_data_process_workunit(self.client.AutoMLTabularTrainingJob.list()) + logger.info(f"fetching a list of AutoMLTextTrainingJobs") yield from self._get_data_process_workunit(self.client.AutoMLTextTrainingJob.list()) + logger.info(f"fetching a list of AutoMLImageTrainingJobs") yield from self._get_data_process_workunit(self.client.AutoMLImageTrainingJob.list()) + logger.info(f"fetching a list of AutoMLVideoTrainingJobs") yield from self._get_data_process_workunit(self.client.AutoMLVideoTrainingJob.list()) + logger.info(f"fetching a list of AutoMLForecastingTrainingJobs") yield from self._get_data_process_workunit(self.client.AutoMLForecastingTrainingJob.list()) def _get_data_process_workunit(self, jobs: List[_TrainingJob]) -> Iterable[MetadataWorkUnit]: @@ -164,14 +175,14 @@ def _get_ml_group_workunit( model: Model, ) -> MetadataWorkUnit: """ - Generate an MLModelGroup workunit for a VertexAI Model. + Generate an MLModelGroup work unit for a VertexAI Model. """ ml_model_group_urn = self._make_ml_model_group_urn(model) ml_model_group_properties = MLModelGroupPropertiesClass( - name=model.name, + name=self._make_vertexai_name("model_group", model.name), description=model.description, - createdAt=model.create_time, - trainingJobs=[], + createdAt=int(model.create_time.timestamp()), + customProperties={"displayName": model.display_name} ) wu = self._create_workunit( urn=ml_model_group_urn, @@ -191,11 +202,8 @@ def _get_data_process_properties_workunit(self, job: _TrainingJob) -> MetadataWo """ Generate a work unit for VertexAI Training Job """ - created_time = job.start_time or int(time.time() * 1000) - created_actor = "" - # created_actor = ( - # f"urn:li:platformResource:{job, "user"}" if getattr(job, "user") else "" - # ) + created_time = int(job.start_time.timestamp()) or int(time.time() * 1000) + created_actor = f"urn:li:platformResource:{self.platform}" job_id = self._make_vertexai_name(entity_type="job", entity_id=job.name) entityUrn = builder.make_data_process_instance_urn(job_id) @@ -206,10 +214,10 @@ def _get_data_process_properties_workunit(self, job: _TrainingJob) -> MetadataWo actor=created_actor, ), externalUrl=self._make_job_external_url(job), - customProperties={"displayName": job.display_name}, - type= + customProperties={"displayName": job.display_name} ) + logging.info(f"generating data process instance for training job: {entityUrn}") return self._create_workunit(urn=entityUrn, aspect=aspect) def _is_automl_job(self, job: _TrainingJob) -> bool: @@ -243,6 +251,9 @@ def _get_job_output_workunit(self, job: _TrainingJob) -> Iterable[MetadataWorkUn ) job_urn = self._make_job_urn(job) + logger.info(f"found that training job: {job.display_name} generated a model (name:{model_display_name} id:{model_id})") + + aspect = MLModelPropertiesClass( trainingJobs=[job_urn], @@ -262,6 +273,8 @@ def _get_job_input_workunit(self, job: _TrainingJob) -> Iterable[MetadataWorkUni if "inputDataConfig" in job_conf and "datasetId" in job_conf["inputDataConfig"]: # Create URN of Input Dataset for Training Job dataset_id = job_conf["inputDataConfig"]["datasetId"] + logger.info(f"found that training job {job.display_name} used input dataset: {dataset_id}") + if dataset_id: yield self._get_data_process_input_workunit(job, dataset_id) @@ -286,52 +299,71 @@ def _get_data_process_input_workunit(self, job: _TrainingJob, dataset_id: str) - aspect = DataProcessInstanceInputClass( inputs=[dataset_urn] ) + logger.info(f"generating input dataset {dataset_urn}") + return self._create_workunit(urn=entityUrn, aspect=aspect) - def _get_ml_model_properties_workunit( - self, - model: Model, - model_version: VersionInfo, - ) -> MetadataWorkUnit: + def _get_ml_model_properties_workunit(self,model: Model,model_version: VersionInfo) -> MetadataWorkUnit: """ Generate an MLModel workunit for an VertexAI Model Version. Every Model Version is a DataHub MLModel entity associated with an MLModelGroup corresponding to a registered Model in VertexAI Model Registry. """ + logging.info(f"starting model work unit for model {model.name}") + ml_model_group_urn = self._make_ml_model_group_urn(model) model_name = self._make_vertexai_name(entity_type="model", entity_id=model.name) ml_model_urn = self._make_ml_model_urn(model_version, model_name=model_name) + model_version_name = f"{model_name}{self.config.model_name_separator}{model_version.version_id}" training_job_names = None - training_metrics = None - hyperparams = None - if self._validate_training_job(model): - training_job_names = [model.training_job.name] + # if self._validate_training_job(model): + # training_job_names = [model.training_job.name] + + endpoint:Optional[Endpoint] = self._search_endpoint(model) + endpoint_urn = builder.make_ml_model_deployment_urn( + platform=self.platform, + deployment_name=self._make_vertexai_name("endpoint", endpoint.display_name), + env=self.config.env + ) if endpoint else None ml_model_properties = MLModelPropertiesClass( - name=model.name, + name=model_version_name, description=model_version.version_description, customProperties={"displayName": model_version.model_display_name + - self.config.model_name_separator + model_version.version_id}, - created=model_version.version_create_time, - lastModified=model_version.version_update_time, + self.config.model_name_separator + model_version.version_id, + "resourceName": model.resource_name}, + created=TimeStampClass(model_version.version_create_time.second), + lastModified=TimeStampClass(model_version.version_update_time.second), version=VersionTagClass(versionTag=str(model_version.version_id)), - hyperParams=hyperparams, - trainingMetrics=training_metrics, groups=[ml_model_group_urn], # link model version to model group - trainingJobs=training_job_names if training_job_names else None, # link to training job - deployments=[], # link to model registry and endpoint - externalUrl=self._make_model_version_external_url(model) - # tags=list(model_version.tags.keys()), - # customProperties=model_version.tags, + # trainingJobs=training_job_names if training_job_names else None, # link to training job + deployments=[endpoint_urn] if endpoint is not None else [], # link to model registry and endpoint + externalUrl=self._make_model_version_external_url(model), ) wu = self._create_workunit(urn=ml_model_urn, aspect=ml_model_properties) + logging.info(f"created model version {ml_model_properties.name} associated with group {ml_model_group_urn}") + return wu + def _search_endpoint(self, model: Model) -> Optional[Endpoint]: + """ + Search for an endpoint associated with the model. + """ + + if self.endpoints is None: + self.endpoints = self.client.Endpoint.list() + for endpoint in self.endpoints: + deployed_models = endpoint.list_models() + if model.resource_name in deployed_models: + return endpoint + + return None + def _make_ml_model_urn(self, model_version: VersionInfo, model_name:str) -> str: urn = builder.make_ml_model_urn( From 78d184bc744d55930e99897641db9ea426bfc4bc Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Mon, 24 Feb 2025 11:57:36 -0800 Subject: [PATCH 12/59] added endopint workunit creation and refactored --- .../src/datahub/ingestion/source/vertexai.py | 142 ++++++++++-------- 1 file changed, 83 insertions(+), 59 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py index bd3eff560c3d44..04ebb52c2dc130 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py +++ b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py @@ -1,16 +1,21 @@ import logging import time -from typing import Iterable, Optional, TypeVar, List +from typing import Iterable, List, Optional, TypeVar from google.cloud import aiplatform -from google.cloud.aiplatform import AutoMLTabularTrainingJob, CustomJob, AutoMLTextTrainingJob, AutoMLImageTrainingJob, \ - AutoMLVideoTrainingJob, AutoMLForecastingTrainingJob, Endpoint +from google.cloud.aiplatform import ( + AutoMLForecastingTrainingJob, + AutoMLImageTrainingJob, + AutoMLTabularTrainingJob, + AutoMLTextTrainingJob, + AutoMLVideoTrainingJob, + Endpoint, +) from google.cloud.aiplatform.models import Model, VersionInfo from google.cloud.aiplatform.training_jobs import _TrainingJob from pydantic.fields import Field import datahub.emitter.mce_builder as builder -from datahub.api.entities.dataprocess.dataprocess_instance import DataProcessInstance from datahub.configuration.source_common import EnvConfigMixin from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.common import PipelineContext @@ -23,13 +28,17 @@ ) from datahub.ingestion.api.source import Source, SourceCapability, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit -from datahub.metadata._schema_classes import DataProcessInstancePropertiesClass, AuditStampClass, \ - DataProcessInstanceInputClass, TimeStampClass +from datahub.metadata._schema_classes import ( + AuditStampClass, + DataProcessInstanceInputClass, + DataProcessInstancePropertiesClass, + TimeStampClass, +) from datahub.metadata.schema_classes import ( + MLModelDeploymentPropertiesClass, MLModelGroupPropertiesClass, MLModelPropertiesClass, VersionTagClass, - MLModelDeploymentPropertiesClass, _Aspect, ) @@ -78,7 +87,7 @@ def get_report(self) -> SourceReport: def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: """ - Main Function to fetch and yields workunits for various VertexAI resources. + Main Function to fetch and yields work units for various VertexAI resources. - Models and Model Versions from the Model Registry - Training Jobs """ @@ -116,7 +125,7 @@ def _validate_training_job(self, model: Model) -> bool: def _get_ml_model_workunits(self) -> Iterable[MetadataWorkUnit]: """ - Fetch List of Models in Model Registry and generate a corresponding workunit. + Fetch List of Models in Model Registry and generate a corresponding work unit. """ registered_models = self.client.Model.list() for model in registered_models: @@ -133,35 +142,35 @@ def _get_ml_model_workunits(self) -> Iterable[MetadataWorkUnit]: # create work unit for Model (= Model Version in VertexAI) logger.info(f"generating MLProperties work unit for model (name: {model.display_name} id:{model.name})") - yield self._get_ml_model_properties_workunit(model=model, model_version=model_version) + yield from self._get_ml_model_endpoint_workunit(model=model, model_version=model_version) def _get_training_job_workunit(self) -> Iterable[MetadataWorkUnit]: """ - Fetches training jobs from Vertex AI and generates corresponding workunits. + Fetches training jobs from Vertex AI and generates corresponding work units. This method retrieves various types of training jobs from Vertex AI, including CustomJob, CustomTrainingJob, CustomContainerTrainingJob, CustomPythonPackageTrainingJob, AutoMLTabularTrainingJob, AutoMLTextTrainingJob, AutoMLImageTrainingJob, AutoMLVideoTrainingJob, - and AutoMLForecastingTrainingJob. For each job, it generates workunits containing metadata + and AutoMLForecastingTrainingJob. For each job, it generates work units containing metadata about the job, its inputs, and its outputs. """ - logger.info(f"fetching a list of CustomJobs") + logger.info("Fetching a list of CustomJobs from VertexAI server") yield from self._get_data_process_workunit(self.client.CustomJob.list()) - logger.info(f"fetching a list of CustomTrainingJobs") + logger.info("fetching a list of CustomTrainingJobs from VertexAI server") yield from self._get_data_process_workunit(self.client.CustomTrainingJob.list()) - logger.info(f"fetching a list of CustomContainerTrainingJob") + logger.info("fetching a list of CustomContainerTrainingJob from VertexAI server") yield from self._get_data_process_workunit(self.client.CustomContainerTrainingJob.list()) - logger.info(f"fetching a list of CustomPythonPackageTrainingJob") + logger.info("fetching a list of CustomPythonPackageTrainingJob from VertexAI server") yield from self._get_data_process_workunit(self.client.CustomPythonPackageTrainingJob.list()) - logger.info(f"fetching a list of AutoMLTabularTrainingJobs") + logger.info("fetching a list of AutoMLTabularTrainingJobs from VertexAI server") yield from self._get_data_process_workunit(self.client.AutoMLTabularTrainingJob.list()) - logger.info(f"fetching a list of AutoMLTextTrainingJobs") + logger.info("fetching a list of AutoMLTextTrainingJobs from VertexAI server") yield from self._get_data_process_workunit(self.client.AutoMLTextTrainingJob.list()) - logger.info(f"fetching a list of AutoMLImageTrainingJobs") + logger.info("fetching a list of AutoMLImageTrainingJobs from VertexAI server") yield from self._get_data_process_workunit(self.client.AutoMLImageTrainingJob.list()) - logger.info(f"fetching a list of AutoMLVideoTrainingJobs") + logger.info("fetching a list of AutoMLVideoTrainingJobs from VertexAI server") yield from self._get_data_process_workunit(self.client.AutoMLVideoTrainingJob.list()) - logger.info(f"fetching a list of AutoMLForecastingTrainingJobs") + logger.info("fetching a list of AutoMLForecastingTrainingJobs from VertexAI server") yield from self._get_data_process_workunit(self.client.AutoMLForecastingTrainingJob.list()) def _get_data_process_workunit(self, jobs: List[_TrainingJob]) -> Iterable[MetadataWorkUnit]: @@ -227,6 +236,13 @@ def _is_automl_job(self, job: _TrainingJob) -> bool: isinstance(job, AutoMLVideoTrainingJob)) or isinstance(job, AutoMLForecastingTrainingJob)) + def _search_model_version(self, model:Model, version_id:str) -> Optional[VersionInfo]: + for version in model.versioning_registry.list_versions(): + if version.version_id == version_id: + return version + return None + + def _get_job_output_workunit(self, job: _TrainingJob) -> Iterable[MetadataWorkUnit]: """ This method creates work units that link the training job to the model version @@ -239,27 +255,16 @@ def _get_job_output_workunit(self, job: _TrainingJob) -> Iterable[MetadataWorkUn if ("modelToUpload" in job_conf and "name" in job_conf["modelToUpload"] and job_conf["modelToUpload"]["name"]): model_id = job_conf["modelToUpload"]["name"].split("/")[-1] - model_version = job_conf["modelToUpload"]["versionId"] - model_display_name = job_conf["modelToUpload"]["displayName"] - entity_id = f"{model_id}{self.config.model_name_separator}{model_version}" - model_version = self._make_vertexai_name(entity_type="model", entity_id=entity_id) - - model_version_urn = builder.make_ml_model_urn( - platform=self.platform, - model_name=model_version, - env=self.config.env, - ) - + model_version_str = job_conf["modelToUpload"]["versionId"] job_urn = self._make_job_urn(job) - logger.info(f"found that training job: {job.display_name} generated a model (name:{model_display_name} id:{model_id})") - - - aspect = MLModelPropertiesClass( - trainingJobs=[job_urn], - customProperties={"displayName": model_display_name} - ) - yield self._create_workunit(urn=model_version_urn, aspect=aspect) + model = Model(model_name=job_conf["modelToUpload"]["name"]) + model_version = self._search_model_version(model, model_version_str) + if model and model_version: + logger.info( + f"found that training job: {job.display_name} generated " + f"a model (name:{model.display_name} id:{model_version_str})") + yield from self._get_ml_model_endpoint_workunit(model, model_version, job_urn) def _get_job_input_workunit(self, job: _TrainingJob) -> Iterable[MetadataWorkUnit]: """ @@ -303,8 +308,40 @@ def _get_data_process_input_workunit(self, job: _TrainingJob, dataset_id: str) - return self._create_workunit(urn=entityUrn, aspect=aspect) + def _get_ml_model_endpoint_workunit(self, model: Model, model_version: VersionInfo, + training_job_urn: Optional[str] = None) -> Iterable[MetadataWorkUnit]: - def _get_ml_model_properties_workunit(self,model: Model,model_version: VersionInfo) -> MetadataWorkUnit: + """ + Generate an MLModel and Endopint workunit for an VertexAI Model Version. + """ + logging.info(f"starting model work unit for model {model.name}") + + ml_model_group_urn = self._make_ml_model_group_urn(model) + model_name = self._make_vertexai_name(entity_type="model", entity_id=model.name) + ml_model_urn = self._make_ml_model_urn(model_version, model_name=model_name) + model_version_name = f"{model_name}{self.config.model_name_separator}{model_version.version_id}" + + endpoint: Optional[Endpoint] = self._search_endpoint(model) + endpoint_urn = None + + if endpoint: + endpoint_urn = builder.make_ml_model_deployment_urn( + platform=self.platform, + deployment_name=self._make_vertexai_name("endpoint", endpoint.display_name), + env=self.config.env + ) if endpoint else None + ml_deployment_properties = MLModelDeploymentPropertiesClass( + description=model.description, + createdAt=int(endpoint.create_time.timestamp()), + version=VersionTagClass(versionTag=str(model_version.version_id)), + customProperties={"displayName": endpoint.display_name} + ) + yield self._create_workunit(urn=endpoint_urn, aspect=ml_deployment_properties) + + yield self._get_ml_model_properties_workunit(model, model_version,training_job_urn, endpoint_urn) + + def _get_ml_model_properties_workunit(self, model: Model, model_version: VersionInfo, + training_job_urn:Optional[str] = None, endpoint_urn:Optional[str] = None) -> MetadataWorkUnit: """ Generate an MLModel workunit for an VertexAI Model Version. Every Model Version is a DataHub MLModel entity associated with an MLModelGroup @@ -314,20 +351,9 @@ def _get_ml_model_properties_workunit(self,model: Model,model_version: VersionIn ml_model_group_urn = self._make_ml_model_group_urn(model) model_name = self._make_vertexai_name(entity_type="model", entity_id=model.name) - ml_model_urn = self._make_ml_model_urn(model_version, model_name=model_name) model_version_name = f"{model_name}{self.config.model_name_separator}{model_version.version_id}" + ml_model_urn = self._make_ml_model_urn(model_version, model_name=model_name) - training_job_names = None - - # if self._validate_training_job(model): - # training_job_names = [model.training_job.name] - - endpoint:Optional[Endpoint] = self._search_endpoint(model) - endpoint_urn = builder.make_ml_model_deployment_urn( - platform=self.platform, - deployment_name=self._make_vertexai_name("endpoint", endpoint.display_name), - env=self.config.env - ) if endpoint else None ml_model_properties = MLModelPropertiesClass( name=model_version_name, @@ -339,15 +365,14 @@ def _get_ml_model_properties_workunit(self,model: Model,model_version: VersionIn lastModified=TimeStampClass(model_version.version_update_time.second), version=VersionTagClass(versionTag=str(model_version.version_id)), groups=[ml_model_group_urn], # link model version to model group - # trainingJobs=training_job_names if training_job_names else None, # link to training job - deployments=[endpoint_urn] if endpoint is not None else [], # link to model registry and endpoint + trainingJobs=[training_job_urn] if training_job_urn else None, # link to training job + deployments=[endpoint_urn] if endpoint_urn else [], # link to model registry and endpoint externalUrl=self._make_model_version_external_url(model), ) - wu = self._create_workunit(urn=ml_model_urn, aspect=ml_model_properties) - logging.info(f"created model version {ml_model_properties.name} associated with group {ml_model_group_urn}") + # logging.info(f"created model version {ml_model_properties.name} associated with group {ml_model_group_urn}") + return self._create_workunit(urn=ml_model_urn, aspect=ml_model_properties) - return wu def _search_endpoint(self, model: Model) -> Optional[Endpoint]: @@ -393,7 +418,6 @@ def _make_job_external_url(self, job: _TrainingJob): Model external URL in Vertex AI Sample URLs: https://console.cloud.google.com/vertex-ai/training/training-pipelines?project=acryl-poc&trainingPipelineId=5401695018589093888 - https://console.cloud.google.com/vertex-ai/models/locations/us-west2/models/812468724182286336/versions/default?backTo=training&trainingPipelineId=5401695018589093888&project=acryl-poc """ entity_type = "training" external_url = (f"{self.vertexai_base_url}/{entity_type}/training-pipelines?trainingPipelineId={job.name}" From d746a4c466e6b6f9ff2054ac2359afa813fd4f43 Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Mon, 24 Feb 2025 12:01:16 -0800 Subject: [PATCH 13/59] commit temporarily --- metadata-ingestion/docs/sources/vertexai/vertexai_recipe.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/metadata-ingestion/docs/sources/vertexai/vertexai_recipe.yml b/metadata-ingestion/docs/sources/vertexai/vertexai_recipe.yml index 91d8e2a86fee48..840e5fd6c4b5d5 100644 --- a/metadata-ingestion/docs/sources/vertexai/vertexai_recipe.yml +++ b/metadata-ingestion/docs/sources/vertexai/vertexai_recipe.yml @@ -8,3 +8,4 @@ sink: type: "datahub-rest" config: server: "http://localhost:8080" + token: "eyJhbGciOiJIUzI1NiJ9.eyJhY3RvclR5cGUiOiJVU0VSIiwiYWN0b3JJZCI6ImRhdGFodWIiLCJ0eXBlIjoiUEVSU09OQUwiLCJ2ZXJzaW9uIjoiMiIsImp0aSI6IjU5OTBhZjRjLTFiOTEtNDg1Zi1iNDk3LTJmZjVlODA0ODY3YSIsInN1YiI6ImRhdGFodWIiLCJleHAiOjE3NDI4OTA1NDgsImlzcyI6ImRhdGFodWItbWV0YWRhdGEtc2VydmljZSJ9.yDZzG_Kes9GCYIJeLRNibzTryyzIXG_ve6o3VcDByMo" From 5fbe0e511c01d955c763061be46160d35437a76b Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Mon, 24 Feb 2025 12:06:12 -0800 Subject: [PATCH 14/59] lintfix --- .../src/datahub/ingestion/source/vertexai.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py index 04ebb52c2dc130..d32d157968ead8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py +++ b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py @@ -117,9 +117,10 @@ def _validate_training_job(self, model: Model) -> bool: try: # when model has ref to training job, but field is not accessible, it is not valid name = job.name + logger.debug((f"can fetch training job name: {name} for model: (name:{model.display_name} id:{model.name})")) return True except RuntimeError: - logger.info(f"cannot fetch training job name, not valid for model (name:{model.display_name} id:{model.name})") + logger.debug(f"cannot fetch training job name, not valid for model (name:{model.display_name} id:{model.name})") return False @@ -254,7 +255,6 @@ def _get_job_output_workunit(self, job: _TrainingJob) -> Iterable[MetadataWorkUn job_conf = job.to_dict() if ("modelToUpload" in job_conf and "name" in job_conf["modelToUpload"] and job_conf["modelToUpload"]["name"]): - model_id = job_conf["modelToUpload"]["name"].split("/")[-1] model_version_str = job_conf["modelToUpload"]["versionId"] job_urn = self._make_job_urn(job) @@ -314,12 +314,6 @@ def _get_ml_model_endpoint_workunit(self, model: Model, model_version: VersionIn """ Generate an MLModel and Endopint workunit for an VertexAI Model Version. """ - logging.info(f"starting model work unit for model {model.name}") - - ml_model_group_urn = self._make_ml_model_group_urn(model) - model_name = self._make_vertexai_name(entity_type="model", entity_id=model.name) - ml_model_urn = self._make_ml_model_urn(model_version, model_name=model_name) - model_version_name = f"{model_name}{self.config.model_name_separator}{model_version.version_id}" endpoint: Optional[Endpoint] = self._search_endpoint(model) endpoint_urn = None From 9f8e8a3fb3a479cf42a7153d9212d0af583640be Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Mon, 24 Feb 2025 12:26:22 -0800 Subject: [PATCH 15/59] removing unnecesary commits --- .../app/ingest/source/IngestionSourceList.tsx | 5 + .../examples/ai/dh_ai_client.py | 99 +----------- .../ai/vertexai_example_experiment.py | 141 ------------------ .../ai/vertexai_example_training_job.py | 116 -------------- .../src/datahub/ingestion/source/vertexai.py | 6 +- 5 files changed, 15 insertions(+), 352 deletions(-) delete mode 100644 metadata-ingestion/examples/ai/vertexai_example_experiment.py delete mode 100644 metadata-ingestion/examples/ai/vertexai_example_training_job.py diff --git a/datahub-web-react/src/app/ingest/source/IngestionSourceList.tsx b/datahub-web-react/src/app/ingest/source/IngestionSourceList.tsx index f9d0e48e1f0867..8cc3845b1b338b 100644 --- a/datahub-web-react/src/app/ingest/source/IngestionSourceList.tsx +++ b/datahub-web-react/src/app/ingest/source/IngestionSourceList.tsx @@ -98,6 +98,11 @@ export const IngestionSourceList = () => { const [sort, setSort] = useState(); const [hideSystemSources, setHideSystemSources] = useState(true); + // When source filter changes, reset page to 1 + useEffect(() => { + setPage(1); + }, [sourceFilter]); + /** * Show or hide system ingestion sources using a hidden command S command. */ diff --git a/metadata-ingestion/examples/ai/dh_ai_client.py b/metadata-ingestion/examples/ai/dh_ai_client.py index 7c65f54f5dfeb9..b51c8dd050cf1b 100644 --- a/metadata-ingestion/examples/ai/dh_ai_client.py +++ b/metadata-ingestion/examples/ai/dh_ai_client.py @@ -290,62 +290,6 @@ def create_model( logger.info(f"Created model: {model_urn}") return str(model_urn) - def create_training_job( - self, - run_id: str, - properties: Optional[models.DataProcessInstancePropertiesClass] = None, - training_run_properties: Optional[models.MLTrainingRunPropertiesClass] = None, - run_result: Optional[str] = None, - start_timestamp: Optional[int] = None, - end_timestamp: Optional[int] = None, - **kwargs: Any, - ) -> str: - """Create a training job with properties and events.""" - dpi_urn = f"urn:li:dataProcessInstance:{run_id}" - - # Create basic properties and aspects - aspects = [ - ( - properties - or self._create_properties_class( - models.DataProcessInstancePropertiesClass, kwargs - ) - ), - models.SubTypesClass(typeNames=["ML Training Run"]), - ] - - # Add training run properties if provided - if training_run_properties: - aspects.append(training_run_properties) - - # Handle run events - current_time = int(time.time() * 1000) - start_ts = start_timestamp or current_time - end_ts = end_timestamp or current_time - - # Create events - aspects.append( - self._create_run_event( - status=DataProcessRunStatusClass.STARTED, timestamp=start_ts - ) - ) - - if run_result: - aspects.append( - self._create_run_event( - status=DataProcessRunStatusClass.COMPLETE, - timestamp=end_ts, - result=run_result, - duration_millis=end_ts - start_ts, - ) - ) - - # Create and emit MCPs - mcps = [self._create_mcp(dpi_urn, aspect) for aspect in aspects] - self._emit_mcps(mcps) - logger.info(f"Created training job: {dpi_urn}") - return dpi_urn - def create_experiment( self, experiment_id: str, @@ -438,48 +382,28 @@ def create_dataset(self, name: str, platform: str, **kwargs: Any) -> str: raise ValueError(f"Failed to create dataset URN for {name}") return dataset.urn - def _add_process_to_model(self, model_urn: str, process_urn: str) -> None: - """Add a DataProcessInstance to a model while preserving existing properties.""" + def add_run_to_model(self, model_urn: str, run_urn: str) -> None: + """Add a run to a model while preserving existing properties.""" self._update_entity_properties( entity_urn=model_urn, aspect_type=models.MLModelPropertiesClass, - updates={"trainingJobs": process_urn}, + updates={"trainingJobs": run_urn}, entity_type="mlModel", skip_properties=["trainingJobs"], ) - - def add_run_to_model(self, model_urn: str, run_urn: str) -> None: - """Add a run to a model while preserving existing properties.""" - self._add_process_to_model(model_urn, run_urn) logger.info(f"Added run {run_urn} to model {model_urn}") - def add_job_to_model(self, model_urn: str, job_urn: str) -> None: - """Add a job to a model while preserving existing properties.""" - self._add_process_to_model(model_urn, job_urn) - logger.info(f"Added training job {job_urn} to model {model_urn}") - - def _add_process_to_model_group( - self, model_group_urn: str, process_urn: str - ) -> None: - """Add DatapProcessInstance to a model group while preserving existing properties.""" + def add_run_to_model_group(self, model_group_urn: str, run_urn: str) -> None: + """Add a run to a model group while preserving existing properties.""" self._update_entity_properties( entity_urn=model_group_urn, aspect_type=models.MLModelGroupPropertiesClass, - updates={"trainingJobs": process_urn}, + updates={"trainingJobs": run_urn}, entity_type="mlModelGroup", skip_properties=["trainingJobs"], ) - - def add_run_to_model_group(self, model_group_urn: str, run_urn: str) -> None: - """Add a run to a model group while preserving existing properties.""" - self._add_process_to_model_group(model_group_urn, run_urn) logger.info(f"Added run {run_urn} to model group {model_group_urn}") - def add_job_to_model_group(self, model_group_urn: str, job_urn: str) -> None: - """Add a job to a model group while preserving existing properties.""" - self._add_process_to_model_group(model_group_urn, job_urn) - logger.info(f"Added job {job_urn} to model group {model_group_urn}") - def add_model_to_model_group(self, model_urn: str, group_urn: str) -> None: """Add a model to a group while preserving existing properties""" self._update_entity_properties( @@ -499,9 +423,7 @@ def add_run_to_experiment(self, run_urn: str, experiment_urn: str) -> None: self._emit_mcps(mcp) logger.info(f"Added run {run_urn} to experiment {experiment_urn}") - def _add_input_datasets_to_process( - self, run_urn: str, dataset_urns: List[str] - ) -> None: + def add_input_datasets_to_run(self, run_urn: str, dataset_urns: List[str]) -> None: """Add input datasets to a run""" mcp = self._create_mcp( entity_urn=run_urn, @@ -510,15 +432,8 @@ def _add_input_datasets_to_process( aspect=DataProcessInstanceInput(inputs=dataset_urns), ) self._emit_mcps(mcp) - - def add_input_datasets_to_run(self, run_urn: str, dataset_urns: List[str]) -> None: - self._add_input_datasets_to_process(run_urn, dataset_urns) logger.info(f"Added input datasets to run {run_urn}") - def add_input_datasets_to_job(self, job_urn: str, dataset_urns: List[str]) -> None: - self._add_input_datasets_to_process(job_urn, dataset_urns) - logger.info(f"Added input datasets to training job {job_urn}") - def add_output_datasets_to_run(self, run_urn: str, dataset_urns: List[str]) -> None: """Add output datasets to a run""" mcp = self._create_mcp( diff --git a/metadata-ingestion/examples/ai/vertexai_example_experiment.py b/metadata-ingestion/examples/ai/vertexai_example_experiment.py deleted file mode 100644 index 264b428a880aa0..00000000000000 --- a/metadata-ingestion/examples/ai/vertexai_example_experiment.py +++ /dev/null @@ -1,141 +0,0 @@ -import argparse - -from dh_ai_client import DatahubAIClient - -import datahub.metadata.schema_classes as models -from datahub.metadata.com.linkedin.pegasus2avro.dataprocess import RunResultType - - -def create_experiment_example(client: DatahubAIClient) -> None: - experiment_urn = client.create_experiment( - experiment_id="table_classification_experiment", - properties=models.ContainerPropertiesClass( - name="Tabular classification Experiment", - description="Experiment for tabular classification", - customProperties={"team": "forecasting"}, - created=models.TimeStampClass( - time=1628580000000, actor="urn:li:corpuser:datahub" - ), - lastModified=models.TimeStampClass( - time=1628580000000, actor="urn:li:corpuser:datahub" - ), - ), - ) - - # Create a training run - run_urn = client.create_training_run( - run_id="simple_training_run", - properties=models.DataProcessInstancePropertiesClass( - name="Simple Training Run", - created=models.AuditStampClass( - time=1628580000000, actor="urn:li:corpuser:datahub" - ), - customProperties={"team": "forecasting"}, - ), - training_run_properties=models.MLTrainingRunPropertiesClass( - id="simple_training_run", - outputUrls=["gc://my-bucket/output"], - trainingMetrics=[models.MLMetricClass(name="accuracy", value="0.9")], - hyperParams=[models.MLHyperParamClass(name="learning_rate", value="0.01")], - externalUrl="https:localhost:5000", - ), - run_result=RunResultType.FAILURE, - start_timestamp=1628580000000, - end_timestamp=1628580001000, - ) - - # Create model group - model_group_urn = client.create_model_group( - group_id="AutoML-prediction-model-group", - properties=models.MLModelGroupPropertiesClass( - name="AutoML training", - description="Tabular classification prediction models", - created=models.TimeStampClass( - time=1628580000000, actor="urn:li:corpuser:datahub" - ), - ), - ) - - # Creating a model - model_urn = client.create_model( - model_id="automl-prediction-model", - properties=models.MLModelPropertiesClass( - name="AutoML training", - description="Tabular classification prediction models", - customProperties={"team": "forecasting"}, - trainingMetrics=[ - models.MLMetricClass(name="accuracy", value="0.9"), - models.MLMetricClass(name="precision", value="0.8"), - ], - hyperParams=[ - models.MLHyperParamClass(name="learning_rate", value="0.01"), - models.MLHyperParamClass(name="batch_size", value="32"), - ], - externalUrl="https:localhost:5000", - created=models.TimeStampClass( - time=1628580000000, actor="urn:li:corpuser:datahub" - ), - lastModified=models.TimeStampClass( - time=1628580000000, actor="urn:li:corpuser:datahub" - ), - tags=["forecasting", "prediction"], - ), - version="3583871344875405312", - alias="champion", - ) - - # Create datasets - input_dataset_urn = client.create_dataset( - platform="gcs", - name="table_input", - ) - - output_dataset_urn = client.create_dataset( - platform="gcs", - name="table_output", - ) - - # Add run to experiment - client.add_run_to_experiment(run_urn=run_urn, experiment_urn=experiment_urn) - - # Add run to model - client.add_run_to_model( - model_urn=model_urn, - run_urn=run_urn, - ) - - # add run to model group - client.add_run_to_model_group( - model_group_urn=model_group_urn, - run_urn=run_urn, - ) - - # Add input and output datasets to run - client.add_input_datasets_to_run( - run_urn=run_urn, dataset_urns=[str(input_dataset_urn)] - ) - - client.add_output_datasets_to_run( - run_urn=run_urn, dataset_urns=[str(output_dataset_urn)] - ) - - -if __name__ == "__main__": - # Example usage - parser = argparse.ArgumentParser() - parser.add_argument("--token", required=False, help="DataHub access token") - parser.add_argument( - "--server_url", - required=False, - default="http://localhost:8080", - help="DataHub server URL (defaults to http://localhost:8080)", - ) - parser.add_argument("--platform", default="vertexai", help="platform name") - args = parser.parse_args() - - # Create Client - client = DatahubAIClient( - token=args.token, server_url=args.server_url, platform=args.platform - ) - - create_experiment_example(client) diff --git a/metadata-ingestion/examples/ai/vertexai_example_training_job.py b/metadata-ingestion/examples/ai/vertexai_example_training_job.py deleted file mode 100644 index 25febf1b74e3ff..00000000000000 --- a/metadata-ingestion/examples/ai/vertexai_example_training_job.py +++ /dev/null @@ -1,116 +0,0 @@ -import argparse - -from dh_ai_client import DatahubAIClient - -import datahub.metadata.schema_classes as models -from datahub.metadata.com.linkedin.pegasus2avro.dataprocess import RunResultType - - -def create_training_job_example(client: DatahubAIClient) -> None: - # Create Training Job - training_job_urn = client.create_training_job( - run_id="train-petfinder-automl-job", - properties=models.DataProcessInstancePropertiesClass( - name="Training Job", - created=models.AuditStampClass( - time=1628580000000, actor="urn:li:corpuser:datahub" - ), - customProperties={"team": "classification"}, - ), - training_run_properties=models.MLTrainingRunPropertiesClass( - id="train-petfinder-automl-job", - outputUrls=["gc://my-bucket/output"], - trainingMetrics=[models.MLMetricClass(name="accuracy", value="0.9")], - hyperParams=[models.MLHyperParamClass(name="learning_rate", value="0.01")], - externalUrl="https:localhost:5000", - ), - run_result=RunResultType.FAILURE, - start_timestamp=1628580000000, - end_timestamp=1628580001000, - ) - - # Create model group - model_group_urn = client.create_model_group( - group_id="AutoML-prediction-model-group", - properties=models.MLModelGroupPropertiesClass( - name="AutoML training", - description="Tabular classification prediction models", - created=models.TimeStampClass( - time=1628580000000, actor="urn:li:corpuser:datahub" - ), - ), - ) - - # Creating a model with metrics - model_urn = client.create_model( - model_id="automl-prediction-model", - properties=models.MLModelPropertiesClass( - name="AutoML training", - description="Tabular classification prediction models", - customProperties={"team": "forecasting"}, - trainingMetrics=[ - models.MLMetricClass(name="accuracy", value="0.9"), - models.MLMetricClass(name="precision", value="0.8"), - ], - hyperParams=[ - models.MLHyperParamClass(name="learning_rate", value="0.01"), - models.MLHyperParamClass(name="batch_size", value="32"), - ], - externalUrl="https:localhost:5000", - created=models.TimeStampClass( - time=1628580000000, actor="urn:li:corpuser:datahub" - ), - lastModified=models.TimeStampClass( - time=1628580000000, actor="urn:li:corpuser:datahub" - ), - tags=["forecasting", "prediction"], - ), - version="3583871344875405312", - alias="champion", - ) - - # Create datasets - input_dataset_urn = client.create_dataset( - platform="gcs", - name="classification_input_data", - ) - - # Add model to model group - client.add_model_to_model_group(model_urn=model_urn, group_urn=model_group_urn) - - # Add training job to model - client.add_job_to_model( - model_urn=model_urn, - job_urn=training_job_urn, - ) - - # add training job to model group - client.add_job_to_model_group( - model_group_urn=model_group_urn, - job_urn=training_job_urn, - ) - - # Add input and output datasets to run - client.add_input_datasets_to_job( - job_urn=training_job_urn, dataset_urns=[str(input_dataset_urn)] - ) - - -if __name__ == "__main__": - # Example usage - parser = argparse.ArgumentParser() - parser.add_argument("--token", required=False, help="DataHub access token") - parser.add_argument( - "--server_url", - required=False, - default="http://localhost:8080", - help="DataHub server URL (defaults to http://localhost:8080)", - ) - parser.add_argument("--platform", default="vertexai", help="platform name") - args = parser.parse_args() - # Create Client - client = DatahubAIClient( - token=args.token, server_url=args.server_url, platform=args.platform - ) - - create_training_job_example(client) diff --git a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py index d32d157968ead8..bcc96a3bf33252 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py +++ b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py @@ -227,7 +227,7 @@ def _get_data_process_properties_workunit(self, job: _TrainingJob) -> MetadataWo customProperties={"displayName": job.display_name} ) - logging.info(f"generating data process instance for training job: {entityUrn}") + logging.info(f"Generating data process instance for training job: {entityUrn}") return self._create_workunit(urn=entityUrn, aspect=aspect) def _is_automl_job(self, job: _TrainingJob) -> bool: @@ -253,7 +253,7 @@ def _get_job_output_workunit(self, job: _TrainingJob) -> Iterable[MetadataWorkUn """ job_conf = job.to_dict() - if ("modelToUpload" in job_conf and "name" in job_conf["modelToUpload"] and job_conf["modelToUpload"]["name"]): + if "modelToUpload" in job_conf and "name" in job_conf["modelToUpload"] and job_conf["modelToUpload"]["name"]: model_version_str = job_conf["modelToUpload"]["versionId"] job_urn = self._make_job_urn(job) @@ -312,7 +312,7 @@ def _get_ml_model_endpoint_workunit(self, model: Model, model_version: VersionIn training_job_urn: Optional[str] = None) -> Iterable[MetadataWorkUnit]: """ - Generate an MLModel and Endopint workunit for an VertexAI Model Version. + Generate an MLModel and Endpoint work unit for an VertexAI Model Version. """ endpoint: Optional[Endpoint] = self._search_endpoint(model) From 85d1830dd92ab7700d90b7a186b296ba0a651152 Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Mon, 24 Feb 2025 12:29:43 -0800 Subject: [PATCH 16/59] cleanup recipe --- metadata-ingestion/docs/sources/vertexai/vertexai_recipe.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/metadata-ingestion/docs/sources/vertexai/vertexai_recipe.yml b/metadata-ingestion/docs/sources/vertexai/vertexai_recipe.yml index 840e5fd6c4b5d5..91d8e2a86fee48 100644 --- a/metadata-ingestion/docs/sources/vertexai/vertexai_recipe.yml +++ b/metadata-ingestion/docs/sources/vertexai/vertexai_recipe.yml @@ -8,4 +8,3 @@ sink: type: "datahub-rest" config: server: "http://localhost:8080" - token: "eyJhbGciOiJIUzI1NiJ9.eyJhY3RvclR5cGUiOiJVU0VSIiwiYWN0b3JJZCI6ImRhdGFodWIiLCJ0eXBlIjoiUEVSU09OQUwiLCJ2ZXJzaW9uIjoiMiIsImp0aSI6IjU5OTBhZjRjLTFiOTEtNDg1Zi1iNDk3LTJmZjVlODA0ODY3YSIsInN1YiI6ImRhdGFodWIiLCJleHAiOjE3NDI4OTA1NDgsImlzcyI6ImRhdGFodWItbWV0YWRhdGEtc2VydmljZSJ9.yDZzG_Kes9GCYIJeLRNibzTryyzIXG_ve6o3VcDByMo" From aae68934a3379cc76a27342c5d1bf91fcf98222b Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Mon, 24 Feb 2025 12:45:55 -0800 Subject: [PATCH 17/59] minor change in config --- .../src/datahub/ingestion/source/vertexai.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py index bcc96a3bf33252..35e639086a87b9 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py +++ b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py @@ -56,6 +56,11 @@ class VertexAIConfig(EnvConfigMixin): description=("Bucket URI used in your project"), ) + vertexai_url: Optional[str] = Field( + default="https://console.cloud.google.com/vertex-ai", + description=("VertexUI URI"), + ) + model_name_separator: str = Field( default="_", description="A string which separates model name from its version (e.g. model_1 or model-1)", @@ -72,7 +77,6 @@ class VertexAIConfig(EnvConfigMixin): @capability(SourceCapability.TAGS, "Extract tags for VertexAI Registered Model Stages") class VertexAISource(Source): platform = "vertexai" - vertexai_base_url = "https://console.cloud.google.com/vertex-ai" def __init__(self, ctx: PipelineContext, config: VertexAIConfig): super().__init__(ctx) @@ -414,7 +418,7 @@ def _make_job_external_url(self, job: _TrainingJob): https://console.cloud.google.com/vertex-ai/training/training-pipelines?project=acryl-poc&trainingPipelineId=5401695018589093888 """ entity_type = "training" - external_url = (f"{self.vertexai_base_url}/{entity_type}/training-pipelines?trainingPipelineId={job.name}" + external_url = (f"{self.config.vertexai_url}/{entity_type}/training-pipelines?trainingPipelineId={job.name}" f"?project={self.config.project_id}") return external_url @@ -425,7 +429,7 @@ def _make_model_external_url(self, model: Model): https://console.cloud.google.com/vertex-ai/models/locations/us-west2/models/812468724182286336?project=acryl-poc """ entity_type = "models" - external_url = (f"{self.vertexai_base_url}/{entity_type}/locations/{self.config.region}/{entity_type}/{model.name}" + external_url = (f"{self.config.vertexai_url}/{entity_type}/locations/{self.config.region}/{entity_type}/{model.name}" f"?project={self.config.project_id}") return external_url @@ -436,7 +440,7 @@ def _make_model_version_external_url(self, model: Model): https://console.cloud.google.com/vertex-ai/models/locations/us-west2/models/812468724182286336/versions/1?project=acryl-poc """ entity_type = "models" - external_url = (f"{self.vertexai_base_url}/{entity_type}/locations/{self.config.region}/{entity_type}/{model.name}" + external_url = (f"{self.config.vertexai_url}/{entity_type}/locations/{self.config.region}/{entity_type}/{model.name}" f"/versions/{model.version_id}" f"?project={self.config.project_id}") return external_url From 764f8fdcbcce5af41732b8e6a08169950c336a20 Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Mon, 24 Feb 2025 13:38:37 -0800 Subject: [PATCH 18/59] fixing dataset --- .../src/datahub/ingestion/source/vertexai.py | 79 +++++++++++++++---- 1 file changed, 62 insertions(+), 17 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py index 35e639086a87b9..37d4c846169573 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py +++ b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py @@ -9,8 +9,9 @@ AutoMLTabularTrainingJob, AutoMLTextTrainingJob, AutoMLVideoTrainingJob, - Endpoint, + Endpoint, TabularDataset, TextDataset, ) +from google.cloud.aiplatform.datasets import _Dataset from google.cloud.aiplatform.models import Model, VersionInfo from google.cloud.aiplatform.training_jobs import _TrainingJob from pydantic.fields import Field @@ -32,8 +33,9 @@ AuditStampClass, DataProcessInstanceInputClass, DataProcessInstancePropertiesClass, - TimeStampClass, + TimeStampClass, DatasetPropertiesClass, ) +from datahub.metadata.com.linkedin.pegasus2avro.dataset import DatasetProperties from datahub.metadata.schema_classes import ( MLModelDeploymentPropertiesClass, MLModelGroupPropertiesClass, @@ -142,11 +144,11 @@ def _get_ml_model_workunits(self) -> Iterable[MetadataWorkUnit]: # create work unit for Training Job (if Model has reference to Training Job) if self._validate_training_job(model): logger.info( - f"generating TrainingJob work unit for model: {model_version.model_display_name}") + f"Generating TrainingJob work unit for model: {model_version.model_display_name}") yield from self._get_data_process_properties_workunit(model.training_job) # create work unit for Model (= Model Version in VertexAI) - logger.info(f"generating MLProperties work unit for model (name: {model.display_name} id:{model.name})") + logger.info(f"Generating work unit for model (name: {model.display_name} id:{model.name})") yield from self._get_ml_model_endpoint_workunit(model=model, model_version=model_version) @@ -161,21 +163,21 @@ def _get_training_job_workunit(self) -> Iterable[MetadataWorkUnit]: """ logger.info("Fetching a list of CustomJobs from VertexAI server") yield from self._get_data_process_workunit(self.client.CustomJob.list()) - logger.info("fetching a list of CustomTrainingJobs from VertexAI server") + logger.info("Fetching a list of CustomTrainingJobs from VertexAI server") yield from self._get_data_process_workunit(self.client.CustomTrainingJob.list()) - logger.info("fetching a list of CustomContainerTrainingJob from VertexAI server") + logger.info("Fetching a list of CustomContainerTrainingJobs from VertexAI server") yield from self._get_data_process_workunit(self.client.CustomContainerTrainingJob.list()) - logger.info("fetching a list of CustomPythonPackageTrainingJob from VertexAI server") + logger.info("Fetching a list of CustomPythonPackageTrainingJob from VertexAI server") yield from self._get_data_process_workunit(self.client.CustomPythonPackageTrainingJob.list()) - logger.info("fetching a list of AutoMLTabularTrainingJobs from VertexAI server") + logger.info("Fetching a list of AutoMLTabularTrainingJobs from VertexAI server") yield from self._get_data_process_workunit(self.client.AutoMLTabularTrainingJob.list()) - logger.info("fetching a list of AutoMLTextTrainingJobs from VertexAI server") + logger.info("Fetching a list of AutoMLTextTrainingJobs from VertexAI server") yield from self._get_data_process_workunit(self.client.AutoMLTextTrainingJob.list()) - logger.info("fetching a list of AutoMLImageTrainingJobs from VertexAI server") + logger.info("Fetching a list of AutoMLImageTrainingJobs from VertexAI server") yield from self._get_data_process_workunit(self.client.AutoMLImageTrainingJob.list()) - logger.info("fetching a list of AutoMLVideoTrainingJobs from VertexAI server") + logger.info("Fetching a list of AutoMLVideoTrainingJobs from VertexAI server") yield from self._get_data_process_workunit(self.client.AutoMLVideoTrainingJob.list()) - logger.info("fetching a list of AutoMLForecastingTrainingJobs from VertexAI server") + logger.info("Fetching a list of AutoMLForecastingTrainingJobs from VertexAI server") yield from self._get_data_process_workunit(self.client.AutoMLForecastingTrainingJob.list()) def _get_data_process_workunit(self, jobs: List[_TrainingJob]) -> Iterable[MetadataWorkUnit]: @@ -270,6 +272,39 @@ def _get_job_output_workunit(self, job: _TrainingJob) -> Iterable[MetadataWorkUn f"a model (name:{model.display_name} id:{model_version_str})") yield from self._get_ml_model_endpoint_workunit(model, model_version, job_urn) + + def _search_dataset(self, dataset_id: str) -> Optional[DatasetProperties]: + for ds in self.client.datasets.TextDataset.list(): + if ds.name == dataset_id: + return ds + for ds in self.client.datasets.TabularDataset.list(): + if ds.name == dataset_id: + return ds + for ds in self.client.datasets.ImageDataset.list(): + if ds.name == dataset_id: + return ds + for ds in self.client.datasets.TimeSeriesDataset.list(): + if ds.name == dataset_id: + return ds + for ds in self.client.datasets.VideoDataset.list(): + if ds.name == dataset_id: + return ds + return None + + def _make_dataset_aspect(self, ds: _Dataset) -> Optional[DatasetPropertiesClass]: + aspect = DatasetPropertiesClass( + name=self._make_vertexai_name("dataset", ds.name), + created=TimeStampClass(time=int(ds.create_time.timestamp())), + description=f"Dataset: {ds.display_name} for training job", + customProperties={"displayName": ds.display_name, + "resourceName": ds.resource_name, + }, + qualifiedName=ds.resource_name + ) + return aspect + + + def _get_job_input_workunit(self, job: _TrainingJob) -> Iterable[MetadataWorkUnit]: """ Generate work units for the input data of a training job. @@ -285,9 +320,9 @@ def _get_job_input_workunit(self, job: _TrainingJob) -> Iterable[MetadataWorkUni logger.info(f"found that training job {job.display_name} used input dataset: {dataset_id}") if dataset_id: - yield self._get_data_process_input_workunit(job, dataset_id) + yield from self._get_data_process_input_workunit(job, dataset_id) - def _get_data_process_input_workunit(self, job: _TrainingJob, dataset_id: str) -> MetadataWorkUnit: + def _get_data_process_input_workunit(self, job: _TrainingJob, dataset_id: str) -> Iterable[MetadataWorkUnit]: """ This method creates a work unit for the input dataset of a training job. It constructs the URN for the input dataset and the training job, and then creates a DataProcessInstanceInputClass aspect @@ -302,15 +337,25 @@ def _get_data_process_input_workunit(self, job: _TrainingJob, dataset_id: str) - env=self.config.env, ) + dataset = self._search_dataset(dataset_id) + if dataset: + if isinstance(dataset, TabularDataset): + aspect = DatasetPropertiesClass( + name=dataset_name, + description=f"Input dataset for training job: {job.display_name}", + ) + aspect = self._make_dataset_aspect(dataset) + if aspect: + yield self._create_workunit(urn=dataset_urn, aspect=aspect) + # Create URN of Training Job job_id = self._make_vertexai_name(entity_type="job", entity_id=job.name) entityUrn = builder.make_data_process_instance_urn(job_id) - aspect = DataProcessInstanceInputClass( + dp_aspect = DataProcessInstanceInputClass( inputs=[dataset_urn] ) logger.info(f"generating input dataset {dataset_urn}") - - return self._create_workunit(urn=entityUrn, aspect=aspect) + yield self._create_workunit(urn=entityUrn, aspect=dp_aspect) def _get_ml_model_endpoint_workunit(self, model: Model, model_version: VersionInfo, training_job_urn: Optional[str] = None) -> Iterable[MetadataWorkUnit]: From 29ddcffbd8f64c84a70e0eb6865a2a56ce3f9b8c Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Mon, 24 Feb 2025 13:42:16 -0800 Subject: [PATCH 19/59] adding comments for dataset --- .../src/datahub/ingestion/source/vertexai.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py index 37d4c846169573..e22d1cb61971ab 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py +++ b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py @@ -274,6 +274,11 @@ def _get_job_output_workunit(self, job: _TrainingJob) -> Iterable[MetadataWorkUn def _search_dataset(self, dataset_id: str) -> Optional[DatasetProperties]: + """ + Search for a dataset by its ID in Vertex AI. + This method iterates through different types of datasets (Text, Tabular, Image, + TimeSeries, and Video) to find a dataset that matches the given dataset ID. + """ for ds in self.client.datasets.TextDataset.list(): if ds.name == dataset_id: return ds @@ -292,6 +297,9 @@ def _search_dataset(self, dataset_id: str) -> Optional[DatasetProperties]: return None def _make_dataset_aspect(self, ds: _Dataset) -> Optional[DatasetPropertiesClass]: + """ + Create a DatasetPropertiesClass aspect for a given Vertex AI dataset. + """ aspect = DatasetPropertiesClass( name=self._make_vertexai_name("dataset", ds.name), created=TimeStampClass(time=int(ds.create_time.timestamp())), From 437e7d2c4ed5713f09aacde851b47923ea3bebd1 Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Mon, 24 Feb 2025 13:44:59 -0800 Subject: [PATCH 20/59] minor fix --- metadata-ingestion/src/datahub/ingestion/source/vertexai.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py index e22d1cb61971ab..ec9212ed0280f8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py +++ b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py @@ -9,7 +9,8 @@ AutoMLTabularTrainingJob, AutoMLTextTrainingJob, AutoMLVideoTrainingJob, - Endpoint, TabularDataset, TextDataset, + Endpoint, + TabularDataset, ) from google.cloud.aiplatform.datasets import _Dataset from google.cloud.aiplatform.models import Model, VersionInfo @@ -33,7 +34,8 @@ AuditStampClass, DataProcessInstanceInputClass, DataProcessInstancePropertiesClass, - TimeStampClass, DatasetPropertiesClass, + DatasetPropertiesClass, + TimeStampClass, ) from datahub.metadata.com.linkedin.pegasus2avro.dataset import DatasetProperties from datahub.metadata.schema_classes import ( From a2a1f0a6fc1311921a196c4bbb6c4e534735b316 Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Mon, 24 Feb 2025 14:50:36 -0800 Subject: [PATCH 21/59] adding vertex to dev requirements in setup.py --- metadata-ingestion/setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index edc8afdb220a8f..2a6f3fe013c5ac 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -675,6 +675,7 @@ "sac", "cassandra", "neo4j", + "vertexai", ] if plugin for dependency in plugins[plugin] From bf869da478302d2fe36be044a3fd0436b5442ccb Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Mon, 24 Feb 2025 15:20:15 -0800 Subject: [PATCH 22/59] minor fix --- .../src/datahub/ingestion/source/vertexai.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py index ec9212ed0280f8..220ed457a132c3 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py +++ b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py @@ -275,7 +275,7 @@ def _get_job_output_workunit(self, job: _TrainingJob) -> Iterable[MetadataWorkUn yield from self._get_ml_model_endpoint_workunit(model, model_version, job_urn) - def _search_dataset(self, dataset_id: str) -> Optional[DatasetProperties]: + def _search_dataset(self, dataset_id: str) -> Optional[_Dataset]: """ Search for a dataset by its ID in Vertex AI. This method iterates through different types of datasets (Text, Tabular, Image, @@ -349,11 +349,6 @@ def _get_data_process_input_workunit(self, job: _TrainingJob, dataset_id: str) - dataset = self._search_dataset(dataset_id) if dataset: - if isinstance(dataset, TabularDataset): - aspect = DatasetPropertiesClass( - name=dataset_name, - description=f"Input dataset for training job: {job.display_name}", - ) aspect = self._make_dataset_aspect(dataset) if aspect: yield self._create_workunit(urn=dataset_urn, aspect=aspect) @@ -364,7 +359,7 @@ def _get_data_process_input_workunit(self, job: _TrainingJob, dataset_id: str) - dp_aspect = DataProcessInstanceInputClass( inputs=[dataset_urn] ) - logger.info(f"generating input dataset {dataset_urn}") + logger.info(f"generating input dataset {dataset_name}") yield self._create_workunit(urn=entityUrn, aspect=dp_aspect) def _get_ml_model_endpoint_workunit(self, model: Model, model_version: VersionInfo, From c1f24b790a37a6aaaf3fdeb628d38296ee291bfd Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Mon, 24 Feb 2025 18:12:25 -0800 Subject: [PATCH 23/59] caching dataset list acquisitions --- .../src/datahub/ingestion/source/vertexai.py | 29 +++++++++---------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py index 220ed457a132c3..483d6e3749bb08 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py +++ b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py @@ -89,6 +89,7 @@ def __init__(self, ctx: PipelineContext, config: VertexAIConfig): aiplatform.init(project=config.project_id, location=config.region) self.client = aiplatform self.endpoints = None + self.datasets = None def get_report(self) -> SourceReport: return self.report @@ -281,21 +282,19 @@ def _search_dataset(self, dataset_id: str) -> Optional[_Dataset]: This method iterates through different types of datasets (Text, Tabular, Image, TimeSeries, and Video) to find a dataset that matches the given dataset ID. """ - for ds in self.client.datasets.TextDataset.list(): - if ds.name == dataset_id: - return ds - for ds in self.client.datasets.TabularDataset.list(): - if ds.name == dataset_id: - return ds - for ds in self.client.datasets.ImageDataset.list(): - if ds.name == dataset_id: - return ds - for ds in self.client.datasets.TimeSeriesDataset.list(): - if ds.name == dataset_id: - return ds - for ds in self.client.datasets.VideoDataset.list(): - if ds.name == dataset_id: - return ds + + if self.datasets is None: + self.datasets = [] + self.datasets.extend(self.client.datasets.TextDataset.list()) + self.datasets.extend(self.client.datasets.TabularDataset.list()) + self.datasets.extend(self.client.datasets.ImageDataset.list()) + self.datasets.extend(self.client.datasets.TimeSeriesDataset.list()) + self.datasets.extend(self.client.datasets.VideoDataset.list()) + + for dataset in self.datasets: + if dataset.name == dataset_id: + return dataset + return None def _make_dataset_aspect(self, ds: _Dataset) -> Optional[DatasetPropertiesClass]: From 453688d294265fa67b6ec2257bc9da29f9e0fef3 Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Tue, 25 Feb 2025 10:14:14 -0800 Subject: [PATCH 24/59] review comment on dataset --- .../src/datahub/ingestion/source/vertexai.py | 44 +++++++++---------- 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py index 483d6e3749bb08..465af025852d09 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py +++ b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py @@ -1,5 +1,6 @@ import logging import time +from collections import defaultdict from typing import Iterable, List, Optional, TypeVar from google.cloud import aiplatform @@ -60,18 +61,7 @@ class VertexAIConfig(EnvConfigMixin): description=("Bucket URI used in your project"), ) - vertexai_url: Optional[str] = Field( - default="https://console.cloud.google.com/vertex-ai", - description=("VertexUI URI"), - ) - - model_name_separator: str = Field( - default="_", - description="A string which separates model name from its version (e.g. model_1 or model-1)", - ) - - -@platform_name("vertexai") +@platform_name("Vertex AI") @config_class(VertexAIConfig) @support_status(SupportStatus.TESTING) @capability( @@ -80,7 +70,8 @@ class VertexAIConfig(EnvConfigMixin): ) @capability(SourceCapability.TAGS, "Extract tags for VertexAI Registered Model Stages") class VertexAISource(Source): - platform = "vertexai" + platform: str = "vertexai" + model_name_separator = "_" def __init__(self, ctx: PipelineContext, config: VertexAIConfig): super().__init__(ctx) @@ -91,6 +82,7 @@ def __init__(self, ctx: PipelineContext, config: VertexAIConfig): self.endpoints = None self.datasets = None + def get_report(self) -> SourceReport: return self.report @@ -124,7 +116,8 @@ def _validate_training_job(self, model: Model) -> bool: return False try: - # when model has ref to training job, but field is not accessible, it is not valid + # when model has ref to training job, but field is sometimes not accessible and RunTImeError thrown when accessed + # if RunTimeError is not thrown, it is valid and proceed name = job.name logger.debug((f"can fetch training job name: {name} for model: (name:{model.display_name} id:{model.name})")) return True @@ -284,18 +277,21 @@ def _search_dataset(self, dataset_id: str) -> Optional[_Dataset]: """ if self.datasets is None: - self.datasets = [] - self.datasets.extend(self.client.datasets.TextDataset.list()) - self.datasets.extend(self.client.datasets.TabularDataset.list()) - self.datasets.extend(self.client.datasets.ImageDataset.list()) - self.datasets.extend(self.client.datasets.TimeSeriesDataset.list()) - self.datasets.extend(self.client.datasets.VideoDataset.list()) + self.datasets = defaultdict(lambda: None) + for ds in self.client.datasets.TextDataset.list(): + self.datasets[ds.name] = ds + for ds in self.client.datasets.TabularDataset.list(): + self.datasets[ds.name] = ds + for ds in self.client.datasets.TabularDataset.list(): + self.datasets[ds.name] = ds + for ds in self.client.datasets.TimeSeriesDataset.list(): + self.datasets[ds.name] = ds + for ds in self.client.datasets.VideoDataset.list(): + self.datasets[ds.name] = ds + + return self.datasets[dataset_id] - for dataset in self.datasets: - if dataset.name == dataset_id: - return dataset - return None def _make_dataset_aspect(self, ds: _Dataset) -> Optional[DatasetPropertiesClass]: """ From be03cf59980e4240770ca904b88a4ef2847486cf Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Tue, 25 Feb 2025 10:21:27 -0800 Subject: [PATCH 25/59] minor chagne --- metadata-ingestion/src/datahub/ingestion/source/vertexai.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py index 465af025852d09..bf2e0fb406dbdd 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py +++ b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py @@ -282,7 +282,7 @@ def _search_dataset(self, dataset_id: str) -> Optional[_Dataset]: self.datasets[ds.name] = ds for ds in self.client.datasets.TabularDataset.list(): self.datasets[ds.name] = ds - for ds in self.client.datasets.TabularDataset.list(): + for ds in self.client.datasets.ImageDataset.list(): self.datasets[ds.name] = ds for ds in self.client.datasets.TimeSeriesDataset.list(): self.datasets[ds.name] = ds From 8c76435694791f3f50526128474e4f78c4f1089f Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Tue, 25 Feb 2025 12:05:59 -0800 Subject: [PATCH 26/59] change name --- .../src/datahub/ingestion/source/vertexai.py | 21 +++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py index bf2e0fb406dbdd..fa3b444267a33c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py +++ b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py @@ -13,6 +13,7 @@ Endpoint, TabularDataset, ) +from google.cloud.aiplatform.base import VertexAiResourceNoun from google.cloud.aiplatform.datasets import _Dataset from google.cloud.aiplatform.models import Model, VersionInfo from google.cloud.aiplatform.training_jobs import _TrainingJob @@ -36,8 +37,9 @@ DataProcessInstanceInputClass, DataProcessInstancePropertiesClass, DatasetPropertiesClass, - TimeStampClass, + TimeStampClass, SubTypesClass, ) +from datahub.metadata.com.linkedin.pegasus2avro.common import SubTypes from datahub.metadata.com.linkedin.pegasus2avro.dataset import DatasetProperties from datahub.metadata.schema_classes import ( MLModelDeploymentPropertiesClass, @@ -61,6 +63,11 @@ class VertexAIConfig(EnvConfigMixin): description=("Bucket URI used in your project"), ) + vertexai_url: Optional[str] = Field( + default="https://console.cloud.google.com/vertex-ai", + description=("VertexUI URI"), + ) + @platform_name("Vertex AI") @config_class(VertexAIConfig) @support_status(SupportStatus.TESTING) @@ -226,9 +233,11 @@ def _get_data_process_properties_workunit(self, job: _TrainingJob) -> MetadataWo actor=created_actor, ), externalUrl=self._make_job_external_url(job), - customProperties={"displayName": job.display_name} + customProperties={"displayName": job.display_name}, + ) + logging.info(f"Generating data process instance for training job: {entityUrn}") return self._create_workunit(urn=entityUrn, aspect=aspect) @@ -269,7 +278,7 @@ def _get_job_output_workunit(self, job: _TrainingJob) -> Iterable[MetadataWorkUn yield from self._get_ml_model_endpoint_workunit(model, model_version, job_urn) - def _search_dataset(self, dataset_id: str) -> Optional[_Dataset]: + def _search_dataset(self, dataset_id: str) -> Optional[VertexAiResourceNoun]: """ Search for a dataset by its ID in Vertex AI. This method iterates through different types of datasets (Text, Tabular, Image, @@ -394,7 +403,7 @@ def _get_ml_model_properties_workunit(self, model: Model, model_version: Version ml_model_group_urn = self._make_ml_model_group_urn(model) model_name = self._make_vertexai_name(entity_type="model", entity_id=model.name) - model_version_name = f"{model_name}{self.config.model_name_separator}{model_version.version_id}" + model_version_name = f"{model_name}{self.model_name_separator}{model_version.version_id}" ml_model_urn = self._make_ml_model_urn(model_version, model_name=model_name) @@ -402,7 +411,7 @@ def _get_ml_model_properties_workunit(self, model: Model, model_version: Version name=model_version_name, description=model_version.version_description, customProperties={"displayName": model_version.model_display_name + - self.config.model_name_separator + model_version.version_id, + self.model_name_separator + model_version.version_id, "resourceName": model.resource_name}, created=TimeStampClass(model_version.version_create_time.second), lastModified=TimeStampClass(model_version.version_update_time.second), @@ -436,7 +445,7 @@ def _search_endpoint(self, model: Model) -> Optional[Endpoint]: def _make_ml_model_urn(self, model_version: VersionInfo, model_name:str) -> str: urn = builder.make_ml_model_urn( platform=self.platform, - model_name=f"{model_name}{self.config.model_name_separator}{model_version.version_id}", + model_name=f"{model_name}{self.model_name_separator}{model_version.version_id}", env=self.config.env, ) return urn From 33a19c9ca8d0b96cc6dd582888b1faf055c279b0 Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Tue, 25 Feb 2025 12:43:57 -0800 Subject: [PATCH 27/59] lint fix --- metadata-ingestion/src/datahub/ingestion/source/vertexai.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py index fa3b444267a33c..ba485edbd9249e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py +++ b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py @@ -11,7 +11,6 @@ AutoMLTextTrainingJob, AutoMLVideoTrainingJob, Endpoint, - TabularDataset, ) from google.cloud.aiplatform.base import VertexAiResourceNoun from google.cloud.aiplatform.datasets import _Dataset @@ -37,10 +36,8 @@ DataProcessInstanceInputClass, DataProcessInstancePropertiesClass, DatasetPropertiesClass, - TimeStampClass, SubTypesClass, + TimeStampClass, ) -from datahub.metadata.com.linkedin.pegasus2avro.common import SubTypes -from datahub.metadata.com.linkedin.pegasus2avro.dataset import DatasetProperties from datahub.metadata.schema_classes import ( MLModelDeploymentPropertiesClass, MLModelGroupPropertiesClass, From b76ec256b64f0f53c337d57e870ca31522b3c39b Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Tue, 25 Feb 2025 13:36:55 -0800 Subject: [PATCH 28/59] Refactor code to use auto_workunit --- .../src/datahub/ingestion/source/vertexai.py | 338 +++++++++++------- 1 file changed, 206 insertions(+), 132 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py index ba485edbd9249e..717e03aff36898 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py +++ b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py @@ -13,7 +13,6 @@ Endpoint, ) from google.cloud.aiplatform.base import VertexAiResourceNoun -from google.cloud.aiplatform.datasets import _Dataset from google.cloud.aiplatform.models import Model, VersionInfo from google.cloud.aiplatform.training_jobs import _TrainingJob from pydantic.fields import Field @@ -30,12 +29,14 @@ support_status, ) from datahub.ingestion.api.source import Source, SourceCapability, SourceReport +from datahub.ingestion.api.source_helpers import auto_workunit from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.metadata._schema_classes import ( AuditStampClass, DataProcessInstanceInputClass, DataProcessInstancePropertiesClass, DatasetPropertiesClass, + SubTypesClass, TimeStampClass, ) from datahub.metadata.schema_classes import ( @@ -43,13 +44,13 @@ MLModelGroupPropertiesClass, MLModelPropertiesClass, VersionTagClass, - _Aspect, ) T = TypeVar("T") logger = logging.getLogger(__name__) + class VertexAIConfig(EnvConfigMixin): project_id: str = Field(description=("Project ID in Google Cloud Platform")) region: str = Field( @@ -65,6 +66,7 @@ class VertexAIConfig(EnvConfigMixin): description=("VertexUI URI"), ) + @platform_name("Vertex AI") @config_class(VertexAIConfig) @support_status(SupportStatus.TESTING) @@ -86,7 +88,6 @@ def __init__(self, ctx: PipelineContext, config: VertexAIConfig): self.endpoints = None self.datasets = None - def get_report(self) -> SourceReport: return self.report @@ -102,15 +103,6 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: yield from self._get_training_job_workunit() # TODO Fetch Experiments and Experiment Runs - def _create_workunit(self, urn: str, aspect: _Aspect) -> MetadataWorkUnit: - """ - Utility to create an MCP workunit. - """ - return MetadataChangeProposalWrapper( - entityUrn=urn, - aspect=aspect, - ).as_workunit() - def _validate_training_job(self, model: Model) -> bool: """ Validate Model Has Valid Training Job @@ -123,10 +115,16 @@ def _validate_training_job(self, model: Model) -> bool: # when model has ref to training job, but field is sometimes not accessible and RunTImeError thrown when accessed # if RunTimeError is not thrown, it is valid and proceed name = job.name - logger.debug((f"can fetch training job name: {name} for model: (name:{model.display_name} id:{model.name})")) + logger.debug( + ( + f"can fetch training job name: {name} for model: (name:{model.display_name} id:{model.name})" + ) + ) return True except RuntimeError: - logger.debug(f"cannot fetch training job name, not valid for model (name:{model.display_name} id:{model.name})") + logger.debug( + f"cannot fetch training job name, not valid for model (name:{model.display_name} id:{model.name})" + ) return False @@ -137,20 +135,25 @@ def _get_ml_model_workunits(self) -> Iterable[MetadataWorkUnit]: registered_models = self.client.Model.list() for model in registered_models: # create work unit for Model Group (= Model in VertexAI) - yield self._get_ml_group_workunit(model) + yield from self._get_ml_group_workunit(model) model_versions = model.versioning_registry.list_versions() for model_version in model_versions: - # create work unit for Training Job (if Model has reference to Training Job) if self._validate_training_job(model): logger.info( - f"Generating TrainingJob work unit for model: {model_version.model_display_name}") - yield from self._get_data_process_properties_workunit(model.training_job) + f"Generating TrainingJob work unit for model: {model_version.model_display_name}" + ) + yield from self._get_data_process_properties_workunit( + model.training_job + ) # create work unit for Model (= Model Version in VertexAI) - logger.info(f"Generating work unit for model (name: {model.display_name} id:{model.name})") - yield from self._get_ml_model_endpoint_workunit(model=model, model_version=model_version) - + logger.info( + f"Generating work unit for model (name: {model.display_name} id:{model.name})" + ) + yield from self._get_ml_model_endpoint_workunit( + model=model, model_version=model_version + ) def _get_training_job_workunit(self) -> Iterable[MetadataWorkUnit]: """ @@ -165,56 +168,81 @@ def _get_training_job_workunit(self) -> Iterable[MetadataWorkUnit]: yield from self._get_data_process_workunit(self.client.CustomJob.list()) logger.info("Fetching a list of CustomTrainingJobs from VertexAI server") yield from self._get_data_process_workunit(self.client.CustomTrainingJob.list()) - logger.info("Fetching a list of CustomContainerTrainingJobs from VertexAI server") - yield from self._get_data_process_workunit(self.client.CustomContainerTrainingJob.list()) - logger.info("Fetching a list of CustomPythonPackageTrainingJob from VertexAI server") - yield from self._get_data_process_workunit(self.client.CustomPythonPackageTrainingJob.list()) + logger.info( + "Fetching a list of CustomContainerTrainingJobs from VertexAI server" + ) + yield from self._get_data_process_workunit( + self.client.CustomContainerTrainingJob.list() + ) + logger.info( + "Fetching a list of CustomPythonPackageTrainingJob from VertexAI server" + ) + yield from self._get_data_process_workunit( + self.client.CustomPythonPackageTrainingJob.list() + ) logger.info("Fetching a list of AutoMLTabularTrainingJobs from VertexAI server") - yield from self._get_data_process_workunit(self.client.AutoMLTabularTrainingJob.list()) + yield from self._get_data_process_workunit( + self.client.AutoMLTabularTrainingJob.list() + ) logger.info("Fetching a list of AutoMLTextTrainingJobs from VertexAI server") - yield from self._get_data_process_workunit(self.client.AutoMLTextTrainingJob.list()) + yield from self._get_data_process_workunit( + self.client.AutoMLTextTrainingJob.list() + ) logger.info("Fetching a list of AutoMLImageTrainingJobs from VertexAI server") - yield from self._get_data_process_workunit(self.client.AutoMLImageTrainingJob.list()) + yield from self._get_data_process_workunit( + self.client.AutoMLImageTrainingJob.list() + ) logger.info("Fetching a list of AutoMLVideoTrainingJobs from VertexAI server") - yield from self._get_data_process_workunit(self.client.AutoMLVideoTrainingJob.list()) - logger.info("Fetching a list of AutoMLForecastingTrainingJobs from VertexAI server") - yield from self._get_data_process_workunit(self.client.AutoMLForecastingTrainingJob.list()) + yield from self._get_data_process_workunit( + self.client.AutoMLVideoTrainingJob.list() + ) + logger.info( + "Fetching a list of AutoMLForecastingTrainingJobs from VertexAI server" + ) + yield from self._get_data_process_workunit( + self.client.AutoMLForecastingTrainingJob.list() + ) - def _get_data_process_workunit(self, jobs: List[_TrainingJob]) -> Iterable[MetadataWorkUnit]: + def _get_data_process_workunit( + self, jobs: List[_TrainingJob] + ) -> Iterable[MetadataWorkUnit]: for job in jobs: - yield self._get_data_process_properties_workunit(job) + yield from self._get_data_process_properties_workunit(job) yield from self._get_job_output_workunit(job) yield from self._get_job_input_workunit(job) def _get_ml_group_workunit( self, model: Model, - ) -> MetadataWorkUnit: + ) -> Iterable[MetadataWorkUnit]: """ Generate an MLModelGroup work unit for a VertexAI Model. """ ml_model_group_urn = self._make_ml_model_group_urn(model) - ml_model_group_properties = MLModelGroupPropertiesClass( - name=self._make_vertexai_name("model_group", model.name), - description=model.description, - createdAt=int(model.create_time.timestamp()), - customProperties={"displayName": model.display_name} - ) - wu = self._create_workunit( - urn=ml_model_group_urn, - aspect=ml_model_group_properties, + + mcp = MetadataChangeProposalWrapper( + entityUrn=ml_model_group_urn, + aspect=MLModelGroupPropertiesClass( + name=self._make_vertexai_name("model_group", model.name), + description=model.description, + createdAt=int(model.create_time.timestamp()), + customProperties={"displayName": model.display_name}, + ), ) - return wu + + yield from auto_workunit([mcp]) def _make_ml_model_group_urn(self, model: Model) -> str: urn = builder.make_ml_model_group_urn( platform=self.platform, - group_name=self._make_vertexai_name("model",model.name), + group_name=self._make_vertexai_name("model", model.name), env=self.config.env, ) return urn - def _get_data_process_properties_workunit(self, job: _TrainingJob) -> MetadataWorkUnit: + def _get_data_process_properties_workunit( + self, job: _TrainingJob + ) -> Iterable[MetadataWorkUnit]: """ Generate a work unit for VertexAI Training Job """ @@ -223,7 +251,10 @@ def _get_data_process_properties_workunit(self, job: _TrainingJob) -> MetadataWo job_id = self._make_vertexai_name(entity_type="job", entity_id=job.name) entityUrn = builder.make_data_process_instance_urn(job_id) - aspect = DataProcessInstancePropertiesClass( + + prop_mcp = MetadataChangeProposalWrapper( + entityUrn=entityUrn, + aspect=DataProcessInstancePropertiesClass( name=job_id, created=AuditStampClass( time=created_time, @@ -231,27 +262,32 @@ def _get_data_process_properties_workunit(self, job: _TrainingJob) -> MetadataWo ), externalUrl=self._make_job_external_url(job), customProperties={"displayName": job.display_name}, + ), + ) - ) - + jobtype = job.__class__.__name__ + subtype_mcp = MetadataChangeProposalWrapper( + entityUrn=entityUrn, aspect=SubTypesClass(typeNames=[f"{jobtype}"]) + ) - logging.info(f"Generating data process instance for training job: {entityUrn}") - return self._create_workunit(urn=entityUrn, aspect=aspect) + yield from auto_workunit([prop_mcp, subtype_mcp]) def _is_automl_job(self, job: _TrainingJob) -> bool: - return ((isinstance(job, AutoMLTabularTrainingJob) or - isinstance(job, AutoMLTextTrainingJob) or - isinstance(job, AutoMLImageTrainingJob) or - isinstance(job, AutoMLVideoTrainingJob)) or - isinstance(job, AutoMLForecastingTrainingJob)) - - def _search_model_version(self, model:Model, version_id:str) -> Optional[VersionInfo]: + return ( + isinstance(job, AutoMLTabularTrainingJob) + or isinstance(job, AutoMLTextTrainingJob) + or isinstance(job, AutoMLImageTrainingJob) + or isinstance(job, AutoMLVideoTrainingJob) + ) or isinstance(job, AutoMLForecastingTrainingJob) + + def _search_model_version( + self, model: Model, version_id: str + ) -> Optional[VersionInfo]: for version in model.versioning_registry.list_versions(): if version.version_id == version_id: return version return None - def _get_job_output_workunit(self, job: _TrainingJob) -> Iterable[MetadataWorkUnit]: """ This method creates work units that link the training job to the model version @@ -261,8 +297,11 @@ def _get_job_output_workunit(self, job: _TrainingJob) -> Iterable[MetadataWorkUn """ job_conf = job.to_dict() - if "modelToUpload" in job_conf and "name" in job_conf["modelToUpload"] and job_conf["modelToUpload"]["name"]: - + if ( + "modelToUpload" in job_conf + and "name" in job_conf["modelToUpload"] + and job_conf["modelToUpload"]["name"] + ): model_version_str = job_conf["modelToUpload"]["versionId"] job_urn = self._make_job_urn(job) @@ -271,9 +310,11 @@ def _get_job_output_workunit(self, job: _TrainingJob) -> Iterable[MetadataWorkUn if model and model_version: logger.info( f"found that training job: {job.display_name} generated " - f"a model (name:{model.display_name} id:{model_version_str})") - yield from self._get_ml_model_endpoint_workunit(model, model_version, job_urn) - + f"a model (name:{model.display_name} id:{model_version_str})" + ) + yield from self._get_ml_model_endpoint_workunit( + model, model_version, job_urn + ) def _search_dataset(self, dataset_id: str) -> Optional[VertexAiResourceNoun]: """ @@ -297,9 +338,9 @@ def _search_dataset(self, dataset_id: str) -> Optional[VertexAiResourceNoun]: return self.datasets[dataset_id] - - - def _make_dataset_aspect(self, ds: _Dataset) -> Optional[DatasetPropertiesClass]: + def _get_dataset_workunit( + self, urn: str, ds: VertexAiResourceNoun + ) -> Iterable[MetadataWorkUnit]: """ Create a DatasetPropertiesClass aspect for a given Vertex AI dataset. """ @@ -307,33 +348,42 @@ def _make_dataset_aspect(self, ds: _Dataset) -> Optional[DatasetPropertiesClass] name=self._make_vertexai_name("dataset", ds.name), created=TimeStampClass(time=int(ds.create_time.timestamp())), description=f"Dataset: {ds.display_name} for training job", - customProperties={"displayName": ds.display_name, - "resourceName": ds.resource_name, - }, - qualifiedName=ds.resource_name + customProperties={ + "displayName": ds.display_name, + "resourceName": ds.resource_name, + }, + qualifiedName=ds.resource_name, ) - return aspect + mcp = MetadataChangeProposalWrapper(entityUrn=urn, aspect=aspect) + yield from auto_workunit([mcp]) def _get_job_input_workunit(self, job: _TrainingJob) -> Iterable[MetadataWorkUnit]: """ - Generate work units for the input data of a training job. - This method checks if the training job is an AutoML job and if it has an input dataset - configuration. If so, it creates a work unit for the input dataset. - """ + Generate work units for the input data of a training job. + This method checks if the training job is an AutoML job and if it has an input dataset + configuration. If so, it creates a work unit for the input dataset. + """ if self._is_automl_job(job): job_conf = job.to_dict() - if "inputDataConfig" in job_conf and "datasetId" in job_conf["inputDataConfig"]: + if ( + "inputDataConfig" in job_conf + and "datasetId" in job_conf["inputDataConfig"] + ): # Create URN of Input Dataset for Training Job dataset_id = job_conf["inputDataConfig"]["datasetId"] - logger.info(f"found that training job {job.display_name} used input dataset: {dataset_id}") + logger.info( + f"found that training job {job.display_name} used input dataset: {dataset_id}" + ) if dataset_id: yield from self._get_data_process_input_workunit(job, dataset_id) - def _get_data_process_input_workunit(self, job: _TrainingJob, dataset_id: str) -> Iterable[MetadataWorkUnit]: + def _get_data_process_input_workunit( + self, job: _TrainingJob, dataset_id: str + ) -> Iterable[MetadataWorkUnit]: """ This method creates a work unit for the input dataset of a training job. It constructs the URN for the input dataset and the training job, and then creates a DataProcessInstanceInputClass aspect @@ -341,7 +391,9 @@ def _get_data_process_input_workunit(self, job: _TrainingJob, dataset_id: str) - """ # Create URN of Input Dataset for Training Job - dataset_name = self._make_vertexai_name(entity_type="dataset", entity_id=dataset_id) + dataset_name = self._make_vertexai_name( + entity_type="dataset", entity_id=dataset_id + ) dataset_urn = builder.make_dataset_urn( platform=self.platform, name=dataset_name, @@ -350,47 +402,61 @@ def _get_data_process_input_workunit(self, job: _TrainingJob, dataset_id: str) - dataset = self._search_dataset(dataset_id) if dataset: - aspect = self._make_dataset_aspect(dataset) - if aspect: - yield self._create_workunit(urn=dataset_urn, aspect=aspect) + yield from self._get_dataset_workunit(urn=dataset_urn, ds=dataset) # Create URN of Training Job job_id = self._make_vertexai_name(entity_type="job", entity_id=job.name) - entityUrn = builder.make_data_process_instance_urn(job_id) - dp_aspect = DataProcessInstanceInputClass( - inputs=[dataset_urn] + mcp = MetadataChangeProposalWrapper( + entityUrn=builder.make_data_process_instance_urn(job_id), + aspect=DataProcessInstanceInputClass(inputs=[dataset_urn]), ) logger.info(f"generating input dataset {dataset_name}") - yield self._create_workunit(urn=entityUrn, aspect=dp_aspect) - - def _get_ml_model_endpoint_workunit(self, model: Model, model_version: VersionInfo, - training_job_urn: Optional[str] = None) -> Iterable[MetadataWorkUnit]: + yield from auto_workunit([mcp]) + def _get_ml_model_endpoint_workunit( + self, + model: Model, + model_version: VersionInfo, + training_job_urn: Optional[str] = None, + ) -> Iterable[MetadataWorkUnit]: + """ + Generate an MLModel and Endpoint work unit for an VertexAI Model Version. """ - Generate an MLModel and Endpoint work unit for an VertexAI Model Version. - """ endpoint: Optional[Endpoint] = self._search_endpoint(model) endpoint_urn = None if endpoint: endpoint_urn = builder.make_ml_model_deployment_urn( - platform=self.platform, - deployment_name=self._make_vertexai_name("endpoint", endpoint.display_name), - env=self.config.env - ) if endpoint else None - ml_deployment_properties = MLModelDeploymentPropertiesClass( + platform=self.platform, + deployment_name=self._make_vertexai_name( + "endpoint", endpoint.display_name + ), + env=self.config.env, + ) + deployment_aspect = MLModelDeploymentPropertiesClass( description=model.description, createdAt=int(endpoint.create_time.timestamp()), version=VersionTagClass(versionTag=str(model_version.version_id)), - customProperties={"displayName": endpoint.display_name} + customProperties={"displayName": endpoint.display_name}, + ) + + mcp = MetadataChangeProposalWrapper( + entityUrn=endpoint_urn, aspect=deployment_aspect ) - yield self._create_workunit(urn=endpoint_urn, aspect=ml_deployment_properties) + yield from auto_workunit([mcp]) - yield self._get_ml_model_properties_workunit(model, model_version,training_job_urn, endpoint_urn) + yield from self._get_ml_model_properties_workunit( + model, model_version, training_job_urn, endpoint_urn + ) - def _get_ml_model_properties_workunit(self, model: Model, model_version: VersionInfo, - training_job_urn:Optional[str] = None, endpoint_urn:Optional[str] = None) -> MetadataWorkUnit: + def _get_ml_model_properties_workunit( + self, + model: Model, + model_version: VersionInfo, + training_job_urn: Optional[str] = None, + endpoint_urn: Optional[str] = None, + ) -> Iterable[MetadataWorkUnit]: """ Generate an MLModel workunit for an VertexAI Model Version. Every Model Version is a DataHub MLModel entity associated with an MLModelGroup @@ -400,29 +466,38 @@ def _get_ml_model_properties_workunit(self, model: Model, model_version: Version ml_model_group_urn = self._make_ml_model_group_urn(model) model_name = self._make_vertexai_name(entity_type="model", entity_id=model.name) - model_version_name = f"{model_name}{self.model_name_separator}{model_version.version_id}" + model_version_name = ( + f"{model_name}{self.model_name_separator}{model_version.version_id}" + ) ml_model_urn = self._make_ml_model_urn(model_version, model_name=model_name) - ml_model_properties = MLModelPropertiesClass( name=model_version_name, description=model_version.version_description, - customProperties={"displayName": model_version.model_display_name + - self.model_name_separator + model_version.version_id, - "resourceName": model.resource_name}, + customProperties={ + "displayName": model_version.model_display_name + + self.model_name_separator + + model_version.version_id, + "resourceName": model.resource_name, + }, created=TimeStampClass(model_version.version_create_time.second), lastModified=TimeStampClass(model_version.version_update_time.second), version=VersionTagClass(versionTag=str(model_version.version_id)), - groups=[ml_model_group_urn], # link model version to model group - trainingJobs=[training_job_urn] if training_job_urn else None, # link to training job - deployments=[endpoint_urn] if endpoint_urn else [], # link to model registry and endpoint + groups=[ml_model_group_urn], # link model version to model group + trainingJobs=[training_job_urn] + if training_job_urn + else None, # link to training job + deployments=[endpoint_urn] + if endpoint_urn + else [], # link to model registry and endpoint externalUrl=self._make_model_version_external_url(model), ) # logging.info(f"created model version {ml_model_properties.name} associated with group {ml_model_group_urn}") - return self._create_workunit(urn=ml_model_urn, aspect=ml_model_properties) - - + mcp = MetadataChangeProposalWrapper( + entityUrn=ml_model_urn, aspect=ml_model_properties + ) + yield from auto_workunit([mcp]) def _search_endpoint(self, model: Model) -> Optional[Endpoint]: """ @@ -438,8 +513,7 @@ def _search_endpoint(self, model: Model) -> Optional[Endpoint]: return None - - def _make_ml_model_urn(self, model_version: VersionInfo, model_name:str) -> str: + def _make_ml_model_urn(self, model_version: VersionInfo, model_name: str) -> str: urn = builder.make_ml_model_urn( platform=self.platform, model_name=f"{model_name}{self.model_name_separator}{model_version.version_id}", @@ -449,19 +523,14 @@ def _make_ml_model_urn(self, model_version: VersionInfo, model_name:str) -> str: def _make_job_urn(self, job: _TrainingJob) -> str: job_id = self._make_vertexai_name(entity_type="job", entity_id=job.name) - urn = builder.make_data_process_instance_urn( - dataProcessInstanceId=job_id - ) + urn = builder.make_data_process_instance_urn(dataProcessInstanceId=job_id) return urn - - def _make_vertexai_name(self, - entity_type:str, - entity_id:str, - separator:str=".") -> str: + def _make_vertexai_name( + self, entity_type: str, entity_id: str, separator: str = "." + ) -> str: return f"{self.config.project_id}{separator}{entity_type}{separator}{entity_id}" - def _make_job_external_url(self, job: _TrainingJob): """ Model external URL in Vertex AI @@ -469,8 +538,10 @@ def _make_job_external_url(self, job: _TrainingJob): https://console.cloud.google.com/vertex-ai/training/training-pipelines?project=acryl-poc&trainingPipelineId=5401695018589093888 """ entity_type = "training" - external_url = (f"{self.config.vertexai_url}/{entity_type}/training-pipelines?trainingPipelineId={job.name}" - f"?project={self.config.project_id}") + external_url = ( + f"{self.config.vertexai_url}/{entity_type}/training-pipelines?trainingPipelineId={job.name}" + f"?project={self.config.project_id}" + ) return external_url def _make_model_external_url(self, model: Model): @@ -480,8 +551,10 @@ def _make_model_external_url(self, model: Model): https://console.cloud.google.com/vertex-ai/models/locations/us-west2/models/812468724182286336?project=acryl-poc """ entity_type = "models" - external_url = (f"{self.config.vertexai_url}/{entity_type}/locations/{self.config.region}/{entity_type}/{model.name}" - f"?project={self.config.project_id}") + external_url = ( + f"{self.config.vertexai_url}/{entity_type}/locations/{self.config.region}/{entity_type}/{model.name}" + f"?project={self.config.project_id}" + ) return external_url def _make_model_version_external_url(self, model: Model): @@ -491,8 +564,9 @@ def _make_model_version_external_url(self, model: Model): https://console.cloud.google.com/vertex-ai/models/locations/us-west2/models/812468724182286336/versions/1?project=acryl-poc """ entity_type = "models" - external_url = (f"{self.config.vertexai_url}/{entity_type}/locations/{self.config.region}/{entity_type}/{model.name}" - f"/versions/{model.version_id}" - f"?project={self.config.project_id}") + external_url = ( + f"{self.config.vertexai_url}/{entity_type}/locations/{self.config.region}/{entity_type}/{model.name}" + f"/versions/{model.version_id}" + f"?project={self.config.project_id}" + ) return external_url - From c7d51653d76f6552796253b467379d83c8bc69bd Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Tue, 25 Feb 2025 14:33:07 -0800 Subject: [PATCH 29/59] flattern make_vertexai_name --- .../src/datahub/ingestion/source/vertexai.py | 79 +++++++++++++------ 1 file changed, 55 insertions(+), 24 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py index 717e03aff36898..7529d9b0068e67 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py +++ b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py @@ -141,7 +141,7 @@ def _get_ml_model_workunits(self) -> Iterable[MetadataWorkUnit]: # create work unit for Training Job (if Model has reference to Training Job) if self._validate_training_job(model): logger.info( - f"Generating TrainingJob work unit for model: {model_version.model_display_name}" + f"Ingesting a training job for a model: {model_version.model_display_name}" ) yield from self._get_data_process_properties_workunit( model.training_job @@ -149,7 +149,7 @@ def _get_ml_model_workunits(self) -> Iterable[MetadataWorkUnit]: # create work unit for Model (= Model Version in VertexAI) logger.info( - f"Generating work unit for model (name: {model.display_name} id:{model.name})" + f"Ingesting a model (name: {model.display_name} id:{model.name})" ) yield from self._get_ml_model_endpoint_workunit( model=model, model_version=model_version @@ -223,7 +223,7 @@ def _get_ml_group_workunit( mcp = MetadataChangeProposalWrapper( entityUrn=ml_model_group_urn, aspect=MLModelGroupPropertiesClass( - name=self._make_vertexai_name("model_group", model.name), + name=self._make_vertexai_model_group_name(model.name), description=model.description, createdAt=int(model.create_time.timestamp()), customProperties={"displayName": model.display_name}, @@ -235,7 +235,7 @@ def _get_ml_group_workunit( def _make_ml_model_group_urn(self, model: Model) -> str: urn = builder.make_ml_model_group_urn( platform=self.platform, - group_name=self._make_vertexai_name("model", model.name), + group_name=self._make_vertexai_model_name(model.name), env=self.config.env, ) return urn @@ -249,7 +249,7 @@ def _get_data_process_properties_workunit( created_time = int(job.start_time.timestamp()) or int(time.time() * 1000) created_actor = f"urn:li:platformResource:{self.platform}" - job_id = self._make_vertexai_name(entity_type="job", entity_id=job.name) + job_id = self._make_vertexai_job_name(entity_id=job.name) entityUrn = builder.make_data_process_instance_urn(job_id) prop_mcp = MetadataChangeProposalWrapper( @@ -344,20 +344,31 @@ def _get_dataset_workunit( """ Create a DatasetPropertiesClass aspect for a given Vertex AI dataset. """ - aspect = DatasetPropertiesClass( - name=self._make_vertexai_name("dataset", ds.name), - created=TimeStampClass(time=int(ds.create_time.timestamp())), - description=f"Dataset: {ds.display_name} for training job", - customProperties={ - "displayName": ds.display_name, - "resourceName": ds.resource_name, - }, - qualifiedName=ds.resource_name, + + mcps = [] + mcps.append( + MetadataChangeProposalWrapper( + entityUrn=urn, + aspect=DatasetPropertiesClass( + name=self._make_vertexai_dataset_name(ds.name), + created=TimeStampClass(time=int(ds.create_time.timestamp())), + description=f"Dataset: {ds.display_name} for training job", + customProperties={ + "displayName": ds.display_name, + "resourceName": ds.resource_name, + }, + qualifiedName=ds.resource_name, + ), + ) ) - mcp = MetadataChangeProposalWrapper(entityUrn=urn, aspect=aspect) + mcps.append( + MetadataChangeProposalWrapper( + entityUrn=urn, aspect=SubTypesClass(typeNames=["Dataset"]) + ) + ) - yield from auto_workunit([mcp]) + yield from auto_workunit(mcps) def _get_job_input_workunit(self, job: _TrainingJob) -> Iterable[MetadataWorkUnit]: """ @@ -391,9 +402,7 @@ def _get_data_process_input_workunit( """ # Create URN of Input Dataset for Training Job - dataset_name = self._make_vertexai_name( - entity_type="dataset", entity_id=dataset_id - ) + dataset_name = self._make_vertexai_dataset_name(entity_id=dataset_id) dataset_urn = builder.make_dataset_urn( platform=self.platform, name=dataset_name, @@ -405,7 +414,7 @@ def _get_data_process_input_workunit( yield from self._get_dataset_workunit(urn=dataset_urn, ds=dataset) # Create URN of Training Job - job_id = self._make_vertexai_name(entity_type="job", entity_id=job.name) + job_id = self._make_vertexai_job_name(entity_id=job.name) mcp = MetadataChangeProposalWrapper( entityUrn=builder.make_data_process_instance_urn(job_id), aspect=DataProcessInstanceInputClass(inputs=[dataset_urn]), @@ -429,8 +438,8 @@ def _get_ml_model_endpoint_workunit( if endpoint: endpoint_urn = builder.make_ml_model_deployment_urn( platform=self.platform, - deployment_name=self._make_vertexai_name( - "endpoint", endpoint.display_name + deployment_name=self._make_vertexai_endpoint_name( + entity_id=endpoint.display_name ), env=self.config.env, ) @@ -465,7 +474,7 @@ def _get_ml_model_properties_workunit( logging.info(f"starting model work unit for model {model.name}") ml_model_group_urn = self._make_ml_model_group_urn(model) - model_name = self._make_vertexai_name(entity_type="model", entity_id=model.name) + model_name = self._make_vertexai_model_name(entity_id=model.name) model_version_name = ( f"{model_name}{self.model_name_separator}{model_version.version_id}" ) @@ -522,7 +531,7 @@ def _make_ml_model_urn(self, model_version: VersionInfo, model_name: str) -> str return urn def _make_job_urn(self, job: _TrainingJob) -> str: - job_id = self._make_vertexai_name(entity_type="job", entity_id=job.name) + job_id = self._make_vertexai_job_name(entity_id=job.name) urn = builder.make_data_process_instance_urn(dataProcessInstanceId=job_id) return urn @@ -531,6 +540,28 @@ def _make_vertexai_name( ) -> str: return f"{self.config.project_id}{separator}{entity_type}{separator}{entity_id}" + def _make_vertexai_model_group_name( + self, entity_id: str, separator: str = "." + ) -> str: + entity_type = "model_group" + return f"{self.config.project_id}{separator}{entity_type}{separator}{entity_id}" + + def _make_vertexai_endpoint_name(self, entity_id: str, separator: str = ".") -> str: + entity_type = "endpoint" + return f"{self.config.project_id}{separator}{entity_type}{separator}{entity_id}" + + def _make_vertexai_model_name(self, entity_id: str, separator: str = ".") -> str: + entity_type = "model" + return f"{self.config.project_id}{separator}{entity_type}{separator}{entity_id}" + + def _make_vertexai_dataset_name(self, entity_id: str, separator: str = ".") -> str: + entity_type = "dataset" + return f"{self.config.project_id}{separator}{entity_type}{separator}{entity_id}" + + def _make_vertexai_job_name(self, entity_id: str, separator: str = ".") -> str: + entity_type = "job" + return f"{self.config.project_id}{separator}{entity_type}{separator}{entity_id}" + def _make_job_external_url(self, job: _TrainingJob): """ Model external URL in Vertex AI From 482c159f85fff4f51011540590d99f6a70474241 Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Tue, 25 Feb 2025 15:44:54 -0800 Subject: [PATCH 30/59] lint type error is fixed --- .../src/datahub/ingestion/source/vertexai.py | 92 ++++++++++--------- 1 file changed, 48 insertions(+), 44 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py index 7529d9b0068e67..696ff44bbd5e52 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py +++ b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py @@ -1,6 +1,5 @@ import logging import time -from collections import defaultdict from typing import Iterable, List, Optional, TypeVar from google.cloud import aiplatform @@ -85,8 +84,8 @@ def __init__(self, ctx: PipelineContext, config: VertexAIConfig): self.report = SourceReport() aiplatform.init(project=config.project_id, location=config.region) self.client = aiplatform - self.endpoints = None - self.datasets = None + self.endpoints: Optional[List[Endpoint]] = None + self.datasets: Optional[dict] = None def get_report(self) -> SourceReport: return self.report @@ -100,7 +99,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: # Fetch Models, Model Versions a from Model Registry yield from self._get_ml_model_workunits() # Fetch Training Jobs - yield from self._get_training_job_workunit() + yield from self._get_training_jobs_workunit() # TODO Fetch Experiments and Experiment Runs def _validate_training_job(self, model: Model) -> bool: @@ -143,9 +142,10 @@ def _get_ml_model_workunits(self) -> Iterable[MetadataWorkUnit]: logger.info( f"Ingesting a training job for a model: {model_version.model_display_name}" ) - yield from self._get_data_process_properties_workunit( - model.training_job - ) + if model.training_job: + yield from self._get_data_process_properties_workunit( + model.training_job + ) # create work unit for Model (= Model Version in VertexAI) logger.info( @@ -155,7 +155,7 @@ def _get_ml_model_workunits(self) -> Iterable[MetadataWorkUnit]: model=model, model_version=model_version ) - def _get_training_job_workunit(self) -> Iterable[MetadataWorkUnit]: + def _get_training_jobs_workunit(self) -> Iterable[MetadataWorkUnit]: """ Fetches training jobs from Vertex AI and generates corresponding work units. This method retrieves various types of training jobs from Vertex AI, including @@ -165,48 +165,51 @@ def _get_training_job_workunit(self) -> Iterable[MetadataWorkUnit]: about the job, its inputs, and its outputs. """ logger.info("Fetching a list of CustomJobs from VertexAI server") - yield from self._get_data_process_workunit(self.client.CustomJob.list()) + for job in self.client.CustomJob.list(): + yield from self._get_training_job_workunit(job) + logger.info("Fetching a list of CustomTrainingJobs from VertexAI server") - yield from self._get_data_process_workunit(self.client.CustomTrainingJob.list()) + for job in self.client.CustomTrainingJob.list(): + yield from self._get_training_job_workunit(job) + logger.info( "Fetching a list of CustomContainerTrainingJobs from VertexAI server" ) - yield from self._get_data_process_workunit( - self.client.CustomContainerTrainingJob.list() - ) + for job in self.client.CustomContainerTrainingJob.list(): + yield from self._get_training_job_workunit(job) + logger.info( "Fetching a list of CustomPythonPackageTrainingJob from VertexAI server" ) - yield from self._get_data_process_workunit( - self.client.CustomPythonPackageTrainingJob.list() - ) + for job in self.client.CustomPythonPackageTrainingJob.list(): + yield from self._get_training_job_workunit(job) + logger.info("Fetching a list of AutoMLTabularTrainingJobs from VertexAI server") - yield from self._get_data_process_workunit( - self.client.AutoMLTabularTrainingJob.list() - ) + for job in self.client.AutoMLTabularTrainingJob.list(): + yield from self._get_training_job_workunit(job) + logger.info("Fetching a list of AutoMLTextTrainingJobs from VertexAI server") - yield from self._get_data_process_workunit( - self.client.AutoMLTextTrainingJob.list() - ) + for job in self.client.AutoMLTextTrainingJob.list(): + yield from self._get_training_job_workunit(job) + logger.info("Fetching a list of AutoMLImageTrainingJobs from VertexAI server") - yield from self._get_data_process_workunit( - self.client.AutoMLImageTrainingJob.list() - ) + for job in self.client.AutoMLImageTrainingJob.list(): + yield from self._get_training_job_workunit(job) + logger.info("Fetching a list of AutoMLVideoTrainingJobs from VertexAI server") - yield from self._get_data_process_workunit( - self.client.AutoMLVideoTrainingJob.list() - ) + for job in self.client.AutoMLVideoTrainingJob.list(): + yield from self._get_training_job_workunit(job) + logger.info( "Fetching a list of AutoMLForecastingTrainingJobs from VertexAI server" ) - yield from self._get_data_process_workunit( - self.client.AutoMLForecastingTrainingJob.list() - ) + for job in self.client.AutoMLForecastingTrainingJob.list(): + yield from self._get_training_job_workunit(job) - def _get_data_process_workunit( - self, jobs: List[_TrainingJob] + def _get_training_job_workunit( + self, job: VertexAiResourceNoun ) -> Iterable[MetadataWorkUnit]: - for job in jobs: + if isinstance(job, _TrainingJob): yield from self._get_data_process_properties_workunit(job) yield from self._get_job_output_workunit(job) yield from self._get_job_input_workunit(job) @@ -241,12 +244,13 @@ def _make_ml_model_group_urn(self, model: Model) -> str: return urn def _get_data_process_properties_workunit( - self, job: _TrainingJob + self, job: VertexAiResourceNoun ) -> Iterable[MetadataWorkUnit]: """ Generate a work unit for VertexAI Training Job """ - created_time = int(job.start_time.timestamp()) or int(time.time() * 1000) + + created_time = int(job.create_time.timestamp()) or int(time.time() * 1000) created_actor = f"urn:li:platformResource:{self.platform}" job_id = self._make_vertexai_job_name(entity_id=job.name) @@ -324,7 +328,7 @@ def _search_dataset(self, dataset_id: str) -> Optional[VertexAiResourceNoun]: """ if self.datasets is None: - self.datasets = defaultdict(lambda: None) + self.datasets = dict() for ds in self.client.datasets.TextDataset.list(): self.datasets[ds.name] = ds for ds in self.client.datasets.TabularDataset.list(): @@ -336,7 +340,7 @@ def _search_dataset(self, dataset_id: str) -> Optional[VertexAiResourceNoun]: for ds in self.client.datasets.VideoDataset.list(): self.datasets[ds.name] = ds - return self.datasets[dataset_id] + return self.datasets[dataset_id] if dataset_id in self.datasets else None def _get_dataset_workunit( self, urn: str, ds: VertexAiResourceNoun @@ -562,40 +566,40 @@ def _make_vertexai_job_name(self, entity_id: str, separator: str = ".") -> str: entity_type = "job" return f"{self.config.project_id}{separator}{entity_type}{separator}{entity_id}" - def _make_job_external_url(self, job: _TrainingJob): + def _make_job_external_url(self, job: VertexAiResourceNoun) -> str: """ Model external URL in Vertex AI Sample URLs: https://console.cloud.google.com/vertex-ai/training/training-pipelines?project=acryl-poc&trainingPipelineId=5401695018589093888 """ entity_type = "training" - external_url = ( + external_url: str = ( f"{self.config.vertexai_url}/{entity_type}/training-pipelines?trainingPipelineId={job.name}" f"?project={self.config.project_id}" ) return external_url - def _make_model_external_url(self, model: Model): + def _make_model_external_url(self, model: Model) -> str: """ Model external URL in Vertex AI Sample URL: https://console.cloud.google.com/vertex-ai/models/locations/us-west2/models/812468724182286336?project=acryl-poc """ entity_type = "models" - external_url = ( + external_url: str = ( f"{self.config.vertexai_url}/{entity_type}/locations/{self.config.region}/{entity_type}/{model.name}" f"?project={self.config.project_id}" ) return external_url - def _make_model_version_external_url(self, model: Model): + def _make_model_version_external_url(self, model: Model) -> str: """ Model Version external URL in Vertex AI Sample URL: https://console.cloud.google.com/vertex-ai/models/locations/us-west2/models/812468724182286336/versions/1?project=acryl-poc """ entity_type = "models" - external_url = ( + external_url: str = ( f"{self.config.vertexai_url}/{entity_type}/locations/{self.config.region}/{entity_type}/{model.name}" f"/versions/{model.version_id}" f"?project={self.config.project_id}" From 1032630c12161464204449660d664de7d84e527b Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Wed, 26 Feb 2025 10:18:11 -0800 Subject: [PATCH 31/59] adding credentail config --- .../docs/sources/vertexai/vertexai_pre.md | 47 +++++++++++++++++ .../docs/sources/vertexai/vertexai_recipe.yml | 7 +++ .../ingestion/source/common/configs.py | 50 ++++++++++++++++++ .../src/datahub/ingestion/source/vertexai.py | 51 +++++++++++++------ 4 files changed, 140 insertions(+), 15 deletions(-) create mode 100644 metadata-ingestion/docs/sources/vertexai/vertexai_pre.md create mode 100644 metadata-ingestion/src/datahub/ingestion/source/common/configs.py diff --git a/metadata-ingestion/docs/sources/vertexai/vertexai_pre.md b/metadata-ingestion/docs/sources/vertexai/vertexai_pre.md new file mode 100644 index 00000000000000..98f176728be60f --- /dev/null +++ b/metadata-ingestion/docs/sources/vertexai/vertexai_pre.md @@ -0,0 +1,47 @@ + + +#### Credential to access to GCP +1. Follow the section on credentials to access Vertex AI [GCP docs](https://cloud.google.com/docs/authentication/provide-credentials-adc#how-to). + +#### Create a service account in the Extractor Project + +1. Setup a ServiceAccount as per [VertexAI docs](https://cloud.google.com/iam/docs/creating-managing-service-accounts#iam-service-accounts-create-console) + and assign the previously created role to this service account. +2. Download a service account JSON keyfile. + Example credential file: + +```json +{ + "type": "service_account", + "project_id": "project-id-1234567", + "private_key_id": "d0121d0000882411234e11166c6aaa23ed5d74e0", + "private_key": "-----BEGIN PRIVATE KEY-----\nMIIyourkey\n-----END PRIVATE KEY-----", + "client_email": "test@suppproject-id-1234567.iam.gserviceaccount.com", + "client_id": "113545814931671546333", + "auth_uri": "https://accounts.google.com/o/oauth2/auth", + "token_uri": "https://oauth2.googleapis.com/token", + "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", + "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/test%suppproject-id-1234567.iam.gserviceaccount.com" +} +``` + +3. To provide credentials to the source, you can either: + + Set an environment variable: + + ```sh + $ export GOOGLE_APPLICATION_CREDENTIALS="/path/to/keyfile.json" + ``` + + _or_ + + Set credential config in your source based on the credential json file. For example: + + ```yml + credential: + project_id: project-id-1234567 + private_key_id: "d0121d0000882411234e11166c6aaa23ed5d74e0" + private_key: "-----BEGIN PRIVATE KEY-----\nMIIyourkey\n-----END PRIVATE KEY-----\n" + client_email: "test@suppproject-id-1234567.iam.gserviceaccount.com" + client_id: "123456678890" + ``` diff --git a/metadata-ingestion/docs/sources/vertexai/vertexai_recipe.yml b/metadata-ingestion/docs/sources/vertexai/vertexai_recipe.yml index 91d8e2a86fee48..1b38fecbea4d96 100644 --- a/metadata-ingestion/docs/sources/vertexai/vertexai_recipe.yml +++ b/metadata-ingestion/docs/sources/vertexai/vertexai_recipe.yml @@ -3,8 +3,15 @@ source: config: project_id: "acryl-poc" region: "us-west2" +# credential: +# project_id: "project_id" +# private_key: "private_key" +# private_key_id: "project_key_id" +# client_email: "client_email" +# client_id: "client_id" sink: type: "datahub-rest" config: server: "http://localhost:8080" + token: "eyJhbGciOiJIUzI1NiJ9.eyJhY3RvclR5cGUiOiJVU0VSIiwiYWN0b3JJZCI6ImRhdGFodWIiLCJ0eXBlIjoiUEVSU09OQUwiLCJ2ZXJzaW9uIjoiMiIsImp0aSI6IjU5OTBhZjRjLTFiOTEtNDg1Zi1iNDk3LTJmZjVlODA0ODY3YSIsInN1YiI6ImRhdGFodWIiLCJleHAiOjE3NDI4OTA1NDgsImlzcyI6ImRhdGFodWItbWV0YWRhdGEtc2VydmljZSJ9.yDZzG_Kes9GCYIJeLRNibzTryyzIXG_ve6o3VcDByMo" \ No newline at end of file diff --git a/metadata-ingestion/src/datahub/ingestion/source/common/configs.py b/metadata-ingestion/src/datahub/ingestion/source/common/configs.py new file mode 100644 index 00000000000000..6dc0884a4d0717 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/common/configs.py @@ -0,0 +1,50 @@ +import json +import tempfile +from typing import Any, Dict, Optional + +from pydantic import Field, root_validator + +from datahub.configuration import ConfigModel +from datahub.configuration.validate_multiline_string import pydantic_multiline_string + + +class GCPCredential(ConfigModel): + project_id: str = Field(description="Project id to set the credentials") + private_key_id: str = Field(description="Private key id") + private_key: str = Field( + description="Private key in a form of '-----BEGIN PRIVATE KEY-----\\nprivate-key\\n-----END PRIVATE KEY-----\\n'" + ) + client_email: str = Field(description="Client email") + client_id: str = Field(description="Client Id") + auth_uri: str = Field( + default="https://accounts.google.com/o/oauth2/auth", + description="Authentication uri", + ) + token_uri: str = Field( + default="https://oauth2.googleapis.com/token", description="Token uri" + ) + auth_provider_x509_cert_url: str = Field( + default="https://www.googleapis.com/oauth2/v1/certs", + description="Auth provider x509 certificate url", + ) + type: str = Field(default="service_account", description="Authentication type") + client_x509_cert_url: Optional[str] = Field( + default=None, + description="If not set it will be default to https://www.googleapis.com/robot/v1/metadata/x509/client_email", + ) + + _fix_private_key_newlines = pydantic_multiline_string("private_key") + + @root_validator(skip_on_failure=True) + def validate_config(cls, values: Dict[str, Any]) -> Dict[str, Any]: + if values.get("client_x509_cert_url") is None: + values["client_x509_cert_url"] = ( + f"https://www.googleapis.com/robot/v1/metadata/x509/{values['client_email']}" + ) + return values + + def create_credential_temp_file(self) -> str: + with tempfile.NamedTemporaryFile(delete=False) as fp: + cred_json = json.dumps(self.dict(), indent=4, separators=(",", ": ")) + fp.write(cred_json.encode()) + return fp.name diff --git a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py index 696ff44bbd5e52..30715c1bd6cb09 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py +++ b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py @@ -1,6 +1,7 @@ import logging +import os import time -from typing import Iterable, List, Optional, TypeVar +from typing import Any, Iterable, List, Optional, TypeVar from google.cloud import aiplatform from google.cloud.aiplatform import ( @@ -13,7 +14,7 @@ ) from google.cloud.aiplatform.base import VertexAiResourceNoun from google.cloud.aiplatform.models import Model, VersionInfo -from google.cloud.aiplatform.training_jobs import _TrainingJob +from pydantic import PrivateAttr from pydantic.fields import Field import datahub.emitter.mce_builder as builder @@ -30,6 +31,7 @@ from datahub.ingestion.api.source import Source, SourceCapability, SourceReport from datahub.ingestion.api.source_helpers import auto_workunit from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.common.configs import GCPCredential from datahub.metadata._schema_classes import ( AuditStampClass, DataProcessInstanceInputClass, @@ -51,6 +53,9 @@ class VertexAIConfig(EnvConfigMixin): + credential: Optional[GCPCredential] = Field( + default=None, description="GCP credential information" + ) project_id: str = Field(description=("Project ID in Google Cloud Platform")) region: str = Field( description=("Region of your project in Google Cloud Platform"), @@ -59,12 +64,23 @@ class VertexAIConfig(EnvConfigMixin): default=None, description=("Bucket URI used in your project"), ) - vertexai_url: Optional[str] = Field( default="https://console.cloud.google.com/vertex-ai", description=("VertexUI URI"), ) + _credentials_path: Optional[str] = PrivateAttr(None) + + def __init__(self, **data: Any): + super().__init__(**data) + + if self.credential: + self._credentials_path = self.credential.create_credential_temp_file() + logger.debug( + f"Creating temporary credential file at {self._credentials_path}" + ) + os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self._credentials_path + @platform_name("Vertex AI") @config_class(VertexAIConfig) @@ -209,10 +225,9 @@ def _get_training_jobs_workunit(self) -> Iterable[MetadataWorkUnit]: def _get_training_job_workunit( self, job: VertexAiResourceNoun ) -> Iterable[MetadataWorkUnit]: - if isinstance(job, _TrainingJob): - yield from self._get_data_process_properties_workunit(job) - yield from self._get_job_output_workunit(job) - yield from self._get_job_input_workunit(job) + yield from self._get_data_process_properties_workunit(job) + yield from self._get_job_output_workunit(job) + yield from self._get_job_input_workunit(job) def _get_ml_group_workunit( self, @@ -276,7 +291,7 @@ def _get_data_process_properties_workunit( yield from auto_workunit([prop_mcp, subtype_mcp]) - def _is_automl_job(self, job: _TrainingJob) -> bool: + def _is_automl_job(self, job: VertexAiResourceNoun) -> bool: return ( isinstance(job, AutoMLTabularTrainingJob) or isinstance(job, AutoMLTextTrainingJob) @@ -292,7 +307,9 @@ def _search_model_version( return version return None - def _get_job_output_workunit(self, job: _TrainingJob) -> Iterable[MetadataWorkUnit]: + def _get_job_output_workunit( + self, job: VertexAiResourceNoun + ) -> Iterable[MetadataWorkUnit]: """ This method creates work units that link the training job to the model version that it produces. It checks if the job configuration contains a model to upload, @@ -313,7 +330,7 @@ def _get_job_output_workunit(self, job: _TrainingJob) -> Iterable[MetadataWorkUn model_version = self._search_model_version(model, model_version_str) if model and model_version: logger.info( - f"found that training job: {job.display_name} generated " + f" found a training job: {job.display_name} generated " f"a model (name:{model.display_name} id:{model_version_str})" ) yield from self._get_ml_model_endpoint_workunit( @@ -374,7 +391,9 @@ def _get_dataset_workunit( yield from auto_workunit(mcps) - def _get_job_input_workunit(self, job: _TrainingJob) -> Iterable[MetadataWorkUnit]: + def _get_job_input_workunit( + self, job: VertexAiResourceNoun + ) -> Iterable[MetadataWorkUnit]: """ Generate work units for the input data of a training job. This method checks if the training job is an AutoML job and if it has an input dataset @@ -390,14 +409,14 @@ def _get_job_input_workunit(self, job: _TrainingJob) -> Iterable[MetadataWorkUni # Create URN of Input Dataset for Training Job dataset_id = job_conf["inputDataConfig"]["datasetId"] logger.info( - f"found that training job {job.display_name} used input dataset: {dataset_id}" + f" found a training job: {job.display_name} used input dataset: {id: dataset_id}" ) if dataset_id: yield from self._get_data_process_input_workunit(job, dataset_id) def _get_data_process_input_workunit( - self, job: _TrainingJob, dataset_id: str + self, job: VertexAiResourceNoun, dataset_id: str ) -> Iterable[MetadataWorkUnit]: """ This method creates a work unit for the input dataset of a training job. It constructs the URN @@ -423,7 +442,9 @@ def _get_data_process_input_workunit( entityUrn=builder.make_data_process_instance_urn(job_id), aspect=DataProcessInstanceInputClass(inputs=[dataset_urn]), ) - logger.info(f"generating input dataset {dataset_name}") + logger.info( + f" found training job :{job.display_name} used input dataset : {dataset_name}" + ) yield from auto_workunit([mcp]) def _get_ml_model_endpoint_workunit( @@ -534,7 +555,7 @@ def _make_ml_model_urn(self, model_version: VersionInfo, model_name: str) -> str ) return urn - def _make_job_urn(self, job: _TrainingJob) -> str: + def _make_job_urn(self, job: VertexAiResourceNoun) -> str: job_id = self._make_vertexai_job_name(entity_id=job.name) urn = builder.make_data_process_instance_urn(dataProcessInstanceId=job_id) return urn From 616b76ade0b987c7426dc4554546c1d27575f0dc Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Wed, 26 Feb 2025 10:32:35 -0800 Subject: [PATCH 32/59] refactor and changed GCP credential to pass project_id --- .../docs/sources/vertexai/vertexai_pre.md | 3 +- .../docs/sources/vertexai/vertexai_recipe.yml | 1 - .../ingestion/source/common/configs.py | 50 ----------------- .../src/datahub/ingestion/source/vertexai.py | 56 +++++++++++++++++-- 4 files changed, 53 insertions(+), 57 deletions(-) delete mode 100644 metadata-ingestion/src/datahub/ingestion/source/common/configs.py diff --git a/metadata-ingestion/docs/sources/vertexai/vertexai_pre.md b/metadata-ingestion/docs/sources/vertexai/vertexai_pre.md index 98f176728be60f..98047482299a49 100644 --- a/metadata-ingestion/docs/sources/vertexai/vertexai_pre.md +++ b/metadata-ingestion/docs/sources/vertexai/vertexai_pre.md @@ -5,7 +5,7 @@ #### Create a service account in the Extractor Project -1. Setup a ServiceAccount as per [VertexAI docs](https://cloud.google.com/iam/docs/creating-managing-service-accounts#iam-service-accounts-create-console) +1. Setup a ServiceAccount as per [GCP docs](https://cloud.google.com/iam/docs/creating-managing-service-accounts#iam-service-accounts-create-console) and assign the previously created role to this service account. 2. Download a service account JSON keyfile. Example credential file: @@ -39,7 +39,6 @@ ```yml credential: - project_id: project-id-1234567 private_key_id: "d0121d0000882411234e11166c6aaa23ed5d74e0" private_key: "-----BEGIN PRIVATE KEY-----\nMIIyourkey\n-----END PRIVATE KEY-----\n" client_email: "test@suppproject-id-1234567.iam.gserviceaccount.com" diff --git a/metadata-ingestion/docs/sources/vertexai/vertexai_recipe.yml b/metadata-ingestion/docs/sources/vertexai/vertexai_recipe.yml index 1b38fecbea4d96..6692a9a9b44420 100644 --- a/metadata-ingestion/docs/sources/vertexai/vertexai_recipe.yml +++ b/metadata-ingestion/docs/sources/vertexai/vertexai_recipe.yml @@ -4,7 +4,6 @@ source: project_id: "acryl-poc" region: "us-west2" # credential: -# project_id: "project_id" # private_key: "private_key" # private_key_id: "project_key_id" # client_email: "client_email" diff --git a/metadata-ingestion/src/datahub/ingestion/source/common/configs.py b/metadata-ingestion/src/datahub/ingestion/source/common/configs.py deleted file mode 100644 index 6dc0884a4d0717..00000000000000 --- a/metadata-ingestion/src/datahub/ingestion/source/common/configs.py +++ /dev/null @@ -1,50 +0,0 @@ -import json -import tempfile -from typing import Any, Dict, Optional - -from pydantic import Field, root_validator - -from datahub.configuration import ConfigModel -from datahub.configuration.validate_multiline_string import pydantic_multiline_string - - -class GCPCredential(ConfigModel): - project_id: str = Field(description="Project id to set the credentials") - private_key_id: str = Field(description="Private key id") - private_key: str = Field( - description="Private key in a form of '-----BEGIN PRIVATE KEY-----\\nprivate-key\\n-----END PRIVATE KEY-----\\n'" - ) - client_email: str = Field(description="Client email") - client_id: str = Field(description="Client Id") - auth_uri: str = Field( - default="https://accounts.google.com/o/oauth2/auth", - description="Authentication uri", - ) - token_uri: str = Field( - default="https://oauth2.googleapis.com/token", description="Token uri" - ) - auth_provider_x509_cert_url: str = Field( - default="https://www.googleapis.com/oauth2/v1/certs", - description="Auth provider x509 certificate url", - ) - type: str = Field(default="service_account", description="Authentication type") - client_x509_cert_url: Optional[str] = Field( - default=None, - description="If not set it will be default to https://www.googleapis.com/robot/v1/metadata/x509/client_email", - ) - - _fix_private_key_newlines = pydantic_multiline_string("private_key") - - @root_validator(skip_on_failure=True) - def validate_config(cls, values: Dict[str, Any]) -> Dict[str, Any]: - if values.get("client_x509_cert_url") is None: - values["client_x509_cert_url"] = ( - f"https://www.googleapis.com/robot/v1/metadata/x509/{values['client_email']}" - ) - return values - - def create_credential_temp_file(self) -> str: - with tempfile.NamedTemporaryFile(delete=False) as fp: - cred_json = json.dumps(self.dict(), indent=4, separators=(",", ": ")) - fp.write(cred_json.encode()) - return fp.name diff --git a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py index 30715c1bd6cb09..cebea400702a0a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py +++ b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py @@ -1,7 +1,9 @@ +import json import logging import os +import tempfile import time -from typing import Any, Iterable, List, Optional, TypeVar +from typing import Any, Iterable, List, Optional, TypeVar, Dict from google.cloud import aiplatform from google.cloud.aiplatform import ( @@ -14,11 +16,13 @@ ) from google.cloud.aiplatform.base import VertexAiResourceNoun from google.cloud.aiplatform.models import Model, VersionInfo -from pydantic import PrivateAttr +from pydantic import PrivateAttr, root_validator from pydantic.fields import Field import datahub.emitter.mce_builder as builder +from datahub.configuration import ConfigModel from datahub.configuration.source_common import EnvConfigMixin +from datahub.configuration.validate_multiline_string import pydantic_multiline_string from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.decorators import ( @@ -31,7 +35,6 @@ from datahub.ingestion.api.source import Source, SourceCapability, SourceReport from datahub.ingestion.api.source_helpers import auto_workunit from datahub.ingestion.api.workunit import MetadataWorkUnit -from datahub.ingestion.source.common.configs import GCPCredential from datahub.metadata._schema_classes import ( AuditStampClass, DataProcessInstanceInputClass, @@ -51,6 +54,51 @@ logger = logging.getLogger(__name__) +class GCPCredential(ConfigModel): + + private_key_id: str = Field(description="Private key id") + private_key: str = Field( + description="Private key in a form of '-----BEGIN PRIVATE KEY-----\\nprivate-key\\n-----END PRIVATE KEY-----\\n'" + ) + client_email: str = Field(description="Client email") + client_id: str = Field(description="Client Id") + auth_uri: str = Field( + default="https://accounts.google.com/o/oauth2/auth", + description="Authentication uri", + ) + token_uri: str = Field( + default="https://oauth2.googleapis.com/token", description="Token uri" + ) + auth_provider_x509_cert_url: str = Field( + default="https://www.googleapis.com/oauth2/v1/certs", + description="Auth provider x509 certificate url", + ) + type: str = Field(default="service_account", description="Authentication type") + client_x509_cert_url: Optional[str] = Field( + default=None, + description="If not set it will be default to https://www.googleapis.com/robot/v1/metadata/x509/client_email", + ) + + _fix_private_key_newlines = pydantic_multiline_string("private_key") + + @root_validator(skip_on_failure=True) + def validate_config(cls, values: Dict[str, Any]) -> Dict[str, Any]: + if values.get("client_x509_cert_url") is None: + values["client_x509_cert_url"] = ( + f"https://www.googleapis.com/robot/v1/metadata/x509/{values['client_email']}" + ) + return values + + def create_credential_temp_file(self, project_id:str) -> str: + + # Adding project_id from the top level config + configs = self.dict() + configs["project_id"] = project_id + with tempfile.NamedTemporaryFile(delete=False) as fp: + cred_json = json.dumps(configs, indent=4, separators=(",", ": ")) + fp.write(cred_json.encode()) + return fp.name + class VertexAIConfig(EnvConfigMixin): credential: Optional[GCPCredential] = Field( @@ -75,7 +123,7 @@ def __init__(self, **data: Any): super().__init__(**data) if self.credential: - self._credentials_path = self.credential.create_credential_temp_file() + self._credentials_path = self.credential.create_credential_temp_file(self.project_id) logger.debug( f"Creating temporary credential file at {self._credentials_path}" ) From 1dcfce12f58e7a239941f3e41d6e0d596eb4c70a Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Wed, 26 Feb 2025 12:52:42 -0800 Subject: [PATCH 33/59] Adding more unit test case coverage, fixed lint and test case --- .../src/datahub/ingestion/source/vertexai.py | 74 +++--- .../tests/unit/test_vertexai_source.py | 229 +++++++++++------- 2 files changed, 185 insertions(+), 118 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py index cebea400702a0a..e82f25fc72371b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py +++ b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py @@ -3,7 +3,7 @@ import os import tempfile import time -from typing import Any, Iterable, List, Optional, TypeVar, Dict +from typing import Any, Dict, Iterable, List, Optional, TypeVar from google.cloud import aiplatform from google.cloud.aiplatform import ( @@ -35,18 +35,16 @@ from datahub.ingestion.api.source import Source, SourceCapability, SourceReport from datahub.ingestion.api.source_helpers import auto_workunit from datahub.ingestion.api.workunit import MetadataWorkUnit -from datahub.metadata._schema_classes import ( +from datahub.metadata.schema_classes import ( AuditStampClass, DataProcessInstanceInputClass, DataProcessInstancePropertiesClass, DatasetPropertiesClass, - SubTypesClass, - TimeStampClass, -) -from datahub.metadata.schema_classes import ( MLModelDeploymentPropertiesClass, MLModelGroupPropertiesClass, MLModelPropertiesClass, + SubTypesClass, + TimeStampClass, VersionTagClass, ) @@ -54,8 +52,8 @@ logger = logging.getLogger(__name__) -class GCPCredential(ConfigModel): +class GCPCredential(ConfigModel): private_key_id: str = Field(description="Private key id") private_key: str = Field( description="Private key in a form of '-----BEGIN PRIVATE KEY-----\\nprivate-key\\n-----END PRIVATE KEY-----\\n'" @@ -89,8 +87,7 @@ def validate_config(cls, values: Dict[str, Any]) -> Dict[str, Any]: ) return values - def create_credential_temp_file(self, project_id:str) -> str: - + def create_credential_temp_file(self, project_id: str) -> str: # Adding project_id from the top level config configs = self.dict() configs["project_id"] = project_id @@ -123,7 +120,9 @@ def __init__(self, **data: Any): super().__init__(**data) if self.credential: - self._credentials_path = self.credential.create_credential_temp_file(self.project_id) + self._credentials_path = self.credential.create_credential_temp_file( + self.project_id + ) logger.debug( f"Creating temporary credential file at {self._credentials_path}" ) @@ -291,7 +290,9 @@ def _get_ml_group_workunit( aspect=MLModelGroupPropertiesClass( name=self._make_vertexai_model_group_name(model.name), description=model.description, - createdAt=int(model.create_time.timestamp()), + createdAt=int(model.create_time.timestamp()) + if model.create_time + else None, customProperties={"displayName": model.display_name}, ), ) @@ -313,7 +314,11 @@ def _get_data_process_properties_workunit( Generate a work unit for VertexAI Training Job """ - created_time = int(job.create_time.timestamp()) or int(time.time() * 1000) + created_time = ( + int(job.create_time.timestamp()) + if job.create_time + else int(time.time() * 1000) + ) created_actor = f"urn:li:platformResource:{self.platform}" job_id = self._make_vertexai_job_name(entity_id=job.name) @@ -328,13 +333,15 @@ def _get_data_process_properties_workunit( actor=created_actor, ), externalUrl=self._make_job_external_url(job), - customProperties={"displayName": job.display_name}, + customProperties={ + "displayName": job.display_name, + "jobType": job.__class__.__name__, + }, ), ) - jobtype = job.__class__.__name__ subtype_mcp = MetadataChangeProposalWrapper( - entityUrn=entityUrn, aspect=SubTypesClass(typeNames=[f"{jobtype}"]) + entityUrn=entityUrn, aspect=SubTypesClass(typeNames=["Training Job"]) ) yield from auto_workunit([prop_mcp, subtype_mcp]) @@ -457,7 +464,7 @@ def _get_job_input_workunit( # Create URN of Input Dataset for Training Job dataset_id = job_conf["inputDataConfig"]["datasetId"] logger.info( - f" found a training job: {job.display_name} used input dataset: {id: dataset_id}" + f" found a training job: {job.display_name} used input dataset id: {dataset_id}" ) if dataset_id: @@ -480,20 +487,19 @@ def _get_data_process_input_workunit( env=self.config.env, ) - dataset = self._search_dataset(dataset_id) + dataset = self._search_dataset(dataset_id) if dataset_id else None if dataset: yield from self._get_dataset_workunit(urn=dataset_urn, ds=dataset) - - # Create URN of Training Job - job_id = self._make_vertexai_job_name(entity_id=job.name) - mcp = MetadataChangeProposalWrapper( - entityUrn=builder.make_data_process_instance_urn(job_id), - aspect=DataProcessInstanceInputClass(inputs=[dataset_urn]), - ) - logger.info( - f" found training job :{job.display_name} used input dataset : {dataset_name}" - ) - yield from auto_workunit([mcp]) + # Create URN of Training Job + job_id = self._make_vertexai_job_name(entity_id=job.name) + mcp = MetadataChangeProposalWrapper( + entityUrn=builder.make_data_process_instance_urn(job_id), + aspect=DataProcessInstanceInputClass(inputs=[dataset_urn]), + ) + logger.info( + f" found training job :{job.display_name} used input dataset : {dataset_name}" + ) + yield from auto_workunit([mcp]) def _get_ml_model_endpoint_workunit( self, @@ -562,8 +568,12 @@ def _get_ml_model_properties_workunit( + model_version.version_id, "resourceName": model.resource_name, }, - created=TimeStampClass(model_version.version_create_time.second), - lastModified=TimeStampClass(model_version.version_update_time.second), + created=TimeStampClass(model_version.version_create_time.second) + if model_version.version_create_time + else None, + lastModified=TimeStampClass(model_version.version_update_time.second) + if model_version.version_update_time + else None, version=VersionTagClass(versionTag=str(model_version.version_id)), groups=[ml_model_group_urn], # link model version to model group trainingJobs=[training_job_urn] @@ -631,7 +641,9 @@ def _make_vertexai_dataset_name(self, entity_id: str, separator: str = ".") -> s entity_type = "dataset" return f"{self.config.project_id}{separator}{entity_type}{separator}{entity_id}" - def _make_vertexai_job_name(self, entity_id: str, separator: str = ".") -> str: + def _make_vertexai_job_name( + self, entity_id: Optional[str], separator: str = "." + ) -> str: entity_type = "job" return f"{self.config.project_id}{separator}{entity_type}{separator}{entity_id}" diff --git a/metadata-ingestion/tests/unit/test_vertexai_source.py b/metadata-ingestion/tests/unit/test_vertexai_source.py index e7ec577da037a8..ae97a637eaa7e2 100644 --- a/metadata-ingestion/tests/unit/test_vertexai_source.py +++ b/metadata-ingestion/tests/unit/test_vertexai_source.py @@ -2,20 +2,80 @@ from unittest.mock import MagicMock, patch import pytest +from google.cloud.aiplatform.base import VertexAiResourceNoun from google.cloud.aiplatform.models import Model, VersionInfo from google.protobuf import timestamp_pb2 from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.source.vertexai import VertexAIConfig, VertexAISource +from datahub.metadata.com.linkedin.pegasus2avro.ml.metadata import ( + MLModelGroupProperties, + MLModelProperties, +) +from datahub.metadata.schema_classes import ( + DataProcessInstanceInputClass, + DataProcessInstancePropertiesClass, + SubTypesClass, +) + + +@pytest.fixture +def mock_model() -> Model: + mock_model_1 = MagicMock(spec=Model) + mock_model_1.name = "mock_prediction_model_1" + mock_model_1.create_time = timestamp_pb2.Timestamp().GetCurrentTime() + mock_model_1.update_time = timestamp_pb2.Timestamp().GetCurrentTime() + mock_model_1.version_id = "1" + mock_model_1.display_name = "mock_prediction_model_1_display_name" + return mock_model_1 + + +@pytest.fixture +def mock_models() -> List[Model]: + mock_model_1 = MagicMock(spec=Model) + mock_model_1.name = "mock_prediction_model_1" + mock_model_1.create_time = timestamp_pb2.Timestamp().GetCurrentTime() + mock_model_1.update_time = timestamp_pb2.Timestamp().GetCurrentTime() + mock_model_1.version_id = "1" + mock_model_1.display_name = "mock_prediction_model_1_display_name" + mock_model_1.description = "mock_prediction_model_1_description" + + mock_model_2 = MagicMock(spec=Model) + mock_model_2.name = "mock_prediction_model_2" + + mock_model_2.create_time = timestamp_pb2.Timestamp().GetCurrentTime() + mock_model_2.update_time = timestamp_pb2.Timestamp().GetCurrentTime() + mock_model_2.version_id = "1" + mock_model_2.display_name = "mock_prediction_model_2_display_name" + mock_model_2.description = "mock_prediction_model_1_description" + + return [mock_model_1, mock_model_2] + + +@pytest.fixture +def mock_training_job() -> VertexAiResourceNoun: + mock_training_job = MagicMock(spec=VertexAiResourceNoun) + mock_training_job.name = "mock_training_job" + mock_training_job.create_time = timestamp_pb2.Timestamp().GetCurrentTime() + mock_training_job.update_time = timestamp_pb2.Timestamp().GetCurrentTime() + mock_training_job.display_name = "mock_training_job_display_name" + mock_training_job.description = "mock_training_job_description" + return mock_training_job @pytest.fixture def project_id() -> str: + """ + Replace with your GCP Project ID + """ return "acryl-poc" @pytest.fixture def region() -> str: + """ + Replace with your GCP region s + """ return "us-west2" @@ -31,7 +91,8 @@ def source(project_id: str, region: str) -> VertexAISource: def real_model(source: VertexAISource) -> Model: """ Fixture for the model that is actually registered in the Vertex AI Model Registry - use mock_models for local testing purpose + Use mock_model for local testing purpose, but this fixture is provided to use real model for debugging. + Replace model name with your real model when using this fixture. """ model_name = "projects/872197881936/locations/us-west2/models/3583871344875405312" return Model(model_name=model_name) @@ -40,116 +101,110 @@ def real_model(source: VertexAISource) -> Model: @pytest.fixture def model_version( source: VertexAISource, - real_model: Model, + mock_model: Model, ) -> VersionInfo: version = "1" return VersionInfo( version_id=version, version_description="test", - # how to create timestamp_pb2.Timestamp using current time? version_create_time=timestamp_pb2.Timestamp().GetCurrentTime(), version_update_time=timestamp_pb2.Timestamp().GetCurrentTime(), - model_display_name=real_model.name, - model_resource_name=real_model.resource_name, + model_display_name=mock_model.name, + model_resource_name=mock_model.resource_name, ) -@pytest.fixture -def mock_models() -> List[Model]: - mock_model_1 = MagicMock(spec=Model) - mock_model_1.name = "mock_prediction_model_1" - mock_model_1.create_time = timestamp_pb2.Timestamp().GetCurrentTime() - mock_model_1.update_time = timestamp_pb2.Timestamp().GetCurrentTime() - mock_model_1.version_id = "1" - mock_model_1.display_name = "mock_prediction_model_1_display_name" - - mock_model_2 = MagicMock(spec=Model) - mock_model_2.name = "mock_prediction_model_2" - - mock_model_2.create_time = timestamp_pb2.Timestamp().GetCurrentTime() - mock_model_2.update_time = timestamp_pb2.Timestamp().GetCurrentTime() - mock_model_2.version_id = "1" - mock_model_2.display_name = "mock_prediction_model_2_display_name" - - return [mock_model_1, mock_model_2] - - -@pytest.fixture -def mock_model(): - mock_model_1 = MagicMock(spec=Model) - mock_model_1.name = "mock_prediction_model_1" - mock_model_1.create_time = timestamp_pb2.Timestamp().GetCurrentTime() - mock_model_1.update_time = timestamp_pb2.Timestamp().GetCurrentTime() - mock_model_1.version_id = "1" - mock_model_1.display_name = "mock_prediction_model_1_display_name" - return mock_model_1 +@patch("google.cloud.aiplatform.Model.list") +def test_get_ml_model_workunits( + mock_list: List[Model], source: VertexAISource, mock_models: List[Model] +) -> None: + assert hasattr(mock_list, "return_value") # this check needed to go ground lint + mock_list.return_value = mock_models + wcs = [wc for wc in source._get_ml_model_workunits()] + assert len(wcs) == 2 + # aspect is MLModelGroupPropertiesClass -def test_mock_model_workunit(source, mock_model, model_version): - wu = source._get_ml_model_properties_workunit( - model=mock_model, - model_version=model_version, + assert hasattr(wcs[0].metadata, "aspect") + aspect = wcs[0].metadata.aspect + assert isinstance(aspect, MLModelGroupProperties) + assert ( + aspect.name == f"{source._make_vertexai_model_group_name(mock_models[0].name)}" ) - aspect = wu.metadata.aspect - # aspect is MLModelPropertiesClass - print(aspect) - assert aspect.description == model_version.version_description - assert aspect.date == model_version.version_create_time - + assert aspect.description == mock_models[0].description -@pytest.mark.skip(reason="Skipping, this is for debugging purpose") -def test_real_model_workunit(source, real_model, model_version): - """ - Disabled as default - Use real model registered in the Vertex AI Model Registry - """ - wu = source._get_ml_model_properties_workunit( - model=real_model, - model_version=model_version, + assert hasattr(wcs[1].metadata, "aspect") + aspect = wcs[1].metadata.aspect + assert isinstance(aspect, MLModelGroupProperties) + assert ( + aspect.name == f"{source._make_vertexai_model_group_name(mock_models[1].name)}" + ) + assert aspect.description == mock_models[1].description + + +def test_get_ml_model_properties_workunit( + source: VertexAISource, mock_model: Model, model_version: VersionInfo +) -> None: + wu = [ + wu for wu in source._get_ml_model_properties_workunit(mock_model, model_version) + ] + assert len(wu) == 1 + assert hasattr(wu[0].metadata, "aspect") + aspect = wu[0].metadata.aspect + assert isinstance(aspect, MLModelProperties) + assert ( + aspect.name + == f"{source._make_vertexai_model_name(mock_model.name)}_{mock_model.version_id}" ) - aspect = wu.metadata.aspect - # aspect is MLModelPropertiesClass assert aspect.description == model_version.version_description assert aspect.date == model_version.version_create_time assert aspect.hyperParams is None - assert aspect.trainingMetrics is None -@patch("google.cloud.aiplatform.Model.list") -def test_mock_models_and_versions_workunits( - mock_list, source, real_model, model_version, mock_models -): - mock_list.return_value = mock_models - wcs = [wc for wc in source._get_ml_model_workunits()] - assert len(wcs) == 2 - # aspect is MLModelGroupPropertiesClass - assert wcs[0].metadata.aspect.name == mock_models[0].name - assert wcs[0].metadata.aspect.description == mock_models[0].description - assert wcs[0].metadata.aspect.createdAt == mock_models[0].create_time - assert wcs[1].metadata.aspect.name == mock_models[1].name - assert wcs[1].metadata.aspect.description == mock_models[1].description - assert wcs[1].metadata.aspect.createdAt == mock_models[1].create_time +def test_get_data_process_properties_workunit( + source: VertexAISource, mock_training_job: VertexAiResourceNoun +) -> None: + for wu in source._get_data_process_properties_workunit(mock_training_job): + assert hasattr(wu.metadata, "aspect") + aspect = wu.metadata.aspect + if isinstance(aspect, DataProcessInstancePropertiesClass): + assert ( + aspect.name + == f"{source._make_vertexai_job_name(mock_training_job.name)}" + ) + assert aspect.externalUrl == source._make_job_external_url( + mock_training_job + ) + elif isinstance(aspect, SubTypesClass): + assert "Training Job" in aspect.typeNames + + +def test_get_data_process_input_workunit( + source: VertexAISource, mock_training_job: VertexAiResourceNoun +) -> None: + for wu in source._get_data_process_input_workunit(mock_training_job, "12345"): + assert hasattr(wu.metadata, "aspect") + aspect = wu.metadata.aspect + assert isinstance(aspect, DataProcessInstanceInputClass) + assert len(aspect.inputs) == 1 @pytest.mark.skip(reason="Skipping, this is for debugging purpose") -def test_real_models_and_versions_workunits(source): +def test_real_model_workunit( + source: VertexAISource, real_model: Model, model_version: VersionInfo +) -> None: """ Disabled as default Use real model registered in the Vertex AI Model Registry """ - wcs = [wc for wc in source._get_ml_model_workunits()] - assert len(wcs) == 2 - # aspect is MLModelGroupPropertiesClass or MLModelPropertiesClass - # assert using real name in GCP model registry - # assert wcs[0].metadata.aspect.name == "mock_prediction_model_1" - - -def test_config_model_name_separator(source, model_version): - name_version_sep = "+" - source.config.model_name_separator = name_version_sep - expected_model_name = f"{model_version.model_display_name}{name_version_sep}{model_version.version_id}" - expected_urn = f"urn:li:mlModel:(urn:li:dataPlatform:vertexai,{expected_model_name},{source.config.env})" - - urn = source._make_ml_model_urn(model_version) - - assert urn == expected_urn + for wu in source._get_ml_model_properties_workunit( + model=real_model, model_version=model_version + ): + assert hasattr(wu.metadata, "aspect") + aspect = wu.metadata.aspect + assert isinstance(aspect, MLModelProperties) + # aspect is MLModelPropertiesClass + assert aspect.description == model_version.version_description + assert aspect.date == model_version.version_create_time + assert aspect.hyperParams is None + assert aspect.trainingMetrics is None From f16c8f527bff4b3e2bbee617ac07c61b1991ac70 Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Wed, 26 Feb 2025 13:26:55 -0800 Subject: [PATCH 34/59] fix platform name --- metadata-ingestion/src/datahub/ingestion/source/vertexai.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py index e82f25fc72371b..2ec4e139663291 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py +++ b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py @@ -129,7 +129,7 @@ def __init__(self, **data: Any): os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self._credentials_path -@platform_name("Vertex AI") +@platform_name("VertexAI") @config_class(VertexAIConfig) @support_status(SupportStatus.TESTING) @capability( From 1de43a0c14375b38bd9fc19d23c1c68bf8d52768 Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Wed, 26 Feb 2025 14:48:18 -0800 Subject: [PATCH 35/59] fixed _get_data_process_input_workunit test case --- .../tests/unit/test_vertexai_source.py | 35 ++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/metadata-ingestion/tests/unit/test_vertexai_source.py b/metadata-ingestion/tests/unit/test_vertexai_source.py index ae97a637eaa7e2..ef57f18f5c691e 100644 --- a/metadata-ingestion/tests/unit/test_vertexai_source.py +++ b/metadata-ingestion/tests/unit/test_vertexai_source.py @@ -179,9 +179,42 @@ def test_get_data_process_properties_workunit( assert "Training Job" in aspect.typeNames +@patch("google.cloud.aiplatform.datasets.TextDataset.list") +@patch("google.cloud.aiplatform.datasets.TabularDataset.list") +@patch("google.cloud.aiplatform.datasets.ImageDataset.list") +@patch("google.cloud.aiplatform.datasets.TimeSeriesDataset.list") +@patch("google.cloud.aiplatform.datasets.VideoDataset.list") def test_get_data_process_input_workunit( - source: VertexAISource, mock_training_job: VertexAiResourceNoun + mock_text_list: List[VertexAiResourceNoun], + mock_tabular_list: List[VertexAiResourceNoun], + mock_image_list: List[VertexAiResourceNoun], + mock_time_series_list: List[VertexAiResourceNoun], + mock_video_list: List[VertexAiResourceNoun], + source: VertexAISource, + mock_training_job: VertexAiResourceNoun, ) -> None: + # Mocking all the dataset list + assert hasattr( + mock_text_list, "return_value" + ) # this check needed to go ground lint + mock_text_list.return_value = [] + assert hasattr( + mock_tabular_list, "return_value" + ) # this check needed to go ground lint + mock_tabular_list.return_value = [] + assert hasattr( + mock_video_list, "return_value" + ) # this check needed to go ground lint + mock_video_list.return_value = [] + assert hasattr( + mock_time_series_list, "return_value" + ) # this check needed to go ground lint + mock_time_series_list.return_value = [] + assert hasattr( + mock_image_list, "return_value" + ) # this check needed to go ground lint + mock_image_list.return_value = [] + for wu in source._get_data_process_input_workunit(mock_training_job, "12345"): assert hasattr(wu.metadata, "aspect") aspect = wu.metadata.aspect From ea577cbf949e9828214551c3e37f73f6d3c9b913 Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Wed, 26 Feb 2025 17:06:48 -0800 Subject: [PATCH 36/59] Adding subtype and container to dataset and training job --- .../src/datahub/ingestion/source/vertexai.py | 180 +++++++++++++----- 1 file changed, 128 insertions(+), 52 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py index 2ec4e139663291..174db8eda0ff0e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py +++ b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py @@ -20,10 +20,12 @@ from pydantic.fields import Field import datahub.emitter.mce_builder as builder +import datahub.emitter.mcp_builder from datahub.configuration import ConfigModel from datahub.configuration.source_common import EnvConfigMixin from datahub.configuration.validate_multiline_string import pydantic_multiline_string from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.emitter.mcp_builder import ProjectIdKey from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.decorators import ( SupportStatus, @@ -37,6 +39,7 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.metadata.schema_classes import ( AuditStampClass, + ContainerClass, DataProcessInstanceInputClass, DataProcessInstancePropertiesClass, DatasetPropertiesClass, @@ -159,12 +162,26 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: - Models and Model Versions from the Model Registry - Training Jobs """ - # Fetch Models, Model Versions a from Model Registry + + # Ingest Project + yield from self._get_project_workunit() + # Fetch and Ingest Models, Model Versions a from Model Registry yield from self._get_ml_model_workunits() - # Fetch Training Jobs + # Fetch and Ingest Training Jobs yield from self._get_training_jobs_workunit() # TODO Fetch Experiments and Experiment Runs + def _get_project_workunit(self) -> Iterable[MetadataWorkUnit]: + container_key = ProjectIdKey( + project_id=self.config.project_id, platform=self.platform + ) + + yield from datahub.emitter.mcp_builder.gen_containers( + container_key=container_key, + name=self.config.project_id, + sub_types=["Project"], + ) + def _validate_training_job(self, model: Model) -> bool: """ Validate Model Has Valid Training Job @@ -290,7 +307,7 @@ def _get_ml_group_workunit( aspect=MLModelGroupPropertiesClass( name=self._make_vertexai_model_group_name(model.name), description=model.description, - createdAt=int(model.create_time.timestamp()) + createdAt=int(model.create_time.timestamp() * 1000) if model.create_time else None, customProperties={"displayName": model.display_name}, @@ -322,29 +339,48 @@ def _get_data_process_properties_workunit( created_actor = f"urn:li:platformResource:{self.platform}" job_id = self._make_vertexai_job_name(entity_id=job.name) - entityUrn = builder.make_data_process_instance_urn(job_id) - - prop_mcp = MetadataChangeProposalWrapper( - entityUrn=entityUrn, - aspect=DataProcessInstancePropertiesClass( - name=job_id, - created=AuditStampClass( - time=created_time, - actor=created_actor, + job_urn = builder.make_data_process_instance_urn(job_id) + + mcps = [] + mcps.append( + MetadataChangeProposalWrapper( + entityUrn=job_urn, + aspect=DataProcessInstancePropertiesClass( + name=job_id, + created=AuditStampClass( + time=created_time, + actor=created_actor, + ), + externalUrl=self._make_job_external_url(job), + customProperties={ + "displayName": job.display_name, + "jobType": job.__class__.__name__, + }, ), - externalUrl=self._make_job_external_url(job), - customProperties={ - "displayName": job.display_name, - "jobType": job.__class__.__name__, - }, - ), + ) + ) + + mcps.append( + MetadataChangeProposalWrapper( + entityUrn=job_urn, aspect=SubTypesClass(typeNames=["Training Job"]) + ) + ) + + # Create a container for Project as parent of the dataset + container_key = ProjectIdKey( + project_id=self.config.project_id, platform=self.platform ) - subtype_mcp = MetadataChangeProposalWrapper( - entityUrn=entityUrn, aspect=SubTypesClass(typeNames=["Training Job"]) + mcps.append( + MetadataChangeProposalWrapper( + entityUrn=job_urn, + aspect=ContainerClass( + container=container_key.as_urn(), + ), + ) ) - yield from auto_workunit([prop_mcp, subtype_mcp]) + yield from auto_workunit(mcps) def _is_automl_job(self, job: VertexAiResourceNoun) -> bool: return ( @@ -415,19 +451,20 @@ def _search_dataset(self, dataset_id: str) -> Optional[VertexAiResourceNoun]: return self.datasets[dataset_id] if dataset_id in self.datasets else None def _get_dataset_workunit( - self, urn: str, ds: VertexAiResourceNoun + self, dataset_urn: str, ds: VertexAiResourceNoun ) -> Iterable[MetadataWorkUnit]: """ Create a DatasetPropertiesClass aspect for a given Vertex AI dataset. """ + # Create aspects for the dataset mcps = [] mcps.append( MetadataChangeProposalWrapper( - entityUrn=urn, + entityUrn=dataset_urn, aspect=DatasetPropertiesClass( name=self._make_vertexai_dataset_name(ds.name), - created=TimeStampClass(time=int(ds.create_time.timestamp())), + created=TimeStampClass(time=int(ds.create_time.timestamp() * 1000)), description=f"Dataset: {ds.display_name} for training job", customProperties={ "displayName": ds.display_name, @@ -437,13 +474,25 @@ def _get_dataset_workunit( ), ) ) - mcps.append( MetadataChangeProposalWrapper( - entityUrn=urn, aspect=SubTypesClass(typeNames=["Dataset"]) + entityUrn=dataset_urn, aspect=SubTypesClass(typeNames=["Dataset"]) ) ) + # Create a container for Project as parent of the dataset + container_key = ProjectIdKey( + project_id=self.config.project_id, platform=self.platform + ) + + mcps.append( + MetadataChangeProposalWrapper( + entityUrn=dataset_urn, + aspect=ContainerClass( + container=container_key.as_urn(), + ), + ) + ) yield from auto_workunit(mcps) def _get_job_input_workunit( @@ -489,7 +538,7 @@ def _get_data_process_input_workunit( dataset = self._search_dataset(dataset_id) if dataset_id else None if dataset: - yield from self._get_dataset_workunit(urn=dataset_urn, ds=dataset) + yield from self._get_dataset_workunit(dataset_urn=dataset_urn, ds=dataset) # Create URN of Training Job job_id = self._make_vertexai_job_name(entity_id=job.name) mcp = MetadataChangeProposalWrapper( @@ -501,6 +550,49 @@ def _get_data_process_input_workunit( ) yield from auto_workunit([mcp]) + def _get_endpoint_workunit( + self, endpoint: Endpoint, model: Model, model_version: VersionInfo + ) -> Iterable[MetadataWorkUnit]: + endpoint_urn = builder.make_ml_model_deployment_urn( + platform=self.platform, + deployment_name=self._make_vertexai_endpoint_name( + entity_id=endpoint.display_name + ), + env=self.config.env, + ) + deployment_aspect = MLModelDeploymentPropertiesClass( + description=model.description, + createdAt=int(endpoint.create_time.timestamp() * 1000), + version=VersionTagClass(versionTag=str(model_version.version_id)), + customProperties={"displayName": endpoint.display_name}, + ) + + mcps = [] + mcps.append( + MetadataChangeProposalWrapper( + entityUrn=endpoint_urn, aspect=deployment_aspect + ) + ) + + mcps.append( + MetadataChangeProposalWrapper( + entityUrn=endpoint_urn, + aspect=ContainerClass( + container=ProjectIdKey( + project_id=self.config.project_id, platform=self.platform + ).as_urn(), + ), + ) + ) + + mcps.append( + MetadataChangeProposalWrapper( + entityUrn=endpoint_urn, aspect=SubTypesClass(typeNames=["Endpoint"]) + ) + ) + + yield from auto_workunit(mcps) + def _get_ml_model_endpoint_workunit( self, model: Model, @@ -515,24 +607,7 @@ def _get_ml_model_endpoint_workunit( endpoint_urn = None if endpoint: - endpoint_urn = builder.make_ml_model_deployment_urn( - platform=self.platform, - deployment_name=self._make_vertexai_endpoint_name( - entity_id=endpoint.display_name - ), - env=self.config.env, - ) - deployment_aspect = MLModelDeploymentPropertiesClass( - description=model.description, - createdAt=int(endpoint.create_time.timestamp()), - version=VersionTagClass(versionTag=str(model_version.version_id)), - customProperties={"displayName": endpoint.display_name}, - ) - - mcp = MetadataChangeProposalWrapper( - entityUrn=endpoint_urn, aspect=deployment_aspect - ) - yield from auto_workunit([mcp]) + yield from self._get_endpoint_workunit(endpoint, model, model_version) yield from self._get_ml_model_properties_workunit( model, model_version, training_job_urn, endpoint_urn @@ -552,14 +627,14 @@ def _get_ml_model_properties_workunit( """ logging.info(f"starting model work unit for model {model.name}") - ml_model_group_urn = self._make_ml_model_group_urn(model) + model_group_urn = self._make_ml_model_group_urn(model) model_name = self._make_vertexai_model_name(entity_id=model.name) model_version_name = ( f"{model_name}{self.model_name_separator}{model_version.version_id}" ) - ml_model_urn = self._make_ml_model_urn(model_version, model_name=model_name) + model_urn = self._make_ml_model_urn(model_version, model_name=model_name) - ml_model_properties = MLModelPropertiesClass( + model_aspect = MLModelPropertiesClass( name=model_version_name, description=model_version.version_description, customProperties={ @@ -575,7 +650,7 @@ def _get_ml_model_properties_workunit( if model_version.version_update_time else None, version=VersionTagClass(versionTag=str(model_version.version_id)), - groups=[ml_model_group_urn], # link model version to model group + groups=[model_group_urn], # link model version to model group trainingJobs=[training_job_urn] if training_job_urn else None, # link to training job @@ -585,11 +660,12 @@ def _get_ml_model_properties_workunit( externalUrl=self._make_model_version_external_url(model), ) + mcps = [] # logging.info(f"created model version {ml_model_properties.name} associated with group {ml_model_group_urn}") - mcp = MetadataChangeProposalWrapper( - entityUrn=ml_model_urn, aspect=ml_model_properties + mcps.append( + MetadataChangeProposalWrapper(entityUrn=model_urn, aspect=model_aspect) ) - yield from auto_workunit([mcp]) + yield from auto_workunit(mcps) def _search_endpoint(self, model: Model) -> Optional[Endpoint]: """ From 46ff5261a5ba2d9c79020ccbd608f69021144377 Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Thu, 27 Feb 2025 13:51:10 -0800 Subject: [PATCH 37/59] fix UI issue on timestamp and refactor --- .../src/datahub/ingestion/source/vertexai.py | 55 +++++++++++++++++-- .../tests/unit/test_vertexai_source.py | 35 ++++++++++++ 2 files changed, 85 insertions(+), 5 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py index 174db8eda0ff0e..5ab74d50ffabef 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py +++ b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py @@ -37,6 +37,9 @@ from datahub.ingestion.api.source import Source, SourceCapability, SourceReport from datahub.ingestion.api.source_helpers import auto_workunit from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.metadata.com.linkedin.pegasus2avro.ml.metadata import ( + MLTrainingRunProperties, +) from datahub.metadata.schema_classes import ( AuditStampClass, ContainerClass, @@ -307,9 +310,14 @@ def _get_ml_group_workunit( aspect=MLModelGroupPropertiesClass( name=self._make_vertexai_model_group_name(model.name), description=model.description, - createdAt=int(model.create_time.timestamp() * 1000) + created=TimeStampClass(time=int(model.create_time.timestamp() * 1000)) if model.create_time else None, + lastModified=TimeStampClass( + time=int(model.update_time.timestamp() * 1000) + ) + if model.update_time + else None, customProperties={"displayName": model.display_name}, ), ) @@ -332,7 +340,7 @@ def _get_data_process_properties_workunit( """ created_time = ( - int(job.create_time.timestamp()) + int(job.create_time.timestamp() * 1000) if job.create_time else int(time.time() * 1000) ) @@ -360,12 +368,31 @@ def _get_data_process_properties_workunit( ) ) + mcps.append( + MetadataChangeProposalWrapper( + entityUrn=job_urn, + aspect=MLTrainingRunProperties( + externalUrl=self._make_job_external_url(job), id=job.name + ), + ) + ) + mcps.append( MetadataChangeProposalWrapper( entityUrn=job_urn, aspect=SubTypesClass(typeNames=["Training Job"]) ) ) + # mcps.append( + # MetadataChangeProposalWrapper( + # entityUrn=job_urn, + # aspect=DataProcessInstanceRunEventClass( + # status=DataProcessRunStatusClass.COMPLETE, + # timestampMillis=0 + # ) + # ) + # ) + # Create a container for Project as parent of the dataset container_key = ProjectIdKey( project_id=self.config.project_id, platform=self.platform @@ -465,7 +492,7 @@ def _get_dataset_workunit( aspect=DatasetPropertiesClass( name=self._make_vertexai_dataset_name(ds.name), created=TimeStampClass(time=int(ds.create_time.timestamp() * 1000)), - description=f"Dataset: {ds.display_name} for training job", + description=f"Dataset: {ds.display_name}", customProperties={ "displayName": ds.display_name, "resourceName": ds.resource_name, @@ -643,10 +670,14 @@ def _get_ml_model_properties_workunit( + model_version.version_id, "resourceName": model.resource_name, }, - created=TimeStampClass(model_version.version_create_time.second) + created=TimeStampClass( + int(model_version.version_create_time.timestamp() * 1000) + ) if model_version.version_create_time else None, - lastModified=TimeStampClass(model_version.version_update_time.second) + lastModified=TimeStampClass( + int(model_version.version_update_time.timestamp() * 1000) + ) if model_version.version_update_time else None, version=VersionTagClass(versionTag=str(model_version.version_id)), @@ -658,6 +689,7 @@ def _get_ml_model_properties_workunit( if endpoint_urn else [], # link to model registry and endpoint externalUrl=self._make_model_version_external_url(model), + type="ML Model", ) mcps = [] @@ -665,6 +697,19 @@ def _get_ml_model_properties_workunit( mcps.append( MetadataChangeProposalWrapper(entityUrn=model_urn, aspect=model_aspect) ) + + # Create a container for Project as parent of the dataset + # mcps.append( + # MetadataChangeProposalWrapper( + # entityUrn=model_urn, + # aspect=ContainerClass( + # container=ProjectIdKey( + # project_id=self.config.project_id, + # platform=self.platform).as_urn(), + # ), + # ) + # ) + yield from auto_workunit(mcps) def _search_endpoint(self, model: Model) -> Optional[Endpoint]: diff --git a/metadata-ingestion/tests/unit/test_vertexai_source.py b/metadata-ingestion/tests/unit/test_vertexai_source.py index ef57f18f5c691e..ef7a019a4e2d15 100644 --- a/metadata-ingestion/tests/unit/test_vertexai_source.py +++ b/metadata-ingestion/tests/unit/test_vertexai_source.py @@ -2,8 +2,10 @@ from unittest.mock import MagicMock, patch import pytest +from google.cloud import aiplatform from google.cloud.aiplatform.base import VertexAiResourceNoun from google.cloud.aiplatform.models import Model, VersionInfo +from google.cloud.aiplatform.training_jobs import _TrainingJob from google.protobuf import timestamp_pb2 from datahub.ingestion.api.common import PipelineContext @@ -98,6 +100,25 @@ def real_model(source: VertexAISource) -> Model: return Model(model_name=model_name) +@pytest.fixture +def real_autoML_tabular_job(source: VertexAISource) -> _TrainingJob: + """ + Fixture for the training job that is actually registered in the Vertex AI Model Registry + Use mock_training_job for local testing purpose, but this fixture is provided to use real training job for debugging. + Replace training job name with your real training job when using this fixture. + """ + + # Initialize the AI Platform client + aiplatform.init(project=source.config.project_id, location=source.config.region) + + # Retrieve the custom training job by its resource name + # resource_name format 'projects/your-project-id/locations/your-location/trainingPipelines/your-training-job-id') + job = aiplatform.AutoMLTabularTrainingJob.get( + resource_name="projects/872197881936/locations/us-west2/trainingPipelines/5401695018589093888" + ) + return job + + @pytest.fixture def model_version( source: VertexAISource, @@ -241,3 +262,17 @@ def test_real_model_workunit( assert aspect.date == model_version.version_create_time assert aspect.hyperParams is None assert aspect.trainingMetrics is None + + +@pytest.mark.skip(reason="Skipping, this is for debugging purpose") +def test_real_get_data_process_properties( + source: VertexAISource, real_autoML_tabular_job: _TrainingJob +) -> None: + for wu in source._get_data_process_properties_workunit(real_autoML_tabular_job): + assert hasattr(wu.metadata, "aspect") + aspect = wu.metadata.aspect + if isinstance(aspect, DataProcessInstancePropertiesClass): + # aspect is DataProcessInstancePropertiesClass + assert aspect.externalUrl == source._make_job_external_url( + real_autoML_tabular_job + ) From 7b0fb70ac644c2950a5733628b7dab2aad37f549 Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Thu, 27 Feb 2025 13:54:38 -0800 Subject: [PATCH 38/59] removed token --- metadata-ingestion/docs/sources/vertexai/vertexai_recipe.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/metadata-ingestion/docs/sources/vertexai/vertexai_recipe.yml b/metadata-ingestion/docs/sources/vertexai/vertexai_recipe.yml index 6692a9a9b44420..d97d5d82e3e666 100644 --- a/metadata-ingestion/docs/sources/vertexai/vertexai_recipe.yml +++ b/metadata-ingestion/docs/sources/vertexai/vertexai_recipe.yml @@ -13,4 +13,3 @@ sink: type: "datahub-rest" config: server: "http://localhost:8080" - token: "eyJhbGciOiJIUzI1NiJ9.eyJhY3RvclR5cGUiOiJVU0VSIiwiYWN0b3JJZCI6ImRhdGFodWIiLCJ0eXBlIjoiUEVSU09OQUwiLCJ2ZXJzaW9uIjoiMiIsImp0aSI6IjU5OTBhZjRjLTFiOTEtNDg1Zi1iNDk3LTJmZjVlODA0ODY3YSIsInN1YiI6ImRhdGFodWIiLCJleHAiOjE3NDI4OTA1NDgsImlzcyI6ImRhdGFodWItbWV0YWRhdGEtc2VydmljZSJ9.yDZzG_Kes9GCYIJeLRNibzTryyzIXG_ve6o3VcDByMo" \ No newline at end of file From cf9c242c91d633f83aae606f1671d7343622af3b Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Fri, 28 Feb 2025 02:08:37 -0800 Subject: [PATCH 39/59] Adding integration test for VertexAI --- metadata-ingestion/setup.py | 1 + .../vertexai/test_vertexai_source.py | 170 ++++++++++++ .../vertexai/vertexai_mcps_golden.json | 253 ++++++++++++++++++ 3 files changed, 424 insertions(+) create mode 100644 metadata-ingestion/tests/integration/vertexai/test_vertexai_source.py create mode 100644 metadata-ingestion/tests/integration/vertexai/vertexai_mcps_golden.json diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index f5f474accc2825..de680b6974a95a 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -710,6 +710,7 @@ "mariadb", "redash", "vertica", + "vertexai" ] if plugin for dependency in plugins[plugin] diff --git a/metadata-ingestion/tests/integration/vertexai/test_vertexai_source.py b/metadata-ingestion/tests/integration/vertexai/test_vertexai_source.py new file mode 100644 index 00000000000000..ad0274ed41af31 --- /dev/null +++ b/metadata-ingestion/tests/integration/vertexai/test_vertexai_source.py @@ -0,0 +1,170 @@ +from pathlib import Path +from typing import Any, Dict, List, TypeVar +from unittest.mock import MagicMock, patch + +import pytest +from _pytest.config import Config +from google.cloud.aiplatform import Model +from google.cloud.aiplatform.base import VertexAiResourceNoun +from google.protobuf import timestamp_pb2 + +from datahub.ingestion.run.pipeline import Pipeline +from tests.test_helpers import mce_helpers + +T = TypeVar("T") + + +@pytest.fixture +def project_id() -> str: + return "test-project-id" + + +@pytest.fixture +def region() -> str: + return "us-west2" + + +@pytest.fixture +def sink_file_path(tmp_path: Path) -> str: + return str(tmp_path / "vertexai_source_mcps.json") + + +@pytest.fixture +def pipeline_config( + project_id: str, region: str, sink_file_path: str +) -> Dict[str, Any]: + source_type = "vertexai" + return { + "run_id": "vertexai-source-test", + "source": { + "type": source_type, + "config": { + "project_id": project_id, + "region": region, + }, + }, + "sink": { + "type": "file", + "config": { + "filename": sink_file_path, + }, + }, + } + + +@pytest.fixture +def mock_models() -> List[Model]: + mock_model_1 = MagicMock(spec=Model) + mock_model_1.name = "mock_prediction_model_1" + mock_model_1.create_time = timestamp_pb2.Timestamp().GetCurrentTime() + mock_model_1.update_time = timestamp_pb2.Timestamp().GetCurrentTime() + mock_model_1.version_id = "1" + mock_model_1.display_name = "mock_prediction_model_1_display_name" + mock_model_1.description = "mock_prediction_model_1_description" + + mock_model_2 = MagicMock(spec=Model) + mock_model_2.name = "mock_prediction_model_2" + + mock_model_2.create_time = timestamp_pb2.Timestamp().GetCurrentTime() + mock_model_2.update_time = timestamp_pb2.Timestamp().GetCurrentTime() + mock_model_2.version_id = "1" + mock_model_2.display_name = "mock_prediction_model_2_display_name" + mock_model_2.description = "mock_prediction_model_1_description" + + return [mock_model_1, mock_model_2] + + +@pytest.fixture +def mock_training_jobs() -> List[VertexAiResourceNoun]: + mock_training_job = MagicMock(spec=VertexAiResourceNoun) + mock_training_job.name = "mock_training_job" + mock_training_job.create_time = timestamp_pb2.Timestamp().GetCurrentTime() + mock_training_job.update_time = timestamp_pb2.Timestamp().GetCurrentTime() + mock_training_job.display_name = "mock_training_job_display_name" + mock_training_job.description = "mock_training_job_description" + return [mock_training_job] + + +@patch("google.cloud.aiplatform.init") +@patch("google.cloud.aiplatform.Model.list") +@patch("google.cloud.aiplatform.datasets.TextDataset.list") +@patch("google.cloud.aiplatform.datasets.TabularDataset.list") +@patch("google.cloud.aiplatform.datasets.ImageDataset.list") +@patch("google.cloud.aiplatform.datasets.TimeSeriesDataset.list") +@patch("google.cloud.aiplatform.datasets.VideoDataset.list") +@patch("google.cloud.aiplatform.CustomJob.list") +@patch("google.cloud.aiplatform.CustomTrainingJob.list") +@patch("google.cloud.aiplatform.CustomContainerTrainingJob.list") +@patch("google.cloud.aiplatform.CustomPythonPackageTrainingJob.list") +@patch("google.cloud.aiplatform.AutoMLTabularTrainingJob.list") +@patch("google.cloud.aiplatform.AutoMLTextTrainingJob.list") +@patch("google.cloud.aiplatform.AutoMLImageTrainingJob.list") +@patch("google.cloud.aiplatform.AutoMLVideoTrainingJob.list") +def test_vertexai_source_ingestion( + mock_automl_video_job_list: List[VertexAiResourceNoun], + mock_automl_image_list: List[VertexAiResourceNoun], + mock_automl_text_job_list: List[VertexAiResourceNoun], + mock_automl_tabular_job_list: List[VertexAiResourceNoun], + mock_custom_python_job_list: List[VertexAiResourceNoun], + mock_custom_container_job_list: List[VertexAiResourceNoun], + mock_custom_training_job_list: List[VertexAiResourceNoun], + mock_custom_job_list: List[VertexAiResourceNoun], + mock_video_ds_list: List[VertexAiResourceNoun], + mock_time_series_ds_list: List[VertexAiResourceNoun], + mock_image_ds_list: List[VertexAiResourceNoun], + mock_tabular_ds_list: List[VertexAiResourceNoun], + mock_text_ds_list: List[VertexAiResourceNoun], + mock_model_list: List[Model], + mock_init: MagicMock, + pytestconfig: Config, + sink_file_path: str, + pipeline_config: Dict[str, Any], + mock_models: List[Model], + mock_training_jobs: List[VertexAiResourceNoun], +) -> None: + assert hasattr(mock_model_list, "return_value") + mock_model_list.return_value = mock_models + assert hasattr(mock_text_ds_list, "return_value") + mock_text_ds_list.return_value = [] + assert hasattr(mock_tabular_ds_list, "return_value") + mock_tabular_ds_list.return_value = [] + assert hasattr(mock_image_ds_list, "return_value") + mock_image_ds_list.return_value = [] + assert hasattr(mock_time_series_ds_list, "return_value") + mock_time_series_ds_list.return_value = [] + assert hasattr(mock_video_ds_list, "return_value") + mock_video_ds_list.return_value = [] + assert hasattr(mock_custom_job_list, "return_value") + mock_custom_job_list.return_value = mock_training_jobs + assert hasattr(mock_custom_training_job_list, "return_value") + mock_custom_training_job_list.return_value = [] + assert hasattr(mock_custom_container_job_list, "return_value") + mock_custom_container_job_list.return_value = [] + assert hasattr(mock_custom_python_job_list, "return_value") + mock_custom_python_job_list.return_value = [] + assert hasattr(mock_automl_tabular_job_list, "return_value") + mock_automl_tabular_job_list.return_value = [] + assert hasattr(mock_automl_text_job_list, "return_value") + mock_automl_text_job_list.return_value = [] + assert hasattr(mock_automl_image_list, "return_value") + mock_automl_image_list.return_value = [] + assert hasattr(mock_automl_video_job_list, "return_value") + mock_automl_video_job_list.return_value = [] + + golden_file_path = ( + pytestconfig.rootpath / "tests/integration/vertexai/vertexai_mcps_golden.json" + ) + + print(f"mcps file path: {str(sink_file_path)}") + print(f"golden file path: {str(golden_file_path)}") + + pipeline = Pipeline.create(pipeline_config) + pipeline.run() + pipeline.pretty_print_summary() + pipeline.raise_from_status() + + mce_helpers.check_golden_file( + pytestconfig=pytestconfig, + output_path=sink_file_path, + golden_path=golden_file_path, + ) diff --git a/metadata-ingestion/tests/integration/vertexai/vertexai_mcps_golden.json b/metadata-ingestion/tests/integration/vertexai/vertexai_mcps_golden.json new file mode 100644 index 00000000000000..45b87513754b65 --- /dev/null +++ b/metadata-ingestion/tests/integration/vertexai/vertexai_mcps_golden.json @@ -0,0 +1,253 @@ +[ +{ + "entityType": "container", + "entityUrn": "urn:li:container:29746a9030349f4340ed74b46913dab6", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "vertexai", + "project_id": "test-project-id" + }, + "name": "test-project-id" + } + }, + "systemMetadata": { + "lastObserved": 1740736756728, + "runId": "vertexai-source-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:29746a9030349f4340ed74b46913dab6", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1740736756729, + "runId": "vertexai-source-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:29746a9030349f4340ed74b46913dab6", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:vertexai" + } + }, + "systemMetadata": { + "lastObserved": 1740736756729, + "runId": "vertexai-source-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:29746a9030349f4340ed74b46913dab6", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Project" + ] + } + }, + "systemMetadata": { + "lastObserved": 1740736756729, + "runId": "vertexai-source-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:29746a9030349f4340ed74b46913dab6", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [] + } + }, + "systemMetadata": { + "lastObserved": 1740736756730, + "runId": "vertexai-source-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "mlModelGroup", + "entityUrn": "urn:li:mlModelGroup:(urn:li:dataPlatform:vertexai,test-project-id.model.mock_prediction_model_1,PROD)", + "changeType": "UPSERT", + "aspectName": "mlModelGroupProperties", + "aspect": { + "json": { + "customProperties": { + "displayName": "mock_prediction_model_1_display_name" + }, + "name": "test-project-id.model_group.mock_prediction_model_1", + "description": "mock_prediction_model_1_description" + } + }, + "systemMetadata": { + "lastObserved": 1740736756731, + "runId": "vertexai-source-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "mlModelGroup", + "entityUrn": "urn:li:mlModelGroup:(urn:li:dataPlatform:vertexai,test-project-id.model.mock_prediction_model_2,PROD)", + "changeType": "UPSERT", + "aspectName": "mlModelGroupProperties", + "aspect": { + "json": { + "customProperties": { + "displayName": "mock_prediction_model_2_display_name" + }, + "name": "test-project-id.model_group.mock_prediction_model_2", + "description": "mock_prediction_model_1_description" + } + }, + "systemMetadata": { + "lastObserved": 1740736756732, + "runId": "vertexai-source-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:test-project-id.job.mock_training_job", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "displayName": "mock_training_job_display_name", + "jobType": "VertexAiResourceNoun" + }, + "externalUrl": "https://console.cloud.google.com/vertex-ai/training/training-pipelines?trainingPipelineId=mock_training_job?project=test-project-id", + "name": "test-project-id.job.mock_training_job", + "created": { + "time": 1740736756731, + "actor": "urn:li:platformResource:vertexai" + } + } + }, + "systemMetadata": { + "lastObserved": 1740736758306, + "runId": "vertexai-source-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:test-project-id.job.mock_training_job", + "changeType": "UPSERT", + "aspectName": "mlTrainingRunProperties", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "https://console.cloud.google.com/vertex-ai/training/training-pipelines?trainingPipelineId=mock_training_job?project=test-project-id", + "id": "mock_training_job" + } + }, + "systemMetadata": { + "lastObserved": 1740736758308, + "runId": "vertexai-source-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:test-project-id.job.mock_training_job", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Training Job" + ] + } + }, + "systemMetadata": { + "lastObserved": 1740736758309, + "runId": "vertexai-source-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:test-project-id.job.mock_training_job", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:29746a9030349f4340ed74b46913dab6" + } + }, + "systemMetadata": { + "lastObserved": 1740736758311, + "runId": "vertexai-source-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:test-project-id.job.mock_training_job", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1740736758312, + "runId": "vertexai-source-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "mlModelGroup", + "entityUrn": "urn:li:mlModelGroup:(urn:li:dataPlatform:vertexai,test-project-id.model.mock_prediction_model_1,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1740736758312, + "runId": "vertexai-source-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "mlModelGroup", + "entityUrn": "urn:li:mlModelGroup:(urn:li:dataPlatform:vertexai,test-project-id.model.mock_prediction_model_2,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1740736758313, + "runId": "vertexai-source-test", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file From 398c380f1ef71074b91acf5170274dbecc55e098 Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Fri, 28 Feb 2025 08:47:08 -0800 Subject: [PATCH 40/59] Adding unit test cases --- .../app/ingest/source/builder/sources.json | 2 +- .../src/datahub/ingestion/source/vertexai.py | 1 - .../tests/unit/test_vertexai_source.py | 73 ++++++++++++++++++- 3 files changed, 73 insertions(+), 3 deletions(-) diff --git a/datahub-web-react/src/app/ingest/source/builder/sources.json b/datahub-web-react/src/app/ingest/source/builder/sources.json index 43fd1c28ea94f7..c2882214830268 100644 --- a/datahub-web-react/src/app/ingest/source/builder/sources.json +++ b/datahub-web-react/src/app/ingest/source/builder/sources.json @@ -339,6 +339,6 @@ "name": "vertexai", "displayName": "VertexAI", "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/vertexai/", - "recipe": "source:\n type: vertexai\n config:\n tracking_uri: tracking_uri" + "recipe": "source:\n type: vertexai\n config:\n project_id: # you GCP project ID \n region: # region where your GCP project resides \n # Credentials\n # Add GCP credentials" } ] diff --git a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py index 5ab74d50ffabef..5a613588c3c678 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py +++ b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py @@ -119,7 +119,6 @@ class VertexAIConfig(EnvConfigMixin): default="https://console.cloud.google.com/vertex-ai", description=("VertexUI URI"), ) - _credentials_path: Optional[str] = PrivateAttr(None) def __init__(self, **data: Any): diff --git a/metadata-ingestion/tests/unit/test_vertexai_source.py b/metadata-ingestion/tests/unit/test_vertexai_source.py index ef7a019a4e2d15..2464c4bb28ac22 100644 --- a/metadata-ingestion/tests/unit/test_vertexai_source.py +++ b/metadata-ingestion/tests/unit/test_vertexai_source.py @@ -1,10 +1,11 @@ +from datetime import datetime from typing import List from unittest.mock import MagicMock, patch import pytest from google.cloud import aiplatform from google.cloud.aiplatform.base import VertexAiResourceNoun -from google.cloud.aiplatform.models import Model, VersionInfo +from google.cloud.aiplatform.models import Endpoint, Model, VersionInfo from google.cloud.aiplatform.training_jobs import _TrainingJob from google.protobuf import timestamp_pb2 @@ -17,6 +18,7 @@ from datahub.metadata.schema_classes import ( DataProcessInstanceInputClass, DataProcessInstancePropertiesClass, + MLModelDeploymentPropertiesClass, SubTypesClass, ) @@ -65,6 +67,15 @@ def mock_training_job() -> VertexAiResourceNoun: return mock_training_job +@pytest.fixture +def mock_endpoint() -> Endpoint: + mock_endpoint = MagicMock(spec=Endpoint) + mock_endpoint.description = "test endpoint" + mock_endpoint.create_time = datetime.now() + mock_endpoint.display_name = "test endpoint display name" + return mock_endpoint + + @pytest.fixture def project_id() -> str: """ @@ -182,6 +193,23 @@ def test_get_ml_model_properties_workunit( assert aspect.hyperParams is None +def test_get_endpoint_workunit( + source: VertexAISource, + mock_endpoint: Endpoint, + mock_model: Model, + model_version: VersionInfo, +) -> None: + for wu in source._get_endpoint_workunit(mock_endpoint, mock_model, model_version): + assert hasattr(wu.metadata, "aspect") + aspect = wu.metadata.aspect + if isinstance(aspect, MLModelDeploymentPropertiesClass): + assert aspect.description == mock_model.description + assert aspect.customProperties == { + "displayName": mock_endpoint.display_name + } + assert aspect.createdAt == int(mock_endpoint.create_time.timestamp() * 1000) + + def test_get_data_process_properties_workunit( source: VertexAISource, mock_training_job: VertexAiResourceNoun ) -> None: @@ -243,6 +271,49 @@ def test_get_data_process_input_workunit( assert len(aspect.inputs) == 1 +def test_vertexai_config_init(): + config_data = { + "project_id": "test-project", + "region": "us-central1", + "bucket_uri": "gs://test-bucket", + "vertexai_url": "https://console.cloud.google.com/vertex-ai", + "credential": { + "private_key_id": "test-key-id", + "private_key": "-----BEGIN PRIVATE KEY-----\ntest-private-key\n-----END PRIVATE KEY-----\n", + "client_email": "test-email@test-project.iam.gserviceaccount.com", + "client_id": "test-client-id", + "auth_uri": "https://accounts.google.com/o/oauth2/auth", + "token_uri": "https://oauth2.googleapis.com/token", + "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", + "type": "service_account", + }, + } + + config = VertexAIConfig(**config_data) + + assert config.project_id == "test-project" + assert config.region == "us-central1" + assert config.bucket_uri == "gs://test-bucket" + assert config.vertexai_url == "https://console.cloud.google.com/vertex-ai" + assert config.credential is not None + assert config.credential.private_key_id == "test-key-id" + assert ( + config.credential.private_key + == "-----BEGIN PRIVATE KEY-----\ntest-private-key\n-----END PRIVATE KEY-----\n" + ) + assert ( + config.credential.client_email + == "test-email@test-project.iam.gserviceaccount.com" + ) + assert config.credential.client_id == "test-client-id" + assert config.credential.auth_uri == "https://accounts.google.com/o/oauth2/auth" + assert config.credential.token_uri == "https://oauth2.googleapis.com/token" + assert ( + config.credential.auth_provider_x509_cert_url + == "https://www.googleapis.com/oauth2/v1/certs" + ) + + @pytest.mark.skip(reason="Skipping, this is for debugging purpose") def test_real_model_workunit( source: VertexAISource, real_model: Model, model_version: VersionInfo From 4703cd98d289618d3d4d64ba08217e605b357910 Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Fri, 28 Feb 2025 10:54:52 -0800 Subject: [PATCH 41/59] increasing unit test coverage --- .../src/datahub/ingestion/source/vertexai.py | 17 +---- .../vertexai/test_vertexai_source.py | 4 + .../tests/unit/test_vertexai_source.py | 76 +++++++++++++++++++ 3 files changed, 82 insertions(+), 15 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py index 5a613588c3c678..d38d265e292af5 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py +++ b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py @@ -3,7 +3,7 @@ import os import tempfile import time -from typing import Any, Dict, Iterable, List, Optional, TypeVar +from typing import Any, Iterable, List, Optional, TypeVar from google.cloud import aiplatform from google.cloud.aiplatform import ( @@ -16,7 +16,7 @@ ) from google.cloud.aiplatform.base import VertexAiResourceNoun from google.cloud.aiplatform.models import Model, VersionInfo -from pydantic import PrivateAttr, root_validator +from pydantic import PrivateAttr from pydantic.fields import Field import datahub.emitter.mce_builder as builder @@ -85,14 +85,6 @@ class GCPCredential(ConfigModel): _fix_private_key_newlines = pydantic_multiline_string("private_key") - @root_validator(skip_on_failure=True) - def validate_config(cls, values: Dict[str, Any]) -> Dict[str, Any]: - if values.get("client_x509_cert_url") is None: - values["client_x509_cert_url"] = ( - f"https://www.googleapis.com/robot/v1/metadata/x509/{values['client_email']}" - ) - return values - def create_credential_temp_file(self, project_id: str) -> str: # Adding project_id from the top level config configs = self.dict() @@ -738,11 +730,6 @@ def _make_job_urn(self, job: VertexAiResourceNoun) -> str: urn = builder.make_data_process_instance_urn(dataProcessInstanceId=job_id) return urn - def _make_vertexai_name( - self, entity_type: str, entity_id: str, separator: str = "." - ) -> str: - return f"{self.config.project_id}{separator}{entity_type}{separator}{entity_id}" - def _make_vertexai_model_group_name( self, entity_id: str, separator: str = "." ) -> str: diff --git a/metadata-ingestion/tests/integration/vertexai/test_vertexai_source.py b/metadata-ingestion/tests/integration/vertexai/test_vertexai_source.py index ad0274ed41af31..8ce58607cf1e25 100644 --- a/metadata-ingestion/tests/integration/vertexai/test_vertexai_source.py +++ b/metadata-ingestion/tests/integration/vertexai/test_vertexai_source.py @@ -100,7 +100,9 @@ def mock_training_jobs() -> List[VertexAiResourceNoun]: @patch("google.cloud.aiplatform.AutoMLTextTrainingJob.list") @patch("google.cloud.aiplatform.AutoMLImageTrainingJob.list") @patch("google.cloud.aiplatform.AutoMLVideoTrainingJob.list") +@patch("google.cloud.aiplatform.AutoMLForecastingTrainingJob.list") def test_vertexai_source_ingestion( + mock_automl_forecasting_job_list: List[VertexAiResourceNoun], mock_automl_video_job_list: List[VertexAiResourceNoun], mock_automl_image_list: List[VertexAiResourceNoun], mock_automl_text_job_list: List[VertexAiResourceNoun], @@ -150,6 +152,8 @@ def test_vertexai_source_ingestion( mock_automl_image_list.return_value = [] assert hasattr(mock_automl_video_job_list, "return_value") mock_automl_video_job_list.return_value = [] + assert hasattr(mock_automl_forecasting_job_list, "return_value") + mock_automl_forecasting_job_list.return_value = [] golden_file_path = ( pytestconfig.rootpath / "tests/integration/vertexai/vertexai_mcps_golden.json" diff --git a/metadata-ingestion/tests/unit/test_vertexai_source.py b/metadata-ingestion/tests/unit/test_vertexai_source.py index 2464c4bb28ac22..20c972b524f454 100644 --- a/metadata-ingestion/tests/unit/test_vertexai_source.py +++ b/metadata-ingestion/tests/unit/test_vertexai_source.py @@ -9,8 +9,10 @@ from google.cloud.aiplatform.training_jobs import _TrainingJob from google.protobuf import timestamp_pb2 +from datahub.emitter.mcp_builder import ProjectIdKey from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.source.vertexai import VertexAIConfig, VertexAISource +from datahub.metadata._schema_classes import ContainerClass from datahub.metadata.com.linkedin.pegasus2avro.ml.metadata import ( MLModelGroupProperties, MLModelProperties, @@ -31,6 +33,9 @@ def mock_model() -> Model: mock_model_1.update_time = timestamp_pb2.Timestamp().GetCurrentTime() mock_model_1.version_id = "1" mock_model_1.display_name = "mock_prediction_model_1_display_name" + mock_model_1.resource_name = ( + "projects/872197881936/locations/us-west2/models/3583871344875405312" + ) return mock_model_1 @@ -314,6 +319,77 @@ def test_vertexai_config_init(): ) +@patch("google.cloud.aiplatform.CustomJob.list") +@patch("google.cloud.aiplatform.CustomTrainingJob.list") +@patch("google.cloud.aiplatform.CustomContainerTrainingJob.list") +@patch("google.cloud.aiplatform.CustomPythonPackageTrainingJob.list") +@patch("google.cloud.aiplatform.AutoMLTabularTrainingJob.list") +@patch("google.cloud.aiplatform.AutoMLTextTrainingJob.list") +@patch("google.cloud.aiplatform.AutoMLImageTrainingJob.list") +@patch("google.cloud.aiplatform.AutoMLVideoTrainingJob.list") +@patch("google.cloud.aiplatform.AutoMLForecastingTrainingJob.list") +def test_get_training_jobs_workunit( + mock_automl_forecasting_job_list: List[VertexAiResourceNoun], + mock_automl_video_job_list: List[VertexAiResourceNoun], + mock_automl_image_list: List[VertexAiResourceNoun], + mock_automl_text_job_list: List[VertexAiResourceNoun], + mock_automl_tabular_job_list: List[VertexAiResourceNoun], + mock_custom_python_job_list: List[VertexAiResourceNoun], + mock_custom_container_job_list: List[VertexAiResourceNoun], + mock_custom_training_job_list: List[VertexAiResourceNoun], + mock_custom_job_list: List[VertexAiResourceNoun], + source: VertexAISource, + mock_training_job: VertexAiResourceNoun, +) -> None: + assert hasattr(mock_custom_job_list, "return_value") + mock_custom_job_list.return_value = [mock_training_job] + assert hasattr(mock_custom_training_job_list, "return_value") + mock_custom_training_job_list.return_value = [] + assert hasattr(mock_custom_container_job_list, "return_value") + mock_custom_container_job_list.return_value = [] + assert hasattr(mock_custom_python_job_list, "return_value") + mock_custom_python_job_list.return_value = [] + assert hasattr(mock_automl_tabular_job_list, "return_value") + mock_automl_tabular_job_list.return_value = [] + assert hasattr(mock_automl_text_job_list, "return_value") + mock_automl_text_job_list.return_value = [] + assert hasattr(mock_automl_image_list, "return_value") + mock_automl_image_list.return_value = [] + assert hasattr(mock_automl_video_job_list, "return_value") + mock_automl_video_job_list.return_value = [] + assert hasattr(mock_automl_forecasting_job_list, "return_value") + mock_automl_forecasting_job_list.return_value = [] + + container_key = ProjectIdKey( + project_id=source.config.project_id, platform=source.platform + ) + + for wc in source._get_training_jobs_workunit(): + assert hasattr(wc.metadata, "aspect") + aspect = wc.metadata.aspect + if isinstance(aspect, DataProcessInstancePropertiesClass): + assert ( + aspect.name + == f"{source.config.project_id}.job.{mock_training_job.name}" + ) + assert ( + aspect.customProperties["displayName"] == mock_training_job.display_name + ) + if isinstance(aspect, SubTypesClass): + assert aspect.typeNames == ["Training Job"] + + if isinstance(aspect, ContainerClass): + assert aspect.container == container_key.as_urn() + + +def test_make_model_external_url(mock_model: Model, source: VertexAISource) -> None: + assert ( + source._make_model_external_url(mock_model) + == f"{source.config.vertexai_url}/models/locations/{source.config.region}/models/{mock_model.name}" + f"?project={source.config.project_id}" + ) + + @pytest.mark.skip(reason="Skipping, this is for debugging purpose") def test_real_model_workunit( source: VertexAISource, real_model: Model, model_version: VersionInfo From ba26abb1656c4070ef56e61870f7c3ed0dd9af40 Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Fri, 28 Feb 2025 14:14:50 -0800 Subject: [PATCH 42/59] adding more unit tests --- .../tests/unit/test_vertexai_source.py | 46 ++++++++++++++++++- 1 file changed, 44 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/tests/unit/test_vertexai_source.py b/metadata-ingestion/tests/unit/test_vertexai_source.py index 20c972b524f454..300ad3c6050096 100644 --- a/metadata-ingestion/tests/unit/test_vertexai_source.py +++ b/metadata-ingestion/tests/unit/test_vertexai_source.py @@ -4,20 +4,22 @@ import pytest from google.cloud import aiplatform +from google.cloud.aiplatform import AutoMLTabularTrainingJob from google.cloud.aiplatform.base import VertexAiResourceNoun from google.cloud.aiplatform.models import Endpoint, Model, VersionInfo from google.cloud.aiplatform.training_jobs import _TrainingJob from google.protobuf import timestamp_pb2 +import datahub.emitter.mce_builder as builder from datahub.emitter.mcp_builder import ProjectIdKey from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.source.vertexai import VertexAIConfig, VertexAISource -from datahub.metadata._schema_classes import ContainerClass from datahub.metadata.com.linkedin.pegasus2avro.ml.metadata import ( MLModelGroupProperties, MLModelProperties, ) from datahub.metadata.schema_classes import ( + ContainerClass, DataProcessInstanceInputClass, DataProcessInstancePropertiesClass, MLModelDeploymentPropertiesClass, @@ -72,6 +74,28 @@ def mock_training_job() -> VertexAiResourceNoun: return mock_training_job +@pytest.fixture +def mock_dataset() -> VertexAiResourceNoun: + mock_training_job = MagicMock(spec=VertexAiResourceNoun) + mock_training_job.name = "mock_dataset" + mock_training_job.create_time = timestamp_pb2.Timestamp().GetCurrentTime() + mock_training_job.update_time = timestamp_pb2.Timestamp().GetCurrentTime() + mock_training_job.display_name = "mock_dataset_display_name" + mock_training_job.description = "mock_dataset_description" + return mock_training_job + + +@pytest.fixture +def mock_training_automl_job() -> AutoMLTabularTrainingJob: + mock_automl_job = MagicMock(spec=AutoMLTabularTrainingJob) + mock_automl_job.name = "mock_auto_automl_tabular_job" + mock_automl_job.create_time = timestamp_pb2.Timestamp().GetCurrentTime() + mock_automl_job.update_time = timestamp_pb2.Timestamp().GetCurrentTime() + mock_automl_job.display_name = "mock_auto_automl_tabular_job_display_name" + mock_automl_job.description = "mock_auto_automl_tabular_job_display_name" + return mock_automl_job + + @pytest.fixture def mock_endpoint() -> Endpoint: mock_endpoint = MagicMock(spec=Endpoint) @@ -340,6 +364,7 @@ def test_get_training_jobs_workunit( mock_custom_job_list: List[VertexAiResourceNoun], source: VertexAISource, mock_training_job: VertexAiResourceNoun, + mock_training_automl_job: AutoMLTabularTrainingJob, ) -> None: assert hasattr(mock_custom_job_list, "return_value") mock_custom_job_list.return_value = [mock_training_job] @@ -350,7 +375,7 @@ def test_get_training_jobs_workunit( assert hasattr(mock_custom_python_job_list, "return_value") mock_custom_python_job_list.return_value = [] assert hasattr(mock_automl_tabular_job_list, "return_value") - mock_automl_tabular_job_list.return_value = [] + mock_automl_tabular_job_list.return_value = [mock_training_automl_job] assert hasattr(mock_automl_text_job_list, "return_value") mock_automl_text_job_list.return_value = [] assert hasattr(mock_automl_image_list, "return_value") @@ -364,6 +389,12 @@ def test_get_training_jobs_workunit( project_id=source.config.project_id, platform=source.platform ) + """ + Test the retrieval of training jobs work units from Vertex AI. + This function mocks customJob and AutoMLTabularTrainingJob, + and verifies the properties of the work units + """ + for wc in source._get_training_jobs_workunit(): assert hasattr(wc.metadata, "aspect") aspect = wc.metadata.aspect @@ -371,9 +402,11 @@ def test_get_training_jobs_workunit( assert ( aspect.name == f"{source.config.project_id}.job.{mock_training_job.name}" + or f"{source.config.project_id}.job.{mock_training_automl_job.name}" ) assert ( aspect.customProperties["displayName"] == mock_training_job.display_name + or mock_training_automl_job.display_name ) if isinstance(aspect, SubTypesClass): assert aspect.typeNames == ["Training Job"] @@ -390,6 +423,15 @@ def test_make_model_external_url(mock_model: Model, source: VertexAISource) -> N ) +def test_make_job_urn( + mock_training_job: VertexAiResourceNoun, source: VertexAISource +) -> None: + assert ( + source._make_job_urn(mock_training_job) + == f"{builder.make_data_process_instance_urn(source._make_vertexai_job_name(mock_training_job.name))}" + ) + + @pytest.mark.skip(reason="Skipping, this is for debugging purpose") def test_real_model_workunit( source: VertexAISource, real_model: Model, model_version: VersionInfo From 84ebae09224f4a4951ace0cf9e5104b86e406986 Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Sun, 2 Mar 2025 21:39:46 -0800 Subject: [PATCH 43/59] fixed review comments --- .../docs/sources/vertexai/vertexai_recipe.yml | 2 +- .../src/datahub/ingestion/source/vertexai.py | 335 ++++++++---------- .../vertexai/test_vertexai_source.py | 150 +++----- .../vertexai/vertexai_mcps_golden.json | 111 +----- .../tests/unit/test_vertexai_source.py | 10 +- 5 files changed, 210 insertions(+), 398 deletions(-) diff --git a/metadata-ingestion/docs/sources/vertexai/vertexai_recipe.yml b/metadata-ingestion/docs/sources/vertexai/vertexai_recipe.yml index d97d5d82e3e666..d517537cd85eab 100644 --- a/metadata-ingestion/docs/sources/vertexai/vertexai_recipe.yml +++ b/metadata-ingestion/docs/sources/vertexai/vertexai_recipe.yml @@ -4,7 +4,7 @@ source: project_id: "acryl-poc" region: "us-west2" # credential: -# private_key: "private_key" +# private_key: '-----BEGIN PRIVATE KEY-----\\nprivate-key\\n-----END PRIVATE KEY-----\\n' # private_key_id: "project_key_id" # client_email: "client_email" # client_id: "client_id" diff --git a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py index d38d265e292af5..f3586875430181 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py +++ b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py @@ -1,6 +1,5 @@ import json import logging -import os import tempfile import time from typing import Any, Iterable, List, Optional, TypeVar @@ -16,16 +15,17 @@ ) from google.cloud.aiplatform.base import VertexAiResourceNoun from google.cloud.aiplatform.models import Model, VersionInfo +from google.oauth2 import service_account from pydantic import PrivateAttr from pydantic.fields import Field import datahub.emitter.mce_builder as builder -import datahub.emitter.mcp_builder +from datahub._codegen.aspect import _Aspect from datahub.configuration import ConfigModel from datahub.configuration.source_common import EnvConfigMixin from datahub.configuration.validate_multiline_string import pydantic_multiline_string from datahub.emitter.mcp import MetadataChangeProposalWrapper -from datahub.emitter.mcp_builder import ProjectIdKey +from datahub.emitter.mcp_builder import ProjectIdKey, gen_containers from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.decorators import ( SupportStatus, @@ -123,10 +123,9 @@ def __init__(self, **data: Any): logger.debug( f"Creating temporary credential file at {self._credentials_path}" ) - os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self._credentials_path -@platform_name("VertexAI") +@platform_name("Vertex AI", id="vertexai") @config_class(VertexAIConfig) @support_status(SupportStatus.TESTING) @capability( @@ -142,7 +141,18 @@ def __init__(self, ctx: PipelineContext, config: VertexAIConfig): super().__init__(ctx) self.config = config self.report = SourceReport() - aiplatform.init(project=config.project_id, location=config.region) + + credentials = ( + service_account.Credentials.from_service_account_file( + self.config._credentials_path + ) + if self.config.credential + else None + ) + + aiplatform.init( + project=config.project_id, location=config.region, credentials=credentials + ) self.client = aiplatform self.endpoints: Optional[List[Endpoint]] = None self.datasets: Optional[dict] = None @@ -158,25 +168,25 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: """ # Ingest Project - yield from self._get_project_workunit() + yield from self._get_project_workunits() # Fetch and Ingest Models, Model Versions a from Model Registry yield from self._get_ml_model_workunits() # Fetch and Ingest Training Jobs yield from self._get_training_jobs_workunit() # TODO Fetch Experiments and Experiment Runs - def _get_project_workunit(self) -> Iterable[MetadataWorkUnit]: + def _get_project_workunits(self) -> Iterable[MetadataWorkUnit]: container_key = ProjectIdKey( project_id=self.config.project_id, platform=self.platform ) - yield from datahub.emitter.mcp_builder.gen_containers( + yield from gen_containers( container_key=container_key, name=self.config.project_id, sub_types=["Project"], ) - def _validate_training_job(self, model: Model) -> bool: + def _has_training_job(self, model: Model) -> bool: """ Validate Model Has Valid Training Job """ @@ -212,12 +222,12 @@ def _get_ml_model_workunits(self) -> Iterable[MetadataWorkUnit]: model_versions = model.versioning_registry.list_versions() for model_version in model_versions: # create work unit for Training Job (if Model has reference to Training Job) - if self._validate_training_job(model): + if self._has_training_job(model): logger.info( f"Ingesting a training job for a model: {model_version.model_display_name}" ) if model.training_job: - yield from self._get_data_process_properties_workunit( + yield from self._get_data_process_properties_workunits( model.training_job ) @@ -238,52 +248,27 @@ def _get_training_jobs_workunit(self) -> Iterable[MetadataWorkUnit]: and AutoMLForecastingTrainingJob. For each job, it generates work units containing metadata about the job, its inputs, and its outputs. """ - logger.info("Fetching a list of CustomJobs from VertexAI server") - for job in self.client.CustomJob.list(): - yield from self._get_training_job_workunit(job) - - logger.info("Fetching a list of CustomTrainingJobs from VertexAI server") - for job in self.client.CustomTrainingJob.list(): - yield from self._get_training_job_workunit(job) - - logger.info( - "Fetching a list of CustomContainerTrainingJobs from VertexAI server" - ) - for job in self.client.CustomContainerTrainingJob.list(): - yield from self._get_training_job_workunit(job) - - logger.info( - "Fetching a list of CustomPythonPackageTrainingJob from VertexAI server" - ) - for job in self.client.CustomPythonPackageTrainingJob.list(): - yield from self._get_training_job_workunit(job) - - logger.info("Fetching a list of AutoMLTabularTrainingJobs from VertexAI server") - for job in self.client.AutoMLTabularTrainingJob.list(): - yield from self._get_training_job_workunit(job) - - logger.info("Fetching a list of AutoMLTextTrainingJobs from VertexAI server") - for job in self.client.AutoMLTextTrainingJob.list(): - yield from self._get_training_job_workunit(job) - - logger.info("Fetching a list of AutoMLImageTrainingJobs from VertexAI server") - for job in self.client.AutoMLImageTrainingJob.list(): - yield from self._get_training_job_workunit(job) - - logger.info("Fetching a list of AutoMLVideoTrainingJobs from VertexAI server") - for job in self.client.AutoMLVideoTrainingJob.list(): - yield from self._get_training_job_workunit(job) - - logger.info( - "Fetching a list of AutoMLForecastingTrainingJobs from VertexAI server" - ) - for job in self.client.AutoMLForecastingTrainingJob.list(): - yield from self._get_training_job_workunit(job) + class_names = [ + "CustomJob", + "CustomTrainingJob", + "CustomContainerTrainingJob", + "CustomPythonPackageTrainingJob", + "AutoMLTabularTrainingJob", + "AutoMLTextTrainingJob", + "AutoMLImageTrainingJob", + "AutoMLVideoTrainingJob", + "AutoMLForecastingTrainingJob", + ] + # Iterate over class names and call the list() function + for class_name in class_names: + logger.info(f"Fetching a list of {class_name}s from VertexAI server") + for job in getattr(self.client, class_name).list(): + yield from self._get_training_job_workunit(job) def _get_training_job_workunit( self, job: VertexAiResourceNoun ) -> Iterable[MetadataWorkUnit]: - yield from self._get_data_process_properties_workunit(job) + yield from self._get_data_process_properties_workunits(job) yield from self._get_job_output_workunit(job) yield from self._get_job_input_workunit(job) @@ -323,7 +308,7 @@ def _make_ml_model_group_urn(self, model: Model) -> str: ) return urn - def _get_data_process_properties_workunit( + def _get_data_process_properties_workunits( self, job: VertexAiResourceNoun ) -> Iterable[MetadataWorkUnit]: """ @@ -340,65 +325,45 @@ def _get_data_process_properties_workunit( job_id = self._make_vertexai_job_name(entity_id=job.name) job_urn = builder.make_data_process_instance_urn(job_id) - mcps = [] - mcps.append( - MetadataChangeProposalWrapper( - entityUrn=job_urn, - aspect=DataProcessInstancePropertiesClass( - name=job_id, - created=AuditStampClass( - time=created_time, - actor=created_actor, - ), - externalUrl=self._make_job_external_url(job), - customProperties={ - "displayName": job.display_name, - "jobType": job.__class__.__name__, - }, + aspects: List[_Aspect] = list() + aspects.append( + DataProcessInstancePropertiesClass( + name=job_id, + created=AuditStampClass( + time=created_time, + actor=created_actor, ), + externalUrl=self._make_job_external_url(job), + customProperties={ + "displayName": job.display_name, + "jobType": job.__class__.__name__, + }, ) ) - mcps.append( - MetadataChangeProposalWrapper( - entityUrn=job_urn, - aspect=MLTrainingRunProperties( - externalUrl=self._make_job_external_url(job), id=job.name - ), + aspects.append( + MLTrainingRunProperties( + externalUrl=self._make_job_external_url(job), id=job.name ) ) + aspects.append(SubTypesClass(typeNames=["Training Job"])) - mcps.append( - MetadataChangeProposalWrapper( - entityUrn=job_urn, aspect=SubTypesClass(typeNames=["Training Job"]) - ) - ) + aspects.append(ContainerClass(container=self._get_project_container().as_urn())) - # mcps.append( - # MetadataChangeProposalWrapper( - # entityUrn=job_urn, - # aspect=DataProcessInstanceRunEventClass( + # TO BE ADDED + # aspects.append( + # DataProcessInstanceRunEventClass( # status=DataProcessRunStatusClass.COMPLETE, # timestampMillis=0 - # ) # ) - # ) + # } - # Create a container for Project as parent of the dataset - container_key = ProjectIdKey( - project_id=self.config.project_id, platform=self.platform + yield from auto_workunit( + MetadataChangeProposalWrapper.construct_many(job_urn, aspects=aspects) ) - mcps.append( - MetadataChangeProposalWrapper( - entityUrn=job_urn, - aspect=ContainerClass( - container=container_key.as_urn(), - ), - ) - ) - - yield from auto_workunit(mcps) + def _get_project_container(self) -> ProjectIdKey: + return ProjectIdKey(project_id=self.config.project_id, platform=self.platform) def _is_automl_job(self, job: VertexAiResourceNoun) -> bool: return ( @@ -406,7 +371,8 @@ def _is_automl_job(self, job: VertexAiResourceNoun) -> bool: or isinstance(job, AutoMLTextTrainingJob) or isinstance(job, AutoMLImageTrainingJob) or isinstance(job, AutoMLVideoTrainingJob) - ) or isinstance(job, AutoMLForecastingTrainingJob) + or isinstance(job, AutoMLForecastingTrainingJob) + ) def _search_model_version( self, model: Model, version_id: str @@ -453,20 +419,23 @@ def _search_dataset(self, dataset_id: str) -> Optional[VertexAiResourceNoun]: TimeSeries, and Video) to find a dataset that matches the given dataset ID. """ + dataset_types = [ + "TextDataset", + "TabularDataset", + "ImageDataset", + "TimeSeriesDataset", + "VideoDataset", + ] + if self.datasets is None: self.datasets = dict() - for ds in self.client.datasets.TextDataset.list(): - self.datasets[ds.name] = ds - for ds in self.client.datasets.TabularDataset.list(): - self.datasets[ds.name] = ds - for ds in self.client.datasets.ImageDataset.list(): - self.datasets[ds.name] = ds - for ds in self.client.datasets.TimeSeriesDataset.list(): - self.datasets[ds.name] = ds - for ds in self.client.datasets.VideoDataset.list(): - self.datasets[ds.name] = ds - - return self.datasets[dataset_id] if dataset_id in self.datasets else None + + for dtype in dataset_types: + dataset_class = getattr(self.client.datasets, dtype) + for ds in dataset_class.list(): + self.datasets[ds.name] = ds + + return self.datasets.get(dataset_id) def _get_dataset_workunit( self, dataset_urn: str, ds: VertexAiResourceNoun @@ -476,42 +445,27 @@ def _get_dataset_workunit( """ # Create aspects for the dataset - mcps = [] - mcps.append( - MetadataChangeProposalWrapper( - entityUrn=dataset_urn, - aspect=DatasetPropertiesClass( - name=self._make_vertexai_dataset_name(ds.name), - created=TimeStampClass(time=int(ds.create_time.timestamp() * 1000)), - description=f"Dataset: {ds.display_name}", - customProperties={ - "displayName": ds.display_name, - "resourceName": ds.resource_name, - }, - qualifiedName=ds.resource_name, - ), - ) - ) - mcps.append( - MetadataChangeProposalWrapper( - entityUrn=dataset_urn, aspect=SubTypesClass(typeNames=["Dataset"]) + aspects: List[_Aspect] = list() + aspects.append( + DatasetPropertiesClass( + name=self._make_vertexai_dataset_name(ds.name), + created=TimeStampClass(time=int(ds.create_time.timestamp() * 1000)), + description=f"Dataset: {ds.display_name}", + customProperties={ + "displayName": ds.display_name, + "resourceName": ds.resource_name, + }, + qualifiedName=ds.resource_name, ) ) - # Create a container for Project as parent of the dataset - container_key = ProjectIdKey( - project_id=self.config.project_id, platform=self.platform - ) + aspects.append(SubTypesClass(typeNames=["Dataset"])) - mcps.append( - MetadataChangeProposalWrapper( - entityUrn=dataset_urn, - aspect=ContainerClass( - container=container_key.as_urn(), - ), - ) + # Create a container for Project as parent of the dataset + aspects.append(ContainerClass(container=self._get_project_container().as_urn())) + yield from auto_workunit( + MetadataChangeProposalWrapper.construct_many(dataset_urn, aspects=aspects) ) - yield from auto_workunit(mcps) def _get_job_input_workunit( self, job: VertexAiResourceNoun @@ -531,7 +485,7 @@ def _get_job_input_workunit( # Create URN of Input Dataset for Training Job dataset_id = job_conf["inputDataConfig"]["datasetId"] logger.info( - f" found a training job: {job.display_name} used input dataset id: {dataset_id}" + f"Found input dataset (id: {dataset_id}) for training job ({job.display_name})" ) if dataset_id: @@ -564,7 +518,7 @@ def _get_data_process_input_workunit( aspect=DataProcessInstanceInputClass(inputs=[dataset_urn]), ) logger.info( - f" found training job :{job.display_name} used input dataset : {dataset_name}" + f"Found input dataset ({dataset_name}) for training job ({job.display_name})" ) yield from auto_workunit([mcp]) @@ -596,9 +550,7 @@ def _get_endpoint_workunit( MetadataChangeProposalWrapper( entityUrn=endpoint_urn, aspect=ContainerClass( - container=ProjectIdKey( - project_id=self.config.project_id, platform=self.platform - ).as_urn(), + container=self._get_project_container().as_urn(), ), ) ) @@ -652,56 +604,53 @@ def _get_ml_model_properties_workunit( ) model_urn = self._make_ml_model_urn(model_version, model_name=model_name) - model_aspect = MLModelPropertiesClass( - name=model_version_name, - description=model_version.version_description, - customProperties={ - "displayName": model_version.model_display_name - + self.model_name_separator - + model_version.version_id, - "resourceName": model.resource_name, - }, - created=TimeStampClass( - int(model_version.version_create_time.timestamp() * 1000) - ) - if model_version.version_create_time - else None, - lastModified=TimeStampClass( - int(model_version.version_update_time.timestamp() * 1000) + aspects: List[_Aspect] = list() + + aspects.append( + MLModelPropertiesClass( + name=model_version_name, + description=model_version.version_description, + customProperties={ + "displayName": model_version.model_display_name + + self.model_name_separator + + model_version.version_id, + "resourceName": model.resource_name, + }, + created=TimeStampClass( + int(model_version.version_create_time.timestamp() * 1000) + ) + if model_version.version_create_time + else None, + lastModified=TimeStampClass( + int(model_version.version_update_time.timestamp() * 1000) + ) + if model_version.version_update_time + else None, + version=VersionTagClass(versionTag=str(model_version.version_id)), + groups=[model_group_urn], # link model version to model group + trainingJobs=[training_job_urn] + if training_job_urn + else None, # link to training job + deployments=[endpoint_urn] + if endpoint_urn + else [], # link to model registry and endpoint + externalUrl=self._make_model_version_external_url(model), + type="ML Model", ) - if model_version.version_update_time - else None, - version=VersionTagClass(versionTag=str(model_version.version_id)), - groups=[model_group_urn], # link model version to model group - trainingJobs=[training_job_urn] - if training_job_urn - else None, # link to training job - deployments=[endpoint_urn] - if endpoint_urn - else [], # link to model registry and endpoint - externalUrl=self._make_model_version_external_url(model), - type="ML Model", - ) - - mcps = [] - # logging.info(f"created model version {ml_model_properties.name} associated with group {ml_model_group_urn}") - mcps.append( - MetadataChangeProposalWrapper(entityUrn=model_urn, aspect=model_aspect) ) - # Create a container for Project as parent of the dataset - # mcps.append( - # MetadataChangeProposalWrapper( - # entityUrn=model_urn, - # aspect=ContainerClass( - # container=ProjectIdKey( - # project_id=self.config.project_id, - # platform=self.platform).as_urn(), - # ), + # TO BE ADDED: Create a container for Project as parent of the dataset + # aspects.append( + # ContainerClass( + # container=self._get_project_container().as_urn(), # ) # ) - yield from auto_workunit(mcps) + yield from auto_workunit( + MetadataChangeProposalWrapper.construct_many( + entityUrn=model_urn, aspects=aspects + ) + ) def _search_endpoint(self, model: Model) -> Optional[Endpoint]: """ diff --git a/metadata-ingestion/tests/integration/vertexai/test_vertexai_source.py b/metadata-ingestion/tests/integration/vertexai/test_vertexai_source.py index 8ce58607cf1e25..fdbcf9a3d1e682 100644 --- a/metadata-ingestion/tests/integration/vertexai/test_vertexai_source.py +++ b/metadata-ingestion/tests/integration/vertexai/test_vertexai_source.py @@ -1,46 +1,36 @@ +import contextlib from pathlib import Path from typing import Any, Dict, List, TypeVar from unittest.mock import MagicMock, patch import pytest -from _pytest.config import Config from google.cloud.aiplatform import Model from google.cloud.aiplatform.base import VertexAiResourceNoun from google.protobuf import timestamp_pb2 +from pytest import Config from datahub.ingestion.run.pipeline import Pipeline from tests.test_helpers import mce_helpers T = TypeVar("T") - -@pytest.fixture -def project_id() -> str: - return "test-project-id" - - -@pytest.fixture -def region() -> str: - return "us-west2" - +PROJECT_ID = "test-project-id" +REGION = "us-west2" @pytest.fixture def sink_file_path(tmp_path: Path) -> str: return str(tmp_path / "vertexai_source_mcps.json") -@pytest.fixture -def pipeline_config( - project_id: str, region: str, sink_file_path: str -) -> Dict[str, Any]: +def get_pipeline_config(sink_file_path: str) -> Dict[str, Any]: source_type = "vertexai" return { "run_id": "vertexai-source-test", "source": { "type": source_type, "config": { - "project_id": project_id, - "region": region, + "project_id": PROJECT_ID, + "region": REGION, }, }, "sink": { @@ -85,90 +75,54 @@ def mock_training_jobs() -> List[VertexAiResourceNoun]: return [mock_training_job] -@patch("google.cloud.aiplatform.init") -@patch("google.cloud.aiplatform.Model.list") -@patch("google.cloud.aiplatform.datasets.TextDataset.list") -@patch("google.cloud.aiplatform.datasets.TabularDataset.list") -@patch("google.cloud.aiplatform.datasets.ImageDataset.list") -@patch("google.cloud.aiplatform.datasets.TimeSeriesDataset.list") -@patch("google.cloud.aiplatform.datasets.VideoDataset.list") -@patch("google.cloud.aiplatform.CustomJob.list") -@patch("google.cloud.aiplatform.CustomTrainingJob.list") -@patch("google.cloud.aiplatform.CustomContainerTrainingJob.list") -@patch("google.cloud.aiplatform.CustomPythonPackageTrainingJob.list") -@patch("google.cloud.aiplatform.AutoMLTabularTrainingJob.list") -@patch("google.cloud.aiplatform.AutoMLTextTrainingJob.list") -@patch("google.cloud.aiplatform.AutoMLImageTrainingJob.list") -@patch("google.cloud.aiplatform.AutoMLVideoTrainingJob.list") -@patch("google.cloud.aiplatform.AutoMLForecastingTrainingJob.list") def test_vertexai_source_ingestion( - mock_automl_forecasting_job_list: List[VertexAiResourceNoun], - mock_automl_video_job_list: List[VertexAiResourceNoun], - mock_automl_image_list: List[VertexAiResourceNoun], - mock_automl_text_job_list: List[VertexAiResourceNoun], - mock_automl_tabular_job_list: List[VertexAiResourceNoun], - mock_custom_python_job_list: List[VertexAiResourceNoun], - mock_custom_container_job_list: List[VertexAiResourceNoun], - mock_custom_training_job_list: List[VertexAiResourceNoun], - mock_custom_job_list: List[VertexAiResourceNoun], - mock_video_ds_list: List[VertexAiResourceNoun], - mock_time_series_ds_list: List[VertexAiResourceNoun], - mock_image_ds_list: List[VertexAiResourceNoun], - mock_tabular_ds_list: List[VertexAiResourceNoun], - mock_text_ds_list: List[VertexAiResourceNoun], - mock_model_list: List[Model], - mock_init: MagicMock, pytestconfig: Config, sink_file_path: str, - pipeline_config: Dict[str, Any], mock_models: List[Model], mock_training_jobs: List[VertexAiResourceNoun], ) -> None: - assert hasattr(mock_model_list, "return_value") - mock_model_list.return_value = mock_models - assert hasattr(mock_text_ds_list, "return_value") - mock_text_ds_list.return_value = [] - assert hasattr(mock_tabular_ds_list, "return_value") - mock_tabular_ds_list.return_value = [] - assert hasattr(mock_image_ds_list, "return_value") - mock_image_ds_list.return_value = [] - assert hasattr(mock_time_series_ds_list, "return_value") - mock_time_series_ds_list.return_value = [] - assert hasattr(mock_video_ds_list, "return_value") - mock_video_ds_list.return_value = [] - assert hasattr(mock_custom_job_list, "return_value") - mock_custom_job_list.return_value = mock_training_jobs - assert hasattr(mock_custom_training_job_list, "return_value") - mock_custom_training_job_list.return_value = [] - assert hasattr(mock_custom_container_job_list, "return_value") - mock_custom_container_job_list.return_value = [] - assert hasattr(mock_custom_python_job_list, "return_value") - mock_custom_python_job_list.return_value = [] - assert hasattr(mock_automl_tabular_job_list, "return_value") - mock_automl_tabular_job_list.return_value = [] - assert hasattr(mock_automl_text_job_list, "return_value") - mock_automl_text_job_list.return_value = [] - assert hasattr(mock_automl_image_list, "return_value") - mock_automl_image_list.return_value = [] - assert hasattr(mock_automl_video_job_list, "return_value") - mock_automl_video_job_list.return_value = [] - assert hasattr(mock_automl_forecasting_job_list, "return_value") - mock_automl_forecasting_job_list.return_value = [] - - golden_file_path = ( - pytestconfig.rootpath / "tests/integration/vertexai/vertexai_mcps_golden.json" - ) - - print(f"mcps file path: {str(sink_file_path)}") - print(f"golden file path: {str(golden_file_path)}") - - pipeline = Pipeline.create(pipeline_config) - pipeline.run() - pipeline.pretty_print_summary() - pipeline.raise_from_status() - - mce_helpers.check_golden_file( - pytestconfig=pytestconfig, - output_path=sink_file_path, - golden_path=golden_file_path, - ) + mocks = {} + with contextlib.ExitStack() as exit_stack: + for path_to_mock in [ + "google.cloud.aiplatform.init", + "google.cloud.aiplatform.Model.list", + "google.cloud.aiplatform.datasets.TextDataset.list", + "google.cloud.aiplatform.datasets.TabularDataset.list", + "google.cloud.aiplatform.datasets.ImageDataset.list", + "google.cloud.aiplatform.datasets.TimeSeriesDataset.list", + "google.cloud.aiplatform.datasets.VideoDataset.list", + "google.cloud.aiplatform.CustomJob.list", + "google.cloud.aiplatform.CustomTrainingJob.list", + "google.cloud.aiplatform.CustomContainerTrainingJob.list", + "google.cloud.aiplatform.CustomPythonPackageTrainingJob.list", + "google.cloud.aiplatform.AutoMLTabularTrainingJob.list", + "google.cloud.aiplatform.AutoMLTextTrainingJob.list", + "google.cloud.aiplatform.AutoMLImageTrainingJob.list", + "google.cloud.aiplatform.AutoMLVideoTrainingJob.list", + "google.cloud.aiplatform.AutoMLForecastingTrainingJob.list", + ]: + mock = exit_stack.enter_context(patch(path_to_mock)) + if path_to_mock == "google.cloud.aiplatform.Model.list": + mock.return_value = mock_models + else: + mock.return_value = [] + mocks[path_to_mock] = mock + + golden_file_path = ( + pytestconfig.rootpath + / "tests/integration/vertexai/vertexai_mcps_golden.json" + ) + + print(f"mcps file path: {str(sink_file_path)}") + print(f"golden file path: {str(golden_file_path)}") + + pipeline = Pipeline.create(get_pipeline_config(sink_file_path)) + pipeline.run() + pipeline.pretty_print_summary() + pipeline.raise_from_status() + + mce_helpers.check_golden_file( + pytestconfig=pytestconfig, + output_path=sink_file_path, + golden_path=golden_file_path, + ) diff --git a/metadata-ingestion/tests/integration/vertexai/vertexai_mcps_golden.json b/metadata-ingestion/tests/integration/vertexai/vertexai_mcps_golden.json index 45b87513754b65..7dbe1f89f7643a 100644 --- a/metadata-ingestion/tests/integration/vertexai/vertexai_mcps_golden.json +++ b/metadata-ingestion/tests/integration/vertexai/vertexai_mcps_golden.json @@ -14,7 +14,7 @@ } }, "systemMetadata": { - "lastObserved": 1740736756728, + "lastObserved": 1740975346432, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } @@ -30,7 +30,7 @@ } }, "systemMetadata": { - "lastObserved": 1740736756729, + "lastObserved": 1740975346433, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } @@ -46,7 +46,7 @@ } }, "systemMetadata": { - "lastObserved": 1740736756729, + "lastObserved": 1740975346433, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } @@ -64,7 +64,7 @@ } }, "systemMetadata": { - "lastObserved": 1740736756729, + "lastObserved": 1740975346433, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } @@ -80,7 +80,7 @@ } }, "systemMetadata": { - "lastObserved": 1740736756730, + "lastObserved": 1740975346433, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } @@ -100,7 +100,7 @@ } }, "systemMetadata": { - "lastObserved": 1740736756731, + "lastObserved": 1740975346434, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } @@ -120,100 +120,7 @@ } }, "systemMetadata": { - "lastObserved": 1740736756732, - "runId": "vertexai-source-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataProcessInstance", - "entityUrn": "urn:li:dataProcessInstance:test-project-id.job.mock_training_job", - "changeType": "UPSERT", - "aspectName": "dataProcessInstanceProperties", - "aspect": { - "json": { - "customProperties": { - "displayName": "mock_training_job_display_name", - "jobType": "VertexAiResourceNoun" - }, - "externalUrl": "https://console.cloud.google.com/vertex-ai/training/training-pipelines?trainingPipelineId=mock_training_job?project=test-project-id", - "name": "test-project-id.job.mock_training_job", - "created": { - "time": 1740736756731, - "actor": "urn:li:platformResource:vertexai" - } - } - }, - "systemMetadata": { - "lastObserved": 1740736758306, - "runId": "vertexai-source-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataProcessInstance", - "entityUrn": "urn:li:dataProcessInstance:test-project-id.job.mock_training_job", - "changeType": "UPSERT", - "aspectName": "mlTrainingRunProperties", - "aspect": { - "json": { - "customProperties": {}, - "externalUrl": "https://console.cloud.google.com/vertex-ai/training/training-pipelines?trainingPipelineId=mock_training_job?project=test-project-id", - "id": "mock_training_job" - } - }, - "systemMetadata": { - "lastObserved": 1740736758308, - "runId": "vertexai-source-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataProcessInstance", - "entityUrn": "urn:li:dataProcessInstance:test-project-id.job.mock_training_job", - "changeType": "UPSERT", - "aspectName": "subTypes", - "aspect": { - "json": { - "typeNames": [ - "Training Job" - ] - } - }, - "systemMetadata": { - "lastObserved": 1740736758309, - "runId": "vertexai-source-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataProcessInstance", - "entityUrn": "urn:li:dataProcessInstance:test-project-id.job.mock_training_job", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:29746a9030349f4340ed74b46913dab6" - } - }, - "systemMetadata": { - "lastObserved": 1740736758311, - "runId": "vertexai-source-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataProcessInstance", - "entityUrn": "urn:li:dataProcessInstance:test-project-id.job.mock_training_job", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false - } - }, - "systemMetadata": { - "lastObserved": 1740736758312, + "lastObserved": 1740975346435, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } @@ -229,7 +136,7 @@ } }, "systemMetadata": { - "lastObserved": 1740736758312, + "lastObserved": 1740975346436, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } @@ -245,7 +152,7 @@ } }, "systemMetadata": { - "lastObserved": 1740736758313, + "lastObserved": 1740975346436, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } diff --git a/metadata-ingestion/tests/unit/test_vertexai_source.py b/metadata-ingestion/tests/unit/test_vertexai_source.py index 300ad3c6050096..4891850ce69b28 100644 --- a/metadata-ingestion/tests/unit/test_vertexai_source.py +++ b/metadata-ingestion/tests/unit/test_vertexai_source.py @@ -13,7 +13,10 @@ import datahub.emitter.mce_builder as builder from datahub.emitter.mcp_builder import ProjectIdKey from datahub.ingestion.api.common import PipelineContext -from datahub.ingestion.source.vertexai import VertexAIConfig, VertexAISource +from datahub.ingestion.source.vertexai import ( + VertexAIConfig, + VertexAISource, +) from datahub.metadata.com.linkedin.pegasus2avro.ml.metadata import ( MLModelGroupProperties, MLModelProperties, @@ -242,7 +245,7 @@ def test_get_endpoint_workunit( def test_get_data_process_properties_workunit( source: VertexAISource, mock_training_job: VertexAiResourceNoun ) -> None: - for wu in source._get_data_process_properties_workunit(mock_training_job): + for wu in source._get_data_process_properties_workunits(mock_training_job): assert hasattr(wu.metadata, "aspect") aspect = wu.metadata.aspect if isinstance(aspect, DataProcessInstancePropertiesClass): @@ -394,7 +397,6 @@ def test_get_training_jobs_workunit( This function mocks customJob and AutoMLTabularTrainingJob, and verifies the properties of the work units """ - for wc in source._get_training_jobs_workunit(): assert hasattr(wc.metadata, "aspect") aspect = wc.metadata.aspect @@ -457,7 +459,7 @@ def test_real_model_workunit( def test_real_get_data_process_properties( source: VertexAISource, real_autoML_tabular_job: _TrainingJob ) -> None: - for wu in source._get_data_process_properties_workunit(real_autoML_tabular_job): + for wu in source._get_data_process_properties_workunits(real_autoML_tabular_job): assert hasattr(wu.metadata, "aspect") aspect = wu.metadata.aspect if isinstance(aspect, DataProcessInstancePropertiesClass): From 5472929e18641041ac693eb5c295afe4136b545b Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Mon, 3 Mar 2025 11:39:28 -0800 Subject: [PATCH 44/59] fixed review comments, adding unit test cases --- .../docs/sources/vertexai/vertexai_pre.md | 2 +- .../src/datahub/ingestion/source/vertexai.py | 134 ++++----- ...st_vertexai_source.py => test_vertexai.py} | 9 +- .../tests/unit/test_vertexai_source.py | 266 +++++++++--------- 4 files changed, 186 insertions(+), 225 deletions(-) rename metadata-ingestion/tests/integration/vertexai/{test_vertexai_source.py => test_vertexai.py} (95%) diff --git a/metadata-ingestion/docs/sources/vertexai/vertexai_pre.md b/metadata-ingestion/docs/sources/vertexai/vertexai_pre.md index 98047482299a49..73c9fb4454a2c4 100644 --- a/metadata-ingestion/docs/sources/vertexai/vertexai_pre.md +++ b/metadata-ingestion/docs/sources/vertexai/vertexai_pre.md @@ -3,7 +3,7 @@ #### Credential to access to GCP 1. Follow the section on credentials to access Vertex AI [GCP docs](https://cloud.google.com/docs/authentication/provide-credentials-adc#how-to). -#### Create a service account in the Extractor Project +#### Create a service account and assign roles 1. Setup a ServiceAccount as per [GCP docs](https://cloud.google.com/iam/docs/creating-managing-service-accounts#iam-service-accounts-create-console) and assign the previously created role to this service account. diff --git a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py index f3586875430181..fca1248f075fbf 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py +++ b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py @@ -85,10 +85,11 @@ class GCPCredential(ConfigModel): _fix_private_key_newlines = pydantic_multiline_string("private_key") - def create_credential_temp_file(self, project_id: str) -> str: + def create_credential_temp_file(self, project_id: Optional[str] = None) -> str: # Adding project_id from the top level config configs = self.dict() - configs["project_id"] = project_id + if project_id: + configs["project_id"] = project_id with tempfile.NamedTemporaryFile(delete=False) as fp: cred_json = json.dumps(configs, indent=4, separators=(",", ": ")) fp.write(cred_json.encode()) @@ -168,14 +169,14 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: """ # Ingest Project - yield from self._get_project_workunits() + yield from self._gen_project_workunits() # Fetch and Ingest Models, Model Versions a from Model Registry - yield from self._get_ml_model_workunits() + yield from self._get_ml_models_workunits() # Fetch and Ingest Training Jobs - yield from self._get_training_jobs_workunit() + yield from self._get_training_jobs_workunits() # TODO Fetch Experiments and Experiment Runs - def _get_project_workunits(self) -> Iterable[MetadataWorkUnit]: + def _gen_project_workunits(self) -> Iterable[MetadataWorkUnit]: container_key = ProjectIdKey( project_id=self.config.project_id, platform=self.platform ) @@ -186,60 +187,25 @@ def _get_project_workunits(self) -> Iterable[MetadataWorkUnit]: sub_types=["Project"], ) - def _has_training_job(self, model: Model) -> bool: - """ - Validate Model Has Valid Training Job - """ - job = model.training_job - if not job: - return False - - try: - # when model has ref to training job, but field is sometimes not accessible and RunTImeError thrown when accessed - # if RunTimeError is not thrown, it is valid and proceed - name = job.name - logger.debug( - ( - f"can fetch training job name: {name} for model: (name:{model.display_name} id:{model.name})" - ) - ) - return True - except RuntimeError: - logger.debug( - f"cannot fetch training job name, not valid for model (name:{model.display_name} id:{model.name})" - ) - - return False - - def _get_ml_model_workunits(self) -> Iterable[MetadataWorkUnit]: + def _get_ml_models_workunits(self) -> Iterable[MetadataWorkUnit]: """ Fetch List of Models in Model Registry and generate a corresponding work unit. """ registered_models = self.client.Model.list() for model in registered_models: # create work unit for Model Group (= Model in VertexAI) - yield from self._get_ml_group_workunit(model) + yield from self._gen_ml_group_workunits(model) model_versions = model.versioning_registry.list_versions() for model_version in model_versions: - # create work unit for Training Job (if Model has reference to Training Job) - if self._has_training_job(model): - logger.info( - f"Ingesting a training job for a model: {model_version.model_display_name}" - ) - if model.training_job: - yield from self._get_data_process_properties_workunits( - model.training_job - ) - # create work unit for Model (= Model Version in VertexAI) logger.info( f"Ingesting a model (name: {model.display_name} id:{model.name})" ) - yield from self._get_ml_model_endpoint_workunit( + yield from self._gen_ml_model_endpoint_workunits( model=model, model_version=model_version ) - def _get_training_jobs_workunit(self) -> Iterable[MetadataWorkUnit]: + def _get_training_jobs_workunits(self) -> Iterable[MetadataWorkUnit]: """ Fetches training jobs from Vertex AI and generates corresponding work units. This method retrieves various types of training jobs from Vertex AI, including @@ -263,16 +229,16 @@ def _get_training_jobs_workunit(self) -> Iterable[MetadataWorkUnit]: for class_name in class_names: logger.info(f"Fetching a list of {class_name}s from VertexAI server") for job in getattr(self.client, class_name).list(): - yield from self._get_training_job_workunit(job) + yield from self._get_training_job_workunits(job) - def _get_training_job_workunit( + def _get_training_job_workunits( self, job: VertexAiResourceNoun ) -> Iterable[MetadataWorkUnit]: - yield from self._get_data_process_properties_workunits(job) - yield from self._get_job_output_workunit(job) - yield from self._get_job_input_workunit(job) + yield from self._generate_data_process_workunits(job) + yield from self._get_job_output_workunits(job) + yield from self._get_job_input_workunits(job) - def _get_ml_group_workunit( + def _gen_ml_group_workunits( self, model: Model, ) -> Iterable[MetadataWorkUnit]: @@ -308,7 +274,7 @@ def _make_ml_model_group_urn(self, model: Model) -> str: ) return urn - def _get_data_process_properties_workunits( + def _generate_data_process_workunits( self, job: VertexAiResourceNoun ) -> Iterable[MetadataWorkUnit]: """ @@ -382,7 +348,7 @@ def _search_model_version( return version return None - def _get_job_output_workunit( + def _get_job_output_workunits( self, job: VertexAiResourceNoun ) -> Iterable[MetadataWorkUnit]: """ @@ -408,7 +374,7 @@ def _get_job_output_workunit( f" found a training job: {job.display_name} generated " f"a model (name:{model.display_name} id:{model_version_str})" ) - yield from self._get_ml_model_endpoint_workunit( + yield from self._gen_ml_model_endpoint_workunits( model, model_version, job_urn ) @@ -437,7 +403,7 @@ def _search_dataset(self, dataset_id: str) -> Optional[VertexAiResourceNoun]: return self.datasets.get(dataset_id) - def _get_dataset_workunit( + def _get_dataset_workunits( self, dataset_urn: str, ds: VertexAiResourceNoun ) -> Iterable[MetadataWorkUnit]: """ @@ -449,7 +415,9 @@ def _get_dataset_workunit( aspects.append( DatasetPropertiesClass( name=self._make_vertexai_dataset_name(ds.name), - created=TimeStampClass(time=int(ds.create_time.timestamp() * 1000)), + created=TimeStampClass(time=int(ds.create_time.timestamp() * 1000)) + if ds.create_time + else None, description=f"Dataset: {ds.display_name}", customProperties={ "displayName": ds.display_name, @@ -467,7 +435,7 @@ def _get_dataset_workunit( MetadataChangeProposalWrapper.construct_many(dataset_urn, aspects=aspects) ) - def _get_job_input_workunit( + def _get_job_input_workunits( self, job: VertexAiResourceNoun ) -> Iterable[MetadataWorkUnit]: """ @@ -489,9 +457,9 @@ def _get_job_input_workunit( ) if dataset_id: - yield from self._get_data_process_input_workunit(job, dataset_id) + yield from self._gen_input_dataset_workunits(job, dataset_id) - def _get_data_process_input_workunit( + def _gen_input_dataset_workunits( self, job: VertexAiResourceNoun, dataset_id: str ) -> Iterable[MetadataWorkUnit]: """ @@ -510,7 +478,7 @@ def _get_data_process_input_workunit( dataset = self._search_dataset(dataset_id) if dataset_id else None if dataset: - yield from self._get_dataset_workunit(dataset_urn=dataset_urn, ds=dataset) + yield from self._get_dataset_workunits(dataset_urn=dataset_urn, ds=dataset) # Create URN of Training Job job_id = self._make_vertexai_job_name(entity_id=job.name) mcp = MetadataChangeProposalWrapper( @@ -522,7 +490,7 @@ def _get_data_process_input_workunit( ) yield from auto_workunit([mcp]) - def _get_endpoint_workunit( + def _gen_endpoint_workunits( self, endpoint: Endpoint, model: Model, model_version: VersionInfo ) -> Iterable[MetadataWorkUnit]: endpoint_urn = builder.make_ml_model_deployment_urn( @@ -532,38 +500,30 @@ def _get_endpoint_workunit( ), env=self.config.env, ) - deployment_aspect = MLModelDeploymentPropertiesClass( - description=model.description, - createdAt=int(endpoint.create_time.timestamp() * 1000), - version=VersionTagClass(versionTag=str(model_version.version_id)), - customProperties={"displayName": endpoint.display_name}, - ) - mcps = [] - mcps.append( - MetadataChangeProposalWrapper( - entityUrn=endpoint_urn, aspect=deployment_aspect + aspects: List[_Aspect] = list() + aspects.append( + MLModelDeploymentPropertiesClass( + description=model.description, + createdAt=int(endpoint.create_time.timestamp() * 1000), + version=VersionTagClass(versionTag=str(model_version.version_id)), + customProperties={"displayName": endpoint.display_name}, ) ) - mcps.append( - MetadataChangeProposalWrapper( - entityUrn=endpoint_urn, - aspect=ContainerClass( - container=self._get_project_container().as_urn(), - ), + aspects.append( + ContainerClass( + container=self._get_project_container().as_urn(), ) ) - mcps.append( - MetadataChangeProposalWrapper( - entityUrn=endpoint_urn, aspect=SubTypesClass(typeNames=["Endpoint"]) - ) - ) + aspects.append(SubTypesClass(typeNames=["Endpoint"])) - yield from auto_workunit(mcps) + yield from auto_workunit( + MetadataChangeProposalWrapper.construct_many(endpoint_urn, aspects=aspects) + ) - def _get_ml_model_endpoint_workunit( + def _gen_ml_model_endpoint_workunits( self, model: Model, model_version: VersionInfo, @@ -577,13 +537,13 @@ def _get_ml_model_endpoint_workunit( endpoint_urn = None if endpoint: - yield from self._get_endpoint_workunit(endpoint, model, model_version) + yield from self._gen_endpoint_workunits(endpoint, model, model_version) - yield from self._get_ml_model_properties_workunit( + yield from self._gen_ml_model_workunits( model, model_version, training_job_urn, endpoint_urn ) - def _get_ml_model_properties_workunit( + def _gen_ml_model_workunits( self, model: Model, model_version: VersionInfo, diff --git a/metadata-ingestion/tests/integration/vertexai/test_vertexai_source.py b/metadata-ingestion/tests/integration/vertexai/test_vertexai.py similarity index 95% rename from metadata-ingestion/tests/integration/vertexai/test_vertexai_source.py rename to metadata-ingestion/tests/integration/vertexai/test_vertexai.py index fdbcf9a3d1e682..9ada49a556b9fb 100644 --- a/metadata-ingestion/tests/integration/vertexai/test_vertexai_source.py +++ b/metadata-ingestion/tests/integration/vertexai/test_vertexai.py @@ -17,6 +17,7 @@ PROJECT_ID = "test-project-id" REGION = "us-west2" + @pytest.fixture def sink_file_path(tmp_path: Path) -> str: return str(tmp_path / "vertexai_source_mcps.json") @@ -81,9 +82,8 @@ def test_vertexai_source_ingestion( mock_models: List[Model], mock_training_jobs: List[VertexAiResourceNoun], ) -> None: - mocks = {} with contextlib.ExitStack() as exit_stack: - for path_to_mock in [ + for func_to_mock in [ "google.cloud.aiplatform.init", "google.cloud.aiplatform.Model.list", "google.cloud.aiplatform.datasets.TextDataset.list", @@ -101,12 +101,11 @@ def test_vertexai_source_ingestion( "google.cloud.aiplatform.AutoMLVideoTrainingJob.list", "google.cloud.aiplatform.AutoMLForecastingTrainingJob.list", ]: - mock = exit_stack.enter_context(patch(path_to_mock)) - if path_to_mock == "google.cloud.aiplatform.Model.list": + mock = exit_stack.enter_context(patch(func_to_mock)) + if func_to_mock == "google.cloud.aiplatform.Model.list": mock.return_value = mock_models else: mock.return_value = [] - mocks[path_to_mock] = mock golden_file_path = ( pytestconfig.rootpath diff --git a/metadata-ingestion/tests/unit/test_vertexai_source.py b/metadata-ingestion/tests/unit/test_vertexai_source.py index 4891850ce69b28..70692a7ac4860e 100644 --- a/metadata-ingestion/tests/unit/test_vertexai_source.py +++ b/metadata-ingestion/tests/unit/test_vertexai_source.py @@ -1,3 +1,5 @@ +import contextlib +import json from datetime import datetime from typing import List from unittest.mock import MagicMock, patch @@ -29,6 +31,9 @@ SubTypesClass, ) +PROJECT_ID = "acryl-poc" +REGION = "us-west2" + @pytest.fixture def mock_model() -> Model: @@ -79,13 +84,13 @@ def mock_training_job() -> VertexAiResourceNoun: @pytest.fixture def mock_dataset() -> VertexAiResourceNoun: - mock_training_job = MagicMock(spec=VertexAiResourceNoun) - mock_training_job.name = "mock_dataset" - mock_training_job.create_time = timestamp_pb2.Timestamp().GetCurrentTime() - mock_training_job.update_time = timestamp_pb2.Timestamp().GetCurrentTime() - mock_training_job.display_name = "mock_dataset_display_name" - mock_training_job.description = "mock_dataset_description" - return mock_training_job + mock_dataset = MagicMock(spec=VertexAiResourceNoun) + mock_dataset.name = "mock_dataset" + mock_dataset.create_time = timestamp_pb2.Timestamp().GetCurrentTime() + mock_dataset.update_time = timestamp_pb2.Timestamp().GetCurrentTime() + mock_dataset.display_name = "mock_dataset_display_name" + mock_dataset.description = "mock_dataset_description" + return mock_dataset @pytest.fixture @@ -109,26 +114,10 @@ def mock_endpoint() -> Endpoint: @pytest.fixture -def project_id() -> str: - """ - Replace with your GCP Project ID - """ - return "acryl-poc" - - -@pytest.fixture -def region() -> str: - """ - Replace with your GCP region s - """ - return "us-west2" - - -@pytest.fixture -def source(project_id: str, region: str) -> VertexAISource: +def source() -> VertexAISource: return VertexAISource( ctx=PipelineContext(run_id="vertexai-source-test"), - config=VertexAIConfig(project_id=project_id, region=region), + config=VertexAIConfig(project_id=PROJECT_ID, region=REGION), ) @@ -185,7 +174,7 @@ def test_get_ml_model_workunits( assert hasattr(mock_list, "return_value") # this check needed to go ground lint mock_list.return_value = mock_models - wcs = [wc for wc in source._get_ml_model_workunits()] + wcs = [wc for wc in source._get_ml_models_workunits()] assert len(wcs) == 2 # aspect is MLModelGroupPropertiesClass @@ -209,9 +198,7 @@ def test_get_ml_model_workunits( def test_get_ml_model_properties_workunit( source: VertexAISource, mock_model: Model, model_version: VersionInfo ) -> None: - wu = [ - wu for wu in source._get_ml_model_properties_workunit(mock_model, model_version) - ] + wu = [wu for wu in source._gen_ml_model_workunits(mock_model, model_version)] assert len(wu) == 1 assert hasattr(wu[0].metadata, "aspect") aspect = wu[0].metadata.aspect @@ -231,7 +218,7 @@ def test_get_endpoint_workunit( mock_model: Model, model_version: VersionInfo, ) -> None: - for wu in source._get_endpoint_workunit(mock_endpoint, mock_model, model_version): + for wu in source._gen_endpoint_workunits(mock_endpoint, mock_model, model_version): assert hasattr(wu.metadata, "aspect") aspect = wu.metadata.aspect if isinstance(aspect, MLModelDeploymentPropertiesClass): @@ -240,12 +227,17 @@ def test_get_endpoint_workunit( "displayName": mock_endpoint.display_name } assert aspect.createdAt == int(mock_endpoint.create_time.timestamp() * 1000) + elif isinstance(aspect, ContainerClass): + assert aspect.container == source._get_project_container().as_urn() + + elif isinstance(aspect, SubTypesClass): + assert aspect.typeNames == ["Endpoint"] def test_get_data_process_properties_workunit( source: VertexAISource, mock_training_job: VertexAiResourceNoun ) -> None: - for wu in source._get_data_process_properties_workunits(mock_training_job): + for wu in source._generate_data_process_workunits(mock_training_job): assert hasattr(wu.metadata, "aspect") aspect = wu.metadata.aspect if isinstance(aspect, DataProcessInstancePropertiesClass): @@ -256,51 +248,37 @@ def test_get_data_process_properties_workunit( assert aspect.externalUrl == source._make_job_external_url( mock_training_job ) + assert ( + aspect.customProperties["displayName"] == mock_training_job.display_name + ) elif isinstance(aspect, SubTypesClass): assert "Training Job" in aspect.typeNames -@patch("google.cloud.aiplatform.datasets.TextDataset.list") -@patch("google.cloud.aiplatform.datasets.TabularDataset.list") -@patch("google.cloud.aiplatform.datasets.ImageDataset.list") -@patch("google.cloud.aiplatform.datasets.TimeSeriesDataset.list") -@patch("google.cloud.aiplatform.datasets.VideoDataset.list") def test_get_data_process_input_workunit( - mock_text_list: List[VertexAiResourceNoun], - mock_tabular_list: List[VertexAiResourceNoun], - mock_image_list: List[VertexAiResourceNoun], - mock_time_series_list: List[VertexAiResourceNoun], - mock_video_list: List[VertexAiResourceNoun], source: VertexAISource, mock_training_job: VertexAiResourceNoun, ) -> None: - # Mocking all the dataset list - assert hasattr( - mock_text_list, "return_value" - ) # this check needed to go ground lint - mock_text_list.return_value = [] - assert hasattr( - mock_tabular_list, "return_value" - ) # this check needed to go ground lint - mock_tabular_list.return_value = [] - assert hasattr( - mock_video_list, "return_value" - ) # this check needed to go ground lint - mock_video_list.return_value = [] - assert hasattr( - mock_time_series_list, "return_value" - ) # this check needed to go ground lint - mock_time_series_list.return_value = [] - assert hasattr( - mock_image_list, "return_value" - ) # this check needed to go ground lint - mock_image_list.return_value = [] - - for wu in source._get_data_process_input_workunit(mock_training_job, "12345"): - assert hasattr(wu.metadata, "aspect") - aspect = wu.metadata.aspect - assert isinstance(aspect, DataProcessInstanceInputClass) - assert len(aspect.inputs) == 1 + with contextlib.ExitStack() as exit_stack: + for func_to_mock in [ + "google.cloud.aiplatform.init", + "google.cloud.aiplatform.datasets.TextDataset.list", + "google.cloud.aiplatform.datasets.TabularDataset.list", + "google.cloud.aiplatform.datasets.ImageDataset.list", + "google.cloud.aiplatform.datasets.TimeSeriesDataset.list", + "google.cloud.aiplatform.datasets.VideoDataset.list", + ]: + mock = exit_stack.enter_context(patch(func_to_mock)) + if func_to_mock == "google.cloud.aiplatform.CustomJob.list": + mock.return_value = [mock_training_job] + else: + mock.return_value = [] + + for wu in source._gen_input_dataset_workunits(mock_training_job, "12345"): + assert hasattr(wu.metadata, "aspect") + aspect = wu.metadata.aspect + assert isinstance(aspect, DataProcessInstanceInputClass) + assert len(aspect.inputs) == 1 def test_vertexai_config_init(): @@ -345,76 +323,100 @@ def test_vertexai_config_init(): == "https://www.googleapis.com/oauth2/v1/certs" ) + assert config._credentials_path is not None + with open(config._credentials_path, "r") as file: + content = json.loads(file.read()) + assert content["project_id"] == "test-project" + assert content["private_key_id"] == "test-key-id" + assert content["private_key_id"] == "test-key-id" + assert ( + content["private_key"] + == "-----BEGIN PRIVATE KEY-----\ntest-private-key\n-----END PRIVATE KEY-----\n" + ) + assert ( + content["client_email"] == "test-email@test-project.iam.gserviceaccount.com" + ) + assert content["client_id"] == "test-client-id" + assert content["auth_uri"] == "https://accounts.google.com/o/oauth2/auth" + assert content["token_uri"] == "https://oauth2.googleapis.com/token" + assert ( + content["auth_provider_x509_cert_url"] + == "https://www.googleapis.com/oauth2/v1/certs" + ) + -@patch("google.cloud.aiplatform.CustomJob.list") -@patch("google.cloud.aiplatform.CustomTrainingJob.list") -@patch("google.cloud.aiplatform.CustomContainerTrainingJob.list") -@patch("google.cloud.aiplatform.CustomPythonPackageTrainingJob.list") -@patch("google.cloud.aiplatform.AutoMLTabularTrainingJob.list") -@patch("google.cloud.aiplatform.AutoMLTextTrainingJob.list") -@patch("google.cloud.aiplatform.AutoMLImageTrainingJob.list") -@patch("google.cloud.aiplatform.AutoMLVideoTrainingJob.list") -@patch("google.cloud.aiplatform.AutoMLForecastingTrainingJob.list") def test_get_training_jobs_workunit( - mock_automl_forecasting_job_list: List[VertexAiResourceNoun], - mock_automl_video_job_list: List[VertexAiResourceNoun], - mock_automl_image_list: List[VertexAiResourceNoun], - mock_automl_text_job_list: List[VertexAiResourceNoun], - mock_automl_tabular_job_list: List[VertexAiResourceNoun], - mock_custom_python_job_list: List[VertexAiResourceNoun], - mock_custom_container_job_list: List[VertexAiResourceNoun], - mock_custom_training_job_list: List[VertexAiResourceNoun], - mock_custom_job_list: List[VertexAiResourceNoun], source: VertexAISource, mock_training_job: VertexAiResourceNoun, mock_training_automl_job: AutoMLTabularTrainingJob, ) -> None: - assert hasattr(mock_custom_job_list, "return_value") - mock_custom_job_list.return_value = [mock_training_job] - assert hasattr(mock_custom_training_job_list, "return_value") - mock_custom_training_job_list.return_value = [] - assert hasattr(mock_custom_container_job_list, "return_value") - mock_custom_container_job_list.return_value = [] - assert hasattr(mock_custom_python_job_list, "return_value") - mock_custom_python_job_list.return_value = [] - assert hasattr(mock_automl_tabular_job_list, "return_value") - mock_automl_tabular_job_list.return_value = [mock_training_automl_job] - assert hasattr(mock_automl_text_job_list, "return_value") - mock_automl_text_job_list.return_value = [] - assert hasattr(mock_automl_image_list, "return_value") - mock_automl_image_list.return_value = [] - assert hasattr(mock_automl_video_job_list, "return_value") - mock_automl_video_job_list.return_value = [] - assert hasattr(mock_automl_forecasting_job_list, "return_value") - mock_automl_forecasting_job_list.return_value = [] - - container_key = ProjectIdKey( - project_id=source.config.project_id, platform=source.platform + with contextlib.ExitStack() as exit_stack: + for func_to_mock in [ + "google.cloud.aiplatform.init", + "google.cloud.aiplatform.CustomJob.list", + "google.cloud.aiplatform.CustomTrainingJob.list", + "google.cloud.aiplatform.CustomContainerTrainingJob.list", + "google.cloud.aiplatform.CustomPythonPackageTrainingJob.list", + "google.cloud.aiplatform.AutoMLTabularTrainingJob.list", + "google.cloud.aiplatform.AutoMLImageTrainingJob.list", + "google.cloud.aiplatform.AutoMLTextTrainingJob.list", + "google.cloud.aiplatform.AutoMLVideoTrainingJob.list", + "google.cloud.aiplatform.AutoMLForecastingTrainingJob.list", + ]: + mock = exit_stack.enter_context(patch(func_to_mock)) + if func_to_mock == "google.cloud.aiplatform.CustomJob.list": + mock.return_value = [mock_training_job] + else: + mock.return_value = [] + + container_key = ProjectIdKey( + project_id=source.config.project_id, platform=source.platform + ) + + """ + Test the retrieval of training jobs work units from Vertex AI. + This function mocks customJob and AutoMLTabularTrainingJob, + and verifies the properties of the work units + """ + for wc in source._get_training_jobs_workunits(): + assert hasattr(wc.metadata, "aspect") + aspect = wc.metadata.aspect + if isinstance(aspect, DataProcessInstancePropertiesClass): + assert ( + aspect.name + == f"{source.config.project_id}.job.{mock_training_job.name}" + or f"{source.config.project_id}.job.{mock_training_automl_job.name}" + ) + assert ( + aspect.customProperties["displayName"] + == mock_training_job.display_name + or mock_training_automl_job.display_name + ) + if isinstance(aspect, SubTypesClass): + assert aspect.typeNames == ["Training Job"] + + if isinstance(aspect, ContainerClass): + assert aspect.container == container_key.as_urn() + + +def test_get_dataset_workunit( + mock_dataset: VertexAiResourceNoun, source: VertexAISource +) -> None: + dataset_urn = builder.make_dataset_urn( + platform=source.platform, + name=mock_dataset.name, + env=source.config.env, ) - - """ - Test the retrieval of training jobs work units from Vertex AI. - This function mocks customJob and AutoMLTabularTrainingJob, - and verifies the properties of the work units - """ - for wc in source._get_training_jobs_workunit(): - assert hasattr(wc.metadata, "aspect") - aspect = wc.metadata.aspect + for wu in source._get_dataset_workunits(dataset_urn=dataset_urn, ds=mock_dataset): + assert hasattr(wu.metadata, "aspect") + aspect = wu.metadata.aspect if isinstance(aspect, DataProcessInstancePropertiesClass): - assert ( - aspect.name - == f"{source.config.project_id}.job.{mock_training_job.name}" - or f"{source.config.project_id}.job.{mock_training_automl_job.name}" - ) - assert ( - aspect.customProperties["displayName"] == mock_training_job.display_name - or mock_training_automl_job.display_name - ) - if isinstance(aspect, SubTypesClass): - assert aspect.typeNames == ["Training Job"] - - if isinstance(aspect, ContainerClass): - assert aspect.container == container_key.as_urn() + assert aspect.name == f"{source._make_vertexai_job_name(mock_dataset.name)}" + assert aspect.customProperties["displayName"] == mock_dataset.display_name + elif isinstance(aspect, ContainerClass): + assert aspect.container == source._get_project_container().as_urn() + elif isinstance(aspect, SubTypesClass): + assert aspect.typeNames == ["Dataset"] def test_make_model_external_url(mock_model: Model, source: VertexAISource) -> None: @@ -442,7 +444,7 @@ def test_real_model_workunit( Disabled as default Use real model registered in the Vertex AI Model Registry """ - for wu in source._get_ml_model_properties_workunit( + for wu in source._gen_ml_model_workunits( model=real_model, model_version=model_version ): assert hasattr(wu.metadata, "aspect") @@ -459,7 +461,7 @@ def test_real_model_workunit( def test_real_get_data_process_properties( source: VertexAISource, real_autoML_tabular_job: _TrainingJob ) -> None: - for wu in source._get_data_process_properties_workunits(real_autoML_tabular_job): + for wu in source._generate_data_process_workunits(real_autoML_tabular_job): assert hasattr(wu.metadata, "aspect") aspect = wu.metadata.aspect if isinstance(aspect, DataProcessInstancePropertiesClass): From 0eeeb7281e6b5c1fc09c39a469f12f0d353424a5 Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Mon, 3 Mar 2025 12:06:50 -0800 Subject: [PATCH 45/59] minor change --- .../src/datahub/ingestion/source/vertexai.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py index fca1248f075fbf..2e0255f1158a9f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py +++ b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py @@ -371,8 +371,8 @@ def _get_job_output_workunits( model_version = self._search_model_version(model, model_version_str) if model and model_version: logger.info( - f" found a training job: {job.display_name} generated " - f"a model (name:{model.display_name} id:{model_version_str})" + f"Found output model (name:{model.display_name} id:{model_version_str}) " + f"for training job: {job.display_name}" ) yield from self._gen_ml_model_endpoint_workunits( model, model_version, job_urn @@ -486,9 +486,13 @@ def _gen_input_dataset_workunits( aspect=DataProcessInstanceInputClass(inputs=[dataset_urn]), ) logger.info( - f"Found input dataset ({dataset_name}) for training job ({job.display_name})" + f"Found the name of input dataset ({dataset_name}) with dataset id ({dataset_id})" ) yield from auto_workunit([mcp]) + else: + logger.error( + f"Unable to find the name of input dataset ({dataset_name}) with dataset id ({dataset_id})" + ) def _gen_endpoint_workunits( self, endpoint: Endpoint, model: Model, model_version: VersionInfo From 6c43ecc51c22c267923d7551436872f1982b503f Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Mon, 3 Mar 2025 12:27:28 -0800 Subject: [PATCH 46/59] Change BigQueryCredentail to common function: GCPCredential --- .../source/bigquery_v2/bigquery_config.py | 48 +---------------- .../ingestion/source/common/credentials.py | 53 +++++++++++++++++++ .../src/datahub/ingestion/source/vertexai.py | 42 +-------------- .../integration/fivetran/test_fivetran.py | 4 +- 4 files changed, 58 insertions(+), 89 deletions(-) create mode 100644 metadata-ingestion/src/datahub/ingestion/source/common/credentials.py diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py index 57bfa2e3090d31..c93667c852e043 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py @@ -1,8 +1,6 @@ -import json import logging import os import re -import tempfile from datetime import timedelta from typing import Any, Dict, List, Optional, Union @@ -17,10 +15,10 @@ PlatformInstanceConfigMixin, ) from datahub.configuration.validate_field_removal import pydantic_removed_field -from datahub.configuration.validate_multiline_string import pydantic_multiline_string from datahub.ingestion.glossary.classification_mixin import ( ClassificationSourceConfigMixin, ) +from datahub.ingestion.source.common.credentials import GCPCredential from datahub.ingestion.source.data_lake_common.path_spec import PathSpec from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, SQLFilterConfig from datahub.ingestion.source.state.stateful_ingestion_base import ( @@ -107,50 +105,8 @@ class BigQueryUsageConfig(BaseUsageConfig): ) -class BigQueryCredential(ConfigModel): - project_id: str = Field(description="Project id to set the credentials") - private_key_id: str = Field(description="Private key id") - private_key: str = Field( - description="Private key in a form of '-----BEGIN PRIVATE KEY-----\\nprivate-key\\n-----END PRIVATE KEY-----\\n'" - ) - client_email: str = Field(description="Client email") - client_id: str = Field(description="Client Id") - auth_uri: str = Field( - default="https://accounts.google.com/o/oauth2/auth", - description="Authentication uri", - ) - token_uri: str = Field( - default="https://oauth2.googleapis.com/token", description="Token uri" - ) - auth_provider_x509_cert_url: str = Field( - default="https://www.googleapis.com/oauth2/v1/certs", - description="Auth provider x509 certificate url", - ) - type: str = Field(default="service_account", description="Authentication type") - client_x509_cert_url: Optional[str] = Field( - default=None, - description="If not set it will be default to https://www.googleapis.com/robot/v1/metadata/x509/client_email", - ) - - _fix_private_key_newlines = pydantic_multiline_string("private_key") - - @root_validator(skip_on_failure=True) - def validate_config(cls, values: Dict[str, Any]) -> Dict[str, Any]: - if values.get("client_x509_cert_url") is None: - values["client_x509_cert_url"] = ( - f"https://www.googleapis.com/robot/v1/metadata/x509/{values['client_email']}" - ) - return values - - def create_credential_temp_file(self) -> str: - with tempfile.NamedTemporaryFile(delete=False) as fp: - cred_json = json.dumps(self.dict(), indent=4, separators=(",", ": ")) - fp.write(cred_json.encode()) - return fp.name - - class BigQueryConnectionConfig(ConfigModel): - credential: Optional[BigQueryCredential] = Field( + credential: Optional[GCPCredential] = Field( default=None, description="BigQuery credential informations" ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/common/credentials.py b/metadata-ingestion/src/datahub/ingestion/source/common/credentials.py new file mode 100644 index 00000000000000..66b20485cb2d50 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/common/credentials.py @@ -0,0 +1,53 @@ +import json +import tempfile +from typing import Any, Dict, Optional + +from pydantic import Field, root_validator + +from datahub.configuration import ConfigModel +from datahub.configuration.validate_multiline_string import pydantic_multiline_string + + +class GCPCredential(ConfigModel): + project_id: Optional[str] = Field(description="Project id to set the credentials") + private_key_id: str = Field(description="Private key id") + private_key: str = Field( + description="Private key in a form of '-----BEGIN PRIVATE KEY-----\\nprivate-key\\n-----END PRIVATE KEY-----\\n'" + ) + client_email: str = Field(description="Client email") + client_id: str = Field(description="Client Id") + auth_uri: str = Field( + default="https://accounts.google.com/o/oauth2/auth", + description="Authentication uri", + ) + token_uri: str = Field( + default="https://oauth2.googleapis.com/token", description="Token uri" + ) + auth_provider_x509_cert_url: str = Field( + default="https://www.googleapis.com/oauth2/v1/certs", + description="Auth provider x509 certificate url", + ) + type: str = Field(default="service_account", description="Authentication type") + client_x509_cert_url: Optional[str] = Field( + default=None, + description="If not set it will be default to https://www.googleapis.com/robot/v1/metadata/x509/client_email", + ) + + _fix_private_key_newlines = pydantic_multiline_string("private_key") + + @root_validator(skip_on_failure=True) + def validate_config(cls, values: Dict[str, Any]) -> Dict[str, Any]: + if values.get("client_x509_cert_url") is None: + values["client_x509_cert_url"] = ( + f"https://www.googleapis.com/robot/v1/metadata/x509/{values['client_email']}" + ) + return values + + def create_credential_temp_file(self, project_id: Optional[str] = None) -> str: + configs = self.dict() + if project_id: + configs["project_id"] = project_id + with tempfile.NamedTemporaryFile(delete=False) as fp: + cred_json = json.dumps(self.dict(), indent=4, separators=(",", ": ")) + fp.write(cred_json.encode()) + return fp.name diff --git a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py index 2e0255f1158a9f..8f025b9eeebb1b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py +++ b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py @@ -1,6 +1,4 @@ -import json import logging -import tempfile import time from typing import Any, Iterable, List, Optional, TypeVar @@ -21,9 +19,7 @@ import datahub.emitter.mce_builder as builder from datahub._codegen.aspect import _Aspect -from datahub.configuration import ConfigModel from datahub.configuration.source_common import EnvConfigMixin -from datahub.configuration.validate_multiline_string import pydantic_multiline_string from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.mcp_builder import ProjectIdKey, gen_containers from datahub.ingestion.api.common import PipelineContext @@ -37,6 +33,7 @@ from datahub.ingestion.api.source import Source, SourceCapability, SourceReport from datahub.ingestion.api.source_helpers import auto_workunit from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.common.credentials import GCPCredential from datahub.metadata.com.linkedin.pegasus2avro.ml.metadata import ( MLTrainingRunProperties, ) @@ -59,43 +56,6 @@ logger = logging.getLogger(__name__) -class GCPCredential(ConfigModel): - private_key_id: str = Field(description="Private key id") - private_key: str = Field( - description="Private key in a form of '-----BEGIN PRIVATE KEY-----\\nprivate-key\\n-----END PRIVATE KEY-----\\n'" - ) - client_email: str = Field(description="Client email") - client_id: str = Field(description="Client Id") - auth_uri: str = Field( - default="https://accounts.google.com/o/oauth2/auth", - description="Authentication uri", - ) - token_uri: str = Field( - default="https://oauth2.googleapis.com/token", description="Token uri" - ) - auth_provider_x509_cert_url: str = Field( - default="https://www.googleapis.com/oauth2/v1/certs", - description="Auth provider x509 certificate url", - ) - type: str = Field(default="service_account", description="Authentication type") - client_x509_cert_url: Optional[str] = Field( - default=None, - description="If not set it will be default to https://www.googleapis.com/robot/v1/metadata/x509/client_email", - ) - - _fix_private_key_newlines = pydantic_multiline_string("private_key") - - def create_credential_temp_file(self, project_id: Optional[str] = None) -> str: - # Adding project_id from the top level config - configs = self.dict() - if project_id: - configs["project_id"] = project_id - with tempfile.NamedTemporaryFile(delete=False) as fp: - cred_json = json.dumps(configs, indent=4, separators=(",", ": ")) - fp.write(cred_json.encode()) - return fp.name - - class VertexAIConfig(EnvConfigMixin): credential: Optional[GCPCredential] = Field( default=None, description="GCP credential information" diff --git a/metadata-ingestion/tests/integration/fivetran/test_fivetran.py b/metadata-ingestion/tests/integration/fivetran/test_fivetran.py index 9f53ae2382d406..2f3e6a30cb5e09 100644 --- a/metadata-ingestion/tests/integration/fivetran/test_fivetran.py +++ b/metadata-ingestion/tests/integration/fivetran/test_fivetran.py @@ -9,7 +9,7 @@ from datahub.configuration.common import ConfigurationWarning from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.run.pipeline import Pipeline -from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryCredential +from datahub.ingestion.source.common.credentials import GCPCredential from datahub.ingestion.source.fivetran.config import ( BigQueryDestinationConfig, FivetranSourceConfig, @@ -398,7 +398,7 @@ def test_fivetran_snowflake_destination_config(): @freeze_time(FROZEN_TIME) def test_fivetran_bigquery_destination_config(): bigquery_dest = BigQueryDestinationConfig( - credential=BigQueryCredential( + credential=GCPCredential( private_key_id="testprivatekey", project_id="test-project", client_email="fivetran-connector@test-project.iam.gserviceaccount.com", From 1f64a95d65167c773cb74ef845163404f11634e4 Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Mon, 3 Mar 2025 13:43:48 -0800 Subject: [PATCH 47/59] fixed one unit test case failure, and naming chagne --- .../ingestion/source/common/credentials.py | 2 +- .../src/datahub/ingestion/source/vertexai.py | 40 ++++++++++++------- .../tests/unit/test_vertexai_source.py | 4 +- 3 files changed, 28 insertions(+), 18 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/common/credentials.py b/metadata-ingestion/src/datahub/ingestion/source/common/credentials.py index 66b20485cb2d50..a1c9cac9319c8c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/common/credentials.py +++ b/metadata-ingestion/src/datahub/ingestion/source/common/credentials.py @@ -48,6 +48,6 @@ def create_credential_temp_file(self, project_id: Optional[str] = None) -> str: if project_id: configs["project_id"] = project_id with tempfile.NamedTemporaryFile(delete=False) as fp: - cred_json = json.dumps(self.dict(), indent=4, separators=(",", ": ")) + cred_json = json.dumps(configs, indent=4, separators=(",", ": ")) fp.write(cred_json.encode()) return fp.name diff --git a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py index 8f025b9eeebb1b..ef2c04a49dc8a5 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py +++ b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py @@ -194,7 +194,7 @@ def _get_training_jobs_workunits(self) -> Iterable[MetadataWorkUnit]: def _get_training_job_workunits( self, job: VertexAiResourceNoun ) -> Iterable[MetadataWorkUnit]: - yield from self._generate_data_process_workunits(job) + yield from self._gen_data_process_workunits(job) yield from self._get_job_output_workunits(job) yield from self._get_job_input_workunits(job) @@ -207,9 +207,9 @@ def _gen_ml_group_workunits( """ ml_model_group_urn = self._make_ml_model_group_urn(model) - mcp = MetadataChangeProposalWrapper( - entityUrn=ml_model_group_urn, - aspect=MLModelGroupPropertiesClass( + aspects: List[_Aspect] = list() + aspects.append( + MLModelGroupPropertiesClass( name=self._make_vertexai_model_group_name(model.name), description=model.description, created=TimeStampClass(time=int(model.create_time.timestamp() * 1000)) @@ -221,10 +221,18 @@ def _gen_ml_group_workunits( if model.update_time else None, customProperties={"displayName": model.display_name}, - ), + ) ) - yield from auto_workunit([mcp]) + # TODO add following when metadata model for mlgroup is updated (these aspects not supported currently) + # aspects.append(SubTypesClass(typeNames=["Training Job"])) + # aspects.append(ContainerClass(container=self._get_project_container().as_urn())) + + yield from auto_workunit( + MetadataChangeProposalWrapper.construct_many( + ml_model_group_urn, aspects=aspects + ) + ) def _make_ml_model_group_urn(self, model: Model) -> str: urn = builder.make_ml_model_group_urn( @@ -234,7 +242,7 @@ def _make_ml_model_group_urn(self, model: Model) -> str: ) return urn - def _generate_data_process_workunits( + def _gen_data_process_workunits( self, job: VertexAiResourceNoun ) -> Iterable[MetadataWorkUnit]: """ @@ -276,7 +284,7 @@ def _generate_data_process_workunits( aspects.append(ContainerClass(container=self._get_project_container().as_urn())) - # TO BE ADDED + # TODO add status of the job # aspects.append( # DataProcessInstanceRunEventClass( # status=DataProcessRunStatusClass.COMPLETE, @@ -438,17 +446,19 @@ def _gen_input_dataset_workunits( dataset = self._search_dataset(dataset_id) if dataset_id else None if dataset: + logger.info( + f"Found the name of input dataset ({dataset_name}) with dataset id ({dataset_id})" + ) + # Yield aspect of input dataset yield from self._get_dataset_workunits(dataset_urn=dataset_urn, ds=dataset) - # Create URN of Training Job + + # Yield aspect(DataProcessInstanceInputClass) of training job job_id = self._make_vertexai_job_name(entity_id=job.name) - mcp = MetadataChangeProposalWrapper( + yield MetadataChangeProposalWrapper( entityUrn=builder.make_data_process_instance_urn(job_id), aspect=DataProcessInstanceInputClass(inputs=[dataset_urn]), - ) - logger.info( - f"Found the name of input dataset ({dataset_name}) with dataset id ({dataset_id})" - ) - yield from auto_workunit([mcp]) + ).as_workunit() + else: logger.error( f"Unable to find the name of input dataset ({dataset_name}) with dataset id ({dataset_id})" diff --git a/metadata-ingestion/tests/unit/test_vertexai_source.py b/metadata-ingestion/tests/unit/test_vertexai_source.py index 70692a7ac4860e..1dee362312e08a 100644 --- a/metadata-ingestion/tests/unit/test_vertexai_source.py +++ b/metadata-ingestion/tests/unit/test_vertexai_source.py @@ -237,7 +237,7 @@ def test_get_endpoint_workunit( def test_get_data_process_properties_workunit( source: VertexAISource, mock_training_job: VertexAiResourceNoun ) -> None: - for wu in source._generate_data_process_workunits(mock_training_job): + for wu in source._gen_data_process_workunits(mock_training_job): assert hasattr(wu.metadata, "aspect") aspect = wu.metadata.aspect if isinstance(aspect, DataProcessInstancePropertiesClass): @@ -461,7 +461,7 @@ def test_real_model_workunit( def test_real_get_data_process_properties( source: VertexAISource, real_autoML_tabular_job: _TrainingJob ) -> None: - for wu in source._generate_data_process_workunits(real_autoML_tabular_job): + for wu in source._gen_data_process_workunits(real_autoML_tabular_job): assert hasattr(wu.metadata, "aspect") aspect = wu.metadata.aspect if isinstance(aspect, DataProcessInstancePropertiesClass): From b5592861ba2991f5cd2d2ab1cf47596473092c87 Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Mon, 3 Mar 2025 14:15:36 -0800 Subject: [PATCH 48/59] Added Enum and refactoring --- .../src/datahub/ingestion/source/vertexai.py | 24 ++++++++++++------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py index ef2c04a49dc8a5..3ff2c22c41339c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py +++ b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py @@ -50,6 +50,7 @@ TimeStampClass, VersionTagClass, ) +from datahub.utilities.str_enum import StrEnum T = TypeVar("T") @@ -86,6 +87,15 @@ def __init__(self, **data: Any): ) +class MLTypes(StrEnum): + # Generic SubTypes + TRAINING_JOB = "Training Job" + MODEL = "ML Model" + MODEL_GROUP = "ML Model Group" + ENDPOINT = "Endpoint" + DATASET = "Dataset" + + @platform_name("Vertex AI", id="vertexai") @config_class(VertexAIConfig) @support_status(SupportStatus.TESTING) @@ -137,12 +147,8 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: # TODO Fetch Experiments and Experiment Runs def _gen_project_workunits(self) -> Iterable[MetadataWorkUnit]: - container_key = ProjectIdKey( - project_id=self.config.project_id, platform=self.platform - ) - yield from gen_containers( - container_key=container_key, + container_key=self._get_project_container(), name=self.config.project_id, sub_types=["Project"], ) @@ -225,7 +231,7 @@ def _gen_ml_group_workunits( ) # TODO add following when metadata model for mlgroup is updated (these aspects not supported currently) - # aspects.append(SubTypesClass(typeNames=["Training Job"])) + # aspects.append(SubTypesClass(typeNames=[MLTypes.MODEL_GROUP])) # aspects.append(ContainerClass(container=self._get_project_container().as_urn())) yield from auto_workunit( @@ -280,7 +286,7 @@ def _gen_data_process_workunits( externalUrl=self._make_job_external_url(job), id=job.name ) ) - aspects.append(SubTypesClass(typeNames=["Training Job"])) + aspects.append(SubTypesClass(typeNames=[MLTypes.TRAINING_JOB])) aspects.append(ContainerClass(container=self._get_project_container().as_urn())) @@ -395,7 +401,7 @@ def _get_dataset_workunits( ) ) - aspects.append(SubTypesClass(typeNames=["Dataset"])) + aspects.append(SubTypesClass(typeNames=[MLTypes.DATASET])) # Create a container for Project as parent of the dataset aspects.append(ContainerClass(container=self._get_project_container().as_urn())) @@ -491,7 +497,7 @@ def _gen_endpoint_workunits( ) ) - aspects.append(SubTypesClass(typeNames=["Endpoint"])) + aspects.append(SubTypesClass(typeNames=[MLTypes.ENDPOINT])) yield from auto_workunit( MetadataChangeProposalWrapper.construct_many(endpoint_urn, aspects=aspects) From 4edd575daef06934fba54845a23aeed0f2fa0bef Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Mon, 3 Mar 2025 15:02:49 -0800 Subject: [PATCH 49/59] add comment --- metadata-ingestion/src/datahub/ingestion/source/vertexai.py | 1 - 1 file changed, 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py index 3ff2c22c41339c..b57d4a57399ef9 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py +++ b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py @@ -88,7 +88,6 @@ def __init__(self, **data: Any): class MLTypes(StrEnum): - # Generic SubTypes TRAINING_JOB = "Training Job" MODEL = "ML Model" MODEL_GROUP = "ML Model Group" From 57650256b2444c7b52db41a1ab43920bd2403a08 Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Tue, 4 Mar 2025 09:46:37 -0800 Subject: [PATCH 50/59] fixed review comments --- .../docs/sources/vertexai/README.md | 1 - .../docs/sources/vertexai/vertexai_pre.md | 18 +- .../docs/sources/vertexai/vertexai_recipe.yml | 3 +- .../source/bigquery_v2/bigquery_config.py | 2 +- ...edentials.py => gcp_credentials_config.py} | 0 .../src/datahub/ingestion/source/vertexai.py | 375 ++++++++++++------ .../integration/fivetran/test_fivetran.py | 2 +- .../integration/vertexai/test_vertexai.py | 42 +- .../vertexai/vertexai_mcps_golden.json | 204 +++++++++- .../tests/unit/test_vertexai_source.py | 110 +++-- 10 files changed, 540 insertions(+), 217 deletions(-) delete mode 100644 metadata-ingestion/docs/sources/vertexai/README.md rename metadata-ingestion/src/datahub/ingestion/source/common/{credentials.py => gcp_credentials_config.py} (100%) diff --git a/metadata-ingestion/docs/sources/vertexai/README.md b/metadata-ingestion/docs/sources/vertexai/README.md deleted file mode 100644 index 07bc128a6007d6..00000000000000 --- a/metadata-ingestion/docs/sources/vertexai/README.md +++ /dev/null @@ -1 +0,0 @@ -Ingesting metadata from VertexAI requires using the **vertexai** module. diff --git a/metadata-ingestion/docs/sources/vertexai/vertexai_pre.md b/metadata-ingestion/docs/sources/vertexai/vertexai_pre.md index 73c9fb4454a2c4..c4a9c7924fb731 100644 --- a/metadata-ingestion/docs/sources/vertexai/vertexai_pre.md +++ b/metadata-ingestion/docs/sources/vertexai/vertexai_pre.md @@ -1,14 +1,16 @@ +Ingesting metadata from VertexAI requires using the **Vertex AI** module. +#### Prerequisites +Please refer to the [Vertex AI documentation](https://cloud.google.com/vertex-ai/docs) for basic information on Vertex AI. -#### Credential to access to GCP -1. Follow the section on credentials to access Vertex AI [GCP docs](https://cloud.google.com/docs/authentication/provide-credentials-adc#how-to). +#### Credentials to access to GCP +Please read the section to understand how to set up application default Credentials to GCP [GCP docs](https://cloud.google.com/docs/authentication/provide-credentials-adc#how-to). #### Create a service account and assign roles -1. Setup a ServiceAccount as per [GCP docs](https://cloud.google.com/iam/docs/creating-managing-service-accounts#iam-service-accounts-create-console) - and assign the previously created role to this service account. -2. Download a service account JSON keyfile. - Example credential file: +1. Setup a ServiceAccount as per [GCP docs](https://cloud.google.com/iam/docs/creating-managing-service-accounts#iam-service-accounts-create-console) and assign the previously created role to this service account. +2. Download a service account JSON keyfile. +- Example credential file: ```json { @@ -27,7 +29,7 @@ 3. To provide credentials to the source, you can either: - Set an environment variable: +- Set an environment variable: ```sh $ export GOOGLE_APPLICATION_CREDENTIALS="/path/to/keyfile.json" @@ -35,7 +37,7 @@ _or_ - Set credential config in your source based on the credential json file. For example: +- Set credential config in your source based on the credential json file. For example: ```yml credential: diff --git a/metadata-ingestion/docs/sources/vertexai/vertexai_recipe.yml b/metadata-ingestion/docs/sources/vertexai/vertexai_recipe.yml index d517537cd85eab..78135700225dc4 100644 --- a/metadata-ingestion/docs/sources/vertexai/vertexai_recipe.yml +++ b/metadata-ingestion/docs/sources/vertexai/vertexai_recipe.yml @@ -3,7 +3,8 @@ source: config: project_id: "acryl-poc" region: "us-west2" -# credential: +# Note that GOOGLE_APPLICATION_CREDENTIALS or credential section below is required for authentication. +# credential: # private_key: '-----BEGIN PRIVATE KEY-----\\nprivate-key\\n-----END PRIVATE KEY-----\\n' # private_key_id: "project_key_id" # client_email: "client_email" diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py index c93667c852e043..1f777feeccf781 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py @@ -18,7 +18,7 @@ from datahub.ingestion.glossary.classification_mixin import ( ClassificationSourceConfigMixin, ) -from datahub.ingestion.source.common.credentials import GCPCredential +from datahub.ingestion.source.common.gcp_credentials_config import GCPCredential from datahub.ingestion.source.data_lake_common.path_spec import PathSpec from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, SQLFilterConfig from datahub.ingestion.source.state.stateful_ingestion_base import ( diff --git a/metadata-ingestion/src/datahub/ingestion/source/common/credentials.py b/metadata-ingestion/src/datahub/ingestion/source/common/gcp_credentials_config.py similarity index 100% rename from metadata-ingestion/src/datahub/ingestion/source/common/credentials.py rename to metadata-ingestion/src/datahub/ingestion/source/common/gcp_credentials_config.py diff --git a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py index b57d4a57399ef9..3b723d5c1c0d7e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py +++ b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py @@ -1,7 +1,9 @@ +import dataclasses import logging import time from typing import Any, Iterable, List, Optional, TypeVar +from google.api_core.exceptions import GoogleAPICallError from google.cloud import aiplatform from google.cloud.aiplatform import ( AutoMLForecastingTrainingJob, @@ -33,7 +35,7 @@ from datahub.ingestion.api.source import Source, SourceCapability, SourceReport from datahub.ingestion.api.source_helpers import auto_workunit from datahub.ingestion.api.workunit import MetadataWorkUnit -from datahub.ingestion.source.common.credentials import GCPCredential +from datahub.ingestion.source.common.gcp_credentials_config import GCPCredential from datahub.metadata.com.linkedin.pegasus2avro.ml.metadata import ( MLTrainingRunProperties, ) @@ -93,6 +95,15 @@ class MLTypes(StrEnum): MODEL_GROUP = "ML Model Group" ENDPOINT = "Endpoint" DATASET = "Dataset" + PROJECT = "Project" + + +@dataclasses.dataclass +class TrainingJobMetadata: + job: VertexAiResourceNoun + input_dataset: Optional[VertexAiResourceNoun] = None + output_model: Optional[Model] = None + output_model_version: Optional[VersionInfo] = None @platform_name("Vertex AI", id="vertexai") @@ -100,12 +111,11 @@ class MLTypes(StrEnum): @support_status(SupportStatus.TESTING) @capability( SourceCapability.DESCRIPTIONS, - "Extract descriptions for vertexai Registered Models and Model Versions", + "Extract descriptions for Vertex AI Registered Models and Model Versions", ) -@capability(SourceCapability.TAGS, "Extract tags for VertexAI Registered Model Stages") +@capability(SourceCapability.TAGS, "Extract tags for Vertex AI Registered Model Stages") class VertexAISource(Source): platform: str = "vertexai" - model_name_separator = "_" def __init__(self, ctx: PipelineContext, config: VertexAIConfig): super().__init__(ctx) @@ -126,6 +136,7 @@ def __init__(self, ctx: PipelineContext, config: VertexAIConfig): self.client = aiplatform self.endpoints: Optional[List[Endpoint]] = None self.datasets: Optional[dict] = None + self.model_name_separator = "_" def get_report(self) -> SourceReport: return self.report @@ -149,7 +160,7 @@ def _gen_project_workunits(self) -> Iterable[MetadataWorkUnit]: yield from gen_containers( container_key=self._get_project_container(), name=self.config.project_id, - sub_types=["Project"], + sub_types=[MLTypes.PROJECT], ) def _get_ml_models_workunits(self) -> Iterable[MetadataWorkUnit]: @@ -199,9 +210,81 @@ def _get_training_jobs_workunits(self) -> Iterable[MetadataWorkUnit]: def _get_training_job_workunits( self, job: VertexAiResourceNoun ) -> Iterable[MetadataWorkUnit]: - yield from self._gen_data_process_workunits(job) - yield from self._get_job_output_workunits(job) - yield from self._get_job_input_workunits(job) + job_meta: TrainingJobMetadata = self._get_training_job_metadata(job) + # Create DataProcessInstance for the training job + yield from self._gen_training_job_workunits(job_meta) + # Create ML Model entity for output ML model of this training job + yield from self._gen_output_model_workunits(job_meta) + + def _gen_output_model_workunits( + self, job_meta: TrainingJobMetadata + ) -> Iterable[MetadataWorkUnit]: + if job_meta.output_model and job_meta.output_model_version: + job = job_meta.job + job_urn = builder.make_data_process_instance_urn( + self._make_vertexai_job_name(entity_id=job.name) + ) + + yield from self._gen_ml_model_endpoint_workunits( + job_meta.output_model, job_meta.output_model_version, job_urn + ) + + def _gen_training_job_workunits( + self, job_meta: TrainingJobMetadata + ) -> Iterable[MetadataWorkUnit]: + """ + Generate a work unit for VertexAI Training Job + """ + job = job_meta.job + job_id = self._make_vertexai_job_name(entity_id=job.name) + job_urn = builder.make_data_process_instance_urn(job_id) + + created_time = ( + int(job.create_time.timestamp() * 1000) + if job.create_time + else int(time.time() * 1000) + ) + created_actor = f"urn:li:platformResource:{self.platform}" + + aspects: List[_Aspect] = list() + aspects.append( + DataProcessInstancePropertiesClass( + name=job_id, + created=AuditStampClass( + time=created_time, + actor=created_actor, + ), + externalUrl=self._make_job_external_url(job), + customProperties={ + "displayName": job.display_name, + "jobType": job.__class__.__name__, + }, + ) + ) + aspects.append( + MLTrainingRunProperties( + externalUrl=self._make_job_external_url(job), id=job.name + ) + ) + aspects.append(SubTypesClass(typeNames=[MLTypes.TRAINING_JOB])) + aspects.append(ContainerClass(container=self._get_project_container().as_urn())) + + # If Training job has Input Dataset + if job_meta.input_dataset: + dataset_urn = builder.make_dataset_urn( + platform=self.platform, + name=self._make_vertexai_dataset_name( + entity_id=job_meta.input_dataset.name + ), + env=self.config.env, + ) + aspects.append( + DataProcessInstanceInputClass(inputs=[dataset_urn]), + ) + + yield from auto_workunit( + MetadataChangeProposalWrapper.construct_many(job_urn, aspects=aspects) + ) def _gen_ml_group_workunits( self, @@ -247,59 +330,59 @@ def _make_ml_model_group_urn(self, model: Model) -> str: ) return urn - def _gen_data_process_workunits( - self, job: VertexAiResourceNoun - ) -> Iterable[MetadataWorkUnit]: - """ - Generate a work unit for VertexAI Training Job - """ - - created_time = ( - int(job.create_time.timestamp() * 1000) - if job.create_time - else int(time.time() * 1000) - ) - created_actor = f"urn:li:platformResource:{self.platform}" - - job_id = self._make_vertexai_job_name(entity_id=job.name) - job_urn = builder.make_data_process_instance_urn(job_id) - - aspects: List[_Aspect] = list() - aspects.append( - DataProcessInstancePropertiesClass( - name=job_id, - created=AuditStampClass( - time=created_time, - actor=created_actor, - ), - externalUrl=self._make_job_external_url(job), - customProperties={ - "displayName": job.display_name, - "jobType": job.__class__.__name__, - }, - ) - ) - - aspects.append( - MLTrainingRunProperties( - externalUrl=self._make_job_external_url(job), id=job.name - ) - ) - aspects.append(SubTypesClass(typeNames=[MLTypes.TRAINING_JOB])) - - aspects.append(ContainerClass(container=self._get_project_container().as_urn())) - - # TODO add status of the job - # aspects.append( - # DataProcessInstanceRunEventClass( - # status=DataProcessRunStatusClass.COMPLETE, - # timestampMillis=0 - # ) - # } - - yield from auto_workunit( - MetadataChangeProposalWrapper.construct_many(job_urn, aspects=aspects) - ) + # def _gen_data_process_workunits( + # self, job: VertexAiResourceNoun + # ) -> Iterable[MetadataWorkUnit]: + # """ + # Generate a work unit for VertexAI Training Job + # """ + # + # created_time = ( + # int(job.create_time.timestamp() * 1000) + # if job.create_time + # else int(time.time() * 1000) + # ) + # created_actor = f"urn:li:platformResource:{self.platform}" + # + # job_id = self._make_vertexai_job_name(entity_id=job.name) + # job_urn = builder.make_data_process_instance_urn(job_id) + # + # aspects: List[_Aspect] = list() + # aspects.append( + # DataProcessInstancePropertiesClass( + # name=job_id, + # created=AuditStampClass( + # time=created_time, + # actor=created_actor, + # ), + # externalUrl=self._make_job_external_url(job), + # customProperties={ + # "displayName": job.display_name, + # "jobType": job.__class__.__name__, + # }, + # ) + # ) + # + # aspects.append( + # MLTrainingRunProperties( + # externalUrl=self._make_job_external_url(job), id=job.name + # ) + # ) + # aspects.append(SubTypesClass(typeNames=[MLTypes.TRAINING_JOB])) + # + # aspects.append(ContainerClass(container=self._get_project_container().as_urn())) + # + # # TODO add status of the job + # # aspects.append( + # # DataProcessInstanceRunEventClass( + # # status=DataProcessRunStatusClass.COMPLETE, + # # timestampMillis=0 + # # ) + # # } + # + # yield from auto_workunit( + # MetadataChangeProposalWrapper.construct_many(job_urn, aspects=aspects) + # ) def _get_project_container(self) -> ProjectIdKey: return ProjectIdKey(project_id=self.config.project_id, platform=self.platform) @@ -321,35 +404,35 @@ def _search_model_version( return version return None - def _get_job_output_workunits( - self, job: VertexAiResourceNoun - ) -> Iterable[MetadataWorkUnit]: - """ - This method creates work units that link the training job to the model version - that it produces. It checks if the job configuration contains a model to upload, - and if so, it generates a work unit for the model version with the training job - as part of its properties. - """ - - job_conf = job.to_dict() - if ( - "modelToUpload" in job_conf - and "name" in job_conf["modelToUpload"] - and job_conf["modelToUpload"]["name"] - ): - model_version_str = job_conf["modelToUpload"]["versionId"] - job_urn = self._make_job_urn(job) - - model = Model(model_name=job_conf["modelToUpload"]["name"]) - model_version = self._search_model_version(model, model_version_str) - if model and model_version: - logger.info( - f"Found output model (name:{model.display_name} id:{model_version_str}) " - f"for training job: {job.display_name}" - ) - yield from self._gen_ml_model_endpoint_workunits( - model, model_version, job_urn - ) + # def _get_job_output_workunits( + # self, job: VertexAiResourceNoun + # ) -> Iterable[MetadataWorkUnit]: + # """ + # This method creates work units that link the training job to the model version + # that it produces. It checks if the job configuration contains a model to upload, + # and if so, it generates a work unit for the model version with the training job + # as part of its properties. + # """ + # + # job_conf = job.to_dict() + # if ( + # "modelToUpload" in job_conf + # and "name" in job_conf["modelToUpload"] + # and job_conf["modelToUpload"]["name"] + # ): + # model_version_str = job_conf["modelToUpload"]["versionId"] + # job_urn = self._make_job_urn(job) + # + # model = Model(model_name=job_conf["modelToUpload"]["name"]) + # model_version = self._search_model_version(model, model_version_str) + # if model and model_version: + # logger.info( + # f"Found output model (name:{model.display_name} id:{model_version_str}) " + # f"for training job: {job.display_name}" + # ) + # yield from self._gen_ml_model_endpoint_workunits( + # model, model_version, job_urn + # ) def _search_dataset(self, dataset_id: str) -> Optional[VertexAiResourceNoun]: """ @@ -374,7 +457,7 @@ def _search_dataset(self, dataset_id: str) -> Optional[VertexAiResourceNoun]: for ds in dataset_class.list(): self.datasets[ds.name] = ds - return self.datasets.get(dataset_id) + return self.datasets.get(dataset_id) if dataset_id in self.datasets else None def _get_dataset_workunits( self, dataset_urn: str, ds: VertexAiResourceNoun @@ -408,17 +491,47 @@ def _get_dataset_workunits( MetadataChangeProposalWrapper.construct_many(dataset_urn, aspects=aspects) ) - def _get_job_input_workunits( + # def _get_job_input_workunits( + # self, job: VertexAiResourceNoun + # ) -> Iterable[MetadataWorkUnit]: + # """ + # Generate work units for the input data of a training job. + # This method checks if the training job is an AutoML job and if it has an input dataset + # configuration. If so, it creates a work unit for the input dataset. + # """ + # + # if self._is_automl_job(job): + # job_conf = job.to_dict() + # if ( + # "inputDataConfig" in job_conf + # and "datasetId" in job_conf["inputDataConfig"] + # ): + # # Create URN of Input Dataset for Training Job + # dataset_id = job_conf["inputDataConfig"]["datasetId"] + # logger.info( + # f"Found input dataset (id: {dataset_id}) for training job ({job.display_name})" + # ) + # + # if dataset_id: + # yield from self._gen_input_dataset_workunits(job, dataset_id) + + def _get_training_job_metadata( self, job: VertexAiResourceNoun - ) -> Iterable[MetadataWorkUnit]: + ) -> TrainingJobMetadata: """ - Generate work units for the input data of a training job. - This method checks if the training job is an AutoML job and if it has an input dataset - configuration. If so, it creates a work unit for the input dataset. + Retrieve metadata for a given Vertex AI training job. + This method extracts metadata for a Vertex AI training job, including input datasets + and output models. It checks if the job is an AutoML job and retrieves the relevant + input dataset and output model information. """ + job_meta = TrainingJobMetadata(job=job) + + # Check if the job is an AutoML job if self._is_automl_job(job): job_conf = job.to_dict() + + # Check if input dataset is present in the job configuration if ( "inputDataConfig" in job_conf and "datasetId" in job_conf["inputDataConfig"] @@ -430,7 +543,34 @@ def _get_job_input_workunits( ) if dataset_id: - yield from self._gen_input_dataset_workunits(job, dataset_id) + job_meta.input_dataset = ( + self._search_dataset(dataset_id) if dataset_id else None + ) + + # Check if output model is present in the job configuration + if ( + "modelToUpload" in job_conf + and "name" in job_conf["modelToUpload"] + and job_conf["modelToUpload"]["name"] + and job_conf["modelToUpload"]["versionId"] + ): + model_version_str = job_conf["modelToUpload"]["versionId"] + try: + model = Model(model_name=job_conf["modelToUpload"]["name"]) + model_version = self._search_model_version(model, model_version_str) + if model and model_version: + logger.info( + f"Found output model (name:{model.display_name} id:{model_version_str}) " + f"for training job: {job.display_name}" + ) + job_meta.output_model = model + job_meta.output_model_version = model_version + except GoogleAPICallError: + logger.error( + f"Error while fetching model version {model_version_str}" + ) + + return job_meta def _gen_input_dataset_workunits( self, job: VertexAiResourceNoun, dataset_id: str @@ -619,28 +759,30 @@ def _make_job_urn(self, job: VertexAiResourceNoun) -> str: return urn def _make_vertexai_model_group_name( - self, entity_id: str, separator: str = "." + self, + entity_id: str, ) -> str: - entity_type = "model_group" - return f"{self.config.project_id}{separator}{entity_type}{separator}{entity_id}" + separator: str = "." + return f"{self.config.project_id}{separator}model_group{separator}{entity_id}" - def _make_vertexai_endpoint_name(self, entity_id: str, separator: str = ".") -> str: - entity_type = "endpoint" - return f"{self.config.project_id}{separator}{entity_type}{separator}{entity_id}" + def _make_vertexai_endpoint_name(self, entity_id: str) -> str: + separator: str = "." + return f"{self.config.project_id}{separator}endpoint{separator}{entity_id}" - def _make_vertexai_model_name(self, entity_id: str, separator: str = ".") -> str: - entity_type = "model" - return f"{self.config.project_id}{separator}{entity_type}{separator}{entity_id}" + def _make_vertexai_model_name(self, entity_id: str) -> str: + separator: str = "." + return f"{self.config.project_id}{separator}model{separator}{entity_id}" - def _make_vertexai_dataset_name(self, entity_id: str, separator: str = ".") -> str: - entity_type = "dataset" - return f"{self.config.project_id}{separator}{entity_type}{separator}{entity_id}" + def _make_vertexai_dataset_name(self, entity_id: str) -> str: + separator: str = "." + return f"{self.config.project_id}{separator}dataset{separator}{entity_id}" def _make_vertexai_job_name( - self, entity_id: Optional[str], separator: str = "." + self, + entity_id: Optional[str], ) -> str: - entity_type = "job" - return f"{self.config.project_id}{separator}{entity_type}{separator}{entity_id}" + separator: str = "." + return f"{self.config.project_id}{separator}job{separator}{entity_id}" def _make_job_external_url(self, job: VertexAiResourceNoun) -> str: """ @@ -648,9 +790,8 @@ def _make_job_external_url(self, job: VertexAiResourceNoun) -> str: Sample URLs: https://console.cloud.google.com/vertex-ai/training/training-pipelines?project=acryl-poc&trainingPipelineId=5401695018589093888 """ - entity_type = "training" external_url: str = ( - f"{self.config.vertexai_url}/{entity_type}/training-pipelines?trainingPipelineId={job.name}" + f"{self.config.vertexai_url}/training/training-pipelines?trainingPipelineId={job.name}" f"?project={self.config.project_id}" ) return external_url @@ -661,9 +802,8 @@ def _make_model_external_url(self, model: Model) -> str: Sample URL: https://console.cloud.google.com/vertex-ai/models/locations/us-west2/models/812468724182286336?project=acryl-poc """ - entity_type = "models" external_url: str = ( - f"{self.config.vertexai_url}/{entity_type}/locations/{self.config.region}/{entity_type}/{model.name}" + f"{self.config.vertexai_url}/models/locations/{self.config.region}/models/{model.name}" f"?project={self.config.project_id}" ) return external_url @@ -674,9 +814,8 @@ def _make_model_version_external_url(self, model: Model) -> str: Sample URL: https://console.cloud.google.com/vertex-ai/models/locations/us-west2/models/812468724182286336/versions/1?project=acryl-poc """ - entity_type = "models" external_url: str = ( - f"{self.config.vertexai_url}/{entity_type}/locations/{self.config.region}/{entity_type}/{model.name}" + f"{self.config.vertexai_url}/models/locations/{self.config.region}/models/{model.name}" f"/versions/{model.version_id}" f"?project={self.config.project_id}" ) diff --git a/metadata-ingestion/tests/integration/fivetran/test_fivetran.py b/metadata-ingestion/tests/integration/fivetran/test_fivetran.py index 2f3e6a30cb5e09..1aab4e9419014f 100644 --- a/metadata-ingestion/tests/integration/fivetran/test_fivetran.py +++ b/metadata-ingestion/tests/integration/fivetran/test_fivetran.py @@ -9,7 +9,7 @@ from datahub.configuration.common import ConfigurationWarning from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.run.pipeline import Pipeline -from datahub.ingestion.source.common.credentials import GCPCredential +from datahub.ingestion.source.common.gcp_credentials_config import GCPCredential from datahub.ingestion.source.fivetran.config import ( BigQueryDestinationConfig, FivetranSourceConfig, diff --git a/metadata-ingestion/tests/integration/vertexai/test_vertexai.py b/metadata-ingestion/tests/integration/vertexai/test_vertexai.py index 9ada49a556b9fb..cbee5db8d22de9 100644 --- a/metadata-ingestion/tests/integration/vertexai/test_vertexai.py +++ b/metadata-ingestion/tests/integration/vertexai/test_vertexai.py @@ -4,8 +4,7 @@ from unittest.mock import MagicMock, patch import pytest -from google.cloud.aiplatform import Model -from google.cloud.aiplatform.base import VertexAiResourceNoun +from google.cloud.aiplatform import AutoMLTabularTrainingJob, CustomJob, Model from google.protobuf import timestamp_pb2 from pytest import Config @@ -43,8 +42,7 @@ def get_pipeline_config(sink_file_path: str) -> Dict[str, Any]: } -@pytest.fixture -def mock_models() -> List[Model]: +def gen_mock_models() -> List[Model]: mock_model_1 = MagicMock(spec=Model) mock_model_1.name = "mock_prediction_model_1" mock_model_1.create_time = timestamp_pb2.Timestamp().GetCurrentTime() @@ -65,23 +63,28 @@ def mock_models() -> List[Model]: return [mock_model_1, mock_model_2] -@pytest.fixture -def mock_training_jobs() -> List[VertexAiResourceNoun]: - mock_training_job = MagicMock(spec=VertexAiResourceNoun) +def gen_mock_training_custom_job() -> CustomJob: + mock_training_job = MagicMock(spec=CustomJob) mock_training_job.name = "mock_training_job" mock_training_job.create_time = timestamp_pb2.Timestamp().GetCurrentTime() mock_training_job.update_time = timestamp_pb2.Timestamp().GetCurrentTime() mock_training_job.display_name = "mock_training_job_display_name" mock_training_job.description = "mock_training_job_description" - return [mock_training_job] + + return mock_training_job + + +def gen_mock_training_automl_job() -> AutoMLTabularTrainingJob: + mock_automl_job = MagicMock(spec=AutoMLTabularTrainingJob) + mock_automl_job.name = "mock_auto_automl_tabular_job" + mock_automl_job.create_time = timestamp_pb2.Timestamp().GetCurrentTime() + mock_automl_job.update_time = timestamp_pb2.Timestamp().GetCurrentTime() + mock_automl_job.display_name = "mock_auto_automl_tabular_job_display_name" + mock_automl_job.description = "mock_auto_automl_tabular_job_display_name" + return mock_automl_job -def test_vertexai_source_ingestion( - pytestconfig: Config, - sink_file_path: str, - mock_models: List[Model], - mock_training_jobs: List[VertexAiResourceNoun], -) -> None: +def test_vertexai_source_ingestion(pytestconfig: Config, sink_file_path: str) -> None: with contextlib.ExitStack() as exit_stack: for func_to_mock in [ "google.cloud.aiplatform.init", @@ -103,7 +106,16 @@ def test_vertexai_source_ingestion( ]: mock = exit_stack.enter_context(patch(func_to_mock)) if func_to_mock == "google.cloud.aiplatform.Model.list": - mock.return_value = mock_models + mock.return_value = gen_mock_models() + elif func_to_mock == "google.cloud.aiplatform.CustomJob.list": + mock.return_value = [ + gen_mock_training_custom_job(), + gen_mock_training_automl_job(), + ] + elif ( + func_to_mock == "google.cloud.aiplatform.AutoMLTabularTrainingJob.list" + ): + mock.return_value = [gen_mock_training_automl_job()] else: mock.return_value = [] diff --git a/metadata-ingestion/tests/integration/vertexai/vertexai_mcps_golden.json b/metadata-ingestion/tests/integration/vertexai/vertexai_mcps_golden.json index 7dbe1f89f7643a..146b7e70c78849 100644 --- a/metadata-ingestion/tests/integration/vertexai/vertexai_mcps_golden.json +++ b/metadata-ingestion/tests/integration/vertexai/vertexai_mcps_golden.json @@ -14,7 +14,7 @@ } }, "systemMetadata": { - "lastObserved": 1740975346432, + "lastObserved": 1741107353572, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } @@ -30,7 +30,7 @@ } }, "systemMetadata": { - "lastObserved": 1740975346433, + "lastObserved": 1741107353573, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } @@ -46,7 +46,7 @@ } }, "systemMetadata": { - "lastObserved": 1740975346433, + "lastObserved": 1741107353573, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } @@ -64,7 +64,7 @@ } }, "systemMetadata": { - "lastObserved": 1740975346433, + "lastObserved": 1741107353573, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } @@ -80,7 +80,7 @@ } }, "systemMetadata": { - "lastObserved": 1740975346433, + "lastObserved": 1741107353574, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } @@ -100,7 +100,7 @@ } }, "systemMetadata": { - "lastObserved": 1740975346434, + "lastObserved": 1741107353574, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } @@ -120,7 +120,193 @@ } }, "systemMetadata": { - "lastObserved": 1740975346435, + "lastObserved": 1741107354209, + "runId": "vertexai-source-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:test-project-id.job.mock_training_job", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "displayName": "mock_training_job_display_name", + "jobType": "CustomJob" + }, + "externalUrl": "https://console.cloud.google.com/vertex-ai/training/training-pipelines?trainingPipelineId=mock_training_job?project=test-project-id", + "name": "test-project-id.job.mock_training_job", + "created": { + "time": 1741107354209, + "actor": "urn:li:platformResource:vertexai" + } + } + }, + "systemMetadata": { + "lastObserved": 1741107356953, + "runId": "vertexai-source-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:test-project-id.job.mock_training_job", + "changeType": "UPSERT", + "aspectName": "mlTrainingRunProperties", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "https://console.cloud.google.com/vertex-ai/training/training-pipelines?trainingPipelineId=mock_training_job?project=test-project-id", + "id": "mock_training_job" + } + }, + "systemMetadata": { + "lastObserved": 1741107356956, + "runId": "vertexai-source-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:test-project-id.job.mock_training_job", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Training Job" + ] + } + }, + "systemMetadata": { + "lastObserved": 1741107356958, + "runId": "vertexai-source-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:test-project-id.job.mock_training_job", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:29746a9030349f4340ed74b46913dab6" + } + }, + "systemMetadata": { + "lastObserved": 1741107356958, + "runId": "vertexai-source-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:test-project-id.job.mock_auto_automl_tabular_job", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "displayName": "mock_auto_automl_tabular_job_display_name", + "jobType": "AutoMLTabularTrainingJob" + }, + "externalUrl": "https://console.cloud.google.com/vertex-ai/training/training-pipelines?trainingPipelineId=mock_auto_automl_tabular_job?project=test-project-id", + "name": "test-project-id.job.mock_auto_automl_tabular_job", + "created": { + "time": 1741107356952, + "actor": "urn:li:platformResource:vertexai" + } + } + }, + "systemMetadata": { + "lastObserved": 1741107356959, + "runId": "vertexai-source-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:test-project-id.job.mock_auto_automl_tabular_job", + "changeType": "UPSERT", + "aspectName": "mlTrainingRunProperties", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "https://console.cloud.google.com/vertex-ai/training/training-pipelines?trainingPipelineId=mock_auto_automl_tabular_job?project=test-project-id", + "id": "mock_auto_automl_tabular_job" + } + }, + "systemMetadata": { + "lastObserved": 1741107356960, + "runId": "vertexai-source-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:test-project-id.job.mock_auto_automl_tabular_job", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Training Job" + ] + } + }, + "systemMetadata": { + "lastObserved": 1741107356960, + "runId": "vertexai-source-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:test-project-id.job.mock_auto_automl_tabular_job", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:29746a9030349f4340ed74b46913dab6" + } + }, + "systemMetadata": { + "lastObserved": 1741107356961, + "runId": "vertexai-source-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:test-project-id.job.mock_auto_automl_tabular_job", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1741107356961, + "runId": "vertexai-source-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:test-project-id.job.mock_training_job", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1741107356962, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } @@ -136,7 +322,7 @@ } }, "systemMetadata": { - "lastObserved": 1740975346436, + "lastObserved": 1741107356962, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } @@ -152,7 +338,7 @@ } }, "systemMetadata": { - "lastObserved": 1740975346436, + "lastObserved": 1741107356963, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } diff --git a/metadata-ingestion/tests/unit/test_vertexai_source.py b/metadata-ingestion/tests/unit/test_vertexai_source.py index 1dee362312e08a..b310dc18b6e696 100644 --- a/metadata-ingestion/tests/unit/test_vertexai_source.py +++ b/metadata-ingestion/tests/unit/test_vertexai_source.py @@ -35,8 +35,7 @@ REGION = "us-west2" -@pytest.fixture -def mock_model() -> Model: +def gen_mock_model() -> Model: mock_model_1 = MagicMock(spec=Model) mock_model_1.name = "mock_prediction_model_1" mock_model_1.create_time = timestamp_pb2.Timestamp().GetCurrentTime() @@ -49,8 +48,7 @@ def mock_model() -> Model: return mock_model_1 -@pytest.fixture -def mock_models() -> List[Model]: +def gen_mock_models() -> List[Model]: mock_model_1 = MagicMock(spec=Model) mock_model_1.name = "mock_prediction_model_1" mock_model_1.create_time = timestamp_pb2.Timestamp().GetCurrentTime() @@ -71,8 +69,7 @@ def mock_models() -> List[Model]: return [mock_model_1, mock_model_2] -@pytest.fixture -def mock_training_job() -> VertexAiResourceNoun: +def gen_mock_training_job() -> VertexAiResourceNoun: mock_training_job = MagicMock(spec=VertexAiResourceNoun) mock_training_job.name = "mock_training_job" mock_training_job.create_time = timestamp_pb2.Timestamp().GetCurrentTime() @@ -82,8 +79,7 @@ def mock_training_job() -> VertexAiResourceNoun: return mock_training_job -@pytest.fixture -def mock_dataset() -> VertexAiResourceNoun: +def gen_mock_dataset() -> VertexAiResourceNoun: mock_dataset = MagicMock(spec=VertexAiResourceNoun) mock_dataset.name = "mock_dataset" mock_dataset.create_time = timestamp_pb2.Timestamp().GetCurrentTime() @@ -93,8 +89,7 @@ def mock_dataset() -> VertexAiResourceNoun: return mock_dataset -@pytest.fixture -def mock_training_automl_job() -> AutoMLTabularTrainingJob: +def gen_mock_training_automl_job() -> AutoMLTabularTrainingJob: mock_automl_job = MagicMock(spec=AutoMLTabularTrainingJob) mock_automl_job.name = "mock_auto_automl_tabular_job" mock_automl_job.create_time = timestamp_pb2.Timestamp().GetCurrentTime() @@ -104,8 +99,7 @@ def mock_training_automl_job() -> AutoMLTabularTrainingJob: return mock_automl_job -@pytest.fixture -def mock_endpoint() -> Endpoint: +def gen_mock_endpoint() -> Endpoint: mock_endpoint = MagicMock(spec=Endpoint) mock_endpoint.description = "test endpoint" mock_endpoint.create_time = datetime.now() @@ -113,6 +107,18 @@ def mock_endpoint() -> Endpoint: return mock_endpoint +def gen_mock_model_version(mock_model: Model) -> VersionInfo: + version = "1" + return VersionInfo( + version_id=version, + version_description="test", + version_create_time=timestamp_pb2.Timestamp().GetCurrentTime(), + version_update_time=timestamp_pb2.Timestamp().GetCurrentTime(), + model_display_name=mock_model.name, + model_resource_name=mock_model.resource_name, + ) + + @pytest.fixture def source() -> VertexAISource: return VertexAISource( @@ -121,8 +127,7 @@ def source() -> VertexAISource: ) -@pytest.fixture -def real_model(source: VertexAISource) -> Model: +def gen_real_model(source: VertexAISource) -> Model: """ Fixture for the model that is actually registered in the Vertex AI Model Registry Use mock_model for local testing purpose, but this fixture is provided to use real model for debugging. @@ -132,8 +137,7 @@ def real_model(source: VertexAISource) -> Model: return Model(model_name=model_name) -@pytest.fixture -def real_autoML_tabular_job(source: VertexAISource) -> _TrainingJob: +def gen_real_autoML_tabular_job(source: VertexAISource) -> _TrainingJob: """ Fixture for the training job that is actually registered in the Vertex AI Model Registry Use mock_training_job for local testing purpose, but this fixture is provided to use real training job for debugging. @@ -151,26 +155,9 @@ def real_autoML_tabular_job(source: VertexAISource) -> _TrainingJob: return job -@pytest.fixture -def model_version( - source: VertexAISource, - mock_model: Model, -) -> VersionInfo: - version = "1" - return VersionInfo( - version_id=version, - version_description="test", - version_create_time=timestamp_pb2.Timestamp().GetCurrentTime(), - version_update_time=timestamp_pb2.Timestamp().GetCurrentTime(), - model_display_name=mock_model.name, - model_resource_name=mock_model.resource_name, - ) - - @patch("google.cloud.aiplatform.Model.list") -def test_get_ml_model_workunits( - mock_list: List[Model], source: VertexAISource, mock_models: List[Model] -) -> None: +def test_get_ml_model_workunits(mock_list: List[Model], source: VertexAISource) -> None: + mock_models = gen_mock_models() assert hasattr(mock_list, "return_value") # this check needed to go ground lint mock_list.return_value = mock_models @@ -196,8 +183,10 @@ def test_get_ml_model_workunits( def test_get_ml_model_properties_workunit( - source: VertexAISource, mock_model: Model, model_version: VersionInfo + source: VertexAISource, ) -> None: + mock_model = gen_mock_model() + model_version = gen_mock_model_version(mock_model) wu = [wu for wu in source._gen_ml_model_workunits(mock_model, model_version)] assert len(wu) == 1 assert hasattr(wu[0].metadata, "aspect") @@ -214,10 +203,10 @@ def test_get_ml_model_properties_workunit( def test_get_endpoint_workunit( source: VertexAISource, - mock_endpoint: Endpoint, - mock_model: Model, - model_version: VersionInfo, ) -> None: + mock_model = gen_mock_model() + model_version = gen_mock_model_version(mock_model) + mock_endpoint = gen_mock_endpoint() for wu in source._gen_endpoint_workunits(mock_endpoint, mock_model, model_version): assert hasattr(wu.metadata, "aspect") aspect = wu.metadata.aspect @@ -234,10 +223,9 @@ def test_get_endpoint_workunit( assert aspect.typeNames == ["Endpoint"] -def test_get_data_process_properties_workunit( - source: VertexAISource, mock_training_job: VertexAiResourceNoun -) -> None: - for wu in source._gen_data_process_workunits(mock_training_job): +def test_get_data_process_properties_workunit(source: VertexAISource) -> None: + mock_training_job = gen_mock_training_job() + for wu in source._get_training_job_workunits(mock_training_job): assert hasattr(wu.metadata, "aspect") aspect = wu.metadata.aspect if isinstance(aspect, DataProcessInstancePropertiesClass): @@ -255,10 +243,8 @@ def test_get_data_process_properties_workunit( assert "Training Job" in aspect.typeNames -def test_get_data_process_input_workunit( - source: VertexAISource, - mock_training_job: VertexAiResourceNoun, -) -> None: +def test_get_data_process_input_workunit(source: VertexAISource) -> None: + mock_training_job = gen_mock_training_job() with contextlib.ExitStack() as exit_stack: for func_to_mock in [ "google.cloud.aiplatform.init", @@ -347,9 +333,9 @@ def test_vertexai_config_init(): def test_get_training_jobs_workunit( source: VertexAISource, - mock_training_job: VertexAiResourceNoun, - mock_training_automl_job: AutoMLTabularTrainingJob, ) -> None: + mock_training_job = gen_mock_training_job() + mock_training_automl_job = gen_mock_training_automl_job() with contextlib.ExitStack() as exit_stack: for func_to_mock in [ "google.cloud.aiplatform.init", @@ -399,9 +385,8 @@ def test_get_training_jobs_workunit( assert aspect.container == container_key.as_urn() -def test_get_dataset_workunit( - mock_dataset: VertexAiResourceNoun, source: VertexAISource -) -> None: +def test_get_dataset_workunit(source: VertexAISource) -> None: + mock_dataset = gen_mock_dataset() dataset_urn = builder.make_dataset_urn( platform=source.platform, name=mock_dataset.name, @@ -419,7 +404,8 @@ def test_get_dataset_workunit( assert aspect.typeNames == ["Dataset"] -def test_make_model_external_url(mock_model: Model, source: VertexAISource) -> None: +def test_make_model_external_url(source: VertexAISource) -> None: + mock_model = gen_mock_model() assert ( source._make_model_external_url(mock_model) == f"{source.config.vertexai_url}/models/locations/{source.config.region}/models/{mock_model.name}" @@ -427,9 +413,8 @@ def test_make_model_external_url(mock_model: Model, source: VertexAISource) -> N ) -def test_make_job_urn( - mock_training_job: VertexAiResourceNoun, source: VertexAISource -) -> None: +def test_make_job_urn(source: VertexAISource) -> None: + mock_training_job = gen_mock_training_job() assert ( source._make_job_urn(mock_training_job) == f"{builder.make_data_process_instance_urn(source._make_vertexai_job_name(mock_training_job.name))}" @@ -437,13 +422,13 @@ def test_make_job_urn( @pytest.mark.skip(reason="Skipping, this is for debugging purpose") -def test_real_model_workunit( - source: VertexAISource, real_model: Model, model_version: VersionInfo -) -> None: +def test_real_model_workunit(source: VertexAISource) -> None: """ Disabled as default Use real model registered in the Vertex AI Model Registry """ + real_model = gen_real_model(source) + model_version = gen_mock_model_version(real_model) for wu in source._gen_ml_model_workunits( model=real_model, model_version=model_version ): @@ -458,10 +443,9 @@ def test_real_model_workunit( @pytest.mark.skip(reason="Skipping, this is for debugging purpose") -def test_real_get_data_process_properties( - source: VertexAISource, real_autoML_tabular_job: _TrainingJob -) -> None: - for wu in source._gen_data_process_workunits(real_autoML_tabular_job): +def test_real_get_data_process_properties(source: VertexAISource) -> None: + real_autoML_tabular_job = gen_real_autoML_tabular_job(source) + for wu in source._get_training_job_workunits(real_autoML_tabular_job): assert hasattr(wu.metadata, "aspect") aspect = wu.metadata.aspect if isinstance(aspect, DataProcessInstancePropertiesClass): From 4b093653270167e0339a2d4a9bef0b8828288f0d Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Tue, 4 Mar 2025 09:47:39 -0800 Subject: [PATCH 51/59] delete test case using real model --- .../tests/unit/test_vertexai_source.py | 63 ------------------- 1 file changed, 63 deletions(-) diff --git a/metadata-ingestion/tests/unit/test_vertexai_source.py b/metadata-ingestion/tests/unit/test_vertexai_source.py index b310dc18b6e696..cb4057bf4cd287 100644 --- a/metadata-ingestion/tests/unit/test_vertexai_source.py +++ b/metadata-ingestion/tests/unit/test_vertexai_source.py @@ -126,35 +126,6 @@ def source() -> VertexAISource: config=VertexAIConfig(project_id=PROJECT_ID, region=REGION), ) - -def gen_real_model(source: VertexAISource) -> Model: - """ - Fixture for the model that is actually registered in the Vertex AI Model Registry - Use mock_model for local testing purpose, but this fixture is provided to use real model for debugging. - Replace model name with your real model when using this fixture. - """ - model_name = "projects/872197881936/locations/us-west2/models/3583871344875405312" - return Model(model_name=model_name) - - -def gen_real_autoML_tabular_job(source: VertexAISource) -> _TrainingJob: - """ - Fixture for the training job that is actually registered in the Vertex AI Model Registry - Use mock_training_job for local testing purpose, but this fixture is provided to use real training job for debugging. - Replace training job name with your real training job when using this fixture. - """ - - # Initialize the AI Platform client - aiplatform.init(project=source.config.project_id, location=source.config.region) - - # Retrieve the custom training job by its resource name - # resource_name format 'projects/your-project-id/locations/your-location/trainingPipelines/your-training-job-id') - job = aiplatform.AutoMLTabularTrainingJob.get( - resource_name="projects/872197881936/locations/us-west2/trainingPipelines/5401695018589093888" - ) - return job - - @patch("google.cloud.aiplatform.Model.list") def test_get_ml_model_workunits(mock_list: List[Model], source: VertexAISource) -> None: mock_models = gen_mock_models() @@ -419,37 +390,3 @@ def test_make_job_urn(source: VertexAISource) -> None: source._make_job_urn(mock_training_job) == f"{builder.make_data_process_instance_urn(source._make_vertexai_job_name(mock_training_job.name))}" ) - - -@pytest.mark.skip(reason="Skipping, this is for debugging purpose") -def test_real_model_workunit(source: VertexAISource) -> None: - """ - Disabled as default - Use real model registered in the Vertex AI Model Registry - """ - real_model = gen_real_model(source) - model_version = gen_mock_model_version(real_model) - for wu in source._gen_ml_model_workunits( - model=real_model, model_version=model_version - ): - assert hasattr(wu.metadata, "aspect") - aspect = wu.metadata.aspect - assert isinstance(aspect, MLModelProperties) - # aspect is MLModelPropertiesClass - assert aspect.description == model_version.version_description - assert aspect.date == model_version.version_create_time - assert aspect.hyperParams is None - assert aspect.trainingMetrics is None - - -@pytest.mark.skip(reason="Skipping, this is for debugging purpose") -def test_real_get_data_process_properties(source: VertexAISource) -> None: - real_autoML_tabular_job = gen_real_autoML_tabular_job(source) - for wu in source._get_training_job_workunits(real_autoML_tabular_job): - assert hasattr(wu.metadata, "aspect") - aspect = wu.metadata.aspect - if isinstance(aspect, DataProcessInstancePropertiesClass): - # aspect is DataProcessInstancePropertiesClass - assert aspect.externalUrl == source._make_job_external_url( - real_autoML_tabular_job - ) From eb261c3ee26a8e87d2712d2d00472d2027cb9dcb Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Tue, 4 Mar 2025 09:58:13 -0800 Subject: [PATCH 52/59] delete commented out code --- .../src/datahub/ingestion/source/vertexai.py | 108 ------------------ .../tests/unit/test_vertexai_source.py | 3 +- 2 files changed, 1 insertion(+), 110 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py index 3b723d5c1c0d7e..a561ca8916d7aa 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py +++ b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py @@ -330,60 +330,6 @@ def _make_ml_model_group_urn(self, model: Model) -> str: ) return urn - # def _gen_data_process_workunits( - # self, job: VertexAiResourceNoun - # ) -> Iterable[MetadataWorkUnit]: - # """ - # Generate a work unit for VertexAI Training Job - # """ - # - # created_time = ( - # int(job.create_time.timestamp() * 1000) - # if job.create_time - # else int(time.time() * 1000) - # ) - # created_actor = f"urn:li:platformResource:{self.platform}" - # - # job_id = self._make_vertexai_job_name(entity_id=job.name) - # job_urn = builder.make_data_process_instance_urn(job_id) - # - # aspects: List[_Aspect] = list() - # aspects.append( - # DataProcessInstancePropertiesClass( - # name=job_id, - # created=AuditStampClass( - # time=created_time, - # actor=created_actor, - # ), - # externalUrl=self._make_job_external_url(job), - # customProperties={ - # "displayName": job.display_name, - # "jobType": job.__class__.__name__, - # }, - # ) - # ) - # - # aspects.append( - # MLTrainingRunProperties( - # externalUrl=self._make_job_external_url(job), id=job.name - # ) - # ) - # aspects.append(SubTypesClass(typeNames=[MLTypes.TRAINING_JOB])) - # - # aspects.append(ContainerClass(container=self._get_project_container().as_urn())) - # - # # TODO add status of the job - # # aspects.append( - # # DataProcessInstanceRunEventClass( - # # status=DataProcessRunStatusClass.COMPLETE, - # # timestampMillis=0 - # # ) - # # } - # - # yield from auto_workunit( - # MetadataChangeProposalWrapper.construct_many(job_urn, aspects=aspects) - # ) - def _get_project_container(self) -> ProjectIdKey: return ProjectIdKey(project_id=self.config.project_id, platform=self.platform) @@ -404,36 +350,6 @@ def _search_model_version( return version return None - # def _get_job_output_workunits( - # self, job: VertexAiResourceNoun - # ) -> Iterable[MetadataWorkUnit]: - # """ - # This method creates work units that link the training job to the model version - # that it produces. It checks if the job configuration contains a model to upload, - # and if so, it generates a work unit for the model version with the training job - # as part of its properties. - # """ - # - # job_conf = job.to_dict() - # if ( - # "modelToUpload" in job_conf - # and "name" in job_conf["modelToUpload"] - # and job_conf["modelToUpload"]["name"] - # ): - # model_version_str = job_conf["modelToUpload"]["versionId"] - # job_urn = self._make_job_urn(job) - # - # model = Model(model_name=job_conf["modelToUpload"]["name"]) - # model_version = self._search_model_version(model, model_version_str) - # if model and model_version: - # logger.info( - # f"Found output model (name:{model.display_name} id:{model_version_str}) " - # f"for training job: {job.display_name}" - # ) - # yield from self._gen_ml_model_endpoint_workunits( - # model, model_version, job_urn - # ) - def _search_dataset(self, dataset_id: str) -> Optional[VertexAiResourceNoun]: """ Search for a dataset by its ID in Vertex AI. @@ -491,30 +407,6 @@ def _get_dataset_workunits( MetadataChangeProposalWrapper.construct_many(dataset_urn, aspects=aspects) ) - # def _get_job_input_workunits( - # self, job: VertexAiResourceNoun - # ) -> Iterable[MetadataWorkUnit]: - # """ - # Generate work units for the input data of a training job. - # This method checks if the training job is an AutoML job and if it has an input dataset - # configuration. If so, it creates a work unit for the input dataset. - # """ - # - # if self._is_automl_job(job): - # job_conf = job.to_dict() - # if ( - # "inputDataConfig" in job_conf - # and "datasetId" in job_conf["inputDataConfig"] - # ): - # # Create URN of Input Dataset for Training Job - # dataset_id = job_conf["inputDataConfig"]["datasetId"] - # logger.info( - # f"Found input dataset (id: {dataset_id}) for training job ({job.display_name})" - # ) - # - # if dataset_id: - # yield from self._gen_input_dataset_workunits(job, dataset_id) - def _get_training_job_metadata( self, job: VertexAiResourceNoun ) -> TrainingJobMetadata: diff --git a/metadata-ingestion/tests/unit/test_vertexai_source.py b/metadata-ingestion/tests/unit/test_vertexai_source.py index cb4057bf4cd287..3bb822274a2ed2 100644 --- a/metadata-ingestion/tests/unit/test_vertexai_source.py +++ b/metadata-ingestion/tests/unit/test_vertexai_source.py @@ -5,11 +5,9 @@ from unittest.mock import MagicMock, patch import pytest -from google.cloud import aiplatform from google.cloud.aiplatform import AutoMLTabularTrainingJob from google.cloud.aiplatform.base import VertexAiResourceNoun from google.cloud.aiplatform.models import Endpoint, Model, VersionInfo -from google.cloud.aiplatform.training_jobs import _TrainingJob from google.protobuf import timestamp_pb2 import datahub.emitter.mce_builder as builder @@ -126,6 +124,7 @@ def source() -> VertexAISource: config=VertexAIConfig(project_id=PROJECT_ID, region=REGION), ) + @patch("google.cloud.aiplatform.Model.list") def test_get_ml_model_workunits(mock_list: List[Model], source: VertexAISource) -> None: mock_models = gen_mock_models() From e6feb8a35ff31ba1fa1ac2e53183a2862ddbc23b Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Tue, 4 Mar 2025 12:54:57 -0800 Subject: [PATCH 53/59] consolidate use of auto_workunit and change func output to mcps --- .../src/datahub/ingestion/source/vertexai.py | 179 +++++++-------- .../tests/unit/test_vertexai_source.py | 209 ++++++++---------- 2 files changed, 174 insertions(+), 214 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py index a561ca8916d7aa..97a4b777fd8f59 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py +++ b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py @@ -151,9 +151,9 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: # Ingest Project yield from self._gen_project_workunits() # Fetch and Ingest Models, Model Versions a from Model Registry - yield from self._get_ml_models_workunits() + yield from auto_workunit(self._get_ml_models_mcps()) # Fetch and Ingest Training Jobs - yield from self._get_training_jobs_workunits() + yield from auto_workunit(self._get_training_jobs_mcps()) # TODO Fetch Experiments and Experiment Runs def _gen_project_workunits(self) -> Iterable[MetadataWorkUnit]: @@ -163,25 +163,25 @@ def _gen_project_workunits(self) -> Iterable[MetadataWorkUnit]: sub_types=[MLTypes.PROJECT], ) - def _get_ml_models_workunits(self) -> Iterable[MetadataWorkUnit]: + def _get_ml_models_mcps(self) -> Iterable[MetadataChangeProposalWrapper]: """ Fetch List of Models in Model Registry and generate a corresponding work unit. """ registered_models = self.client.Model.list() for model in registered_models: # create work unit for Model Group (= Model in VertexAI) - yield from self._gen_ml_group_workunits(model) + yield from self._gen_ml_group_mcps(model) model_versions = model.versioning_registry.list_versions() for model_version in model_versions: # create work unit for Model (= Model Version in VertexAI) logger.info( f"Ingesting a model (name: {model.display_name} id:{model.name})" ) - yield from self._gen_ml_model_endpoint_workunits( + yield from self._gen_ml_model_endpoint_mcps( model=model, model_version=model_version ) - def _get_training_jobs_workunits(self) -> Iterable[MetadataWorkUnit]: + def _get_training_jobs_mcps(self) -> Iterable[MetadataChangeProposalWrapper]: """ Fetches training jobs from Vertex AI and generates corresponding work units. This method retrieves various types of training jobs from Vertex AI, including @@ -205,33 +205,35 @@ def _get_training_jobs_workunits(self) -> Iterable[MetadataWorkUnit]: for class_name in class_names: logger.info(f"Fetching a list of {class_name}s from VertexAI server") for job in getattr(self.client, class_name).list(): - yield from self._get_training_job_workunits(job) + yield from self._get_training_job_mcps(job) - def _get_training_job_workunits( + def _get_training_job_mcps( self, job: VertexAiResourceNoun - ) -> Iterable[MetadataWorkUnit]: + ) -> Iterable[MetadataChangeProposalWrapper]: job_meta: TrainingJobMetadata = self._get_training_job_metadata(job) # Create DataProcessInstance for the training job - yield from self._gen_training_job_workunits(job_meta) + yield from self._gen_training_job_mcps(job_meta) + # Create Dataset entity for Input Dataset of Training job + yield from self._get_input_dataset_mcps(job_meta) # Create ML Model entity for output ML model of this training job - yield from self._gen_output_model_workunits(job_meta) + yield from self._gen_output_model_mcps(job_meta) - def _gen_output_model_workunits( + def _gen_output_model_mcps( self, job_meta: TrainingJobMetadata - ) -> Iterable[MetadataWorkUnit]: + ) -> Iterable[MetadataChangeProposalWrapper]: if job_meta.output_model and job_meta.output_model_version: job = job_meta.job job_urn = builder.make_data_process_instance_urn( self._make_vertexai_job_name(entity_id=job.name) ) - yield from self._gen_ml_model_endpoint_workunits( + yield from self._gen_ml_model_endpoint_mcps( job_meta.output_model, job_meta.output_model_version, job_urn ) - def _gen_training_job_workunits( + def _gen_training_job_mcps( self, job_meta: TrainingJobMetadata - ) -> Iterable[MetadataWorkUnit]: + ) -> Iterable[MetadataChangeProposalWrapper]: """ Generate a work unit for VertexAI Training Job """ @@ -282,14 +284,14 @@ def _gen_training_job_workunits( DataProcessInstanceInputClass(inputs=[dataset_urn]), ) - yield from auto_workunit( - MetadataChangeProposalWrapper.construct_many(job_urn, aspects=aspects) + yield from MetadataChangeProposalWrapper.construct_many( + job_urn, aspects=aspects ) - def _gen_ml_group_workunits( + def _gen_ml_group_mcps( self, model: Model, - ) -> Iterable[MetadataWorkUnit]: + ) -> Iterable[MetadataChangeProposalWrapper]: """ Generate an MLModelGroup work unit for a VertexAI Model. """ @@ -316,10 +318,8 @@ def _gen_ml_group_workunits( # aspects.append(SubTypesClass(typeNames=[MLTypes.MODEL_GROUP])) # aspects.append(ContainerClass(container=self._get_project_container().as_urn())) - yield from auto_workunit( - MetadataChangeProposalWrapper.construct_many( - ml_model_group_urn, aspects=aspects - ) + yield from MetadataChangeProposalWrapper.construct_many( + ml_model_group_urn, aspects=aspects ) def _make_ml_model_group_urn(self, model: Model) -> str: @@ -375,37 +375,48 @@ def _search_dataset(self, dataset_id: str) -> Optional[VertexAiResourceNoun]: return self.datasets.get(dataset_id) if dataset_id in self.datasets else None - def _get_dataset_workunits( - self, dataset_urn: str, ds: VertexAiResourceNoun - ) -> Iterable[MetadataWorkUnit]: + def _get_input_dataset_mcps( + self, job_meta: TrainingJobMetadata + ) -> Iterable[MetadataChangeProposalWrapper]: """ Create a DatasetPropertiesClass aspect for a given Vertex AI dataset. """ + ds = job_meta.input_dataset - # Create aspects for the dataset - aspects: List[_Aspect] = list() - aspects.append( - DatasetPropertiesClass( - name=self._make_vertexai_dataset_name(ds.name), - created=TimeStampClass(time=int(ds.create_time.timestamp() * 1000)) - if ds.create_time - else None, - description=f"Dataset: {ds.display_name}", - customProperties={ - "displayName": ds.display_name, - "resourceName": ds.resource_name, - }, - qualifiedName=ds.resource_name, + if ds: + # Create URN of Input Dataset for Training Job + dataset_name = self._make_vertexai_dataset_name(entity_id=ds.name) + dataset_urn = builder.make_dataset_urn( + platform=self.platform, + name=dataset_name, + env=self.config.env, ) - ) - aspects.append(SubTypesClass(typeNames=[MLTypes.DATASET])) + # Create aspects for the dataset + aspects: List[_Aspect] = list() + aspects.append( + DatasetPropertiesClass( + name=self._make_vertexai_dataset_name(ds.name), + created=TimeStampClass(time=int(ds.create_time.timestamp() * 1000)) + if ds.create_time + else None, + description=f"Dataset: {ds.display_name}", + customProperties={ + "displayName": ds.display_name, + "resourceName": ds.resource_name, + }, + qualifiedName=ds.resource_name, + ) + ) - # Create a container for Project as parent of the dataset - aspects.append(ContainerClass(container=self._get_project_container().as_urn())) - yield from auto_workunit( - MetadataChangeProposalWrapper.construct_many(dataset_urn, aspects=aspects) - ) + aspects.append(SubTypesClass(typeNames=[MLTypes.DATASET])) + # Create a container for Project as parent of the dataset + aspects.append( + ContainerClass(container=self._get_project_container().as_urn()) + ) + yield from MetadataChangeProposalWrapper.construct_many( + dataset_urn, aspects=aspects + ) def _get_training_job_metadata( self, job: VertexAiResourceNoun @@ -435,9 +446,12 @@ def _get_training_job_metadata( ) if dataset_id: - job_meta.input_dataset = ( - self._search_dataset(dataset_id) if dataset_id else None - ) + input_ds = self._search_dataset(dataset_id) + if input_ds: + logger.info( + f"Found the name of input dataset ({input_ds.display_name}) with dataset id ({dataset_id})" + ) + job_meta.input_dataset = input_ds # Check if output model is present in the job configuration if ( @@ -464,46 +478,9 @@ def _get_training_job_metadata( return job_meta - def _gen_input_dataset_workunits( - self, job: VertexAiResourceNoun, dataset_id: str - ) -> Iterable[MetadataWorkUnit]: - """ - This method creates a work unit for the input dataset of a training job. It constructs the URN - for the input dataset and the training job, and then creates a DataProcessInstanceInputClass aspect - to link the input dataset to the training job. - """ - - # Create URN of Input Dataset for Training Job - dataset_name = self._make_vertexai_dataset_name(entity_id=dataset_id) - dataset_urn = builder.make_dataset_urn( - platform=self.platform, - name=dataset_name, - env=self.config.env, - ) - - dataset = self._search_dataset(dataset_id) if dataset_id else None - if dataset: - logger.info( - f"Found the name of input dataset ({dataset_name}) with dataset id ({dataset_id})" - ) - # Yield aspect of input dataset - yield from self._get_dataset_workunits(dataset_urn=dataset_urn, ds=dataset) - - # Yield aspect(DataProcessInstanceInputClass) of training job - job_id = self._make_vertexai_job_name(entity_id=job.name) - yield MetadataChangeProposalWrapper( - entityUrn=builder.make_data_process_instance_urn(job_id), - aspect=DataProcessInstanceInputClass(inputs=[dataset_urn]), - ).as_workunit() - - else: - logger.error( - f"Unable to find the name of input dataset ({dataset_name}) with dataset id ({dataset_id})" - ) - - def _gen_endpoint_workunits( + def _gen_endpoint_mcps( self, endpoint: Endpoint, model: Model, model_version: VersionInfo - ) -> Iterable[MetadataWorkUnit]: + ) -> Iterable[MetadataChangeProposalWrapper]: endpoint_urn = builder.make_ml_model_deployment_urn( platform=self.platform, deployment_name=self._make_vertexai_endpoint_name( @@ -530,16 +507,16 @@ def _gen_endpoint_workunits( aspects.append(SubTypesClass(typeNames=[MLTypes.ENDPOINT])) - yield from auto_workunit( - MetadataChangeProposalWrapper.construct_many(endpoint_urn, aspects=aspects) + yield from MetadataChangeProposalWrapper.construct_many( + endpoint_urn, aspects=aspects ) - def _gen_ml_model_endpoint_workunits( + def _gen_ml_model_endpoint_mcps( self, model: Model, model_version: VersionInfo, training_job_urn: Optional[str] = None, - ) -> Iterable[MetadataWorkUnit]: + ) -> Iterable[MetadataChangeProposalWrapper]: """ Generate an MLModel and Endpoint work unit for an VertexAI Model Version. """ @@ -548,19 +525,19 @@ def _gen_ml_model_endpoint_workunits( endpoint_urn = None if endpoint: - yield from self._gen_endpoint_workunits(endpoint, model, model_version) + yield from self._gen_endpoint_mcps(endpoint, model, model_version) - yield from self._gen_ml_model_workunits( + yield from self._gen_ml_model_mcps( model, model_version, training_job_urn, endpoint_urn ) - def _gen_ml_model_workunits( + def _gen_ml_model_mcps( self, model: Model, model_version: VersionInfo, training_job_urn: Optional[str] = None, endpoint_urn: Optional[str] = None, - ) -> Iterable[MetadataWorkUnit]: + ) -> Iterable[MetadataChangeProposalWrapper]: """ Generate an MLModel workunit for an VertexAI Model Version. Every Model Version is a DataHub MLModel entity associated with an MLModelGroup @@ -617,10 +594,8 @@ def _gen_ml_model_workunits( # ) # ) - yield from auto_workunit( - MetadataChangeProposalWrapper.construct_many( - entityUrn=model_urn, aspects=aspects - ) + yield from MetadataChangeProposalWrapper.construct_many( + entityUrn=model_urn, aspects=aspects ) def _search_endpoint(self, model: Model) -> Optional[Endpoint]: diff --git a/metadata-ingestion/tests/unit/test_vertexai_source.py b/metadata-ingestion/tests/unit/test_vertexai_source.py index 3bb822274a2ed2..27784882b429bf 100644 --- a/metadata-ingestion/tests/unit/test_vertexai_source.py +++ b/metadata-ingestion/tests/unit/test_vertexai_source.py @@ -11,9 +11,10 @@ from google.protobuf import timestamp_pb2 import datahub.emitter.mce_builder as builder -from datahub.emitter.mcp_builder import ProjectIdKey from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.source.vertexai import ( + MLTypes, + TrainingJobMetadata, VertexAIConfig, VertexAISource, ) @@ -126,25 +127,25 @@ def source() -> VertexAISource: @patch("google.cloud.aiplatform.Model.list") -def test_get_ml_model_workunits(mock_list: List[Model], source: VertexAISource) -> None: +def test_get_ml_model_mcps(mock_list: List[Model], source: VertexAISource) -> None: mock_models = gen_mock_models() assert hasattr(mock_list, "return_value") # this check needed to go ground lint mock_list.return_value = mock_models - wcs = [wc for wc in source._get_ml_models_workunits()] - assert len(wcs) == 2 + mcps = [mcp for mcp in source._get_ml_models_mcps()] + assert len(mcps) == 2 # aspect is MLModelGroupPropertiesClass - assert hasattr(wcs[0].metadata, "aspect") - aspect = wcs[0].metadata.aspect + assert hasattr(mcps[0], "aspect") + aspect = mcps[0].aspect assert isinstance(aspect, MLModelGroupProperties) assert ( aspect.name == f"{source._make_vertexai_model_group_name(mock_models[0].name)}" ) assert aspect.description == mock_models[0].description - assert hasattr(wcs[1].metadata, "aspect") - aspect = wcs[1].metadata.aspect + assert hasattr(mcps[1], "aspect") + aspect = mcps[1].aspect assert isinstance(aspect, MLModelGroupProperties) assert ( aspect.name == f"{source._make_vertexai_model_group_name(mock_models[1].name)}" @@ -152,15 +153,15 @@ def test_get_ml_model_workunits(mock_list: List[Model], source: VertexAISource) assert aspect.description == mock_models[1].description -def test_get_ml_model_properties_workunit( +def test_get_ml_model_properties_mcps( source: VertexAISource, ) -> None: mock_model = gen_mock_model() model_version = gen_mock_model_version(mock_model) - wu = [wu for wu in source._gen_ml_model_workunits(mock_model, model_version)] - assert len(wu) == 1 - assert hasattr(wu[0].metadata, "aspect") - aspect = wu[0].metadata.aspect + mcp = [mcp for mcp in source._gen_ml_model_mcps(mock_model, model_version)] + assert len(mcp) == 1 + assert hasattr(mcp[0], "aspect") + aspect = mcp[0].aspect assert isinstance(aspect, MLModelProperties) assert ( aspect.name @@ -171,15 +172,15 @@ def test_get_ml_model_properties_workunit( assert aspect.hyperParams is None -def test_get_endpoint_workunit( +def test_get_endpoint_mcps( source: VertexAISource, ) -> None: mock_model = gen_mock_model() model_version = gen_mock_model_version(mock_model) mock_endpoint = gen_mock_endpoint() - for wu in source._gen_endpoint_workunits(mock_endpoint, mock_model, model_version): - assert hasattr(wu.metadata, "aspect") - aspect = wu.metadata.aspect + for mcp in source._gen_endpoint_mcps(mock_endpoint, mock_model, model_version): + assert hasattr(mcp, "aspect") + aspect = mcp.aspect if isinstance(aspect, MLModelDeploymentPropertiesClass): assert aspect.description == mock_model.description assert aspect.customProperties == { @@ -192,37 +193,23 @@ def test_get_endpoint_workunit( elif isinstance(aspect, SubTypesClass): assert aspect.typeNames == ["Endpoint"] - -def test_get_data_process_properties_workunit(source: VertexAISource) -> None: - mock_training_job = gen_mock_training_job() - for wu in source._get_training_job_workunits(mock_training_job): - assert hasattr(wu.metadata, "aspect") - aspect = wu.metadata.aspect - if isinstance(aspect, DataProcessInstancePropertiesClass): - assert ( - aspect.name - == f"{source._make_vertexai_job_name(mock_training_job.name)}" - ) - assert aspect.externalUrl == source._make_job_external_url( - mock_training_job - ) - assert ( - aspect.customProperties["displayName"] == mock_training_job.display_name - ) - elif isinstance(aspect, SubTypesClass): - assert "Training Job" in aspect.typeNames - - -def test_get_data_process_input_workunit(source: VertexAISource) -> None: +def test_get_training_jobs_mcps( + source: VertexAISource, +) -> None: mock_training_job = gen_mock_training_job() + mock_training_automl_job = gen_mock_training_automl_job() with contextlib.ExitStack() as exit_stack: for func_to_mock in [ "google.cloud.aiplatform.init", - "google.cloud.aiplatform.datasets.TextDataset.list", - "google.cloud.aiplatform.datasets.TabularDataset.list", - "google.cloud.aiplatform.datasets.ImageDataset.list", - "google.cloud.aiplatform.datasets.TimeSeriesDataset.list", - "google.cloud.aiplatform.datasets.VideoDataset.list", + "google.cloud.aiplatform.CustomJob.list", + "google.cloud.aiplatform.CustomTrainingJob.list", + "google.cloud.aiplatform.CustomContainerTrainingJob.list", + "google.cloud.aiplatform.CustomPythonPackageTrainingJob.list", + "google.cloud.aiplatform.AutoMLTabularTrainingJob.list", + "google.cloud.aiplatform.AutoMLImageTrainingJob.list", + "google.cloud.aiplatform.AutoMLTextTrainingJob.list", + "google.cloud.aiplatform.AutoMLVideoTrainingJob.list", + "google.cloud.aiplatform.AutoMLForecastingTrainingJob.list", ]: mock = exit_stack.enter_context(patch(func_to_mock)) if func_to_mock == "google.cloud.aiplatform.CustomJob.list": @@ -230,11 +217,65 @@ def test_get_data_process_input_workunit(source: VertexAISource) -> None: else: mock.return_value = [] - for wu in source._gen_input_dataset_workunits(mock_training_job, "12345"): - assert hasattr(wu.metadata, "aspect") - aspect = wu.metadata.aspect - assert isinstance(aspect, DataProcessInstanceInputClass) - assert len(aspect.inputs) == 1 + """ + Test the retrieval of training jobs work units from Vertex AI. + This function mocks customJob and AutoMLTabularTrainingJob, + and verifies the properties of the work units + """ + for mcp in source._get_training_jobs_mcps(): + assert hasattr(mcp, "aspect") + aspect = mcp.aspect + if isinstance(aspect, DataProcessInstancePropertiesClass): + assert ( + aspect.name + == f"{source.config.project_id}.job.{mock_training_job.name}" + or f"{source.config.project_id}.job.{mock_training_automl_job.name}" + ) + assert ( + aspect.customProperties["displayName"] + == mock_training_job.display_name + or mock_training_automl_job.display_name + ) + if isinstance(aspect, SubTypesClass): + assert aspect.typeNames == [MLTypes.TRAINING_JOB] + + if isinstance(aspect, ContainerClass): + assert aspect.container == source._get_project_container().as_urn() + + +def test_gen_training_job_mcps(source: VertexAISource) -> None: + mock_training_job = gen_mock_training_job() + mock_dataset = gen_mock_dataset() + mock_job = gen_mock_training_job() + job_meta = TrainingJobMetadata(mock_job, input_dataset=mock_dataset) + + for mcp in source._gen_training_job_mcps(job_meta): + assert hasattr(mcp, "aspect") + aspect = mcp.aspect + if isinstance(aspect, DataProcessInstancePropertiesClass): + assert ( + aspect.name + == f"{source.config.project_id}.job.{mock_training_job.name}" + ) + assert ( + aspect.customProperties["displayName"] == mock_training_job.display_name + ) + if isinstance(aspect, SubTypesClass): + assert aspect.typeNames == [MLTypes.TRAINING_JOB] + + if isinstance(aspect, ContainerClass): + assert aspect.container == source._get_project_container().as_urn() + + if isinstance(aspect, DataProcessInstanceInputClass): + dataset_name = source._make_vertexai_dataset_name( + entity_id=mock_dataset.name + ) + dataset_urn = builder.make_dataset_urn( + platform=source.platform, + name=dataset_name, + env=source.config.env, + ) + assert aspect.inputs == [dataset_urn] def test_vertexai_config_init(): @@ -301,70 +342,14 @@ def test_vertexai_config_init(): ) -def test_get_training_jobs_workunit( - source: VertexAISource, -) -> None: - mock_training_job = gen_mock_training_job() - mock_training_automl_job = gen_mock_training_automl_job() - with contextlib.ExitStack() as exit_stack: - for func_to_mock in [ - "google.cloud.aiplatform.init", - "google.cloud.aiplatform.CustomJob.list", - "google.cloud.aiplatform.CustomTrainingJob.list", - "google.cloud.aiplatform.CustomContainerTrainingJob.list", - "google.cloud.aiplatform.CustomPythonPackageTrainingJob.list", - "google.cloud.aiplatform.AutoMLTabularTrainingJob.list", - "google.cloud.aiplatform.AutoMLImageTrainingJob.list", - "google.cloud.aiplatform.AutoMLTextTrainingJob.list", - "google.cloud.aiplatform.AutoMLVideoTrainingJob.list", - "google.cloud.aiplatform.AutoMLForecastingTrainingJob.list", - ]: - mock = exit_stack.enter_context(patch(func_to_mock)) - if func_to_mock == "google.cloud.aiplatform.CustomJob.list": - mock.return_value = [mock_training_job] - else: - mock.return_value = [] - - container_key = ProjectIdKey( - project_id=source.config.project_id, platform=source.platform - ) - - """ - Test the retrieval of training jobs work units from Vertex AI. - This function mocks customJob and AutoMLTabularTrainingJob, - and verifies the properties of the work units - """ - for wc in source._get_training_jobs_workunits(): - assert hasattr(wc.metadata, "aspect") - aspect = wc.metadata.aspect - if isinstance(aspect, DataProcessInstancePropertiesClass): - assert ( - aspect.name - == f"{source.config.project_id}.job.{mock_training_job.name}" - or f"{source.config.project_id}.job.{mock_training_automl_job.name}" - ) - assert ( - aspect.customProperties["displayName"] - == mock_training_job.display_name - or mock_training_automl_job.display_name - ) - if isinstance(aspect, SubTypesClass): - assert aspect.typeNames == ["Training Job"] - - if isinstance(aspect, ContainerClass): - assert aspect.container == container_key.as_urn() - - -def test_get_dataset_workunit(source: VertexAISource) -> None: +def test_get_input_dataset_mcps(source: VertexAISource) -> None: mock_dataset = gen_mock_dataset() - dataset_urn = builder.make_dataset_urn( - platform=source.platform, - name=mock_dataset.name, - env=source.config.env, - ) - for wu in source._get_dataset_workunits(dataset_urn=dataset_urn, ds=mock_dataset): - assert hasattr(wu.metadata, "aspect") - aspect = wu.metadata.aspect + mock_job = gen_mock_training_job() + job_meta = TrainingJobMetadata(mock_job, input_dataset=mock_dataset) + + for mcp in source._get_input_dataset_mcps(job_meta): + assert hasattr(mcp, "aspect") + aspect = mcp.aspect if isinstance(aspect, DataProcessInstancePropertiesClass): assert aspect.name == f"{source._make_vertexai_job_name(mock_dataset.name)}" assert aspect.customProperties["displayName"] == mock_dataset.display_name From b31d0f6c988c8fdaaccd96ea824477c72b27f17d Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Tue, 4 Mar 2025 13:17:09 -0800 Subject: [PATCH 54/59] fix comment --- .../src/datahub/ingestion/source/vertexai.py | 22 +++++++++---------- .../tests/unit/test_vertexai_source.py | 1 + 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py index 97a4b777fd8f59..0bb4b360b62eeb 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py +++ b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py @@ -143,7 +143,7 @@ def get_report(self) -> SourceReport: def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: """ - Main Function to fetch and yields work units for various VertexAI resources. + Main Function to fetch and yields mcps for various VertexAI resources. - Models and Model Versions from the Model Registry - Training Jobs """ @@ -165,15 +165,15 @@ def _gen_project_workunits(self) -> Iterable[MetadataWorkUnit]: def _get_ml_models_mcps(self) -> Iterable[MetadataChangeProposalWrapper]: """ - Fetch List of Models in Model Registry and generate a corresponding work unit. + Fetch List of Models in Model Registry and generate a corresponding mcp. """ registered_models = self.client.Model.list() for model in registered_models: - # create work unit for Model Group (= Model in VertexAI) + # create mcp for Model Group (= Model in VertexAI) yield from self._gen_ml_group_mcps(model) model_versions = model.versioning_registry.list_versions() for model_version in model_versions: - # create work unit for Model (= Model Version in VertexAI) + # create mcp for Model (= Model Version in VertexAI) logger.info( f"Ingesting a model (name: {model.display_name} id:{model.name})" ) @@ -183,11 +183,11 @@ def _get_ml_models_mcps(self) -> Iterable[MetadataChangeProposalWrapper]: def _get_training_jobs_mcps(self) -> Iterable[MetadataChangeProposalWrapper]: """ - Fetches training jobs from Vertex AI and generates corresponding work units. + Fetches training jobs from Vertex AI and generates corresponding mcps. This method retrieves various types of training jobs from Vertex AI, including CustomJob, CustomTrainingJob, CustomContainerTrainingJob, CustomPythonPackageTrainingJob, AutoMLTabularTrainingJob, AutoMLTextTrainingJob, AutoMLImageTrainingJob, AutoMLVideoTrainingJob, - and AutoMLForecastingTrainingJob. For each job, it generates work units containing metadata + and AutoMLForecastingTrainingJob. For each job, it generates mcps containing metadata about the job, its inputs, and its outputs. """ class_names = [ @@ -235,7 +235,7 @@ def _gen_training_job_mcps( self, job_meta: TrainingJobMetadata ) -> Iterable[MetadataChangeProposalWrapper]: """ - Generate a work unit for VertexAI Training Job + Generate a mcp for VertexAI Training Job """ job = job_meta.job job_id = self._make_vertexai_job_name(entity_id=job.name) @@ -293,7 +293,7 @@ def _gen_ml_group_mcps( model: Model, ) -> Iterable[MetadataChangeProposalWrapper]: """ - Generate an MLModelGroup work unit for a VertexAI Model. + Generate an MLModelGroup mcp for a VertexAI Model. """ ml_model_group_urn = self._make_ml_model_group_urn(model) @@ -518,7 +518,7 @@ def _gen_ml_model_endpoint_mcps( training_job_urn: Optional[str] = None, ) -> Iterable[MetadataChangeProposalWrapper]: """ - Generate an MLModel and Endpoint work unit for an VertexAI Model Version. + Generate an MLModel and Endpoint mcp for an VertexAI Model Version. """ endpoint: Optional[Endpoint] = self._search_endpoint(model) @@ -539,11 +539,11 @@ def _gen_ml_model_mcps( endpoint_urn: Optional[str] = None, ) -> Iterable[MetadataChangeProposalWrapper]: """ - Generate an MLModel workunit for an VertexAI Model Version. + Generate an MLModel mcp for an VertexAI Model Version. Every Model Version is a DataHub MLModel entity associated with an MLModelGroup corresponding to a registered Model in VertexAI Model Registry. """ - logging.info(f"starting model work unit for model {model.name}") + logging.info(f"generating model mcp for {model.name}") model_group_urn = self._make_ml_model_group_urn(model) model_name = self._make_vertexai_model_name(entity_id=model.name) diff --git a/metadata-ingestion/tests/unit/test_vertexai_source.py b/metadata-ingestion/tests/unit/test_vertexai_source.py index 27784882b429bf..566884ee42bc94 100644 --- a/metadata-ingestion/tests/unit/test_vertexai_source.py +++ b/metadata-ingestion/tests/unit/test_vertexai_source.py @@ -193,6 +193,7 @@ def test_get_endpoint_mcps( elif isinstance(aspect, SubTypesClass): assert aspect.typeNames == ["Endpoint"] + def test_get_training_jobs_mcps( source: VertexAISource, ) -> None: From 99269aa1544bd1cc0f0c290d2e69fd38c98282b7 Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Wed, 5 Mar 2025 01:28:00 -0800 Subject: [PATCH 55/59] Add POJO for model and change logic of model extraction and mcps creation --- .../src/datahub/ingestion/source/vertexai.py | 182 ++++++++++-------- .../tests/unit/test_vertexai_source.py | 10 +- 2 files changed, 113 insertions(+), 79 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py index 0bb4b360b62eeb..ab384dc0c0566e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py +++ b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py @@ -1,6 +1,7 @@ import dataclasses import logging import time +from collections import defaultdict from typing import Any, Iterable, List, Optional, TypeVar from google.api_core.exceptions import GoogleAPICallError @@ -106,6 +107,14 @@ class TrainingJobMetadata: output_model_version: Optional[VersionInfo] = None +@dataclasses.dataclass +class ModelMetadata: + model: Model + model_version: VersionInfo + training_job_urn: Optional[str] = None + endpoints: Optional[List[Endpoint]] = None + + @platform_name("Vertex AI", id="vertexai") @config_class(VertexAIConfig) @support_status(SupportStatus.TESTING) @@ -134,9 +143,8 @@ def __init__(self, ctx: PipelineContext, config: VertexAIConfig): project=config.project_id, location=config.region, credentials=credentials ) self.client = aiplatform - self.endpoints: Optional[List[Endpoint]] = None + self.endpoints: Optional[dict] = None self.datasets: Optional[dict] = None - self.model_name_separator = "_" def get_report(self) -> SourceReport: return self.report @@ -177,10 +185,28 @@ def _get_ml_models_mcps(self) -> Iterable[MetadataChangeProposalWrapper]: logger.info( f"Ingesting a model (name: {model.display_name} id:{model.name})" ) - yield from self._gen_ml_model_endpoint_mcps( + yield from self._get_ml_model_mcps( model=model, model_version=model_version ) + def _get_ml_model_mcps( + self, model: Model, model_version: VersionInfo + ) -> Iterable[MetadataChangeProposalWrapper]: + model_meta: ModelMetadata = self._get_ml_model_metadata(model, model_version) + # Create ML Model Entity + yield from self._gen_ml_model_mcps(model_meta) + # Create Endpoint Entity + yield from self._gen_endpoint_mcps(model_meta) + + def _get_ml_model_metadata( + self, model: Model, model_version: VersionInfo + ) -> ModelMetadata: + model_meta = ModelMetadata(model=model, model_version=model_version) + # Search for endpoints associated with the model + endpoints = self._search_endpoint(model) + model_meta.endpoints = endpoints + return model_meta + def _get_training_jobs_mcps(self) -> Iterable[MetadataChangeProposalWrapper]: """ Fetches training jobs from Vertex AI and generates corresponding mcps. @@ -227,8 +253,12 @@ def _gen_output_model_mcps( self._make_vertexai_job_name(entity_id=job.name) ) - yield from self._gen_ml_model_endpoint_mcps( - job_meta.output_model, job_meta.output_model_version, job_urn + yield from self._gen_ml_model_mcps( + ModelMetadata( + model=job_meta.output_model, + model_version=job_meta.output_model_version, + training_job_urn=job_urn, + ) ) def _gen_training_job_mcps( @@ -479,79 +509,81 @@ def _get_training_job_metadata( return job_meta def _gen_endpoint_mcps( - self, endpoint: Endpoint, model: Model, model_version: VersionInfo + self, model_meta: ModelMetadata ) -> Iterable[MetadataChangeProposalWrapper]: - endpoint_urn = builder.make_ml_model_deployment_urn( - platform=self.platform, - deployment_name=self._make_vertexai_endpoint_name( - entity_id=endpoint.display_name - ), - env=self.config.env, - ) - - aspects: List[_Aspect] = list() - aspects.append( - MLModelDeploymentPropertiesClass( - description=model.description, - createdAt=int(endpoint.create_time.timestamp() * 1000), - version=VersionTagClass(versionTag=str(model_version.version_id)), - customProperties={"displayName": endpoint.display_name}, - ) - ) + model: Model = model_meta.model + model_version: VersionInfo = model_meta.model_version + + if model_meta.endpoints: + for endpoint in model_meta.endpoints: + endpoint_urn = builder.make_ml_model_deployment_urn( + platform=self.platform, + deployment_name=self._make_vertexai_endpoint_name( + entity_id=endpoint.display_name + ), + env=self.config.env, + ) - aspects.append( - ContainerClass( - container=self._get_project_container().as_urn(), - ) - ) + aspects: List[_Aspect] = list() + aspects.append( + MLModelDeploymentPropertiesClass( + description=model.description, + createdAt=int(endpoint.create_time.timestamp() * 1000), + version=VersionTagClass( + versionTag=str(model_version.version_id) + ), + customProperties={"displayName": endpoint.display_name}, + ) + ) - aspects.append(SubTypesClass(typeNames=[MLTypes.ENDPOINT])) + # TODO add followings when metadata for MLModelDeployment is updated (these aspects not supported currently) + # aspects.append( + # ContainerClass(container=self._get_project_container().as_urn()) + # ) + # aspects.append(SubTypesClass(typeNames=[MLTypes.ENDPOINT])) - yield from MetadataChangeProposalWrapper.construct_many( - endpoint_urn, aspects=aspects - ) + yield from MetadataChangeProposalWrapper.construct_many( + endpoint_urn, aspects=aspects + ) - def _gen_ml_model_endpoint_mcps( - self, - model: Model, - model_version: VersionInfo, - training_job_urn: Optional[str] = None, + def _gen_ml_model_mcps( + self, ModelMetadata: ModelMetadata ) -> Iterable[MetadataChangeProposalWrapper]: """ Generate an MLModel and Endpoint mcp for an VertexAI Model Version. """ - endpoint: Optional[Endpoint] = self._search_endpoint(model) - endpoint_urn = None - - if endpoint: - yield from self._gen_endpoint_mcps(endpoint, model, model_version) + model: Model = ModelMetadata.model + model_version: VersionInfo = ModelMetadata.model_version + training_job_urn: Optional[str] = ModelMetadata.training_job_urn + endpoints: Optional[List[Endpoint]] = ModelMetadata.endpoints + endpoint_urns: List[str] = list() - yield from self._gen_ml_model_mcps( - model, model_version, training_job_urn, endpoint_urn - ) - - def _gen_ml_model_mcps( - self, - model: Model, - model_version: VersionInfo, - training_job_urn: Optional[str] = None, - endpoint_urn: Optional[str] = None, - ) -> Iterable[MetadataChangeProposalWrapper]: - """ - Generate an MLModel mcp for an VertexAI Model Version. - Every Model Version is a DataHub MLModel entity associated with an MLModelGroup - corresponding to a registered Model in VertexAI Model Registry. - """ logging.info(f"generating model mcp for {model.name}") + # Generate list of endpoint URL + if endpoints: + for endpoint in endpoints: + logger.info( + f"found endpoint ({endpoint.display_name}) for model ({model.resource_name})" + ) + endpoint_urns.append( + builder.make_ml_model_deployment_urn( + platform=self.platform, + deployment_name=self._make_vertexai_endpoint_name( + entity_id=endpoint.display_name + ), + env=self.config.env, + ) + ) + + # Create URN for Model and Model Version model_group_urn = self._make_ml_model_group_urn(model) model_name = self._make_vertexai_model_name(entity_id=model.name) - model_version_name = ( - f"{model_name}{self.model_name_separator}{model_version.version_id}" - ) + model_version_name = f"{model_name}_{model_version.version_id}" model_urn = self._make_ml_model_urn(model_version, model_name=model_name) + # Create aspects for ML Model aspects: List[_Aspect] = list() aspects.append( @@ -559,9 +591,7 @@ def _gen_ml_model_mcps( name=model_version_name, description=model_version.version_description, customProperties={ - "displayName": model_version.model_display_name - + self.model_name_separator - + model_version.version_id, + "displayName": f"{model_version.model_display_name}_{model_version.version_id}", "resourceName": model.resource_name, }, created=TimeStampClass( @@ -579,15 +609,13 @@ def _gen_ml_model_mcps( trainingJobs=[training_job_urn] if training_job_urn else None, # link to training job - deployments=[endpoint_urn] - if endpoint_urn - else [], # link to model registry and endpoint + deployments=endpoint_urns, externalUrl=self._make_model_version_external_url(model), type="ML Model", ) ) - # TO BE ADDED: Create a container for Project as parent of the dataset + # TODO Add a container for Project as parent of the dataset # aspects.append( # ContainerClass( # container=self._get_project_container().as_urn(), @@ -598,24 +626,24 @@ def _gen_ml_model_mcps( entityUrn=model_urn, aspects=aspects ) - def _search_endpoint(self, model: Model) -> Optional[Endpoint]: + def _search_endpoint(self, model: Model) -> List[Endpoint]: """ Search for an endpoint associated with the model. """ - if self.endpoints is None: - self.endpoints = self.client.Endpoint.list() - for endpoint in self.endpoints: - deployed_models = endpoint.list_models() - if model.resource_name in deployed_models: - return endpoint + endpoint_dict = defaultdict(list) + for endpoint in self.client.Endpoint.list(): + for resource in endpoint.list_models(): + endpoint_dict[resource.model].append(endpoint) + self.endpoints = endpoint_dict - return None + endpoints = self.endpoints[model.resource_name] + return endpoints def _make_ml_model_urn(self, model_version: VersionInfo, model_name: str) -> str: urn = builder.make_ml_model_urn( platform=self.platform, - model_name=f"{model_name}{self.model_name_separator}{model_version.version_id}", + model_name=f"{model_name}_{model_version.version_id}", env=self.config.env, ) return urn diff --git a/metadata-ingestion/tests/unit/test_vertexai_source.py b/metadata-ingestion/tests/unit/test_vertexai_source.py index 566884ee42bc94..5985be5b77f6ba 100644 --- a/metadata-ingestion/tests/unit/test_vertexai_source.py +++ b/metadata-ingestion/tests/unit/test_vertexai_source.py @@ -14,6 +14,7 @@ from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.source.vertexai import ( MLTypes, + ModelMetadata, TrainingJobMetadata, VertexAIConfig, VertexAISource, @@ -158,7 +159,8 @@ def test_get_ml_model_properties_mcps( ) -> None: mock_model = gen_mock_model() model_version = gen_mock_model_version(mock_model) - mcp = [mcp for mcp in source._gen_ml_model_mcps(mock_model, model_version)] + model_meta = ModelMetadata(mock_model, model_version) + mcp = [mcp for mcp in source._gen_ml_model_mcps(model_meta)] assert len(mcp) == 1 assert hasattr(mcp[0], "aspect") aspect = mcp[0].aspect @@ -178,7 +180,11 @@ def test_get_endpoint_mcps( mock_model = gen_mock_model() model_version = gen_mock_model_version(mock_model) mock_endpoint = gen_mock_endpoint() - for mcp in source._gen_endpoint_mcps(mock_endpoint, mock_model, model_version): + model_meta = ModelMetadata( + model=mock_model, model_version=model_version, endpoints=[mock_endpoint] + ) + + for mcp in source._gen_endpoint_mcps(model_meta): assert hasattr(mcp, "aspect") aspect = mcp.aspect if isinstance(aspect, MLModelDeploymentPropertiesClass): From f900f6dbabe4cbcfd2c6562909d3469ea2e9c3e9 Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Wed, 5 Mar 2025 01:54:35 -0800 Subject: [PATCH 56/59] use datetime_to_ts_millis helper --- .../src/datahub/ingestion/source/vertexai.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py index ab384dc0c0566e..a165de6fc383f7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py +++ b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py @@ -1,7 +1,7 @@ import dataclasses import logging -import time from collections import defaultdict +from datetime import datetime from typing import Any, Iterable, List, Optional, TypeVar from google.api_core.exceptions import GoogleAPICallError @@ -54,6 +54,7 @@ VersionTagClass, ) from datahub.utilities.str_enum import StrEnum +from datahub.utilities.time import datetime_to_ts_millis T = TypeVar("T") @@ -272,9 +273,9 @@ def _gen_training_job_mcps( job_urn = builder.make_data_process_instance_urn(job_id) created_time = ( - int(job.create_time.timestamp() * 1000) + datetime_to_ts_millis(job.create_time) if job.create_time - else int(time.time() * 1000) + else datetime_to_ts_millis(datetime.now()) ) created_actor = f"urn:li:platformResource:{self.platform}" @@ -332,11 +333,11 @@ def _gen_ml_group_mcps( MLModelGroupPropertiesClass( name=self._make_vertexai_model_group_name(model.name), description=model.description, - created=TimeStampClass(time=int(model.create_time.timestamp() * 1000)) + created=TimeStampClass(time=datetime_to_ts_millis(model.create_time)) if model.create_time else None, lastModified=TimeStampClass( - time=int(model.update_time.timestamp() * 1000) + time=datetime_to_ts_millis(model.update_time) ) if model.update_time else None, @@ -427,7 +428,7 @@ def _get_input_dataset_mcps( aspects.append( DatasetPropertiesClass( name=self._make_vertexai_dataset_name(ds.name), - created=TimeStampClass(time=int(ds.create_time.timestamp() * 1000)) + created=TimeStampClass(time=datetime_to_ts_millis(ds.create_time)) if ds.create_time else None, description=f"Dataset: {ds.display_name}", @@ -528,7 +529,7 @@ def _gen_endpoint_mcps( aspects.append( MLModelDeploymentPropertiesClass( description=model.description, - createdAt=int(endpoint.create_time.timestamp() * 1000), + createdAt=datetime_to_ts_millis(endpoint.create_time), version=VersionTagClass( versionTag=str(model_version.version_id) ), @@ -595,12 +596,12 @@ def _gen_ml_model_mcps( "resourceName": model.resource_name, }, created=TimeStampClass( - int(model_version.version_create_time.timestamp() * 1000) + datetime_to_ts_millis(model_version.version_create_time) ) if model_version.version_create_time else None, lastModified=TimeStampClass( - int(model_version.version_update_time.timestamp() * 1000) + datetime_to_ts_millis(model_version.version_update_time) ) if model_version.version_update_time else None, From 5c46c595d72534d1e86756bc9b150d1dfe491bf8 Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Wed, 5 Mar 2025 10:19:00 -0800 Subject: [PATCH 57/59] refactored unit test case for better assertion --- .../src/datahub/ingestion/source/vertexai.py | 4 +- .../vertexai/vertexai_mcps_golden.json | 127 ++++++++++++++---- .../tests/unit/test_vertexai_source.py | 127 +++++++++++++----- 3 files changed, 199 insertions(+), 59 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py index a165de6fc383f7..acff82bcf4dd35 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py +++ b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py @@ -356,7 +356,7 @@ def _gen_ml_group_mcps( def _make_ml_model_group_urn(self, model: Model) -> str: urn = builder.make_ml_model_group_urn( platform=self.platform, - group_name=self._make_vertexai_model_name(model.name), + group_name=self._make_vertexai_model_group_name(model.name), env=self.config.env, ) return urn @@ -520,7 +520,7 @@ def _gen_endpoint_mcps( endpoint_urn = builder.make_ml_model_deployment_urn( platform=self.platform, deployment_name=self._make_vertexai_endpoint_name( - entity_id=endpoint.display_name + entity_id=endpoint.name ), env=self.config.env, ) diff --git a/metadata-ingestion/tests/integration/vertexai/vertexai_mcps_golden.json b/metadata-ingestion/tests/integration/vertexai/vertexai_mcps_golden.json index 146b7e70c78849..f1a14f2b5ba9de 100644 --- a/metadata-ingestion/tests/integration/vertexai/vertexai_mcps_golden.json +++ b/metadata-ingestion/tests/integration/vertexai/vertexai_mcps_golden.json @@ -14,7 +14,7 @@ } }, "systemMetadata": { - "lastObserved": 1741107353572, + "lastObserved": 1741198547912, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } @@ -30,7 +30,7 @@ } }, "systemMetadata": { - "lastObserved": 1741107353573, + "lastObserved": 1741198547912, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } @@ -46,7 +46,7 @@ } }, "systemMetadata": { - "lastObserved": 1741107353573, + "lastObserved": 1741198547913, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } @@ -64,7 +64,7 @@ } }, "systemMetadata": { - "lastObserved": 1741107353573, + "lastObserved": 1741198547913, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } @@ -80,14 +80,14 @@ } }, "systemMetadata": { - "lastObserved": 1741107353574, + "lastObserved": 1741198547913, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "mlModelGroup", - "entityUrn": "urn:li:mlModelGroup:(urn:li:dataPlatform:vertexai,test-project-id.model.mock_prediction_model_1,PROD)", + "entityUrn": "urn:li:mlModelGroup:(urn:li:dataPlatform:vertexai,test-project-id.model_group.mock_prediction_model_1,PROD)", "changeType": "UPSERT", "aspectName": "mlModelGroupProperties", "aspect": { @@ -100,14 +100,14 @@ } }, "systemMetadata": { - "lastObserved": 1741107353574, + "lastObserved": 1741198547914, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "mlModelGroup", - "entityUrn": "urn:li:mlModelGroup:(urn:li:dataPlatform:vertexai,test-project-id.model.mock_prediction_model_2,PROD)", + "entityUrn": "urn:li:mlModelGroup:(urn:li:dataPlatform:vertexai,test-project-id.model_group.mock_prediction_model_2,PROD)", "changeType": "UPSERT", "aspectName": "mlModelGroupProperties", "aspect": { @@ -120,7 +120,7 @@ } }, "systemMetadata": { - "lastObserved": 1741107354209, + "lastObserved": 1741198547916, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } @@ -139,13 +139,13 @@ "externalUrl": "https://console.cloud.google.com/vertex-ai/training/training-pipelines?trainingPipelineId=mock_training_job?project=test-project-id", "name": "test-project-id.job.mock_training_job", "created": { - "time": 1741107354209, + "time": 1741198547915, "actor": "urn:li:platformResource:vertexai" } } }, "systemMetadata": { - "lastObserved": 1741107356953, + "lastObserved": 1741198547917, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } @@ -163,7 +163,7 @@ } }, "systemMetadata": { - "lastObserved": 1741107356956, + "lastObserved": 1741198547917, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } @@ -181,7 +181,7 @@ } }, "systemMetadata": { - "lastObserved": 1741107356958, + "lastObserved": 1741198547917, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } @@ -197,7 +197,7 @@ } }, "systemMetadata": { - "lastObserved": 1741107356958, + "lastObserved": 1741198547918, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } @@ -216,13 +216,13 @@ "externalUrl": "https://console.cloud.google.com/vertex-ai/training/training-pipelines?trainingPipelineId=mock_auto_automl_tabular_job?project=test-project-id", "name": "test-project-id.job.mock_auto_automl_tabular_job", "created": { - "time": 1741107356952, + "time": 1741198547917, "actor": "urn:li:platformResource:vertexai" } } }, "systemMetadata": { - "lastObserved": 1741107356959, + "lastObserved": 1741198547919, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } @@ -240,7 +240,7 @@ } }, "systemMetadata": { - "lastObserved": 1741107356960, + "lastObserved": 1741198547919, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } @@ -258,7 +258,7 @@ } }, "systemMetadata": { - "lastObserved": 1741107356960, + "lastObserved": 1741198547920, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } @@ -274,7 +274,84 @@ } }, "systemMetadata": { - "lastObserved": 1741107356961, + "lastObserved": 1741198547920, + "runId": "vertexai-source-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:test-project-id.job.mock_auto_automl_tabular_job", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "displayName": "mock_auto_automl_tabular_job_display_name", + "jobType": "AutoMLTabularTrainingJob" + }, + "externalUrl": "https://console.cloud.google.com/vertex-ai/training/training-pipelines?trainingPipelineId=mock_auto_automl_tabular_job?project=test-project-id", + "name": "test-project-id.job.mock_auto_automl_tabular_job", + "created": { + "time": 1741198547919, + "actor": "urn:li:platformResource:vertexai" + } + } + }, + "systemMetadata": { + "lastObserved": 1741198547920, + "runId": "vertexai-source-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:test-project-id.job.mock_auto_automl_tabular_job", + "changeType": "UPSERT", + "aspectName": "mlTrainingRunProperties", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "https://console.cloud.google.com/vertex-ai/training/training-pipelines?trainingPipelineId=mock_auto_automl_tabular_job?project=test-project-id", + "id": "mock_auto_automl_tabular_job" + } + }, + "systemMetadata": { + "lastObserved": 1741198547921, + "runId": "vertexai-source-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:test-project-id.job.mock_auto_automl_tabular_job", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Training Job" + ] + } + }, + "systemMetadata": { + "lastObserved": 1741198547921, + "runId": "vertexai-source-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:test-project-id.job.mock_auto_automl_tabular_job", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:29746a9030349f4340ed74b46913dab6" + } + }, + "systemMetadata": { + "lastObserved": 1741198547922, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } @@ -290,7 +367,7 @@ } }, "systemMetadata": { - "lastObserved": 1741107356961, + "lastObserved": 1741198547922, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } @@ -306,14 +383,14 @@ } }, "systemMetadata": { - "lastObserved": 1741107356962, + "lastObserved": 1741198547922, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "mlModelGroup", - "entityUrn": "urn:li:mlModelGroup:(urn:li:dataPlatform:vertexai,test-project-id.model.mock_prediction_model_1,PROD)", + "entityUrn": "urn:li:mlModelGroup:(urn:li:dataPlatform:vertexai,test-project-id.model_group.mock_prediction_model_1,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -322,14 +399,14 @@ } }, "systemMetadata": { - "lastObserved": 1741107356962, + "lastObserved": 1741198547923, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "mlModelGroup", - "entityUrn": "urn:li:mlModelGroup:(urn:li:dataPlatform:vertexai,test-project-id.model.mock_prediction_model_2,PROD)", + "entityUrn": "urn:li:mlModelGroup:(urn:li:dataPlatform:vertexai,test-project-id.model_group.mock_prediction_model_2,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -338,7 +415,7 @@ } }, "systemMetadata": { - "lastObserved": 1741107356963, + "lastObserved": 1741198547923, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } diff --git a/metadata-ingestion/tests/unit/test_vertexai_source.py b/metadata-ingestion/tests/unit/test_vertexai_source.py index 5985be5b77f6ba..5d50526590bb0c 100644 --- a/metadata-ingestion/tests/unit/test_vertexai_source.py +++ b/metadata-ingestion/tests/unit/test_vertexai_source.py @@ -19,6 +19,7 @@ VertexAIConfig, VertexAISource, ) +from datahub.metadata._schema_classes import MLTrainingRunPropertiesClass from datahub.metadata.com.linkedin.pegasus2avro.ml.metadata import ( MLModelGroupProperties, MLModelProperties, @@ -133,25 +134,37 @@ def test_get_ml_model_mcps(mock_list: List[Model], source: VertexAISource) -> No assert hasattr(mock_list, "return_value") # this check needed to go ground lint mock_list.return_value = mock_models - mcps = [mcp for mcp in source._get_ml_models_mcps()] - assert len(mcps) == 2 - # aspect is MLModelGroupPropertiesClass + # Running _get_ml_models_mcps + actual_mcps = [mcp for mcp in source._get_ml_models_mcps()] - assert hasattr(mcps[0], "aspect") - aspect = mcps[0].aspect - assert isinstance(aspect, MLModelGroupProperties) - assert ( - aspect.name == f"{source._make_vertexai_model_group_name(mock_models[0].name)}" - ) - assert aspect.description == mock_models[0].description + actual_urns = [mcp.entityUrn for mcp in actual_mcps] + expected_urns = [] + for mock_model in mock_models: + expected_urns.append( + builder.make_ml_model_group_urn( + platform=source.platform, + group_name=source._make_vertexai_model_group_name(mock_model.name), + env=source.config.env, + ) + ) - assert hasattr(mcps[1], "aspect") - aspect = mcps[1].aspect - assert isinstance(aspect, MLModelGroupProperties) - assert ( - aspect.name == f"{source._make_vertexai_model_group_name(mock_models[1].name)}" - ) - assert aspect.description == mock_models[1].description + # expect 2 model groups + assert actual_urns == expected_urns + + for mcp in actual_mcps: + assert hasattr(mcp, "aspect") + aspect = mcp.aspect + if isinstance(aspect, MLModelGroupProperties): + assert ( + aspect.name + == f"{source._make_vertexai_model_group_name(mock_models[0].name)}" + or aspect.name + == f"{source._make_vertexai_model_group_name(mock_models[1].name)}" + ) + assert ( + aspect.description == mock_models[0].description + or aspect.description == mock_models[1].description + ) def test_get_ml_model_properties_mcps( @@ -160,6 +173,8 @@ def test_get_ml_model_properties_mcps( mock_model = gen_mock_model() model_version = gen_mock_model_version(mock_model) model_meta = ModelMetadata(mock_model, model_version) + + # Run _gen_ml_model_mcps mcp = [mcp for mcp in source._gen_ml_model_mcps(model_meta)] assert len(mcp) == 1 assert hasattr(mcp[0], "aspect") @@ -184,6 +199,21 @@ def test_get_endpoint_mcps( model=mock_model, model_version=model_version, endpoints=[mock_endpoint] ) + # Run _gen_endpoint_mcps + actual_mcps = [mcp for mcp in source._gen_endpoint_mcps(model_meta)] + actual_urns = [mcp.entityUrn for mcp in actual_mcps] + endpoint_urn = builder.make_ml_model_deployment_urn( + platform=source.platform, + deployment_name=source._make_vertexai_endpoint_name( + entity_id=mock_endpoint.name + ), + env=source.config.env, + ) + + expected_urns = [endpoint_urn] * 1 + # expect 1 endpoint urn + assert actual_urns == expected_urns + for mcp in source._gen_endpoint_mcps(model_meta): assert hasattr(mcp, "aspect") aspect = mcp.aspect @@ -193,11 +223,11 @@ def test_get_endpoint_mcps( "displayName": mock_endpoint.display_name } assert aspect.createdAt == int(mock_endpoint.create_time.timestamp() * 1000) - elif isinstance(aspect, ContainerClass): - assert aspect.container == source._get_project_container().as_urn() - - elif isinstance(aspect, SubTypesClass): - assert aspect.typeNames == ["Endpoint"] + # TODO: Add following when container/subtype supported + # elif isinstance(aspect, ContainerClass): + # assert aspect.container == source._get_project_container().as_urn() + # elif isinstance(aspect, SubTypesClass): + # assert aspect.typeNames == ["Endpoint"] def test_get_training_jobs_mcps( @@ -229,7 +259,19 @@ def test_get_training_jobs_mcps( This function mocks customJob and AutoMLTabularTrainingJob, and verifies the properties of the work units """ - for mcp in source._get_training_jobs_mcps(): + + # Run _get_training_jobs_mcps + actual_mcps = [mcp for mcp in source._get_training_jobs_mcps()] + actual_urns = [mcp.entityUrn for mcp in actual_mcps] + expected_urns = [ + builder.make_data_process_instance_urn( + source._make_vertexai_job_name(mock_training_job.name) + ) + ] * 4 # expect 4 aspects + + assert actual_urns == expected_urns + + for mcp in actual_mcps: assert hasattr(mcp, "aspect") aspect = mcp.aspect if isinstance(aspect, DataProcessInstancePropertiesClass): @@ -243,6 +285,11 @@ def test_get_training_jobs_mcps( == mock_training_job.display_name or mock_training_automl_job.display_name ) + if isinstance(aspect, MLTrainingRunPropertiesClass): + assert aspect.id == mock_training_job.name + assert aspect.externalUrl == source._make_job_external_url( + mock_training_job + ) if isinstance(aspect, SubTypesClass): assert aspect.typeNames == [MLTypes.TRAINING_JOB] @@ -256,7 +303,24 @@ def test_gen_training_job_mcps(source: VertexAISource) -> None: mock_job = gen_mock_training_job() job_meta = TrainingJobMetadata(mock_job, input_dataset=mock_dataset) - for mcp in source._gen_training_job_mcps(job_meta): + actual_mcps = [mcp for mcp in source._gen_training_job_mcps(job_meta)] + actual_urns = [mcp.entityUrn for mcp in actual_mcps] + expected_urns = [ + builder.make_data_process_instance_urn( + source._make_vertexai_job_name(mock_training_job.name) + ) + ] * 5 # expect 5 aspects under the same urn for the job + + assert actual_urns == expected_urns + + dataset_name = source._make_vertexai_dataset_name(entity_id=mock_dataset.name) + dataset_urn = builder.make_dataset_urn( + platform=source.platform, + name=dataset_name, + env=source.config.env, + ) + + for mcp in actual_mcps: assert hasattr(mcp, "aspect") aspect = mcp.aspect if isinstance(aspect, DataProcessInstancePropertiesClass): @@ -267,6 +331,12 @@ def test_gen_training_job_mcps(source: VertexAISource) -> None: assert ( aspect.customProperties["displayName"] == mock_training_job.display_name ) + if isinstance(aspect, MLTrainingRunPropertiesClass): + assert aspect.id == mock_training_job.name + assert aspect.externalUrl == source._make_job_external_url( + mock_training_job + ) + if isinstance(aspect, SubTypesClass): assert aspect.typeNames == [MLTypes.TRAINING_JOB] @@ -274,14 +344,6 @@ def test_gen_training_job_mcps(source: VertexAISource) -> None: assert aspect.container == source._get_project_container().as_urn() if isinstance(aspect, DataProcessInstanceInputClass): - dataset_name = source._make_vertexai_dataset_name( - entity_id=mock_dataset.name - ) - dataset_urn = builder.make_dataset_urn( - platform=source.platform, - name=dataset_name, - env=source.config.env, - ) assert aspect.inputs == [dataset_urn] @@ -354,6 +416,7 @@ def test_get_input_dataset_mcps(source: VertexAISource) -> None: mock_job = gen_mock_training_job() job_meta = TrainingJobMetadata(mock_job, input_dataset=mock_dataset) + # Run _get_input_dataset_mcps for mcp in source._get_input_dataset_mcps(job_meta): assert hasattr(mcp, "aspect") aspect = mcp.aspect From 1772b7e458692ea69b5b54803fc602b576745418 Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Wed, 5 Mar 2025 12:25:00 -0800 Subject: [PATCH 58/59] Modified integration test to cover relationship between job to dataset, and job to model --- .../src/datahub/ingestion/source/vertexai.py | 25 +- .../integration/vertexai/test_vertexai.py | 48 +- .../vertexai/vertexai_mcps_golden.json | 443 ++++++++++++++++-- 3 files changed, 463 insertions(+), 53 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py index acff82bcf4dd35..d1d0b9043b18e4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/vertexai.py +++ b/metadata-ingestion/src/datahub/ingestion/source/vertexai.py @@ -463,15 +463,14 @@ def _get_training_job_metadata( # Check if the job is an AutoML job if self._is_automl_job(job): - job_conf = job.to_dict() - # Check if input dataset is present in the job configuration if ( - "inputDataConfig" in job_conf - and "datasetId" in job_conf["inputDataConfig"] + hasattr(job, "_gca_resource") + and hasattr(job._gca_resource, "input_data_config") + and hasattr(job._gca_resource.input_data_config, "dataset_id") ): # Create URN of Input Dataset for Training Job - dataset_id = job_conf["inputDataConfig"]["datasetId"] + dataset_id = job._gca_resource.input_data_config.dataset_id logger.info( f"Found input dataset (id: {dataset_id}) for training job ({job.display_name})" ) @@ -485,15 +484,13 @@ def _get_training_job_metadata( job_meta.input_dataset = input_ds # Check if output model is present in the job configuration - if ( - "modelToUpload" in job_conf - and "name" in job_conf["modelToUpload"] - and job_conf["modelToUpload"]["name"] - and job_conf["modelToUpload"]["versionId"] + if hasattr(job, "_gca_resource") and hasattr( + job._gca_resource, "model_to_upload" ): - model_version_str = job_conf["modelToUpload"]["versionId"] + model_version_str = job._gca_resource.model_to_upload.version_id + model_name = job._gca_resource.model_to_upload.name try: - model = Model(model_name=job_conf["modelToUpload"]["name"]) + model = Model(model_name=model_name) model_version = self._search_model_version(model, model_version_str) if model and model_version: logger.info( @@ -506,7 +503,6 @@ def _get_training_job_metadata( logger.error( f"Error while fetching model version {model_version_str}" ) - return job_meta def _gen_endpoint_mcps( @@ -592,7 +588,8 @@ def _gen_ml_model_mcps( name=model_version_name, description=model_version.version_description, customProperties={ - "displayName": f"{model_version.model_display_name}_{model_version.version_id}", + "displayName": f"{model_version.model_display_name}", + "versionId": f"{model_version.version_id}", "resourceName": model.resource_name, }, created=TimeStampClass( diff --git a/metadata-ingestion/tests/integration/vertexai/test_vertexai.py b/metadata-ingestion/tests/integration/vertexai/test_vertexai.py index cbee5db8d22de9..740f6d1dcbe37f 100644 --- a/metadata-ingestion/tests/integration/vertexai/test_vertexai.py +++ b/metadata-ingestion/tests/integration/vertexai/test_vertexai.py @@ -5,10 +5,13 @@ import pytest from google.cloud.aiplatform import AutoMLTabularTrainingJob, CustomJob, Model +from google.cloud.aiplatform.base import VertexAiResourceNoun +from google.cloud.aiplatform.models import VersionInfo from google.protobuf import timestamp_pb2 from pytest import Config from datahub.ingestion.run.pipeline import Pipeline +from datahub.ingestion.source.vertexai import TrainingJobMetadata from tests.test_helpers import mce_helpers T = TypeVar("T") @@ -50,6 +53,7 @@ def gen_mock_models() -> List[Model]: mock_model_1.version_id = "1" mock_model_1.display_name = "mock_prediction_model_1_display_name" mock_model_1.description = "mock_prediction_model_1_description" + mock_model_1.resource_name = "projects/123/locations/us-central1/models/456" mock_model_2 = MagicMock(spec=Model) mock_model_2.name = "mock_prediction_model_2" @@ -59,6 +63,7 @@ def gen_mock_models() -> List[Model]: mock_model_2.version_id = "1" mock_model_2.display_name = "mock_prediction_model_2_display_name" mock_model_2.description = "mock_prediction_model_1_description" + mock_model_2.resource_name = "projects/123/locations/us-central1/models/789" return [mock_model_1, mock_model_2] @@ -84,7 +89,35 @@ def gen_mock_training_automl_job() -> AutoMLTabularTrainingJob: return mock_automl_job +def gen_mock_model_version(mock_model: Model) -> VersionInfo: + version = "1" + return VersionInfo( + version_id=version, + version_description="test", + version_create_time=timestamp_pb2.Timestamp().GetCurrentTime(), + version_update_time=timestamp_pb2.Timestamp().GetCurrentTime(), + model_display_name=mock_model.name, + model_resource_name=mock_model.resource_name, + ) + + +def gen_mock_dataset() -> VertexAiResourceNoun: + mock_dataset = MagicMock(spec=VertexAiResourceNoun) + mock_dataset.name = "mock_dataset" + mock_dataset.create_time = timestamp_pb2.Timestamp().GetCurrentTime() + mock_dataset.update_time = timestamp_pb2.Timestamp().GetCurrentTime() + mock_dataset.display_name = "mock_dataset_display_name" + mock_dataset.description = "mock_dataset_description" + mock_dataset.resource_name = "projects/123/locations/us-central1/datasets/456" + return mock_dataset + + def test_vertexai_source_ingestion(pytestconfig: Config, sink_file_path: str) -> None: + mock_automl_job = gen_mock_training_automl_job() + mock_models = gen_mock_models() + mock_model_version = gen_mock_model_version(mock_models[0]) + mock_dataset = gen_mock_dataset() + with contextlib.ExitStack() as exit_stack: for func_to_mock in [ "google.cloud.aiplatform.init", @@ -103,8 +136,10 @@ def test_vertexai_source_ingestion(pytestconfig: Config, sink_file_path: str) -> "google.cloud.aiplatform.AutoMLImageTrainingJob.list", "google.cloud.aiplatform.AutoMLVideoTrainingJob.list", "google.cloud.aiplatform.AutoMLForecastingTrainingJob.list", + "datahub.ingestion.source.vertexai.VertexAISource._get_training_job_metadata", ]: mock = exit_stack.enter_context(patch(func_to_mock)) + if func_to_mock == "google.cloud.aiplatform.Model.list": mock.return_value = gen_mock_models() elif func_to_mock == "google.cloud.aiplatform.CustomJob.list": @@ -115,7 +150,18 @@ def test_vertexai_source_ingestion(pytestconfig: Config, sink_file_path: str) -> elif ( func_to_mock == "google.cloud.aiplatform.AutoMLTabularTrainingJob.list" ): - mock.return_value = [gen_mock_training_automl_job()] + mock.return_value = [mock_automl_job] + elif ( + func_to_mock + == "datahub.ingestion.source.vertexai.VertexAISource._get_training_job_metadata" + ): + mock.return_value = TrainingJobMetadata( + job=mock_automl_job, + input_dataset=mock_dataset, + output_model=mock_models[0], + output_model_version=mock_model_version, + ) + else: mock.return_value = [] diff --git a/metadata-ingestion/tests/integration/vertexai/vertexai_mcps_golden.json b/metadata-ingestion/tests/integration/vertexai/vertexai_mcps_golden.json index f1a14f2b5ba9de..1c6d57b423971e 100644 --- a/metadata-ingestion/tests/integration/vertexai/vertexai_mcps_golden.json +++ b/metadata-ingestion/tests/integration/vertexai/vertexai_mcps_golden.json @@ -14,7 +14,7 @@ } }, "systemMetadata": { - "lastObserved": 1741198547912, + "lastObserved": 1741205717242, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } @@ -30,7 +30,7 @@ } }, "systemMetadata": { - "lastObserved": 1741198547912, + "lastObserved": 1741205717242, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } @@ -46,7 +46,7 @@ } }, "systemMetadata": { - "lastObserved": 1741198547913, + "lastObserved": 1741205717243, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } @@ -64,7 +64,7 @@ } }, "systemMetadata": { - "lastObserved": 1741198547913, + "lastObserved": 1741205717243, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } @@ -80,7 +80,7 @@ } }, "systemMetadata": { - "lastObserved": 1741198547913, + "lastObserved": 1741205717243, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } @@ -100,7 +100,7 @@ } }, "systemMetadata": { - "lastObserved": 1741198547914, + "lastObserved": 1741205717244, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } @@ -120,57 +120,57 @@ } }, "systemMetadata": { - "lastObserved": 1741198547916, + "lastObserved": 1741205717245, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataProcessInstance", - "entityUrn": "urn:li:dataProcessInstance:test-project-id.job.mock_training_job", + "entityUrn": "urn:li:dataProcessInstance:test-project-id.job.mock_auto_automl_tabular_job", "changeType": "UPSERT", "aspectName": "dataProcessInstanceProperties", "aspect": { "json": { "customProperties": { - "displayName": "mock_training_job_display_name", - "jobType": "CustomJob" + "displayName": "mock_auto_automl_tabular_job_display_name", + "jobType": "AutoMLTabularTrainingJob" }, - "externalUrl": "https://console.cloud.google.com/vertex-ai/training/training-pipelines?trainingPipelineId=mock_training_job?project=test-project-id", - "name": "test-project-id.job.mock_training_job", + "externalUrl": "https://console.cloud.google.com/vertex-ai/training/training-pipelines?trainingPipelineId=mock_auto_automl_tabular_job?project=test-project-id", + "name": "test-project-id.job.mock_auto_automl_tabular_job", "created": { - "time": 1741198547915, + "time": 1741205717245, "actor": "urn:li:platformResource:vertexai" } } }, "systemMetadata": { - "lastObserved": 1741198547917, + "lastObserved": 1741205717246, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataProcessInstance", - "entityUrn": "urn:li:dataProcessInstance:test-project-id.job.mock_training_job", + "entityUrn": "urn:li:dataProcessInstance:test-project-id.job.mock_auto_automl_tabular_job", "changeType": "UPSERT", "aspectName": "mlTrainingRunProperties", "aspect": { "json": { "customProperties": {}, - "externalUrl": "https://console.cloud.google.com/vertex-ai/training/training-pipelines?trainingPipelineId=mock_training_job?project=test-project-id", - "id": "mock_training_job" + "externalUrl": "https://console.cloud.google.com/vertex-ai/training/training-pipelines?trainingPipelineId=mock_auto_automl_tabular_job?project=test-project-id", + "id": "mock_auto_automl_tabular_job" } }, "systemMetadata": { - "lastObserved": 1741198547917, + "lastObserved": 1741205717246, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataProcessInstance", - "entityUrn": "urn:li:dataProcessInstance:test-project-id.job.mock_training_job", + "entityUrn": "urn:li:dataProcessInstance:test-project-id.job.mock_auto_automl_tabular_job", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -181,14 +181,89 @@ } }, "systemMetadata": { - "lastObserved": 1741198547917, + "lastObserved": 1741205717246, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataProcessInstance", - "entityUrn": "urn:li:dataProcessInstance:test-project-id.job.mock_training_job", + "entityUrn": "urn:li:dataProcessInstance:test-project-id.job.mock_auto_automl_tabular_job", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:29746a9030349f4340ed74b46913dab6" + } + }, + "systemMetadata": { + "lastObserved": 1741205717247, + "runId": "vertexai-source-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:test-project-id.job.mock_auto_automl_tabular_job", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceInput", + "aspect": { + "json": { + "inputs": [ + "urn:li:dataset:(urn:li:dataPlatform:vertexai,test-project-id.dataset.mock_dataset,PROD)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1741205717247, + "runId": "vertexai-source-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertexai,test-project-id.dataset.mock_dataset,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": { + "displayName": "mock_dataset_display_name", + "resourceName": "projects/123/locations/us-central1/datasets/456" + }, + "name": "test-project-id.dataset.mock_dataset", + "qualifiedName": "projects/123/locations/us-central1/datasets/456", + "description": "Dataset: mock_dataset_display_name", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1741205717247, + "runId": "vertexai-source-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertexai,test-project-id.dataset.mock_dataset,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Dataset" + ] + } + }, + "systemMetadata": { + "lastObserved": 1741205717248, + "runId": "vertexai-source-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertexai,test-project-id.dataset.mock_dataset,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { @@ -197,7 +272,63 @@ } }, "systemMetadata": { - "lastObserved": 1741198547918, + "lastObserved": 1741205717248, + "runId": "vertexai-source-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertexai,test-project-id.dataset.mock_dataset,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:29746a9030349f4340ed74b46913dab6", + "urn": "urn:li:container:29746a9030349f4340ed74b46913dab6" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1741205717248, + "runId": "vertexai-source-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "mlModel", + "entityUrn": "urn:li:mlModel:(urn:li:dataPlatform:vertexai,test-project-id.model.mock_prediction_model_1_1,PROD)", + "changeType": "UPSERT", + "aspectName": "mlModelProperties", + "aspect": { + "json": { + "customProperties": { + "displayName": "mock_prediction_model_1", + "versionId": "1", + "resourceName": "projects/123/locations/us-central1/models/456" + }, + "externalUrl": "https://console.cloud.google.com/vertex-ai/models/locations/us-west2/models/mock_prediction_model_1/versions/1?project=test-project-id", + "trainingJobs": [ + "urn:li:dataProcessInstance:test-project-id.job.mock_auto_automl_tabular_job" + ], + "name": "test-project-id.model.mock_prediction_model_1_1", + "description": "test", + "version": { + "versionTag": "1" + }, + "type": "ML Model", + "tags": [], + "deployments": [], + "groups": [ + "urn:li:mlModelGroup:(urn:li:dataPlatform:vertexai,test-project-id.model_group.mock_prediction_model_1,PROD)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1741205717249, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } @@ -216,13 +347,13 @@ "externalUrl": "https://console.cloud.google.com/vertex-ai/training/training-pipelines?trainingPipelineId=mock_auto_automl_tabular_job?project=test-project-id", "name": "test-project-id.job.mock_auto_automl_tabular_job", "created": { - "time": 1741198547917, + "time": 1741205717249, "actor": "urn:li:platformResource:vertexai" } } }, "systemMetadata": { - "lastObserved": 1741198547919, + "lastObserved": 1741205717249, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } @@ -240,7 +371,7 @@ } }, "systemMetadata": { - "lastObserved": 1741198547919, + "lastObserved": 1741205717250, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } @@ -258,7 +389,7 @@ } }, "systemMetadata": { - "lastObserved": 1741198547920, + "lastObserved": 1741205717250, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } @@ -274,7 +405,117 @@ } }, "systemMetadata": { - "lastObserved": 1741198547920, + "lastObserved": 1741205717250, + "runId": "vertexai-source-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:test-project-id.job.mock_auto_automl_tabular_job", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceInput", + "aspect": { + "json": { + "inputs": [ + "urn:li:dataset:(urn:li:dataPlatform:vertexai,test-project-id.dataset.mock_dataset,PROD)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1741205717251, + "runId": "vertexai-source-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertexai,test-project-id.dataset.mock_dataset,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": { + "displayName": "mock_dataset_display_name", + "resourceName": "projects/123/locations/us-central1/datasets/456" + }, + "name": "test-project-id.dataset.mock_dataset", + "qualifiedName": "projects/123/locations/us-central1/datasets/456", + "description": "Dataset: mock_dataset_display_name", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1741205717251, + "runId": "vertexai-source-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertexai,test-project-id.dataset.mock_dataset,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Dataset" + ] + } + }, + "systemMetadata": { + "lastObserved": 1741205717251, + "runId": "vertexai-source-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertexai,test-project-id.dataset.mock_dataset,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:29746a9030349f4340ed74b46913dab6" + } + }, + "systemMetadata": { + "lastObserved": 1741205717252, + "runId": "vertexai-source-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "mlModel", + "entityUrn": "urn:li:mlModel:(urn:li:dataPlatform:vertexai,test-project-id.model.mock_prediction_model_1_1,PROD)", + "changeType": "UPSERT", + "aspectName": "mlModelProperties", + "aspect": { + "json": { + "customProperties": { + "displayName": "mock_prediction_model_1", + "versionId": "1", + "resourceName": "projects/123/locations/us-central1/models/456" + }, + "externalUrl": "https://console.cloud.google.com/vertex-ai/models/locations/us-west2/models/mock_prediction_model_1/versions/1?project=test-project-id", + "trainingJobs": [ + "urn:li:dataProcessInstance:test-project-id.job.mock_auto_automl_tabular_job" + ], + "name": "test-project-id.model.mock_prediction_model_1_1", + "description": "test", + "version": { + "versionTag": "1" + }, + "type": "ML Model", + "tags": [], + "deployments": [], + "groups": [ + "urn:li:mlModelGroup:(urn:li:dataPlatform:vertexai,test-project-id.model_group.mock_prediction_model_1,PROD)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1741205717252, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } @@ -293,13 +534,13 @@ "externalUrl": "https://console.cloud.google.com/vertex-ai/training/training-pipelines?trainingPipelineId=mock_auto_automl_tabular_job?project=test-project-id", "name": "test-project-id.job.mock_auto_automl_tabular_job", "created": { - "time": 1741198547919, + "time": 1741205717252, "actor": "urn:li:platformResource:vertexai" } } }, "systemMetadata": { - "lastObserved": 1741198547920, + "lastObserved": 1741205717253, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } @@ -317,7 +558,7 @@ } }, "systemMetadata": { - "lastObserved": 1741198547921, + "lastObserved": 1741205717253, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } @@ -335,7 +576,23 @@ } }, "systemMetadata": { - "lastObserved": 1741198547921, + "lastObserved": 1741205717253, + "runId": "vertexai-source-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:test-project-id.job.mock_auto_automl_tabular_job", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:29746a9030349f4340ed74b46913dab6" + } + }, + "systemMetadata": { + "lastObserved": 1741205717254, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } @@ -344,6 +601,65 @@ "entityType": "dataProcessInstance", "entityUrn": "urn:li:dataProcessInstance:test-project-id.job.mock_auto_automl_tabular_job", "changeType": "UPSERT", + "aspectName": "dataProcessInstanceInput", + "aspect": { + "json": { + "inputs": [ + "urn:li:dataset:(urn:li:dataPlatform:vertexai,test-project-id.dataset.mock_dataset,PROD)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1741205717254, + "runId": "vertexai-source-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertexai,test-project-id.dataset.mock_dataset,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": { + "displayName": "mock_dataset_display_name", + "resourceName": "projects/123/locations/us-central1/datasets/456" + }, + "name": "test-project-id.dataset.mock_dataset", + "qualifiedName": "projects/123/locations/us-central1/datasets/456", + "description": "Dataset: mock_dataset_display_name", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1741205717254, + "runId": "vertexai-source-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertexai,test-project-id.dataset.mock_dataset,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Dataset" + ] + } + }, + "systemMetadata": { + "lastObserved": 1741205717255, + "runId": "vertexai-source-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertexai,test-project-id.dataset.mock_dataset,PROD)", + "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { @@ -351,7 +667,42 @@ } }, "systemMetadata": { - "lastObserved": 1741198547922, + "lastObserved": 1741205717255, + "runId": "vertexai-source-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "mlModel", + "entityUrn": "urn:li:mlModel:(urn:li:dataPlatform:vertexai,test-project-id.model.mock_prediction_model_1_1,PROD)", + "changeType": "UPSERT", + "aspectName": "mlModelProperties", + "aspect": { + "json": { + "customProperties": { + "displayName": "mock_prediction_model_1", + "versionId": "1", + "resourceName": "projects/123/locations/us-central1/models/456" + }, + "externalUrl": "https://console.cloud.google.com/vertex-ai/models/locations/us-west2/models/mock_prediction_model_1/versions/1?project=test-project-id", + "trainingJobs": [ + "urn:li:dataProcessInstance:test-project-id.job.mock_auto_automl_tabular_job" + ], + "name": "test-project-id.model.mock_prediction_model_1_1", + "description": "test", + "version": { + "versionTag": "1" + }, + "type": "ML Model", + "tags": [], + "deployments": [], + "groups": [ + "urn:li:mlModelGroup:(urn:li:dataPlatform:vertexai,test-project-id.model_group.mock_prediction_model_1,PROD)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1741205717255, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } @@ -367,14 +718,30 @@ } }, "systemMetadata": { - "lastObserved": 1741198547922, + "lastObserved": 1741205717256, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } }, { - "entityType": "dataProcessInstance", - "entityUrn": "urn:li:dataProcessInstance:test-project-id.job.mock_training_job", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertexai,test-project-id.dataset.mock_dataset,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1741205717256, + "runId": "vertexai-source-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "mlModel", + "entityUrn": "urn:li:mlModel:(urn:li:dataPlatform:vertexai,test-project-id.model.mock_prediction_model_1_1,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -383,7 +750,7 @@ } }, "systemMetadata": { - "lastObserved": 1741198547922, + "lastObserved": 1741205717256, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } @@ -399,7 +766,7 @@ } }, "systemMetadata": { - "lastObserved": 1741198547923, + "lastObserved": 1741205717257, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } @@ -415,7 +782,7 @@ } }, "systemMetadata": { - "lastObserved": 1741198547923, + "lastObserved": 1741205717257, "runId": "vertexai-source-test", "lastRunId": "no-run-id-provided" } From 8e40b7c32c2cb9e6e4c811eb9e697aed70c15796 Mon Sep 17 00:00:00 2001 From: Ryota Egashira Date: Wed, 5 Mar 2025 12:33:40 -0800 Subject: [PATCH 59/59] fix import error in test case --- metadata-ingestion/tests/unit/test_vertexai_source.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-ingestion/tests/unit/test_vertexai_source.py b/metadata-ingestion/tests/unit/test_vertexai_source.py index 5d50526590bb0c..c3dd3bd6964809 100644 --- a/metadata-ingestion/tests/unit/test_vertexai_source.py +++ b/metadata-ingestion/tests/unit/test_vertexai_source.py @@ -19,7 +19,6 @@ VertexAIConfig, VertexAISource, ) -from datahub.metadata._schema_classes import MLTrainingRunPropertiesClass from datahub.metadata.com.linkedin.pegasus2avro.ml.metadata import ( MLModelGroupProperties, MLModelProperties, @@ -29,6 +28,7 @@ DataProcessInstanceInputClass, DataProcessInstancePropertiesClass, MLModelDeploymentPropertiesClass, + MLTrainingRunPropertiesClass, SubTypesClass, )