From 4b8f3dc79a9a7d0be64d040fd61261801fe91a10 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Mon, 3 Mar 2025 15:11:41 -0800 Subject: [PATCH 1/3] feat(sdk): add support for institutional memory links --- metadata-ingestion/src/datahub/sdk/_shared.py | 85 ++++++++++++++++- metadata-ingestion/src/datahub/sdk/dataset.py | 6 ++ .../test_dataset_complex_golden.json | 28 ++++++ .../test_dataset_ingestion_golden.json | 28 ++++++ .../test_links_add_remove_golden.json | 93 +++++++++++++++++++ .../tests/unit/sdk_v2/test_dataset.py | 48 ++++++++++ 6 files changed, 286 insertions(+), 2 deletions(-) create mode 100644 metadata-ingestion/tests/unit/sdk_v2/dataset_golden/test_links_add_remove_golden.json diff --git a/metadata-ingestion/src/datahub/sdk/_shared.py b/metadata-ingestion/src/datahub/sdk/_shared.py index b061fd7aa63a76..c43f65b16b8d88 100644 --- a/metadata-ingestion/src/datahub/sdk/_shared.py +++ b/metadata-ingestion/src/datahub/sdk/_shared.py @@ -7,6 +7,7 @@ Callable, List, Optional, + Sequence, Tuple, Union, ) @@ -49,6 +50,8 @@ ActorUrn: TypeAlias = Union[CorpUserUrn, CorpGroupUrn] +_DEFAULT_ACTOR_URN = CorpUserUrn("__ingestion").urn() + def make_time_stamp(ts: Optional[datetime]) -> Optional[models.TimeStampClass]: if ts is None: @@ -438,8 +441,7 @@ def _parse_glossary_term_association_class( def _terms_audit_stamp(self) -> models.AuditStampClass: return models.AuditStampClass( time=0, - # TODO figure out what to put here - actor=CorpUserUrn("__ingestion").urn(), + actor=_DEFAULT_ACTOR_URN, ) def set_terms(self, terms: TermsInputType) -> None: @@ -493,3 +495,82 @@ def domain(self) -> Optional[DomainUrn]: def set_domain(self, domain: DomainInputType) -> None: domain_urn = DomainUrn.from_string(domain) # basically a type assertion self._set_aspect(models.DomainsClass(domains=[str(domain_urn)])) + + +LinkInputType: TypeAlias = Union[ + str, + Tuple[str, str], # url, description + models.InstitutionalMemoryMetadataClass, +] +LinksInputType: TypeAlias = Sequence[LinkInputType] + + +class HasInstitutionalMemory(Entity): + __slots__ = () + + def _ensure_institutional_memory( + self, + ) -> List[models.InstitutionalMemoryMetadataClass]: + return self._setdefault_aspect( + models.InstitutionalMemoryClass(elements=[]) + ).elements + + @property + def links(self) -> Optional[List[models.InstitutionalMemoryMetadataClass]]: + if institutional_memory := self._get_aspect(models.InstitutionalMemoryClass): + return institutional_memory.elements + return None + + @classmethod + def _institutional_memory_audit_stamp(self) -> models.AuditStampClass: + return models.AuditStampClass( + time=0, + actor=_DEFAULT_ACTOR_URN, + ) + + @classmethod + def _parse_link_association_class( + cls, link: LinkInputType + ) -> models.InstitutionalMemoryMetadataClass: + if isinstance(link, models.InstitutionalMemoryMetadataClass): + return link + elif isinstance(link, str): + return models.InstitutionalMemoryMetadataClass( + url=link, + description=link, + createStamp=cls._institutional_memory_audit_stamp(), + ) + elif isinstance(link, tuple) and len(link) == 2: + url, description = link + return models.InstitutionalMemoryMetadataClass( + url=url, + description=description, + createStamp=cls._institutional_memory_audit_stamp(), + ) + else: + assert_never(link) + + def set_links(self, links: LinksInputType) -> None: + self._set_aspect( + models.InstitutionalMemoryClass( + elements=[self._parse_link_association_class(link) for link in links] + ) + ) + + @classmethod + def _link_key(self, link: models.InstitutionalMemoryMetadataClass) -> str: + return link.url + + def add_link(self, link: LinkInputType) -> None: + add_list_unique( + self._ensure_institutional_memory(), + self._link_key, + self._parse_link_association_class(link), + ) + + def remove_link(self, link: LinkInputType) -> None: + remove_list_unique( + self._ensure_institutional_memory(), + self._link_key, + self._parse_link_association_class(link), + ) diff --git a/metadata-ingestion/src/datahub/sdk/dataset.py b/metadata-ingestion/src/datahub/sdk/dataset.py index c367aa79cbcc91..88ab9fbae4dc2d 100644 --- a/metadata-ingestion/src/datahub/sdk/dataset.py +++ b/metadata-ingestion/src/datahub/sdk/dataset.py @@ -23,11 +23,13 @@ DomainInputType, HasContainer, HasDomain, + HasInstitutionalMemory, HasOwnership, HasPlatformInstance, HasSubtype, HasTags, HasTerms, + LinksInputType, OwnersInputType, ParentContainerInputType, TagInputType, @@ -425,6 +427,7 @@ class Dataset( HasTags, HasTerms, HasDomain, + HasInstitutionalMemory, Entity, ): __slots__ = () @@ -453,6 +456,7 @@ def __init__( parent_container: ParentContainerInputType | Unset = unset, subtype: Optional[str] = None, owners: Optional[OwnersInputType] = None, + links: Optional[LinksInputType] = None, tags: Optional[TagsInputType] = None, terms: Optional[TermsInputType] = None, # TODO structured_properties @@ -499,6 +503,8 @@ def __init__( self.set_subtype(subtype) if owners is not None: self.set_owners(owners) + if links is not None: + self.set_links(links) if tags is not None: self.set_tags(tags) if terms is not None: diff --git a/metadata-ingestion/tests/unit/sdk_v2/dataset_golden/test_dataset_complex_golden.json b/metadata-ingestion/tests/unit/sdk_v2/dataset_golden/test_dataset_complex_golden.json index 1e2e3f7e1de1e9..29960691043bcc 100644 --- a/metadata-ingestion/tests/unit/sdk_v2/dataset_golden/test_dataset_complex_golden.json +++ b/metadata-ingestion/tests/unit/sdk_v2/dataset_golden/test_dataset_complex_golden.json @@ -175,6 +175,34 @@ } } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_instance.my_db.my_schema.my_table,PROD)", + "changeType": "UPSERT", + "aspectName": "institutionalMemory", + "aspect": { + "json": { + "elements": [ + { + "url": "https://example.com/doc1", + "description": "https://example.com/doc1", + "createStamp": { + "time": 0, + "actor": "urn:li:corpuser:__ingestion" + } + }, + { + "url": "https://example.com/doc2", + "description": "Documentation 2", + "createStamp": { + "time": 0, + "actor": "urn:li:corpuser:__ingestion" + } + } + ] + } + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_instance.my_db.my_schema.my_table,PROD)", diff --git a/metadata-ingestion/tests/unit/sdk_v2/dataset_golden/test_dataset_ingestion_golden.json b/metadata-ingestion/tests/unit/sdk_v2/dataset_golden/test_dataset_ingestion_golden.json index 2ef7daafc13781..8a76ae000569ad 100644 --- a/metadata-ingestion/tests/unit/sdk_v2/dataset_golden/test_dataset_ingestion_golden.json +++ b/metadata-ingestion/tests/unit/sdk_v2/dataset_golden/test_dataset_ingestion_golden.json @@ -182,6 +182,34 @@ } } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_instance.my_db.my_schema.my_table,PROD)", + "changeType": "UPSERT", + "aspectName": "institutionalMemory", + "aspect": { + "json": { + "elements": [ + { + "url": "https://example.com/doc1", + "description": "https://example.com/doc1", + "createStamp": { + "time": 0, + "actor": "urn:li:corpuser:__ingestion" + } + }, + { + "url": "https://example.com/doc2", + "description": "Documentation 2", + "createStamp": { + "time": 0, + "actor": "urn:li:corpuser:__ingestion" + } + } + ] + } + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_instance.my_db.my_schema.my_table,PROD)", diff --git a/metadata-ingestion/tests/unit/sdk_v2/dataset_golden/test_links_add_remove_golden.json b/metadata-ingestion/tests/unit/sdk_v2/dataset_golden/test_links_add_remove_golden.json new file mode 100644 index 00000000000000..fa9d830cb88338 --- /dev/null +++ b/metadata-ingestion/tests/unit/sdk_v2/dataset_golden/test_links_add_remove_golden.json @@ -0,0 +1,93 @@ +[ +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,proj.dataset.table,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:bigquery" + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,proj.dataset.table,PROD)", + "changeType": "UPSERT", + "aspectName": "schemaMetadata", + "aspect": { + "json": { + "schemaName": "", + "platform": "urn:li:dataPlatform:bigquery", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.schema.Schemaless": {} + }, + "fields": [ + { + "fieldPath": "field1", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field2", + "nullable": false, + "description": "field2 description", + "type": { + "type": { + "com.linkedin.schema.NullType": {} + } + }, + "nativeDataType": "int64", + "recursive": false, + "isPartOfKey": false + } + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,proj.dataset.table,PROD)", + "changeType": "UPSERT", + "aspectName": "institutionalMemory", + "aspect": { + "json": { + "elements": [ + { + "url": "https://example.com/doc2", + "description": "Documentation 2", + "createStamp": { + "time": 0, + "actor": "urn:li:corpuser:__ingestion" + } + }, + { + "url": "https://example.com/doc3", + "description": "Documentation 3", + "createStamp": { + "time": 0, + "actor": "urn:li:corpuser:__ingestion" + } + } + ] + } + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/sdk_v2/test_dataset.py b/metadata-ingestion/tests/unit/sdk_v2/test_dataset.py index 0e1ab2a2b3dba3..d16a1152aff5ee 100644 --- a/metadata-ingestion/tests/unit/sdk_v2/test_dataset.py +++ b/metadata-ingestion/tests/unit/sdk_v2/test_dataset.py @@ -113,6 +113,10 @@ def _build_complex_dataset() -> Dataset: owners=[ CorpUserUrn("admin@datahubproject.io"), ], + links=[ + "https://example.com/doc1", + ("https://example.com/doc2", "Documentation 2"), + ], tags=[ TagUrn("tag1"), TagUrn("tag2"), @@ -178,6 +182,12 @@ def _build_complex_dataset() -> Dataset: assert d["field2"].terms is not None assert len(d["field2"].terms) == 2 + # Add assertions for links + assert d.links is not None + assert len(d.links) == 2 + assert d.links[0].url == "https://example.com/doc1" + assert d.links[1].url == "https://example.com/doc2" + return d @@ -369,3 +379,41 @@ def test_browse_path() -> None: assert d.browse_path == path assert_entity_golden(d, _GOLDEN_DIR / "test_browse_path_golden.json") + + +def test_links_add_remove() -> None: + d = Dataset( + platform="bigquery", + name="proj.dataset.table", + schema=[ + ("field1", "string"), + ("field2", "int64", "field2 description"), + ], + links=[ + "https://example.com/doc1", + ("https://example.com/doc2", "Documentation 2"), + ], + ) + + # Test initial state + assert d.links is not None + assert len(d.links) == 2 + assert d.links[0].url == "https://example.com/doc1" + assert d.links[0].description == "https://example.com/doc1" + assert d.links[1].url == "https://example.com/doc2" + assert d.links[1].description == "Documentation 2" + + # Test link add/remove flows + for _ in range(2): # Second iteration should be a no-op + d.add_link(("https://example.com/doc3", "Documentation 3")) + assert len(d.links) == 3 + assert d.links[2].url == "https://example.com/doc3" + assert d.links[2].description == "Documentation 3" + + for _ in range(2): # Second iteration should be a no-op + d.remove_link("https://example.com/doc1") + assert len(d.links) == 2 + assert d.links[0].url == "https://example.com/doc2" + assert d.links[1].url == "https://example.com/doc3" + + assert_entity_golden(d, _GOLDEN_DIR / "test_links_add_remove_golden.json") From a50538438c1bb4bf59a572291635e5403f316afc Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Mon, 3 Mar 2025 15:19:17 -0800 Subject: [PATCH 2/3] add support for containers --- .../src/datahub/sdk/container.py | 6 ++++++ metadata-ingestion/src/datahub/sdk/dataset.py | 2 +- .../test_container_complex_golden.json | 20 +++++++++++++++++++ .../tests/unit/sdk_v2/test_container.py | 13 ++++++------ .../tests/unit/sdk_v2/test_dataset.py | 10 ++++------ 5 files changed, 38 insertions(+), 13 deletions(-) diff --git a/metadata-ingestion/src/datahub/sdk/container.py b/metadata-ingestion/src/datahub/sdk/container.py index d2c449a6d2166b..b5b1ce3815b6ed 100644 --- a/metadata-ingestion/src/datahub/sdk/container.py +++ b/metadata-ingestion/src/datahub/sdk/container.py @@ -20,11 +20,13 @@ DomainInputType, HasContainer, HasDomain, + HasInstitutionalMemory, HasOwnership, HasPlatformInstance, HasSubtype, HasTags, HasTerms, + LinksInputType, OwnersInputType, ParentContainerInputType, TagsInputType, @@ -41,6 +43,7 @@ class Container( HasSubtype, HasContainer, HasOwnership, + HasInstitutionalMemory, HasTags, HasTerms, HasDomain, @@ -71,6 +74,7 @@ def __init__( parent_container: Auto | ParentContainerInputType | None = auto, subtype: Optional[str] = None, owners: Optional[OwnersInputType] = None, + links: Optional[LinksInputType] = None, tags: Optional[TagsInputType] = None, terms: Optional[TermsInputType] = None, domain: Optional[DomainInputType] = None, @@ -133,6 +137,8 @@ def __init__( self.set_subtype(subtype) if owners is not None: self.set_owners(owners) + if links is not None: + self.set_links(links) if tags is not None: self.set_tags(tags) if terms is not None: diff --git a/metadata-ingestion/src/datahub/sdk/dataset.py b/metadata-ingestion/src/datahub/sdk/dataset.py index 88ab9fbae4dc2d..2367c93e4e7832 100644 --- a/metadata-ingestion/src/datahub/sdk/dataset.py +++ b/metadata-ingestion/src/datahub/sdk/dataset.py @@ -424,10 +424,10 @@ class Dataset( HasSubtype, HasContainer, HasOwnership, + HasInstitutionalMemory, HasTags, HasTerms, HasDomain, - HasInstitutionalMemory, Entity, ): __slots__ = () diff --git a/metadata-ingestion/tests/unit/sdk_v2/container_golden/test_container_complex_golden.json b/metadata-ingestion/tests/unit/sdk_v2/container_golden/test_container_complex_golden.json index 6735cccdf5e879..3ed3faadad8fbb 100644 --- a/metadata-ingestion/tests/unit/sdk_v2/container_golden/test_container_complex_golden.json +++ b/metadata-ingestion/tests/unit/sdk_v2/container_golden/test_container_complex_golden.json @@ -104,6 +104,26 @@ } } }, +{ + "entityType": "container", + "entityUrn": "urn:li:container:66c5ac35f0bfc521dee6f7d9533a8056", + "changeType": "UPSERT", + "aspectName": "institutionalMemory", + "aspect": { + "json": { + "elements": [ + { + "url": "https://example.com/doc1", + "description": "https://example.com/doc1", + "createStamp": { + "time": 0, + "actor": "urn:li:corpuser:__ingestion" + } + } + ] + } + } +}, { "entityType": "container", "entityUrn": "urn:li:container:66c5ac35f0bfc521dee6f7d9533a8056", diff --git a/metadata-ingestion/tests/unit/sdk_v2/test_container.py b/metadata-ingestion/tests/unit/sdk_v2/test_container.py index 314db153d8a6b0..85d5731de74f15 100644 --- a/metadata-ingestion/tests/unit/sdk_v2/test_container.py +++ b/metadata-ingestion/tests/unit/sdk_v2/test_container.py @@ -43,6 +43,8 @@ def test_container_basic() -> None: assert c.platform.platform_name == "bigquery" assert c.platform_instance is None assert c.browse_path == [] + assert c.owners is None + assert c.links is None assert c.tags is None assert c.terms is None assert c.created is None @@ -95,6 +97,7 @@ def test_container_complex() -> None: owners=[ CorpUserUrn("admin@datahubproject.io"), ], + links=["https://example.com/doc1"], tags=[ TagUrn("tag1"), TagUrn("tag2"), @@ -134,12 +137,10 @@ def test_container_complex() -> None: # Check standard aspects. assert c.subtype == "Schema" + assert c.owners is not None and len(c.owners) == 1 + assert c.links is not None and len(c.links) == 1 + assert c.tags is not None and len(c.tags) == 2 + assert c.terms is not None and len(c.terms) == 1 assert c.domain == DomainUrn("Marketing") - assert c.tags is not None - assert len(c.tags) == 2 - assert c.terms is not None - assert len(c.terms) == 1 - assert c.owners is not None - assert len(c.owners) == 1 assert_entity_golden(c, _GOLDEN_DIR / "test_container_complex_golden.json") diff --git a/metadata-ingestion/tests/unit/sdk_v2/test_dataset.py b/metadata-ingestion/tests/unit/sdk_v2/test_dataset.py index d16a1152aff5ee..e4cd5246406d8e 100644 --- a/metadata-ingestion/tests/unit/sdk_v2/test_dataset.py +++ b/metadata-ingestion/tests/unit/sdk_v2/test_dataset.py @@ -152,13 +152,11 @@ def _build_complex_dataset() -> Dataset: # Check standard aspects. assert d.subtype == "Table" + assert d.owners is not None and len(d.owners) == 1 + assert d.links is not None and len(d.links) == 2 + assert d.tags is not None and len(d.tags) == 2 + assert d.terms is not None and len(d.terms) == 1 assert d.domain == DomainUrn("Marketing") - assert d.tags is not None - assert len(d.tags) == 2 - assert d.terms is not None - assert len(d.terms) == 1 - assert d.owners is not None - assert len(d.owners) == 1 assert len(d.schema) == 2 From 8ebc2df37b932547165f94909fd34129ed9ae6ae Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Mon, 3 Mar 2025 15:20:38 -0800 Subject: [PATCH 3/3] add comment --- metadata-ingestion/src/datahub/sdk/_shared.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/metadata-ingestion/src/datahub/sdk/_shared.py b/metadata-ingestion/src/datahub/sdk/_shared.py index c43f65b16b8d88..c5999c2296a247 100644 --- a/metadata-ingestion/src/datahub/sdk/_shared.py +++ b/metadata-ingestion/src/datahub/sdk/_shared.py @@ -508,6 +508,10 @@ def set_domain(self, domain: DomainInputType) -> None: class HasInstitutionalMemory(Entity): __slots__ = () + # Internally the aspect is called institutionalMemory, and so much of the code + # uses that name. However, the public-facing API is called "links", since + # that's what we call these in the UI. + def _ensure_institutional_memory( self, ) -> List[models.InstitutionalMemoryMetadataClass]: