From 9cb5886d6dffb4cf45cf125dad6cd637e75a5e53 Mon Sep 17 00:00:00 2001 From: Hyejin Yoon <0327jane@gmail.com> Date: Thu, 6 Mar 2025 04:59:21 +0900 Subject: [PATCH 01/10] fix(ui): change tags to properties in ml model view (#12789) --- .../src/app/entity/mlModelGroup/profile/ModelGroupModels.tsx | 4 ++-- .../app/entityV2/mlModelGroup/profile/ModelGroupModels.tsx | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/datahub-web-react/src/app/entity/mlModelGroup/profile/ModelGroupModels.tsx b/datahub-web-react/src/app/entity/mlModelGroup/profile/ModelGroupModels.tsx index a443a5d30fa31b..db7e54be99a81b 100644 --- a/datahub-web-react/src/app/entity/mlModelGroup/profile/ModelGroupModels.tsx +++ b/datahub-web-react/src/app/entity/mlModelGroup/profile/ModelGroupModels.tsx @@ -125,8 +125,8 @@ export default function MLGroupModels() { }, }, { - title: 'Tags', - key: 'tags', + title: 'Properties', + key: 'properties', width: 200, render: (_: any, record: any) => { const tags = record.properties?.tags || []; diff --git a/datahub-web-react/src/app/entityV2/mlModelGroup/profile/ModelGroupModels.tsx b/datahub-web-react/src/app/entityV2/mlModelGroup/profile/ModelGroupModels.tsx index 6517584b2a7278..ac43b4f17b9102 100644 --- a/datahub-web-react/src/app/entityV2/mlModelGroup/profile/ModelGroupModels.tsx +++ b/datahub-web-react/src/app/entityV2/mlModelGroup/profile/ModelGroupModels.tsx @@ -128,8 +128,8 @@ export default function MLGroupModels() { }, }, { - title: 'Tags', - key: 'tags', + title: 'Properties', + key: 'properties', width: 200, render: (_: any, record: any) => { const tags = record.properties?.tags || []; From 1068e2b512bd51a61ca850c95d1a5bfbd9205352 Mon Sep 17 00:00:00 2001 From: Chris Collins Date: Wed, 5 Mar 2025 17:45:28 -0500 Subject: [PATCH 02/10] fix(ui) Fix changing color and icon for domains in UI (#12792) --- .../graphql/types/domain/DomainType.java | 3 +- .../src/app/domain/DomainsList.tsx | 1 + datahub-web-react/src/app/domain/utils.ts | 1 + .../header/IconPicker/IconColorPicker.tsx | 48 ++----------------- datahub-web-react/src/graphql/domain.graphql | 6 +++ .../src/graphql/fragments.graphql | 9 ++++ .../src/graphql/glossaryNode.graphql | 3 ++ datahub-web-react/src/graphql/preview.graphql | 3 ++ datahub-web-react/src/graphql/search.graphql | 3 ++ .../src/main/resources/entity-registry.yml | 2 + 10 files changed, 33 insertions(+), 46 deletions(-) diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/domain/DomainType.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/domain/DomainType.java index 7afe1018004e02..65d4332196c6df 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/domain/DomainType.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/domain/DomainType.java @@ -40,7 +40,8 @@ public class DomainType Constants.OWNERSHIP_ASPECT_NAME, Constants.INSTITUTIONAL_MEMORY_ASPECT_NAME, Constants.STRUCTURED_PROPERTIES_ASPECT_NAME, - Constants.FORMS_ASPECT_NAME); + Constants.FORMS_ASPECT_NAME, + Constants.DISPLAY_PROPERTIES_ASPECT_NAME); private final EntityClient _entityClient; public DomainType(final EntityClient entityClient) { diff --git a/datahub-web-react/src/app/domain/DomainsList.tsx b/datahub-web-react/src/app/domain/DomainsList.tsx index b1095726808fe8..37476bdc2cfe42 100644 --- a/datahub-web-react/src/app/domain/DomainsList.tsx +++ b/datahub-web-react/src/app/domain/DomainsList.tsx @@ -196,6 +196,7 @@ export const DomainsList = () => { }, ownership: null, entities: null, + displayProperties: null, }, pageSize, ); diff --git a/datahub-web-react/src/app/domain/utils.ts b/datahub-web-react/src/app/domain/utils.ts index 8273c33e2c41df..e39e97ac86bb6e 100644 --- a/datahub-web-react/src/app/domain/utils.ts +++ b/datahub-web-react/src/app/domain/utils.ts @@ -72,6 +72,7 @@ export const updateListDomainsCache = ( children: null, dataProducts: null, parentDomains: null, + displayProperties: null, }, 1000, parentDomain, diff --git a/datahub-web-react/src/app/entityV2/shared/containers/profile/header/IconPicker/IconColorPicker.tsx b/datahub-web-react/src/app/entityV2/shared/containers/profile/header/IconPicker/IconColorPicker.tsx index 714e54439e5fbc..574b012241f174 100644 --- a/datahub-web-react/src/app/entityV2/shared/containers/profile/header/IconPicker/IconColorPicker.tsx +++ b/datahub-web-react/src/app/entityV2/shared/containers/profile/header/IconPicker/IconColorPicker.tsx @@ -1,5 +1,4 @@ import { Input, Modal } from 'antd'; -import { debounce } from 'lodash'; import React from 'react'; import styled from 'styled-components'; @@ -53,13 +52,6 @@ const IconColorPicker: React.FC = ({ const [stagedColor, setStagedColor] = React.useState(color || '#000000'); const [stagedIcon, setStagedIcon] = React.useState(icon || 'account_circle'); - // a debounced version of updateDisplayProperties that takes in the same arguments - // eslint-disable-next-line react-hooks/exhaustive-deps - const debouncedUpdateDisplayProperties = React.useCallback( - debounce((...args) => updateDisplayProperties(...args).then(() => setTimeout(() => refetch(), 1000)), 500), - [], - ); - return ( = ({ }, }, }, - }); + }).then(() => refetch()); onChangeColor?.(stagedColor); onChangeIcon?.(stagedIcon); onClose(); @@ -93,44 +85,10 @@ const IconColorPicker: React.FC = ({ marginBottom: 30, marginTop: 15, }} - onChange={(e) => { - setStagedColor(e.target.value); - debouncedUpdateDisplayProperties?.({ - variables: { - urn, - input: { - colorHex: e.target.value, - icon: { - iconLibrary: IconLibrary.Material, - name: stagedIcon, - style: 'Outlined', - }, - }, - }, - }); - }} + onChange={(e) => setStagedColor(e.target.value)} /> Choose an icon for {name || 'Domain'} - { - console.log('picking icon', i); - debouncedUpdateDisplayProperties?.({ - variables: { - urn, - input: { - colorHex: stagedColor, - icon: { - iconLibrary: IconLibrary.Material, - name: capitalize(snakeToCamel(i)), - style: 'Outlined', - }, - }, - }, - }); - setStagedIcon(i); - }} - /> + setStagedIcon(i)} /> ); }; diff --git a/datahub-web-react/src/graphql/domain.graphql b/datahub-web-react/src/graphql/domain.graphql index 1a26e6313916cf..f945590a461c89 100644 --- a/datahub-web-react/src/graphql/domain.graphql +++ b/datahub-web-react/src/graphql/domain.graphql @@ -40,6 +40,9 @@ query getDomain($urn: String!) { forms { ...formsFields } + displayProperties { + ...displayPropertiesFields + } ...domainEntitiesFields ...notes } @@ -64,6 +67,9 @@ query listDomains($input: ListDomainsInput!) { ownership { ...ownershipFields } + displayProperties { + ...displayPropertiesFields + } ...domainEntitiesFields } } diff --git a/datahub-web-react/src/graphql/fragments.graphql b/datahub-web-react/src/graphql/fragments.graphql index dacf5e4c900ae7..fe2b5fddd7ab95 100644 --- a/datahub-web-react/src/graphql/fragments.graphql +++ b/datahub-web-react/src/graphql/fragments.graphql @@ -229,6 +229,9 @@ fragment parentNodesFields on ParentNodesResult { properties { name } + displayProperties { + ...displayPropertiesFields + } } } @@ -238,6 +241,9 @@ fragment parentDomainsFields on ParentDomainsResult { urn type ... on Domain { + displayProperties { + ...displayPropertiesFields + } properties { name description @@ -1259,6 +1265,9 @@ fragment entityDomain on DomainAssociation { ...parentDomainsFields } ...domainEntitiesFields + displayProperties { + ...displayPropertiesFields + } } associatedUrn } diff --git a/datahub-web-react/src/graphql/glossaryNode.graphql b/datahub-web-react/src/graphql/glossaryNode.graphql index 38202872306fbd..9c452c4868f5e8 100644 --- a/datahub-web-react/src/graphql/glossaryNode.graphql +++ b/datahub-web-react/src/graphql/glossaryNode.graphql @@ -54,6 +54,9 @@ query getGlossaryNode($urn: String!) { } } } + displayProperties { + ...displayPropertiesFields + } ...notes } } diff --git a/datahub-web-react/src/graphql/preview.graphql b/datahub-web-react/src/graphql/preview.graphql index 8000f59f2bf258..fbd12935ebb720 100644 --- a/datahub-web-react/src/graphql/preview.graphql +++ b/datahub-web-react/src/graphql/preview.graphql @@ -341,6 +341,9 @@ fragment entityPreview on Entity { parentDomains { ...parentDomainsFields } + displayProperties { + ...displayPropertiesFields + } ...domainEntitiesFields } ... on Container { diff --git a/datahub-web-react/src/graphql/search.graphql b/datahub-web-react/src/graphql/search.graphql index 89ef2709504d53..14e5bc408dad8c 100644 --- a/datahub-web-react/src/graphql/search.graphql +++ b/datahub-web-react/src/graphql/search.graphql @@ -845,6 +845,9 @@ fragment searchResultsWithoutSchemaField on Entity { parentDomains { ...parentDomainsFields } + displayProperties { + ...displayPropertiesFields + } ...domainEntitiesFields structuredProperties { properties { diff --git a/metadata-models/src/main/resources/entity-registry.yml b/metadata-models/src/main/resources/entity-registry.yml index ea9939ad6b4078..87dc120a788e57 100644 --- a/metadata-models/src/main/resources/entity-registry.yml +++ b/metadata-models/src/main/resources/entity-registry.yml @@ -239,6 +239,7 @@ entities: - structuredProperties - forms - testResults + - displayProperties - name: container doc: A container of related data assets. category: core @@ -300,6 +301,7 @@ entities: - forms - testResults - subTypes + - displayProperties - name: dataHubIngestionSource category: internal keyAspect: dataHubIngestionSourceKey From cf0dc3ac6bf99dd71b02ac2bb4ff005ce7acad93 Mon Sep 17 00:00:00 2001 From: ryota-cloud Date: Wed, 5 Mar 2025 14:52:39 -0800 Subject: [PATCH 03/10] Support container in ML Model Group, Model and Deployment (#12793) --- metadata-models/src/main/resources/entity-registry.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/metadata-models/src/main/resources/entity-registry.yml b/metadata-models/src/main/resources/entity-registry.yml index 87dc120a788e57..cf33d97426ecfc 100644 --- a/metadata-models/src/main/resources/entity-registry.yml +++ b/metadata-models/src/main/resources/entity-registry.yml @@ -377,6 +377,7 @@ entities: - testResults - versionProperties - subTypes + - container - name: mlModelGroup category: core keyAspect: mlModelGroupKey @@ -396,6 +397,7 @@ entities: - forms - testResults - subTypes + - container - name: mlModelDeployment category: core keyAspect: mlModelDeploymentKey @@ -407,6 +409,7 @@ entities: - globalTags - dataPlatformInstance - testResults + - container - name: mlFeatureTable category: core keyAspect: mlFeatureTableKey From 1d1ed78be7322151448242c87d860c2f01bbe7de Mon Sep 17 00:00:00 2001 From: Hyejin Yoon <0327jane@gmail.com> Date: Thu, 6 Mar 2025 08:41:18 +0900 Subject: [PATCH 04/10] docs: update mlflow ingestion docs to include new concept mappings (#12791) Co-authored-by: Harshal Sheth --- metadata-ingestion/docs/sources/mlflow/mlflow_pre.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/metadata-ingestion/docs/sources/mlflow/mlflow_pre.md b/metadata-ingestion/docs/sources/mlflow/mlflow_pre.md index fc499a7a3b2b8c..b4991ffeecafda 100644 --- a/metadata-ingestion/docs/sources/mlflow/mlflow_pre.md +++ b/metadata-ingestion/docs/sources/mlflow/mlflow_pre.md @@ -2,8 +2,10 @@ This ingestion source maps the following MLflow Concepts to DataHub Concepts: -| Source Concept | DataHub Concept | Notes | -|:---------------------------------------------------------------------------------:|:-------------------------------------------------------------------------------------------:|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| [`Registered Model`](https://mlflow.org/docs/latest/model-registry.html#concepts) | [`MlModelGroup`](https://datahubproject.io/docs/generated/metamodel/entities/mlmodelgroup/) | The name of a Model Group is the same as a Registered Model's name (e.g. my_mlflow_model) | -| [`Model Version`](https://mlflow.org/docs/latest/model-registry.html#concepts) | [`MlModel`](https://datahubproject.io/docs/generated/metamodel/entities/mlmodel/) | The name of a Model is `{registered_model_name}{model_name_separator}{model_version}` (e.g. my_mlflow_model_1 for Registered Model named my_mlflow_model and Version 1, my_mlflow_model_2, etc.) | -| [`Model Stage`](https://mlflow.org/docs/latest/model-registry.html#concepts) | [`Tag`](https://datahubproject.io/docs/generated/metamodel/entities/tag/) | The mapping between Model Stages and generated Tags is the following:
- Production: mlflow_production
- Staging: mlflow_staging
- Archived: mlflow_archived
- None: mlflow_none | +| Source Concept | DataHub Concept | Notes | +|:-----------------------------------------------------------------------------------:|:---------------------------------------------------------------------------------------------------------:|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| [`Registered Model`](https://mlflow.org/docs/latest/model-registry/#registered-model) | [`MlModelGroup`](https://datahubproject.io/docs/generated/metamodel/entities/mlmodelgroup/) | The name of a Model Group is the same as a Registered Model's name (e.g. my_mlflow_model). Registered Models serve as containers for multiple versions of the same model in MLflow. | +| [`Model Version`](https://mlflow.org/docs/latest/model-registry/#model-version) | [`MlModel`](https://datahubproject.io/docs/generated/metamodel/entities/mlmodel/) | The name of a Model is `{registered_model_name}{model_name_separator}{model_version}` (e.g. my_mlflow_model_1 for Registered Model named my_mlflow_model and Version 1, my_mlflow_model_2, etc.). Each Model Version represents a specific iteration of a model with its own artifacts and metadata. | +| [`Experiment`](https://mlflow.org/docs/latest/tracking/#experiments) | [`Container`](https://datahubproject.io/docs/generated/metamodel/entities/container/) | Each Experiment in MLflow is mapped to a Container in DataHub. Experiments organize related runs and serve as logical groupings for model development iterations, allowing tracking of parameters, metrics, and artifacts. | +| [`Run`](https://mlflow.org/docs/latest/tracking/#runs) | [`DataProcessInstance`](https://datahubproject.io/docs/generated/metamodel/entities/dataprocessinstance/) | Captures the run's execution details, parameters, metrics, and lineage to a model. | +| [`Model Stage`](https://mlflow.org/docs/latest/model-registry/#deprecated-using-model-stages) | [`Tag`](https://datahubproject.io/docs/generated/metamodel/entities/tag/) | The mapping between Model Stages and generated Tags is the following:
- Production: mlflow_production
- Staging: mlflow_staging
- Archived: mlflow_archived
- None: mlflow_none. Model Stages indicate the deployment status of each version. | \ No newline at end of file From 484faee243034114ebd9212b8d169b66fa81c828 Mon Sep 17 00:00:00 2001 From: Jay <159848059+jayacryl@users.noreply.github.com> Date: Wed, 5 Mar 2025 16:57:38 -0800 Subject: [PATCH 05/10] fix(web) move form entity sidebar to right to align with cloud (#12796) --- .../app/entity/shared/entityForm/FormByEntity.tsx | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/datahub-web-react/src/app/entity/shared/entityForm/FormByEntity.tsx b/datahub-web-react/src/app/entity/shared/entityForm/FormByEntity.tsx index 61a70c145bd115..a6c3fdae71b2ed 100644 --- a/datahub-web-react/src/app/entity/shared/entityForm/FormByEntity.tsx +++ b/datahub-web-react/src/app/entity/shared/entityForm/FormByEntity.tsx @@ -63,13 +63,14 @@ export default function FormByEntity({ formUrn }: Props) { - }} - backgroundColor="white" - alignLeft - />
+ {selectedEntityData && ( + }} + backgroundColor="white" + /> + )} From 6d4744f93b829455bdf699ee8a7549f312044f21 Mon Sep 17 00:00:00 2001 From: Chakru <161002324+chakru-r@users.noreply.github.com> Date: Thu, 6 Mar 2025 06:28:46 +0530 Subject: [PATCH 06/10] doc(iceberg): iceberg doc updates (#12787) Co-authored-by: Shirshanka Das --- docs/iceberg-catalog.md | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/docs/iceberg-catalog.md b/docs/iceberg-catalog.md index c11a2fb270f015..a4461a2cfc4a02 100644 --- a/docs/iceberg-catalog.md +++ b/docs/iceberg-catalog.md @@ -46,6 +46,8 @@ Before starting, ensure you have: DH_ICEBERG_DATA_ROOT="s3://your-bucket/path" ``` + The `DH_ICEBERG_CLIENT_ID` is the `AWS_ACCESS_KEY_ID` and `DH_ICEBERG_CLIENT_SECRET` is the `AWS_SECRET_ACCESS_KEY` + 4. If using pyiceberg, configure pyiceberg to use your local datahub using one of its supported ways. For example, create `~/.pyiceberg.yaml` with ```commandline catalog: @@ -124,8 +126,10 @@ You can create Iceberg tables using PyIceberg with a defined schema. Here's an e -Connect to the DataHub Iceberg Catalog using Spark SQL by defining `$GMS_HOST`, `$GMS_PORT`, `$WAREHOUSE` to connect to and `$USER_PAT` - the DataHub Personal Access Token used to connect to the catalog: -When datahub is running locally, set `GMS_HOST` to `localhost` and `GMS_PORT` to `8080`. +Connect to the DataHub Iceberg Catalog using Spark SQL by defining `$GMS_HOST`, `$GMS_PORT`, `$WAREHOUSE` to connect to and `$USER_PAT` - the DataHub Personal Access Token used to connect to the catalog. +When using DataHub Cloud (Acryl), the Iceberg Catalog URL is `https://.acryl.io/gms/iceberg/` +If you're running DataHub locally, set `GMS_HOST` to `localhost` and `GMS_PORT` to `8080`. + For this example, set `WAREHOUSE` to `arctic_warehouse` ```cli @@ -518,4 +522,4 @@ A: Check that: - [Apache Iceberg Documentation](https://iceberg.apache.org/) - [PyIceberg Documentation](https://py.iceberg.apache.org/) -- [DataHub Documentation](https://datahubproject.io/docs/) \ No newline at end of file +- [DataHub Documentation](https://datahubproject.io/docs/) From ba8affbc7ab8c9f8e6fc2d7e346c27b436deb90e Mon Sep 17 00:00:00 2001 From: Hyejin Yoon <0327jane@gmail.com> Date: Thu, 6 Mar 2025 12:23:22 +0900 Subject: [PATCH 07/10] docs: add exporting from source to write mcp guide (#12800) --- docs/advanced/writing-mcps.md | 43 +++++++++++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 5 deletions(-) diff --git a/docs/advanced/writing-mcps.md b/docs/advanced/writing-mcps.md index a90bff77ea555c..a74385df2f5488 100644 --- a/docs/advanced/writing-mcps.md +++ b/docs/advanced/writing-mcps.md @@ -39,15 +39,48 @@ For example, if you want to understand the structure of entities in your DataHub ## Saving MCPs to a file -### Exporting rom DataHub Instance +### Exporting from Ingestion Source -You can export MCPs directly from your DataHub instance using a recipe file. This is useful when you want to: +You can export MCPs from an ingestion source (such as BigQuery, Snowflake, etc.) to a file using the `file` sink type in your recipe. This approach is useful when you want to: -- Examine existing entities in your DataHub instance +- Save MCPs for later ingestion +- Examine existing entities in the source +- Debug ingestion issues + +To get started, create a recipe file (e.g., `export_mcps.yaml`) specifying your target source and the file `sink` type: + +```yaml +source: + type: bigquery # Replace with your source type + config: + ... # Add your source configuration here +sink: + type: "file" + config: + filename: "mcps.json" +``` + +Run the ingestion with the following command: + +```python +datahub ingest -c export_mcps.yaml +``` + +This command will extract all entities from your source and write them to `mcps.json` in MCP format. + +For more details about the `file` sink type, please refer to [Metadata File](../../metadata-ingestion/sink_docs/metadata-file.md) + +### Exporting from DataHub Instance + +You can also export MCPs directly from an existing DataHub instance using a similar recipe approach. This method is particularly useful when you need to: + +- Examine entities already in your DataHub instance - Create test cases based on real data - Debug entity relationships -First, create a recipe file (e.g., `export_mcps.yaml`): +The process is similar to exporting from an ingestion source, with the only difference being that you'll use `datahub` as the source type. +Create a recipe file (e.g., `export_mcps.yaml`) with this configuration: + ```yaml source: @@ -69,7 +102,7 @@ Run the ingestion: datahub ingest -c export_mcps.yaml ``` -This will write all the entities from your DataHub instance to `mcps.json` in MCP format. +This will extract all entities from your DataHub instance and save them to `mcps.json` in MCP format. ### Creating MCPs with Python SDK From 4714f46f11a949fac21d36a4bcf711fcc33ba870 Mon Sep 17 00:00:00 2001 From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> Date: Thu, 6 Mar 2025 19:37:18 +0530 Subject: [PATCH 08/10] feat(ingest/redshift): support for datashares lineage (#12660) Co-authored-by: Harshal Sheth --- .../docs/sources/redshift/README.md | 0 .../docs/sources/redshift/redshift_pre.md | 104 ++++ .../api/entities/common/serialized_value.py | 7 +- .../ingestion/source/redshift/config.py | 4 + .../ingestion/source/redshift/datashares.py | 236 +++++++++ .../ingestion/source/redshift/lineage.py | 8 +- .../ingestion/source/redshift/lineage_v2.py | 11 +- .../ingestion/source/redshift/profile.py | 2 +- .../ingestion/source/redshift/query.py | 158 ++++-- .../ingestion/source/redshift/redshift.py | 113 ++-- .../source/redshift/redshift_schema.py | 172 ++++++- .../ingestion/source/redshift/report.py | 3 + .../unit/redshift/test_redshift_datashares.py | 486 ++++++++++++++++++ 13 files changed, 1183 insertions(+), 121 deletions(-) delete mode 100644 metadata-ingestion/docs/sources/redshift/README.md create mode 100644 metadata-ingestion/docs/sources/redshift/redshift_pre.md create mode 100644 metadata-ingestion/src/datahub/ingestion/source/redshift/datashares.py create mode 100644 metadata-ingestion/tests/unit/redshift/test_redshift_datashares.py diff --git a/metadata-ingestion/docs/sources/redshift/README.md b/metadata-ingestion/docs/sources/redshift/README.md deleted file mode 100644 index e69de29bb2d1d6..00000000000000 diff --git a/metadata-ingestion/docs/sources/redshift/redshift_pre.md b/metadata-ingestion/docs/sources/redshift/redshift_pre.md new file mode 100644 index 00000000000000..ad54de6811db3f --- /dev/null +++ b/metadata-ingestion/docs/sources/redshift/redshift_pre.md @@ -0,0 +1,104 @@ +### Prerequisites + +This source needs to access system tables that require extra permissions. +To grant these permissions, please alter your datahub Redshift user the following way: +```sql +ALTER USER datahub_user WITH SYSLOG ACCESS UNRESTRICTED; +GRANT SELECT ON pg_catalog.svv_table_info to datahub_user; +GRANT SELECT ON pg_catalog.svl_user_info to datahub_user; +``` + +To ingest datashares lineage, ingestion user for both producer and consumer namespace would need alter/share +access to datashare. See [svv_datashares](https://docs.aws.amazon.com/redshift/latest/dg/r_SVV_DATASHARES.html) +docs for more information. +```sql +GRANT SHARE ON to datahub_user +``` + +:::note + +Giving a user unrestricted access to system tables gives the user visibility to data generated by other users. For example, STL_QUERY and STL_QUERYTEXT contain the full text of INSERT, UPDATE, and DELETE statements. + +::: + +### Concept mapping +| Source Concept | DataHub Concept | Notes | +| -------------- | --------------------------------------------------------- | ------------------ | +| `"redshift"` | [Data Platform](../../metamodel/entities/dataPlatform.md) | | +| Database | [Container](../../metamodel/entities/container.md) | Subtype `Database` | +| Schema | [Container](../../metamodel/entities/container.md) | Subtype `Schema` | +| Table | [Dataset](../../metamodel/entities/dataset.md) | Subtype `Table` | +| View | [Dataset](../../metamodel/entities/dataset.md) | Subtype `View` | + + +### Ingestion of multiple redshift databases, namespaces + +- If multiple databases are present in the Redshift namespace (or provisioned cluster), +you would need to set up a separate ingestion per database. + +- Ingestion recipes of all databases in a particular redshift namespace should use same platform instance. + +- If you've multiple redshift namespaces that you want to ingest within DataHub, it is highly recommended that +you specify a platform_instance equivalent to namespace in recipe. It can be same as namespace id or other +human readable name however it should be unique across all your redshift namespaces. + + +### Lineage + +There are multiple lineage collector implementations as Redshift does not support table lineage out of the box. + +#### stl_scan_based +The stl_scan based collector uses Redshift's [stl_insert](https://docs.aws.amazon.com/redshift/latest/dg/r_STL_INSERT.html) and [stl_scan](https://docs.aws.amazon.com/redshift/latest/dg/r_STL_SCAN.html) system tables to +discover lineage between tables. +Pros: +- Fast +- Reliable + +Cons: +- Does not work with Spectrum/external tables because those scans do not show up in stl_scan table. +- If a table is depending on a view then the view won't be listed as dependency. Instead the table will be connected with the view's dependencies. + +#### sql_based +The sql_based based collector uses Redshift's [stl_insert](https://docs.aws.amazon.com/redshift/latest/dg/r_STL_INSERT.html) to discover all the insert queries +and uses sql parsing to discover the dependencies. + +Pros: +- Works with Spectrum tables +- Views are connected properly if a table depends on it + +Cons: +- Slow. +- Less reliable as the query parser can fail on certain queries + +#### mixed +Using both collector above and first applying the sql based and then the stl_scan based one. + +Pros: +- Works with Spectrum tables +- Views are connected properly if a table depends on it +- A bit more reliable than the sql_based one only + +Cons: +- Slow +- May be incorrect at times as the query parser can fail on certain queries + +:::note + +The redshift stl redshift tables which are used for getting data lineage retain at most seven days of log history, and sometimes closer to 2-5 days. This means you cannot extract lineage from queries issued outside that window. + +::: + +### Datashares Lineage +This is enabled by default, can be disabled via setting `include_share_lineage: False` + +It is mandatory to run redshift ingestion of datashare producer namespace at least once so that lineage +shows up correctly after datashare consumer namespace is ingested. + +### Profiling +Profiling runs sql queries on the redshift cluster to get statistics about the tables. To be able to do that, the user needs to have read access to the tables that should be profiled. + +If you don't want to grant read access to the tables you can enable table level profiling which will get table statistics without reading the data. +```yaml +profiling: + profile_table_level_only: true +``` \ No newline at end of file diff --git a/metadata-ingestion/src/datahub/api/entities/common/serialized_value.py b/metadata-ingestion/src/datahub/api/entities/common/serialized_value.py index 0f13ea04ab0753..feed42cd2fde34 100644 --- a/metadata-ingestion/src/datahub/api/entities/common/serialized_value.py +++ b/metadata-ingestion/src/datahub/api/entities/common/serialized_value.py @@ -1,6 +1,6 @@ import json import logging -from typing import Dict, Optional, Type, Union +from typing import Dict, Optional, Type, TypeVar, Union from avrogen.dict_wrapper import DictWrapper from pydantic import BaseModel @@ -13,6 +13,7 @@ _REMAPPED_SCHEMA_TYPES = { k.replace("pegasus2avro.", ""): v for k, v in SCHEMA_TYPES.items() } +T = TypeVar("T", bound=BaseModel) class SerializedResourceValue(BaseModel): @@ -83,8 +84,8 @@ def as_pegasus_object(self) -> DictWrapper: ) def as_pydantic_object( - self, model_type: Type[BaseModel], validate_schema_ref: bool = False - ) -> BaseModel: + self, model_type: Type[T], validate_schema_ref: bool = False + ) -> T: """ Parse the blob into a Pydantic-defined Python object based on the schema type and schema ref. diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/config.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/config.py index 932ada0a908b28..80b834edb3940d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/config.py @@ -128,6 +128,10 @@ class RedshiftConfig( default=True, description="Whether lineage should be collected from copy commands", ) + include_share_lineage: bool = Field( + default=True, + description="Whether lineage should be collected from datashares", + ) include_usage_statistics: bool = Field( default=False, diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/datashares.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/datashares.py new file mode 100644 index 00000000000000..a4e7d509fda5e4 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/datashares.py @@ -0,0 +1,236 @@ +from typing import Dict, Iterable, List, Optional, Union + +from pydantic import BaseModel + +from datahub.api.entities.platformresource.platform_resource import ( + ElasticPlatformResourceQuery, + PlatformResource, + PlatformResourceKey, + PlatformResourceSearchFields, +) +from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.graph.client import DataHubGraph +from datahub.ingestion.source.redshift.config import RedshiftConfig +from datahub.ingestion.source.redshift.redshift_schema import ( + InboundDatashare, + OutboundDatashare, + PartialInboundDatashare, + RedshiftTable, + RedshiftView, +) +from datahub.ingestion.source.redshift.report import RedshiftReport +from datahub.sql_parsing.sql_parsing_aggregator import KnownLineageMapping +from datahub.utilities.search_utils import LogicalOperator + + +class OutboundSharePlatformResource(BaseModel): + namespace: str + platform_instance: Optional[str] + env: str + source_database: str + share_name: str + + def get_key(self) -> str: + return f"{self.namespace}.{self.share_name}" + + +PLATFORM_RESOURCE_TYPE = "OUTBOUND_DATASHARE" + + +class RedshiftDatasharesHelper: + """ + Redshift datashares lineage generation relies on PlatformResource entity + to identify the producer namespace and its platform_instance and env + + Ingestion of any database in namespace will + A. generate PlatformResource entity for all outbound shares in namespace. + B. generate lineage with upstream tables from another namespace, if the database + is created from an inbound share + + """ + + def __init__( + self, + config: RedshiftConfig, + report: RedshiftReport, + graph: Optional[DataHubGraph], + ): + self.platform = "redshift" + self.config = config + self.report = report + self.graph = graph + + def to_platform_resource( + self, shares: List[OutboundDatashare] + ) -> Iterable[MetadataChangeProposalWrapper]: + if not shares: + self.report.outbound_shares_count = 0 + return + + self.report.outbound_shares_count = len(shares) + # Producer namespace will be current namespace for all + # outbound data shares + + for share in shares: + producer_namespace = share.producer_namespace + try: + platform_resource_key = PlatformResourceKey( + platform=self.platform, + platform_instance=self.config.platform_instance, + resource_type=PLATFORM_RESOURCE_TYPE, + primary_key=share.get_key(), + ) + + value = OutboundSharePlatformResource( + namespace=producer_namespace, + platform_instance=self.config.platform_instance, + env=self.config.env, + source_database=share.source_database, + share_name=share.share_name, + ) + + platform_resource = PlatformResource.create( + key=platform_resource_key, + value=value, + secondary_keys=[share.share_name, share.producer_namespace], + ) + + yield from platform_resource.to_mcps() + + except Exception as exc: + self.report.warning( + title="Downstream lineage to outbound datashare may not work", + message="Failed to generate platform resource for outbound datashares", + context=f"Namespace {share.producer_namespace} Share {share.share_name}", + exc=exc, + ) + + def generate_lineage( + self, + share: Union[InboundDatashare, PartialInboundDatashare], + tables: Dict[str, List[Union[RedshiftTable, RedshiftView]]], + ) -> Iterable[KnownLineageMapping]: + upstream_share = self.find_upstream_share(share) + + if not upstream_share: + return + + for schema in tables: + for table in tables[schema]: + dataset_urn = self.gen_dataset_urn( + f"{share.consumer_database}.{schema}.{table.name}", + self.config.platform_instance, + self.config.env, + ) + + upstream_dataset_urn = self.gen_dataset_urn( + f"{upstream_share.source_database}.{schema}.{table.name}", + upstream_share.platform_instance, + upstream_share.env, + ) + + yield KnownLineageMapping( + upstream_urn=upstream_dataset_urn, downstream_urn=dataset_urn + ) + + def find_upstream_share( + self, share: Union[InboundDatashare, PartialInboundDatashare] + ) -> Optional[OutboundSharePlatformResource]: + if not self.graph: + self.report.warning( + title="Upstream lineage of inbound datashare will be missing", + message="Missing datahub graph. Either use the datahub-rest sink or " + "set the top-level datahub_api config in the recipe", + ) + else: + resources = self.get_platform_resources(self.graph, share) + + if len(resources) == 0 or ( + not any( + [ + resource.resource_info is not None + and resource.resource_info.resource_type + == PLATFORM_RESOURCE_TYPE + for resource in resources + ] + ) + ): + self.report.info( + title="Upstream lineage of inbound datashare will be missing", + message="Missing platform resource for share. " + "Setup redshift ingestion for namespace if not already done. If ingestion is setup, " + "check whether ingestion user has ALTER/SHARE permission to share.", + context=share.get_description(), + ) + else: + # Ideally we should get only one resource as primary key is namespace+share + # and type is "OUTBOUND_DATASHARE" + for resource in resources: + try: + assert ( + resource.resource_info is not None + and resource.resource_info.value is not None + ) + return resource.resource_info.value.as_pydantic_object( + OutboundSharePlatformResource, True + ) + except Exception as e: + self.report.warning( + title="Upstream lineage of inbound datashare will be missing", + message="Failed to parse platform resource for outbound datashare", + context=share.get_description(), + exc=e, + ) + + return None + + def get_platform_resources( + self, + graph: DataHubGraph, + share: Union[InboundDatashare, PartialInboundDatashare], + ) -> List[PlatformResource]: + # NOTE: ideally we receive InboundDatashare and not PartialInboundDatashare. + # however due to varchar(128) type of database table that captures datashare options + # we may receive only partial information about inbound share + # Alternate option to get InboundDatashare using svv_datashares requires superuser + if isinstance(share, PartialInboundDatashare): + return list( + PlatformResource.search_by_filters( + graph, + ElasticPlatformResourceQuery.create_from() + .group(LogicalOperator.AND) + .add_field_match( + PlatformResourceSearchFields.RESOURCE_TYPE, + PLATFORM_RESOURCE_TYPE, + ) + .add_field_match( + PlatformResourceSearchFields.PLATFORM, self.platform + ) + .add_field_match( + PlatformResourceSearchFields.SECONDARY_KEYS, + share.share_name, + ) + .add_wildcard( + PlatformResourceSearchFields.SECONDARY_KEYS.field_name, + f"{share.producer_namespace_prefix}*", + ) + .end(), + ) + ) + return list( + PlatformResource.search_by_key( + graph, key=share.get_key(), primary=True, is_exact=True + ) + ) + + # TODO: Refactor and move to new RedshiftIdentifierBuilder class + def gen_dataset_urn( + self, datahub_dataset_name: str, platform_instance: Optional[str], env: str + ) -> str: + return make_dataset_urn_with_platform_instance( + platform=self.platform, + name=datahub_dataset_name, + platform_instance=platform_instance, + env=env, + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py index 4b3d238a13261c..3b9b069902542a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py @@ -813,9 +813,13 @@ def get_lineage( ) tablename = table.name - if table.type == "EXTERNAL_TABLE": + if ( + table.is_external_table + and schema.is_external_schema + and schema.external_platform + ): # external_db_params = schema.option - upstream_platform = schema.type.lower() + upstream_platform = schema.external_platform.lower() catalog_upstream = UpstreamClass( mce_builder.make_dataset_urn_with_platform_instance( upstream_platform, diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage_v2.py index 4b84c25965a994..1f43d0c3bf4887 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage_v2.py @@ -401,11 +401,14 @@ def _process_external_tables( ) -> None: for schema_name, tables in all_tables[self.database].items(): for table in tables: - if table.type == "EXTERNAL_TABLE": - schema = db_schemas[self.database][schema_name] - + schema = db_schemas[self.database][schema_name] + if ( + table.is_external_table + and schema.is_external_schema + and schema.external_platform + ): # external_db_params = schema.option - upstream_platform = schema.type.lower() + upstream_platform = schema.external_platform.lower() table_urn = mce_builder.make_dataset_urn_with_platform_instance( self.platform, diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/profile.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/profile.py index 6f611fa6741879..4ef79062670415 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/profile.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/profile.py @@ -48,7 +48,7 @@ def get_workunits( if not self.config.schema_pattern.allowed(schema): continue for table in tables[db].get(schema, {}): - if table.type == "EXTERNAL_TABLE": + if table.is_external_table: if not self.config.profiling.profile_external_tables: # Case 1: If user did not tell us to profile external tables, simply log this. self.report.profiling_skipped_other[schema] += 1 diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py index 62f7d0a3901c7a..0c101dbb5ec676 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py @@ -31,40 +31,62 @@ def get_temp_table_clause(table_name: str) -> List[str]: AND (datname <> ('template1')::name) """ - list_schemas: str = """SELECT distinct n.nspname AS "schema_name", - 'local' as schema_type, - null as schema_owner_name, - '' as schema_option, - null as external_database - FROM pg_catalog.pg_class c - LEFT JOIN pg_catalog.pg_namespace n ON n.oid = c.relnamespace - JOIN pg_catalog.pg_user u ON u.usesysid = c.relowner - WHERE c.relkind IN ('r','v','m','S','f') - AND n.nspname !~ '^pg_' - AND n.nspname != 'information_schema' -UNION ALL -SELECT schemaname as schema_name, - CASE s.eskind - WHEN '1' THEN 'GLUE' - WHEN '2' THEN 'HIVE' - WHEN '3' THEN 'POSTGRES' - WHEN '4' THEN 'REDSHIFT' - ELSE 'OTHER' - END as schema_type, - -- setting user_name to null as we don't use it now now and it breaks backward compatibility due to additional permission need - -- usename as schema_owner_name, - null as schema_owner_name, - esoptions as schema_option, - databasename as external_database + # NOTE: although schema owner id is available in tables, we do not use it + # as getting username from id requires access to pg_catalog.pg_user_info + # which is available only to superusers. + # NOTE: Need union here instead of using svv_all_schemas, in order to get + # external platform related lineage + # NOTE: Using database_name filter for svv_redshift_schemas, as otherwise + # schemas from other shared databases also show up. + @staticmethod + def list_schemas(database: str) -> str: + return f""" + SELECT + schema_name, + schema_type, + schema_option, + cast(null as varchar(256)) as external_platform, + cast(null as varchar(256)) as external_database + FROM svv_redshift_schemas + WHERE database_name = '{database}' + AND schema_name != 'pg_catalog' and schema_name != 'information_schema' + UNION ALL + SELECT + schemaname as schema_name, + 'external' as schema_type, + esoptions as schema_option, + CASE s.eskind + WHEN '1' THEN 'GLUE' + WHEN '2' THEN 'HIVE' + WHEN '3' THEN 'POSTGRES' + WHEN '4' THEN 'REDSHIFT' + ELSE 'OTHER' + END as external_platform, + databasename as external_database FROM SVV_EXTERNAL_SCHEMAS as s - -- inner join pg_catalog.pg_user_info as i on i.usesysid = s.esowner ORDER BY SCHEMA_NAME; """ + @staticmethod + def get_database_details(database): + return f"""\ + select + database_name, + database_type, + database_options + from svv_redshift_databases + where database_name='{database}';""" + + # NOTE: although table owner id is available in tables, we do not use it + # as getting username from id requires access to pg_catalog.pg_user_info + # which is available only to superusers. + # NOTE: Tables from shared database are not available in pg_catalog.pg_class @staticmethod def list_tables( - skip_external_tables: bool = False, + skip_external_tables: bool = False, is_shared_database: bool = False ) -> str: + # NOTE: it looks like description is available only in pg_description + # So this remains preferrred way tables_query = """ SELECT CASE c.relkind WHEN 'r' THEN 'TABLE' @@ -83,8 +105,6 @@ def list_tables( WHEN 8 THEN 'ALL' END AS "diststyle", c.relowner AS "owner_id", - -- setting user_name to null as we don't use it now now and it breaks backward compatibility due to additional permission need - -- u.usename AS "owner_name", null as "owner_name", TRIM(TRAILING ';' FROM pg_catalog.pg_get_viewdef (c.oid,TRUE)) AS "view_definition", pg_catalog.array_to_string(c.relacl,'\n') AS "privileges", @@ -98,11 +118,11 @@ def list_tables( LEFT JOIN pg_catalog.pg_namespace n ON n.oid = c.relnamespace LEFT JOIN pg_class_info as ci on c.oid = ci.reloid LEFT JOIN pg_catalog.pg_description pgd ON pgd.objsubid = 0 AND pgd.objoid = c.oid - -- JOIN pg_catalog.pg_user u ON u.usesysid = c.relowner WHERE c.relkind IN ('r','v','m','S','f') AND n.nspname !~ '^pg_' AND n.nspname != 'information_schema' """ + external_tables_query = """ SELECT 'EXTERNAL_TABLE' as tabletype, NULL AS "schema_oid", @@ -125,13 +145,62 @@ def list_tables( ORDER BY "schema", "relname" """ - if skip_external_tables: + shared_database_tables_query = """ + SELECT table_type as tabletype, + NULL AS "schema_oid", + schema_name AS "schema", + NULL AS "rel_oid", + table_name AS "relname", + NULL as "creation_time", + NULL AS "diststyle", + table_owner AS "owner_id", + NULL AS "owner_name", + NULL AS "view_definition", + table_acl AS "privileges", + NULL as "location", + NULL as parameters, + NULL as input_format, + NULL As output_format, + NULL as serde_parameters, + NULL as table_description + FROM svv_redshift_tables + ORDER BY "schema", + "relname" +""" + if is_shared_database: + return shared_database_tables_query + elif skip_external_tables: return tables_query else: return f"{tables_query} UNION {external_tables_query}" - # Why is this unused. Is this a bug? - list_columns: str = """ + @staticmethod + def list_columns(is_shared_database: bool = False) -> str: + if is_shared_database: + return """ + SELECT + schema_name as "schema", + table_name as "table_name", + column_name as "name", + encoding as "encode", + -- Spectrum represents data types differently. + -- Standardize, so we can infer types. + data_type AS "type", + distkey as "distkey", + sortkey as "sortkey", + (case when is_nullable = 'no' then TRUE else FALSE end) as "notnull", + null as "comment", + null as "adsrc", + ordinal_position as "attnum", + data_type AS "format_type", + column_default as "default", + null as "schema_oid", + null as "table_oid" + FROM SVV_REDSHIFT_COLUMNS + WHERE 1 and schema = '{schema_name}' + ORDER BY "schema", "table_name", "attnum" +""" + return """ SELECT n.nspname as "schema", c.relname as "table_name", @@ -362,6 +431,29 @@ def list_insert_create_queries_sql( ) -> str: raise NotImplementedError + @staticmethod + def list_outbound_datashares() -> str: + return """SELECT \ + share_type, \ + share_name, \ + trim(producer_namespace) as producer_namespace, \ + source_database \ + FROM svv_datashares + WHERE share_type='OUTBOUND'\ + """ + + @staticmethod + def get_inbound_datashare(database: str) -> str: + return f"""SELECT \ + share_type, \ + share_name, \ + trim(producer_namespace) as producer_namespace, \ + consumer_database \ + FROM svv_datashares + WHERE share_type='INBOUND' + AND consumer_database= '{database}'\ + """ + class RedshiftProvisionedQuery(RedshiftCommonQuery): @staticmethod diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py index cce282c71056a2..5a357001884423 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py @@ -33,7 +33,10 @@ TestableSource, TestConnectionReport, ) -from datahub.ingestion.api.source_helpers import create_dataset_props_patch_builder +from datahub.ingestion.api.source_helpers import ( + auto_workunit, + create_dataset_props_patch_builder, +) from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.glossary.classification_mixin import ( ClassificationHandler, @@ -45,6 +48,7 @@ DatasetSubTypes, ) from datahub.ingestion.source.redshift.config import RedshiftConfig +from datahub.ingestion.source.redshift.datashares import RedshiftDatasharesHelper from datahub.ingestion.source.redshift.exception import handle_redshift_exceptions_yield from datahub.ingestion.source.redshift.lineage import RedshiftLineageExtractor from datahub.ingestion.source.redshift.lineage_v2 import RedshiftSqlLineageV2 @@ -52,6 +56,7 @@ from datahub.ingestion.source.redshift.redshift_data_reader import RedshiftDataReader from datahub.ingestion.source.redshift.redshift_schema import ( RedshiftColumn, + RedshiftDatabase, RedshiftDataDictionary, RedshiftSchema, RedshiftTable, @@ -150,76 +155,6 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource): - Table, row, and column statistics via optional SQL profiling - Table lineage - Usage statistics - - ### Prerequisites - - This source needs to access system tables that require extra permissions. - To grant these permissions, please alter your datahub Redshift user the following way: - ```sql - ALTER USER datahub_user WITH SYSLOG ACCESS UNRESTRICTED; - GRANT SELECT ON pg_catalog.svv_table_info to datahub_user; - GRANT SELECT ON pg_catalog.svl_user_info to datahub_user; - ``` - - :::note - - Giving a user unrestricted access to system tables gives the user visibility to data generated by other users. For example, STL_QUERY and STL_QUERYTEXT contain the full text of INSERT, UPDATE, and DELETE statements. - - ::: - - ### Lineage - - There are multiple lineage collector implementations as Redshift does not support table lineage out of the box. - - #### stl_scan_based - The stl_scan based collector uses Redshift's [stl_insert](https://docs.aws.amazon.com/redshift/latest/dg/r_STL_INSERT.html) and [stl_scan](https://docs.aws.amazon.com/redshift/latest/dg/r_STL_SCAN.html) system tables to - discover lineage between tables. - Pros: - - Fast - - Reliable - - Cons: - - Does not work with Spectrum/external tables because those scans do not show up in stl_scan table. - - If a table is depending on a view then the view won't be listed as dependency. Instead the table will be connected with the view's dependencies. - - #### sql_based - The sql_based based collector uses Redshift's [stl_insert](https://docs.aws.amazon.com/redshift/latest/dg/r_STL_INSERT.html) to discover all the insert queries - and uses sql parsing to discover the dependencies. - - Pros: - - Works with Spectrum tables - - Views are connected properly if a table depends on it - - Cons: - - Slow. - - Less reliable as the query parser can fail on certain queries - - #### mixed - Using both collector above and first applying the sql based and then the stl_scan based one. - - Pros: - - Works with Spectrum tables - - Views are connected properly if a table depends on it - - A bit more reliable than the sql_based one only - - Cons: - - Slow - - May be incorrect at times as the query parser can fail on certain queries - - :::note - - The redshift stl redshift tables which are used for getting data lineage retain at most seven days of log history, and sometimes closer to 2-5 days. This means you cannot extract lineage from queries issued outside that window. - - ::: - - ### Profiling - Profiling runs sql queries on the redshift cluster to get statistics about the tables. To be able to do that, the user needs to have read access to the tables that should be profiled. - - If you don't want to grant read access to the tables you can enable table level profiling which will get table statistics without reading the data. - ```yaml - profiling: - profile_table_level_only: true - ``` """ # TODO: Replace with standardized types in sql_types.py @@ -330,6 +265,9 @@ def __init__(self, config: RedshiftConfig, ctx: PipelineContext): self.config: RedshiftConfig = config self.report: RedshiftReport = RedshiftReport() self.classification_handler = ClassificationHandler(self.config, self.report) + self.datashares_helper = RedshiftDatasharesHelper( + self.config, self.report, self.ctx.graph + ) self.platform = "redshift" self.domain_registry = None if self.config.domain: @@ -361,6 +299,7 @@ def __init__(self, config: RedshiftConfig, ctx: PipelineContext): is_serverless=self.config.is_serverless ) + self.db: Optional[RedshiftDatabase] = None self.db_tables: Dict[str, Dict[str, List[RedshiftTable]]] = {} self.db_views: Dict[str, Dict[str, List[RedshiftView]]] = {} self.db_schemas: Dict[str, Dict[str, RedshiftSchema]] = {} @@ -424,6 +363,11 @@ def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit database = self.config.database logger.info(f"Processing db {database}") + + self.db = self.data_dictionary.get_database_details(connection, database) + self.report.is_shared_database = ( + self.db is not None and self.db.is_shared_database + ) with self.report.new_stage(METADATA_EXTRACTION): self.db_tables[database] = defaultdict() self.db_views[database] = defaultdict() @@ -563,7 +507,9 @@ def process_schema( schema_columns: Dict[str, Dict[str, List[RedshiftColumn]]] = {} schema_columns[schema.name] = self.data_dictionary.get_columns_for_schema( - conn=connection, schema=schema + conn=connection, + schema=schema, + is_shared_database=self.report.is_shared_database, ) if self.config.include_tables: @@ -887,6 +833,7 @@ def cache_tables_and_views(self, connection, database): tables, views = self.data_dictionary.get_tables_and_views( conn=connection, skip_external_tables=self.config.skip_external_tables, + is_shared_database=self.report.is_shared_database, ) for schema in tables: if not is_schema_allowed( @@ -1029,6 +976,28 @@ def extract_lineage_v2( database: str, lineage_extractor: RedshiftSqlLineageV2, ) -> Iterable[MetadataWorkUnit]: + if self.config.include_share_lineage: + outbound_shares = self.data_dictionary.get_outbound_datashares(connection) + yield from auto_workunit( + self.datashares_helper.to_platform_resource(list(outbound_shares)) + ) + + if self.db and self.db.is_shared_database: + inbound_share = self.db.get_inbound_share() + if inbound_share is None: + self.report.warning( + title="Upstream lineage of inbound datashare will be missing", + message="Database options do not contain sufficient information", + context=f"Database: {database}, Options {self.db.options}", + ) + else: + for known_lineage in self.datashares_helper.generate_lineage( + inbound_share, self.get_all_tables()[database] + ): + lineage_extractor.aggregator.add(known_lineage) + + # TODO: distinguish between definition level lineage and audit log based lineage + # definition level lineage should never be skipped if not self._should_ingest_lineage(): return diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift_schema.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift_schema.py index 594f88dd521ad5..73456d445d0847 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift_schema.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift_schema.py @@ -1,7 +1,8 @@ import logging +import re from dataclasses import dataclass, field from datetime import datetime, timezone -from typing import Dict, Iterable, List, Optional, Tuple +from typing import Dict, Iterable, List, Optional, Tuple, Union import redshift_connector @@ -41,6 +42,10 @@ class RedshiftTable(BaseTable): serde_parameters: Optional[str] = None last_altered: Optional[datetime] = None + @property + def is_external_table(self) -> bool: + return self.type == "EXTERNAL_TABLE" + @dataclass class RedshiftView(BaseTable): @@ -51,6 +56,10 @@ class RedshiftView(BaseTable): size_in_bytes: Optional[int] = None rows_count: Optional[int] = None + @property + def is_external_table(self) -> bool: + return self.type == "EXTERNAL_TABLE" + @dataclass class RedshiftSchema: @@ -59,8 +68,102 @@ class RedshiftSchema: type: str owner: Optional[str] = None option: Optional[str] = None + external_platform: Optional[str] = None external_database: Optional[str] = None + @property + def is_external_schema(self) -> bool: + return self.type == "external" + + +@dataclass +class PartialInboundDatashare: + share_name: str + producer_namespace_prefix: str + consumer_database: str + + def get_description(self) -> str: + return ( + f"Namespace Prefix {self.producer_namespace_prefix} Share {self.share_name}" + ) + + +@dataclass +class OutboundDatashare: + share_name: str + producer_namespace: str + source_database: str + + def get_key(self) -> str: + return f"{self.producer_namespace}.{self.share_name}" + + +@dataclass +class InboundDatashare: + share_name: str + producer_namespace: str + consumer_database: str + + def get_key(self) -> str: + return f"{self.producer_namespace}.{self.share_name}" + + def get_description(self) -> str: + return f"Namespace {self.producer_namespace} Share {self.share_name}" + + +@dataclass +class RedshiftDatabase: + name: str + type: str + options: Optional[str] = None + + @property + def is_shared_database(self) -> bool: + return self.type == "shared" + + # NOTE: ideally options are in form + # {"datashare_name":"xxx","datashare_producer_account":"1234","datashare_producer_namespace":"yyy"} + # however due to varchar(128) type of database table that captures options + # we may receive only partial information about inbound share + def get_inbound_share( + self, + ) -> Optional[Union[InboundDatashare, PartialInboundDatashare]]: + if not self.is_shared_database or not self.options: + return None + + # Convert into single regex ?? + share_name_match = re.search(r'"datashare_name"\s*:\s*"([^"]*)"', self.options) + namespace_match = re.search( + r'"datashare_producer_namespace"\s*:\s*"([^"]*)"', self.options + ) + partial_namespace_match = re.search( + r'"datashare_producer_namespace"\s*:\s*"([^"]*)$', self.options + ) + + if not share_name_match: + # We will always at least get share name + return None + + share_name = share_name_match.group(1) + if namespace_match: + return InboundDatashare( + share_name=share_name, + producer_namespace=namespace_match.group(1), + consumer_database=self.name, + ) + elif partial_namespace_match: + return PartialInboundDatashare( + share_name=share_name, + producer_namespace_prefix=partial_namespace_match.group(1), + consumer_database=self.name, + ) + else: + return PartialInboundDatashare( + share_name=share_name, + producer_namespace_prefix="", + consumer_database=self.name, + ) + @dataclass class RedshiftExtraTableMeta: @@ -141,13 +244,31 @@ def get_databases(conn: redshift_connector.Connection) -> List[str]: return [db[0] for db in dbs] + @staticmethod + def get_database_details( + conn: redshift_connector.Connection, database: str + ) -> Optional[RedshiftDatabase]: + cursor = RedshiftDataDictionary.get_query_result( + conn, + RedshiftCommonQuery.get_database_details(database), + ) + + row = cursor.fetchone() + if row is None: + return None + return RedshiftDatabase( + name=database, + type=row[1], + options=row[2], + ) + @staticmethod def get_schemas( conn: redshift_connector.Connection, database: str ) -> List[RedshiftSchema]: cursor = RedshiftDataDictionary.get_query_result( conn, - RedshiftCommonQuery.list_schemas.format(database_name=database), + RedshiftCommonQuery.list_schemas(database), ) schemas = cursor.fetchall() @@ -158,8 +279,8 @@ def get_schemas( database=database, name=schema[field_names.index("schema_name")], type=schema[field_names.index("schema_type")], - owner=schema[field_names.index("schema_owner_name")], option=schema[field_names.index("schema_option")], + external_platform=schema[field_names.index("external_platform")], external_database=schema[field_names.index("external_database")], ) for schema in schemas @@ -203,6 +324,7 @@ def get_tables_and_views( self, conn: redshift_connector.Connection, skip_external_tables: bool = False, + is_shared_database: bool = False, ) -> Tuple[Dict[str, List[RedshiftTable]], Dict[str, List[RedshiftView]]]: tables: Dict[str, List[RedshiftTable]] = {} views: Dict[str, List[RedshiftView]] = {} @@ -213,7 +335,10 @@ def get_tables_and_views( cur = RedshiftDataDictionary.get_query_result( conn, - RedshiftCommonQuery.list_tables(skip_external_tables=skip_external_tables), + RedshiftCommonQuery.list_tables( + skip_external_tables=skip_external_tables, + is_shared_database=is_shared_database, + ), ) field_names = [i[0] for i in cur.description] db_tables = cur.fetchall() @@ -358,11 +483,15 @@ def get_schema_fields_for_column( @staticmethod def get_columns_for_schema( - conn: redshift_connector.Connection, schema: RedshiftSchema + conn: redshift_connector.Connection, + schema: RedshiftSchema, + is_shared_database: bool = False, ) -> Dict[str, List[RedshiftColumn]]: cursor = RedshiftDataDictionary.get_query_result( conn, - RedshiftCommonQuery.list_columns.format(schema_name=schema.name), + RedshiftCommonQuery.list_columns( + is_shared_database=is_shared_database + ).format(schema_name=schema.name), ) table_columns: Dict[str, List[RedshiftColumn]] = {} @@ -508,3 +637,34 @@ def get_alter_table_commands( start_time=row[field_names.index("start_time")], ) rows = cursor.fetchmany() + + @staticmethod + def get_outbound_datashares( + conn: redshift_connector.Connection, + ) -> Iterable[OutboundDatashare]: + cursor = conn.cursor() + cursor.execute(RedshiftCommonQuery.list_outbound_datashares()) + for item in cursor.fetchall(): + yield OutboundDatashare( + share_name=item[1], + producer_namespace=item[2], + source_database=item[3], + ) + + # NOTE: this is not used right now as it requires superuser privilege + # We can use this in future if the permissions are lowered. + @staticmethod + def get_inbound_datashare( + conn: redshift_connector.Connection, + database: str, + ) -> Optional[InboundDatashare]: + cursor = conn.cursor() + cursor.execute(RedshiftCommonQuery.get_inbound_datashare(database)) + item = cursor.fetchone() + if item: + return InboundDatashare( + share_name=item[1], + producer_namespace=item[2], + consumer_database=item[3], + ) + return None diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/report.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/report.py index 2748f2a588a930..047df69555b737 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/report.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/report.py @@ -60,5 +60,8 @@ class RedshiftReport( sql_aggregator: Optional[SqlAggregatorReport] = None lineage_phases_timer: Dict[str, PerfTimer] = field(default_factory=dict) + is_shared_database: bool = False + outbound_shares_count: Optional[int] = None + def report_dropped(self, key: str) -> None: self.filtered.append(key) diff --git a/metadata-ingestion/tests/unit/redshift/test_redshift_datashares.py b/metadata-ingestion/tests/unit/redshift/test_redshift_datashares.py new file mode 100644 index 00000000000000..3167341f4a0b90 --- /dev/null +++ b/metadata-ingestion/tests/unit/redshift/test_redshift_datashares.py @@ -0,0 +1,486 @@ +import json +from typing import Dict, List, Union +from unittest.mock import patch + +from datahub.api.entities.platformresource.platform_resource import ( + PlatformResource, + PlatformResourceKey, +) +from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph +from datahub.ingestion.source.redshift.config import RedshiftConfig +from datahub.ingestion.source.redshift.datashares import ( + InboundDatashare, + OutboundDatashare, + OutboundSharePlatformResource, + RedshiftDatasharesHelper, + RedshiftTable, + RedshiftView, +) +from datahub.ingestion.source.redshift.redshift_schema import PartialInboundDatashare +from datahub.ingestion.source.redshift.report import RedshiftReport +from datahub.metadata.schema_classes import ( + PlatformResourceInfoClass, + SerializedValueClass, + SerializedValueContentTypeClass, +) +from datahub.sql_parsing.sql_parsing_aggregator import KnownLineageMapping + + +def get_redshift_config(): + return RedshiftConfig( + host_port="localhost:5439", + database="XXXXXXX", + username="XXXXXXXXX", + password="XXXX_password", + platform_instance="consumer_instance", + ) + + +def get_datahub_graph(): + """ + Mock DataHubGraph instance for testing purposes. + """ + graph = DataHubGraph(DatahubClientConfig(server="xxxx")) + return graph + + +class TestDatasharesHelper: + def test_generate_lineage_success(self): + """ + Test generate_lineage method when share and graph exist, resources are found, + and upstream namespace and database are successfully identified. + """ + # Setup + config = get_redshift_config() + report = RedshiftReport() + graph = get_datahub_graph() + helper = RedshiftDatasharesHelper(config, report, graph) + + # Mock input data + share = InboundDatashare( + producer_namespace="producer_namespace", + share_name="test_share", + consumer_database="consumer_db", + ) + tables: Dict[str, List[Union[RedshiftTable, RedshiftView]]] = { + "schema1": [ + RedshiftTable(name="table1", comment=None, created=None), + RedshiftTable(name="table2", comment=None, created=None), + ], + "schema2": [RedshiftTable(name="table3", comment=None, created=None)], + } + + # Mock PlatformResource.search_by_key + def mock_search_by_key(*args, **kwargs): + resource = PlatformResource.create( + key=PlatformResourceKey( + platform="redshift", + platform_instance="producer_instance", + resource_type="OUTBOUND_DATASHARE", + primary_key="producer_namespace.some_share", + ), + value=OutboundSharePlatformResource( + namespace="producer_namespace", + platform_instance="producer_instance", + env="PROD", + source_database="producer_db", + share_name="test_share", + ), + ) + + return [resource] + + with patch.object(PlatformResource, "search_by_key") as mocked_method: + mocked_method.side_effect = mock_search_by_key + result = list(helper.generate_lineage(share, tables)) + # Assert + assert len(result) == 3 + expected_mappings = [ + KnownLineageMapping( + upstream_urn="urn:li:dataset:(urn:li:dataPlatform:redshift,producer_instance.producer_db.schema1.table1,PROD)", + downstream_urn="urn:li:dataset:(urn:li:dataPlatform:redshift,consumer_instance.consumer_db.schema1.table1,PROD)", + ), + KnownLineageMapping( + upstream_urn="urn:li:dataset:(urn:li:dataPlatform:redshift,producer_instance.producer_db.schema1.table2,PROD)", + downstream_urn="urn:li:dataset:(urn:li:dataPlatform:redshift,consumer_instance.consumer_db.schema1.table2,PROD)", + ), + KnownLineageMapping( + upstream_urn="urn:li:dataset:(urn:li:dataPlatform:redshift,producer_instance.producer_db.schema2.table3,PROD)", + downstream_urn="urn:li:dataset:(urn:li:dataPlatform:redshift,consumer_instance.consumer_db.schema2.table3,PROD)", + ), + ] + assert result == expected_mappings + + def test_generate_lineage_success_partial_inbound_share(self): + """ + Test generate_lineage method when share and graph exist, resources are found, + and upstream namespace and database are successfully identified. + """ + # Setup + config = get_redshift_config() + report = RedshiftReport() + graph = get_datahub_graph() + helper = RedshiftDatasharesHelper(config, report, graph) + + # Mock input data + share = PartialInboundDatashare( + producer_namespace_prefix="producer_na", + share_name="test_share", + consumer_database="consumer_db", + ) + tables: Dict[str, List[Union[RedshiftTable, RedshiftView]]] = { + "schema1": [ + RedshiftTable(name="table1", comment=None, created=None), + RedshiftTable(name="table2", comment=None, created=None), + ], + "schema2": [RedshiftTable(name="table3", comment=None, created=None)], + } + + # Mock PlatformResource.search_by_key + def mock_search_by_filters(*args, **kwargs): + resource = PlatformResource.create( + key=PlatformResourceKey( + platform="redshift", + platform_instance="producer_instance", + resource_type="OUTBOUND_DATASHARE", + primary_key="producer_namespace.some_share", + ), + value=OutboundSharePlatformResource( + namespace="producer_namespace", + platform_instance="producer_instance", + env="PROD", + source_database="producer_db", + share_name="test_share", + ), + ) + + return [resource] + + with patch.object(PlatformResource, "search_by_filters") as mocked_method: + mocked_method.side_effect = mock_search_by_filters + result = list(helper.generate_lineage(share, tables)) + # Assert + assert len(result) == 3 + expected_mappings = [ + KnownLineageMapping( + upstream_urn="urn:li:dataset:(urn:li:dataPlatform:redshift,producer_instance.producer_db.schema1.table1,PROD)", + downstream_urn="urn:li:dataset:(urn:li:dataPlatform:redshift,consumer_instance.consumer_db.schema1.table1,PROD)", + ), + KnownLineageMapping( + upstream_urn="urn:li:dataset:(urn:li:dataPlatform:redshift,producer_instance.producer_db.schema1.table2,PROD)", + downstream_urn="urn:li:dataset:(urn:li:dataPlatform:redshift,consumer_instance.consumer_db.schema1.table2,PROD)", + ), + KnownLineageMapping( + upstream_urn="urn:li:dataset:(urn:li:dataPlatform:redshift,producer_instance.producer_db.schema2.table3,PROD)", + downstream_urn="urn:li:dataset:(urn:li:dataPlatform:redshift,consumer_instance.consumer_db.schema2.table3,PROD)", + ), + ] + assert result == expected_mappings + + def test_generate_lineage_missing_graph_reports_warning(self): + """ + Test generate_lineage when share is provided but graph is not available. + + This test verifies that the method correctly handles the case where an InboundDatashare + is provided, but the DataHubGraph is not available. It should set the + self.is_shared_database flag to True and log a warning about missing upstream lineage. + """ + # Setup + config = get_redshift_config() + report = RedshiftReport() + graph = None + helper = RedshiftDatasharesHelper(config, report, graph) + + share = InboundDatashare( + producer_namespace="test_namespace", + share_name="test_share", + consumer_database="test_db", + ) + tables: Dict[str, List[Union[RedshiftTable, RedshiftView]]] = {} + + # Execute + list(helper.generate_lineage(share, tables)) + + # Assert + assert len(report.warnings) == 1 + + assert ( + list(report.warnings)[0].title + == "Upstream lineage of inbound datashare will be missing" + ) + assert "Missing datahub graph" in list(report.warnings)[0].message + + def test_generate_lineage_missing_producer_platform_resource(self): + """ + Test generate_lineage when share is provided, graph exists, but no resources are found. + + This test verifies that the method handles the case where an inbound datashare is provided, + the DataHubGraph is available, but no platform resources are found for the producer namespace. + It should result in a warning being reported and no lineage mappings being generated. + """ + # Setup + config = get_redshift_config() + report = RedshiftReport() + graph = get_datahub_graph() + helper = RedshiftDatasharesHelper(config, report, graph) + + # Create a mock InboundDatashare + share = InboundDatashare( + share_name="test_share", + producer_namespace="test_namespace", + consumer_database="test_db", + ) + + # Create mock tables + tables: Dict[str, List[Union[RedshiftTable, RedshiftView]]] = { + "schema1": [RedshiftTable(name="table1", created=None, comment=None)] + } + + # Mock the PlatformResource.search_by_key to return an empty list + with patch.object(PlatformResource, "search_by_key") as mocked_method: + mocked_method.return_value = [] + result = list(helper.generate_lineage(share, tables)) + + # Assertions + assert len(result) == 0, "No lineage mappings should be generated" + assert len(report.infos) == 1 + assert ( + list(report.infos)[0].title + == "Upstream lineage of inbound datashare will be missing" + ) + assert "Missing platform resource" in list(report.infos)[0].message + + def test_generate_lineage_malformed_share_platform_resource(self): + """ + Test generate_lineage method when share and graph exist, resources are found, + but upstream_share is None due to error in parsing resource info. + + This test verifies that the method handles the case where the upstream namespace is found, + but we failed to parse the value. + It should result in a warning being reported and no lineage mappings being generated. + """ + # Setup + config = get_redshift_config() + report = RedshiftReport() + graph = get_datahub_graph() + helper = RedshiftDatasharesHelper(config, report, graph) + + share = InboundDatashare( + producer_namespace="produer_namespace", + share_name="test_share", + consumer_database="consumer_db", + ) + tables: Dict[str, List[Union[RedshiftTable, RedshiftView]]] = { + "schema1": [RedshiftTable(name="table1", comment=None, created=None)] + } + + # Mock PlatformResource.search_by_key to return a resource + def mock_search_by_key(*args, **kwargs): + resource = PlatformResource.create( + key=PlatformResourceKey( + platform="redshift", + platform_instance="producer_instance", + resource_type="OUTBOUND_DATASHARE", + primary_key="producer_namespace.some_share", + ), + value={ + "namespace": "producer_namespace", + "platform_instance": "producer_instance", + "env": "PROD", + "outbound_share_name_to_source_database": {}, # Empty dict to simulate missing share + }, + ) + + return [resource] + + with patch.object(PlatformResource, "search_by_key") as mocked_method: + mocked_method.side_effect = mock_search_by_key + result = list(helper.generate_lineage(share, tables)) + + # Assert + assert len(result) == 0 + assert len(report.warnings) == 1 + assert ( + list(report.warnings)[0].title + == "Upstream lineage of inbound datashare will be missing" + ) + assert ( + "Failed to parse platform resource for outbound datashare" + in list(report.warnings)[0].message + ) + + def test_generate_lineage_shared_database_with_no_tables(self): + """ + Test generate_lineage with valid share but empty tables dictionary. + """ + config = get_redshift_config() + report = RedshiftReport() + graph = get_datahub_graph() # Mock or actual instance + helper = RedshiftDatasharesHelper(config, report, graph) + + share = InboundDatashare( + producer_namespace="producer_namespace", + consumer_database="db", + share_name="share", + ) + tables: Dict[str, List[Union[RedshiftTable, RedshiftView]]] = {} + + with patch.object(PlatformResource, "search_by_key") as mocked_method: + mocked_method.return_value = [] + result = list(helper.generate_lineage(share, tables)) + + assert len(result) == 0 + + def test_to_platform_resource_success(self): + """ + Test the to_platform_resource method when shares list is not empty. + + This test verifies that the method correctly processes a non-empty list of OutboundDatashare objects, + generates the appropriate PlatformResource, and yields the expected MetadataChangeProposalWrapper objects. + It also checks that the outbound_shares_count in the report is set correctly. + """ + # Setup + config = RedshiftConfig( + host_port="test_host", + database="test_db", + username="test_user", + password="test_pass", + platform_instance="test_instance", + ) + report = RedshiftReport() + helper = RedshiftDatasharesHelper(config, report, None) + + # Create test data + shares = [ + OutboundDatashare( + producer_namespace="test_namespace", + share_name="share1", + source_database="db1", + ), + OutboundDatashare( + producer_namespace="test_namespace", + share_name="share2", + source_database="db2", + ), + ] + + # Execute the method + result = list(helper.to_platform_resource(shares)) + + # Assertions + assert len(result) > 0, ( + "Expected at least one MetadataChangeProposalWrapper to be yielded" + ) + assert report.outbound_shares_count == 2, ( + "Expected outbound_shares_count to be 2" + ) + + # Check the content of the first MetadataChangeProposalWrapper + first_mcp = result[0] + assert first_mcp.entityType == "platformResource", ( + "Expected entityType to be platformResource" + ) + assert first_mcp.aspectName == "platformResourceInfo", ( + "Expected aspectName to be platformResourceInfo" + ) + + info = first_mcp.aspect + assert isinstance(info, PlatformResourceInfoClass) + + assert info.resourceType == "OUTBOUND_DATASHARE" + assert info.primaryKey == "test_namespace.share1" + + assert isinstance(info.value, SerializedValueClass) + assert info.value.contentType == SerializedValueContentTypeClass.JSON + assert info.value.blob == json.dumps( + { + "namespace": "test_namespace", + "platform_instance": "test_instance", + "env": "PROD", + "source_database": "db1", + "share_name": "share1", + }, + ).encode("utf-8") + + # Check the content of the first MetadataChangeProposalWrapper + fourth_mcp = result[3] + assert fourth_mcp.entityType == "platformResource", ( + "Expected entityType to be platformResource" + ) + assert fourth_mcp.aspectName == "platformResourceInfo", ( + "Expected aspectName to be platformResourceInfo" + ) + + info = fourth_mcp.aspect + assert isinstance(info, PlatformResourceInfoClass) + + assert info.resourceType == "OUTBOUND_DATASHARE" + assert info.primaryKey == "test_namespace.share2" + + assert isinstance(info.value, SerializedValueClass) + assert info.value.contentType == SerializedValueContentTypeClass.JSON + assert info.value.blob == json.dumps( + { + "namespace": "test_namespace", + "platform_instance": "test_instance", + "env": "PROD", + "source_database": "db2", + "share_name": "share2", + }, + ).encode("utf-8") + + def test_to_platform_resource_edge_case_single_share(self): + """ + Test the to_platform_resource method with a single share. + This edge case should still produce a valid result. + """ + config = get_redshift_config() + report = RedshiftReport() + helper = RedshiftDatasharesHelper(config, report, None) + + share = OutboundDatashare( + producer_namespace="test", share_name="share1", source_database="db1" + ) + + result = list(helper.to_platform_resource([share])) + + assert len(result) > 0 + assert report.outbound_shares_count == 1 + + def test_to_platform_resource_empty_input(self): + """ + Test the to_platform_resource method with an empty list of shares. + This should set the outbound_shares_count to 0 and return an empty iterable. + """ + config = get_redshift_config() + report = RedshiftReport() + helper = RedshiftDatasharesHelper(config, report, None) + + result = list(helper.to_platform_resource([])) + + assert len(result) == 0 + assert report.outbound_shares_count == 0 + + def test_to_platform_resource_exception_handling(self): + """ + Test the exception handling in the to_platform_resource method. + This should catch the exception and add a warning to the report. + """ + config = get_redshift_config() + report = RedshiftReport() + helper = RedshiftDatasharesHelper(config, report, None) + + # Create a share with invalid data to trigger an exception + invalid_share = OutboundDatashare( + producer_namespace=None, # type:ignore + share_name="x", + source_database="y", + ) + + list(helper.to_platform_resource([invalid_share])) + + assert len(report.warnings) == 1 + assert ( + list(report.warnings)[0].title + == "Downstream lineage to outbound datashare may not work" + ) From a700448bad373bf4447b7d1d3f683e15e6a0f249 Mon Sep 17 00:00:00 2001 From: Jonny Dixon <45681293+acrylJonny@users.noreply.github.com> Date: Thu, 6 Mar 2025 06:30:10 -0800 Subject: [PATCH 09/10] feat(ingestion/business-glossary): Automatically generate predictable glossary term and node URNs when incompatible URL characters are specified in term and node names. (#12673) --- docs/how/updating-datahub.md | 2 + .../datahub-business-glossary.md | 254 +++++----- .../source/metadata/business_glossary.py | 48 +- .../custom_ownership_urns_golden.json | 28 +- .../glossary_events_golden.json | 127 ++--- ...ultiple_owners_different_types_golden.json | 18 +- .../multiple_owners_same_type_golden.json | 18 +- .../single_owner_types_golden.json | 48 +- .../test_business_glossary.py | 32 +- .../url_cleaning_events_golden.json | 446 ++++++++++++++++++ .../url_cleaning_glossary.yml | 31 ++ .../remote/golden/remote_glossary_golden.json | 69 +-- .../tests/unit/test_business_glossary.py | 85 ++++ 13 files changed, 938 insertions(+), 268 deletions(-) create mode 100644 metadata-ingestion/tests/integration/business-glossary/url_cleaning_events_golden.json create mode 100644 metadata-ingestion/tests/integration/business-glossary/url_cleaning_glossary.yml create mode 100644 metadata-ingestion/tests/unit/test_business_glossary.py diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md index adb86c1bce1b37..47c6343858a0fc 100644 --- a/docs/how/updating-datahub.md +++ b/docs/how/updating-datahub.md @@ -20,6 +20,8 @@ This file documents any backwards-incompatible changes in DataHub and assists pe ### Breaking Changes +- #12673: Business Glossary ID generation has been modified to handle special characters and URL cleaning. When `enable_auto_id` is false (default), IDs are now generated by cleaning the name (converting spaces to hyphens, removing special characters except periods which are used as path separators) while preserving case. This may result in different IDs being generated for terms with special characters. + - #12580: The OpenAPI source handled nesting incorrectly. 12580 fixes it to create proper nested field paths, however, this will re-write the incorrect schemas of existing OpenAPI runs. - #12408: The `platform` field in the DataPlatformInstance GraphQL type is removed. Clients need to retrieve the platform via the optional `dataPlatformInstance` field. diff --git a/metadata-ingestion/docs/sources/business-glossary/datahub-business-glossary.md b/metadata-ingestion/docs/sources/business-glossary/datahub-business-glossary.md index 3433a853ea9b05..211115fd39163f 100644 --- a/metadata-ingestion/docs/sources/business-glossary/datahub-business-glossary.md +++ b/metadata-ingestion/docs/sources/business-glossary/datahub-business-glossary.md @@ -24,7 +24,8 @@ nodes: # list of child **Glossa Example **GlossaryNode**: ```yaml -- name: Shipping # name of the node +- name: "Shipping" # name of the node + id: "Shipping-Logistics" # (optional) custom identifier for the node description: Provides terms related to the shipping domain # description of the node owners: # (optional) owners contains 2 nested fields users: # (optional) a list of user IDs @@ -43,7 +44,8 @@ Example **GlossaryNode**: Example **GlossaryTerm**: ```yaml -- name: FullAddress # name of the term +- name: "Full Address" # name of the term + id: "Full-Address-Details" # (optional) custom identifier for the term description: A collection of information to give the location of a building or plot of land. # description of the term owners: # (optional) owners contains 2 nested fields users: # (optional) a list of user IDs @@ -67,10 +69,86 @@ Example **GlossaryTerm**: domain: "urn:li:domain:Logistics" # (optional) domain name or domain urn ``` -To see how these all work together, check out this comprehensive example business glossary file below: +## ID Management and URL Generation + +The business glossary provides two primary ways to manage term and node identifiers: + +1. **Custom IDs**: You can explicitly specify an ID for any term or node using the `id` field. This is recommended for terms that need stable, predictable identifiers: + ```yaml + terms: + - name: "Response Time" + id: "support-response-time" # Explicit ID + description: "Target time to respond to customer inquiries" + ``` + +2. **Automatic ID Generation**: When no ID is specified, the system will generate one based on the `enable_auto_id` setting: + - With `enable_auto_id: false` (default): + - Node and term names are converted to URL-friendly format + - Spaces within names are replaced with hyphens + - Special characters are removed (except hyphens) + - Case is preserved + - Multiple hyphens are collapsed to single ones + - Path components (node/term hierarchy) are joined with periods + - Example: Node "Customer Support" with term "Response Time" → "Customer-Support.Response-Time" + + - With `enable_auto_id: true`: + - Generates GUID-based IDs + - Recommended for guaranteed uniqueness + - Required for terms with non-ASCII characters + +Here's how path-based ID generation works: +```yaml +nodes: + - name: "Customer Support" # Node ID: Customer-Support + terms: + - name: "Response Time" # Term ID: Customer-Support.Response-Time + description: "Response SLA" + + - name: "First Reply" # Term ID: Customer-Support.First-Reply + description: "Initial response" + + - name: "Product Feedback" # Node ID: Product-Feedback + terms: + - name: "Response Time" # Term ID: Product-Feedback.Response-Time + description: "Feedback response" +``` + +**Important Notes**: +- Periods (.) are used exclusively as path separators between nodes and terms +- Periods in term or node names themselves will be removed +- Each component of the path (node names, term names) is cleaned independently: + - Spaces to hyphens + - Special characters removed + - Case preserved +- The cleaned components are then joined with periods to form the full path +- Non-ASCII characters in any component trigger automatic GUID generation +- Once an ID is created (either manually or automatically), it cannot be easily changed +- All references to a term (in `inherits`, `contains`, etc.) must use its correct ID +- Moving terms in the hierarchy does NOT update their IDs: + - The ID retains its original path components even after moving + - This can lead to IDs that don't match the current location + - Consider using `enable_auto_id: true` if you plan to reorganize your glossary +- For terms that other terms will reference, consider using explicit IDs or enable auto_id + +Example of how different names are handled: +```yaml +nodes: + - name: "Data Services" # Node ID: Data-Services + terms: + # Basic term name + - name: "Response Time" # Term ID: Data-Services.Response-Time + description: "SLA metrics" + + # Term name with special characters + - name: "API @ Response" # Term ID: Data-Services.API-Response + description: "API metrics" + + # Term with non-ASCII (triggers GUID) + - name: "パフォーマンス" # Term ID will be a 32-character GUID + description: "Performance" +``` -
-Example business glossary file +To see how these all work together, check out this comprehensive example business glossary file below: ```yaml version: "1" @@ -80,172 +158,108 @@ owners: - mjames url: "https://github.com/datahub-project/datahub/" nodes: - - name: Classification + - name: "Data Classification" + id: "Data-Classification" # Custom ID for stable references description: A set of terms related to Data Classification knowledge_links: - label: Wiki link for classification url: "https://en.wikipedia.org/wiki/Classification" terms: - - name: Sensitive + - name: "Sensitive Data" # Will generate: Data-Classification.Sensitive-Data description: Sensitive Data custom_properties: is_confidential: "false" - - name: Confidential + - name: "Confidential Information" # Will generate: Data-Classification.Confidential-Information description: Confidential Data custom_properties: is_confidential: "true" - - name: HighlyConfidential + - name: "Highly Confidential" # Will generate: Data-Classification.Highly-Confidential description: Highly Confidential Data custom_properties: is_confidential: "true" domain: Marketing - - name: PersonalInformation + + - name: "Personal Information" description: All terms related to personal information owners: users: - mjames terms: - - name: Email - ## An example of using an id to pin a term to a specific guid - ## See "how to generate custom IDs for your terms" section below - # id: "urn:li:glossaryTerm:41516e310acbfd9076fffc2c98d2d1a3" + - name: "Email" # Will generate: Personal-Information.Email description: An individual's email address inherits: - - Classification.Confidential + - Data-Classification.Confidential # References parent node path owners: groups: - Trust and Safety - - name: Address + - name: "Address" # Will generate: Personal-Information.Address description: A physical address - - name: Gender + - name: "Gender" # Will generate: Personal-Information.Gender description: The gender identity of the individual inherits: - - Classification.Sensitive - - name: Shipping - description: Provides terms related to the shipping domain - owners: - users: - - njones - groups: - - logistics - terms: - - name: FullAddress - description: A collection of information to give the location of a building or plot of land. - owners: - users: - - njones - groups: - - logistics - term_source: "EXTERNAL" - source_ref: FIBO - source_url: "https://www.google.com" - inherits: - - Privacy.PII - contains: - - Shipping.ZipCode - - Shipping.CountryCode - - Shipping.StreetAddress - related_terms: - - Housing.Kitchen.Cutlery - custom_properties: - - is_used_for_compliance_tracking: "true" - knowledge_links: - - url: "https://en.wikipedia.org/wiki/Address" - label: Wiki link - domain: "urn:li:domain:Logistics" - knowledge_links: - - label: Wiki link for shipping - url: "https://en.wikipedia.org/wiki/Freight_transport" - - name: ClientsAndAccounts + - Data-Classification.Sensitive # References parent node path + + - name: "Clients And Accounts" description: Provides basic concepts such as account, account holder, account provider, relationship manager that are commonly used by financial services providers to describe customers and to determine counterparty identities owners: groups: - finance + type: DATAOWNER terms: - - name: Account + - name: "Account" # Will generate: Clients-And-Accounts.Account description: Container for records associated with a business arrangement for regular transactions and services term_source: "EXTERNAL" source_ref: FIBO source_url: "https://spec.edmcouncil.org/fibo/ontology/FBC/ProductsAndServices/ClientsAndAccounts/Account" inherits: - - Classification.HighlyConfidential + - Data-Classification.Highly-Confidential # References parent node path contains: - - ClientsAndAccounts.Balance - - name: Balance + - Clients-And-Accounts.Balance # References term in same node + - name: "Balance" # Will generate: Clients-And-Accounts.Balance description: Amount of money available or owed term_source: "EXTERNAL" source_ref: FIBO source_url: "https://spec.edmcouncil.org/fibo/ontology/FBC/ProductsAndServices/ClientsAndAccounts/Balance" - - name: Housing - description: Provides terms related to the housing domain - owners: - users: - - mjames - groups: - - interior - nodes: - - name: Colors - description: "Colors that are used in Housing construction" - terms: - - name: Red - description: "red color" - term_source: "EXTERNAL" - source_ref: FIBO - source_url: "https://spec.edmcouncil.org/fibo/ontology/FBC/ProductsAndServices/ClientsAndAccounts/Account" - - - name: Green - description: "green color" - term_source: "EXTERNAL" - source_ref: FIBO - source_url: "https://spec.edmcouncil.org/fibo/ontology/FBC/ProductsAndServices/ClientsAndAccounts/Account" - - - name: Pink - description: pink color - term_source: "EXTERNAL" - source_ref: FIBO - source_url: "https://spec.edmcouncil.org/fibo/ontology/FBC/ProductsAndServices/ClientsAndAccounts/Account" + + - name: "KPIs" + description: Common Business KPIs terms: - - name: WindowColor - description: Supported window colors - term_source: "EXTERNAL" - source_ref: FIBO - source_url: "https://spec.edmcouncil.org/fibo/ontology/FBC/ProductsAndServices/ClientsAndAccounts/Account" - values: - - Housing.Colors.Red - - Housing.Colors.Pink + - name: "CSAT %" # Will generate: KPIs.CSAT + description: Customer Satisfaction Score +``` - - name: Kitchen - description: a room or area where food is prepared and cooked. - term_source: "EXTERNAL" - source_ref: FIBO - source_url: "https://spec.edmcouncil.org/fibo/ontology/FBC/ProductsAndServices/ClientsAndAccounts/Account" +## Custom ID Specification - - name: Spoon - description: an implement consisting of a small, shallow oval or round bowl on a long handle, used for eating, stirring, and serving food. - term_source: "EXTERNAL" - source_ref: FIBO - source_url: "https://spec.edmcouncil.org/fibo/ontology/FBC/ProductsAndServices/ClientsAndAccounts/Account" - related_terms: - - Housing.Kitchen - knowledge_links: - - url: "https://en.wikipedia.org/wiki/Spoon" - label: Wiki link -``` -
+Custom IDs can be specified in two ways, both of which are fully supported and acceptable: -Source file linked [here](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/examples/bootstrap_data/business_glossary.yml). +1. Just the ID portion (simpler approach): +```yaml +terms: + - name: "Email" + id: "company-email" # Will become urn:li:glossaryTerm:company-email + description: "Company email address" +``` -## Generating custom IDs for your terms +2. Full URN format: +```yaml +terms: + - name: "Email" + id: "urn:li:glossaryTerm:company-email" + description: "Company email address" +``` -IDs are normally inferred from the glossary term/node's name, see the `enable_auto_id` config. But, if you need a stable -identifier, you can generate a custom ID for your term. It should be unique across the entire Glossary. +Both methods are valid and will work correctly. The system will automatically handle the URN prefix if you specify just the ID portion. -Here's an example ID: -`id: "urn:li:glossaryTerm:41516e310acbfd9076fffc2c98d2d1a3"` +The same applies for nodes: +```yaml +nodes: + - name: "Communications" + id: "internal-comms" # Will become urn:li:glossaryNode:internal-comms + description: "Internal communication methods" +``` -A note of caution: once you select a custom ID, it cannot be easily changed. +Note: Once you select a custom ID, it cannot be easily changed. ## Compatibility -Compatible with version 1 of business glossary format. -The source will be evolved as we publish newer versions of this format. +Compatible with version 1 of business glossary format. The source will be evolved as newer versions of this format are published. \ No newline at end of file diff --git a/metadata-ingestion/src/datahub/ingestion/source/metadata/business_glossary.py b/metadata-ingestion/src/datahub/ingestion/source/metadata/business_glossary.py index 26a0331e1e5767..2cf68ea4c7b65a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/metadata/business_glossary.py +++ b/metadata-ingestion/src/datahub/ingestion/source/metadata/business_glossary.py @@ -1,5 +1,6 @@ import logging import pathlib +import re import time from dataclasses import dataclass, field from typing import Any, Dict, Iterable, List, Optional, TypeVar, Union @@ -118,17 +119,58 @@ def version_must_be_1(cls, v): return v +def clean_url(text: str) -> str: + """ + Clean text for use in URLs by: + 1. Replacing spaces with hyphens + 2. Removing special characters (preserving hyphens and periods) + 3. Collapsing multiple hyphens and periods into single ones + """ + # Replace spaces with hyphens + text = text.replace(" ", "-") + # Remove special characters except hyphens and periods + text = re.sub(r"[^a-zA-Z0-9\-.]", "", text) + # Collapse multiple hyphens into one + text = re.sub(r"-+", "-", text) + # Collapse multiple periods into one + text = re.sub(r"\.+", ".", text) + # Remove leading/trailing hyphens and periods + text = text.strip("-.") + return text + + def create_id(path: List[str], default_id: Optional[str], enable_auto_id: bool) -> str: + """ + Create an ID for a glossary node or term. + + Args: + path: List of path components leading to this node/term + default_id: Optional manually specified ID + enable_auto_id: Whether to generate GUIDs + """ if default_id is not None: - return default_id # No need to create id from path as default_id is provided + return default_id # Use explicitly provided ID id_: str = ".".join(path) - if UrnEncoder.contains_extended_reserved_char(id_): - enable_auto_id = True + # Check for non-ASCII characters before cleaning + if any(ord(c) > 127 for c in id_): + return datahub_guid({"path": id_}) if enable_auto_id: + # Generate GUID for auto_id mode id_ = datahub_guid({"path": id_}) + else: + # Clean the URL for better readability when not using auto_id + id_ = clean_url(id_) + + # Force auto_id if the cleaned URL still contains problematic characters + if UrnEncoder.contains_extended_reserved_char(id_): + logger.warning( + f"ID '{id_}' contains problematic characters after URL cleaning. Falling back to GUID generation for stability." + ) + id_ = datahub_guid({"path": id_}) + return id_ diff --git a/metadata-ingestion/tests/integration/business-glossary/custom_ownership_urns_golden.json b/metadata-ingestion/tests/integration/business-glossary/custom_ownership_urns_golden.json index 2fc3de77efd8eb..6db8dfd5dcf340 100644 --- a/metadata-ingestion/tests/integration/business-glossary/custom_ownership_urns_golden.json +++ b/metadata-ingestion/tests/integration/business-glossary/custom_ownership_urns_golden.json @@ -2,7 +2,7 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryNodeSnapshot": { - "urn": "urn:li:glossaryNode:Custom URN Types", + "urn": "urn:li:glossaryNode:Custom-URN-Types", "aspects": [ { "com.linkedin.pegasus2avro.glossary.GlossaryNodeInfo": { @@ -42,21 +42,21 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00-dlsmlo", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-ugsgt3", "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryTermSnapshot": { - "urn": "urn:li:glossaryTerm:Custom URN Types.Mixed URN Types", + "urn": "urn:li:glossaryTerm:Custom-URN-Types.Mixed-URN-Types", "aspects": [ { "com.linkedin.pegasus2avro.glossary.GlossaryTermInfo": { "customProperties": {}, "name": "Mixed URN Types", "definition": "Term with custom URN types", - "parentNode": "urn:li:glossaryNode:Custom URN Types", + "parentNode": "urn:li:glossaryNode:Custom-URN-Types", "termSource": "INTERNAL", "sourceRef": "DataHub", "sourceUrl": "https://github.com/datahub-project/datahub/" @@ -88,21 +88,21 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00-dlsmlo", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-ugsgt3", "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryTermSnapshot": { - "urn": "urn:li:glossaryTerm:Custom URN Types.Mixed Standard and URN", + "urn": "urn:li:glossaryTerm:Custom-URN-Types.Mixed-Standard-and-URN", "aspects": [ { "com.linkedin.pegasus2avro.glossary.GlossaryTermInfo": { "customProperties": {}, "name": "Mixed Standard and URN", "definition": "Term with both standard and URN types", - "parentNode": "urn:li:glossaryNode:Custom URN Types", + "parentNode": "urn:li:glossaryNode:Custom-URN-Types", "termSource": "INTERNAL", "sourceRef": "DataHub", "sourceUrl": "https://github.com/datahub-project/datahub/" @@ -133,13 +133,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00-dlsmlo", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-ugsgt3", "lastRunId": "no-run-id-provided" } }, { "entityType": "glossaryNode", - "entityUrn": "urn:li:glossaryNode:Custom URN Types", + "entityUrn": "urn:li:glossaryNode:Custom-URN-Types", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -149,13 +149,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00-dlsmlo", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-ugsgt3", "lastRunId": "no-run-id-provided" } }, { "entityType": "glossaryTerm", - "entityUrn": "urn:li:glossaryTerm:Custom URN Types.Mixed Standard and URN", + "entityUrn": "urn:li:glossaryTerm:Custom-URN-Types.Mixed-Standard-and-URN", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -165,13 +165,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00-dlsmlo", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-ugsgt3", "lastRunId": "no-run-id-provided" } }, { "entityType": "glossaryTerm", - "entityUrn": "urn:li:glossaryTerm:Custom URN Types.Mixed URN Types", + "entityUrn": "urn:li:glossaryTerm:Custom-URN-Types.Mixed-URN-Types", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -181,7 +181,7 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00-dlsmlo", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-ugsgt3", "lastRunId": "no-run-id-provided" } } diff --git a/metadata-ingestion/tests/integration/business-glossary/glossary_events_golden.json b/metadata-ingestion/tests/integration/business-glossary/glossary_events_golden.json index f3ff1114853c35..65feadde5d5f4f 100644 --- a/metadata-ingestion/tests/integration/business-glossary/glossary_events_golden.json +++ b/metadata-ingestion/tests/integration/business-glossary/glossary_events_golden.json @@ -21,6 +21,7 @@ "type": "DEVELOPER" } ], + "ownerTypes": {}, "lastModified": { "time": 0, "actor": "urn:li:corpuser:unknown" @@ -32,7 +33,7 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-h7iopd", "lastRunId": "no-run-id-provided" } }, @@ -58,7 +59,7 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-h7iopd", "lastRunId": "no-run-id-provided" } }, @@ -88,6 +89,7 @@ "type": "DEVELOPER" } ], + "ownerTypes": {}, "lastModified": { "time": 0, "actor": "urn:li:corpuser:unknown" @@ -99,7 +101,7 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-h7iopd", "lastRunId": "no-run-id-provided" } }, @@ -125,7 +127,7 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-h7iopd", "lastRunId": "no-run-id-provided" } }, @@ -155,6 +157,7 @@ "type": "DEVELOPER" } ], + "ownerTypes": {}, "lastModified": { "time": 0, "actor": "urn:li:corpuser:unknown" @@ -166,13 +169,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-h7iopd", "lastRunId": "no-run-id-provided" } }, { "entityType": "glossaryTerm", - "entityUrn": "urn:li:glossaryTerm:Classification.Highly Confidential", + "entityUrn": "urn:li:glossaryTerm:Classification.Highly-Confidential", "changeType": "UPSERT", "aspectName": "domains", "aspect": { @@ -184,14 +187,14 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-h7iopd", "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryTermSnapshot": { - "urn": "urn:li:glossaryTerm:Classification.Highly Confidential", + "urn": "urn:li:glossaryTerm:Classification.Highly-Confidential", "aspects": [ { "com.linkedin.pegasus2avro.glossary.GlossaryTermInfo": { @@ -214,6 +217,7 @@ "type": "DEVELOPER" } ], + "ownerTypes": {}, "lastModified": { "time": 0, "actor": "urn:li:corpuser:unknown" @@ -225,14 +229,14 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-h7iopd", "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryNodeSnapshot": { - "urn": "urn:li:glossaryNode:Personal Information", + "urn": "urn:li:glossaryNode:Personal-Information", "aspects": [ { "com.linkedin.pegasus2avro.glossary.GlossaryNodeInfo": { @@ -249,6 +253,7 @@ "type": "DATAOWNER" } ], + "ownerTypes": {}, "lastModified": { "time": 0, "actor": "urn:li:corpuser:unknown" @@ -260,21 +265,21 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-h7iopd", "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryTermSnapshot": { - "urn": "urn:li:glossaryTerm:Personal Information.Email", + "urn": "urn:li:glossaryTerm:Personal-Information.Email", "aspects": [ { "com.linkedin.pegasus2avro.glossary.GlossaryTermInfo": { "customProperties": {}, "name": "Email", "definition": "An individual's email address", - "parentNode": "urn:li:glossaryNode:Personal Information", + "parentNode": "urn:li:glossaryNode:Personal-Information", "termSource": "INTERNAL", "sourceRef": "DataHub", "sourceUrl": "https://github.com/datahub-project/datahub/" @@ -295,6 +300,7 @@ "type": "DEVELOPER" } ], + "ownerTypes": {}, "lastModified": { "time": 0, "actor": "urn:li:corpuser:unknown" @@ -306,21 +312,21 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-h7iopd", "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryTermSnapshot": { - "urn": "urn:li:glossaryTerm:Personal Information.Address", + "urn": "urn:li:glossaryTerm:Personal-Information.Address", "aspects": [ { "com.linkedin.pegasus2avro.glossary.GlossaryTermInfo": { "customProperties": {}, "name": "Address", "definition": "A physical address", - "parentNode": "urn:li:glossaryNode:Personal Information", + "parentNode": "urn:li:glossaryNode:Personal-Information", "termSource": "INTERNAL", "sourceRef": "DataHub", "sourceUrl": "https://github.com/datahub-project/datahub/" @@ -334,6 +340,7 @@ "type": "DATAOWNER" } ], + "ownerTypes": {}, "lastModified": { "time": 0, "actor": "urn:li:corpuser:unknown" @@ -345,21 +352,21 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-h7iopd", "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryTermSnapshot": { - "urn": "urn:li:glossaryTerm:Personal Information.Gender", + "urn": "urn:li:glossaryTerm:Personal-Information.Gender", "aspects": [ { "com.linkedin.pegasus2avro.glossary.GlossaryTermInfo": { "customProperties": {}, "name": "Gender", "definition": "The gender identity of the individual", - "parentNode": "urn:li:glossaryNode:Personal Information", + "parentNode": "urn:li:glossaryNode:Personal-Information", "termSource": "INTERNAL", "sourceRef": "DataHub", "sourceUrl": "https://github.com/datahub-project/datahub/" @@ -380,6 +387,7 @@ "type": "DATAOWNER" } ], + "ownerTypes": {}, "lastModified": { "time": 0, "actor": "urn:li:corpuser:unknown" @@ -391,14 +399,14 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-h7iopd", "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryNodeSnapshot": { - "urn": "urn:li:glossaryNode:Clients And Accounts", + "urn": "urn:li:glossaryNode:Clients-And-Accounts", "aspects": [ { "com.linkedin.pegasus2avro.glossary.GlossaryNodeInfo": { @@ -416,6 +424,7 @@ "typeUrn": "urn:li:ownershipType:my_cutom_type" } ], + "ownerTypes": {}, "lastModified": { "time": 0, "actor": "urn:li:corpuser:unknown" @@ -427,21 +436,21 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-h7iopd", "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryTermSnapshot": { - "urn": "urn:li:glossaryTerm:Clients And Accounts.Account", + "urn": "urn:li:glossaryTerm:Clients-And-Accounts.Account", "aspects": [ { "com.linkedin.pegasus2avro.glossary.GlossaryTermInfo": { "customProperties": {}, "name": "Account", "definition": "Container for records associated with a business arrangement for regular transactions and services", - "parentNode": "urn:li:glossaryNode:Clients And Accounts", + "parentNode": "urn:li:glossaryNode:Clients-And-Accounts", "termSource": "EXTERNAL", "sourceRef": "FIBO", "sourceUrl": "https://spec.edmcouncil.org/fibo/ontology/FBC/ProductsAndServices/ClientsAndAccounts/Account" @@ -450,10 +459,10 @@ { "com.linkedin.pegasus2avro.glossary.GlossaryRelatedTerms": { "isRelatedTerms": [ - "urn:li:glossaryTerm:Classification.Highly Confidential" + "urn:li:glossaryTerm:Classification.Highly-Confidential" ], "hasRelatedTerms": [ - "urn:li:glossaryTerm:Clients And Accounts.Balance" + "urn:li:glossaryTerm:Clients-And-Accounts.Balance" ] } }, @@ -466,6 +475,7 @@ "typeUrn": "urn:li:ownershipType:my_cutom_type" } ], + "ownerTypes": {}, "lastModified": { "time": 0, "actor": "urn:li:corpuser:unknown" @@ -477,21 +487,21 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-h7iopd", "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryTermSnapshot": { - "urn": "urn:li:glossaryTerm:Clients And Accounts.Balance", + "urn": "urn:li:glossaryTerm:Clients-And-Accounts.Balance", "aspects": [ { "com.linkedin.pegasus2avro.glossary.GlossaryTermInfo": { "customProperties": {}, "name": "Balance", "definition": "Amount of money available or owed", - "parentNode": "urn:li:glossaryNode:Clients And Accounts", + "parentNode": "urn:li:glossaryNode:Clients-And-Accounts", "termSource": "EXTERNAL", "sourceRef": "FIBO", "sourceUrl": "https://spec.edmcouncil.org/fibo/ontology/FBC/ProductsAndServices/ClientsAndAccounts/Balance" @@ -506,6 +516,7 @@ "typeUrn": "urn:li:ownershipType:my_cutom_type" } ], + "ownerTypes": {}, "lastModified": { "time": 0, "actor": "urn:li:corpuser:unknown" @@ -517,7 +528,7 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-h7iopd", "lastRunId": "no-run-id-provided" } }, @@ -541,6 +552,7 @@ "type": "DEVELOPER" } ], + "ownerTypes": {}, "lastModified": { "time": 0, "actor": "urn:li:corpuser:unknown" @@ -552,14 +564,14 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-h7iopd", "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryTermSnapshot": { - "urn": "urn:li:glossaryTerm:4faf1eed790370f65942f2998a7993d6", + "urn": "urn:li:glossaryTerm:KPIs.CSAT", "aspects": [ { "com.linkedin.pegasus2avro.glossary.GlossaryTermInfo": { @@ -580,6 +592,7 @@ "type": "DEVELOPER" } ], + "ownerTypes": {}, "lastModified": { "time": 0, "actor": "urn:li:corpuser:unknown" @@ -591,7 +604,7 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-h7iopd", "lastRunId": "no-run-id-provided" } }, @@ -607,13 +620,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-h7iopd", "lastRunId": "no-run-id-provided" } }, { "entityType": "glossaryNode", - "entityUrn": "urn:li:glossaryNode:Clients And Accounts", + "entityUrn": "urn:li:glossaryNode:Clients-And-Accounts", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -623,7 +636,7 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-h7iopd", "lastRunId": "no-run-id-provided" } }, @@ -639,13 +652,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-h7iopd", "lastRunId": "no-run-id-provided" } }, { "entityType": "glossaryNode", - "entityUrn": "urn:li:glossaryNode:Personal Information", + "entityUrn": "urn:li:glossaryNode:Personal-Information", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -655,13 +668,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-h7iopd", "lastRunId": "no-run-id-provided" } }, { "entityType": "glossaryTerm", - "entityUrn": "urn:li:glossaryTerm:4faf1eed790370f65942f2998a7993d6", + "entityUrn": "urn:li:glossaryTerm:Classification.Confidential", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -671,13 +684,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-h7iopd", "lastRunId": "no-run-id-provided" } }, { "entityType": "glossaryTerm", - "entityUrn": "urn:li:glossaryTerm:Classification.Confidential", + "entityUrn": "urn:li:glossaryTerm:Classification.Highly-Confidential", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -687,13 +700,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-h7iopd", "lastRunId": "no-run-id-provided" } }, { "entityType": "glossaryTerm", - "entityUrn": "urn:li:glossaryTerm:Classification.Highly Confidential", + "entityUrn": "urn:li:glossaryTerm:Classification.Sensitive", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -703,13 +716,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-h7iopd", "lastRunId": "no-run-id-provided" } }, { "entityType": "glossaryTerm", - "entityUrn": "urn:li:glossaryTerm:Classification.Sensitive", + "entityUrn": "urn:li:glossaryTerm:Clients-And-Accounts.Account", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -719,13 +732,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-h7iopd", "lastRunId": "no-run-id-provided" } }, { "entityType": "glossaryTerm", - "entityUrn": "urn:li:glossaryTerm:Clients And Accounts.Account", + "entityUrn": "urn:li:glossaryTerm:Clients-And-Accounts.Balance", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -735,13 +748,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-h7iopd", "lastRunId": "no-run-id-provided" } }, { "entityType": "glossaryTerm", - "entityUrn": "urn:li:glossaryTerm:Clients And Accounts.Balance", + "entityUrn": "urn:li:glossaryTerm:KPIs.CSAT", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -751,13 +764,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-h7iopd", "lastRunId": "no-run-id-provided" } }, { "entityType": "glossaryTerm", - "entityUrn": "urn:li:glossaryTerm:Personal Information.Address", + "entityUrn": "urn:li:glossaryTerm:Personal-Information.Address", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -767,13 +780,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-h7iopd", "lastRunId": "no-run-id-provided" } }, { "entityType": "glossaryTerm", - "entityUrn": "urn:li:glossaryTerm:Personal Information.Email", + "entityUrn": "urn:li:glossaryTerm:Personal-Information.Email", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -783,13 +796,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-h7iopd", "lastRunId": "no-run-id-provided" } }, { "entityType": "glossaryTerm", - "entityUrn": "urn:li:glossaryTerm:Personal Information.Gender", + "entityUrn": "urn:li:glossaryTerm:Personal-Information.Gender", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -799,7 +812,7 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-h7iopd", "lastRunId": "no-run-id-provided" } } diff --git a/metadata-ingestion/tests/integration/business-glossary/multiple_owners_different_types_golden.json b/metadata-ingestion/tests/integration/business-glossary/multiple_owners_different_types_golden.json index 4cec348708291a..b6b1ec101a39da 100644 --- a/metadata-ingestion/tests/integration/business-glossary/multiple_owners_different_types_golden.json +++ b/metadata-ingestion/tests/integration/business-glossary/multiple_owners_different_types_golden.json @@ -2,7 +2,7 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryNodeSnapshot": { - "urn": "urn:li:glossaryNode:Different Owner Types", + "urn": "urn:li:glossaryNode:Different-Owner-Types", "aspects": [ { "com.linkedin.pegasus2avro.glossary.GlossaryNodeInfo": { @@ -47,21 +47,21 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00-2te9j9", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-8vduoq", "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryTermSnapshot": { - "urn": "urn:li:glossaryTerm:Different Owner Types.Mixed Ownership", + "urn": "urn:li:glossaryTerm:Different-Owner-Types.Mixed-Ownership", "aspects": [ { "com.linkedin.pegasus2avro.glossary.GlossaryTermInfo": { "customProperties": {}, "name": "Mixed Ownership", "definition": "Term with different owner types", - "parentNode": "urn:li:glossaryNode:Different Owner Types", + "parentNode": "urn:li:glossaryNode:Different-Owner-Types", "termSource": "INTERNAL", "sourceRef": "DataHub", "sourceUrl": "https://github.com/datahub-project/datahub/" @@ -99,13 +99,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00-2te9j9", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-8vduoq", "lastRunId": "no-run-id-provided" } }, { "entityType": "glossaryNode", - "entityUrn": "urn:li:glossaryNode:Different Owner Types", + "entityUrn": "urn:li:glossaryNode:Different-Owner-Types", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -115,13 +115,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00-2te9j9", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-8vduoq", "lastRunId": "no-run-id-provided" } }, { "entityType": "glossaryTerm", - "entityUrn": "urn:li:glossaryTerm:Different Owner Types.Mixed Ownership", + "entityUrn": "urn:li:glossaryTerm:Different-Owner-Types.Mixed-Ownership", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -131,7 +131,7 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00-2te9j9", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-8vduoq", "lastRunId": "no-run-id-provided" } } diff --git a/metadata-ingestion/tests/integration/business-glossary/multiple_owners_same_type_golden.json b/metadata-ingestion/tests/integration/business-glossary/multiple_owners_same_type_golden.json index 9342682510d84b..a3db411db32eca 100644 --- a/metadata-ingestion/tests/integration/business-glossary/multiple_owners_same_type_golden.json +++ b/metadata-ingestion/tests/integration/business-glossary/multiple_owners_same_type_golden.json @@ -2,7 +2,7 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryNodeSnapshot": { - "urn": "urn:li:glossaryNode:Multiple Owners", + "urn": "urn:li:glossaryNode:Multiple-Owners", "aspects": [ { "com.linkedin.pegasus2avro.glossary.GlossaryNodeInfo": { @@ -47,21 +47,21 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00-0l66l7", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-iuvo6j", "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryTermSnapshot": { - "urn": "urn:li:glossaryTerm:Multiple Owners.Multiple Dev Owners", + "urn": "urn:li:glossaryTerm:Multiple-Owners.Multiple-Dev-Owners", "aspects": [ { "com.linkedin.pegasus2avro.glossary.GlossaryTermInfo": { "customProperties": {}, "name": "Multiple Dev Owners", "definition": "Term owned by multiple developers", - "parentNode": "urn:li:glossaryNode:Multiple Owners", + "parentNode": "urn:li:glossaryNode:Multiple-Owners", "termSource": "INTERNAL", "sourceRef": "DataHub", "sourceUrl": "https://github.com/datahub-project/datahub/" @@ -103,13 +103,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00-0l66l7", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-iuvo6j", "lastRunId": "no-run-id-provided" } }, { "entityType": "glossaryNode", - "entityUrn": "urn:li:glossaryNode:Multiple Owners", + "entityUrn": "urn:li:glossaryNode:Multiple-Owners", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -119,13 +119,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00-0l66l7", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-iuvo6j", "lastRunId": "no-run-id-provided" } }, { "entityType": "glossaryTerm", - "entityUrn": "urn:li:glossaryTerm:Multiple Owners.Multiple Dev Owners", + "entityUrn": "urn:li:glossaryTerm:Multiple-Owners.Multiple-Dev-Owners", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -135,7 +135,7 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00-0l66l7", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-iuvo6j", "lastRunId": "no-run-id-provided" } } diff --git a/metadata-ingestion/tests/integration/business-glossary/single_owner_types_golden.json b/metadata-ingestion/tests/integration/business-glossary/single_owner_types_golden.json index 006e77f523a10b..420f829bbf50b6 100644 --- a/metadata-ingestion/tests/integration/business-glossary/single_owner_types_golden.json +++ b/metadata-ingestion/tests/integration/business-glossary/single_owner_types_golden.json @@ -2,7 +2,7 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryNodeSnapshot": { - "urn": "urn:li:glossaryNode:Single Owner Types", + "urn": "urn:li:glossaryNode:Single-Owner-Types", "aspects": [ { "com.linkedin.pegasus2avro.glossary.GlossaryNodeInfo": { @@ -31,21 +31,21 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00-ruwyic", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-bx72oe", "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryTermSnapshot": { - "urn": "urn:li:glossaryTerm:Single Owner Types.Developer Owned", + "urn": "urn:li:glossaryTerm:Single-Owner-Types.Developer-Owned", "aspects": [ { "com.linkedin.pegasus2avro.glossary.GlossaryTermInfo": { "customProperties": {}, "name": "Developer Owned", "definition": "Term owned by developer", - "parentNode": "urn:li:glossaryNode:Single Owner Types", + "parentNode": "urn:li:glossaryNode:Single-Owner-Types", "termSource": "INTERNAL", "sourceRef": "DataHub", "sourceUrl": "https://github.com/datahub-project/datahub/" @@ -71,21 +71,21 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00-ruwyic", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-bx72oe", "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryTermSnapshot": { - "urn": "urn:li:glossaryTerm:Single Owner Types.Data Owner Owned", + "urn": "urn:li:glossaryTerm:Single-Owner-Types.Data-Owner-Owned", "aspects": [ { "com.linkedin.pegasus2avro.glossary.GlossaryTermInfo": { "customProperties": {}, "name": "Data Owner Owned", "definition": "Term owned by data owner", - "parentNode": "urn:li:glossaryNode:Single Owner Types", + "parentNode": "urn:li:glossaryNode:Single-Owner-Types", "termSource": "INTERNAL", "sourceRef": "DataHub", "sourceUrl": "https://github.com/datahub-project/datahub/" @@ -111,21 +111,21 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00-ruwyic", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-bx72oe", "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryTermSnapshot": { - "urn": "urn:li:glossaryTerm:Single Owner Types.Producer Owned", + "urn": "urn:li:glossaryTerm:Single-Owner-Types.Producer-Owned", "aspects": [ { "com.linkedin.pegasus2avro.glossary.GlossaryTermInfo": { "customProperties": {}, "name": "Producer Owned", "definition": "Term owned by producer", - "parentNode": "urn:li:glossaryNode:Single Owner Types", + "parentNode": "urn:li:glossaryNode:Single-Owner-Types", "termSource": "INTERNAL", "sourceRef": "DataHub", "sourceUrl": "https://github.com/datahub-project/datahub/" @@ -151,21 +151,21 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00-ruwyic", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-bx72oe", "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryTermSnapshot": { - "urn": "urn:li:glossaryTerm:Single Owner Types.Stakeholder Owned", + "urn": "urn:li:glossaryTerm:Single-Owner-Types.Stakeholder-Owned", "aspects": [ { "com.linkedin.pegasus2avro.glossary.GlossaryTermInfo": { "customProperties": {}, "name": "Stakeholder Owned", "definition": "Term owned by stakeholder", - "parentNode": "urn:li:glossaryNode:Single Owner Types", + "parentNode": "urn:li:glossaryNode:Single-Owner-Types", "termSource": "INTERNAL", "sourceRef": "DataHub", "sourceUrl": "https://github.com/datahub-project/datahub/" @@ -191,13 +191,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00-ruwyic", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-bx72oe", "lastRunId": "no-run-id-provided" } }, { "entityType": "glossaryNode", - "entityUrn": "urn:li:glossaryNode:Single Owner Types", + "entityUrn": "urn:li:glossaryNode:Single-Owner-Types", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -207,13 +207,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00-ruwyic", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-bx72oe", "lastRunId": "no-run-id-provided" } }, { "entityType": "glossaryTerm", - "entityUrn": "urn:li:glossaryTerm:Single Owner Types.Data Owner Owned", + "entityUrn": "urn:li:glossaryTerm:Single-Owner-Types.Data-Owner-Owned", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -223,13 +223,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00-ruwyic", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-bx72oe", "lastRunId": "no-run-id-provided" } }, { "entityType": "glossaryTerm", - "entityUrn": "urn:li:glossaryTerm:Single Owner Types.Developer Owned", + "entityUrn": "urn:li:glossaryTerm:Single-Owner-Types.Developer-Owned", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -239,13 +239,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00-ruwyic", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-bx72oe", "lastRunId": "no-run-id-provided" } }, { "entityType": "glossaryTerm", - "entityUrn": "urn:li:glossaryTerm:Single Owner Types.Producer Owned", + "entityUrn": "urn:li:glossaryTerm:Single-Owner-Types.Producer-Owned", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -255,13 +255,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00-ruwyic", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-bx72oe", "lastRunId": "no-run-id-provided" } }, { "entityType": "glossaryTerm", - "entityUrn": "urn:li:glossaryTerm:Single Owner Types.Stakeholder Owned", + "entityUrn": "urn:li:glossaryTerm:Single-Owner-Types.Stakeholder-Owned", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -271,7 +271,7 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "datahub-business-glossary-2020_04_14-07_00_00-ruwyic", + "runId": "datahub-business-glossary-2020_04_14-07_00_00-bx72oe", "lastRunId": "no-run-id-provided" } } diff --git a/metadata-ingestion/tests/integration/business-glossary/test_business_glossary.py b/metadata-ingestion/tests/integration/business-glossary/test_business_glossary.py index 74cf9aa3b528f2..87d8c94a1d3950 100644 --- a/metadata-ingestion/tests/integration/business-glossary/test_business_glossary.py +++ b/metadata-ingestion/tests/integration/business-glossary/test_business_glossary.py @@ -4,7 +4,6 @@ from freezegun import freeze_time from datahub.ingestion.run.pipeline import Pipeline -from datahub.ingestion.source.metadata import business_glossary from tests.test_helpers import mce_helpers FROZEN_TIME = "2020-04-14 07:00:00" @@ -200,6 +199,31 @@ def test_custom_ownership_urns( @freeze_time(FROZEN_TIME) -def test_auto_id_creation_on_reserved_char(): - id_: str = business_glossary.create_id(["pii", "secure % password"], None, False) - assert id_ == "24baf9389cc05c162c7148c96314d733" +@pytest.mark.integration +def test_url_cleaning( + mock_datahub_graph_instance, + pytestconfig, + tmp_path, + mock_time, +): + """Test URL cleaning functionality when auto_id is disabled""" + test_resources_dir = pytestconfig.rootpath / "tests/integration/business-glossary" + output_mces_path: str = f"{tmp_path}/url_cleaning_events.json" + golden_mces_path: str = f"{test_resources_dir}/url_cleaning_events_golden.json" + + pipeline = Pipeline.create( + get_default_recipe( + glossary_yml_file_path=f"{test_resources_dir}/url_cleaning_glossary.yml", + event_output_file_path=output_mces_path, + enable_auto_id=False, + ) + ) + pipeline.ctx.graph = mock_datahub_graph_instance + pipeline.run() + pipeline.raise_from_status() + + mce_helpers.check_golden_file( + pytestconfig, + output_path=output_mces_path, + golden_path=golden_mces_path, + ) diff --git a/metadata-ingestion/tests/integration/business-glossary/url_cleaning_events_golden.json b/metadata-ingestion/tests/integration/business-glossary/url_cleaning_events_golden.json new file mode 100644 index 00000000000000..0ff27423c55058 --- /dev/null +++ b/metadata-ingestion/tests/integration/business-glossary/url_cleaning_events_golden.json @@ -0,0 +1,446 @@ +[ +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryNodeSnapshot": { + "urn": "urn:li:glossaryNode:URL-Testing", + "aspects": [ + { + "com.linkedin.pegasus2avro.glossary.GlossaryNodeInfo": { + "customProperties": {}, + "definition": "Testing URL cleaning functionality", + "name": "URL Testing" + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:mjames", + "type": "DEVELOPER" + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "datahub-business-glossary-2020_04_14-07_00_00-4alqef", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryTermSnapshot": { + "urn": "urn:li:glossaryTerm:URL-Testing.Basic-Term-With-Spaces", + "aspects": [ + { + "com.linkedin.pegasus2avro.glossary.GlossaryTermInfo": { + "customProperties": {}, + "name": "Basic Term With Spaces", + "definition": "Testing basic space replacement", + "parentNode": "urn:li:glossaryNode:URL-Testing", + "termSource": "INTERNAL", + "sourceRef": "DataHub", + "sourceUrl": "https://github.com/datahub-project/datahub/" + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:mjames", + "type": "DEVELOPER" + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "datahub-business-glossary-2020_04_14-07_00_00-4alqef", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryTermSnapshot": { + "urn": "urn:li:glossaryTerm:URL-Testing.SpecialCharacters", + "aspects": [ + { + "com.linkedin.pegasus2avro.glossary.GlossaryTermInfo": { + "customProperties": {}, + "name": "Special@#$Characters!", + "definition": "Testing special character removal", + "parentNode": "urn:li:glossaryNode:URL-Testing", + "termSource": "INTERNAL", + "sourceRef": "DataHub", + "sourceUrl": "https://github.com/datahub-project/datahub/" + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:mjames", + "type": "DEVELOPER" + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "datahub-business-glossary-2020_04_14-07_00_00-4alqef", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryTermSnapshot": { + "urn": "urn:li:glossaryTerm:URL-Testing.MixedCase-Term", + "aspects": [ + { + "com.linkedin.pegasus2avro.glossary.GlossaryTermInfo": { + "customProperties": {}, + "name": "MixedCase Term", + "definition": "Testing case preservation", + "parentNode": "urn:li:glossaryNode:URL-Testing", + "termSource": "INTERNAL", + "sourceRef": "DataHub", + "sourceUrl": "https://github.com/datahub-project/datahub/" + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:mjames", + "type": "DEVELOPER" + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "datahub-business-glossary-2020_04_14-07_00_00-4alqef", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryTermSnapshot": { + "urn": "urn:li:glossaryTerm:URL-Testing.Multiple-Spaces", + "aspects": [ + { + "com.linkedin.pegasus2avro.glossary.GlossaryTermInfo": { + "customProperties": {}, + "name": "Multiple Spaces", + "definition": "Testing multiple space handling", + "parentNode": "urn:li:glossaryNode:URL-Testing", + "termSource": "INTERNAL", + "sourceRef": "DataHub", + "sourceUrl": "https://github.com/datahub-project/datahub/" + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:mjames", + "type": "DEVELOPER" + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "datahub-business-glossary-2020_04_14-07_00_00-4alqef", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryTermSnapshot": { + "urn": "urn:li:glossaryTerm:URL-Testing.Term.With.Special-Chars", + "aspects": [ + { + "com.linkedin.pegasus2avro.glossary.GlossaryTermInfo": { + "customProperties": {}, + "name": "Term.With.Special-Chars", + "definition": "Testing mixed special characters", + "parentNode": "urn:li:glossaryNode:URL-Testing", + "termSource": "INTERNAL", + "sourceRef": "DataHub", + "sourceUrl": "https://github.com/datahub-project/datahub/" + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:mjames", + "type": "DEVELOPER" + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "datahub-business-glossary-2020_04_14-07_00_00-4alqef", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryTermSnapshot": { + "urn": "urn:li:glossaryTerm:URL-Testing.Special-At-Start", + "aspects": [ + { + "com.linkedin.pegasus2avro.glossary.GlossaryTermInfo": { + "customProperties": {}, + "name": "@#$Special At Start", + "definition": "Testing leading special characters", + "parentNode": "urn:li:glossaryNode:URL-Testing", + "termSource": "INTERNAL", + "sourceRef": "DataHub", + "sourceUrl": "https://github.com/datahub-project/datahub/" + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:mjames", + "type": "DEVELOPER" + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "datahub-business-glossary-2020_04_14-07_00_00-4alqef", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryTermSnapshot": { + "urn": "urn:li:glossaryTerm:URL-Testing.Numbers-123", + "aspects": [ + { + "com.linkedin.pegasus2avro.glossary.GlossaryTermInfo": { + "customProperties": {}, + "name": "Numbers 123", + "definition": "Testing numbers in term names", + "parentNode": "urn:li:glossaryNode:URL-Testing", + "termSource": "INTERNAL", + "sourceRef": "DataHub", + "sourceUrl": "https://github.com/datahub-project/datahub/" + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:mjames", + "type": "DEVELOPER" + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "datahub-business-glossary-2020_04_14-07_00_00-4alqef", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "glossaryNode", + "entityUrn": "urn:li:glossaryNode:URL-Testing", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "datahub-business-glossary-2020_04_14-07_00_00-4alqef", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "glossaryTerm", + "entityUrn": "urn:li:glossaryTerm:URL-Testing.Basic-Term-With-Spaces", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "datahub-business-glossary-2020_04_14-07_00_00-4alqef", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "glossaryTerm", + "entityUrn": "urn:li:glossaryTerm:URL-Testing.MixedCase-Term", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "datahub-business-glossary-2020_04_14-07_00_00-4alqef", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "glossaryTerm", + "entityUrn": "urn:li:glossaryTerm:URL-Testing.Multiple-Spaces", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "datahub-business-glossary-2020_04_14-07_00_00-4alqef", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "glossaryTerm", + "entityUrn": "urn:li:glossaryTerm:URL-Testing.Numbers-123", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "datahub-business-glossary-2020_04_14-07_00_00-4alqef", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "glossaryTerm", + "entityUrn": "urn:li:glossaryTerm:URL-Testing.Special-At-Start", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "datahub-business-glossary-2020_04_14-07_00_00-4alqef", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "glossaryTerm", + "entityUrn": "urn:li:glossaryTerm:URL-Testing.SpecialCharacters", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "datahub-business-glossary-2020_04_14-07_00_00-4alqef", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "glossaryTerm", + "entityUrn": "urn:li:glossaryTerm:URL-Testing.Term.With.Special-Chars", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "datahub-business-glossary-2020_04_14-07_00_00-4alqef", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/business-glossary/url_cleaning_glossary.yml b/metadata-ingestion/tests/integration/business-glossary/url_cleaning_glossary.yml new file mode 100644 index 00000000000000..64c8ee6837f894 --- /dev/null +++ b/metadata-ingestion/tests/integration/business-glossary/url_cleaning_glossary.yml @@ -0,0 +1,31 @@ +# tests/integration/business-glossary/url_cleaning_glossary.yml +version: "1" +source: DataHub +owners: + users: + - mjames +url: "https://github.com/datahub-project/datahub/" +nodes: + - name: "URL Testing" + description: "Testing URL cleaning functionality" + terms: + - name: "Basic Term With Spaces" + description: "Testing basic space replacement" + + - name: "Special@#$Characters!" + description: "Testing special character removal" + + - name: "MixedCase Term" + description: "Testing case preservation" + + - name: "Multiple Spaces" + description: "Testing multiple space handling" + + - name: "Term.With.Special-Chars" + description: "Testing mixed special characters" + + - name: "@#$Special At Start" + description: "Testing leading special characters" + + - name: "Numbers 123" + description: "Testing numbers in term names" \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/remote/golden/remote_glossary_golden.json b/metadata-ingestion/tests/integration/remote/golden/remote_glossary_golden.json index a3adcb7639712b..d482c135cfa2da 100644 --- a/metadata-ingestion/tests/integration/remote/golden/remote_glossary_golden.json +++ b/metadata-ingestion/tests/integration/remote/golden/remote_glossary_golden.json @@ -21,6 +21,7 @@ "type": "DEVELOPER" } ], + "ownerTypes": {}, "lastModified": { "time": 0, "actor": "urn:li:corpuser:unknown" @@ -88,6 +89,7 @@ "type": "DEVELOPER" } ], + "ownerTypes": {}, "lastModified": { "time": 0, "actor": "urn:li:corpuser:unknown" @@ -155,6 +157,7 @@ "type": "DEVELOPER" } ], + "ownerTypes": {}, "lastModified": { "time": 0, "actor": "urn:li:corpuser:unknown" @@ -172,7 +175,7 @@ }, { "entityType": "glossaryTerm", - "entityUrn": "urn:li:glossaryTerm:Classification.Highly Confidential", + "entityUrn": "urn:li:glossaryTerm:Classification.Highly-Confidential", "changeType": "UPSERT", "aspectName": "domains", "aspect": { @@ -191,7 +194,7 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryTermSnapshot": { - "urn": "urn:li:glossaryTerm:Classification.Highly Confidential", + "urn": "urn:li:glossaryTerm:Classification.Highly-Confidential", "aspects": [ { "com.linkedin.pegasus2avro.glossary.GlossaryTermInfo": { @@ -214,6 +217,7 @@ "type": "DEVELOPER" } ], + "ownerTypes": {}, "lastModified": { "time": 0, "actor": "urn:li:corpuser:unknown" @@ -232,7 +236,7 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryNodeSnapshot": { - "urn": "urn:li:glossaryNode:Personal Information", + "urn": "urn:li:glossaryNode:Personal-Information", "aspects": [ { "com.linkedin.pegasus2avro.glossary.GlossaryNodeInfo": { @@ -249,6 +253,7 @@ "type": "DEVELOPER" } ], + "ownerTypes": {}, "lastModified": { "time": 0, "actor": "urn:li:corpuser:unknown" @@ -267,14 +272,14 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryTermSnapshot": { - "urn": "urn:li:glossaryTerm:Personal Information.Email", + "urn": "urn:li:glossaryTerm:Personal-Information.Email", "aspects": [ { "com.linkedin.pegasus2avro.glossary.GlossaryTermInfo": { "customProperties": {}, "name": "Email", "definition": "An individual's email address", - "parentNode": "urn:li:glossaryNode:Personal Information", + "parentNode": "urn:li:glossaryNode:Personal-Information", "termSource": "INTERNAL", "sourceRef": "DataHub", "sourceUrl": "https://github.com/datahub-project/datahub/" @@ -295,6 +300,7 @@ "type": "DEVELOPER" } ], + "ownerTypes": {}, "lastModified": { "time": 0, "actor": "urn:li:corpuser:unknown" @@ -313,14 +319,14 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryTermSnapshot": { - "urn": "urn:li:glossaryTerm:Personal Information.Address", + "urn": "urn:li:glossaryTerm:Personal-Information.Address", "aspects": [ { "com.linkedin.pegasus2avro.glossary.GlossaryTermInfo": { "customProperties": {}, "name": "Address", "definition": "A physical address", - "parentNode": "urn:li:glossaryNode:Personal Information", + "parentNode": "urn:li:glossaryNode:Personal-Information", "termSource": "INTERNAL", "sourceRef": "DataHub", "sourceUrl": "https://github.com/datahub-project/datahub/" @@ -334,6 +340,7 @@ "type": "DEVELOPER" } ], + "ownerTypes": {}, "lastModified": { "time": 0, "actor": "urn:li:corpuser:unknown" @@ -352,14 +359,14 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryTermSnapshot": { - "urn": "urn:li:glossaryTerm:Personal Information.Gender", + "urn": "urn:li:glossaryTerm:Personal-Information.Gender", "aspects": [ { "com.linkedin.pegasus2avro.glossary.GlossaryTermInfo": { "customProperties": {}, "name": "Gender", "definition": "The gender identity of the individual", - "parentNode": "urn:li:glossaryNode:Personal Information", + "parentNode": "urn:li:glossaryNode:Personal-Information", "termSource": "INTERNAL", "sourceRef": "DataHub", "sourceUrl": "https://github.com/datahub-project/datahub/" @@ -380,6 +387,7 @@ "type": "DEVELOPER" } ], + "ownerTypes": {}, "lastModified": { "time": 0, "actor": "urn:li:corpuser:unknown" @@ -398,7 +406,7 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryNodeSnapshot": { - "urn": "urn:li:glossaryNode:Clients And Accounts", + "urn": "urn:li:glossaryNode:Clients-And-Accounts", "aspects": [ { "com.linkedin.pegasus2avro.glossary.GlossaryNodeInfo": { @@ -415,6 +423,7 @@ "type": "DEVELOPER" } ], + "ownerTypes": {}, "lastModified": { "time": 0, "actor": "urn:li:corpuser:unknown" @@ -433,14 +442,14 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryTermSnapshot": { - "urn": "urn:li:glossaryTerm:Clients And Accounts.Account", + "urn": "urn:li:glossaryTerm:Clients-And-Accounts.Account", "aspects": [ { "com.linkedin.pegasus2avro.glossary.GlossaryTermInfo": { "customProperties": {}, "name": "Account", "definition": "Container for records associated with a business arrangement for regular transactions and services", - "parentNode": "urn:li:glossaryNode:Clients And Accounts", + "parentNode": "urn:li:glossaryNode:Clients-And-Accounts", "termSource": "EXTERNAL", "sourceRef": "FIBO", "sourceUrl": "https://spec.edmcouncil.org/fibo/ontology/FBC/ProductsAndServices/ClientsAndAccounts/Account" @@ -449,10 +458,10 @@ { "com.linkedin.pegasus2avro.glossary.GlossaryRelatedTerms": { "isRelatedTerms": [ - "urn:li:glossaryTerm:Classification.Highly Confidential" + "urn:li:glossaryTerm:Classification.Highly-Confidential" ], "hasRelatedTerms": [ - "urn:li:glossaryTerm:Clients And Accounts.Balance" + "urn:li:glossaryTerm:Clients-And-Accounts.Balance" ] } }, @@ -464,6 +473,7 @@ "type": "DEVELOPER" } ], + "ownerTypes": {}, "lastModified": { "time": 0, "actor": "urn:li:corpuser:unknown" @@ -482,14 +492,14 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryTermSnapshot": { - "urn": "urn:li:glossaryTerm:Clients And Accounts.Balance", + "urn": "urn:li:glossaryTerm:Clients-And-Accounts.Balance", "aspects": [ { "com.linkedin.pegasus2avro.glossary.GlossaryTermInfo": { "customProperties": {}, "name": "Balance", "definition": "Amount of money available or owed", - "parentNode": "urn:li:glossaryNode:Clients And Accounts", + "parentNode": "urn:li:glossaryNode:Clients-And-Accounts", "termSource": "EXTERNAL", "sourceRef": "FIBO", "sourceUrl": "https://spec.edmcouncil.org/fibo/ontology/FBC/ProductsAndServices/ClientsAndAccounts/Balance" @@ -503,6 +513,7 @@ "type": "DEVELOPER" } ], + "ownerTypes": {}, "lastModified": { "time": 0, "actor": "urn:li:corpuser:unknown" @@ -538,6 +549,7 @@ "type": "DEVELOPER" } ], + "ownerTypes": {}, "lastModified": { "time": 0, "actor": "urn:li:corpuser:unknown" @@ -556,7 +568,7 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryTermSnapshot": { - "urn": "urn:li:glossaryTerm:4faf1eed790370f65942f2998a7993d6", + "urn": "urn:li:glossaryTerm:KPIs.CSAT", "aspects": [ { "com.linkedin.pegasus2avro.glossary.GlossaryTermInfo": { @@ -577,6 +589,7 @@ "type": "DEVELOPER" } ], + "ownerTypes": {}, "lastModified": { "time": 0, "actor": "urn:li:corpuser:unknown" @@ -610,7 +623,7 @@ }, { "entityType": "glossaryNode", - "entityUrn": "urn:li:glossaryNode:Clients And Accounts", + "entityUrn": "urn:li:glossaryNode:Clients-And-Accounts", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -642,7 +655,7 @@ }, { "entityType": "glossaryNode", - "entityUrn": "urn:li:glossaryNode:Personal Information", + "entityUrn": "urn:li:glossaryNode:Personal-Information", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -658,7 +671,7 @@ }, { "entityType": "glossaryTerm", - "entityUrn": "urn:li:glossaryTerm:4faf1eed790370f65942f2998a7993d6", + "entityUrn": "urn:li:glossaryTerm:Classification.Confidential", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -674,7 +687,7 @@ }, { "entityType": "glossaryTerm", - "entityUrn": "urn:li:glossaryTerm:Classification.Confidential", + "entityUrn": "urn:li:glossaryTerm:Classification.Highly-Confidential", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -690,7 +703,7 @@ }, { "entityType": "glossaryTerm", - "entityUrn": "urn:li:glossaryTerm:Classification.Highly Confidential", + "entityUrn": "urn:li:glossaryTerm:Classification.Sensitive", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -706,7 +719,7 @@ }, { "entityType": "glossaryTerm", - "entityUrn": "urn:li:glossaryTerm:Classification.Sensitive", + "entityUrn": "urn:li:glossaryTerm:Clients-And-Accounts.Account", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -722,7 +735,7 @@ }, { "entityType": "glossaryTerm", - "entityUrn": "urn:li:glossaryTerm:Clients And Accounts.Account", + "entityUrn": "urn:li:glossaryTerm:Clients-And-Accounts.Balance", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -738,7 +751,7 @@ }, { "entityType": "glossaryTerm", - "entityUrn": "urn:li:glossaryTerm:Clients And Accounts.Balance", + "entityUrn": "urn:li:glossaryTerm:KPIs.CSAT", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -754,7 +767,7 @@ }, { "entityType": "glossaryTerm", - "entityUrn": "urn:li:glossaryTerm:Personal Information.Address", + "entityUrn": "urn:li:glossaryTerm:Personal-Information.Address", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -770,7 +783,7 @@ }, { "entityType": "glossaryTerm", - "entityUrn": "urn:li:glossaryTerm:Personal Information.Email", + "entityUrn": "urn:li:glossaryTerm:Personal-Information.Email", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -786,7 +799,7 @@ }, { "entityType": "glossaryTerm", - "entityUrn": "urn:li:glossaryTerm:Personal Information.Gender", + "entityUrn": "urn:li:glossaryTerm:Personal-Information.Gender", "changeType": "UPSERT", "aspectName": "status", "aspect": { diff --git a/metadata-ingestion/tests/unit/test_business_glossary.py b/metadata-ingestion/tests/unit/test_business_glossary.py new file mode 100644 index 00000000000000..2c55317af701d0 --- /dev/null +++ b/metadata-ingestion/tests/unit/test_business_glossary.py @@ -0,0 +1,85 @@ +from datahub.ingestion.source.metadata.business_glossary import clean_url, create_id + + +def test_clean_url(): + """Test the clean_url function with various input cases""" + test_cases = [ + ("Basic Term", "Basic-Term"), + ("Term With Spaces", "Term-With-Spaces"), + ("Special@#$Characters!", "SpecialCharacters"), + ("MixedCase Term", "MixedCase-Term"), + ("Multiple Spaces", "Multiple-Spaces"), + ("Term-With-Hyphens", "Term-With-Hyphens"), + ("Term.With.Dots", "Term.With.Dots"), # Preserve periods + ("Term_With_Underscores", "TermWithUnderscores"), + ("123 Numeric Term", "123-Numeric-Term"), + ("@#$Special At Start", "Special-At-Start"), + ("-Leading-Trailing-", "Leading-Trailing"), + ("Multiple...Periods", "Multiple.Periods"), # Test multiple periods + ("Mixed-Hyphens.Periods", "Mixed-Hyphens.Periods"), # Test mixed separators + ] + + for input_str, expected in test_cases: + result = clean_url(input_str) + assert result == expected, ( + f"Expected '{expected}' for input '{input_str}', got '{result}'" + ) + + +def test_clean_url_edge_cases(): + """Test clean_url function with edge cases""" + test_cases = [ + ("", ""), # Empty string + (" ", ""), # Single space + (" ", ""), # Multiple spaces + ("@#$%", ""), # Only special characters + ("A", "A"), # Single character + ("A B", "A-B"), # Two characters with space + ("A.B", "A.B"), # Period separator + ("...", ""), # Only periods + (".Leading.Trailing.", "Leading.Trailing"), # Leading/trailing periods + ] + + for input_str, expected in test_cases: + result = clean_url(input_str) + assert result == expected, ( + f"Expected '{expected}' for input '{input_str}', got '{result}'" + ) + + +def test_create_id_url_cleaning(): + """Test create_id function's URL cleaning behavior""" + # Test basic URL cleaning + id_ = create_id(["pii", "secure % password"], None, False) + assert id_ == "pii.secure-password" + + # Test with multiple path components + id_ = create_id(["Term One", "Term Two", "Term Three"], None, False) + assert id_ == "Term-One.Term-Two.Term-Three" + + # Test with path components containing periods + id_ = create_id(["Term.One", "Term.Two"], None, False) + assert id_ == "Term.One.Term.Two" + + +def test_create_id_with_special_chars(): + """Test create_id function's handling of special characters""" + # Test with non-ASCII characters (should trigger auto_id) + id_ = create_id(["pii", "secure パスワード"], None, False) + assert len(id_) == 32 # GUID length + assert id_.isalnum() # Should only contain alphanumeric characters + + # Test with characters that aren't periods or hyphens + id_ = create_id(["test", "special@#$chars"], None, False) + assert id_ == "test.specialchars" + + +def test_create_id_with_default(): + """Test create_id function with default_id parameter""" + # Test that default_id is respected + id_ = create_id(["any", "path"], "custom-id", False) + assert id_ == "custom-id" + + # Test with URN as default_id + id_ = create_id(["any", "path"], "urn:li:glossaryTerm:custom-id", False) + assert id_ == "urn:li:glossaryTerm:custom-id" From fcabe88962669df54482934a0b8ef6432603d12b Mon Sep 17 00:00:00 2001 From: Jonny Dixon <45681293+acrylJonny@users.noreply.github.com> Date: Thu, 6 Mar 2025 14:32:03 +0000 Subject: [PATCH 10/10] fix(ingestion/oracle): Improved foreign key handling (#11867) Co-authored-by: Harshal Sheth --- .../datahub/ingestion/source/sql/oracle.py | 156 +- .../oracle/golden_test_error_handling.json | 1386 +++++++++++++++++ .../tests/integration/oracle/test_oracle.py | 127 +- 3 files changed, 1602 insertions(+), 67 deletions(-) create mode 100644 metadata-ingestion/tests/integration/oracle/golden_test_error_handling.json diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/oracle.py b/metadata-ingestion/src/datahub/ingestion/source/sql/oracle.py index 9042f9a63a3ae9..16e382278154e1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/oracle.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/oracle.py @@ -152,6 +152,7 @@ def __init__(self, inspector_instance: Inspector): self.exclude_tablespaces: Tuple[str, str] = ("SYSTEM", "SYSAUX") def get_db_name(self) -> str: + db_name = None try: # Try to retrieve current DB name by executing query db_name = self._inspector_instance.bind.execute( @@ -159,7 +160,12 @@ def get_db_name(self) -> str: ).scalar() return str(db_name) except sqlalchemy.exc.DatabaseError as e: - logger.error("Error fetching DB name: " + str(e)) + self.report.failure( + title="Error fetching database name using sys_context.", + message="database_fetch_error", + context=db_name, + exc=e, + ) return "" def get_schema_names(self) -> List[str]: @@ -326,8 +332,8 @@ def get_columns( try: coltype = ischema_names[coltype]() except KeyError: - logger.warning( - f"Did not recognize type {coltype} of column {colname}" + logger.info( + f"Unrecognized column datatype {coltype} of column {colname}" ) coltype = sqltypes.NULLTYPE @@ -379,8 +385,8 @@ def get_table_comment(self, table_name: str, schema: Optional[str] = None) -> Di COMMENT_SQL = """ SELECT comments FROM dba_tab_comments - WHERE table_name = CAST(:table_name AS VARCHAR(128)) - AND owner = CAST(:schema_name AS VARCHAR(128)) + WHERE table_name = :table_name + AND owner = :schema_name """ c = self._inspector_instance.bind.execute( @@ -397,79 +403,93 @@ def _get_constraint_data( text = ( "SELECT" - "\nac.constraint_name," # 0 - "\nac.constraint_type," # 1 - "\nloc.column_name AS local_column," # 2 - "\nrem.table_name AS remote_table," # 3 - "\nrem.column_name AS remote_column," # 4 - "\nrem.owner AS remote_owner," # 5 - "\nloc.position as loc_pos," # 6 - "\nrem.position as rem_pos," # 7 - "\nac.search_condition," # 8 - "\nac.delete_rule" # 9 - "\nFROM dba_constraints%(dblink)s ac," - "\ndba_cons_columns%(dblink)s loc," - "\ndba_cons_columns%(dblink)s rem" - "\nWHERE ac.table_name = CAST(:table_name AS VARCHAR2(128))" - "\nAND ac.constraint_type IN ('R','P', 'U', 'C')" + "\nac.constraint_name," + "\nac.constraint_type," + "\nacc.column_name AS local_column," + "\nNULL AS remote_table," + "\nNULL AS remote_column," + "\nNULL AS remote_owner," + "\nacc.position AS loc_pos," + "\nNULL AS rem_pos," + "\nac.search_condition," + "\nac.delete_rule" + "\nFROM dba_constraints ac" + "\nJOIN dba_cons_columns acc" + "\nON ac.owner = acc.owner" + "\nAND ac.constraint_name = acc.constraint_name" + "\nAND ac.table_name = acc.table_name" + "\nWHERE ac.table_name = :table_name" + "\nAND ac.constraint_type IN ('P', 'U', 'C')" ) if schema is not None: params["owner"] = schema - text += "\nAND ac.owner = CAST(:owner AS VARCHAR2(128))" + text += "\nAND ac.owner = :owner" + # Splitting into queries with UNION ALL for execution efficiency text += ( - "\nAND ac.owner = loc.owner" - "\nAND ac.constraint_name = loc.constraint_name" - "\nAND ac.r_owner = rem.owner(+)" - "\nAND ac.r_constraint_name = rem.constraint_name(+)" - "\nAND (rem.position IS NULL or loc.position=rem.position)" - "\nORDER BY ac.constraint_name, loc.position" + "\nUNION ALL" + "\nSELECT" + "\nac.constraint_name," + "\nac.constraint_type," + "\nacc.column_name AS local_column," + "\nac.r_table_name AS remote_table," + "\nrcc.column_name AS remote_column," + "\nac.r_owner AS remote_owner," + "\nacc.position AS loc_pos," + "\nrcc.position AS rem_pos," + "\nac.search_condition," + "\nac.delete_rule" + "\nFROM dba_constraints ac" + "\nJOIN dba_cons_columns acc" + "\nON ac.owner = acc.owner" + "\nAND ac.constraint_name = acc.constraint_name" + "\nAND ac.table_name = acc.table_name" + "\nLEFT JOIN dba_cons_columns rcc" + "\nON ac.r_owner = rcc.owner" + "\nAND ac.r_constraint_name = rcc.constraint_name" + "\nAND acc.position = rcc.position" + "\nWHERE ac.table_name = :table_name" + "\nAND ac.constraint_type = 'R'" ) - text = text % {"dblink": dblink} + if schema is not None: + text += "\nAND ac.owner = :owner" + + text += "\nORDER BY constraint_name, loc_pos" + rp = self._inspector_instance.bind.execute(sql.text(text), params) - constraint_data = rp.fetchall() - return constraint_data + return rp.fetchall() def get_pk_constraint( self, table_name: str, schema: Optional[str] = None, dblink: str = "" ) -> Dict: - denormalized_table_name = self._inspector_instance.dialect.denormalize_name( - table_name - ) - assert denormalized_table_name - - schema = self._inspector_instance.dialect.denormalize_name( - schema or self.default_schema_name - ) - - if schema is None: - schema = self._inspector_instance.dialect.default_schema_name - pkeys = [] constraint_name = None - constraint_data = self._get_constraint_data( - denormalized_table_name, schema, dblink - ) - for row in constraint_data: - ( - cons_name, - cons_type, - local_column, - remote_table, - remote_column, - remote_owner, - ) = row[0:2] + tuple( - [self._inspector_instance.dialect.normalize_name(x) for x in row[2:6]] + try: + for row in self._get_constraint_data(table_name, schema, dblink): + if row[1] == "P": # constraint_type is 'P' for primary key + if constraint_name is None: + constraint_name = ( + self._inspector_instance.dialect.normalize_name(row[0]) + ) + col_name = self._inspector_instance.dialect.normalize_name( + row[2] + ) # local_column + pkeys.append(col_name) + except Exception as e: + self.report.warning( + title="Failed to Process Primary Keys", + message=( + f"Unable to process primary key constraints for {schema}.{table_name}. " + "Ensure SELECT access on DBA_CONSTRAINTS and DBA_CONS_COLUMNS.", + ), + context=f"{schema}.{table_name}", + exc=e, ) - if cons_type == "P": - if constraint_name is None: - constraint_name = self._inspector_instance.dialect.normalize_name( - cons_name - ) - pkeys.append(local_column) + # Return empty constraint if we can't process it + return {"constrained_columns": [], "name": None} return {"constrained_columns": pkeys, "name": constraint_name} @@ -527,6 +547,16 @@ def fkey_rec(): f"dba_cons_columns{dblink} - does the user have " "proper rights to the table?" ) + self.report.warning( + title="Missing Table Permissions", + message=( + f"Unable to query table_name from dba_cons_columns{dblink}. " + "This usually indicates insufficient permissions on the target table. " + f"Foreign key relationships will not be detected for {schema}.{table_name}. " + "Please ensure the user has SELECT privileges on dba_cons_columns." + ), + context=f"{schema}.{table_name}", + ) rec = fkeys[cons_name] rec["name"] = cons_name @@ -573,8 +603,8 @@ def get_view_definition( text = "SELECT text FROM dba_views WHERE view_name=:view_name" if schema is not None: - text += " AND owner = :schema" - params["schema"] = schema + params["owner"] = schema + text += "\nAND owner = :owner" rp = self._inspector_instance.bind.execute(sql.text(text), params).scalar() diff --git a/metadata-ingestion/tests/integration/oracle/golden_test_error_handling.json b/metadata-ingestion/tests/integration/oracle/golden_test_error_handling.json new file mode 100644 index 00000000000000..c542cb6a20190a --- /dev/null +++ b/metadata-ingestion/tests/integration/oracle/golden_test_error_handling.json @@ -0,0 +1,1386 @@ +[ +{ + "entityType": "container", + "entityUrn": "urn:li:container:0e497517e191d344b0c403231bc708d0", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "oracle", + "env": "PROD", + "database": "OraDoc" + }, + "name": "OraDoc", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:0e497517e191d344b0c403231bc708d0", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:0e497517e191d344b0c403231bc708d0", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:oracle" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:0e497517e191d344b0c403231bc708d0", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Database" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:0e497517e191d344b0c403231bc708d0", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:937a38ee28b69ecae38665c5e842d0ad", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:0e497517e191d344b0c403231bc708d0" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:937a38ee28b69ecae38665c5e842d0ad", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "oracle", + "env": "PROD", + "database": "OraDoc", + "schema": "schema1" + }, + "name": "schema1", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:937a38ee28b69ecae38665c5e842d0ad", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:937a38ee28b69ecae38665c5e842d0ad", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:oracle" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:937a38ee28b69ecae38665c5e842d0ad", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Schema" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:937a38ee28b69ecae38665c5e842d0ad", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0e497517e191d344b0c403231bc708d0", + "urn": "urn:li:container:0e497517e191d344b0c403231bc708d0" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:oracle,schema1.test1,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:937a38ee28b69ecae38665c5e842d0ad" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:oracle,schema1.test1,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": {}, + "name": "test1", + "description": "Some mock comment here ...", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "schema1.test1", + "platform": "urn:li:dataPlatform:oracle", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "mock column name", + "nullable": true, + "description": "mock comment for column", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "NUMBER", + "recursive": false, + "isPartOfKey": true + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:oracle,schema1.test1,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:oracle,schema1.test1,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0e497517e191d344b0c403231bc708d0", + "urn": "urn:li:container:0e497517e191d344b0c403231bc708d0" + }, + { + "id": "urn:li:container:937a38ee28b69ecae38665c5e842d0ad", + "urn": "urn:li:container:937a38ee28b69ecae38665c5e842d0ad" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:oracle,schema1.test2,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:937a38ee28b69ecae38665c5e842d0ad" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:oracle,schema1.test2,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": {}, + "name": "test2", + "description": "Some mock comment here ...", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "schema1.test2", + "platform": "urn:li:dataPlatform:oracle", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "mock column name", + "nullable": true, + "description": "mock comment for column", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "NUMBER", + "recursive": false, + "isPartOfKey": true + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:oracle,schema1.test2,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:oracle,schema1.test2,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0e497517e191d344b0c403231bc708d0", + "urn": "urn:li:container:0e497517e191d344b0c403231bc708d0" + }, + { + "id": "urn:li:container:937a38ee28b69ecae38665c5e842d0ad", + "urn": "urn:li:container:937a38ee28b69ecae38665c5e842d0ad" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:oracle,schema1.view1,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:937a38ee28b69ecae38665c5e842d0ad" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:oracle,schema1.view1,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "is_view": "True", + "view_definition": "CREATE VIEW mock_view AS\n SELECT\n mock_column1,\n mock_column2\n FROM mock_table" + }, + "name": "view1", + "description": "Some mock comment here ...", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "schema1.view1", + "platform": "urn:li:dataPlatform:oracle", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "mock column name", + "nullable": true, + "description": "mock comment for column", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "NUMBER", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:oracle,schema1.view1,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:oracle,schema1.view1,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "CREATE VIEW mock_view AS\n SELECT\n mock_column1,\n mock_column2\n FROM mock_table", + "viewLanguage": "SQL" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:oracle,schema1.view1,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0e497517e191d344b0c403231bc708d0", + "urn": "urn:li:container:0e497517e191d344b0c403231bc708d0" + }, + { + "id": "urn:li:container:937a38ee28b69ecae38665c5e842d0ad", + "urn": "urn:li:container:937a38ee28b69ecae38665c5e842d0ad" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:1965527855ae77f259a8ddea2b8eed2f", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:0e497517e191d344b0c403231bc708d0" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:1965527855ae77f259a8ddea2b8eed2f", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "oracle", + "env": "PROD", + "database": "OraDoc", + "schema": "schema2" + }, + "name": "schema2", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:1965527855ae77f259a8ddea2b8eed2f", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:1965527855ae77f259a8ddea2b8eed2f", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:oracle" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:1965527855ae77f259a8ddea2b8eed2f", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Schema" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:1965527855ae77f259a8ddea2b8eed2f", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0e497517e191d344b0c403231bc708d0", + "urn": "urn:li:container:0e497517e191d344b0c403231bc708d0" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:oracle,schema2.test3,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:1965527855ae77f259a8ddea2b8eed2f" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:oracle,schema2.test3,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": {}, + "name": "test3", + "description": "Some mock comment here ...", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "schema2.test3", + "platform": "urn:li:dataPlatform:oracle", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "mock column name", + "nullable": true, + "description": "mock comment for column", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "NUMBER", + "recursive": false, + "isPartOfKey": true + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:oracle,schema2.test3,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:oracle,schema2.test3,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0e497517e191d344b0c403231bc708d0", + "urn": "urn:li:container:0e497517e191d344b0c403231bc708d0" + }, + { + "id": "urn:li:container:1965527855ae77f259a8ddea2b8eed2f", + "urn": "urn:li:container:1965527855ae77f259a8ddea2b8eed2f" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:oracle,schema2.test4,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:1965527855ae77f259a8ddea2b8eed2f" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:oracle,schema2.test4,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": {}, + "name": "test4", + "description": "Some mock comment here ...", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "schema2.test4", + "platform": "urn:li:dataPlatform:oracle", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "mock column name", + "nullable": true, + "description": "mock comment for column", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "NUMBER", + "recursive": false, + "isPartOfKey": true + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:oracle,schema2.test4,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:oracle,schema2.test4,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0e497517e191d344b0c403231bc708d0", + "urn": "urn:li:container:0e497517e191d344b0c403231bc708d0" + }, + { + "id": "urn:li:container:1965527855ae77f259a8ddea2b8eed2f", + "urn": "urn:li:container:1965527855ae77f259a8ddea2b8eed2f" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:oracle,schema2.view1,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:1965527855ae77f259a8ddea2b8eed2f" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:oracle,schema2.view1,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "is_view": "True", + "view_definition": "CREATE VIEW mock_view AS\n SELECT\n mock_column1,\n mock_column2\n FROM mock_table" + }, + "name": "view1", + "description": "Some mock comment here ...", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "schema2.view1", + "platform": "urn:li:dataPlatform:oracle", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "mock column name", + "nullable": true, + "description": "mock comment for column", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "NUMBER", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:oracle,schema2.view1,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:oracle,schema2.view1,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "CREATE VIEW mock_view AS\n SELECT\n mock_column1,\n mock_column2\n FROM mock_table", + "viewLanguage": "SQL" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:oracle,schema2.view1,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0e497517e191d344b0c403231bc708d0", + "urn": "urn:li:container:0e497517e191d344b0c403231bc708d0" + }, + { + "id": "urn:li:container:1965527855ae77f259a8ddea2b8eed2f", + "urn": "urn:li:container:1965527855ae77f259a8ddea2b8eed2f" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:oracle,schema1.view1,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:_ingestion" + }, + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:oracle,mock_table,PROD)", + "type": "VIEW", + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aoracle%2Cschema1.view1%2CPROD%29" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:oracle,mock_table,PROD),MOCK_COLUMN1)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:oracle,schema1.view1,PROD),MOCK_COLUMN1)" + ], + "confidenceScore": 0.2, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aoracle%2Cschema1.view1%2CPROD%29" + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:oracle,mock_table,PROD),MOCK_COLUMN2)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:oracle,schema1.view1,PROD),MOCK_COLUMN2)" + ], + "confidenceScore": 0.2, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aoracle%2Cschema1.view1%2CPROD%29" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aoracle%2Cschema1.view1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "queryProperties", + "aspect": { + "json": { + "statement": { + "value": "CREATE VIEW mock_view AS\nSELECT\n mock_column1,\n mock_column2\nFROM mock_table", + "language": "SQL" + }, + "source": "SYSTEM", + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "lastModified": { + "time": 1643871600000, + "actor": "urn:li:corpuser:_ingestion" + } + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aoracle%2Cschema1.view1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "querySubjects", + "aspect": { + "json": { + "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:oracle,mock_table,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:oracle,mock_table,PROD),MOCK_COLUMN1)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:oracle,mock_table,PROD),MOCK_COLUMN2)" + }, + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:oracle,schema1.view1,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:oracle,schema1.view1,PROD),MOCK_COLUMN1)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:oracle,schema1.view1,PROD),MOCK_COLUMN2)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aoracle%2Cschema1.view1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:oracle" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:oracle,schema2.view1,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:_ingestion" + }, + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:oracle,mock_table,PROD)", + "type": "VIEW", + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aoracle%2Cschema2.view1%2CPROD%29" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:oracle,mock_table,PROD),MOCK_COLUMN1)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:oracle,schema2.view1,PROD),MOCK_COLUMN1)" + ], + "confidenceScore": 0.2, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aoracle%2Cschema2.view1%2CPROD%29" + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:oracle,mock_table,PROD),MOCK_COLUMN2)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:oracle,schema2.view1,PROD),MOCK_COLUMN2)" + ], + "confidenceScore": 0.2, + "query": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aoracle%2Cschema2.view1%2CPROD%29" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aoracle%2Cschema2.view1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "queryProperties", + "aspect": { + "json": { + "statement": { + "value": "CREATE VIEW mock_view AS\nSELECT\n mock_column1,\n mock_column2\nFROM mock_table", + "language": "SQL" + }, + "source": "SYSTEM", + "created": { + "time": 0, + "actor": "urn:li:corpuser:_ingestion" + }, + "lastModified": { + "time": 1643871600000, + "actor": "urn:li:corpuser:_ingestion" + } + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aoracle%2Cschema2.view1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "querySubjects", + "aspect": { + "json": { + "subjects": [ + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:oracle,mock_table,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:oracle,mock_table,PROD),MOCK_COLUMN1)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:oracle,mock_table,PROD),MOCK_COLUMN2)" + }, + { + "entity": "urn:li:dataset:(urn:li:dataPlatform:oracle,schema2.view1,PROD)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:oracle,schema2.view1,PROD),MOCK_COLUMN1)" + }, + { + "entity": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:oracle,schema2.view1,PROD),MOCK_COLUMN2)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aoracle%2Cschema2.view1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:oracle" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aoracle%2Cschema1.view1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "query", + "entityUrn": "urn:li:query:view_urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aoracle%2Cschema2.view1%2CPROD%29", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "oracle-2022_02_03-07_00_00-pfauuo", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/oracle/test_oracle.py b/metadata-ingestion/tests/integration/oracle/test_oracle.py index 4541bb8ac65bff..48f6fd596c9623 100644 --- a/metadata-ingestion/tests/integration/oracle/test_oracle.py +++ b/metadata-ingestion/tests/integration/oracle/test_oracle.py @@ -1,14 +1,28 @@ +from typing import Any from unittest import mock -from unittest.mock import MagicMock +from unittest.mock import MagicMock, patch import pytest from freezegun import freeze_time +from sqlalchemy import exc -from tests.integration.oracle.common import OracleTestCaseBase # type: ignore +from datahub.ingestion.api.source import StructuredLogLevel +from datahub.ingestion.source.sql.oracle import OracleInspectorObjectWrapper +from tests.integration.oracle.common import ( # type: ignore[import-untyped] + OracleSourceMockDataBase, + OracleTestCaseBase, +) FROZEN_TIME = "2022-02-03 07:00:00" +class OracleErrorHandlingMockData(OracleSourceMockDataBase): + def get_data(self, *args: Any, **kwargs: Any) -> Any: + if isinstance(args[0], str) and "sys_context" in args[0]: + raise exc.DatabaseError("statement", [], "Mock DB Error") + return super().get_data(*args, **kwargs) + + class OracleIntegrationTestCase(OracleTestCaseBase): def apply_mock_data(self, mock_create_engine, mock_inspect, mock_event): mock_event.listen.return_value = None @@ -37,9 +51,104 @@ def apply(self, mock_create_engine, mock_inspect, mock_event): super().apply() +class TestOracleSourceErrorHandling(OracleIntegrationTestCase): + def __init__(self, pytestconfig, tmp_path): + super().__init__( + pytestconfig=pytestconfig, + tmp_path=tmp_path, + golden_file_name="golden_test_error_handling.json", + output_file_name="oracle_mce_output_error_handling.json", + add_database_name_to_urn=False, + ) + self.default_mock_data = OracleErrorHandlingMockData() + + def test_get_db_name_error_handling(self): + inspector = MagicMock() + inspector.bind.execute.side_effect = exc.DatabaseError( + "statement", [], "Mock DB Error" + ) + inspector_wrapper = OracleInspectorObjectWrapper(inspector) + + db_name = inspector_wrapper.get_db_name() + + assert db_name == "" + assert len(inspector_wrapper.report.failures) == 1 + error = inspector_wrapper.report.failures[0] + assert error.impact.name == StructuredLogLevel.ERROR.name + assert error.message == "database_fetch_error" + + def test_get_pk_constraint_error_handling(self): + inspector = MagicMock() + inspector.dialect.normalize_name.side_effect = lambda x: x + inspector.dialect.denormalize_name.side_effect = lambda x: x + inspector_wrapper = OracleInspectorObjectWrapper(inspector) + + with patch.object( + inspector_wrapper, "_get_constraint_data" + ) as mock_get_constraint: + mock_get_constraint.side_effect = Exception("Mock constraint error") + + result = inspector_wrapper.get_pk_constraint("test_table", "test_schema") + + assert result == {"constrained_columns": [], "name": None} + assert len(inspector_wrapper.report.failures) == 1 + error = inspector_wrapper.report.failures[0] + assert error.impact.name == StructuredLogLevel.ERROR.name + assert "Error processing primary key constraints" in error.message + + def test_get_foreign_keys_missing_table_warning(self): + inspector = MagicMock() + inspector.dialect.normalize_name.side_effect = lambda x: x + inspector.dialect.denormalize_name.side_effect = lambda x: x + inspector_wrapper = OracleInspectorObjectWrapper(inspector) + + mock_data = [ + ( + "FK1", + "R", + "local_col", + None, + "remote_col", + "remote_owner", + 1, + 1, + None, + "NO ACTION", + ) + ] + + with patch.object( + inspector_wrapper, "_get_constraint_data" + ) as mock_get_constraint: + mock_get_constraint.return_value = mock_data + + inspector_wrapper.get_foreign_keys("test_table", "test_schema") + + assert len(inspector_wrapper.report.warnings) == 1 + warning = inspector_wrapper.report.warnings[0] + assert warning.message == "Unable to query table_name from dba_cons_columns" + + def test_get_table_comment_with_cast(self): + inspector = MagicMock() + inspector.dialect.normalize_name.side_effect = lambda x: x + inspector.dialect.denormalize_name.side_effect = lambda x: x + inspector_wrapper = OracleInspectorObjectWrapper(inspector) + + mock_comment = "Test table comment" + inspector.bind.execute.return_value.scalar.return_value = mock_comment + + result = inspector_wrapper.get_table_comment("test_table", "test_schema") + + assert result == {"text": mock_comment} + execute_args = inspector.bind.execute.call_args[0] + sql_text = str(execute_args[0]) + assert "CAST(:table_name AS VARCHAR(128))" in sql_text + assert "CAST(:schema_name AS VARCHAR(128))" in sql_text + + @freeze_time(FROZEN_TIME) @pytest.mark.integration -def test_oracle_source_integration_with_out_database(pytestconfig, tmp_path, mock_time): +def test_oracle_source_integration_with_out_database(pytestconfig, tmp_path): oracle_source_integration_test = OracleIntegrationTestCase( pytestconfig=pytestconfig, tmp_path=tmp_path, @@ -52,7 +161,7 @@ def test_oracle_source_integration_with_out_database(pytestconfig, tmp_path, moc @freeze_time(FROZEN_TIME) @pytest.mark.integration -def test_oracle_source_integration_with_database(pytestconfig, tmp_path, mock_time): +def test_oracle_source_integration_with_database(pytestconfig, tmp_path): oracle_source_integration_test = OracleIntegrationTestCase( pytestconfig=pytestconfig, tmp_path=tmp_path, @@ -61,3 +170,13 @@ def test_oracle_source_integration_with_database(pytestconfig, tmp_path, mock_ti add_database_name_to_urn=True, ) oracle_source_integration_test.apply() + + +@freeze_time(FROZEN_TIME) +@pytest.mark.integration +def test_oracle_source_error_handling(pytestconfig, tmp_path): + test_case = TestOracleSourceErrorHandling( + pytestconfig=pytestconfig, + tmp_path=tmp_path, + ) + test_case.apply()