From 389a404bf1e220f7b22b6fa1cd9053be685a2442 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Sat, 22 Feb 2025 11:35:43 -0800 Subject: [PATCH 01/45] chore(ingest): bump sqlglot version to latest (#12696) --- metadata-ingestion/setup.py | 6 +++--- .../src/datahub/sql_parsing/_sqlglot_patch.py | 12 ++---------- .../src/datahub/utilities/unified_diff.py | 6 +++++- 3 files changed, 10 insertions(+), 14 deletions(-) diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 9a421fc92f2e4..a1750e11212b8 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -99,9 +99,9 @@ sqlglot_lib = { # We heavily monkeypatch sqlglot. - # Prior to the patching, we originally maintained an acryl-sqlglot fork: - # https://github.com/tobymao/sqlglot/compare/main...hsheth2:sqlglot:main?expand=1 - "sqlglot[rs]==25.32.1", + # We used to maintain an acryl-sqlglot fork: https://github.com/tobymao/sqlglot/compare/main...hsheth2:sqlglot:main?expand=1 + # but not longer do. + "sqlglot[rs]==26.6.0", "patchy==2.8.0", } diff --git a/metadata-ingestion/src/datahub/sql_parsing/_sqlglot_patch.py b/metadata-ingestion/src/datahub/sql_parsing/_sqlglot_patch.py index 55f30b576b44e..af1a8865497d8 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/_sqlglot_patch.py +++ b/metadata-ingestion/src/datahub/sql_parsing/_sqlglot_patch.py @@ -172,17 +172,9 @@ class Node(sqlglot.lineage.Node): derived_tables = [ source.expression.parent for source in scope.sources.values() -@@ -254,6 +257,7 @@ def to_node( - if dt.comments and dt.comments[0].startswith("source: ") - } - -+ c: exp.Column - for c in source_columns: - table = c.table - source = scope.sources.get(table) @@ -281,8 +285,21 @@ def to_node( - # it means this column's lineage is unknown. This can happen if the definition of a source used in a query - # is not passed into the `sources` map. + # is unknown. This can happen if the definition of a source used in a query is not + # passed into the `sources` map. source = source or exp.Placeholder() + + subfields = [] diff --git a/metadata-ingestion/src/datahub/utilities/unified_diff.py b/metadata-ingestion/src/datahub/utilities/unified_diff.py index c896fd4df4d8f..2512a7fd6c5ab 100644 --- a/metadata-ingestion/src/datahub/utilities/unified_diff.py +++ b/metadata-ingestion/src/datahub/utilities/unified_diff.py @@ -2,8 +2,12 @@ from dataclasses import dataclass from typing import List, Tuple +from datahub.cli.env_utils import get_boolean_env_variable + +_debug_diff = get_boolean_env_variable("DATAHUB_DEBUG_DIFF_PATCHER") + logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) +logger.setLevel(logging.DEBUG if _debug_diff else logging.INFO) _LOOKAROUND_LINES = 300 From 38e4cfdf2c8b3392b72fc31574932758e78c06a7 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Sat, 22 Feb 2025 16:54:59 -0800 Subject: [PATCH 02/45] chore(docs): disable wheel builds in docs site (#12709) --- docs-website/.gitignore | 1 + docs-website/build.gradle | 1 - docs-website/docusaurus.config.js | 2 +- docs-website/generateDocsDir.ts | 23 ----------------------- docs-website/package.json | 4 ++-- 5 files changed, 4 insertions(+), 27 deletions(-) diff --git a/docs-website/.gitignore b/docs-website/.gitignore index 482a7644292ae..8befa30d04b8a 100644 --- a/docs-website/.gitignore +++ b/docs-website/.gitignore @@ -3,6 +3,7 @@ /docs /genDocs /genStatic +# note that genStatic is no longer used # Generated GraphQL /graphql/combined.graphql diff --git a/docs-website/build.gradle b/docs-website/build.gradle index 1be790695e87e..1cc02977e58e9 100644 --- a/docs-website/build.gradle +++ b/docs-website/build.gradle @@ -83,7 +83,6 @@ task yarnInstall(type: YarnTask) { task yarnGenerate(type: YarnTask, dependsOn: [yarnInstall, generateGraphQLSchema, generateJsonSchema, ':metadata-ingestion:modelDocGen', ':metadata-ingestion:docGen', - ':python-build:buildWheels', ]) { inputs.files(projectMdFiles) outputs.cacheIf { true } diff --git a/docs-website/docusaurus.config.js b/docs-website/docusaurus.config.js index 4349c9b0bc44a..a5f794065e698 100644 --- a/docs-website/docusaurus.config.js +++ b/docs-website/docusaurus.config.js @@ -11,7 +11,7 @@ module.exports = { favicon: "img/favicon.ico", organizationName: "datahub-project", // Usually your GitHub org/user name. projectName: "datahub", // Usually your repo name. - staticDirectories: ["static", "genStatic"], + staticDirectories: ["static"], stylesheets: ["https://fonts.googleapis.com/css2?family=Manrope:wght@400;500;700&display=swap"], headTags: [ { diff --git a/docs-website/generateDocsDir.ts b/docs-website/generateDocsDir.ts index 3a14baee073c2..0b7a0a780cce1 100644 --- a/docs-website/generateDocsDir.ts +++ b/docs-website/generateDocsDir.ts @@ -16,7 +16,6 @@ const GITHUB_BROWSE_URL = "https://github.com/datahub-project/datahub/blob/master"; const OUTPUT_DIRECTORY = "docs"; -const STATIC_DIRECTORY = "genStatic/artifacts"; const SIDEBARS_DEF_PATH = "./sidebars.js"; const sidebars = require(SIDEBARS_DEF_PATH); @@ -607,25 +606,6 @@ function write_markdown_file( } } -function copy_python_wheels(): void { - // Copy the built wheel files to the static directory. - // Everything is copied to the python-build directory first, so - // we just need to copy from there. - const wheel_dir = "../python-build/wheels"; - - const wheel_output_directory = path.join(STATIC_DIRECTORY, "wheels"); - fs.mkdirSync(wheel_output_directory, { recursive: true }); - - const wheel_files = fs.readdirSync(wheel_dir); - for (const wheel_file of wheel_files) { - const src = path.join(wheel_dir, wheel_file); - const dest = path.join(wheel_output_directory, wheel_file); - - // console.log(`Copying artifact ${src} to ${dest}...`); - fs.copyFileSync(src, dest); - } -} - (async function main() { for (const filepath of markdown_files) { //console.log("Processing:", filepath); @@ -680,8 +660,5 @@ function copy_python_wheels(): void { ); } } - - // Generate static directory. - copy_python_wheels(); // TODO: copy over the source json schemas + other artifacts. })(); diff --git a/docs-website/package.json b/docs-website/package.json index 52f04e46c42aa..258196e57621f 100644 --- a/docs-website/package.json +++ b/docs-website/package.json @@ -14,8 +14,8 @@ "_generate-python-sdk": "cd sphinx && make md", "_generate-docs-dir-script": "ts-node -O '{ \"lib\": [\"es2020\"], \"target\": \"es6\" }' generateDocsDir.ts", "_generate-docs": "rm -rf docs && mkdir docs && yarn _generate-graphql && yarn _generate-python-sdk && yarn run _generate-docs-dir-script", - "generate": "rm -rf genDocs genStatic && mkdir genDocs genStatic && yarn _generate-docs && mv docs/* genDocs/ && rmdir docs", - "generate-rsync": "mkdir -p genDocs genStatic && yarn _generate-docs && rsync -v --checksum -r -h -i --delete docs/ genDocs && rm -rf docs", + "generate": "rm -rf genDocs && mkdir genDocs && yarn _generate-docs && mv docs/* genDocs/ && rmdir docs", + "generate-rsync": "mkdir -p genDocs && yarn _generate-docs && rsync -v --checksum -r -h -i --delete docs/ genDocs && rm -rf docs", "lint": "prettier -w generateDocsDir.ts sidebars.js src/pages/index.js", "lint-check": "prettier -l generateDocsDir.ts sidebars.js src/pages/index.js", "lint-fix": "prettier --write generateDocsDir.ts sidebars.js src/pages/index.js", From 7bee19c834fb4b46339875fa711712b6d488900a Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Sat, 22 Feb 2025 16:55:07 -0800 Subject: [PATCH 03/45] chore(docs): archive v0.14.1 docs (#12708) --- .../DocsVersionDropdownNavbarItem.js | 77 ++++++++----------- docs-website/versions.json | 3 +- 2 files changed, 35 insertions(+), 45 deletions(-) diff --git a/docs-website/src/theme/NavbarItem/DocsVersionDropdownNavbarItem.js b/docs-website/src/theme/NavbarItem/DocsVersionDropdownNavbarItem.js index c9e5098a016c2..150af40073270 100644 --- a/docs-website/src/theme/NavbarItem/DocsVersionDropdownNavbarItem.js +++ b/docs-website/src/theme/NavbarItem/DocsVersionDropdownNavbarItem.js @@ -11,6 +11,37 @@ import styles from "./styles.module.scss"; const getVersionMainDoc = (version) => version.docs.find((doc) => doc.id === version.mainDocId); +const archivedVersionLinks = [ + { + label: '0.14.1', + to: 'https://docs-website-8jkm4uler-acryldata.vercel.app/docs/0.14.1/features', + }, + { + label: '0.14.0', + to: 'https://docs-website-eue2qafvn-acryldata.vercel.app/docs/features', + }, + { + label: '0.13.1', + to: 'https://docs-website-psat3nzgi-acryldata.vercel.app/docs/features', + }, + { + label: '0.13.0', + to: 'https://docs-website-lzxh86531-acryldata.vercel.app/docs/features', + }, + { + label: '0.12.1', + to: 'https://docs-website-2uuxmgza2-acryldata.vercel.app/docs/features', + }, + { + label: '0.11.0', + to: 'https://docs-website-irpoe2osc-acryldata.vercel.app/docs/features', + }, + { + label: '0.10.5', + to: 'https://docs-website-1gv2yzn9d-acryldata.vercel.app/docs/features', + }, +] + export default function DocsVersionDropdownNavbarItem({ mobile, docsPluginId = 'default', @@ -42,54 +73,14 @@ export default function DocsVersionDropdownNavbarItem({ type: 'html', value: '', }, - { - value: ` - 0.14.0 - - - `, - type: "html", - }, - { - value: ` - 0.13.1 - - - `, - type: "html", - }, - { - value: ` - 0.13.0 - - - `, - type: "html", - }, - { + ...archivedVersionLinks.map((version) => ({ value: ` - 0.12.1 + ${version.label} `, type: "html", - }, - { - value: ` - 0.11.0 - - - `, - type: "html", - }, - { - value: ` - 0.10.5 - - - `, - type: "html", - }, + })), ]; const items = [...dropdownItemsBefore, ...versionLinks, ...archivedVersions, ...dropdownItemsAfter]; diff --git a/docs-website/versions.json b/docs-website/versions.json index ab242fa47dce5..bb4ea391dc624 100644 --- a/docs-website/versions.json +++ b/docs-website/versions.json @@ -1,4 +1,3 @@ [ - "0.15.0", - "0.14.1" + "0.15.0" ] From d1494c22521d7fbb7101427ccb90aca7108eecc0 Mon Sep 17 00:00:00 2001 From: Aseem Bansal Date: Mon, 24 Feb 2025 14:07:57 +0530 Subject: [PATCH 04/45] dev: upgrade ruff, remove 2 ruff ignore rules (#12702) --- metadata-ingestion-modules/airflow-plugin/setup.py | 2 +- metadata-ingestion-modules/dagster-plugin/setup.py | 2 +- metadata-ingestion-modules/gx-plugin/setup.py | 2 +- metadata-ingestion-modules/prefect-plugin/setup.py | 2 +- metadata-ingestion/pyproject.toml | 2 -- metadata-ingestion/setup.py | 2 +- .../api/source_helpers/test_incremental_lineage_helper.py | 4 +++- .../tests/unit/api/source_helpers/test_source_helpers.py | 8 ++++++-- metadata-ingestion/tests/unit/test_iceberg.py | 4 ++-- metadata-service/iceberg-catalog/requirements.txt | 2 +- smoke-test/requirements.txt | 2 +- 11 files changed, 18 insertions(+), 14 deletions(-) diff --git a/metadata-ingestion-modules/airflow-plugin/setup.py b/metadata-ingestion-modules/airflow-plugin/setup.py index d03ed824c9a26..74714b1a71840 100644 --- a/metadata-ingestion-modules/airflow-plugin/setup.py +++ b/metadata-ingestion-modules/airflow-plugin/setup.py @@ -72,7 +72,7 @@ def get_long_description(): *base_requirements, *mypy_stubs, "coverage>=5.1", - "ruff==0.9.2", + "ruff==0.9.7", "mypy==1.10.1", # pydantic 1.8.2 is incompatible with mypy 0.910. # See https://github.com/samuelcolvin/pydantic/pull/3175#issuecomment-995382910. diff --git a/metadata-ingestion-modules/dagster-plugin/setup.py b/metadata-ingestion-modules/dagster-plugin/setup.py index 8de25f59edbf6..b472dc8f3bb41 100644 --- a/metadata-ingestion-modules/dagster-plugin/setup.py +++ b/metadata-ingestion-modules/dagster-plugin/setup.py @@ -54,7 +54,7 @@ def get_long_description(): "dagster-snowflake >= 0.11.0", "dagster-snowflake-pandas >= 0.11.0", "coverage>=5.1", - "ruff==0.9.2", + "ruff==0.9.7", "mypy>=1.4.0", # pydantic 1.8.2 is incompatible with mypy 0.910. # See https://github.com/samuelcolvin/pydantic/pull/3175#issuecomment-995382910. diff --git a/metadata-ingestion-modules/gx-plugin/setup.py b/metadata-ingestion-modules/gx-plugin/setup.py index 43495673a7ff1..f362e91fc980f 100644 --- a/metadata-ingestion-modules/gx-plugin/setup.py +++ b/metadata-ingestion-modules/gx-plugin/setup.py @@ -59,7 +59,7 @@ def get_long_description(): *base_requirements, *mypy_stubs, "coverage>=5.1", - "ruff==0.9.1", + "ruff==0.9.7", "mypy>=1.4.0", # pydantic 1.8.2 is incompatible with mypy 0.910. # See https://github.com/samuelcolvin/pydantic/pull/3175#issuecomment-995382910. diff --git a/metadata-ingestion-modules/prefect-plugin/setup.py b/metadata-ingestion-modules/prefect-plugin/setup.py index 87feb810b8e5a..f83b90e32266c 100644 --- a/metadata-ingestion-modules/prefect-plugin/setup.py +++ b/metadata-ingestion-modules/prefect-plugin/setup.py @@ -56,7 +56,7 @@ def get_long_description(): *base_requirements, *mypy_stubs, "coverage>=5.1", - "ruff==0.9.2", + "ruff==0.9.7", "mypy>=1.4.0", # pydantic 1.8.2 is incompatible with mypy 0.910. # See https://github.com/samuelcolvin/pydantic/pull/3175#issuecomment-995382910. diff --git a/metadata-ingestion/pyproject.toml b/metadata-ingestion/pyproject.toml index c3ec6cb830dc0..c309926dd9689 100644 --- a/metadata-ingestion/pyproject.toml +++ b/metadata-ingestion/pyproject.toml @@ -36,14 +36,12 @@ extend-ignore = [ "E111", # Handled by formatter "E114", # Handled by formatter "E117", # Handled by formatter - "E203", # Ignore whitespace before ':' (matches Black) "B019", # Allow usages of functools.lru_cache "B008", # Allow function call in argument defaults "RUF012", # mutable-class-default; incompatible with pydantic "RUF015", # unnecessary-iterable-allocation-for-first-element # TODO: Enable these later "B006", # Mutable args - "B017", # Do not assert blind exception "B904", # Checks for raise statements in exception handlers that lack a from clause ] diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index a1750e11212b8..3bf8917f23f89 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -598,7 +598,7 @@ lint_requirements = { # This is pinned only to avoid spurious errors in CI. # We should make an effort to keep it up to date. - "ruff==0.9.2", + "ruff==0.9.7", "mypy==1.10.1", } diff --git a/metadata-ingestion/tests/unit/api/source_helpers/test_incremental_lineage_helper.py b/metadata-ingestion/tests/unit/api/source_helpers/test_incremental_lineage_helper.py index 6dd2332fec2ba..adac19acd6f2a 100644 --- a/metadata-ingestion/tests/unit/api/source_helpers/test_incremental_lineage_helper.py +++ b/metadata-ingestion/tests/unit/api/source_helpers/test_incremental_lineage_helper.py @@ -25,13 +25,15 @@ def make_lineage_aspect( dataset_name: str, upstreams: List[str], timestamp: int = 0, - columns: List[str] = [], + columns: Optional[List[str]] = None, include_cll: bool = False, ) -> models.UpstreamLineageClass: """ Generates dataset properties and upstream lineage aspects with simple column to column lineage between current dataset and all upstreams """ + if not columns: + columns = [] dataset_urn = make_dataset_urn(platform, dataset_name) return models.UpstreamLineageClass( diff --git a/metadata-ingestion/tests/unit/api/source_helpers/test_source_helpers.py b/metadata-ingestion/tests/unit/api/source_helpers/test_source_helpers.py index cdfd24554e5e5..4a33a07a7f6e3 100644 --- a/metadata-ingestion/tests/unit/api/source_helpers/test_source_helpers.py +++ b/metadata-ingestion/tests/unit/api/source_helpers/test_source_helpers.py @@ -1,6 +1,6 @@ import logging from datetime import datetime -from typing import List, Union +from typing import List, Optional, Union import pytest from freezegun import freeze_time @@ -269,7 +269,11 @@ def test_auto_empty_dataset_usage_statistics_invalid_timestamp( ] -def get_sample_mcps(mcps_to_append: List = []) -> List[MetadataChangeProposalWrapper]: +def get_sample_mcps( + mcps_to_append: Optional[List] = None, +) -> List[MetadataChangeProposalWrapper]: + if not mcps_to_append: + mcps_to_append = [] mcps = [ MetadataChangeProposalWrapper( entityUrn="urn:li:dataset:(urn:li:dataPlatform:dbt,abc.foo.bar,PROD)", diff --git a/metadata-ingestion/tests/unit/test_iceberg.py b/metadata-ingestion/tests/unit/test_iceberg.py index 48524450caf36..12a7228ab792c 100644 --- a/metadata-ingestion/tests/unit/test_iceberg.py +++ b/metadata-ingestion/tests/unit/test_iceberg.py @@ -566,7 +566,7 @@ def list_tables(self, namespace: str) -> Iterable[Tuple[str, str]]: class MockCatalogExceptionListingNamespaces(MockCatalog): def list_namespaces(self) -> Iterable[Tuple[str]]: - raise Exception() + raise Exception("Test exception") def test_exception_while_listing_namespaces() -> None: @@ -574,7 +574,7 @@ def test_exception_while_listing_namespaces() -> None: mock_catalog = MockCatalogExceptionListingNamespaces({}) with patch( "datahub.ingestion.source.iceberg.iceberg.IcebergSourceConfig.get_catalog" - ) as get_catalog, pytest.raises(Exception): + ) as get_catalog, pytest.raises(Exception, match="Test exception"): get_catalog.return_value = mock_catalog [*source.get_workunits_internal()] diff --git a/metadata-service/iceberg-catalog/requirements.txt b/metadata-service/iceberg-catalog/requirements.txt index 76f97b8061020..ef28e48091048 100644 --- a/metadata-service/iceberg-catalog/requirements.txt +++ b/metadata-service/iceberg-catalog/requirements.txt @@ -4,5 +4,5 @@ pyspark==3.5.3 -e ../../metadata-ingestion[iceberg-catalog] # libaries for linting below this mypy==1.5.1 -ruff==0.9.6 +ruff==0.9.7 diff --git a/smoke-test/requirements.txt b/smoke-test/requirements.txt index f1fbdac68f067..71c9d59ffae70 100644 --- a/smoke-test/requirements.txt +++ b/smoke-test/requirements.txt @@ -10,7 +10,7 @@ pytest-xdist networkx # libaries for linting below this mypy==1.5.1 -ruff==0.9.2 +ruff==0.9.7 # stub version are copied from metadata-ingestion/setup.py and that should be the source of truth types-requests>=2.28.11.6,<=2.31.0.3 types-PyYAML From 16ef1ac1749952faffe07be30424cb3b61ea9a15 Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Mon, 24 Feb 2025 06:30:45 -0600 Subject: [PATCH 05/45] feat(validation): enable validation trim options (#12712) --- .../entity/ebean/batch/ChangeItemImpl.java | 2 +- .../entity/ebean/batch/MCLItemImpl.java | 4 +- .../entity/validation/ValidationApiUtils.java | 26 +-- .../linkedin/metadata/entity/EntityUtils.java | 2 +- .../metadata/entity/ValidationUtilsTest.java | 2 + .../resources/entity/EntityResource.java | 6 +- metadata-utils/build.gradle | 2 + .../utils/RecordTemplateValidator.java | 38 +++- .../utils/RecordTemplateValidatorTest.java | 208 ++++++++++++++++++ 9 files changed, 268 insertions(+), 22 deletions(-) create mode 100644 metadata-utils/src/test/java/com/linkedin/metadata/utils/RecordTemplateValidatorTest.java diff --git a/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/ChangeItemImpl.java b/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/ChangeItemImpl.java index 2b16982407f29..2415870585e4e 100644 --- a/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/ChangeItemImpl.java +++ b/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/ChangeItemImpl.java @@ -231,7 +231,7 @@ private static RecordTemplate convertToRecordTemplate( aspect = GenericRecordUtils.deserializeAspect( mcp.getAspect().getValue(), mcp.getAspect().getContentType(), aspectSpec); - ValidationApiUtils.validateOrThrow(aspect); + ValidationApiUtils.validateTrimOrThrow(aspect); } catch (ModelConversionException e) { throw new RuntimeException( String.format( diff --git a/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/MCLItemImpl.java b/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/MCLItemImpl.java index 85923a28a64be..db24a107c5dd7 100644 --- a/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/MCLItemImpl.java +++ b/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/MCLItemImpl.java @@ -133,7 +133,7 @@ private static Pair convertToRecordTemplate( aspect = GenericRecordUtils.deserializeAspect( mcl.getAspect().getValue(), mcl.getAspect().getContentType(), aspectSpec); - ValidationApiUtils.validateOrThrow(aspect); + ValidationApiUtils.validateTrimOrThrow(aspect); } else { aspect = null; } @@ -144,7 +144,7 @@ private static Pair convertToRecordTemplate( mcl.getPreviousAspectValue().getValue(), mcl.getPreviousAspectValue().getContentType(), aspectSpec); - ValidationApiUtils.validateOrThrow(prevAspect); + ValidationApiUtils.validateTrimOrThrow(prevAspect); } else { prevAspect = null; } diff --git a/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/validation/ValidationApiUtils.java b/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/validation/ValidationApiUtils.java index f301be3115b01..8b5d8d696688e 100644 --- a/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/validation/ValidationApiUtils.java +++ b/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/validation/ValidationApiUtils.java @@ -36,6 +36,17 @@ public static void validateOrThrow(RecordTemplate record) { }); } + public static void validateTrimOrThrow(RecordTemplate record) { + RecordTemplateValidator.validateTrim( + record, + validationResult -> { + throw new ValidationException( + String.format( + "Failed to validate record with class %s: %s", + record.getClass().getName(), validationResult.getMessages().toString())); + }); + } + public static void validateUrn(@Nonnull EntityRegistry entityRegistry, @Nonnull final Urn urn) { UrnValidationUtil.validateUrn( entityRegistry, @@ -45,19 +56,6 @@ public static void validateUrn(@Nonnull EntityRegistry entityRegistry, @Nonnull System.getenv().getOrDefault(STRICT_URN_VALIDATION_ENABLED, "false")))); } - /** - * Validates a {@link RecordTemplate} and logs a warning if validation fails. - * - * @param record record to be validated.ailure. - */ - public static void validateOrWarn(RecordTemplate record) { - RecordTemplateValidator.validate( - record, - validationResult -> { - log.warn(String.format("Failed to validate record %s against its schema.", record)); - }); - } - public static AspectSpec validate(EntitySpec entitySpec, String aspectName) { if (aspectName == null || aspectName.isEmpty()) { throw new UnsupportedOperationException( @@ -95,7 +93,7 @@ public static void validateRecordTemplate( EntityApiUtils.buildKeyAspect(entityRegistry, urn), resultFunction, validator); if (aspect != null) { - RecordTemplateValidator.validate(aspect, resultFunction, validator); + RecordTemplateValidator.validateTrim(aspect, resultFunction, validator); } } } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityUtils.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityUtils.java index 5f9546fe99cc1..9d7d0f9eba864 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityUtils.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityUtils.java @@ -278,7 +278,7 @@ public static List toSystemAspects( // Read Validate systemAspects.forEach( systemAspect -> - RecordTemplateValidator.validate( + RecordTemplateValidator.validateTrim( systemAspect.getRecordTemplate(), validationFailure -> log.warn( diff --git a/metadata-io/src/test/java/com/linkedin/metadata/entity/ValidationUtilsTest.java b/metadata-io/src/test/java/com/linkedin/metadata/entity/ValidationUtilsTest.java index f89d599ccc12a..5e9ef839e36b7 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/entity/ValidationUtilsTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/entity/ValidationUtilsTest.java @@ -44,6 +44,8 @@ public void testValidateOrThrowThrowsOnMissingUnrecognizedField() { rawMap.put("extraField", 1); Status status = new Status(rawMap); assertThrows(ValidationException.class, () -> ValidationApiUtils.validateOrThrow(status)); + // this one should work + ValidationApiUtils.validateTrimOrThrow(status); } @Test diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityResource.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityResource.java index 1241d5085c461..a3494f73c1bf5 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityResource.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityResource.java @@ -8,7 +8,7 @@ import static com.linkedin.metadata.authorization.ApiOperation.DELETE; import static com.linkedin.metadata.authorization.ApiOperation.EXISTS; import static com.linkedin.metadata.authorization.ApiOperation.READ; -import static com.linkedin.metadata.entity.validation.ValidationApiUtils.validateOrThrow; +import static com.linkedin.metadata.entity.validation.ValidationApiUtils.validateTrimOrThrow; import static com.linkedin.metadata.entity.validation.ValidationUtils.*; import static com.linkedin.metadata.resources.restli.RestliConstants.*; import static com.linkedin.metadata.search.utils.SearchUtils.*; @@ -286,7 +286,7 @@ public Task ingest( } try { - validateOrThrow(entity); + validateTrimOrThrow(entity); } catch (ValidationException e) { throw new RestLiServiceException(HttpStatus.S_422_UNPROCESSABLE_ENTITY, e); } @@ -333,7 +333,7 @@ public Task batchIngest( for (Entity entity : entities) { try { - validateOrThrow(entity); + validateTrimOrThrow(entity); } catch (ValidationException e) { throw new RestLiServiceException(HttpStatus.S_422_UNPROCESSABLE_ENTITY, e); } diff --git a/metadata-utils/build.gradle b/metadata-utils/build.gradle index 5507e43c084a1..5e38e005e0e7b 100644 --- a/metadata-utils/build.gradle +++ b/metadata-utils/build.gradle @@ -34,6 +34,8 @@ dependencies { testImplementation project(':test-models') testImplementation project(path: ':test-models', configuration: 'testDataTemplate') testImplementation externalDependency.testng + testImplementation externalDependency.mockito + testImplementation externalDependency.mockitoInline testImplementation project(':metadata-operation-context') constraints { diff --git a/metadata-utils/src/main/java/com/linkedin/metadata/utils/RecordTemplateValidator.java b/metadata-utils/src/main/java/com/linkedin/metadata/utils/RecordTemplateValidator.java index fb2e0d553d519..ed8dd8419001f 100644 --- a/metadata-utils/src/main/java/com/linkedin/metadata/utils/RecordTemplateValidator.java +++ b/metadata-utils/src/main/java/com/linkedin/metadata/utils/RecordTemplateValidator.java @@ -21,6 +21,12 @@ public class RecordTemplateValidator { CoercionMode.NORMAL, UnrecognizedFieldMode.DISALLOW); + private static final ValidationOptions TRIM_VALIDATION_OPTIONS = + new ValidationOptions( + RequiredMode.CAN_BE_ABSENT_IF_HAS_DEFAULT, + CoercionMode.NORMAL, + UnrecognizedFieldMode.TRIM); + private static final UrnValidator URN_VALIDATOR = new UrnValidator(); /** @@ -37,10 +43,25 @@ public static void validate( } } + /** + * Validates a {@link RecordTemplate} and applies a function if validation fails. Extra fields are + * trimmed. + * + * @param record record to be validated.failure. + */ + public static void validateTrim( + RecordTemplate record, Consumer onValidationFailure) { + final ValidationResult result = + ValidateDataAgainstSchema.validate(record, TRIM_VALIDATION_OPTIONS, URN_VALIDATOR); + if (!result.isValid()) { + onValidationFailure.accept(result); + } + } + /** * Validates a {@link RecordTemplate} and applies a function if validation fails * - * @param record record to be validated.ailure. + * @param record record to be validated.failure. */ public static void validate( RecordTemplate record, Consumer onValidationFailure, Validator validator) { @@ -51,5 +72,20 @@ public static void validate( } } + /** + * Validates a {@link RecordTemplate} and applies a function if validation fails Extra fields are + * trimmed. + * + * @param record record to be validated.failure. + */ + public static void validateTrim( + RecordTemplate record, Consumer onValidationFailure, Validator validator) { + final ValidationResult result = + ValidateDataAgainstSchema.validate(record, TRIM_VALIDATION_OPTIONS, validator); + if (!result.isValid()) { + onValidationFailure.accept(result); + } + } + private RecordTemplateValidator() {} } diff --git a/metadata-utils/src/test/java/com/linkedin/metadata/utils/RecordTemplateValidatorTest.java b/metadata-utils/src/test/java/com/linkedin/metadata/utils/RecordTemplateValidatorTest.java new file mode 100644 index 0000000000000..f6fa03feb458d --- /dev/null +++ b/metadata-utils/src/test/java/com/linkedin/metadata/utils/RecordTemplateValidatorTest.java @@ -0,0 +1,208 @@ +package com.linkedin.metadata.utils; + +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.*; + +import com.linkedin.data.schema.validation.ValidateDataAgainstSchema; +import com.linkedin.data.schema.validation.ValidationOptions; +import com.linkedin.data.schema.validation.ValidationResult; +import com.linkedin.data.schema.validator.Validator; +import com.linkedin.data.template.RecordTemplate; +import java.util.function.Consumer; +import org.mockito.Mock; +import org.mockito.MockitoAnnotations; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +public class RecordTemplateValidatorTest { + + @Mock private RecordTemplate mockRecord; + + @Mock private ValidationResult mockValidationResult; + + @Mock private Consumer mockValidationFailureHandler; + + @Mock private Validator mockValidator; + + @BeforeMethod + public void setup() { + MockitoAnnotations.openMocks(this); + } + + @Test + public void testValidate_WhenValidationSucceeds_DoesNotCallFailureHandler() { + // Arrange + try (var mockedStatic = mockStatic(ValidateDataAgainstSchema.class)) { + when(mockValidationResult.isValid()).thenReturn(true); + mockedStatic + .when( + () -> + ValidateDataAgainstSchema.validate( + any(RecordTemplate.class), + any(ValidationOptions.class), + any(Validator.class))) + .thenReturn(mockValidationResult); + + // Act + RecordTemplateValidator.validate(mockRecord, mockValidationFailureHandler); + + // Assert + verify(mockValidationFailureHandler, never()).accept(any(ValidationResult.class)); + } + } + + @Test + public void testValidate_WhenValidationFails_CallsFailureHandler() { + // Arrange + try (var mockedStatic = mockStatic(ValidateDataAgainstSchema.class)) { + when(mockValidationResult.isValid()).thenReturn(false); + mockedStatic + .when( + () -> + ValidateDataAgainstSchema.validate( + any(RecordTemplate.class), + any(ValidationOptions.class), + any(Validator.class))) + .thenReturn(mockValidationResult); + + // Act + RecordTemplateValidator.validate(mockRecord, mockValidationFailureHandler); + + // Assert + verify(mockValidationFailureHandler).accept(mockValidationResult); + } + } + + @Test + public void testValidateTrim_WhenValidationSucceeds_DoesNotCallFailureHandler() { + // Arrange + try (var mockedStatic = mockStatic(ValidateDataAgainstSchema.class)) { + when(mockValidationResult.isValid()).thenReturn(true); + mockedStatic + .when( + () -> + ValidateDataAgainstSchema.validate( + any(RecordTemplate.class), + any(ValidationOptions.class), + any(Validator.class))) + .thenReturn(mockValidationResult); + + // Act + RecordTemplateValidator.validateTrim(mockRecord, mockValidationFailureHandler); + + // Assert + verify(mockValidationFailureHandler, never()).accept(any(ValidationResult.class)); + } + } + + @Test + public void testValidateTrim_WhenValidationFails_CallsFailureHandler() { + // Arrange + try (var mockedStatic = mockStatic(ValidateDataAgainstSchema.class)) { + when(mockValidationResult.isValid()).thenReturn(false); + mockedStatic + .when( + () -> + ValidateDataAgainstSchema.validate( + any(RecordTemplate.class), + any(ValidationOptions.class), + any(Validator.class))) + .thenReturn(mockValidationResult); + + // Act + RecordTemplateValidator.validateTrim(mockRecord, mockValidationFailureHandler); + + // Assert + verify(mockValidationFailureHandler).accept(mockValidationResult); + } + } + + @Test + public void testValidateWithCustomValidator_WhenValidationSucceeds_DoesNotCallFailureHandler() { + // Arrange + try (var mockedStatic = mockStatic(ValidateDataAgainstSchema.class)) { + when(mockValidationResult.isValid()).thenReturn(true); + mockedStatic + .when( + () -> + ValidateDataAgainstSchema.validate( + any(RecordTemplate.class), + any(ValidationOptions.class), + any(Validator.class))) + .thenReturn(mockValidationResult); + + // Act + RecordTemplateValidator.validate(mockRecord, mockValidationFailureHandler, mockValidator); + + // Assert + verify(mockValidationFailureHandler, never()).accept(any(ValidationResult.class)); + } + } + + @Test + public void testValidateWithCustomValidator_WhenValidationFails_CallsFailureHandler() { + // Arrange + try (var mockedStatic = mockStatic(ValidateDataAgainstSchema.class)) { + when(mockValidationResult.isValid()).thenReturn(false); + mockedStatic + .when( + () -> + ValidateDataAgainstSchema.validate( + any(RecordTemplate.class), + any(ValidationOptions.class), + any(Validator.class))) + .thenReturn(mockValidationResult); + + // Act + RecordTemplateValidator.validate(mockRecord, mockValidationFailureHandler, mockValidator); + + // Assert + verify(mockValidationFailureHandler).accept(mockValidationResult); + } + } + + @Test + public void + testValidateTrimWithCustomValidator_WhenValidationSucceeds_DoesNotCallFailureHandler() { + // Arrange + try (var mockedStatic = mockStatic(ValidateDataAgainstSchema.class)) { + when(mockValidationResult.isValid()).thenReturn(true); + mockedStatic + .when( + () -> + ValidateDataAgainstSchema.validate( + any(RecordTemplate.class), + any(ValidationOptions.class), + any(Validator.class))) + .thenReturn(mockValidationResult); + + // Act + RecordTemplateValidator.validateTrim(mockRecord, mockValidationFailureHandler, mockValidator); + + // Assert + verify(mockValidationFailureHandler, never()).accept(any(ValidationResult.class)); + } + } + + @Test + public void testValidateTrimWithCustomValidator_WhenValidationFails_CallsFailureHandler() { + // Arrange + try (var mockedStatic = mockStatic(ValidateDataAgainstSchema.class)) { + when(mockValidationResult.isValid()).thenReturn(false); + mockedStatic + .when( + () -> + ValidateDataAgainstSchema.validate( + any(RecordTemplate.class), + any(ValidationOptions.class), + any(Validator.class))) + .thenReturn(mockValidationResult); + + // Act + RecordTemplateValidator.validateTrim(mockRecord, mockValidationFailureHandler, mockValidator); + + // Assert + verify(mockValidationFailureHandler).accept(mockValidationResult); + } + } +} From 005a9b0f0d38343706864c1a1486efe710aa5a73 Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Mon, 24 Feb 2025 12:13:20 -0600 Subject: [PATCH 06/45] fix(tracing): handle noop mcl (#12713) --- .../metadata/utils/SystemMetadataUtils.java | 21 ++ .../utils/SystemMetadataUtilsTest.java | 177 +++++++++++++++ metadata-io/build.gradle | 1 + .../linkedin/metadata/entity/AspectDao.java | 13 +- .../metadata/entity/EntityServiceImpl.java | 4 +- .../metadata/trace/TraceServiceImpl.java | 14 +- .../metadata/entity/AspectDaoTest.java | 189 +++++++++++++++- ...rtTest.java => EntityServiceImplTest.java} | 214 +++++++++++++++++- .../metadata/entity/EntityServiceTest.java | 6 +- .../metadata/trace/TraceServiceImplTest.java | 45 ++++ 10 files changed, 673 insertions(+), 11 deletions(-) create mode 100644 entity-registry/src/test/java/com/linkedin/metadata/utils/SystemMetadataUtilsTest.java rename metadata-io/src/test/java/com/linkedin/metadata/entity/{EntityServiceImplApplyUpsertTest.java => EntityServiceImplTest.java} (62%) diff --git a/entity-registry/src/main/java/com/linkedin/metadata/utils/SystemMetadataUtils.java b/entity-registry/src/main/java/com/linkedin/metadata/utils/SystemMetadataUtils.java index 93a87ceb3bc55..33244f4a1400e 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/utils/SystemMetadataUtils.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/utils/SystemMetadataUtils.java @@ -4,12 +4,14 @@ import com.datahub.util.RecordUtils; import com.linkedin.data.template.SetMode; +import com.linkedin.data.template.StringMap; import com.linkedin.mxe.SystemMetadata; import javax.annotation.Nullable; import lombok.extern.slf4j.Slf4j; @Slf4j public class SystemMetadataUtils { + private static final String NO_OP_KEY = "isNoOp"; private SystemMetadataUtils() {} @@ -42,4 +44,23 @@ public static SystemMetadata parseSystemMetadata(String jsonSystemMetadata) { } return RecordUtils.toRecordTemplate(SystemMetadata.class, jsonSystemMetadata); } + + public static boolean isNoOp(@Nullable SystemMetadata systemMetadata) { + if (systemMetadata != null && systemMetadata.hasProperties()) { + return Boolean.parseBoolean(systemMetadata.getProperties().getOrDefault(NO_OP_KEY, "false")); + } + + return false; + } + + @Nullable + public static SystemMetadata setNoOp(@Nullable SystemMetadata systemMetadata, boolean isNoOp) { + if (systemMetadata != null) { + if (!systemMetadata.hasProperties()) { + systemMetadata.setProperties(new StringMap()); + } + systemMetadata.getProperties().put(NO_OP_KEY, String.valueOf(isNoOp)); + } + return systemMetadata; + } } diff --git a/entity-registry/src/test/java/com/linkedin/metadata/utils/SystemMetadataUtilsTest.java b/entity-registry/src/test/java/com/linkedin/metadata/utils/SystemMetadataUtilsTest.java new file mode 100644 index 0000000000000..af0ba9974014b --- /dev/null +++ b/entity-registry/src/test/java/com/linkedin/metadata/utils/SystemMetadataUtilsTest.java @@ -0,0 +1,177 @@ +package com.linkedin.metadata.utils; + +import static com.linkedin.metadata.Constants.DEFAULT_RUN_ID; +import static org.testng.Assert.*; + +import com.linkedin.data.template.StringMap; +import com.linkedin.mxe.SystemMetadata; +import org.testng.annotations.Test; + +public class SystemMetadataUtilsTest { + + @Test + public void testCreateDefaultSystemMetadata() { + SystemMetadata metadata = SystemMetadataUtils.createDefaultSystemMetadata(); + + assertNotNull(metadata); + assertEquals(metadata.getRunId(), DEFAULT_RUN_ID); + assertTrue(metadata.hasLastObserved()); + assertTrue(metadata.getLastObserved() > 0); + } + + @Test + public void testCreateDefaultSystemMetadataWithRunId() { + String customRunId = "custom-run-id"; + SystemMetadata metadata = SystemMetadataUtils.createDefaultSystemMetadata(customRunId); + + assertNotNull(metadata); + assertEquals(metadata.getRunId(), customRunId); + assertTrue(metadata.hasLastObserved()); + assertTrue(metadata.getLastObserved() > 0); + } + + @Test + public void testGenerateSystemMetadataIfEmpty() { + // Test with null input + SystemMetadata nullMetadata = SystemMetadataUtils.generateSystemMetadataIfEmpty(null); + assertNotNull(nullMetadata); + assertEquals(nullMetadata.getRunId(), DEFAULT_RUN_ID); + assertTrue(nullMetadata.hasLastObserved()); + + // Test with existing metadata + SystemMetadata existingMetadata = + new SystemMetadata().setRunId("existing-run").setLastObserved(1234567890L); + SystemMetadata result = SystemMetadataUtils.generateSystemMetadataIfEmpty(existingMetadata); + + assertEquals(result.getRunId(), "existing-run"); + assertEquals(result.getLastObserved(), 1234567890L); + } + + @Test + public void testParseSystemMetadata() { + // Test null input + SystemMetadata nullResult = SystemMetadataUtils.parseSystemMetadata(null); + assertNotNull(nullResult); + assertEquals(nullResult.getRunId(), DEFAULT_RUN_ID); + + // Test empty string input + SystemMetadata emptyResult = SystemMetadataUtils.parseSystemMetadata(""); + assertNotNull(emptyResult); + assertEquals(emptyResult.getRunId(), DEFAULT_RUN_ID); + + // Test valid JSON input + String validJson = "{\"runId\":\"test-run\",\"lastObserved\":1234567890}"; + SystemMetadata jsonResult = SystemMetadataUtils.parseSystemMetadata(validJson); + assertNotNull(jsonResult); + assertEquals(jsonResult.getRunId(), "test-run"); + assertEquals(jsonResult.getLastObserved(), 1234567890L); + } + + @Test + public void testIsNoOp() { + // Test null metadata + assertFalse(SystemMetadataUtils.isNoOp(null)); + + // Test metadata without properties + SystemMetadata emptyMetadata = new SystemMetadata(); + assertFalse(SystemMetadataUtils.isNoOp(emptyMetadata)); + + // Test metadata with isNoOp=true + SystemMetadata noOpMetadata = new SystemMetadata(); + StringMap properties = new StringMap(); + properties.put("isNoOp", "true"); + noOpMetadata.setProperties(properties); + assertTrue(SystemMetadataUtils.isNoOp(noOpMetadata)); + + // Test metadata with isNoOp=false + properties.put("isNoOp", "false"); + assertFalse(SystemMetadataUtils.isNoOp(noOpMetadata)); + } + + @Test + public void testSetNoOp() { + // Test with null metadata + assertNull(SystemMetadataUtils.setNoOp(null, true)); + + // Test setting noOp to true + SystemMetadata metadata = new SystemMetadata(); + SystemMetadata result = SystemMetadataUtils.setNoOp(metadata, true); + assertNotNull(result); + assertTrue(result.hasProperties()); + assertNotNull(result.getProperties()); + assertEquals(result.getProperties().get("isNoOp"), "true"); + + // Test setting noOp to false + result = SystemMetadataUtils.setNoOp(metadata, false); + assertNotNull(result); + assertTrue(result.hasProperties()); + assertNotNull(result.getProperties()); + assertEquals(result.getProperties().get("isNoOp"), "false"); + + // Test with existing properties + StringMap existingProps = new StringMap(); + existingProps.put("otherKey", "value"); + metadata.setProperties(existingProps); + result = SystemMetadataUtils.setNoOp(metadata, true); + assertNotNull(result); + assertEquals(result.getProperties().get("otherKey"), "value"); + assertEquals(result.getProperties().get("isNoOp"), "true"); + } + + @Test + public void testGenerateSystemMetadataIfEmpty_NullInput() { + SystemMetadata result = SystemMetadataUtils.generateSystemMetadataIfEmpty(null); + + assertNotNull(result); + assertEquals(DEFAULT_RUN_ID, result.getRunId()); + assertNotNull(result.getLastObserved()); + assertTrue(result.getLastObserved() > 0); + } + + @Test + public void testGenerateSystemMetadataIfEmpty_NoRunId() { + SystemMetadata input = new SystemMetadata().setLastObserved(1234567890L); + + SystemMetadata result = SystemMetadataUtils.generateSystemMetadataIfEmpty(input); + + assertNotNull(result); + assertEquals(DEFAULT_RUN_ID, result.getRunId()); + assertEquals(1234567890L, result.getLastObserved().longValue()); + } + + @Test + public void testGenerateSystemMetadataIfEmpty_NoLastObserved() { + SystemMetadata input = new SystemMetadata().setRunId("custom-run-id"); + + SystemMetadata result = SystemMetadataUtils.generateSystemMetadataIfEmpty(input); + + assertNotNull(result); + assertEquals("custom-run-id", result.getRunId()); + assertNotNull(result.getLastObserved()); + assertTrue(result.getLastObserved() > 0); + } + + @Test + public void testGenerateSystemMetadataIfEmpty_ZeroLastObserved() { + SystemMetadata input = new SystemMetadata().setRunId("custom-run-id").setLastObserved(0L); + + SystemMetadata result = SystemMetadataUtils.generateSystemMetadataIfEmpty(input); + + assertNotNull(result); + assertEquals("custom-run-id", result.getRunId()); + assertNotNull(result.getLastObserved()); + assertTrue(result.getLastObserved() > 0); + } + + @Test + public void testGenerateSystemMetadataIfEmpty_AllFieldsPopulated() { + SystemMetadata input = + new SystemMetadata().setRunId("custom-run-id").setLastObserved(1234567890L); + + SystemMetadata result = SystemMetadataUtils.generateSystemMetadataIfEmpty(input); + + assertNotNull(result); + assertEquals("custom-run-id", result.getRunId()); + assertEquals(1234567890L, result.getLastObserved().longValue()); + } +} diff --git a/metadata-io/build.gradle b/metadata-io/build.gradle index 8efb96d723845..6dcd2c4bbd91b 100644 --- a/metadata-io/build.gradle +++ b/metadata-io/build.gradle @@ -97,6 +97,7 @@ dependencies { testImplementation externalDependency.springBootTest testImplementation spec.product.pegasus.restliServer testImplementation externalDependency.ebeanTest + testImplementation externalDependency.opentelemetrySdk // logback >=1.3 required due to `testcontainers` only testImplementation 'ch.qos.logback:logback-classic:1.4.7' diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/AspectDao.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/AspectDao.java index cb838180ffa3d..0715c7c8e54c7 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/entity/AspectDao.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/AspectDao.java @@ -10,6 +10,7 @@ import com.linkedin.metadata.entity.ebean.EbeanAspectV2; import com.linkedin.metadata.entity.ebean.PartitionedStream; import com.linkedin.metadata.entity.restoreindices.RestoreIndicesArgs; +import com.linkedin.metadata.utils.SystemMetadataUtils; import com.linkedin.metadata.utils.metrics.MetricUtils; import com.linkedin.mxe.SystemMetadata; import com.linkedin.util.Pair; @@ -142,15 +143,19 @@ default Pair, Optional> saveLatestAspect( .equals(currentVersion0.getSystemMetadataVersion())) { inserted = insertAspect(txContext, latestAspect.getDatabaseAspect().get(), targetVersion); - - // add trace - overwrite if version incremented - newAspect.setSystemMetadata(opContext.withTraceId(newAspect.getSystemMetadata(), true)); } // update version 0 Optional updated = Optional.empty(); + boolean isNoOp = + Objects.equals(currentVersion0.getRecordTemplate(), newAspect.getRecordTemplate()); + if (!Objects.equals(currentVersion0.getSystemMetadata(), newAspect.getSystemMetadata()) - || !Objects.equals(currentVersion0.getRecordTemplate(), newAspect.getRecordTemplate())) { + || !isNoOp) { + // update no-op used for tracing + SystemMetadataUtils.setNoOp(newAspect.getSystemMetadata(), isNoOp); + // add trace - overwrite if version incremented + newAspect.setSystemMetadata(opContext.withTraceId(newAspect.getSystemMetadata(), true)); updated = updateAspect(txContext, newAspect); } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java index f86179360a17a..1d4581218e09a 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java @@ -76,6 +76,7 @@ import com.linkedin.metadata.utils.EntityApiUtils; import com.linkedin.metadata.utils.GenericRecordUtils; import com.linkedin.metadata.utils.PegasusUtils; +import com.linkedin.metadata.utils.SystemMetadataUtils; import com.linkedin.metadata.utils.metrics.MetricUtils; import com.linkedin.mxe.MetadataAuditOperation; import com.linkedin.mxe.MetadataChangeLog; @@ -2050,7 +2051,8 @@ public Optional, Boolean>> conditionallyProduceMCLAsync( Urn entityUrn, AuditStamp auditStamp, AspectSpec aspectSpec) { - boolean isNoOp = Objects.equals(oldAspect, newAspect); + boolean isNoOp = + SystemMetadataUtils.isNoOp(newSystemMetadata) || Objects.equals(oldAspect, newAspect); if (!isNoOp || alwaysEmitChangeLog || shouldAspectEmitChangeLog(aspectSpec)) { log.info("Producing MCL for ingested aspect {}, urn {}", aspectSpec.getName(), entityUrn); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/trace/TraceServiceImpl.java b/metadata-io/src/main/java/com/linkedin/metadata/trace/TraceServiceImpl.java index 47eddba432246..f8352a68d9665 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/trace/TraceServiceImpl.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/trace/TraceServiceImpl.java @@ -11,6 +11,7 @@ import com.linkedin.metadata.run.AspectRowSummary; import com.linkedin.metadata.systemmetadata.SystemMetadataService; import com.linkedin.metadata.systemmetadata.TraceService; +import com.linkedin.metadata.utils.SystemMetadataUtils; import com.linkedin.mxe.FailedMetadataChangeProposal; import com.linkedin.mxe.SystemMetadata; import com.linkedin.util.Pair; @@ -168,7 +169,12 @@ private Map> tracePrimaryInParall String aspectName = aspectEntry.getKey(); if (traceId.equals(systemTraceId)) { - aspectStatuses.put(aspectName, TraceStorageStatus.ok(TraceWriteStatus.ACTIVE_STATE)); + if (SystemMetadataUtils.isNoOp(systemMetadata)) { + aspectStatuses.put(aspectName, TraceStorageStatus.ok(TraceWriteStatus.NO_OP)); + } else { + aspectStatuses.put( + aspectName, TraceStorageStatus.ok(TraceWriteStatus.ACTIVE_STATE)); + } } else if (traceTimestampMillis <= extractTimestamp(systemTraceId, createdOnMillis)) { aspectStatuses.put( aspectName, TraceStorageStatus.ok(TraceWriteStatus.HISTORIC_STATE)); @@ -421,7 +427,9 @@ private static Map mergeStatus( storageEntry -> { String aspectName = storageEntry.getKey(); TraceStorageStatus primaryStatus = storageEntry.getValue(); - TraceStorageStatus searchStatus = searchAspectStatus.get(aspectName); + TraceStorageStatus searchStatus = + searchAspectStatus.getOrDefault( + aspectName, TraceStorageStatus.ok(TraceWriteStatus.PENDING)); TraceStatus traceStatus = TraceStatus.builder() .primaryStorage(primaryStatus) @@ -448,7 +456,7 @@ private static Map mergeStatus( } private static boolean isSuccess( - TraceStorageStatus primaryStatus, TraceStorageStatus searchStatus) { + @Nonnull TraceStorageStatus primaryStatus, @Nonnull TraceStorageStatus searchStatus) { return !TraceWriteStatus.ERROR.equals(primaryStatus.getWriteStatus()) && !TraceWriteStatus.ERROR.equals(searchStatus.getWriteStatus()); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/entity/AspectDaoTest.java b/metadata-io/src/test/java/com/linkedin/metadata/entity/AspectDaoTest.java index 46aebbdf33988..db9c12aad66a6 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/entity/AspectDaoTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/entity/AspectDaoTest.java @@ -4,6 +4,8 @@ import static com.linkedin.metadata.Constants.STATUS_ASPECT_NAME; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertNotEquals; +import static org.testng.Assert.assertNull; import static org.testng.Assert.assertTrue; import com.datahub.util.RecordUtils; @@ -11,6 +13,7 @@ import com.linkedin.common.urn.Urn; import com.linkedin.common.urn.UrnUtils; import com.linkedin.data.template.SetMode; +import com.linkedin.data.template.StringMap; import com.linkedin.metadata.aspect.EntityAspect; import com.linkedin.metadata.aspect.SystemAspect; import com.linkedin.metadata.entity.ebean.EbeanAspectV2; @@ -19,10 +22,19 @@ import com.linkedin.metadata.entity.restoreindices.RestoreIndicesArgs; import com.linkedin.metadata.models.EntitySpec; import com.linkedin.metadata.utils.AuditStampUtils; +import com.linkedin.metadata.utils.SystemMetadataUtils; import com.linkedin.mxe.SystemMetadata; import com.linkedin.util.Pair; +import io.datahubproject.metadata.context.ObjectMapperContext; import io.datahubproject.metadata.context.OperationContext; +import io.datahubproject.metadata.context.TraceContext; import io.datahubproject.test.metadata.context.TestOperationContexts; +import io.opentelemetry.api.OpenTelemetry; +import io.opentelemetry.api.trace.Span; +import io.opentelemetry.api.trace.SpanContext; +import io.opentelemetry.api.trace.Tracer; +import io.opentelemetry.sdk.OpenTelemetrySdk; +import io.opentelemetry.sdk.trace.SdkTracerProvider; import java.util.List; import java.util.Map; import java.util.Optional; @@ -39,7 +51,18 @@ public class AspectDaoTest { private final OperationContext opContext = - TestOperationContexts.systemContextNoSearchAuthorization(); + TestOperationContexts.systemContextTraceNoSearchAuthorization( + () -> ObjectMapperContext.DEFAULT, + () -> { + // Set up OpenTelemetry SDK for testing + SdkTracerProvider tracerProvider = SdkTracerProvider.builder().build(); + OpenTelemetry openTelemetry = + OpenTelemetrySdk.builder().setTracerProvider(tracerProvider).build(); + + // Create a tracer + Tracer tracer = openTelemetry.getTracer("test-tracer"); + return TraceContext.builder().tracer(tracer).build(); + }); private final EntitySpec corpUserEntitySpec = opContext.getEntityRegistry().getEntitySpec(CORP_USER_ENTITY_NAME); @@ -208,6 +231,170 @@ public void testSaveLatestAspect_ThrowsOnNullNewVersion() { aspectDao.saveLatestAspect(opContext, null, null, newAspect); } + @Test + public void testSaveLatestAspect_NoOpTracingSet() { + // Setup + SystemAspect currentAspect = createSystemAspect("1"); + SystemAspect newAspect = createSystemAspect("1"); + SystemAspect dbAspect = createSystemAspect("1"); + currentAspect.setDatabaseAspect(dbAspect); + + // Execute + Pair, Optional> result = + aspectDao.saveLatestAspect(opContext, txContext, currentAspect, newAspect); + + // Verify + // Should not have any changes since it's a true no-op (same version and content) + assertFalse(result.getFirst().isPresent(), "Should not have inserted previous version"); + assertFalse(result.getSecond().isPresent(), "Should not have updated current version"); + + // The input aspect should not be modified since no update occurred + assertNull( + newAspect.getSystemMetadata().getProperties(), + "SystemMetadata should not be modified for no-op case with no update"); + } + + @Test + public void testSaveLatestAspect_NoOpWithMetadataChange() { + opContext.withSpan( + "testSaveLatestAspect_NoOpWithMetadataChange", + () -> { + // Verify span context is valid + SpanContext currentSpanContext = Span.current().getSpanContext(); + assertTrue(currentSpanContext.isValid(), "Span context should be valid"); + + // Setup + SystemAspect currentAspect = createSystemAspect("1"); + SystemAspect newAspect = createSystemAspect("1"); + SystemAspect dbAspect = createSystemAspect("1"); + currentAspect.setDatabaseAspect(dbAspect); + + // Modify system metadata but keep same content + newAspect + .getSystemMetadata() + .setLastObserved(newAspect.getSystemMetadata().getLastObserved() + 1); + + // Execute + Pair, Optional> result = + aspectDao.saveLatestAspect(opContext, txContext, currentAspect, newAspect); + + // Verify + assertFalse(result.getFirst().isPresent(), "Should not have inserted previous version"); + assertTrue(result.getSecond().isPresent(), "Should have updated current version"); + + SystemMetadata updatedMetadata = + RecordUtils.toRecordTemplate( + SystemMetadata.class, result.getSecond().get().getSystemMetadata()); + + assertTrue( + SystemMetadataUtils.isNoOp(updatedMetadata), + "NoOp should be true for metadata-only change"); + assertTrue( + updatedMetadata.getProperties().containsKey("telemetryTraceId"), + "TraceId should be set"); + }); + } + + @Test + public void testSaveLatestAspect_ContentChangeTracing() { + opContext.withSpan( + "testSaveLatestAspect_ContentChangeTracing", + () -> { + // Verify span context is valid + SpanContext currentSpanContext = Span.current().getSpanContext(); + assertTrue(currentSpanContext.isValid(), "Span context should be valid"); + + // Setup + SystemAspect currentAspect = createSystemAspect("1"); + SystemAspect newAspect = createSystemAspect("2"); + // Modify the content to ensure it's not a no-op + Status newStatus = new Status().setRemoved(true); + newAspect.setRecordTemplate(newStatus); + + SystemAspect dbAspect = createSystemAspect("1"); + currentAspect.setDatabaseAspect(dbAspect); + + // Execute + Pair, Optional> result = + aspectDao.saveLatestAspect(opContext, txContext, currentAspect, newAspect); + + // Verify + assertTrue(result.getFirst().isPresent(), "Should have inserted previous version"); + assertTrue(result.getSecond().isPresent(), "Should have updated current version"); + + SystemMetadata updatedMetadata = + RecordUtils.toRecordTemplate( + SystemMetadata.class, result.getSecond().get().getSystemMetadata()); + + assertFalse( + SystemMetadataUtils.isNoOp(updatedMetadata), + "NoOp should be false for content change"); + assertTrue( + updatedMetadata.getProperties().containsKey("telemetryTraceId"), + "TraceId should be set"); + + // Verify previous version's metadata is unchanged + SystemMetadata previousMetadata = + RecordUtils.toRecordTemplate( + SystemMetadata.class, result.getFirst().get().getSystemMetadata()); + assertFalse( + SystemMetadataUtils.isNoOp(previousMetadata), + "Previous version should not have NoOp flag"); + }); + } + + @Test + public void testSaveLatestAspect_TraceIdPropagation() { + opContext.withSpan( + "testSaveLatestAspect_TraceIdPropagation", + () -> { + // Verify span context is valid + SpanContext currentSpanContext = Span.current().getSpanContext(); + assertTrue(currentSpanContext.isValid(), "Span context should be valid"); + + // Setup + String existingTraceId = "existing-trace-123"; + SystemAspect currentAspect = createSystemAspect("1"); + currentAspect + .getSystemMetadata() + .setProperties(new StringMap(Map.of("telemetryTraceId", existingTraceId))); + + SystemAspect newAspect = createSystemAspect("2"); + // Set a different trace ID to verify overwrite behavior + newAspect + .getSystemMetadata() + .setProperties(new StringMap(Map.of("telemetryTraceId", "new-trace-456"))); + + SystemAspect dbAspect = createSystemAspect("1"); + dbAspect + .getSystemMetadata() + .setProperties(new StringMap(Map.of("telemetryTraceId", existingTraceId))); + currentAspect.setDatabaseAspect(dbAspect); + + // Execute + Pair, Optional> result = + aspectDao.saveLatestAspect(opContext, txContext, currentAspect, newAspect); + + // Verify + assertTrue(result.getSecond().isPresent(), "Should have updated current version"); + + SystemMetadata updatedMetadata = + RecordUtils.toRecordTemplate( + SystemMetadata.class, result.getSecond().get().getSystemMetadata()); + + assertTrue( + updatedMetadata.getProperties().containsKey("telemetryTraceId"), + "TraceId should be set"); + assertNotEquals( + updatedMetadata.getProperties().get("telemetryTraceId"), + existingTraceId, + "TraceId should be overwritten for version increment"); + assertFalse( + updatedMetadata.getProperties().get("telemetryTraceId").contains("-trace-"), + "TraceId should match operation context and not the test trace ids"); + }); + } + // Concrete implementation for testing default methods private class TestAspectDao implements AspectDao { diff --git a/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceImplApplyUpsertTest.java b/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceImplTest.java similarity index 62% rename from metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceImplApplyUpsertTest.java rename to metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceImplTest.java index 3300416e21593..cf75a01ef71a8 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceImplApplyUpsertTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceImplTest.java @@ -1,35 +1,90 @@ package com.linkedin.metadata.entity; +import static com.linkedin.metadata.Constants.STATUS_ASPECT_NAME; +import static com.linkedin.metadata.Constants.UPSTREAM_LINEAGE_ASPECT_NAME; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; import static org.testng.Assert.assertNotNull; import static org.testng.Assert.assertNull; import static org.testng.Assert.assertTrue; import com.datahub.util.RecordUtils; import com.linkedin.common.AuditStamp; +import com.linkedin.common.Status; +import com.linkedin.common.urn.Urn; import com.linkedin.common.urn.UrnUtils; import com.linkedin.data.template.DataTemplateUtil; +import com.linkedin.data.template.RecordTemplate; +import com.linkedin.dataset.UpstreamLineage; import com.linkedin.identity.CorpUserInfo; import com.linkedin.metadata.AspectGenerationUtils; import com.linkedin.metadata.aspect.SystemAspect; import com.linkedin.metadata.aspect.batch.ChangeMCP; +import com.linkedin.metadata.config.PreProcessHooks; import com.linkedin.metadata.entity.ebean.EbeanAspectV2; import com.linkedin.metadata.entity.ebean.EbeanSystemAspect; import com.linkedin.metadata.entity.ebean.batch.ChangeItemImpl; +import com.linkedin.metadata.event.EventProducer; import com.linkedin.metadata.models.registry.EntityRegistry; +import com.linkedin.metadata.utils.GenericRecordUtils; +import com.linkedin.metadata.utils.SystemMetadataUtils; +import com.linkedin.mxe.MetadataChangeProposal; import com.linkedin.mxe.SystemMetadata; +import com.linkedin.util.Pair; import io.datahubproject.metadata.context.OperationContext; import io.datahubproject.test.metadata.context.TestOperationContexts; import java.sql.Timestamp; import java.util.Optional; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.Future; +import org.testng.annotations.BeforeMethod; import org.testng.annotations.Test; -public class EntityServiceImplApplyUpsertTest { +public class EntityServiceImplTest { private final AuditStamp TEST_AUDIT_STAMP = AspectGenerationUtils.createAuditStamp(); private final OperationContext opContext = TestOperationContexts.systemContextNoSearchAuthorization(); private final EntityRegistry testEntityRegistry = opContext.getEntityRegistry(); + private static final Urn TEST_URN = UrnUtils.getUrn("urn:li:corpuser:EntityServiceImplTest"); + + private EventProducer mockEventProducer; + private Status oldAspect; + private Status newAspect; + private EntityServiceImpl entityService; + private MetadataChangeProposal testMCP; + + @BeforeMethod + public void setup() throws Exception { + mockEventProducer = mock(EventProducer.class); + + // Initialize common test objects + entityService = + new EntityServiceImpl( + mock(AspectDao.class), mockEventProducer, false, mock(PreProcessHooks.class), 0, true); + + // Create test aspects + oldAspect = new Status().setRemoved(false); + newAspect = new Status().setRemoved(true); + + testMCP = + new MetadataChangeProposal() + .setEntityUrn(TEST_URN) + .setEntityType(TEST_URN.getEntityType()) + .setAspectName(STATUS_ASPECT_NAME) + .setAspect(GenericRecordUtils.serializeAspect(newAspect)); + + when(mockEventProducer.produceMetadataChangeLog( + any(OperationContext.class), any(), any(), any())) + .thenReturn(CompletableFuture.completedFuture(null)); + } + @Test public void testApplyUpsertNoOp() throws Exception { // Set up initial system metadata @@ -293,4 +348,161 @@ public void testApplyUpsertNullVersionException() { assertEquals(changeMCP.getNextAspectVersion(), 1); assertEquals(changeMCP.getSystemMetadata().getVersion(), "1"); } + + @Test + public void testNoMCLWhenSystemMetadataIsNoOp() { + // Arrange + SystemMetadata systemMetadata = SystemMetadataUtils.createDefaultSystemMetadata(); + SystemMetadataUtils.setNoOp(systemMetadata, true); // Makes it a no-op + + // Act + Optional, Boolean>> result = + entityService.conditionallyProduceMCLAsync( + opContext, + oldAspect, + null, // oldSystemMetadata + newAspect, + systemMetadata, + testMCP, + TEST_URN, + TEST_AUDIT_STAMP, + opContext + .getEntityRegistry() + .getEntitySpec(TEST_URN.getEntityType()) + .getAspectSpec(STATUS_ASPECT_NAME)); + + // Assert + assertFalse(result.isPresent(), "Should not produce MCL when system metadata is no-op"); + verify(mockEventProducer, never()).produceMetadataChangeLog(any(), any(), any()); + } + + @Test + public void testNoMCLWhenAspectsAreEqual() { + // Arrange + RecordTemplate sameAspect = newAspect; + + // Act + Optional, Boolean>> result = + entityService.conditionallyProduceMCLAsync( + opContext, + sameAspect, + null, // oldSystemMetadata + sameAspect, + SystemMetadataUtils.createDefaultSystemMetadata(), + testMCP, + TEST_URN, + TEST_AUDIT_STAMP, + opContext + .getEntityRegistry() + .getEntitySpec(TEST_URN.getEntityType()) + .getAspectSpec(STATUS_ASPECT_NAME)); + + // Assert + assertFalse(result.isPresent(), "Should not produce MCL when aspects are equal"); + verify(mockEventProducer, never()).produceMetadataChangeLog(any(), any(), any()); + } + + @Test + public void testProducesMCLWhenChangesExist() { + // Arrange + SystemMetadata systemMetadata = SystemMetadataUtils.createDefaultSystemMetadata(); + SystemMetadataUtils.setNoOp(systemMetadata, false); // Makes it not a no-op + + // Act + Optional, Boolean>> result = + entityService.conditionallyProduceMCLAsync( + opContext, + oldAspect, + null, // oldSystemMetadata + newAspect, + systemMetadata, + testMCP, + TEST_URN, + TEST_AUDIT_STAMP, + opContext + .getEntityRegistry() + .getEntitySpec(TEST_URN.getEntityType()) + .getAspectSpec(STATUS_ASPECT_NAME)); + + // Assert + assertTrue(result.isPresent(), "Should produce MCL when changes exist"); + verify(mockEventProducer, times(1)) + .produceMetadataChangeLog(any(OperationContext.class), any(), any(), any()); + } + + @Test + public void testAlwaysEmitChangeLogFlag() { + // Arrange + entityService = + new EntityServiceImpl( + mock(AspectDao.class), + mockEventProducer, + true, // alwaysEmitChangeLog set to true + mock(PreProcessHooks.class), + 0, + true); + + RecordTemplate sameAspect = newAspect; + + // Act + Optional, Boolean>> result = + entityService.conditionallyProduceMCLAsync( + opContext, + sameAspect, + null, // oldSystemMetadata + sameAspect, // Same aspect + SystemMetadataUtils.createDefaultSystemMetadata(), + testMCP, + TEST_URN, + TEST_AUDIT_STAMP, + opContext + .getEntityRegistry() + .getEntitySpec(TEST_URN.getEntityType()) + .getAspectSpec(STATUS_ASPECT_NAME)); + + // Assert + assertTrue( + result.isPresent(), + "Should produce MCL when alwaysEmitChangeLog is true, regardless of no-op status"); + verify(mockEventProducer, times(1)) + .produceMetadataChangeLog(any(OperationContext.class), any(), any(), any()); + } + + @Test + public void testAspectWithLineageRelationship() { + // Arrange + Urn datasetUrn = + UrnUtils.getUrn( + "urn:li:dataset:(urn:li:dataPlatform:test,testAspectWithLineageRelationship,PROD)"); + UpstreamLineage sameLineageAspect = new UpstreamLineage(); + MetadataChangeProposal datasetMCP = + new MetadataChangeProposal() + .setEntityUrn(datasetUrn) + .setEntityType(datasetUrn.getEntityType()) + .setAspectName(UPSTREAM_LINEAGE_ASPECT_NAME) + .setAspect(GenericRecordUtils.serializeAspect(sameLineageAspect)); + + // Act + Optional, Boolean>> result = + entityService.conditionallyProduceMCLAsync( + opContext, + sameLineageAspect, + null, // oldSystemMetadata + sameLineageAspect, // Same aspect + SystemMetadataUtils.createDefaultSystemMetadata(), + datasetMCP, + datasetUrn, + TEST_AUDIT_STAMP, + opContext + .getEntityRegistry() + .getEntitySpec(datasetUrn.getEntityType()) + .getAspectSpec(UPSTREAM_LINEAGE_ASPECT_NAME)); + + // Assert + assertTrue( + result.isPresent(), + "Should produce MCL when aspect has lineage relationship, regardless of no-op status"); + verify(mockEventProducer, times(1)) + .produceMetadataChangeLog(any(OperationContext.class), any(), any(), any()); + } } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java index 49d665654db7b..426446a0c3178 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java @@ -1352,6 +1352,7 @@ public void testIngestGetLatestAspect() throws AssertionError { EntityAspect readAspectDao2 = _aspectDao.getAspect(entityUrn.toString(), aspectName, 0); assertTrue(DataTemplateUtil.areEqual(writeAspect2, readAspect2)); + SystemMetadataUtils.setNoOp(expectedMetadata2, false); assertTrue( DataTemplateUtil.areEqual( SystemMetadataUtils.parseSystemMetadata(readAspectDao2.getSystemMetadata()), @@ -1393,7 +1394,9 @@ public void testIngestGetLatestEnvelopedAspect() throws Exception { SystemMetadata metadata2 = AspectGenerationUtils.createSystemMetadata(1635792689, "run-456", null, "2"); SystemMetadata expectedMetadata2 = - AspectGenerationUtils.createSystemMetadata(1635792689, "run-456", "run-123", "2"); + SystemMetadataUtils.setNoOp( + AspectGenerationUtils.createSystemMetadata(1635792689, "run-456", "run-123", "2"), + false); List items = List.of( @@ -1594,6 +1597,7 @@ public void testIngestSameAspect() throws AssertionError { SystemMetadataUtils.parseSystemMetadata(readAspectDao2.getSystemMetadata()), metadata1)); + SystemMetadataUtils.setNoOp(expectedMetadata2, true); assertTrue( DataTemplateUtil.areEqual( SystemMetadataUtils.parseSystemMetadata(readAspectDao2.getSystemMetadata()), diff --git a/metadata-io/src/test/java/com/linkedin/metadata/trace/TraceServiceImplTest.java b/metadata-io/src/test/java/com/linkedin/metadata/trace/TraceServiceImplTest.java index 1ec575e5567a9..32631eb247b1d 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/trace/TraceServiceImplTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/trace/TraceServiceImplTest.java @@ -25,6 +25,7 @@ import com.linkedin.metadata.entity.EntityService; import com.linkedin.metadata.run.AspectRowSummary; import com.linkedin.metadata.systemmetadata.SystemMetadataService; +import com.linkedin.metadata.utils.SystemMetadataUtils; import com.linkedin.mxe.FailedMetadataChangeProposal; import com.linkedin.mxe.MetadataChangeProposal; import com.linkedin.mxe.SystemMetadata; @@ -347,4 +348,48 @@ public void testTraceWithFailedMessage() throws Exception { "java.lang.IllegalArgumentException"); assertFalse(status.isSuccess()); } + + @Test + public void testTraceWithNoOpState() throws Exception { + // Arrange + Map> aspectNames = + Collections.singletonMap(TEST_URN, Collections.singletonList(ASPECT_NAME)); + + // Create system metadata with NO_OP state + SystemMetadata systemMetadata = new SystemMetadata(); + systemMetadata.setProperties( + new StringMap(Map.of(TraceContext.TELEMETRY_TRACE_KEY, TEST_TRACE_ID))); + SystemMetadataUtils.setNoOp(systemMetadata, true); // Set NO_OP flag + + // Create enveloped aspect with NO_OP system metadata + EnvelopedAspect envelopedAspect = new EnvelopedAspect(); + envelopedAspect.setCreated(new AuditStamp().setTime(Instant.now().toEpochMilli())); + envelopedAspect.setSystemMetadata(systemMetadata); + + // Set up entity response + EntityResponse entityResponse = new EntityResponse(); + entityResponse.setAspects( + new EnvelopedAspectMap(Collections.singletonMap(ASPECT_NAME, envelopedAspect))); + entityResponse.setEntityName(TEST_URN.getEntityType()); + entityResponse.setUrn(TEST_URN); + + // Mock entity service response + when(entityService.getEntitiesV2(any(), anyString(), anySet(), anySet(), anyBoolean())) + .thenReturn(Collections.singletonMap(TEST_URN, entityResponse)); + + // Act + Map> result = + traceService.trace(operationContext, TEST_TRACE_ID, aspectNames, false, false); + + // Assert + assertNotNull(result); + assertTrue(result.containsKey(TEST_URN)); + Map urnStatus = result.get(TEST_URN); + assertTrue(urnStatus.containsKey(ASPECT_NAME)); + + TraceStatus status = urnStatus.get(ASPECT_NAME); + assertEquals(status.getPrimaryStorage().getWriteStatus(), TraceWriteStatus.NO_OP); + assertEquals(status.getSearchStorage().getWriteStatus(), TraceWriteStatus.NO_OP); + assertTrue(status.isSuccess()); + } } From 8dfd8fbbaf5bdeff787294c7f0823126423de05c Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Mon, 24 Feb 2025 11:57:51 -0800 Subject: [PATCH 07/45] feat(ingest): migrate Cassandra source to new SDK (#12695) --- metadata-ingestion/scripts/avro_codegen.py | 2 +- .../src/datahub/ingestion/run/pipeline.py | 5 + .../ingestion/source/cassandra/cassandra.py | 385 ++- .../source/cassandra/cassandra_api.py | 15 +- .../src/datahub/sdk/__init__.py | 41 +- metadata-ingestion/src/datahub/sdk/_entity.py | 19 +- .../src/datahub/sdk/container.py | 4 +- metadata-ingestion/src/datahub/sdk/dataset.py | 8 +- .../cassandra/cassandra_mcps_golden.json | 2186 +++++++++-------- .../integration/cassandra/docker-compose.yml | 3 +- .../integration/cassandra/test_cassandra.py | 28 +- 11 files changed, 1435 insertions(+), 1261 deletions(-) diff --git a/metadata-ingestion/scripts/avro_codegen.py b/metadata-ingestion/scripts/avro_codegen.py index 7e75cba983381..2fe2729349944 100644 --- a/metadata-ingestion/scripts/avro_codegen.py +++ b/metadata-ingestion/scripts/avro_codegen.py @@ -714,7 +714,7 @@ def from_key_aspect(cls, key_aspect: "{key_aspect_class}") -> "{class_name}": code += f""" @property def {field_name(field)}(self) -> {field_type(field)}: - return self.entity_ids[{i}] + return self._entity_ids[{i}] """ return code diff --git a/metadata-ingestion/src/datahub/ingestion/run/pipeline.py b/metadata-ingestion/src/datahub/ingestion/run/pipeline.py index ea266f67a9c3d..e43693f8ac9d7 100644 --- a/metadata-ingestion/src/datahub/ingestion/run/pipeline.py +++ b/metadata-ingestion/src/datahub/ingestion/run/pipeline.py @@ -43,6 +43,7 @@ SystemMetadataTransformer, ) from datahub.ingestion.transformer.transform_registry import transform_registry +from datahub.sdk._attribution import KnownAttribution, change_default_attribution from datahub.telemetry import stats from datahub.telemetry.telemetry import telemetry_instance from datahub.utilities._custom_package_loader import model_version_name @@ -410,6 +411,10 @@ def run(self) -> None: ) ) + self.exit_stack.enter_context( + change_default_attribution(KnownAttribution.INGESTION) + ) + self.final_status = PipelineStatus.UNKNOWN self._notify_reporters_on_ingestion_start() callback = None diff --git a/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra.py b/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra.py index 062c64d45767f..9966d333fdc17 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra.py +++ b/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra.py @@ -1,19 +1,14 @@ import dataclasses import json import logging -from typing import Any, Dict, Iterable, List, Optional +from typing import Any, Dict, Iterable, List, Optional, Union from datahub.emitter.mce_builder import ( - make_data_platform_urn, - make_dataplatform_instance_urn, make_dataset_urn_with_platform_instance, make_schema_field_urn, ) -from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.mcp_builder import ( ContainerKey, - add_dataset_to_container, - gen_containers, ) from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.decorators import ( @@ -31,6 +26,7 @@ CassandraColumn, CassandraEntities, CassandraKeyspace, + CassandraSharedDatasetFields, CassandraTable, CassandraView, ) @@ -51,24 +47,21 @@ from datahub.ingestion.source.state.stateful_ingestion_base import ( StatefulIngestionSourceBase, ) -from datahub.metadata.com.linkedin.pegasus2avro.common import StatusClass from datahub.metadata.com.linkedin.pegasus2avro.schema import ( SchemaField, - SchemaMetadata, ) from datahub.metadata.schema_classes import ( - DataPlatformInstanceClass, DatasetLineageTypeClass, - DatasetPropertiesClass, FineGrainedLineageClass, FineGrainedLineageDownstreamTypeClass, FineGrainedLineageUpstreamTypeClass, - OtherSchemaClass, - SubTypesClass, UpstreamClass, UpstreamLineageClass, ViewPropertiesClass, ) +from datahub.sdk._entity import Entity +from datahub.sdk.container import Container +from datahub.sdk.dataset import Dataset logger = logging.getLogger(__name__) @@ -133,6 +126,13 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: def get_workunits_internal( self, ) -> Iterable[MetadataWorkUnit]: + for metadata in self._get_metadata(): + if isinstance(metadata, MetadataWorkUnit): + yield metadata + else: + yield from metadata.as_workunits() + + def _get_metadata(self) -> Iterable[Union[MetadataWorkUnit, Entity]]: if not self.cassandra_api.authenticate(): return keyspaces: List[CassandraKeyspace] = self.cassandra_api.get_keyspaces() @@ -145,7 +145,7 @@ def get_workunits_internal( self.report.report_dropped(keyspace_name) continue - yield from self._generate_keyspace_container(keyspace) + yield self._generate_keyspace_container(keyspace) try: yield from self._extract_tables_from_keyspace(keyspace_name) @@ -170,21 +170,20 @@ def get_workunits_internal( if self.config.is_profiling_enabled(): yield from self.profiler.get_workunits(self.cassandra_data) - def _generate_keyspace_container( - self, keyspace: CassandraKeyspace - ) -> Iterable[MetadataWorkUnit]: + def _generate_keyspace_container(self, keyspace: CassandraKeyspace) -> Container: keyspace_container_key = self._generate_keyspace_container_key( keyspace.keyspace_name ) - yield from gen_containers( - container_key=keyspace_container_key, - name=keyspace.keyspace_name, + + return Container( + keyspace_container_key, + display_name=keyspace.keyspace_name, qualified_name=keyspace.keyspace_name, + subtype=DatasetContainerSubTypes.KEYSPACE, extra_properties={ "durable_writes": str(keyspace.durable_writes), "replication": json.dumps(keyspace.replication), }, - sub_types=[DatasetContainerSubTypes.KEYSPACE], ) def _generate_keyspace_container_key(self, keyspace_name: str) -> ContainerKey: @@ -196,105 +195,55 @@ def _generate_keyspace_container_key(self, keyspace_name: str) -> ContainerKey: ) # get all tables for a given keyspace, iterate over them to extract column metadata - def _extract_tables_from_keyspace( - self, keyspace_name: str - ) -> Iterable[MetadataWorkUnit]: + def _extract_tables_from_keyspace(self, keyspace_name: str) -> Iterable[Dataset]: self.cassandra_data.keyspaces.append(keyspace_name) tables: List[CassandraTable] = self.cassandra_api.get_tables(keyspace_name) for table in tables: - # define the dataset urn for this table to be used downstream - table_name: str = table.table_name - dataset_name: str = f"{keyspace_name}.{table_name}" - - if not self.config.table_pattern.allowed(dataset_name): - self.report.report_dropped(dataset_name) - continue - - self.cassandra_data.tables.setdefault(keyspace_name, []).append(table_name) - self.report.report_entity_scanned(dataset_name, ent_type="Table") - - dataset_urn = make_dataset_urn_with_platform_instance( - platform=self.platform, - name=dataset_name, - env=self.config.env, - platform_instance=self.config.platform_instance, + dataset = self._generate_table(keyspace_name, table) + if dataset: + yield dataset + + def _generate_table( + self, keyspace_name: str, table: CassandraTable + ) -> Optional[Dataset]: + table_name: str = table.table_name + dataset_name: str = f"{keyspace_name}.{table_name}" + + self.report.report_entity_scanned(dataset_name, ent_type="Table") + if not self.config.table_pattern.allowed(dataset_name): + self.report.report_dropped(dataset_name) + return None + + self.cassandra_data.tables.setdefault(keyspace_name, []).append(table_name) + + schema_fields = None + try: + schema_fields = self._extract_columns_from_table(keyspace_name, table_name) + except Exception as e: + self.report.failure( + message="Failed to extract columns from table", + context=dataset_name, + exc=e, ) - # 1. Extract columns from table, then construct and emit the schemaMetadata aspect. - try: - yield from self._extract_columns_from_table( - keyspace_name, table_name, dataset_urn - ) - except Exception as e: - self.report.failure( - message="Failed to extract columns from table", - context=table_name, - exc=e, - ) - - yield MetadataChangeProposalWrapper( - entityUrn=dataset_urn, - aspect=StatusClass(removed=False), - ).as_workunit() - - yield MetadataChangeProposalWrapper( - entityUrn=dataset_urn, - aspect=SubTypesClass( - typeNames=[ - DatasetSubTypes.TABLE, - ] - ), - ).as_workunit() - - yield MetadataChangeProposalWrapper( - entityUrn=dataset_urn, - aspect=DatasetPropertiesClass( - name=table_name, - qualifiedName=f"{keyspace_name}.{table_name}", - description=table.comment, - customProperties={ - "bloom_filter_fp_chance": str(table.bloom_filter_fp_chance), - "caching": json.dumps(table.caching), - "compaction": json.dumps(table.compaction), - "compression": json.dumps(table.compression), - "crc_check_chance": str(table.crc_check_chance), - "dclocal_read_repair_chance": str( - table.dclocal_read_repair_chance - ), - "default_time_to_live": str(table.default_time_to_live), - "extensions": json.dumps(table.extensions), - "gc_grace_seconds": str(table.gc_grace_seconds), - "max_index_interval": str(table.max_index_interval), - "min_index_interval": str(table.min_index_interval), - "memtable_flush_period_in_ms": str( - table.memtable_flush_period_in_ms - ), - "read_repair_chance": str(table.read_repair_chance), - "speculative_retry": str(table.speculative_retry), - }, - ), - ).as_workunit() - - yield from add_dataset_to_container( - container_key=self._generate_keyspace_container_key(keyspace_name), - dataset_urn=dataset_urn, - ) - - if self.config.platform_instance: - yield MetadataChangeProposalWrapper( - entityUrn=dataset_urn, - aspect=DataPlatformInstanceClass( - platform=make_data_platform_urn(self.platform), - instance=make_dataplatform_instance_urn( - self.platform, self.config.platform_instance - ), - ), - ).as_workunit() + return Dataset( + platform=self.platform, + name=dataset_name, + env=self.config.env, + platform_instance=self.config.platform_instance, + subtype=DatasetSubTypes.TABLE, + parent_container=self._generate_keyspace_container_key(keyspace_name), + schema=schema_fields, + display_name=table_name, + qualified_name=dataset_name, + description=table.comment, + custom_properties=self._get_dataset_custom_props(table), + ) # get all columns for a given table, iterate over them to extract column metadata def _extract_columns_from_table( - self, keyspace_name: str, table_name: str, dataset_urn: str - ) -> Iterable[MetadataWorkUnit]: + self, keyspace_name: str, table_name: str + ) -> Optional[List[SchemaField]]: column_infos: List[CassandraColumn] = self.cassandra_api.get_columns( keyspace_name, table_name ) @@ -305,147 +254,117 @@ def _extract_columns_from_table( self.report.report_warning( message="Table has no columns, skipping", context=table_name ) - return + return None + # Tricky: we also save the column info to a global store. jsonable_column_infos: List[Dict[str, Any]] = [] for column in column_infos: self.cassandra_data.columns.setdefault(table_name, []).append(column) jsonable_column_infos.append(dataclasses.asdict(column)) - schema_metadata: SchemaMetadata = SchemaMetadata( - schemaName=table_name, - platform=make_data_platform_urn(self.platform), - version=0, - hash="", - platformSchema=OtherSchemaClass( - rawSchema=json.dumps(jsonable_column_infos) - ), - fields=schema_fields, - ) - - yield MetadataChangeProposalWrapper( - entityUrn=dataset_urn, - aspect=schema_metadata, - ).as_workunit() + return schema_fields - def _extract_views_from_keyspace( - self, keyspace_name: str - ) -> Iterable[MetadataWorkUnit]: + def _extract_views_from_keyspace(self, keyspace_name: str) -> Iterable[Dataset]: views: List[CassandraView] = self.cassandra_api.get_views(keyspace_name) for view in views: - view_name: str = view.view_name - dataset_name: str = f"{keyspace_name}.{view_name}" - self.report.report_entity_scanned(dataset_name) - dataset_urn: str = make_dataset_urn_with_platform_instance( - platform=self.platform, - name=dataset_name, - env=self.config.env, - platform_instance=self.config.platform_instance, + dataset = self._generate_view(keyspace_name, view) + if dataset: + yield dataset + + def _generate_view( + self, keyspace_name: str, view: CassandraView + ) -> Optional[Dataset]: + view_name: str = view.view_name + dataset_name: str = f"{keyspace_name}.{view_name}" + + self.report.report_entity_scanned(dataset_name, ent_type="View") + if not self.config.table_pattern.allowed(dataset_name): + # TODO: Maybe add a view_pattern instead of reusing table_pattern? + self.report.report_dropped(dataset_name) + return None + + schema_fields = None + try: + schema_fields = self._extract_columns_from_table(keyspace_name, view_name) + except Exception as e: + self.report.failure( + message="Failed to extract columns from views", + context=view_name, + exc=e, ) - yield MetadataChangeProposalWrapper( - entityUrn=dataset_urn, - aspect=StatusClass(removed=False), - ).as_workunit() - - yield MetadataChangeProposalWrapper( - entityUrn=dataset_urn, - aspect=SubTypesClass( - typeNames=[ - DatasetSubTypes.VIEW, - ] - ), - ).as_workunit() - - yield MetadataChangeProposalWrapper( - entityUrn=dataset_urn, - aspect=ViewPropertiesClass( + dataset = Dataset( + platform=self.platform, + name=dataset_name, + env=self.config.env, + platform_instance=self.config.platform_instance, + subtype=DatasetSubTypes.VIEW, + parent_container=self._generate_keyspace_container_key(keyspace_name), + schema=schema_fields, + display_name=view_name, + qualified_name=dataset_name, + description=view.comment, + custom_properties=self._get_dataset_custom_props(view), + extra_aspects=[ + ViewPropertiesClass( materialized=True, viewLogic=view.where_clause, # Use the WHERE clause as view logic viewLanguage="CQL", # Use "CQL" as the language ), - ).as_workunit() - - yield MetadataChangeProposalWrapper( - entityUrn=dataset_urn, - aspect=DatasetPropertiesClass( - name=view_name, - qualifiedName=f"{keyspace_name}.{view_name}", - description=view.comment, - customProperties={ - "bloom_filter_fp_chance": str(view.bloom_filter_fp_chance), - "caching": json.dumps(view.caching), - "compaction": json.dumps(view.compaction), - "compression": json.dumps(view.compression), - "crc_check_chance": str(view.crc_check_chance), - "include_all_columns": str(view.include_all_columns), - "dclocal_read_repair_chance": str( - view.dclocal_read_repair_chance - ), - "default_time_to_live": str(view.default_time_to_live), - "extensions": json.dumps(view.extensions), - "gc_grace_seconds": str(view.gc_grace_seconds), - "max_index_interval": str(view.max_index_interval), - "min_index_interval": str(view.min_index_interval), - "memtable_flush_period_in_ms": str( - view.memtable_flush_period_in_ms - ), - "read_repair_chance": str(view.read_repair_chance), - "speculative_retry": str(view.speculative_retry), - }, - ), - ).as_workunit() + ], + ) - try: - yield from self._extract_columns_from_table( - keyspace_name, view_name, dataset_urn - ) - except Exception as e: - self.report.failure( - message="Failed to extract columns from views", - context=view_name, - exc=e, + # Construct and emit lineage off of 'base_table_name' + # NOTE: we don't need to use 'base_table_id' since table is always in same keyspace, see https://docs.datastax.com/en/cql-oss/3.3/cql/cql_reference/cqlCreateMaterializedView.html#cqlCreateMaterializedView__keyspace-name + upstream_urn: str = make_dataset_urn_with_platform_instance( + platform=self.platform, + name=f"{keyspace_name}.{view.base_table_name}", + env=self.config.env, + platform_instance=self.config.platform_instance, + ) + fineGrainedLineages = self.get_upstream_fields_of_field_in_datasource( + view_name, str(dataset.urn), upstream_urn + ) + upstream_lineage = UpstreamLineageClass( + upstreams=[ + UpstreamClass( + dataset=upstream_urn, + type=DatasetLineageTypeClass.VIEW, ) + ], + fineGrainedLineages=fineGrainedLineages, + ) - # Construct and emit lineage off of 'base_table_name' - # NOTE: we don't need to use 'base_table_id' since table is always in same keyspace, see https://docs.datastax.com/en/cql-oss/3.3/cql/cql_reference/cqlCreateMaterializedView.html#cqlCreateMaterializedView__keyspace-name - upstream_urn: str = make_dataset_urn_with_platform_instance( - platform=self.platform, - name=f"{keyspace_name}.{view.table_name}", - env=self.config.env, - platform_instance=self.config.platform_instance, - ) - fineGrainedLineages = self.get_upstream_fields_of_field_in_datasource( - view_name, dataset_urn, upstream_urn - ) - yield MetadataChangeProposalWrapper( - entityUrn=dataset_urn, - aspect=UpstreamLineageClass( - upstreams=[ - UpstreamClass( - dataset=upstream_urn, - type=DatasetLineageTypeClass.VIEW, - ) - ], - fineGrainedLineages=fineGrainedLineages, - ), - ).as_workunit() - - yield from add_dataset_to_container( - container_key=self._generate_keyspace_container_key(keyspace_name), - dataset_urn=dataset_urn, + dataset.set_upstreams(upstream_lineage) + + return dataset + + def _get_dataset_custom_props( + self, dataset: CassandraSharedDatasetFields + ) -> Dict[str, str]: + props = { + "bloom_filter_fp_chance": str(dataset.bloom_filter_fp_chance), + "caching": json.dumps(dataset.caching), + "compaction": json.dumps(dataset.compaction), + "compression": json.dumps(dataset.compression), + "crc_check_chance": str(dataset.crc_check_chance), + "dclocal_read_repair_chance": str(dataset.dclocal_read_repair_chance), + "default_time_to_live": str(dataset.default_time_to_live), + "extensions": json.dumps(dataset.extensions), + "gc_grace_seconds": str(dataset.gc_grace_seconds), + "max_index_interval": str(dataset.max_index_interval), + "min_index_interval": str(dataset.min_index_interval), + "memtable_flush_period_in_ms": str(dataset.memtable_flush_period_in_ms), + "read_repair_chance": str(dataset.read_repair_chance), + "speculative_retry": str(dataset.speculative_retry), + } + if isinstance(dataset, CassandraView): + props.update( + { + "include_all_columns": str(dataset.include_all_columns), + } ) - - if self.config.platform_instance: - yield MetadataChangeProposalWrapper( - entityUrn=dataset_urn, - aspect=DataPlatformInstanceClass( - platform=make_data_platform_urn(self.platform), - instance=make_dataplatform_instance_urn( - self.platform, self.config.platform_instance - ), - ), - ).as_workunit() + return props def get_upstream_fields_of_field_in_datasource( self, table_name: str, dataset_urn: str, upstream_urn: str diff --git a/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra_api.py b/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra_api.py index 4cf0613762aab..c1a813eb6ee34 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra_api.py +++ b/metadata-ingestion/src/datahub/ingestion/source/cassandra/cassandra_api.py @@ -23,9 +23,9 @@ class CassandraKeyspace: @dataclass -class CassandraTable: +class CassandraSharedDatasetFields: keyspace_name: str - table_name: str + bloom_filter_fp_chance: Optional[float] caching: Optional[Dict[str, str]] comment: Optional[str] @@ -43,6 +43,11 @@ class CassandraTable: speculative_retry: Optional[str] +@dataclass +class CassandraTable(CassandraSharedDatasetFields): + table_name: str + + @dataclass class CassandraColumn: keyspace_name: str @@ -55,8 +60,10 @@ class CassandraColumn: @dataclass -class CassandraView(CassandraTable): +class CassandraView(CassandraSharedDatasetFields): view_name: str + + base_table_name: str include_all_columns: Optional[bool] where_clause: str = "" @@ -261,7 +268,7 @@ def get_views(self, keyspace_name: str) -> List[CassandraView]: views = self.get(CassandraQueries.GET_VIEWS_QUERY, [keyspace_name]) view_list = [ CassandraView( - table_name=row.base_table_name, + base_table_name=row.base_table_name, keyspace_name=row.keyspace_name, view_name=row.view_name, bloom_filter_fp_chance=row.bloom_filter_fp_chance, diff --git a/metadata-ingestion/src/datahub/sdk/__init__.py b/metadata-ingestion/src/datahub/sdk/__init__.py index 54bd18c323047..ec7ecf4ce0688 100644 --- a/metadata-ingestion/src/datahub/sdk/__init__.py +++ b/metadata-ingestion/src/datahub/sdk/__init__.py @@ -1,7 +1,7 @@ -import warnings +import types import datahub.metadata.schema_classes as models -from datahub.errors import ExperimentalWarning, SdkUsageError +from datahub.errors import SdkUsageError from datahub.ingestion.graph.config import DatahubClientConfig from datahub.metadata.urns import ( ChartUrn, @@ -21,13 +21,30 @@ from datahub.sdk.dataset import Dataset from datahub.sdk.main_client import DataHubClient -warnings.warn( - "The new datahub SDK (e.g. datahub.sdk.*) is experimental. " - "Our typical backwards-compatibility and stability guarantees do not apply to this code. " - "When it's promoted to stable, the import path will change " - "from `from datahub.sdk import ...` to `from datahub import ...`.", - ExperimentalWarning, - stacklevel=2, -) -del warnings -del ExperimentalWarning +# We want to print out the warning if people do `from datahub.sdk import X`. +# But we don't want to print out warnings if they're doing a more direct +# import like `from datahub.sdk.container import Container`, since that's +# what our internal code does. +_vars = {} +for _name, _value in list(locals().items()): + if not _name.startswith("_") and ( + _name == "models" or not isinstance(_value, types.ModuleType) + ): + _vars[_name] = _value + del locals()[_name] + + +def __getattr__(name): + import warnings + + from datahub.errors import ExperimentalWarning + + warnings.warn( + "The new datahub SDK (e.g. datahub.sdk.*) is experimental. " + "Our typical backwards-compatibility and stability guarantees do not apply to this code. " + "When it's promoted to stable, the import path will change " + "from `from datahub.sdk import ...` to `from datahub import ...`.", + ExperimentalWarning, + stacklevel=2, + ) + return _vars[name] diff --git a/metadata-ingestion/src/datahub/sdk/_entity.py b/metadata-ingestion/src/datahub/sdk/_entity.py index 071affc27eff2..f5887e4e0fb80 100644 --- a/metadata-ingestion/src/datahub/sdk/_entity.py +++ b/metadata-ingestion/src/datahub/sdk/_entity.py @@ -1,5 +1,7 @@ +from __future__ import annotations + import abc -from typing import List, Optional, Type, Union +from typing import TYPE_CHECKING, List, Optional, Type, Union from typing_extensions import Self @@ -10,6 +12,12 @@ from datahub.metadata.urns import Urn from datahub.utilities.urns._urn_base import _SpecificUrn +if TYPE_CHECKING: + from datahub.ingestion.api.workunit import MetadataWorkUnit + + +ExtraAspectsType = Union[None, List[AspectTypeVar]] + class Entity: __slots__ = ("_urn", "_prev_aspects", "_aspects") @@ -87,5 +95,14 @@ def _as_mcps( ) return mcps + def as_workunits(self) -> List[MetadataWorkUnit]: + return [mcp.as_workunit() for mcp in self._as_mcps()] + + def _set_extra_aspects(self, extra_aspects: ExtraAspectsType) -> None: + # TODO: Add validation to ensure that an "extra aspect" does not conflict + # with / get overridden by a standard aspect. + for aspect in extra_aspects or []: + self._set_aspect(aspect) + def __repr__(self) -> str: return f"{self.__class__.__name__}('{self.urn}')" diff --git a/metadata-ingestion/src/datahub/sdk/container.py b/metadata-ingestion/src/datahub/sdk/container.py index e9ecc1989e995..ec4d6521c6088 100644 --- a/metadata-ingestion/src/datahub/sdk/container.py +++ b/metadata-ingestion/src/datahub/sdk/container.py @@ -16,7 +16,7 @@ ContainerUrn, Urn, ) -from datahub.sdk._entity import Entity +from datahub.sdk._entity import Entity, ExtraAspectsType from datahub.sdk._shared import ( DomainInputType, HasContainer, @@ -74,6 +74,7 @@ def __init__( tags: Optional[TagsInputType] = None, terms: Optional[TermsInputType] = None, domain: Optional[DomainInputType] = None, + extra_aspects: ExtraAspectsType = None, ): # Hack: while the type annotations say container_key is always a ContainerKey, # we allow ContainerUrn to make the graph-based constructor work. @@ -82,6 +83,7 @@ def __init__( else: urn = ContainerUrn.from_string(container_key.as_urn()) super().__init__(urn) + self._set_extra_aspects(extra_aspects) # This needs to come first to ensure that the display name is registered. self._ensure_container_props(name=display_name) diff --git a/metadata-ingestion/src/datahub/sdk/dataset.py b/metadata-ingestion/src/datahub/sdk/dataset.py index bb7306a1acc1c..6d241627e58d1 100644 --- a/metadata-ingestion/src/datahub/sdk/dataset.py +++ b/metadata-ingestion/src/datahub/sdk/dataset.py @@ -2,7 +2,7 @@ import warnings from datetime import datetime -from typing import Dict, List, Optional, Tuple, Type, Union +from typing import Dict, List, Optional, Sequence, Tuple, Type, Union from typing_extensions import Self, TypeAlias, assert_never @@ -18,7 +18,7 @@ from datahub.ingestion.source.sql.sql_types import resolve_sql_type from datahub.metadata.urns import DatasetUrn, SchemaFieldUrn, Urn from datahub.sdk._attribution import is_ingestion_attribution -from datahub.sdk._entity import Entity +from datahub.sdk._entity import Entity, ExtraAspectsType from datahub.sdk._shared import ( DatasetUrnOrStr, DomainInputType, @@ -47,7 +47,7 @@ models.SchemaFieldClass, ] SchemaFieldsInputType: TypeAlias = Union[ - List[SchemaFieldInputType], + Sequence[SchemaFieldInputType], models.SchemaMetadataClass, ] @@ -457,6 +457,7 @@ def __init__( terms: Optional[TermsInputType] = None, # TODO structured_properties domain: Optional[DomainInputType] = None, + extra_aspects: ExtraAspectsType = None, # Dataset-specific aspects. schema: Optional[SchemaFieldsInputType] = None, upstreams: Optional[models.UpstreamLineageClass] = None, @@ -468,6 +469,7 @@ def __init__( env=env, ) super().__init__(urn) + self._set_extra_aspects(extra_aspects) self._set_platform_instance(urn.platform, platform_instance) diff --git a/metadata-ingestion/tests/integration/cassandra/cassandra_mcps_golden.json b/metadata-ingestion/tests/integration/cassandra/cassandra_mcps_golden.json index 1823a218ada2e..fb0bca406b925 100644 --- a/metadata-ingestion/tests/integration/cassandra/cassandra_mcps_golden.json +++ b/metadata-ingestion/tests/integration/cassandra/cassandra_mcps_golden.json @@ -1,64 +1,66 @@ [ { "entityType": "container", - "entityUrn": "urn:li:container:e88cdfeb0f0ec790300527f9ea34ee05", + "entityUrn": "urn:li:container:9debadbaaa1a46f8ff193a388a363cfd", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "cassandra", + "instance": "dev_instance", "env": "PROD", - "keyspace": "cass_test_1", + "keyspace": "example_keyspace", "durable_writes": "True", "replication": "{\"class\": \"org.apache.cassandra.locator.SimpleStrategy\", \"replication_factor\": \"1\"}" }, - "name": "cass_test_1", - "qualifiedName": "cass_test_1", + "name": "example_keyspace", + "qualifiedName": "example_keyspace", "env": "PROD" } }, "systemMetadata": { - "lastObserved": 1731579516869, + "lastObserved": 1739924675276, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "container", - "entityUrn": "urn:li:container:e88cdfeb0f0ec790300527f9ea34ee05", + "entityUrn": "urn:li:container:9debadbaaa1a46f8ff193a388a363cfd", "changeType": "UPSERT", - "aspectName": "dataPlatformInstance", + "aspectName": "status", "aspect": { "json": { - "platform": "urn:li:dataPlatform:cassandra" + "removed": false } }, "systemMetadata": { - "lastObserved": 1731309924399, + "lastObserved": 1739924675277, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "container", - "entityUrn": "urn:li:container:e88cdfeb0f0ec790300527f9ea34ee05", + "entityUrn": "urn:li:container:9debadbaaa1a46f8ff193a388a363cfd", "changeType": "UPSERT", - "aspectName": "status", + "aspectName": "dataPlatformInstance", "aspect": { "json": { - "removed": false + "platform": "urn:li:dataPlatform:cassandra", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)" } }, "systemMetadata": { - "lastObserved": 1731309924399, + "lastObserved": 1739924675278, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "container", - "entityUrn": "urn:li:container:e88cdfeb0f0ec790300527f9ea34ee05", + "entityUrn": "urn:li:container:9debadbaaa1a46f8ff193a388a363cfd", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -69,174 +71,40 @@ } }, "systemMetadata": { - "lastObserved": 1731309924400, + "lastObserved": 1739924675280, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "container", - "entityUrn": "urn:li:container:e88cdfeb0f0ec790300527f9ea34ee05", + "entityUrn": "urn:li:container:9debadbaaa1a46f8ff193a388a363cfd", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { - "path": [] - } - }, - "systemMetadata": { - "lastObserved": 1731309924400, - "runId": "cassandra-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_1.information,PROD)", - "changeType": "UPSERT", - "aspectName": "schemaMetadata", - "aspect": { - "json": { - "schemaName": "information", - "platform": "urn:li:dataPlatform:cassandra", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "hash": "", - "platformSchema": { - "com.linkedin.schema.OtherSchema": { - "rawSchema": "[{\"keyspace_name\": \"cass_test_1\", \"table_name\": \"information\", \"column_name\": \"details\", \"type\": \"text\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"cass_test_1\", \"table_name\": \"information\", \"column_name\": \"last_updated\", \"type\": \"timestamp\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"cass_test_1\", \"table_name\": \"information\", \"column_name\": \"person_id\", \"type\": \"int\", \"clustering_order\": \"none\", \"kind\": \"partition_key\", \"position\": 0}]" - } - }, - "fields": [ - { - "fieldPath": "details", - "nullable": true, - "type": { - "type": { - "com.linkedin.schema.StringType": {} - } - }, - "nativeDataType": "text", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "last_updated", - "nullable": true, - "type": { - "type": { - "com.linkedin.schema.TimeType": {} - } - }, - "nativeDataType": "timestamp", - "recursive": false, - "isPartOfKey": false - }, + "path": [ { - "fieldPath": "person_id", - "nullable": true, - "type": { - "type": { - "com.linkedin.schema.NumberType": {} - } - }, - "nativeDataType": "int", - "recursive": false, - "isPartOfKey": false + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)" } ] } }, "systemMetadata": { - "lastObserved": 1731591019538, - "runId": "cassandra-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_1.information,PROD)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false - } - }, - "systemMetadata": { - "lastObserved": 1731309924405, - "runId": "cassandra-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_1.information,PROD)", - "changeType": "UPSERT", - "aspectName": "subTypes", - "aspect": { - "json": { - "typeNames": [ - "Table" - ] - } - }, - "systemMetadata": { - "lastObserved": 1731309924405, - "runId": "cassandra-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_1.information,PROD)", - "changeType": "UPSERT", - "aspectName": "datasetProperties", - "aspect": { - "json": { - "customProperties": { - "bloom_filter_fp_chance": "0.01", - "caching": "{\"keys\": \"ALL\", \"rows_per_partition\": \"NONE\"}", - "compaction": "{\"class\": \"org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy\", \"max_threshold\": \"32\", \"min_threshold\": \"4\"}", - "compression": "{\"chunk_length_in_kb\": \"16\", \"class\": \"org.apache.cassandra.io.compress.LZ4Compressor\"}", - "crc_check_chance": "1.0", - "dclocal_read_repair_chance": "0.0", - "default_time_to_live": "0", - "extensions": "{}", - "gc_grace_seconds": "864000", - "max_index_interval": "2048", - "min_index_interval": "128", - "memtable_flush_period_in_ms": "0", - "read_repair_chance": "0.0", - "speculative_retry": "99p" - }, - "name": "information", - "qualifiedName": "cass_test_1.information", - "description": "", - "tags": [] - } - }, - "systemMetadata": { - "lastObserved": 1731591019540, + "lastObserved": 1739924675281, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.all_data_types,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.all_data_types,PROD)", "changeType": "UPSERT", "aspectName": "schemaMetadata", "aspect": { "json": { - "schemaName": "all_data_types", + "schemaName": "", "platform": "urn:li:dataPlatform:cassandra", "version": 0, "created": { @@ -249,9 +117,7 @@ }, "hash": "", "platformSchema": { - "com.linkedin.schema.OtherSchema": { - "rawSchema": "[{\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"ascii_column\", \"type\": \"ascii\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"bigint_column\", \"type\": \"bigint\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"blob_column\", \"type\": \"blob\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"boolean_column\", \"type\": \"boolean\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"date_column\", \"type\": \"date\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"decimal_column\", \"type\": \"decimal\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"double_column\", \"type\": \"double\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"float_column\", \"type\": \"float\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"frozen_list_column\", \"type\": \"frozen>\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"frozen_map_column\", \"type\": \"frozen>\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"frozen_set_column\", \"type\": \"frozen>\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"id\", \"type\": \"uuid\", \"clustering_order\": \"none\", \"kind\": \"partition_key\", \"position\": 0}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"inet_column\", \"type\": \"inet\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"int_column\", \"type\": \"int\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"list_column\", \"type\": \"list\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"map_column\", \"type\": \"map\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"set_column\", \"type\": \"set\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"smallint_column\", \"type\": \"smallint\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"text_column\", \"type\": \"text\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"time_column\", \"type\": \"time\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"timestamp_column\", \"type\": \"timestamp\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"timeuuid_column\", \"type\": \"timeuuid\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"tinyint_column\", \"type\": \"tinyint\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"tuple_column\", \"type\": \"frozen>\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"uuid_column\", \"type\": \"uuid\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"varchar_column\", \"type\": \"text\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"all_data_types\", \"column_name\": \"varint_column\", \"type\": \"varint\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}]" - } + "com.linkedin.schema.Schemaless": {} }, "fields": [ { @@ -582,100 +448,14 @@ } }, "systemMetadata": { - "lastObserved": 1731591019435, - "runId": "cassandra-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.all_data_types,PROD)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false - } - }, - "systemMetadata": { - "lastObserved": 1731310097192, - "runId": "cassandra-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_1.people,PROD)", - "changeType": "UPSERT", - "aspectName": "schemaMetadata", - "aspect": { - "json": { - "schemaName": "people", - "platform": "urn:li:dataPlatform:cassandra", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "hash": "", - "platformSchema": { - "com.linkedin.schema.OtherSchema": { - "rawSchema": "[{\"keyspace_name\": \"cass_test_1\", \"table_name\": \"people\", \"column_name\": \"email\", \"type\": \"text\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"cass_test_1\", \"table_name\": \"people\", \"column_name\": \"name\", \"type\": \"text\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"cass_test_1\", \"table_name\": \"people\", \"column_name\": \"person_id\", \"type\": \"int\", \"clustering_order\": \"none\", \"kind\": \"partition_key\", \"position\": 0}]" - } - }, - "fields": [ - { - "fieldPath": "email", - "nullable": true, - "type": { - "type": { - "com.linkedin.schema.StringType": {} - } - }, - "nativeDataType": "text", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "name", - "nullable": true, - "type": { - "type": { - "com.linkedin.schema.StringType": {} - } - }, - "nativeDataType": "text", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "person_id", - "nullable": true, - "type": { - "type": { - "com.linkedin.schema.NumberType": {} - } - }, - "nativeDataType": "int", - "recursive": false, - "isPartOfKey": false - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1731591019563, + "lastObserved": 1739997601555, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_1.people,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.all_data_types,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -684,14 +464,14 @@ } }, "systemMetadata": { - "lastObserved": 1731309924412, + "lastObserved": 1739924675311, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_1.people,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.all_data_types,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -702,14 +482,14 @@ } }, "systemMetadata": { - "lastObserved": 1731309924412, + "lastObserved": 1739924675312, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_1.people,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.all_data_types,PROD)", "changeType": "UPSERT", "aspectName": "datasetProperties", "aspect": { @@ -730,104 +510,135 @@ "read_repair_chance": "0.0", "speculative_retry": "99p" }, - "name": "people", - "qualifiedName": "cass_test_1.people", - "description": "", + "name": "all_data_types", + "qualifiedName": "example_keyspace.all_data_types", + "description": "Table containing all supported Cassandra data types, excluding counters", "tags": [] } }, "systemMetadata": { - "lastObserved": 1731591019564, + "lastObserved": 1739924675313, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_1.information,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.all_data_types,PROD)", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:e88cdfeb0f0ec790300527f9ea34ee05", - "urn": "urn:li:container:e88cdfeb0f0ec790300527f9ea34ee05" - } - ] + "container": "urn:li:container:9debadbaaa1a46f8ff193a388a363cfd" } }, "systemMetadata": { - "lastObserved": 1731309924406, + "lastObserved": 1739924675314, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_1.information,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.all_data_types,PROD)", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "dataPlatformInstance", "aspect": { "json": { - "container": "urn:li:container:e88cdfeb0f0ec790300527f9ea34ee05" + "platform": "urn:li:dataPlatform:cassandra", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)" } }, "systemMetadata": { - "lastObserved": 1731309924406, + "lastObserved": 1739924675315, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { - "entityType": "container", - "entityUrn": "urn:li:container:0381892d0717b54887d087eaafd95d2b", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.all_data_types,PROD)", "changeType": "UPSERT", - "aspectName": "containerProperties", + "aspectName": "browsePathsV2", "aspect": { "json": { - "customProperties": { - "platform": "cassandra", - "env": "PROD", - "keyspace": "cass_test_2", - "durable_writes": "True", - "replication": "{\"class\": \"org.apache.cassandra.locator.SimpleStrategy\", \"replication_factor\": \"1\"}" - }, - "name": "cass_test_2", - "qualifiedName": "cass_test_2", - "env": "PROD" + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)" + }, + { + "id": "urn:li:container:9debadbaaa1a46f8ff193a388a363cfd", + "urn": "urn:li:container:9debadbaaa1a46f8ff193a388a363cfd" + } + ] } }, "systemMetadata": { - "lastObserved": 1731579516849, + "lastObserved": 1739924675316, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_1.people,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.counter_table,PROD)", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "schemaMetadata", "aspect": { "json": { - "path": [ + "schemaName": "", + "platform": "urn:li:dataPlatform:cassandra", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.schema.Schemaless": {} + }, + "fields": [ + { + "fieldPath": "counter_column", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "counter", + "recursive": false, + "isPartOfKey": false + }, { - "id": "urn:li:container:e88cdfeb0f0ec790300527f9ea34ee05", - "urn": "urn:li:container:e88cdfeb0f0ec790300527f9ea34ee05" + "fieldPath": "id", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "uuid", + "recursive": false, + "isPartOfKey": false } ] } }, "systemMetadata": { - "lastObserved": 1731309924413, + "lastObserved": 1739997601577, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { - "entityType": "container", - "entityUrn": "urn:li:container:0381892d0717b54887d087eaafd95d2b", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.counter_table,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -836,51 +647,130 @@ } }, "systemMetadata": { - "lastObserved": 1731309924420, + "lastObserved": 1739924675326, + "runId": "cassandra-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.counter_table,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1739924675327, + "runId": "cassandra-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.counter_table,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": { + "bloom_filter_fp_chance": "0.01", + "caching": "{\"keys\": \"ALL\", \"rows_per_partition\": \"NONE\"}", + "compaction": "{\"class\": \"org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy\", \"max_threshold\": \"32\", \"min_threshold\": \"4\"}", + "compression": "{\"chunk_length_in_kb\": \"16\", \"class\": \"org.apache.cassandra.io.compress.LZ4Compressor\"}", + "crc_check_chance": "1.0", + "dclocal_read_repair_chance": "0.0", + "default_time_to_live": "0", + "extensions": "{}", + "gc_grace_seconds": "864000", + "max_index_interval": "2048", + "min_index_interval": "128", + "memtable_flush_period_in_ms": "0", + "read_repair_chance": "0.0", + "speculative_retry": "99p" + }, + "name": "counter_table", + "qualifiedName": "example_keyspace.counter_table", + "description": "Separate table containing only counter column", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1739924675328, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_1.people,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.counter_table,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:e88cdfeb0f0ec790300527f9ea34ee05" + "container": "urn:li:container:9debadbaaa1a46f8ff193a388a363cfd" } }, "systemMetadata": { - "lastObserved": 1731309924413, + "lastObserved": 1739924675330, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { - "entityType": "container", - "entityUrn": "urn:li:container:0381892d0717b54887d087eaafd95d2b", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.counter_table,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:cassandra", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)" + } + }, + "systemMetadata": { + "lastObserved": 1739924675331, + "runId": "cassandra-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.counter_table,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { - "path": [] + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)" + }, + { + "id": "urn:li:container:9debadbaaa1a46f8ff193a388a363cfd", + "urn": "urn:li:container:9debadbaaa1a46f8ff193a388a363cfd" + } + ] } }, "systemMetadata": { - "lastObserved": 1731309924421, + "lastObserved": 1739924675332, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_2.tasks,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.shopping_cart,PROD)", "changeType": "UPSERT", "aspectName": "schemaMetadata", "aspect": { "json": { - "schemaName": "tasks", + "schemaName": "", "platform": "urn:li:dataPlatform:cassandra", "version": 0, "created": { @@ -893,25 +783,23 @@ }, "hash": "", "platformSchema": { - "com.linkedin.schema.OtherSchema": { - "rawSchema": "[{\"keyspace_name\": \"cass_test_2\", \"table_name\": \"tasks\", \"column_name\": \"details\", \"type\": \"text\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"cass_test_2\", \"table_name\": \"tasks\", \"column_name\": \"last_updated\", \"type\": \"timestamp\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"cass_test_2\", \"table_name\": \"tasks\", \"column_name\": \"status\", \"type\": \"text\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"cass_test_2\", \"table_name\": \"tasks\", \"column_name\": \"task_id\", \"type\": \"int\", \"clustering_order\": \"none\", \"kind\": \"partition_key\", \"position\": 0}]" - } + "com.linkedin.schema.Schemaless": {} }, "fields": [ { - "fieldPath": "details", + "fieldPath": "item_count", "nullable": true, "type": { "type": { - "com.linkedin.schema.StringType": {} + "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "text", + "nativeDataType": "int", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "last_updated", + "fieldPath": "last_update_timestamp", "nullable": true, "type": { "type": { @@ -923,7 +811,7 @@ "isPartOfKey": false }, { - "fieldPath": "status", + "fieldPath": "userid", "nullable": true, "type": { "type": { @@ -933,31 +821,19 @@ "nativeDataType": "text", "recursive": false, "isPartOfKey": false - }, - { - "fieldPath": "task_id", - "nullable": true, - "type": { - "type": { - "com.linkedin.schema.NumberType": {} - } - }, - "nativeDataType": "int", - "recursive": false, - "isPartOfKey": false } ] } }, "systemMetadata": { - "lastObserved": 1731591019516, + "lastObserved": 1739997601596, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_2.tasks,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.shopping_cart,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -966,14 +842,14 @@ } }, "systemMetadata": { - "lastObserved": 1731309924426, + "lastObserved": 1739924675342, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_2.tasks,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.shopping_cart,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -984,14 +860,14 @@ } }, "systemMetadata": { - "lastObserved": 1731309924426, + "lastObserved": 1739924675343, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_2.tasks,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.shopping_cart,PROD)", "changeType": "UPSERT", "aspectName": "datasetProperties", "aspect": { @@ -1012,131 +888,172 @@ "read_repair_chance": "0.0", "speculative_retry": "99p" }, - "name": "tasks", - "qualifiedName": "cass_test_2.tasks", + "name": "shopping_cart", + "qualifiedName": "example_keyspace.shopping_cart", "description": "", "tags": [] } }, "systemMetadata": { - "lastObserved": 1731591019518, + "lastObserved": 1739924675345, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { - "entityType": "container", - "entityUrn": "urn:li:container:0381892d0717b54887d087eaafd95d2b", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.shopping_cart,PROD)", "changeType": "UPSERT", - "aspectName": "subTypes", + "aspectName": "container", "aspect": { "json": { - "typeNames": [ - "Keyspace" - ] + "container": "urn:li:container:9debadbaaa1a46f8ff193a388a363cfd" } }, "systemMetadata": { - "lastObserved": 1731309924421, + "lastObserved": 1739924675346, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { - "entityType": "container", - "entityUrn": "urn:li:container:0381892d0717b54887d087eaafd95d2b", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.shopping_cart,PROD)", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:cassandra" + "platform": "urn:li:dataPlatform:cassandra", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)" } }, "systemMetadata": { - "lastObserved": 1731309924420, + "lastObserved": 1739924675347, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_2.tasks,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.shopping_cart,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:0381892d0717b54887d087eaafd95d2b", - "urn": "urn:li:container:0381892d0717b54887d087eaafd95d2b" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)" + }, + { + "id": "urn:li:container:9debadbaaa1a46f8ff193a388a363cfd", + "urn": "urn:li:container:9debadbaaa1a46f8ff193a388a363cfd" } ] } }, "systemMetadata": { - "lastObserved": 1731309924427, + "lastObserved": 1739924675348, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_2.tasks,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.example_view_1,PROD)", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "status", "aspect": { "json": { - "container": "urn:li:container:0381892d0717b54887d087eaafd95d2b" + "removed": false } }, "systemMetadata": { - "lastObserved": 1731309924427, + "lastObserved": 1739924675355, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.all_data_types,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.example_view_1,PROD)", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "subTypes", "aspect": { "json": { - "container": "urn:li:container:305f73c676989511c67d97ace119138c" + "typeNames": [ + "View" + ] } }, "systemMetadata": { - "lastObserved": 1731310097193, + "lastObserved": 1739924675356, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.all_data_types,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.example_view_1,PROD)", "changeType": "UPSERT", - "aspectName": "subTypes", + "aspectName": "viewProperties", "aspect": { "json": { - "typeNames": [ - "Table" - ] + "materialized": true, + "viewLogic": "id IS NOT NULL AND ascii_column IS NOT NULL", + "viewLanguage": "CQL" + } + }, + "systemMetadata": { + "lastObserved": 1739924675357, + "runId": "cassandra-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.example_view_1,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": { + "bloom_filter_fp_chance": "0.01", + "caching": "{\"keys\": \"ALL\", \"rows_per_partition\": \"NONE\"}", + "compaction": "{\"class\": \"org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy\", \"max_threshold\": \"32\", \"min_threshold\": \"4\"}", + "compression": "{\"chunk_length_in_kb\": \"16\", \"class\": \"org.apache.cassandra.io.compress.LZ4Compressor\"}", + "crc_check_chance": "1.0", + "include_all_columns": "False", + "dclocal_read_repair_chance": "0.0", + "default_time_to_live": "0", + "extensions": "{}", + "gc_grace_seconds": "864000", + "max_index_interval": "2048", + "min_index_interval": "128", + "memtable_flush_period_in_ms": "0", + "read_repair_chance": "0.0", + "speculative_retry": "99p" + }, + "name": "example_view_1", + "qualifiedName": "example_keyspace.example_view_1", + "description": "Example view definition with id and ascii_column", + "tags": [] } }, "systemMetadata": { - "lastObserved": 1731310097193, + "lastObserved": 1739924675358, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_2.task_status,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.example_view_1,PROD)", "changeType": "UPSERT", "aspectName": "schemaMetadata", "aspect": { "json": { - "schemaName": "task_status", + "schemaName": "", "platform": "urn:li:dataPlatform:cassandra", "version": 0, "created": { @@ -1149,32 +1066,42 @@ }, "hash": "", "platformSchema": { - "com.linkedin.schema.OtherSchema": { - "rawSchema": "[{\"keyspace_name\": \"cass_test_2\", \"table_name\": \"task_status\", \"column_name\": \"status\", \"type\": \"text\", \"clustering_order\": \"asc\", \"kind\": \"clustering\", \"position\": 0}, {\"keyspace_name\": \"cass_test_2\", \"table_name\": \"task_status\", \"column_name\": \"task_id\", \"type\": \"int\", \"clustering_order\": \"none\", \"kind\": \"partition_key\", \"position\": 0}]" - } + "com.linkedin.schema.Schemaless": {} }, "fields": [ { - "fieldPath": "status", + "fieldPath": "ascii_column", "nullable": true, "type": { "type": { "com.linkedin.schema.StringType": {} } }, - "nativeDataType": "text", + "nativeDataType": "ascii", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "task_id", + "fieldPath": "bigint_column", "nullable": true, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "int", + "nativeDataType": "bigint", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "id", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "uuid", "recursive": false, "isPartOfKey": false } @@ -1182,70 +1109,432 @@ } }, "systemMetadata": { - "lastObserved": 1731591019525, + "lastObserved": 1739997601613, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.all_data_types,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.example_view_1,PROD)", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "upstreamLineage", "aspect": { "json": { - "path": [ + "upstreams": [ { - "id": "urn:li:container:305f73c676989511c67d97ace119138c", - "urn": "urn:li:container:305f73c676989511c67d97ace119138c" + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.all_data_types,PROD)", + "type": "VIEW" } - ] - } - }, - "systemMetadata": { - "lastObserved": 1731310097193, - "runId": "cassandra-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.all_data_types,PROD)", - "changeType": "UPSERT", - "aspectName": "datasetProperties", - "aspect": { - "json": { - "customProperties": { - "bloom_filter_fp_chance": "0.01", - "caching": "{\"keys\": \"ALL\", \"rows_per_partition\": \"NONE\"}", - "compaction": "{\"class\": \"org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy\", \"max_threshold\": \"32\", \"min_threshold\": \"4\"}", - "compression": "{\"chunk_length_in_kb\": \"16\", \"class\": \"org.apache.cassandra.io.compress.LZ4Compressor\"}", - "crc_check_chance": "1.0", - "dclocal_read_repair_chance": "0.0", - "default_time_to_live": "0", - "extensions": "{}", - "gc_grace_seconds": "864000", - "max_index_interval": "2048", - "min_index_interval": "128", - "memtable_flush_period_in_ms": "0", + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.all_data_types,PROD),ascii_column)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.example_view_1,PROD),ascii_column)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.all_data_types,PROD),bigint_column)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.example_view_1,PROD),bigint_column)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.all_data_types,PROD),id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.example_view_1,PROD),id)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1739924675364, + "runId": "cassandra-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.example_view_1,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:9debadbaaa1a46f8ff193a388a363cfd" + } + }, + "systemMetadata": { + "lastObserved": 1739924675367, + "runId": "cassandra-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.example_view_1,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:cassandra", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)" + } + }, + "systemMetadata": { + "lastObserved": 1739924675367, + "runId": "cassandra-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.example_view_1,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)" + }, + { + "id": "urn:li:container:9debadbaaa1a46f8ff193a388a363cfd", + "urn": "urn:li:container:9debadbaaa1a46f8ff193a388a363cfd" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1739924675368, + "runId": "cassandra-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.example_view_2,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1739924675376, + "runId": "cassandra-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.example_view_2,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1739924675377, + "runId": "cassandra-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.example_view_2,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": true, + "viewLogic": "id IS NOT NULL AND ascii_column IS NOT NULL", + "viewLanguage": "CQL" + } + }, + "systemMetadata": { + "lastObserved": 1739924675378, + "runId": "cassandra-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.example_view_2,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": { + "bloom_filter_fp_chance": "0.01", + "caching": "{\"keys\": \"ALL\", \"rows_per_partition\": \"NONE\"}", + "compaction": "{\"class\": \"org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy\", \"max_threshold\": \"32\", \"min_threshold\": \"4\"}", + "compression": "{\"chunk_length_in_kb\": \"16\", \"class\": \"org.apache.cassandra.io.compress.LZ4Compressor\"}", + "crc_check_chance": "1.0", + "include_all_columns": "False", + "dclocal_read_repair_chance": "0.0", + "default_time_to_live": "0", + "extensions": "{}", + "gc_grace_seconds": "864000", + "max_index_interval": "2048", + "min_index_interval": "128", + "memtable_flush_period_in_ms": "0", "read_repair_chance": "0.0", "speculative_retry": "99p" }, - "name": "all_data_types", - "qualifiedName": "example_keyspace.all_data_types", - "description": "Table containing all supported Cassandra data types, excluding counters", + "name": "example_view_2", + "qualifiedName": "example_keyspace.example_view_2", + "description": "Example view definition with id and ascii_column", "tags": [] } }, "systemMetadata": { - "lastObserved": 1731591019440, + "lastObserved": 1739924675380, + "runId": "cassandra-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.example_view_2,PROD)", + "changeType": "UPSERT", + "aspectName": "schemaMetadata", + "aspect": { + "json": { + "schemaName": "", + "platform": "urn:li:dataPlatform:cassandra", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.schema.Schemaless": {} + }, + "fields": [ + { + "fieldPath": "ascii_column", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "ascii", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "float_column", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "float", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "id", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "uuid", + "recursive": false, + "isPartOfKey": false + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1739997601626, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_2.task_status,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.example_view_2,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.all_data_types,PROD)", + "type": "VIEW" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.all_data_types,PROD),ascii_column)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.example_view_2,PROD),ascii_column)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.all_data_types,PROD),float_column)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.example_view_2,PROD),float_column)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.all_data_types,PROD),id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.example_view_2,PROD),id)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1739924675385, + "runId": "cassandra-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.example_view_2,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:9debadbaaa1a46f8ff193a388a363cfd" + } + }, + "systemMetadata": { + "lastObserved": 1739924675387, + "runId": "cassandra-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.example_view_2,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:cassandra", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)" + } + }, + "systemMetadata": { + "lastObserved": 1739924675388, + "runId": "cassandra-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.example_view_2,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)" + }, + { + "id": "urn:li:container:9debadbaaa1a46f8ff193a388a363cfd", + "urn": "urn:li:container:9debadbaaa1a46f8ff193a388a363cfd" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1739924675389, + "runId": "cassandra-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:5cbe874ca6cbc4b51dc2313034798ba5", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "cassandra", + "instance": "dev_instance", + "env": "PROD", + "keyspace": "cass_test_2", + "durable_writes": "True", + "replication": "{\"class\": \"org.apache.cassandra.locator.SimpleStrategy\", \"replication_factor\": \"1\"}" + }, + "name": "cass_test_2", + "qualifiedName": "cass_test_2", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1739924675400, + "runId": "cassandra-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:5cbe874ca6cbc4b51dc2313034798ba5", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -1254,37 +1543,75 @@ } }, "systemMetadata": { - "lastObserved": 1731310097158, + "lastObserved": 1739924675402, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_2.task_status,PROD)", + "entityType": "container", + "entityUrn": "urn:li:container:5cbe874ca6cbc4b51dc2313034798ba5", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:cassandra", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)" + } + }, + "systemMetadata": { + "lastObserved": 1739924675402, + "runId": "cassandra-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:5cbe874ca6cbc4b51dc2313034798ba5", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Keyspace" + ] + } + }, + "systemMetadata": { + "lastObserved": 1739924675403, + "runId": "cassandra-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:5cbe874ca6cbc4b51dc2313034798ba5", "changeType": "UPSERT", - "aspectName": "subTypes", + "aspectName": "browsePathsV2", "aspect": { "json": { - "typeNames": [ - "View" + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)" + } ] } }, "systemMetadata": { - "lastObserved": 1731310097161, + "lastObserved": 1739924675404, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.counter_table,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_2.tasks,PROD)", "changeType": "UPSERT", "aspectName": "schemaMetadata", "aspect": { "json": { - "schemaName": "counter_table", + "schemaName": "", "platform": "urn:li:dataPlatform:cassandra", "version": 0, "created": { @@ -1297,32 +1624,54 @@ }, "hash": "", "platformSchema": { - "com.linkedin.schema.OtherSchema": { - "rawSchema": "[{\"keyspace_name\": \"example_keyspace\", \"table_name\": \"counter_table\", \"column_name\": \"counter_column\", \"type\": \"counter\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"counter_table\", \"column_name\": \"id\", \"type\": \"uuid\", \"clustering_order\": \"none\", \"kind\": \"partition_key\", \"position\": 0}]" - } + "com.linkedin.schema.Schemaless": {} }, "fields": [ { - "fieldPath": "counter_column", + "fieldPath": "details", "nullable": true, "type": { "type": { - "com.linkedin.schema.NumberType": {} + "com.linkedin.schema.StringType": {} } }, - "nativeDataType": "counter", + "nativeDataType": "text", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "id", + "fieldPath": "last_updated", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.TimeType": {} + } + }, + "nativeDataType": "timestamp", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "status", "nullable": true, "type": { "type": { "com.linkedin.schema.StringType": {} } }, - "nativeDataType": "uuid", + "nativeDataType": "text", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "task_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "int", "recursive": false, "isPartOfKey": false } @@ -1330,14 +1679,48 @@ } }, "systemMetadata": { - "lastObserved": 1731591019446, + "lastObserved": 1739997601663, + "runId": "cassandra-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_2.tasks,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1739924675415, + "runId": "cassandra-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_2.tasks,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1739924675416, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_2.task_status,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_2.tasks,PROD)", "changeType": "UPSERT", "aspectName": "datasetProperties", "aspect": { @@ -1348,7 +1731,6 @@ "compaction": "{\"class\": \"org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy\", \"max_threshold\": \"32\", \"min_threshold\": \"4\"}", "compression": "{\"chunk_length_in_kb\": \"16\", \"class\": \"org.apache.cassandra.io.compress.LZ4Compressor\"}", "crc_check_chance": "1.0", - "include_all_columns": "False", "dclocal_read_repair_chance": "0.0", "default_time_to_live": "0", "extensions": "{}", @@ -1359,126 +1741,223 @@ "read_repair_chance": "0.0", "speculative_retry": "99p" }, - "name": "task_status", - "qualifiedName": "cass_test_2.task_status", + "name": "tasks", + "qualifiedName": "cass_test_2.tasks", "description": "", "tags": [] } }, "systemMetadata": { - "lastObserved": 1731591019524, + "lastObserved": 1739924675417, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.counter_table,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_2.tasks,PROD)", "changeType": "UPSERT", - "aspectName": "status", + "aspectName": "container", "aspect": { "json": { - "removed": false + "container": "urn:li:container:5cbe874ca6cbc4b51dc2313034798ba5" } }, "systemMetadata": { - "lastObserved": 1731310097198, + "lastObserved": 1739924675419, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_2.task_status,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_2.tasks,PROD)", "changeType": "UPSERT", - "aspectName": "viewProperties", + "aspectName": "dataPlatformInstance", "aspect": { "json": { - "materialized": true, - "viewLogic": "status IS NOT NULL AND task_id IS NOT NULL", - "viewLanguage": "CQL" + "platform": "urn:li:dataPlatform:cassandra", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)" } }, "systemMetadata": { - "lastObserved": 1731310097161, + "lastObserved": 1739924675420, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_2.task_status,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_2.tasks,PROD)", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "browsePathsV2", "aspect": { "json": { - "container": "urn:li:container:0381892d0717b54887d087eaafd95d2b" + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)" + }, + { + "id": "urn:li:container:5cbe874ca6cbc4b51dc2313034798ba5", + "urn": "urn:li:container:5cbe874ca6cbc4b51dc2313034798ba5" + } + ] } }, "systemMetadata": { - "lastObserved": 1731310097163, + "lastObserved": 1739924675421, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_2.task_status,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_2.task_status,PROD)", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "status", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:0381892d0717b54887d087eaafd95d2b", - "urn": "urn:li:container:0381892d0717b54887d087eaafd95d2b" - } + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1739924675428, + "runId": "cassandra-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_2.task_status,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "View" ] } }, "systemMetadata": { - "lastObserved": 1731310097163, + "lastObserved": 1739924675429, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.counter_table,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_2.task_status,PROD)", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "viewProperties", "aspect": { "json": { - "container": "urn:li:container:305f73c676989511c67d97ace119138c" + "materialized": true, + "viewLogic": "status IS NOT NULL AND task_id IS NOT NULL", + "viewLanguage": "CQL" } }, "systemMetadata": { - "lastObserved": 1731310097199, + "lastObserved": 1739924675430, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.counter_table,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_2.task_status,PROD)", "changeType": "UPSERT", - "aspectName": "subTypes", + "aspectName": "datasetProperties", "aspect": { "json": { - "typeNames": [ - "Table" + "customProperties": { + "bloom_filter_fp_chance": "0.01", + "caching": "{\"keys\": \"ALL\", \"rows_per_partition\": \"NONE\"}", + "compaction": "{\"class\": \"org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy\", \"max_threshold\": \"32\", \"min_threshold\": \"4\"}", + "compression": "{\"chunk_length_in_kb\": \"16\", \"class\": \"org.apache.cassandra.io.compress.LZ4Compressor\"}", + "crc_check_chance": "1.0", + "include_all_columns": "False", + "dclocal_read_repair_chance": "0.0", + "default_time_to_live": "0", + "extensions": "{}", + "gc_grace_seconds": "864000", + "max_index_interval": "2048", + "min_index_interval": "128", + "memtable_flush_period_in_ms": "0", + "read_repair_chance": "0.0", + "speculative_retry": "99p" + }, + "name": "task_status", + "qualifiedName": "cass_test_2.task_status", + "description": "", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1739924675431, + "runId": "cassandra-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_2.task_status,PROD)", + "changeType": "UPSERT", + "aspectName": "schemaMetadata", + "aspect": { + "json": { + "schemaName": "", + "platform": "urn:li:dataPlatform:cassandra", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.schema.Schemaless": {} + }, + "fields": [ + { + "fieldPath": "status", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "text", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "task_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "int", + "recursive": false, + "isPartOfKey": false + } ] } }, "systemMetadata": { - "lastObserved": 1731310097198, + "lastObserved": 1739997601674, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_2.task_status,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_2.task_status,PROD)", "changeType": "UPSERT", "aspectName": "upstreamLineage", "aspect": { @@ -1489,7 +1968,7 @@ "time": 0, "actor": "urn:li:corpuser:unknown" }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_2.tasks,PROD)", + "dataset": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_2.tasks,PROD)", "type": "VIEW" } ], @@ -1497,22 +1976,22 @@ { "upstreamType": "FIELD_SET", "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_2.tasks,PROD),status)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_2.tasks,PROD),status)" ], "downstreamType": "FIELD", "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_2.task_status,PROD),status)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_2.task_status,PROD),status)" ], "confidenceScore": 1.0 }, { "upstreamType": "FIELD_SET", "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_2.tasks,PROD),task_id)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_2.tasks,PROD),task_id)" ], "downstreamType": "FIELD", "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_2.task_status,PROD),task_id)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_2.task_status,PROD),task_id)" ], "confidenceScore": 1.0 } @@ -1520,111 +1999,98 @@ } }, "systemMetadata": { - "lastObserved": 1731447296444, + "lastObserved": 1739924675436, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.counter_table,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_2.task_status,PROD)", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:305f73c676989511c67d97ace119138c", - "urn": "urn:li:container:305f73c676989511c67d97ace119138c" - } - ] + "container": "urn:li:container:5cbe874ca6cbc4b51dc2313034798ba5" } }, "systemMetadata": { - "lastObserved": 1731310097199, + "lastObserved": 1739924675440, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.counter_table,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_2.task_status,PROD)", "changeType": "UPSERT", - "aspectName": "datasetProperties", + "aspectName": "dataPlatformInstance", "aspect": { "json": { - "customProperties": { - "bloom_filter_fp_chance": "0.01", - "caching": "{\"keys\": \"ALL\", \"rows_per_partition\": \"NONE\"}", - "compaction": "{\"class\": \"org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy\", \"max_threshold\": \"32\", \"min_threshold\": \"4\"}", - "compression": "{\"chunk_length_in_kb\": \"16\", \"class\": \"org.apache.cassandra.io.compress.LZ4Compressor\"}", - "crc_check_chance": "1.0", - "dclocal_read_repair_chance": "0.0", - "default_time_to_live": "0", - "extensions": "{}", - "gc_grace_seconds": "864000", - "max_index_interval": "2048", - "min_index_interval": "128", - "memtable_flush_period_in_ms": "0", - "read_repair_chance": "0.0", - "speculative_retry": "99p" - }, - "name": "counter_table", - "qualifiedName": "example_keyspace.counter_table", - "description": "Separate table containing only counter column", - "tags": [] + "platform": "urn:li:dataPlatform:cassandra", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)" } }, "systemMetadata": { - "lastObserved": 1731591019447, + "lastObserved": 1739924675441, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { - "entityType": "container", - "entityUrn": "urn:li:container:305f73c676989511c67d97ace119138c", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_2.task_status,PROD)", "changeType": "UPSERT", - "aspectName": "dataPlatformInstance", + "aspectName": "browsePathsV2", "aspect": { "json": { - "platform": "urn:li:dataPlatform:cassandra" + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)" + }, + { + "id": "urn:li:container:5cbe874ca6cbc4b51dc2313034798ba5", + "urn": "urn:li:container:5cbe874ca6cbc4b51dc2313034798ba5" + } + ] } }, "systemMetadata": { - "lastObserved": 1731310097186, + "lastObserved": 1739924675442, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "container", - "entityUrn": "urn:li:container:305f73c676989511c67d97ace119138c", + "entityUrn": "urn:li:container:b89ce3e714c980422ca601f9be0f54af", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "cassandra", + "instance": "dev_instance", "env": "PROD", - "keyspace": "example_keyspace", + "keyspace": "cass_test_1", "durable_writes": "True", "replication": "{\"class\": \"org.apache.cassandra.locator.SimpleStrategy\", \"replication_factor\": \"1\"}" }, - "name": "example_keyspace", - "qualifiedName": "example_keyspace", + "name": "cass_test_1", + "qualifiedName": "cass_test_1", "env": "PROD" } }, "systemMetadata": { - "lastObserved": 1731579516801, + "lastObserved": 1739924675452, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "container", - "entityUrn": "urn:li:container:305f73c676989511c67d97ace119138c", + "entityUrn": "urn:li:container:b89ce3e714c980422ca601f9be0f54af", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -1633,14 +2099,31 @@ } }, "systemMetadata": { - "lastObserved": 1731310097185, + "lastObserved": 1739924675453, + "runId": "cassandra-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:b89ce3e714c980422ca601f9be0f54af", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:cassandra", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)" + } + }, + "systemMetadata": { + "lastObserved": 1739924675454, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "container", - "entityUrn": "urn:li:container:305f73c676989511c67d97ace119138c", + "entityUrn": "urn:li:container:b89ce3e714c980422ca601f9be0f54af", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1651,35 +2134,40 @@ } }, "systemMetadata": { - "lastObserved": 1731310097186, + "lastObserved": 1739924675455, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "container", - "entityUrn": "urn:li:container:305f73c676989511c67d97ace119138c", + "entityUrn": "urn:li:container:b89ce3e714c980422ca601f9be0f54af", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { - "path": [] + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)" + } + ] } }, "systemMetadata": { - "lastObserved": 1731310097186, + "lastObserved": 1739924675456, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.shopping_cart,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_1.information,PROD)", "changeType": "UPSERT", "aspectName": "schemaMetadata", "aspect": { "json": { - "schemaName": "shopping_cart", + "schemaName": "", "platform": "urn:li:dataPlatform:cassandra", "version": 0, "created": { @@ -1692,25 +2180,23 @@ }, "hash": "", "platformSchema": { - "com.linkedin.schema.OtherSchema": { - "rawSchema": "[{\"keyspace_name\": \"example_keyspace\", \"table_name\": \"shopping_cart\", \"column_name\": \"item_count\", \"type\": \"int\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"shopping_cart\", \"column_name\": \"last_update_timestamp\", \"type\": \"timestamp\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"shopping_cart\", \"column_name\": \"userid\", \"type\": \"text\", \"clustering_order\": \"none\", \"kind\": \"partition_key\", \"position\": 0}]" - } + "com.linkedin.schema.Schemaless": {} }, "fields": [ { - "fieldPath": "item_count", + "fieldPath": "details", "nullable": true, "type": { "type": { - "com.linkedin.schema.NumberType": {} + "com.linkedin.schema.StringType": {} } }, - "nativeDataType": "int", + "nativeDataType": "text", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "last_update_timestamp", + "fieldPath": "last_updated", "nullable": true, "type": { "type": { @@ -1722,14 +2208,14 @@ "isPartOfKey": false }, { - "fieldPath": "userid", + "fieldPath": "person_id", "nullable": true, "type": { "type": { - "com.linkedin.schema.StringType": {} + "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "text", + "nativeDataType": "int", "recursive": false, "isPartOfKey": false } @@ -1737,74 +2223,30 @@ } }, "systemMetadata": { - "lastObserved": 1731591019453, + "lastObserved": 1739997601705, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.example_view_1,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_1.information,PROD)", "changeType": "UPSERT", - "aspectName": "upstreamLineage", + "aspectName": "status", "aspect": { "json": { - "upstreams": [ - { - "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.all_data_types,PROD)", - "type": "VIEW" - } - ], - "fineGrainedLineages": [ - { - "upstreamType": "FIELD_SET", - "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.all_data_types,PROD),ascii_column)" - ], - "downstreamType": "FIELD", - "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.example_view_1,PROD),ascii_column)" - ], - "confidenceScore": 1.0 - }, - { - "upstreamType": "FIELD_SET", - "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.all_data_types,PROD),bigint_column)" - ], - "downstreamType": "FIELD", - "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.example_view_1,PROD),bigint_column)" - ], - "confidenceScore": 1.0 - }, - { - "upstreamType": "FIELD_SET", - "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.all_data_types,PROD),id)" - ], - "downstreamType": "FIELD", - "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.example_view_1,PROD),id)" - ], - "confidenceScore": 1.0 - } - ] + "removed": false } }, "systemMetadata": { - "lastObserved": 1731447296557, + "lastObserved": 1739924675467, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.shopping_cart,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_1.information,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1815,48 +2257,14 @@ } }, "systemMetadata": { - "lastObserved": 1731410842611, - "runId": "cassandra-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.shopping_cart,PROD)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false - } - }, - "systemMetadata": { - "lastObserved": 1731410842610, - "runId": "cassandra-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.example_view_1,PROD)", - "changeType": "UPSERT", - "aspectName": "viewProperties", - "aspect": { - "json": { - "materialized": true, - "viewLogic": "id IS NOT NULL AND ascii_column IS NOT NULL", - "viewLanguage": "CQL" - } - }, - "systemMetadata": { - "lastObserved": 1731310103458, + "lastObserved": 1739924675468, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.shopping_cart,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_1.information,PROD)", "changeType": "UPSERT", "aspectName": "datasetProperties", "aspect": { @@ -1877,97 +2285,84 @@ "read_repair_chance": "0.0", "speculative_retry": "99p" }, - "name": "shopping_cart", - "qualifiedName": "example_keyspace.shopping_cart", + "name": "information", + "qualifiedName": "cass_test_1.information", "description": "", "tags": [] } }, "systemMetadata": { - "lastObserved": 1731591019455, - "runId": "cassandra-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.example_view_1,PROD)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false - } - }, - "systemMetadata": { - "lastObserved": 1731310103456, + "lastObserved": 1739924675469, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.shopping_cart,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_1.information,PROD)", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:305f73c676989511c67d97ace119138c", - "urn": "urn:li:container:305f73c676989511c67d97ace119138c" - } - ] + "container": "urn:li:container:b89ce3e714c980422ca601f9be0f54af" } }, "systemMetadata": { - "lastObserved": 1731410842612, + "lastObserved": 1739924675470, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.shopping_cart,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_1.information,PROD)", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "dataPlatformInstance", "aspect": { "json": { - "container": "urn:li:container:305f73c676989511c67d97ace119138c" + "platform": "urn:li:dataPlatform:cassandra", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)" } }, "systemMetadata": { - "lastObserved": 1731410842611, + "lastObserved": 1739924675471, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.example_view_1,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_1.information,PROD)", "changeType": "UPSERT", - "aspectName": "subTypes", + "aspectName": "browsePathsV2", "aspect": { "json": { - "typeNames": [ - "View" + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)" + }, + { + "id": "urn:li:container:b89ce3e714c980422ca601f9be0f54af", + "urn": "urn:li:container:b89ce3e714c980422ca601f9be0f54af" + } ] } }, "systemMetadata": { - "lastObserved": 1731310103457, + "lastObserved": 1739924675472, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.example_view_1,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_1.people,PROD)", "changeType": "UPSERT", "aspectName": "schemaMetadata", "aspect": { "json": { - "schemaName": "example_view_1", + "schemaName": "", "platform": "urn:li:dataPlatform:cassandra", "version": 0, "created": { @@ -1980,44 +2375,42 @@ }, "hash": "", "platformSchema": { - "com.linkedin.schema.OtherSchema": { - "rawSchema": "[{\"keyspace_name\": \"example_keyspace\", \"table_name\": \"example_view_1\", \"column_name\": \"ascii_column\", \"type\": \"ascii\", \"clustering_order\": \"asc\", \"kind\": \"clustering\", \"position\": 0}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"example_view_1\", \"column_name\": \"bigint_column\", \"type\": \"bigint\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"example_view_1\", \"column_name\": \"id\", \"type\": \"uuid\", \"clustering_order\": \"none\", \"kind\": \"partition_key\", \"position\": 0}]" - } + "com.linkedin.schema.Schemaless": {} }, "fields": [ { - "fieldPath": "ascii_column", + "fieldPath": "email", "nullable": true, "type": { "type": { "com.linkedin.schema.StringType": {} } }, - "nativeDataType": "ascii", + "nativeDataType": "text", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "bigint_column", + "fieldPath": "name", "nullable": true, "type": { "type": { - "com.linkedin.schema.NumberType": {} + "com.linkedin.schema.StringType": {} } }, - "nativeDataType": "bigint", + "nativeDataType": "text", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "id", + "fieldPath": "person_id", "nullable": true, "type": { "type": { - "com.linkedin.schema.StringType": {} + "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "uuid", + "nativeDataType": "int", "recursive": false, "isPartOfKey": false } @@ -2025,51 +2418,48 @@ } }, "systemMetadata": { - "lastObserved": 1731591019464, + "lastObserved": 1739997601745, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.example_view_1,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_1.people,PROD)", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "status", "aspect": { "json": { - "container": "urn:li:container:305f73c676989511c67d97ace119138c" + "removed": false } }, "systemMetadata": { - "lastObserved": 1731310103460, + "lastObserved": 1739924675511, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.example_view_1,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_1.people,PROD)", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "subTypes", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:305f73c676989511c67d97ace119138c", - "urn": "urn:li:container:305f73c676989511c67d97ace119138c" - } + "typeNames": [ + "Table" ] } }, "systemMetadata": { - "lastObserved": 1731310103461, + "lastObserved": 1739924675512, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.example_view_1,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_1.people,PROD)", "changeType": "UPSERT", "aspectName": "datasetProperties", "aspect": { @@ -2080,7 +2470,6 @@ "compaction": "{\"class\": \"org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy\", \"max_threshold\": \"32\", \"min_threshold\": \"4\"}", "compression": "{\"chunk_length_in_kb\": \"16\", \"class\": \"org.apache.cassandra.io.compress.LZ4Compressor\"}", "crc_check_chance": "1.0", - "include_all_columns": "False", "dclocal_read_repair_chance": "0.0", "default_time_to_live": "0", "extensions": "{}", @@ -2091,281 +2480,185 @@ "read_repair_chance": "0.0", "speculative_retry": "99p" }, - "name": "example_view_1", - "qualifiedName": "example_keyspace.example_view_1", - "description": "Example view definition with id and ascii_column", + "name": "people", + "qualifiedName": "cass_test_1.people", + "description": "", "tags": [] } }, "systemMetadata": { - "lastObserved": 1731591019464, - "runId": "cassandra-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.example_view_2,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:305f73c676989511c67d97ace119138c", - "urn": "urn:li:container:305f73c676989511c67d97ace119138c" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1731310942175, - "runId": "cassandra-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.example_view_2,PROD)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false - } - }, - "systemMetadata": { - "lastObserved": 1731310942171, - "runId": "cassandra-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.example_view_2,PROD)", - "changeType": "UPSERT", - "aspectName": "subTypes", - "aspect": { - "json": { - "typeNames": [ - "View" - ] - } - }, - "systemMetadata": { - "lastObserved": 1731310942172, + "lastObserved": 1739924675513, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.example_view_2,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_1.people,PROD)", "changeType": "UPSERT", - "aspectName": "viewProperties", + "aspectName": "container", "aspect": { "json": { - "materialized": true, - "viewLogic": "id IS NOT NULL AND ascii_column IS NOT NULL", - "viewLanguage": "CQL" + "container": "urn:li:container:b89ce3e714c980422ca601f9be0f54af" } }, "systemMetadata": { - "lastObserved": 1731310942172, + "lastObserved": 1739924675515, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.example_view_2,PROD)", - "changeType": "UPSERT", - "aspectName": "upstreamLineage", - "aspect": { - "json": { - "upstreams": [ - { - "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.all_data_types,PROD)", - "type": "VIEW" - } - ], - "fineGrainedLineages": [ - { - "upstreamType": "FIELD_SET", - "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.all_data_types,PROD),ascii_column)" - ], - "downstreamType": "FIELD", - "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.example_view_2,PROD),ascii_column)" - ], - "confidenceScore": 1.0 - }, - { - "upstreamType": "FIELD_SET", - "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.all_data_types,PROD),float_column)" - ], - "downstreamType": "FIELD", - "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.example_view_2,PROD),float_column)" - ], - "confidenceScore": 1.0 - }, - { - "upstreamType": "FIELD_SET", - "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.all_data_types,PROD),id)" - ], - "downstreamType": "FIELD", - "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.example_view_2,PROD),id)" - ], - "confidenceScore": 1.0 - } - ] +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_1.people,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:cassandra", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)" } }, "systemMetadata": { - "lastObserved": 1731447296594, + "lastObserved": 1739924675516, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.example_view_2,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_1.people,PROD)", "changeType": "UPSERT", - "aspectName": "datasetProperties", + "aspectName": "browsePathsV2", "aspect": { "json": { - "customProperties": { - "bloom_filter_fp_chance": "0.01", - "caching": "{\"keys\": \"ALL\", \"rows_per_partition\": \"NONE\"}", - "compaction": "{\"class\": \"org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy\", \"max_threshold\": \"32\", \"min_threshold\": \"4\"}", - "compression": "{\"chunk_length_in_kb\": \"16\", \"class\": \"org.apache.cassandra.io.compress.LZ4Compressor\"}", - "crc_check_chance": "1.0", - "include_all_columns": "False", - "dclocal_read_repair_chance": "0.0", - "default_time_to_live": "0", - "extensions": "{}", - "gc_grace_seconds": "864000", - "max_index_interval": "2048", - "min_index_interval": "128", - "memtable_flush_period_in_ms": "0", - "read_repair_chance": "0.0", - "speculative_retry": "99p" - }, - "name": "example_view_2", - "qualifiedName": "example_keyspace.example_view_2", - "description": "Example view definition with id and ascii_column", - "tags": [] + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:cassandra,dev_instance)" + }, + { + "id": "urn:li:container:b89ce3e714c980422ca601f9be0f54af", + "urn": "urn:li:container:b89ce3e714c980422ca601f9be0f54af" + } + ] } }, "systemMetadata": { - "lastObserved": 1731591019474, + "lastObserved": 1739924675517, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.example_view_2,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.counter_table,PROD)", "changeType": "UPSERT", - "aspectName": "schemaMetadata", + "aspectName": "datasetProfile", "aspect": { "json": { - "schemaName": "example_view_2", - "platform": "urn:li:dataPlatform:cassandra", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "hash": "", - "platformSchema": { - "com.linkedin.schema.OtherSchema": { - "rawSchema": "[{\"keyspace_name\": \"example_keyspace\", \"table_name\": \"example_view_2\", \"column_name\": \"ascii_column\", \"type\": \"ascii\", \"clustering_order\": \"asc\", \"kind\": \"clustering\", \"position\": 0}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"example_view_2\", \"column_name\": \"float_column\", \"type\": \"float\", \"clustering_order\": \"none\", \"kind\": \"regular\", \"position\": -1}, {\"keyspace_name\": \"example_keyspace\", \"table_name\": \"example_view_2\", \"column_name\": \"id\", \"type\": \"uuid\", \"clustering_order\": \"none\", \"kind\": \"partition_key\", \"position\": 0}]" - } + "timestampMillis": 1739924675506, + "partitionSpec": { + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" }, - "fields": [ - { - "fieldPath": "ascii_column", - "nullable": true, - "type": { - "type": { - "com.linkedin.schema.StringType": {} - } - }, - "nativeDataType": "ascii", - "recursive": false, - "isPartOfKey": false - }, + "rowCount": 0, + "columnCount": 2, + "fieldProfiles": [ { - "fieldPath": "float_column", - "nullable": true, - "type": { - "type": { - "com.linkedin.schema.NumberType": {} - } - }, - "nativeDataType": "float", - "recursive": false, - "isPartOfKey": false + "fieldPath": "counter_column", + "nullCount": 0 }, { "fieldPath": "id", - "nullable": true, - "type": { - "type": { - "com.linkedin.schema.StringType": {} - } - }, - "nativeDataType": "uuid", - "recursive": false, - "isPartOfKey": false + "nullCount": 0 } ] } }, "systemMetadata": { - "lastObserved": 1731591019474, + "lastObserved": 1739924675535, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.example_view_2,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.shopping_cart,PROD)", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "datasetProfile", "aspect": { "json": { - "container": "urn:li:container:305f73c676989511c67d97ace119138c" + "timestampMillis": 1739924675535, + "partitionSpec": { + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" + }, + "rowCount": 9, + "columnCount": 3, + "fieldProfiles": [ + { + "fieldPath": "item_count", + "uniqueCount": 5, + "nullCount": 4, + "min": "2", + "max": "100", + "mean": "46.4", + "median": "50.0", + "stdev": "38.44", + "sampleValues": [ + "5", + "100", + "75", + "2", + "50" + ] + }, + { + "fieldPath": "last_update_timestamp", + "uniqueCount": 9, + "nullCount": 0, + "min": "2024-11-01 00:00:00", + "max": "2024-11-09 00:00:00", + "sampleValues": [ + "2024-11-08 00:00:00", + "2024-11-06 00:00:00", + "2024-11-02 00:00:00", + "2024-11-03 00:00:00", + "2024-11-05 00:00:00" + ] + }, + { + "fieldPath": "userid", + "uniqueCount": 9, + "nullCount": 0, + "min": "1234", + "max": "9876", + "sampleValues": [ + "1240", + "1238", + "1234", + "1235", + "1237" + ] + } + ] } }, "systemMetadata": { - "lastObserved": 1731310942175, + "lastObserved": 1739924675551, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.all_data_types,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.example_keyspace.all_data_types,PROD)", "changeType": "UPSERT", "aspectName": "datasetProfile", "aspect": { "json": { - "timestampMillis": 1731579516915, + "timestampMillis": 1739924675549, "partitionSpec": { "partition": "FULL_TABLE_SNAPSHOT", "type": "FULL_TABLE" @@ -2485,87 +2778,19 @@ } }, "systemMetadata": { - "lastObserved": 1731579516925, - "runId": "cassandra-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_1.people,PROD)", - "changeType": "UPSERT", - "aspectName": "datasetProfile", - "aspect": { - "json": { - "timestampMillis": 1731579516959, - "partitionSpec": { - "partition": "FULL_TABLE_SNAPSHOT", - "type": "FULL_TABLE" - }, - "rowCount": 0, - "columnCount": 3, - "fieldProfiles": [ - { - "fieldPath": "email", - "nullCount": 0 - }, - { - "fieldPath": "name", - "nullCount": 0 - }, - { - "fieldPath": "person_id", - "nullCount": 0 - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1731579516960, - "runId": "cassandra-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.counter_table,PROD)", - "changeType": "UPSERT", - "aspectName": "datasetProfile", - "aspect": { - "json": { - "timestampMillis": 1731579516904, - "partitionSpec": { - "partition": "FULL_TABLE_SNAPSHOT", - "type": "FULL_TABLE" - }, - "rowCount": 0, - "columnCount": 2, - "fieldProfiles": [ - { - "fieldPath": "counter_column", - "nullCount": 0 - }, - { - "fieldPath": "id", - "nullCount": 0 - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1731579516915, + "lastObserved": 1739924675568, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_2.tasks,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_2.tasks,PROD)", "changeType": "UPSERT", "aspectName": "datasetProfile", "aspect": { "json": { - "timestampMillis": 1731579516939, + "timestampMillis": 1739924675564, "partitionSpec": { "partition": "FULL_TABLE_SNAPSHOT", "type": "FULL_TABLE" @@ -2593,19 +2818,19 @@ } }, "systemMetadata": { - "lastObserved": 1731579516950, + "lastObserved": 1739924675587, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,cass_test_1.information,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_1.information,PROD)", "changeType": "UPSERT", "aspectName": "datasetProfile", "aspect": { "json": { - "timestampMillis": 1731579516950, + "timestampMillis": 1739924675586, "partitionSpec": { "partition": "FULL_TABLE_SNAPSHOT", "type": "FULL_TABLE" @@ -2629,76 +2854,43 @@ } }, "systemMetadata": { - "lastObserved": 1731579516959, + "lastObserved": 1739924675599, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,example_keyspace.shopping_cart,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:cassandra,dev_instance.cass_test_1.people,PROD)", "changeType": "UPSERT", "aspectName": "datasetProfile", "aspect": { "json": { - "timestampMillis": 1731579516925, + "timestampMillis": 1739924675598, "partitionSpec": { "partition": "FULL_TABLE_SNAPSHOT", "type": "FULL_TABLE" }, - "rowCount": 9, + "rowCount": 0, "columnCount": 3, "fieldProfiles": [ { - "fieldPath": "item_count", - "uniqueCount": 5, - "nullCount": 4, - "min": "2", - "max": "100", - "mean": "46.4", - "median": "50.0", - "stdev": "38.44", - "sampleValues": [ - "5", - "100", - "75", - "2", - "50" - ] + "fieldPath": "email", + "nullCount": 0 }, { - "fieldPath": "last_update_timestamp", - "uniqueCount": 9, - "nullCount": 0, - "min": "2024-11-01 00:00:00", - "max": "2024-11-09 00:00:00", - "sampleValues": [ - "2024-11-08 00:00:00", - "2024-11-06 00:00:00", - "2024-11-02 00:00:00", - "2024-11-03 00:00:00", - "2024-11-05 00:00:00" - ] + "fieldPath": "name", + "nullCount": 0 }, { - "fieldPath": "userid", - "uniqueCount": 9, - "nullCount": 0, - "min": "1234", - "max": "9876", - "sampleValues": [ - "1240", - "1238", - "1234", - "1235", - "1237" - ] + "fieldPath": "person_id", + "nullCount": 0 } ] } }, "systemMetadata": { - "lastObserved": 1731579516939, + "lastObserved": 1739924675602, "runId": "cassandra-test", "lastRunId": "no-run-id-provided" } diff --git a/metadata-ingestion/tests/integration/cassandra/docker-compose.yml b/metadata-ingestion/tests/integration/cassandra/docker-compose.yml index a1a2a3b97d134..f7509155597c7 100644 --- a/metadata-ingestion/tests/integration/cassandra/docker-compose.yml +++ b/metadata-ingestion/tests/integration/cassandra/docker-compose.yml @@ -1,4 +1,3 @@ -version: "1" services: test-cassandra: image: cassandra:latest @@ -6,7 +5,7 @@ services: ports: - 9042:9042 volumes: - - ./setup/cassandra.yaml:/etc/cassandra/cassandra.yaml + - ${CASSANDRA_CONFIG_DIR:-./setup}/cassandra.yaml:/etc/cassandra/cassandra.yaml - ./setup/init_keyspaces.cql:/docker-entrypoint-initdb.d/init_keyspaces.cql networks: - testnet diff --git a/metadata-ingestion/tests/integration/cassandra/test_cassandra.py b/metadata-ingestion/tests/integration/cassandra/test_cassandra.py index d561308aaad20..822099903cabc 100644 --- a/metadata-ingestion/tests/integration/cassandra/test_cassandra.py +++ b/metadata-ingestion/tests/integration/cassandra/test_cassandra.py @@ -1,4 +1,6 @@ import logging +import pathlib +import shutil import time import pytest @@ -9,25 +11,37 @@ logger = logging.getLogger(__name__) +_resources_dir = pathlib.Path(__file__).parent + @pytest.mark.integration -def test_cassandra_ingest(docker_compose_runner, pytestconfig, tmp_path): - test_resources_dir = pytestconfig.rootpath / "tests/integration/cassandra" +def test_cassandra_ingest(docker_compose_runner, pytestconfig, tmp_path, monkeypatch): + # Tricky: The cassandra container makes modifications directly to the cassandra.yaml + # config file. + # See https://github.com/docker-library/cassandra/issues/165 + # To avoid spurious diffs, we copy the config file to a temporary location + # and depend on that instead. The docker-compose file has the corresponding + # env variable usage to pick up the config file. + cassandra_config_file = _resources_dir / "setup/cassandra.yaml" + shutil.copy(cassandra_config_file, tmp_path / "cassandra.yaml") + monkeypatch.setenv("CASSANDRA_CONFIG_DIR", str(tmp_path)) with docker_compose_runner( - test_resources_dir / "docker-compose.yml", "cassandra" + _resources_dir / "docker-compose.yml", "cassandra" ) as docker_services: wait_for_port(docker_services, "test-cassandra", 9042) time.sleep(5) + # Run the metadata ingestion pipeline. logger.info("Starting the ingestion test...") - pipeline_default_platform_instance = Pipeline.create( + pipeline = Pipeline.create( { "run_id": "cassandra-test", "source": { "type": "cassandra", "config": { + "platform_instance": "dev_instance", "contact_point": "localhost", "port": 9042, "profiling": {"enabled": True}, @@ -41,13 +55,13 @@ def test_cassandra_ingest(docker_compose_runner, pytestconfig, tmp_path): }, } ) - pipeline_default_platform_instance.run() - pipeline_default_platform_instance.raise_from_status() + pipeline.run() + pipeline.raise_from_status() # Verify the output. logger.info("Verifying output.") mce_helpers.check_golden_file( pytestconfig, output_path=f"{tmp_path}/cassandra_mcps.json", - golden_path=test_resources_dir / "cassandra_mcps_golden.json", + golden_path=_resources_dir / "cassandra_mcps_golden.json", ) From 610e5a899e313e94f66a87e116abf6d4671d51c9 Mon Sep 17 00:00:00 2001 From: Chris Collins Date: Mon, 24 Feb 2025 17:29:29 -0500 Subject: [PATCH 08/45] fix(filters) Fix autocomplete for platforms and improve advanced search builder (#12560) --- .../types/dataplatform/DataPlatformType.java | 51 ++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataplatform/DataPlatformType.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataplatform/DataPlatformType.java index 921b1ab3b5edd..c0c57086a9f2a 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataplatform/DataPlatformType.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataplatform/DataPlatformType.java @@ -2,15 +2,26 @@ import static com.linkedin.metadata.Constants.*; +import com.google.common.collect.ImmutableSet; import com.linkedin.common.urn.Urn; import com.linkedin.common.urn.UrnUtils; import com.linkedin.datahub.graphql.QueryContext; +import com.linkedin.datahub.graphql.generated.AutoCompleteResults; import com.linkedin.datahub.graphql.generated.DataPlatform; import com.linkedin.datahub.graphql.generated.Entity; +import com.linkedin.datahub.graphql.generated.FacetFilterInput; +import com.linkedin.datahub.graphql.generated.SearchResults; +import com.linkedin.datahub.graphql.resolvers.ResolverUtils; import com.linkedin.datahub.graphql.types.EntityType; +import com.linkedin.datahub.graphql.types.SearchableEntityType; import com.linkedin.datahub.graphql.types.dataplatform.mappers.DataPlatformMapper; +import com.linkedin.datahub.graphql.types.mappers.AutoCompleteResultsMapper; +import com.linkedin.datahub.graphql.types.mappers.UrnSearchResultsMapper; import com.linkedin.entity.EntityResponse; import com.linkedin.entity.client.EntityClient; +import com.linkedin.metadata.query.AutoCompleteResult; +import com.linkedin.metadata.query.filter.Filter; +import com.linkedin.metadata.search.SearchResult; import graphql.execution.DataFetcherResult; import java.util.ArrayList; import java.util.HashSet; @@ -18,8 +29,11 @@ import java.util.Map; import java.util.function.Function; import java.util.stream.Collectors; +import javax.annotation.Nonnull; +import javax.annotation.Nullable; -public class DataPlatformType implements EntityType { +public class DataPlatformType + implements SearchableEntityType, EntityType { private final EntityClient _entityClient; @@ -75,4 +89,39 @@ public com.linkedin.datahub.graphql.generated.EntityType type() { public Function getKeyProvider() { return Entity::getUrn; } + + @Override + public SearchResults search( + @Nonnull String query, + @Nullable List filters, + int start, + int count, + @Nonnull final QueryContext context) + throws Exception { + final Map facetFilters = + ResolverUtils.buildFacetFilters(filters, ImmutableSet.of()); + final SearchResult searchResult = + _entityClient.search( + context.getOperationContext().withSearchFlags(flags -> flags.setFulltext(true)), + DATA_PLATFORM_ENTITY_NAME, + query, + facetFilters, + start, + count); + return UrnSearchResultsMapper.map(context, searchResult); + } + + @Override + public AutoCompleteResults autoComplete( + @Nonnull String query, + @Nullable String field, + @Nullable Filter filters, + int limit, + @Nonnull final QueryContext context) + throws Exception { + final AutoCompleteResult result = + _entityClient.autoComplete( + context.getOperationContext(), DATA_PLATFORM_ENTITY_NAME, query, filters, limit); + return AutoCompleteResultsMapper.map(context, result); + } } From cb67726ab4c5334e92b760de183ac3e38e60883d Mon Sep 17 00:00:00 2001 From: cccs-cat001 <56204545+cccs-cat001@users.noreply.github.com> Date: Mon, 24 Feb 2025 18:47:46 -0400 Subject: [PATCH 09/45] fix(ingest): handle groups in pattern_cleanup_ownership transformer (#12536) --- .../transformer/pattern_cleanup_ownership.py | 32 +- .../tests/unit/test_transform_dataset.py | 317 ++++++++++++++++++ 2 files changed, 342 insertions(+), 7 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/pattern_cleanup_ownership.py b/metadata-ingestion/src/datahub/ingestion/transformer/pattern_cleanup_ownership.py index f17546d6f7299..3b4491290516c 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/pattern_cleanup_ownership.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/pattern_cleanup_ownership.py @@ -1,3 +1,4 @@ +import logging import re from typing import List, Optional, Set, cast @@ -10,8 +11,11 @@ OwnershipClass, OwnershipTypeClass, ) +from datahub.metadata.urns import CorpGroupUrn, CorpUserUrn +from datahub.utilities.urns._urn_base import Urn +from datahub.utilities.urns.error import InvalidUrnError -_USER_URN_PREFIX: str = "urn:li:corpuser:" +logger = logging.getLogger(__name__) class PatternCleanUpOwnershipConfig(ConfigModel): @@ -49,6 +53,11 @@ def _get_current_owner_urns(self, entity_urn: str) -> Set[str]: else: return set() + def _process_owner(self, name: str) -> str: + for value in self.config.pattern_for_cleanup: + name = re.sub(value, "", name) + return name + def transform_aspect( self, entity_urn: str, aspect_name: str, aspect: Optional[builder.Aspect] ) -> Optional[builder.Aspect]: @@ -58,14 +67,23 @@ def transform_aspect( # clean all the owners based on the parameters received from config cleaned_owner_urns: List[str] = [] for owner_urn in current_owner_urns: - user_id: str = owner_urn.split(_USER_URN_PREFIX)[1] - for value in self.config.pattern_for_cleanup: - user_id = re.sub(value, "", user_id) - - cleaned_owner_urns.append(_USER_URN_PREFIX + user_id) + username = "" + try: + owner: Urn = Urn.from_string(owner_urn) + if isinstance(owner, CorpUserUrn): + username = str(CorpUserUrn(self._process_owner(owner.username))) + elif isinstance(owner, CorpGroupUrn): + username = str(CorpGroupUrn(self._process_owner(owner.name))) + else: + logger.warning(f"{owner_urn} is not a supported owner type.") + username = owner_urn + except InvalidUrnError: + logger.warning(f"Could not parse {owner_urn} from {entity_urn}") + username = owner_urn + cleaned_owner_urns.append(username) ownership_type, ownership_type_urn = builder.validate_ownership_type( - OwnershipTypeClass.DATAOWNER + OwnershipTypeClass.TECHNICAL_OWNER ) owners = [ OwnerClass( diff --git a/metadata-ingestion/tests/unit/test_transform_dataset.py b/metadata-ingestion/tests/unit/test_transform_dataset.py index 5151be9c8b199..290e57b46f504 100644 --- a/metadata-ingestion/tests/unit/test_transform_dataset.py +++ b/metadata-ingestion/tests/unit/test_transform_dataset.py @@ -3954,6 +3954,323 @@ def test_clean_owner_urn_transformation_should_not_remove_system_identifier( _test_clean_owner_urns(pipeline_context, in_owner_urns, config, in_owner_urns) +def test_clean_owner_group_urn_transformation_remove_fixed_string( + mock_datahub_graph_instance, +): + pipeline_context = PipelineContext(run_id="transformer_pipe_line") + pipeline_context.graph = mock_datahub_graph_instance + + group_ids = [ + "ABCDEF:email_id@example.com", + "ABCDEF:123email_id@example.com", + "email_id@example.co.in", + "email_id@example.co.uk", + "email_test:XYZ@example.com", + "email_id:id1@example.com", + "email_id:id2@example.com", + ] + + in_owner_urns: List[str] = [] + for group in group_ids: + in_owner_urns.append( + builder.make_owner_urn(group, owner_type=builder.OwnerType.GROUP) + ) + + # remove 'ABCDEF:' + config: List[Union[re.Pattern, str]] = ["ABCDEF:"] + expected_group_ids: List[str] = [ + "email_id@example.com", + "123email_id@example.com", + "email_id@example.co.in", + "email_id@example.co.uk", + "email_test:XYZ@example.com", + "email_id:id1@example.com", + "email_id:id2@example.com", + ] + expected_owner_urns: List[str] = [] + for group in expected_group_ids: + expected_owner_urns.append( + builder.make_owner_urn(group, owner_type=builder.OwnerType.GROUP) + ) + _test_clean_owner_urns(pipeline_context, in_owner_urns, config, expected_owner_urns) + + +def test_clean_owner_group_urn_transformation_remove_multiple_values( + mock_datahub_graph_instance, +): + pipeline_context = PipelineContext(run_id="transformer_pipe_line") + pipeline_context.graph = mock_datahub_graph_instance + + group_ids = [ + "ABCDEF:email_id@example.com", + "ABCDEF:123email_id@example.com", + "email_id@example.co.in", + "email_id@example.co.uk", + "email_test:XYZ@example.com", + "email_id:id1@example.com", + "email_id:id2@example.com", + ] + + in_owner_urns: List[str] = [] + for group in group_ids: + in_owner_urns.append( + builder.make_owner_urn(group, owner_type=builder.OwnerType.GROUP) + ) + + # remove multiple values + config: List[Union[re.Pattern, str]] = ["ABCDEF:", "email"] + expected_group_ids: List[str] = [ + "_id@example.com", + "123_id@example.com", + "_id@example.co.in", + "_id@example.co.uk", + "_test:XYZ@example.com", + "_id:id1@example.com", + "_id:id2@example.com", + ] + expected_owner_urns: List[str] = [] + for group in expected_group_ids: + expected_owner_urns.append( + builder.make_owner_urn(group, owner_type=builder.OwnerType.GROUP) + ) + _test_clean_owner_urns(pipeline_context, in_owner_urns, config, expected_owner_urns) + + +def test_clean_owner_group_urn_transformation_remove_values_using_regex( + mock_datahub_graph_instance, +): + pipeline_context = PipelineContext(run_id="transformer_pipe_line") + pipeline_context.graph = mock_datahub_graph_instance + + group_ids = [ + "ABCDEF:email_id@example.com", + "ABCDEF:123email_id@example.com", + "email_id@example.co.in", + "email_id@example.co.uk", + "email_test:XYZ@example.com", + "email_id:id1@example.com", + "email_id:id2@example.com", + ] + + in_owner_urns: List[str] = [] + for group in group_ids: + in_owner_urns.append( + builder.make_owner_urn(group, owner_type=builder.OwnerType.GROUP) + ) + + # remove words after `_` using RegEx i.e. `id`, `test` + config: List[Union[re.Pattern, str]] = [r"(?<=_)(\w+)"] + expected_group_ids: List[str] = [ + "ABCDEF:email_@example.com", + "ABCDEF:123email_@example.com", + "email_@example.co.in", + "email_@example.co.uk", + "email_:XYZ@example.com", + "email_:id1@example.com", + "email_:id2@example.com", + ] + expected_owner_urns: List[str] = [] + for group in expected_group_ids: + expected_owner_urns.append( + builder.make_owner_urn(group, owner_type=builder.OwnerType.GROUP) + ) + _test_clean_owner_urns(pipeline_context, in_owner_urns, config, expected_owner_urns) + + +def test_clean_owner_group_urn_transformation_remove_digits( + mock_datahub_graph_instance, +): + pipeline_context = PipelineContext(run_id="transformer_pipe_line") + pipeline_context.graph = mock_datahub_graph_instance + + group_ids = [ + "ABCDEF:email_id@example.com", + "ABCDEF:123email_id@example.com", + "email_id@example.co.in", + "email_id@example.co.uk", + "email_test:XYZ@example.com", + "email_id:id1@example.com", + "email_id:id2@example.com", + ] + + in_owner_urns: List[str] = [] + for group in group_ids: + in_owner_urns.append( + builder.make_owner_urn(group, owner_type=builder.OwnerType.GROUP) + ) + + # remove digits + config: List[Union[re.Pattern, str]] = [r"\d+"] + expected_group_ids: List[str] = [ + "ABCDEF:email_id@example.com", + "ABCDEF:email_id@example.com", + "email_id@example.co.in", + "email_id@example.co.uk", + "email_test:XYZ@example.com", + "email_id:id@example.com", + "email_id:id@example.com", + ] + expected_owner_urns: List[str] = [] + for group in expected_group_ids: + expected_owner_urns.append( + builder.make_owner_urn(group, owner_type=builder.OwnerType.GROUP) + ) + _test_clean_owner_urns(pipeline_context, in_owner_urns, config, expected_owner_urns) + + +def test_clean_owner_group_urn_transformation_remove_pattern( + mock_datahub_graph_instance, +): + pipeline_context = PipelineContext(run_id="transformer_pipe_line") + pipeline_context.graph = mock_datahub_graph_instance + + group_ids = [ + "ABCDEF:email_id@example.com", + "ABCDEF:123email_id@example.com", + "email_id@example.co.in", + "email_id@example.co.uk", + "email_test:XYZ@example.com", + "email_id:id1@example.com", + "email_id:id2@example.com", + ] + + in_owner_urns: List[str] = [] + for group in group_ids: + in_owner_urns.append( + builder.make_owner_urn(group, owner_type=builder.OwnerType.GROUP) + ) + + # remove `example.*` + config: List[Union[re.Pattern, str]] = [r"@example\.\S*"] + expected_group_ids: List[str] = [ + "ABCDEF:email_id", + "ABCDEF:123email_id", + "email_id", + "email_id", + "email_test:XYZ", + "email_id:id1", + "email_id:id2", + ] + expected_owner_urns: List[str] = [] + for group in expected_group_ids: + expected_owner_urns.append( + builder.make_owner_urn(group, owner_type=builder.OwnerType.GROUP) + ) + _test_clean_owner_urns(pipeline_context, in_owner_urns, config, expected_owner_urns) + + +def test_clean_owner_group_urn_transformation_remove_word_in_capital_letters( + mock_datahub_graph_instance, +): + pipeline_context = PipelineContext(run_id="transformer_pipe_line") + pipeline_context.graph = mock_datahub_graph_instance + + group_ids = [ + "ABCDEF:email_id@example.com", + "ABCDEF:123email_id@example.com", + "email_id@example.co.in", + "email_id@example.co.uk", + "email_test:XYZ@example.com", + "email_id:id1@example.com", + "email_id:id2@example.com", + "email_test:XYabZ@example.com", + ] + + in_owner_urns: List[str] = [] + for group in group_ids: + in_owner_urns.append( + builder.make_owner_urn(group, owner_type=builder.OwnerType.GROUP) + ) + + # if string between `:` and `@` is in CAPITAL then remove it + config: List[Union[re.Pattern, str]] = ["(?<=:)[A-Z]+(?=@)"] + expected_group_ids: List[str] = [ + "ABCDEF:email_id@example.com", + "ABCDEF:123email_id@example.com", + "email_id@example.co.in", + "email_id@example.co.uk", + "email_test:@example.com", + "email_id:id1@example.com", + "email_id:id2@example.com", + "email_test:XYabZ@example.com", + ] + expected_owner_urns: List[str] = [] + for group in expected_group_ids: + expected_owner_urns.append( + builder.make_owner_urn(group, owner_type=builder.OwnerType.GROUP) + ) + _test_clean_owner_urns(pipeline_context, in_owner_urns, config, expected_owner_urns) + + +def test_clean_owner_group_urn_transformation_remove_pattern_with_alphanumeric_value( + mock_datahub_graph_instance, +): + pipeline_context = PipelineContext(run_id="transformer_pipe_line") + pipeline_context.graph = mock_datahub_graph_instance + + group_ids = [ + "ABCDEF:email_id@example.com", + "ABCDEF:123email_id@example.com", + "email_id@example.co.in", + "email_id@example.co.uk", + "email_test:XYZ@example.com", + "email_id:id1@example.com", + "email_id:id2@example.com", + ] + + in_owner_urns: List[str] = [] + for group in group_ids: + in_owner_urns.append( + builder.make_owner_urn(group, owner_type=builder.OwnerType.GROUP) + ) + + # remove any pattern having `id` followed by any digits + config: List[Union[re.Pattern, str]] = [r"id\d+"] + expected_group_ids: List[str] = [ + "ABCDEF:email_id@example.com", + "ABCDEF:123email_id@example.com", + "email_id@example.co.in", + "email_id@example.co.uk", + "email_test:XYZ@example.com", + "email_id:@example.com", + "email_id:@example.com", + ] + expected_owner_urns: List[str] = [] + for group in expected_group_ids: + expected_owner_urns.append( + builder.make_owner_urn(group, owner_type=builder.OwnerType.GROUP) + ) + _test_clean_owner_urns(pipeline_context, in_owner_urns, config, expected_owner_urns) + + +def test_clean_owner_group_urn_transformation_should_not_remove_system_identifier( + mock_datahub_graph_instance, +): + pipeline_context = PipelineContext(run_id="transformer_pipe_line") + pipeline_context.graph = mock_datahub_graph_instance + + group_ids = [ + "ABCDEF:email_id@example.com", + "ABCDEF:123email_id@example.com", + "email_id@example.co.in", + "email_id@example.co.uk", + "email_test:XYZ@example.com", + "email_id:id1@example.com", + "email_id:id2@example.com", + ] + + in_owner_urns: List[str] = [] + for group in group_ids: + in_owner_urns.append( + builder.make_owner_urn(group, owner_type=builder.OwnerType.GROUP) + ) + + # should not remove system identifier + config: List[Union[re.Pattern, str]] = ["urn:li:corpGroup:"] + + _test_clean_owner_urns(pipeline_context, in_owner_urns, config, in_owner_urns) + + def test_replace_external_url_word_replace( mock_datahub_graph_instance, ): From c980cbd8f2d10d2ece1aa7796449744d958f6300 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20G=C3=B3mez=20Villamor?= Date: Tue, 25 Feb 2025 08:08:45 +0100 Subject: [PATCH 10/45] tests(druid): integration tests for druid ingestion (#12717) Co-authored-by: Harshal Sheth --- .../tests/integration/druid/docker/README.md | 1 + .../druid/docker/docker-compose.yml | 134 ++ .../integration/druid/docker/environment | 51 + .../integration/druid/golden/druid_mces.json | 1615 +++++++++++++++++ .../druid/recipes/druid_to_file.yml | 11 + .../tests/integration/druid/test_druid.py | 45 + 6 files changed, 1857 insertions(+) create mode 100644 metadata-ingestion/tests/integration/druid/docker/README.md create mode 100644 metadata-ingestion/tests/integration/druid/docker/docker-compose.yml create mode 100644 metadata-ingestion/tests/integration/druid/docker/environment create mode 100644 metadata-ingestion/tests/integration/druid/golden/druid_mces.json create mode 100644 metadata-ingestion/tests/integration/druid/recipes/druid_to_file.yml create mode 100644 metadata-ingestion/tests/integration/druid/test_druid.py diff --git a/metadata-ingestion/tests/integration/druid/docker/README.md b/metadata-ingestion/tests/integration/druid/docker/README.md new file mode 100644 index 0000000000000..da5c0e42dfbae --- /dev/null +++ b/metadata-ingestion/tests/integration/druid/docker/README.md @@ -0,0 +1 @@ +Copied Docker setup from https://github.com/apache/druid/blob/32.0.0/distribution/docker/docker-compose.yml \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/druid/docker/docker-compose.yml b/metadata-ingestion/tests/integration/druid/docker/docker-compose.yml new file mode 100644 index 0000000000000..b088720e9eb6f --- /dev/null +++ b/metadata-ingestion/tests/integration/druid/docker/docker-compose.yml @@ -0,0 +1,134 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +version: "2.2" + +volumes: + metadata_data: {} + middle_var: {} + historical_var: {} + broker_var: {} + coordinator_var: {} + router_var: {} + druid_shared: {} + + +services: + postgres: + container_name: postgres + image: postgres:latest + ports: + - "5432:5432" + volumes: + - metadata_data:/var/lib/postgresql/data + environment: + - POSTGRES_PASSWORD=FoolishPassword + - POSTGRES_USER=druid + - POSTGRES_DB=druid + + # Need 3.5 or later for container nodes + zookeeper: + container_name: zookeeper + image: zookeeper:3.5.10 + ports: + - "2181:2181" + environment: + - ZOO_MY_ID=1 + + coordinator: + image: apache/druid:32.0.0 + container_name: coordinator + volumes: + - druid_shared:/opt/shared + - coordinator_var:/opt/druid/var + depends_on: + - zookeeper + - postgres + ports: + - "8081:8081" + command: + - coordinator + env_file: + - environment + + broker: + image: apache/druid:32.0.0 + container_name: broker + volumes: + - broker_var:/opt/druid/var + depends_on: + - zookeeper + - postgres + - coordinator + ports: + - "8082:8082" + command: + - broker + env_file: + - environment + + historical: + image: apache/druid:32.0.0 + container_name: historical + volumes: + - druid_shared:/opt/shared + - historical_var:/opt/druid/var + depends_on: + - zookeeper + - postgres + - coordinator + ports: + - "8083:8083" + command: + - historical + env_file: + - environment + + middlemanager: + image: apache/druid:32.0.0 + container_name: middlemanager + volumes: + - druid_shared:/opt/shared + - middle_var:/opt/druid/var + depends_on: + - zookeeper + - postgres + - coordinator + ports: + - "8091:8091" + - "8100-8105:8100-8105" + command: + - middleManager + env_file: + - environment + + router: + image: apache/druid:32.0.0 + container_name: router + volumes: + - router_var:/opt/druid/var + depends_on: + - zookeeper + - postgres + - coordinator + ports: + - "8888:8888" + command: + - router + env_file: + - environment \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/druid/docker/environment b/metadata-ingestion/tests/integration/druid/docker/environment new file mode 100644 index 0000000000000..20de1c94fb373 --- /dev/null +++ b/metadata-ingestion/tests/integration/druid/docker/environment @@ -0,0 +1,51 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +# Java tuning +#DRUID_XMX=1g +#DRUID_XMS=1g +#DRUID_MAXNEWSIZE=250m +#DRUID_NEWSIZE=250m +#DRUID_MAXDIRECTMEMORYSIZE=6172m +DRUID_SINGLE_NODE_CONF=micro-quickstart + +druid_emitter_logging_logLevel=debug + +druid_extensions_loadList=["druid-histogram", "druid-datasketches", "druid-lookups-cached-global", "postgresql-metadata-storage", "druid-multi-stage-query"] + +druid_zk_service_host=zookeeper + +druid_metadata_storage_host= +druid_metadata_storage_type=postgresql +druid_metadata_storage_connector_connectURI=jdbc:postgresql://postgres:5432/druid +druid_metadata_storage_connector_user=druid +druid_metadata_storage_connector_password=FoolishPassword + +druid_indexer_runner_javaOptsArray=["-server", "-Xmx1g", "-Xms1g", "-XX:MaxDirectMemorySize=3g", "-Duser.timezone=UTC", "-Dfile.encoding=UTF-8", "-Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager"] +druid_indexer_fork_property_druid_processing_buffer_sizeBytes=256MiB + +druid_storage_type=local +druid_storage_storageDirectory=/opt/shared/segments +druid_indexer_logs_type=file +druid_indexer_logs_directory=/opt/shared/indexing-logs + +druid_processing_numThreads=2 +druid_processing_numMergeBuffers=2 + +DRUID_LOG4J= \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/druid/golden/druid_mces.json b/metadata-ingestion/tests/integration/druid/golden/druid_mces.json new file mode 100644 index 0000000000000..e4a276a0e29b8 --- /dev/null +++ b/metadata-ingestion/tests/integration/druid/golden/druid_mces.json @@ -0,0 +1,1615 @@ +[ +{ + "entityType": "container", + "entityUrn": "urn:li:container:60894d7dfe5dd8cc58b7fef9e08a360b", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "druid", + "instance": "my_druid", + "env": "PROD", + "database": "druid/v2/sql/" + }, + "name": "druid/v2/sql/", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1740387600000, + "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:60894d7dfe5dd8cc58b7fef9e08a360b", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1740387600000, + "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:60894d7dfe5dd8cc58b7fef9e08a360b", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:druid", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:druid,my_druid)" + } + }, + "systemMetadata": { + "lastObserved": 1740387600000, + "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:60894d7dfe5dd8cc58b7fef9e08a360b", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Database" + ] + } + }, + "systemMetadata": { + "lastObserved": 1740387600000, + "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:60894d7dfe5dd8cc58b7fef9e08a360b", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:druid,my_druid)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:druid,my_druid)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1740387600000, + "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:21c5c6d3f7ea8f3719ba9a5b54dec295", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:60894d7dfe5dd8cc58b7fef9e08a360b" + } + }, + "systemMetadata": { + "lastObserved": 1740387600000, + "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:21c5c6d3f7ea8f3719ba9a5b54dec295", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "druid", + "instance": "my_druid", + "env": "PROD", + "database": "druid/v2/sql/", + "schema": "druid" + }, + "name": "druid", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1740387600000, + "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:21c5c6d3f7ea8f3719ba9a5b54dec295", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1740387600000, + "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:21c5c6d3f7ea8f3719ba9a5b54dec295", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:druid", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:druid,my_druid)" + } + }, + "systemMetadata": { + "lastObserved": 1740387600000, + "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:21c5c6d3f7ea8f3719ba9a5b54dec295", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Schema" + ] + } + }, + "systemMetadata": { + "lastObserved": 1740387600000, + "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:21c5c6d3f7ea8f3719ba9a5b54dec295", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:druid,my_druid)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:druid,my_druid)" + }, + { + "id": "urn:li:container:60894d7dfe5dd8cc58b7fef9e08a360b", + "urn": "urn:li:container:60894d7dfe5dd8cc58b7fef9e08a360b" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1740387600000, + "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:ef71eb2867381f1f215505577b90a0b9", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:60894d7dfe5dd8cc58b7fef9e08a360b" + } + }, + "systemMetadata": { + "lastObserved": 1740387600000, + "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:ef71eb2867381f1f215505577b90a0b9", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "druid", + "instance": "my_druid", + "env": "PROD", + "database": "druid/v2/sql/", + "schema": "sys" + }, + "name": "sys", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1740387600000, + "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:ef71eb2867381f1f215505577b90a0b9", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1740387600000, + "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:ef71eb2867381f1f215505577b90a0b9", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:druid", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:druid,my_druid)" + } + }, + "systemMetadata": { + "lastObserved": 1740387600000, + "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:ef71eb2867381f1f215505577b90a0b9", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Schema" + ] + } + }, + "systemMetadata": { + "lastObserved": 1740387600000, + "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:ef71eb2867381f1f215505577b90a0b9", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:druid,my_druid)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:druid,my_druid)" + }, + { + "id": "urn:li:container:60894d7dfe5dd8cc58b7fef9e08a360b", + "urn": "urn:li:container:60894d7dfe5dd8cc58b7fef9e08a360b" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1740387600000, + "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.segments,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:ef71eb2867381f1f215505577b90a0b9" + } + }, + "systemMetadata": { + "lastObserved": 1740387600000, + "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.segments,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": {}, + "name": "segments", + "description": "", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "my_druid.segments", + "platform": "urn:li:dataPlatform:druid", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "segment_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "datasource", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "start", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "end", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "size", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "LONG", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "version", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "partition_num", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "LONG", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "num_replicas", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "LONG", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "num_rows", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "LONG", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "is_active", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "LONG", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "is_published", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "LONG", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "is_available", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "LONG", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "is_realtime", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "LONG", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "is_overshadowed", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "LONG", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "shard_spec", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "dimensions", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "metrics", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "last_compaction_state", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "replication_factor", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "LONG", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1740387600000, + "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.segments,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:druid", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:druid,my_druid)" + } + }, + "systemMetadata": { + "lastObserved": 1740387600000, + "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.segments,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1740387600000, + "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.segments,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:druid,my_druid)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:druid,my_druid)" + }, + { + "id": "urn:li:container:60894d7dfe5dd8cc58b7fef9e08a360b", + "urn": "urn:li:container:60894d7dfe5dd8cc58b7fef9e08a360b" + }, + { + "id": "urn:li:container:ef71eb2867381f1f215505577b90a0b9", + "urn": "urn:li:container:ef71eb2867381f1f215505577b90a0b9" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1740387600000, + "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.server_segments,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:ef71eb2867381f1f215505577b90a0b9" + } + }, + "systemMetadata": { + "lastObserved": 1740387600000, + "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.server_segments,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": {}, + "name": "server_segments", + "description": "", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "my_druid.server_segments", + "platform": "urn:li:dataPlatform:druid", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "server", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "segment_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1740387600000, + "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.server_segments,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:druid", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:druid,my_druid)" + } + }, + "systemMetadata": { + "lastObserved": 1740387600000, + "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.server_segments,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1740387600000, + "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.server_segments,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:druid,my_druid)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:druid,my_druid)" + }, + { + "id": "urn:li:container:60894d7dfe5dd8cc58b7fef9e08a360b", + "urn": "urn:li:container:60894d7dfe5dd8cc58b7fef9e08a360b" + }, + { + "id": "urn:li:container:ef71eb2867381f1f215505577b90a0b9", + "urn": "urn:li:container:ef71eb2867381f1f215505577b90a0b9" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1740387600000, + "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.servers,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:ef71eb2867381f1f215505577b90a0b9" + } + }, + "systemMetadata": { + "lastObserved": 1740387600000, + "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.servers,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": {}, + "name": "servers", + "description": "", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "my_druid.servers", + "platform": "urn:li:dataPlatform:druid", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "server", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "host", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "plaintext_port", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "LONG", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "tls_port", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "LONG", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "server_type", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "tier", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "curr_size", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "LONG", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "max_size", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "LONG", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "is_leader", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "LONG", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "start_time", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1740387600000, + "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.servers,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:druid", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:druid,my_druid)" + } + }, + "systemMetadata": { + "lastObserved": 1740387600000, + "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.servers,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1740387600000, + "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.servers,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:druid,my_druid)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:druid,my_druid)" + }, + { + "id": "urn:li:container:60894d7dfe5dd8cc58b7fef9e08a360b", + "urn": "urn:li:container:60894d7dfe5dd8cc58b7fef9e08a360b" + }, + { + "id": "urn:li:container:ef71eb2867381f1f215505577b90a0b9", + "urn": "urn:li:container:ef71eb2867381f1f215505577b90a0b9" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1740387600000, + "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.supervisors,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:ef71eb2867381f1f215505577b90a0b9" + } + }, + "systemMetadata": { + "lastObserved": 1740387600000, + "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.supervisors,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": {}, + "name": "supervisors", + "description": "", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "my_druid.supervisors", + "platform": "urn:li:dataPlatform:druid", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "supervisor_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "state", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "detailed_state", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "healthy", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "LONG", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "type", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "source", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "suspended", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "LONG", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "spec", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1740387600000, + "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.supervisors,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:druid", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:druid,my_druid)" + } + }, + "systemMetadata": { + "lastObserved": 1740387600000, + "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.supervisors,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1740387600000, + "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.supervisors,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:druid,my_druid)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:druid,my_druid)" + }, + { + "id": "urn:li:container:60894d7dfe5dd8cc58b7fef9e08a360b", + "urn": "urn:li:container:60894d7dfe5dd8cc58b7fef9e08a360b" + }, + { + "id": "urn:li:container:ef71eb2867381f1f215505577b90a0b9", + "urn": "urn:li:container:ef71eb2867381f1f215505577b90a0b9" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1740387600000, + "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.tasks,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:ef71eb2867381f1f215505577b90a0b9" + } + }, + "systemMetadata": { + "lastObserved": 1740387600000, + "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.tasks,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": {}, + "name": "tasks", + "description": "", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "my_druid.tasks", + "platform": "urn:li:dataPlatform:druid", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "task_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "group_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "type", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "datasource", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "created_time", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "queue_insertion_time", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "status", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "runner_status", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "duration", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "LONG", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "location", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "host", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "plaintext_port", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "LONG", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "tls_port", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "LONG", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "error_msg", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "STRING", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1740387600000, + "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.tasks,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:druid", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:druid,my_druid)" + } + }, + "systemMetadata": { + "lastObserved": 1740387600000, + "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.tasks,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1740387600000, + "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.tasks,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:druid,my_druid)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:druid,my_druid)" + }, + { + "id": "urn:li:container:60894d7dfe5dd8cc58b7fef9e08a360b", + "urn": "urn:li:container:60894d7dfe5dd8cc58b7fef9e08a360b" + }, + { + "id": "urn:li:container:ef71eb2867381f1f215505577b90a0b9", + "urn": "urn:li:container:ef71eb2867381f1f215505577b90a0b9" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1740387600000, + "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/druid/recipes/druid_to_file.yml b/metadata-ingestion/tests/integration/druid/recipes/druid_to_file.yml new file mode 100644 index 0000000000000..bfb5577093596 --- /dev/null +++ b/metadata-ingestion/tests/integration/druid/recipes/druid_to_file.yml @@ -0,0 +1,11 @@ +source: + type: druid + config: + host_port: "localhost:8082" + platform_instance: "my_druid" + username: druid + password: FoolishPassword +sink: + type: file + config: + filename: "./druid_mces.json" diff --git a/metadata-ingestion/tests/integration/druid/test_druid.py b/metadata-ingestion/tests/integration/druid/test_druid.py new file mode 100644 index 0000000000000..909aa11395b70 --- /dev/null +++ b/metadata-ingestion/tests/integration/druid/test_druid.py @@ -0,0 +1,45 @@ +import pathlib + +import pytest +from freezegun import freeze_time + +from tests.test_helpers import mce_helpers +from tests.test_helpers.click_helpers import run_datahub_cmd +from tests.test_helpers.docker_helpers import wait_for_port + +FROZEN_TIME = "2025-02-24 09:00:00" +TESTS_DIR = pathlib.Path(__file__).parent +GOLDEN_FILES_DIR = TESTS_DIR / "golden" +DOCKER_DIR = TESTS_DIR / "docker" +RECIPES_DIR = TESTS_DIR / "recipes" + + +@pytest.fixture(scope="module") +def druid_up(docker_compose_runner): + with docker_compose_runner( + DOCKER_DIR / "docker-compose.yml", "druid" + ) as docker_services: + wait_for_port(docker_services, "coordinator", 8081, timeout=120) + wait_for_port(docker_services, "broker", 8082, timeout=120) + yield + + +@freeze_time(FROZEN_TIME) +@pytest.mark.integration +def test_druid_ingest( + pytestconfig, + druid_up, + tmp_path, +): + config_file = (RECIPES_DIR / "druid_to_file.yml").resolve() + output_path = tmp_path / "druid_mces.json" + + run_datahub_cmd(["ingest", "-c", f"{config_file}"], tmp_path=tmp_path) + + # Verify the output + mce_helpers.check_golden_file( + pytestconfig, + output_path=output_path, + golden_path=GOLDEN_FILES_DIR / "druid_mces.json", + ignore_paths=[], + ) From 9ca9794af011eeddcc5366bf69e7651674e3c151 Mon Sep 17 00:00:00 2001 From: Aseem Bansal Date: Tue, 25 Feb 2025 14:05:32 +0530 Subject: [PATCH 11/45] feat(api): let admins use granted privileges for actors (#12718) --- .../graphql/resolvers/policy/GetGrantedPrivilegesResolver.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/policy/GetGrantedPrivilegesResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/policy/GetGrantedPrivilegesResolver.java index a9097fa68a07d..0f051b93d9af1 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/policy/GetGrantedPrivilegesResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/policy/GetGrantedPrivilegesResolver.java @@ -57,6 +57,6 @@ public CompletableFuture get(final DataFetchingEnvironment environme } private boolean isAuthorized(final QueryContext context, final String actor) { - return actor.equals(context.getActorUrn()); + return PolicyAuthUtils.canManagePolicies(context) || actor.equals(context.getActorUrn()); } } From f18da511c1f016c3163900127a9f36bd39ad9544 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Tue, 25 Feb 2025 00:52:17 -0800 Subject: [PATCH 12/45] feat(build): use `pull_request_target` for datahub-wheels (#12722) --- .github/workflows/python-build-pages.yml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python-build-pages.yml b/.github/workflows/python-build-pages.yml index 9e1a256347803..7d31f2b063421 100644 --- a/.github/workflows/python-build-pages.yml +++ b/.github/workflows/python-build-pages.yml @@ -8,7 +8,7 @@ on: - "metadata-ingestion/**" - "metadata-ingestion-modules/**" - "metadata-models/**" - pull_request: + pull_request_target: branches: - "**" paths: @@ -38,7 +38,11 @@ jobs: distribution: "zulu" java-version: 17 - uses: gradle/actions/setup-gradle@v4 - - uses: acryldata/sane-checkout-action@v3 + - uses: actions/checkout@v4 + # Note: not using acryldata/sane-checkout-action because this is a + # pull_request_target event, and hence requires `ref`. + with: + ref: ${{ github.event.pull_request.base.sha }} - uses: actions/setup-python@v5 with: python-version: "3.10" From c3ca73e30898cfe7e5333527d3f1d268ed622263 Mon Sep 17 00:00:00 2001 From: Kevin Karch Date: Tue, 25 Feb 2025 08:22:31 -0500 Subject: [PATCH 13/45] feat(ui): access management docs (#12719) Co-authored-by: Shirshanka Das --- docs-website/sidebars.js | 5 + .../feature-guides/access-management.md | 223 ++++++++++++++++++ 2 files changed, 228 insertions(+) create mode 100644 docs/features/feature-guides/access-management.md diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js index dd81f114725b8..b0289ec56b090 100644 --- a/docs-website/sidebars.js +++ b/docs-website/sidebars.js @@ -92,6 +92,11 @@ module.exports = { }, ], }, + { + label: "Access Management", + type: "doc", + id: "docs/features/feature-guides/access-management", + }, { label: "Automations", type: "category", diff --git a/docs/features/feature-guides/access-management.md b/docs/features/feature-guides/access-management.md new file mode 100644 index 0000000000000..e981fae6f019e --- /dev/null +++ b/docs/features/feature-guides/access-management.md @@ -0,0 +1,223 @@ +import FeatureAvailability from '@site/src/components/FeatureAvailability'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Access Management + + + +## Introduction + +DataHub's Access Management feature allows you to associate external roles from your source systems with your data assets in DataHub. This creates a unified view of access control across your data ecosystem, helping data consumers: + +1. **Discover available access** - Find what roles are already provisioned for them across different data platforms +2. **Request appropriate access** - Easily identify and request to join the appropriate role for the access they need +3. **Simplify governance** - Streamline the access management process by centralizing role information in DataHub + +By integrating your external roles into DataHub, teams can reduce access request friction and ensure users have the right level of access to the data they need. + +## Configuration + +### Self-hosted DataHub + +For self-hosted DataHub deployments, the Access Management feature is *disabled* by default. To enable it, +simply set the `SHOW_ACCESS_MANAGEMENT` environment variable for the `datahub-gms` service container +to `true`. For example in your `docker/datahub-gms/docker.env`, you'd configure: + +``` +SHOW_ACCESS_MANAGEMENT=true +``` + +### DataHub Cloud + +If you're using DataHub Cloud (managed by Acryl), enabling the Access Management feature just requires contacting your Acryl Customer Success representative. They can enable this feature for your environment without any configuration changes on your part. + +## UI Location +Under a dataset, the new tab "Access Management" should appear if configured correctly. + +

+ +

+ +## Data Model +Access management introduces a new entity in DataHub's metadata model called a Role. +A Role is comprised of: + +* A unique key (URN) +* Properties of the role (name, description, type, request URL) +* A list of users that have been provisioned the role + +This role must then be associated with datasets through a new aspect called access. + +:::note Important Note +Currently, only Dataset entities support Access Management. +::: + +:::caution Do not confuse role with datahubrole +The "role" entity refers to an external role definition that exists in your source systems (like Snowflake or BigQuery), while "datahubrole" is for the management of privileges within DataHub itself (i.e., the admin role can accept proposed metadata changes). +::: + +## Managing Access Through DataHub + +You can set up Access Management through either the CLI or Python API. Here's how to complete the three main steps: + +### Creating External Roles + + + + +```bash +datahub put --urn "urn:li:role:reader" --aspect roleProperties -d - <<-EOF +{ + "name": "Snowflake Reader Role", + "description": "Description for Snowflake Reader Role", + "type": "READ", + "requestUrl": "http://custom-url-for-redirection.com" +} +EOF +``` + + + + +```python +import datahub.emitter.mce_builder as builder +from datahub.emitter.rest_emitter import DatahubRestEmitter +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.metadata.schema_classes import RolePropertiesClass, ChangeTypeClass + +# Create a role properties aspect +role_properties = RolePropertiesClass( + name="Snowflake Reader Role", + description="Description for Snowflake Reader Role", + type="READ", + requestUrl="http://custom-url-for-redirection.com" +) + +# Create a metadata change proposal +mcp = MetadataChangeProposalWrapper( + changeType=ChangeTypeClass.UPSERT, + entityUrn="urn:li:role:reader", + aspectName="roleProperties", + aspect=role_properties +) + +# Emit the metadata +emitter = DatahubRestEmitter(gms_server="http://localhost:8080") +emitter.emit(mcp) +``` + + + + +### Assigning Users to Roles (Optional) + + + + +```bash +datahub put --urn "urn:li:role:reader" --aspect actors -d - <<-EOF +{ + "users": [ + {"user": "urn:li:corpuser:datahubuser"} + ] +} +EOF +``` + + + + +```python +from datahub.metadata.schema_classes import ActorsClass, ActorClass + +# Create an actors aspect +actors = ActorsClass( + users=[ + ActorClass(user="urn:li:corpuser:datahubuser") + ] +) + +# Create a metadata change proposal +mcp = MetadataChangeProposalWrapper( + changeType=ChangeTypeClass.UPSERT, + entityUrn="urn:li:role:reader", + aspectName="actors", + aspect=actors +) + +# Emit the metadata +emitter.emit(mcp) +``` + + + + +### Assigning Roles to Datasets + + + + +```bash +datahub put --urn "urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_created,PROD)" --aspect access -d - <<-EOF +{ + "roles": [ + {"urn": "urn:li:role:reader"}, + {"urn": "urn:li:role:writer"} + ] +} +EOF +``` + + + + +```python +from datahub.metadata.schema_classes import AccessClass, RoleAssociationClass + +dataset_urn = "urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_created,PROD)" + +# Create an access aspect with multiple roles +access_aspect = AccessClass( + roles=[ + RoleAssociationClass(urn="urn:li:role:reader"), + RoleAssociationClass(urn="urn:li:role:writer") + ] +) + +# Create a metadata change proposal +mcp = MetadataChangeProposalWrapper( + changeType=ChangeTypeClass.UPSERT, + entityUrn=dataset_urn, + aspectName="access", + aspect=access_aspect +) + +# Emit the metadata +emitter.emit(mcp) +``` + + + + +## Use Cases + +Here are some common scenarios where integrating external roles into DataHub is valuable: + +1. **Unified Access View** - Data engineers can see all users with access to sensitive data across multiple platforms from a single interface +2. **Self-Service Access Requests** - Analysts can discover what roles they need to access specific datasets and request them directly from DataHub +3. **Access Auditing** - Compliance teams can review who has access to which datasets through which roles +4. **Onboarding Acceleration** - New team members can quickly discover what access they need for their role + +## Demo and Examples + +To see Access Management in action, check out our [DataHub Townhall demo](https://youtu.be/mXsn33tALCA?t=1333) where we showcase how to use this feature in a real-world scenario. + +## What's Next for Access Management + +Future enhancements planned for Access Management include: + +* Modeling external policies in addition to just roles +* Automatically extracting roles/policies from sources like BigQuery, Snowflake, etc. +* Extending support to more entity types beyond datasets +* Advanced access request workflows with approvals \ No newline at end of file From 653f9ef21fb0411e914e3451b1ad0a8a7777f994 Mon Sep 17 00:00:00 2001 From: Aseem Bansal Date: Tue, 25 Feb 2025 19:24:36 +0530 Subject: [PATCH 14/45] fix(lineage): error message for edit lineage (#12724) --- .../graphql/resolvers/lineage/UpdateLineageResolver.java | 7 +++---- .../src/app/lineage/manage/ManageLineageModal.tsx | 4 ++-- .../src/app/lineageV2/manualLineage/ManageLineageModal.tsx | 5 ++--- docker/datahub-gms/Dockerfile | 2 +- docs/features/feature-guides/ui-lineage.md | 2 ++ .../java/com/datahub/authorization/DataHubAuthorizer.java | 2 +- .../main/java/com/datahub/authorization/PolicyEngine.java | 3 +-- .../main/java/com/datahub/authorization/PolicyFetcher.java | 2 +- 8 files changed, 13 insertions(+), 14 deletions(-) diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/lineage/UpdateLineageResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/lineage/UpdateLineageResolver.java index 928e33d44c84e..e8b227f1327ab 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/lineage/UpdateLineageResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/lineage/UpdateLineageResolver.java @@ -222,16 +222,15 @@ private void checkLineageEdgePrivileges( if (!isAuthorized(context, upstreamUrn, editLineagePrivileges)) { throw new AuthorizationException( String.format( - "Unauthorized to edit %s lineage. Please contact your DataHub administrator.", - upstreamUrn.getEntityType())); + "Unauthorized to edit %s lineage for %s", upstreamUrn, upstreamUrn.getEntityType())); } Urn downstreamUrn = UrnUtils.getUrn(lineageEdge.getDownstreamUrn()); if (!isAuthorized(context, downstreamUrn, editLineagePrivileges)) { throw new AuthorizationException( String.format( - "Unauthorized to edit %s lineage. Please contact your DataHub administrator.", - downstreamUrn.getEntityType())); + "Unauthorized to edit %s lineage for %s", + downstreamUrn, downstreamUrn.getEntityType())); } } diff --git a/datahub-web-react/src/app/lineage/manage/ManageLineageModal.tsx b/datahub-web-react/src/app/lineage/manage/ManageLineageModal.tsx index ed792724ebedf..5648838bb5e0d 100644 --- a/datahub-web-react/src/app/lineage/manage/ManageLineageModal.tsx +++ b/datahub-web-react/src/app/lineage/manage/ManageLineageModal.tsx @@ -107,8 +107,8 @@ export default function ManageLineageModal({ }); } }) - .catch(() => { - message.error('Error updating lineage'); + .catch((error) => { + message.error(error.message || 'Error updating lineage'); }); } diff --git a/datahub-web-react/src/app/lineageV2/manualLineage/ManageLineageModal.tsx b/datahub-web-react/src/app/lineageV2/manualLineage/ManageLineageModal.tsx index 9f70de2b6f675..8cd8b7cb7a65e 100644 --- a/datahub-web-react/src/app/lineageV2/manualLineage/ManageLineageModal.tsx +++ b/datahub-web-react/src/app/lineageV2/manualLineage/ManageLineageModal.tsx @@ -85,9 +85,8 @@ export default function ManageLineageModal({ node, direction, closeModal, refetc }); } }) - .catch((e) => { - message.error('Error updating lineage'); - console.warn(e); + .catch((error) => { + message.error(error.message || 'Error updating lineage'); }); } diff --git a/docker/datahub-gms/Dockerfile b/docker/datahub-gms/Dockerfile index 232802a6bad8b..c5e00aa40e471 100644 --- a/docker/datahub-gms/Dockerfile +++ b/docker/datahub-gms/Dockerfile @@ -49,11 +49,11 @@ COPY --from=binary /go/bin/dockerize /usr/local/bin ENV LD_LIBRARY_PATH="/lib:/lib64" FROM base AS prod-install -COPY war.war /datahub/datahub-gms/bin/war.war COPY metadata-models/src/main/resources/entity-registry.yml /datahub/datahub-gms/resources/entity-registry.yml COPY docker/datahub-gms/start.sh /datahub/datahub-gms/scripts/start.sh COPY docker/monitoring/client-prometheus-config.yaml /datahub/datahub-gms/scripts/prometheus-config.yaml RUN chmod +x /datahub/datahub-gms/scripts/start.sh +COPY war.war /datahub/datahub-gms/bin/war.war FROM base AS dev-install # Dummy stage for development. Assumes code is built on your machine and mounted to this image. diff --git a/docs/features/feature-guides/ui-lineage.md b/docs/features/feature-guides/ui-lineage.md index a91046e7ec5e7..d137424b3896b 100644 --- a/docs/features/feature-guides/ui-lineage.md +++ b/docs/features/feature-guides/ui-lineage.md @@ -5,6 +5,8 @@ The UI shows the latest version of the data lineage. The time picker can be used ## Editing from Lineage Graph View +Ensure that you have `Edit lineage` privilege on both upstream and downstream entities before you try to add upstream or downstream lineage. + The first place that you can edit data lineage for entities is from the Lineage Visualization screen. Click on the "Lineage" button on the top right of an entity's profile to get to this view.

diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java index 54f5c6de37d9f..29ec33ff646c4 100644 --- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java +++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java @@ -367,7 +367,7 @@ public void run() { writeLock.unlock(); } - log.debug(String.format("Successfully fetched %s policies.", total)); + log.debug("Successfully fetched {} policies.", total); } catch (Exception e) { log.error( "Caught exception while loading Policy cache. Will retry on next scheduled attempt.", diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/PolicyEngine.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/PolicyEngine.java index e1d2e20de2157..72c33601132eb 100644 --- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/PolicyEngine.java +++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/PolicyEngine.java @@ -191,11 +191,10 @@ private boolean isResourceMatch( return true; } if (policyResourceFilter == null) { - // No resource defined on the policy. + log.debug("No resource defined on the policy."); return true; } if (requestResource.isEmpty()) { - // Resource filter present in policy, but no resource spec provided. log.debug("Resource filter present in policy, but no resource spec provided."); return false; } diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/PolicyFetcher.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/PolicyFetcher.java index a2b464f819419..6c281959feb70 100644 --- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/PolicyFetcher.java +++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/PolicyFetcher.java @@ -81,7 +81,7 @@ public PolicyFetchResult fetchPolicies( public PolicyFetchResult fetchPolicies( OperationContext opContext, String query, int count, @Nullable String scrollId, Filter filter) throws RemoteInvocationException, URISyntaxException { - log.debug(String.format("Batch fetching policies. count: %s, scroll: %s", count, scrollId)); + log.debug("Batch fetching policies. count: {}, scroll: {}", count, scrollId); // First fetch all policy urns ScrollResult result = From ffe2278d4d0dee694bab9950d84b371dd18afdbe Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Tue, 25 Feb 2025 10:44:54 -0800 Subject: [PATCH 15/45] docs: clarify limits on AI docs (#12728) --- docs/automations/ai-docs.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/automations/ai-docs.md b/docs/automations/ai-docs.md index bbec33f3bcae6..decdca9561c55 100644 --- a/docs/automations/ai-docs.md +++ b/docs/automations/ai-docs.md @@ -33,4 +33,5 @@ Data privacy: Your metadata is not sent to any third-party LLMs. We use AWS Bedr ## Limitations +- This feature is not available on tables with more than 100 columns. We are working on expanding this limit. - This feature is powered by an LLM, which can produce inaccurate results. While we've taken steps to reduce the likelihood of hallucinations, they can still occur. From 2396f0e23d1e79704291c84b602a802b02d69db0 Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Tue, 25 Feb 2025 15:57:17 -0600 Subject: [PATCH 16/45] fix(urn-validation): additional test cases for urn validation (#12727) --- .../tests/unit/urns/invalid_urns.txt | 8 + .../unit/urns/invalid_urns_java_only.txt | 4 + .../tests/unit/urns/valid_urns.txt | 17 +- .../metadata/utils/UrnValidationUtil.java | 122 ++++---- .../metadata/utils/UrnValidationUtilTest.java | 271 ++++++++++-------- 5 files changed, 233 insertions(+), 189 deletions(-) create mode 100644 metadata-ingestion/tests/unit/urns/invalid_urns_java_only.txt diff --git a/metadata-ingestion/tests/unit/urns/invalid_urns.txt b/metadata-ingestion/tests/unit/urns/invalid_urns.txt index 9ce2c99a1a4ee..2003fabd652ca 100644 --- a/metadata-ingestion/tests/unit/urns/invalid_urns.txt +++ b/metadata-ingestion/tests/unit/urns/invalid_urns.txt @@ -8,6 +8,14 @@ urn:li:corpuser:abc) # Reserved characters urn:li:corpuser:foo␟bar urn:li:tag:a,b,c +urn:li:corpuser:, +urn:li:dataset:(urn:li:dataPlatform:hdfs␟path,PROD) +urn:li:dashboard:(looker,dashboards,thelook) +urn:li:dataset:(urn:li:dataPlatform:hdfs,/path/to/data,()) +urn:li:dataset:(urn:li:dataPlatform:hdfs,(illegal),PROD) +urn:li:corpuser:(foo)123 +urn:li:dataJob:(urn:li:dataFlow:(mssql,1/2/3/4.c_n on %28LOCAL%29,PROD),1/2/3/4.c_n on (LOCAL)) +urn:li:dataJob:(urn:li:dataFlow:(mssql,123.%28TEST%29,PROD),(TEST)) # CorpUser URN tests urn:li:corpuser:(part1,part2) diff --git a/metadata-ingestion/tests/unit/urns/invalid_urns_java_only.txt b/metadata-ingestion/tests/unit/urns/invalid_urns_java_only.txt new file mode 100644 index 0000000000000..46a01c4d08679 --- /dev/null +++ b/metadata-ingestion/tests/unit/urns/invalid_urns_java_only.txt @@ -0,0 +1,4 @@ +# Not yet handled by python + +# Invalid use of colon +urn:li:dataset:(urn:li:dataPlatform:abc:def,table_name,prod) \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/urns/valid_urns.txt b/metadata-ingestion/tests/unit/urns/valid_urns.txt index 23205ec9a7235..9b44c0d5e1bc4 100644 --- a/metadata-ingestion/tests/unit/urns/valid_urns.txt +++ b/metadata-ingestion/tests/unit/urns/valid_urns.txt @@ -1,4 +1,4 @@ -# Unknown entity types become generic urns +# Unknown entity types become generic urns (does not apply to Java) urn:li:abc:foo urn:li:abc:(foo,bar) urn:li:abc:(urn:li:dataPlatform:abc,def,prod) @@ -6,8 +6,8 @@ urn:li:abc:(urn:li:dataPlatform:abc,def,prod) # A bunch of pretty normal urns urn:li:corpuser:foo urn:li:corpGroup:bar -urn:li:dataset:(urn:li:dataPlatform:abc,def/ghi,prod) -urn:li:dataFlow:(airflow,def,prod) +urn:li:dataset:(urn:li:dataPlatform:abc,def/ghi,PROD) +urn:li:dataFlow:(airflow,def,PROD) urn:li:dataJob:(urn:li:dataFlow:(airflow,flow_id,prod),job_id) urn:li:tag:abc urn:li:chart:(looker,chart_name) @@ -22,3 +22,14 @@ urn:li:tag:: urn:li:dashboard:(looker,dashboards.thelook::customer_lookup) urn:li:dataPlatform:abc:def urn:li:corpuser:foo:bar@example.com + +# From java test cases +urn:li:corpuser:foo:bar +urn:li:dataset:(urn:li:dataPlatform:hdfs,/path/to/data,PROD) +urn:li:dataPlatform:abc:def +urn:li:dataset:(urn:li:dataPlatform:s3,urn:li:dataset:%28urn:li:dataPlatform:s3%2Ctest-datalake-concepts/prog_maintenance%2CPROD%29,PROD) +urn:li:dataset:(urn:li:dataPlatform:bigquery,myproject.dataset.table,PROD) +urn:li:assertion:123=-%28__% weekly__%29 +urn:li:dataset:(urn:li:dataPlatform:s3,urn:li:dataset:%28urn:li:dataPlatform:s3%2Ctest-datalake-concepts%prog_maintenance%2CPROD%29,PROD) +urn:li:dataJob:(urn:li:dataFlow:(mssql,123.%28TEST%29,PROD),%28TEST%29) +urn:li:dashboard:(looker,dashboards.thelook::cohort_data_tool) diff --git a/metadata-utils/src/main/java/com/linkedin/metadata/utils/UrnValidationUtil.java b/metadata-utils/src/main/java/com/linkedin/metadata/utils/UrnValidationUtil.java index f5bbf3e093952..5f7503977c9c4 100644 --- a/metadata-utils/src/main/java/com/linkedin/metadata/utils/UrnValidationUtil.java +++ b/metadata-utils/src/main/java/com/linkedin/metadata/utils/UrnValidationUtil.java @@ -1,7 +1,6 @@ package com.linkedin.metadata.utils; import com.linkedin.common.urn.Urn; -import com.linkedin.common.urn.UrnUtils; import com.linkedin.data.DataList; import com.linkedin.data.DataMap; import com.linkedin.metadata.Constants; @@ -11,9 +10,7 @@ import com.linkedin.metadata.models.annotation.UrnValidationAnnotation; import com.linkedin.metadata.models.registry.EntityRegistry; import java.net.URISyntaxException; -import java.net.URLDecoder; import java.net.URLEncoder; -import java.nio.charset.StandardCharsets; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; @@ -32,8 +29,9 @@ public class UrnValidationUtil { // Related to BrowsePathv2 public static final String URN_DELIMITER_SEPARATOR = "␟"; // https://datahubproject.io/docs/what/urn/#restrictions - public static final Set ILLEGAL_URN_COMPONENT_CHARACTERS = Set.of("(", ")"); - public static final Set ILLEGAL_URN_TUPLE_CHARACTERS = Set.of(","); + public static final Set ILLEGAL_URN_CHARACTERS_PARENTHESES = Set.of("(", ")"); + // Commas are used as delimiters in tuple URNs, but not allowed in URN components + public static final Set ILLEGAL_URN_COMPONENT_DELIMITER = Set.of(","); private UrnValidationUtil() {} @@ -66,17 +64,29 @@ public static void validateUrn( "Error: URN cannot contain " + URN_DELIMITER_SEPARATOR + " character"); } - int totalParts = urn.getEntityKey().getParts().size(); - List illegalComponents = - urn.getEntityKey().getParts().stream() - .flatMap(part -> processUrnPartRecursively(part, totalParts)) - .collect(Collectors.toList()); + // Check if this is a simple (non-tuple) URN containing commas + if (urn.getEntityKey().getParts().size() == 1) { + String part = urn.getEntityKey().getParts().get(0); + if (part.contains(",")) { + if (strict) { + throw new IllegalArgumentException( + String.format( + "Simple URN %s contains comma character which is not allowed in non-tuple URNs", + urn)); + } else { + log.error( + "Simple URN {} contains comma character which is not allowed in non-tuple URNs", urn); + } + } + } + + // Validate all the URN parts + List illegalComponents = validateUrnComponents(urn); if (!illegalComponents.isEmpty()) { String message = String.format( - "Illegal `%s` characters detected in URN %s component(s): %s", - ILLEGAL_URN_COMPONENT_CHARACTERS, urn, illegalComponents); + "Illegal characters detected in URN %s component(s): %s", urn, illegalComponents); if (strict) { throw new IllegalArgumentException(message); @@ -92,25 +102,40 @@ public static void validateUrn( } } - /** Recursively process URN parts with URL decoding */ + /** Validates all components of a URN and returns a list of any illegal components. */ + private static List validateUrnComponents(Urn urn) { + int totalParts = urn.getEntityKey().getParts().size(); + + return urn.getEntityKey().getParts().stream() + .flatMap(part -> processUrnPartRecursively(part, totalParts)) + .collect(Collectors.toList()); + } + + /** Recursively process URN parts with URL decoding and check for illegal characters */ private static Stream processUrnPartRecursively(String urnPart, int totalParts) { - String decodedPart = - URLDecoder.decode(URLEncodingFixer.fixURLEncoding(urnPart), StandardCharsets.UTF_8); - if (decodedPart.startsWith("urn:li:")) { - // Recursively process nested URN after decoding - int nestedParts = UrnUtils.getUrn(decodedPart).getEntityKey().getParts().size(); - return UrnUtils.getUrn(decodedPart).getEntityKey().getParts().stream() - .flatMap(part -> processUrnPartRecursively(part, nestedParts)); + // If this part is a nested URN, don't check it directly for illegal characters + if (urnPart.startsWith("urn:li:")) { + return Stream.empty(); } + + // If this part has encoded parentheses, consider it valid + if (urnPart.contains("%28") || urnPart.contains("%29")) { + return Stream.empty(); + } + + // Check for unencoded parentheses in any part + if (ILLEGAL_URN_CHARACTERS_PARENTHESES.stream().anyMatch(c -> urnPart.contains(c))) { + return Stream.of(urnPart); + } + + // For tuple parts (URNs with multiple components), check for illegal commas within components if (totalParts > 1) { - if (ILLEGAL_URN_TUPLE_CHARACTERS.stream().anyMatch(c -> urnPart.contains(c))) { + if (ILLEGAL_URN_COMPONENT_DELIMITER.stream().anyMatch(c -> urnPart.contains(c))) { return Stream.of(urnPart); } } - if (ILLEGAL_URN_COMPONENT_CHARACTERS.stream().anyMatch(c -> urnPart.contains(c))) { - return Stream.of(urnPart); - } + // If we reach here, the part is valid return Stream.empty(); } @@ -207,53 +232,4 @@ public static class UrnValidationEntry { String urn; UrnValidationAnnotation annotation; } - - /** - * Fixes malformed URL encoding by escaping unescaped % characters while preserving valid - * percent-encoded sequences. - */ - private static class URLEncodingFixer { - /** - * @param input The potentially malformed URL-encoded string - * @return A string with proper URL encoding that can be safely decoded - */ - public static String fixURLEncoding(String input) { - if (input == null) { - return null; - } - - StringBuilder result = new StringBuilder(input.length() * 2); - int i = 0; - - while (i < input.length()) { - char currentChar = input.charAt(i); - - if (currentChar == '%') { - if (i + 2 < input.length()) { - // Check if the next two characters form a valid hex pair - String hexPair = input.substring(i + 1, i + 3); - if (isValidHexPair(hexPair)) { - // This is a valid percent-encoded sequence, keep it as is - result.append(currentChar); - } else { - // Invalid sequence, escape the % character - result.append("%25"); - } - } else { - // % at the end of string, escape it - result.append("%25"); - } - } else { - result.append(currentChar); - } - i++; - } - - return result.toString(); - } - - private static boolean isValidHexPair(String pair) { - return pair.matches("[0-9A-Fa-f]{2}"); - } - } } diff --git a/metadata-utils/src/test/java/com/linkedin/metadata/utils/UrnValidationUtilTest.java b/metadata-utils/src/test/java/com/linkedin/metadata/utils/UrnValidationUtilTest.java index da3832b8580ff..65dd7c2ae00ac 100644 --- a/metadata-utils/src/test/java/com/linkedin/metadata/utils/UrnValidationUtilTest.java +++ b/metadata-utils/src/test/java/com/linkedin/metadata/utils/UrnValidationUtilTest.java @@ -4,66 +4,23 @@ import com.linkedin.common.urn.UrnUtils; import com.linkedin.metadata.models.registry.EntityRegistry; import io.datahubproject.test.metadata.context.TestOperationContexts; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; import java.net.URISyntaxException; +import java.util.ArrayList; +import java.util.List; +import java.util.Set; +import javax.annotation.Nonnull; +import org.testng.Assert; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; public class UrnValidationUtilTest { private static final EntityRegistry entityRegistry = TestOperationContexts.defaultEntityRegistry(); - @Test - public void testValidateDatasetUrn() { - Urn validUrn = UrnUtils.getUrn("urn:li:dataset:(urn:li:dataPlatform:hdfs,/path/to/data,PROD)"); - UrnValidationUtil.validateUrn(entityRegistry, validUrn, true); - // If no exception is thrown, test passes - } - - @Test - public void testSimpleUrnColon() { - UrnValidationUtil.validateUrn(entityRegistry, UrnUtils.getUrn("urn:li:corpuser:foo:bar"), true); - UrnValidationUtil.validateUrn( - entityRegistry, UrnUtils.getUrn("urn:li:dataPlatform:abc:def"), true); - UrnValidationUtil.validateUrn( - entityRegistry, UrnUtils.getUrn("urn:li:corpuser:foo:bar@example.com"), true); - // If no exception is thrown, test passes - } - - @Test - public void testSimpleUrnComma() { - UrnValidationUtil.validateUrn(entityRegistry, UrnUtils.getUrn("urn:li:corpuser:,"), true); - // If no exception is thrown, test passes - } - - @Test(expectedExceptions = IllegalArgumentException.class) - public void testTupleUrnComma() { - UrnValidationUtil.validateUrn( - entityRegistry, UrnUtils.getUrn("urn:li:dashboard:(looker,dashboards,thelook)"), true); - } - - @Test(expectedExceptions = IllegalArgumentException.class) - public void testFabricTypeCasing() { - // prod != PROD - UrnValidationUtil.validateUrn( - entityRegistry, - UrnUtils.getUrn("urn:li:dataset:(urn:li:dataPlatform:abc:def,table_name,prod)"), - true); - } - - @Test - public void testComplexUrnColon() throws URISyntaxException { - Urn validUrn = - Urn.createFromString( - "urn:li:dataset:(urn:li:dataPlatform:s3,urn:li:dataset:%28urn:li:dataPlatform:s3%2Ctest-datalake-concepts/prog_maintenance%2CPROD%29,PROD)"); - UrnValidationUtil.validateUrn(entityRegistry, validUrn, true); - // If no exception is thrown, test passes - } - - @Test(expectedExceptions = IllegalArgumentException.class) - public void testFabricTypeParen() { - Urn invalidUrn = UrnUtils.getUrn("urn:li:dataset:(urn:li:dataPlatform:hdfs,/path/to/data,())"); - UrnValidationUtil.validateUrn(entityRegistry, invalidUrn, true); - } - @Test(expectedExceptions = IllegalArgumentException.class) public void testUrnWithTrailingWhitespace() { Urn invalidUrn = @@ -71,32 +28,6 @@ public void testUrnWithTrailingWhitespace() { UrnValidationUtil.validateUrn(entityRegistry, invalidUrn, true); } - @Test(expectedExceptions = IllegalArgumentException.class) - public void testUrnWithIllegalDelimiter() { - Urn invalidUrn = UrnUtils.getUrn("urn:li:dataset:(urn:li:dataPlatform:hdfs␟path,PROD)"); - UrnValidationUtil.validateUrn(entityRegistry, invalidUrn, true); - } - - @Test(expectedExceptions = IllegalArgumentException.class) - public void testComplexUrnWithParens1() { - Urn invalidUrn = UrnUtils.getUrn("urn:li:dataset:(urn:li:dataPlatform:hdfs,(illegal),PROD)"); - UrnValidationUtil.validateUrn(entityRegistry, invalidUrn, true); - } - - @Test(expectedExceptions = IllegalArgumentException.class) - public void testComplexUrnWithParens2() { - Urn invalidUrn = - UrnUtils.getUrn( - "urn:li:dataJob:(urn:li:dataFlow:(mssql,1/2/3/4.c_n on %28LOCAL%29,PROD),1/2/3/4.c_n on (LOCAL))"); - UrnValidationUtil.validateUrn(entityRegistry, invalidUrn, true); - } - - @Test(expectedExceptions = IllegalArgumentException.class) - public void testSimpleUrnWithParens() { - Urn invalidUrn = UrnUtils.getUrn("urn:li:corpuser:(foo)123"); - UrnValidationUtil.validateUrn(entityRegistry, invalidUrn, true); - } - @Test(expectedExceptions = IllegalArgumentException.class) public void testExcessiveLength() { StringBuilder longPath = new StringBuilder("urn:li:dataset:(urn:li:dataPlatform:hdfs,"); @@ -110,52 +41,166 @@ public void testExcessiveLength() { UrnValidationUtil.validateUrn(entityRegistry, invalidUrn, true); } - @Test - public void testValidComplexUrn() { - Urn validUrn = - UrnUtils.getUrn( - "urn:li:dataset:(urn:li:dataPlatform:bigquery,myproject.dataset.table,PROD)"); - - UrnValidationUtil.validateUrn(entityRegistry, validUrn, true); - // If no exception is thrown, test passes - } - @Test(expectedExceptions = NullPointerException.class) public void testUrnNull() { UrnValidationUtil.validateUrn(entityRegistry, null, true); } - @Test - public void testValidPartialUrlEncode() { - Urn validUrn = UrnUtils.getUrn("urn:li:assertion:123=-%28__% weekly__%29"); + /** + * Common method to validate URNs from a file and return the validation results. + * + * @param filePath Path to the file containing URNs. + * @return ValidationResult containing lists of valid and invalid URNs. + * @throws IOException If there is an error reading the file. + */ + private ValidationResult validateUrnsFromFile( + @Nonnull String filePath, @Nonnull Set excludePrefix) + throws IOException, URISyntaxException { + List invalidUrns = new ArrayList<>(); + List validUrns = new ArrayList<>(); + int totalUrns = 0; + + File file = new File(filePath); + BufferedReader reader = new BufferedReader(new FileReader(file)); + String line; + + while ((line = reader.readLine()) != null) { + // Skip empty lines and comment lines (starting with #) + line = line.trim(); + final String excludeCheck = line; + if (line.isEmpty() + || line.startsWith("#") + || excludePrefix.stream().anyMatch(excludeCheck::startsWith)) { + continue; + } + + totalUrns++; + + try { + Urn urn = UrnUtils.getUrn(line); + UrnValidationUtil.validateUrn(entityRegistry, urn, true); + validUrns.add(line); + } catch (Exception e) { + invalidUrns.add(line + " - Error: " + e.getMessage()); + } + + if (validUrns.contains(line)) { + // If valid should also parse correctly + Urn.createFromString(line); + } + } - UrnValidationUtil.validateUrn(entityRegistry, validUrn, true); - // If no exception is thrown, test passes - } + reader.close(); - @Test - public void testValidPartialUrlEncode2() { - Urn validUrn = - UrnUtils.getUrn( - "urn:li:dataset:(urn:li:dataPlatform:s3,urn:li:dataset:%28urn:li:dataPlatform:s3%2Ctest-datalake-concepts%prog_maintenance%2CPROD%29,PROD)"); + // Print summary + System.out.println("File: " + filePath); + System.out.println("Total URNs processed: " + totalUrns); + System.out.println("Valid URNs: " + validUrns.size()); + System.out.println("Invalid URNs: " + invalidUrns.size()); - UrnValidationUtil.validateUrn(entityRegistry, validUrn, true); - // If no exception is thrown, test passes + return new ValidationResult(validUrns, invalidUrns, totalUrns); } - @Test - public void testValidColon() { - Urn validUrn = - UrnUtils.getUrn("urn:li:dashboard:(looker,dashboards.thelook::cohort_data_tool)"); + /** + * Test method to validate URNs from a file containing valid URNs. Expects all URNs in the file to + * be valid. + * + * @param filePath Path to the file containing valid URNs. + */ + @Test(dataProvider = "validUrnFilePathProvider") + public void testValidateValidUrnsFromFile(String filePath) throws URISyntaxException { + try { + ValidationResult result = validateUrnsFromFile(filePath, Set.of("urn:li:abc:")); + + // Print invalid URNs if any exist + if (!result.getInvalidUrns().isEmpty()) { + System.out.println("Invalid URNs found in valid URN file:"); + result.getInvalidUrns().forEach(System.out::println); + Assert.fail("Found " + result.getInvalidUrns().size() + " invalid URNs in valid URN file"); + } - UrnValidationUtil.validateUrn(entityRegistry, validUrn, true); - // If no exception is thrown, test passes + // Assert that we have at least one test case + Assert.assertTrue(result.getValidUrns().size() > 0, "No valid URNs found in the file"); + + } catch (IOException e) { + Assert.fail("Failed to read the file: " + e.getMessage()); + } } - @Test - public void testNoTupleComma() { - Urn invalidUrn = UrnUtils.getUrn("urn:li:corpuser:,"); - UrnValidationUtil.validateUrn(entityRegistry, invalidUrn, true); - // If no exception is thrown, test passes + /** + * Test method to validate URNs from a file containing invalid URNs. Expects all URNs in the file + * to be invalid. + * + * @param filePath Path to the file containing invalid URNs. + */ + @Test(dataProvider = "invalidUrnFilePathProvider") + public void testValidateInvalidUrnsFromFile(String filePath) throws URISyntaxException { + try { + ValidationResult result = validateUrnsFromFile(filePath, Set.of()); + + // Print valid URNs if any exist + if (!result.getValidUrns().isEmpty()) { + System.out.println("Valid URNs found in invalid URN file:"); + result.getValidUrns().forEach(System.out::println); + Assert.fail("Found " + result.getValidUrns().size() + " valid URNs in invalid URN file"); + } + + // Assert that we have at least one test case + Assert.assertTrue(result.getInvalidUrns().size() > 0, "No invalid URNs found in the file"); + + } catch (IOException e) { + Assert.fail("Failed to read the file: " + e.getMessage()); + } + } + + /** + * Data provider for the valid URN file paths. + * + * @return Array of test data + */ + @DataProvider(name = "validUrnFilePathProvider") + public Object[][] validUrnFilePathProvider() { + return new Object[][] {{"../metadata-ingestion/tests/unit/urns/valid_urns.txt"} + // Add more test files as needed + }; + } + + /** + * Data provider for the invalid URN file paths. + * + * @return Array of test data + */ + @DataProvider(name = "invalidUrnFilePathProvider") + public Object[][] invalidUrnFilePathProvider() { + return new Object[][] { + {"../metadata-ingestion/tests/unit/urns/invalid_urns.txt"}, + {"../metadata-ingestion/tests/unit/urns/invalid_urns_java_only.txt"} + // Add more test files as needed + }; + } + + /** Class to hold validation results. */ + private static class ValidationResult { + private final List validUrns; + private final List invalidUrns; + private final int totalUrns; + + public ValidationResult(List validUrns, List invalidUrns, int totalUrns) { + this.validUrns = validUrns; + this.invalidUrns = invalidUrns; + this.totalUrns = totalUrns; + } + + public List getValidUrns() { + return validUrns; + } + + public List getInvalidUrns() { + return invalidUrns; + } + + public int getTotalUrns() { + return totalUrns; + } } } From f14c42d2ef7754819ce9d5584a9407a8c8df5dea Mon Sep 17 00:00:00 2001 From: Chris Collins Date: Tue, 25 Feb 2025 17:05:23 -0500 Subject: [PATCH 17/45] fix(ui) Fix NPE in pluralize function (#12629) --- metadata-service/war/src/main/resources/boot/policies.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-service/war/src/main/resources/boot/policies.json b/metadata-service/war/src/main/resources/boot/policies.json index 5d4d0730f96a6..2026162d7fbc7 100644 --- a/metadata-service/war/src/main/resources/boot/policies.json +++ b/metadata-service/war/src/main/resources/boot/policies.json @@ -142,7 +142,7 @@ "privileges": [ "VIEW_DATASET_USAGE", "VIEW_DATASET_PROFILE", - "VIEW_DATASET_OPERATIONS_PRIVILEGE" + "VIEW_DATASET_OPERATIONS" ], "displayName": "All Users - View Dataset Sensitive Information", "description": "Grants viewing privileges of usage and profile information of all datasets for all users", From 5f5e395c90df6ca063c3edcd2432ba12b4406564 Mon Sep 17 00:00:00 2001 From: Rasnar Date: Wed, 26 Feb 2025 08:36:06 +0100 Subject: [PATCH 18/45] Fix platform instance support on Druid ingestion (#12716) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: rasnar <11248833+Rasnar@users.noreply.github.com> Co-authored-by: Sergio Gómez Villamor --- docs/how/updating-datahub.md | 2 + .../src/datahub/ingestion/source/sql/druid.py | 6 +- .../integration/druid/golden/druid_mces.json | 144 +++++++++--------- .../tests/unit/test_druid_source.py | 6 + 4 files changed, 81 insertions(+), 77 deletions(-) diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md index c8fcb0021414f..adb86c1bce1b3 100644 --- a/docs/how/updating-datahub.md +++ b/docs/how/updating-datahub.md @@ -26,6 +26,8 @@ This file documents any backwards-incompatible changes in DataHub and assists pe - #12671: The `priority` field of the Incident entity is changed from an integer to an enum. This field was previously completely unused in UI and API, so this change should not affect existing deployments. +- #12716: Fix the `platform_instance` being added twice to the URN. If you want to have the previous behavior back, you need to add your platform_instance twice (i.e. `plat.plat`). + ### Known Issues diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/druid.py b/metadata-ingestion/src/datahub/ingestion/source/sql/druid.py index 3f20e0a0f18b6..7a15c766cba66 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/druid.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/druid.py @@ -50,11 +50,7 @@ def get_sql_alchemy_url(self): """ def get_identifier(self, schema: str, table: str) -> str: - return ( - f"{self.platform_instance}.{table}" - if self.platform_instance - else f"{table}" - ) + return f"{table}" @platform_name("Druid") diff --git a/metadata-ingestion/tests/integration/druid/golden/druid_mces.json b/metadata-ingestion/tests/integration/druid/golden/druid_mces.json index e4a276a0e29b8..5b9e156bebf82 100644 --- a/metadata-ingestion/tests/integration/druid/golden/druid_mces.json +++ b/metadata-ingestion/tests/integration/druid/golden/druid_mces.json @@ -18,7 +18,7 @@ }, "systemMetadata": { "lastObserved": 1740387600000, - "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "runId": "druid-2025_02_24-09_00_00-er4usx", "lastRunId": "no-run-id-provided" } }, @@ -34,7 +34,7 @@ }, "systemMetadata": { "lastObserved": 1740387600000, - "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "runId": "druid-2025_02_24-09_00_00-er4usx", "lastRunId": "no-run-id-provided" } }, @@ -51,7 +51,7 @@ }, "systemMetadata": { "lastObserved": 1740387600000, - "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "runId": "druid-2025_02_24-09_00_00-er4usx", "lastRunId": "no-run-id-provided" } }, @@ -69,7 +69,7 @@ }, "systemMetadata": { "lastObserved": 1740387600000, - "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "runId": "druid-2025_02_24-09_00_00-er4usx", "lastRunId": "no-run-id-provided" } }, @@ -90,7 +90,7 @@ }, "systemMetadata": { "lastObserved": 1740387600000, - "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "runId": "druid-2025_02_24-09_00_00-er4usx", "lastRunId": "no-run-id-provided" } }, @@ -106,7 +106,7 @@ }, "systemMetadata": { "lastObserved": 1740387600000, - "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "runId": "druid-2025_02_24-09_00_00-er4usx", "lastRunId": "no-run-id-provided" } }, @@ -130,7 +130,7 @@ }, "systemMetadata": { "lastObserved": 1740387600000, - "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "runId": "druid-2025_02_24-09_00_00-er4usx", "lastRunId": "no-run-id-provided" } }, @@ -146,7 +146,7 @@ }, "systemMetadata": { "lastObserved": 1740387600000, - "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "runId": "druid-2025_02_24-09_00_00-er4usx", "lastRunId": "no-run-id-provided" } }, @@ -163,7 +163,7 @@ }, "systemMetadata": { "lastObserved": 1740387600000, - "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "runId": "druid-2025_02_24-09_00_00-er4usx", "lastRunId": "no-run-id-provided" } }, @@ -181,7 +181,7 @@ }, "systemMetadata": { "lastObserved": 1740387600000, - "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "runId": "druid-2025_02_24-09_00_00-er4usx", "lastRunId": "no-run-id-provided" } }, @@ -206,7 +206,7 @@ }, "systemMetadata": { "lastObserved": 1740387600000, - "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "runId": "druid-2025_02_24-09_00_00-er4usx", "lastRunId": "no-run-id-provided" } }, @@ -222,7 +222,7 @@ }, "systemMetadata": { "lastObserved": 1740387600000, - "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "runId": "druid-2025_02_24-09_00_00-er4usx", "lastRunId": "no-run-id-provided" } }, @@ -246,7 +246,7 @@ }, "systemMetadata": { "lastObserved": 1740387600000, - "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "runId": "druid-2025_02_24-09_00_00-er4usx", "lastRunId": "no-run-id-provided" } }, @@ -262,7 +262,7 @@ }, "systemMetadata": { "lastObserved": 1740387600000, - "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "runId": "druid-2025_02_24-09_00_00-er4usx", "lastRunId": "no-run-id-provided" } }, @@ -279,7 +279,7 @@ }, "systemMetadata": { "lastObserved": 1740387600000, - "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "runId": "druid-2025_02_24-09_00_00-er4usx", "lastRunId": "no-run-id-provided" } }, @@ -297,7 +297,7 @@ }, "systemMetadata": { "lastObserved": 1740387600000, - "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "runId": "druid-2025_02_24-09_00_00-er4usx", "lastRunId": "no-run-id-provided" } }, @@ -322,13 +322,13 @@ }, "systemMetadata": { "lastObserved": 1740387600000, - "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "runId": "druid-2025_02_24-09_00_00-er4usx", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.segments,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.segments,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { @@ -338,14 +338,14 @@ }, "systemMetadata": { "lastObserved": 1740387600000, - "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "runId": "druid-2025_02_24-09_00_00-er4usx", "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.segments,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.segments,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -362,7 +362,7 @@ }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "my_druid.segments", + "schemaName": "segments", "platform": "urn:li:dataPlatform:druid", "version": 0, "created": { @@ -616,13 +616,13 @@ }, "systemMetadata": { "lastObserved": 1740387600000, - "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "runId": "druid-2025_02_24-09_00_00-er4usx", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.segments,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.segments,PROD)", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { @@ -633,13 +633,13 @@ }, "systemMetadata": { "lastObserved": 1740387600000, - "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "runId": "druid-2025_02_24-09_00_00-er4usx", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.segments,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.segments,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -651,13 +651,13 @@ }, "systemMetadata": { "lastObserved": 1740387600000, - "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "runId": "druid-2025_02_24-09_00_00-er4usx", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.segments,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.segments,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -680,13 +680,13 @@ }, "systemMetadata": { "lastObserved": 1740387600000, - "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "runId": "druid-2025_02_24-09_00_00-er4usx", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.server_segments,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.server_segments,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { @@ -696,14 +696,14 @@ }, "systemMetadata": { "lastObserved": 1740387600000, - "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "runId": "druid-2025_02_24-09_00_00-er4usx", "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.server_segments,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.server_segments,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -720,7 +720,7 @@ }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "my_druid.server_segments", + "schemaName": "server_segments", "platform": "urn:li:dataPlatform:druid", "version": 0, "created": { @@ -770,13 +770,13 @@ }, "systemMetadata": { "lastObserved": 1740387600000, - "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "runId": "druid-2025_02_24-09_00_00-er4usx", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.server_segments,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.server_segments,PROD)", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { @@ -787,13 +787,13 @@ }, "systemMetadata": { "lastObserved": 1740387600000, - "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "runId": "druid-2025_02_24-09_00_00-er4usx", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.server_segments,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.server_segments,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -805,13 +805,13 @@ }, "systemMetadata": { "lastObserved": 1740387600000, - "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "runId": "druid-2025_02_24-09_00_00-er4usx", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.server_segments,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.server_segments,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -834,13 +834,13 @@ }, "systemMetadata": { "lastObserved": 1740387600000, - "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "runId": "druid-2025_02_24-09_00_00-er4usx", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.servers,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.servers,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { @@ -850,14 +850,14 @@ }, "systemMetadata": { "lastObserved": 1740387600000, - "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "runId": "druid-2025_02_24-09_00_00-er4usx", "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.servers,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.servers,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -874,7 +874,7 @@ }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "my_druid.servers", + "schemaName": "servers", "platform": "urn:li:dataPlatform:druid", "version": 0, "created": { @@ -1020,13 +1020,13 @@ }, "systemMetadata": { "lastObserved": 1740387600000, - "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "runId": "druid-2025_02_24-09_00_00-er4usx", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.servers,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.servers,PROD)", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { @@ -1037,13 +1037,13 @@ }, "systemMetadata": { "lastObserved": 1740387600000, - "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "runId": "druid-2025_02_24-09_00_00-er4usx", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.servers,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.servers,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1055,13 +1055,13 @@ }, "systemMetadata": { "lastObserved": 1740387600000, - "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "runId": "druid-2025_02_24-09_00_00-er4usx", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.servers,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.servers,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -1084,13 +1084,13 @@ }, "systemMetadata": { "lastObserved": 1740387600000, - "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "runId": "druid-2025_02_24-09_00_00-er4usx", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.supervisors,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.supervisors,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { @@ -1100,14 +1100,14 @@ }, "systemMetadata": { "lastObserved": 1740387600000, - "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "runId": "druid-2025_02_24-09_00_00-er4usx", "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.supervisors,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.supervisors,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -1124,7 +1124,7 @@ }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "my_druid.supervisors", + "schemaName": "supervisors", "platform": "urn:li:dataPlatform:druid", "version": 0, "created": { @@ -1246,13 +1246,13 @@ }, "systemMetadata": { "lastObserved": 1740387600000, - "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "runId": "druid-2025_02_24-09_00_00-er4usx", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.supervisors,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.supervisors,PROD)", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { @@ -1263,13 +1263,13 @@ }, "systemMetadata": { "lastObserved": 1740387600000, - "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "runId": "druid-2025_02_24-09_00_00-er4usx", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.supervisors,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.supervisors,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1281,13 +1281,13 @@ }, "systemMetadata": { "lastObserved": 1740387600000, - "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "runId": "druid-2025_02_24-09_00_00-er4usx", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.supervisors,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.supervisors,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -1310,13 +1310,13 @@ }, "systemMetadata": { "lastObserved": 1740387600000, - "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "runId": "druid-2025_02_24-09_00_00-er4usx", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.tasks,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.tasks,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { @@ -1326,14 +1326,14 @@ }, "systemMetadata": { "lastObserved": 1740387600000, - "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "runId": "druid-2025_02_24-09_00_00-er4usx", "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.tasks,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.tasks,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -1350,7 +1350,7 @@ }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "my_druid.tasks", + "schemaName": "tasks", "platform": "urn:li:dataPlatform:druid", "version": 0, "created": { @@ -1544,13 +1544,13 @@ }, "systemMetadata": { "lastObserved": 1740387600000, - "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "runId": "druid-2025_02_24-09_00_00-er4usx", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.tasks,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.tasks,PROD)", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { @@ -1561,13 +1561,13 @@ }, "systemMetadata": { "lastObserved": 1740387600000, - "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "runId": "druid-2025_02_24-09_00_00-er4usx", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.tasks,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.tasks,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1579,13 +1579,13 @@ }, "systemMetadata": { "lastObserved": 1740387600000, - "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "runId": "druid-2025_02_24-09_00_00-er4usx", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.my_druid.tasks,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:druid,my_druid.tasks,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -1608,7 +1608,7 @@ }, "systemMetadata": { "lastObserved": 1740387600000, - "runId": "druid-2025_02_24-09_00_00-p8a5zq", + "runId": "druid-2025_02_24-09_00_00-er4usx", "lastRunId": "no-run-id-provided" } } diff --git a/metadata-ingestion/tests/unit/test_druid_source.py b/metadata-ingestion/tests/unit/test_druid_source.py index 4b21ec9633dc4..504fb13700a78 100644 --- a/metadata-ingestion/tests/unit/test_druid_source.py +++ b/metadata-ingestion/tests/unit/test_druid_source.py @@ -5,3 +5,9 @@ def test_druid_uri(): config = DruidConfig.parse_obj({"host_port": "localhost:8082"}) assert config.get_sql_alchemy_url() == "druid://localhost:8082/druid/v2/sql/" + + +def test_druid_get_identifier(): + config = DruidConfig.parse_obj({"host_port": "localhost:8082"}) + + assert config.get_identifier("schema", "table") == "table" From 7cfee8bc040f5a49963d86379f6087522f2758af Mon Sep 17 00:00:00 2001 From: Chakru <161002324+chakru-r@users.noreply.github.com> Date: Wed, 26 Feb 2025 17:24:53 +0530 Subject: [PATCH 19/45] ci(coverage): update patch coverage threshold (#12733) --- .github/.codecov.yml | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/.github/.codecov.yml b/.github/.codecov.yml index 1faf5a6bab464..4a459d13b60a2 100644 --- a/.github/.codecov.yml +++ b/.github/.codecov.yml @@ -1,21 +1,19 @@ comment: - layout: "header, files, footer" # remove "new" from "header" and "footer" - hide_project_coverage: true # set to false + layout: "condensed_header, condensed_files, condensed_footer" + hide_project_coverage: true require_changes: false # if true: only post the comment if coverage changes codecov: #due to ci-optimization, reports for modules that have not changed may be quite old max_report_age: off +github_checks: + #Hide annotations that show up in github PR reviews. There still is a red bar next to lines not covered + annotations: false + flag_management: default_rules: # the rules that will be followed for any flag added, generally carryforward: true - statuses: - - type: project - target: auto - threshold: 0% #Not enforcing project coverage yet. - - type: patch - target: 90% individual_flags: # exceptions to the default rules above, stated flag by flag - name: frontend paths: @@ -55,11 +53,8 @@ flag_management: - "metadata-ingestion-modules/prefect-plugin/**" coverage: status: - project: - default: - target: 0% # no threshold enforcement yet - only_pulls: true + project: false patch: default: - target: 90% # for new code added in the patch - only_pulls: true + target: 75% # for new code added in the patch + only_pulls: true \ No newline at end of file From eaa17a073dfc4b99a20dc5f0dedbf168a5d33db1 Mon Sep 17 00:00:00 2001 From: Chris Collins Date: Wed, 26 Feb 2025 09:53:33 -0500 Subject: [PATCH 20/45] fix(ui) Fix bug with date dropdown in deprecation modal (#12633) --- .../EntityDropdown/EntityMenuActions.tsx | 5 ++++- datahub-web-react/src/conf/index.ts | 20 +++++++++++++++++-- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/datahub-web-react/src/app/entityV2/shared/EntityDropdown/EntityMenuActions.tsx b/datahub-web-react/src/app/entityV2/shared/EntityDropdown/EntityMenuActions.tsx index 0a3d8a4ff4e95..ec6a3186e0621 100644 --- a/datahub-web-react/src/app/entityV2/shared/EntityDropdown/EntityMenuActions.tsx +++ b/datahub-web-react/src/app/entityV2/shared/EntityDropdown/EntityMenuActions.tsx @@ -1,6 +1,7 @@ import { MoreOutlined } from '@ant-design/icons'; import React, { useContext } from 'react'; import styled from 'styled-components'; +import { useAppConfig } from '@src/app/useAppConfig'; import { useEntityData, useRefetch } from '../../../entity/shared/EntityContext'; import ShareMenuAction from '../../../shared/share/v2/ShareMenuAction'; import EntitySidebarContext from '../../../sharedV2/EntitySidebarContext'; @@ -63,6 +64,8 @@ function EntityMenuActions(props: Props) { const refetch = useRefetch(); + const { entityVersioningEnabled } = useAppConfig().config.featureFlags; + const hasVersioningActions = !!(menuItems.has(EntityMenuItems.LINK_VERSION) || entityData?.versionProperties); return ( <> @@ -76,7 +79,7 @@ function EntityMenuActions(props: Props) { )} {menuItems.has(EntityMenuItems.RAISE_INCIDENT) && } - {hasVersioningActions && ( + {entityVersioningEnabled && hasVersioningActions && ( Date: Wed, 26 Feb 2025 11:11:18 -0500 Subject: [PATCH 21/45] fix(ui) Fix group membership inconsistencies on group page (#12704) --- .../entityV2/group/GroupInfoHeaderSection.tsx | 4 +-- .../GroupMembersSidebarSectionContent.tsx | 33 ++++++++++++------- .../src/app/entityV2/group/GroupProfile.tsx | 8 ++--- .../group/GroupSidebarMembersSection.tsx | 5 ++- .../src/app/entityV2/group/types.ts | 4 +++ .../profile/sidebar/SidebarSection.tsx | 8 ++++- 6 files changed, 39 insertions(+), 23 deletions(-) create mode 100644 datahub-web-react/src/app/entityV2/group/types.ts diff --git a/datahub-web-react/src/app/entityV2/group/GroupInfoHeaderSection.tsx b/datahub-web-react/src/app/entityV2/group/GroupInfoHeaderSection.tsx index 0dd47fcca1132..2c7f779f13dff 100644 --- a/datahub-web-react/src/app/entityV2/group/GroupInfoHeaderSection.tsx +++ b/datahub-web-react/src/app/entityV2/group/GroupInfoHeaderSection.tsx @@ -43,13 +43,13 @@ export const GroupInfoHeaderSection = ({ isExternalGroup, groupName, }: Props) => { - const groupMemberRelationshipsCount = groupMemberRelationships?.count || 0; + const groupMemberRelationshipsTotal = groupMemberRelationships?.total || 0; return ( {groupName} - {groupMemberRelationshipsCount > 0 && {groupMemberRelationships?.count} members} + {groupMemberRelationshipsTotal > 0 && {groupMemberRelationshipsTotal} members} {isExternalGroup && ( ; + groupMemberRelationships: EntityRelationshipsResult; }; -const DEFAULT_MAX_ENTITIES_TO_SHOW = 4; +const DEFAULT_MAX_ENTITIES_TO_SHOW = 5; -export default function GroupMembersSidebarSectionContent({ relationships }: Props) { +export default function GroupMembersSidebarSectionContent({ groupMemberRelationships }: Props) { + const history = useHistory(); + const { url } = useRouteMatch(); const [entityCount, setEntityCount] = useState(DEFAULT_MAX_ENTITIES_TO_SHOW); const entityRegistry = useEntityRegistry(); - const relationshipsCount = relationships?.length || 0; + const relationshipsTotal = groupMemberRelationships?.total || 0; + const relationshipsAvailableCount = groupMemberRelationships.relationships?.length || 0; return ( <> - {relationships.length === 0 && ( + {relationshipsTotal === 0 && ( No members yet. )} - {relationships.length > 0 && - relationships.map((item, index) => { + {relationshipsTotal > 0 && + groupMemberRelationships.relationships.map((item, index) => { const user = item.entity as CorpUser; return index < entityCount && ; })} - {relationshipsCount > entityCount && ( + {relationshipsAvailableCount > entityCount && ( )} + {relationshipsTotal > relationshipsAvailableCount && entityCount >= relationshipsAvailableCount && ( + history.replace(`${url}/${TabType.Members.toLocaleLowerCase()}`)}> + View all members + + )} ); } diff --git a/datahub-web-react/src/app/entityV2/group/GroupProfile.tsx b/datahub-web-react/src/app/entityV2/group/GroupProfile.tsx index 8056d72535a6d..6659178d12be4 100644 --- a/datahub-web-react/src/app/entityV2/group/GroupProfile.tsx +++ b/datahub-web-react/src/app/entityV2/group/GroupProfile.tsx @@ -26,14 +26,10 @@ import EntitySidebarContext from '../../sharedV2/EntitySidebarContext'; import SidebarCollapsibleHeader from '../shared/containers/profile/sidebar/SidebarCollapsibleHeader'; import { EntitySidebarTabs } from '../shared/containers/profile/sidebar/EntitySidebarTabs'; import { REDESIGN_COLORS } from '../shared/constants'; +import { TabType } from './types'; const messageStyle = { marginTop: '10%' }; -export enum TabType { - Assets = 'Owner Of', - Members = 'Members', -} - const ENABLED_TAB_TYPES = [TabType.Assets, TabType.Members]; const MEMBER_PAGE_SIZE = 15; @@ -119,7 +115,7 @@ export default function GroupProfile({ urn }: Props) { }, { name: TabType.Members, - path: TabType.Members.toLocaleLowerCase(), + path: TabType.Members.toLocaleLowerCase(), // do not remove toLocaleLowerCase as we link to this tab elsewhere content: ( - } + showFullCount + content={} /> ); }; diff --git a/datahub-web-react/src/app/entityV2/group/types.ts b/datahub-web-react/src/app/entityV2/group/types.ts new file mode 100644 index 0000000000000..ece22d739ccb7 --- /dev/null +++ b/datahub-web-react/src/app/entityV2/group/types.ts @@ -0,0 +1,4 @@ +export enum TabType { + Assets = 'Owner Of', + Members = 'Members', +} diff --git a/datahub-web-react/src/app/entityV2/shared/containers/profile/sidebar/SidebarSection.tsx b/datahub-web-react/src/app/entityV2/shared/containers/profile/sidebar/SidebarSection.tsx index d4c93b34b1269..202312154346d 100644 --- a/datahub-web-react/src/app/entityV2/shared/containers/profile/sidebar/SidebarSection.tsx +++ b/datahub-web-react/src/app/entityV2/shared/containers/profile/sidebar/SidebarSection.tsx @@ -78,6 +78,7 @@ type Props = { collapsedContent?: React.ReactNode; collapsible?: boolean; expandedByDefault?: boolean; + showFullCount?: boolean; }; export const SidebarSection = ({ @@ -88,6 +89,7 @@ export const SidebarSection = ({ collapsedContent, collapsible = true, expandedByDefault = true, + showFullCount, }: Props) => { return ( {title} - {count > 0 && {count > 10 ? '10+' : count}} + {count > 0 && ( + + {showFullCount ? <>{count} : <>{count > 10 ? '10+' : count}} + + )} {collapsedContent} From 815e688b76b1deee1cd9f202345161d151b56c16 Mon Sep 17 00:00:00 2001 From: Chris Collins Date: Wed, 26 Feb 2025 11:16:44 -0500 Subject: [PATCH 22/45] fix(ui) Properly get display name when downloading search results (#12720) --- .../shared/components/styled/search/downloadAsCsvUtil.ts | 7 ++++--- .../shared/components/styled/search/downloadAsCsvUtil.ts | 5 +++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/datahub-web-react/src/app/entity/shared/components/styled/search/downloadAsCsvUtil.ts b/datahub-web-react/src/app/entity/shared/components/styled/search/downloadAsCsvUtil.ts index 9b3f4671abc79..d000359499ce6 100644 --- a/datahub-web-react/src/app/entity/shared/components/styled/search/downloadAsCsvUtil.ts +++ b/datahub-web-react/src/app/entity/shared/components/styled/search/downloadAsCsvUtil.ts @@ -33,6 +33,7 @@ export const getSearchCsvDownloadHeader = (sampleResult?: SearchResultInterface) }; export const transformGenericEntityPropertiesToCsvRow = ( + entityRegistry: EntityRegistry, properties: GenericEntityProperties | null, entityUrl: string, result: SearchResultInterface, @@ -41,11 +42,11 @@ export const transformGenericEntityPropertiesToCsvRow = ( // urn properties?.urn || '', // name - properties?.name || '', + entityRegistry.getDisplayName(result.entity.type, result.entity) || properties?.name || '', // type result.entity.type || '', // description - properties?.properties?.description || '', + properties?.properties?.description || properties?.editableProperties?.description || '', // user owners properties?.ownership?.owners ?.filter((owner) => owner.owner.type === EntityType.CorpUser) @@ -98,6 +99,6 @@ export const transformResultsToCsvRow = (results: SearchResultInterface[], entit return results.map((result) => { const genericEntityProperties = entityRegistry.getGenericEntityProperties(result.entity.type, result.entity); const entityUrl = entityRegistry.getEntityUrl(result.entity.type, result.entity.urn); - return transformGenericEntityPropertiesToCsvRow(genericEntityProperties, entityUrl, result); + return transformGenericEntityPropertiesToCsvRow(entityRegistry, genericEntityProperties, entityUrl, result); }); }; diff --git a/datahub-web-react/src/app/entityV2/shared/components/styled/search/downloadAsCsvUtil.ts b/datahub-web-react/src/app/entityV2/shared/components/styled/search/downloadAsCsvUtil.ts index 99ae03ab9c385..8d0dc5d5de902 100644 --- a/datahub-web-react/src/app/entityV2/shared/components/styled/search/downloadAsCsvUtil.ts +++ b/datahub-web-react/src/app/entityV2/shared/components/styled/search/downloadAsCsvUtil.ts @@ -33,6 +33,7 @@ export const getSearchCsvDownloadHeader = (sampleResult?: SearchResultInterface) }; export const transformGenericEntityPropertiesToCsvRow = ( + entityRegistry: EntityRegistry, properties: GenericEntityProperties | null, entityUrl: string, result: SearchResultInterface, @@ -41,7 +42,7 @@ export const transformGenericEntityPropertiesToCsvRow = ( // urn properties?.urn || '', // name - properties?.name || '', + entityRegistry.getDisplayName(result.entity.type, result.entity) || properties?.name || '', // type result.entity.type || '', // description @@ -98,6 +99,6 @@ export const transformResultsToCsvRow = (results: SearchResultInterface[], entit return results.map((result) => { const genericEntityProperties = entityRegistry.getGenericEntityProperties(result.entity.type, result.entity); const entityUrl = entityRegistry.getEntityUrl(result.entity.type, result.entity.urn); - return transformGenericEntityPropertiesToCsvRow(genericEntityProperties, entityUrl, result); + return transformGenericEntityPropertiesToCsvRow(entityRegistry, genericEntityProperties, entityUrl, result); }); }; From a5af1a8b7e1e407909dc622c3f2e5c6f6ea5020b Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 26 Feb 2025 09:24:38 -0800 Subject: [PATCH 23/45] fix(ingest): bump avro dep (#12729) --- metadata-ingestion/setup.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 3bf8917f23f89..d653f4da4f387 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -38,7 +38,8 @@ "expandvars>=0.6.5", "avro-gen3==0.7.16", # "avro-gen3 @ git+https://github.com/acryldata/avro_gen@master#egg=avro-gen3", - "avro>=1.11.3,<1.12", + # avro has historically made breaking changes, so we have a cautious upper bound. + "avro>=1.11.3,<1.13", "python-dateutil>=2.8.0", "tabulate", "progressbar2", @@ -76,7 +77,7 @@ # now provide prebuilt wheels for most platforms, including M1 Macs and # Linux aarch64 (e.g. Docker's linux/arm64). Installing confluent_kafka # from source remains a pain. - "confluent_kafka[schemaregistry]>=1.9.0", + "confluent_kafka[schemaregistry,avro]>=1.9.0", # We currently require both Avro libraries. The codegen uses avro-python3 (above) # schema parsers at runtime for generating and reading JSON into Python objects. # At the same time, we use Kafka's AvroSerializer, which internally relies on From 45c318788450d4abd63bc0980ce911b361aaed57 Mon Sep 17 00:00:00 2001 From: Chris Collins Date: Wed, 26 Feb 2025 13:04:58 -0500 Subject: [PATCH 24/45] fix(ui) Filter healthy assets out of unhealthy upstreams component (#12705) --- .../shared/embed/UpstreamHealth/UpstreamHealth.tsx | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/datahub-web-react/src/app/entityV2/shared/embed/UpstreamHealth/UpstreamHealth.tsx b/datahub-web-react/src/app/entityV2/shared/embed/UpstreamHealth/UpstreamHealth.tsx index 7095746e5fc8a..928f7c83e8ea4 100644 --- a/datahub-web-react/src/app/entityV2/shared/embed/UpstreamHealth/UpstreamHealth.tsx +++ b/datahub-web-react/src/app/entityV2/shared/embed/UpstreamHealth/UpstreamHealth.tsx @@ -3,6 +3,7 @@ import { Divider } from 'antd'; import React, { useEffect, useState } from 'react'; import styled from 'styled-components'; import { ErrorRounded } from '@mui/icons-material'; +import { isUnhealthy } from '@src/app/shared/health/healthUtils'; import { useSearchAcrossLineageQuery } from '../../../../../graphql/search.generated'; import { Dataset, EntityType, FilterOperator, LineageDirection } from '../../../../../types.generated'; import { @@ -184,7 +185,10 @@ export default function UpstreamHealth() { setIndirectUpstreamsDataStart(newStart); } - const hasUnhealthyUpstreams = directUpstreamEntities.length || indirectUpstreamEntities.length; + const unhealthyDirectUpstreams = directUpstreamEntities.filter((e) => e.health && isUnhealthy(e.health)); + const unhealthyIndirectUpstreams = indirectUpstreamEntities.filter((e) => e.health && isUnhealthy(e.health)); + + const hasUnhealthyUpstreams = unhealthyDirectUpstreams.length || unhealthyIndirectUpstreams.length; if (!hasUnhealthyUpstreams) return null; @@ -200,13 +204,13 @@ export default function UpstreamHealth() { {isOpen && ( Date: Wed, 26 Feb 2025 11:17:31 -0800 Subject: [PATCH 25/45] docs: update slack link (#12731) --- README.md | 46 +++++++++++-------- docs-website/docusaurus.config.js | 4 +- docs-website/src/components/SlackUtm/index.js | 2 +- docs-website/src/pages/slack/index.js | 27 ++--------- .../Navbar/communityCardDropdownContent.js | 2 +- docs/slack.md | 4 +- .../examples/mce_files/bootstrap_mce.json | 2 +- 7 files changed, 39 insertions(+), 48 deletions(-) diff --git a/README.md b/README.md index 3e85f68142d5f..a6faa03657126 100644 --- a/README.md +++ b/README.md @@ -18,27 +18,38 @@ export const Logo = (props) => {

+ DataHub +

# DataHub: The Data Discovery Platform for the Modern Data Stack -## Built with ❤️ by [Acryl Data](https://acryldata.io) and [LinkedIn](https://engineering.linkedin.com) -[![Version](https://img.shields.io/github/v/release/datahub-project/datahub?include_prereleases)](https://github.com/datahub-project/datahub/releases/latest) -[![PyPI version](https://badge.fury.io/py/acryl-datahub.svg)](https://badge.fury.io/py/acryl-datahub) -[![build & test](https://github.com/datahub-project/datahub/workflows/build%20&%20test/badge.svg?branch=master&event=push)](https://github.com/datahub-project/datahub/actions?query=workflow%3A%22build+%26+test%22+branch%3Amaster+event%3Apush) -[![Docker Pulls](https://img.shields.io/docker/pulls/acryldata/datahub-gms.svg)](https://hub.docker.com/r/acryldata/datahub-gms) -[![Slack](https://img.shields.io/badge/slack-join_chat-white.svg?logo=slack&style=social)](https://datahubproject.io/slack?utm_source=github&utm_medium=readme&utm_campaign=github_readme) -[![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg)](https://github.com/datahub-project/datahub/blob/master/docs/CONTRIBUTING.md) -[![GitHub commit activity](https://img.shields.io/github/commit-activity/m/datahub-project/datahub)](https://github.com/datahub-project/datahub/pulls?q=is%3Apr) -[![License](https://img.shields.io/github/license/datahub-project/datahub)](https://github.com/datahub-project/datahub/blob/master/LICENSE) -[![YouTube](https://img.shields.io/youtube/channel/subscribers/UC3qFQC5IiwR5fvWEqi_tJ5w?style=social)](https://www.youtube.com/channel/UC3qFQC5IiwR5fvWEqi_tJ5w) -[![Medium](https://img.shields.io/badge/Medium-12100E?style=for-the-badge&logo=medium&logoColor=white)](https://medium.com/datahub-project) -[![Follow](https://img.shields.io/twitter/follow/datahubproject?label=Follow&style=social)](https://twitter.com/datahubproject) -### 🏠 Hosted DataHub Docs (Courtesy of Acryl Data): [datahubproject.io](https://datahubproject.io/docs) + +### Built with ❤️ by [Acryl Data](https://acryldata.io) and [LinkedIn](https://engineering.linkedin.com) + + --- +### 🏠 Docs: [datahubproject.io](https://datahubproject.io/docs) + [Quickstart](https://datahubproject.io/docs/quickstart) | [Features](https://datahubproject.io/docs/) | [Roadmap](https://feature-requests.datahubproject.io/roadmap) | @@ -47,6 +58,7 @@ HOSTED_DOCS_ONLY--> [Town Hall](https://datahubproject.io/docs/townhalls) --- + > 📣 DataHub Town Hall is the 4th Thursday at 9am US PT of every month - [add it to your calendar!](https://rsvp.datahubproject.io/) > > - Town-hall Zoom link: [zoom.datahubproject.io](https://zoom.datahubproject.io) @@ -70,11 +82,11 @@ Check out DataHub's [Features](docs/features.md) & [Roadmap](https://feature-req ## Demo and Screenshots -There's a [hosted demo environment](https://demo.datahubproject.io/) courtesy of [Acryl Data](https://acryldata.io) where you can explore DataHub without installing it locally +There's a [hosted demo environment](https://demo.datahubproject.io/) courtesy of [Acryl Data](https://acryldata.io) where you can explore DataHub without installing it locally. ## Quickstart -Please follow the [DataHub Quickstart Guide](https://datahubproject.io/docs/quickstart) to get a copy of DataHub up & running locally using [Docker](https://docker.com). As the guide assumes some basic knowledge of Docker, we'd recommend you to go through the "Hello World" example of [A Docker Tutorial for Beginners](https://docker-curriculum.com) if Docker is completely foreign to you. +Please follow the [DataHub Quickstart Guide](https://datahubproject.io/docs/quickstart) to run DataHub locally using [Docker](https://docker.com). ## Development @@ -106,7 +118,7 @@ We welcome contributions from the community. Please refer to our [Contributing G ## Community -Join our [Slack workspace](https://datahubproject.io/slack?utm_source=github&utm_medium=readme&utm_campaign=github_readme) for discussions and important announcements. You can also find out more about our upcoming [town hall meetings](docs/townhalls.md) and view past recordings. +Join our [Slack workspace](https://pages.acryl.io/slack?utm_source=github&utm_medium=readme&utm_campaign=github_readme) for discussions and important announcements. You can also find out more about our upcoming [town hall meetings](docs/townhalls.md) and view past recordings. ## Security @@ -159,8 +171,6 @@ Here are the companies that have officially adopted DataHub. Please feel free to - [Wolt](https://wolt.com) - [Zynga](https://www.zynga.com) - - ## Select Articles & Talks - [DataHub Blog](https://blog.datahubproject.io/) diff --git a/docs-website/docusaurus.config.js b/docs-website/docusaurus.config.js index a5f794065e698..9c3a3fa338dce 100644 --- a/docs-website/docusaurus.config.js +++ b/docs-website/docusaurus.config.js @@ -167,7 +167,7 @@ module.exports = { position: "right", items: [ { - to: "/slack", + href: "https://pages.acryl.io/slack?utm_source=docs&utm_medium=header&utm_campaign=docs_header", label: "Join Slack", }, { @@ -189,7 +189,7 @@ module.exports = { ], }, { - href: "/slack", + href: "https://pages.acryl.io/slack?utm_source=docs&utm_medium=header&utm_campaign=docs_header", html: `