From 2396f0e23d1e79704291c84b602a802b02d69db0 Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Tue, 25 Feb 2025 15:57:17 -0600 Subject: [PATCH] fix(urn-validation): additional test cases for urn validation (#12727) --- .../tests/unit/urns/invalid_urns.txt | 8 + .../unit/urns/invalid_urns_java_only.txt | 4 + .../tests/unit/urns/valid_urns.txt | 17 +- .../metadata/utils/UrnValidationUtil.java | 122 ++++---- .../metadata/utils/UrnValidationUtilTest.java | 271 ++++++++++-------- 5 files changed, 233 insertions(+), 189 deletions(-) create mode 100644 metadata-ingestion/tests/unit/urns/invalid_urns_java_only.txt diff --git a/metadata-ingestion/tests/unit/urns/invalid_urns.txt b/metadata-ingestion/tests/unit/urns/invalid_urns.txt index 9ce2c99a1a4ee8..2003fabd652ca6 100644 --- a/metadata-ingestion/tests/unit/urns/invalid_urns.txt +++ b/metadata-ingestion/tests/unit/urns/invalid_urns.txt @@ -8,6 +8,14 @@ urn:li:corpuser:abc) # Reserved characters urn:li:corpuser:foo␟bar urn:li:tag:a,b,c +urn:li:corpuser:, +urn:li:dataset:(urn:li:dataPlatform:hdfs␟path,PROD) +urn:li:dashboard:(looker,dashboards,thelook) +urn:li:dataset:(urn:li:dataPlatform:hdfs,/path/to/data,()) +urn:li:dataset:(urn:li:dataPlatform:hdfs,(illegal),PROD) +urn:li:corpuser:(foo)123 +urn:li:dataJob:(urn:li:dataFlow:(mssql,1/2/3/4.c_n on %28LOCAL%29,PROD),1/2/3/4.c_n on (LOCAL)) +urn:li:dataJob:(urn:li:dataFlow:(mssql,123.%28TEST%29,PROD),(TEST)) # CorpUser URN tests urn:li:corpuser:(part1,part2) diff --git a/metadata-ingestion/tests/unit/urns/invalid_urns_java_only.txt b/metadata-ingestion/tests/unit/urns/invalid_urns_java_only.txt new file mode 100644 index 00000000000000..46a01c4d08679d --- /dev/null +++ b/metadata-ingestion/tests/unit/urns/invalid_urns_java_only.txt @@ -0,0 +1,4 @@ +# Not yet handled by python + +# Invalid use of colon +urn:li:dataset:(urn:li:dataPlatform:abc:def,table_name,prod) \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/urns/valid_urns.txt b/metadata-ingestion/tests/unit/urns/valid_urns.txt index 23205ec9a7235b..9b44c0d5e1bc4f 100644 --- a/metadata-ingestion/tests/unit/urns/valid_urns.txt +++ b/metadata-ingestion/tests/unit/urns/valid_urns.txt @@ -1,4 +1,4 @@ -# Unknown entity types become generic urns +# Unknown entity types become generic urns (does not apply to Java) urn:li:abc:foo urn:li:abc:(foo,bar) urn:li:abc:(urn:li:dataPlatform:abc,def,prod) @@ -6,8 +6,8 @@ urn:li:abc:(urn:li:dataPlatform:abc,def,prod) # A bunch of pretty normal urns urn:li:corpuser:foo urn:li:corpGroup:bar -urn:li:dataset:(urn:li:dataPlatform:abc,def/ghi,prod) -urn:li:dataFlow:(airflow,def,prod) +urn:li:dataset:(urn:li:dataPlatform:abc,def/ghi,PROD) +urn:li:dataFlow:(airflow,def,PROD) urn:li:dataJob:(urn:li:dataFlow:(airflow,flow_id,prod),job_id) urn:li:tag:abc urn:li:chart:(looker,chart_name) @@ -22,3 +22,14 @@ urn:li:tag:: urn:li:dashboard:(looker,dashboards.thelook::customer_lookup) urn:li:dataPlatform:abc:def urn:li:corpuser:foo:bar@example.com + +# From java test cases +urn:li:corpuser:foo:bar +urn:li:dataset:(urn:li:dataPlatform:hdfs,/path/to/data,PROD) +urn:li:dataPlatform:abc:def +urn:li:dataset:(urn:li:dataPlatform:s3,urn:li:dataset:%28urn:li:dataPlatform:s3%2Ctest-datalake-concepts/prog_maintenance%2CPROD%29,PROD) +urn:li:dataset:(urn:li:dataPlatform:bigquery,myproject.dataset.table,PROD) +urn:li:assertion:123=-%28__% weekly__%29 +urn:li:dataset:(urn:li:dataPlatform:s3,urn:li:dataset:%28urn:li:dataPlatform:s3%2Ctest-datalake-concepts%prog_maintenance%2CPROD%29,PROD) +urn:li:dataJob:(urn:li:dataFlow:(mssql,123.%28TEST%29,PROD),%28TEST%29) +urn:li:dashboard:(looker,dashboards.thelook::cohort_data_tool) diff --git a/metadata-utils/src/main/java/com/linkedin/metadata/utils/UrnValidationUtil.java b/metadata-utils/src/main/java/com/linkedin/metadata/utils/UrnValidationUtil.java index f5bbf3e093952d..5f7503977c9c46 100644 --- a/metadata-utils/src/main/java/com/linkedin/metadata/utils/UrnValidationUtil.java +++ b/metadata-utils/src/main/java/com/linkedin/metadata/utils/UrnValidationUtil.java @@ -1,7 +1,6 @@ package com.linkedin.metadata.utils; import com.linkedin.common.urn.Urn; -import com.linkedin.common.urn.UrnUtils; import com.linkedin.data.DataList; import com.linkedin.data.DataMap; import com.linkedin.metadata.Constants; @@ -11,9 +10,7 @@ import com.linkedin.metadata.models.annotation.UrnValidationAnnotation; import com.linkedin.metadata.models.registry.EntityRegistry; import java.net.URISyntaxException; -import java.net.URLDecoder; import java.net.URLEncoder; -import java.nio.charset.StandardCharsets; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; @@ -32,8 +29,9 @@ public class UrnValidationUtil { // Related to BrowsePathv2 public static final String URN_DELIMITER_SEPARATOR = "␟"; // https://datahubproject.io/docs/what/urn/#restrictions - public static final Set ILLEGAL_URN_COMPONENT_CHARACTERS = Set.of("(", ")"); - public static final Set ILLEGAL_URN_TUPLE_CHARACTERS = Set.of(","); + public static final Set ILLEGAL_URN_CHARACTERS_PARENTHESES = Set.of("(", ")"); + // Commas are used as delimiters in tuple URNs, but not allowed in URN components + public static final Set ILLEGAL_URN_COMPONENT_DELIMITER = Set.of(","); private UrnValidationUtil() {} @@ -66,17 +64,29 @@ public static void validateUrn( "Error: URN cannot contain " + URN_DELIMITER_SEPARATOR + " character"); } - int totalParts = urn.getEntityKey().getParts().size(); - List illegalComponents = - urn.getEntityKey().getParts().stream() - .flatMap(part -> processUrnPartRecursively(part, totalParts)) - .collect(Collectors.toList()); + // Check if this is a simple (non-tuple) URN containing commas + if (urn.getEntityKey().getParts().size() == 1) { + String part = urn.getEntityKey().getParts().get(0); + if (part.contains(",")) { + if (strict) { + throw new IllegalArgumentException( + String.format( + "Simple URN %s contains comma character which is not allowed in non-tuple URNs", + urn)); + } else { + log.error( + "Simple URN {} contains comma character which is not allowed in non-tuple URNs", urn); + } + } + } + + // Validate all the URN parts + List illegalComponents = validateUrnComponents(urn); if (!illegalComponents.isEmpty()) { String message = String.format( - "Illegal `%s` characters detected in URN %s component(s): %s", - ILLEGAL_URN_COMPONENT_CHARACTERS, urn, illegalComponents); + "Illegal characters detected in URN %s component(s): %s", urn, illegalComponents); if (strict) { throw new IllegalArgumentException(message); @@ -92,25 +102,40 @@ public static void validateUrn( } } - /** Recursively process URN parts with URL decoding */ + /** Validates all components of a URN and returns a list of any illegal components. */ + private static List validateUrnComponents(Urn urn) { + int totalParts = urn.getEntityKey().getParts().size(); + + return urn.getEntityKey().getParts().stream() + .flatMap(part -> processUrnPartRecursively(part, totalParts)) + .collect(Collectors.toList()); + } + + /** Recursively process URN parts with URL decoding and check for illegal characters */ private static Stream processUrnPartRecursively(String urnPart, int totalParts) { - String decodedPart = - URLDecoder.decode(URLEncodingFixer.fixURLEncoding(urnPart), StandardCharsets.UTF_8); - if (decodedPart.startsWith("urn:li:")) { - // Recursively process nested URN after decoding - int nestedParts = UrnUtils.getUrn(decodedPart).getEntityKey().getParts().size(); - return UrnUtils.getUrn(decodedPart).getEntityKey().getParts().stream() - .flatMap(part -> processUrnPartRecursively(part, nestedParts)); + // If this part is a nested URN, don't check it directly for illegal characters + if (urnPart.startsWith("urn:li:")) { + return Stream.empty(); } + + // If this part has encoded parentheses, consider it valid + if (urnPart.contains("%28") || urnPart.contains("%29")) { + return Stream.empty(); + } + + // Check for unencoded parentheses in any part + if (ILLEGAL_URN_CHARACTERS_PARENTHESES.stream().anyMatch(c -> urnPart.contains(c))) { + return Stream.of(urnPart); + } + + // For tuple parts (URNs with multiple components), check for illegal commas within components if (totalParts > 1) { - if (ILLEGAL_URN_TUPLE_CHARACTERS.stream().anyMatch(c -> urnPart.contains(c))) { + if (ILLEGAL_URN_COMPONENT_DELIMITER.stream().anyMatch(c -> urnPart.contains(c))) { return Stream.of(urnPart); } } - if (ILLEGAL_URN_COMPONENT_CHARACTERS.stream().anyMatch(c -> urnPart.contains(c))) { - return Stream.of(urnPart); - } + // If we reach here, the part is valid return Stream.empty(); } @@ -207,53 +232,4 @@ public static class UrnValidationEntry { String urn; UrnValidationAnnotation annotation; } - - /** - * Fixes malformed URL encoding by escaping unescaped % characters while preserving valid - * percent-encoded sequences. - */ - private static class URLEncodingFixer { - /** - * @param input The potentially malformed URL-encoded string - * @return A string with proper URL encoding that can be safely decoded - */ - public static String fixURLEncoding(String input) { - if (input == null) { - return null; - } - - StringBuilder result = new StringBuilder(input.length() * 2); - int i = 0; - - while (i < input.length()) { - char currentChar = input.charAt(i); - - if (currentChar == '%') { - if (i + 2 < input.length()) { - // Check if the next two characters form a valid hex pair - String hexPair = input.substring(i + 1, i + 3); - if (isValidHexPair(hexPair)) { - // This is a valid percent-encoded sequence, keep it as is - result.append(currentChar); - } else { - // Invalid sequence, escape the % character - result.append("%25"); - } - } else { - // % at the end of string, escape it - result.append("%25"); - } - } else { - result.append(currentChar); - } - i++; - } - - return result.toString(); - } - - private static boolean isValidHexPair(String pair) { - return pair.matches("[0-9A-Fa-f]{2}"); - } - } } diff --git a/metadata-utils/src/test/java/com/linkedin/metadata/utils/UrnValidationUtilTest.java b/metadata-utils/src/test/java/com/linkedin/metadata/utils/UrnValidationUtilTest.java index da3832b8580ff9..65dd7c2ae00aca 100644 --- a/metadata-utils/src/test/java/com/linkedin/metadata/utils/UrnValidationUtilTest.java +++ b/metadata-utils/src/test/java/com/linkedin/metadata/utils/UrnValidationUtilTest.java @@ -4,66 +4,23 @@ import com.linkedin.common.urn.UrnUtils; import com.linkedin.metadata.models.registry.EntityRegistry; import io.datahubproject.test.metadata.context.TestOperationContexts; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; import java.net.URISyntaxException; +import java.util.ArrayList; +import java.util.List; +import java.util.Set; +import javax.annotation.Nonnull; +import org.testng.Assert; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; public class UrnValidationUtilTest { private static final EntityRegistry entityRegistry = TestOperationContexts.defaultEntityRegistry(); - @Test - public void testValidateDatasetUrn() { - Urn validUrn = UrnUtils.getUrn("urn:li:dataset:(urn:li:dataPlatform:hdfs,/path/to/data,PROD)"); - UrnValidationUtil.validateUrn(entityRegistry, validUrn, true); - // If no exception is thrown, test passes - } - - @Test - public void testSimpleUrnColon() { - UrnValidationUtil.validateUrn(entityRegistry, UrnUtils.getUrn("urn:li:corpuser:foo:bar"), true); - UrnValidationUtil.validateUrn( - entityRegistry, UrnUtils.getUrn("urn:li:dataPlatform:abc:def"), true); - UrnValidationUtil.validateUrn( - entityRegistry, UrnUtils.getUrn("urn:li:corpuser:foo:bar@example.com"), true); - // If no exception is thrown, test passes - } - - @Test - public void testSimpleUrnComma() { - UrnValidationUtil.validateUrn(entityRegistry, UrnUtils.getUrn("urn:li:corpuser:,"), true); - // If no exception is thrown, test passes - } - - @Test(expectedExceptions = IllegalArgumentException.class) - public void testTupleUrnComma() { - UrnValidationUtil.validateUrn( - entityRegistry, UrnUtils.getUrn("urn:li:dashboard:(looker,dashboards,thelook)"), true); - } - - @Test(expectedExceptions = IllegalArgumentException.class) - public void testFabricTypeCasing() { - // prod != PROD - UrnValidationUtil.validateUrn( - entityRegistry, - UrnUtils.getUrn("urn:li:dataset:(urn:li:dataPlatform:abc:def,table_name,prod)"), - true); - } - - @Test - public void testComplexUrnColon() throws URISyntaxException { - Urn validUrn = - Urn.createFromString( - "urn:li:dataset:(urn:li:dataPlatform:s3,urn:li:dataset:%28urn:li:dataPlatform:s3%2Ctest-datalake-concepts/prog_maintenance%2CPROD%29,PROD)"); - UrnValidationUtil.validateUrn(entityRegistry, validUrn, true); - // If no exception is thrown, test passes - } - - @Test(expectedExceptions = IllegalArgumentException.class) - public void testFabricTypeParen() { - Urn invalidUrn = UrnUtils.getUrn("urn:li:dataset:(urn:li:dataPlatform:hdfs,/path/to/data,())"); - UrnValidationUtil.validateUrn(entityRegistry, invalidUrn, true); - } - @Test(expectedExceptions = IllegalArgumentException.class) public void testUrnWithTrailingWhitespace() { Urn invalidUrn = @@ -71,32 +28,6 @@ public void testUrnWithTrailingWhitespace() { UrnValidationUtil.validateUrn(entityRegistry, invalidUrn, true); } - @Test(expectedExceptions = IllegalArgumentException.class) - public void testUrnWithIllegalDelimiter() { - Urn invalidUrn = UrnUtils.getUrn("urn:li:dataset:(urn:li:dataPlatform:hdfs␟path,PROD)"); - UrnValidationUtil.validateUrn(entityRegistry, invalidUrn, true); - } - - @Test(expectedExceptions = IllegalArgumentException.class) - public void testComplexUrnWithParens1() { - Urn invalidUrn = UrnUtils.getUrn("urn:li:dataset:(urn:li:dataPlatform:hdfs,(illegal),PROD)"); - UrnValidationUtil.validateUrn(entityRegistry, invalidUrn, true); - } - - @Test(expectedExceptions = IllegalArgumentException.class) - public void testComplexUrnWithParens2() { - Urn invalidUrn = - UrnUtils.getUrn( - "urn:li:dataJob:(urn:li:dataFlow:(mssql,1/2/3/4.c_n on %28LOCAL%29,PROD),1/2/3/4.c_n on (LOCAL))"); - UrnValidationUtil.validateUrn(entityRegistry, invalidUrn, true); - } - - @Test(expectedExceptions = IllegalArgumentException.class) - public void testSimpleUrnWithParens() { - Urn invalidUrn = UrnUtils.getUrn("urn:li:corpuser:(foo)123"); - UrnValidationUtil.validateUrn(entityRegistry, invalidUrn, true); - } - @Test(expectedExceptions = IllegalArgumentException.class) public void testExcessiveLength() { StringBuilder longPath = new StringBuilder("urn:li:dataset:(urn:li:dataPlatform:hdfs,"); @@ -110,52 +41,166 @@ public void testExcessiveLength() { UrnValidationUtil.validateUrn(entityRegistry, invalidUrn, true); } - @Test - public void testValidComplexUrn() { - Urn validUrn = - UrnUtils.getUrn( - "urn:li:dataset:(urn:li:dataPlatform:bigquery,myproject.dataset.table,PROD)"); - - UrnValidationUtil.validateUrn(entityRegistry, validUrn, true); - // If no exception is thrown, test passes - } - @Test(expectedExceptions = NullPointerException.class) public void testUrnNull() { UrnValidationUtil.validateUrn(entityRegistry, null, true); } - @Test - public void testValidPartialUrlEncode() { - Urn validUrn = UrnUtils.getUrn("urn:li:assertion:123=-%28__% weekly__%29"); + /** + * Common method to validate URNs from a file and return the validation results. + * + * @param filePath Path to the file containing URNs. + * @return ValidationResult containing lists of valid and invalid URNs. + * @throws IOException If there is an error reading the file. + */ + private ValidationResult validateUrnsFromFile( + @Nonnull String filePath, @Nonnull Set excludePrefix) + throws IOException, URISyntaxException { + List invalidUrns = new ArrayList<>(); + List validUrns = new ArrayList<>(); + int totalUrns = 0; + + File file = new File(filePath); + BufferedReader reader = new BufferedReader(new FileReader(file)); + String line; + + while ((line = reader.readLine()) != null) { + // Skip empty lines and comment lines (starting with #) + line = line.trim(); + final String excludeCheck = line; + if (line.isEmpty() + || line.startsWith("#") + || excludePrefix.stream().anyMatch(excludeCheck::startsWith)) { + continue; + } + + totalUrns++; + + try { + Urn urn = UrnUtils.getUrn(line); + UrnValidationUtil.validateUrn(entityRegistry, urn, true); + validUrns.add(line); + } catch (Exception e) { + invalidUrns.add(line + " - Error: " + e.getMessage()); + } + + if (validUrns.contains(line)) { + // If valid should also parse correctly + Urn.createFromString(line); + } + } - UrnValidationUtil.validateUrn(entityRegistry, validUrn, true); - // If no exception is thrown, test passes - } + reader.close(); - @Test - public void testValidPartialUrlEncode2() { - Urn validUrn = - UrnUtils.getUrn( - "urn:li:dataset:(urn:li:dataPlatform:s3,urn:li:dataset:%28urn:li:dataPlatform:s3%2Ctest-datalake-concepts%prog_maintenance%2CPROD%29,PROD)"); + // Print summary + System.out.println("File: " + filePath); + System.out.println("Total URNs processed: " + totalUrns); + System.out.println("Valid URNs: " + validUrns.size()); + System.out.println("Invalid URNs: " + invalidUrns.size()); - UrnValidationUtil.validateUrn(entityRegistry, validUrn, true); - // If no exception is thrown, test passes + return new ValidationResult(validUrns, invalidUrns, totalUrns); } - @Test - public void testValidColon() { - Urn validUrn = - UrnUtils.getUrn("urn:li:dashboard:(looker,dashboards.thelook::cohort_data_tool)"); + /** + * Test method to validate URNs from a file containing valid URNs. Expects all URNs in the file to + * be valid. + * + * @param filePath Path to the file containing valid URNs. + */ + @Test(dataProvider = "validUrnFilePathProvider") + public void testValidateValidUrnsFromFile(String filePath) throws URISyntaxException { + try { + ValidationResult result = validateUrnsFromFile(filePath, Set.of("urn:li:abc:")); + + // Print invalid URNs if any exist + if (!result.getInvalidUrns().isEmpty()) { + System.out.println("Invalid URNs found in valid URN file:"); + result.getInvalidUrns().forEach(System.out::println); + Assert.fail("Found " + result.getInvalidUrns().size() + " invalid URNs in valid URN file"); + } - UrnValidationUtil.validateUrn(entityRegistry, validUrn, true); - // If no exception is thrown, test passes + // Assert that we have at least one test case + Assert.assertTrue(result.getValidUrns().size() > 0, "No valid URNs found in the file"); + + } catch (IOException e) { + Assert.fail("Failed to read the file: " + e.getMessage()); + } } - @Test - public void testNoTupleComma() { - Urn invalidUrn = UrnUtils.getUrn("urn:li:corpuser:,"); - UrnValidationUtil.validateUrn(entityRegistry, invalidUrn, true); - // If no exception is thrown, test passes + /** + * Test method to validate URNs from a file containing invalid URNs. Expects all URNs in the file + * to be invalid. + * + * @param filePath Path to the file containing invalid URNs. + */ + @Test(dataProvider = "invalidUrnFilePathProvider") + public void testValidateInvalidUrnsFromFile(String filePath) throws URISyntaxException { + try { + ValidationResult result = validateUrnsFromFile(filePath, Set.of()); + + // Print valid URNs if any exist + if (!result.getValidUrns().isEmpty()) { + System.out.println("Valid URNs found in invalid URN file:"); + result.getValidUrns().forEach(System.out::println); + Assert.fail("Found " + result.getValidUrns().size() + " valid URNs in invalid URN file"); + } + + // Assert that we have at least one test case + Assert.assertTrue(result.getInvalidUrns().size() > 0, "No invalid URNs found in the file"); + + } catch (IOException e) { + Assert.fail("Failed to read the file: " + e.getMessage()); + } + } + + /** + * Data provider for the valid URN file paths. + * + * @return Array of test data + */ + @DataProvider(name = "validUrnFilePathProvider") + public Object[][] validUrnFilePathProvider() { + return new Object[][] {{"../metadata-ingestion/tests/unit/urns/valid_urns.txt"} + // Add more test files as needed + }; + } + + /** + * Data provider for the invalid URN file paths. + * + * @return Array of test data + */ + @DataProvider(name = "invalidUrnFilePathProvider") + public Object[][] invalidUrnFilePathProvider() { + return new Object[][] { + {"../metadata-ingestion/tests/unit/urns/invalid_urns.txt"}, + {"../metadata-ingestion/tests/unit/urns/invalid_urns_java_only.txt"} + // Add more test files as needed + }; + } + + /** Class to hold validation results. */ + private static class ValidationResult { + private final List validUrns; + private final List invalidUrns; + private final int totalUrns; + + public ValidationResult(List validUrns, List invalidUrns, int totalUrns) { + this.validUrns = validUrns; + this.invalidUrns = invalidUrns; + this.totalUrns = totalUrns; + } + + public List getValidUrns() { + return validUrns; + } + + public List getInvalidUrns() { + return invalidUrns; + } + + public int getTotalUrns() { + return totalUrns; + } } }