fix(urn-validation): additional test cases for urn validation (#12727)

datahub-project · Feb 25, 2025 · 2396f0e · 2396f0e
1 parent ffe2278
commit 2396f0e
Show file tree

Hide file tree

Showing 5 changed files with 233 additions and 189 deletions.
diff --git a/metadata-ingestion/tests/unit/urns/invalid_urns.txt b/metadata-ingestion/tests/unit/urns/invalid_urns.txt
@@ -8,6 +8,14 @@ urn:li:corpuser:abc)
 # Reserved characters
 urn:li:corpuser:foo␟bar
 urn:li:tag:a,b,c
+urn:li:corpuser:,
+urn:li:dataset:(urn:li:dataPlatform:hdfs␟path,PROD)
+urn:li:dashboard:(looker,dashboards,thelook)
+urn:li:dataset:(urn:li:dataPlatform:hdfs,/path/to/data,())
+urn:li:dataset:(urn:li:dataPlatform:hdfs,(illegal),PROD)
+urn:li:corpuser:(foo)123
+urn:li:dataJob:(urn:li:dataFlow:(mssql,1/2/3/4.c_n on %28LOCAL%29,PROD),1/2/3/4.c_n on (LOCAL))
+urn:li:dataJob:(urn:li:dataFlow:(mssql,123.%28TEST%29,PROD),(TEST))
 
 # CorpUser URN tests
 urn:li:corpuser:(part1,part2)

diff --git a/metadata-ingestion/tests/unit/urns/invalid_urns_java_only.txt b/metadata-ingestion/tests/unit/urns/invalid_urns_java_only.txt
@@ -0,0 +1,4 @@
+# Not yet handled by python
+
+# Invalid use of colon
+urn:li:dataset:(urn:li:dataPlatform:abc:def,table_name,prod)
diff --git a/metadata-ingestion/tests/unit/urns/valid_urns.txt b/metadata-ingestion/tests/unit/urns/valid_urns.txt
@@ -1,13 +1,13 @@
-# Unknown entity types become generic urns
+# Unknown entity types become generic urns (does not apply to Java)
 urn:li:abc:foo
 urn:li:abc:(foo,bar)
 urn:li:abc:(urn:li:dataPlatform:abc,def,prod)
 
 # A bunch of pretty normal urns
 urn:li:corpuser:foo
 urn:li:corpGroup:bar
-urn:li:dataset:(urn:li:dataPlatform:abc,def/ghi,prod)
-urn:li:dataFlow:(airflow,def,prod)
+urn:li:dataset:(urn:li:dataPlatform:abc,def/ghi,PROD)
+urn:li:dataFlow:(airflow,def,PROD)
 urn:li:dataJob:(urn:li:dataFlow:(airflow,flow_id,prod),job_id)
 urn:li:tag:abc
 urn:li:chart:(looker,chart_name)
@@ -22,3 +22,14 @@ urn:li:tag::
 urn:li:dashboard:(looker,dashboards.thelook::customer_lookup)
 urn:li:dataPlatform:abc:def
 urn:li:corpuser:foo:bar@example.com
+
+# From java test cases
+urn:li:corpuser:foo:bar
+urn:li:dataset:(urn:li:dataPlatform:hdfs,/path/to/data,PROD)
+urn:li:dataPlatform:abc:def
+urn:li:dataset:(urn:li:dataPlatform:s3,urn:li:dataset:%28urn:li:dataPlatform:s3%2Ctest-datalake-concepts/prog_maintenance%2CPROD%29,PROD)
+urn:li:dataset:(urn:li:dataPlatform:bigquery,myproject.dataset.table,PROD)
+urn:li:assertion:123=-%28__% weekly__%29
+urn:li:dataset:(urn:li:dataPlatform:s3,urn:li:dataset:%28urn:li:dataPlatform:s3%2Ctest-datalake-concepts%prog_maintenance%2CPROD%29,PROD)
+urn:li:dataJob:(urn:li:dataFlow:(mssql,123.%28TEST%29,PROD),%28TEST%29)
+urn:li:dashboard:(looker,dashboards.thelook::cohort_data_tool)
diff --git a/metadata-utils/src/main/java/com/linkedin/metadata/utils/UrnValidationUtil.java b/metadata-utils/src/main/java/com/linkedin/metadata/utils/UrnValidationUtil.java
@@ -1,7 +1,6 @@
 package com.linkedin.metadata.utils;
 
 import com.linkedin.common.urn.Urn;
-import com.linkedin.common.urn.UrnUtils;
 import com.linkedin.data.DataList;
 import com.linkedin.data.DataMap;
 import com.linkedin.metadata.Constants;
@@ -11,9 +10,7 @@
 import com.linkedin.metadata.models.annotation.UrnValidationAnnotation;
 import com.linkedin.metadata.models.registry.EntityRegistry;
 import java.net.URISyntaxException;
-import java.net.URLDecoder;
 import java.net.URLEncoder;
-import java.nio.charset.StandardCharsets;
 import java.util.Collection;
 import java.util.HashMap;
 import java.util.HashSet;
@@ -32,8 +29,9 @@ public class UrnValidationUtil {
   // Related to BrowsePathv2
   public static final String URN_DELIMITER_SEPARATOR = "␟";
   // https://datahubproject.io/docs/what/urn/#restrictions
-  public static final Set<String> ILLEGAL_URN_COMPONENT_CHARACTERS = Set.of("(", ")");
-  public static final Set<String> ILLEGAL_URN_TUPLE_CHARACTERS = Set.of(",");
+  public static final Set<String> ILLEGAL_URN_CHARACTERS_PARENTHESES = Set.of("(", ")");
+  // Commas are used as delimiters in tuple URNs, but not allowed in URN components
+  public static final Set<String> ILLEGAL_URN_COMPONENT_DELIMITER = Set.of(",");
 
   private UrnValidationUtil() {}
 
@@ -66,17 +64,29 @@ public static void validateUrn(
           "Error: URN cannot contain " + URN_DELIMITER_SEPARATOR + " character");
     }
 
-    int totalParts = urn.getEntityKey().getParts().size();
-    List<String> illegalComponents =
-        urn.getEntityKey().getParts().stream()
-            .flatMap(part -> processUrnPartRecursively(part, totalParts))
-            .collect(Collectors.toList());
+    // Check if this is a simple (non-tuple) URN containing commas
+    if (urn.getEntityKey().getParts().size() == 1) {
+      String part = urn.getEntityKey().getParts().get(0);
+      if (part.contains(",")) {
+        if (strict) {
+          throw new IllegalArgumentException(
+              String.format(
+                  "Simple URN %s contains comma character which is not allowed in non-tuple URNs",
+                  urn));
+        } else {
+          log.error(
+              "Simple URN {} contains comma character which is not allowed in non-tuple URNs", urn);
+        }
+      }
+    }
+
+    // Validate all the URN parts
+    List<String> illegalComponents = validateUrnComponents(urn);
 
     if (!illegalComponents.isEmpty()) {
       String message =
           String.format(
-              "Illegal `%s` characters detected in URN %s component(s): %s",
-              ILLEGAL_URN_COMPONENT_CHARACTERS, urn, illegalComponents);
+              "Illegal characters detected in URN %s component(s): %s", urn, illegalComponents);
 
       if (strict) {
         throw new IllegalArgumentException(message);
@@ -92,25 +102,40 @@ public static void validateUrn(
     }
   }
 
-  /** Recursively process URN parts with URL decoding */
+  /** Validates all components of a URN and returns a list of any illegal components. */
+  private static List<String> validateUrnComponents(Urn urn) {
+    int totalParts = urn.getEntityKey().getParts().size();
+
+    return urn.getEntityKey().getParts().stream()
+        .flatMap(part -> processUrnPartRecursively(part, totalParts))
+        .collect(Collectors.toList());
+  }
+
+  /** Recursively process URN parts with URL decoding and check for illegal characters */
   private static Stream<String> processUrnPartRecursively(String urnPart, int totalParts) {
-    String decodedPart =
-        URLDecoder.decode(URLEncodingFixer.fixURLEncoding(urnPart), StandardCharsets.UTF_8);
-    if (decodedPart.startsWith("urn:li:")) {
-      // Recursively process nested URN after decoding
-      int nestedParts = UrnUtils.getUrn(decodedPart).getEntityKey().getParts().size();
-      return UrnUtils.getUrn(decodedPart).getEntityKey().getParts().stream()
-          .flatMap(part -> processUrnPartRecursively(part, nestedParts));
+    // If this part is a nested URN, don't check it directly for illegal characters
+    if (urnPart.startsWith("urn:li:")) {
+      return Stream.empty();
     }
+
+    // If this part has encoded parentheses, consider it valid
+    if (urnPart.contains("%28") || urnPart.contains("%29")) {
+      return Stream.empty();
+    }
+
+    // Check for unencoded parentheses in any part
+    if (ILLEGAL_URN_CHARACTERS_PARENTHESES.stream().anyMatch(c -> urnPart.contains(c))) {
+      return Stream.of(urnPart);
+    }
+
+    // For tuple parts (URNs with multiple components), check for illegal commas within components
     if (totalParts > 1) {
-      if (ILLEGAL_URN_TUPLE_CHARACTERS.stream().anyMatch(c -> urnPart.contains(c))) {
+      if (ILLEGAL_URN_COMPONENT_DELIMITER.stream().anyMatch(c -> urnPart.contains(c))) {
         return Stream.of(urnPart);
       }
     }
-    if (ILLEGAL_URN_COMPONENT_CHARACTERS.stream().anyMatch(c -> urnPart.contains(c))) {
-      return Stream.of(urnPart);
-    }
 
+    // If we reach here, the part is valid
     return Stream.empty();
   }
 
@@ -207,53 +232,4 @@ public static class UrnValidationEntry {
     String urn;
     UrnValidationAnnotation annotation;
   }
-
-  /**
-   * Fixes malformed URL encoding by escaping unescaped % characters while preserving valid
-   * percent-encoded sequences.
-   */
-  private static class URLEncodingFixer {
-    /**
-     * @param input The potentially malformed URL-encoded string
-     * @return A string with proper URL encoding that can be safely decoded
-     */
-    public static String fixURLEncoding(String input) {
-      if (input == null) {
-        return null;
-      }
-
-      StringBuilder result = new StringBuilder(input.length() * 2);
-      int i = 0;
-
-      while (i < input.length()) {
-        char currentChar = input.charAt(i);
-
-        if (currentChar == '%') {
-          if (i + 2 < input.length()) {
-            // Check if the next two characters form a valid hex pair
-            String hexPair = input.substring(i + 1, i + 3);
-            if (isValidHexPair(hexPair)) {
-              // This is a valid percent-encoded sequence, keep it as is
-              result.append(currentChar);
-            } else {
-              // Invalid sequence, escape the % character
-              result.append("%25");
-            }
-          } else {
-            // % at the end of string, escape it
-            result.append("%25");
-          }
-        } else {
-          result.append(currentChar);
-        }
-        i++;
-      }
-
-      return result.toString();
-    }
-
-    private static boolean isValidHexPair(String pair) {
-      return pair.matches("[0-9A-Fa-f]{2}");
-    }
-  }
 }