Skip to content

Commit

Permalink
fix(urn-validation): additional test cases for urn validation (#12727)
Browse files Browse the repository at this point in the history
  • Loading branch information
david-leifker authored Feb 25, 2025
1 parent ffe2278 commit 2396f0e
Show file tree
Hide file tree
Showing 5 changed files with 233 additions and 189 deletions.
8 changes: 8 additions & 0 deletions metadata-ingestion/tests/unit/urns/invalid_urns.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,14 @@ urn:li:corpuser:abc)
# Reserved characters
urn:li:corpuser:foo␟bar
urn:li:tag:a,b,c
urn:li:corpuser:,
urn:li:dataset:(urn:li:dataPlatform:hdfs␟path,PROD)
urn:li:dashboard:(looker,dashboards,thelook)
urn:li:dataset:(urn:li:dataPlatform:hdfs,/path/to/data,())
urn:li:dataset:(urn:li:dataPlatform:hdfs,(illegal),PROD)
urn:li:corpuser:(foo)123
urn:li:dataJob:(urn:li:dataFlow:(mssql,1/2/3/4.c_n on %28LOCAL%29,PROD),1/2/3/4.c_n on (LOCAL))
urn:li:dataJob:(urn:li:dataFlow:(mssql,123.%28TEST%29,PROD),(TEST))

# CorpUser URN tests
urn:li:corpuser:(part1,part2)
Expand Down
4 changes: 4 additions & 0 deletions metadata-ingestion/tests/unit/urns/invalid_urns_java_only.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Not yet handled by python

# Invalid use of colon
urn:li:dataset:(urn:li:dataPlatform:abc:def,table_name,prod)
17 changes: 14 additions & 3 deletions metadata-ingestion/tests/unit/urns/valid_urns.txt
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
# Unknown entity types become generic urns
# Unknown entity types become generic urns (does not apply to Java)
urn:li:abc:foo
urn:li:abc:(foo,bar)
urn:li:abc:(urn:li:dataPlatform:abc,def,prod)

# A bunch of pretty normal urns
urn:li:corpuser:foo
urn:li:corpGroup:bar
urn:li:dataset:(urn:li:dataPlatform:abc,def/ghi,prod)
urn:li:dataFlow:(airflow,def,prod)
urn:li:dataset:(urn:li:dataPlatform:abc,def/ghi,PROD)
urn:li:dataFlow:(airflow,def,PROD)
urn:li:dataJob:(urn:li:dataFlow:(airflow,flow_id,prod),job_id)
urn:li:tag:abc
urn:li:chart:(looker,chart_name)
Expand All @@ -22,3 +22,14 @@ urn:li:tag::
urn:li:dashboard:(looker,dashboards.thelook::customer_lookup)
urn:li:dataPlatform:abc:def
urn:li:corpuser:foo:bar@example.com

# From java test cases
urn:li:corpuser:foo:bar
urn:li:dataset:(urn:li:dataPlatform:hdfs,/path/to/data,PROD)
urn:li:dataPlatform:abc:def
urn:li:dataset:(urn:li:dataPlatform:s3,urn:li:dataset:%28urn:li:dataPlatform:s3%2Ctest-datalake-concepts/prog_maintenance%2CPROD%29,PROD)
urn:li:dataset:(urn:li:dataPlatform:bigquery,myproject.dataset.table,PROD)
urn:li:assertion:123=-%28__% weekly__%29
urn:li:dataset:(urn:li:dataPlatform:s3,urn:li:dataset:%28urn:li:dataPlatform:s3%2Ctest-datalake-concepts%prog_maintenance%2CPROD%29,PROD)
urn:li:dataJob:(urn:li:dataFlow:(mssql,123.%28TEST%29,PROD),%28TEST%29)
urn:li:dashboard:(looker,dashboards.thelook::cohort_data_tool)
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package com.linkedin.metadata.utils;

import com.linkedin.common.urn.Urn;
import com.linkedin.common.urn.UrnUtils;
import com.linkedin.data.DataList;
import com.linkedin.data.DataMap;
import com.linkedin.metadata.Constants;
Expand All @@ -11,9 +10,7 @@
import com.linkedin.metadata.models.annotation.UrnValidationAnnotation;
import com.linkedin.metadata.models.registry.EntityRegistry;
import java.net.URISyntaxException;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
Expand All @@ -32,8 +29,9 @@ public class UrnValidationUtil {
// Related to BrowsePathv2
public static final String URN_DELIMITER_SEPARATOR = "␟";
// https://datahubproject.io/docs/what/urn/#restrictions
public static final Set<String> ILLEGAL_URN_COMPONENT_CHARACTERS = Set.of("(", ")");
public static final Set<String> ILLEGAL_URN_TUPLE_CHARACTERS = Set.of(",");
public static final Set<String> ILLEGAL_URN_CHARACTERS_PARENTHESES = Set.of("(", ")");
// Commas are used as delimiters in tuple URNs, but not allowed in URN components
public static final Set<String> ILLEGAL_URN_COMPONENT_DELIMITER = Set.of(",");

private UrnValidationUtil() {}

Expand Down Expand Up @@ -66,17 +64,29 @@ public static void validateUrn(
"Error: URN cannot contain " + URN_DELIMITER_SEPARATOR + " character");
}

int totalParts = urn.getEntityKey().getParts().size();
List<String> illegalComponents =
urn.getEntityKey().getParts().stream()
.flatMap(part -> processUrnPartRecursively(part, totalParts))
.collect(Collectors.toList());
// Check if this is a simple (non-tuple) URN containing commas
if (urn.getEntityKey().getParts().size() == 1) {
String part = urn.getEntityKey().getParts().get(0);
if (part.contains(",")) {
if (strict) {
throw new IllegalArgumentException(
String.format(
"Simple URN %s contains comma character which is not allowed in non-tuple URNs",
urn));
} else {
log.error(
"Simple URN {} contains comma character which is not allowed in non-tuple URNs", urn);
}
}
}

// Validate all the URN parts
List<String> illegalComponents = validateUrnComponents(urn);

if (!illegalComponents.isEmpty()) {
String message =
String.format(
"Illegal `%s` characters detected in URN %s component(s): %s",
ILLEGAL_URN_COMPONENT_CHARACTERS, urn, illegalComponents);
"Illegal characters detected in URN %s component(s): %s", urn, illegalComponents);

if (strict) {
throw new IllegalArgumentException(message);
Expand All @@ -92,25 +102,40 @@ public static void validateUrn(
}
}

/** Recursively process URN parts with URL decoding */
/** Validates all components of a URN and returns a list of any illegal components. */
private static List<String> validateUrnComponents(Urn urn) {
int totalParts = urn.getEntityKey().getParts().size();

return urn.getEntityKey().getParts().stream()
.flatMap(part -> processUrnPartRecursively(part, totalParts))
.collect(Collectors.toList());
}

/** Recursively process URN parts with URL decoding and check for illegal characters */
private static Stream<String> processUrnPartRecursively(String urnPart, int totalParts) {
String decodedPart =
URLDecoder.decode(URLEncodingFixer.fixURLEncoding(urnPart), StandardCharsets.UTF_8);
if (decodedPart.startsWith("urn:li:")) {
// Recursively process nested URN after decoding
int nestedParts = UrnUtils.getUrn(decodedPart).getEntityKey().getParts().size();
return UrnUtils.getUrn(decodedPart).getEntityKey().getParts().stream()
.flatMap(part -> processUrnPartRecursively(part, nestedParts));
// If this part is a nested URN, don't check it directly for illegal characters
if (urnPart.startsWith("urn:li:")) {
return Stream.empty();
}

// If this part has encoded parentheses, consider it valid
if (urnPart.contains("%28") || urnPart.contains("%29")) {
return Stream.empty();
}

// Check for unencoded parentheses in any part
if (ILLEGAL_URN_CHARACTERS_PARENTHESES.stream().anyMatch(c -> urnPart.contains(c))) {
return Stream.of(urnPart);
}

// For tuple parts (URNs with multiple components), check for illegal commas within components
if (totalParts > 1) {
if (ILLEGAL_URN_TUPLE_CHARACTERS.stream().anyMatch(c -> urnPart.contains(c))) {
if (ILLEGAL_URN_COMPONENT_DELIMITER.stream().anyMatch(c -> urnPart.contains(c))) {
return Stream.of(urnPart);
}
}
if (ILLEGAL_URN_COMPONENT_CHARACTERS.stream().anyMatch(c -> urnPart.contains(c))) {
return Stream.of(urnPart);
}

// If we reach here, the part is valid
return Stream.empty();
}

Expand Down Expand Up @@ -207,53 +232,4 @@ public static class UrnValidationEntry {
String urn;
UrnValidationAnnotation annotation;
}

/**
* Fixes malformed URL encoding by escaping unescaped % characters while preserving valid
* percent-encoded sequences.
*/
private static class URLEncodingFixer {
/**
* @param input The potentially malformed URL-encoded string
* @return A string with proper URL encoding that can be safely decoded
*/
public static String fixURLEncoding(String input) {
if (input == null) {
return null;
}

StringBuilder result = new StringBuilder(input.length() * 2);
int i = 0;

while (i < input.length()) {
char currentChar = input.charAt(i);

if (currentChar == '%') {
if (i + 2 < input.length()) {
// Check if the next two characters form a valid hex pair
String hexPair = input.substring(i + 1, i + 3);
if (isValidHexPair(hexPair)) {
// This is a valid percent-encoded sequence, keep it as is
result.append(currentChar);
} else {
// Invalid sequence, escape the % character
result.append("%25");
}
} else {
// % at the end of string, escape it
result.append("%25");
}
} else {
result.append(currentChar);
}
i++;
}

return result.toString();
}

private static boolean isValidHexPair(String pair) {
return pair.matches("[0-9A-Fa-f]{2}");
}
}
}
Loading

0 comments on commit 2396f0e

Please sign in to comment.