Skip to content

Commit

Permalink
feat(ingestion-tracing): implement ingestion integration with tracing…
Browse files Browse the repository at this point in the history
… api
  • Loading branch information
david-leifker committed Feb 23, 2025
1 parent 7bee19c commit b8f315b
Show file tree
Hide file tree
Showing 8 changed files with 1,502 additions and 16 deletions.
188 changes: 188 additions & 0 deletions metadata-ingestion/src/datahub/emitter/openapi_emitter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
import json
import logging
from collections import defaultdict
from dataclasses import dataclass
from datetime import timedelta
from typing import Dict, List, Optional, Sequence, Union

from requests import Response

from datahub.cli.cli_utils import ensure_has_system_metadata
from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.emitter.openapi_tracer import OpenAPITrace
from datahub.emitter.response_helper import extract_trace_data
from datahub.emitter.rest_emitter import (
_DATAHUB_EMITTER_TRACE,
BATCH_INGEST_MAX_PAYLOAD_LENGTH,
INGEST_MAX_PAYLOAD_BYTES,
DataHubRestEmitter,
)
from datahub.emitter.serialization_helper import pre_json_transform
from datahub.metadata.com.linkedin.pegasus2avro.mxe import (
MetadataChangeProposal,
)

logger = logging.getLogger(__name__)


@dataclass
class Chunk:
items: List[str]
total_bytes: int = 0

def add_item(self, item: str) -> bool:
item_bytes = len(item.encode())
if not self.items: # Always add at least one item even if over byte limit
self.items.append(item)
self.total_bytes += item_bytes
return True
self.items.append(item)
self.total_bytes += item_bytes
return True

Check warning on line 41 in metadata-ingestion/src/datahub/emitter/openapi_emitter.py

View check run for this annotation

Codecov / codecov/patch

metadata-ingestion/src/datahub/emitter/openapi_emitter.py#L34-L41

Added lines #L34 - L41 were not covered by tests

@staticmethod
def join(chunk: "Chunk") -> str:
return "[" + ",".join(chunk.items) + "]"

Check warning on line 45 in metadata-ingestion/src/datahub/emitter/openapi_emitter.py

View check run for this annotation

Codecov / codecov/patch

metadata-ingestion/src/datahub/emitter/openapi_emitter.py#L45

Added line #L45 was not covered by tests


class DataHubOpenApiEmitter(DataHubRestEmitter, OpenAPITrace):
def __init__(
self,
gms_server: str,
token: Optional[str] = None,
timeout_sec: Optional[float] = None,
connect_timeout_sec: Optional[float] = None,
read_timeout_sec: Optional[float] = None,
retry_status_codes: Optional[List[int]] = None,
retry_methods: Optional[List[str]] = None,
retry_max_times: Optional[int] = None,
extra_headers: Optional[Dict[str, str]] = None,
ca_certificate_path: Optional[str] = None,
client_certificate_path: Optional[str] = None,
disable_ssl_verification: bool = False,
default_trace_mode: bool = False,
):
super().__init__(

Check warning on line 65 in metadata-ingestion/src/datahub/emitter/openapi_emitter.py

View check run for this annotation

Codecov / codecov/patch

metadata-ingestion/src/datahub/emitter/openapi_emitter.py#L65

Added line #L65 was not covered by tests
gms_server,
token,
timeout_sec,
connect_timeout_sec,
read_timeout_sec,
retry_status_codes,
retry_methods,
retry_max_times,
extra_headers,
ca_certificate_path,
client_certificate_path,
disable_ssl_verification,
default_trace_mode,
)

def _emit_generic(self, url: str, payload: dict) -> Response:
return super()._emit_generic(url, payload=json.dumps(payload))

Check warning on line 82 in metadata-ingestion/src/datahub/emitter/openapi_emitter.py

View check run for this annotation

Codecov / codecov/patch

metadata-ingestion/src/datahub/emitter/openapi_emitter.py#L82

Added line #L82 was not covered by tests

def _to_request(
self,
mcp: Union[MetadataChangeProposal, MetadataChangeProposalWrapper],
async_flag: Optional[bool] = None,
async_default: bool = False,
):
resolved_async_flag = async_flag if async_flag is not None else async_default
url = f"{self._gms_server}/openapi/v3/entity/{mcp.entityType}?async={'true' if resolved_async_flag else 'false'}"
ensure_has_system_metadata(mcp)
aspect_value = pre_json_transform(mcp.aspect.to_obj())
return (

Check warning on line 94 in metadata-ingestion/src/datahub/emitter/openapi_emitter.py

View check run for this annotation

Codecov / codecov/patch

metadata-ingestion/src/datahub/emitter/openapi_emitter.py#L90-L94

Added lines #L90 - L94 were not covered by tests
url,
[
{
"urn": mcp.entityUrn,
mcp.aspectName: {
"value": aspect_value,
"systemMetadata": mcp.systemMetadata.to_obj(),
},
}
],
)

def emit_mcp(
self,
mcp: Union[MetadataChangeProposal, MetadataChangeProposalWrapper],
async_flag: Optional[bool] = None,
trace_flag: Optional[bool] = None,
trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
) -> None:
request = self._to_request(mcp, async_flag)

Check warning on line 114 in metadata-ingestion/src/datahub/emitter/openapi_emitter.py

View check run for this annotation

Codecov / codecov/patch

metadata-ingestion/src/datahub/emitter/openapi_emitter.py#L114

Added line #L114 was not covered by tests

response = self._emit_generic(request[0], payload=request[1])

Check warning on line 116 in metadata-ingestion/src/datahub/emitter/openapi_emitter.py

View check run for this annotation

Codecov / codecov/patch

metadata-ingestion/src/datahub/emitter/openapi_emitter.py#L116

Added line #L116 was not covered by tests

if self._should_trace(async_flag, trace_flag):
trace_data = extract_trace_data(response) if response else None
if trace_data:
self.await_status([trace_data], trace_timeout)

Check warning on line 121 in metadata-ingestion/src/datahub/emitter/openapi_emitter.py

View check run for this annotation

Codecov / codecov/patch

metadata-ingestion/src/datahub/emitter/openapi_emitter.py#L118-L121

Added lines #L118 - L121 were not covered by tests

def emit_mcps(
self,
mcps: Sequence[Union[MetadataChangeProposal, MetadataChangeProposalWrapper]],
async_flag: Optional[bool] = None,
trace_flag: Optional[bool] = None,
trace_timeout: Optional[timedelta] = timedelta(seconds=3600),
) -> int:
"""
1. Grouping MCPs by their entity URL
2. Breaking down large batches into smaller chunks based on both:
* Total byte size (INGEST_MAX_PAYLOAD_BYTES)
* Maximum number of items (BATCH_INGEST_MAX_PAYLOAD_LENGTH)
The Chunk class encapsulates both the items and their byte size tracking
Serializing the items only once with json.dumps(request[1]) and reusing that
The chunking logic handles edge cases (always accepting at least one item per chunk)
The joining logic is efficient with a simple string concatenation
:param mcps: metadata change proposals to transmit
:param async_flag: the mode
:return:
"""
if _DATAHUB_EMITTER_TRACE:
logger.debug(f"Attempting to emit MCP batch of size {len(mcps)}")

Check warning on line 146 in metadata-ingestion/src/datahub/emitter/openapi_emitter.py

View check run for this annotation

Codecov / codecov/patch

metadata-ingestion/src/datahub/emitter/openapi_emitter.py#L145-L146

Added lines #L145 - L146 were not covered by tests

# group by entity url
batches = defaultdict(

Check warning on line 149 in metadata-ingestion/src/datahub/emitter/openapi_emitter.py

View check run for this annotation

Codecov / codecov/patch

metadata-ingestion/src/datahub/emitter/openapi_emitter.py#L149

Added line #L149 was not covered by tests
lambda: [Chunk(items=[])]
) # Initialize with one empty Chunk

for mcp in mcps:
request = self._to_request(mcp, async_flag, async_default=True)
current_chunk = batches[request[0]][-1] # Get the last chunk

Check warning on line 155 in metadata-ingestion/src/datahub/emitter/openapi_emitter.py

View check run for this annotation

Codecov / codecov/patch

metadata-ingestion/src/datahub/emitter/openapi_emitter.py#L153-L155

Added lines #L153 - L155 were not covered by tests
# Only serialize once
serialized_item = json.dumps(request[1][0])
item_bytes = len(serialized_item.encode())

Check warning on line 158 in metadata-ingestion/src/datahub/emitter/openapi_emitter.py

View check run for this annotation

Codecov / codecov/patch

metadata-ingestion/src/datahub/emitter/openapi_emitter.py#L157-L158

Added lines #L157 - L158 were not covered by tests

# If adding this item would exceed max_bytes, create a new chunk
# Unless the chunk is empty (always add at least one item)
if current_chunk.items and (

Check warning on line 162 in metadata-ingestion/src/datahub/emitter/openapi_emitter.py

View check run for this annotation

Codecov / codecov/patch

metadata-ingestion/src/datahub/emitter/openapi_emitter.py#L162

Added line #L162 was not covered by tests
current_chunk.total_bytes + item_bytes > INGEST_MAX_PAYLOAD_BYTES
or len(current_chunk.items) >= BATCH_INGEST_MAX_PAYLOAD_LENGTH
):
new_chunk = Chunk(items=[])
batches[request[0]].append(new_chunk)
current_chunk = new_chunk

Check warning on line 168 in metadata-ingestion/src/datahub/emitter/openapi_emitter.py

View check run for this annotation

Codecov / codecov/patch

metadata-ingestion/src/datahub/emitter/openapi_emitter.py#L166-L168

Added lines #L166 - L168 were not covered by tests

current_chunk.add_item(serialized_item)

Check warning on line 170 in metadata-ingestion/src/datahub/emitter/openapi_emitter.py

View check run for this annotation

Codecov / codecov/patch

metadata-ingestion/src/datahub/emitter/openapi_emitter.py#L170

Added line #L170 was not covered by tests

responses = []
for url, chunks in batches.items():
for chunk in chunks:
response = super()._emit_generic(url, payload=Chunk.join(chunk))
responses.append(response)

Check warning on line 176 in metadata-ingestion/src/datahub/emitter/openapi_emitter.py

View check run for this annotation

Codecov / codecov/patch

metadata-ingestion/src/datahub/emitter/openapi_emitter.py#L172-L176

Added lines #L172 - L176 were not covered by tests

if self._should_trace(async_flag, trace_flag, async_default=True):
trace_data = []
for response in responses:
data = extract_trace_data(response) if response else None
if data is not None:
trace_data.append(data)

Check warning on line 183 in metadata-ingestion/src/datahub/emitter/openapi_emitter.py

View check run for this annotation

Codecov / codecov/patch

metadata-ingestion/src/datahub/emitter/openapi_emitter.py#L178-L183

Added lines #L178 - L183 were not covered by tests

if trace_data:
self.await_status(trace_data, trace_timeout)

Check warning on line 186 in metadata-ingestion/src/datahub/emitter/openapi_emitter.py

View check run for this annotation

Codecov / codecov/patch

metadata-ingestion/src/datahub/emitter/openapi_emitter.py#L185-L186

Added lines #L185 - L186 were not covered by tests

return len(responses)

Check warning on line 188 in metadata-ingestion/src/datahub/emitter/openapi_emitter.py

View check run for this annotation

Codecov / codecov/patch

metadata-ingestion/src/datahub/emitter/openapi_emitter.py#L188

Added line #L188 was not covered by tests
97 changes: 97 additions & 0 deletions metadata-ingestion/src/datahub/emitter/openapi_tracer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import logging
import time
from datetime import datetime, timedelta
from typing import List

from datahub.configuration.common import (
OperationalError,
)
from datahub.emitter.response_helper import TraceData

logger = logging.getLogger(__name__)

PENDING_STATUS = "PENDING"
INITIAL_BACKOFF = 1.0 # Start with 1 second
MAX_BACKOFF = 300.0 # Cap at 5 minutes
BACKOFF_FACTOR = 2.0 # Double the wait time each attempt


class OpenAPITrace:
def await_status(
self,
trace_data: List[TraceData],
trace_timeout: timedelta,
) -> None:
"""Verify the status of asynchronous write operations.
Args:
trace_data: List of trace data to verify
trace_timeout: Maximum time to wait for verification.
Raises:
OperationalError: If verification fails or times out
"""
try:
if not trace_data:
logger.debug("No trace data to verify")
return

Check warning on line 37 in metadata-ingestion/src/datahub/emitter/openapi_tracer.py

View check run for this annotation

Codecov / codecov/patch

metadata-ingestion/src/datahub/emitter/openapi_tracer.py#L34-L37

Added lines #L34 - L37 were not covered by tests

start_time = datetime.now()

Check warning on line 39 in metadata-ingestion/src/datahub/emitter/openapi_tracer.py

View check run for this annotation

Codecov / codecov/patch

metadata-ingestion/src/datahub/emitter/openapi_tracer.py#L39

Added line #L39 was not covered by tests

for trace in trace_data:
current_backoff = INITIAL_BACKOFF

Check warning on line 42 in metadata-ingestion/src/datahub/emitter/openapi_tracer.py

View check run for this annotation

Codecov / codecov/patch

metadata-ingestion/src/datahub/emitter/openapi_tracer.py#L41-L42

Added lines #L41 - L42 were not covered by tests

while trace.data:
if datetime.now() - start_time > trace_timeout:
raise OperationalError(

Check warning on line 46 in metadata-ingestion/src/datahub/emitter/openapi_tracer.py

View check run for this annotation

Codecov / codecov/patch

metadata-ingestion/src/datahub/emitter/openapi_tracer.py#L44-L46

Added lines #L44 - L46 were not covered by tests
f"Timeout waiting for async write completion after {trace_timeout.total_seconds()} seconds"
)

base_url = f"{self._gms_server}/openapi/v1/trace/write"
url = f"{base_url}/{trace.trace_id}?onlyIncludeErrors=false&detailed=true"

Check warning on line 51 in metadata-ingestion/src/datahub/emitter/openapi_tracer.py

View check run for this annotation

Codecov / codecov/patch

metadata-ingestion/src/datahub/emitter/openapi_tracer.py#L50-L51

Added lines #L50 - L51 were not covered by tests

response = self._emit_generic(url, payload=trace.data)
json_data = response.json()

Check warning on line 54 in metadata-ingestion/src/datahub/emitter/openapi_tracer.py

View check run for this annotation

Codecov / codecov/patch

metadata-ingestion/src/datahub/emitter/openapi_tracer.py#L53-L54

Added lines #L53 - L54 were not covered by tests

for urn, aspects in json_data.items():
for aspect_name, aspect_status in aspects.items():
if not aspect_status["success"]:
error_msg = (

Check warning on line 59 in metadata-ingestion/src/datahub/emitter/openapi_tracer.py

View check run for this annotation

Codecov / codecov/patch

metadata-ingestion/src/datahub/emitter/openapi_tracer.py#L56-L59

Added lines #L56 - L59 were not covered by tests
f"Unable to validate async write to DataHub GMS: "
f"Persistence failure for URN '{urn}' aspect '{aspect_name}'. "
f"Status: {aspect_status}"
)
raise OperationalError(error_msg, aspect_status)

Check warning on line 64 in metadata-ingestion/src/datahub/emitter/openapi_tracer.py

View check run for this annotation

Codecov / codecov/patch

metadata-ingestion/src/datahub/emitter/openapi_tracer.py#L64

Added line #L64 was not covered by tests

primary_storage = aspect_status["primaryStorage"][

Check warning on line 66 in metadata-ingestion/src/datahub/emitter/openapi_tracer.py

View check run for this annotation

Codecov / codecov/patch

metadata-ingestion/src/datahub/emitter/openapi_tracer.py#L66

Added line #L66 was not covered by tests
"writeStatus"
]
search_storage = aspect_status["searchStorage"][

Check warning on line 69 in metadata-ingestion/src/datahub/emitter/openapi_tracer.py

View check run for this annotation

Codecov / codecov/patch

metadata-ingestion/src/datahub/emitter/openapi_tracer.py#L69

Added line #L69 was not covered by tests
"writeStatus"
]

# Remove resolved statuses
if (

Check warning on line 74 in metadata-ingestion/src/datahub/emitter/openapi_tracer.py

View check run for this annotation

Codecov / codecov/patch

metadata-ingestion/src/datahub/emitter/openapi_tracer.py#L74

Added line #L74 was not covered by tests
primary_storage != PENDING_STATUS
and search_storage != PENDING_STATUS
):
trace.data[urn].remove(aspect_name)

Check warning on line 78 in metadata-ingestion/src/datahub/emitter/openapi_tracer.py

View check run for this annotation

Codecov / codecov/patch

metadata-ingestion/src/datahub/emitter/openapi_tracer.py#L78

Added line #L78 was not covered by tests

# Remove urns with all statuses resolved
if not trace.data[urn]:
trace.data.pop(urn)

Check warning on line 82 in metadata-ingestion/src/datahub/emitter/openapi_tracer.py

View check run for this annotation

Codecov / codecov/patch

metadata-ingestion/src/datahub/emitter/openapi_tracer.py#L81-L82

Added lines #L81 - L82 were not covered by tests

# Adjust backoff based on response
if trace.data:

Check warning on line 85 in metadata-ingestion/src/datahub/emitter/openapi_tracer.py

View check run for this annotation

Codecov / codecov/patch

metadata-ingestion/src/datahub/emitter/openapi_tracer.py#L85

Added line #L85 was not covered by tests
# If we still have pending items, increase backoff
current_backoff = min(

Check warning on line 87 in metadata-ingestion/src/datahub/emitter/openapi_tracer.py

View check run for this annotation

Codecov / codecov/patch

metadata-ingestion/src/datahub/emitter/openapi_tracer.py#L87

Added line #L87 was not covered by tests
current_backoff * BACKOFF_FACTOR, MAX_BACKOFF
)
logger.debug(

Check warning on line 90 in metadata-ingestion/src/datahub/emitter/openapi_tracer.py

View check run for this annotation

Codecov / codecov/patch

metadata-ingestion/src/datahub/emitter/openapi_tracer.py#L90

Added line #L90 was not covered by tests
f"Waiting {current_backoff} seconds before next check"
)
time.sleep(current_backoff)

Check warning on line 93 in metadata-ingestion/src/datahub/emitter/openapi_tracer.py

View check run for this annotation

Codecov / codecov/patch

metadata-ingestion/src/datahub/emitter/openapi_tracer.py#L93

Added line #L93 was not covered by tests

except Exception as e:
logger.error(f"Error during status verification: {str(e)}")
raise

Check warning on line 97 in metadata-ingestion/src/datahub/emitter/openapi_tracer.py

View check run for this annotation

Codecov / codecov/patch

metadata-ingestion/src/datahub/emitter/openapi_tracer.py#L95-L97

Added lines #L95 - L97 were not covered by tests
Loading

0 comments on commit b8f315b

Please sign in to comment.