getsentry · cmanallen · Feb 18, 2025 · Feb 19, 2025 · Feb 19, 2025 · Feb 19, 2025
diff --git a/bin/mock-replay-recording b/bin/mock-replay-recording
@@ -0,0 +1,76 @@
+#!/usr/bin/env python
+""".
+
+Helpful commands:
+
+    - Run the consumer.
+        - `sentry run consumer ingest-replay-recordings --consumer-group 0`
+    - Check if offsets are committed correctly.
+        - `docker exec -it kafka-kafka-1 kafka-consumer-groups --bootstrap-server localhost:9092 --describe --group 0`
+"""
+from sentry.runner import configure
+
+configure()
+import logging
+import os
+import time
+import uuid
+
+import click
+from arroyo import Topic as ArroyoTopic
+from arroyo.backends.kafka import KafkaPayload, KafkaProducer, build_kafka_configuration
+from sentry_kafka_schemas.codecs import Codec
+from sentry_kafka_schemas.schema_types.ingest_replay_recordings_v1 import ReplayRecording
+
+os.environ.setdefault("DJANGO_SETTINGS_MODULE", "sentry.conf.server")
+
+import django
+
+django.setup()
+
+from sentry.conf.types.kafka_definition import Topic, get_topic_codec
+from sentry.utils.kafka_config import get_kafka_producer_cluster_options, get_topic_definition
+
+logger = logging.getLogger(__name__)
+
+
+def get_producer() -> KafkaProducer:
+    cluster_name = get_topic_definition(Topic.INGEST_REPLAYS_RECORDINGS)["cluster"]
+    producer_config = get_kafka_producer_cluster_options(cluster_name)
+    return KafkaProducer(build_kafka_configuration(default_config=producer_config))
+
+
+RECORDING_CODEC: Codec[ReplayRecording] = get_topic_codec(Topic.INGEST_REPLAYS_RECORDINGS)
+
+
+@click.command()
+@click.option("--organization-id", type=int, required=True, help="Organization ID")
+@click.option("--project-id", type=int, required=True, help="Project ID")
+def main(organization_id: int, project_id: int) -> None:
+    """Produce a mock uptime result message to the INGEST_REPLAYS_RECORDINGS topic."""
+    message: ReplayRecording = {
+        "key_id": None,
+        "org_id": organization_id,
+        "payload": b'{"segment_id"',
+        "project_id": project_id,
+        "received": int(time.time()),
+        "replay_event": None,
+        "replay_id": uuid.uuid4().hex,
+        "replay_video": None,
+        "retention_days": 30,
+        "type": "replay_recording_not_chunked",
+        "version": 1,
+    }
+
+    producer = get_producer()
+    topic = get_topic_definition(Topic.INGEST_REPLAYS_RECORDINGS)["real_topic_name"]
+    payload = KafkaPayload(None, RECORDING_CODEC.encode(message), [])
+
+    producer.produce(ArroyoTopic(topic), payload)
+    producer.close()
+
+    logger.info("Successfully produced message to %s", topic)
+
+
+if __name__ == "__main__":
+    main()
@@ -84,24 +84,11 @@ def ingest_replay_recordings_options() -> list[click.Option]:
 
 def ingest_replay_recordings_buffered_options() -> list[click.Option]:
     """Return a list of ingest-replay-recordings-buffered options."""
-    options = [
-        click.Option(
-            ["--max-buffer-message-count", "max_buffer_message_count"],
-            type=int,
-            default=100,
-        ),
-        click.Option(
-            ["--max-buffer-size-in-bytes", "max_buffer_size_in_bytes"],
-            type=int,
-            default=2_500_000,
-        ),
-        click.Option(
-            ["--max-buffer-time-in-seconds", "max_buffer_time_in_seconds"],
-            type=int,
-            default=1,
-        ),
+    return [
+        click.Option(["--max-buffer-length", "max_buffer_length"], type=int, default=8),
+        click.Option(["--max-buffer-wait", "max_buffer_wait"], type=int, default=1),
+        click.Option(["--max-workers", "max_workers"], type=int, default=8),
     ]
-    return options
 
 
 def ingest_monitors_options() -> list[click.Option]:
@@ -269,8 +256,8 @@ def ingest_transactions_options() -> list[click.Option]:
     },
     "ingest-replay-recordings": {
         "topic": Topic.INGEST_REPLAYS_RECORDINGS,
-        "strategy_factory": "sentry.replays.consumers.recording.ProcessReplayRecordingStrategyFactory",
-        "click_options": ingest_replay_recordings_options(),
+        "strategy_factory": "sentry.replays.consumers.buffered.factory.PlatformStrategyFactory",
+        "click_options": ingest_replay_recordings_buffered_options(),
     },
     "ingest-replay-recordings-buffered": {
         "topic": Topic.INGEST_REPLAYS_RECORDINGS,

@@ -0,0 +1,128 @@
+"""Session Replay recording consumer implementation.
+
+To understand how the buffering works visit the `lib.py` module and inspect the source of the
+buffering runtime.
+
+This module has two parts. A processing component and a buffer flushing component. The processing
+component is straight-forward. It accepts a message and performs some work on it. After it
+completes it instructs the runtime to append the message to the buffer. This is abstracted by the
+buffering runtime library so we just return the transformed data in this module.
+
+The second part is the flushing of the buffer. The buffering runtime library has no idea when to
+flush this buffer so it constantly asks us if it can flush. We control flushing behavior through a
+stateful "BufferManager" class.  If we can_flush then we do_flush. After the flush completes the
+RunTime will commit the offsets.
+"""
+
+import contextlib
+import time
+from concurrent.futures import FIRST_EXCEPTION, ThreadPoolExecutor, wait
+from typing import TypedDict
+
+import sentry_sdk
+
+from sentry.replays.consumers.buffered.lib import Model, buffering_runtime
+from sentry.replays.usecases.ingest import (
+    DropSilently,
+    ProcessedRecordingMessage,
+    commit_recording_message,
+    parse_recording_message,
+    process_recording_message,
+    sentry_tracing,
+    track_recording_metadata,
+)
+
+
+class Flags(TypedDict):
+    max_buffer_length: int
+    max_buffer_wait: int
+    max_workers: int
+
+
+class BufferManager:
+    """Buffer manager.
+
+    The buffer manager is a class instance has a lifetime as long as the RunTime's. We pass its
+    methods as callbacks to the Model. The state contained within the method's instance is implicit
+    and unknown to the RunTime.
+    """
+
+    def __init__(self, flags: Flags) -> None:
+        self.__max_buffer_length = flags["max_buffer_length"]
+        self.__max_buffer_wait = flags["max_buffer_wait"]
+        self.__max_workers = flags["max_workers"]
+
+        self.__last_flushed_at = time.time()
+
+    def can_flush(self, model: Model[ProcessedRecordingMessage]) -> bool:
+        # TODO: time.time is stateful and hard to test. We should enable the RunTime to perform
+        #       managed effects so we can properly test this behavior.
+        return (
+            len(model.buffer) >= self.__max_buffer_length
+            or (time.time() - self.__max_buffer_wait) >= self.__last_flushed_at
+        )
+
+    def do_flush(self, model: Model[ProcessedRecordingMessage]) -> None:
+        with sentry_tracing("replays.consumers.buffered.flush_buffer"):
+            flush_buffer(model, max_workers=self.__max_workers)
+            # TODO: time.time again. Should be declarative for testing purposes.
+            self.__last_flushed_at = time.time()
+
+
+@sentry_sdk.trace
+def flush_buffer(model: Model[ProcessedRecordingMessage], max_workers: int) -> None:
+    if len(model.buffer) == 0:
+        return None
+
+    with ThreadPoolExecutor(max_workers=max_workers) as pool:
+        futures = [pool.submit(flush_message, message) for message in model.buffer]
+
+        # Tasks can fail. We check the done set for any failures. We will wait for all the
+        # futures to complete before running this step or eagerly run this step if any task
+        # errors.
+        done, _ = wait(futures, return_when=FIRST_EXCEPTION)
+        for future in done:
+            exc = future.exception()
+            if exc is not None:
+                # TODO: Why raise? Can I do something more meaningful here than reject the whole
+                #       batch? Raising is certainly the easiest way of handling failures...
+                raise exc
+
+    # Recording metadata is not tracked in the threadpool. This is because this function will
+    # log. Logging will acquire a lock and make our threading less useful due to the speed of
+    # the I/O we do in this step.
+    for message in model.buffer:
+        track_recording_metadata(message)
+
+    return None
+
+
+@sentry_sdk.trace
+def flush_message(message: ProcessedRecordingMessage) -> None:
+    with contextlib.suppress(DropSilently):
+        commit_recording_message(message)
+
+
+def process_message(message_bytes: bytes) -> ProcessedRecordingMessage | None:
+    """Message processing function.
+
+    Accepts an unstructured type and returns a structured one. Other than tracing the goal is to
+    have no I/O here. We'll commit the I/O on flush.
+    """
+    with sentry_tracing("replays.consumers.buffered.process_message"):
+        with contextlib.suppress(DropSilently):
+            message = parse_recording_message(message_bytes)
+            return process_recording_message(message)
+        return None
+
+
+def init(flags: Flags) -> Model[ProcessedRecordingMessage]:
+    """Return the initial state of the application."""
+    buffer = BufferManager(flags)
+    return Model(buffer=[], can_flush=buffer.can_flush, do_flush=buffer.do_flush, offsets={})
+
+
+recording_runtime = buffering_runtime(
+    init_fn=init,
+    process_fn=process_message,
+)
@@ -0,0 +1,32 @@
+"""Session Replay recording consumer strategy factory.
+
+This module exists solely to abstract the bootstraping process of the application and runtime in
+`sentry/consumers/__init__.py`.
+"""
+
+from collections.abc import Mapping
+
+from arroyo.backends.kafka.consumer import KafkaPayload
+from arroyo.processing.strategies import ProcessingStrategy, ProcessingStrategyFactory
+from arroyo.types import Commit as ArroyoCommit
+from arroyo.types import Partition
+
+from sentry.replays.consumers.buffered.consumer import Flags, recording_runtime
+from sentry.replays.consumers.buffered.platform import PlatformStrategy
+
+
+class PlatformStrategyFactory(ProcessingStrategyFactory[KafkaPayload]):
+
+    def __init__(self, max_buffer_length: int, max_buffer_wait: int, max_workers: int) -> None:
+        self.flags: Flags = {
+            "max_buffer_length": max_buffer_length,
+            "max_buffer_wait": max_buffer_wait,
+            "max_workers": max_workers,
+        }
+
+    def create_with_partitions(
+        self,
+        commit: ArroyoCommit,
+        partitions: Mapping[Partition, int],
+    ) -> ProcessingStrategy[KafkaPayload]:
+        return PlatformStrategy(commit=commit, flags=self.flags, runtime=recording_runtime)