From 707a6c66fb6dea5c6b9be0f0c0c8281e1d1b2154 Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Wed, 5 Feb 2025 19:06:57 +0200 Subject: [PATCH 01/26] Add API Budget --- .../declarative_component_schema.yaml | 166 ++++++++++++++++++ .../models/declarative_component_schema.py | 130 ++++++++++++++ .../parsers/model_to_component_factory.py | 130 +++++++++++++- .../declarative/requesters/http_requester.py | 3 + 4 files changed, 423 insertions(+), 6 deletions(-) diff --git a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml index d51d4c922..ea044f816 100644 --- a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml +++ b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml @@ -1365,6 +1365,168 @@ definitions: $parameters: type: object additional_properties: true + APIBudget: + title: API Budget + description: Component that defines how many requests can be made to the API in a given time frame. + type: object + required: + - type + properties: + type: + type: string + enum: [APIBudget] + policies: + title: Policies + description: List of policies that define the rate limits for different types of requests. + type: array + items: + anyOf: + - "$ref": "#/definitions/FixedWindowCallRatePolicy" + - "$ref": "#/definitions/MovingWindowCallRatePolicy" + - "$ref": "#/definitions/UnlimitedCallRatePolicy" + ratelimit_reset_header: + title: Rate Limit Reset Header + description: The name of the header that contains the timestamp for when the rate limit will reset. + type: string + default: "ratelimit-reset" + ratelimit_remaining_header: + title: Rate Limit Remaining Header + description: The name of the header that contains the number of remaining requests. + type: string + default: "ratelimit-remaining" + status_codes_for_ratelimit_hit: + title: Status Codes for Rate Limit Hit + description: List of HTTP status codes that indicate a rate limit has been hit. + type: array + items: + type: integer + default: [429] + maximum_attempts_to_acquire: + title: Maximum Attempts to Acquire + description: The maximum number of attempts to acquire a call before giving up. + type: integer + default: 100000 + additionalProperties: true + FixedWindowCallRatePolicy: + title: Fixed Window Call Rate Policy + description: A policy that allows a fixed number of calls within a specific time window. + type: object + required: + - type + - next_reset_ts + - period + - call_limit + - matchers + properties: + type: + type: string + enum: [FixedWindowCallRatePolicy] + next_reset_ts: + title: Next Reset Timestamp + description: The timestamp when the rate limit will reset. + type: string + format: date-time + period: + title: Period + description: The time interval for the rate limit window. + type: string + format: duration + call_limit: + title: Call Limit + description: The maximum number of calls allowed within the period. + type: integer + matchers: + title: Matchers + description: List of matchers that define which requests this policy applies to. + type: array + items: + "$ref": "#/definitions/HttpRequestMatcher" + additionalProperties: true + MovingWindowCallRatePolicy: + title: Moving Window Call Rate Policy + description: A policy that allows a fixed number of calls within a moving time window. + type: object + required: + - type + - rates + - matchers + properties: + type: + type: string + enum: [MovingWindowCallRatePolicy] + rates: + title: Rates + description: List of rates that define the call limits for different time intervals. + type: array + items: + "$ref": "#/definitions/Rate" + matchers: + title: Matchers + description: List of matchers that define which requests this policy applies to. + type: array + items: + "$ref": "#/definitions/HttpRequestMatcher" + additionalProperties: true + UnlimitedCallRatePolicy: + title: Unlimited Call Rate Policy + description: A policy that allows unlimited calls for specific requests. + type: object + required: + - type + - matchers + properties: + type: + type: string + enum: [UnlimitedCallRatePolicy] + matchers: + title: Matchers + description: List of matchers that define which requests this policy applies to. + type: array + items: + "$ref": "#/definitions/HttpRequestMatcher" + additionalProperties: true + Rate: + title: Rate + description: Defines a rate limit with a specific number of calls allowed within a time interval. + type: object + required: + - limit + - interval + properties: + limit: + title: Limit + description: The maximum number of calls allowed within the interval. + type: integer + interval: + title: Interval + description: The time interval for the rate limit. + type: string + format: duration + additionalProperties: true + HttpRequestMatcher: + title: HTTP Request Matcher + description: Matches HTTP requests based on method, URL, parameters, and headers. + type: object + properties: + method: + title: Method + description: The HTTP method to match (e.g., GET, POST). + type: string + url: + title: URL + description: The URL to match. + type: string + params: + title: Parameters + description: The query parameters to match. + type: object + additionalProperties: true + headers: + title: Headers + description: The headers to match. + type: object + additionalProperties: true + additionalProperties: true DefaultErrorHandler: title: Default Error Handler description: Component defining how to handle errors. Default behavior includes only retrying server errors (HTTP 5XX) and too many requests (HTTP 429) with an exponential backoff. @@ -1637,6 +1799,10 @@ definitions: - "$ref": "#/definitions/DefaultErrorHandler" - "$ref": "#/definitions/CustomErrorHandler" - "$ref": "#/definitions/CompositeErrorHandler" + api_budget: + title: API Budget + description: Component that defines how many requests can be made to the API in a given time frame. + "$ref": "#/definitions/APIBudget" http_method: title: HTTP Method description: The HTTP method used to fetch data from the source (can be GET or POST). diff --git a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py index 6aa1d35a7..bd5a69f6c 100644 --- a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py +++ b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py @@ -3,6 +3,7 @@ from __future__ import annotations +from datetime import datetime, timedelta from enum import Enum from typing import Any, Dict, List, Literal, Optional, Union @@ -642,6 +643,36 @@ class OAuthAuthenticator(BaseModel): parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters") +class Rate(BaseModel): + class Config: + extra = Extra.allow + + limit: int = Field( + ..., + description="The maximum number of calls allowed within the interval.", + title="Limit", + ) + interval: timedelta = Field( + ..., description="The time interval for the rate limit.", title="Interval" + ) + + +class HttpRequestMatcher(BaseModel): + class Config: + extra = Extra.allow + + method: Optional[str] = Field( + None, description="The HTTP method to match (e.g., GET, POST).", title="Method" + ) + url: Optional[str] = Field(None, description="The URL to match.", title="URL") + params: Optional[Dict[str, Any]] = Field( + None, description="The query parameters to match.", title="Parameters" + ) + headers: Optional[Dict[str, Any]] = Field( + None, description="The headers to match.", title="Headers" + ) + + class DpathExtractor(BaseModel): type: Literal["DpathExtractor"] field_path: List[str] = Field( @@ -1578,6 +1609,60 @@ class DatetimeBasedCursor(BaseModel): parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters") +class FixedWindowCallRatePolicy(BaseModel): + class Config: + extra = Extra.allow + + type: Literal["FixedWindowCallRatePolicy"] + next_reset_ts: datetime = Field( + ..., + description="The timestamp when the rate limit will reset.", + title="Next Reset Timestamp", + ) + period: timedelta = Field( + ..., description="The time interval for the rate limit window.", title="Period" + ) + call_limit: int = Field( + ..., + description="The maximum number of calls allowed within the period.", + title="Call Limit", + ) + matchers: List[HttpRequestMatcher] = Field( + ..., + description="List of matchers that define which requests this policy applies to.", + title="Matchers", + ) + + +class MovingWindowCallRatePolicy(BaseModel): + class Config: + extra = Extra.allow + + type: Literal["MovingWindowCallRatePolicy"] + rates: List[Rate] = Field( + ..., + description="List of rates that define the call limits for different time intervals.", + title="Rates", + ) + matchers: List[HttpRequestMatcher] = Field( + ..., + description="List of matchers that define which requests this policy applies to.", + title="Matchers", + ) + + +class UnlimitedCallRatePolicy(BaseModel): + class Config: + extra = Extra.allow + + type: Literal["UnlimitedCallRatePolicy"] + matchers: List[HttpRequestMatcher] = Field( + ..., + description="List of matchers that define which requests this policy applies to.", + title="Matchers", + ) + + class DefaultErrorHandler(BaseModel): type: Literal["DefaultErrorHandler"] backoff_strategies: Optional[ @@ -1709,6 +1794,46 @@ class CompositeErrorHandler(BaseModel): parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters") +class APIBudget(BaseModel): + class Config: + extra = Extra.allow + + type: Literal["APIBudget"] + policies: Optional[ + List[ + Union[ + FixedWindowCallRatePolicy, + MovingWindowCallRatePolicy, + UnlimitedCallRatePolicy, + ] + ] + ] = Field( + None, + description="List of policies that define the rate limits for different types of requests.", + title="Policies", + ) + ratelimit_reset_header: Optional[str] = Field( + "ratelimit-reset", + description="The name of the header that contains the timestamp for when the rate limit will reset.", + title="Rate Limit Reset Header", + ) + ratelimit_remaining_header: Optional[str] = Field( + "ratelimit-remaining", + description="The name of the header that contains the number of remaining requests.", + title="Rate Limit Remaining Header", + ) + status_codes_for_ratelimit_hit: Optional[List[int]] = Field( + [429], + description="List of HTTP status codes that indicate a rate limit has been hit.", + title="Status Codes for Rate Limit Hit", + ) + maximum_attempts_to_acquire: Optional[int] = Field( + 100000, + description="The maximum number of attempts to acquire a call before giving up.", + title="Maximum Attempts to Acquire", + ) + + class ZipfileDecoder(BaseModel): class Config: extra = Extra.allow @@ -1979,6 +2104,11 @@ class HttpRequester(BaseModel): description="Error handler component that defines how to handle errors.", title="Error Handler", ) + api_budget: Optional[APIBudget] = Field( + None, + description="Component that defines how many requests can be made to the API in a given time frame.", + title="API Budget", + ) http_method: Optional[HttpMethod] = Field( HttpMethod.GET, description="The HTTP method used to fetch data from the source (can be GET or POST).", diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index b8eeca1ec..cec9aff25 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -112,6 +112,9 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( AddFields as AddFieldsModel, ) +from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( + APIBudget as APIBudgetModel, +) from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( ApiKeyAuthenticator as ApiKeyAuthenticatorModel, ) @@ -226,6 +229,9 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( ExponentialBackoffStrategy as ExponentialBackoffStrategyModel, ) +from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( + FixedWindowCallRatePolicy as FixedWindowCallRatePolicyModel, +) from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( FlattenFields as FlattenFieldsModel, ) @@ -241,6 +247,9 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( HttpRequester as HttpRequesterModel, ) +from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( + HttpRequestMatcher as HttpRequestMatcherModel, +) from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( HttpResponseFilter as HttpResponseFilterModel, ) @@ -295,6 +304,9 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( MinMaxDatetime as MinMaxDatetimeModel, ) +from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( + MovingWindowCallRatePolicy as MovingWindowCallRatePolicyModel, +) from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( NoAuth as NoAuthModel, ) @@ -313,6 +325,9 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( ParentStreamConfig as ParentStreamConfigModel, ) +from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( + Rate as RateModel, +) from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( RecordFilter as RecordFilterModel, ) @@ -356,6 +371,9 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( TypesMap as TypesMapModel, ) +from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( + UnlimitedCallRatePolicy as UnlimitedCallRatePolicyModel, +) from airbyte_cdk.sources.declarative.models.declarative_component_schema import ValueType from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( WaitTimeFromHeader as WaitTimeFromHeaderModel, @@ -469,6 +487,14 @@ MessageRepository, NoopMessageRepository, ) +from airbyte_cdk.sources.streams.call_rate import ( + FixedWindowCallRatePolicy, + HttpAPIBudget, + HttpRequestMatcher, + MovingWindowCallRatePolicy, + Rate, + UnlimitedCallRatePolicy, +) from airbyte_cdk.sources.streams.concurrent.clamping import ( ClampingEndProvider, ClampingStrategy, @@ -607,6 +633,12 @@ def _init_mappings(self) -> None: StreamConfigModel: self.create_stream_config, ComponentMappingDefinitionModel: self.create_components_mapping_definition, ZipfileDecoderModel: self.create_zipfile_decoder, + APIBudgetModel: self.create_api_budget, + FixedWindowCallRatePolicyModel: self.create_fixed_window_call_rate_policy, + MovingWindowCallRatePolicyModel: self.create_moving_window_call_rate_policy, + UnlimitedCallRatePolicyModel: self.create_unlimited_call_rate_policy, + RateModel: self.create_rate, + HttpRequestMatcherModel: self.create_http_request_matcher, } # Needed for the case where we need to perform a second parse on the fields of a custom component @@ -813,7 +845,8 @@ def create_legacy_to_per_partition_state_migration( return LegacyToPerPartitionStateMigration( partition_router, # type: ignore # was already checked above - declarative_stream.incremental_sync, # type: ignore # was already checked. Migration can be applied only to incremental streams. + declarative_stream.incremental_sync, + # type: ignore # was already checked. Migration can be applied only to incremental streams. config, declarative_stream.parameters, # type: ignore # different type is expected here Mapping[str, Any], got Dict[str, Any] ) @@ -1111,7 +1144,8 @@ def create_concurrent_cursor_from_datetime_based_cursor( clamping_strategy = DayClampingStrategy() end_date_provider = ClampingEndProvider( DayClampingStrategy(is_ceiling=False), - end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice + end_date_provider, + # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice granularity=cursor_granularity or datetime.timedelta(seconds=1), ) case "WEEK": @@ -1128,14 +1162,16 @@ def create_concurrent_cursor_from_datetime_based_cursor( clamping_strategy = WeekClampingStrategy(weekday) end_date_provider = ClampingEndProvider( WeekClampingStrategy(weekday, is_ceiling=False), - end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice + end_date_provider, + # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice granularity=cursor_granularity or datetime.timedelta(days=1), ) case "MONTH": clamping_strategy = MonthClampingStrategy() end_date_provider = ClampingEndProvider( MonthClampingStrategy(is_ceiling=False), - end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice + end_date_provider, + # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice granularity=cursor_granularity or datetime.timedelta(days=1), ) case _: @@ -1152,8 +1188,10 @@ def create_concurrent_cursor_from_datetime_based_cursor( connector_state_converter=connector_state_converter, cursor_field=cursor_field, slice_boundary_fields=slice_boundary_fields, - start=start_date, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice - end_provider=end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice + start=start_date, + # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice + end_provider=end_date_provider, + # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice lookback_window=lookback_window, slice_range=step_length, cursor_granularity=cursor_granularity, @@ -1911,6 +1949,12 @@ def create_http_requester( ) ) + api_budget = ( + self._create_component_from_model(model=model.api_budget, config=config) + if model.api_budget + else None + ) + request_options_provider = InterpolatedRequestOptionsProvider( request_body_data=model.request_body_data, request_body_json=model.request_body_json, @@ -1931,6 +1975,7 @@ def create_http_requester( path=model.path, authenticator=authenticator, error_handler=error_handler, + api_budget=api_budget, http_method=HttpMethod[model.http_method.value], request_options_provider=request_options_provider, config=config, @@ -2919,3 +2964,76 @@ def _is_supported_parser_for_pagination(self, parser: Parser) -> bool: return isinstance(parser.inner_parser, JsonParser) else: return False + + def create_api_budget( + self, model: APIBudgetModel, config: Config, **kwargs: Any + ) -> HttpAPIBudget: + policies = [ + self._create_component_from_model(model=policy, config=config) + for policy in model.policies + ] + + return HttpAPIBudget( + policies=policies, + ratelimit_reset_header=model.ratelimit_reset_header, + ratelimit_remaining_header=model.ratelimit_remaining_header, + status_codes_for_ratelimit_hit=model.status_codes_for_ratelimit_hit, + maximum_attempts_to_acquire=model.maximum_attempts_to_acquire, + ) + + def create_fixed_window_call_rate_policy( + self, model: FixedWindowCallRatePolicyModel, config: Config, **kwargs: Any + ) -> FixedWindowCallRatePolicy: + matchers = [ + self._create_component_from_model(model=matcher, config=config) + for matcher in model.matchers + ] + return FixedWindowCallRatePolicy( + next_reset_ts=model.next_reset_ts, + period=parse_duration(model.period), + call_limit=model.call_limit, + matchers=matchers, + ) + + def create_moving_window_call_rate_policy( + self, model: MovingWindowCallRatePolicyModel, config: Config, **kwargs: Any + ) -> MovingWindowCallRatePolicy: + rates = [ + self._create_component_from_model(model=rate, config=config) for rate in model.rates + ] + matchers = [ + self._create_component_from_model(model=matcher, config=config) + for matcher in model.matchers + ] + return MovingWindowCallRatePolicy( + rates=rates, + matchers=matchers, + ) + + def create_unlimited_call_rate_policy( + self, model: UnlimitedCallRatePolicyModel, config: Config, **kwargs: Any + ) -> UnlimitedCallRatePolicy: + matchers = [ + self._create_component_from_model(model=matcher, config=config) + for matcher in model.matchers + ] + + return UnlimitedCallRatePolicy( + matchers=matchers, + ) + + def create_rate(self, model: RateModel, config: Config, **kwargs: Any) -> Rate: + return Rate( + limit=model.limit, + interval=model.interval, + ) + + def create_http_request_matcher( + self, model: HttpRequestMatcherModel, config: Config, **kwargs: Any + ) -> HttpRequestMatcher: + return HttpRequestMatcher( + method=model.method, + url=model.url, + params=model.params, + headers=model.headers, + ) diff --git a/airbyte_cdk/sources/declarative/requesters/http_requester.py b/airbyte_cdk/sources/declarative/requesters/http_requester.py index 35d4b0f11..96b6a4365 100644 --- a/airbyte_cdk/sources/declarative/requesters/http_requester.py +++ b/airbyte_cdk/sources/declarative/requesters/http_requester.py @@ -22,6 +22,7 @@ ) from airbyte_cdk.sources.declarative.requesters.requester import HttpMethod, Requester from airbyte_cdk.sources.message import MessageRepository, NoopMessageRepository +from airbyte_cdk.sources.streams.call_rate import APIBudget from airbyte_cdk.sources.streams.http import HttpClient from airbyte_cdk.sources.streams.http.error_handlers import ErrorHandler from airbyte_cdk.sources.types import Config, StreamSlice, StreamState @@ -55,6 +56,7 @@ class HttpRequester(Requester): http_method: Union[str, HttpMethod] = HttpMethod.GET request_options_provider: Optional[InterpolatedRequestOptionsProvider] = None error_handler: Optional[ErrorHandler] = None + api_budget: Optional[APIBudget] = None disable_retries: bool = False message_repository: MessageRepository = NoopMessageRepository() use_cache: bool = False @@ -91,6 +93,7 @@ def __post_init__(self, parameters: Mapping[str, Any]) -> None: name=self.name, logger=self.logger, error_handler=self.error_handler, + api_budget=self.api_budget, authenticator=self._authenticator, use_cache=self.use_cache, backoff_strategy=backoff_strategies, From b6bcdd7aa93e04fb3a81824c99d7b5821dbeffc7 Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Thu, 6 Feb 2025 20:40:54 +0200 Subject: [PATCH 02/26] Refactor to move api_budget to root level --- .../declarative_component_schema.yaml | 67 ++++++-- .../manifest_declarative_source.py | 4 + .../models/declarative_component_schema.py | 69 ++++++-- .../parsers/model_to_component_factory.py | 81 ++++++--- airbyte_cdk/sources/streams/call_rate.py | 155 ++++++++++-------- 5 files changed, 251 insertions(+), 125 deletions(-) diff --git a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml index ea044f816..aa4e2b4df 100644 --- a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml +++ b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml @@ -40,6 +40,12 @@ properties: "$ref": "#/definitions/Spec" concurrency_level: "$ref": "#/definitions/ConcurrencyLevel" + api_budget: + title: API Budget + description: Defines how many requests can be made to the API in a given time frame. This field accepts either a generic APIBudget or an HTTP-specific configuration (HTTPAPIBudget) to be applied across all streams. + anyOf: + - "$ref": "#/definitions/APIBudget" + - "$ref": "#/definitions/HTTPAPIBudget" metadata: type: object description: For internal Airbyte use only - DO NOT modify manually. Used by consumers of declarative manifests for storing related metadata. @@ -794,7 +800,7 @@ definitions: description: This option is used to adjust the upper and lower boundaries of each datetime window to beginning and end of the provided target period (day, week, month) type: object required: - - target + - target properties: target: title: Target @@ -1367,17 +1373,49 @@ definitions: additional_properties: true APIBudget: title: API Budget - description: Component that defines how many requests can be made to the API in a given time frame. + description: > + A generic API budget configuration that defines the policies (rate limiting rules) + and the maximum number of attempts to acquire a call credit. This budget does not automatically + update itself based on HTTP response headers. type: object required: - type + - policies properties: type: type: string enum: [APIBudget] policies: title: Policies - description: List of policies that define the rate limits for different types of requests. + description: List of call rate policies that define how many calls are allowed. + type: array + items: + anyOf: + - "$ref": "#/definitions/FixedWindowCallRatePolicy" + - "$ref": "#/definitions/MovingWindowCallRatePolicy" + - "$ref": "#/definitions/UnlimitedCallRatePolicy" + maximum_attempts_to_acquire: + title: Maximum Attempts to Acquire + description: The maximum number of attempts to acquire a call before giving up. + type: integer + default: 100000 + additionalProperties: true + HTTPAPIBudget: + title: HTTP API Budget + description: > + An HTTP-specific API budget that extends APIBudget by updating rate limiting information based + on HTTP response headers. It extracts available calls and the next reset timestamp from the HTTP responses. + type: object + required: + - type + - policies + properties: + type: + type: string + enum: [HTTPAPIBudget] + policies: + title: Policies + description: List of call rate policies that define how many calls are allowed. type: array items: anyOf: @@ -1386,12 +1424,12 @@ definitions: - "$ref": "#/definitions/UnlimitedCallRatePolicy" ratelimit_reset_header: title: Rate Limit Reset Header - description: The name of the header that contains the timestamp for when the rate limit will reset. + description: The HTTP response header name that indicates when the rate limit resets. type: string default: "ratelimit-reset" ratelimit_remaining_header: title: Rate Limit Remaining Header - description: The name of the header that contains the number of remaining requests. + description: The HTTP response header name that indicates the number of remaining allowed calls. type: string default: "ratelimit-remaining" status_codes_for_ratelimit_hit: @@ -1505,16 +1543,23 @@ definitions: additionalProperties: true HttpRequestMatcher: title: HTTP Request Matcher - description: Matches HTTP requests based on method, URL, parameters, and headers. + description: > + Matches HTTP requests based on method, base URL, URL path pattern, query parameters, and headers. + Use `url_base` to specify the scheme and host (without trailing slash) and + `url_path_pattern` to apply a regex to the request path. type: object properties: method: title: Method description: The HTTP method to match (e.g., GET, POST). type: string - url: - title: URL - description: The URL to match. + url_base: + title: URL Base + description: The base URL (scheme and host, e.g. "https://api.example.com") to match. + type: string + url_path_pattern: + title: URL Path Pattern + description: A regular expression pattern to match the URL path. type: string params: title: Parameters @@ -1799,10 +1844,6 @@ definitions: - "$ref": "#/definitions/DefaultErrorHandler" - "$ref": "#/definitions/CustomErrorHandler" - "$ref": "#/definitions/CompositeErrorHandler" - api_budget: - title: API Budget - description: Component that defines how many requests can be made to the API in a given time frame. - "$ref": "#/definitions/APIBudget" http_method: title: HTTP Method description: The HTTP method used to fetch data from the source (can be GET or POST). diff --git a/airbyte_cdk/sources/declarative/manifest_declarative_source.py b/airbyte_cdk/sources/declarative/manifest_declarative_source.py index efc779464..d3afb1396 100644 --- a/airbyte_cdk/sources/declarative/manifest_declarative_source.py +++ b/airbyte_cdk/sources/declarative/manifest_declarative_source.py @@ -137,6 +137,10 @@ def streams(self, config: Mapping[str, Any]) -> List[Stream]: self._source_config, config ) + api_budget_model = self._source_config.get("api_budget") + if api_budget_model: + self._constructor.set_api_budget(api_budget_model, config) + source_streams = [ self._constructor.create_component( DeclarativeStreamModel, diff --git a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py index bd5a69f6c..c00e46831 100644 --- a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py +++ b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py @@ -664,7 +664,16 @@ class Config: method: Optional[str] = Field( None, description="The HTTP method to match (e.g., GET, POST).", title="Method" ) - url: Optional[str] = Field(None, description="The URL to match.", title="URL") + url_base: Optional[str] = Field( + None, + description='The base URL (scheme and host, e.g. "https://api.example.com") to match.', + title="URL Base", + ) + url_path_pattern: Optional[str] = Field( + None, + description="A regular expression pattern to match the URL path.", + title="URL Path Pattern", + ) params: Optional[Dict[str, Any]] = Field( None, description="The query parameters to match.", title="Parameters" ) @@ -1799,27 +1808,48 @@ class Config: extra = Extra.allow type: Literal["APIBudget"] - policies: Optional[ - List[ - Union[ - FixedWindowCallRatePolicy, - MovingWindowCallRatePolicy, - UnlimitedCallRatePolicy, - ] + policies: List[ + Union[ + FixedWindowCallRatePolicy, + MovingWindowCallRatePolicy, + UnlimitedCallRatePolicy, ] ] = Field( - None, - description="List of policies that define the rate limits for different types of requests.", + ..., + description="List of call rate policies that define how many calls are allowed.", + title="Policies", + ) + maximum_attempts_to_acquire: Optional[int] = Field( + 100000, + description="The maximum number of attempts to acquire a call before giving up.", + title="Maximum Attempts to Acquire", + ) + + +class HTTPAPIBudget(BaseModel): + class Config: + extra = Extra.allow + + type: Literal["HTTPAPIBudget"] + policies: List[ + Union[ + FixedWindowCallRatePolicy, + MovingWindowCallRatePolicy, + UnlimitedCallRatePolicy, + ] + ] = Field( + ..., + description="List of call rate policies that define how many calls are allowed.", title="Policies", ) ratelimit_reset_header: Optional[str] = Field( "ratelimit-reset", - description="The name of the header that contains the timestamp for when the rate limit will reset.", + description="The HTTP response header name that indicates when the rate limit resets.", title="Rate Limit Reset Header", ) ratelimit_remaining_header: Optional[str] = Field( "ratelimit-remaining", - description="The name of the header that contains the number of remaining requests.", + description="The HTTP response header name that indicates the number of remaining allowed calls.", title="Rate Limit Remaining Header", ) status_codes_for_ratelimit_hit: Optional[List[int]] = Field( @@ -1867,6 +1897,11 @@ class Config: definitions: Optional[Dict[str, Any]] = None spec: Optional[Spec] = None concurrency_level: Optional[ConcurrencyLevel] = None + api_budget: Optional[Union[APIBudget, HTTPAPIBudget]] = Field( + None, + description="Defines how many requests can be made to the API in a given time frame. This field accepts either a generic APIBudget or an HTTP-specific configuration (HTTPAPIBudget) to be applied across all streams.", + title="API Budget", + ) metadata: Optional[Dict[str, Any]] = Field( None, description="For internal Airbyte use only - DO NOT modify manually. Used by consumers of declarative manifests for storing related metadata.", @@ -1893,6 +1928,11 @@ class Config: definitions: Optional[Dict[str, Any]] = None spec: Optional[Spec] = None concurrency_level: Optional[ConcurrencyLevel] = None + api_budget: Optional[Union[APIBudget, HTTPAPIBudget]] = Field( + None, + description="Defines how many requests can be made to the API in a given time frame. This field accepts either a generic APIBudget or an HTTP-specific configuration (HTTPAPIBudget) to be applied across all streams.", + title="API Budget", + ) metadata: Optional[Dict[str, Any]] = Field( None, description="For internal Airbyte use only - DO NOT modify manually. Used by consumers of declarative manifests for storing related metadata.", @@ -2104,11 +2144,6 @@ class HttpRequester(BaseModel): description="Error handler component that defines how to handle errors.", title="Error Handler", ) - api_budget: Optional[APIBudget] = Field( - None, - description="Component that defines how many requests can be made to the API in a given time frame.", - title="API Budget", - ) http_method: Optional[HttpMethod] = Field( HttpMethod.GET, description="The HTTP method used to fetch data from the source (can be GET or POST).", diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index cec9aff25..87048a005 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -241,6 +241,9 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( GzipParser as GzipParserModel, ) +from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( + HTTPAPIBudget as HTTPAPIBudgetModel, +) from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( HttpComponentsResolver as HttpComponentsResolverModel, ) @@ -488,6 +491,7 @@ NoopMessageRepository, ) from airbyte_cdk.sources.streams.call_rate import ( + APIBudget, FixedWindowCallRatePolicy, HttpAPIBudget, HttpRequestMatcher, @@ -546,6 +550,7 @@ def __init__( self._evaluate_log_level(emit_connector_builder_messages) ) self._connector_state_manager = connector_state_manager or ConnectorStateManager() + self._api_budget: Optional[Union[APIBudget, HttpAPIBudget]] = None def _init_mappings(self) -> None: self.PYDANTIC_MODEL_TO_CONSTRUCTOR: Mapping[Type[BaseModel], Callable[..., Any]] = { @@ -634,6 +639,7 @@ def _init_mappings(self) -> None: ComponentMappingDefinitionModel: self.create_components_mapping_definition, ZipfileDecoderModel: self.create_zipfile_decoder, APIBudgetModel: self.create_api_budget, + HTTPAPIBudgetModel: self.create_http_api_budget, FixedWindowCallRatePolicyModel: self.create_fixed_window_call_rate_policy, MovingWindowCallRatePolicyModel: self.create_moving_window_call_rate_policy, UnlimitedCallRatePolicyModel: self.create_unlimited_call_rate_policy, @@ -845,8 +851,7 @@ def create_legacy_to_per_partition_state_migration( return LegacyToPerPartitionStateMigration( partition_router, # type: ignore # was already checked above - declarative_stream.incremental_sync, - # type: ignore # was already checked. Migration can be applied only to incremental streams. + declarative_stream.incremental_sync, # type: ignore # was already checked. Migration can be applied only to incremental streams. config, declarative_stream.parameters, # type: ignore # different type is expected here Mapping[str, Any], got Dict[str, Any] ) @@ -1144,8 +1149,7 @@ def create_concurrent_cursor_from_datetime_based_cursor( clamping_strategy = DayClampingStrategy() end_date_provider = ClampingEndProvider( DayClampingStrategy(is_ceiling=False), - end_date_provider, - # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice + end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice granularity=cursor_granularity or datetime.timedelta(seconds=1), ) case "WEEK": @@ -1162,16 +1166,14 @@ def create_concurrent_cursor_from_datetime_based_cursor( clamping_strategy = WeekClampingStrategy(weekday) end_date_provider = ClampingEndProvider( WeekClampingStrategy(weekday, is_ceiling=False), - end_date_provider, - # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice + end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice granularity=cursor_granularity or datetime.timedelta(days=1), ) case "MONTH": clamping_strategy = MonthClampingStrategy() end_date_provider = ClampingEndProvider( MonthClampingStrategy(is_ceiling=False), - end_date_provider, - # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice + end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice granularity=cursor_granularity or datetime.timedelta(days=1), ) case _: @@ -1188,10 +1190,8 @@ def create_concurrent_cursor_from_datetime_based_cursor( connector_state_converter=connector_state_converter, cursor_field=cursor_field, slice_boundary_fields=slice_boundary_fields, - start=start_date, - # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice - end_provider=end_date_provider, - # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice + start=start_date, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice + end_provider=end_date_provider, # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice lookback_window=lookback_window, slice_range=step_length, cursor_granularity=cursor_granularity, @@ -1949,11 +1949,7 @@ def create_http_requester( ) ) - api_budget = ( - self._create_component_from_model(model=model.api_budget, config=config) - if model.api_budget - else None - ) + api_budget = self._api_budget request_options_provider = InterpolatedRequestOptionsProvider( request_body_data=model.request_body_data, @@ -2965,8 +2961,21 @@ def _is_supported_parser_for_pagination(self, parser: Parser) -> bool: else: return False - def create_api_budget( - self, model: APIBudgetModel, config: Config, **kwargs: Any + def create_api_budget(self, model: APIBudgetModel, config: Config, **kwargs: Any) -> APIBudget: + policies = [ + self._create_component_from_model(model=policy, config=config) + for policy in model.policies + ] + + return APIBudget( + policies=policies, + maximum_attempts_to_acquire=model.maximum_attempts_to_acquire + if model.maximum_attempts_to_acquire + else 100000, + ) + + def create_http_api_budget( + self, model: HTTPAPIBudgetModel, config: Config, **kwargs: Any ) -> HttpAPIBudget: policies = [ self._create_component_from_model(model=policy, config=config) @@ -2975,10 +2984,18 @@ def create_api_budget( return HttpAPIBudget( policies=policies, - ratelimit_reset_header=model.ratelimit_reset_header, - ratelimit_remaining_header=model.ratelimit_remaining_header, - status_codes_for_ratelimit_hit=model.status_codes_for_ratelimit_hit, - maximum_attempts_to_acquire=model.maximum_attempts_to_acquire, + maximum_attempts_to_acquire=model.maximum_attempts_to_acquire + if model.maximum_attempts_to_acquire + else 100000, + ratelimit_reset_header=model.ratelimit_reset_header + if model.ratelimit_reset_header + else "ratelimit-reset", + ratelimit_remaining_header=model.ratelimit_remaining_header + if model.ratelimit_remaining_header + else "ratelimit-remaining", + status_codes_for_ratelimit_hit=model.status_codes_for_ratelimit_hit + if model.status_codes_for_ratelimit_hit + else (429,), ) def create_fixed_window_call_rate_policy( @@ -3033,7 +3050,23 @@ def create_http_request_matcher( ) -> HttpRequestMatcher: return HttpRequestMatcher( method=model.method, - url=model.url, + url_base=model.url_base, + url_path_pattern=model.url_path_pattern, params=model.params, headers=model.headers, ) + + def set_api_budget(self, component_definition: ComponentDefinition, config: Config) -> None: + model_str = component_definition.get("type") + if model_str == "APIBudget": + # Annotate model_type as a type that is a subclass of BaseModel + model_type: Union[Type[APIBudgetModel], Type[HTTPAPIBudgetModel]] = APIBudgetModel + elif model_str == "HTTPAPIBudget": + model_type = HTTPAPIBudgetModel + else: + raise ValueError(f"Unknown API Budget type: {model_str}") + + # create_component expects a type[BaseModel] and returns an instance of that model. + self._api_budget = self.create_component( + model_type=model_type, component_definition=component_definition, config=config + ) diff --git a/airbyte_cdk/sources/streams/call_rate.py b/airbyte_cdk/sources/streams/call_rate.py index 81ebac78e..d25fb9c2b 100644 --- a/airbyte_cdk/sources/streams/call_rate.py +++ b/airbyte_cdk/sources/streams/call_rate.py @@ -6,10 +6,12 @@ import dataclasses import datetime import logging +import re import time +from dataclasses import InitVar, dataclass, field from datetime import timedelta from threading import RLock -from typing import TYPE_CHECKING, Any, Mapping, Optional +from typing import TYPE_CHECKING, Any, Mapping, Optional, Union from urllib import parse import requests @@ -98,43 +100,55 @@ def __call__(self, request: Any) -> bool: class HttpRequestMatcher(RequestMatcher): - """Simple implementation of RequestMatcher for http requests case""" + """ + Extended RequestMatcher for HTTP requests that supports matching on: + - HTTP method (case-insensitive) + - URL base (scheme + netloc) optionally + - URL path pattern (a regex applied to the path portion of the URL) + - Query parameters (must be present) + - Headers (header names compared case-insensitively) + """ def __init__( self, method: Optional[str] = None, - url: Optional[str] = None, + url_base: Optional[str] = None, + url_path_pattern: Optional[str] = None, params: Optional[Mapping[str, Any]] = None, headers: Optional[Mapping[str, Any]] = None, ): - """Constructor - - :param method: - :param url: - :param params: - :param headers: """ - self._method = method - self._url = url + :param method: HTTP method (e.g. "GET", "POST"); compared case-insensitively. + :param url_base: Base URL (scheme://host) that must match. + :param url_path_pattern: A regex pattern that will be applied to the path portion of the URL. + :param params: Dictionary of query parameters that must be present in the request. + :param headers: Dictionary of headers that must be present (header keys are compared case-insensitively). + """ + self._method = method.upper() if method else None + + # Normalize the url_base if provided: remove trailing slash. + self._url_base = url_base.rstrip("/") if url_base else None + + # Compile the URL path pattern if provided. + self._url_path_pattern = re.compile(url_path_pattern) if url_path_pattern else None + + # Normalize query parameters to strings. self._params = {str(k): str(v) for k, v in (params or {}).items()} - self._headers = {str(k): str(v) for k, v in (headers or {}).items()} + + # Normalize header keys to lowercase. + self._headers = {str(k).lower(): str(v) for k, v in (headers or {}).items()} @staticmethod def _match_dict(obj: Mapping[str, Any], pattern: Mapping[str, Any]) -> bool: - """Check that all elements from pattern dict present and have the same values in obj dict - - :param obj: - :param pattern: - :return: - """ + """Check that every key/value in the pattern exists in the object.""" return pattern.items() <= obj.items() def __call__(self, request: Any) -> bool: """ - - :param request: - :return: True if matches the provided request object, False - otherwise + :param request: A requests.Request or requests.PreparedRequest instance. + :return: True if the request matches all provided criteria; False otherwise. """ + # Prepare the request (if needed) and extract the URL details. if isinstance(request, requests.Request): prepared_request = request.prepare() elif isinstance(request, requests.PreparedRequest): @@ -142,21 +156,40 @@ def __call__(self, request: Any) -> bool: else: return False - if self._method is not None: - if prepared_request.method != self._method: + # Check HTTP method. + if self._method is not None and prepared_request.method is not None: + if prepared_request.method.upper() != self._method: return False - if self._url is not None and prepared_request.url is not None: - url_without_params = prepared_request.url.split("?")[0] - if url_without_params != self._url: + + # Parse the URL. + parsed_url = parse.urlsplit(prepared_request.url) + # Reconstruct the base: scheme://netloc + request_url_base = f"{str(parsed_url.scheme)}://{str(parsed_url.netloc)}" + # The path (without query parameters) + request_path = str(parsed_url.path).rstrip("/") + + # If a base URL is provided, check that it matches. + if self._url_base is not None: + if request_url_base != self._url_base: return False - if self._params is not None: - parsed_url = parse.urlsplit(prepared_request.url) - params = dict(parse.parse_qsl(str(parsed_url.query))) - if not self._match_dict(params, self._params): + + # If a URL path pattern is provided, ensure the path matches the regex. + if self._url_path_pattern is not None: + if not self._url_path_pattern.search(request_path): return False - if self._headers is not None: - if not self._match_dict(prepared_request.headers, self._headers): + + # Check query parameters. + if self._params: + query_params = dict(parse.parse_qsl(str(parsed_url.query))) + if not self._match_dict(query_params, self._params): return False + + # Check headers (normalize keys to lower-case). + if self._headers: + req_headers = {k.lower(): v for k, v in prepared_request.headers.items()} + if not self._match_dict(req_headers, self._headers): + return False + return True @@ -399,24 +432,17 @@ def update_from_response(self, request: Any, response: Any) -> None: """ +@dataclass class APIBudget(AbstractAPIBudget): - """Default APIBudget implementation""" - - def __init__( - self, policies: list[AbstractCallRatePolicy], maximum_attempts_to_acquire: int = 100000 - ) -> None: - """Constructor - - :param policies: list of policies in this budget - :param maximum_attempts_to_acquire: number of attempts before throwing hit ratelimit exception, we put some big number here - to avoid situations when many threads compete with each other for a few lots over a significant amount of time - """ + """ + Default APIBudget implementation. + """ - self._policies = policies - self._maximum_attempts_to_acquire = maximum_attempts_to_acquire + policies: list[AbstractCallRatePolicy] + maximum_attempts_to_acquire: int = 100000 def get_matching_policy(self, request: Any) -> Optional[AbstractCallRatePolicy]: - for policy in self._policies: + for policy in self.policies: if policy.matches(request): return policy return None @@ -437,7 +463,7 @@ def acquire_call( policy = self.get_matching_policy(request) if policy: self._do_acquire(request=request, policy=policy, block=block, timeout=timeout) - elif self._policies: + elif self.policies: logger.info("no policies matched with requests, allow call by default") def update_from_response(self, request: Any, response: Any) -> None: @@ -460,7 +486,7 @@ def _do_acquire( """ last_exception = None # sometimes we spend all budget before a second attempt, so we have few more here - for attempt in range(1, self._maximum_attempts_to_acquire): + for attempt in range(1, self.maximum_attempts_to_acquire): try: policy.try_acquire(request, weight=1) return @@ -484,31 +510,18 @@ def _do_acquire( if last_exception: logger.info( - "we used all %s attempts to acquire and failed", self._maximum_attempts_to_acquire + "we used all %s attempts to acquire and failed", self.maximum_attempts_to_acquire ) raise last_exception +@dataclass class HttpAPIBudget(APIBudget): """Implementation of AbstractAPIBudget for HTTP""" - def __init__( - self, - ratelimit_reset_header: str = "ratelimit-reset", - ratelimit_remaining_header: str = "ratelimit-remaining", - status_codes_for_ratelimit_hit: tuple[int] = (429,), - **kwargs: Any, - ): - """Constructor - - :param ratelimit_reset_header: name of the header that has a timestamp of the next reset of call budget - :param ratelimit_remaining_header: name of the header that has the number of calls left - :param status_codes_for_ratelimit_hit: list of HTTP status codes that signal about rate limit being hit - """ - self._ratelimit_reset_header = ratelimit_reset_header - self._ratelimit_remaining_header = ratelimit_remaining_header - self._status_codes_for_ratelimit_hit = status_codes_for_ratelimit_hit - super().__init__(**kwargs) + ratelimit_reset_header: str = "ratelimit-reset" + ratelimit_remaining_header: str = "ratelimit-remaining" + status_codes_for_ratelimit_hit: Union[tuple[int], list[int]] = (429,) def update_from_response(self, request: Any, response: Any) -> None: policy = self.get_matching_policy(request) @@ -523,17 +536,17 @@ def update_from_response(self, request: Any, response: Any) -> None: def get_reset_ts_from_response( self, response: requests.Response ) -> Optional[datetime.datetime]: - if response.headers.get(self._ratelimit_reset_header): + if response.headers.get(self.ratelimit_reset_header): return datetime.datetime.fromtimestamp( - int(response.headers[self._ratelimit_reset_header]) + int(response.headers[self.ratelimit_reset_header]) ) return None def get_calls_left_from_response(self, response: requests.Response) -> Optional[int]: - if response.headers.get(self._ratelimit_remaining_header): - return int(response.headers[self._ratelimit_remaining_header]) + if response.headers.get(self.ratelimit_remaining_header): + return int(response.headers[self.ratelimit_remaining_header]) - if response.status_code in self._status_codes_for_ratelimit_hit: + if response.status_code in self.status_codes_for_ratelimit_hit: return 0 return None From 040ff9e5ec97af3fd7e56bf18fb46a5e70273153 Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Thu, 6 Feb 2025 20:46:27 +0200 Subject: [PATCH 03/26] Format --- .../parsers/model_to_component_factory.py | 20 +++++-------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 87048a005..0ae7e9572 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -2969,9 +2969,7 @@ def create_api_budget(self, model: APIBudgetModel, config: Config, **kwargs: Any return APIBudget( policies=policies, - maximum_attempts_to_acquire=model.maximum_attempts_to_acquire - if model.maximum_attempts_to_acquire - else 100000, + maximum_attempts_to_acquire=model.maximum_attempts_to_acquire or 100000, ) def create_http_api_budget( @@ -2984,18 +2982,10 @@ def create_http_api_budget( return HttpAPIBudget( policies=policies, - maximum_attempts_to_acquire=model.maximum_attempts_to_acquire - if model.maximum_attempts_to_acquire - else 100000, - ratelimit_reset_header=model.ratelimit_reset_header - if model.ratelimit_reset_header - else "ratelimit-reset", - ratelimit_remaining_header=model.ratelimit_remaining_header - if model.ratelimit_remaining_header - else "ratelimit-remaining", - status_codes_for_ratelimit_hit=model.status_codes_for_ratelimit_hit - if model.status_codes_for_ratelimit_hit - else (429,), + maximum_attempts_to_acquire=model.maximum_attempts_to_acquire or 100000, + ratelimit_reset_header=model.ratelimit_reset_header or "ratelimit-reset", + ratelimit_remaining_header=model.ratelimit_remaining_header or "ratelimit-remaining", + status_codes_for_ratelimit_hit=model.status_codes_for_ratelimit_hit or (429,), ) def create_fixed_window_call_rate_policy( From 15f830ca5be3ad69cc8065a5de43098d0a1ab110 Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Fri, 7 Feb 2025 17:43:53 +0200 Subject: [PATCH 04/26] Update for backward compatibility --- .../declarative_component_schema.yaml | 8 +- .../models/declarative_component_schema.py | 8 +- .../parsers/model_to_component_factory.py | 12 +-- airbyte_cdk/sources/streams/call_rate.py | 63 +++++++++++++ unit_tests/sources/streams/test_call_rate.py | 88 +++++++++++++++++++ 5 files changed, 165 insertions(+), 14 deletions(-) diff --git a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml index abcddf514..25c9492fb 100644 --- a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml +++ b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml @@ -1478,7 +1478,7 @@ definitions: description: List of matchers that define which requests this policy applies to. type: array items: - "$ref": "#/definitions/HttpRequestMatcher" + "$ref": "#/definitions/HttpRequestRegexMatcher" additionalProperties: true MovingWindowCallRatePolicy: title: Moving Window Call Rate Policy @@ -1503,7 +1503,7 @@ definitions: description: List of matchers that define which requests this policy applies to. type: array items: - "$ref": "#/definitions/HttpRequestMatcher" + "$ref": "#/definitions/HttpRequestRegexMatcher" additionalProperties: true UnlimitedCallRatePolicy: title: Unlimited Call Rate Policy @@ -1521,7 +1521,7 @@ definitions: description: List of matchers that define which requests this policy applies to. type: array items: - "$ref": "#/definitions/HttpRequestMatcher" + "$ref": "#/definitions/HttpRequestRegexMatcher" additionalProperties: true Rate: title: Rate @@ -1541,7 +1541,7 @@ definitions: type: string format: duration additionalProperties: true - HttpRequestMatcher: + HttpRequestRegexMatcher: title: HTTP Request Matcher description: > Matches HTTP requests based on method, base URL, URL path pattern, query parameters, and headers. diff --git a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py index 5bd0aa80d..aaff67548 100644 --- a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py +++ b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py @@ -657,7 +657,7 @@ class Config: ) -class HttpRequestMatcher(BaseModel): +class HttpRequestRegexMatcher(BaseModel): class Config: extra = Extra.allow @@ -1642,7 +1642,7 @@ class Config: description="The maximum number of calls allowed within the period.", title="Call Limit", ) - matchers: List[HttpRequestMatcher] = Field( + matchers: List[HttpRequestRegexMatcher] = Field( ..., description="List of matchers that define which requests this policy applies to.", title="Matchers", @@ -1659,7 +1659,7 @@ class Config: description="List of rates that define the call limits for different time intervals.", title="Rates", ) - matchers: List[HttpRequestMatcher] = Field( + matchers: List[HttpRequestRegexMatcher] = Field( ..., description="List of matchers that define which requests this policy applies to.", title="Matchers", @@ -1671,7 +1671,7 @@ class Config: extra = Extra.allow type: Literal["UnlimitedCallRatePolicy"] - matchers: List[HttpRequestMatcher] = Field( + matchers: List[HttpRequestRegexMatcher] = Field( ..., description="List of matchers that define which requests this policy applies to.", title="Matchers", diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 6f3f39604..9bd775a4a 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -251,7 +251,7 @@ HttpRequester as HttpRequesterModel, ) from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( - HttpRequestMatcher as HttpRequestMatcherModel, + HttpRequestRegexMatcher as HttpRequestRegexMatcherModel, ) from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( HttpResponseFilter as HttpResponseFilterModel, @@ -494,7 +494,7 @@ APIBudget, FixedWindowCallRatePolicy, HttpAPIBudget, - HttpRequestMatcher, + HttpRequestRegexMatcher, MovingWindowCallRatePolicy, Rate, UnlimitedCallRatePolicy, @@ -644,7 +644,7 @@ def _init_mappings(self) -> None: MovingWindowCallRatePolicyModel: self.create_moving_window_call_rate_policy, UnlimitedCallRatePolicyModel: self.create_unlimited_call_rate_policy, RateModel: self.create_rate, - HttpRequestMatcherModel: self.create_http_request_matcher, + HttpRequestRegexMatcherModel: self.create_http_request_matcher, } # Needed for the case where we need to perform a second parse on the fields of a custom component @@ -3040,9 +3040,9 @@ def create_rate(self, model: RateModel, config: Config, **kwargs: Any) -> Rate: ) def create_http_request_matcher( - self, model: HttpRequestMatcherModel, config: Config, **kwargs: Any - ) -> HttpRequestMatcher: - return HttpRequestMatcher( + self, model: HttpRequestRegexMatcherModel, config: Config, **kwargs: Any + ) -> HttpRequestRegexMatcher: + return HttpRequestRegexMatcher( method=model.method, url_base=model.url_base, url_path_pattern=model.url_path_pattern, diff --git a/airbyte_cdk/sources/streams/call_rate.py b/airbyte_cdk/sources/streams/call_rate.py index d25fb9c2b..21fec881f 100644 --- a/airbyte_cdk/sources/streams/call_rate.py +++ b/airbyte_cdk/sources/streams/call_rate.py @@ -100,6 +100,69 @@ def __call__(self, request: Any) -> bool: class HttpRequestMatcher(RequestMatcher): + """Simple implementation of RequestMatcher for http requests case""" + + def __init__( + self, + method: Optional[str] = None, + url: Optional[str] = None, + params: Optional[Mapping[str, Any]] = None, + headers: Optional[Mapping[str, Any]] = None, + ): + """Constructor + + :param method: + :param url: + :param params: + :param headers: + """ + self._method = method + self._url = url + self._params = {str(k): str(v) for k, v in (params or {}).items()} + self._headers = {str(k): str(v) for k, v in (headers or {}).items()} + + @staticmethod + def _match_dict(obj: Mapping[str, Any], pattern: Mapping[str, Any]) -> bool: + """Check that all elements from pattern dict present and have the same values in obj dict + + :param obj: + :param pattern: + :return: + """ + return pattern.items() <= obj.items() + + def __call__(self, request: Any) -> bool: + """ + + :param request: + :return: True if matches the provided request object, False - otherwise + """ + if isinstance(request, requests.Request): + prepared_request = request.prepare() + elif isinstance(request, requests.PreparedRequest): + prepared_request = request + else: + return False + + if self._method is not None: + if prepared_request.method != self._method: + return False + if self._url is not None and prepared_request.url is not None: + url_without_params = prepared_request.url.split("?")[0] + if url_without_params != self._url: + return False + if self._params is not None: + parsed_url = parse.urlsplit(prepared_request.url) + params = dict(parse.parse_qsl(str(parsed_url.query))) + if not self._match_dict(params, self._params): + return False + if self._headers is not None: + if not self._match_dict(prepared_request.headers, self._headers): + return False + return True + + +class HttpRequestRegexMatcher(RequestMatcher): """ Extended RequestMatcher for HTTP requests that supports matching on: - HTTP method (case-insensitive) diff --git a/unit_tests/sources/streams/test_call_rate.py b/unit_tests/sources/streams/test_call_rate.py index 16bce68e3..853e2997e 100644 --- a/unit_tests/sources/streams/test_call_rate.py +++ b/unit_tests/sources/streams/test_call_rate.py @@ -17,6 +17,7 @@ CallRateLimitHit, FixedWindowCallRatePolicy, HttpRequestMatcher, + HttpRequestRegexMatcher, MovingWindowCallRatePolicy, Rate, UnlimitedCallRatePolicy, @@ -357,3 +358,90 @@ def test_with_cache(self, mocker, requests_mock): assert next(records) == {"data": "some_data"} assert MovingWindowCallRatePolicy.try_acquire.call_count == 1 + + +class TestHttpRequestRegexMatcher: + """ + Tests for the new regex-based logic: + - Case-insensitive HTTP method matching + - Optional url_base (scheme://netloc) + - Regex-based path matching + - Query params (must be present) + - Headers (case-insensitive keys) + """ + + def test_case_insensitive_method(self): + matcher = HttpRequestRegexMatcher(method="GET") + + req_ok = Request("get", "https://example.com/test/path") + req_wrong = Request("POST", "https://example.com/test/path") + + assert matcher(req_ok) + assert not matcher(req_wrong) + + def test_url_base(self): + matcher = HttpRequestRegexMatcher(url_base="https://example.com") + + req_ok = Request("GET", "https://example.com/test/path?foo=bar") + req_wrong = Request("GET", "https://another.com/test/path?foo=bar") + + assert matcher(req_ok) + assert not matcher(req_wrong) + + def test_url_path_pattern(self): + matcher = HttpRequestRegexMatcher(url_path_pattern=r"/test/") + + req_ok = Request("GET", "https://example.com/test/something") + req_wrong = Request("GET", "https://example.com/other/something") + + assert matcher(req_ok) + assert not matcher(req_wrong) + + def test_query_params(self): + matcher = HttpRequestRegexMatcher(params={"foo": "bar"}) + + req_ok = Request("GET", "https://example.com/api?foo=bar&extra=123") + req_missing = Request("GET", "https://example.com/api?not_foo=bar") + + assert matcher(req_ok) + assert not matcher(req_missing) + + def test_headers_case_insensitive(self): + matcher = HttpRequestRegexMatcher(headers={"X-Custom-Header": "abc"}) + + req_ok = Request( + "GET", + "https://example.com/api?foo=bar", + headers={"x-custom-header": "abc", "other": "123"}, + ) + req_wrong = Request("GET", "https://example.com/api", headers={"x-custom-header": "wrong"}) + + assert matcher(req_ok) + assert not matcher(req_wrong) + + def test_combined_criteria(self): + matcher = HttpRequestRegexMatcher( + method="GET", + url_base="https://example.com", + url_path_pattern=r"/test/", + params={"foo": "bar"}, + headers={"X-Test": "123"}, + ) + + req_ok = Request("GET", "https://example.com/test/me?foo=bar", headers={"x-test": "123"}) + req_bad_base = Request( + "GET", "https://other.com/test/me?foo=bar", headers={"x-test": "123"} + ) + req_bad_path = Request("GET", "https://example.com/nope?foo=bar", headers={"x-test": "123"}) + req_bad_param = Request( + "GET", "https://example.com/test/me?extra=xyz", headers={"x-test": "123"} + ) + req_bad_header = Request( + "GET", "https://example.com/test/me?foo=bar", headers={"some-other-header": "xyz"} + ) + + assert matcher(req_ok) + assert not matcher(req_bad_base) + assert not matcher(req_bad_path) + assert not matcher(req_bad_param) + assert not matcher(req_bad_header) From 1285668eecf394e90d373490c561d506f808d73d Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Sun, 9 Feb 2025 22:26:53 +0200 Subject: [PATCH 05/26] Add unit tests --- .../test_model_to_component_factory.py | 80 +++++++++++++++++++ .../requesters/test_http_requester.py | 32 ++++++++ 2 files changed, 112 insertions(+) diff --git a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py index 43564a5c8..769bc52a0 100644 --- a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +++ b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py @@ -142,6 +142,7 @@ from airbyte_cdk.sources.declarative.transformations import AddFields, RemoveFields from airbyte_cdk.sources.declarative.transformations.add_fields import AddedFieldDefinition from airbyte_cdk.sources.declarative.yaml_declarative_source import YamlDeclarativeSource +from airbyte_cdk.sources.streams.call_rate import MovingWindowCallRatePolicy from airbyte_cdk.sources.streams.concurrent.clamping import ( ClampingEndProvider, DayClampingStrategy, @@ -3564,3 +3565,82 @@ def test_create_async_retriever(): assert isinstance(selector, RecordSelector) assert isinstance(extractor, DpathExtractor) assert extractor.field_path == ["data"] + + +def test_api_budget(): + manifest = { + "type": "DeclarativeSource", + "api_budget": { + "type": "HTTPAPIBudget", + "ratelimit_reset_header": "X-RateLimit-Reset", + "ratelimit_remaining_header": "X-RateLimit-Remaining", + "status_codes_for_ratelimit_hit": [429, 503], + "policies": [ + { + "type": "MovingWindowCallRatePolicy", + "rates": [ + { + "type": "Rate", + "limit": 3, + "interval": "PT0.1S", # 0.1 seconds + } + ], + "matchers": [ + { + "type": "HttpRequestRegexMatcher", + "method": "GET", + "url_base": "https://api.sendgrid.com", + "url_path_pattern": "/v3/marketing/lists", + } + ], + } + ], + }, + "my_requester": { + "type": "HttpRequester", + "path": "/v3/marketing/lists", + "url_base": "https://api.sendgrid.com", + "http_method": "GET", + "authenticator": { + "type": "BasicHttpAuthenticator", + "username": "admin", + "password": "{{ config['password'] }}", + }, + }, + } + + config = { + "password": "verysecrettoken", + } + + factory = ModelToComponentFactory() + if "api_budget" in manifest: + factory.set_api_budget(manifest["api_budget"], config) + + from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( + HttpRequester as HttpRequesterModel, + ) + + requester_definition = manifest["my_requester"] + assert requester_definition["type"] == "HttpRequester" + + http_requester = factory.create_component( + model_type=HttpRequesterModel, + component_definition=requester_definition, + config=config, + name="lists_stream", + decoder=None, + ) + + assert http_requester.api_budget is not None + assert http_requester.api_budget.ratelimit_reset_header == "X-RateLimit-Reset" + assert http_requester.api_budget.status_codes_for_ratelimit_hit == [429, 503] + assert len(http_requester.api_budget.policies) == 1 + + # The single policy is a MovingWindowCallRatePolicy + policy = http_requester.api_budget.policies[0] + assert isinstance(policy, MovingWindowCallRatePolicy) + assert policy._bucket.rates[0].limit == 3 + # The 0.1s from 'PT0.1S' is stored in ms by PyRateLimiter internally + # but here just check that the limit and interval exist + assert policy._bucket.rates[0].interval == 100 # 100 ms diff --git a/unit_tests/sources/declarative/requesters/test_http_requester.py b/unit_tests/sources/declarative/requesters/test_http_requester.py index f02ec206b..c5d5c218d 100644 --- a/unit_tests/sources/declarative/requesters/test_http_requester.py +++ b/unit_tests/sources/declarative/requesters/test_http_requester.py @@ -2,6 +2,7 @@ # Copyright (c) 2023 Airbyte, Inc., all rights reserved. # +from datetime import timedelta from typing import Any, Mapping, Optional from unittest import mock from unittest.mock import MagicMock @@ -9,6 +10,7 @@ import pytest as pytest import requests +import requests.sessions from requests import PreparedRequest from airbyte_cdk.sources.declarative.auth.declarative_authenticator import DeclarativeAuthenticator @@ -27,6 +29,12 @@ InterpolatedRequestOptionsProvider, ) from airbyte_cdk.sources.message import MessageRepository +from airbyte_cdk.sources.streams.call_rate import ( + AbstractAPIBudget, + HttpAPIBudget, + MovingWindowCallRatePolicy, + Rate, +) from airbyte_cdk.sources.streams.http.error_handlers.response_models import ResponseAction from airbyte_cdk.sources.streams.http.exceptions import ( RequestBodyException, @@ -45,6 +53,7 @@ def factory( request_options_provider: Optional[InterpolatedRequestOptionsProvider] = None, authenticator: Optional[DeclarativeAuthenticator] = None, error_handler: Optional[ErrorHandler] = None, + api_budget: Optional[HttpAPIBudget] = None, config: Optional[Config] = None, parameters: Mapping[str, Any] = None, disable_retries: bool = False, @@ -61,6 +70,7 @@ def factory( http_method=http_method, request_options_provider=request_options_provider, error_handler=error_handler, + api_budget=api_budget, disable_retries=disable_retries, message_repository=message_repository or MagicMock(), use_cache=use_cache, @@ -934,3 +944,25 @@ def test_backoff_strategy_from_manifest_is_respected(http_requester_factory: Any http_requester._http_client._request_attempt_count.get(request_mock) == http_requester._http_client._max_retries + 1 ) + + +def test_http_requester_with_mock_apibudget(http_requester_factory, monkeypatch): + mock_budget = MagicMock(spec=HttpAPIBudget) + + requester = http_requester_factory( + url_base="https://example.com", + path="test", + api_budget=mock_budget, + ) + + dummy_response = requests.Response() + dummy_response.status_code = 200 + send_mock = MagicMock(return_value=dummy_response) + monkeypatch.setattr(requests.Session, "send", send_mock) + + response = requester.send_request() + + assert send_mock.call_count == 1 + assert response.status_code == 200 + + assert mock_budget.acquire_call.call_count == 1 From 7be98423518c975e672629abbd4cb063048e55d2 Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Sun, 9 Feb 2025 22:38:57 +0200 Subject: [PATCH 06/26] Add FixedWindowCallRatePolicy unit test --- .../parsers/model_to_component_factory.py | 2 +- .../test_model_to_component_factory.py | 79 +++++++++++++++++++ 2 files changed, 80 insertions(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 9bd775a4a..b55d40fcd 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -3001,7 +3001,7 @@ def create_fixed_window_call_rate_policy( ] return FixedWindowCallRatePolicy( next_reset_ts=model.next_reset_ts, - period=parse_duration(model.period), + period=model.period, call_limit=model.call_limit, matchers=matchers, ) diff --git a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py index 769bc52a0..bc72ea36b 100644 --- a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +++ b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py @@ -3644,3 +3644,82 @@ def test_api_budget(): # The 0.1s from 'PT0.1S' is stored in ms by PyRateLimiter internally # but here just check that the limit and interval exist assert policy._bucket.rates[0].interval == 100 # 100 ms + + +def test_api_budget_fixed_window_policy(): + manifest = { + "type": "DeclarativeSource", + # Root-level api_budget referencing a FixedWindowCallRatePolicy + "api_budget": { + "type": "APIBudget", + "maximum_attempts_to_acquire": 9999, + "policies": [ + { + "type": "FixedWindowCallRatePolicy", + "next_reset_ts": "2025-01-01T00:00:00Z", + "period": "PT1M", # 1 minute + "call_limit": 10, + "matchers": [ + { + "type": "HttpRequestRegexMatcher", + "method": "GET", + "url_base": "https://example.org", + "url_path_pattern": "/v2/data", + } + ], + } + ], + }, + # We'll define a single HttpRequester that references that base + "my_requester": { + "type": "HttpRequester", + "path": "/v2/data", + "url_base": "https://example.org", + "http_method": "GET", + "authenticator": {"type": "NoAuth"}, + }, + } + + config = {} + + factory = ModelToComponentFactory() + if "api_budget" in manifest: + factory.set_api_budget(manifest["api_budget"], config) + + from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( + HttpRequester as HttpRequesterModel, + ) + + requester_definition = manifest["my_requester"] + assert requester_definition["type"] == "HttpRequester" + http_requester = factory.create_component( + model_type=HttpRequesterModel, + component_definition=requester_definition, + config=config, + name="my_stream", + decoder=None, + ) + + assert http_requester.api_budget is not None + assert http_requester.api_budget.maximum_attempts_to_acquire == 9999 + assert len(http_requester.api_budget.policies) == 1 + + from airbyte_cdk.sources.streams.call_rate import FixedWindowCallRatePolicy + + policy = http_requester.api_budget.policies[0] + assert isinstance(policy, FixedWindowCallRatePolicy) + assert policy._call_limit == 10 + # The period is "PT1M" => 60 seconds + assert policy._offset.total_seconds() == 60 + + expected_reset_dt = datetime(2025, 1, 1, 0, 0, 0, tzinfo=timezone.utc) + assert policy._next_reset_ts == expected_reset_dt + + assert len(policy._matchers) == 1 + matcher = policy._matchers[0] + from airbyte_cdk.sources.streams.call_rate import HttpRequestRegexMatcher + + assert isinstance(matcher, HttpRequestRegexMatcher) + assert matcher._method == "GET" + assert matcher._url_base == "https://example.org" + assert matcher._url_path_pattern.pattern == "/v2/data" From 8d3bfce9fef2442d46eab37cbe6e5d5f275c7ec5 Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Mon, 10 Feb 2025 11:24:38 +0200 Subject: [PATCH 07/26] Change the partitions limit to 1000 --- .../declarative/incremental/concurrent_partition_cursor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py b/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py index ab667c655..fd803df49 100644 --- a/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +++ b/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py @@ -58,7 +58,7 @@ class ConcurrentPerPartitionCursor(Cursor): CurrentPerPartitionCursor expects the state of the ConcurrentCursor to follow the format {cursor_field: cursor_value}. """ - DEFAULT_MAX_PARTITIONS_NUMBER = 10000 + DEFAULT_MAX_PARTITIONS_NUMBER = 1000 _NO_STATE: Mapping[str, Any] = {} _NO_CURSOR_STATE: Mapping[str, Any] = {} _GLOBAL_STATE_KEY = "state" From 509ea05575c146587d2d0c0970e09a886fee3a35 Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Mon, 10 Feb 2025 17:31:53 +0200 Subject: [PATCH 08/26] Refactored switching logic --- .../incremental/concurrent_partition_cursor.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py b/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py index fd803df49..f54a0297f 100644 --- a/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +++ b/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py @@ -58,7 +58,8 @@ class ConcurrentPerPartitionCursor(Cursor): CurrentPerPartitionCursor expects the state of the ConcurrentCursor to follow the format {cursor_field: cursor_value}. """ - DEFAULT_MAX_PARTITIONS_NUMBER = 1000 + DEFAULT_MAX_PARTITIONS_NUMBER = 10_000 + SWITCH_TO_GLOBAL_LIMIT = 1000 _NO_STATE: Mapping[str, Any] = {} _NO_CURSOR_STATE: Mapping[str, Any] = {} _GLOBAL_STATE_KEY = "state" @@ -99,7 +100,7 @@ def __init__( self._new_global_cursor: Optional[StreamState] = None self._lookback_window: int = 0 self._parent_state: Optional[StreamState] = None - self._over_limit: int = 0 + self._number_of_partitions: int = 0 self._use_global_cursor: bool = False self._partition_serializer = PerPartitionKeySerializer() @@ -233,8 +234,8 @@ def _ensure_partition_limit(self) -> None: or removed due to being the oldest. """ with self._lock: + self._number_of_partitions += 1 while len(self._cursor_per_partition) > self.DEFAULT_MAX_PARTITIONS_NUMBER - 1: - self._over_limit += 1 # Try removing finished partitions first for partition_key in list(self._cursor_per_partition.keys()): if ( @@ -245,7 +246,7 @@ def _ensure_partition_limit(self) -> None: partition_key ) # Remove the oldest partition logger.warning( - f"The maximum number of partitions has been reached. Dropping the oldest finished partition: {oldest_partition}. Over limit: {self._over_limit}." + f"The maximum number of partitions has been reached. Dropping the oldest finished partition: {oldest_partition}. Over limit: {self._number_of_partitions}." ) break else: @@ -254,7 +255,7 @@ def _ensure_partition_limit(self) -> None: 1 ] # Remove the oldest partition logger.warning( - f"The maximum number of partitions has been reached. Dropping the oldest partition: {oldest_partition}. Over limit: {self._over_limit}." + f"The maximum number of partitions has been reached. Dropping the oldest partition: {oldest_partition}. Over limit: {self._number_of_partitions}." ) def _set_initial_state(self, stream_state: StreamState) -> None: @@ -355,6 +356,10 @@ def _set_global_state(self, stream_state: Mapping[str, Any]) -> None: def observe(self, record: Record) -> None: if not self._use_global_cursor and self.limit_reached(): + logger.info( + f"Exceeded the 'SWITCH_TO_GLOBAL_LIMIT' of {self.SWITCH_TO_GLOBAL_LIMIT}. " + f"Switching to global cursor for {self._stream_name}." + ) self._use_global_cursor = True if not record.associated_slice: @@ -397,4 +402,4 @@ def _get_cursor(self, record: Record) -> ConcurrentCursor: return cursor def limit_reached(self) -> bool: - return self._over_limit > self.DEFAULT_MAX_PARTITIONS_NUMBER + return self._number_of_partitions > self.SWITCH_TO_GLOBAL_LIMIT From 8d44150ce61cb38aaf4e9ce30183ef43f3a7a0fd Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Mon, 10 Feb 2025 20:53:06 +0200 Subject: [PATCH 09/26] Increase the limit for number of partitions in memory --- .../declarative/incremental/concurrent_partition_cursor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py b/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py index f54a0297f..d69b61bfd 100644 --- a/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +++ b/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py @@ -58,7 +58,7 @@ class ConcurrentPerPartitionCursor(Cursor): CurrentPerPartitionCursor expects the state of the ConcurrentCursor to follow the format {cursor_field: cursor_value}. """ - DEFAULT_MAX_PARTITIONS_NUMBER = 10_000 + DEFAULT_MAX_PARTITIONS_NUMBER = 25_000 SWITCH_TO_GLOBAL_LIMIT = 1000 _NO_STATE: Mapping[str, Any] = {} _NO_CURSOR_STATE: Mapping[str, Any] = {} From 342375c5fc1017a8738fbc0a7166695f24388801 Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Wed, 12 Feb 2025 15:42:21 +0200 Subject: [PATCH 10/26] Refactor ConcurrentPerPartitionCursor to not use ConcurrentCursor with `_use_global_cursor` --- .../declarative_component_schema.yaml | 207 ------------------ .../concurrent_partition_cursor.py | 29 ++- .../manifest_declarative_source.py | 4 - .../models/declarative_component_schema.py | 165 -------------- .../parsers/model_to_component_factory.py | 141 ------------ .../declarative/requesters/http_requester.py | 3 - airbyte_cdk/sources/streams/call_rate.py | 156 ++++--------- 7 files changed, 59 insertions(+), 646 deletions(-) diff --git a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml index 25c9492fb..b0242c94f 100644 --- a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml +++ b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml @@ -40,12 +40,6 @@ properties: "$ref": "#/definitions/Spec" concurrency_level: "$ref": "#/definitions/ConcurrencyLevel" - api_budget: - title: API Budget - description: Defines how many requests can be made to the API in a given time frame. This field accepts either a generic APIBudget or an HTTP-specific configuration (HTTPAPIBudget) to be applied across all streams. - anyOf: - - "$ref": "#/definitions/APIBudget" - - "$ref": "#/definitions/HTTPAPIBudget" metadata: type: object description: For internal Airbyte use only - DO NOT modify manually. Used by consumers of declarative manifests for storing related metadata. @@ -1371,207 +1365,6 @@ definitions: $parameters: type: object additional_properties: true - APIBudget: - title: API Budget - description: > - A generic API budget configuration that defines the policies (rate limiting rules) - and the maximum number of attempts to acquire a call credit. This budget does not automatically - update itself based on HTTP response headers. - type: object - required: - - type - - policies - properties: - type: - type: string - enum: [APIBudget] - policies: - title: Policies - description: List of call rate policies that define how many calls are allowed. - type: array - items: - anyOf: - - "$ref": "#/definitions/FixedWindowCallRatePolicy" - - "$ref": "#/definitions/MovingWindowCallRatePolicy" - - "$ref": "#/definitions/UnlimitedCallRatePolicy" - maximum_attempts_to_acquire: - title: Maximum Attempts to Acquire - description: The maximum number of attempts to acquire a call before giving up. - type: integer - default: 100000 - additionalProperties: true - HTTPAPIBudget: - title: HTTP API Budget - description: > - An HTTP-specific API budget that extends APIBudget by updating rate limiting information based - on HTTP response headers. It extracts available calls and the next reset timestamp from the HTTP responses. - type: object - required: - - type - - policies - properties: - type: - type: string - enum: [HTTPAPIBudget] - policies: - title: Policies - description: List of call rate policies that define how many calls are allowed. - type: array - items: - anyOf: - - "$ref": "#/definitions/FixedWindowCallRatePolicy" - - "$ref": "#/definitions/MovingWindowCallRatePolicy" - - "$ref": "#/definitions/UnlimitedCallRatePolicy" - ratelimit_reset_header: - title: Rate Limit Reset Header - description: The HTTP response header name that indicates when the rate limit resets. - type: string - default: "ratelimit-reset" - ratelimit_remaining_header: - title: Rate Limit Remaining Header - description: The HTTP response header name that indicates the number of remaining allowed calls. - type: string - default: "ratelimit-remaining" - status_codes_for_ratelimit_hit: - title: Status Codes for Rate Limit Hit - description: List of HTTP status codes that indicate a rate limit has been hit. - type: array - items: - type: integer - default: [429] - maximum_attempts_to_acquire: - title: Maximum Attempts to Acquire - description: The maximum number of attempts to acquire a call before giving up. - type: integer - default: 100000 - additionalProperties: true - FixedWindowCallRatePolicy: - title: Fixed Window Call Rate Policy - description: A policy that allows a fixed number of calls within a specific time window. - type: object - required: - - type - - next_reset_ts - - period - - call_limit - - matchers - properties: - type: - type: string - enum: [FixedWindowCallRatePolicy] - next_reset_ts: - title: Next Reset Timestamp - description: The timestamp when the rate limit will reset. - type: string - format: date-time - period: - title: Period - description: The time interval for the rate limit window. - type: string - format: duration - call_limit: - title: Call Limit - description: The maximum number of calls allowed within the period. - type: integer - matchers: - title: Matchers - description: List of matchers that define which requests this policy applies to. - type: array - items: - "$ref": "#/definitions/HttpRequestRegexMatcher" - additionalProperties: true - MovingWindowCallRatePolicy: - title: Moving Window Call Rate Policy - description: A policy that allows a fixed number of calls within a moving time window. - type: object - required: - - type - - rates - - matchers - properties: - type: - type: string - enum: [MovingWindowCallRatePolicy] - rates: - title: Rates - description: List of rates that define the call limits for different time intervals. - type: array - items: - "$ref": "#/definitions/Rate" - matchers: - title: Matchers - description: List of matchers that define which requests this policy applies to. - type: array - items: - "$ref": "#/definitions/HttpRequestRegexMatcher" - additionalProperties: true - UnlimitedCallRatePolicy: - title: Unlimited Call Rate Policy - description: A policy that allows unlimited calls for specific requests. - type: object - required: - - type - - matchers - properties: - type: - type: string - enum: [UnlimitedCallRatePolicy] - matchers: - title: Matchers - description: List of matchers that define which requests this policy applies to. - type: array - items: - "$ref": "#/definitions/HttpRequestRegexMatcher" - additionalProperties: true - Rate: - title: Rate - description: Defines a rate limit with a specific number of calls allowed within a time interval. - type: object - required: - - limit - - interval - properties: - limit: - title: Limit - description: The maximum number of calls allowed within the interval. - type: integer - interval: - title: Interval - description: The time interval for the rate limit. - type: string - format: duration - additionalProperties: true - HttpRequestRegexMatcher: - title: HTTP Request Matcher - description: > - Matches HTTP requests based on method, base URL, URL path pattern, query parameters, and headers. - Use `url_base` to specify the scheme and host (without trailing slash) and - `url_path_pattern` to apply a regex to the request path. - type: object - properties: - method: - title: Method - description: The HTTP method to match (e.g., GET, POST). - type: string - url_base: - title: URL Base - description: The base URL (scheme and host, e.g. "https://api.example.com") to match. - type: string - url_path_pattern: - title: URL Path Pattern - description: A regular expression pattern to match the URL path. - type: string - params: - title: Parameters - description: The query parameters to match. - type: object - additionalProperties: true - headers: - title: Headers - description: The headers to match. - type: object - additionalProperties: true - additionalProperties: true DefaultErrorHandler: title: Default Error Handler description: Component defining how to handle errors. Default behavior includes only retrying server errors (HTTP 5XX) and too many requests (HTTP 429) with an exponential backoff. diff --git a/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py b/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py index d69b61bfd..fc75ecd90 100644 --- a/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +++ b/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py @@ -142,7 +142,8 @@ def close_partition(self, partition: Partition) -> None: raise ValueError("stream_slice cannot be None") partition_key = self._to_partition_key(stream_slice.partition) - self._cursor_per_partition[partition_key].close_partition(partition=partition) + if not self._use_global_cursor: + self._cursor_per_partition[partition_key].close_partition(partition=partition) with self._lock: self._semaphore_per_partition[partition_key].acquire() cursor = self._cursor_per_partition[partition_key] @@ -150,12 +151,7 @@ def close_partition(self, partition: Partition) -> None: partition_key in self._finished_partitions and self._semaphore_per_partition[partition_key]._value == 0 ): - if ( - self._new_global_cursor is None - or self._new_global_cursor[self.cursor_field.cursor_field_key] - < cursor.state[self.cursor_field.cursor_field_key] - ): - self._new_global_cursor = copy.deepcopy(cursor.state) + self._update_global_cursor(cursor.state[self.cursor_field.cursor_field_key]) if not self._use_global_cursor: self._emit_state_message() @@ -366,9 +362,22 @@ def observe(self, record: Record) -> None: raise ValueError( "Invalid state as stream slices that are emitted should refer to an existing cursor" ) - self._cursor_per_partition[ - self._to_partition_key(record.associated_slice.partition) - ].observe(record) + + record_cursor = self._connector_state_converter.parse_value( + self._cursor_field.extract_value(record) + ) + self._update_global_cursor(record_cursor) + if not self._use_global_cursor: + self._cursor_per_partition[ + self._to_partition_key(record.associated_slice.partition) + ].observe(record) + + def _update_global_cursor(self, value: Mapping[str, Any]) -> None: + if ( + self._new_global_cursor is None + or self._new_global_cursor[self.cursor_field.cursor_field_key] < value + ): + self._new_global_cursor = {self.cursor_field.cursor_field_key: copy.deepcopy(value)} def _to_partition_key(self, partition: Mapping[str, Any]) -> str: return self._partition_serializer.to_partition_key(partition) diff --git a/airbyte_cdk/sources/declarative/manifest_declarative_source.py b/airbyte_cdk/sources/declarative/manifest_declarative_source.py index d3afb1396..efc779464 100644 --- a/airbyte_cdk/sources/declarative/manifest_declarative_source.py +++ b/airbyte_cdk/sources/declarative/manifest_declarative_source.py @@ -137,10 +137,6 @@ def streams(self, config: Mapping[str, Any]) -> List[Stream]: self._source_config, config ) - api_budget_model = self._source_config.get("api_budget") - if api_budget_model: - self._constructor.set_api_budget(api_budget_model, config) - source_streams = [ self._constructor.create_component( DeclarativeStreamModel, diff --git a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py index aaff67548..fe29cee2c 100644 --- a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py +++ b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py @@ -3,7 +3,6 @@ from __future__ import annotations -from datetime import datetime, timedelta from enum import Enum from typing import Any, Dict, List, Literal, Optional, Union @@ -643,45 +642,6 @@ class OAuthAuthenticator(BaseModel): parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters") -class Rate(BaseModel): - class Config: - extra = Extra.allow - - limit: int = Field( - ..., - description="The maximum number of calls allowed within the interval.", - title="Limit", - ) - interval: timedelta = Field( - ..., description="The time interval for the rate limit.", title="Interval" - ) - - -class HttpRequestRegexMatcher(BaseModel): - class Config: - extra = Extra.allow - - method: Optional[str] = Field( - None, description="The HTTP method to match (e.g., GET, POST).", title="Method" - ) - url_base: Optional[str] = Field( - None, - description='The base URL (scheme and host, e.g. "https://api.example.com") to match.', - title="URL Base", - ) - url_path_pattern: Optional[str] = Field( - None, - description="A regular expression pattern to match the URL path.", - title="URL Path Pattern", - ) - params: Optional[Dict[str, Any]] = Field( - None, description="The query parameters to match.", title="Parameters" - ) - headers: Optional[Dict[str, Any]] = Field( - None, description="The headers to match.", title="Headers" - ) - - class DpathExtractor(BaseModel): type: Literal["DpathExtractor"] field_path: List[str] = Field( @@ -1624,60 +1584,6 @@ class DatetimeBasedCursor(BaseModel): parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters") -class FixedWindowCallRatePolicy(BaseModel): - class Config: - extra = Extra.allow - - type: Literal["FixedWindowCallRatePolicy"] - next_reset_ts: datetime = Field( - ..., - description="The timestamp when the rate limit will reset.", - title="Next Reset Timestamp", - ) - period: timedelta = Field( - ..., description="The time interval for the rate limit window.", title="Period" - ) - call_limit: int = Field( - ..., - description="The maximum number of calls allowed within the period.", - title="Call Limit", - ) - matchers: List[HttpRequestRegexMatcher] = Field( - ..., - description="List of matchers that define which requests this policy applies to.", - title="Matchers", - ) - - -class MovingWindowCallRatePolicy(BaseModel): - class Config: - extra = Extra.allow - - type: Literal["MovingWindowCallRatePolicy"] - rates: List[Rate] = Field( - ..., - description="List of rates that define the call limits for different time intervals.", - title="Rates", - ) - matchers: List[HttpRequestRegexMatcher] = Field( - ..., - description="List of matchers that define which requests this policy applies to.", - title="Matchers", - ) - - -class UnlimitedCallRatePolicy(BaseModel): - class Config: - extra = Extra.allow - - type: Literal["UnlimitedCallRatePolicy"] - matchers: List[HttpRequestRegexMatcher] = Field( - ..., - description="List of matchers that define which requests this policy applies to.", - title="Matchers", - ) - - class DefaultErrorHandler(BaseModel): type: Literal["DefaultErrorHandler"] backoff_strategies: Optional[ @@ -1809,67 +1715,6 @@ class CompositeErrorHandler(BaseModel): parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters") -class APIBudget(BaseModel): - class Config: - extra = Extra.allow - - type: Literal["APIBudget"] - policies: List[ - Union[ - FixedWindowCallRatePolicy, - MovingWindowCallRatePolicy, - UnlimitedCallRatePolicy, - ] - ] = Field( - ..., - description="List of call rate policies that define how many calls are allowed.", - title="Policies", - ) - maximum_attempts_to_acquire: Optional[int] = Field( - 100000, - description="The maximum number of attempts to acquire a call before giving up.", - title="Maximum Attempts to Acquire", - ) - - -class HTTPAPIBudget(BaseModel): - class Config: - extra = Extra.allow - - type: Literal["HTTPAPIBudget"] - policies: List[ - Union[ - FixedWindowCallRatePolicy, - MovingWindowCallRatePolicy, - UnlimitedCallRatePolicy, - ] - ] = Field( - ..., - description="List of call rate policies that define how many calls are allowed.", - title="Policies", - ) - ratelimit_reset_header: Optional[str] = Field( - "ratelimit-reset", - description="The HTTP response header name that indicates when the rate limit resets.", - title="Rate Limit Reset Header", - ) - ratelimit_remaining_header: Optional[str] = Field( - "ratelimit-remaining", - description="The HTTP response header name that indicates the number of remaining allowed calls.", - title="Rate Limit Remaining Header", - ) - status_codes_for_ratelimit_hit: Optional[List[int]] = Field( - [429], - description="List of HTTP status codes that indicate a rate limit has been hit.", - title="Status Codes for Rate Limit Hit", - ) - maximum_attempts_to_acquire: Optional[int] = Field( - 100000, - description="The maximum number of attempts to acquire a call before giving up.", - title="Maximum Attempts to Acquire", - ) - - class ZipfileDecoder(BaseModel): class Config: extra = Extra.allow @@ -1903,11 +1748,6 @@ class Config: definitions: Optional[Dict[str, Any]] = None spec: Optional[Spec] = None concurrency_level: Optional[ConcurrencyLevel] = None - api_budget: Optional[Union[APIBudget, HTTPAPIBudget]] = Field( - None, - description="Defines how many requests can be made to the API in a given time frame. This field accepts either a generic APIBudget or an HTTP-specific configuration (HTTPAPIBudget) to be applied across all streams.", - title="API Budget", - ) metadata: Optional[Dict[str, Any]] = Field( None, description="For internal Airbyte use only - DO NOT modify manually. Used by consumers of declarative manifests for storing related metadata.", @@ -1934,11 +1774,6 @@ class Config: definitions: Optional[Dict[str, Any]] = None spec: Optional[Spec] = None concurrency_level: Optional[ConcurrencyLevel] = None - api_budget: Optional[Union[APIBudget, HTTPAPIBudget]] = Field( - None, - description="Defines how many requests can be made to the API in a given time frame. This field accepts either a generic APIBudget or an HTTP-specific configuration (HTTPAPIBudget) to be applied across all streams.", - title="API Budget", - ) metadata: Optional[Dict[str, Any]] = Field( None, description="For internal Airbyte use only - DO NOT modify manually. Used by consumers of declarative manifests for storing related metadata.", diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py index 4b80e851b..c6d69623d 100644 --- a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +++ b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py @@ -112,9 +112,6 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( AddFields as AddFieldsModel, ) -from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( - APIBudget as APIBudgetModel, -) from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( ApiKeyAuthenticator as ApiKeyAuthenticatorModel, ) @@ -229,9 +226,6 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( ExponentialBackoffStrategy as ExponentialBackoffStrategyModel, ) -from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( - FixedWindowCallRatePolicy as FixedWindowCallRatePolicyModel, -) from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( FlattenFields as FlattenFieldsModel, ) @@ -241,18 +235,12 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( GzipParser as GzipParserModel, ) -from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( - HTTPAPIBudget as HTTPAPIBudgetModel, -) from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( HttpComponentsResolver as HttpComponentsResolverModel, ) from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( HttpRequester as HttpRequesterModel, ) -from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( - HttpRequestRegexMatcher as HttpRequestRegexMatcherModel, -) from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( HttpResponseFilter as HttpResponseFilterModel, ) @@ -307,9 +295,6 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( MinMaxDatetime as MinMaxDatetimeModel, ) -from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( - MovingWindowCallRatePolicy as MovingWindowCallRatePolicyModel, -) from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( NoAuth as NoAuthModel, ) @@ -328,9 +313,6 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( ParentStreamConfig as ParentStreamConfigModel, ) -from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( - Rate as RateModel, -) from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( RecordFilter as RecordFilterModel, ) @@ -374,9 +356,6 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( TypesMap as TypesMapModel, ) -from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( - UnlimitedCallRatePolicy as UnlimitedCallRatePolicyModel, -) from airbyte_cdk.sources.declarative.models.declarative_component_schema import ValueType from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( WaitTimeFromHeader as WaitTimeFromHeaderModel, @@ -490,15 +469,6 @@ MessageRepository, NoopMessageRepository, ) -from airbyte_cdk.sources.streams.call_rate import ( - APIBudget, - FixedWindowCallRatePolicy, - HttpAPIBudget, - HttpRequestRegexMatcher, - MovingWindowCallRatePolicy, - Rate, - UnlimitedCallRatePolicy, -) from airbyte_cdk.sources.streams.concurrent.clamping import ( ClampingEndProvider, ClampingStrategy, @@ -550,7 +520,6 @@ def __init__( self._evaluate_log_level(emit_connector_builder_messages) ) self._connector_state_manager = connector_state_manager or ConnectorStateManager() - self._api_budget: Optional[Union[APIBudget, HttpAPIBudget]] = None def _init_mappings(self) -> None: self.PYDANTIC_MODEL_TO_CONSTRUCTOR: Mapping[Type[BaseModel], Callable[..., Any]] = { @@ -638,13 +607,6 @@ def _init_mappings(self) -> None: StreamConfigModel: self.create_stream_config, ComponentMappingDefinitionModel: self.create_components_mapping_definition, ZipfileDecoderModel: self.create_zipfile_decoder, - APIBudgetModel: self.create_api_budget, - HTTPAPIBudgetModel: self.create_http_api_budget, - FixedWindowCallRatePolicyModel: self.create_fixed_window_call_rate_policy, - MovingWindowCallRatePolicyModel: self.create_moving_window_call_rate_policy, - UnlimitedCallRatePolicyModel: self.create_unlimited_call_rate_policy, - RateModel: self.create_rate, - HttpRequestRegexMatcherModel: self.create_http_request_matcher, } # Needed for the case where we need to perform a second parse on the fields of a custom component @@ -1957,8 +1919,6 @@ def create_http_requester( ) ) - api_budget = self._api_budget - request_options_provider = InterpolatedRequestOptionsProvider( request_body_data=model.request_body_data, request_body_json=model.request_body_json, @@ -1979,7 +1939,6 @@ def create_http_requester( path=model.path, authenticator=authenticator, error_handler=error_handler, - api_budget=api_budget, http_method=HttpMethod[model.http_method.value], request_options_provider=request_options_provider, config=config, @@ -2981,103 +2940,3 @@ def _is_supported_parser_for_pagination(self, parser: Parser) -> bool: return isinstance(parser.inner_parser, JsonParser) else: return False - - def create_api_budget(self, model: APIBudgetModel, config: Config, **kwargs: Any) -> APIBudget: - policies = [ - self._create_component_from_model(model=policy, config=config) - for policy in model.policies - ] - - return APIBudget( - policies=policies, - maximum_attempts_to_acquire=model.maximum_attempts_to_acquire or 100000, - ) - - def create_http_api_budget( - self, model: HTTPAPIBudgetModel, config: Config, **kwargs: Any - ) -> HttpAPIBudget: - policies = [ - self._create_component_from_model(model=policy, config=config) - for policy in model.policies - ] - - return HttpAPIBudget( - policies=policies, - maximum_attempts_to_acquire=model.maximum_attempts_to_acquire or 100000, - ratelimit_reset_header=model.ratelimit_reset_header or "ratelimit-reset", - ratelimit_remaining_header=model.ratelimit_remaining_header or "ratelimit-remaining", - status_codes_for_ratelimit_hit=model.status_codes_for_ratelimit_hit or (429,), - ) - - def create_fixed_window_call_rate_policy( - self, model: FixedWindowCallRatePolicyModel, config: Config, **kwargs: Any - ) -> FixedWindowCallRatePolicy: - matchers = [ - self._create_component_from_model(model=matcher, config=config) - for matcher in model.matchers - ] - return FixedWindowCallRatePolicy( - next_reset_ts=model.next_reset_ts, - period=model.period, - call_limit=model.call_limit, - matchers=matchers, - ) - - def create_moving_window_call_rate_policy( - self, model: MovingWindowCallRatePolicyModel, config: Config, **kwargs: Any - ) -> MovingWindowCallRatePolicy: - rates = [ - self._create_component_from_model(model=rate, config=config) for rate in model.rates - ] - matchers = [ - self._create_component_from_model(model=matcher, config=config) - for matcher in model.matchers - ] - return MovingWindowCallRatePolicy( - rates=rates, - matchers=matchers, - ) - - def create_unlimited_call_rate_policy( - self, model: UnlimitedCallRatePolicyModel, config: Config, **kwargs: Any - ) -> UnlimitedCallRatePolicy: - matchers = [ - self._create_component_from_model(model=matcher, config=config) - for matcher in model.matchers - ] - - return UnlimitedCallRatePolicy( - matchers=matchers, - ) - - def create_rate(self, model: RateModel, config: Config, **kwargs: Any) -> Rate: - return Rate( - limit=model.limit, - interval=model.interval, - ) - - def create_http_request_matcher( - self, model: HttpRequestRegexMatcherModel, config: Config, **kwargs: Any - ) -> HttpRequestRegexMatcher: - return HttpRequestRegexMatcher( - method=model.method, - url_base=model.url_base, - url_path_pattern=model.url_path_pattern, - params=model.params, - headers=model.headers, - ) - - def set_api_budget(self, component_definition: ComponentDefinition, config: Config) -> None: - model_str = component_definition.get("type") - if model_str == "APIBudget": - # Annotate model_type as a type that is a subclass of BaseModel - model_type: Union[Type[APIBudgetModel], Type[HTTPAPIBudgetModel]] = APIBudgetModel - elif model_str == "HTTPAPIBudget": - model_type = HTTPAPIBudgetModel - else: - raise ValueError(f"Unknown API Budget type: {model_str}") - - # create_component expects a type[BaseModel] and returns an instance of that model. - self._api_budget = self.create_component( - model_type=model_type, component_definition=component_definition, config=config - ) diff --git a/airbyte_cdk/sources/declarative/requesters/http_requester.py b/airbyte_cdk/sources/declarative/requesters/http_requester.py index b206bd688..ad23f4d06 100644 --- a/airbyte_cdk/sources/declarative/requesters/http_requester.py +++ b/airbyte_cdk/sources/declarative/requesters/http_requester.py @@ -22,7 +22,6 @@ ) from airbyte_cdk.sources.declarative.requesters.requester import HttpMethod, Requester from airbyte_cdk.sources.message import MessageRepository, NoopMessageRepository -from airbyte_cdk.sources.streams.call_rate import APIBudget from airbyte_cdk.sources.streams.http import HttpClient from airbyte_cdk.sources.streams.http.error_handlers import ErrorHandler from airbyte_cdk.sources.types import Config, StreamSlice, StreamState @@ -56,7 +55,6 @@ class HttpRequester(Requester): http_method: Union[str, HttpMethod] = HttpMethod.GET request_options_provider: Optional[InterpolatedRequestOptionsProvider] = None error_handler: Optional[ErrorHandler] = None - api_budget: Optional[APIBudget] = None disable_retries: bool = False message_repository: MessageRepository = NoopMessageRepository() use_cache: bool = False @@ -93,7 +91,6 @@ def __post_init__(self, parameters: Mapping[str, Any]) -> None: name=self.name, logger=self.logger, error_handler=self.error_handler, - api_budget=self.api_budget, authenticator=self._authenticator, use_cache=self.use_cache, backoff_strategy=backoff_strategies, diff --git a/airbyte_cdk/sources/streams/call_rate.py b/airbyte_cdk/sources/streams/call_rate.py index 21fec881f..81ebac78e 100644 --- a/airbyte_cdk/sources/streams/call_rate.py +++ b/airbyte_cdk/sources/streams/call_rate.py @@ -6,12 +6,10 @@ import dataclasses import datetime import logging -import re import time -from dataclasses import InitVar, dataclass, field from datetime import timedelta from threading import RLock -from typing import TYPE_CHECKING, Any, Mapping, Optional, Union +from typing import TYPE_CHECKING, Any, Mapping, Optional from urllib import parse import requests @@ -162,100 +160,6 @@ def __call__(self, request: Any) -> bool: return True -class HttpRequestRegexMatcher(RequestMatcher): - """ - Extended RequestMatcher for HTTP requests that supports matching on: - - HTTP method (case-insensitive) - - URL base (scheme + netloc) optionally - - URL path pattern (a regex applied to the path portion of the URL) - - Query parameters (must be present) - - Headers (header names compared case-insensitively) - """ - - def __init__( - self, - method: Optional[str] = None, - url_base: Optional[str] = None, - url_path_pattern: Optional[str] = None, - params: Optional[Mapping[str, Any]] = None, - headers: Optional[Mapping[str, Any]] = None, - ): - """ - :param method: HTTP method (e.g. "GET", "POST"); compared case-insensitively. - :param url_base: Base URL (scheme://host) that must match. - :param url_path_pattern: A regex pattern that will be applied to the path portion of the URL. - :param params: Dictionary of query parameters that must be present in the request. - :param headers: Dictionary of headers that must be present (header keys are compared case-insensitively). - """ - self._method = method.upper() if method else None - - # Normalize the url_base if provided: remove trailing slash. - self._url_base = url_base.rstrip("/") if url_base else None - - # Compile the URL path pattern if provided. - self._url_path_pattern = re.compile(url_path_pattern) if url_path_pattern else None - - # Normalize query parameters to strings. - self._params = {str(k): str(v) for k, v in (params or {}).items()} - - # Normalize header keys to lowercase. - self._headers = {str(k).lower(): str(v) for k, v in (headers or {}).items()} - - @staticmethod - def _match_dict(obj: Mapping[str, Any], pattern: Mapping[str, Any]) -> bool: - """Check that every key/value in the pattern exists in the object.""" - return pattern.items() <= obj.items() - - def __call__(self, request: Any) -> bool: - """ - :param request: A requests.Request or requests.PreparedRequest instance. - :return: True if the request matches all provided criteria; False otherwise. - """ - # Prepare the request (if needed) and extract the URL details. - if isinstance(request, requests.Request): - prepared_request = request.prepare() - elif isinstance(request, requests.PreparedRequest): - prepared_request = request - else: - return False - - # Check HTTP method. - if self._method is not None and prepared_request.method is not None: - if prepared_request.method.upper() != self._method: - return False - - # Parse the URL. - parsed_url = parse.urlsplit(prepared_request.url) - # Reconstruct the base: scheme://netloc - request_url_base = f"{str(parsed_url.scheme)}://{str(parsed_url.netloc)}" - # The path (without query parameters) - request_path = str(parsed_url.path).rstrip("/") - - # If a base URL is provided, check that it matches. - if self._url_base is not None: - if request_url_base != self._url_base: - return False - - # If a URL path pattern is provided, ensure the path matches the regex. - if self._url_path_pattern is not None: - if not self._url_path_pattern.search(request_path): - return False - - # Check query parameters. - if self._params: - query_params = dict(parse.parse_qsl(str(parsed_url.query))) - if not self._match_dict(query_params, self._params): - return False - - # Check headers (normalize keys to lower-case). - if self._headers: - req_headers = {k.lower(): v for k, v in prepared_request.headers.items()} - if not self._match_dict(req_headers, self._headers): - return False - - return True - - class BaseCallRatePolicy(AbstractCallRatePolicy, abc.ABC): def __init__(self, matchers: list[RequestMatcher]): self._matchers = matchers @@ -495,17 +399,24 @@ def update_from_response(self, request: Any, response: Any) -> None: """ -@dataclass class APIBudget(AbstractAPIBudget): - """ - Default APIBudget implementation. - """ + """Default APIBudget implementation""" + + def __init__( + self, policies: list[AbstractCallRatePolicy], maximum_attempts_to_acquire: int = 100000 + ) -> None: + """Constructor + + :param policies: list of policies in this budget + :param maximum_attempts_to_acquire: number of attempts before throwing hit ratelimit exception, we put some big number here + to avoid situations when many threads compete with each other for a few lots over a significant amount of time + """ - policies: list[AbstractCallRatePolicy] - maximum_attempts_to_acquire: int = 100000 + self._policies = policies + self._maximum_attempts_to_acquire = maximum_attempts_to_acquire def get_matching_policy(self, request: Any) -> Optional[AbstractCallRatePolicy]: - for policy in self.policies: + for policy in self._policies: if policy.matches(request): return policy return None @@ -526,7 +437,7 @@ def acquire_call( policy = self.get_matching_policy(request) if policy: self._do_acquire(request=request, policy=policy, block=block, timeout=timeout) - elif self.policies: + elif self._policies: logger.info("no policies matched with requests, allow call by default") def update_from_response(self, request: Any, response: Any) -> None: @@ -549,7 +460,7 @@ def _do_acquire( """ last_exception = None # sometimes we spend all budget before a second attempt, so we have few more here - for attempt in range(1, self.maximum_attempts_to_acquire): + for attempt in range(1, self._maximum_attempts_to_acquire): try: policy.try_acquire(request, weight=1) return @@ -573,18 +484,31 @@ def _do_acquire( if last_exception: logger.info( - "we used all %s attempts to acquire and failed", self.maximum_attempts_to_acquire + "we used all %s attempts to acquire and failed", self._maximum_attempts_to_acquire ) raise last_exception -@dataclass class HttpAPIBudget(APIBudget): """Implementation of AbstractAPIBudget for HTTP""" - ratelimit_reset_header: str = "ratelimit-reset" - ratelimit_remaining_header: str = "ratelimit-remaining" - status_codes_for_ratelimit_hit: Union[tuple[int], list[int]] = (429,) + def __init__( + self, + ratelimit_reset_header: str = "ratelimit-reset", + ratelimit_remaining_header: str = "ratelimit-remaining", + status_codes_for_ratelimit_hit: tuple[int] = (429,), + **kwargs: Any, + ): + """Constructor + + :param ratelimit_reset_header: name of the header that has a timestamp of the next reset of call budget + :param ratelimit_remaining_header: name of the header that has the number of calls left + :param status_codes_for_ratelimit_hit: list of HTTP status codes that signal about rate limit being hit + """ + self._ratelimit_reset_header = ratelimit_reset_header + self._ratelimit_remaining_header = ratelimit_remaining_header + self._status_codes_for_ratelimit_hit = status_codes_for_ratelimit_hit + super().__init__(**kwargs) def update_from_response(self, request: Any, response: Any) -> None: policy = self.get_matching_policy(request) @@ -599,17 +523,17 @@ def update_from_response(self, request: Any, response: Any) -> None: def get_reset_ts_from_response( self, response: requests.Response ) -> Optional[datetime.datetime]: - if response.headers.get(self.ratelimit_reset_header): + if response.headers.get(self._ratelimit_reset_header): return datetime.datetime.fromtimestamp( - int(response.headers[self.ratelimit_reset_header]) + int(response.headers[self._ratelimit_reset_header]) ) return None def get_calls_left_from_response(self, response: requests.Response) -> Optional[int]: - if response.headers.get(self.ratelimit_remaining_header): - return int(response.headers[self.ratelimit_remaining_header]) + if response.headers.get(self._ratelimit_remaining_header): + return int(response.headers[self._ratelimit_remaining_header]) - if response.status_code in self.status_codes_for_ratelimit_hit: + if response.status_code in self._status_codes_for_ratelimit_hit: return 0 return None From 05f4db7b6a3222af20d624439882a080c3014642 Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Wed, 12 Feb 2025 15:44:45 +0200 Subject: [PATCH 11/26] Delete code from another branch --- .../test_model_to_component_factory.py | 159 ------------------ .../requesters/test_http_requester.py | 32 ---- unit_tests/sources/streams/test_call_rate.py | 88 ---------- 3 files changed, 279 deletions(-) diff --git a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py index 14e3460e0..32a73f364 100644 --- a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +++ b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py @@ -142,7 +142,6 @@ from airbyte_cdk.sources.declarative.transformations import AddFields, RemoveFields from airbyte_cdk.sources.declarative.transformations.add_fields import AddedFieldDefinition from airbyte_cdk.sources.declarative.yaml_declarative_source import YamlDeclarativeSource -from airbyte_cdk.sources.streams.call_rate import MovingWindowCallRatePolicy from airbyte_cdk.sources.streams.concurrent.clamping import ( ClampingEndProvider, DayClampingStrategy, @@ -3685,161 +3684,3 @@ def test_create_async_retriever(): assert isinstance(selector, RecordSelector) assert isinstance(extractor, DpathExtractor) assert extractor.field_path == ["data"] - - -def test_api_budget(): - manifest = { - "type": "DeclarativeSource", - "api_budget": { - "type": "HTTPAPIBudget", - "ratelimit_reset_header": "X-RateLimit-Reset", - "ratelimit_remaining_header": "X-RateLimit-Remaining", - "status_codes_for_ratelimit_hit": [429, 503], - "policies": [ - { - "type": "MovingWindowCallRatePolicy", - "rates": [ - { - "type": "Rate", - "limit": 3, - "interval": "PT0.1S", # 0.1 seconds - } - ], - "matchers": [ - { - "type": "HttpRequestRegexMatcher", - "method": "GET", - "url_base": "https://api.sendgrid.com", - "url_path_pattern": "/v3/marketing/lists", - } - ], - } - ], - }, - "my_requester": { - "type": "HttpRequester", - "path": "/v3/marketing/lists", - "url_base": "https://api.sendgrid.com", - "http_method": "GET", - "authenticator": { - "type": "BasicHttpAuthenticator", - "username": "admin", - "password": "{{ config['password'] }}", - }, - }, - } - - config = { - "password": "verysecrettoken", - } - - factory = ModelToComponentFactory() - if "api_budget" in manifest: - factory.set_api_budget(manifest["api_budget"], config) - - from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( - HttpRequester as HttpRequesterModel, - ) - - requester_definition = manifest["my_requester"] - assert requester_definition["type"] == "HttpRequester" - - http_requester = factory.create_component( - model_type=HttpRequesterModel, - component_definition=requester_definition, - config=config, - name="lists_stream", - decoder=None, - ) - - assert http_requester.api_budget is not None - assert http_requester.api_budget.ratelimit_reset_header == "X-RateLimit-Reset" - assert http_requester.api_budget.status_codes_for_ratelimit_hit == [429, 503] - assert len(http_requester.api_budget.policies) == 1 - - # The single policy is a MovingWindowCallRatePolicy - policy = http_requester.api_budget.policies[0] - assert isinstance(policy, MovingWindowCallRatePolicy) - assert policy._bucket.rates[0].limit == 3 - # The 0.1s from 'PT0.1S' is stored in ms by PyRateLimiter internally - # but here just check that the limit and interval exist - assert policy._bucket.rates[0].interval == 100 # 100 ms - - -def test_api_budget_fixed_window_policy(): - manifest = { - "type": "DeclarativeSource", - # Root-level api_budget referencing a FixedWindowCallRatePolicy - "api_budget": { - "type": "APIBudget", - "maximum_attempts_to_acquire": 9999, - "policies": [ - { - "type": "FixedWindowCallRatePolicy", - "next_reset_ts": "2025-01-01T00:00:00Z", - "period": "PT1M", # 1 minute - "call_limit": 10, - "matchers": [ - { - "type": "HttpRequestRegexMatcher", - "method": "GET", - "url_base": "https://example.org", - "url_path_pattern": "/v2/data", - } - ], - } - ], - }, - # We'll define a single HttpRequester that references that base - "my_requester": { - "type": "HttpRequester", - "path": "/v2/data", - "url_base": "https://example.org", - "http_method": "GET", - "authenticator": {"type": "NoAuth"}, - }, - } - - config = {} - - factory = ModelToComponentFactory() - if "api_budget" in manifest: - factory.set_api_budget(manifest["api_budget"], config) - - from airbyte_cdk.sources.declarative.models.declarative_component_schema import ( - HttpRequester as HttpRequesterModel, - ) - - requester_definition = manifest["my_requester"] - assert requester_definition["type"] == "HttpRequester" - http_requester = factory.create_component( - model_type=HttpRequesterModel, - component_definition=requester_definition, - config=config, - name="my_stream", - decoder=None, - ) - - assert http_requester.api_budget is not None - assert http_requester.api_budget.maximum_attempts_to_acquire == 9999 - assert len(http_requester.api_budget.policies) == 1 - - from airbyte_cdk.sources.streams.call_rate import FixedWindowCallRatePolicy - - policy = http_requester.api_budget.policies[0] - assert isinstance(policy, FixedWindowCallRatePolicy) - assert policy._call_limit == 10 - # The period is "PT1M" => 60 seconds - assert policy._offset.total_seconds() == 60 - - expected_reset_dt = datetime(2025, 1, 1, 0, 0, 0, tzinfo=timezone.utc) - assert policy._next_reset_ts == expected_reset_dt - - assert len(policy._matchers) == 1 - matcher = policy._matchers[0] - from airbyte_cdk.sources.streams.call_rate import HttpRequestRegexMatcher - - assert isinstance(matcher, HttpRequestRegexMatcher) - assert matcher._method == "GET" - assert matcher._url_base == "https://example.org" - assert matcher._url_path_pattern.pattern == "/v2/data" diff --git a/unit_tests/sources/declarative/requesters/test_http_requester.py b/unit_tests/sources/declarative/requesters/test_http_requester.py index c5d5c218d..f02ec206b 100644 --- a/unit_tests/sources/declarative/requesters/test_http_requester.py +++ b/unit_tests/sources/declarative/requesters/test_http_requester.py @@ -2,7 +2,6 @@ # Copyright (c) 2023 Airbyte, Inc., all rights reserved. # -from datetime import timedelta from typing import Any, Mapping, Optional from unittest import mock from unittest.mock import MagicMock @@ -10,7 +9,6 @@ import pytest as pytest import requests -import requests.sessions from requests import PreparedRequest from airbyte_cdk.sources.declarative.auth.declarative_authenticator import DeclarativeAuthenticator @@ -29,12 +27,6 @@ InterpolatedRequestOptionsProvider, ) from airbyte_cdk.sources.message import MessageRepository -from airbyte_cdk.sources.streams.call_rate import ( - AbstractAPIBudget, - HttpAPIBudget, - MovingWindowCallRatePolicy, - Rate, -) from airbyte_cdk.sources.streams.http.error_handlers.response_models import ResponseAction from airbyte_cdk.sources.streams.http.exceptions import ( RequestBodyException, @@ -53,7 +45,6 @@ def factory( request_options_provider: Optional[InterpolatedRequestOptionsProvider] = None, authenticator: Optional[DeclarativeAuthenticator] = None, error_handler: Optional[ErrorHandler] = None, - api_budget: Optional[HttpAPIBudget] = None, config: Optional[Config] = None, parameters: Mapping[str, Any] = None, disable_retries: bool = False, @@ -70,7 +61,6 @@ def factory( http_method=http_method, request_options_provider=request_options_provider, error_handler=error_handler, - api_budget=api_budget, disable_retries=disable_retries, message_repository=message_repository or MagicMock(), use_cache=use_cache, @@ -944,25 +934,3 @@ def test_backoff_strategy_from_manifest_is_respected(http_requester_factory: Any http_requester._http_client._request_attempt_count.get(request_mock) == http_requester._http_client._max_retries + 1 ) - - -def test_http_requester_with_mock_apibudget(http_requester_factory, monkeypatch): - mock_budget = MagicMock(spec=HttpAPIBudget) - - requester = http_requester_factory( - url_base="https://example.com", - path="test", - api_budget=mock_budget, - ) - - dummy_response = requests.Response() - dummy_response.status_code = 200 - send_mock = MagicMock(return_value=dummy_response) - monkeypatch.setattr(requests.Session, "send", send_mock) - - response = requester.send_request() - - assert send_mock.call_count == 1 - assert response.status_code == 200 - - assert mock_budget.acquire_call.call_count == 1 diff --git a/unit_tests/sources/streams/test_call_rate.py b/unit_tests/sources/streams/test_call_rate.py index 853e2997e..16bce68e3 100644 --- a/unit_tests/sources/streams/test_call_rate.py +++ b/unit_tests/sources/streams/test_call_rate.py @@ -17,7 +17,6 @@ CallRateLimitHit, FixedWindowCallRatePolicy, HttpRequestMatcher, - HttpRequestRegexMatcher, MovingWindowCallRatePolicy, Rate, UnlimitedCallRatePolicy, @@ -358,90 +357,3 @@ def test_with_cache(self, mocker, requests_mock): assert next(records) == {"data": "some_data"} assert MovingWindowCallRatePolicy.try_acquire.call_count == 1 - - -class TestHttpRequestRegexMatcher: - """ - Tests for the new regex-based logic: - - Case-insensitive HTTP method matching - - Optional url_base (scheme://netloc) - - Regex-based path matching - - Query params (must be present) - - Headers (case-insensitive keys) - """ - - def test_case_insensitive_method(self): - matcher = HttpRequestRegexMatcher(method="GET") - - req_ok = Request("get", "https://example.com/test/path") - req_wrong = Request("POST", "https://example.com/test/path") - - assert matcher(req_ok) - assert not matcher(req_wrong) - - def test_url_base(self): - matcher = HttpRequestRegexMatcher(url_base="https://example.com") - - req_ok = Request("GET", "https://example.com/test/path?foo=bar") - req_wrong = Request("GET", "https://another.com/test/path?foo=bar") - - assert matcher(req_ok) - assert not matcher(req_wrong) - - def test_url_path_pattern(self): - matcher = HttpRequestRegexMatcher(url_path_pattern=r"/test/") - - req_ok = Request("GET", "https://example.com/test/something") - req_wrong = Request("GET", "https://example.com/other/something") - - assert matcher(req_ok) - assert not matcher(req_wrong) - - def test_query_params(self): - matcher = HttpRequestRegexMatcher(params={"foo": "bar"}) - - req_ok = Request("GET", "https://example.com/api?foo=bar&extra=123") - req_missing = Request("GET", "https://example.com/api?not_foo=bar") - - assert matcher(req_ok) - assert not matcher(req_missing) - - def test_headers_case_insensitive(self): - matcher = HttpRequestRegexMatcher(headers={"X-Custom-Header": "abc"}) - - req_ok = Request( - "GET", - "https://example.com/api?foo=bar", - headers={"x-custom-header": "abc", "other": "123"}, - ) - req_wrong = Request("GET", "https://example.com/api", headers={"x-custom-header": "wrong"}) - - assert matcher(req_ok) - assert not matcher(req_wrong) - - def test_combined_criteria(self): - matcher = HttpRequestRegexMatcher( - method="GET", - url_base="https://example.com", - url_path_pattern=r"/test/", - params={"foo": "bar"}, - headers={"X-Test": "123"}, - ) - - req_ok = Request("GET", "https://example.com/test/me?foo=bar", headers={"x-test": "123"}) - req_bad_base = Request( - "GET", "https://other.com/test/me?foo=bar", headers={"x-test": "123"} - ) - req_bad_path = Request("GET", "https://example.com/nope?foo=bar", headers={"x-test": "123"}) - req_bad_param = Request( - "GET", "https://example.com/test/me?extra=xyz", headers={"x-test": "123"} - ) - req_bad_header = Request( - "GET", "https://example.com/test/me?foo=bar", headers={"some-other-header": "xyz"} - ) - - assert matcher(req_ok) - assert not matcher(req_bad_base) - assert not matcher(req_bad_path) - assert not matcher(req_bad_param) - assert not matcher(req_bad_header) From c0bc64538acfbbcf06f7df8908a3ff248f061089 Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Wed, 12 Feb 2025 17:44:45 +0200 Subject: [PATCH 12/26] Fix cursor value from record --- .../declarative/incremental/concurrent_partition_cursor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py b/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py index fc75ecd90..4dc3a6341 100644 --- a/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +++ b/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py @@ -363,8 +363,8 @@ def observe(self, record: Record) -> None: "Invalid state as stream slices that are emitted should refer to an existing cursor" ) - record_cursor = self._connector_state_converter.parse_value( - self._cursor_field.extract_value(record) + record_cursor = self._connector_state_converter.output_format( + self._connector_state_converter.parse_value(self._cursor_field.extract_value(record)) ) self._update_global_cursor(record_cursor) if not self._use_global_cursor: From 52b95e33d7782c163447a966754dc5156d7b555c Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Thu, 13 Feb 2025 12:02:37 +0200 Subject: [PATCH 13/26] Add throttling for state emitting in ConcurrentPerPartitionCursor --- .../concurrent_partition_cursor.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py b/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py index 4dc3a6341..2780218dc 100644 --- a/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +++ b/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py @@ -5,6 +5,7 @@ import copy import logging import threading +import time from collections import OrderedDict from copy import deepcopy from datetime import timedelta @@ -59,7 +60,7 @@ class ConcurrentPerPartitionCursor(Cursor): """ DEFAULT_MAX_PARTITIONS_NUMBER = 25_000 - SWITCH_TO_GLOBAL_LIMIT = 1000 + SWITCH_TO_GLOBAL_LIMIT = 10_000 _NO_STATE: Mapping[str, Any] = {} _NO_CURSOR_STATE: Mapping[str, Any] = {} _GLOBAL_STATE_KEY = "state" @@ -103,6 +104,8 @@ def __init__( self._number_of_partitions: int = 0 self._use_global_cursor: bool = False self._partition_serializer = PerPartitionKeySerializer() + # Track the last time a state message was emitted + self._last_emission_time: float = 0.0 self._set_initial_state(stream_state) @@ -166,9 +169,12 @@ def ensure_at_least_one_state_emitted(self) -> None: self._global_cursor = self._new_global_cursor self._lookback_window = self._timer.finish() self._parent_state = self._partition_router.get_stream_state() - self._emit_state_message() + self._emit_state_message(throttle=False) - def _emit_state_message(self) -> None: + def _emit_state_message(self, throttle: bool = True) -> None: + current_time = time.time() + if throttle and current_time - self._last_emission_time <= 60: + return self._connector_state_manager.update_state_for_stream( self._stream_name, self._stream_namespace, @@ -178,6 +184,7 @@ def _emit_state_message(self) -> None: self._stream_name, self._stream_namespace ) self._message_repository.emit_message(state_message) + self._last_emission_time = current_time def stream_slices(self) -> Iterable[StreamSlice]: if self._timer.is_running(): @@ -242,7 +249,7 @@ def _ensure_partition_limit(self) -> None: partition_key ) # Remove the oldest partition logger.warning( - f"The maximum number of partitions has been reached. Dropping the oldest finished partition: {oldest_partition}. Over limit: {self._number_of_partitions}." + f"The maximum number of partitions has been reached. Dropping the oldest finished partition: {oldest_partition}. Over limit: {self._number_of_partitions - self.DEFAULT_MAX_PARTITIONS_NUMBER}." ) break else: @@ -251,7 +258,7 @@ def _ensure_partition_limit(self) -> None: 1 ] # Remove the oldest partition logger.warning( - f"The maximum number of partitions has been reached. Dropping the oldest partition: {oldest_partition}. Over limit: {self._number_of_partitions}." + f"The maximum number of partitions has been reached. Dropping the oldest partition: {oldest_partition}. Over limit: {self._number_of_partitions - self.DEFAULT_MAX_PARTITIONS_NUMBER}." ) def _set_initial_state(self, stream_state: StreamState) -> None: @@ -372,7 +379,7 @@ def observe(self, record: Record) -> None: self._to_partition_key(record.associated_slice.partition) ].observe(record) - def _update_global_cursor(self, value: Mapping[str, Any]) -> None: + def _update_global_cursor(self, value: Any) -> None: if ( self._new_global_cursor is None or self._new_global_cursor[self.cursor_field.cursor_field_key] < value From 1166a7a2e68e0713fa191804a4cab844cfdf8c95 Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Mon, 17 Feb 2025 13:41:05 +0200 Subject: [PATCH 14/26] Fix unit tests --- .../concurrent_partition_cursor.py | 19 ++++-- .../test_concurrent_perpartitioncursor.py | 66 ++++++++++++++++--- 2 files changed, 73 insertions(+), 12 deletions(-) diff --git a/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py b/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py index 2780218dc..da12cc05d 100644 --- a/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +++ b/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py @@ -171,10 +171,21 @@ def ensure_at_least_one_state_emitted(self) -> None: self._parent_state = self._partition_router.get_stream_state() self._emit_state_message(throttle=False) - def _emit_state_message(self, throttle: bool = True) -> None: + def _throttle_state_message(self) -> Optional[float]: + """ + Throttles the state message emission to once every 60 seconds. + """ current_time = time.time() - if throttle and current_time - self._last_emission_time <= 60: - return + if current_time - self._last_emission_time <= 60: + return None + return current_time + + def _emit_state_message(self, throttle: bool = True) -> None: + if throttle: + current_time = self._throttle_state_message() + if current_time is None: + return + self._last_emission_time = current_time self._connector_state_manager.update_state_for_stream( self._stream_name, self._stream_namespace, @@ -184,7 +195,6 @@ def _emit_state_message(self, throttle: bool = True) -> None: self._stream_name, self._stream_namespace ) self._message_repository.emit_message(state_message) - self._last_emission_time = current_time def stream_slices(self) -> Iterable[StreamSlice]: if self._timer.is_running(): @@ -358,6 +368,7 @@ def _set_global_state(self, stream_state: Mapping[str, Any]) -> None: self._new_global_cursor = deepcopy(fixed_global_state) def observe(self, record: Record) -> None: + # ToDo: check number of partitions if not self._use_global_cursor and self.limit_reached(): logger.info( f"Exceeded the 'SWITCH_TO_GLOBAL_LIMIT' of {self.SWITCH_TO_GLOBAL_LIMIT}. " diff --git a/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py b/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py index ef06676f5..767d24874 100644 --- a/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py +++ b/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py @@ -3,6 +3,7 @@ from copy import deepcopy from datetime import datetime, timedelta from typing import Any, List, Mapping, MutableMapping, Optional, Union +from unittest.mock import MagicMock, patch from urllib.parse import unquote import pytest @@ -18,6 +19,7 @@ from airbyte_cdk.sources.declarative.concurrent_declarative_source import ( ConcurrentDeclarativeSource, ) +from airbyte_cdk.sources.declarative.incremental import ConcurrentPerPartitionCursor from airbyte_cdk.test.catalog_builder import CatalogBuilder, ConfiguredAirbyteStreamBuilder from airbyte_cdk.test.entrypoint_wrapper import EntrypointOutput, read @@ -1181,14 +1183,18 @@ def test_incremental_parent_state( initial_state, expected_state, ): - run_incremental_parent_state_test( - manifest, - mock_requests, - expected_records, - num_intermediate_states, - initial_state, - [expected_state], - ) + # Patch `_throttle_state_message` so it always returns a float (indicating "no throttle") + with patch.object( + ConcurrentPerPartitionCursor, "_throttle_state_message", return_value=9999999.0 + ): + run_incremental_parent_state_test( + manifest, + mock_requests, + expected_records, + num_intermediate_states, + initial_state, + [expected_state], + ) STATE_MIGRATION_EXPECTED_STATE = { @@ -2967,3 +2973,47 @@ def test_incremental_substream_request_options_provider( expected_records, expected_state, ) + + +def test_state_throttling(mocker): + """ + Verifies that _emit_state_message does not emit a new state if less than 60s + have passed since last emission, and does emit once 60s or more have passed. + """ + cursor = ConcurrentPerPartitionCursor( + cursor_factory=MagicMock(), + partition_router=MagicMock(), + stream_name="test_stream", + stream_namespace=None, + stream_state={}, + message_repository=MagicMock(), + connector_state_manager=MagicMock(), + connector_state_converter=MagicMock(), + cursor_field=MagicMock(), + ) + + mock_connector_manager = cursor._connector_state_manager + mock_repo = cursor._message_repository + + # Set the last emission time to "0" so we can control offset from that + cursor._last_emission_time = 0 + + mock_time = mocker.patch("time.time") + + # First attempt: only 10 seconds passed => NO emission + mock_time.return_value = 10 + cursor._emit_state_message() + mock_connector_manager.update_state_for_stream.assert_not_called() + mock_repo.emit_message.assert_not_called() + + # Second attempt: 30 seconds passed => still NO emission + mock_time.return_value = 30 + cursor._emit_state_message() + mock_connector_manager.update_state_for_stream.assert_not_called() + mock_repo.emit_message.assert_not_called() + + # Advance time: 70 seconds => exceed 60s => MUST emit + mock_time.return_value = 70 + cursor._emit_state_message() + mock_connector_manager.update_state_for_stream.assert_called_once() + mock_repo.emit_message.assert_called_once() From 4a7d9eccb4421c6591835a4ab99e9e4cc22a276c Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Mon, 17 Feb 2025 14:05:10 +0200 Subject: [PATCH 15/26] Move switching to global logic --- .../concurrent_partition_cursor.py | 37 +++++++++---------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py b/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py index da12cc05d..74d7f8893 100644 --- a/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +++ b/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py @@ -59,8 +59,8 @@ class ConcurrentPerPartitionCursor(Cursor): CurrentPerPartitionCursor expects the state of the ConcurrentCursor to follow the format {cursor_field: cursor_value}. """ - DEFAULT_MAX_PARTITIONS_NUMBER = 25_000 - SWITCH_TO_GLOBAL_LIMIT = 10_000 + DEFAULT_MAX_PARTITIONS_NUMBER = 200 + SWITCH_TO_GLOBAL_LIMIT = 100 _NO_STATE: Mapping[str, Any] = {} _NO_CURSOR_STATE: Mapping[str, Any] = {} _GLOBAL_STATE_KEY = "state" @@ -145,19 +145,19 @@ def close_partition(self, partition: Partition) -> None: raise ValueError("stream_slice cannot be None") partition_key = self._to_partition_key(stream_slice.partition) - if not self._use_global_cursor: - self._cursor_per_partition[partition_key].close_partition(partition=partition) with self._lock: - self._semaphore_per_partition[partition_key].acquire() - cursor = self._cursor_per_partition[partition_key] - if ( - partition_key in self._finished_partitions - and self._semaphore_per_partition[partition_key]._value == 0 - ): - self._update_global_cursor(cursor.state[self.cursor_field.cursor_field_key]) if not self._use_global_cursor: + self._cursor_per_partition[partition_key].close_partition(partition=partition) + cursor = self._cursor_per_partition[partition_key] + if ( + partition_key in self._finished_partitions + and self._semaphore_per_partition[partition_key]._value == 0 + ): + self._update_global_cursor(cursor.state[self.cursor_field.cursor_field_key]) self._emit_state_message() + self._semaphore_per_partition[partition_key].acquire() + def ensure_at_least_one_state_emitted(self) -> None: """ The platform expect to have at least one state message on successful syncs. Hence, whatever happens, we expect this method to be @@ -246,6 +246,13 @@ def _ensure_partition_limit(self) -> None: - Logs a warning each time a partition is removed, indicating whether it was finished or removed due to being the oldest. """ + if not self._use_global_cursor and self.limit_reached(): + logger.info( + f"Exceeded the 'SWITCH_TO_GLOBAL_LIMIT' of {self.SWITCH_TO_GLOBAL_LIMIT}. " + f"Switching to global cursor for {self._stream_name}." + ) + self._use_global_cursor = True + with self._lock: self._number_of_partitions += 1 while len(self._cursor_per_partition) > self.DEFAULT_MAX_PARTITIONS_NUMBER - 1: @@ -368,14 +375,6 @@ def _set_global_state(self, stream_state: Mapping[str, Any]) -> None: self._new_global_cursor = deepcopy(fixed_global_state) def observe(self, record: Record) -> None: - # ToDo: check number of partitions - if not self._use_global_cursor and self.limit_reached(): - logger.info( - f"Exceeded the 'SWITCH_TO_GLOBAL_LIMIT' of {self.SWITCH_TO_GLOBAL_LIMIT}. " - f"Switching to global cursor for {self._stream_name}." - ) - self._use_global_cursor = True - if not record.associated_slice: raise ValueError( "Invalid state as stream slices that are emitted should refer to an existing cursor" From 19ad269f8f802f4701cc5b680eec282fd2183a99 Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Mon, 17 Feb 2025 14:45:25 +0200 Subject: [PATCH 16/26] Revert test limits --- .../declarative/incremental/concurrent_partition_cursor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py b/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py index 74d7f8893..ed67e8166 100644 --- a/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +++ b/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py @@ -59,8 +59,8 @@ class ConcurrentPerPartitionCursor(Cursor): CurrentPerPartitionCursor expects the state of the ConcurrentCursor to follow the format {cursor_field: cursor_value}. """ - DEFAULT_MAX_PARTITIONS_NUMBER = 200 - SWITCH_TO_GLOBAL_LIMIT = 100 + DEFAULT_MAX_PARTITIONS_NUMBER = 25_000 + SWITCH_TO_GLOBAL_LIMIT = 10_000 _NO_STATE: Mapping[str, Any] = {} _NO_CURSOR_STATE: Mapping[str, Any] = {} _GLOBAL_STATE_KEY = "state" From 6498528eab15f1bf261df588201a72922d895634 Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Mon, 17 Feb 2025 15:36:26 +0200 Subject: [PATCH 17/26] Fix format --- .../declarative/incremental/concurrent_partition_cursor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py b/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py index ed67e8166..84d3cb6e2 100644 --- a/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +++ b/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py @@ -150,8 +150,8 @@ def close_partition(self, partition: Partition) -> None: self._cursor_per_partition[partition_key].close_partition(partition=partition) cursor = self._cursor_per_partition[partition_key] if ( - partition_key in self._finished_partitions - and self._semaphore_per_partition[partition_key]._value == 0 + partition_key in self._finished_partitions + and self._semaphore_per_partition[partition_key]._value == 0 ): self._update_global_cursor(cursor.state[self.cursor_field.cursor_field_key]) self._emit_state_message() From d3e7fe220eda8f3320c013a5b1cb7439b40db6f3 Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Mon, 17 Feb 2025 18:03:38 +0200 Subject: [PATCH 18/26] Add parent state updates --- .../concurrent_partition_cursor.py | 49 ++++++++++++++++--- 1 file changed, 41 insertions(+), 8 deletions(-) diff --git a/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py b/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py index ed67e8166..aa3eef02a 100644 --- a/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +++ b/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py @@ -95,6 +95,10 @@ def __init__( # the oldest partitions can be efficiently removed, maintaining the most recent partitions. self._cursor_per_partition: OrderedDict[str, ConcurrentCursor] = OrderedDict() self._semaphore_per_partition: OrderedDict[str, threading.Semaphore] = OrderedDict() + + # Parent-state tracking: store each partition’s parent state in creation order + self._partition_parent_state_map: OrderedDict[str, Mapping[str, Any]] = OrderedDict() + self._finished_partitions: set[str] = set() self._lock = threading.Lock() self._timer = Timer() @@ -154,10 +158,32 @@ def close_partition(self, partition: Partition) -> None: and self._semaphore_per_partition[partition_key]._value == 0 ): self._update_global_cursor(cursor.state[self.cursor_field.cursor_field_key]) - self._emit_state_message() + + self._check_and_update_parent_state() + + self._emit_state_message() self._semaphore_per_partition[partition_key].acquire() + def _check_and_update_parent_state(self) -> None: + """ + If all slices for the earliest partitions are closed, pop them from the left + of _partition_parent_state_map and update _parent_state to the most recent popped. + """ + last_closed_state = None + # We iterate in creation order (left to right) in the OrderedDict + for p_key in list(self._partition_parent_state_map.keys()): + # If this partition is not fully closed, stop + if p_key not in self._finished_partitions or self._semaphore_per_partition[p_key]._value != 0: + break + # Otherwise, we pop from the left + _, closed_parent_state = self._partition_parent_state_map.popitem(last=False) + last_closed_state = closed_parent_state + + # If we popped at least one partition, update the parent_state to that partition's parent state + if last_closed_state is not None: + self._parent_state = last_closed_state + def ensure_at_least_one_state_emitted(self) -> None: """ The platform expect to have at least one state message on successful syncs. Hence, whatever happens, we expect this method to be @@ -202,13 +228,17 @@ def stream_slices(self) -> Iterable[StreamSlice]: slices = self._partition_router.stream_slices() self._timer.start() - for partition in slices: - yield from self._generate_slices_from_partition(partition) + for partition, last, parent_state in iterate_with_last_flag_and_state( + slices, self._partition_router.get_stream_state + ): + yield from self._generate_slices_from_partition(partition, parent_state) - def _generate_slices_from_partition(self, partition: StreamSlice) -> Iterable[StreamSlice]: + def _generate_slices_from_partition(self, partition: StreamSlice, parent_state: Mapping[str, Any]) -> Iterable[StreamSlice]: # Ensure the maximum number of partitions is not exceeded self._ensure_partition_limit() + partition_key = self._to_partition_key(partition.partition) + cursor = self._cursor_per_partition.get(self._to_partition_key(partition.partition)) if not cursor: cursor = self._create_cursor( @@ -216,18 +246,21 @@ def _generate_slices_from_partition(self, partition: StreamSlice) -> Iterable[St self._lookback_window if self._global_cursor else 0, ) with self._lock: - self._cursor_per_partition[self._to_partition_key(partition.partition)] = cursor - self._semaphore_per_partition[self._to_partition_key(partition.partition)] = ( + self._cursor_per_partition[partition_key] = cursor + self._semaphore_per_partition[partition_key] = ( threading.Semaphore(0) ) + with self._lock: + self._partition_parent_state_map[partition_key] = deepcopy(parent_state) + for cursor_slice, is_last_slice, _ in iterate_with_last_flag_and_state( cursor.stream_slices(), lambda: None, ): - self._semaphore_per_partition[self._to_partition_key(partition.partition)].release() + self._semaphore_per_partition[partition_key].release() if is_last_slice: - self._finished_partitions.add(self._to_partition_key(partition.partition)) + self._finished_partitions.add(partition_key) yield StreamSlice( partition=partition, cursor_slice=cursor_slice, extra_fields=partition.extra_fields ) From 7b4964edb72b800855114215d710387a529e232c Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Mon, 17 Feb 2025 18:14:19 +0200 Subject: [PATCH 19/26] Move acquiring the semaphore --- .../declarative/incremental/concurrent_partition_cursor.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py b/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py index 84d3cb6e2..efa5996b3 100644 --- a/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +++ b/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py @@ -146,6 +146,7 @@ def close_partition(self, partition: Partition) -> None: partition_key = self._to_partition_key(stream_slice.partition) with self._lock: + self._semaphore_per_partition[partition_key].acquire() if not self._use_global_cursor: self._cursor_per_partition[partition_key].close_partition(partition=partition) cursor = self._cursor_per_partition[partition_key] @@ -156,8 +157,6 @@ def close_partition(self, partition: Partition) -> None: self._update_global_cursor(cursor.state[self.cursor_field.cursor_field_key]) self._emit_state_message() - self._semaphore_per_partition[partition_key].acquire() - def ensure_at_least_one_state_emitted(self) -> None: """ The platform expect to have at least one state message on successful syncs. Hence, whatever happens, we expect this method to be From 203c1312ab97d1c2eec3a349f45edef69a307806 Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Tue, 18 Feb 2025 13:39:05 +0200 Subject: [PATCH 20/26] Refactor to store only unique states --- .../concurrent_partition_cursor.py | 55 +++++++++++++------ .../test_concurrent_perpartitioncursor.py | 2 + 2 files changed, 39 insertions(+), 18 deletions(-) diff --git a/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py b/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py index ab9c5258c..1ece3c579 100644 --- a/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +++ b/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py @@ -166,20 +166,36 @@ def close_partition(self, partition: Partition) -> None: def _check_and_update_parent_state(self) -> None: """ - If all slices for the earliest partitions are closed, pop them from the left - of _partition_parent_state_map and update _parent_state to the most recent popped. + Pop the leftmost partition state from _partition_parent_state_map only if + *all partitions* up to (and including) that partition key in _semaphore_per_partition + are fully finished (i.e. in _finished_partitions and semaphore._value == 0). """ last_closed_state = None - # We iterate in creation order (left to right) in the OrderedDict - for p_key in list(self._partition_parent_state_map.keys()): - # If this partition is not fully closed, stop - if p_key not in self._finished_partitions or self._semaphore_per_partition[p_key]._value != 0: + + while self._partition_parent_state_map: + # Look at the earliest partition key in creation order + earliest_key = next(iter(self._partition_parent_state_map)) + + # Verify ALL partitions from the left up to earliest_key are finished + all_left_finished = True + for p_key, sem in self._semaphore_per_partition.items(): + # If any earlier partition is still not finished, we must stop + if p_key not in self._finished_partitions or sem._value != 0: + all_left_finished = False + break + # Once we've reached earliest_key in the semaphore order, we can stop checking + if p_key == earliest_key: + break + + # If the partitions up to earliest_key are not all finished, break the while-loop + if not all_left_finished: break - # Otherwise, we pop from the left + + # Otherwise, pop the leftmost entry from parent-state map _, closed_parent_state = self._partition_parent_state_map.popitem(last=False) last_closed_state = closed_parent_state - # If we popped at least one partition, update the parent_state to that partition's parent state + # Update _parent_state if we actually popped at least one partition if last_closed_state is not None: self._parent_state = last_closed_state @@ -228,11 +244,13 @@ def stream_slices(self) -> Iterable[StreamSlice]: slices = self._partition_router.stream_slices() self._timer.start() for partition, last, parent_state in iterate_with_last_flag_and_state( - slices, self._partition_router.get_stream_state + slices, self._partition_router.get_stream_state ): yield from self._generate_slices_from_partition(partition, parent_state) - def _generate_slices_from_partition(self, partition: StreamSlice, parent_state: Mapping[str, Any]) -> Iterable[StreamSlice]: + def _generate_slices_from_partition( + self, partition: StreamSlice, parent_state: Mapping[str, Any] + ) -> Iterable[StreamSlice]: # Ensure the maximum number of partitions is not exceeded self._ensure_partition_limit() @@ -247,12 +265,17 @@ def _generate_slices_from_partition(self, partition: StreamSlice, parent_state: with self._lock: self._number_of_partitions += 1 self._cursor_per_partition[partition_key] = cursor - self._semaphore_per_partition[partition_key] = ( - threading.Semaphore(0) - ) + self._semaphore_per_partition[partition_key] = threading.Semaphore(0) with self._lock: - self._partition_parent_state_map[partition_key] = deepcopy(parent_state) + if ( + len(self._partition_parent_state_map) == 0 + or self._partition_parent_state_map[ + next(reversed(self._partition_parent_state_map)) + ] + != parent_state + ): + self._partition_parent_state_map[partition_key] = deepcopy(parent_state) for cursor_slice, is_last_slice, _ in iterate_with_last_flag_and_state( cursor.stream_slices(), @@ -287,7 +310,6 @@ def _ensure_partition_limit(self) -> None: self._use_global_cursor = True with self._lock: - self._number_of_partitions += 1 while len(self._cursor_per_partition) > self.DEFAULT_MAX_PARTITIONS_NUMBER - 1: # Try removing finished partitions first for partition_key in list(self._cursor_per_partition.keys()): @@ -372,9 +394,6 @@ def _set_initial_state(self, stream_state: StreamState) -> None: self._cursor_per_partition[self._to_partition_key(state["partition"])] = ( self._create_cursor(state["cursor"]) ) - self._semaphore_per_partition[self._to_partition_key(state["partition"])] = ( - threading.Semaphore(0) - ) # set default state for missing partitions if it is per partition with fallback to global if self._GLOBAL_STATE_KEY in stream_state: diff --git a/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py b/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py index 767d24874..c40222291 100644 --- a/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py +++ b/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py @@ -2027,6 +2027,8 @@ def test_incremental_parent_state_no_records( "cursor": {"updated_at": PARENT_COMMENT_CURSOR_PARTITION_1}, } ], + "state": {}, + "use_global_cursor": False, "parent_state": {"posts": {"updated_at": PARENT_POSTS_CURSOR}}, } }, From 671fab452c10602b4a7ae5c99797ccfe173d3110 Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Tue, 18 Feb 2025 18:43:06 +0200 Subject: [PATCH 21/26] Add intermediate states validation to unit tests --- .../test_concurrent_perpartitioncursor.py | 3521 +++++++++-------- 1 file changed, 1904 insertions(+), 1617 deletions(-) diff --git a/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py b/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py index c40222291..23459366d 100644 --- a/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py +++ b/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py @@ -306,7 +306,7 @@ def run_mocked_test( - mock_requests, manifest, config, stream_name, initial_state, expected_records, expected_state + mock_requests, manifest, config, stream_name, initial_state, expected_records, expected_state ): """ Helper function to mock requests, run the test, and verify the results. @@ -356,15 +356,15 @@ def run_mocked_test( [req for req in m.request_history if unquote(req.url) == unquote(url)] ) assert ( - request_count == 1 + request_count == 1 ), f"URL {url} was called {request_count} times, expected exactly once." def _run_read( - manifest: Mapping[str, Any], - config: Mapping[str, Any], - stream_name: str, - state: Optional[Union[List[AirbyteStateMessage], MutableMapping[str, Any]]] = None, + manifest: Mapping[str, Any], + config: Mapping[str, Any], + stream_name: str, + state: Optional[Union[List[AirbyteStateMessage], MutableMapping[str, Any]]] = None, ) -> EntrypointOutput: source = ConcurrentDeclarativeSource( source_config=manifest, config=config, catalog=None, state=state @@ -416,8 +416,8 @@ def _run_read( INITIAL_STATE_PARTITION_11_CURSOR.replace("Z", "") ) LOOKBACK_DATE = ( - INITIAL_GLOBAL_CURSOR_DATE - timedelta(days=LOOKBACK_WINDOW_DAYS) -).isoformat() + "Z" + INITIAL_GLOBAL_CURSOR_DATE - timedelta(days=LOOKBACK_WINDOW_DAYS) + ).isoformat() + "Z" PARTITION_SYNC_START_TIME = "2024-01-02T00:00:00Z" @@ -426,316 +426,316 @@ def _run_read( "test_name, manifest, mock_requests, expected_records, initial_state, expected_state", [ ( - "test_incremental_parent_state", - SUBSTREAM_MANIFEST_NO_DEPENDENCY, - [ - # Fetch the first page of posts - ( - f"https://api.example.com/community/posts?per_page=100&start_time={START_DATE}", - { - "posts": [ - {"id": 1, "updated_at": POST_1_UPDATED_AT}, - {"id": 2, "updated_at": POST_2_UPDATED_AT}, - ], - "next_page": f"https://api.example.com/community/posts?per_page=100&start_time={START_DATE}&page=2", - }, - ), - # Fetch the second page of posts - ( - f"https://api.example.com/community/posts?per_page=100&start_time={START_DATE}&page=2", - {"posts": [{"id": 3, "updated_at": POST_3_UPDATED_AT}]}, - ), - # Fetch the first page of comments for post 1 - ( - "https://api.example.com/community/posts/1/comments?per_page=100", - { - "comments": [ + "test_incremental_parent_state", + SUBSTREAM_MANIFEST_NO_DEPENDENCY, + [ + # Fetch the first page of posts + ( + f"https://api.example.com/community/posts?per_page=100&start_time={START_DATE}", { - "id": 9, - "post_id": 1, - "updated_at": COMMENT_9_OLDEST, # No requests for comment 9, filtered out due to the date + "posts": [ + {"id": 1, "updated_at": POST_1_UPDATED_AT}, + {"id": 2, "updated_at": POST_2_UPDATED_AT}, + ], + "next_page": f"https://api.example.com/community/posts?per_page=100&start_time={START_DATE}&page=2", }, + ), + # Fetch the second page of posts + ( + f"https://api.example.com/community/posts?per_page=100&start_time={START_DATE}&page=2", + {"posts": [{"id": 3, "updated_at": POST_3_UPDATED_AT}]}, + ), + # Fetch the first page of comments for post 1 + ( + "https://api.example.com/community/posts/1/comments?per_page=100", { - "id": 10, - "post_id": 1, - "updated_at": COMMENT_10_UPDATED_AT, + "comments": [ + { + "id": 9, + "post_id": 1, + "updated_at": COMMENT_9_OLDEST, # No requests for comment 9, filtered out due to the date + }, + { + "id": 10, + "post_id": 1, + "updated_at": COMMENT_10_UPDATED_AT, + }, + { + "id": 11, + "post_id": 1, + "updated_at": COMMENT_11_UPDATED_AT, + }, + ], + "next_page": "https://api.example.com/community/posts/1/comments?per_page=100&page=2", }, + ), + # Fetch the second page of comments for post 1 + ( + "https://api.example.com/community/posts/1/comments?per_page=100&page=2", { - "id": 11, - "post_id": 1, - "updated_at": COMMENT_11_UPDATED_AT, + "comments": [ + { + "id": 12, + "post_id": 1, + "updated_at": COMMENT_12_UPDATED_AT, + } + ] }, - ], - "next_page": "https://api.example.com/community/posts/1/comments?per_page=100&page=2", - }, - ), - # Fetch the second page of comments for post 1 - ( - "https://api.example.com/community/posts/1/comments?per_page=100&page=2", - { - "comments": [ + ), + # Fetch the first page of votes for comment 10 of post 1 + ( + f"https://api.example.com/community/posts/1/comments/10/votes?per_page=100&start_time={INITIAL_STATE_PARTITION_10_CURSOR}", { - "id": 12, - "post_id": 1, - "updated_at": COMMENT_12_UPDATED_AT, - } - ] - }, - ), - # Fetch the first page of votes for comment 10 of post 1 - ( - f"https://api.example.com/community/posts/1/comments/10/votes?per_page=100&start_time={INITIAL_STATE_PARTITION_10_CURSOR}", - { - "votes": [ + "votes": [ + { + "id": 100, + "comment_id": 10, + "created_at": VOTE_100_CREATED_AT, + } + ], + "next_page": f"https://api.example.com/community/posts/1/comments/10/votes?per_page=100&page=2&start_time={INITIAL_STATE_PARTITION_10_CURSOR}", + }, + ), + # Fetch the second page of votes for comment 10 of post 1 + ( + f"https://api.example.com/community/posts/1/comments/10/votes?per_page=100&page=2&start_time={INITIAL_STATE_PARTITION_10_CURSOR}", { - "id": 100, - "comment_id": 10, - "created_at": VOTE_100_CREATED_AT, - } - ], - "next_page": f"https://api.example.com/community/posts/1/comments/10/votes?per_page=100&page=2&start_time={INITIAL_STATE_PARTITION_10_CURSOR}", - }, - ), - # Fetch the second page of votes for comment 10 of post 1 - ( - f"https://api.example.com/community/posts/1/comments/10/votes?per_page=100&page=2&start_time={INITIAL_STATE_PARTITION_10_CURSOR}", - { - "votes": [ + "votes": [ + { + "id": 101, + "comment_id": 10, + "created_at": VOTE_101_CREATED_AT, + } + ] + }, + ), + # Fetch the first page of votes for comment 11 of post 1 + ( + f"https://api.example.com/community/posts/1/comments/11/votes?per_page=100&start_time={INITIAL_STATE_PARTITION_11_CURSOR}", { - "id": 101, - "comment_id": 10, - "created_at": VOTE_101_CREATED_AT, - } - ] - }, - ), - # Fetch the first page of votes for comment 11 of post 1 - ( - f"https://api.example.com/community/posts/1/comments/11/votes?per_page=100&start_time={INITIAL_STATE_PARTITION_11_CURSOR}", - { - "votes": [ + "votes": [ + { + "id": 111, + "comment_id": 11, + "created_at": VOTE_111_CREATED_AT, + } + ] + }, + ), + # Fetch the first page of votes for comment 12 of post 1 + ( + f"https://api.example.com/community/posts/1/comments/12/votes?per_page=100&start_time={LOOKBACK_DATE}", + {"votes": []}, + ), + # Fetch the first page of comments for post 2 + ( + "https://api.example.com/community/posts/2/comments?per_page=100", { - "id": 111, - "comment_id": 11, - "created_at": VOTE_111_CREATED_AT, - } - ] - }, - ), - # Fetch the first page of votes for comment 12 of post 1 - ( - f"https://api.example.com/community/posts/1/comments/12/votes?per_page=100&start_time={LOOKBACK_DATE}", - {"votes": []}, - ), - # Fetch the first page of comments for post 2 - ( - "https://api.example.com/community/posts/2/comments?per_page=100", - { - "comments": [ + "comments": [ + { + "id": 20, + "post_id": 2, + "updated_at": COMMENT_20_UPDATED_AT, + } + ], + "next_page": "https://api.example.com/community/posts/2/comments?per_page=100&page=2", + }, + ), + # Fetch the second page of comments for post 2 + ( + "https://api.example.com/community/posts/2/comments?per_page=100&page=2", { - "id": 20, - "post_id": 2, - "updated_at": COMMENT_20_UPDATED_AT, - } - ], - "next_page": "https://api.example.com/community/posts/2/comments?per_page=100&page=2", - }, - ), - # Fetch the second page of comments for post 2 - ( - "https://api.example.com/community/posts/2/comments?per_page=100&page=2", - { - "comments": [ + "comments": [ + { + "id": 21, + "post_id": 2, + "updated_at": COMMENT_21_UPDATED_AT, + } + ] + }, + ), + # Fetch the first page of votes for comment 20 of post 2 + ( + f"https://api.example.com/community/posts/2/comments/20/votes?per_page=100&start_time={LOOKBACK_DATE}", { - "id": 21, - "post_id": 2, - "updated_at": COMMENT_21_UPDATED_AT, - } - ] - }, - ), - # Fetch the first page of votes for comment 20 of post 2 - ( - f"https://api.example.com/community/posts/2/comments/20/votes?per_page=100&start_time={LOOKBACK_DATE}", - { - "votes": [ + "votes": [ + { + "id": 200, + "comment_id": 20, + "created_at": VOTE_200_CREATED_AT, + } + ] + }, + ), + # Fetch the first page of votes for comment 21 of post 2 + ( + f"https://api.example.com/community/posts/2/comments/21/votes?per_page=100&start_time={LOOKBACK_DATE}", { - "id": 200, - "comment_id": 20, - "created_at": VOTE_200_CREATED_AT, - } - ] + "votes": [ + { + "id": 210, + "comment_id": 21, + "created_at": VOTE_210_CREATED_AT, + } + ] + }, + ), + # Fetch the first page of comments for post 3 + ( + "https://api.example.com/community/posts/3/comments?per_page=100", + { + "comments": [ + { + "id": 30, + "post_id": 3, + "updated_at": COMMENT_30_UPDATED_AT, + } + ] + }, + ), + # Fetch the first page of votes for comment 30 of post 3 + ( + f"https://api.example.com/community/posts/3/comments/30/votes?per_page=100&start_time={LOOKBACK_DATE}", + { + "votes": [ + { + "id": 300, + "comment_id": 30, + "created_at": VOTE_300_CREATED_AT_TIMESTAMP, + } + ] + }, + ), + ], + # Expected records + [ + { + "comment_id": 10, + "comment_updated_at": COMMENT_10_UPDATED_AT, + "created_at": VOTE_100_CREATED_AT, + "id": 100, }, - ), - # Fetch the first page of votes for comment 21 of post 2 - ( - f"https://api.example.com/community/posts/2/comments/21/votes?per_page=100&start_time={LOOKBACK_DATE}", { - "votes": [ - { - "id": 210, - "comment_id": 21, - "created_at": VOTE_210_CREATED_AT, - } - ] + "comment_id": 10, + "comment_updated_at": COMMENT_10_UPDATED_AT, + "created_at": VOTE_101_CREATED_AT, + "id": 101, }, - ), - # Fetch the first page of comments for post 3 - ( - "https://api.example.com/community/posts/3/comments?per_page=100", { - "comments": [ - { - "id": 30, - "post_id": 3, - "updated_at": COMMENT_30_UPDATED_AT, - } - ] + "comment_id": 11, + "comment_updated_at": COMMENT_11_UPDATED_AT, + "created_at": VOTE_111_CREATED_AT, + "id": 111, }, - ), - # Fetch the first page of votes for comment 30 of post 3 - ( - f"https://api.example.com/community/posts/3/comments/30/votes?per_page=100&start_time={LOOKBACK_DATE}", { - "votes": [ - { - "id": 300, - "comment_id": 30, - "created_at": VOTE_300_CREATED_AT_TIMESTAMP, - } - ] + "comment_id": 20, + "comment_updated_at": COMMENT_20_UPDATED_AT, + "created_at": VOTE_200_CREATED_AT, + "id": 200, }, - ), - ], - # Expected records - [ - { - "comment_id": 10, - "comment_updated_at": COMMENT_10_UPDATED_AT, - "created_at": VOTE_100_CREATED_AT, - "id": 100, - }, - { - "comment_id": 10, - "comment_updated_at": COMMENT_10_UPDATED_AT, - "created_at": VOTE_101_CREATED_AT, - "id": 101, - }, - { - "comment_id": 11, - "comment_updated_at": COMMENT_11_UPDATED_AT, - "created_at": VOTE_111_CREATED_AT, - "id": 111, - }, - { - "comment_id": 20, - "comment_updated_at": COMMENT_20_UPDATED_AT, - "created_at": VOTE_200_CREATED_AT, - "id": 200, - }, - { - "comment_id": 21, - "comment_updated_at": COMMENT_21_UPDATED_AT, - "created_at": VOTE_210_CREATED_AT, - "id": 210, - }, - { - "comment_id": 30, - "comment_updated_at": COMMENT_30_UPDATED_AT, - "created_at": str(VOTE_300_CREATED_AT_TIMESTAMP), - "id": 300, - }, - ], - # Initial state - { - # This should not happen since parent state is disabled, but I've added this to validate that and - # incoming parent_state is ignored when the parent stream's incremental_dependency is disabled - "parent_state": { - "post_comments": { - "states": [ - { - "partition": {"id": 1, "parent_slice": {}}, - "cursor": {"updated_at": PARENT_COMMENT_CURSOR_PARTITION_1}, - } - ], - "parent_state": {"posts": {"updated_at": PARENT_POSTS_CURSOR}}, - } - }, - "states": [ { - "partition": { - "id": 10, - "parent_slice": {"id": 1, "parent_slice": {}}, - }, - "cursor": {"created_at": INITIAL_STATE_PARTITION_10_CURSOR_TIMESTAMP}, + "comment_id": 21, + "comment_updated_at": COMMENT_21_UPDATED_AT, + "created_at": VOTE_210_CREATED_AT, + "id": 210, }, { - "partition": { - "id": 11, - "parent_slice": {"id": 1, "parent_slice": {}}, - }, - "cursor": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, + "comment_id": 30, + "comment_updated_at": COMMENT_30_UPDATED_AT, + "created_at": str(VOTE_300_CREATED_AT_TIMESTAMP), + "id": 300, }, ], - "state": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR_TIMESTAMP}, - "lookback_window": 86400, - }, - # Expected state - { - "states": [ - { - "partition": { - "id": 10, - "parent_slice": {"id": 1, "parent_slice": {}}, - }, - "cursor": {"created_at": VOTE_100_CREATED_AT}, + # Initial state + { + # This should not happen since parent state is disabled, but I've added this to validate that and + # incoming parent_state is ignored when the parent stream's incremental_dependency is disabled + "parent_state": { + "post_comments": { + "states": [ + { + "partition": {"id": 1, "parent_slice": {}}, + "cursor": {"updated_at": PARENT_COMMENT_CURSOR_PARTITION_1}, + } + ], + "parent_state": {"posts": {"updated_at": PARENT_POSTS_CURSOR}}, + } }, - { - "partition": { - "id": 11, - "parent_slice": {"id": 1, "parent_slice": {}}, + "states": [ + { + "partition": { + "id": 10, + "parent_slice": {"id": 1, "parent_slice": {}}, + }, + "cursor": {"created_at": INITIAL_STATE_PARTITION_10_CURSOR_TIMESTAMP}, }, - "cursor": {"created_at": VOTE_111_CREATED_AT}, - }, - { - "partition": { - "id": 12, - "parent_slice": {"id": 1, "parent_slice": {}}, + { + "partition": { + "id": 11, + "parent_slice": {"id": 1, "parent_slice": {}}, + }, + "cursor": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, }, - "cursor": {"created_at": LOOKBACK_DATE}, - }, - { - "partition": { - "id": 20, - "parent_slice": {"id": 2, "parent_slice": {}}, + ], + "state": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR_TIMESTAMP}, + "lookback_window": 86400, + }, + # Expected state + { + "states": [ + { + "partition": { + "id": 10, + "parent_slice": {"id": 1, "parent_slice": {}}, + }, + "cursor": {"created_at": VOTE_100_CREATED_AT}, }, - "cursor": {"created_at": VOTE_200_CREATED_AT}, - }, - { - "partition": { - "id": 21, - "parent_slice": {"id": 2, "parent_slice": {}}, + { + "partition": { + "id": 11, + "parent_slice": {"id": 1, "parent_slice": {}}, + }, + "cursor": {"created_at": VOTE_111_CREATED_AT}, }, - "cursor": {"created_at": VOTE_210_CREATED_AT}, - }, - { - "partition": { - "id": 30, - "parent_slice": {"id": 3, "parent_slice": {}}, + { + "partition": { + "id": 12, + "parent_slice": {"id": 1, "parent_slice": {}}, + }, + "cursor": {"created_at": LOOKBACK_DATE}, }, - "cursor": {"created_at": VOTE_300_CREATED_AT}, - }, - ], - "use_global_cursor": False, - "lookback_window": 1, - "parent_state": {}, - "state": {"created_at": VOTE_100_CREATED_AT}, - }, + { + "partition": { + "id": 20, + "parent_slice": {"id": 2, "parent_slice": {}}, + }, + "cursor": {"created_at": VOTE_200_CREATED_AT}, + }, + { + "partition": { + "id": 21, + "parent_slice": {"id": 2, "parent_slice": {}}, + }, + "cursor": {"created_at": VOTE_210_CREATED_AT}, + }, + { + "partition": { + "id": 30, + "parent_slice": {"id": 3, "parent_slice": {}}, + }, + "cursor": {"created_at": VOTE_300_CREATED_AT}, + }, + ], + "use_global_cursor": False, + "lookback_window": 1, + "parent_state": {}, + "state": {"created_at": VOTE_100_CREATED_AT}, + }, ), ], ) def test_incremental_parent_state_no_incremental_dependency( - test_name, manifest, mock_requests, expected_records, initial_state, expected_state + test_name, manifest, mock_requests, expected_records, initial_state, expected_state ): """ This is a pretty complicated test that syncs a low-code connector stream with three levels of substreams @@ -761,12 +761,13 @@ def test_incremental_parent_state_no_incremental_dependency( def run_incremental_parent_state_test( - manifest, - mock_requests, - expected_records, - num_intermediate_states, - initial_state, - expected_states, + manifest, + mock_requests, + expected_records, + num_intermediate_states, + intermidiate_states, + initial_state, + expected_states, ): """ Run an incremental parent state test for the specified stream. @@ -784,6 +785,7 @@ def run_incremental_parent_state_test( mock_requests (list): A list of tuples containing URL and response data for mocking API requests. expected_records (list): The expected records to compare against the output. num_intermediate_states (int): The number of intermediate states to expect. + intermidiate_states (list): A list of intermediate states to assert initial_state (list): The initial state to start the read operation. expected_states (list): A list of expected final states after the read operation. """ @@ -830,6 +832,12 @@ def run_incremental_parent_state_test( # Assert that the number of intermediate states is as expected assert len(intermediate_states) - 1 == num_intermediate_states + # Extract just the Python dict from each state message + all_state_dicts = [st[0].stream.stream_state.__dict__ for st in intermediate_states] + + for idx, itermidiate_state in enumerate(all_state_dicts): + assert itermidiate_state == intermidiate_states[idx], idx + # For each intermediate state, perform another read starting from that state for state, records_before_state in intermediate_states[:-1]: output_intermediate = _run_read(manifest, CONFIG, STREAM_NAME, [state]) @@ -848,8 +856,8 @@ def run_incremental_parent_state_test( {orjson.dumps(record): record for record in expected_records}.values() ) assert ( - sorted(cumulative_records_state_deduped, key=lambda x: x["id"]) - == sorted(expected_records_set, key=lambda x: x["id"]) + sorted(cumulative_records_state_deduped, key=lambda x: x["id"]) + == sorted(expected_records_set, key=lambda x: x["id"]) ), f"Records mismatch with intermediate state {state}. Expected {expected_records}, got {cumulative_records_state_deduped}" # Store the final state after each intermediate read @@ -862,336 +870,615 @@ def run_incremental_parent_state_test( # Assert that the final state matches the expected state for all runs for i, final_state in enumerate(final_states): assert ( - final_state in expected_states + final_state in expected_states ), f"Final state mismatch at run {i + 1}. Expected {expected_states}, got {final_state}" -@pytest.mark.parametrize( - "test_name, manifest, mock_requests, expected_records, num_intermediate_states, initial_state, expected_state", - [ - ( - "test_incremental_parent_state", - SUBSTREAM_MANIFEST, - [ - # Fetch the first page of posts - ( - f"https://api.example.com/community/posts?per_page=100&start_time={PARENT_POSTS_CURSOR}", - { - "posts": [ - {"id": 1, "updated_at": POST_1_UPDATED_AT}, - {"id": 2, "updated_at": POST_2_UPDATED_AT}, - ], - "next_page": ( - f"https://api.example.com/community/posts" - f"?per_page=100&start_time={PARENT_POSTS_CURSOR}&page=2" - ), - }, - ), - # Fetch the second page of posts - ( - f"https://api.example.com/community/posts?per_page=100&start_time={PARENT_POSTS_CURSOR}&page=2", - {"posts": [{"id": 3, "updated_at": POST_3_UPDATED_AT}]}, - ), - # Fetch the first page of comments for post 1 - ( - "https://api.example.com/community/posts/1/comments?per_page=100", - { - "comments": [ - { - "id": 9, - "post_id": 1, - "updated_at": COMMENT_9_OLDEST, - }, - { - "id": 10, - "post_id": 1, - "updated_at": COMMENT_10_UPDATED_AT, - }, - { - "id": 11, - "post_id": 1, - "updated_at": COMMENT_11_UPDATED_AT, - }, - ], - "next_page": "https://api.example.com/community/posts/1/comments?per_page=100&page=2", - }, - ), - # Fetch the second page of comments for post 1 - ( - "https://api.example.com/community/posts/1/comments?per_page=100&page=2", - {"comments": [{"id": 12, "post_id": 1, "updated_at": COMMENT_12_UPDATED_AT}]}, - ), - # Fetch the first page of votes for comment 10 of post 1 - ( - f"https://api.example.com/community/posts/1/comments/10/votes?per_page=100&start_time={INITIAL_STATE_PARTITION_10_CURSOR}", +INITIAL_STATE = { + "parent_state": { + "post_comments": { + "states": [ + { + "partition": {"id": 1, "parent_slice": {}}, + "cursor": {"updated_at": PARENT_COMMENT_CURSOR_PARTITION_1}, + } + ], + "parent_state": {"posts": {"updated_at": PARENT_POSTS_CURSOR}}, + } + }, + "state": {"created_at": INITIAL_GLOBAL_CURSOR}, + "states": [ + { + "partition": { + "id": 10, + "parent_slice": {"id": 1, "parent_slice": {}}, + }, + "cursor": {"created_at": INITIAL_STATE_PARTITION_10_CURSOR}, + }, + { + "partition": { + "id": 11, + "parent_slice": {"id": 1, "parent_slice": {}}, + }, + "cursor": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, + }, + ], + "lookback_window": 86400, +} + +INTERMEDIATE_STATES = [ + { + "use_global_cursor": False, + "states": [ + { + "partition": {"id": 10, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-15T00:00:00Z"}, + }, + { + "partition": {"id": 11, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-03T00:00:02Z"}, + }, + ], + "state": {"created_at": "2024-01-03T00:00:02Z"}, + "lookback_window": 86400, + "parent_state": { + "post_comments": { + "use_global_cursor": False, + "state": {}, + "states": [ { - "votes": [ - { - "id": 100, - "comment_id": 10, - "created_at": VOTE_100_CREATED_AT, - } - ], - "next_page": ( - f"https://api.example.com/community/posts/1/comments/10/votes" - f"?per_page=100&page=2&start_time={INITIAL_STATE_PARTITION_10_CURSOR}" - ), - }, - ), - # Fetch the second page of votes for comment 10 of post 1 - ( - f"https://api.example.com/community/posts/1/comments/10/votes" - f"?per_page=100&page=2&start_time={INITIAL_STATE_PARTITION_10_CURSOR}", - {"votes": [{"id": 101, "comment_id": 10, "created_at": VOTE_101_CREATED_AT}]}, - ), - # Fetch the first page of votes for comment 11 of post 1 - ( - f"https://api.example.com/community/posts/1/comments/11/votes" - f"?per_page=100&start_time={INITIAL_STATE_PARTITION_11_CURSOR}", - {"votes": [{"id": 111, "comment_id": 11, "created_at": VOTE_111_CREATED_AT}]}, - ), - # Fetch the first page of votes for comment 12 of post 1 - ( - f"https://api.example.com/community/posts/1/comments/12/votes?per_page=100&start_time={LOOKBACK_DATE}", - {"votes": []}, - ), - # Fetch the first page of comments for post 2 - ( - "https://api.example.com/community/posts/2/comments?per_page=100", + "partition": {"id": 1, "parent_slice": {}}, + "cursor": {"updated_at": "2023-01-04T00:00:00Z"}, + } + ], + "parent_state": {"posts": {"updated_at": "2024-01-05T00:00:00Z"}}, + } + }, + }, + { + "use_global_cursor": False, + "states": [ + { + "partition": {"id": 10, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-15T00:00:00Z"}, + }, + { + "partition": {"id": 11, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-13T00:00:00Z"}, + }, + ], + "state": {"created_at": "2024-01-03T00:00:02Z"}, + "lookback_window": 86400, + "parent_state": { + "post_comments": { + "use_global_cursor": False, + "state": {}, + "states": [ { - "comments": [{"id": 20, "post_id": 2, "updated_at": COMMENT_20_UPDATED_AT}], - "next_page": "https://api.example.com/community/posts/2/comments?per_page=100&page=2", - }, - ), - # Fetch the second page of comments for post 2 - ( - "https://api.example.com/community/posts/2/comments?per_page=100&page=2", - {"comments": [{"id": 21, "post_id": 2, "updated_at": COMMENT_21_UPDATED_AT}]}, - ), - # Fetch the first page of votes for comment 20 of post 2 - ( - f"https://api.example.com/community/posts/2/comments/20/votes?per_page=100&start_time={LOOKBACK_DATE}", - {"votes": [{"id": 200, "comment_id": 20, "created_at": VOTE_200_CREATED_AT}]}, - ), - # Fetch the first page of votes for comment 21 of post 2 - ( - f"https://api.example.com/community/posts/2/comments/21/votes?per_page=100&start_time={LOOKBACK_DATE}", - {"votes": [{"id": 210, "comment_id": 21, "created_at": VOTE_210_CREATED_AT}]}, - ), - # Fetch the first page of comments for post 3 - ( - "https://api.example.com/community/posts/3/comments?per_page=100", - {"comments": [{"id": 30, "post_id": 3, "updated_at": COMMENT_30_UPDATED_AT}]}, - ), - # Fetch the first page of votes for comment 30 of post 3 - ( - f"https://api.example.com/community/posts/3/comments/30/votes?per_page=100&start_time={LOOKBACK_DATE}", + "partition": {"id": 1, "parent_slice": {}}, + "cursor": {"updated_at": "2023-01-04T00:00:00Z"}, + } + ], + "parent_state": {"posts": {"updated_at": "2024-01-05T00:00:00Z"}}, + } + }, + }, + { + "use_global_cursor": False, + "states": [ + { + "partition": {"id": 10, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-15T00:00:00Z"}, + }, + { + "partition": {"id": 11, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-13T00:00:00Z"}, + }, + { + "partition": {"id": 12, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-02T00:00:02Z"}, + }, + ], + "state": {"created_at": "2024-01-03T00:00:02Z"}, + "lookback_window": 86400, + "parent_state": { + "post_comments": { + "use_global_cursor": False, + "state": {}, + "states": [ { - "votes": [ - { - "id": 300, - "comment_id": 30, - "created_at": VOTE_300_CREATED_AT_TIMESTAMP, - } - ] - }, - ), - # Requests with intermediate states - # Fetch votes for comment 10 of post 1 - ( - f"https://api.example.com/community/posts/1/comments/10/votes?per_page=100&start_time={VOTE_100_CREATED_AT}", + "partition": {"id": 1, "parent_slice": {}}, + "cursor": {"updated_at": "2023-01-04T00:00:00Z"}, + } + ], + "parent_state": {"posts": {"updated_at": "2024-01-05T00:00:00Z"}}, + } + }, + }, + { + "use_global_cursor": False, + "states": [ + { + "partition": {"id": 10, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-15T00:00:00Z"}, + }, + { + "partition": {"id": 11, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-13T00:00:00Z"}, + }, + { + "partition": {"id": 12, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-02T00:00:02Z"}, + }, + { + "partition": {"id": 20, "parent_slice": {"id": 2, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-12T00:00:00Z"}, + }, + ], + "state": {"created_at": "2024-01-03T00:00:02Z"}, + "lookback_window": 86400, + "parent_state": { + "post_comments": { + "use_global_cursor": False, + "state": {}, + "states": [ { - "votes": [{"id": 100, "comment_id": 10, "created_at": VOTE_100_CREATED_AT}], - }, - ), - # Fetch votes for comment 11 of post 1 - ( - f"https://api.example.com/community/posts/1/comments/11/votes?per_page=100&start_time={VOTE_111_CREATED_AT}", + "partition": {"id": 1, "parent_slice": {}}, + "cursor": {"updated_at": "2024-01-25T00:00:00Z"}, + } + ], + "parent_state": {"posts": {"updated_at": "2024-01-05T00:00:00Z"}}, + } + }, + }, + { + "use_global_cursor": False, + "states": [ + { + "partition": {"id": 10, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-15T00:00:00Z"}, + }, + { + "partition": {"id": 11, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-13T00:00:00Z"}, + }, + { + "partition": {"id": 12, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-02T00:00:02Z"}, + }, + { + "partition": {"id": 20, "parent_slice": {"id": 2, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-12T00:00:00Z"}, + }, + { + "partition": {"id": 21, "parent_slice": {"id": 2, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-12T00:00:15Z"}, + }, + { + "partition": {"id": 30, "parent_slice": {"id": 3, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-02T00:00:02Z"}, + }, + ], + "state": {"created_at": "2024-01-03T00:00:02Z"}, + "lookback_window": 86400, + "parent_state": { + "post_comments": { + "use_global_cursor": False, + "state": {}, + "states": [ { - "votes": [{"id": 111, "comment_id": 11, "created_at": VOTE_111_CREATED_AT}], - }, - ), - # Fetch votes for comment 12 of post 1 - ( - f"https://api.example.com/community/posts/1/comments/12/votes?per_page=100&start_time={VOTE_111_CREATED_AT}", + "partition": {"id": 1, "parent_slice": {}}, + "cursor": {"updated_at": "2024-01-25T00:00:00Z"}, + } + ], + "parent_state": {"posts": {"updated_at": "2024-01-05T00:00:00Z"}}, + } + }, + }, + { + "use_global_cursor": False, + "states": [ + { + "partition": {"id": 10, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-15T00:00:00Z"}, + }, + { + "partition": {"id": 11, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-13T00:00:00Z"}, + }, + { + "partition": {"id": 12, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-02T00:00:02Z"}, + }, + { + "partition": {"id": 20, "parent_slice": {"id": 2, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-12T00:00:00Z"}, + }, + { + "partition": {"id": 21, "parent_slice": {"id": 2, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-12T00:00:15Z"}, + }, + { + "partition": {"id": 30, "parent_slice": {"id": 3, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-10T00:00:00Z"}, + }, + ], + "state": {"created_at": "2024-01-03T00:00:02Z"}, + "lookback_window": 86400, + "parent_state": { + "post_comments": { + "use_global_cursor": False, + "state": {}, + "states": [ { - "votes": [], + "partition": {"id": 1, "parent_slice": {}}, + "cursor": {"updated_at": "2024-01-25T00:00:00Z"}, }, - ), - # Fetch votes for comment 20 of post 2 - ( - f"https://api.example.com/community/posts/2/comments/20/votes?per_page=100&start_time={VOTE_200_CREATED_AT}", - {"votes": [{"id": 200, "comment_id": 20, "created_at": VOTE_200_CREATED_AT}]}, - ), - # Fetch votes for comment 21 of post 2 - ( - f"https://api.example.com/community/posts/2/comments/21/votes?per_page=100&start_time={VOTE_210_CREATED_AT}", - {"votes": [{"id": 210, "comment_id": 21, "created_at": VOTE_210_CREATED_AT}]}, - ), - # Fetch votes for comment 30 of post 3 - ( - f"https://api.example.com/community/posts/3/comments/30/votes?per_page=100&start_time={VOTE_300_CREATED_AT}", { - "votes": [ - { - "id": 300, - "comment_id": 30, - "created_at": VOTE_300_CREATED_AT_TIMESTAMP, - } - ] + "partition": {"id": 2, "parent_slice": {}}, + "cursor": {"updated_at": "2024-01-22T00:00:00Z"}, }, - ), - ], - # Expected records - [ - { - "comment_id": 10, - "comment_updated_at": COMMENT_10_UPDATED_AT, - "created_at": VOTE_100_CREATED_AT, - "id": 100, - }, - { - "comment_id": 10, - "comment_updated_at": COMMENT_10_UPDATED_AT, - "created_at": VOTE_101_CREATED_AT, - "id": 101, - }, - { - "comment_id": 11, - "comment_updated_at": COMMENT_11_UPDATED_AT, - "created_at": VOTE_111_CREATED_AT, - "id": 111, - }, - { - "comment_id": 20, - "comment_updated_at": COMMENT_20_UPDATED_AT, - "created_at": VOTE_200_CREATED_AT, - "id": 200, - }, - { - "comment_id": 21, - "comment_updated_at": COMMENT_21_UPDATED_AT, - "created_at": VOTE_210_CREATED_AT, - "id": 210, - }, - { - "comment_id": 30, - "comment_updated_at": COMMENT_30_UPDATED_AT, - "created_at": str(VOTE_300_CREATED_AT_TIMESTAMP), - "id": 300, - }, - ], - # Number of intermediate states - 6 as number of parent partitions - 6, - # Initial state + ], + "parent_state": {"posts": {"updated_at": "2024-01-05T00:00:00Z"}}, + } + }, + }, + { + "use_global_cursor": False, + "states": [ { - "parent_state": { - "post_comments": { - "states": [ - { - "partition": {"id": 1, "parent_slice": {}}, - "cursor": {"updated_at": PARENT_COMMENT_CURSOR_PARTITION_1}, - } - ], - "parent_state": {"posts": {"updated_at": PARENT_POSTS_CURSOR}}, - } - }, - "state": {"created_at": INITIAL_GLOBAL_CURSOR}, + "partition": {"id": 10, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-15T00:00:00Z"}, + }, + { + "partition": {"id": 11, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-13T00:00:00Z"}, + }, + { + "partition": {"id": 12, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-02T00:00:02Z"}, + }, + { + "partition": {"id": 20, "parent_slice": {"id": 2, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-12T00:00:00Z"}, + }, + { + "partition": {"id": 21, "parent_slice": {"id": 2, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-12T00:00:15Z"}, + }, + { + "partition": {"id": 30, "parent_slice": {"id": 3, "parent_slice": {}}}, + "cursor": {"created_at": "2024-01-10T00:00:00Z"}, + }, + ], + "state": {"created_at": "2024-01-15T00:00:00Z"}, + "lookback_window": 1, + "parent_state": { + "post_comments": { + "use_global_cursor": False, + "state": {"updated_at": "2024-01-25T00:00:00Z"}, + "lookback_window": 1, "states": [ { - "partition": { - "id": 10, - "parent_slice": {"id": 1, "parent_slice": {}}, - }, - "cursor": {"created_at": INITIAL_STATE_PARTITION_10_CURSOR}, + "partition": {"id": 1, "parent_slice": {}}, + "cursor": {"updated_at": "2024-01-25T00:00:00Z"}, }, { - "partition": { - "id": 11, - "parent_slice": {"id": 1, "parent_slice": {}}, - }, - "cursor": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, + "partition": {"id": 2, "parent_slice": {}}, + "cursor": {"updated_at": "2024-01-22T00:00:00Z"}, + }, + { + "partition": {"id": 3, "parent_slice": {}}, + "cursor": {"updated_at": "2024-01-09T00:00:00Z"}, }, ], - "lookback_window": 86400, - }, - # Expected state - { - "state": {"created_at": VOTE_100_CREATED_AT}, - "parent_state": { - "post_comments": { - "use_global_cursor": False, - "state": {"updated_at": COMMENT_10_UPDATED_AT}, # 10 is the "latest" - "parent_state": { - "posts": {"updated_at": POST_1_UPDATED_AT} - }, # post 1 is the latest - "lookback_window": 1, - "states": [ + "parent_state": {"posts": {"updated_at": "2024-01-30T00:00:00Z"}}, + } + }, + }, +] + + +@pytest.mark.parametrize( + "test_name, manifest, mock_requests, expected_records, num_intermediate_states, intermidiate_states, initial_state, expected_state", + [ + ( + "test_incremental_parent_state", + SUBSTREAM_MANIFEST, + [ + # Fetch the first page of posts + ( + f"https://api.example.com/community/posts?per_page=100&start_time={PARENT_POSTS_CURSOR}", + { + "posts": [ + {"id": 1, "updated_at": POST_1_UPDATED_AT}, + {"id": 2, "updated_at": POST_2_UPDATED_AT}, + ], + "next_page": ( + f"https://api.example.com/community/posts" + f"?per_page=100&start_time={PARENT_POSTS_CURSOR}&page=2" + ), + }, + ), + # Fetch the second page of posts + ( + f"https://api.example.com/community/posts?per_page=100&start_time={PARENT_POSTS_CURSOR}&page=2", + {"posts": [{"id": 3, "updated_at": POST_3_UPDATED_AT}]}, + ), + # Fetch the first page of comments for post 1 + ( + "https://api.example.com/community/posts/1/comments?per_page=100", + { + "comments": [ + { + "id": 9, + "post_id": 1, + "updated_at": COMMENT_9_OLDEST, + }, + { + "id": 10, + "post_id": 1, + "updated_at": COMMENT_10_UPDATED_AT, + }, + { + "id": 11, + "post_id": 1, + "updated_at": COMMENT_11_UPDATED_AT, + }, + ], + "next_page": "https://api.example.com/community/posts/1/comments?per_page=100&page=2", + }, + ), + # Fetch the second page of comments for post 1 + ( + "https://api.example.com/community/posts/1/comments?per_page=100&page=2", + {"comments": [{"id": 12, "post_id": 1, "updated_at": COMMENT_12_UPDATED_AT}]}, + ), + # Fetch the first page of votes for comment 10 of post 1 + ( + f"https://api.example.com/community/posts/1/comments/10/votes?per_page=100&start_time={INITIAL_STATE_PARTITION_10_CURSOR}", { - "partition": {"id": 1, "parent_slice": {}}, - "cursor": {"updated_at": COMMENT_10_UPDATED_AT}, + "votes": [ + { + "id": 100, + "comment_id": 10, + "created_at": VOTE_100_CREATED_AT, + } + ], + "next_page": ( + f"https://api.example.com/community/posts/1/comments/10/votes" + f"?per_page=100&page=2&start_time={INITIAL_STATE_PARTITION_10_CURSOR}" + ), }, + ), + # Fetch the second page of votes for comment 10 of post 1 + ( + f"https://api.example.com/community/posts/1/comments/10/votes" + f"?per_page=100&page=2&start_time={INITIAL_STATE_PARTITION_10_CURSOR}", + {"votes": [{"id": 101, "comment_id": 10, "created_at": VOTE_101_CREATED_AT}]}, + ), + # Fetch the first page of votes for comment 11 of post 1 + ( + f"https://api.example.com/community/posts/1/comments/11/votes" + f"?per_page=100&start_time={INITIAL_STATE_PARTITION_11_CURSOR}", + {"votes": [{"id": 111, "comment_id": 11, "created_at": VOTE_111_CREATED_AT}]}, + ), + # Fetch the first page of votes for comment 12 of post 1 + ( + f"https://api.example.com/community/posts/1/comments/12/votes?per_page=100&start_time={LOOKBACK_DATE}", + {"votes": []}, + ), + # Fetch the first page of comments for post 2 + ( + "https://api.example.com/community/posts/2/comments?per_page=100", { - "partition": {"id": 2, "parent_slice": {}}, - "cursor": {"updated_at": COMMENT_20_UPDATED_AT}, + "comments": [{"id": 20, "post_id": 2, "updated_at": COMMENT_20_UPDATED_AT}], + "next_page": "https://api.example.com/community/posts/2/comments?per_page=100&page=2", }, + ), + # Fetch the second page of comments for post 2 + ( + "https://api.example.com/community/posts/2/comments?per_page=100&page=2", + {"comments": [{"id": 21, "post_id": 2, "updated_at": COMMENT_21_UPDATED_AT}]}, + ), + # Fetch the first page of votes for comment 20 of post 2 + ( + f"https://api.example.com/community/posts/2/comments/20/votes?per_page=100&start_time={LOOKBACK_DATE}", + {"votes": [{"id": 200, "comment_id": 20, "created_at": VOTE_200_CREATED_AT}]}, + ), + # Fetch the first page of votes for comment 21 of post 2 + ( + f"https://api.example.com/community/posts/2/comments/21/votes?per_page=100&start_time={LOOKBACK_DATE}", + {"votes": [{"id": 210, "comment_id": 21, "created_at": VOTE_210_CREATED_AT}]}, + ), + # Fetch the first page of comments for post 3 + ( + "https://api.example.com/community/posts/3/comments?per_page=100", + {"comments": [{"id": 30, "post_id": 3, "updated_at": COMMENT_30_UPDATED_AT}]}, + ), + # Fetch the first page of votes for comment 30 of post 3 + ( + f"https://api.example.com/community/posts/3/comments/30/votes?per_page=100&start_time={LOOKBACK_DATE}", { - "partition": {"id": 3, "parent_slice": {}}, - "cursor": {"updated_at": COMMENT_30_UPDATED_AT}, + "votes": [ + { + "id": 300, + "comment_id": 30, + "created_at": VOTE_300_CREATED_AT_TIMESTAMP, + } + ] }, - ], - } - }, - "lookback_window": 1, - "use_global_cursor": False, - "states": [ + ), + # Requests with intermediate states + # Fetch votes for comment 10 of post 1 + ( + f"https://api.example.com/community/posts/1/comments/10/votes?per_page=100&start_time={VOTE_100_CREATED_AT}", + { + "votes": [{"id": 100, "comment_id": 10, "created_at": VOTE_100_CREATED_AT}], + }, + ), + # Fetch votes for comment 11 of post 1 + ( + f"https://api.example.com/community/posts/1/comments/11/votes?per_page=100&start_time={VOTE_111_CREATED_AT}", + { + "votes": [{"id": 111, "comment_id": 11, "created_at": VOTE_111_CREATED_AT}], + }, + ), + # Fetch votes for comment 12 of post 1 + ( + f"https://api.example.com/community/posts/1/comments/12/votes?per_page=100&start_time={VOTE_111_CREATED_AT}", + { + "votes": [], + }, + ), + # Fetch votes for comment 20 of post 2 + ( + f"https://api.example.com/community/posts/2/comments/20/votes?per_page=100&start_time={VOTE_200_CREATED_AT}", + {"votes": [{"id": 200, "comment_id": 20, "created_at": VOTE_200_CREATED_AT}]}, + ), + # Fetch votes for comment 21 of post 2 + ( + f"https://api.example.com/community/posts/2/comments/21/votes?per_page=100&start_time={VOTE_210_CREATED_AT}", + {"votes": [{"id": 210, "comment_id": 21, "created_at": VOTE_210_CREATED_AT}]}, + ), + # Fetch votes for comment 30 of post 3 + ( + f"https://api.example.com/community/posts/3/comments/30/votes?per_page=100&start_time={VOTE_300_CREATED_AT}", + { + "votes": [ + { + "id": 300, + "comment_id": 30, + "created_at": VOTE_300_CREATED_AT_TIMESTAMP, + } + ] + }, + ), + ], + # Expected records + [ { - "partition": {"id": 10, "parent_slice": {"id": 1, "parent_slice": {}}}, - "cursor": {"created_at": VOTE_100_CREATED_AT}, + "comment_id": 10, + "comment_updated_at": COMMENT_10_UPDATED_AT, + "created_at": VOTE_100_CREATED_AT, + "id": 100, }, { - "partition": {"id": 11, "parent_slice": {"id": 1, "parent_slice": {}}}, - "cursor": {"created_at": VOTE_111_CREATED_AT}, + "comment_id": 10, + "comment_updated_at": COMMENT_10_UPDATED_AT, + "created_at": VOTE_101_CREATED_AT, + "id": 101, }, { - "partition": {"id": 12, "parent_slice": {"id": 1, "parent_slice": {}}}, - "cursor": {"created_at": LOOKBACK_DATE}, + "comment_id": 11, + "comment_updated_at": COMMENT_11_UPDATED_AT, + "created_at": VOTE_111_CREATED_AT, + "id": 111, }, { - "partition": {"id": 20, "parent_slice": {"id": 2, "parent_slice": {}}}, - "cursor": {"created_at": VOTE_200_CREATED_AT}, + "comment_id": 20, + "comment_updated_at": COMMENT_20_UPDATED_AT, + "created_at": VOTE_200_CREATED_AT, + "id": 200, }, { - "partition": {"id": 21, "parent_slice": {"id": 2, "parent_slice": {}}}, - "cursor": {"created_at": VOTE_210_CREATED_AT}, + "comment_id": 21, + "comment_updated_at": COMMENT_21_UPDATED_AT, + "created_at": VOTE_210_CREATED_AT, + "id": 210, }, { - "partition": {"id": 30, "parent_slice": {"id": 3, "parent_slice": {}}}, - "cursor": {"created_at": VOTE_300_CREATED_AT}, + "comment_id": 30, + "comment_updated_at": COMMENT_30_UPDATED_AT, + "created_at": str(VOTE_300_CREATED_AT_TIMESTAMP), + "id": 300, }, ], - }, + # Number of intermediate states - 6 as number of parent partitions + 6, + # Intermediate states + INTERMEDIATE_STATES, + # Initial state + INITIAL_STATE, + # Expected state + { + "state": {"created_at": VOTE_100_CREATED_AT}, + "parent_state": { + "post_comments": { + "use_global_cursor": False, + "state": {"updated_at": COMMENT_10_UPDATED_AT}, # 10 is the "latest" + "parent_state": { + "posts": {"updated_at": POST_1_UPDATED_AT} + }, # post 1 is the latest + "lookback_window": 1, + "states": [ + { + "partition": {"id": 1, "parent_slice": {}}, + "cursor": {"updated_at": COMMENT_10_UPDATED_AT}, + }, + { + "partition": {"id": 2, "parent_slice": {}}, + "cursor": {"updated_at": COMMENT_20_UPDATED_AT}, + }, + { + "partition": {"id": 3, "parent_slice": {}}, + "cursor": {"updated_at": COMMENT_30_UPDATED_AT}, + }, + ], + } + }, + "lookback_window": 1, + "use_global_cursor": False, + "states": [ + { + "partition": {"id": 10, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": VOTE_100_CREATED_AT}, + }, + { + "partition": {"id": 11, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": VOTE_111_CREATED_AT}, + }, + { + "partition": {"id": 12, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": LOOKBACK_DATE}, + }, + { + "partition": {"id": 20, "parent_slice": {"id": 2, "parent_slice": {}}}, + "cursor": {"created_at": VOTE_200_CREATED_AT}, + }, + { + "partition": {"id": 21, "parent_slice": {"id": 2, "parent_slice": {}}}, + "cursor": {"created_at": VOTE_210_CREATED_AT}, + }, + { + "partition": {"id": 30, "parent_slice": {"id": 3, "parent_slice": {}}}, + "cursor": {"created_at": VOTE_300_CREATED_AT}, + }, + ], + }, ), ], ) def test_incremental_parent_state( - test_name, - manifest, - mock_requests, - expected_records, - num_intermediate_states, - initial_state, - expected_state, + test_name, + manifest, + mock_requests, + expected_records, + num_intermediate_states, + intermidiate_states, + initial_state, + expected_state, ): # Patch `_throttle_state_message` so it always returns a float (indicating "no throttle") with patch.object( - ConcurrentPerPartitionCursor, "_throttle_state_message", return_value=9999999.0 + ConcurrentPerPartitionCursor, "_throttle_state_message", return_value=9999999.0 ): run_incremental_parent_state_test( manifest, mock_requests, expected_records, num_intermediate_states, + intermidiate_states, initial_state, [expected_state], ) @@ -1259,166 +1546,166 @@ def test_incremental_parent_state( "test_name, manifest, mock_requests, expected_records", [ ( - "test_incremental_parent_state", - SUBSTREAM_MANIFEST, - [ - # Fetch the first page of posts - ( - f"https://api.example.com/community/posts?per_page=100&start_time={PARTITION_SYNC_START_TIME}", - { - "posts": [ - {"id": 1, "updated_at": POST_1_UPDATED_AT}, - {"id": 2, "updated_at": POST_2_UPDATED_AT}, - ], - "next_page": ( + "test_incremental_parent_state", + SUBSTREAM_MANIFEST, + [ + # Fetch the first page of posts + ( + f"https://api.example.com/community/posts?per_page=100&start_time={PARTITION_SYNC_START_TIME}", + { + "posts": [ + {"id": 1, "updated_at": POST_1_UPDATED_AT}, + {"id": 2, "updated_at": POST_2_UPDATED_AT}, + ], + "next_page": ( + f"https://api.example.com/community/posts?per_page=100" + f"&start_time={PARTITION_SYNC_START_TIME}&page=2" + ), + }, + ), + # Fetch the second page of posts + ( f"https://api.example.com/community/posts?per_page=100" - f"&start_time={PARTITION_SYNC_START_TIME}&page=2" - ), + f"&start_time={PARTITION_SYNC_START_TIME}&page=2", + {"posts": [{"id": 3, "updated_at": POST_3_UPDATED_AT}]}, + ), + # Fetch the first page of comments for post 1 + ( + "https://api.example.com/community/posts/1/comments?per_page=100", + { + "comments": [ + {"id": 9, "post_id": 1, "updated_at": COMMENT_9_OLDEST}, + {"id": 10, "post_id": 1, "updated_at": COMMENT_10_UPDATED_AT}, + {"id": 11, "post_id": 1, "updated_at": COMMENT_11_UPDATED_AT}, + ], + "next_page": ( + "https://api.example.com/community/posts/1/comments" + "?per_page=100&page=2" + ), + }, + ), + # Fetch the second page of comments for post 1 + ( + "https://api.example.com/community/posts/1/comments?per_page=100&page=2", + {"comments": [{"id": 12, "post_id": 1, "updated_at": COMMENT_12_UPDATED_AT}]}, + ), + # Fetch the first page of votes for comment 10 of post 1 + ( + f"https://api.example.com/community/posts/1/comments/10/votes" + f"?per_page=100&start_time={PARTITION_SYNC_START_TIME}", + { + "votes": [{"id": 100, "comment_id": 10, "created_at": VOTE_100_CREATED_AT}], + "next_page": ( + f"https://api.example.com/community/posts/1/comments/10/votes" + f"?per_page=100&page=2&start_time={PARTITION_SYNC_START_TIME}" + ), + }, + ), + # Fetch the second page of votes for comment 10 of post 1 + ( + f"https://api.example.com/community/posts/1/comments/10/votes" + f"?per_page=100&page=2&start_time={PARTITION_SYNC_START_TIME}", + {"votes": [{"id": 101, "comment_id": 10, "created_at": VOTE_101_CREATED_AT}]}, + ), + # Fetch the first page of votes for comment 11 of post 1 + ( + f"https://api.example.com/community/posts/1/comments/11/votes" + f"?per_page=100&start_time={PARTITION_SYNC_START_TIME}", + {"votes": [{"id": 111, "comment_id": 11, "created_at": VOTE_111_CREATED_AT}]}, + ), + # Fetch the first page of votes for comment 12 of post 1 + ( + f"https://api.example.com/community/posts/1/comments/12/votes" + f"?per_page=100&start_time={PARTITION_SYNC_START_TIME}", + {"votes": []}, + ), + # Fetch the first page of comments for post 2 + ( + "https://api.example.com/community/posts/2/comments?per_page=100", + { + "comments": [{"id": 20, "post_id": 2, "updated_at": COMMENT_20_UPDATED_AT}], + "next_page": ( + "https://api.example.com/community/posts/2/comments" + "?per_page=100&page=2" + ), + }, + ), + # Fetch the second page of comments for post 2 + ( + "https://api.example.com/community/posts/2/comments?per_page=100&page=2", + {"comments": [{"id": 21, "post_id": 2, "updated_at": COMMENT_21_UPDATED_AT}]}, + ), + # Fetch the first page of votes for comment 20 of post 2 + ( + f"https://api.example.com/community/posts/2/comments/20/votes" + f"?per_page=100&start_time={PARTITION_SYNC_START_TIME}", + {"votes": [{"id": 200, "comment_id": 20, "created_at": VOTE_200_CREATED_AT}]}, + ), + # Fetch the first page of votes for comment 21 of post 2 + ( + f"https://api.example.com/community/posts/2/comments/21/votes" + f"?per_page=100&start_time={PARTITION_SYNC_START_TIME}", + {"votes": [{"id": 210, "comment_id": 21, "created_at": VOTE_210_CREATED_AT}]}, + ), + # Fetch the first page of comments for post 3 + ( + "https://api.example.com/community/posts/3/comments?per_page=100", + {"comments": [{"id": 30, "post_id": 3, "updated_at": COMMENT_30_UPDATED_AT}]}, + ), + # Fetch the first page of votes for comment 30 of post 3 + ( + f"https://api.example.com/community/posts/3/comments/30/votes" + f"?per_page=100&start_time={PARTITION_SYNC_START_TIME}", + { + "votes": [ + { + "id": 300, + "comment_id": 30, + "created_at": VOTE_300_CREATED_AT_TIMESTAMP, + } + ] + }, + ), + ], + # Expected records + [ + { + "comment_id": 10, + "comment_updated_at": COMMENT_10_UPDATED_AT, + "created_at": VOTE_100_CREATED_AT, + "id": 100, }, - ), - # Fetch the second page of posts - ( - f"https://api.example.com/community/posts?per_page=100" - f"&start_time={PARTITION_SYNC_START_TIME}&page=2", - {"posts": [{"id": 3, "updated_at": POST_3_UPDATED_AT}]}, - ), - # Fetch the first page of comments for post 1 - ( - "https://api.example.com/community/posts/1/comments?per_page=100", { - "comments": [ - {"id": 9, "post_id": 1, "updated_at": COMMENT_9_OLDEST}, - {"id": 10, "post_id": 1, "updated_at": COMMENT_10_UPDATED_AT}, - {"id": 11, "post_id": 1, "updated_at": COMMENT_11_UPDATED_AT}, - ], - "next_page": ( - "https://api.example.com/community/posts/1/comments" - "?per_page=100&page=2" - ), + "comment_id": 10, + "comment_updated_at": COMMENT_10_UPDATED_AT, + "created_at": VOTE_101_CREATED_AT, + "id": 101, }, - ), - # Fetch the second page of comments for post 1 - ( - "https://api.example.com/community/posts/1/comments?per_page=100&page=2", - {"comments": [{"id": 12, "post_id": 1, "updated_at": COMMENT_12_UPDATED_AT}]}, - ), - # Fetch the first page of votes for comment 10 of post 1 - ( - f"https://api.example.com/community/posts/1/comments/10/votes" - f"?per_page=100&start_time={PARTITION_SYNC_START_TIME}", { - "votes": [{"id": 100, "comment_id": 10, "created_at": VOTE_100_CREATED_AT}], - "next_page": ( - f"https://api.example.com/community/posts/1/comments/10/votes" - f"?per_page=100&page=2&start_time={PARTITION_SYNC_START_TIME}" - ), + "comment_id": 11, + "comment_updated_at": COMMENT_11_UPDATED_AT, + "created_at": VOTE_111_CREATED_AT, + "id": 111, }, - ), - # Fetch the second page of votes for comment 10 of post 1 - ( - f"https://api.example.com/community/posts/1/comments/10/votes" - f"?per_page=100&page=2&start_time={PARTITION_SYNC_START_TIME}", - {"votes": [{"id": 101, "comment_id": 10, "created_at": VOTE_101_CREATED_AT}]}, - ), - # Fetch the first page of votes for comment 11 of post 1 - ( - f"https://api.example.com/community/posts/1/comments/11/votes" - f"?per_page=100&start_time={PARTITION_SYNC_START_TIME}", - {"votes": [{"id": 111, "comment_id": 11, "created_at": VOTE_111_CREATED_AT}]}, - ), - # Fetch the first page of votes for comment 12 of post 1 - ( - f"https://api.example.com/community/posts/1/comments/12/votes" - f"?per_page=100&start_time={PARTITION_SYNC_START_TIME}", - {"votes": []}, - ), - # Fetch the first page of comments for post 2 - ( - "https://api.example.com/community/posts/2/comments?per_page=100", { - "comments": [{"id": 20, "post_id": 2, "updated_at": COMMENT_20_UPDATED_AT}], - "next_page": ( - "https://api.example.com/community/posts/2/comments" - "?per_page=100&page=2" - ), + "comment_id": 20, + "comment_updated_at": COMMENT_20_UPDATED_AT, + "created_at": VOTE_200_CREATED_AT, + "id": 200, }, - ), - # Fetch the second page of comments for post 2 - ( - "https://api.example.com/community/posts/2/comments?per_page=100&page=2", - {"comments": [{"id": 21, "post_id": 2, "updated_at": COMMENT_21_UPDATED_AT}]}, - ), - # Fetch the first page of votes for comment 20 of post 2 - ( - f"https://api.example.com/community/posts/2/comments/20/votes" - f"?per_page=100&start_time={PARTITION_SYNC_START_TIME}", - {"votes": [{"id": 200, "comment_id": 20, "created_at": VOTE_200_CREATED_AT}]}, - ), - # Fetch the first page of votes for comment 21 of post 2 - ( - f"https://api.example.com/community/posts/2/comments/21/votes" - f"?per_page=100&start_time={PARTITION_SYNC_START_TIME}", - {"votes": [{"id": 210, "comment_id": 21, "created_at": VOTE_210_CREATED_AT}]}, - ), - # Fetch the first page of comments for post 3 - ( - "https://api.example.com/community/posts/3/comments?per_page=100", - {"comments": [{"id": 30, "post_id": 3, "updated_at": COMMENT_30_UPDATED_AT}]}, - ), - # Fetch the first page of votes for comment 30 of post 3 - ( - f"https://api.example.com/community/posts/3/comments/30/votes" - f"?per_page=100&start_time={PARTITION_SYNC_START_TIME}", { - "votes": [ - { - "id": 300, - "comment_id": 30, - "created_at": VOTE_300_CREATED_AT_TIMESTAMP, - } - ] + "comment_id": 21, + "comment_updated_at": COMMENT_21_UPDATED_AT, + "created_at": VOTE_210_CREATED_AT, + "id": 210, }, - ), - ], - # Expected records - [ - { - "comment_id": 10, - "comment_updated_at": COMMENT_10_UPDATED_AT, - "created_at": VOTE_100_CREATED_AT, - "id": 100, - }, - { - "comment_id": 10, - "comment_updated_at": COMMENT_10_UPDATED_AT, - "created_at": VOTE_101_CREATED_AT, - "id": 101, - }, - { - "comment_id": 11, - "comment_updated_at": COMMENT_11_UPDATED_AT, - "created_at": VOTE_111_CREATED_AT, - "id": 111, - }, - { - "comment_id": 20, - "comment_updated_at": COMMENT_20_UPDATED_AT, - "created_at": VOTE_200_CREATED_AT, - "id": 200, - }, - { - "comment_id": 21, - "comment_updated_at": COMMENT_21_UPDATED_AT, - "created_at": VOTE_210_CREATED_AT, - "id": 210, - }, - { - "comment_id": 30, - "comment_updated_at": COMMENT_30_UPDATED_AT, - "created_at": str(VOTE_300_CREATED_AT_TIMESTAMP), - "id": 300, - }, - ], + { + "comment_id": 30, + "comment_updated_at": COMMENT_30_UPDATED_AT, + "created_at": str(VOTE_300_CREATED_AT_TIMESTAMP), + "id": 300, + }, + ], ), ], ) @@ -1427,40 +1714,40 @@ def test_incremental_parent_state( [ ({"created_at": PARTITION_SYNC_START_TIME}, STATE_MIGRATION_EXPECTED_STATE), ( - { - "state": {"created_at": PARTITION_SYNC_START_TIME}, - "lookback_window": 0, - "use_global_cursor": False, - "parent_state": { - "post_comments": { - "state": {"updated_at": PARTITION_SYNC_START_TIME}, - "parent_state": {"posts": {"updated_at": PARTITION_SYNC_START_TIME}}, - "lookback_window": 0, - } + { + "state": {"created_at": PARTITION_SYNC_START_TIME}, + "lookback_window": 0, + "use_global_cursor": False, + "parent_state": { + "post_comments": { + "state": {"updated_at": PARTITION_SYNC_START_TIME}, + "parent_state": {"posts": {"updated_at": PARTITION_SYNC_START_TIME}}, + "lookback_window": 0, + } + }, }, - }, - STATE_MIGRATION_EXPECTED_STATE, + STATE_MIGRATION_EXPECTED_STATE, ), ( - { - "state": {"created_at": PARTITION_SYNC_START_TIME}, - "lookback_window": 0, - "use_global_cursor": True, - "parent_state": { - "post_comments": { - "state": {"updated_at": PARTITION_SYNC_START_TIME}, - "parent_state": {"posts": {"updated_at": PARTITION_SYNC_START_TIME}}, - "lookback_window": 0, - } + { + "state": {"created_at": PARTITION_SYNC_START_TIME}, + "lookback_window": 0, + "use_global_cursor": True, + "parent_state": { + "post_comments": { + "state": {"updated_at": PARTITION_SYNC_START_TIME}, + "parent_state": {"posts": {"updated_at": PARTITION_SYNC_START_TIME}}, + "lookback_window": 0, + } + }, }, - }, - STATE_MIGRATION_GLOBAL_EXPECTED_STATE, + STATE_MIGRATION_GLOBAL_EXPECTED_STATE, ), ( - { - "state": {"created_at": PARTITION_SYNC_START_TIME}, - }, - STATE_MIGRATION_EXPECTED_STATE, + { + "state": {"created_at": PARTITION_SYNC_START_TIME}, + }, + STATE_MIGRATION_EXPECTED_STATE, ), ], ids=[ @@ -1471,7 +1758,7 @@ def test_incremental_parent_state( ], ) def test_incremental_parent_state_migration( - test_name, manifest, mock_requests, expected_records, initial_state, expected_state + test_name, manifest, mock_requests, expected_records, initial_state, expected_state ): """ Test incremental partition router with parent state migration @@ -1491,101 +1778,101 @@ def test_incremental_parent_state_migration( "test_name, manifest, mock_requests, expected_records, initial_state, expected_state", [ ( - "test_incremental_parent_state", - SUBSTREAM_MANIFEST, - [ - # Fetch the first page of posts - ( - f"https://api.example.com/community/posts?per_page=100&start_time={PARENT_POSTS_CURSOR}", - { - "posts": [], - "next_page": ( + "test_incremental_parent_state", + SUBSTREAM_MANIFEST, + [ + # Fetch the first page of posts + ( + f"https://api.example.com/community/posts?per_page=100&start_time={PARENT_POSTS_CURSOR}", + { + "posts": [], + "next_page": ( + f"https://api.example.com/community/posts?per_page=100" + f"&start_time={PARENT_POSTS_CURSOR}&page=2" + ), + }, + ), + # Fetch the second page of posts + ( f"https://api.example.com/community/posts?per_page=100" - f"&start_time={PARENT_POSTS_CURSOR}&page=2" - ), + f"&start_time={PARENT_POSTS_CURSOR}&page=2", + {"posts": []}, + ), + ], + # Expected records (empty) + [], + # Initial state + { + "parent_state": { + "post_comments": { + "states": [ + { + "partition": {"id": 1, "parent_slice": {}}, + "cursor": {"updated_at": PARENT_COMMENT_CURSOR_PARTITION_1}, + } + ], + "parent_state": {"posts": {"updated_at": PARENT_POSTS_CURSOR}}, + } }, - ), - # Fetch the second page of posts - ( - f"https://api.example.com/community/posts?per_page=100" - f"&start_time={PARENT_POSTS_CURSOR}&page=2", - {"posts": []}, - ), - ], - # Expected records (empty) - [], - # Initial state - { - "parent_state": { - "post_comments": { - "states": [ - { - "partition": {"id": 1, "parent_slice": {}}, - "cursor": {"updated_at": PARENT_COMMENT_CURSOR_PARTITION_1}, - } - ], - "parent_state": {"posts": {"updated_at": PARENT_POSTS_CURSOR}}, - } - }, - "states": [ - { - "partition": { - "id": 10, - "parent_slice": {"id": 1, "parent_slice": {}}, + "states": [ + { + "partition": { + "id": 10, + "parent_slice": {"id": 1, "parent_slice": {}}, + }, + "cursor": {"created_at": INITIAL_STATE_PARTITION_10_CURSOR}, }, - "cursor": {"created_at": INITIAL_STATE_PARTITION_10_CURSOR}, - }, - { - "partition": { - "id": 11, - "parent_slice": {"id": 1, "parent_slice": {}}, + { + "partition": { + "id": 11, + "parent_slice": {"id": 1, "parent_slice": {}}, + }, + "cursor": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, }, - "cursor": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, - }, - ], - "state": {"created_at": INITIAL_GLOBAL_CURSOR}, - "lookback_window": 1, - }, - # Expected state - { - "parent_state": { - "post_comments": { - "states": [ - { - "partition": {"id": 1, "parent_slice": {}}, - "cursor": {"updated_at": PARENT_COMMENT_CURSOR_PARTITION_1}, - } - ], - "state": {}, - "use_global_cursor": False, - "parent_state": {"posts": {"updated_at": PARENT_POSTS_CURSOR}}, - } + ], + "state": {"created_at": INITIAL_GLOBAL_CURSOR}, + "lookback_window": 1, }, - "states": [ - { - "partition": { - "id": 10, - "parent_slice": {"id": 1, "parent_slice": {}}, - }, - "cursor": {"created_at": INITIAL_STATE_PARTITION_10_CURSOR}, + # Expected state + { + "parent_state": { + "post_comments": { + "states": [ + { + "partition": {"id": 1, "parent_slice": {}}, + "cursor": {"updated_at": PARENT_COMMENT_CURSOR_PARTITION_1}, + } + ], + "state": {}, + "use_global_cursor": False, + "parent_state": {"posts": {"updated_at": PARENT_POSTS_CURSOR}}, + } }, - { - "partition": { - "id": 11, - "parent_slice": {"id": 1, "parent_slice": {}}, + "states": [ + { + "partition": { + "id": 10, + "parent_slice": {"id": 1, "parent_slice": {}}, + }, + "cursor": {"created_at": INITIAL_STATE_PARTITION_10_CURSOR}, }, - "cursor": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, - }, - ], - "state": {"created_at": INITIAL_GLOBAL_CURSOR}, - "lookback_window": 1, - "use_global_cursor": False, - }, + { + "partition": { + "id": 11, + "parent_slice": {"id": 1, "parent_slice": {}}, + }, + "cursor": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, + }, + ], + "state": {"created_at": INITIAL_GLOBAL_CURSOR}, + "lookback_window": 1, + "use_global_cursor": False, + }, ), ], ) def test_incremental_parent_state_no_slices( - test_name, manifest, mock_requests, expected_records, initial_state, expected_state + test_name, manifest, mock_requests, expected_records, initial_state, expected_state ): """ Test incremental partition router with no parent records @@ -1605,217 +1892,217 @@ def test_incremental_parent_state_no_slices( "test_name, manifest, mock_requests, expected_records, initial_state, expected_state", [ ( - "test_incremental_parent_state", - SUBSTREAM_MANIFEST, - [ - # Fetch the first page of posts - ( - f"https://api.example.com/community/posts?per_page=100&start_time={PARENT_POSTS_CURSOR}", - { - "posts": [ - {"id": 1, "updated_at": POST_1_UPDATED_AT}, - {"id": 2, "updated_at": POST_2_UPDATED_AT}, - ], - "next_page": ( + "test_incremental_parent_state", + SUBSTREAM_MANIFEST, + [ + # Fetch the first page of posts + ( + f"https://api.example.com/community/posts?per_page=100&start_time={PARENT_POSTS_CURSOR}", + { + "posts": [ + {"id": 1, "updated_at": POST_1_UPDATED_AT}, + {"id": 2, "updated_at": POST_2_UPDATED_AT}, + ], + "next_page": ( + f"https://api.example.com/community/posts?per_page=100" + f"&start_time={PARENT_POSTS_CURSOR}&page=2" + ), + }, + ), + # Fetch the second page of posts + ( f"https://api.example.com/community/posts?per_page=100" - f"&start_time={PARENT_POSTS_CURSOR}&page=2" - ), - }, - ), - # Fetch the second page of posts - ( - f"https://api.example.com/community/posts?per_page=100" - f"&start_time={PARENT_POSTS_CURSOR}&page=2", - {"posts": [{"id": 3, "updated_at": POST_3_UPDATED_AT}]}, - ), - # Fetch the first page of comments for post 1 - ( - "https://api.example.com/community/posts/1/comments?per_page=100", - { - "comments": [ - {"id": 9, "post_id": 1, "updated_at": COMMENT_9_OLDEST}, - {"id": 10, "post_id": 1, "updated_at": COMMENT_10_UPDATED_AT}, - {"id": 11, "post_id": 1, "updated_at": COMMENT_11_UPDATED_AT}, - ], - "next_page": ( - "https://api.example.com/community/posts/1/comments?per_page=100&page=2" - ), - }, - ), - # Fetch the second page of comments for post 1 - ( - "https://api.example.com/community/posts/1/comments?per_page=100&page=2", - {"comments": [{"id": 12, "post_id": 1, "updated_at": COMMENT_12_UPDATED_AT}]}, - ), - # Fetch the first page of votes for comment 10 of post 1 - ( - f"https://api.example.com/community/posts/1/comments/10/votes" - f"?per_page=100&start_time={INITIAL_STATE_PARTITION_10_CURSOR}", - { - "votes": [], - "next_page": ( + f"&start_time={PARENT_POSTS_CURSOR}&page=2", + {"posts": [{"id": 3, "updated_at": POST_3_UPDATED_AT}]}, + ), + # Fetch the first page of comments for post 1 + ( + "https://api.example.com/community/posts/1/comments?per_page=100", + { + "comments": [ + {"id": 9, "post_id": 1, "updated_at": COMMENT_9_OLDEST}, + {"id": 10, "post_id": 1, "updated_at": COMMENT_10_UPDATED_AT}, + {"id": 11, "post_id": 1, "updated_at": COMMENT_11_UPDATED_AT}, + ], + "next_page": ( + "https://api.example.com/community/posts/1/comments?per_page=100&page=2" + ), + }, + ), + # Fetch the second page of comments for post 1 + ( + "https://api.example.com/community/posts/1/comments?per_page=100&page=2", + {"comments": [{"id": 12, "post_id": 1, "updated_at": COMMENT_12_UPDATED_AT}]}, + ), + # Fetch the first page of votes for comment 10 of post 1 + ( f"https://api.example.com/community/posts/1/comments/10/votes" - f"?per_page=100&page=2&start_time={INITIAL_STATE_PARTITION_10_CURSOR}" - ), - }, - ), - # Fetch the second page of votes for comment 10 of post 1 - ( - f"https://api.example.com/community/posts/1/comments/10/votes" - f"?per_page=100&page=2&start_time={INITIAL_STATE_PARTITION_10_CURSOR}", - {"votes": []}, - ), - # Fetch the first page of votes for comment 11 of post 1 - ( - f"https://api.example.com/community/posts/1/comments/11/votes" - f"?per_page=100&start_time={INITIAL_STATE_PARTITION_11_CURSOR}", - {"votes": []}, - ), - # Fetch the first page of votes for comment 12 of post 1 - ( - f"https://api.example.com/community/posts/1/comments/12/votes" - f"?per_page=100&start_time={INITIAL_STATE_PARTITION_11_CURSOR}", - {"votes": []}, - ), - # Fetch the first page of comments for post 2 - ( - "https://api.example.com/community/posts/2/comments?per_page=100", - { - "comments": [{"id": 20, "post_id": 2, "updated_at": COMMENT_20_UPDATED_AT}], - "next_page": ( - "https://api.example.com/community/posts/2/comments?per_page=100&page=2" - ), - }, - ), - # Fetch the second page of comments for post 2 - ( - "https://api.example.com/community/posts/2/comments?per_page=100&page=2", - {"comments": [{"id": 21, "post_id": 2, "updated_at": COMMENT_21_UPDATED_AT}]}, - ), - # Fetch the first page of votes for comment 20 of post 2 - ( - f"https://api.example.com/community/posts/2/comments/20/votes" - f"?per_page=100&start_time={INITIAL_STATE_PARTITION_11_CURSOR}", - {"votes": []}, - ), - # Fetch the first page of votes for comment 21 of post 2 - ( - f"https://api.example.com/community/posts/2/comments/21/votes" - f"?per_page=100&start_time={INITIAL_STATE_PARTITION_11_CURSOR}", - {"votes": []}, - ), - # Fetch the first page of comments for post 3 - ( - "https://api.example.com/community/posts/3/comments?per_page=100", - {"comments": [{"id": 30, "post_id": 3, "updated_at": COMMENT_30_UPDATED_AT}]}, - ), - # Fetch the first page of votes for comment 30 of post 3 - ( - f"https://api.example.com/community/posts/3/comments/30/votes" - f"?per_page=100&start_time={INITIAL_STATE_PARTITION_11_CURSOR}", - {"votes": []}, - ), - ], - # Expected records - [], - # Initial state - { - "parent_state": { - "post_comments": { - "states": [ + f"?per_page=100&start_time={INITIAL_STATE_PARTITION_10_CURSOR}", { - "partition": {"id": 1, "parent_slice": {}}, - "cursor": {"updated_at": PARENT_COMMENT_CURSOR_PARTITION_1}, - } - ], - "parent_state": {"posts": {"updated_at": PARENT_POSTS_CURSOR}}, - } - }, - "states": [ - { - "partition": { - "id": 10, - "parent_slice": {"id": 1, "parent_slice": {}}, - }, - "cursor": {"created_at": INITIAL_STATE_PARTITION_10_CURSOR}, - }, - { - "partition": { - "id": 11, - "parent_slice": {"id": 1, "parent_slice": {}}, - }, - "cursor": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, - }, + "votes": [], + "next_page": ( + f"https://api.example.com/community/posts/1/comments/10/votes" + f"?per_page=100&page=2&start_time={INITIAL_STATE_PARTITION_10_CURSOR}" + ), + }, + ), + # Fetch the second page of votes for comment 10 of post 1 + ( + f"https://api.example.com/community/posts/1/comments/10/votes" + f"?per_page=100&page=2&start_time={INITIAL_STATE_PARTITION_10_CURSOR}", + {"votes": []}, + ), + # Fetch the first page of votes for comment 11 of post 1 + ( + f"https://api.example.com/community/posts/1/comments/11/votes" + f"?per_page=100&start_time={INITIAL_STATE_PARTITION_11_CURSOR}", + {"votes": []}, + ), + # Fetch the first page of votes for comment 12 of post 1 + ( + f"https://api.example.com/community/posts/1/comments/12/votes" + f"?per_page=100&start_time={INITIAL_STATE_PARTITION_11_CURSOR}", + {"votes": []}, + ), + # Fetch the first page of comments for post 2 + ( + "https://api.example.com/community/posts/2/comments?per_page=100", + { + "comments": [{"id": 20, "post_id": 2, "updated_at": COMMENT_20_UPDATED_AT}], + "next_page": ( + "https://api.example.com/community/posts/2/comments?per_page=100&page=2" + ), + }, + ), + # Fetch the second page of comments for post 2 + ( + "https://api.example.com/community/posts/2/comments?per_page=100&page=2", + {"comments": [{"id": 21, "post_id": 2, "updated_at": COMMENT_21_UPDATED_AT}]}, + ), + # Fetch the first page of votes for comment 20 of post 2 + ( + f"https://api.example.com/community/posts/2/comments/20/votes" + f"?per_page=100&start_time={INITIAL_STATE_PARTITION_11_CURSOR}", + {"votes": []}, + ), + # Fetch the first page of votes for comment 21 of post 2 + ( + f"https://api.example.com/community/posts/2/comments/21/votes" + f"?per_page=100&start_time={INITIAL_STATE_PARTITION_11_CURSOR}", + {"votes": []}, + ), + # Fetch the first page of comments for post 3 + ( + "https://api.example.com/community/posts/3/comments?per_page=100", + {"comments": [{"id": 30, "post_id": 3, "updated_at": COMMENT_30_UPDATED_AT}]}, + ), + # Fetch the first page of votes for comment 30 of post 3 + ( + f"https://api.example.com/community/posts/3/comments/30/votes" + f"?per_page=100&start_time={INITIAL_STATE_PARTITION_11_CURSOR}", + {"votes": []}, + ), ], - "use_global_cursor": False, - "state": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, - "lookback_window": 0, - }, - # Expected state - { - "lookback_window": 1, - "use_global_cursor": False, - "state": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, - "states": [ - { - "partition": { - "id": 10, - "parent_slice": {"id": 1, "parent_slice": {}}, - }, - "cursor": {"created_at": INITIAL_STATE_PARTITION_10_CURSOR}, + # Expected records + [], + # Initial state + { + "parent_state": { + "post_comments": { + "states": [ + { + "partition": {"id": 1, "parent_slice": {}}, + "cursor": {"updated_at": PARENT_COMMENT_CURSOR_PARTITION_1}, + } + ], + "parent_state": {"posts": {"updated_at": PARENT_POSTS_CURSOR}}, + } }, - { - "partition": { - "id": 11, - "parent_slice": {"id": 1, "parent_slice": {}}, + "states": [ + { + "partition": { + "id": 10, + "parent_slice": {"id": 1, "parent_slice": {}}, + }, + "cursor": {"created_at": INITIAL_STATE_PARTITION_10_CURSOR}, }, - "cursor": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, - }, - { - "partition": {"id": 12, "parent_slice": {"id": 1, "parent_slice": {}}}, - "cursor": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, - }, - { - "partition": {"id": 20, "parent_slice": {"id": 2, "parent_slice": {}}}, - "cursor": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, - }, - { - "partition": {"id": 21, "parent_slice": {"id": 2, "parent_slice": {}}}, - "cursor": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, - }, - { - "partition": {"id": 30, "parent_slice": {"id": 3, "parent_slice": {}}}, - "cursor": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, - }, - ], - "parent_state": { - "post_comments": { - "use_global_cursor": False, - "state": {"updated_at": COMMENT_10_UPDATED_AT}, - "parent_state": {"posts": {"updated_at": POST_1_UPDATED_AT}}, - "lookback_window": 1, - "states": [ - { - "partition": {"id": 1, "parent_slice": {}}, - "cursor": {"updated_at": COMMENT_10_UPDATED_AT}, + { + "partition": { + "id": 11, + "parent_slice": {"id": 1, "parent_slice": {}}, }, - { - "partition": {"id": 2, "parent_slice": {}}, - "cursor": {"updated_at": COMMENT_20_UPDATED_AT}, + "cursor": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, + }, + ], + "use_global_cursor": False, + "state": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, + "lookback_window": 0, + }, + # Expected state + { + "lookback_window": 1, + "use_global_cursor": False, + "state": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, + "states": [ + { + "partition": { + "id": 10, + "parent_slice": {"id": 1, "parent_slice": {}}, }, - { - "partition": {"id": 3, "parent_slice": {}}, - "cursor": {"updated_at": COMMENT_30_UPDATED_AT}, + "cursor": {"created_at": INITIAL_STATE_PARTITION_10_CURSOR}, + }, + { + "partition": { + "id": 11, + "parent_slice": {"id": 1, "parent_slice": {}}, }, - ], - } + "cursor": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, + }, + { + "partition": {"id": 12, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, + }, + { + "partition": {"id": 20, "parent_slice": {"id": 2, "parent_slice": {}}}, + "cursor": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, + }, + { + "partition": {"id": 21, "parent_slice": {"id": 2, "parent_slice": {}}}, + "cursor": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, + }, + { + "partition": {"id": 30, "parent_slice": {"id": 3, "parent_slice": {}}}, + "cursor": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, + }, + ], + "parent_state": { + "post_comments": { + "use_global_cursor": False, + "state": {"updated_at": COMMENT_10_UPDATED_AT}, + "parent_state": {"posts": {"updated_at": POST_1_UPDATED_AT}}, + "lookback_window": 1, + "states": [ + { + "partition": {"id": 1, "parent_slice": {}}, + "cursor": {"updated_at": COMMENT_10_UPDATED_AT}, + }, + { + "partition": {"id": 2, "parent_slice": {}}, + "cursor": {"updated_at": COMMENT_20_UPDATED_AT}, + }, + { + "partition": {"id": 3, "parent_slice": {}}, + "cursor": {"updated_at": COMMENT_30_UPDATED_AT}, + }, + ], + } + }, }, - }, ), ], ) def test_incremental_parent_state_no_records( - test_name, manifest, mock_requests, expected_records, initial_state, expected_state + test_name, manifest, mock_requests, expected_records, initial_state, expected_state ): """ Test incremental partition router with no child records @@ -1835,238 +2122,238 @@ def test_incremental_parent_state_no_records( "test_name, manifest, mock_requests, expected_records, initial_state, expected_state", [ ( - "test_incremental_parent_state", - SUBSTREAM_MANIFEST, - [ - # Fetch the first page of posts - ( - f"https://api.example.com/community/posts?per_page=100&start_time={PARENT_POSTS_CURSOR}", + "test_incremental_parent_state", + SUBSTREAM_MANIFEST, + [ + # Fetch the first page of posts + ( + f"https://api.example.com/community/posts?per_page=100&start_time={PARENT_POSTS_CURSOR}", + { + "posts": [ + {"id": 1, "updated_at": POST_1_UPDATED_AT}, + {"id": 2, "updated_at": POST_2_UPDATED_AT}, + ], + "next_page": ( + f"https://api.example.com/community/posts?per_page=100&start_time={PARENT_POSTS_CURSOR}&page=2" + ), + }, + ), + # Fetch the second page of posts + ( + f"https://api.example.com/community/posts?per_page=100&start_time={PARENT_POSTS_CURSOR}&page=2", + {"posts": [{"id": 3, "updated_at": POST_3_UPDATED_AT}]}, + ), + # Fetch the first page of comments for post 1 + ( + "https://api.example.com/community/posts/1/comments?per_page=100", + { + "comments": [ + {"id": 9, "post_id": 1, "updated_at": COMMENT_9_OLDEST}, + {"id": 10, "post_id": 1, "updated_at": COMMENT_10_UPDATED_AT}, + {"id": 11, "post_id": 1, "updated_at": COMMENT_11_UPDATED_AT}, + ], + "next_page": ( + "https://api.example.com/community/posts/1/comments?per_page=100&page=2" + ), + }, + ), + # Fetch the second page of comments for post 1 + ( + "https://api.example.com/community/posts/1/comments?per_page=100&page=2", + {"comments": [{"id": 12, "post_id": 1, "updated_at": COMMENT_12_UPDATED_AT}]}, + ), + # Fetch the first page of votes for comment 10 of post 1 + ( + f"https://api.example.com/community/posts/1/comments/10/votes" + f"?per_page=100&start_time={INITIAL_STATE_PARTITION_10_CURSOR}", + { + "votes": [{"id": 100, "comment_id": 10, "created_at": VOTE_100_CREATED_AT}], + "next_page": ( + f"https://api.example.com/community/posts/1/comments/10/votes" + f"?per_page=100&page=2&start_time={INITIAL_STATE_PARTITION_10_CURSOR}" + ), + }, + ), + # Fetch the second page of votes for comment 10 of post 1 + ( + f"https://api.example.com/community/posts/1/comments/10/votes" + f"?per_page=100&page=2&start_time={INITIAL_STATE_PARTITION_10_CURSOR}", + {"votes": [{"id": 101, "comment_id": 10, "created_at": VOTE_101_CREATED_AT}]}, + ), + # Fetch the first page of votes for comment 11 of post 1 + ( + f"https://api.example.com/community/posts/1/comments/11/votes" + f"?per_page=100&start_time={INITIAL_STATE_PARTITION_11_CURSOR}", + {"votes": [{"id": 111, "comment_id": 11, "created_at": VOTE_111_CREATED_AT}]}, + ), + # Fetch the first page of votes for comment 12 of post 1 + ( + f"https://api.example.com/community/posts/1/comments/12/votes?per_page=100&start_time={LOOKBACK_DATE}", + {"votes": []}, + ), + # Fetch the first page of comments for post 2 + ( + "https://api.example.com/community/posts/2/comments?per_page=100", + { + "comments": [{"id": 20, "post_id": 2, "updated_at": COMMENT_20_UPDATED_AT}], + "next_page": ( + "https://api.example.com/community/posts/2/comments?per_page=100&page=2" + ), + }, + ), + # Fetch the second page of comments for post 2 + ( + "https://api.example.com/community/posts/2/comments?per_page=100&page=2", + {"comments": [{"id": 21, "post_id": 2, "updated_at": COMMENT_21_UPDATED_AT}]}, + ), + # Fetch the first page of votes for comment 20 of post 2 - 404 error + ( + f"https://api.example.com/community/posts/2/comments/20/votes" + f"?per_page=100&start_time={LOOKBACK_DATE}", + None, + ), + # Fetch the first page of votes for comment 21 of post 2 + ( + f"https://api.example.com/community/posts/2/comments/21/votes" + f"?per_page=100&start_time={LOOKBACK_DATE}", + {"votes": [{"id": 210, "comment_id": 21, "created_at": VOTE_210_CREATED_AT}]}, + ), + # Fetch the first page of comments for post 3 + ( + "https://api.example.com/community/posts/3/comments?per_page=100", + {"comments": [{"id": 30, "post_id": 3, "updated_at": COMMENT_30_UPDATED_AT}]}, + ), + # Fetch the first page of votes for comment 30 of post 3 + ( + f"https://api.example.com/community/posts/3/comments/30/votes" + f"?per_page=100&start_time={LOOKBACK_DATE}", + { + "votes": [ + { + "id": 300, + "comment_id": 30, + "created_at": VOTE_300_CREATED_AT_TIMESTAMP, + } + ] + }, + ), + ], + # Expected records + [ { - "posts": [ - {"id": 1, "updated_at": POST_1_UPDATED_AT}, - {"id": 2, "updated_at": POST_2_UPDATED_AT}, - ], - "next_page": ( - f"https://api.example.com/community/posts?per_page=100&start_time={PARENT_POSTS_CURSOR}&page=2" - ), + "comment_id": 10, + "comment_updated_at": COMMENT_10_UPDATED_AT, + "created_at": VOTE_100_CREATED_AT, + "id": 100, }, - ), - # Fetch the second page of posts - ( - f"https://api.example.com/community/posts?per_page=100&start_time={PARENT_POSTS_CURSOR}&page=2", - {"posts": [{"id": 3, "updated_at": POST_3_UPDATED_AT}]}, - ), - # Fetch the first page of comments for post 1 - ( - "https://api.example.com/community/posts/1/comments?per_page=100", { - "comments": [ - {"id": 9, "post_id": 1, "updated_at": COMMENT_9_OLDEST}, - {"id": 10, "post_id": 1, "updated_at": COMMENT_10_UPDATED_AT}, - {"id": 11, "post_id": 1, "updated_at": COMMENT_11_UPDATED_AT}, - ], - "next_page": ( - "https://api.example.com/community/posts/1/comments?per_page=100&page=2" - ), + "comment_id": 10, + "comment_updated_at": COMMENT_10_UPDATED_AT, + "created_at": VOTE_101_CREATED_AT, + "id": 101, }, - ), - # Fetch the second page of comments for post 1 - ( - "https://api.example.com/community/posts/1/comments?per_page=100&page=2", - {"comments": [{"id": 12, "post_id": 1, "updated_at": COMMENT_12_UPDATED_AT}]}, - ), - # Fetch the first page of votes for comment 10 of post 1 - ( - f"https://api.example.com/community/posts/1/comments/10/votes" - f"?per_page=100&start_time={INITIAL_STATE_PARTITION_10_CURSOR}", { - "votes": [{"id": 100, "comment_id": 10, "created_at": VOTE_100_CREATED_AT}], - "next_page": ( - f"https://api.example.com/community/posts/1/comments/10/votes" - f"?per_page=100&page=2&start_time={INITIAL_STATE_PARTITION_10_CURSOR}" - ), + "comment_id": 11, + "comment_updated_at": COMMENT_11_UPDATED_AT, + "created_at": VOTE_111_CREATED_AT, + "id": 111, }, - ), - # Fetch the second page of votes for comment 10 of post 1 - ( - f"https://api.example.com/community/posts/1/comments/10/votes" - f"?per_page=100&page=2&start_time={INITIAL_STATE_PARTITION_10_CURSOR}", - {"votes": [{"id": 101, "comment_id": 10, "created_at": VOTE_101_CREATED_AT}]}, - ), - # Fetch the first page of votes for comment 11 of post 1 - ( - f"https://api.example.com/community/posts/1/comments/11/votes" - f"?per_page=100&start_time={INITIAL_STATE_PARTITION_11_CURSOR}", - {"votes": [{"id": 111, "comment_id": 11, "created_at": VOTE_111_CREATED_AT}]}, - ), - # Fetch the first page of votes for comment 12 of post 1 - ( - f"https://api.example.com/community/posts/1/comments/12/votes?per_page=100&start_time={LOOKBACK_DATE}", - {"votes": []}, - ), - # Fetch the first page of comments for post 2 - ( - "https://api.example.com/community/posts/2/comments?per_page=100", { - "comments": [{"id": 20, "post_id": 2, "updated_at": COMMENT_20_UPDATED_AT}], - "next_page": ( - "https://api.example.com/community/posts/2/comments?per_page=100&page=2" - ), + "comment_id": 21, + "comment_updated_at": COMMENT_21_UPDATED_AT, + "created_at": VOTE_210_CREATED_AT, + "id": 210, }, - ), - # Fetch the second page of comments for post 2 - ( - "https://api.example.com/community/posts/2/comments?per_page=100&page=2", - {"comments": [{"id": 21, "post_id": 2, "updated_at": COMMENT_21_UPDATED_AT}]}, - ), - # Fetch the first page of votes for comment 20 of post 2 - 404 error - ( - f"https://api.example.com/community/posts/2/comments/20/votes" - f"?per_page=100&start_time={LOOKBACK_DATE}", - None, - ), - # Fetch the first page of votes for comment 21 of post 2 - ( - f"https://api.example.com/community/posts/2/comments/21/votes" - f"?per_page=100&start_time={LOOKBACK_DATE}", - {"votes": [{"id": 210, "comment_id": 21, "created_at": VOTE_210_CREATED_AT}]}, - ), - # Fetch the first page of comments for post 3 - ( - "https://api.example.com/community/posts/3/comments?per_page=100", - {"comments": [{"id": 30, "post_id": 3, "updated_at": COMMENT_30_UPDATED_AT}]}, - ), - # Fetch the first page of votes for comment 30 of post 3 - ( - f"https://api.example.com/community/posts/3/comments/30/votes" - f"?per_page=100&start_time={LOOKBACK_DATE}", { - "votes": [ - { - "id": 300, - "comment_id": 30, - "created_at": VOTE_300_CREATED_AT_TIMESTAMP, - } - ] + "comment_id": 30, + "comment_updated_at": COMMENT_30_UPDATED_AT, + "created_at": str(VOTE_300_CREATED_AT_TIMESTAMP), + "id": 300, }, - ), - ], - # Expected records - [ - { - "comment_id": 10, - "comment_updated_at": COMMENT_10_UPDATED_AT, - "created_at": VOTE_100_CREATED_AT, - "id": 100, - }, - { - "comment_id": 10, - "comment_updated_at": COMMENT_10_UPDATED_AT, - "created_at": VOTE_101_CREATED_AT, - "id": 101, - }, - { - "comment_id": 11, - "comment_updated_at": COMMENT_11_UPDATED_AT, - "created_at": VOTE_111_CREATED_AT, - "id": 111, - }, + ], + # Initial state { - "comment_id": 21, - "comment_updated_at": COMMENT_21_UPDATED_AT, - "created_at": VOTE_210_CREATED_AT, - "id": 210, + "parent_state": { + "post_comments": { + "states": [ + { + "partition": {"id": 1, "parent_slice": {}}, + "cursor": {"updated_at": PARENT_COMMENT_CURSOR_PARTITION_1}, + } + ], + "parent_state": {"posts": {"updated_at": PARENT_POSTS_CURSOR}}, + } + }, + "state": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, + "lookback_window": 86400, + "states": [ + { + "partition": { + "id": 10, + "parent_slice": {"id": 1, "parent_slice": {}}, + }, + "cursor": {"created_at": INITIAL_STATE_PARTITION_10_CURSOR}, + }, + { + "partition": { + "id": 11, + "parent_slice": {"id": 1, "parent_slice": {}}, + }, + "cursor": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, + }, + ], }, + # Expected state { - "comment_id": 30, - "comment_updated_at": COMMENT_30_UPDATED_AT, - "created_at": str(VOTE_300_CREATED_AT_TIMESTAMP), - "id": 300, - }, - ], - # Initial state - { - "parent_state": { - "post_comments": { - "states": [ - { - "partition": {"id": 1, "parent_slice": {}}, - "cursor": {"updated_at": PARENT_COMMENT_CURSOR_PARTITION_1}, - } - ], - "parent_state": {"posts": {"updated_at": PARENT_POSTS_CURSOR}}, - } - }, - "state": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, - "lookback_window": 86400, - "states": [ - { - "partition": { - "id": 10, - "parent_slice": {"id": 1, "parent_slice": {}}, - }, - "cursor": {"created_at": INITIAL_STATE_PARTITION_10_CURSOR}, + # The global state, lookback window and the parent state are the same because sync failed for comment 20 + "parent_state": { + "post_comments": { + "states": [ + { + "partition": {"id": 1, "parent_slice": {}}, + "cursor": {"updated_at": PARENT_COMMENT_CURSOR_PARTITION_1}, + } + ], + "state": {}, + "use_global_cursor": False, + "parent_state": {"posts": {"updated_at": PARENT_POSTS_CURSOR}}, + } }, - { - "partition": { - "id": 11, - "parent_slice": {"id": 1, "parent_slice": {}}, + "state": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, + "lookback_window": 86400, + "use_global_cursor": False, + "states": [ + { + "partition": {"id": 10, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": VOTE_100_CREATED_AT}, }, - "cursor": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, - }, - ], - }, - # Expected state - { - # The global state, lookback window and the parent state are the same because sync failed for comment 20 - "parent_state": { - "post_comments": { - "states": [ - { - "partition": {"id": 1, "parent_slice": {}}, - "cursor": {"updated_at": PARENT_COMMENT_CURSOR_PARTITION_1}, - } - ], - "state": {}, - "use_global_cursor": False, - "parent_state": {"posts": {"updated_at": PARENT_POSTS_CURSOR}}, - } + { + "partition": {"id": 11, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": VOTE_111_CREATED_AT}, + }, + { + "partition": {"id": 12, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": LOOKBACK_DATE}, + }, + { + "partition": {"id": 20, "parent_slice": {"id": 2, "parent_slice": {}}}, + "cursor": {"created_at": LOOKBACK_DATE}, + }, + { + "partition": {"id": 21, "parent_slice": {"id": 2, "parent_slice": {}}}, + "cursor": {"created_at": VOTE_210_CREATED_AT}, + }, + { + "partition": {"id": 30, "parent_slice": {"id": 3, "parent_slice": {}}}, + "cursor": {"created_at": VOTE_300_CREATED_AT}, + }, + ], }, - "state": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, - "lookback_window": 86400, - "use_global_cursor": False, - "states": [ - { - "partition": {"id": 10, "parent_slice": {"id": 1, "parent_slice": {}}}, - "cursor": {"created_at": VOTE_100_CREATED_AT}, - }, - { - "partition": {"id": 11, "parent_slice": {"id": 1, "parent_slice": {}}}, - "cursor": {"created_at": VOTE_111_CREATED_AT}, - }, - { - "partition": {"id": 12, "parent_slice": {"id": 1, "parent_slice": {}}}, - "cursor": {"created_at": LOOKBACK_DATE}, - }, - { - "partition": {"id": 20, "parent_slice": {"id": 2, "parent_slice": {}}}, - "cursor": {"created_at": LOOKBACK_DATE}, - }, - { - "partition": {"id": 21, "parent_slice": {"id": 2, "parent_slice": {}}}, - "cursor": {"created_at": VOTE_210_CREATED_AT}, - }, - { - "partition": {"id": 30, "parent_slice": {"id": 3, "parent_slice": {}}}, - "cursor": {"created_at": VOTE_300_CREATED_AT}, - }, - ], - }, ), ], ) def test_incremental_substream_error( - test_name, manifest, mock_requests, expected_records, initial_state, expected_state + test_name, manifest, mock_requests, expected_records, initial_state, expected_state ): run_mocked_test( mock_requests, @@ -2237,85 +2524,85 @@ def test_incremental_substream_error( "test_name, manifest, mock_requests, expected_records, initial_state, expected_state", [ ( - "test_incremental_parent_state", - LISTPARTITION_MANIFEST, - [ - # Fetch the first page of comments for post 1 - ( - "https://api.example.com/community/posts/1/comments?per_page=100&start_time=2024-01-24T00:00:00Z", - { - "comments": [ - {"id": 9, "post_id": 1, "updated_at": "2023-01-01T00:00:00Z"}, - {"id": 10, "post_id": 1, "updated_at": "2024-01-25T00:00:00Z"}, - {"id": 11, "post_id": 1, "updated_at": "2024-01-24T00:00:00Z"}, - ], - "next_page": "https://api.example.com/community/posts/1/comments?per_page=100&page=2&start_time=2024-01-24T00:00:00Z", - }, - ), - # Fetch the second page of comments for post 1 - ( - "https://api.example.com/community/posts/1/comments?per_page=100&page=2&start_time=2024-01-24T00:00:00Z", - {"comments": [{"id": 12, "post_id": 1, "updated_at": "2024-01-23T00:00:00Z"}]}, - ), - # Fetch the first page of comments for post 2 - ( - "https://api.example.com/community/posts/2/comments?per_page=100&start_time=2024-01-21T05:00:00Z", - { - "comments": [ - {"id": 20, "post_id": 2, "updated_at": "2024-01-22T00:00:00Z"} - ], - "next_page": "https://api.example.com/community/posts/2/comments?per_page=100&page=2&start_time=2024-01-21T05:00:00Z", - }, - ), - # Fetch the second page of comments for post 2 - ( - "https://api.example.com/community/posts/2/comments?per_page=100&page=2&start_time=2024-01-21T05:00:00Z", - {"comments": [{"id": 21, "post_id": 2, "updated_at": "2024-01-21T00:00:00Z"}]}, - ), - # Fetch the first page of comments for post 3 - ( - "https://api.example.com/community/posts/3/comments?per_page=100&start_time=2024-01-08T00:00:00Z", - {"comments": [{"id": 30, "post_id": 3, "updated_at": "2024-01-09T00:00:00Z"}]}, - ), - ], - # Expected records - [ - {"id": 10, "post_id": 1, "updated_at": "2024-01-25T00:00:00Z"}, - {"id": 11, "post_id": 1, "updated_at": "2024-01-24T00:00:00Z"}, - {"id": 20, "post_id": 2, "updated_at": "2024-01-22T00:00:00Z"}, - {"id": 30, "post_id": 3, "updated_at": "2024-01-09T00:00:00Z"}, - ], - # Initial state - { - "state": {"updated_at": "2024-01-08T00:00:00Z"}, - "states": [ - { - "cursor": {"updated_at": "2024-01-24T00:00:00Z"}, - "partition": {"id": "1"}, - }, - { - "cursor": {"updated_at": "2024-01-21T05:00:00Z"}, - "partition": {"id": "2"}, - }, + "test_incremental_parent_state", + LISTPARTITION_MANIFEST, + [ + # Fetch the first page of comments for post 1 + ( + "https://api.example.com/community/posts/1/comments?per_page=100&start_time=2024-01-24T00:00:00Z", + { + "comments": [ + {"id": 9, "post_id": 1, "updated_at": "2023-01-01T00:00:00Z"}, + {"id": 10, "post_id": 1, "updated_at": "2024-01-25T00:00:00Z"}, + {"id": 11, "post_id": 1, "updated_at": "2024-01-24T00:00:00Z"}, + ], + "next_page": "https://api.example.com/community/posts/1/comments?per_page=100&page=2&start_time=2024-01-24T00:00:00Z", + }, + ), + # Fetch the second page of comments for post 1 + ( + "https://api.example.com/community/posts/1/comments?per_page=100&page=2&start_time=2024-01-24T00:00:00Z", + {"comments": [{"id": 12, "post_id": 1, "updated_at": "2024-01-23T00:00:00Z"}]}, + ), + # Fetch the first page of comments for post 2 + ( + "https://api.example.com/community/posts/2/comments?per_page=100&start_time=2024-01-21T05:00:00Z", + { + "comments": [ + {"id": 20, "post_id": 2, "updated_at": "2024-01-22T00:00:00Z"} + ], + "next_page": "https://api.example.com/community/posts/2/comments?per_page=100&page=2&start_time=2024-01-21T05:00:00Z", + }, + ), + # Fetch the second page of comments for post 2 + ( + "https://api.example.com/community/posts/2/comments?per_page=100&page=2&start_time=2024-01-21T05:00:00Z", + {"comments": [{"id": 21, "post_id": 2, "updated_at": "2024-01-21T00:00:00Z"}]}, + ), + # Fetch the first page of comments for post 3 + ( + "https://api.example.com/community/posts/3/comments?per_page=100&start_time=2024-01-08T00:00:00Z", + {"comments": [{"id": 30, "post_id": 3, "updated_at": "2024-01-09T00:00:00Z"}]}, + ), ], - "use_global_cursor": False, - }, - # Expected state - { - "use_global_cursor": False, - "lookback_window": 1, - "state": {"updated_at": "2024-01-25T00:00:00Z"}, - "states": [ - {"cursor": {"updated_at": "2024-01-25T00:00:00Z"}, "partition": {"id": "1"}}, - {"cursor": {"updated_at": "2024-01-22T00:00:00Z"}, "partition": {"id": "2"}}, - {"cursor": {"updated_at": "2024-01-09T00:00:00Z"}, "partition": {"id": "3"}}, + # Expected records + [ + {"id": 10, "post_id": 1, "updated_at": "2024-01-25T00:00:00Z"}, + {"id": 11, "post_id": 1, "updated_at": "2024-01-24T00:00:00Z"}, + {"id": 20, "post_id": 2, "updated_at": "2024-01-22T00:00:00Z"}, + {"id": 30, "post_id": 3, "updated_at": "2024-01-09T00:00:00Z"}, ], - }, + # Initial state + { + "state": {"updated_at": "2024-01-08T00:00:00Z"}, + "states": [ + { + "cursor": {"updated_at": "2024-01-24T00:00:00Z"}, + "partition": {"id": "1"}, + }, + { + "cursor": {"updated_at": "2024-01-21T05:00:00Z"}, + "partition": {"id": "2"}, + }, + ], + "use_global_cursor": False, + }, + # Expected state + { + "use_global_cursor": False, + "lookback_window": 1, + "state": {"updated_at": "2024-01-25T00:00:00Z"}, + "states": [ + {"cursor": {"updated_at": "2024-01-25T00:00:00Z"}, "partition": {"id": "1"}}, + {"cursor": {"updated_at": "2024-01-22T00:00:00Z"}, "partition": {"id": "2"}}, + {"cursor": {"updated_at": "2024-01-09T00:00:00Z"}, "partition": {"id": "3"}}, + ], + }, ), ], ) def test_incremental_list_partition_router( - test_name, manifest, mock_requests, expected_records, initial_state, expected_state + test_name, manifest, mock_requests, expected_records, initial_state, expected_state ): """ Test ConcurrentPerPartitionCursor with ListPartitionRouter @@ -2335,85 +2622,85 @@ def test_incremental_list_partition_router( "test_name, manifest, mock_requests, expected_records, initial_state, expected_state", [ ( - "test_incremental_error_handling", - LISTPARTITION_MANIFEST, - [ - # Fetch the first page of comments for post 1 - ( - "https://api.example.com/community/posts/1/comments?per_page=100&start_time=2024-01-20T00:00:00Z", - { - "comments": [ - {"id": 9, "post_id": 1, "updated_at": "2023-01-01T00:00:00Z"}, - {"id": 10, "post_id": 1, "updated_at": "2024-01-25T00:00:00Z"}, - {"id": 11, "post_id": 1, "updated_at": "2024-01-24T00:00:00Z"}, - ], - "next_page": "https://api.example.com/community/posts/1/comments?per_page=100&page=2&start_time=2024-01-20T00:00:00Z", - }, - ), - # Error response for the second page of comments for post 1 - ( - "https://api.example.com/community/posts/1/comments?per_page=100&page=2&start_time=2024-01-20T00:00:00Z", - None, # Simulate a network error or an empty response - ), - # Fetch the first page of comments for post 2 - ( - "https://api.example.com/community/posts/2/comments?per_page=100&start_time=2024-01-21T05:00:00Z", - { - "comments": [ - {"id": 20, "post_id": 2, "updated_at": "2024-01-22T00:00:00Z"} - ], - "next_page": "https://api.example.com/community/posts/2/comments?per_page=100&page=2&start_time=2024-01-21T05:00:00Z", - }, - ), - # Fetch the second page of comments for post 2 - ( - "https://api.example.com/community/posts/2/comments?per_page=100&page=2&start_time=2024-01-21T05:00:00Z", - {"comments": [{"id": 21, "post_id": 2, "updated_at": "2024-01-21T00:00:00Z"}]}, - ), - # Fetch the first page of comments for post 3 - ( - "https://api.example.com/community/posts/3/comments?per_page=100&start_time=2024-01-08T00:00:00Z", - {"comments": [{"id": 30, "post_id": 3, "updated_at": "2024-01-09T00:00:00Z"}]}, - ), - ], - # Expected records - [ - {"id": 10, "post_id": 1, "updated_at": "2024-01-25T00:00:00Z"}, - {"id": 11, "post_id": 1, "updated_at": "2024-01-24T00:00:00Z"}, - {"id": 20, "post_id": 2, "updated_at": "2024-01-22T00:00:00Z"}, - {"id": 30, "post_id": 3, "updated_at": "2024-01-09T00:00:00Z"}, - ], - # Initial state - { - "state": {"updated_at": "2024-01-08T00:00:00Z"}, - "states": [ - { - "cursor": {"updated_at": "2024-01-20T00:00:00Z"}, - "partition": {"id": "1"}, - }, - { - "cursor": {"updated_at": "2024-01-21T05:00:00Z"}, - "partition": {"id": "2"}, - }, + "test_incremental_error_handling", + LISTPARTITION_MANIFEST, + [ + # Fetch the first page of comments for post 1 + ( + "https://api.example.com/community/posts/1/comments?per_page=100&start_time=2024-01-20T00:00:00Z", + { + "comments": [ + {"id": 9, "post_id": 1, "updated_at": "2023-01-01T00:00:00Z"}, + {"id": 10, "post_id": 1, "updated_at": "2024-01-25T00:00:00Z"}, + {"id": 11, "post_id": 1, "updated_at": "2024-01-24T00:00:00Z"}, + ], + "next_page": "https://api.example.com/community/posts/1/comments?per_page=100&page=2&start_time=2024-01-20T00:00:00Z", + }, + ), + # Error response for the second page of comments for post 1 + ( + "https://api.example.com/community/posts/1/comments?per_page=100&page=2&start_time=2024-01-20T00:00:00Z", + None, # Simulate a network error or an empty response + ), + # Fetch the first page of comments for post 2 + ( + "https://api.example.com/community/posts/2/comments?per_page=100&start_time=2024-01-21T05:00:00Z", + { + "comments": [ + {"id": 20, "post_id": 2, "updated_at": "2024-01-22T00:00:00Z"} + ], + "next_page": "https://api.example.com/community/posts/2/comments?per_page=100&page=2&start_time=2024-01-21T05:00:00Z", + }, + ), + # Fetch the second page of comments for post 2 + ( + "https://api.example.com/community/posts/2/comments?per_page=100&page=2&start_time=2024-01-21T05:00:00Z", + {"comments": [{"id": 21, "post_id": 2, "updated_at": "2024-01-21T00:00:00Z"}]}, + ), + # Fetch the first page of comments for post 3 + ( + "https://api.example.com/community/posts/3/comments?per_page=100&start_time=2024-01-08T00:00:00Z", + {"comments": [{"id": 30, "post_id": 3, "updated_at": "2024-01-09T00:00:00Z"}]}, + ), ], - "use_global_cursor": False, - }, - # Expected state - { - "lookback_window": 0, - "use_global_cursor": False, - "state": {"updated_at": "2024-01-08T00:00:00Z"}, - "states": [ - {"cursor": {"updated_at": "2024-01-20T00:00:00Z"}, "partition": {"id": "1"}}, - {"cursor": {"updated_at": "2024-01-22T00:00:00Z"}, "partition": {"id": "2"}}, - {"cursor": {"updated_at": "2024-01-09T00:00:00Z"}, "partition": {"id": "3"}}, + # Expected records + [ + {"id": 10, "post_id": 1, "updated_at": "2024-01-25T00:00:00Z"}, + {"id": 11, "post_id": 1, "updated_at": "2024-01-24T00:00:00Z"}, + {"id": 20, "post_id": 2, "updated_at": "2024-01-22T00:00:00Z"}, + {"id": 30, "post_id": 3, "updated_at": "2024-01-09T00:00:00Z"}, ], - }, + # Initial state + { + "state": {"updated_at": "2024-01-08T00:00:00Z"}, + "states": [ + { + "cursor": {"updated_at": "2024-01-20T00:00:00Z"}, + "partition": {"id": "1"}, + }, + { + "cursor": {"updated_at": "2024-01-21T05:00:00Z"}, + "partition": {"id": "2"}, + }, + ], + "use_global_cursor": False, + }, + # Expected state + { + "lookback_window": 0, + "use_global_cursor": False, + "state": {"updated_at": "2024-01-08T00:00:00Z"}, + "states": [ + {"cursor": {"updated_at": "2024-01-20T00:00:00Z"}, "partition": {"id": "1"}}, + {"cursor": {"updated_at": "2024-01-22T00:00:00Z"}, "partition": {"id": "2"}}, + {"cursor": {"updated_at": "2024-01-09T00:00:00Z"}, "partition": {"id": "3"}}, + ], + }, ), ], ) def test_incremental_error( - test_name, manifest, mock_requests, expected_records, initial_state, expected_state + test_name, manifest, mock_requests, expected_records, initial_state, expected_state ): """ Test with failed request. @@ -2707,261 +2994,261 @@ def test_incremental_error( "test_name, manifest, mock_requests, expected_records, initial_state, expected_state", [ ( - "test_incremental_parent_state", - SUBSTREAM_REQUEST_OPTIONS_MANIFEST, - [ - # Fetch the first page of posts - ( - f"https://api.example.com/community/posts?per_page=100&start_time={PARENT_POSTS_CURSOR}", - { - "posts": [ - {"id": 1, "updated_at": POST_1_UPDATED_AT}, - {"id": 2, "updated_at": POST_2_UPDATED_AT}, - ], - "next_page": ( - f"https://api.example.com/community/posts" - f"?per_page=100&start_time={PARENT_POSTS_CURSOR}&page=2" - ), - }, - ), - # Fetch the second page of posts - ( - f"https://api.example.com/community/posts?per_page=100&start_time={PARENT_POSTS_CURSOR}&page=2", - {"posts": [{"id": 3, "updated_at": POST_3_UPDATED_AT}]}, - ), - # Fetch the first page of comments for post 1 - ( - "https://api.example.com/community/posts_comments?per_page=100&post_id=1", - { - "comments": [ + "test_incremental_parent_state", + SUBSTREAM_REQUEST_OPTIONS_MANIFEST, + [ + # Fetch the first page of posts + ( + f"https://api.example.com/community/posts?per_page=100&start_time={PARENT_POSTS_CURSOR}", { - "id": 9, - "post_id": 1, - "updated_at": COMMENT_9_OLDEST, + "posts": [ + {"id": 1, "updated_at": POST_1_UPDATED_AT}, + {"id": 2, "updated_at": POST_2_UPDATED_AT}, + ], + "next_page": ( + f"https://api.example.com/community/posts" + f"?per_page=100&start_time={PARENT_POSTS_CURSOR}&page=2" + ), }, + ), + # Fetch the second page of posts + ( + f"https://api.example.com/community/posts?per_page=100&start_time={PARENT_POSTS_CURSOR}&page=2", + {"posts": [{"id": 3, "updated_at": POST_3_UPDATED_AT}]}, + ), + # Fetch the first page of comments for post 1 + ( + "https://api.example.com/community/posts_comments?per_page=100&post_id=1", { - "id": 10, - "post_id": 1, - "updated_at": COMMENT_10_UPDATED_AT, + "comments": [ + { + "id": 9, + "post_id": 1, + "updated_at": COMMENT_9_OLDEST, + }, + { + "id": 10, + "post_id": 1, + "updated_at": COMMENT_10_UPDATED_AT, + }, + { + "id": 11, + "post_id": 1, + "updated_at": COMMENT_11_UPDATED_AT, + }, + ], + "next_page": "https://api.example.com/community/posts_comments?per_page=100&post_id=1&page=2", }, + ), + # Fetch the second page of comments for post 1 + ( + "https://api.example.com/community/posts_comments?per_page=100&post_id=1&page=2", + {"comments": [{"id": 12, "post_id": 1, "updated_at": COMMENT_12_UPDATED_AT}]}, + ), + # Fetch the first page of votes for comment 10 of post 1 + ( + f"https://api.example.com/community/posts_comments_votes?per_page=100&comment_id=10&start_time={INITIAL_STATE_PARTITION_10_CURSOR}", { - "id": 11, - "post_id": 1, - "updated_at": COMMENT_11_UPDATED_AT, + "votes": [ + { + "id": 100, + "comment_id": 10, + "created_at": VOTE_100_CREATED_AT, + } + ], + "next_page": ( + f"https://api.example.com/community/posts_comments_votes" + f"?per_page=100&page=2&comment_id=10&start_time={INITIAL_STATE_PARTITION_10_CURSOR}" + ), }, - ], - "next_page": "https://api.example.com/community/posts_comments?per_page=100&post_id=1&page=2", - }, - ), - # Fetch the second page of comments for post 1 - ( - "https://api.example.com/community/posts_comments?per_page=100&post_id=1&page=2", - {"comments": [{"id": 12, "post_id": 1, "updated_at": COMMENT_12_UPDATED_AT}]}, - ), - # Fetch the first page of votes for comment 10 of post 1 - ( - f"https://api.example.com/community/posts_comments_votes?per_page=100&comment_id=10&start_time={INITIAL_STATE_PARTITION_10_CURSOR}", - { - "votes": [ - { - "id": 100, - "comment_id": 10, - "created_at": VOTE_100_CREATED_AT, - } - ], - "next_page": ( + ), + # Fetch the second page of votes for comment 10 of post 1 + ( f"https://api.example.com/community/posts_comments_votes" - f"?per_page=100&page=2&comment_id=10&start_time={INITIAL_STATE_PARTITION_10_CURSOR}" - ), - }, - ), - # Fetch the second page of votes for comment 10 of post 1 - ( - f"https://api.example.com/community/posts_comments_votes" - f"?per_page=100&page=2&comment_id=10&start_time={INITIAL_STATE_PARTITION_10_CURSOR}", - {"votes": [{"id": 101, "comment_id": 10, "created_at": VOTE_101_CREATED_AT}]}, - ), - # Fetch the first page of votes for comment 11 of post 1 - ( - f"https://api.example.com/community/posts_comments_votes" - f"?per_page=100&comment_id=11&start_time={INITIAL_STATE_PARTITION_11_CURSOR}", - {"votes": [{"id": 111, "comment_id": 11, "created_at": VOTE_111_CREATED_AT}]}, - ), - # Fetch the first page of votes for comment 12 of post 1 - ( - f"https://api.example.com/community/posts_comments_votes?" - f"per_page=100&comment_id=12&start_time={LOOKBACK_DATE}", - {"votes": []}, - ), - # Fetch the first page of comments for post 2 - ( - "https://api.example.com/community/posts_comments?per_page=100&post_id=2", - { - "comments": [{"id": 20, "post_id": 2, "updated_at": COMMENT_20_UPDATED_AT}], - "next_page": "https://api.example.com/community/posts_comments?per_page=100&post_id=2&page=2", - }, - ), - # Fetch the second page of comments for post 2 - ( - "https://api.example.com/community/posts_comments?per_page=100&post_id=2&page=2", - {"comments": [{"id": 21, "post_id": 2, "updated_at": COMMENT_21_UPDATED_AT}]}, - ), - # Fetch the first page of votes for comment 20 of post 2 - ( - f"https://api.example.com/community/posts_comments_votes" - f"?per_page=100&comment_id=20&start_time={LOOKBACK_DATE}", - {"votes": [{"id": 200, "comment_id": 20, "created_at": VOTE_200_CREATED_AT}]}, - ), - # Fetch the first page of votes for comment 21 of post 2 - ( - f"https://api.example.com/community/posts_comments_votes?" - f"per_page=100&comment_id=21&start_time={LOOKBACK_DATE}", - {"votes": [{"id": 210, "comment_id": 21, "created_at": VOTE_210_CREATED_AT}]}, - ), - # Fetch the first page of comments for post 3 - ( - "https://api.example.com/community/posts_comments?per_page=100&post_id=3", - {"comments": [{"id": 30, "post_id": 3, "updated_at": COMMENT_30_UPDATED_AT}]}, - ), - # Fetch the first page of votes for comment 30 of post 3 - ( - f"https://api.example.com/community/posts_comments_votes?" - f"per_page=100&comment_id=30&start_time={LOOKBACK_DATE}", - {"votes": [{"id": 300, "comment_id": 30, "created_at": VOTE_300_CREATED_AT}]}, - ), - ], - # Expected records - [ - { - "comment_id": 10, - "comment_updated_at": COMMENT_10_UPDATED_AT, - "created_at": VOTE_100_CREATED_AT, - "id": 100, - }, - { - "comment_id": 10, - "comment_updated_at": COMMENT_10_UPDATED_AT, - "created_at": VOTE_101_CREATED_AT, - "id": 101, - }, - { - "comment_id": 11, - "comment_updated_at": COMMENT_11_UPDATED_AT, - "created_at": VOTE_111_CREATED_AT, - "id": 111, - }, - { - "comment_id": 20, - "comment_updated_at": COMMENT_20_UPDATED_AT, - "created_at": VOTE_200_CREATED_AT, - "id": 200, - }, - { - "comment_id": 21, - "comment_updated_at": COMMENT_21_UPDATED_AT, - "created_at": VOTE_210_CREATED_AT, - "id": 210, - }, - { - "comment_id": 30, - "comment_updated_at": COMMENT_30_UPDATED_AT, - "created_at": VOTE_300_CREATED_AT, - "id": 300, - }, - ], - # Initial state - { - "parent_state": { - "post_comments": { - "states": [ - { - "partition": {"id": 1, "parent_slice": {}}, - "cursor": {"updated_at": PARENT_COMMENT_CURSOR_PARTITION_1}, - } - ], - "parent_state": {"posts": {"updated_at": PARENT_POSTS_CURSOR}}, - } - }, - "state": {"created_at": INITIAL_GLOBAL_CURSOR}, - "states": [ - { - "partition": { - "id": 10, - "parent_slice": {"id": 1, "parent_slice": {}}, - }, - "cursor": {"created_at": INITIAL_STATE_PARTITION_10_CURSOR}, - }, - { - "partition": { - "id": 11, - "parent_slice": {"id": 1, "parent_slice": {}}, - }, - "cursor": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, - }, - ], - "lookback_window": 86400, - }, - # Expected state - { - "state": {"created_at": VOTE_100_CREATED_AT}, - "parent_state": { - "post_comments": { - "use_global_cursor": False, - "state": {"updated_at": COMMENT_10_UPDATED_AT}, # 10 is the "latest" - "parent_state": { - "posts": {"updated_at": POST_1_UPDATED_AT} - }, # post 1 is the latest - "lookback_window": 1, - "states": [ - { - "partition": {"id": 1, "parent_slice": {}}, - "cursor": {"updated_at": COMMENT_10_UPDATED_AT}, - }, - { - "partition": {"id": 2, "parent_slice": {}}, - "cursor": {"updated_at": COMMENT_20_UPDATED_AT}, - }, + f"?per_page=100&page=2&comment_id=10&start_time={INITIAL_STATE_PARTITION_10_CURSOR}", + {"votes": [{"id": 101, "comment_id": 10, "created_at": VOTE_101_CREATED_AT}]}, + ), + # Fetch the first page of votes for comment 11 of post 1 + ( + f"https://api.example.com/community/posts_comments_votes" + f"?per_page=100&comment_id=11&start_time={INITIAL_STATE_PARTITION_11_CURSOR}", + {"votes": [{"id": 111, "comment_id": 11, "created_at": VOTE_111_CREATED_AT}]}, + ), + # Fetch the first page of votes for comment 12 of post 1 + ( + f"https://api.example.com/community/posts_comments_votes?" + f"per_page=100&comment_id=12&start_time={LOOKBACK_DATE}", + {"votes": []}, + ), + # Fetch the first page of comments for post 2 + ( + "https://api.example.com/community/posts_comments?per_page=100&post_id=2", { - "partition": {"id": 3, "parent_slice": {}}, - "cursor": {"updated_at": COMMENT_30_UPDATED_AT}, + "comments": [{"id": 20, "post_id": 2, "updated_at": COMMENT_20_UPDATED_AT}], + "next_page": "https://api.example.com/community/posts_comments?per_page=100&post_id=2&page=2", }, - ], - } - }, - "lookback_window": 1, - "use_global_cursor": False, - "states": [ + ), + # Fetch the second page of comments for post 2 + ( + "https://api.example.com/community/posts_comments?per_page=100&post_id=2&page=2", + {"comments": [{"id": 21, "post_id": 2, "updated_at": COMMENT_21_UPDATED_AT}]}, + ), + # Fetch the first page of votes for comment 20 of post 2 + ( + f"https://api.example.com/community/posts_comments_votes" + f"?per_page=100&comment_id=20&start_time={LOOKBACK_DATE}", + {"votes": [{"id": 200, "comment_id": 20, "created_at": VOTE_200_CREATED_AT}]}, + ), + # Fetch the first page of votes for comment 21 of post 2 + ( + f"https://api.example.com/community/posts_comments_votes?" + f"per_page=100&comment_id=21&start_time={LOOKBACK_DATE}", + {"votes": [{"id": 210, "comment_id": 21, "created_at": VOTE_210_CREATED_AT}]}, + ), + # Fetch the first page of comments for post 3 + ( + "https://api.example.com/community/posts_comments?per_page=100&post_id=3", + {"comments": [{"id": 30, "post_id": 3, "updated_at": COMMENT_30_UPDATED_AT}]}, + ), + # Fetch the first page of votes for comment 30 of post 3 + ( + f"https://api.example.com/community/posts_comments_votes?" + f"per_page=100&comment_id=30&start_time={LOOKBACK_DATE}", + {"votes": [{"id": 300, "comment_id": 30, "created_at": VOTE_300_CREATED_AT}]}, + ), + ], + # Expected records + [ { - "partition": {"id": 10, "parent_slice": {"id": 1, "parent_slice": {}}}, - "cursor": {"created_at": VOTE_100_CREATED_AT}, + "comment_id": 10, + "comment_updated_at": COMMENT_10_UPDATED_AT, + "created_at": VOTE_100_CREATED_AT, + "id": 100, }, { - "partition": {"id": 11, "parent_slice": {"id": 1, "parent_slice": {}}}, - "cursor": {"created_at": VOTE_111_CREATED_AT}, + "comment_id": 10, + "comment_updated_at": COMMENT_10_UPDATED_AT, + "created_at": VOTE_101_CREATED_AT, + "id": 101, }, { - "partition": {"id": 12, "parent_slice": {"id": 1, "parent_slice": {}}}, - "cursor": {"created_at": LOOKBACK_DATE}, + "comment_id": 11, + "comment_updated_at": COMMENT_11_UPDATED_AT, + "created_at": VOTE_111_CREATED_AT, + "id": 111, }, { - "partition": {"id": 20, "parent_slice": {"id": 2, "parent_slice": {}}}, - "cursor": {"created_at": VOTE_200_CREATED_AT}, + "comment_id": 20, + "comment_updated_at": COMMENT_20_UPDATED_AT, + "created_at": VOTE_200_CREATED_AT, + "id": 200, }, { - "partition": {"id": 21, "parent_slice": {"id": 2, "parent_slice": {}}}, - "cursor": {"created_at": VOTE_210_CREATED_AT}, + "comment_id": 21, + "comment_updated_at": COMMENT_21_UPDATED_AT, + "created_at": VOTE_210_CREATED_AT, + "id": 210, }, { - "partition": {"id": 30, "parent_slice": {"id": 3, "parent_slice": {}}}, - "cursor": {"created_at": VOTE_300_CREATED_AT}, + "comment_id": 30, + "comment_updated_at": COMMENT_30_UPDATED_AT, + "created_at": VOTE_300_CREATED_AT, + "id": 300, }, ], - }, + # Initial state + { + "parent_state": { + "post_comments": { + "states": [ + { + "partition": {"id": 1, "parent_slice": {}}, + "cursor": {"updated_at": PARENT_COMMENT_CURSOR_PARTITION_1}, + } + ], + "parent_state": {"posts": {"updated_at": PARENT_POSTS_CURSOR}}, + } + }, + "state": {"created_at": INITIAL_GLOBAL_CURSOR}, + "states": [ + { + "partition": { + "id": 10, + "parent_slice": {"id": 1, "parent_slice": {}}, + }, + "cursor": {"created_at": INITIAL_STATE_PARTITION_10_CURSOR}, + }, + { + "partition": { + "id": 11, + "parent_slice": {"id": 1, "parent_slice": {}}, + }, + "cursor": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, + }, + ], + "lookback_window": 86400, + }, + # Expected state + { + "state": {"created_at": VOTE_100_CREATED_AT}, + "parent_state": { + "post_comments": { + "use_global_cursor": False, + "state": {"updated_at": COMMENT_10_UPDATED_AT}, # 10 is the "latest" + "parent_state": { + "posts": {"updated_at": POST_1_UPDATED_AT} + }, # post 1 is the latest + "lookback_window": 1, + "states": [ + { + "partition": {"id": 1, "parent_slice": {}}, + "cursor": {"updated_at": COMMENT_10_UPDATED_AT}, + }, + { + "partition": {"id": 2, "parent_slice": {}}, + "cursor": {"updated_at": COMMENT_20_UPDATED_AT}, + }, + { + "partition": {"id": 3, "parent_slice": {}}, + "cursor": {"updated_at": COMMENT_30_UPDATED_AT}, + }, + ], + } + }, + "lookback_window": 1, + "use_global_cursor": False, + "states": [ + { + "partition": {"id": 10, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": VOTE_100_CREATED_AT}, + }, + { + "partition": {"id": 11, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": VOTE_111_CREATED_AT}, + }, + { + "partition": {"id": 12, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": LOOKBACK_DATE}, + }, + { + "partition": {"id": 20, "parent_slice": {"id": 2, "parent_slice": {}}}, + "cursor": {"created_at": VOTE_200_CREATED_AT}, + }, + { + "partition": {"id": 21, "parent_slice": {"id": 2, "parent_slice": {}}}, + "cursor": {"created_at": VOTE_210_CREATED_AT}, + }, + { + "partition": {"id": 30, "parent_slice": {"id": 3, "parent_slice": {}}}, + "cursor": {"created_at": VOTE_300_CREATED_AT}, + }, + ], + }, ), ], ) def test_incremental_substream_request_options_provider( - test_name, manifest, mock_requests, expected_records, initial_state, expected_state + test_name, manifest, mock_requests, expected_records, initial_state, expected_state ): """ Test incremental syncing for a stream that uses request options provider from parent stream config. From a1d98fbc41f1d387b5e97197b81641adc66ed1ee Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Tue, 18 Feb 2025 18:44:40 +0200 Subject: [PATCH 22/26] Fix format --- .../test_concurrent_perpartitioncursor.py | 3194 ++++++++--------- 1 file changed, 1597 insertions(+), 1597 deletions(-) diff --git a/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py b/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py index 23459366d..f650847a6 100644 --- a/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py +++ b/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py @@ -306,7 +306,7 @@ def run_mocked_test( - mock_requests, manifest, config, stream_name, initial_state, expected_records, expected_state + mock_requests, manifest, config, stream_name, initial_state, expected_records, expected_state ): """ Helper function to mock requests, run the test, and verify the results. @@ -356,15 +356,15 @@ def run_mocked_test( [req for req in m.request_history if unquote(req.url) == unquote(url)] ) assert ( - request_count == 1 + request_count == 1 ), f"URL {url} was called {request_count} times, expected exactly once." def _run_read( - manifest: Mapping[str, Any], - config: Mapping[str, Any], - stream_name: str, - state: Optional[Union[List[AirbyteStateMessage], MutableMapping[str, Any]]] = None, + manifest: Mapping[str, Any], + config: Mapping[str, Any], + stream_name: str, + state: Optional[Union[List[AirbyteStateMessage], MutableMapping[str, Any]]] = None, ) -> EntrypointOutput: source = ConcurrentDeclarativeSource( source_config=manifest, config=config, catalog=None, state=state @@ -416,8 +416,8 @@ def _run_read( INITIAL_STATE_PARTITION_11_CURSOR.replace("Z", "") ) LOOKBACK_DATE = ( - INITIAL_GLOBAL_CURSOR_DATE - timedelta(days=LOOKBACK_WINDOW_DAYS) - ).isoformat() + "Z" + INITIAL_GLOBAL_CURSOR_DATE - timedelta(days=LOOKBACK_WINDOW_DAYS) +).isoformat() + "Z" PARTITION_SYNC_START_TIME = "2024-01-02T00:00:00Z" @@ -426,316 +426,316 @@ def _run_read( "test_name, manifest, mock_requests, expected_records, initial_state, expected_state", [ ( - "test_incremental_parent_state", - SUBSTREAM_MANIFEST_NO_DEPENDENCY, - [ - # Fetch the first page of posts - ( - f"https://api.example.com/community/posts?per_page=100&start_time={START_DATE}", - { - "posts": [ - {"id": 1, "updated_at": POST_1_UPDATED_AT}, - {"id": 2, "updated_at": POST_2_UPDATED_AT}, - ], - "next_page": f"https://api.example.com/community/posts?per_page=100&start_time={START_DATE}&page=2", - }, - ), - # Fetch the second page of posts - ( - f"https://api.example.com/community/posts?per_page=100&start_time={START_DATE}&page=2", - {"posts": [{"id": 3, "updated_at": POST_3_UPDATED_AT}]}, - ), - # Fetch the first page of comments for post 1 - ( - "https://api.example.com/community/posts/1/comments?per_page=100", - { - "comments": [ - { - "id": 9, - "post_id": 1, - "updated_at": COMMENT_9_OLDEST, # No requests for comment 9, filtered out due to the date - }, - { - "id": 10, - "post_id": 1, - "updated_at": COMMENT_10_UPDATED_AT, - }, - { - "id": 11, - "post_id": 1, - "updated_at": COMMENT_11_UPDATED_AT, - }, - ], - "next_page": "https://api.example.com/community/posts/1/comments?per_page=100&page=2", - }, - ), - # Fetch the second page of comments for post 1 - ( - "https://api.example.com/community/posts/1/comments?per_page=100&page=2", - { - "comments": [ - { - "id": 12, - "post_id": 1, - "updated_at": COMMENT_12_UPDATED_AT, - } - ] - }, - ), - # Fetch the first page of votes for comment 10 of post 1 - ( - f"https://api.example.com/community/posts/1/comments/10/votes?per_page=100&start_time={INITIAL_STATE_PARTITION_10_CURSOR}", - { - "votes": [ - { - "id": 100, - "comment_id": 10, - "created_at": VOTE_100_CREATED_AT, - } - ], - "next_page": f"https://api.example.com/community/posts/1/comments/10/votes?per_page=100&page=2&start_time={INITIAL_STATE_PARTITION_10_CURSOR}", - }, - ), - # Fetch the second page of votes for comment 10 of post 1 - ( - f"https://api.example.com/community/posts/1/comments/10/votes?per_page=100&page=2&start_time={INITIAL_STATE_PARTITION_10_CURSOR}", - { - "votes": [ - { - "id": 101, - "comment_id": 10, - "created_at": VOTE_101_CREATED_AT, - } - ] - }, - ), - # Fetch the first page of votes for comment 11 of post 1 - ( - f"https://api.example.com/community/posts/1/comments/11/votes?per_page=100&start_time={INITIAL_STATE_PARTITION_11_CURSOR}", + "test_incremental_parent_state", + SUBSTREAM_MANIFEST_NO_DEPENDENCY, + [ + # Fetch the first page of posts + ( + f"https://api.example.com/community/posts?per_page=100&start_time={START_DATE}", + { + "posts": [ + {"id": 1, "updated_at": POST_1_UPDATED_AT}, + {"id": 2, "updated_at": POST_2_UPDATED_AT}, + ], + "next_page": f"https://api.example.com/community/posts?per_page=100&start_time={START_DATE}&page=2", + }, + ), + # Fetch the second page of posts + ( + f"https://api.example.com/community/posts?per_page=100&start_time={START_DATE}&page=2", + {"posts": [{"id": 3, "updated_at": POST_3_UPDATED_AT}]}, + ), + # Fetch the first page of comments for post 1 + ( + "https://api.example.com/community/posts/1/comments?per_page=100", + { + "comments": [ { - "votes": [ - { - "id": 111, - "comment_id": 11, - "created_at": VOTE_111_CREATED_AT, - } - ] + "id": 9, + "post_id": 1, + "updated_at": COMMENT_9_OLDEST, # No requests for comment 9, filtered out due to the date }, - ), - # Fetch the first page of votes for comment 12 of post 1 - ( - f"https://api.example.com/community/posts/1/comments/12/votes?per_page=100&start_time={LOOKBACK_DATE}", - {"votes": []}, - ), - # Fetch the first page of comments for post 2 - ( - "https://api.example.com/community/posts/2/comments?per_page=100", { - "comments": [ - { - "id": 20, - "post_id": 2, - "updated_at": COMMENT_20_UPDATED_AT, - } - ], - "next_page": "https://api.example.com/community/posts/2/comments?per_page=100&page=2", + "id": 10, + "post_id": 1, + "updated_at": COMMENT_10_UPDATED_AT, }, - ), - # Fetch the second page of comments for post 2 - ( - "https://api.example.com/community/posts/2/comments?per_page=100&page=2", { - "comments": [ - { - "id": 21, - "post_id": 2, - "updated_at": COMMENT_21_UPDATED_AT, - } - ] + "id": 11, + "post_id": 1, + "updated_at": COMMENT_11_UPDATED_AT, }, - ), - # Fetch the first page of votes for comment 20 of post 2 - ( - f"https://api.example.com/community/posts/2/comments/20/votes?per_page=100&start_time={LOOKBACK_DATE}", + ], + "next_page": "https://api.example.com/community/posts/1/comments?per_page=100&page=2", + }, + ), + # Fetch the second page of comments for post 1 + ( + "https://api.example.com/community/posts/1/comments?per_page=100&page=2", + { + "comments": [ { - "votes": [ - { - "id": 200, - "comment_id": 20, - "created_at": VOTE_200_CREATED_AT, - } - ] - }, - ), - # Fetch the first page of votes for comment 21 of post 2 - ( - f"https://api.example.com/community/posts/2/comments/21/votes?per_page=100&start_time={LOOKBACK_DATE}", + "id": 12, + "post_id": 1, + "updated_at": COMMENT_12_UPDATED_AT, + } + ] + }, + ), + # Fetch the first page of votes for comment 10 of post 1 + ( + f"https://api.example.com/community/posts/1/comments/10/votes?per_page=100&start_time={INITIAL_STATE_PARTITION_10_CURSOR}", + { + "votes": [ { - "votes": [ - { - "id": 210, - "comment_id": 21, - "created_at": VOTE_210_CREATED_AT, - } - ] - }, - ), - # Fetch the first page of comments for post 3 - ( - "https://api.example.com/community/posts/3/comments?per_page=100", + "id": 100, + "comment_id": 10, + "created_at": VOTE_100_CREATED_AT, + } + ], + "next_page": f"https://api.example.com/community/posts/1/comments/10/votes?per_page=100&page=2&start_time={INITIAL_STATE_PARTITION_10_CURSOR}", + }, + ), + # Fetch the second page of votes for comment 10 of post 1 + ( + f"https://api.example.com/community/posts/1/comments/10/votes?per_page=100&page=2&start_time={INITIAL_STATE_PARTITION_10_CURSOR}", + { + "votes": [ { - "comments": [ - { - "id": 30, - "post_id": 3, - "updated_at": COMMENT_30_UPDATED_AT, - } - ] - }, - ), - # Fetch the first page of votes for comment 30 of post 3 - ( - f"https://api.example.com/community/posts/3/comments/30/votes?per_page=100&start_time={LOOKBACK_DATE}", + "id": 101, + "comment_id": 10, + "created_at": VOTE_101_CREATED_AT, + } + ] + }, + ), + # Fetch the first page of votes for comment 11 of post 1 + ( + f"https://api.example.com/community/posts/1/comments/11/votes?per_page=100&start_time={INITIAL_STATE_PARTITION_11_CURSOR}", + { + "votes": [ { - "votes": [ - { - "id": 300, - "comment_id": 30, - "created_at": VOTE_300_CREATED_AT_TIMESTAMP, - } - ] - }, - ), - ], - # Expected records - [ + "id": 111, + "comment_id": 11, + "created_at": VOTE_111_CREATED_AT, + } + ] + }, + ), + # Fetch the first page of votes for comment 12 of post 1 + ( + f"https://api.example.com/community/posts/1/comments/12/votes?per_page=100&start_time={LOOKBACK_DATE}", + {"votes": []}, + ), + # Fetch the first page of comments for post 2 + ( + "https://api.example.com/community/posts/2/comments?per_page=100", { - "comment_id": 10, - "comment_updated_at": COMMENT_10_UPDATED_AT, - "created_at": VOTE_100_CREATED_AT, - "id": 100, + "comments": [ + { + "id": 20, + "post_id": 2, + "updated_at": COMMENT_20_UPDATED_AT, + } + ], + "next_page": "https://api.example.com/community/posts/2/comments?per_page=100&page=2", }, + ), + # Fetch the second page of comments for post 2 + ( + "https://api.example.com/community/posts/2/comments?per_page=100&page=2", { - "comment_id": 10, - "comment_updated_at": COMMENT_10_UPDATED_AT, - "created_at": VOTE_101_CREATED_AT, - "id": 101, + "comments": [ + { + "id": 21, + "post_id": 2, + "updated_at": COMMENT_21_UPDATED_AT, + } + ] }, + ), + # Fetch the first page of votes for comment 20 of post 2 + ( + f"https://api.example.com/community/posts/2/comments/20/votes?per_page=100&start_time={LOOKBACK_DATE}", { - "comment_id": 11, - "comment_updated_at": COMMENT_11_UPDATED_AT, - "created_at": VOTE_111_CREATED_AT, - "id": 111, + "votes": [ + { + "id": 200, + "comment_id": 20, + "created_at": VOTE_200_CREATED_AT, + } + ] }, + ), + # Fetch the first page of votes for comment 21 of post 2 + ( + f"https://api.example.com/community/posts/2/comments/21/votes?per_page=100&start_time={LOOKBACK_DATE}", { - "comment_id": 20, - "comment_updated_at": COMMENT_20_UPDATED_AT, - "created_at": VOTE_200_CREATED_AT, - "id": 200, + "votes": [ + { + "id": 210, + "comment_id": 21, + "created_at": VOTE_210_CREATED_AT, + } + ] }, + ), + # Fetch the first page of comments for post 3 + ( + "https://api.example.com/community/posts/3/comments?per_page=100", { - "comment_id": 21, - "comment_updated_at": COMMENT_21_UPDATED_AT, - "created_at": VOTE_210_CREATED_AT, - "id": 210, + "comments": [ + { + "id": 30, + "post_id": 3, + "updated_at": COMMENT_30_UPDATED_AT, + } + ] }, + ), + # Fetch the first page of votes for comment 30 of post 3 + ( + f"https://api.example.com/community/posts/3/comments/30/votes?per_page=100&start_time={LOOKBACK_DATE}", { - "comment_id": 30, - "comment_updated_at": COMMENT_30_UPDATED_AT, - "created_at": str(VOTE_300_CREATED_AT_TIMESTAMP), - "id": 300, + "votes": [ + { + "id": 300, + "comment_id": 30, + "created_at": VOTE_300_CREATED_AT_TIMESTAMP, + } + ] }, - ], - # Initial state + ), + ], + # Expected records + [ { - # This should not happen since parent state is disabled, but I've added this to validate that and - # incoming parent_state is ignored when the parent stream's incremental_dependency is disabled - "parent_state": { - "post_comments": { - "states": [ - { - "partition": {"id": 1, "parent_slice": {}}, - "cursor": {"updated_at": PARENT_COMMENT_CURSOR_PARTITION_1}, - } - ], - "parent_state": {"posts": {"updated_at": PARENT_POSTS_CURSOR}}, - } - }, - "states": [ - { - "partition": { - "id": 10, - "parent_slice": {"id": 1, "parent_slice": {}}, - }, - "cursor": {"created_at": INITIAL_STATE_PARTITION_10_CURSOR_TIMESTAMP}, - }, - { - "partition": { - "id": 11, - "parent_slice": {"id": 1, "parent_slice": {}}, - }, - "cursor": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, - }, - ], - "state": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR_TIMESTAMP}, - "lookback_window": 86400, + "comment_id": 10, + "comment_updated_at": COMMENT_10_UPDATED_AT, + "created_at": VOTE_100_CREATED_AT, + "id": 100, }, - # Expected state { - "states": [ - { - "partition": { - "id": 10, - "parent_slice": {"id": 1, "parent_slice": {}}, - }, - "cursor": {"created_at": VOTE_100_CREATED_AT}, + "comment_id": 10, + "comment_updated_at": COMMENT_10_UPDATED_AT, + "created_at": VOTE_101_CREATED_AT, + "id": 101, + }, + { + "comment_id": 11, + "comment_updated_at": COMMENT_11_UPDATED_AT, + "created_at": VOTE_111_CREATED_AT, + "id": 111, + }, + { + "comment_id": 20, + "comment_updated_at": COMMENT_20_UPDATED_AT, + "created_at": VOTE_200_CREATED_AT, + "id": 200, + }, + { + "comment_id": 21, + "comment_updated_at": COMMENT_21_UPDATED_AT, + "created_at": VOTE_210_CREATED_AT, + "id": 210, + }, + { + "comment_id": 30, + "comment_updated_at": COMMENT_30_UPDATED_AT, + "created_at": str(VOTE_300_CREATED_AT_TIMESTAMP), + "id": 300, + }, + ], + # Initial state + { + # This should not happen since parent state is disabled, but I've added this to validate that and + # incoming parent_state is ignored when the parent stream's incremental_dependency is disabled + "parent_state": { + "post_comments": { + "states": [ + { + "partition": {"id": 1, "parent_slice": {}}, + "cursor": {"updated_at": PARENT_COMMENT_CURSOR_PARTITION_1}, + } + ], + "parent_state": {"posts": {"updated_at": PARENT_POSTS_CURSOR}}, + } + }, + "states": [ + { + "partition": { + "id": 10, + "parent_slice": {"id": 1, "parent_slice": {}}, }, - { - "partition": { - "id": 11, - "parent_slice": {"id": 1, "parent_slice": {}}, - }, - "cursor": {"created_at": VOTE_111_CREATED_AT}, + "cursor": {"created_at": INITIAL_STATE_PARTITION_10_CURSOR_TIMESTAMP}, + }, + { + "partition": { + "id": 11, + "parent_slice": {"id": 1, "parent_slice": {}}, }, - { - "partition": { - "id": 12, - "parent_slice": {"id": 1, "parent_slice": {}}, - }, - "cursor": {"created_at": LOOKBACK_DATE}, + "cursor": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, + }, + ], + "state": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR_TIMESTAMP}, + "lookback_window": 86400, + }, + # Expected state + { + "states": [ + { + "partition": { + "id": 10, + "parent_slice": {"id": 1, "parent_slice": {}}, }, - { - "partition": { - "id": 20, - "parent_slice": {"id": 2, "parent_slice": {}}, - }, - "cursor": {"created_at": VOTE_200_CREATED_AT}, + "cursor": {"created_at": VOTE_100_CREATED_AT}, + }, + { + "partition": { + "id": 11, + "parent_slice": {"id": 1, "parent_slice": {}}, }, - { - "partition": { - "id": 21, - "parent_slice": {"id": 2, "parent_slice": {}}, - }, - "cursor": {"created_at": VOTE_210_CREATED_AT}, + "cursor": {"created_at": VOTE_111_CREATED_AT}, + }, + { + "partition": { + "id": 12, + "parent_slice": {"id": 1, "parent_slice": {}}, }, - { - "partition": { - "id": 30, - "parent_slice": {"id": 3, "parent_slice": {}}, - }, - "cursor": {"created_at": VOTE_300_CREATED_AT}, + "cursor": {"created_at": LOOKBACK_DATE}, + }, + { + "partition": { + "id": 20, + "parent_slice": {"id": 2, "parent_slice": {}}, }, - ], - "use_global_cursor": False, - "lookback_window": 1, - "parent_state": {}, - "state": {"created_at": VOTE_100_CREATED_AT}, - }, + "cursor": {"created_at": VOTE_200_CREATED_AT}, + }, + { + "partition": { + "id": 21, + "parent_slice": {"id": 2, "parent_slice": {}}, + }, + "cursor": {"created_at": VOTE_210_CREATED_AT}, + }, + { + "partition": { + "id": 30, + "parent_slice": {"id": 3, "parent_slice": {}}, + }, + "cursor": {"created_at": VOTE_300_CREATED_AT}, + }, + ], + "use_global_cursor": False, + "lookback_window": 1, + "parent_state": {}, + "state": {"created_at": VOTE_100_CREATED_AT}, + }, ), ], ) def test_incremental_parent_state_no_incremental_dependency( - test_name, manifest, mock_requests, expected_records, initial_state, expected_state + test_name, manifest, mock_requests, expected_records, initial_state, expected_state ): """ This is a pretty complicated test that syncs a low-code connector stream with three levels of substreams @@ -761,13 +761,13 @@ def test_incremental_parent_state_no_incremental_dependency( def run_incremental_parent_state_test( - manifest, - mock_requests, - expected_records, - num_intermediate_states, - intermidiate_states, - initial_state, - expected_states, + manifest, + mock_requests, + expected_records, + num_intermediate_states, + intermidiate_states, + initial_state, + expected_states, ): """ Run an incremental parent state test for the specified stream. @@ -856,8 +856,8 @@ def run_incremental_parent_state_test( {orjson.dumps(record): record for record in expected_records}.values() ) assert ( - sorted(cumulative_records_state_deduped, key=lambda x: x["id"]) - == sorted(expected_records_set, key=lambda x: x["id"]) + sorted(cumulative_records_state_deduped, key=lambda x: x["id"]) + == sorted(expected_records_set, key=lambda x: x["id"]) ), f"Records mismatch with intermediate state {state}. Expected {expected_records}, got {cumulative_records_state_deduped}" # Store the final state after each intermediate read @@ -870,7 +870,7 @@ def run_incremental_parent_state_test( # Assert that the final state matches the expected state for all runs for i, final_state in enumerate(final_states): assert ( - final_state in expected_states + final_state in expected_states ), f"Final state mismatch at run {i + 1}. Expected {expected_states}, got {final_state}" @@ -1183,295 +1183,295 @@ def run_incremental_parent_state_test( "test_name, manifest, mock_requests, expected_records, num_intermediate_states, intermidiate_states, initial_state, expected_state", [ ( - "test_incremental_parent_state", - SUBSTREAM_MANIFEST, - [ - # Fetch the first page of posts - ( - f"https://api.example.com/community/posts?per_page=100&start_time={PARENT_POSTS_CURSOR}", + "test_incremental_parent_state", + SUBSTREAM_MANIFEST, + [ + # Fetch the first page of posts + ( + f"https://api.example.com/community/posts?per_page=100&start_time={PARENT_POSTS_CURSOR}", + { + "posts": [ + {"id": 1, "updated_at": POST_1_UPDATED_AT}, + {"id": 2, "updated_at": POST_2_UPDATED_AT}, + ], + "next_page": ( + f"https://api.example.com/community/posts" + f"?per_page=100&start_time={PARENT_POSTS_CURSOR}&page=2" + ), + }, + ), + # Fetch the second page of posts + ( + f"https://api.example.com/community/posts?per_page=100&start_time={PARENT_POSTS_CURSOR}&page=2", + {"posts": [{"id": 3, "updated_at": POST_3_UPDATED_AT}]}, + ), + # Fetch the first page of comments for post 1 + ( + "https://api.example.com/community/posts/1/comments?per_page=100", + { + "comments": [ { - "posts": [ - {"id": 1, "updated_at": POST_1_UPDATED_AT}, - {"id": 2, "updated_at": POST_2_UPDATED_AT}, - ], - "next_page": ( - f"https://api.example.com/community/posts" - f"?per_page=100&start_time={PARENT_POSTS_CURSOR}&page=2" - ), + "id": 9, + "post_id": 1, + "updated_at": COMMENT_9_OLDEST, }, - ), - # Fetch the second page of posts - ( - f"https://api.example.com/community/posts?per_page=100&start_time={PARENT_POSTS_CURSOR}&page=2", - {"posts": [{"id": 3, "updated_at": POST_3_UPDATED_AT}]}, - ), - # Fetch the first page of comments for post 1 - ( - "https://api.example.com/community/posts/1/comments?per_page=100", { - "comments": [ - { - "id": 9, - "post_id": 1, - "updated_at": COMMENT_9_OLDEST, - }, - { - "id": 10, - "post_id": 1, - "updated_at": COMMENT_10_UPDATED_AT, - }, - { - "id": 11, - "post_id": 1, - "updated_at": COMMENT_11_UPDATED_AT, - }, - ], - "next_page": "https://api.example.com/community/posts/1/comments?per_page=100&page=2", + "id": 10, + "post_id": 1, + "updated_at": COMMENT_10_UPDATED_AT, }, - ), - # Fetch the second page of comments for post 1 - ( - "https://api.example.com/community/posts/1/comments?per_page=100&page=2", - {"comments": [{"id": 12, "post_id": 1, "updated_at": COMMENT_12_UPDATED_AT}]}, - ), - # Fetch the first page of votes for comment 10 of post 1 - ( - f"https://api.example.com/community/posts/1/comments/10/votes?per_page=100&start_time={INITIAL_STATE_PARTITION_10_CURSOR}", { - "votes": [ - { - "id": 100, - "comment_id": 10, - "created_at": VOTE_100_CREATED_AT, - } - ], - "next_page": ( - f"https://api.example.com/community/posts/1/comments/10/votes" - f"?per_page=100&page=2&start_time={INITIAL_STATE_PARTITION_10_CURSOR}" - ), + "id": 11, + "post_id": 1, + "updated_at": COMMENT_11_UPDATED_AT, }, - ), - # Fetch the second page of votes for comment 10 of post 1 - ( - f"https://api.example.com/community/posts/1/comments/10/votes" - f"?per_page=100&page=2&start_time={INITIAL_STATE_PARTITION_10_CURSOR}", - {"votes": [{"id": 101, "comment_id": 10, "created_at": VOTE_101_CREATED_AT}]}, - ), - # Fetch the first page of votes for comment 11 of post 1 - ( - f"https://api.example.com/community/posts/1/comments/11/votes" - f"?per_page=100&start_time={INITIAL_STATE_PARTITION_11_CURSOR}", - {"votes": [{"id": 111, "comment_id": 11, "created_at": VOTE_111_CREATED_AT}]}, - ), - # Fetch the first page of votes for comment 12 of post 1 - ( - f"https://api.example.com/community/posts/1/comments/12/votes?per_page=100&start_time={LOOKBACK_DATE}", - {"votes": []}, - ), - # Fetch the first page of comments for post 2 - ( - "https://api.example.com/community/posts/2/comments?per_page=100", + ], + "next_page": "https://api.example.com/community/posts/1/comments?per_page=100&page=2", + }, + ), + # Fetch the second page of comments for post 1 + ( + "https://api.example.com/community/posts/1/comments?per_page=100&page=2", + {"comments": [{"id": 12, "post_id": 1, "updated_at": COMMENT_12_UPDATED_AT}]}, + ), + # Fetch the first page of votes for comment 10 of post 1 + ( + f"https://api.example.com/community/posts/1/comments/10/votes?per_page=100&start_time={INITIAL_STATE_PARTITION_10_CURSOR}", + { + "votes": [ { - "comments": [{"id": 20, "post_id": 2, "updated_at": COMMENT_20_UPDATED_AT}], - "next_page": "https://api.example.com/community/posts/2/comments?per_page=100&page=2", - }, - ), - # Fetch the second page of comments for post 2 - ( - "https://api.example.com/community/posts/2/comments?per_page=100&page=2", - {"comments": [{"id": 21, "post_id": 2, "updated_at": COMMENT_21_UPDATED_AT}]}, - ), - # Fetch the first page of votes for comment 20 of post 2 - ( - f"https://api.example.com/community/posts/2/comments/20/votes?per_page=100&start_time={LOOKBACK_DATE}", - {"votes": [{"id": 200, "comment_id": 20, "created_at": VOTE_200_CREATED_AT}]}, - ), - # Fetch the first page of votes for comment 21 of post 2 - ( - f"https://api.example.com/community/posts/2/comments/21/votes?per_page=100&start_time={LOOKBACK_DATE}", - {"votes": [{"id": 210, "comment_id": 21, "created_at": VOTE_210_CREATED_AT}]}, - ), - # Fetch the first page of comments for post 3 - ( - "https://api.example.com/community/posts/3/comments?per_page=100", - {"comments": [{"id": 30, "post_id": 3, "updated_at": COMMENT_30_UPDATED_AT}]}, - ), - # Fetch the first page of votes for comment 30 of post 3 - ( - f"https://api.example.com/community/posts/3/comments/30/votes?per_page=100&start_time={LOOKBACK_DATE}", + "id": 100, + "comment_id": 10, + "created_at": VOTE_100_CREATED_AT, + } + ], + "next_page": ( + f"https://api.example.com/community/posts/1/comments/10/votes" + f"?per_page=100&page=2&start_time={INITIAL_STATE_PARTITION_10_CURSOR}" + ), + }, + ), + # Fetch the second page of votes for comment 10 of post 1 + ( + f"https://api.example.com/community/posts/1/comments/10/votes" + f"?per_page=100&page=2&start_time={INITIAL_STATE_PARTITION_10_CURSOR}", + {"votes": [{"id": 101, "comment_id": 10, "created_at": VOTE_101_CREATED_AT}]}, + ), + # Fetch the first page of votes for comment 11 of post 1 + ( + f"https://api.example.com/community/posts/1/comments/11/votes" + f"?per_page=100&start_time={INITIAL_STATE_PARTITION_11_CURSOR}", + {"votes": [{"id": 111, "comment_id": 11, "created_at": VOTE_111_CREATED_AT}]}, + ), + # Fetch the first page of votes for comment 12 of post 1 + ( + f"https://api.example.com/community/posts/1/comments/12/votes?per_page=100&start_time={LOOKBACK_DATE}", + {"votes": []}, + ), + # Fetch the first page of comments for post 2 + ( + "https://api.example.com/community/posts/2/comments?per_page=100", + { + "comments": [{"id": 20, "post_id": 2, "updated_at": COMMENT_20_UPDATED_AT}], + "next_page": "https://api.example.com/community/posts/2/comments?per_page=100&page=2", + }, + ), + # Fetch the second page of comments for post 2 + ( + "https://api.example.com/community/posts/2/comments?per_page=100&page=2", + {"comments": [{"id": 21, "post_id": 2, "updated_at": COMMENT_21_UPDATED_AT}]}, + ), + # Fetch the first page of votes for comment 20 of post 2 + ( + f"https://api.example.com/community/posts/2/comments/20/votes?per_page=100&start_time={LOOKBACK_DATE}", + {"votes": [{"id": 200, "comment_id": 20, "created_at": VOTE_200_CREATED_AT}]}, + ), + # Fetch the first page of votes for comment 21 of post 2 + ( + f"https://api.example.com/community/posts/2/comments/21/votes?per_page=100&start_time={LOOKBACK_DATE}", + {"votes": [{"id": 210, "comment_id": 21, "created_at": VOTE_210_CREATED_AT}]}, + ), + # Fetch the first page of comments for post 3 + ( + "https://api.example.com/community/posts/3/comments?per_page=100", + {"comments": [{"id": 30, "post_id": 3, "updated_at": COMMENT_30_UPDATED_AT}]}, + ), + # Fetch the first page of votes for comment 30 of post 3 + ( + f"https://api.example.com/community/posts/3/comments/30/votes?per_page=100&start_time={LOOKBACK_DATE}", + { + "votes": [ { - "votes": [ - { - "id": 300, - "comment_id": 30, - "created_at": VOTE_300_CREATED_AT_TIMESTAMP, - } - ] - }, - ), - # Requests with intermediate states - # Fetch votes for comment 10 of post 1 - ( - f"https://api.example.com/community/posts/1/comments/10/votes?per_page=100&start_time={VOTE_100_CREATED_AT}", + "id": 300, + "comment_id": 30, + "created_at": VOTE_300_CREATED_AT_TIMESTAMP, + } + ] + }, + ), + # Requests with intermediate states + # Fetch votes for comment 10 of post 1 + ( + f"https://api.example.com/community/posts/1/comments/10/votes?per_page=100&start_time={VOTE_100_CREATED_AT}", + { + "votes": [{"id": 100, "comment_id": 10, "created_at": VOTE_100_CREATED_AT}], + }, + ), + # Fetch votes for comment 11 of post 1 + ( + f"https://api.example.com/community/posts/1/comments/11/votes?per_page=100&start_time={VOTE_111_CREATED_AT}", + { + "votes": [{"id": 111, "comment_id": 11, "created_at": VOTE_111_CREATED_AT}], + }, + ), + # Fetch votes for comment 12 of post 1 + ( + f"https://api.example.com/community/posts/1/comments/12/votes?per_page=100&start_time={VOTE_111_CREATED_AT}", + { + "votes": [], + }, + ), + # Fetch votes for comment 20 of post 2 + ( + f"https://api.example.com/community/posts/2/comments/20/votes?per_page=100&start_time={VOTE_200_CREATED_AT}", + {"votes": [{"id": 200, "comment_id": 20, "created_at": VOTE_200_CREATED_AT}]}, + ), + # Fetch votes for comment 21 of post 2 + ( + f"https://api.example.com/community/posts/2/comments/21/votes?per_page=100&start_time={VOTE_210_CREATED_AT}", + {"votes": [{"id": 210, "comment_id": 21, "created_at": VOTE_210_CREATED_AT}]}, + ), + # Fetch votes for comment 30 of post 3 + ( + f"https://api.example.com/community/posts/3/comments/30/votes?per_page=100&start_time={VOTE_300_CREATED_AT}", + { + "votes": [ { - "votes": [{"id": 100, "comment_id": 10, "created_at": VOTE_100_CREATED_AT}], - }, - ), - # Fetch votes for comment 11 of post 1 - ( - f"https://api.example.com/community/posts/1/comments/11/votes?per_page=100&start_time={VOTE_111_CREATED_AT}", + "id": 300, + "comment_id": 30, + "created_at": VOTE_300_CREATED_AT_TIMESTAMP, + } + ] + }, + ), + ], + # Expected records + [ + { + "comment_id": 10, + "comment_updated_at": COMMENT_10_UPDATED_AT, + "created_at": VOTE_100_CREATED_AT, + "id": 100, + }, + { + "comment_id": 10, + "comment_updated_at": COMMENT_10_UPDATED_AT, + "created_at": VOTE_101_CREATED_AT, + "id": 101, + }, + { + "comment_id": 11, + "comment_updated_at": COMMENT_11_UPDATED_AT, + "created_at": VOTE_111_CREATED_AT, + "id": 111, + }, + { + "comment_id": 20, + "comment_updated_at": COMMENT_20_UPDATED_AT, + "created_at": VOTE_200_CREATED_AT, + "id": 200, + }, + { + "comment_id": 21, + "comment_updated_at": COMMENT_21_UPDATED_AT, + "created_at": VOTE_210_CREATED_AT, + "id": 210, + }, + { + "comment_id": 30, + "comment_updated_at": COMMENT_30_UPDATED_AT, + "created_at": str(VOTE_300_CREATED_AT_TIMESTAMP), + "id": 300, + }, + ], + # Number of intermediate states - 6 as number of parent partitions + 6, + # Intermediate states + INTERMEDIATE_STATES, + # Initial state + INITIAL_STATE, + # Expected state + { + "state": {"created_at": VOTE_100_CREATED_AT}, + "parent_state": { + "post_comments": { + "use_global_cursor": False, + "state": {"updated_at": COMMENT_10_UPDATED_AT}, # 10 is the "latest" + "parent_state": { + "posts": {"updated_at": POST_1_UPDATED_AT} + }, # post 1 is the latest + "lookback_window": 1, + "states": [ { - "votes": [{"id": 111, "comment_id": 11, "created_at": VOTE_111_CREATED_AT}], + "partition": {"id": 1, "parent_slice": {}}, + "cursor": {"updated_at": COMMENT_10_UPDATED_AT}, }, - ), - # Fetch votes for comment 12 of post 1 - ( - f"https://api.example.com/community/posts/1/comments/12/votes?per_page=100&start_time={VOTE_111_CREATED_AT}", { - "votes": [], + "partition": {"id": 2, "parent_slice": {}}, + "cursor": {"updated_at": COMMENT_20_UPDATED_AT}, }, - ), - # Fetch votes for comment 20 of post 2 - ( - f"https://api.example.com/community/posts/2/comments/20/votes?per_page=100&start_time={VOTE_200_CREATED_AT}", - {"votes": [{"id": 200, "comment_id": 20, "created_at": VOTE_200_CREATED_AT}]}, - ), - # Fetch votes for comment 21 of post 2 - ( - f"https://api.example.com/community/posts/2/comments/21/votes?per_page=100&start_time={VOTE_210_CREATED_AT}", - {"votes": [{"id": 210, "comment_id": 21, "created_at": VOTE_210_CREATED_AT}]}, - ), - # Fetch votes for comment 30 of post 3 - ( - f"https://api.example.com/community/posts/3/comments/30/votes?per_page=100&start_time={VOTE_300_CREATED_AT}", { - "votes": [ - { - "id": 300, - "comment_id": 30, - "created_at": VOTE_300_CREATED_AT_TIMESTAMP, - } - ] + "partition": {"id": 3, "parent_slice": {}}, + "cursor": {"updated_at": COMMENT_30_UPDATED_AT}, }, - ), - ], - # Expected records - [ + ], + } + }, + "lookback_window": 1, + "use_global_cursor": False, + "states": [ { - "comment_id": 10, - "comment_updated_at": COMMENT_10_UPDATED_AT, - "created_at": VOTE_100_CREATED_AT, - "id": 100, + "partition": {"id": 10, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": VOTE_100_CREATED_AT}, }, { - "comment_id": 10, - "comment_updated_at": COMMENT_10_UPDATED_AT, - "created_at": VOTE_101_CREATED_AT, - "id": 101, + "partition": {"id": 11, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": VOTE_111_CREATED_AT}, }, { - "comment_id": 11, - "comment_updated_at": COMMENT_11_UPDATED_AT, - "created_at": VOTE_111_CREATED_AT, - "id": 111, + "partition": {"id": 12, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": LOOKBACK_DATE}, }, { - "comment_id": 20, - "comment_updated_at": COMMENT_20_UPDATED_AT, - "created_at": VOTE_200_CREATED_AT, - "id": 200, + "partition": {"id": 20, "parent_slice": {"id": 2, "parent_slice": {}}}, + "cursor": {"created_at": VOTE_200_CREATED_AT}, }, { - "comment_id": 21, - "comment_updated_at": COMMENT_21_UPDATED_AT, - "created_at": VOTE_210_CREATED_AT, - "id": 210, + "partition": {"id": 21, "parent_slice": {"id": 2, "parent_slice": {}}}, + "cursor": {"created_at": VOTE_210_CREATED_AT}, }, { - "comment_id": 30, - "comment_updated_at": COMMENT_30_UPDATED_AT, - "created_at": str(VOTE_300_CREATED_AT_TIMESTAMP), - "id": 300, + "partition": {"id": 30, "parent_slice": {"id": 3, "parent_slice": {}}}, + "cursor": {"created_at": VOTE_300_CREATED_AT}, }, ], - # Number of intermediate states - 6 as number of parent partitions - 6, - # Intermediate states - INTERMEDIATE_STATES, - # Initial state - INITIAL_STATE, - # Expected state - { - "state": {"created_at": VOTE_100_CREATED_AT}, - "parent_state": { - "post_comments": { - "use_global_cursor": False, - "state": {"updated_at": COMMENT_10_UPDATED_AT}, # 10 is the "latest" - "parent_state": { - "posts": {"updated_at": POST_1_UPDATED_AT} - }, # post 1 is the latest - "lookback_window": 1, - "states": [ - { - "partition": {"id": 1, "parent_slice": {}}, - "cursor": {"updated_at": COMMENT_10_UPDATED_AT}, - }, - { - "partition": {"id": 2, "parent_slice": {}}, - "cursor": {"updated_at": COMMENT_20_UPDATED_AT}, - }, - { - "partition": {"id": 3, "parent_slice": {}}, - "cursor": {"updated_at": COMMENT_30_UPDATED_AT}, - }, - ], - } - }, - "lookback_window": 1, - "use_global_cursor": False, - "states": [ - { - "partition": {"id": 10, "parent_slice": {"id": 1, "parent_slice": {}}}, - "cursor": {"created_at": VOTE_100_CREATED_AT}, - }, - { - "partition": {"id": 11, "parent_slice": {"id": 1, "parent_slice": {}}}, - "cursor": {"created_at": VOTE_111_CREATED_AT}, - }, - { - "partition": {"id": 12, "parent_slice": {"id": 1, "parent_slice": {}}}, - "cursor": {"created_at": LOOKBACK_DATE}, - }, - { - "partition": {"id": 20, "parent_slice": {"id": 2, "parent_slice": {}}}, - "cursor": {"created_at": VOTE_200_CREATED_AT}, - }, - { - "partition": {"id": 21, "parent_slice": {"id": 2, "parent_slice": {}}}, - "cursor": {"created_at": VOTE_210_CREATED_AT}, - }, - { - "partition": {"id": 30, "parent_slice": {"id": 3, "parent_slice": {}}}, - "cursor": {"created_at": VOTE_300_CREATED_AT}, - }, - ], - }, + }, ), ], ) def test_incremental_parent_state( - test_name, - manifest, - mock_requests, - expected_records, - num_intermediate_states, - intermidiate_states, - initial_state, - expected_state, + test_name, + manifest, + mock_requests, + expected_records, + num_intermediate_states, + intermidiate_states, + initial_state, + expected_state, ): # Patch `_throttle_state_message` so it always returns a float (indicating "no throttle") with patch.object( - ConcurrentPerPartitionCursor, "_throttle_state_message", return_value=9999999.0 + ConcurrentPerPartitionCursor, "_throttle_state_message", return_value=9999999.0 ): run_incremental_parent_state_test( manifest, @@ -1546,208 +1546,208 @@ def test_incremental_parent_state( "test_name, manifest, mock_requests, expected_records", [ ( - "test_incremental_parent_state", - SUBSTREAM_MANIFEST, - [ - # Fetch the first page of posts - ( - f"https://api.example.com/community/posts?per_page=100&start_time={PARTITION_SYNC_START_TIME}", - { - "posts": [ - {"id": 1, "updated_at": POST_1_UPDATED_AT}, - {"id": 2, "updated_at": POST_2_UPDATED_AT}, - ], - "next_page": ( - f"https://api.example.com/community/posts?per_page=100" - f"&start_time={PARTITION_SYNC_START_TIME}&page=2" - ), - }, - ), - # Fetch the second page of posts - ( - f"https://api.example.com/community/posts?per_page=100" - f"&start_time={PARTITION_SYNC_START_TIME}&page=2", - {"posts": [{"id": 3, "updated_at": POST_3_UPDATED_AT}]}, - ), - # Fetch the first page of comments for post 1 - ( - "https://api.example.com/community/posts/1/comments?per_page=100", - { - "comments": [ - {"id": 9, "post_id": 1, "updated_at": COMMENT_9_OLDEST}, - {"id": 10, "post_id": 1, "updated_at": COMMENT_10_UPDATED_AT}, - {"id": 11, "post_id": 1, "updated_at": COMMENT_11_UPDATED_AT}, - ], - "next_page": ( - "https://api.example.com/community/posts/1/comments" - "?per_page=100&page=2" - ), - }, - ), - # Fetch the second page of comments for post 1 - ( - "https://api.example.com/community/posts/1/comments?per_page=100&page=2", - {"comments": [{"id": 12, "post_id": 1, "updated_at": COMMENT_12_UPDATED_AT}]}, - ), - # Fetch the first page of votes for comment 10 of post 1 - ( - f"https://api.example.com/community/posts/1/comments/10/votes" - f"?per_page=100&start_time={PARTITION_SYNC_START_TIME}", - { - "votes": [{"id": 100, "comment_id": 10, "created_at": VOTE_100_CREATED_AT}], - "next_page": ( - f"https://api.example.com/community/posts/1/comments/10/votes" - f"?per_page=100&page=2&start_time={PARTITION_SYNC_START_TIME}" - ), - }, - ), - # Fetch the second page of votes for comment 10 of post 1 - ( - f"https://api.example.com/community/posts/1/comments/10/votes" - f"?per_page=100&page=2&start_time={PARTITION_SYNC_START_TIME}", - {"votes": [{"id": 101, "comment_id": 10, "created_at": VOTE_101_CREATED_AT}]}, - ), - # Fetch the first page of votes for comment 11 of post 1 - ( - f"https://api.example.com/community/posts/1/comments/11/votes" - f"?per_page=100&start_time={PARTITION_SYNC_START_TIME}", - {"votes": [{"id": 111, "comment_id": 11, "created_at": VOTE_111_CREATED_AT}]}, - ), - # Fetch the first page of votes for comment 12 of post 1 - ( - f"https://api.example.com/community/posts/1/comments/12/votes" - f"?per_page=100&start_time={PARTITION_SYNC_START_TIME}", - {"votes": []}, - ), - # Fetch the first page of comments for post 2 - ( - "https://api.example.com/community/posts/2/comments?per_page=100", - { - "comments": [{"id": 20, "post_id": 2, "updated_at": COMMENT_20_UPDATED_AT}], - "next_page": ( - "https://api.example.com/community/posts/2/comments" - "?per_page=100&page=2" - ), - }, - ), - # Fetch the second page of comments for post 2 - ( - "https://api.example.com/community/posts/2/comments?per_page=100&page=2", - {"comments": [{"id": 21, "post_id": 2, "updated_at": COMMENT_21_UPDATED_AT}]}, - ), - # Fetch the first page of votes for comment 20 of post 2 - ( - f"https://api.example.com/community/posts/2/comments/20/votes" - f"?per_page=100&start_time={PARTITION_SYNC_START_TIME}", - {"votes": [{"id": 200, "comment_id": 20, "created_at": VOTE_200_CREATED_AT}]}, - ), - # Fetch the first page of votes for comment 21 of post 2 - ( - f"https://api.example.com/community/posts/2/comments/21/votes" - f"?per_page=100&start_time={PARTITION_SYNC_START_TIME}", - {"votes": [{"id": 210, "comment_id": 21, "created_at": VOTE_210_CREATED_AT}]}, - ), - # Fetch the first page of comments for post 3 - ( - "https://api.example.com/community/posts/3/comments?per_page=100", - {"comments": [{"id": 30, "post_id": 3, "updated_at": COMMENT_30_UPDATED_AT}]}, - ), - # Fetch the first page of votes for comment 30 of post 3 - ( - f"https://api.example.com/community/posts/3/comments/30/votes" - f"?per_page=100&start_time={PARTITION_SYNC_START_TIME}", - { - "votes": [ - { - "id": 300, - "comment_id": 30, - "created_at": VOTE_300_CREATED_AT_TIMESTAMP, - } - ] - }, - ), - ], - # Expected records - [ - { - "comment_id": 10, - "comment_updated_at": COMMENT_10_UPDATED_AT, - "created_at": VOTE_100_CREATED_AT, - "id": 100, - }, + "test_incremental_parent_state", + SUBSTREAM_MANIFEST, + [ + # Fetch the first page of posts + ( + f"https://api.example.com/community/posts?per_page=100&start_time={PARTITION_SYNC_START_TIME}", { - "comment_id": 10, - "comment_updated_at": COMMENT_10_UPDATED_AT, - "created_at": VOTE_101_CREATED_AT, - "id": 101, + "posts": [ + {"id": 1, "updated_at": POST_1_UPDATED_AT}, + {"id": 2, "updated_at": POST_2_UPDATED_AT}, + ], + "next_page": ( + f"https://api.example.com/community/posts?per_page=100" + f"&start_time={PARTITION_SYNC_START_TIME}&page=2" + ), }, + ), + # Fetch the second page of posts + ( + f"https://api.example.com/community/posts?per_page=100" + f"&start_time={PARTITION_SYNC_START_TIME}&page=2", + {"posts": [{"id": 3, "updated_at": POST_3_UPDATED_AT}]}, + ), + # Fetch the first page of comments for post 1 + ( + "https://api.example.com/community/posts/1/comments?per_page=100", { - "comment_id": 11, - "comment_updated_at": COMMENT_11_UPDATED_AT, - "created_at": VOTE_111_CREATED_AT, - "id": 111, + "comments": [ + {"id": 9, "post_id": 1, "updated_at": COMMENT_9_OLDEST}, + {"id": 10, "post_id": 1, "updated_at": COMMENT_10_UPDATED_AT}, + {"id": 11, "post_id": 1, "updated_at": COMMENT_11_UPDATED_AT}, + ], + "next_page": ( + "https://api.example.com/community/posts/1/comments" + "?per_page=100&page=2" + ), }, + ), + # Fetch the second page of comments for post 1 + ( + "https://api.example.com/community/posts/1/comments?per_page=100&page=2", + {"comments": [{"id": 12, "post_id": 1, "updated_at": COMMENT_12_UPDATED_AT}]}, + ), + # Fetch the first page of votes for comment 10 of post 1 + ( + f"https://api.example.com/community/posts/1/comments/10/votes" + f"?per_page=100&start_time={PARTITION_SYNC_START_TIME}", { - "comment_id": 20, - "comment_updated_at": COMMENT_20_UPDATED_AT, - "created_at": VOTE_200_CREATED_AT, - "id": 200, + "votes": [{"id": 100, "comment_id": 10, "created_at": VOTE_100_CREATED_AT}], + "next_page": ( + f"https://api.example.com/community/posts/1/comments/10/votes" + f"?per_page=100&page=2&start_time={PARTITION_SYNC_START_TIME}" + ), }, + ), + # Fetch the second page of votes for comment 10 of post 1 + ( + f"https://api.example.com/community/posts/1/comments/10/votes" + f"?per_page=100&page=2&start_time={PARTITION_SYNC_START_TIME}", + {"votes": [{"id": 101, "comment_id": 10, "created_at": VOTE_101_CREATED_AT}]}, + ), + # Fetch the first page of votes for comment 11 of post 1 + ( + f"https://api.example.com/community/posts/1/comments/11/votes" + f"?per_page=100&start_time={PARTITION_SYNC_START_TIME}", + {"votes": [{"id": 111, "comment_id": 11, "created_at": VOTE_111_CREATED_AT}]}, + ), + # Fetch the first page of votes for comment 12 of post 1 + ( + f"https://api.example.com/community/posts/1/comments/12/votes" + f"?per_page=100&start_time={PARTITION_SYNC_START_TIME}", + {"votes": []}, + ), + # Fetch the first page of comments for post 2 + ( + "https://api.example.com/community/posts/2/comments?per_page=100", { - "comment_id": 21, - "comment_updated_at": COMMENT_21_UPDATED_AT, - "created_at": VOTE_210_CREATED_AT, - "id": 210, + "comments": [{"id": 20, "post_id": 2, "updated_at": COMMENT_20_UPDATED_AT}], + "next_page": ( + "https://api.example.com/community/posts/2/comments" + "?per_page=100&page=2" + ), }, + ), + # Fetch the second page of comments for post 2 + ( + "https://api.example.com/community/posts/2/comments?per_page=100&page=2", + {"comments": [{"id": 21, "post_id": 2, "updated_at": COMMENT_21_UPDATED_AT}]}, + ), + # Fetch the first page of votes for comment 20 of post 2 + ( + f"https://api.example.com/community/posts/2/comments/20/votes" + f"?per_page=100&start_time={PARTITION_SYNC_START_TIME}", + {"votes": [{"id": 200, "comment_id": 20, "created_at": VOTE_200_CREATED_AT}]}, + ), + # Fetch the first page of votes for comment 21 of post 2 + ( + f"https://api.example.com/community/posts/2/comments/21/votes" + f"?per_page=100&start_time={PARTITION_SYNC_START_TIME}", + {"votes": [{"id": 210, "comment_id": 21, "created_at": VOTE_210_CREATED_AT}]}, + ), + # Fetch the first page of comments for post 3 + ( + "https://api.example.com/community/posts/3/comments?per_page=100", + {"comments": [{"id": 30, "post_id": 3, "updated_at": COMMENT_30_UPDATED_AT}]}, + ), + # Fetch the first page of votes for comment 30 of post 3 + ( + f"https://api.example.com/community/posts/3/comments/30/votes" + f"?per_page=100&start_time={PARTITION_SYNC_START_TIME}", { - "comment_id": 30, - "comment_updated_at": COMMENT_30_UPDATED_AT, - "created_at": str(VOTE_300_CREATED_AT_TIMESTAMP), - "id": 300, + "votes": [ + { + "id": 300, + "comment_id": 30, + "created_at": VOTE_300_CREATED_AT_TIMESTAMP, + } + ] }, - ], - ), - ], -) + ), + ], + # Expected records + [ + { + "comment_id": 10, + "comment_updated_at": COMMENT_10_UPDATED_AT, + "created_at": VOTE_100_CREATED_AT, + "id": 100, + }, + { + "comment_id": 10, + "comment_updated_at": COMMENT_10_UPDATED_AT, + "created_at": VOTE_101_CREATED_AT, + "id": 101, + }, + { + "comment_id": 11, + "comment_updated_at": COMMENT_11_UPDATED_AT, + "created_at": VOTE_111_CREATED_AT, + "id": 111, + }, + { + "comment_id": 20, + "comment_updated_at": COMMENT_20_UPDATED_AT, + "created_at": VOTE_200_CREATED_AT, + "id": 200, + }, + { + "comment_id": 21, + "comment_updated_at": COMMENT_21_UPDATED_AT, + "created_at": VOTE_210_CREATED_AT, + "id": 210, + }, + { + "comment_id": 30, + "comment_updated_at": COMMENT_30_UPDATED_AT, + "created_at": str(VOTE_300_CREATED_AT_TIMESTAMP), + "id": 300, + }, + ], + ), + ], +) @pytest.mark.parametrize( "initial_state, expected_state", [ ({"created_at": PARTITION_SYNC_START_TIME}, STATE_MIGRATION_EXPECTED_STATE), ( - { - "state": {"created_at": PARTITION_SYNC_START_TIME}, - "lookback_window": 0, - "use_global_cursor": False, - "parent_state": { - "post_comments": { - "state": {"updated_at": PARTITION_SYNC_START_TIME}, - "parent_state": {"posts": {"updated_at": PARTITION_SYNC_START_TIME}}, - "lookback_window": 0, - } - }, + { + "state": {"created_at": PARTITION_SYNC_START_TIME}, + "lookback_window": 0, + "use_global_cursor": False, + "parent_state": { + "post_comments": { + "state": {"updated_at": PARTITION_SYNC_START_TIME}, + "parent_state": {"posts": {"updated_at": PARTITION_SYNC_START_TIME}}, + "lookback_window": 0, + } }, - STATE_MIGRATION_EXPECTED_STATE, + }, + STATE_MIGRATION_EXPECTED_STATE, ), ( - { - "state": {"created_at": PARTITION_SYNC_START_TIME}, - "lookback_window": 0, - "use_global_cursor": True, - "parent_state": { - "post_comments": { - "state": {"updated_at": PARTITION_SYNC_START_TIME}, - "parent_state": {"posts": {"updated_at": PARTITION_SYNC_START_TIME}}, - "lookback_window": 0, - } - }, + { + "state": {"created_at": PARTITION_SYNC_START_TIME}, + "lookback_window": 0, + "use_global_cursor": True, + "parent_state": { + "post_comments": { + "state": {"updated_at": PARTITION_SYNC_START_TIME}, + "parent_state": {"posts": {"updated_at": PARTITION_SYNC_START_TIME}}, + "lookback_window": 0, + } }, - STATE_MIGRATION_GLOBAL_EXPECTED_STATE, + }, + STATE_MIGRATION_GLOBAL_EXPECTED_STATE, ), ( - { - "state": {"created_at": PARTITION_SYNC_START_TIME}, - }, - STATE_MIGRATION_EXPECTED_STATE, + { + "state": {"created_at": PARTITION_SYNC_START_TIME}, + }, + STATE_MIGRATION_EXPECTED_STATE, ), ], ids=[ @@ -1758,7 +1758,7 @@ def test_incremental_parent_state( ], ) def test_incremental_parent_state_migration( - test_name, manifest, mock_requests, expected_records, initial_state, expected_state + test_name, manifest, mock_requests, expected_records, initial_state, expected_state ): """ Test incremental partition router with parent state migration @@ -1778,101 +1778,101 @@ def test_incremental_parent_state_migration( "test_name, manifest, mock_requests, expected_records, initial_state, expected_state", [ ( - "test_incremental_parent_state", - SUBSTREAM_MANIFEST, - [ - # Fetch the first page of posts - ( - f"https://api.example.com/community/posts?per_page=100&start_time={PARENT_POSTS_CURSOR}", - { - "posts": [], - "next_page": ( - f"https://api.example.com/community/posts?per_page=100" - f"&start_time={PARENT_POSTS_CURSOR}&page=2" - ), - }, - ), - # Fetch the second page of posts - ( + "test_incremental_parent_state", + SUBSTREAM_MANIFEST, + [ + # Fetch the first page of posts + ( + f"https://api.example.com/community/posts?per_page=100&start_time={PARENT_POSTS_CURSOR}", + { + "posts": [], + "next_page": ( f"https://api.example.com/community/posts?per_page=100" - f"&start_time={PARENT_POSTS_CURSOR}&page=2", - {"posts": []}, - ), - ], - # Expected records (empty) - [], - # Initial state - { - "parent_state": { - "post_comments": { - "states": [ - { - "partition": {"id": 1, "parent_slice": {}}, - "cursor": {"updated_at": PARENT_COMMENT_CURSOR_PARTITION_1}, - } - ], - "parent_state": {"posts": {"updated_at": PARENT_POSTS_CURSOR}}, - } + f"&start_time={PARENT_POSTS_CURSOR}&page=2" + ), }, - "states": [ - { - "partition": { - "id": 10, - "parent_slice": {"id": 1, "parent_slice": {}}, - }, - "cursor": {"created_at": INITIAL_STATE_PARTITION_10_CURSOR}, + ), + # Fetch the second page of posts + ( + f"https://api.example.com/community/posts?per_page=100" + f"&start_time={PARENT_POSTS_CURSOR}&page=2", + {"posts": []}, + ), + ], + # Expected records (empty) + [], + # Initial state + { + "parent_state": { + "post_comments": { + "states": [ + { + "partition": {"id": 1, "parent_slice": {}}, + "cursor": {"updated_at": PARENT_COMMENT_CURSOR_PARTITION_1}, + } + ], + "parent_state": {"posts": {"updated_at": PARENT_POSTS_CURSOR}}, + } + }, + "states": [ + { + "partition": { + "id": 10, + "parent_slice": {"id": 1, "parent_slice": {}}, }, - { - "partition": { - "id": 11, - "parent_slice": {"id": 1, "parent_slice": {}}, - }, - "cursor": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, + "cursor": {"created_at": INITIAL_STATE_PARTITION_10_CURSOR}, + }, + { + "partition": { + "id": 11, + "parent_slice": {"id": 1, "parent_slice": {}}, }, - ], - "state": {"created_at": INITIAL_GLOBAL_CURSOR}, - "lookback_window": 1, - }, - # Expected state - { - "parent_state": { - "post_comments": { - "states": [ - { - "partition": {"id": 1, "parent_slice": {}}, - "cursor": {"updated_at": PARENT_COMMENT_CURSOR_PARTITION_1}, - } - ], - "state": {}, - "use_global_cursor": False, - "parent_state": {"posts": {"updated_at": PARENT_POSTS_CURSOR}}, - } + "cursor": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, }, - "states": [ - { - "partition": { - "id": 10, - "parent_slice": {"id": 1, "parent_slice": {}}, - }, - "cursor": {"created_at": INITIAL_STATE_PARTITION_10_CURSOR}, + ], + "state": {"created_at": INITIAL_GLOBAL_CURSOR}, + "lookback_window": 1, + }, + # Expected state + { + "parent_state": { + "post_comments": { + "states": [ + { + "partition": {"id": 1, "parent_slice": {}}, + "cursor": {"updated_at": PARENT_COMMENT_CURSOR_PARTITION_1}, + } + ], + "state": {}, + "use_global_cursor": False, + "parent_state": {"posts": {"updated_at": PARENT_POSTS_CURSOR}}, + } + }, + "states": [ + { + "partition": { + "id": 10, + "parent_slice": {"id": 1, "parent_slice": {}}, }, - { - "partition": { - "id": 11, - "parent_slice": {"id": 1, "parent_slice": {}}, - }, - "cursor": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, + "cursor": {"created_at": INITIAL_STATE_PARTITION_10_CURSOR}, + }, + { + "partition": { + "id": 11, + "parent_slice": {"id": 1, "parent_slice": {}}, }, - ], - "state": {"created_at": INITIAL_GLOBAL_CURSOR}, - "lookback_window": 1, - "use_global_cursor": False, - }, + "cursor": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, + }, + ], + "state": {"created_at": INITIAL_GLOBAL_CURSOR}, + "lookback_window": 1, + "use_global_cursor": False, + }, ), ], ) def test_incremental_parent_state_no_slices( - test_name, manifest, mock_requests, expected_records, initial_state, expected_state + test_name, manifest, mock_requests, expected_records, initial_state, expected_state ): """ Test incremental partition router with no parent records @@ -1892,217 +1892,217 @@ def test_incremental_parent_state_no_slices( "test_name, manifest, mock_requests, expected_records, initial_state, expected_state", [ ( - "test_incremental_parent_state", - SUBSTREAM_MANIFEST, - [ - # Fetch the first page of posts - ( - f"https://api.example.com/community/posts?per_page=100&start_time={PARENT_POSTS_CURSOR}", - { - "posts": [ - {"id": 1, "updated_at": POST_1_UPDATED_AT}, - {"id": 2, "updated_at": POST_2_UPDATED_AT}, - ], - "next_page": ( - f"https://api.example.com/community/posts?per_page=100" - f"&start_time={PARENT_POSTS_CURSOR}&page=2" - ), - }, - ), - # Fetch the second page of posts - ( + "test_incremental_parent_state", + SUBSTREAM_MANIFEST, + [ + # Fetch the first page of posts + ( + f"https://api.example.com/community/posts?per_page=100&start_time={PARENT_POSTS_CURSOR}", + { + "posts": [ + {"id": 1, "updated_at": POST_1_UPDATED_AT}, + {"id": 2, "updated_at": POST_2_UPDATED_AT}, + ], + "next_page": ( f"https://api.example.com/community/posts?per_page=100" - f"&start_time={PARENT_POSTS_CURSOR}&page=2", - {"posts": [{"id": 3, "updated_at": POST_3_UPDATED_AT}]}, - ), - # Fetch the first page of comments for post 1 - ( - "https://api.example.com/community/posts/1/comments?per_page=100", - { - "comments": [ - {"id": 9, "post_id": 1, "updated_at": COMMENT_9_OLDEST}, - {"id": 10, "post_id": 1, "updated_at": COMMENT_10_UPDATED_AT}, - {"id": 11, "post_id": 1, "updated_at": COMMENT_11_UPDATED_AT}, - ], - "next_page": ( - "https://api.example.com/community/posts/1/comments?per_page=100&page=2" - ), - }, - ), - # Fetch the second page of comments for post 1 - ( - "https://api.example.com/community/posts/1/comments?per_page=100&page=2", - {"comments": [{"id": 12, "post_id": 1, "updated_at": COMMENT_12_UPDATED_AT}]}, - ), - # Fetch the first page of votes for comment 10 of post 1 - ( - f"https://api.example.com/community/posts/1/comments/10/votes" - f"?per_page=100&start_time={INITIAL_STATE_PARTITION_10_CURSOR}", - { - "votes": [], - "next_page": ( - f"https://api.example.com/community/posts/1/comments/10/votes" - f"?per_page=100&page=2&start_time={INITIAL_STATE_PARTITION_10_CURSOR}" - ), - }, - ), - # Fetch the second page of votes for comment 10 of post 1 - ( + f"&start_time={PARENT_POSTS_CURSOR}&page=2" + ), + }, + ), + # Fetch the second page of posts + ( + f"https://api.example.com/community/posts?per_page=100" + f"&start_time={PARENT_POSTS_CURSOR}&page=2", + {"posts": [{"id": 3, "updated_at": POST_3_UPDATED_AT}]}, + ), + # Fetch the first page of comments for post 1 + ( + "https://api.example.com/community/posts/1/comments?per_page=100", + { + "comments": [ + {"id": 9, "post_id": 1, "updated_at": COMMENT_9_OLDEST}, + {"id": 10, "post_id": 1, "updated_at": COMMENT_10_UPDATED_AT}, + {"id": 11, "post_id": 1, "updated_at": COMMENT_11_UPDATED_AT}, + ], + "next_page": ( + "https://api.example.com/community/posts/1/comments?per_page=100&page=2" + ), + }, + ), + # Fetch the second page of comments for post 1 + ( + "https://api.example.com/community/posts/1/comments?per_page=100&page=2", + {"comments": [{"id": 12, "post_id": 1, "updated_at": COMMENT_12_UPDATED_AT}]}, + ), + # Fetch the first page of votes for comment 10 of post 1 + ( + f"https://api.example.com/community/posts/1/comments/10/votes" + f"?per_page=100&start_time={INITIAL_STATE_PARTITION_10_CURSOR}", + { + "votes": [], + "next_page": ( f"https://api.example.com/community/posts/1/comments/10/votes" - f"?per_page=100&page=2&start_time={INITIAL_STATE_PARTITION_10_CURSOR}", - {"votes": []}, - ), - # Fetch the first page of votes for comment 11 of post 1 - ( - f"https://api.example.com/community/posts/1/comments/11/votes" - f"?per_page=100&start_time={INITIAL_STATE_PARTITION_11_CURSOR}", - {"votes": []}, - ), - # Fetch the first page of votes for comment 12 of post 1 - ( - f"https://api.example.com/community/posts/1/comments/12/votes" - f"?per_page=100&start_time={INITIAL_STATE_PARTITION_11_CURSOR}", - {"votes": []}, - ), - # Fetch the first page of comments for post 2 - ( - "https://api.example.com/community/posts/2/comments?per_page=100", - { - "comments": [{"id": 20, "post_id": 2, "updated_at": COMMENT_20_UPDATED_AT}], - "next_page": ( - "https://api.example.com/community/posts/2/comments?per_page=100&page=2" - ), - }, - ), - # Fetch the second page of comments for post 2 - ( - "https://api.example.com/community/posts/2/comments?per_page=100&page=2", - {"comments": [{"id": 21, "post_id": 2, "updated_at": COMMENT_21_UPDATED_AT}]}, - ), - # Fetch the first page of votes for comment 20 of post 2 - ( - f"https://api.example.com/community/posts/2/comments/20/votes" - f"?per_page=100&start_time={INITIAL_STATE_PARTITION_11_CURSOR}", - {"votes": []}, - ), - # Fetch the first page of votes for comment 21 of post 2 - ( - f"https://api.example.com/community/posts/2/comments/21/votes" - f"?per_page=100&start_time={INITIAL_STATE_PARTITION_11_CURSOR}", - {"votes": []}, - ), - # Fetch the first page of comments for post 3 - ( - "https://api.example.com/community/posts/3/comments?per_page=100", - {"comments": [{"id": 30, "post_id": 3, "updated_at": COMMENT_30_UPDATED_AT}]}, - ), - # Fetch the first page of votes for comment 30 of post 3 - ( - f"https://api.example.com/community/posts/3/comments/30/votes" - f"?per_page=100&start_time={INITIAL_STATE_PARTITION_11_CURSOR}", - {"votes": []}, - ), - ], - # Expected records - [], - # Initial state - { - "parent_state": { - "post_comments": { - "states": [ - { - "partition": {"id": 1, "parent_slice": {}}, - "cursor": {"updated_at": PARENT_COMMENT_CURSOR_PARTITION_1}, - } - ], - "parent_state": {"posts": {"updated_at": PARENT_POSTS_CURSOR}}, - } + f"?per_page=100&page=2&start_time={INITIAL_STATE_PARTITION_10_CURSOR}" + ), }, - "states": [ - { - "partition": { - "id": 10, - "parent_slice": {"id": 1, "parent_slice": {}}, - }, - "cursor": {"created_at": INITIAL_STATE_PARTITION_10_CURSOR}, - }, - { - "partition": { - "id": 11, - "parent_slice": {"id": 1, "parent_slice": {}}, - }, - "cursor": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, - }, - ], - "use_global_cursor": False, - "state": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, - "lookback_window": 0, + ), + # Fetch the second page of votes for comment 10 of post 1 + ( + f"https://api.example.com/community/posts/1/comments/10/votes" + f"?per_page=100&page=2&start_time={INITIAL_STATE_PARTITION_10_CURSOR}", + {"votes": []}, + ), + # Fetch the first page of votes for comment 11 of post 1 + ( + f"https://api.example.com/community/posts/1/comments/11/votes" + f"?per_page=100&start_time={INITIAL_STATE_PARTITION_11_CURSOR}", + {"votes": []}, + ), + # Fetch the first page of votes for comment 12 of post 1 + ( + f"https://api.example.com/community/posts/1/comments/12/votes" + f"?per_page=100&start_time={INITIAL_STATE_PARTITION_11_CURSOR}", + {"votes": []}, + ), + # Fetch the first page of comments for post 2 + ( + "https://api.example.com/community/posts/2/comments?per_page=100", + { + "comments": [{"id": 20, "post_id": 2, "updated_at": COMMENT_20_UPDATED_AT}], + "next_page": ( + "https://api.example.com/community/posts/2/comments?per_page=100&page=2" + ), + }, + ), + # Fetch the second page of comments for post 2 + ( + "https://api.example.com/community/posts/2/comments?per_page=100&page=2", + {"comments": [{"id": 21, "post_id": 2, "updated_at": COMMENT_21_UPDATED_AT}]}, + ), + # Fetch the first page of votes for comment 20 of post 2 + ( + f"https://api.example.com/community/posts/2/comments/20/votes" + f"?per_page=100&start_time={INITIAL_STATE_PARTITION_11_CURSOR}", + {"votes": []}, + ), + # Fetch the first page of votes for comment 21 of post 2 + ( + f"https://api.example.com/community/posts/2/comments/21/votes" + f"?per_page=100&start_time={INITIAL_STATE_PARTITION_11_CURSOR}", + {"votes": []}, + ), + # Fetch the first page of comments for post 3 + ( + "https://api.example.com/community/posts/3/comments?per_page=100", + {"comments": [{"id": 30, "post_id": 3, "updated_at": COMMENT_30_UPDATED_AT}]}, + ), + # Fetch the first page of votes for comment 30 of post 3 + ( + f"https://api.example.com/community/posts/3/comments/30/votes" + f"?per_page=100&start_time={INITIAL_STATE_PARTITION_11_CURSOR}", + {"votes": []}, + ), + ], + # Expected records + [], + # Initial state + { + "parent_state": { + "post_comments": { + "states": [ + { + "partition": {"id": 1, "parent_slice": {}}, + "cursor": {"updated_at": PARENT_COMMENT_CURSOR_PARTITION_1}, + } + ], + "parent_state": {"posts": {"updated_at": PARENT_POSTS_CURSOR}}, + } }, - # Expected state - { - "lookback_window": 1, - "use_global_cursor": False, - "state": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, - "states": [ - { - "partition": { - "id": 10, - "parent_slice": {"id": 1, "parent_slice": {}}, - }, - "cursor": {"created_at": INITIAL_STATE_PARTITION_10_CURSOR}, - }, - { - "partition": { - "id": 11, - "parent_slice": {"id": 1, "parent_slice": {}}, - }, - "cursor": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, - }, - { - "partition": {"id": 12, "parent_slice": {"id": 1, "parent_slice": {}}}, - "cursor": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, + "states": [ + { + "partition": { + "id": 10, + "parent_slice": {"id": 1, "parent_slice": {}}, }, - { - "partition": {"id": 20, "parent_slice": {"id": 2, "parent_slice": {}}}, - "cursor": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, + "cursor": {"created_at": INITIAL_STATE_PARTITION_10_CURSOR}, + }, + { + "partition": { + "id": 11, + "parent_slice": {"id": 1, "parent_slice": {}}, }, - { - "partition": {"id": 21, "parent_slice": {"id": 2, "parent_slice": {}}}, - "cursor": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, + "cursor": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, + }, + ], + "use_global_cursor": False, + "state": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, + "lookback_window": 0, + }, + # Expected state + { + "lookback_window": 1, + "use_global_cursor": False, + "state": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, + "states": [ + { + "partition": { + "id": 10, + "parent_slice": {"id": 1, "parent_slice": {}}, }, - { - "partition": {"id": 30, "parent_slice": {"id": 3, "parent_slice": {}}}, - "cursor": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, + "cursor": {"created_at": INITIAL_STATE_PARTITION_10_CURSOR}, + }, + { + "partition": { + "id": 11, + "parent_slice": {"id": 1, "parent_slice": {}}, }, - ], - "parent_state": { - "post_comments": { - "use_global_cursor": False, - "state": {"updated_at": COMMENT_10_UPDATED_AT}, - "parent_state": {"posts": {"updated_at": POST_1_UPDATED_AT}}, - "lookback_window": 1, - "states": [ - { - "partition": {"id": 1, "parent_slice": {}}, - "cursor": {"updated_at": COMMENT_10_UPDATED_AT}, - }, - { - "partition": {"id": 2, "parent_slice": {}}, - "cursor": {"updated_at": COMMENT_20_UPDATED_AT}, - }, - { - "partition": {"id": 3, "parent_slice": {}}, - "cursor": {"updated_at": COMMENT_30_UPDATED_AT}, - }, - ], - } + "cursor": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, + }, + { + "partition": {"id": 12, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, }, + { + "partition": {"id": 20, "parent_slice": {"id": 2, "parent_slice": {}}}, + "cursor": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, + }, + { + "partition": {"id": 21, "parent_slice": {"id": 2, "parent_slice": {}}}, + "cursor": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, + }, + { + "partition": {"id": 30, "parent_slice": {"id": 3, "parent_slice": {}}}, + "cursor": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, + }, + ], + "parent_state": { + "post_comments": { + "use_global_cursor": False, + "state": {"updated_at": COMMENT_10_UPDATED_AT}, + "parent_state": {"posts": {"updated_at": POST_1_UPDATED_AT}}, + "lookback_window": 1, + "states": [ + { + "partition": {"id": 1, "parent_slice": {}}, + "cursor": {"updated_at": COMMENT_10_UPDATED_AT}, + }, + { + "partition": {"id": 2, "parent_slice": {}}, + "cursor": {"updated_at": COMMENT_20_UPDATED_AT}, + }, + { + "partition": {"id": 3, "parent_slice": {}}, + "cursor": {"updated_at": COMMENT_30_UPDATED_AT}, + }, + ], + } }, + }, ), ], ) def test_incremental_parent_state_no_records( - test_name, manifest, mock_requests, expected_records, initial_state, expected_state + test_name, manifest, mock_requests, expected_records, initial_state, expected_state ): """ Test incremental partition router with no child records @@ -2122,238 +2122,238 @@ def test_incremental_parent_state_no_records( "test_name, manifest, mock_requests, expected_records, initial_state, expected_state", [ ( - "test_incremental_parent_state", - SUBSTREAM_MANIFEST, - [ - # Fetch the first page of posts - ( - f"https://api.example.com/community/posts?per_page=100&start_time={PARENT_POSTS_CURSOR}", - { - "posts": [ - {"id": 1, "updated_at": POST_1_UPDATED_AT}, - {"id": 2, "updated_at": POST_2_UPDATED_AT}, - ], - "next_page": ( - f"https://api.example.com/community/posts?per_page=100&start_time={PARENT_POSTS_CURSOR}&page=2" - ), - }, - ), - # Fetch the second page of posts - ( - f"https://api.example.com/community/posts?per_page=100&start_time={PARENT_POSTS_CURSOR}&page=2", - {"posts": [{"id": 3, "updated_at": POST_3_UPDATED_AT}]}, - ), - # Fetch the first page of comments for post 1 - ( - "https://api.example.com/community/posts/1/comments?per_page=100", - { - "comments": [ - {"id": 9, "post_id": 1, "updated_at": COMMENT_9_OLDEST}, - {"id": 10, "post_id": 1, "updated_at": COMMENT_10_UPDATED_AT}, - {"id": 11, "post_id": 1, "updated_at": COMMENT_11_UPDATED_AT}, - ], - "next_page": ( - "https://api.example.com/community/posts/1/comments?per_page=100&page=2" - ), - }, - ), - # Fetch the second page of comments for post 1 - ( - "https://api.example.com/community/posts/1/comments?per_page=100&page=2", - {"comments": [{"id": 12, "post_id": 1, "updated_at": COMMENT_12_UPDATED_AT}]}, - ), - # Fetch the first page of votes for comment 10 of post 1 - ( - f"https://api.example.com/community/posts/1/comments/10/votes" - f"?per_page=100&start_time={INITIAL_STATE_PARTITION_10_CURSOR}", - { - "votes": [{"id": 100, "comment_id": 10, "created_at": VOTE_100_CREATED_AT}], - "next_page": ( - f"https://api.example.com/community/posts/1/comments/10/votes" - f"?per_page=100&page=2&start_time={INITIAL_STATE_PARTITION_10_CURSOR}" - ), - }, - ), - # Fetch the second page of votes for comment 10 of post 1 - ( - f"https://api.example.com/community/posts/1/comments/10/votes" - f"?per_page=100&page=2&start_time={INITIAL_STATE_PARTITION_10_CURSOR}", - {"votes": [{"id": 101, "comment_id": 10, "created_at": VOTE_101_CREATED_AT}]}, - ), - # Fetch the first page of votes for comment 11 of post 1 - ( - f"https://api.example.com/community/posts/1/comments/11/votes" - f"?per_page=100&start_time={INITIAL_STATE_PARTITION_11_CURSOR}", - {"votes": [{"id": 111, "comment_id": 11, "created_at": VOTE_111_CREATED_AT}]}, - ), - # Fetch the first page of votes for comment 12 of post 1 - ( - f"https://api.example.com/community/posts/1/comments/12/votes?per_page=100&start_time={LOOKBACK_DATE}", - {"votes": []}, - ), - # Fetch the first page of comments for post 2 - ( - "https://api.example.com/community/posts/2/comments?per_page=100", - { - "comments": [{"id": 20, "post_id": 2, "updated_at": COMMENT_20_UPDATED_AT}], - "next_page": ( - "https://api.example.com/community/posts/2/comments?per_page=100&page=2" - ), - }, - ), - # Fetch the second page of comments for post 2 - ( - "https://api.example.com/community/posts/2/comments?per_page=100&page=2", - {"comments": [{"id": 21, "post_id": 2, "updated_at": COMMENT_21_UPDATED_AT}]}, - ), - # Fetch the first page of votes for comment 20 of post 2 - 404 error - ( - f"https://api.example.com/community/posts/2/comments/20/votes" - f"?per_page=100&start_time={LOOKBACK_DATE}", - None, - ), - # Fetch the first page of votes for comment 21 of post 2 - ( - f"https://api.example.com/community/posts/2/comments/21/votes" - f"?per_page=100&start_time={LOOKBACK_DATE}", - {"votes": [{"id": 210, "comment_id": 21, "created_at": VOTE_210_CREATED_AT}]}, - ), - # Fetch the first page of comments for post 3 - ( - "https://api.example.com/community/posts/3/comments?per_page=100", - {"comments": [{"id": 30, "post_id": 3, "updated_at": COMMENT_30_UPDATED_AT}]}, - ), - # Fetch the first page of votes for comment 30 of post 3 - ( - f"https://api.example.com/community/posts/3/comments/30/votes" - f"?per_page=100&start_time={LOOKBACK_DATE}", - { - "votes": [ - { - "id": 300, - "comment_id": 30, - "created_at": VOTE_300_CREATED_AT_TIMESTAMP, - } - ] - }, - ), - ], - # Expected records - [ + "test_incremental_parent_state", + SUBSTREAM_MANIFEST, + [ + # Fetch the first page of posts + ( + f"https://api.example.com/community/posts?per_page=100&start_time={PARENT_POSTS_CURSOR}", { - "comment_id": 10, - "comment_updated_at": COMMENT_10_UPDATED_AT, - "created_at": VOTE_100_CREATED_AT, - "id": 100, + "posts": [ + {"id": 1, "updated_at": POST_1_UPDATED_AT}, + {"id": 2, "updated_at": POST_2_UPDATED_AT}, + ], + "next_page": ( + f"https://api.example.com/community/posts?per_page=100&start_time={PARENT_POSTS_CURSOR}&page=2" + ), }, + ), + # Fetch the second page of posts + ( + f"https://api.example.com/community/posts?per_page=100&start_time={PARENT_POSTS_CURSOR}&page=2", + {"posts": [{"id": 3, "updated_at": POST_3_UPDATED_AT}]}, + ), + # Fetch the first page of comments for post 1 + ( + "https://api.example.com/community/posts/1/comments?per_page=100", { - "comment_id": 10, - "comment_updated_at": COMMENT_10_UPDATED_AT, - "created_at": VOTE_101_CREATED_AT, - "id": 101, + "comments": [ + {"id": 9, "post_id": 1, "updated_at": COMMENT_9_OLDEST}, + {"id": 10, "post_id": 1, "updated_at": COMMENT_10_UPDATED_AT}, + {"id": 11, "post_id": 1, "updated_at": COMMENT_11_UPDATED_AT}, + ], + "next_page": ( + "https://api.example.com/community/posts/1/comments?per_page=100&page=2" + ), }, + ), + # Fetch the second page of comments for post 1 + ( + "https://api.example.com/community/posts/1/comments?per_page=100&page=2", + {"comments": [{"id": 12, "post_id": 1, "updated_at": COMMENT_12_UPDATED_AT}]}, + ), + # Fetch the first page of votes for comment 10 of post 1 + ( + f"https://api.example.com/community/posts/1/comments/10/votes" + f"?per_page=100&start_time={INITIAL_STATE_PARTITION_10_CURSOR}", { - "comment_id": 11, - "comment_updated_at": COMMENT_11_UPDATED_AT, - "created_at": VOTE_111_CREATED_AT, - "id": 111, + "votes": [{"id": 100, "comment_id": 10, "created_at": VOTE_100_CREATED_AT}], + "next_page": ( + f"https://api.example.com/community/posts/1/comments/10/votes" + f"?per_page=100&page=2&start_time={INITIAL_STATE_PARTITION_10_CURSOR}" + ), }, + ), + # Fetch the second page of votes for comment 10 of post 1 + ( + f"https://api.example.com/community/posts/1/comments/10/votes" + f"?per_page=100&page=2&start_time={INITIAL_STATE_PARTITION_10_CURSOR}", + {"votes": [{"id": 101, "comment_id": 10, "created_at": VOTE_101_CREATED_AT}]}, + ), + # Fetch the first page of votes for comment 11 of post 1 + ( + f"https://api.example.com/community/posts/1/comments/11/votes" + f"?per_page=100&start_time={INITIAL_STATE_PARTITION_11_CURSOR}", + {"votes": [{"id": 111, "comment_id": 11, "created_at": VOTE_111_CREATED_AT}]}, + ), + # Fetch the first page of votes for comment 12 of post 1 + ( + f"https://api.example.com/community/posts/1/comments/12/votes?per_page=100&start_time={LOOKBACK_DATE}", + {"votes": []}, + ), + # Fetch the first page of comments for post 2 + ( + "https://api.example.com/community/posts/2/comments?per_page=100", { - "comment_id": 21, - "comment_updated_at": COMMENT_21_UPDATED_AT, - "created_at": VOTE_210_CREATED_AT, - "id": 210, + "comments": [{"id": 20, "post_id": 2, "updated_at": COMMENT_20_UPDATED_AT}], + "next_page": ( + "https://api.example.com/community/posts/2/comments?per_page=100&page=2" + ), }, + ), + # Fetch the second page of comments for post 2 + ( + "https://api.example.com/community/posts/2/comments?per_page=100&page=2", + {"comments": [{"id": 21, "post_id": 2, "updated_at": COMMENT_21_UPDATED_AT}]}, + ), + # Fetch the first page of votes for comment 20 of post 2 - 404 error + ( + f"https://api.example.com/community/posts/2/comments/20/votes" + f"?per_page=100&start_time={LOOKBACK_DATE}", + None, + ), + # Fetch the first page of votes for comment 21 of post 2 + ( + f"https://api.example.com/community/posts/2/comments/21/votes" + f"?per_page=100&start_time={LOOKBACK_DATE}", + {"votes": [{"id": 210, "comment_id": 21, "created_at": VOTE_210_CREATED_AT}]}, + ), + # Fetch the first page of comments for post 3 + ( + "https://api.example.com/community/posts/3/comments?per_page=100", + {"comments": [{"id": 30, "post_id": 3, "updated_at": COMMENT_30_UPDATED_AT}]}, + ), + # Fetch the first page of votes for comment 30 of post 3 + ( + f"https://api.example.com/community/posts/3/comments/30/votes" + f"?per_page=100&start_time={LOOKBACK_DATE}", { - "comment_id": 30, - "comment_updated_at": COMMENT_30_UPDATED_AT, - "created_at": str(VOTE_300_CREATED_AT_TIMESTAMP), - "id": 300, + "votes": [ + { + "id": 300, + "comment_id": 30, + "created_at": VOTE_300_CREATED_AT_TIMESTAMP, + } + ] }, - ], - # Initial state + ), + ], + # Expected records + [ { - "parent_state": { - "post_comments": { - "states": [ - { - "partition": {"id": 1, "parent_slice": {}}, - "cursor": {"updated_at": PARENT_COMMENT_CURSOR_PARTITION_1}, - } - ], - "parent_state": {"posts": {"updated_at": PARENT_POSTS_CURSOR}}, - } - }, - "state": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, - "lookback_window": 86400, - "states": [ - { - "partition": { - "id": 10, - "parent_slice": {"id": 1, "parent_slice": {}}, - }, - "cursor": {"created_at": INITIAL_STATE_PARTITION_10_CURSOR}, - }, - { - "partition": { - "id": 11, - "parent_slice": {"id": 1, "parent_slice": {}}, - }, - "cursor": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, - }, - ], + "comment_id": 10, + "comment_updated_at": COMMENT_10_UPDATED_AT, + "created_at": VOTE_100_CREATED_AT, + "id": 100, }, - # Expected state { - # The global state, lookback window and the parent state are the same because sync failed for comment 20 - "parent_state": { - "post_comments": { - "states": [ - { - "partition": {"id": 1, "parent_slice": {}}, - "cursor": {"updated_at": PARENT_COMMENT_CURSOR_PARTITION_1}, - } - ], - "state": {}, - "use_global_cursor": False, - "parent_state": {"posts": {"updated_at": PARENT_POSTS_CURSOR}}, - } - }, - "state": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, - "lookback_window": 86400, - "use_global_cursor": False, - "states": [ - { - "partition": {"id": 10, "parent_slice": {"id": 1, "parent_slice": {}}}, - "cursor": {"created_at": VOTE_100_CREATED_AT}, - }, - { - "partition": {"id": 11, "parent_slice": {"id": 1, "parent_slice": {}}}, - "cursor": {"created_at": VOTE_111_CREATED_AT}, - }, - { - "partition": {"id": 12, "parent_slice": {"id": 1, "parent_slice": {}}}, - "cursor": {"created_at": LOOKBACK_DATE}, - }, - { - "partition": {"id": 20, "parent_slice": {"id": 2, "parent_slice": {}}}, - "cursor": {"created_at": LOOKBACK_DATE}, - }, - { - "partition": {"id": 21, "parent_slice": {"id": 2, "parent_slice": {}}}, - "cursor": {"created_at": VOTE_210_CREATED_AT}, + "comment_id": 10, + "comment_updated_at": COMMENT_10_UPDATED_AT, + "created_at": VOTE_101_CREATED_AT, + "id": 101, + }, + { + "comment_id": 11, + "comment_updated_at": COMMENT_11_UPDATED_AT, + "created_at": VOTE_111_CREATED_AT, + "id": 111, + }, + { + "comment_id": 21, + "comment_updated_at": COMMENT_21_UPDATED_AT, + "created_at": VOTE_210_CREATED_AT, + "id": 210, + }, + { + "comment_id": 30, + "comment_updated_at": COMMENT_30_UPDATED_AT, + "created_at": str(VOTE_300_CREATED_AT_TIMESTAMP), + "id": 300, + }, + ], + # Initial state + { + "parent_state": { + "post_comments": { + "states": [ + { + "partition": {"id": 1, "parent_slice": {}}, + "cursor": {"updated_at": PARENT_COMMENT_CURSOR_PARTITION_1}, + } + ], + "parent_state": {"posts": {"updated_at": PARENT_POSTS_CURSOR}}, + } + }, + "state": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, + "lookback_window": 86400, + "states": [ + { + "partition": { + "id": 10, + "parent_slice": {"id": 1, "parent_slice": {}}, }, - { - "partition": {"id": 30, "parent_slice": {"id": 3, "parent_slice": {}}}, - "cursor": {"created_at": VOTE_300_CREATED_AT}, + "cursor": {"created_at": INITIAL_STATE_PARTITION_10_CURSOR}, + }, + { + "partition": { + "id": 11, + "parent_slice": {"id": 1, "parent_slice": {}}, }, - ], + "cursor": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, + }, + ], + }, + # Expected state + { + # The global state, lookback window and the parent state are the same because sync failed for comment 20 + "parent_state": { + "post_comments": { + "states": [ + { + "partition": {"id": 1, "parent_slice": {}}, + "cursor": {"updated_at": PARENT_COMMENT_CURSOR_PARTITION_1}, + } + ], + "state": {}, + "use_global_cursor": False, + "parent_state": {"posts": {"updated_at": PARENT_POSTS_CURSOR}}, + } }, + "state": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, + "lookback_window": 86400, + "use_global_cursor": False, + "states": [ + { + "partition": {"id": 10, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": VOTE_100_CREATED_AT}, + }, + { + "partition": {"id": 11, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": VOTE_111_CREATED_AT}, + }, + { + "partition": {"id": 12, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": LOOKBACK_DATE}, + }, + { + "partition": {"id": 20, "parent_slice": {"id": 2, "parent_slice": {}}}, + "cursor": {"created_at": LOOKBACK_DATE}, + }, + { + "partition": {"id": 21, "parent_slice": {"id": 2, "parent_slice": {}}}, + "cursor": {"created_at": VOTE_210_CREATED_AT}, + }, + { + "partition": {"id": 30, "parent_slice": {"id": 3, "parent_slice": {}}}, + "cursor": {"created_at": VOTE_300_CREATED_AT}, + }, + ], + }, ), ], ) def test_incremental_substream_error( - test_name, manifest, mock_requests, expected_records, initial_state, expected_state + test_name, manifest, mock_requests, expected_records, initial_state, expected_state ): run_mocked_test( mock_requests, @@ -2524,85 +2524,85 @@ def test_incremental_substream_error( "test_name, manifest, mock_requests, expected_records, initial_state, expected_state", [ ( - "test_incremental_parent_state", - LISTPARTITION_MANIFEST, - [ - # Fetch the first page of comments for post 1 - ( - "https://api.example.com/community/posts/1/comments?per_page=100&start_time=2024-01-24T00:00:00Z", - { - "comments": [ - {"id": 9, "post_id": 1, "updated_at": "2023-01-01T00:00:00Z"}, - {"id": 10, "post_id": 1, "updated_at": "2024-01-25T00:00:00Z"}, - {"id": 11, "post_id": 1, "updated_at": "2024-01-24T00:00:00Z"}, - ], - "next_page": "https://api.example.com/community/posts/1/comments?per_page=100&page=2&start_time=2024-01-24T00:00:00Z", - }, - ), - # Fetch the second page of comments for post 1 - ( - "https://api.example.com/community/posts/1/comments?per_page=100&page=2&start_time=2024-01-24T00:00:00Z", - {"comments": [{"id": 12, "post_id": 1, "updated_at": "2024-01-23T00:00:00Z"}]}, - ), - # Fetch the first page of comments for post 2 - ( - "https://api.example.com/community/posts/2/comments?per_page=100&start_time=2024-01-21T05:00:00Z", - { - "comments": [ - {"id": 20, "post_id": 2, "updated_at": "2024-01-22T00:00:00Z"} - ], - "next_page": "https://api.example.com/community/posts/2/comments?per_page=100&page=2&start_time=2024-01-21T05:00:00Z", - }, - ), - # Fetch the second page of comments for post 2 - ( - "https://api.example.com/community/posts/2/comments?per_page=100&page=2&start_time=2024-01-21T05:00:00Z", - {"comments": [{"id": 21, "post_id": 2, "updated_at": "2024-01-21T00:00:00Z"}]}, - ), - # Fetch the first page of comments for post 3 - ( - "https://api.example.com/community/posts/3/comments?per_page=100&start_time=2024-01-08T00:00:00Z", - {"comments": [{"id": 30, "post_id": 3, "updated_at": "2024-01-09T00:00:00Z"}]}, - ), + "test_incremental_parent_state", + LISTPARTITION_MANIFEST, + [ + # Fetch the first page of comments for post 1 + ( + "https://api.example.com/community/posts/1/comments?per_page=100&start_time=2024-01-24T00:00:00Z", + { + "comments": [ + {"id": 9, "post_id": 1, "updated_at": "2023-01-01T00:00:00Z"}, + {"id": 10, "post_id": 1, "updated_at": "2024-01-25T00:00:00Z"}, + {"id": 11, "post_id": 1, "updated_at": "2024-01-24T00:00:00Z"}, + ], + "next_page": "https://api.example.com/community/posts/1/comments?per_page=100&page=2&start_time=2024-01-24T00:00:00Z", + }, + ), + # Fetch the second page of comments for post 1 + ( + "https://api.example.com/community/posts/1/comments?per_page=100&page=2&start_time=2024-01-24T00:00:00Z", + {"comments": [{"id": 12, "post_id": 1, "updated_at": "2024-01-23T00:00:00Z"}]}, + ), + # Fetch the first page of comments for post 2 + ( + "https://api.example.com/community/posts/2/comments?per_page=100&start_time=2024-01-21T05:00:00Z", + { + "comments": [ + {"id": 20, "post_id": 2, "updated_at": "2024-01-22T00:00:00Z"} + ], + "next_page": "https://api.example.com/community/posts/2/comments?per_page=100&page=2&start_time=2024-01-21T05:00:00Z", + }, + ), + # Fetch the second page of comments for post 2 + ( + "https://api.example.com/community/posts/2/comments?per_page=100&page=2&start_time=2024-01-21T05:00:00Z", + {"comments": [{"id": 21, "post_id": 2, "updated_at": "2024-01-21T00:00:00Z"}]}, + ), + # Fetch the first page of comments for post 3 + ( + "https://api.example.com/community/posts/3/comments?per_page=100&start_time=2024-01-08T00:00:00Z", + {"comments": [{"id": 30, "post_id": 3, "updated_at": "2024-01-09T00:00:00Z"}]}, + ), + ], + # Expected records + [ + {"id": 10, "post_id": 1, "updated_at": "2024-01-25T00:00:00Z"}, + {"id": 11, "post_id": 1, "updated_at": "2024-01-24T00:00:00Z"}, + {"id": 20, "post_id": 2, "updated_at": "2024-01-22T00:00:00Z"}, + {"id": 30, "post_id": 3, "updated_at": "2024-01-09T00:00:00Z"}, + ], + # Initial state + { + "state": {"updated_at": "2024-01-08T00:00:00Z"}, + "states": [ + { + "cursor": {"updated_at": "2024-01-24T00:00:00Z"}, + "partition": {"id": "1"}, + }, + { + "cursor": {"updated_at": "2024-01-21T05:00:00Z"}, + "partition": {"id": "2"}, + }, ], - # Expected records - [ - {"id": 10, "post_id": 1, "updated_at": "2024-01-25T00:00:00Z"}, - {"id": 11, "post_id": 1, "updated_at": "2024-01-24T00:00:00Z"}, - {"id": 20, "post_id": 2, "updated_at": "2024-01-22T00:00:00Z"}, - {"id": 30, "post_id": 3, "updated_at": "2024-01-09T00:00:00Z"}, + "use_global_cursor": False, + }, + # Expected state + { + "use_global_cursor": False, + "lookback_window": 1, + "state": {"updated_at": "2024-01-25T00:00:00Z"}, + "states": [ + {"cursor": {"updated_at": "2024-01-25T00:00:00Z"}, "partition": {"id": "1"}}, + {"cursor": {"updated_at": "2024-01-22T00:00:00Z"}, "partition": {"id": "2"}}, + {"cursor": {"updated_at": "2024-01-09T00:00:00Z"}, "partition": {"id": "3"}}, ], - # Initial state - { - "state": {"updated_at": "2024-01-08T00:00:00Z"}, - "states": [ - { - "cursor": {"updated_at": "2024-01-24T00:00:00Z"}, - "partition": {"id": "1"}, - }, - { - "cursor": {"updated_at": "2024-01-21T05:00:00Z"}, - "partition": {"id": "2"}, - }, - ], - "use_global_cursor": False, - }, - # Expected state - { - "use_global_cursor": False, - "lookback_window": 1, - "state": {"updated_at": "2024-01-25T00:00:00Z"}, - "states": [ - {"cursor": {"updated_at": "2024-01-25T00:00:00Z"}, "partition": {"id": "1"}}, - {"cursor": {"updated_at": "2024-01-22T00:00:00Z"}, "partition": {"id": "2"}}, - {"cursor": {"updated_at": "2024-01-09T00:00:00Z"}, "partition": {"id": "3"}}, - ], - }, + }, ), ], ) def test_incremental_list_partition_router( - test_name, manifest, mock_requests, expected_records, initial_state, expected_state + test_name, manifest, mock_requests, expected_records, initial_state, expected_state ): """ Test ConcurrentPerPartitionCursor with ListPartitionRouter @@ -2622,85 +2622,85 @@ def test_incremental_list_partition_router( "test_name, manifest, mock_requests, expected_records, initial_state, expected_state", [ ( - "test_incremental_error_handling", - LISTPARTITION_MANIFEST, - [ - # Fetch the first page of comments for post 1 - ( - "https://api.example.com/community/posts/1/comments?per_page=100&start_time=2024-01-20T00:00:00Z", - { - "comments": [ - {"id": 9, "post_id": 1, "updated_at": "2023-01-01T00:00:00Z"}, - {"id": 10, "post_id": 1, "updated_at": "2024-01-25T00:00:00Z"}, - {"id": 11, "post_id": 1, "updated_at": "2024-01-24T00:00:00Z"}, - ], - "next_page": "https://api.example.com/community/posts/1/comments?per_page=100&page=2&start_time=2024-01-20T00:00:00Z", - }, - ), - # Error response for the second page of comments for post 1 - ( - "https://api.example.com/community/posts/1/comments?per_page=100&page=2&start_time=2024-01-20T00:00:00Z", - None, # Simulate a network error or an empty response - ), - # Fetch the first page of comments for post 2 - ( - "https://api.example.com/community/posts/2/comments?per_page=100&start_time=2024-01-21T05:00:00Z", - { - "comments": [ - {"id": 20, "post_id": 2, "updated_at": "2024-01-22T00:00:00Z"} - ], - "next_page": "https://api.example.com/community/posts/2/comments?per_page=100&page=2&start_time=2024-01-21T05:00:00Z", - }, - ), - # Fetch the second page of comments for post 2 - ( - "https://api.example.com/community/posts/2/comments?per_page=100&page=2&start_time=2024-01-21T05:00:00Z", - {"comments": [{"id": 21, "post_id": 2, "updated_at": "2024-01-21T00:00:00Z"}]}, - ), - # Fetch the first page of comments for post 3 - ( - "https://api.example.com/community/posts/3/comments?per_page=100&start_time=2024-01-08T00:00:00Z", - {"comments": [{"id": 30, "post_id": 3, "updated_at": "2024-01-09T00:00:00Z"}]}, - ), + "test_incremental_error_handling", + LISTPARTITION_MANIFEST, + [ + # Fetch the first page of comments for post 1 + ( + "https://api.example.com/community/posts/1/comments?per_page=100&start_time=2024-01-20T00:00:00Z", + { + "comments": [ + {"id": 9, "post_id": 1, "updated_at": "2023-01-01T00:00:00Z"}, + {"id": 10, "post_id": 1, "updated_at": "2024-01-25T00:00:00Z"}, + {"id": 11, "post_id": 1, "updated_at": "2024-01-24T00:00:00Z"}, + ], + "next_page": "https://api.example.com/community/posts/1/comments?per_page=100&page=2&start_time=2024-01-20T00:00:00Z", + }, + ), + # Error response for the second page of comments for post 1 + ( + "https://api.example.com/community/posts/1/comments?per_page=100&page=2&start_time=2024-01-20T00:00:00Z", + None, # Simulate a network error or an empty response + ), + # Fetch the first page of comments for post 2 + ( + "https://api.example.com/community/posts/2/comments?per_page=100&start_time=2024-01-21T05:00:00Z", + { + "comments": [ + {"id": 20, "post_id": 2, "updated_at": "2024-01-22T00:00:00Z"} + ], + "next_page": "https://api.example.com/community/posts/2/comments?per_page=100&page=2&start_time=2024-01-21T05:00:00Z", + }, + ), + # Fetch the second page of comments for post 2 + ( + "https://api.example.com/community/posts/2/comments?per_page=100&page=2&start_time=2024-01-21T05:00:00Z", + {"comments": [{"id": 21, "post_id": 2, "updated_at": "2024-01-21T00:00:00Z"}]}, + ), + # Fetch the first page of comments for post 3 + ( + "https://api.example.com/community/posts/3/comments?per_page=100&start_time=2024-01-08T00:00:00Z", + {"comments": [{"id": 30, "post_id": 3, "updated_at": "2024-01-09T00:00:00Z"}]}, + ), + ], + # Expected records + [ + {"id": 10, "post_id": 1, "updated_at": "2024-01-25T00:00:00Z"}, + {"id": 11, "post_id": 1, "updated_at": "2024-01-24T00:00:00Z"}, + {"id": 20, "post_id": 2, "updated_at": "2024-01-22T00:00:00Z"}, + {"id": 30, "post_id": 3, "updated_at": "2024-01-09T00:00:00Z"}, + ], + # Initial state + { + "state": {"updated_at": "2024-01-08T00:00:00Z"}, + "states": [ + { + "cursor": {"updated_at": "2024-01-20T00:00:00Z"}, + "partition": {"id": "1"}, + }, + { + "cursor": {"updated_at": "2024-01-21T05:00:00Z"}, + "partition": {"id": "2"}, + }, ], - # Expected records - [ - {"id": 10, "post_id": 1, "updated_at": "2024-01-25T00:00:00Z"}, - {"id": 11, "post_id": 1, "updated_at": "2024-01-24T00:00:00Z"}, - {"id": 20, "post_id": 2, "updated_at": "2024-01-22T00:00:00Z"}, - {"id": 30, "post_id": 3, "updated_at": "2024-01-09T00:00:00Z"}, + "use_global_cursor": False, + }, + # Expected state + { + "lookback_window": 0, + "use_global_cursor": False, + "state": {"updated_at": "2024-01-08T00:00:00Z"}, + "states": [ + {"cursor": {"updated_at": "2024-01-20T00:00:00Z"}, "partition": {"id": "1"}}, + {"cursor": {"updated_at": "2024-01-22T00:00:00Z"}, "partition": {"id": "2"}}, + {"cursor": {"updated_at": "2024-01-09T00:00:00Z"}, "partition": {"id": "3"}}, ], - # Initial state - { - "state": {"updated_at": "2024-01-08T00:00:00Z"}, - "states": [ - { - "cursor": {"updated_at": "2024-01-20T00:00:00Z"}, - "partition": {"id": "1"}, - }, - { - "cursor": {"updated_at": "2024-01-21T05:00:00Z"}, - "partition": {"id": "2"}, - }, - ], - "use_global_cursor": False, - }, - # Expected state - { - "lookback_window": 0, - "use_global_cursor": False, - "state": {"updated_at": "2024-01-08T00:00:00Z"}, - "states": [ - {"cursor": {"updated_at": "2024-01-20T00:00:00Z"}, "partition": {"id": "1"}}, - {"cursor": {"updated_at": "2024-01-22T00:00:00Z"}, "partition": {"id": "2"}}, - {"cursor": {"updated_at": "2024-01-09T00:00:00Z"}, "partition": {"id": "3"}}, - ], - }, + }, ), ], ) def test_incremental_error( - test_name, manifest, mock_requests, expected_records, initial_state, expected_state + test_name, manifest, mock_requests, expected_records, initial_state, expected_state ): """ Test with failed request. @@ -2994,261 +2994,261 @@ def test_incremental_error( "test_name, manifest, mock_requests, expected_records, initial_state, expected_state", [ ( - "test_incremental_parent_state", - SUBSTREAM_REQUEST_OPTIONS_MANIFEST, - [ - # Fetch the first page of posts - ( - f"https://api.example.com/community/posts?per_page=100&start_time={PARENT_POSTS_CURSOR}", + "test_incremental_parent_state", + SUBSTREAM_REQUEST_OPTIONS_MANIFEST, + [ + # Fetch the first page of posts + ( + f"https://api.example.com/community/posts?per_page=100&start_time={PARENT_POSTS_CURSOR}", + { + "posts": [ + {"id": 1, "updated_at": POST_1_UPDATED_AT}, + {"id": 2, "updated_at": POST_2_UPDATED_AT}, + ], + "next_page": ( + f"https://api.example.com/community/posts" + f"?per_page=100&start_time={PARENT_POSTS_CURSOR}&page=2" + ), + }, + ), + # Fetch the second page of posts + ( + f"https://api.example.com/community/posts?per_page=100&start_time={PARENT_POSTS_CURSOR}&page=2", + {"posts": [{"id": 3, "updated_at": POST_3_UPDATED_AT}]}, + ), + # Fetch the first page of comments for post 1 + ( + "https://api.example.com/community/posts_comments?per_page=100&post_id=1", + { + "comments": [ { - "posts": [ - {"id": 1, "updated_at": POST_1_UPDATED_AT}, - {"id": 2, "updated_at": POST_2_UPDATED_AT}, - ], - "next_page": ( - f"https://api.example.com/community/posts" - f"?per_page=100&start_time={PARENT_POSTS_CURSOR}&page=2" - ), + "id": 9, + "post_id": 1, + "updated_at": COMMENT_9_OLDEST, }, - ), - # Fetch the second page of posts - ( - f"https://api.example.com/community/posts?per_page=100&start_time={PARENT_POSTS_CURSOR}&page=2", - {"posts": [{"id": 3, "updated_at": POST_3_UPDATED_AT}]}, - ), - # Fetch the first page of comments for post 1 - ( - "https://api.example.com/community/posts_comments?per_page=100&post_id=1", { - "comments": [ - { - "id": 9, - "post_id": 1, - "updated_at": COMMENT_9_OLDEST, - }, - { - "id": 10, - "post_id": 1, - "updated_at": COMMENT_10_UPDATED_AT, - }, - { - "id": 11, - "post_id": 1, - "updated_at": COMMENT_11_UPDATED_AT, - }, - ], - "next_page": "https://api.example.com/community/posts_comments?per_page=100&post_id=1&page=2", + "id": 10, + "post_id": 1, + "updated_at": COMMENT_10_UPDATED_AT, }, - ), - # Fetch the second page of comments for post 1 - ( - "https://api.example.com/community/posts_comments?per_page=100&post_id=1&page=2", - {"comments": [{"id": 12, "post_id": 1, "updated_at": COMMENT_12_UPDATED_AT}]}, - ), - # Fetch the first page of votes for comment 10 of post 1 - ( - f"https://api.example.com/community/posts_comments_votes?per_page=100&comment_id=10&start_time={INITIAL_STATE_PARTITION_10_CURSOR}", { - "votes": [ - { - "id": 100, - "comment_id": 10, - "created_at": VOTE_100_CREATED_AT, - } - ], - "next_page": ( - f"https://api.example.com/community/posts_comments_votes" - f"?per_page=100&page=2&comment_id=10&start_time={INITIAL_STATE_PARTITION_10_CURSOR}" - ), + "id": 11, + "post_id": 1, + "updated_at": COMMENT_11_UPDATED_AT, }, - ), - # Fetch the second page of votes for comment 10 of post 1 - ( - f"https://api.example.com/community/posts_comments_votes" - f"?per_page=100&page=2&comment_id=10&start_time={INITIAL_STATE_PARTITION_10_CURSOR}", - {"votes": [{"id": 101, "comment_id": 10, "created_at": VOTE_101_CREATED_AT}]}, - ), - # Fetch the first page of votes for comment 11 of post 1 - ( - f"https://api.example.com/community/posts_comments_votes" - f"?per_page=100&comment_id=11&start_time={INITIAL_STATE_PARTITION_11_CURSOR}", - {"votes": [{"id": 111, "comment_id": 11, "created_at": VOTE_111_CREATED_AT}]}, - ), - # Fetch the first page of votes for comment 12 of post 1 - ( - f"https://api.example.com/community/posts_comments_votes?" - f"per_page=100&comment_id=12&start_time={LOOKBACK_DATE}", - {"votes": []}, - ), - # Fetch the first page of comments for post 2 - ( - "https://api.example.com/community/posts_comments?per_page=100&post_id=2", + ], + "next_page": "https://api.example.com/community/posts_comments?per_page=100&post_id=1&page=2", + }, + ), + # Fetch the second page of comments for post 1 + ( + "https://api.example.com/community/posts_comments?per_page=100&post_id=1&page=2", + {"comments": [{"id": 12, "post_id": 1, "updated_at": COMMENT_12_UPDATED_AT}]}, + ), + # Fetch the first page of votes for comment 10 of post 1 + ( + f"https://api.example.com/community/posts_comments_votes?per_page=100&comment_id=10&start_time={INITIAL_STATE_PARTITION_10_CURSOR}", + { + "votes": [ { - "comments": [{"id": 20, "post_id": 2, "updated_at": COMMENT_20_UPDATED_AT}], - "next_page": "https://api.example.com/community/posts_comments?per_page=100&post_id=2&page=2", - }, - ), - # Fetch the second page of comments for post 2 - ( - "https://api.example.com/community/posts_comments?per_page=100&post_id=2&page=2", - {"comments": [{"id": 21, "post_id": 2, "updated_at": COMMENT_21_UPDATED_AT}]}, - ), - # Fetch the first page of votes for comment 20 of post 2 - ( + "id": 100, + "comment_id": 10, + "created_at": VOTE_100_CREATED_AT, + } + ], + "next_page": ( f"https://api.example.com/community/posts_comments_votes" - f"?per_page=100&comment_id=20&start_time={LOOKBACK_DATE}", - {"votes": [{"id": 200, "comment_id": 20, "created_at": VOTE_200_CREATED_AT}]}, - ), - # Fetch the first page of votes for comment 21 of post 2 - ( - f"https://api.example.com/community/posts_comments_votes?" - f"per_page=100&comment_id=21&start_time={LOOKBACK_DATE}", - {"votes": [{"id": 210, "comment_id": 21, "created_at": VOTE_210_CREATED_AT}]}, - ), - # Fetch the first page of comments for post 3 - ( - "https://api.example.com/community/posts_comments?per_page=100&post_id=3", - {"comments": [{"id": 30, "post_id": 3, "updated_at": COMMENT_30_UPDATED_AT}]}, - ), - # Fetch the first page of votes for comment 30 of post 3 - ( - f"https://api.example.com/community/posts_comments_votes?" - f"per_page=100&comment_id=30&start_time={LOOKBACK_DATE}", - {"votes": [{"id": 300, "comment_id": 30, "created_at": VOTE_300_CREATED_AT}]}, - ), - ], - # Expected records - [ + f"?per_page=100&page=2&comment_id=10&start_time={INITIAL_STATE_PARTITION_10_CURSOR}" + ), + }, + ), + # Fetch the second page of votes for comment 10 of post 1 + ( + f"https://api.example.com/community/posts_comments_votes" + f"?per_page=100&page=2&comment_id=10&start_time={INITIAL_STATE_PARTITION_10_CURSOR}", + {"votes": [{"id": 101, "comment_id": 10, "created_at": VOTE_101_CREATED_AT}]}, + ), + # Fetch the first page of votes for comment 11 of post 1 + ( + f"https://api.example.com/community/posts_comments_votes" + f"?per_page=100&comment_id=11&start_time={INITIAL_STATE_PARTITION_11_CURSOR}", + {"votes": [{"id": 111, "comment_id": 11, "created_at": VOTE_111_CREATED_AT}]}, + ), + # Fetch the first page of votes for comment 12 of post 1 + ( + f"https://api.example.com/community/posts_comments_votes?" + f"per_page=100&comment_id=12&start_time={LOOKBACK_DATE}", + {"votes": []}, + ), + # Fetch the first page of comments for post 2 + ( + "https://api.example.com/community/posts_comments?per_page=100&post_id=2", { - "comment_id": 10, - "comment_updated_at": COMMENT_10_UPDATED_AT, - "created_at": VOTE_100_CREATED_AT, - "id": 100, + "comments": [{"id": 20, "post_id": 2, "updated_at": COMMENT_20_UPDATED_AT}], + "next_page": "https://api.example.com/community/posts_comments?per_page=100&post_id=2&page=2", }, + ), + # Fetch the second page of comments for post 2 + ( + "https://api.example.com/community/posts_comments?per_page=100&post_id=2&page=2", + {"comments": [{"id": 21, "post_id": 2, "updated_at": COMMENT_21_UPDATED_AT}]}, + ), + # Fetch the first page of votes for comment 20 of post 2 + ( + f"https://api.example.com/community/posts_comments_votes" + f"?per_page=100&comment_id=20&start_time={LOOKBACK_DATE}", + {"votes": [{"id": 200, "comment_id": 20, "created_at": VOTE_200_CREATED_AT}]}, + ), + # Fetch the first page of votes for comment 21 of post 2 + ( + f"https://api.example.com/community/posts_comments_votes?" + f"per_page=100&comment_id=21&start_time={LOOKBACK_DATE}", + {"votes": [{"id": 210, "comment_id": 21, "created_at": VOTE_210_CREATED_AT}]}, + ), + # Fetch the first page of comments for post 3 + ( + "https://api.example.com/community/posts_comments?per_page=100&post_id=3", + {"comments": [{"id": 30, "post_id": 3, "updated_at": COMMENT_30_UPDATED_AT}]}, + ), + # Fetch the first page of votes for comment 30 of post 3 + ( + f"https://api.example.com/community/posts_comments_votes?" + f"per_page=100&comment_id=30&start_time={LOOKBACK_DATE}", + {"votes": [{"id": 300, "comment_id": 30, "created_at": VOTE_300_CREATED_AT}]}, + ), + ], + # Expected records + [ + { + "comment_id": 10, + "comment_updated_at": COMMENT_10_UPDATED_AT, + "created_at": VOTE_100_CREATED_AT, + "id": 100, + }, + { + "comment_id": 10, + "comment_updated_at": COMMENT_10_UPDATED_AT, + "created_at": VOTE_101_CREATED_AT, + "id": 101, + }, + { + "comment_id": 11, + "comment_updated_at": COMMENT_11_UPDATED_AT, + "created_at": VOTE_111_CREATED_AT, + "id": 111, + }, + { + "comment_id": 20, + "comment_updated_at": COMMENT_20_UPDATED_AT, + "created_at": VOTE_200_CREATED_AT, + "id": 200, + }, + { + "comment_id": 21, + "comment_updated_at": COMMENT_21_UPDATED_AT, + "created_at": VOTE_210_CREATED_AT, + "id": 210, + }, + { + "comment_id": 30, + "comment_updated_at": COMMENT_30_UPDATED_AT, + "created_at": VOTE_300_CREATED_AT, + "id": 300, + }, + ], + # Initial state + { + "parent_state": { + "post_comments": { + "states": [ + { + "partition": {"id": 1, "parent_slice": {}}, + "cursor": {"updated_at": PARENT_COMMENT_CURSOR_PARTITION_1}, + } + ], + "parent_state": {"posts": {"updated_at": PARENT_POSTS_CURSOR}}, + } + }, + "state": {"created_at": INITIAL_GLOBAL_CURSOR}, + "states": [ { - "comment_id": 10, - "comment_updated_at": COMMENT_10_UPDATED_AT, - "created_at": VOTE_101_CREATED_AT, - "id": 101, + "partition": { + "id": 10, + "parent_slice": {"id": 1, "parent_slice": {}}, + }, + "cursor": {"created_at": INITIAL_STATE_PARTITION_10_CURSOR}, }, { - "comment_id": 11, - "comment_updated_at": COMMENT_11_UPDATED_AT, - "created_at": VOTE_111_CREATED_AT, - "id": 111, + "partition": { + "id": 11, + "parent_slice": {"id": 1, "parent_slice": {}}, + }, + "cursor": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, }, + ], + "lookback_window": 86400, + }, + # Expected state + { + "state": {"created_at": VOTE_100_CREATED_AT}, + "parent_state": { + "post_comments": { + "use_global_cursor": False, + "state": {"updated_at": COMMENT_10_UPDATED_AT}, # 10 is the "latest" + "parent_state": { + "posts": {"updated_at": POST_1_UPDATED_AT} + }, # post 1 is the latest + "lookback_window": 1, + "states": [ + { + "partition": {"id": 1, "parent_slice": {}}, + "cursor": {"updated_at": COMMENT_10_UPDATED_AT}, + }, + { + "partition": {"id": 2, "parent_slice": {}}, + "cursor": {"updated_at": COMMENT_20_UPDATED_AT}, + }, + { + "partition": {"id": 3, "parent_slice": {}}, + "cursor": {"updated_at": COMMENT_30_UPDATED_AT}, + }, + ], + } + }, + "lookback_window": 1, + "use_global_cursor": False, + "states": [ { - "comment_id": 20, - "comment_updated_at": COMMENT_20_UPDATED_AT, - "created_at": VOTE_200_CREATED_AT, - "id": 200, + "partition": {"id": 10, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": VOTE_100_CREATED_AT}, }, { - "comment_id": 21, - "comment_updated_at": COMMENT_21_UPDATED_AT, - "created_at": VOTE_210_CREATED_AT, - "id": 210, + "partition": {"id": 11, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": VOTE_111_CREATED_AT}, }, { - "comment_id": 30, - "comment_updated_at": COMMENT_30_UPDATED_AT, - "created_at": VOTE_300_CREATED_AT, - "id": 300, + "partition": {"id": 12, "parent_slice": {"id": 1, "parent_slice": {}}}, + "cursor": {"created_at": LOOKBACK_DATE}, }, - ], - # Initial state - { - "parent_state": { - "post_comments": { - "states": [ - { - "partition": {"id": 1, "parent_slice": {}}, - "cursor": {"updated_at": PARENT_COMMENT_CURSOR_PARTITION_1}, - } - ], - "parent_state": {"posts": {"updated_at": PARENT_POSTS_CURSOR}}, - } + { + "partition": {"id": 20, "parent_slice": {"id": 2, "parent_slice": {}}}, + "cursor": {"created_at": VOTE_200_CREATED_AT}, }, - "state": {"created_at": INITIAL_GLOBAL_CURSOR}, - "states": [ - { - "partition": { - "id": 10, - "parent_slice": {"id": 1, "parent_slice": {}}, - }, - "cursor": {"created_at": INITIAL_STATE_PARTITION_10_CURSOR}, - }, - { - "partition": { - "id": 11, - "parent_slice": {"id": 1, "parent_slice": {}}, - }, - "cursor": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, - }, - ], - "lookback_window": 86400, - }, - # Expected state - { - "state": {"created_at": VOTE_100_CREATED_AT}, - "parent_state": { - "post_comments": { - "use_global_cursor": False, - "state": {"updated_at": COMMENT_10_UPDATED_AT}, # 10 is the "latest" - "parent_state": { - "posts": {"updated_at": POST_1_UPDATED_AT} - }, # post 1 is the latest - "lookback_window": 1, - "states": [ - { - "partition": {"id": 1, "parent_slice": {}}, - "cursor": {"updated_at": COMMENT_10_UPDATED_AT}, - }, - { - "partition": {"id": 2, "parent_slice": {}}, - "cursor": {"updated_at": COMMENT_20_UPDATED_AT}, - }, - { - "partition": {"id": 3, "parent_slice": {}}, - "cursor": {"updated_at": COMMENT_30_UPDATED_AT}, - }, - ], - } + { + "partition": {"id": 21, "parent_slice": {"id": 2, "parent_slice": {}}}, + "cursor": {"created_at": VOTE_210_CREATED_AT}, }, - "lookback_window": 1, - "use_global_cursor": False, - "states": [ - { - "partition": {"id": 10, "parent_slice": {"id": 1, "parent_slice": {}}}, - "cursor": {"created_at": VOTE_100_CREATED_AT}, - }, - { - "partition": {"id": 11, "parent_slice": {"id": 1, "parent_slice": {}}}, - "cursor": {"created_at": VOTE_111_CREATED_AT}, - }, - { - "partition": {"id": 12, "parent_slice": {"id": 1, "parent_slice": {}}}, - "cursor": {"created_at": LOOKBACK_DATE}, - }, - { - "partition": {"id": 20, "parent_slice": {"id": 2, "parent_slice": {}}}, - "cursor": {"created_at": VOTE_200_CREATED_AT}, - }, - { - "partition": {"id": 21, "parent_slice": {"id": 2, "parent_slice": {}}}, - "cursor": {"created_at": VOTE_210_CREATED_AT}, - }, - { - "partition": {"id": 30, "parent_slice": {"id": 3, "parent_slice": {}}}, - "cursor": {"created_at": VOTE_300_CREATED_AT}, - }, - ], - }, + { + "partition": {"id": 30, "parent_slice": {"id": 3, "parent_slice": {}}}, + "cursor": {"created_at": VOTE_300_CREATED_AT}, + }, + ], + }, ), ], ) def test_incremental_substream_request_options_provider( - test_name, manifest, mock_requests, expected_records, initial_state, expected_state + test_name, manifest, mock_requests, expected_records, initial_state, expected_state ): """ Test incremental syncing for a stream that uses request options provider from parent stream config. From eff25eccca6826168979384174f38a7be92f3f64 Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Wed, 19 Feb 2025 18:49:08 +0200 Subject: [PATCH 23/26] Add unit tests --- .../test_concurrent_perpartitioncursor.py | 616 +++++++++--------- 1 file changed, 297 insertions(+), 319 deletions(-) diff --git a/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py b/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py index f650847a6..084c31142 100644 --- a/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py +++ b/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py @@ -20,6 +20,14 @@ ConcurrentDeclarativeSource, ) from airbyte_cdk.sources.declarative.incremental import ConcurrentPerPartitionCursor +from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import ( + DeclarativePartition, +) +from airbyte_cdk.sources.streams.concurrent.cursor import CursorField +from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import ( + CustomFormatConcurrentStreamStateConverter, +) +from airbyte_cdk.sources.types import StreamSlice from airbyte_cdk.test.catalog_builder import CatalogBuilder, ConfiguredAirbyteStreamBuilder from airbyte_cdk.test.entrypoint_wrapper import EntrypointOutput, read @@ -765,7 +773,6 @@ def run_incremental_parent_state_test( mock_requests, expected_records, num_intermediate_states, - intermidiate_states, initial_state, expected_states, ): @@ -785,7 +792,6 @@ def run_incremental_parent_state_test( mock_requests (list): A list of tuples containing URL and response data for mocking API requests. expected_records (list): The expected records to compare against the output. num_intermediate_states (int): The number of intermediate states to expect. - intermidiate_states (list): A list of intermediate states to assert initial_state (list): The initial state to start the read operation. expected_states (list): A list of expected final states after the read operation. """ @@ -832,12 +838,6 @@ def run_incremental_parent_state_test( # Assert that the number of intermediate states is as expected assert len(intermediate_states) - 1 == num_intermediate_states - # Extract just the Python dict from each state message - all_state_dicts = [st[0].stream.stream_state.__dict__ for st in intermediate_states] - - for idx, itermidiate_state in enumerate(all_state_dicts): - assert itermidiate_state == intermidiate_states[idx], idx - # For each intermediate state, perform another read starting from that state for state, records_before_state in intermediate_states[:-1]: output_intermediate = _run_read(manifest, CONFIG, STREAM_NAME, [state]) @@ -874,313 +874,8 @@ def run_incremental_parent_state_test( ), f"Final state mismatch at run {i + 1}. Expected {expected_states}, got {final_state}" -INITIAL_STATE = { - "parent_state": { - "post_comments": { - "states": [ - { - "partition": {"id": 1, "parent_slice": {}}, - "cursor": {"updated_at": PARENT_COMMENT_CURSOR_PARTITION_1}, - } - ], - "parent_state": {"posts": {"updated_at": PARENT_POSTS_CURSOR}}, - } - }, - "state": {"created_at": INITIAL_GLOBAL_CURSOR}, - "states": [ - { - "partition": { - "id": 10, - "parent_slice": {"id": 1, "parent_slice": {}}, - }, - "cursor": {"created_at": INITIAL_STATE_PARTITION_10_CURSOR}, - }, - { - "partition": { - "id": 11, - "parent_slice": {"id": 1, "parent_slice": {}}, - }, - "cursor": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, - }, - ], - "lookback_window": 86400, -} - -INTERMEDIATE_STATES = [ - { - "use_global_cursor": False, - "states": [ - { - "partition": {"id": 10, "parent_slice": {"id": 1, "parent_slice": {}}}, - "cursor": {"created_at": "2024-01-15T00:00:00Z"}, - }, - { - "partition": {"id": 11, "parent_slice": {"id": 1, "parent_slice": {}}}, - "cursor": {"created_at": "2024-01-03T00:00:02Z"}, - }, - ], - "state": {"created_at": "2024-01-03T00:00:02Z"}, - "lookback_window": 86400, - "parent_state": { - "post_comments": { - "use_global_cursor": False, - "state": {}, - "states": [ - { - "partition": {"id": 1, "parent_slice": {}}, - "cursor": {"updated_at": "2023-01-04T00:00:00Z"}, - } - ], - "parent_state": {"posts": {"updated_at": "2024-01-05T00:00:00Z"}}, - } - }, - }, - { - "use_global_cursor": False, - "states": [ - { - "partition": {"id": 10, "parent_slice": {"id": 1, "parent_slice": {}}}, - "cursor": {"created_at": "2024-01-15T00:00:00Z"}, - }, - { - "partition": {"id": 11, "parent_slice": {"id": 1, "parent_slice": {}}}, - "cursor": {"created_at": "2024-01-13T00:00:00Z"}, - }, - ], - "state": {"created_at": "2024-01-03T00:00:02Z"}, - "lookback_window": 86400, - "parent_state": { - "post_comments": { - "use_global_cursor": False, - "state": {}, - "states": [ - { - "partition": {"id": 1, "parent_slice": {}}, - "cursor": {"updated_at": "2023-01-04T00:00:00Z"}, - } - ], - "parent_state": {"posts": {"updated_at": "2024-01-05T00:00:00Z"}}, - } - }, - }, - { - "use_global_cursor": False, - "states": [ - { - "partition": {"id": 10, "parent_slice": {"id": 1, "parent_slice": {}}}, - "cursor": {"created_at": "2024-01-15T00:00:00Z"}, - }, - { - "partition": {"id": 11, "parent_slice": {"id": 1, "parent_slice": {}}}, - "cursor": {"created_at": "2024-01-13T00:00:00Z"}, - }, - { - "partition": {"id": 12, "parent_slice": {"id": 1, "parent_slice": {}}}, - "cursor": {"created_at": "2024-01-02T00:00:02Z"}, - }, - ], - "state": {"created_at": "2024-01-03T00:00:02Z"}, - "lookback_window": 86400, - "parent_state": { - "post_comments": { - "use_global_cursor": False, - "state": {}, - "states": [ - { - "partition": {"id": 1, "parent_slice": {}}, - "cursor": {"updated_at": "2023-01-04T00:00:00Z"}, - } - ], - "parent_state": {"posts": {"updated_at": "2024-01-05T00:00:00Z"}}, - } - }, - }, - { - "use_global_cursor": False, - "states": [ - { - "partition": {"id": 10, "parent_slice": {"id": 1, "parent_slice": {}}}, - "cursor": {"created_at": "2024-01-15T00:00:00Z"}, - }, - { - "partition": {"id": 11, "parent_slice": {"id": 1, "parent_slice": {}}}, - "cursor": {"created_at": "2024-01-13T00:00:00Z"}, - }, - { - "partition": {"id": 12, "parent_slice": {"id": 1, "parent_slice": {}}}, - "cursor": {"created_at": "2024-01-02T00:00:02Z"}, - }, - { - "partition": {"id": 20, "parent_slice": {"id": 2, "parent_slice": {}}}, - "cursor": {"created_at": "2024-01-12T00:00:00Z"}, - }, - ], - "state": {"created_at": "2024-01-03T00:00:02Z"}, - "lookback_window": 86400, - "parent_state": { - "post_comments": { - "use_global_cursor": False, - "state": {}, - "states": [ - { - "partition": {"id": 1, "parent_slice": {}}, - "cursor": {"updated_at": "2024-01-25T00:00:00Z"}, - } - ], - "parent_state": {"posts": {"updated_at": "2024-01-05T00:00:00Z"}}, - } - }, - }, - { - "use_global_cursor": False, - "states": [ - { - "partition": {"id": 10, "parent_slice": {"id": 1, "parent_slice": {}}}, - "cursor": {"created_at": "2024-01-15T00:00:00Z"}, - }, - { - "partition": {"id": 11, "parent_slice": {"id": 1, "parent_slice": {}}}, - "cursor": {"created_at": "2024-01-13T00:00:00Z"}, - }, - { - "partition": {"id": 12, "parent_slice": {"id": 1, "parent_slice": {}}}, - "cursor": {"created_at": "2024-01-02T00:00:02Z"}, - }, - { - "partition": {"id": 20, "parent_slice": {"id": 2, "parent_slice": {}}}, - "cursor": {"created_at": "2024-01-12T00:00:00Z"}, - }, - { - "partition": {"id": 21, "parent_slice": {"id": 2, "parent_slice": {}}}, - "cursor": {"created_at": "2024-01-12T00:00:15Z"}, - }, - { - "partition": {"id": 30, "parent_slice": {"id": 3, "parent_slice": {}}}, - "cursor": {"created_at": "2024-01-02T00:00:02Z"}, - }, - ], - "state": {"created_at": "2024-01-03T00:00:02Z"}, - "lookback_window": 86400, - "parent_state": { - "post_comments": { - "use_global_cursor": False, - "state": {}, - "states": [ - { - "partition": {"id": 1, "parent_slice": {}}, - "cursor": {"updated_at": "2024-01-25T00:00:00Z"}, - } - ], - "parent_state": {"posts": {"updated_at": "2024-01-05T00:00:00Z"}}, - } - }, - }, - { - "use_global_cursor": False, - "states": [ - { - "partition": {"id": 10, "parent_slice": {"id": 1, "parent_slice": {}}}, - "cursor": {"created_at": "2024-01-15T00:00:00Z"}, - }, - { - "partition": {"id": 11, "parent_slice": {"id": 1, "parent_slice": {}}}, - "cursor": {"created_at": "2024-01-13T00:00:00Z"}, - }, - { - "partition": {"id": 12, "parent_slice": {"id": 1, "parent_slice": {}}}, - "cursor": {"created_at": "2024-01-02T00:00:02Z"}, - }, - { - "partition": {"id": 20, "parent_slice": {"id": 2, "parent_slice": {}}}, - "cursor": {"created_at": "2024-01-12T00:00:00Z"}, - }, - { - "partition": {"id": 21, "parent_slice": {"id": 2, "parent_slice": {}}}, - "cursor": {"created_at": "2024-01-12T00:00:15Z"}, - }, - { - "partition": {"id": 30, "parent_slice": {"id": 3, "parent_slice": {}}}, - "cursor": {"created_at": "2024-01-10T00:00:00Z"}, - }, - ], - "state": {"created_at": "2024-01-03T00:00:02Z"}, - "lookback_window": 86400, - "parent_state": { - "post_comments": { - "use_global_cursor": False, - "state": {}, - "states": [ - { - "partition": {"id": 1, "parent_slice": {}}, - "cursor": {"updated_at": "2024-01-25T00:00:00Z"}, - }, - { - "partition": {"id": 2, "parent_slice": {}}, - "cursor": {"updated_at": "2024-01-22T00:00:00Z"}, - }, - ], - "parent_state": {"posts": {"updated_at": "2024-01-05T00:00:00Z"}}, - } - }, - }, - { - "use_global_cursor": False, - "states": [ - { - "partition": {"id": 10, "parent_slice": {"id": 1, "parent_slice": {}}}, - "cursor": {"created_at": "2024-01-15T00:00:00Z"}, - }, - { - "partition": {"id": 11, "parent_slice": {"id": 1, "parent_slice": {}}}, - "cursor": {"created_at": "2024-01-13T00:00:00Z"}, - }, - { - "partition": {"id": 12, "parent_slice": {"id": 1, "parent_slice": {}}}, - "cursor": {"created_at": "2024-01-02T00:00:02Z"}, - }, - { - "partition": {"id": 20, "parent_slice": {"id": 2, "parent_slice": {}}}, - "cursor": {"created_at": "2024-01-12T00:00:00Z"}, - }, - { - "partition": {"id": 21, "parent_slice": {"id": 2, "parent_slice": {}}}, - "cursor": {"created_at": "2024-01-12T00:00:15Z"}, - }, - { - "partition": {"id": 30, "parent_slice": {"id": 3, "parent_slice": {}}}, - "cursor": {"created_at": "2024-01-10T00:00:00Z"}, - }, - ], - "state": {"created_at": "2024-01-15T00:00:00Z"}, - "lookback_window": 1, - "parent_state": { - "post_comments": { - "use_global_cursor": False, - "state": {"updated_at": "2024-01-25T00:00:00Z"}, - "lookback_window": 1, - "states": [ - { - "partition": {"id": 1, "parent_slice": {}}, - "cursor": {"updated_at": "2024-01-25T00:00:00Z"}, - }, - { - "partition": {"id": 2, "parent_slice": {}}, - "cursor": {"updated_at": "2024-01-22T00:00:00Z"}, - }, - { - "partition": {"id": 3, "parent_slice": {}}, - "cursor": {"updated_at": "2024-01-09T00:00:00Z"}, - }, - ], - "parent_state": {"posts": {"updated_at": "2024-01-30T00:00:00Z"}}, - } - }, - }, -] - - @pytest.mark.parametrize( - "test_name, manifest, mock_requests, expected_records, num_intermediate_states, intermidiate_states, initial_state, expected_state", + "test_name, manifest, mock_requests, expected_records, num_intermediate_states, initial_state, expected_state", [ ( "test_incremental_parent_state", @@ -1396,10 +1091,38 @@ def run_incremental_parent_state_test( ], # Number of intermediate states - 6 as number of parent partitions 6, - # Intermediate states - INTERMEDIATE_STATES, # Initial state - INITIAL_STATE, + { + "parent_state": { + "post_comments": { + "states": [ + { + "partition": {"id": 1, "parent_slice": {}}, + "cursor": {"updated_at": PARENT_COMMENT_CURSOR_PARTITION_1}, + } + ], + "parent_state": {"posts": {"updated_at": PARENT_POSTS_CURSOR}}, + } + }, + "state": {"created_at": INITIAL_GLOBAL_CURSOR}, + "states": [ + { + "partition": { + "id": 10, + "parent_slice": {"id": 1, "parent_slice": {}}, + }, + "cursor": {"created_at": INITIAL_STATE_PARTITION_10_CURSOR}, + }, + { + "partition": { + "id": 11, + "parent_slice": {"id": 1, "parent_slice": {}}, + }, + "cursor": {"created_at": INITIAL_STATE_PARTITION_11_CURSOR}, + }, + ], + "lookback_window": 86400, + }, # Expected state { "state": {"created_at": VOTE_100_CREATED_AT}, @@ -1465,7 +1188,6 @@ def test_incremental_parent_state( mock_requests, expected_records, num_intermediate_states, - intermidiate_states, initial_state, expected_state, ): @@ -1478,7 +1200,6 @@ def test_incremental_parent_state( mock_requests, expected_records, num_intermediate_states, - intermidiate_states, initial_state, [expected_state], ) @@ -3306,3 +3027,260 @@ def test_state_throttling(mocker): cursor._emit_state_message() mock_connector_manager.update_state_for_stream.assert_called_once() mock_repo.emit_message.assert_called_once() + + +def test_given_no_partitions_processed_when_close_partition_then_no_state_update(): + mock_cursor = MagicMock() + # No slices for no partitions + mock_cursor.stream_slices.side_effect = [iter([])] + mock_cursor.state = {} # Empty state for no partitions + + cursor_factory_mock = MagicMock() + cursor_factory_mock.create.return_value = mock_cursor + + connector_state_converter = CustomFormatConcurrentStreamStateConverter( + datetime_format="%Y-%m-%dT%H:%M:%SZ", + input_datetime_formats=["%Y-%m-%dT%H:%M:%SZ"], + is_sequential_state=True, + cursor_granularity=timedelta(0), + ) + + cursor = ConcurrentPerPartitionCursor( + cursor_factory=cursor_factory_mock, + partition_router=MagicMock(), + stream_name="test_stream", + stream_namespace=None, + stream_state={}, + message_repository=MagicMock(), + connector_state_manager=MagicMock(), + connector_state_converter=connector_state_converter, + cursor_field=CursorField(cursor_field_key="updated_at"), + ) + partition_router = cursor._partition_router + partition_router.stream_slices.return_value = iter([]) + partition_router.get_stream_state.return_value = {} + + slices = list(cursor.stream_slices()) # Call once + for slice in slices: + cursor.close_partition( + DeclarativePartition("test_stream", {}, MagicMock(), MagicMock(), slice) + ) + + assert cursor.state == { + "use_global_cursor": False, + "lookback_window": 0, + "states": [], + } + assert len(cursor._cursor_per_partition) == 0 + assert len(cursor._semaphore_per_partition) == 0 + assert len(cursor._partition_parent_state_map) == 0 + assert mock_cursor.stream_slices.call_count == 0 # No calls since no partitions + + +def test_given_new_partition_mid_sync_when_close_partition_then_update_state(): + mock_cursor = MagicMock() + # Simulate one slice per cursor + mock_cursor.stream_slices.side_effect = [ + iter( + [ + {"slice1": "data1"}, + {"slice2": "data1"}, # First slice + ] + ), + iter( + [ + {"slice2": "data2"}, + {"slice2": "data2"}, # First slice for new partition + ] + ), + ] + mock_cursor.state = {"updated_at": "2024-01-03T00:00:00Z"} # Set cursor state + + connector_state_converter = CustomFormatConcurrentStreamStateConverter( + datetime_format="%Y-%m-%dT%H:%M:%SZ", + input_datetime_formats=["%Y-%m-%dT%H:%M:%SZ"], + is_sequential_state=True, + cursor_granularity=timedelta(0), + ) + + cursor_factory_mock = MagicMock() + cursor_factory_mock.create.return_value = mock_cursor + + cursor = ConcurrentPerPartitionCursor( + cursor_factory=cursor_factory_mock, + partition_router=MagicMock(), + stream_name="test_stream", + stream_namespace=None, + stream_state={ + "states": [ + {"partition": {"id": "1"}, "cursor": {"updated_at": "2024-01-01T00:00:00Z"}} + ], + "state": {"updated_at": "2024-01-01T00:00:00Z"}, + "lookback_window": 86400, + "parent_state": {"posts": {"updated_at": "2024-01-01T00:00:00Z"}}, + }, + message_repository=MagicMock(), + connector_state_manager=MagicMock(), + connector_state_converter=connector_state_converter, + cursor_field=CursorField(cursor_field_key="updated_at"), + ) + partition_router = cursor._partition_router + all_partitions = [ + StreamSlice(partition={"id": "1"}, cursor_slice={}, extra_fields={}), + StreamSlice(partition={"id": "2"}, cursor_slice={}, extra_fields={}), # New partition + ] + partition_router.stream_slices.return_value = iter(all_partitions) + partition_router.get_stream_state.side_effect = [ + {"posts": {"updated_at": "2024-01-04T00:00:00Z"}}, # Initial parent state + {"posts": {"updated_at": "2024-01-05T00:00:00Z"}}, # Updated parent state for new partition + ] + + slices = list(cursor.stream_slices()) + # Close all partitions except from the first one + for slice in slices: + cursor.close_partition( + DeclarativePartition("test_stream", {}, MagicMock(), MagicMock(), slice) + ) + + state = cursor.state + assert state["use_global_cursor"] is False + assert len(state["states"]) == 2 # Should now have two partitions + assert any(p["partition"]["id"] == "1" for p in state["states"]) + assert any(p["partition"]["id"] == "2" for p in state["states"]) + assert state["parent_state"] == {"posts": {"updated_at": "2024-01-05T00:00:00Z"}} + assert state["lookback_window"] == 86400 + assert mock_cursor.stream_slices.call_count == 2 # Called once for each partition + + +def test_given_all_partitions_finished_when_close_partition_then_final_state_emitted(): + mock_cursor = MagicMock() + # Simulate one slice per cursor + mock_cursor.stream_slices.side_effect = [ + iter( + [ + {"slice1": "data"}, # First slice for partition 1 + ] + ), + iter( + [ + {"slice2": "data"}, # First slice for partition 2 + ] + ), + ] + mock_cursor.state = {"updated_at": "2024-01-02T00:00:00Z"} # Set cursor state (latest) + + cursor_factory_mock = MagicMock() + cursor_factory_mock.create.return_value = mock_cursor + + connector_state_converter = CustomFormatConcurrentStreamStateConverter( + datetime_format="%Y-%m-%dT%H:%M:%SZ", + input_datetime_formats=["%Y-%m-%dT%H:%M:%SZ"], + is_sequential_state=True, + cursor_granularity=timedelta(0), + ) + + cursor = ConcurrentPerPartitionCursor( + cursor_factory=cursor_factory_mock, + partition_router=MagicMock(), + stream_name="test_stream", + stream_namespace=None, + stream_state={ + "states": [ + {"partition": {"id": "1"}, "cursor": {"updated_at": "2024-01-01T00:00:00Z"}}, + {"partition": {"id": "2"}, "cursor": {"updated_at": "2024-01-02T00:00:00Z"}}, + ], + "state": {"updated_at": "2024-01-02T00:00:00Z"}, + "lookback_window": 86400, + "parent_state": {"posts": {"updated_at": "2024-01-03T00:00:00Z"}}, + }, + message_repository=MagicMock(), + connector_state_manager=MagicMock(), + connector_state_converter=connector_state_converter, + cursor_field=CursorField(cursor_field_key="updated_at"), + ) + partition_router = cursor._partition_router + partitions = [ + StreamSlice(partition={"id": "1"}, cursor_slice={}, extra_fields={}), + StreamSlice(partition={"id": "2"}, cursor_slice={}, extra_fields={}), + ] + partition_router.stream_slices.return_value = iter(partitions) + partition_router.get_stream_state.return_value = { + "posts": {"updated_at": "2024-01-06T00:00:00Z"} + } + + slices = list(cursor.stream_slices()) + for slice in slices: + cursor.close_partition( + DeclarativePartition("test_stream", {}, MagicMock(), MagicMock(), slice) + ) + + cursor.ensure_at_least_one_state_emitted() + + final_state = cursor.state + assert final_state["use_global_cursor"] is False + assert len(final_state["states"]) == 2 + assert final_state["state"]["updated_at"] == "2024-01-02T00:00:00Z" + assert final_state["parent_state"] == {"posts": {"updated_at": "2024-01-06T00:00:00Z"}} + assert final_state["lookback_window"] == 1 + assert cursor._message_repository.emit_message.call_count == 2 + assert mock_cursor.stream_slices.call_count == 2 # Called once for each partition + + +def test_given_partition_limit_exceeded_when_close_partition_then_switch_to_global_cursor(): + mock_cursor = MagicMock() + # Simulate one slice per cursor + mock_cursor.stream_slices.side_effect = [iter([{"slice" + str(i): "data"}]) for i in range(3)] + mock_cursor.state = {"updated_at": "2024-01-01T00:00:00Z"} # Set cursor state + + cursor_factory_mock = MagicMock() + cursor_factory_mock.create.return_value = mock_cursor + + connector_state_converter = CustomFormatConcurrentStreamStateConverter( + datetime_format="%Y-%m-%dT%H:%M:%SZ", + input_datetime_formats=["%Y-%m-%dT%H:%M:%SZ"], + is_sequential_state=True, + cursor_granularity=timedelta(0), + ) + + cursor = ConcurrentPerPartitionCursor( + cursor_factory=cursor_factory_mock, + partition_router=MagicMock(), + stream_name="test_stream", + stream_namespace=None, + stream_state={}, + message_repository=MagicMock(), + connector_state_manager=MagicMock(), + connector_state_converter=connector_state_converter, + cursor_field=CursorField(cursor_field_key="updated_at"), + ) + # Override default limit for testing + cursor.DEFAULT_MAX_PARTITIONS_NUMBER = 2 + cursor.SWITCH_TO_GLOBAL_LIMIT = 1 + + partition_router = cursor._partition_router + partitions = [ + StreamSlice(partition={"id": str(i)}, cursor_slice={}, extra_fields={}) for i in range(3) + ] # 3 partitions + partition_router.stream_slices.return_value = iter(partitions) + partition_router.get_stream_state.side_effect = [ + {"updated_at": "2024-01-02T00:00:00Z"}, + {"updated_at": "2024-01-03T00:00:00Z"}, + {"updated_at": "2024-01-04T00:00:00Z"}, + {"updated_at": "2024-01-04T00:00:00Z"}, + ] + + slices = list(cursor.stream_slices()) + for slice in slices: + cursor.close_partition( + DeclarativePartition("test_stream", {}, MagicMock(), MagicMock(), slice) + ) + cursor.ensure_at_least_one_state_emitted() + + final_state = cursor.state + assert len(slices) == 3 + assert final_state["use_global_cursor"] is True + assert len(final_state.get("states", [])) == 0 # No per-partition states + assert final_state["parent_state"] == {"updated_at": "2024-01-04T00:00:00Z"} + assert "lookback_window" in final_state + assert len(cursor._cursor_per_partition) <= cursor.DEFAULT_MAX_PARTITIONS_NUMBER + assert mock_cursor.stream_slices.call_count == 3 # Called once for each partition From c51f8406460cf618fcfad2cda4cb87178b2ea247 Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Fri, 21 Feb 2025 15:03:07 +0200 Subject: [PATCH 24/26] Update unit tests --- .../test_concurrent_perpartitioncursor.py | 147 ++++++++++++++---- 1 file changed, 120 insertions(+), 27 deletions(-) diff --git a/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py b/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py index 084c31142..bbed04a81 100644 --- a/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py +++ b/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py @@ -3077,24 +3077,29 @@ def test_given_no_partitions_processed_when_close_partition_then_no_state_update assert mock_cursor.stream_slices.call_count == 0 # No calls since no partitions -def test_given_new_partition_mid_sync_when_close_partition_then_update_state(): - mock_cursor = MagicMock() - # Simulate one slice per cursor - mock_cursor.stream_slices.side_effect = [ - iter( - [ - {"slice1": "data1"}, - {"slice2": "data1"}, # First slice - ] - ), - iter( - [ - {"slice2": "data2"}, - {"slice2": "data2"}, # First slice for new partition - ] - ), - ] - mock_cursor.state = {"updated_at": "2024-01-03T00:00:00Z"} # Set cursor state +def test_given_unfinished_first_parent_partition_no_parent_state_update(): + # Create two mock cursors with different states for each partition + mock_cursor_1 = MagicMock() + mock_cursor_1.stream_slices.return_value = iter( + [ + {"slice1": "data1"}, + {"slice2": "data1"}, # First partition slices + ] + ) + mock_cursor_1.state = {"updated_at": "2024-01-01T00:00:00Z"} # State for partition "1" + + mock_cursor_2 = MagicMock() + mock_cursor_2.stream_slices.return_value = iter( + [ + {"slice2": "data2"}, + {"slice2": "data2"}, # Second partition slices + ] + ) + mock_cursor_2.state = {"updated_at": "2024-01-02T00:00:00Z"} # State for partition "2" + + # Configure cursor factory to return different mock cursors based on partition + cursor_factory_mock = MagicMock() + cursor_factory_mock.create.side_effect = [mock_cursor_1, mock_cursor_2] connector_state_converter = CustomFormatConcurrentStreamStateConverter( datetime_format="%Y-%m-%dT%H:%M:%SZ", @@ -3103,8 +3108,89 @@ def test_given_new_partition_mid_sync_when_close_partition_then_update_state(): cursor_granularity=timedelta(0), ) + cursor = ConcurrentPerPartitionCursor( + cursor_factory=cursor_factory_mock, + partition_router=MagicMock(), + stream_name="test_stream", + stream_namespace=None, + stream_state={ + "states": [ + {"partition": {"id": "1"}, "cursor": {"updated_at": "2024-01-01T00:00:00Z"}} + ], + "state": {"updated_at": "2024-01-01T00:00:00Z"}, + "lookback_window": 86400, + "parent_state": {"posts": {"updated_at": "2024-01-01T00:00:00Z"}}, + }, + message_repository=MagicMock(), + connector_state_manager=MagicMock(), + connector_state_converter=connector_state_converter, + cursor_field=CursorField(cursor_field_key="updated_at"), + ) + partition_router = cursor._partition_router + all_partitions = [ + StreamSlice(partition={"id": "1"}, cursor_slice={}, extra_fields={}), + StreamSlice(partition={"id": "2"}, cursor_slice={}, extra_fields={}), # New partition + ] + partition_router.stream_slices.return_value = iter(all_partitions) + partition_router.get_stream_state.side_effect = [ + {"posts": {"updated_at": "2024-01-04T00:00:00Z"}}, # Initial parent state + {"posts": {"updated_at": "2024-01-05T00:00:00Z"}}, # Updated parent state for new partition + ] + + slices = list(cursor.stream_slices()) + # Close all partitions except from the first one + for slice in slices[1:]: + cursor.close_partition( + DeclarativePartition("test_stream", {}, MagicMock(), MagicMock(), slice) + ) + cursor.ensure_at_least_one_state_emitted() + print(cursor.state) + + state = cursor.state + assert state == { + "use_global_cursor": False, + "states": [ + {"partition": {"id": "1"}, "cursor": {"updated_at": "2024-01-01T00:00:00Z"}}, + {"partition": {"id": "2"}, "cursor": {"updated_at": "2024-01-02T00:00:00Z"}}, + ], + "state": {"updated_at": "2024-01-01T00:00:00Z"}, + "lookback_window": 86400, + "parent_state": {"posts": {"updated_at": "2024-01-01T00:00:00Z"}}, + } + assert mock_cursor_1.stream_slices.call_count == 1 # Called once for each partition + assert mock_cursor_2.stream_slices.call_count == 1 # Called once for each partition + + +def test_given_unfinished_last_parent_partition_with_partial_parent_state_update(): + # Create two mock cursors with different states for each partition + mock_cursor_1 = MagicMock() + mock_cursor_1.stream_slices.return_value = iter( + [ + {"slice1": "data1"}, + {"slice2": "data1"}, # First partition slices + ] + ) + mock_cursor_1.state = {"updated_at": "2024-01-02T00:00:00Z"} # State for partition "1" + + mock_cursor_2 = MagicMock() + mock_cursor_2.stream_slices.return_value = iter( + [ + {"slice2": "data2"}, + {"slice2": "data2"}, # Second partition slices + ] + ) + mock_cursor_2.state = {"updated_at": "2024-01-01T00:00:00Z"} # State for partition "2" + + # Configure cursor factory to return different mock cursors based on partition cursor_factory_mock = MagicMock() - cursor_factory_mock.create.return_value = mock_cursor + cursor_factory_mock.create.side_effect = [mock_cursor_1, mock_cursor_2] + + connector_state_converter = CustomFormatConcurrentStreamStateConverter( + datetime_format="%Y-%m-%dT%H:%M:%SZ", + input_datetime_formats=["%Y-%m-%dT%H:%M:%SZ"], + is_sequential_state=True, + cursor_granularity=timedelta(0), + ) cursor = ConcurrentPerPartitionCursor( cursor_factory=cursor_factory_mock, @@ -3137,19 +3223,26 @@ def test_given_new_partition_mid_sync_when_close_partition_then_update_state(): slices = list(cursor.stream_slices()) # Close all partitions except from the first one - for slice in slices: + for slice in slices[:-1]: cursor.close_partition( DeclarativePartition("test_stream", {}, MagicMock(), MagicMock(), slice) ) + cursor.ensure_at_least_one_state_emitted() + print(cursor.state) state = cursor.state - assert state["use_global_cursor"] is False - assert len(state["states"]) == 2 # Should now have two partitions - assert any(p["partition"]["id"] == "1" for p in state["states"]) - assert any(p["partition"]["id"] == "2" for p in state["states"]) - assert state["parent_state"] == {"posts": {"updated_at": "2024-01-05T00:00:00Z"}} - assert state["lookback_window"] == 86400 - assert mock_cursor.stream_slices.call_count == 2 # Called once for each partition + assert state == { + "use_global_cursor": False, + "states": [ + {"partition": {"id": "1"}, "cursor": {"updated_at": "2024-01-02T00:00:00Z"}}, + {"partition": {"id": "2"}, "cursor": {"updated_at": "2024-01-01T00:00:00Z"}}, + ], + "state": {"updated_at": "2024-01-01T00:00:00Z"}, + "lookback_window": 86400, + "parent_state": {"posts": {"updated_at": "2024-01-04T00:00:00Z"}}, + } + assert mock_cursor_1.stream_slices.call_count == 1 # Called once for each partition + assert mock_cursor_2.stream_slices.call_count == 1 # Called once for each partition def test_given_all_partitions_finished_when_close_partition_then_final_state_emitted(): From 4a18954a55372a0fab521065d2ecbc29d0a12104 Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Fri, 21 Feb 2025 15:55:58 +0200 Subject: [PATCH 25/26] Add deleting finished semaphores --- .../concurrent_partition_cursor.py | 28 +++++-- .../test_concurrent_perpartitioncursor.py | 75 +++++++++++++++++++ 2 files changed, 96 insertions(+), 7 deletions(-) diff --git a/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py b/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py index 1ece3c579..3532b4e67 100644 --- a/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +++ b/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py @@ -169,6 +169,8 @@ def _check_and_update_parent_state(self) -> None: Pop the leftmost partition state from _partition_parent_state_map only if *all partitions* up to (and including) that partition key in _semaphore_per_partition are fully finished (i.e. in _finished_partitions and semaphore._value == 0). + Additionally, delete finished semaphores with a value of 0 to free up memory, + as they are only needed to track errors and completion status. """ last_closed_state = None @@ -178,7 +180,9 @@ def _check_and_update_parent_state(self) -> None: # Verify ALL partitions from the left up to earliest_key are finished all_left_finished = True - for p_key, sem in self._semaphore_per_partition.items(): + for p_key, sem in list( + self._semaphore_per_partition.items() + ): # Use list to allow modification during iteration # If any earlier partition is still not finished, we must stop if p_key not in self._finished_partitions or sem._value != 0: all_left_finished = False @@ -191,17 +195,26 @@ def _check_and_update_parent_state(self) -> None: if not all_left_finished: break - # Otherwise, pop the leftmost entry from parent-state map + # Pop the leftmost entry from parent-state map _, closed_parent_state = self._partition_parent_state_map.popitem(last=False) last_closed_state = closed_parent_state - # Update _parent_state if we actually popped at least one partition + # Clean up finished semaphores with value 0 up to and including earliest_key + for p_key in list(self._semaphore_per_partition.keys()): + sem = self._semaphore_per_partition[p_key] + if p_key in self._finished_partitions and sem._value == 0: + del self._semaphore_per_partition[p_key] + logger.debug(f"Deleted finished semaphore for partition {p_key} with value 0") + if p_key == earliest_key: + break + + # Update _parent_state if we popped at least one partition if last_closed_state is not None: self._parent_state = last_closed_state def ensure_at_least_one_state_emitted(self) -> None: """ - The platform expect to have at least one state message on successful syncs. Hence, whatever happens, we expect this method to be + The platform expects at least one state message on successful syncs. Hence, whatever happens, we expect this method to be called. """ if not any( @@ -238,6 +251,7 @@ def _emit_state_message(self, throttle: bool = True) -> None: self._message_repository.emit_message(state_message) def stream_slices(self) -> Iterable[StreamSlice]: + print("stream_slices") if self._timer.is_running(): raise RuntimeError("stream_slices has been executed more than once.") @@ -313,9 +327,9 @@ def _ensure_partition_limit(self) -> None: while len(self._cursor_per_partition) > self.DEFAULT_MAX_PARTITIONS_NUMBER - 1: # Try removing finished partitions first for partition_key in list(self._cursor_per_partition.keys()): - if ( - partition_key in self._finished_partitions - and self._semaphore_per_partition[partition_key]._value == 0 + if partition_key in self._finished_partitions and ( + partition_key not in self._semaphore_per_partition + or self._semaphore_per_partition[partition_key]._value == 0 ): oldest_partition = self._cursor_per_partition.pop( partition_key diff --git a/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py b/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py index bbed04a81..9e15df5b2 100644 --- a/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py +++ b/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py @@ -3159,6 +3159,7 @@ def test_given_unfinished_first_parent_partition_no_parent_state_update(): } assert mock_cursor_1.stream_slices.call_count == 1 # Called once for each partition assert mock_cursor_2.stream_slices.call_count == 1 # Called once for each partition + assert len(cursor._semaphore_per_partition) == 2 def test_given_unfinished_last_parent_partition_with_partial_parent_state_update(): @@ -3243,6 +3244,7 @@ def test_given_unfinished_last_parent_partition_with_partial_parent_state_update } assert mock_cursor_1.stream_slices.call_count == 1 # Called once for each partition assert mock_cursor_2.stream_slices.call_count == 1 # Called once for each partition + assert len(cursor._semaphore_per_partition) == 1 def test_given_all_partitions_finished_when_close_partition_then_final_state_emitted(): @@ -3317,6 +3319,7 @@ def test_given_all_partitions_finished_when_close_partition_then_final_state_emi assert final_state["lookback_window"] == 1 assert cursor._message_repository.emit_message.call_count == 2 assert mock_cursor.stream_slices.call_count == 2 # Called once for each partition + assert len(cursor._semaphore_per_partition) == 1 def test_given_partition_limit_exceeded_when_close_partition_then_switch_to_global_cursor(): @@ -3377,3 +3380,75 @@ def test_given_partition_limit_exceeded_when_close_partition_then_switch_to_glob assert "lookback_window" in final_state assert len(cursor._cursor_per_partition) <= cursor.DEFAULT_MAX_PARTITIONS_NUMBER assert mock_cursor.stream_slices.call_count == 3 # Called once for each partition + + +def test_semaphore_cleanup(): + # Create two mock cursors with different states for each partition + mock_cursor_1 = MagicMock() + mock_cursor_1.stream_slices.return_value = iter( + [ + {"slice1": "data1"}, + {"slice2": "data1"}, # First partition slices + ] + ) + mock_cursor_1.state = {"updated_at": "2024-01-02T00:00:00Z"} # State for partition "1" + + mock_cursor_2 = MagicMock() + mock_cursor_2.stream_slices.return_value = iter( + [ + {"slice2": "data2"}, + {"slice2": "data2"}, # Second partition slices + ] + ) + mock_cursor_2.state = {"updated_at": "2024-01-03T00:00:00Z"} # State for partition "2" + + # Configure cursor factory to return different mock cursors based on partition + cursor_factory_mock = MagicMock() + cursor_factory_mock.create.side_effect = [mock_cursor_1, mock_cursor_2] + + cursor = ConcurrentPerPartitionCursor( + cursor_factory=cursor_factory_mock, + partition_router=MagicMock(), + stream_name="test_stream", + stream_namespace=None, + stream_state={}, + message_repository=MagicMock(), + connector_state_manager=MagicMock(), + connector_state_converter=MagicMock(), + cursor_field=CursorField(cursor_field_key="updated_at"), + ) + + # Simulate partitions with unique parent states + slices = [ + StreamSlice(partition={"id": "1"}, cursor_slice={}), + StreamSlice(partition={"id": "2"}, cursor_slice={}), + ] + cursor._partition_router.stream_slices.return_value = iter(slices) + # Simulate unique parent states for each partition + cursor._partition_router.get_stream_state.side_effect = [ + {"parent": {"state": "state1"}}, # Parent state for partition "1" + {"parent": {"state": "state2"}}, # Parent state for partition "2" + ] + + # Generate slices to populate semaphores and parent states + generated_slices = list( + cursor.stream_slices() + ) # Populate _semaphore_per_partition and _partition_parent_state_map + + # Verify initial state + assert len(cursor._semaphore_per_partition) == 2 + assert len(cursor._partition_parent_state_map) == 2 + assert cursor._partition_parent_state_map['{"id":"1"}'] == {"parent": {"state": "state1"}} + assert cursor._partition_parent_state_map['{"id":"2"}'] == {"parent": {"state": "state2"}} + + # Close partitions to acquire semaphores (value back to 0) + for s in generated_slices: + cursor.close_partition(DeclarativePartition("test_stream", {}, MagicMock(), MagicMock(), s)) + + # Check state after closing partitions + assert len(cursor._finished_partitions) == 2 + assert len(cursor._semaphore_per_partition) == 0 + assert '{"id":"1"}' not in cursor._semaphore_per_partition + assert '{"id":"2"}' not in cursor._semaphore_per_partition + assert len(cursor._partition_parent_state_map) == 0 # All parent states should be popped + assert cursor._parent_state == {"parent": {"state": "state2"}} # Last parent state From a7ece97d56ac5911ebd48b2fb582dd8de3e253c0 Mon Sep 17 00:00:00 2001 From: Anatolii Yatsuk Date: Fri, 21 Feb 2025 17:53:44 +0200 Subject: [PATCH 26/26] Delete testing prints --- .../declarative/incremental/concurrent_partition_cursor.py | 1 - .../incremental/test_concurrent_perpartitioncursor.py | 2 -- 2 files changed, 3 deletions(-) diff --git a/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py b/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py index 3532b4e67..715589026 100644 --- a/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +++ b/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py @@ -251,7 +251,6 @@ def _emit_state_message(self, throttle: bool = True) -> None: self._message_repository.emit_message(state_message) def stream_slices(self) -> Iterable[StreamSlice]: - print("stream_slices") if self._timer.is_running(): raise RuntimeError("stream_slices has been executed more than once.") diff --git a/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py b/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py index 9e15df5b2..3b4b4fe24 100644 --- a/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py +++ b/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py @@ -3144,7 +3144,6 @@ def test_given_unfinished_first_parent_partition_no_parent_state_update(): DeclarativePartition("test_stream", {}, MagicMock(), MagicMock(), slice) ) cursor.ensure_at_least_one_state_emitted() - print(cursor.state) state = cursor.state assert state == { @@ -3229,7 +3228,6 @@ def test_given_unfinished_last_parent_partition_with_partial_parent_state_update DeclarativePartition("test_stream", {}, MagicMock(), MagicMock(), slice) ) cursor.ensure_at_least_one_state_emitted() - print(cursor.state) state = cursor.state assert state == {