-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathstrava.py
268 lines (241 loc) · 11.2 KB
/
strava.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
# Standard Library Imports
import argparse
import logging
import time
from datetime import datetime, timedelta
from typing import Any
# Third-Party Imports
import dlt
from dlt.sources.helpers.rest_client.auth import OAuth2ClientCredentials
from dlt.sources.helpers.rest_client.paginators import PageNumberPaginator
from dlt.sources.helpers.requests import Request
from dlt.common.pendulum import pendulum
from dlt.sources.rest_api import RESTAPIConfig, rest_api_resources
from tqdm import tqdm
# Configure the root logger
logging.basicConfig(
filename='.dlt/strava.log', # specify the file name
filemode='w', # specify the file mode, 'w' will overwrite the file, 'a' will append to the end
level=logging.DEBUG,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
)
# Set urllib3 logging level to DEBUG
logging.getLogger("urllib3").setLevel(logging.DEBUG)
class OAuth2ClientCredentialsHTTPRefresh(OAuth2ClientCredentials):
def build_access_token_request(self) -> dict[str, Any]:
# Refresh token request doesn't require authentication (or base64 encoding)
return {
"headers": {
"Content-Type": "application/x-www-form-urlencoded",
},
"data": self.access_token_request_data,
}
def auth_strava():
return OAuth2ClientCredentialsHTTPRefresh(
access_token_url=dlt.secrets["sources.strava.credentials.access_token_url"],
# Request body for a refresh token request
access_token_request_data={
"grant_type": "refresh_token",
"refresh_token": dlt.secrets["sources.strava.credentials.refresh_token"], # refresh token scoped with read_all, activity:read_all, profile:read_all
"client_id": dlt.secrets["sources.strava.credentials.client_id"],
"client_secret": dlt.secrets["sources.strava.credentials.client_secret"],
# "scope": dlt.secrets["sources.strava.credentials.scope"] <- could pass in a scope to limit scope of access token returned
},
default_token_expiration=21600, # 6 hours in seconds
)
def parse_arguments():
parser = argparse.ArgumentParser(description="Run the DLT pipeline.")
parser.add_argument("--start-date", type=str, default=None, help="Specify the start date >= for loading data (e.g. '2024-01-01')")
parser.add_argument("--end-date", type=str, default=None, help="Specify the end date < for loading data (e.g. '2024-07-01')")
return parser.parse_args()
class SharedRateLimiter:
def __init__(self, max_requests, period):
self.max_requests = max_requests
self.period = period
self.session_requests = 0
self.total_requests = 0
self.start_time = datetime.now()
def request(self):
if self.session_requests >= self.max_requests:
self.sleep()
self.session_requests = 0
self.session_requests += 1
self.total_requests += 1
def sleep(self):
end_time = self.start_time + self.period
sleep_time = (end_time - datetime.now()).total_seconds()
if sleep_time > 0:
self.loading_bar(sleep_time)
self.start_time = datetime.now()
def loading_bar(self, sleep_time):
"""
Loading bar to track and provide a visual of sleep time.
When the "loading" completes, the next batch of requests will be made.
"""
print("Rate limit reached. Waiting...")
self.print_request_count()
with tqdm(total=int(sleep_time), desc="Sleeping", unit="s", ncols=80, colour='blue', bar_format="{l_bar}{bar}| {remaining} seconds remaining") as pbar:
for _ in range(int(sleep_time)):
time.sleep(1)
pbar.update(1)
def print_request_count(self):
print(f"Total request count: {self.total_requests}")
class RateLimitedPageNumberPaginator(PageNumberPaginator):
def __init__(self, rate_limiter, resource_name, **kwargs):
super().__init__(**kwargs)
self.rate_limiter = rate_limiter
self.resource_name = resource_name
self.resource_requests = 0
def update_request(self, request: Request) -> None:
super().update_request(request)
self.rate_limiter.request()
self.resource_requests += 1
self.print_request_count()
def print_request_count(self):
print(f"Resource {self.resource_name} request count: {self.resource_requests}")
shared_rate_limiter = SharedRateLimiter(
max_requests=95,
period=timedelta(minutes=15)
)
@dlt.source(name="strava")
def strava_source(start_date: str | None = None, end_date: str | None = None):
# Set load_from_date based on whether or not a --start-date param was passed in
load_from_date = (
pendulum.parse(start_date).to_iso8601_string()
if start_date
# Get the last value from loaded metadata. If it does not exist, use 30 days ago.
else dlt.current.source_state().setdefault(
"last_value",
pendulum.today().subtract(days=30).to_iso8601_string()
)
)
load_until_date = (
pendulum.parse(end_date).to_iso8601_string()
if end_date
else None
)
# Creates a REST API configuration for the Strava API
# Uses RESTAPIConfig to get autocompletion and type checking
config: RESTAPIConfig = {
"client": {
"base_url": "https://www.strava.com/api/v3/",
# we add an auth config if the auth token is present
"auth": auth_strava(),
# "paginator": RateLimitedPageNumberPaginator(
# rate_limiter=shared_rate_limiter,
# base_page=1,
# total_path=None,
# )
},
# The default configuration for all resources and their endpoints
"resource_defaults": {
"primary_key": "id",
# Note: Strava activities don't have a concept of updated_at or a similar field
# so just deleting+inserting the most recently extracted record
"write_disposition": "merge",
"endpoint": {
"params": {
"per_page": 200, # default for strava; max is 200
},
},
},
"resources": [
{
"name": "activities",
"primary_key": "id",
"endpoint": {
"path": "activities",
# Query parameters for the endpoint
"paginator": RateLimitedPageNumberPaginator(rate_limiter=shared_rate_limiter, base_page=1, total_path=None, resource_name="activities"),
"incremental": {
"start_param": "after", # Strava API param to return only the records after (ie. >) a specified epoch time
"end_param": "before", # Strava API param to return only the records before (i.e. <) a specified epoch time
"cursor_path": "start_date",
"initial_value": load_from_date,
"end_value": load_until_date,
"convert": lambda timestamp_str: None if timestamp_str is None else int(pendulum.parse(timestamp_str).timestamp()), # converts date string to epoch time
},
},
},
# The following is an example of a resource that uses
# a parent resource (`activities`) to get the `activity_id`
# and include it in the endpoint path:
{
"name": "activity_streams",
"primary_key": ["_activities_id", "type"], # _activites_id comes from the parent activity; see include_from_parent
"max_table_nesting": 0, # the data field is an array, and this tells dlt not to flatten it out into another table (e.g. activity_streams__data); default is 1000
"endpoint": {
# The placeholder {activity_id} will be resolved
# from the parent resource
"path": "activities/{activity_id}/streams",
# A "manual entry" activity won't have any activity streams, and will throw a 404 error because the URL doesn't exist
# This gracefully skips those situations
"response_actions": [
{"status_code": 404, "content": "Not Found", "action": "ignore"},
],
"paginator": RateLimitedPageNumberPaginator(
rate_limiter=shared_rate_limiter,
base_page=1,
total_path=None,
# The activity_stream is returned as a single page, however,
# without setting the maximum_page=1, dlt will just keep incrementing the page infintely for the same activity_stream
maximum_page=1,
resource_name="activity_streams"
),
"params": {
# specifies the desired streams types (param for strava)
"keys": "time,distance,altitude,velocity_smooth,heartrate,cadence,watts,temp,moving,grade_smooth", # not used: latlng
"activity_id": {
"type": "resolve",
"resource": "activities",
"field": "id",
}
},
},
# Include data from `id` field of the parent resource
# in the child data. The field name in the child data
# will be called `_activities_id` (_{resource_name}_{field_name})
"include_from_parent": ["id"],
},
{
"name": "activity_zones",
"primary_key": ["_activities_id", "type"], # _activites_id comes from the parent activity; see include_from_parent
"endpoint": {
"path": "activities/{activity_id}/zones",
"response_actions": [
{"status_code": 404, "content": "Not Found", "action": "ignore"},
],
"paginator": RateLimitedPageNumberPaginator(
rate_limiter=shared_rate_limiter,
base_page=1,
total_path=None,
maximum_page=1,
resource_name="activity_zones"
),
"params": {
"activity_id": {
"type": "resolve",
"resource": "activities",
"field": "id",
}
},
},
"include_from_parent": ["id"],
},
],
}
yield from rest_api_resources(config)
def load_strava() -> None:
args = parse_arguments()
start_date = args.start_date
end_date = args.end_date
pipeline = dlt.pipeline(
pipeline_name="strava_datastack",
destination='duckdb',
dataset_name="strava",
progress="log",
)
load_info = pipeline.run((strava_source(start_date=start_date, end_date=end_date)))
print(load_info)
if __name__ == "__main__":
load_strava()