@@ -41,12 +41,18 @@ def add_arguments(self, parser):
41
41
default = None ,
42
42
help = "Comma-separated list of repositories to process (e.g., 'OWASP-BLT/BLT,OWASP-BLT/BLT-Flutter')" ,
43
43
)
44
+ parser .add_argument (
45
+ "--reset" ,
46
+ action = "store_true" ,
47
+ help = "Reset the last_pr_page_processed counter and start from the beginning" ,
48
+ )
44
49
45
50
def handle (self , * args , ** options ):
46
51
days = options ["days" ]
47
52
limit = options ["limit" ]
48
53
verbose = True # Always use verbose mode for debugging
49
54
repos_arg = options ["repos" ]
55
+ reset = options ["reset" ]
50
56
51
57
self .stdout .write (f"Fetching closed PRs from the past { days } days for GSoC repositories" )
52
58
@@ -72,6 +78,7 @@ def handle(self, *args, **options):
72
78
73
79
total_prs_fetched = 0
74
80
total_prs_added = 0
81
+ total_prs_updated = 0
75
82
76
83
for repo_full_name in all_repos :
77
84
try :
@@ -80,16 +87,23 @@ def handle(self, *args, **options):
80
87
# Check if the repository exists in our database
81
88
repo = self .get_or_create_repo (owner , repo_name )
82
89
83
- # Fetch closed PRs from the past year
84
- prs = self .fetch_closed_prs (owner , repo_name , days , verbose )
90
+ # Reset the last_pr_page_processed if requested
91
+ if reset :
92
+ repo .last_pr_page_processed = 0
93
+ repo .save ()
94
+ self .stdout .write (f"Reset last_pr_page_processed for { repo_full_name } " )
85
95
86
- # Save PRs to the database
87
- added_count = self .save_prs_to_db (repo , prs , verbose )
96
+ # Fetch closed PRs from the past specified days
97
+ prs_fetched , prs_added , prs_updated = self .fetch_and_save_prs (repo , owner , repo_name , days , verbose )
88
98
89
- total_prs_fetched += len (prs )
90
- total_prs_added += added_count
99
+ total_prs_fetched += prs_fetched
100
+ total_prs_added += prs_added
101
+ total_prs_updated += prs_updated
91
102
92
- self .stdout .write (f"Processed { repo_full_name } : Fetched { len (prs )} PRs, Added { added_count } new PRs" )
103
+ self .stdout .write (
104
+ f"Processed { repo_full_name } : Fetched { prs_fetched } PRs, "
105
+ f"Added { prs_added } new PRs, Updated { prs_updated } existing PRs"
106
+ )
93
107
94
108
except Exception as e :
95
109
logger .error (f"Error processing repository { repo_full_name } : { str (e )} " , exc_info = True )
@@ -98,7 +112,9 @@ def handle(self, *args, **options):
98
112
self .stdout .write (
99
113
self .style .SUCCESS (
100
114
f"Completed fetching PRs for GSoC repositories. "
101
- f"Total fetched: { total_prs_fetched } , Total added: { total_prs_added } "
115
+ f"Total fetched: { total_prs_fetched } , "
116
+ f"Total added: { total_prs_added } , "
117
+ f"Total updated: { total_prs_updated } "
102
118
)
103
119
)
104
120
@@ -140,35 +156,43 @@ def get_or_create_repo(self, owner, repo_name):
140
156
watchers = repo_data .get ("watchers_count" , 0 ),
141
157
primary_language = repo_data .get ("language" ),
142
158
is_owasp_repo = owner .upper () == "OWASP" ,
159
+ last_pr_page_processed = 0 ,
143
160
)
144
161
repo .save ()
145
162
146
163
return repo
147
164
148
- def fetch_closed_prs (self , owner , repo_name , days , verbose = False ):
165
+ def fetch_and_save_prs (self , repo , owner , repo_name , days , verbose = False ):
149
166
"""
150
- Fetch closed pull requests from GitHub API.
151
- Only fetches PRs from the past specified number of days .
167
+ Fetch closed pull requests from GitHub API and save them to the database .
168
+ Returns a tuple of (total_prs_fetched, total_prs_added, total_prs_updated) .
152
169
"""
153
- prs = []
154
- page = 1
155
- per_page = 100
170
+ total_prs_fetched = 0
171
+ total_prs_added = 0
172
+ total_prs_updated = 0
156
173
157
174
# Calculate date for filtering
158
175
since_date = timezone .now () - timedelta (days = days )
159
176
since_date_str = since_date .strftime ("%Y-%m-%dT%H:%M:%SZ" )
160
177
161
178
self .stdout .write (f"Fetching PRs since { since_date_str } for { owner } /{ repo_name } " )
162
179
self .stdout .write (f"Current date: { timezone .now ().strftime ('%Y-%m-%dT%H:%M:%SZ' )} , Looking back { days } days" )
180
+ self .stdout .write (f"Starting from page { repo .last_pr_page_processed + 1 } " )
163
181
182
+ # Set up headers for GitHub API
164
183
headers = {"Accept" : "application/vnd.github.v3+json" }
165
184
if settings .GITHUB_TOKEN :
166
185
headers ["Authorization" ] = f"token { settings .GITHUB_TOKEN } "
167
186
self .stdout .write ("Using GitHub token for authentication" )
168
187
else :
169
188
self .stdout .write ("No GitHub token found, using unauthenticated requests (rate limits may apply)" )
170
189
171
- while True :
190
+ # Start from the last processed page + 1
191
+ page = repo .last_pr_page_processed + 1
192
+ per_page = 100
193
+ reached_end = False
194
+
195
+ while not reached_end :
172
196
url = (
173
197
f"https://api.github.com/repos/{ owner } /{ repo_name } /pulls"
174
198
f"?state=closed&per_page={ per_page } &page={ page } &sort=updated&direction=desc"
@@ -185,6 +209,7 @@ def fetch_closed_prs(self, owner, repo_name, days, verbose=False):
185
209
data = response .json ()
186
210
if not data :
187
211
self .stdout .write (f"No more PRs found for { owner } /{ repo_name } on page { page } " )
212
+ reached_end = True
188
213
break
189
214
190
215
self .stdout .write (f"Fetched { len (data )} PRs from page { page } " )
@@ -193,11 +218,21 @@ def fetch_closed_prs(self, owner, repo_name, days, verbose=False):
193
218
merged_count = sum (1 for pr in data if pr .get ("merged_at" ) is not None )
194
219
self .stdout .write (f"Found { merged_count } merged PRs on page { page } " )
195
220
196
- prs .extend (data )
221
+ # Process this page of PRs
222
+ prs_added , prs_updated = self .save_prs_to_db (repo , data , verbose )
223
+ total_prs_fetched += len (data )
224
+ total_prs_added += prs_added
225
+ total_prs_updated += prs_updated
226
+
227
+ # Update the repository's last processed page
228
+ repo .last_pr_page_processed = page
229
+ repo .last_pr_fetch_date = timezone .now ()
230
+ repo .save ()
197
231
198
232
# Check if we've reached the last page
199
233
if len (data ) < per_page :
200
234
self .stdout .write (f"Reached last page ({ page } ) for { owner } /{ repo_name } " )
235
+ reached_end = True
201
236
break
202
237
203
238
page += 1
@@ -208,27 +243,36 @@ def fetch_closed_prs(self, owner, repo_name, days, verbose=False):
208
243
break
209
244
210
245
if verbose :
211
- self .stdout .write (f"Fetched { len (prs )} PRs for { owner } /{ repo_name } " )
212
- merged_prs = sum (1 for pr in prs if pr .get ("merged_at" ) is not None )
213
- self .stdout .write (f"Of which { merged_prs } are merged PRs" )
246
+ self .stdout .write (f"Fetched { total_prs_fetched } PRs for { owner } /{ repo_name } " )
247
+ merged_prs = sum (
248
+ 1
249
+ for pr in GitHubIssue .objects .filter (
250
+ repo = repo , type = "pull_request" , is_merged = True , created_at__gte = since_date
251
+ )
252
+ )
253
+ self .stdout .write (f"Total merged PRs in database: { merged_prs } " )
214
254
215
- return prs
255
+ return total_prs_fetched , total_prs_added , total_prs_updated
216
256
217
257
@transaction .atomic
218
258
def save_prs_to_db (self , repo , prs , verbose = False ):
219
259
"""
220
260
Save pull requests to the database.
221
- Returns the number of new PRs added.
261
+ Returns the number of new PRs added and updated .
222
262
"""
223
263
added_count = 0
264
+ updated_count = 0
224
265
skipped_count = 0
266
+ skipped_not_merged = 0
267
+
268
+ self .stdout .write (f"Processing { len (prs )} PRs for { repo .name } " )
225
269
226
270
for pr in prs :
227
- # Check if PR already exists in the database
228
- if GitHubIssue . objects . filter ( issue_id = pr [ "id" ]). exists ( ):
229
- skipped_count += 1
271
+ # Skip PRs that aren't merged
272
+ if not pr . get ( "merged_at" ):
273
+ skipped_not_merged += 1
230
274
if verbose :
231
- self .stdout .write (f"PR { pr ['number' ]} already exists in the database " )
275
+ self .stdout .write (f"PR { pr ['number' ]} is not merged, skipping " )
232
276
continue
233
277
234
278
# Parse dates
@@ -249,29 +293,44 @@ def save_prs_to_db(self, repo, prs, verbose=False):
249
293
user_profile = None
250
294
if pr ["user" ] and pr ["user" ]["html_url" ]:
251
295
user_profile = UserProfile .objects .filter (github_url = pr ["user" ]["html_url" ]).first ()
296
+ if not user_profile and verbose :
297
+ self .stdout .write (f"No user profile found for { pr ['user' ]['html_url' ]} " )
298
+
299
+ # Prepare the data for the GitHubIssue
300
+ issue_data = {
301
+ "title" : pr ["title" ],
302
+ "body" : pr ["body" ] or "" ,
303
+ "state" : pr ["state" ],
304
+ "type" : "pull_request" ,
305
+ "created_at" : created_at ,
306
+ "updated_at" : updated_at ,
307
+ "closed_at" : closed_at ,
308
+ "merged_at" : merged_at ,
309
+ "is_merged" : is_merged ,
310
+ "url" : pr ["html_url" ],
311
+ "repo" : repo ,
312
+ "user_profile" : user_profile ,
313
+ }
314
+
315
+ # Try to get the existing issue or create a new one
316
+ try :
317
+ github_issue , created = GitHubIssue .objects .update_or_create (issue_id = pr ["id" ], defaults = issue_data )
318
+
319
+ if created :
320
+ added_count += 1
321
+ if verbose :
322
+ self .stdout .write (f"Added PR #{ pr ['number' ]} : { pr ['title' ]} " )
323
+ else :
324
+ updated_count += 1
325
+ if verbose :
326
+ self .stdout .write (f"Updated PR #{ pr ['number' ]} : { pr ['title' ]} " )
327
+ except Exception as e :
328
+ self .stdout .write (self .style .ERROR (f"Error saving PR #{ pr ['number' ]} : { str (e )} " ))
329
+ skipped_count += 1
252
330
253
- # Create the GitHubIssue
254
- github_issue = GitHubIssue (
255
- issue_id = pr ["id" ],
256
- title = pr ["title" ],
257
- body = pr ["body" ] or "" ,
258
- state = pr ["state" ],
259
- type = "pull_request" ,
260
- created_at = created_at ,
261
- updated_at = updated_at ,
262
- closed_at = closed_at ,
263
- merged_at = merged_at ,
264
- is_merged = is_merged ,
265
- url = pr ["html_url" ],
266
- repo = repo ,
267
- user_profile = user_profile ,
268
- )
269
- github_issue .save ()
270
-
271
- added_count += 1
272
-
273
- if verbose :
274
- self .stdout .write (f"Added PR #{ pr ['number' ]} : { pr ['title' ]} " )
331
+ self .stdout .write (f"Skipped { skipped_count } PRs due to errors" )
332
+ self .stdout .write (f"Skipped { skipped_not_merged } PRs that are not merged" )
333
+ self .stdout .write (f"Added { added_count } new PRs to the database" )
334
+ self .stdout .write (f"Updated { updated_count } existing PRs in the database" )
275
335
276
- self .stdout .write (f"Skipped { skipped_count } PRs that already exist in the database" )
277
- return added_count
336
+ return added_count , updated_count
0 commit comments