-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathconsume_logs.py
336 lines (299 loc) · 12.4 KB
/
consume_logs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
"""
This script will compare read/write line counts of files processed by the
S3 to JSON job by querying the Cloudwatch log group where that job writes
its logs.
A message is printed whether a mismatch between read/write counts was discovered,
indicating that data went missing or, potentially, that additional data not present
in the original JSON file was written to the JSON dataset.
A CSV file is written (consume_logs_comparison_report.csv) to the current directory
containing the read/write counts for every file matched in the query. This CSV file
contains the fields:
* workflow_run_id : the ID of the workflow run
* cohort : the cohort this data belongs to (adults_v1, pediatric_v1, etc.)
* file_name : the file name.
* line_count_access : the number of lines counted during read of the original JSON file.
* line_count_creation : the number of lines counted during write to the JSON dataset.
* line_count_difference : line_count_access - line_count_creation
If missing data is discovered, another CSV file (consume_logs_missing_data_report.csv)
is written which contains only the subset of files which have a line_count_difference != 0.
The schema is the same as above.
"""
import argparse
import datetime
import json
import time
from collections import defaultdict
from typing import Dict, List
import boto3
import pandas
def read_args():
parser = argparse.ArgumentParser(
description="Query S3 to JSON CloudWatch logs and compare line counts.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"--log-group-name",
help="The name of the log group to query.",
default="/aws-glue/python-jobs/error",
)
parser.add_argument(
"--query",
help="The query to run against the log group.",
default='fields @message | filter event.action = "list-file-properties"',
)
parser.add_argument(
"--start-datetime",
help=(
"Query start time (local time) expressed in a format parseable by "
"datetime.datetime.strptime. This argument should be "
"formatted as `--time-format`."
),
required=True,
)
parser.add_argument(
"--end-datetime",
help=(
'Default is "now". Query end time (local time) expressed '
"in a format parseable by datetime.datetime.strptime. This "
"argument should be formatted as `--time-format`."
),
)
parser.add_argument(
"--datetime-format",
help="The time format to use with datetime.datetime.strptime",
default="%Y-%m-%d %H:%M:%S",
)
args = parser.parse_args()
return args
def get_seconds_since_epoch(datetime_str: str, datetime_format: str) -> int:
"""
Returns the seconds since epoch for a specific datetime (local time)
Args:
datetime_str (str): A datetime represented by a format parseable by
datetime.datetime.strptime.
datetime_format (str): The datetime.datetime.strptime format used
to parse `datetime`.
Returns:
(int) The number of seconds since January 1, 1970 (local time)
"""
if datetime_str is None:
parsed_datetime = datetime.datetime.now()
else:
parsed_datetime = datetime.datetime.strptime(datetime_str, datetime_format)
seconds_since_epoch = int(parsed_datetime.timestamp())
return seconds_since_epoch
def query_logs(
log_group_name: str,
query_string: str,
start_unix_time: int,
end_unix_time: int,
**kwargs: dict,
) -> list:
"""
Query a CloudWatch log group.
Args:
log_group_name (str): The log group on which to perform the query.
query_string (str): The query string to use.
start_unix_time (int): The beginning of the time range to query. The range
is inclusive, so the specified start time is included in the query.
Specified as epoch (Unix) time.
end_unix_time (int): The end of the time range to query. The range
is inclusive, so the specified end time is included in the query.
Specified as epoch (Unix) time.
kwargs (dict): Used during tests to include boto client stubs
Returns:
(list) The results of the query. Empty if no matches found.
Raises:
UserWarning: If the call to the get query results API does not succeed.
"""
logs_client = kwargs.get("logs_client", boto3.client("logs"))
start_query_response = logs_client.start_query(
logGroupName=log_group_name,
startTime=start_unix_time,
endTime=end_unix_time,
queryString=query_string,
)
query_response = logs_client.get_query_results(
queryId=start_query_response["queryId"]
)
_check_for_failed_query(query_response)
while query_response["status"] != "Complete":
time.sleep(1)
query_response = logs_client.get_query_results(
queryId=start_query_response["queryId"]
)
_check_for_failed_query(query_response)
return query_response["results"]
def _check_for_failed_query(query_response) -> None:
"""
Helper function for checking whether a query response has not succeeded.
"""
if query_response["status"] in ["Failed", "Cancelled", "Timeout", "Unknown"]:
raise UserWarning(f"Query failed with status \"{query_response['status']}\"")
def group_query_result_by_workflow_run(
query_results: List[List[dict]],
) -> Dict[str, List[dict]]:
"""
Associates log records with their workflow run ID.
Since our log query may have picked up logs from multiple workflow runs,
we organize our log records by mapping the workflow run ID to its respective
log records.
Args:
query_results (list[list[dict]]): The query results. See `get_query_results`.
Returns:
(dict[str, list[dict]]): A dict of workflow run ID mapped to its respective logs.
"""
# e.g., [{'@message': '{...}', '@ptr': '...'}, ...]
log_records = [
{field["field"]: field["value"] for field in log_message}
for log_message in query_results
]
workflow_run_logs = defaultdict(list)
for log_record in log_records:
log_message = json.loads(log_record["@message"])
workflow_run_logs[log_message["process"]["parent"]["pid"]].append(log_message)
return workflow_run_logs
def transform_logs_to_dataframe(log_messages: List[dict]) -> pandas.DataFrame:
"""
Construct a pandas DataFrame from log records
Dataframes will contain information necessary for comparing read/write
line counts of individual files from a workflow run.
Args:
log_messages (list[dict]): A list of log messages
Returns:
pandas.DataFrame: A pandas DataFrame with columns:
* cohort (str)
* file_name (str)
* event_type (str, either 'access' or 'creation')
* line_count (int)
"""
dataframe_records = []
for log_message in log_messages:
if (
"event" not in log_message
or "type" not in log_message["event"]
or not any(
[k in log_message["event"]["type"] for k in ["access", "creation"]]
)
or all([k in log_message["event"]["type"] for k in ["access", "creation"]])
):
raise KeyError(
"Did not find event.type in log message or "
"event.type contained unexpected values "
"for workflow run ID {log_message['process']['parent']['pid']} and "
f"file {json.dumps(log_message['file']['labels'])}"
)
if "access" in log_message["event"]["type"]:
event_type = "access"
else:
event_type = "creation"
dataframe_record = {
"cohort": log_message["labels"]["cohort"],
"file_name": log_message["file"]["name"],
"event_type": event_type,
"line_count": log_message["file"]["LineCount"],
}
dataframe_records.append(dataframe_record)
log_dataframe = pandas.DataFrame.from_records(dataframe_records)
return log_dataframe
def report_results(
workflow_run_event_comparison: dict,
comparison_report_path: str = "consume_logs_comparison_report.csv",
missing_data_report_path: str = "consume_logs_missing_data_report.csv",
testing: bool = False,
) -> pandas.DataFrame:
"""
Report any missing data and save the results.
This function reports any differences in the 'line_count_difference' column
for each workflow run and saves the results to a CSV.
Args:
workflow_run_event_comparison (dict): A dictionary mapping workflow runs to
a pandas DataFrame comparing read/write record counts.
comparison_report_path (str, optional): The file path to save the comparison report CSV.
This file contains every read/write comparison that matched in the logs query.
Defaults to "consume_logs_comparison_report.csv".
missing_data_report_path (str, optional): The file path to save the missing data report CSV.
Contains the same information as the comparison report, but subset to comparisons
where a difference in the read/write record count was discovered.
Defaults to "consume_logs_missing_data_report.csv".
testing (bool): Set to True to avoid writing any files. Defaults to False.
Returns:
pandas.DataFrame
"""
all_comparisons = pandas.concat(
workflow_run_event_comparison, names=["workflow_run_id", "index"]
)
all_missing_data = pandas.DataFrame()
for workflow_run in workflow_run_event_comparison:
missing_data = workflow_run_event_comparison[workflow_run].query(
"line_count_difference != 0"
)
if len(missing_data) != 0:
print(
"Discovered differences between records read/write "
f"in workflow run {workflow_run}"
)
missing_data = missing_data.assign(workflow_run_id=workflow_run)
all_missing_data = pandas.concat([all_missing_data, missing_data])
if len(all_missing_data) > 0:
print(f"Writing missing data information to {missing_data_report_path}")
all_missing_data = all_missing_data.set_index("workflow_run_id")
if not testing:
all_missing_data.to_csv(missing_data_report_path)
else:
print("Did not find any differences between records read/write")
print(f"Writing read/write comparison information to {comparison_report_path}")
all_comparisons = all_comparisons.droplevel("index")
if not testing:
all_comparisons.to_csv(comparison_report_path)
return all_missing_data
def main() -> None:
args = read_args()
start_unix_time = get_seconds_since_epoch(
datetime_str=args.start_datetime,
datetime_format=args.datetime_format,
)
end_unix_time = get_seconds_since_epoch(
datetime_str=args.end_datetime,
datetime_format=args.datetime_format,
)
file_property_logs = query_logs(
log_group_name=args.log_group_name,
query_string=args.query,
start_unix_time=start_unix_time,
end_unix_time=end_unix_time,
)
if len(file_property_logs) == 0:
print(
f"The query '{args.query}' did not return any results "
f"in the time range {start_unix_time}-{end_unix_time}"
)
return
workflow_run_logs = group_query_result_by_workflow_run(
query_results=file_property_logs
)
workflow_run_event_comparison = {}
for workflow_run in workflow_run_logs:
workflow_run_dataframe = transform_logs_to_dataframe(
log_messages=workflow_run_logs[workflow_run]
)
access_events = workflow_run_dataframe.query("event_type == 'access'").drop(
"event_type", axis=1
)
creation_events = workflow_run_dataframe.query("event_type == 'creation'").drop(
"event_type", axis=1
)
event_comparison = access_events.merge(
creation_events,
how="left",
on=["cohort", "file_name"],
suffixes=("_access", "_creation"),
)
event_comparison["line_count_difference"] = (
event_comparison["line_count_access"]
- event_comparison["line_count_creation"]
)
workflow_run_event_comparison[workflow_run] = event_comparison
report_results(workflow_run_event_comparison=workflow_run_event_comparison)
if __name__ == "__main__":
main()