-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathdataset_stats.py
323 lines (249 loc) · 14.4 KB
/
dataset_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
#!/usr/bin/env python
""" Compute and save statistics that demonstrate various characteristics of the frames and bounding box annotations in
the REAL-Colon dataset.
Usage:
- Update base_dataset_path = "/path/to/dataset/folder" with path to the folder containing the REAL-colon dataset
- python3 dataset_stats.py
Copyright 2023-, Cosmo Intelligent Medical Devices
"""
import os
import pandas as pd
import concurrent.futures
# Import repo scripts
from polyp_detection import export_coco_format
def get_annotation_data(annotation_folder):
"""
Process video annotation data from a video folder.
Args:
annotation_folder (str): The folder path where annotation xml files of a video are located.
Returns:
box_in_frame_dict: A dictionary containing the following information:
- box_in_frame_dict: A dictionary containing a value of a list of unique binding boxes for each frame
- box_in_frame_dict[i]: The key i refers to the frame number
"""
print(f"Processing annotations in folder {annotation_folder}")
box_in_frame_dict = {} # Maps each frame with a list of bounding box ids in the frame
# Iterate over annotation files in chronological order
ordered_files = sorted(os.listdir(annotation_folder), key=lambda x: int(x.rsplit('_', 1)[-1].split('.')[0]))
for file in ordered_files:
# Parse annotation data from file
ann_data = export_coco_format.parsevocfile(os.path.join(annotation_folder, file))
# Retrieve the frame id from the image name
frame_id = ann_data["img_name"].split('.')[0]
# Initialize the frame in the dictionary with an empty list
box_in_frame_dict[frame_id] = []
# Process bounding boxes if present
if ann_data["boxes"]:
# Process each bounding box
for box in ann_data["boxes"]:
unique_id = box['unique_id']
# Store bounding box details and corresponding frame ID
box_in_frame_dict[frame_id].append(unique_id)
return box_in_frame_dict
def create_frame_tables(class_ann_data, ext_ann_data, lesion_info_csv, ext, is_frames):
"""
Create the csv tables from the data.
Args:
class_ann_data (dict): A dictionary of frames with the different histology types.
ext_ann_data (dict): A dictionary of frames with the different extended histology classes
lesion_info_csv (DataFrame): The csv containing info on the lesion data
ext (str): the extension to add to the front of the csv file name
is_frames (bool): indicates whether a frames csv or a bounding box csv is to be made
Generates:
CSV files listing characteristics of the 4 studies separately and collectively
"""
# Start create csv files with the total number of frames containing histology classor bounding box types
# Get unique values from the 'histology_class' column
unique_histology_classes = lesion_info_csv[['histology_class']].drop_duplicates()
# Create DataFrames for 'MULTIPLE' 'NEGATIVE FRAME' and 'TOTAL' if frames. Otherwise, just 'TOTAL'
new_rows = []
if is_frames:
new_rows = [{'histology_class': 'MULTIPLE'}, {'histology_class': 'NEGATIVE FRAME'}, {'histology_class': 'TOTAL'}]
else:
new_rows = [{'histology_class': 'TOTAL'}]
new_df = pd.DataFrame(new_rows)
# Concatenate the new DataFrame with the existing unique_histology_classes DataFrame
unique_histology_classes = pd.concat([unique_histology_classes, new_df], ignore_index=True)
# Count the occurrences of each unique 'histology_class' in class_ann_data
histology_counts = pd.Series(class_ann_data).value_counts()
# Merge the counts into the unique_histology_classes DataFrame
new_column_name = ""
if is_frames:
new_column_name = 'frames_count'
else:
new_column_name = 'bounding_box_count'
unique_histology_classes = unique_histology_classes.merge(
histology_counts.rename(new_column_name).reset_index(),
how='left',
left_on='histology_class',
right_on='index'
)
# Fill NaN values in 'frames_count' or 'bounding_box_count' with 0
unique_histology_classes[new_column_name] = unique_histology_classes[new_column_name].fillna(0)
# Calculate the count for 'TOTAL' by summing up all other count values
total_count = unique_histology_classes[new_column_name].sum()
unique_histology_classes.loc[
unique_histology_classes['histology_class'] == 'TOTAL', new_column_name] = total_count
# Calculate the 'percentage' column and round
unique_histology_classes['percentage'] = (unique_histology_classes[new_column_name] / total_count) * 100
unique_histology_classes['percentage'] = unique_histology_classes['percentage'].round(2)
# Convert the count column to integers
unique_histology_classes[new_column_name] = unique_histology_classes[new_column_name].astype(int)
# Drop the 'index' column
unique_histology_classes = unique_histology_classes.drop(columns=['index'])
# Save the DataFrame to a CSV file with the counts
if is_frames:
unique_histology_classes.to_csv(f"./stats/{ext}frames_histology_class.csv", index=False)
else:
unique_histology_classes.to_csv(f"./stats/{ext}boxes_histology_class.csv", index=False)
# Start create csv files with the total number of frames containing histology extended or bounding box types
# Get unique values from the 'histology_class' column
unique_histology_extended = lesion_info_csv[['histology_extended']].drop_duplicates()
# Create DataFrames for 'MULTIPLE' 'NEGATIVE FRAME' and 'TOTAL' if frames. Otherwise, just 'TOTAL'
new_rows = []
if is_frames:
new_rows = [{'histology_extended': 'MULTIPLE'}, {'histology_extended': 'NEGATIVE FRAME'},
{'histology_extended': 'TOTAL'}]
else:
new_rows = [{'histology_extended': 'TOTAL'}]
new_df = pd.DataFrame(new_rows)
# Concatenate the new DataFrame with the existing unique_histology_classes DataFrame
unique_histology_extended = pd.concat([unique_histology_extended, new_df], ignore_index=True)
# Count the occurrences of each unique 'histology_extended' in ext_ann_data
histology_counts = pd.Series(ext_ann_data).value_counts()
# Merge the counts into the unique_histology_classes DataFrame
new_column_name = ""
if is_frames:
new_column_name = 'frames_count'
else:
new_column_name = 'bounding_box_count'
unique_histology_extended = unique_histology_extended.merge(
histology_counts.rename(new_column_name).reset_index(),
how='left',
left_on='histology_extended',
right_on='index'
)
# Fill NaN values in 'frames_count' or 'bounding_box_count' with 0
unique_histology_extended[new_column_name] = unique_histology_extended[new_column_name].fillna(0)
# Calculate the frames_count for 'TOTAL' by summing up all other count values
total_frames_count = unique_histology_extended[new_column_name].sum()
unique_histology_extended.loc[
unique_histology_extended['histology_extended'] == 'TOTAL', new_column_name] = total_frames_count
unique_histology_extended[new_column_name] = unique_histology_extended[new_column_name].astype(int)
# Calculate the 'percentage' column
unique_histology_extended['percentage'] = (unique_histology_extended[new_column_name] / total_frames_count) * 100
unique_histology_extended['percentage'] = unique_histology_extended['percentage'].round(2)
# Drop the 'index' column if you don't need it
unique_histology_extended = unique_histology_extended.drop(columns=['index'])
# Save the DataFrame to a CSV file with the counts
if is_frames:
unique_histology_extended.to_csv(f"./stats/{ext}frames_histology_extended.csv", index=False)
else:
unique_histology_extended.to_csv(f"./stats/{ext}boxes_histology_extended.csv", index=False)
return
def main():
# Specify here dataset base path
base_dataset_path = "/path/to/dataset/folder"
lesion_info_csv = pd.read_csv(os.path.join(base_dataset_path, "lesion_info.csv"))
video_info_csv = pd.read_csv(os.path.join(base_dataset_path, "video_info.csv"))
# Retrieve annotations folder
annotation_folders = []
for dataset in range(1, 5):
for vv in range(1, 16):
annotation_folders += [os.path.join(base_dataset_path, f"{dataset:03d}-{vv:03d}" + "_annotations")]
# Loop over dataset videos to get their resolution and their fps
resolutions = {}
fps = {}
for afolder in annotation_folders:
c_ann_data = export_coco_format.parsevocfile(os.path.join(afolder, os.listdir(afolder)[0]))
resolutions[afolder[-19:-12]] = c_ann_data['img_shape']
fps[afolder[-19:-12]] = int(
round(video_info_csv[video_info_csv["unique_video_name"] == afolder[-19:-12]]["fps"].iloc[0]))
# Run concurrently the get_annotation_data over video folders to load video annotation data
class_ann_data = {} # variable to hold annotation info from all the videos for the class csv
ext_ann_data = {} # variable to hold annotation info from all the videos for the extended class csv
with concurrent.futures.ProcessPoolExecutor(max_workers=70) as executor:
result_concurrent = executor.map(get_annotation_data, annotation_folders)
for result in result_concurrent:
result_dict = result
# Loop to populate class_ann_data
for key, value in result_dict.items():
# Add the histology types to the list
if len(value) == 0:
class_ann_data[key] = 'NEGATIVE FRAME'
elif len(value) == 1:
filtered_row = lesion_info_csv.loc[lesion_info_csv['unique_object_id'] == value[0], 'histology_class']
# Append the result to the class_ann_data list
class_ann_data[key] = filtered_row.values[0] if not filtered_row.empty else None
# If more than 1 box, check if they are all the same type and put them in the multiple section if false
else:
# Create a variable to check the other types against
histology_checker = lesion_info_csv.loc[
lesion_info_csv['unique_object_id'] == value[0], 'histology_class'].values[0]
check_passed = True
# Loop through the other boxes to check if they are all the same types
for i in range(1, len(value)):
class_to_check = lesion_info_csv.loc[
lesion_info_csv['unique_object_id'] == value[i], 'histology_class'].values[0]
if class_to_check != histology_checker:
class_ann_data[key] = 'MULTIPLE'
check_passed = False
break
# If they are all the same kind, add it to the data
if check_passed:
class_ann_data[key] = histology_checker
# Loop to populate ext_ann_data
for key, value in result_dict.items():
# Add the histology types to the list
if len(value) == 0:
ext_ann_data[key] = 'NEGATIVE FRAME'
elif len(value) == 1:
filtered_row = lesion_info_csv.loc[
lesion_info_csv['unique_object_id'] == value[0], 'histology_extended']
# Append the result to the class_ann_data list
ext_ann_data[key] = filtered_row.values[0] if not filtered_row.empty else None
# If more than 1 box, check if they are all the same type and put them in the multiple section if false
else:
# Create a variable to check the other types against
histology_checker = lesion_info_csv.loc[
lesion_info_csv['unique_object_id'] == value[0], 'histology_extended'].values[0]
check_passed = True
# Loop through the other boxes to check if they are all the same types
for i in range(1, len(value)):
class_to_check = lesion_info_csv.loc[
lesion_info_csv['unique_object_id'] == value[i], 'histology_extended'].values[0]
if class_to_check != histology_checker:
ext_ann_data[key] = 'MULTIPLE'
check_passed = False
break
# If they are all the same kind, add it to the data
if check_passed:
ext_ann_data[key] = histology_checker
# Create the stats folder if it doesn't exist
if not os.path.exists("./stats"):
os.makedirs("./stats")
# Create frame csv for the four studies
for i in range(1, 5):
print(f"Creating the frames csv for the study 00{i}")
filtered_class_data = {key: value for key, value in class_ann_data.items() if key.startswith(f"00{i}")}
filtered_ext_data = {key: value for key, value in ext_ann_data.items() if key.startswith(f"00{i}")}
create_frame_tables(filtered_class_data, filtered_ext_data, lesion_info_csv, f"00{i}_", True)
# Create frame csv for a collective of the studies
print(f"Creating the frames csv for the collective studies")
create_frame_tables(class_ann_data, ext_ann_data, lesion_info_csv, "", True)
# Start creating csv files for bounding box histology distribution
# Initialize the dictionaries to contain key value pairs of the unique box id and the histology types
class_box_data = lesion_info_csv.set_index('unique_object_id')['histology_class'].to_dict()
ext_box_data = lesion_info_csv.set_index('unique_object_id')['histology_extended'].to_dict()
# Create the csv files for the box histology classes
for i in range(1, 5):
print(f"Creating the bounding box csv for the study 00{i}")
filtered_class_data = {key: value for key, value in class_box_data.items() if key.startswith(f"00{i}")}
filtered_ext_data = {key: value for key, value in ext_box_data.items() if key.startswith(f"00{i}")}
create_frame_tables(filtered_class_data, filtered_ext_data, lesion_info_csv, f"00{i}_", False)
# Create bounding box csv for a collective of the studies
print(f"Creating the bounding box csv for the collective studies")
create_frame_tables(class_box_data, ext_box_data, lesion_info_csv, "", False)
print("Script execution completed.")
if __name__ == '__main__':
main()