-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathfixation_mapping.py
207 lines (183 loc) · 8.4 KB
/
fixation_mapping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
from detectron2.utils.logger import setup_logger
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
import pandas as pd
import os
import time
import numpy as np
from numpy.linalg import norm
import cv2
import json
import argparse
import urllib.request
parser = argparse.ArgumentParser(description='Fixation mapping for Tobii output with FAIR Detectron2')
parser.add_argument('Detectron', metavar='dp', type=str, help='Path to your detectron package folder')
parser.add_argument('VideoFile', metavar='vf', type=str, help='Path to recording video')
parser.add_argument('DataFile', metavar='ff', type=str, help='Path to tobii fixation data export')
parser.add_argument('OutFolder', metavar='of', type=str, help='Path to output folder')
args = parser.parse_args()
detectron_path = args.Detectron
video_path = args.VideoFile
fixation_file_path = args.DataFile
output_folder = args.OutFolder
# # example inputs
# detectron_path = r'C:\Users\marki\detectron2'
# video_path = r'F:\Play\synch_video_data\recording30_full.mp4'
# fixation_file_path = r'F:\Play\synch_video_data\example.xlsx'
# output_folder = os.getcwd()
# get path to model
cwd = os.getcwd()
model_path = os.path.join(cwd, 'model')
# setup detectron logger
setup_logger()
# get config and model
cfg = get_cfg()
cfg.merge_from_file(os.path.join(detectron_path, 'configs\COCO-PanopticSegmentation\panoptic_fpn_R_50_1x.yaml'))
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5 # set threshold for this model
try:
cfg.MODEL.WEIGHTS = os.path.join(model_path, 'model_final_dbfeb4.pkl')
predictor = DefaultPredictor(cfg)
print('using local weight file... ')
except:
print('downloadnig model weights... ')
if not os.path.exists(model_path):
os.makedirs(model_path)
weights = urllib.request.urlretrieve('https://dl.fbaipublicfiles.com/detectron2/COCO-PanopticSegmentation/panoptic_fpn_R_50_1x/139514544/model_final_dbfeb4.pkl',
os.path.join(model_path, 'model_final_dbfeb4.pkl'))
cfg.MODEL.WEIGHTS = os.path.join(model_path, 'model_final_dbfeb4.pkl')
predictor = DefaultPredictor(cfg)
print('weights downloaded,')
# read label json
with open('labels/coco_stuff_rev.json') as json_file:
stuff_rev = json.load(json_file)
stuff_rev = {int(k): v for k,v in stuff_rev.items()}
with open('labels/coco_thing_rev.json') as json_file:
thing_rev = json.load(json_file)
thing_rev = {int(k): v for k, v in thing_rev.items()}
# process video and map fixation
def map_fixation(video_path, fixation_path):
"""function to process video and map fixation"""
df = pd.read_csv(fixation_path, delimiter='\t')
fixation_df = df[df['Eye movement type'] == 'Fixation']
# mid frame timestamp of fixation, like in manual mapping
mid_fixation = fixation_df.groupby(['Eye movement type index']).median().reset_index()
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS)
print('start to map fixations, total fixations to map:', len(mid_fixation))
# loop through fixation and do mapping
time_offset = df.iloc[0]['Recording timestamp'] # segment frame offset
# todo: where to start time, full video or segmented video?
mid_fixation['target'] = ""
mid_fixation['phone_x'] = ""
mid_fixation['phone_y'] = ""
for i in mid_fixation.index:
start_time = time.time()
timestamp = mid_fixation.at[i, 'Recording timestamp']
# fix_x = mid_fixation.get_value(i,'fix_x')
fix_x = mid_fixation.at[i, 'Fixation point X']
fix_y = mid_fixation.at[i, 'Fixation point Y']
# frame_no = round(((timestamp - time_offset) / 1000) * fps) # if the video is also segment
frame_no = round((timestamp / 1000) * fps) # for full video
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_no)
ret, frame = cap.read() # between 300ms and 1300ms...?!
end_1 = time.time()
# do segmentation
outputs = predictor(frame)
segments = outputs['panoptic_seg'][0]
categories = outputs['panoptic_seg'][1]
end_2 = time.time()
segmentation_time = end_2 - end_1 # about 200ms, up to 600
# locate fixation
masks = segments.cpu().numpy() # shape (1080, 1920)
segment_id = masks[int(fix_y), int(fix_x)] # !!!
category_id = categories[segment_id - 1]['category_id']
is_thing = categories[segment_id - 1]['isthing']
# category id to name
if is_thing:
try:
category = thing_rev[category_id]
# if it's phone, map to phone coordinates
if category == 'cell phone':
mask = (masks == segment_id)
fixation = np.array([int(fix_x), int(fix_y)]) # order!
phone_x, phone_y = fixation_to_phone_coords(mask, fixation)
mid_fixation.at[i, 'phone_x'] = phone_x
mid_fixation.at[i, 'phone_y'] = phone_y
except:
print(frame_no, category_id, 'not labelled')
category = 'unlabeled'
continue
if not is_thing:
try:
category = stuff_rev[category_id]
except:
print(frame_no, category_id, 'not labelled')
category = 'unlabeled'
continue
end_3 = time.time()
print('fixation mapped', i+1, ', time cost: ', round(end_3-start_time, 2), 'seconds')
# add to df
mid_fixation.at[i, 'target'] = category
# only use subset
mid_fixation = mid_fixation[['Recording timestamp', 'Eye movement type index', 'Gaze event duration',
'Fixation point X', 'Fixation point Y', 'target', 'phone_x', 'phone_y']]
return mid_fixation
# map to phone coordinates
def fixation_to_phone_coords(mask, fixation):
"""function to map fixation to approximate phone coordinates"""
"""not using affine transformation because:
a) the size of each phone is different
b) affine based on the corners introduces more error in mapping that fixation
"""
mask_im = mask.astype(np.uint8) * 255
try:
contours, _, = cv2.findContours(mask_im, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
except:
# for older cv version
_, contours, _, = cv2.findContours(mask_im, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
# deal with multiple masks
if len(contours) > 1:
contour = sorted(contours, key=cv2.contourArea)[-1]
else:
contour = contours[0]
rect = cv2.minAreaRect(contour)
box = cv2.boxPoints(rect)
box = np.int0(box)
bottom_point = np.array(box[0])
left_point = np.array(box[1])
right_point = np.array(box[3])
top_point = np.array(box[2])
left_edge = norm(left_point - bottom_point)
right_edge = norm(right_point - bottom_point)
if left_edge > right_edge: # assume all phones are rectangular
# readability problems here...
p3 = bottom_point
p2 = left_point
p1 = top_point
distance_12 = norm(np.cross(p2 - p1, p1 - fixation)) / norm(p2 - p1) # on the shorter edge
distance_23 = norm(np.cross(p2 - p3, p3 - fixation)) / norm(p2 - p3) # on the longer edge
phone_x = distance_23 / right_edge
phone_y = distance_12 / left_edge
elif left_edge < right_edge:
p3 = top_point
p2 = left_point
p1 = bottom_point
distance_12 = norm(np.cross(p2 - p1, p1 - fixation)) / norm(p2 - p1)
distance_23 = norm(np.cross(p2 - p3, p3 - fixation)) / norm(p2 - p3)
phone_x = distance_23 / left_edge
phone_y = (right_edge - distance_12) / right_edge
else:
print('approximate screen coordinates not calculated. ')
phone_x = -1
phone_y = -1
return phone_x, phone_y
# run mapping
start_time = time.time()
mapped_fixation = map_fixation(video_path, fixation_file_path)
output_name = 'mapped_' + os.path.basename(fixation_file_path)
mapped_fixation.to_csv(os.path.join(output_folder, output_name), index=False, sep='\t')
end_time = time.time()
print(f"Fixation mapping finished. {len(mapped_fixation)} fixations mapped, "
f"time cost: {round(end_time-start_time)} seconds")
# todo: change the print time thing in mapping function
# todo: one at a time or batch?