-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathmetrics.py
328 lines (284 loc) · 12.8 KB
/
metrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
import numpy as np
import csv
import sys
import pandas as pd
import torch
import jiwer
import librosa
from pystoi import stoi
import transformers
from transformers import Wav2Vec2ForMaskedLM, Wav2Vec2Tokenizer
import sys, os
import warnings
'''
Functions to compute the metrics for the 2 tasks of the L3DAS21 challenge.
- task1_metric returns the metric for task 1.
- location_sensitive_detection returns the metric for task 1.
Both functions require numpy matrices as input and can compute only 1 batch at time.
Please, have a look at the "evaluation_baseline_taskX.py" scripts for detailed examples
on the use of these functions.
'''
#TASK 1 METRICS
warnings.filterwarnings("ignore", category=FutureWarning)
transformers.logging.set_verbosity_error()
#wer_tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h");
#wer_model = Wav2Vec2ForMaskedLM.from_pretrained("facebook/wav2vec2-base-960h");
def wer(clean_speech, denoised_speech):
"""
computes the word error rate(WER) score for 1 single data point
"""
def _transcription(clean_speech, denoised_speech):
# transcribe clean audio
input_values = wer_tokenizer(clean_speech, return_tensors="pt").input_values;
logits = wer_model(input_values).logits;
predicted_ids = torch.argmax(logits, dim=-1);
transcript_clean = wer_tokenizer.batch_decode(predicted_ids)[0];
# transcribe
input_values = wer_tokenizer(denoised_speech, return_tensors="pt").input_values;
logits = wer_model(input_values).logits;
predicted_ids = torch.argmax(logits, dim=-1);
transcript_estimate = wer_tokenizer.batch_decode(predicted_ids)[0];
return [transcript_clean, transcript_estimate]
transcript = _transcription(clean_speech, denoised_speech);
try: #if no words are predicted
wer_val = jiwer.wer(transcript[0], transcript[1])
except ValueError:
wer_val = None
return wer_val
def task1_metric(clean_speech, denoised_speech, sr=16000):
'''
Compute evaluation metric for task 1 as (stoi+(1-word error rate)/2)
This function computes such measure for 1 single datapoint
'''
WER = wer(clean_speech, denoised_speech)
if WER is not None: #if there is no speech in the segment
STOI = stoi(clean_speech, denoised_speech, sr, extended=False)
WER = np.clip(WER, 0., 1.)
STOI = np.clip(STOI, 0., 1.)
metric = (STOI + (1. - WER)) / 2.
else:
metric = None
STOI = None
return metric, WER, STOI
def compute_se_metrics(predicted_folder, truth_folder, fs=16000):
'''
Load all submitted sounds for task 1 and compute the average metric
'''
METRIC = []
WER = []
STOI = []
predicted_list = [s for s in os.listdir(predicted_folder) if '.wav' in s]
truth_list = [s for s in os.listdir(truth_folder) if '.wav' in s]
n_sounds = len(predicted_list)
for i in range(n_sounds):
name = str(i) + '.wav'
predicted_temp_path = os.path.join(predicted_folder, name)
truth_temp_path = os.path.join(truth_folder, name)
predicted = librosa.load(predicted_temp_path, sr=fs)
truth = librosa.load(truth_temp_path, sr=fs)
metric, wer, stoi = task1_metric(truth, predicted)
METRIC.append(metric)
WER.append(wer)
STOI.append(stoi)
average_metric = np.mean(METRIC)
average_wer = np.mean(WER)
average_stoi = np.mean(STOI)
print ('*******************************')
print ('Task 1 metric: ', average_metric)
print ('Word error rate: ', average_wer)
print ('Stoi: ', average_stoi)
return average_metric
#TASK 2 METRICS
sound_classes_dict_task2 = {'Chink_and_clink':0,
'Computer_keyboard':1,
'Cupboard_open_or_close':2,
'Drawer_open_or_close':3,
'Female_speech_and_woman_speaking':4,
'Finger_snapping':5,
'Keys_jangling':6,
'Knock':7,
'Laughter':8,
'Male_speech_and_man_speaking':9,
'Printer':10,
'Scissors':11,
'Telephone':12,
'Writing':13}
def location_sensitive_detection(pred, true, n_frames=100, spatial_threshold=2.,
from_csv=False, verbose=False):
'''
Compute TP, FP, FN of a single data point using
location sensitive detection
'''
TP = 0 #true positives
FP = 0 #false positives
FN = 0 #false negatives
#read csv files into numpy matrices if required
if from_csv:
pred = pd.read_csv(pred, sep=',',header=None)
true = pd.read_csv(true, sep=',',header=None)
pred = pred.values
true = true.values
#build empty dict with a key for each time frame
frames = {}
for i in range(n_frames):
frames[i] = {'p':[], 't':[]}
#fill each time frame key with predicted and true entries for that frame
for i in pred:
frames[i[0]]['p'].append(i)
for i in true:
frames[i[0]]['t'].append(i)
#iterate each time frame:
for frame in range(n_frames):
t = frames[frame]['t'] #all true events for frame i
p = frames[frame]['p'] #all predicted events for frame i
matched = 0 #counts the matching events
if len(t) == 0: #if there are PREDICTED but not TRUE events
FP += len(p) #all predicted are false positive
elif len(p) == 0: #if there are TRUE but not PREDICTED events
FN += len(t) #all predicted are false negative
else:
for i_t in range(len(t)): #iterate all true events
match = False #flag for matching events
#count if in each true event there is or not a matching predicted event
true_class = t[i_t][1] #true class
true_coord = t[i_t][-3:] #true coordinates
for i_p in range(len(p)): #compare each true event with all predicted events
pred_class = p[i_p][1] #predicted class
pred_coord = p[i_p][-3:] #predicted coordinates
spat_error = np.linalg.norm(true_coord-pred_coord) #cartesian distance between spatial coords
if true_class == pred_class and spat_error < spatial_threshold: #if predicton is correct (same label + not exceeding spatial error threshold)
match = True
if match:
matched += 1 #for each true event, match only once comparing all predicted events
num_true_items = len(t)
num_pred_items = len(p)
fn = num_true_items - matched
fp = num_pred_items - matched
#add to counts
TP += matched #number of matches are directly true positives
FN += fn
FP += fp
precision = TP / (TP + FP + sys.float_info.epsilon)
recall = TP / (TP + FN + sys.float_info.epsilon)
F_score = 2 * ((precision * recall) / (precision + recall + sys.float_info.epsilon))
results = {'precision': precision,
'recall': recall,
'F score': F_score
}
if verbose:
print ('true positives: ', TP)
print ('false positives: ', FP)
print ('false negatives: ', FN)
print ('---------------------')
print ('*******************************')
print ('F score: ', F_score)
print ('Precision: ', precision)
print ('Recall: ', recall)
print ('TP: ' , TP)
print ('FP: ' , FP)
print ('FN: ' , FN)
return TP, FP, FN, F_score
def sed_score_computation(pred, true, n_frames=100, spatial_threshold=2.,
from_csv=False, verbose=False):
'''
Compute TP, FP, FN of a single data point using
location sensitive detection
'''
TP = 0 #true positives
FP = 0 #false positives
FN = 0 #false negatives
#read csv files into numpy matrices if required
if from_csv:
pred = pd.read_csv(pred, sep=',',header=None)
true = pd.read_csv(true, sep=',',header=None)
pred = pred.values
true = true.values
#build empty dict with a key for each time frame
frames = {}
for i in range(n_frames):
frames[i] = {'p':[], 't':[]}
#fill each time frame key with predicted and true entries for that frame
for i in pred:
frames[i[0]]['p'].append(i)
for i in true:
frames[i[0]]['t'].append(i)
#iterate each time frame:
for frame in range(n_frames):
t = frames[frame]['t'] #all true events for frame i
p = frames[frame]['p'] #all predicted events for frame i
matched = 0 #counts the matching events
if len(t) == 0: #if there are PREDICTED but not TRUE events
FP += len(p) #all predicted are false positive
elif len(p) == 0: #if there are TRUE but not PREDICTED events
FN += len(t) #all predicted are false negative
else:
for i_t in range(len(t)): #iterate all true events
match = False #flag for matching events
#count if in each true event there is or not a matching predicted event
true_class = t[i_t][1] #true class
# true_coord = t[i_t][-3:] #true coordinates
for i_p in range(len(p)): #compare each true event with all predicted events
pred_class = p[i_p][1] #predicted class
# pred_coord = p[i_p][-3:] #predicted coordinates
# spat_error = np.linalg.norm(true_coord-pred_coord) #cartesian distance between spatial coords
if true_class == pred_class: #and spat_error < spatial_threshold: #if predicton is correct (same label + not exceeding spatial error threshold)
match = True
if match:
matched += 1 #for each true event, match only once comparing all predicted events
num_true_items = len(t)
num_pred_items = len(p)
fn = num_true_items - matched
fp = num_pred_items - matched
#add to counts
TP += matched #number of matches are directly true positives
FN += fn
FP += fp
precision = TP / (TP + FP + sys.float_info.epsilon)
recall = TP / (TP + FN + sys.float_info.epsilon)
F_score = 2 * ((precision * recall) / (precision + recall + sys.float_info.epsilon))
Nref=TP+FN
Nsys=TP+FP
ER_score = (max(Nref, Nsys) - TP) / (Nref + 0.0)################ from evaluation_metrics.py SELD
sed_score=np.mean([1-F_score,ER_score])
results = {'precision': precision,
'recall': recall,
'F score': F_score
}
if verbose:
print ('SED score: ' , sed_score)
return TP, FP, FN, sed_score
def compute_seld_metrics(predicted_folder, truth_folder, n_frames=100, spatial_threshold=0.3):
'''
compute F1 score from results folder of submitted results based on the
location sensitive detection metric
'''
TP = 0
FP = 0
FN = 0
predicted_list = [s for s in os.listdir(predicted_folder) if '.csv' in s]
truth_list = [s for s in os.listdir(truth_folder) if '.csv' in s]
n_files = len(predicted_list)
#iterrate each submitted file
for i in range(n_files):
name = predicted_list[i]
predicted_temp_path = os.path.join(predicted_folder, name)
truth_temp_path = os.path.join(truth_folder, name)
#compute tp,fp,fn for each file
tp, fp, fn = location_sensitive_detection(predicted_temp_path,
truth_temp_path,
n_frames,
spatial_threshold)
TP += tp
FP += fp
FN += fn
#compute total F score
precision = TP / (TP + FP + sys.float_info.epsilon)
recall = TP / (TP + FN + sys.float_info.epsilon)
print ('*******************************')
F_score = (2 * precision * recall) / (precision + recall + sys.float_info.epsilon)
print ('F score: ', F_score)
print ('Precision: ', precision)
print ('Recall: ', recall)
return F_score
#gen_dummy_seld_results('./prova')
#compute_seld_metric('./prova/pred', './prova/truth')