forked from justin-dannemiller/ASR_LLM_Rescoring
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_ref_dict.py
48 lines (39 loc) · 1.8 KB
/
create_ref_dict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
###############################################################################
## Description: Creates a reference dictionary storing the correct ASR ##
## reference sentence for each ASR utterance ##
## Written by: Justin Dannemiller ##
## Last Modified: 04 December 2023 ##
###############################################################################
import os
import argparse
import json
from preprocess_data import extract_sentence, extract_uttID
""""
@brief: Creates a dictionary of references sentences with which the ASR
hypotheses should be compared to.
@param[in] path_to_refs - Path to the reference sentences
@return ref_dict: Dictionary mapping utterance ID to the corrresponding
transcribed sentence
"""
def create_reference_dict(path_to_refs):
## Read all lines in the reference file
with open(path_to_refs, "r") as ref_file:
reference_lines = ref_file.readlines()
ref_dict = {}
## Add each reference sentence to reference dictionary
for ref_line in reference_lines:
uttID = extract_uttID(ref_line)
reference = extract_sentence(ref_line)
ref_dict[uttID] = reference
return ref_dict
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--test_set", type=str, default="test_other")
args = parser.parse_args()
current_path = os.getcwd()
test_set = args.test_set
reference_path = os.path.join(current_path, "ground_truth", test_set, "text")
ref_dict = create_reference_dict(reference_path)
ref_dict_file_name = "ref_dict_" + test_set + ".json"
with open(ref_dict_file_name, "w") as ref_json_file:
json.dump(ref_dict, ref_json_file, indent=4)