-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCloudFunction_Speech2text.py
138 lines (110 loc) · 4.69 KB
/
CloudFunction_Speech2text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
from google.cloud import videointelligence_v1 as vi
from datetime import timedelta
import srt
from google.cloud import storage
import subprocess
def transcribe_speech(video_uri, language_code, segments=None):
video_client = vi.VideoIntelligenceServiceClient()
features = [vi.Feature.SPEECH_TRANSCRIPTION]
config = vi.SpeechTranscriptionConfig(
language_code=language_code,
enable_automatic_punctuation=True,
)
context = vi.VideoContext(
segments=segments,
speech_transcription_config=config,
)
request = vi.AnnotateVideoRequest(
input_uri=video_uri,
features=features,
video_context=context,
)
print(f"Processing video: {video_uri}...")
operation = video_client.annotate_video(request)
return operation.result()
def cors_enabled_function(request):
# For more information about CORS and CORS preflight requests, see
# https://developer.mozilla.org/en-US/docs/Glossary/Preflight_request
# for more information.
# Set CORS headers for the preflight request
if request.method == 'OPTIONS':
# Allows GET requests from any origin with the Content-Type
# header and caches preflight response for an 3600s
headers = {
'Access-Control-Allow-Origin': '*',
'Access-Control-Allow-Methods': 'GET',
'Access-Control-Allow-Headers': 'Content-Type',
'Access-Control-Max-Age': '3600'
}
return ('', 204, headers)
# Set CORS headers for the main request
headers = {
'Access-Control-Allow-Origin': '*'
}
return headers
def print_word_timestamps(response, file_name,min_confidence=0.8):
def keep_transcription(transcription):
return min_confidence <= transcription.alternatives[0].confidence
# First result only, as a single video is processed
transcriptions = response.annotation_results[0].speech_transcriptions
transcriptions = [t for t in transcriptions if keep_transcription(t)]
print(f" Word Timestamps ".center(80, "-"))
data=(f" Word Timestamps ".center(80, "-"))
data= data +"<br>"
subs=[]
for transcription in transcriptions:
best_alternative = transcription.alternatives[0]
confidence = best_alternative.confidence
t1 = 0
frase = ""
n_paraules = 0
ide = 0
last = 0
for i,word in enumerate(best_alternative.words):
if t1 == 0:
t1 = word.start_time.total_seconds()
if frase == "":
frase = word.word
else:
frase = frase +" "+word.word
n_paraules = n_paraules + 1
if n_paraules == 18 or frase[-1] == ".":
t2 = word.end_time.total_seconds()
subs.append(srt.Subtitle(ide,timedelta(seconds=t1),timedelta(seconds=t2),frase))
print(f"{confidence:4.0%} | {t1:7.3f} | {t2:7.3f} | {frase}")
data=data+str(f"{confidence:4.0%}")+" | "+"00:" + str(f"{t1:7.3f}") + " | " + str(f"{t2:7.3f}") + " | "+ str(f"{frase}")
data= data +"<br>"
t1 = 0
frase = ""
n_paraules = 0
ide = ide + 1
composed=srt.compose(subs)
index = file_name.index('.')
file_name = file_name[:index]
print(composed)
storage_client = storage.Client()
bucket = storage_client.bucket("sm5-hackaton.appspot.com")
blob = bucket.blob("subtitles/"+file_name+".srt")
blob.upload_from_string(composed,content_type="text/srt")
blob.make_public()
return composed
def main(request):
language_code = request.args.get('language')
storage_client = storage.Client()
bucket = storage_client.bucket("sm5-hackaton.appspot.com")
blob = bucket.blob("video/"+request.args.get('video'))
name = "/tmp/video.mp4"
blob.download_to_filename(name)
result = subprocess.run(["ffprobe", "-v", "error", "-show_entries",
"format=duration", "-of",
"default=noprint_wrappers=1:nokey=1", name],
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT)
sec = float(result.stdout)
segment = vi.VideoSegment(
start_time_offset=timedelta(seconds=0),
end_time_offset=timedelta(seconds=sec),)
video_uri="gs://sm5-hackaton.appspot.com/video/" + request.args.get('video')
response = transcribe_speech(video_uri, language_code, [segment])
headers=cors_enabled_function(request)
return (print_word_timestamps(response,request.args.get('video')), 200, headers)