-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathapp.py
64 lines (47 loc) · 1.98 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import base64
import wave
import urllib.request
from pyannote.audio import Pipeline
import torch
from pydub import AudioSegment
import os
class InferlessPythonModel:
def download_file(self,url):
filename = "file.mp3"
urllib.request.urlretrieve(url, filename)
return filename
def wav_to_base64(self,file_path):
with open(file_path, 'rb') as wav_file:
# Read the content of the WAV file
wav_content = wav_file.read()
# Encode the content as base64
base64_encoded = base64.b64encode(wav_content).decode('utf-8')
return base64_encoded
def initialize(self):
HF_TOKEN = os.getenv("HF_TOKEN") # Access Hugging Face token from environment variable
self.pipeline = Pipeline.from_pretrained(
"pyannote/speaker-diarization-3.1",
use_auth_token=HF_TOKEN)
self.pipeline.to(torch.device("cuda"))
def infer(self, inputs):
audio_url = inputs["audio_url"]
file_name = self.download_file(audio_url)
diarization = self.pipeline(file_name)
audio = AudioSegment.from_file(file_name,format = "mp3")
speaker_segments_audio = {}
audio_data = []
for turn, _, speaker in diarization.itertracks(yield_label=True):
start_ms = int(turn.start * 1000)
end_ms = int(turn.end * 1000)
segment = audio[start_ms:end_ms]
if speaker in speaker_segments_audio:
speaker_segments_audio[speaker] += segment
else:
speaker_segments_audio[speaker] = segment
for speaker, segment in speaker_segments_audio.items():
segment.export(f"{speaker}.wav", format="wav")
base64_data = self.wav_to_base64(f"{speaker}.wav")
audio_data.append(base64_data)
return {"generated_data":audio_data}
def finalize(self):
pass