-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathgenerate_audio.py
123 lines (92 loc) · 4.35 KB
/
generate_audio.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# generate_audio.py
import tempfile
import asyncio
import os
from audio_processor import AudioGenerator
from dotenv import load_dotenv
from pydub import AudioSegment
load_dotenv()
VOICE_A = os.getenv('VOICE_A', 'Puck')
VOICE_B = os.getenv('VOICE_B', 'Kore')
def parse_conversation(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
lines = content.strip().split('\n')
speaker_a_lines = []
speaker_b_lines = []
for line in lines:
if line.strip():
if line.startswith("Speaker A:"):
speaker_a_lines.append(line.replace("Speaker A:", "").strip())
elif line.startswith("Speaker B:"):
speaker_b_lines.append(line.replace("Speaker B:", "").strip())
return speaker_a_lines, speaker_b_lines
def read_file_content(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
return file.read()
async def setup_environment():
script_dir = os.path.dirname(os.path.abspath(__file__))
return script_dir
def read_and_parse_inputs():
system_instructions = read_file_content('system_instructions_audio.txt')
full_script = read_file_content('podcast_script.txt')
speaker_a_lines, speaker_b_lines = parse_conversation('podcast_script.txt')
return system_instructions, full_script, speaker_a_lines, speaker_b_lines
def prepare_speaker_dialogues(system_instructions, full_script, speaker_lines, voice, temp_dir):
dialogues = [system_instructions + "\n\n" + full_script]
output_files = [os.path.join(temp_dir, f"speaker_{voice}_initial.wav")]
for i, line in enumerate(speaker_lines):
dialogues.append(line)
output_files.append(os.path.join(temp_dir, f"speaker_{voice}_{i}.wav"))
return dialogues, output_files
async def process_speaker(voice, dialogues, output_files):
# Create a single generator for all dialogues
generator = AudioGenerator(voice)
# Process the entire batch of dialogues at once
await generator.process_batch(dialogues, output_files)
# Ensure the websocket connection is closed
if generator.ws:
await generator.ws.close()
def interleave_output_files(speaker_a_files, speaker_b_files):
"""Interleaves the audio files from both speakers to maintain conversation order"""
all_output_files = []
min_length = min(len(speaker_a_files), len(speaker_b_files))
# Interleave files from both speakers
for i in range(min_length):
all_output_files.extend([speaker_a_files[i], speaker_b_files[i]])
# Add any remaining files from either speaker
all_output_files.extend(speaker_a_files[min_length:])
all_output_files.extend(speaker_b_files[min_length:])
return all_output_files
def combine_audio_files(file_list, output_file, silence_duration_ms=50):
combined = AudioSegment.empty()
silence = AudioSegment.silent(duration=silence_duration_ms)
for file in file_list:
audio = AudioSegment.from_wav(file)
if audio.channels == 1:
audio = audio.set_channels(2)
combined += audio + silence
combined.export(output_file, format="wav")
async def main():
script_dir = await setup_environment()
with tempfile.TemporaryDirectory(dir=script_dir) as temp_dir:
system_instructions, full_script, speaker_a_lines, speaker_b_lines = read_and_parse_inputs()
# Prepare dialogues for both speakers
dialogues_a, output_files_a = prepare_speaker_dialogues(
system_instructions, full_script, speaker_a_lines, VOICE_A, temp_dir)
dialogues_b, output_files_b = prepare_speaker_dialogues(
system_instructions, full_script, speaker_b_lines, VOICE_B, temp_dir)
# Process Speaker A first
print("Processing Speaker A...")
await process_speaker(VOICE_A, dialogues_a, output_files_a)
# Then process Speaker B
print("Processing Speaker B...")
await process_speaker(VOICE_B, dialogues_b, output_files_b)
# Interleave and combine audio as before
all_output_files = interleave_output_files(output_files_a[1:], output_files_b[1:])
final_output = "final_podcast.wav"
combine_audio_files(all_output_files, final_output, silence_duration_ms=50)
print(f"\nFinal podcast audio created: {final_output}")
print("Temporary files cleaned up")
if __name__ == "__main__":
asyncio.run(main())