-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathppt2video.py
186 lines (159 loc) · 10.2 KB
/
ppt2video.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import argparse
import os
import re
import subprocess
from tempfile import mkdtemp
import win32com.client as win32
# Make full path from relative path for command line arguments
def ensure_full_path(s):
return os.path.abspath(s) if not os.path.isabs(s) else s
# Parse command line arguments
parser = argparse.ArgumentParser(description='Convert PowerPoint presentation to video presentation with synthesized audio.')
parser.add_argument('pptfile', help='PowerPoint file to create presentation video for', type=ensure_full_path)
parser.add_argument('output', help='Output video file')
parser.add_argument('--slides', help='Slide numbers or ranges to extract (comma-separated list)', type=str)
parser.add_argument('--silence', default=1.5, help='Seconds of silence to pad each slide audio; default: 1.5', type=float)
parser.add_argument('--voice', default='en-GB-SoniaNeural', help='Voice identifier. When using Azure see https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts#prebuilt-neural-voices (default: en-GB-SoniaNeural); when using SAPI, use index number of voice installed on your system')
parser.add_argument('--pronunciation_mapping', help="File that maps the spelling of words and acronyms to their pronunciation", type=ensure_full_path)
parser.add_argument('--video_width', default=1920, help='Width of the output video in pixels; default: 1920', type=int)
parser.add_argument('--video_height', default=1080, help='Height of the output video in pixels; default: 1080', type=int)
parser.add_argument('--api', default='Azure', choices=['SAPI', 'Azure'], help='API to use for speech synthesis: Azure (default; Microsoft Azure AI Speech SDK, requires API key set as environment variable SPEECH_KEY and region in SPEECH_REGION) or SAPI (Microsoft Speech API, part of Windows)', type=str)
parser.add_argument('--update', type=ensure_full_path, help='Folder with temporary files from previous conversion to reuse when only a subset of the slides were updated; should be used together with --slides, must use the same file extension for the output file (i.e., the same video container format) as the previous conversion, and the slide order and count must be the same as in the previous conversion')
parser.add_argument('--quit_ppt', action='store_true', help='Quit PowerPoint after processing the presentation, if no other presentations are open')
parser.add_argument('--skip_image', action='store_true', help='Skip image extraction and use existing image files in the temporary folder; must be combined with --update')
parser.add_argument('--skip_audio', action='store_true', help='Skip audio synthesis and use existing audio files in the temporary folder; must be combined with --update')
parser.add_argument('--poster_slide', type=int, default=1, help='Slide number to use as poster slide for the video (first slide by default)')
args = parser.parse_args()
# import API
if args.api == 'Azure' and not args.skip_audio:
import azure.cognitiveservices.speech as speechsdk
# Pronunciation mapping file contains lines matching the pattern word=pronunciation, e.g.:
# FDAT=effdutt
# Load the file and store in dictionary
pronunciation_mapping = {}
if args.pronunciation_mapping:
with open(args.pronunciation_mapping, 'r', encoding="utf-8") as f:
for line in f:
word, pronunciation = line.strip().split('=')
pronunciation_mapping[word.lower().strip()] = pronunciation.lower().strip()
# Init Azure Speech SDK
if args.api == 'Azure' and not args.skip_audio:
speech_config = speechsdk.SpeechConfig(
subscription=os.environ.get('SPEECH_KEY'),
region=os.environ.get('SPEECH_REGION'))
speech_config.speech_synthesis_voice_name = args.voice
# Determine video container format from output file extension
container_format = os.path.splitext(args.output)[1][1:]
# Create temp directories
temp_dir = args.update if args.update else mkdtemp()
slide_folder = os.path.join(temp_dir, "slides")
audio_folder = os.path.join(temp_dir, "audio")
video_folder = os.path.join(temp_dir, "video")
if not args.update:
for dir in [slide_folder, audio_folder, video_folder]:
os.mkdir(dir)
# Open PowerPoint file, will be needed to extract slide images and notes
ppt = win32.Dispatch("PowerPoint.Application")
presentation = ppt.Presentations.Open(args.pptfile, True, False, False)
# Extract desired slide list
slide_list = []
if args.slides is None:
slide_list = range(1, presentation.Slides.Count + 1)
else:
slides = [s.strip() for s in args.slides.split(',')]
for i in slides:
if '-' in i:
start, end = i.split('-')
slide_list.extend(range(int(start), int(end) + 1))
else:
slide_list.append(int(i))
slide_list = sorted(slide_list)
# Generate silence file
print(f"Generating {args.silence} seconds silence audio file")
silence_file = os.path.join(temp_dir, "silence.wav")
subprocess.run(f'ffmpeg -y -hide_banner -loglevel error -f lavfi -i anullsrc=r=11025:cl=mono -t {args.silence} -c:a pcm_s16le {silence_file}', shell=True)
# remember created slide videos for ffmpeg concat later
slide_videos = []
total_chars = 0
# Loop over slides in presentation
for slide_number in slide_list:
print(f"Processing slide {slide_number}")
slide = presentation.Slides(slide_number)
audio_file = os.path.join(audio_folder, f"audio_{slide_number}.wav")
audio_file_padded = audio_file.replace('.wav', '.m4a')
# Export slide as image
if not args.skip_image:
print(f" Exporting slide {slide_number} as image")
slide_image_file = os.path.join(slide_folder, f"slide_{slide_number}.png")
if os.path.exists(slide_image_file):
os.remove(slide_image_file)
slide.Export(slide_image_file, "PNG", ScaleWidth=args.video_width, ScaleHeight=args.video_height)
# Read slide text
if not args.skip_audio:
slide_text = slide.NotesPage.Shapes.Placeholders(2).TextFrame.TextRange.Text
if not slide_text or len(slide_text.strip()) == 0:
print(f" Skipping slide {slide_number} because no note text found")
continue
# Remove newlines and carriage returns
slide_text = slide_text.replace('\n', ' ').replace('\r', ' ').strip()
# Replace words with pronunciation from mapping file
for word, pronunciation in pronunciation_mapping.items():
slide_text = re.sub(rf'\b{re.escape(word)}\b', pronunciation, slide_text, flags=re.IGNORECASE)
total_chars += len(slide_text)
# Synthesize audio for slide text
print(f" Synthesizing audio of slide {slide_number}")
if args.api == 'Azure':
audio_config = speechsdk.audio.AudioOutputConfig(filename=audio_file)
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)
speech_synthesis_result = speech_synthesizer.speak_text_async(slide_text).get()
if speech_synthesis_result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
pass
elif speech_synthesis_result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = speech_synthesis_result.cancellation_details
print(f" Speech synthesis canceled: {cancellation_details.reason}")
if cancellation_details.reason == speechsdk.CancellationReason.Error:
if cancellation_details.error_details:
print(f" Error details: {cancellation_details.error_details}. Did you set the speech resource key and region values?")
break
else: # SAPI
sapi = win32.Dispatch("SAPI.SpVoice")
sapi.Voice = sapi.GetVoices().Item(int(args.voice))
outfile = win32.Dispatch("SAPI.SpFileStream")
outfile.Open(audio_file, 3, False)
sapi.AudioOutputStream = outfile
sapi.Speak(slide_text)
outfile.Close()
# Extend audio with silence in front and back and encode using AAC codec
subprocess.run('ffmpeg -y -hide_banner -loglevel error -i "{silence}" -i "{audio_in}" -i "{silence}" -filter_complex "[0:0][1:0][2:0]concat=n=3:v=0:a=1[a]" -map "[a]" -c:a aac -strict experimental "{audio_out}"'.format(silence=silence_file, audio_in=audio_file, audio_out=audio_file_padded), shell=True)
# Create video from slide image and synthesized audio
video_file = os.path.join(video_folder, f"video_{slide_number}.{container_format}")
slide_videos.append(video_file)
print(f" Creating video of slide {slide_number} with synthesized audio and slide image")
subprocess.run('ffmpeg -y -hide_banner -loglevel error -loop 1 -i "{slide}" -i "{audio}" -c:v libx264 -framerate 5 -c:a copy -tune stillimage -shortest {video}'.format(slide=os.path.join(slide_folder, slide_image_file), audio=audio_file_padded, video=video_file), shell=True)
# Close PowerPoint
presentation.Close()
# If no open presentations remaining - quit PowerPoint
if args.quit_ppt and ppt.Presentations.Count == 0:
ppt.Quit()
if len(slide_videos) == 0:
print(f"No slide videos {'updated' if args.update else 'created'}, exiting")
exit(0)
# Create final video by concatenating all slide videos
concat_file = os.path.join(temp_dir, "concat.txt")
# Create list of slide videos to concatenate only if not updating previous conversion
if not args.update:
with open(concat_file, "w") as video_list_file:
for slide_video in slide_videos:
video_list_file.write(f"file '{slide_video}'\n")
print("Creating full video by concatenating all slide videos")
# Create temporary full video file
temp_output_file = os.path.join(temp_dir, "__temp_video__." + container_format)
subprocess.run(f'ffmpeg -y -hide_banner -loglevel error -f concat -safe 0 -i "{concat_file}" -c copy "{temp_output_file}"', shell=True)
# Add poster image for final video
poster_slide_file = os.path.join(slide_folder, f"slide_{args.poster_slide}.png")
subprocess.run(f'ffmpeg -y -hide_banner -loglevel error -i "{temp_output_file}" -i "{poster_slide_file}" -map 0 -map 1 -c copy -c:v:1 png -disposition:v:1 attached_pic "{args.output}"', shell=True)
# Clean up
os.remove(temp_output_file)
print(f"Total characters synthesized: {total_chars}")
print(f"Temporary files kept in {temp_dir}")
print("Done.")