avatar.py

from config import *
from image import generate_image
import humanize
import datetime as dt
from argparse import ArgumentParser
import shutil

import os
from animate_face import animate_face
import subprocess, platform

avatar_description = "Young asian man, with short brunette hair, slightly smiling"

def main():
	parser = ArgumentParser()
	parser.add_argument("--image", default=imgfile, help="path to avatar file")
	parser.add_argument("--path_id", default=str(int(time.time())), help="set the path id to use")
	parser.add_argument("--pitch", default=1.0, help="change pitch of voice, 1.0 is original, higher number is higher pitch")
	args = parser.parse_args()
	tstart = time.time()

	## SET PATH
	path_id = args.path_id
	path = os.path.join("temp", path_id)
	os.makedirs(path, exist_ok=True)

	## GENERATE AVATAR IMAGE
	timage = "None"
	if args.image == imgfile:
		print("-----------------------------------------")
		print("generating avatar image")
		t1 = time.time()	
		generate_image(path_id, imgfile, f"hyperrealistic digital avatar, centered, \
			{avatar_description}, rim lighting, studio lighting, looking at the camera")
		timage = humanize.naturaldelta(dt.timedelta(seconds=int(time.time() - t1)))
		print("\ngenerating avatar:", timage)
	else:
		shutil.copyfile(args.image, os.path.join("temp", path_id, imgfile))

	## EXTRACT SPEECH FROM MP4
	print("-----------------------------------------")
	print("extracting speech from mp4")		
	t2 = time.time()
	wavoutfile = os.path.join(path, audiofile)
	command = 'ffmpeg -i {} -acodec pcm_s16le -ar 44100 -ac 1 {}'.format(driverfile, wavoutfile)
	subprocess.call(command, shell=platform.system() != 'Windows')		
	tspeech = humanize.naturaldelta(dt.timedelta(microseconds=int(time.time() - t2)))
	print("\nextracting speech:", tspeech)

	## ANIMATE AVATAR IMAGE
	print("-----------------------------------------")
	print("animating face with driver")
	t3 = time.time()	
	# audiofile determines the length of the driver movie to trim
	# driver movie is imposed on the image file to produce the animated file
	animate_face(path_id, audiofile, driverfile, imgfile, animatedfile)
	tanimate = humanize.naturaldelta(dt.timedelta(seconds=int(time.time() - t3)))
	print("\nanimating face:", tanimate)

	## CHANGING THE PITCH OF THE VOICE
	print("-----------------------------------------")
	print("changing pitch of voice")		
	t4 = time.time()
	wavpitchedfile = os.path.join(path, "pitched.wav")
	# command = 'ffmpeg -i {} -af "rubberband=pitch={}" {}'.format(wavoutfile, args.pitch, wavpitchedfile)
	command = 'ffmpeg -i {} -af "asetrate=44100*{},aresample=44100,atempo=1/{}" {}'.format(wavoutfile, args.pitch, args.pitch, wavpitchedfile)
	
	subprocess.call(command, shell=platform.system() != 'Windows')		
	tpitch = humanize.naturaldelta(dt.timedelta(microseconds=int(time.time() - t4)))
	print("\changing pitch:", tpitch)

	## COMBINING ANIMATION WITH SPPECH
	print("-----------------------------------------")
	print("combining animation with speech")	
	t5 = time.time()
	animatedoutfile = os.path.join(path, animatedfile)
	finaloutfile = os.path.join("results", path_id + "_animated.mp4")
	command = 'ffmpeg -i {} -i {} -c:v copy -map 0:v:0 -map 1:a:0 -shortest {}'.format(animatedoutfile, wavpitchedfile, finaloutfile)
	subprocess.call(command, shell=platform.system() != 'Windows')	
	tcombi = humanize.naturaldelta(dt.timedelta(microseconds=int(time.time() - t5)))
	print("\combining animation with speech:", tcombi)


	print("done")
	print("Overall timing")
	print("--------------")
	print("generating avatar image:", timage)
	print("extracting speech from mp4:", tspeech)
	print("animating face:", tanimate)
	print("changing pitch of voice:", tpitch)
	print("combining animation with speech:", tcombi)
	print("total time:", humanize.naturaldelta(minimum_unit="microseconds", value=dt.timedelta(seconds=int(time.time() - tstart))))

if __name__ == '__main__':
	main()