-
Notifications
You must be signed in to change notification settings - Fork 31
/
Copy pathwav2tfr.py
88 lines (66 loc) · 2.3 KB
/
wav2tfr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from os.path import join, split, splitext
import tensorflow as tf
from util.audio import mu_law_encode
from util.wrapper import txt2list
from dataloader.vctk import make_mu_law_speaker_length
args = tf.app.flags.FLAGS
tf.app.flags.DEFINE_integer('fs', 16000, 'sampling freq')
tf.app.flags.DEFINE_string('file_pattern', None, '')
tf.app.flags.DEFINE_string('output_dir', None, '')
tf.app.flags.DEFINE_string('speaker_list', None, '')
tf.app.flags.DEFINE_string('ext', 'wav',
'file extension: wav, mp3, mp4, ogg are supported.')
def read_text(filename):
''' dedicated to VCTK '''
filename = filename.replace('wav48', 'txt')
filename = filename.replace('.wav', '.txt')
try:
with open(filename, 'r', encoding='utf8') as fp:
lines = fp.readlines()
lines = ''.join(lines)
except FileNotFoundError:
print('[WARNING] text not found: {}'.format(filename))
lines = ''
finally:
pass
return lines
def main(unused_args):
'''
NOTE: the directory structure must be [args.dir_to_wav]/[Set]/[speakers]
'''
if not args.output_dir:
raise ValueError('`output_dir` (output dir) should be specified')
print('[WARNING] Protobuf is super slow (~7 examples per sec). \n'
'This could take 2 hours or more.')
reader = tf.WholeFileReader()
files = tf.gfile.Glob(args.file_pattern)
filename_queue = tf.train.string_input_producer(
files,
num_epochs=1,
shuffle=False)
key, val = reader.read(filename_queue)
wav = tf.contrib.ffmpeg.decode_audio(val, args.ext, args.fs, 1)
wav = tf.reshape(wav, [-1,])
mulaw = mu_law_encode(wav)
for s in txt2list(args.speaker_list):
tf.gfile.MakeDirs(join(args.output_dir, s))
counter = 1
N = len(files)
with tf.train.MonitoredSession() as sess:
while not sess.should_stop():
filename, x_int = sess.run([key, mulaw])
filename = filename.decode('utf8')
text = read_text(filename)
b, _ = splitext(filename)
_, b = split(b)
s = b.split('_')[0]
ex = make_mu_law_speaker_length(x_int, s, text, b)
fp = tf.python_io.TFRecordWriter(
join(args.output_dir, s, '{}.tfr'.format(b)))
fp.write(ex.SerializeToString())
fp.close()
print('\rFile {:5d}/{:5d}: {}'.format(counter, N, b), end='')
counter += 1
print()
if __name__ == '__main__':
tf.app.run()