Skip to content

Commit

Permalink
fixed
Browse files Browse the repository at this point in the history
  • Loading branch information
nkari82 committed Oct 8, 2020
1 parent eb56782 commit 2665f4a
Show file tree
Hide file tree
Showing 5 changed files with 67 additions and 55 deletions.
12 changes: 6 additions & 6 deletions bin/Processor/js.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

class JSpeechProcessor(object):

class Generater(object):
class Generator(object):
def __init__(self):
self._max_seq_length = 0
self._max_feat_size = 0
Expand Down Expand Up @@ -49,7 +49,7 @@ def __init__(self, rootdir, **kwargs):
self._rootdir = rootdir
self._speaker = "tsuchiya"
self._metadata = kwargs.get('metadata',"metadata.csv")
self._generater = kwargs.get('generater', self.Generater())
self._generator = kwargs.get('generator', self.Generator())

self.items = []
if rootdir:
Expand All @@ -58,14 +58,14 @@ def __init__(self, rootdir, **kwargs):
item = self._parse(line, "|")
item if item is None else self.items.append(item)

self._generater.complete()
self._generator.complete()

def _parse(self, line, split):
tid, text = line.strip().split(split)
item = None
try:
seq = np.asarray(self.text_to_sequence(text), np.int32)
item = self._generater(self._rootdir, tid, seq, self._speaker)
item = self._generator(self._rootdir, tid, seq, self._speaker)
except Exception as ex:
print("tid: {}, err: {}, text: {}".format(tid, ex, text))
return item
Expand Down Expand Up @@ -95,10 +95,10 @@ def _normalize(self, text):
return unicodedata.normalize('NFKC', text)

def max_seq_length(self):
return self._generater.max_seq_length();
return self._generator.max_seq_length();

def max_feat_length(self):
return self._generater.max_feat_length();
return self._generator.max_feat_length();

def vocab_size(self):
return len(symbols)
Expand Down
4 changes: 2 additions & 2 deletions bin/dump_fastspeech2.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def __init__(self,outdir,vocab_size=150,n_speakers=1):
self.postnet_dropout_rate = 0.1

# encoder params
self.encoder_self_attention_params = SelfAttentionParams(
self.encoder_self_attention_params = self.SelfAttentionParams(
n_speakers=self.n_speakers,
hidden_size=self.encoder_hidden_size,
num_hidden_layers=self.encoder_num_hidden_layers,
Expand All @@ -93,7 +93,7 @@ def __init__(self,outdir,vocab_size=150,n_speakers=1):
)

# decoder params
self.decoder_self_attention_params = SelfAttentionParams(
self.decoder_self_attention_params = self.SelfAttentionParams(
n_speakers=self.n_speakers,
hidden_size=self.decoder_hidden_size,
num_hidden_layers=self.decoder_num_hidden_layers,
Expand Down
4 changes: 2 additions & 2 deletions bin/extract_duration.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
STRATEGY = return_strategy()

class Config(object):
def __init__(self,outdir,vocab_size=149,n_speakers=1,batch_size=8):
def __init__(self,outdir,batch_size=8,vocab_size=149,n_speakers=1):
# tacotron2 params
self.vocab_size = vocab_size # default
self.embedding_hidden_size = 512 # 'embedding_hidden_size': 512
Expand Down Expand Up @@ -245,7 +245,7 @@ def main():
Processor = JSpeechProcessor

processor = Processor(args.rootdir) # for test
config = Config(args.outdir, processor.vocab_size(),1, args.batch_size)
config = Config(args.outdir, args.batch_size, processor.vocab_size(),1)

max_seq_length = processor.max_seq_length()
max_mel_length = processor.max_feat_length() // config.n_mels
Expand Down
70 changes: 37 additions & 33 deletions bin/train_fastspeech2.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ class Config(object):
],
)

def __init__(self,outdir,vocab_size=150,n_speakers=1):
def __init__(self,outdir,batch_size=32,vocab_size=150,n_speakers=1):
# fastspeech2 params
self.vocab_size = vocab_size
self.n_speakers = n_speakers
Expand Down Expand Up @@ -113,7 +113,7 @@ def __init__(self,outdir,vocab_size=150,n_speakers=1):
self.postnet_dropout_rate = 0.1

# encoder params
self.encoder_self_attention_params = SelfAttentionParams(
self.encoder_self_attention_params = self.SelfAttentionParams(
n_speakers=self.n_speakers,
hidden_size=self.encoder_hidden_size,
num_hidden_layers=self.encoder_num_hidden_layers,
Expand All @@ -132,7 +132,7 @@ def __init__(self,outdir,vocab_size=150,n_speakers=1):
)

# decoder params
self.decoder_self_attention_params = SelfAttentionParams(
self.decoder_self_attention_params = self.SelfAttentionParams(
n_speakers=self.n_speakers,
hidden_size=self.decoder_hidden_size,
num_hidden_layers=self.decoder_num_hidden_layers,
Expand All @@ -151,7 +151,7 @@ def __init__(self,outdir,vocab_size=150,n_speakers=1):
)

# data
self.batch_size = 32
self.batch_size = batch_size
self.test_size = 0.05
self.mel_length_threshold = 0
self.guided_attention = 0.2
Expand Down Expand Up @@ -200,13 +200,13 @@ def _average_by_duration(x, durs):
values = x[start:end][np.where(x[start:end] != 0.0)[0]]
x_char[idx] = np.mean(values) if len(values) > 0 else 0.0 # np.mean([]) = nan.

return x_char.astype(np.float32)
return x_char.astype(np.float32)

@tf.function(input_signature=[tf.TensorSpec(None, tf.float32), tf.TensorSpec(None, tf.int32)])
def _tf_average_by_duration(x, durs):
return tf.numpy_function(_average_by_duration, [x, durs], tf.float32)

def _norm_mean_std(self, x, mean, std):
def _norm_mean_std(x, mean, std):
x = remove_outlier(x)
zero_idxs = np.where(x == 0.0)[0]
x = (x - mean) / std
Expand All @@ -219,8 +219,9 @@ def _generator():

with open(feat_path, 'rb') as f:
mel = np.fromfile(f, dtype='float32')
mel = np.resize(mel, (-1, config.n_mels))
mel_length = mel.shape[0]
mel = np.resize(mel, (-1, config.num_mels))

mel_length = mel.shape[0]

if f is None or mel_length < config.mel_length_threshold:
continue
Expand All @@ -234,12 +235,12 @@ def _generator():
with open(duration_path, 'rb') as f:
duration = np.fromfile(f, dtype='int32')

#f0 = self._norm_mean_std(f0, self.f0_stat[0], self.f0_stat[1])
#energy = self._norm_mean_std(energy, self.energy_stat[0], self.energy_stat[1])
f0 = _norm_mean_std(f0, f0_stat[0], f0_stat[1])
energy = _norm_mean_std(energy, energy_stat[0], energy_stat[1])

# calculate charactor f0/energy
f0 = tf_average_by_duration(f0, duration)
energy = tf_average_by_duration(energy, duration)
f0 = _tf_average_by_duration(f0, duration)
energy = _tf_average_by_duration(energy, duration)

data = {
"input_ids": text_seq,
Expand Down Expand Up @@ -273,7 +274,7 @@ def _generator():
"duration_gts": [None],
"f0_gts": [None],
"energy_gts": [None],
"mel_gts": [None, config.n_mels],
"mel_gts": [None, config.num_mels],
"mel_lengths": []
}

Expand Down Expand Up @@ -352,19 +353,17 @@ def generate_and_save_intermediate_result(self, batch):
mel_gts = mel_gts.numpy()

# check directory
utt_ids = batch["utt_ids"].numpy()
dirname = os.path.join(self.config["outdir"], f"predictions/{self.steps}steps")
dirname = os.path.join(self.config.outdir, f"predictions/{self.steps}steps")
if not os.path.exists(dirname):
os.makedirs(dirname)

for idx, (mel_gt, mel_before, mel_after) in enumerate(zip(mel_gts, mels_before, mels_after), 0):
mel_gt = tf.reshape(mel_gt, (-1, self.config.n_mels)).numpy()
mel_before = tf.reshape(mel_before, (-1, self.config.n_mels)).numpy()
mel_after = tf.reshape(mel_after, (-1, self.config.n_mels)).numpy()
mel_gt = tf.reshape(mel_gt, (-1, self.config.num_mels)).numpy()
mel_before = tf.reshape(mel_before, (-1, self.config.num_mels)).numpy()
mel_after = tf.reshape(mel_after, (-1, self.config.num_mels)).numpy()

# plit figure and save it
utt_id = utt_ids[idx]
figname = os.path.join(dirname, f"{utt_id}.png")
figname = os.path.join(dirname, f"{idx}.png")
fig = plt.figure(figsize=(10, 8))
ax1 = fig.add_subplot(311)
ax2 = fig.add_subplot(312)
Expand All @@ -382,13 +381,15 @@ def generate_and_save_intermediate_result(self, batch):
plt.savefig(figname)
plt.close()

#python train_fastspeech2.py --outdir ./fit_fastspeech2 --rootdir ./datasets/jsut/basic --batch-size 1
def main():
"""Run training process."""
parser = argparse.ArgumentParser(description="Train Tacotron2")
parser.add_argument("--outdir", type=str, required=True, help="directory to save checkpoints.")
parser.add_argument("--rootdir", type=str, required=True, help="dataset directory root")
parser.add_argument("--resume",default="",type=str,nargs="?",help='checkpoint file path to resume training. (default="")')
parser.add_argument("--verbose",type=int,default=1,help="logging level. higher is more logging. (default=1)")
parser.add_argument("--batch-size", default=8, type=int, help="batch size.")
parser.add_argument("--mixed_precision",default=0,type=int,help="using mixed precision for generator or not.")
parser.add_argument("--pretrained",default="",type=str,nargs="?",help='pretrained weights .h5 file to load weights from. Auto-skips non-matching layers',)
args = parser.parse_args()
Expand Down Expand Up @@ -421,18 +422,18 @@ def main():

class Generator(Processor.Generator):
def __init__(self):
super.__init__()
super().__init__()
self._scaler_energy = StandardScaler(copy=False)
self._scaler_f0 = StandardScaler(copy=False)
self._energy_stat = np.stack()
self._f0_stat = np.stack()
self._energy_stat = np.stack((0,0))
self._f0_stat = np.stack((0,0))

def __call__(self, rootdir, tid, seq, speaker):
tid, seq, feat_path, speaker = super.__call__()
tid, seq, feat_path, speaker = super().__call__(rootdir, tid, seq, speaker)

f0_path = os.path.join(self._rootdir, "f0", f"{tid}.f0")
energy_path = os.path.join(self._rootdir, "energies", f"{tid}.e")
duration_path = os.path.join(self._rootdir, "durations", f"{tid}.dur")
f0_path = os.path.join(rootdir, "f0", f"{tid}.f0")
energy_path = os.path.join(rootdir, "energies", f"{tid}.e")
duration_path = os.path.join(rootdir, "durations", f"{tid}.dur")

with open(f0_path) as f:
f0 = np.fromfile(f, dtype='float32')
Expand All @@ -444,20 +445,23 @@ def __call__(self, rootdir, tid, seq, speaker):

return tid, seq, feat_path, f0_path, energy_path, duration_path, speaker

def complete():
self._energy_stat = np.stack((self._scaler_energy.mean_, self._scaler_energy.scale_))
def complete(self):
self._f0_stat = np.stack((self._scaler_f0.mean_, self._scaler_f0.scale_))
self._energy_stat = np.stack((self._scaler_energy.mean_, self._scaler_energy.scale_))

print("energy stat: {}".format(self._energy_stat))
print("f0 stat: {}".format(self._f0_stat))

def energy_stat():
def energy_stat(self):
return self._energy_stat

def f0_stat():
def f0_stat(self):
return self._f0_stat

generator = Generator()
processor = Processor(args.rootdir, generator=generator)
processor = Processor(rootdir=args.rootdir, generator=generator)

config = Config(args.outdir, processor.vocab_size())
config = Config(args.outdir, args.batch_size, processor.vocab_size())

# split train and test
train_split, valid_split = train_test_split(processor.items, test_size=config.test_size,random_state=42,shuffle=True)
Expand Down
32 changes: 20 additions & 12 deletions src/dump_data.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ static_assert(false, "must include filesystem!");
#include <world/dio.h>
#include <world/stonemask.h>

#define SAMPLE_RATE 16000

extern "C"
{
#include "kiss_fft.h"
Expand Down Expand Up @@ -162,7 +164,7 @@ static void calc_norm_gain(float& norm_gain, const std::list<fs::path>& input_fi
sox_false
};

sox_signalinfo_t in_signal = { 16000, 1, 16, 0, NULL };
sox_signalinfo_t in_signal = { SAMPLE_RATE, 1, 16, 0, NULL };
sox_signalinfo_t interm_signal;
double rms_pk_lev_dB = 0.0;

Expand Down Expand Up @@ -241,8 +243,8 @@ static void convert_to(const fs::path& in_path, const fs::path& out_path, const
sox_false
};

sox_signalinfo_t out_signal = { 16000, 1, 16, 0, NULL };
sox_signalinfo_t in_signal = { 16000, 1, 16, 0, NULL };
sox_signalinfo_t out_signal = { SAMPLE_RATE, 1, 16, 0, NULL };
sox_signalinfo_t in_signal = { SAMPLE_RATE, 1, 16, 0, NULL };
sox_signalinfo_t interm_signal;

char* args[10];
Expand Down Expand Up @@ -383,6 +385,7 @@ int main(int argc, const char** argv) {
int format = 0;
int silence = 0;
int norm = 0;
int pcount = 0;
std::string input;
std::string output;
std::string mode;
Expand Down Expand Up @@ -537,7 +540,6 @@ int main(int argc, const char** argv) {

fprintf(stdout, "Convert: %s\r", file.string().c_str());
fflush(stdout);

convert_to(file, out, "sw", silence, gain);
}

Expand All @@ -560,6 +562,7 @@ int main(int argc, const char** argv) {
{
lpcnet_encoder_init(st);
count = 0;
pcount = 0;

fprintf(stdout, "Process file: %ws\r", input_file.c_str());
fflush(stdout);
Expand All @@ -570,6 +573,9 @@ int main(int argc, const char** argv) {
fs::path pcm_path = output_path_pcm;
pcm_path.append(input_file.filename().string());
pcm_path.replace_extension(".s16");

fprintf(stdout, "\nConvert: %s\r\b\r", input_file.string().c_str());
fflush(stdout);
convert_to(input_file, pcm_path, "sw", silence, gain);
input_file = pcm_path;
}
Expand Down Expand Up @@ -704,6 +710,7 @@ int main(int argc, const char** argv) {
unsigned char buf[8];
process_superframe(st, buf, ffeat, fe, encode, quantize, format);
if (fpcm) write_audio(st, pcmbuf, noisebuf, fpcm);
pcount += st->pcount;
st->pcount = 0;
}

Expand All @@ -713,19 +720,20 @@ int main(int argc, const char** argv) {
count++;
}

if(!training && ff0)
if(!training && ff0 && f1)
{
int length = count * FRAME_SIZE;
assert(length <= (ftell(f1) / sizeof(short)));
std::vector<short> data;
std::vector<double> norm;
std::vector<double> f0;
std::vector<double> t;

int length = ftell(f1) / sizeof(short);
int samples = GetSamplesForDIO(SAMPLE_RATE, length, 10.0);

data.resize(length);
norm.resize(length);
f0.resize(size_t(count + 1));
t.resize(size_t(count + 1));
f0.resize(std::max<size_t>(pcount, samples));
t.resize(size_t(samples));

fseek(f1, 0, SEEK_SET);
fread(data.data(), sizeof(short), length, f1);
Expand All @@ -742,9 +750,9 @@ int main(int argc, const char** argv) {
option.frame_period = 10.0;
option.speed = 1;
option.allowed_range = 0.1;
Dio(norm.data(), length, 16000, &option, t.data(), f0.data());
StoneMask(norm.data(), length, 16000, t.data(), f0.data(), f0.size(), f0.data());
for(i = 0; i < count; ++i)
Dio(norm.data(), length, SAMPLE_RATE, &option, t.data(), f0.data());
StoneMask(norm.data(), length, SAMPLE_RATE, t.data(), f0.data(), f0.size(), f0.data());
for(i = 0; i < pcount; ++i)
{
float val = (float)f0[i];
fwrite(&val, sizeof(float), 1, ff0);
Expand Down

0 comments on commit 2665f4a

Please sign in to comment.