From 25097bd7f6b045b008a505d0726231384be29b72 Mon Sep 17 00:00:00 2001 From: andabi Date: Wed, 14 Mar 2018 21:13:52 +0900 Subject: [PATCH] normalize before softmax --- hparams/default.yaml | 2 +- models.py | 28 +++++++++++++++------------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/hparams/default.yaml b/hparams/default.yaml index 08039229..357927a3 100644 --- a/hparams/default.yaml +++ b/hparams/default.yaml @@ -68,7 +68,7 @@ train2: clip_value_max: 3. clip_value_min: -3. clip_norm: 10 - mol_step: 0.001 + mol_step: 0.002 num_epochs: 10000 steps_per_epoch: 100 save_per_epoch: 50 diff --git a/models.py b/models.py index bbe2d126..1708da7d 100644 --- a/models.py +++ b/models.py @@ -4,17 +4,14 @@ import tensorflow as tf from tensorflow.contrib import distributions from tensorpack.graph_builder.model_desc import ModelDesc, InputDesc -from tensorpack.train.tower import get_current_tower_context - -from data_load import phns -from hparam import hparam as hp -from modules import prenet, cbhg -from tensorpack.tfutils.scope_utils import auto_reuse_variable_scope from tensorpack.tfutils import ( - summary, get_current_tower_context, optimizer, gradproc) -import re + get_current_tower_context, optimizer, gradproc) +from tensorpack.tfutils.scope_utils import auto_reuse_variable_scope import tensorpack_extension +from data_load import phns +from hparam import hparam as hp +from modules import prenet, cbhg, normalize class Net1(ModelDesc): @@ -108,13 +105,15 @@ def _build_graph(self, inputs): tf.summary.scalar('net2/prob_min', self.prob_min) def _get_optimizer(self): - lr = tf.get_variable('learning_rate', initializer=hp.train2.lr, trainable=False) - opt = tf.train.AdamOptimizer(learning_rate=lr) gradprocs = [ tensorpack_extension.FilterGradientVariables('.*net2.*', verbose=False), + gradproc.MapGradient(lambda grad: tf.clip_by_value(grad, hp.train2.clip_value_min, hp.train2.clip_value_max)), gradproc.GlobalNormClip(hp.train2.clip_norm), - gradproc.PrintGradient(), + # gradproc.PrintGradient(), + # gradproc.CheckGradient(), ] + lr = tf.get_variable('learning_rate', initializer=hp.train2.lr, trainable=False) + opt = tf.train.AdamOptimizer(learning_rate=lr) return optimizer.apply_grad_processors(opt, gradprocs) @auto_reuse_variable_scope @@ -140,8 +139,11 @@ def network(self, ppgs, is_training): pred_spec = tf.layers.dense(pred_spec, self.y_spec.shape[-1]) # (N, T, 1+hp.n_fft//2) pred_spec = tf.expand_dims(pred_spec, axis=-1) pred_spec_mu = tf.layers.dense(pred_spec, hp.train2.n_mixtures) # (N, T, 1+hp.n_fft//2, n_mixtures) - pred_spec_phi = tf.nn.softmax( - tf.layers.dense(pred_spec, hp.train2.n_mixtures)) # (N, T, 1+hp.n_fft//2, n_mixtures) + + # normalize to prevent softmax output to be NaN. + logits = tf.layers.dense(pred_spec, hp.train2.n_mixtures) + normalized_logits = normalize(logits, type='ins', is_training=get_current_tower_context().is_training, scope='normalize_logits') + pred_spec_phi = tf.nn.softmax(normalized_logits) # (N, T, 1+hp.n_fft//2, n_mixtures) return pred_spec_mu, pred_spec_phi