From 61e3b847f332a662ee979098a2952c97fbc70aff Mon Sep 17 00:00:00 2001 From: andabi Date: Wed, 14 Mar 2018 15:54:41 +0900 Subject: [PATCH] apply tensorpack gradproc in train2 --- hparams/default.yaml | 2 +- models.py | 247 +++---------------------------------------- modules.py | 3 + train1.py | 2 +- train2.py | 78 ++++---------- 5 files changed, 40 insertions(+), 292 deletions(-) diff --git a/hparams/default.yaml b/hparams/default.yaml index 3d5bb355..6f9360f4 100644 --- a/hparams/default.yaml +++ b/hparams/default.yaml @@ -63,7 +63,7 @@ train2: # train batch_size: 32 lr: 0.0003 - lr_cyclic_margin: 0.0002 + lr_cyclic_margin: 0. lr_cyclic_steps: 5000 clip_value_max: 3. clip_value_min: -3. diff --git a/models.py b/models.py index 1850b502..ebcfcde0 100644 --- a/models.py +++ b/models.py @@ -10,6 +10,8 @@ from hparam import hparam as hp from modules import prenet, cbhg from tensorpack.tfutils.scope_utils import auto_reuse_variable_scope +from tensorpack.tfutils import ( + summary, get_current_tower_context, optimizer, gradproc) class Net1(ModelDesc): @@ -18,7 +20,7 @@ def __init__(self): def _get_inputs(self): return [InputDesc(tf.float32, (hp.train1.batch_size, None, hp.default.n_mfcc), 'x_mfccs'), - InputDesc(tf.int32, (hp.train1.batch_size, None, ), 'y_ppgs')] + InputDesc(tf.int32, (hp.train1.batch_size, None,), 'y_ppgs')] def _build_graph(self, inputs): self.x_mfccs, self.y_ppgs = inputs @@ -70,14 +72,13 @@ def acc(self): class Net2(ModelDesc): - def __init__(self): self.net1 = Net1() def _get_inputs(self): return [InputDesc(tf.float32, (hp.train2.batch_size, None, hp.default.n_mfcc), 'x_mfccs'), InputDesc(tf.float32, (hp.train2.batch_size, None, hp.default.n_fft // 2 + 1), 'y_spec'), - InputDesc(tf.float32, (hp.train2.batch_size, None, hp.default.n_mels), 'y_mel'),] + InputDesc(tf.float32, (hp.train2.batch_size, None, hp.default.n_mels), 'y_mel'), ] def _build_graph(self, inputs): self.x_mfcc, self.y_spec, self.y_mel = inputs @@ -99,22 +100,19 @@ def _build_graph(self, inputs): # tf.summary.scalar('net2/train/lr', lr) tf.summary.histogram('net2/train/mu', self.pred_spec_mu) tf.summary.histogram('net2/train/phi', self.pred_spec_phi) + # TODO remove tf.summary.scalar('net2/prob_min', self.prob_min) - # def _get_train_op(self): - # lr = tf.get_variable('learning_rate', initializer=hp.train2.lr, trainable=False) - # optimizer = tf.train.AdamOptimizer(learning_rate=lr) - # with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)): - # var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'net2') - # - # # Gradient clipping to prevent loss explosion - # gvs = optimizer.compute_gradients(loss_op, var_list=var_list) - # gvs = [(tf.clip_by_value(grad, hp.train2.clip_value_min, hp.train2.clip_value_max), var) for grad, var in - # gvs] - # gvs = [(tf.clip_by_norm(grad, hp.train2.clip_norm), var) for grad, var in gvs] - # - # return optimizer.apply_gradients(gvs) + def _get_optimizer(self): + lr = tf.get_variable('learning_rate', initializer=hp.train2.lr, trainable=False) + opt = tf.train.AdamOptimizer(learning_rate=lr) + gradprocs = [gradproc.MapGradient(lambda grad: grad, regex='.*net2.*'), # apply only gradients of net2 + gradproc.GlobalNormClip(hp.train2.clip_norm), + # gradproc.PrintGradient()] + ] + + return optimizer.apply_grad_processors(opt, gradprocs) @auto_reuse_variable_scope def network(self, ppgs, is_training): @@ -139,7 +137,8 @@ def network(self, ppgs, is_training): pred_spec = tf.layers.dense(pred_spec, self.y_spec.shape[-1]) # (N, T, 1+hp.n_fft//2) pred_spec = tf.expand_dims(pred_spec, axis=-1) pred_spec_mu = tf.layers.dense(pred_spec, hp.train2.n_mixtures) # (N, T, 1+hp.n_fft//2, n_mixtures) - pred_spec_phi = tf.nn.softmax(tf.layers.dense(pred_spec, hp.train2.n_mixtures)) # (N, T, 1+hp.n_fft//2, n_mixtures) + pred_spec_phi = tf.nn.softmax( + tf.layers.dense(pred_spec, hp.train2.n_mixtures)) # (N, T, 1+hp.n_fft//2, n_mixtures) return pred_spec_mu, pred_spec_phi @@ -158,217 +157,3 @@ def loss(self): self.prob_min = tf.reduce_min(prob) loss = -tf.reduce_mean(tf.log(prob + 1e-8)) return loss - - -# class Model: -# def __init__(self, mode, batch_size): -# self.mode = mode -# self.batch_size = batch_size -# self.is_training = self.get_is_training(mode) -# -# # Networks -# self.net_template = tf.make_template('net', self._net2) -# self.ppgs, self.pred_ppg, self.logits_ppg, self.pred_spec_mu, self.pred_spec_phi = self.net_template() -# -# def __call__(self): -# return self.pred_spec_mu -# -# def _get_inputs(self): -# length = hp.signal.duration * hp.signal.sr -# length_spec = length // hp.signal.hop_length + 1 -# return [InputDesc(tf.float32, (None, length), 'wav'), -# InputDesc(tf.float32, (None, length_spec, hp.signal.n_mels), 'x'), -# InputDesc(tf.int32, (None,), 'speaker_id')] -# -# def get_input(self, mode, batch_size, queue): -# ''' -# mode: A string. One of the phases below: -# `train1`: TIMIT TRAIN waveform -> mfccs (inputs) -> PGGs -> phones (target) (ce loss) -# `test1`: TIMIT TEST waveform -> mfccs (inputs) -> PGGs -> phones (target) (accuracy) -# `train2`: ARCTIC SLT waveform -> mfccs -> PGGs (inputs) -> spectrogram (target)(l2 loss) -# `test2`: ARCTIC SLT waveform -> mfccs -> PGGs (inputs) -> spectrogram (target)(accuracy) -# `convert`: ARCTIC BDL waveform -> mfccs (inputs) -> PGGs -> spectrogram -> waveform (output) -# ''' -# if mode not in ('train1', 'test1', 'train2', 'test2', 'convert'): -# raise Exception("invalid mode={}".format(mode)) -# -# x_mfcc = tf.placeholder(tf.float32, shape=(batch_size, None, hp.default.n_mfcc)) -# y_ppgs = tf.placeholder(tf.int32, shape=(batch_size, None,)) -# y_spec = tf.placeholder(tf.float32, shape=(batch_size, None, 1 + hp.default.n_fft // 2)) -# y_mel = tf.placeholder(tf.float32, shape=(batch_size, None, hp.default.n_mels)) -# num_batch = 1 -# -# if queue: -# if mode in ("train1", "test1"): # x: mfccs (N, T, n_mfccs), y: Phones (N, T) -# x_mfcc, y_ppgs, num_batch = get_batch_queue(mode=mode, batch_size=batch_size) -# elif mode in ("train2", "test2", "convert"): # x: mfccs (N, T, n_mfccs), y: spectrogram (N, T, 1+n_fft//2) -# x_mfcc, y_spec, y_mel, num_batch = get_batch_queue(mode=mode, batch_size=batch_size) -# return x_mfcc, y_ppgs, y_spec, y_mel, num_batch -# -# def get_is_training(self, mode): -# if mode in ('train1', 'train2'): -# is_training = True -# else: -# is_training = False -# return is_training -# -# def _net1(self): -# with tf.variable_scope('net1'): -# # Pre-net -# prenet_out = prenet(self.x_mfcc, -# num_units=[hp.train1.hidden_units, hp.train1.hidden_units // 2], -# dropout_rate=hp.train1.dropout_rate, -# is_training=self.is_training) # (N, T, E/2) -# -# # CBHG -# out = cbhg(prenet_out, hp.train1.num_banks, hp.train1.hidden_units // 2, -# hp.train1.num_highway_blocks, hp.train1.norm_type, self.is_training) -# -# # Final linear projection -# logits = tf.layers.dense(out, len(phns)) # (N, T, V) -# ppgs = tf.nn.softmax(logits / hp.train1.t) # (N, T, V) -# preds = tf.to_int32(tf.arg_max(logits, dimension=-1)) # (N, T) -# -# return ppgs, preds, logits -# -# def loss_net1(self): -# istarget = tf.sign(tf.abs(tf.reduce_sum(self.x_mfcc, -1))) # indicator: (N, T) -# loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits_ppg / hp.train1.t, -# labels=self.y_ppg) -# loss *= istarget -# loss = tf.reduce_mean(loss) -# return loss -# -# def acc_net1(self): -# istarget = tf.sign(tf.abs(tf.reduce_sum(self.x_mfcc, -1))) # indicator: (N, T) -# num_hits = tf.reduce_sum(tf.to_float(tf.equal(self.pred_ppg, self.y_ppg)) * istarget) -# num_targets = tf.reduce_sum(istarget) -# acc = num_hits / num_targets -# return acc -# -# def _net2(self): -# # PPGs from net1 -# ppgs, preds_ppg, logits_ppg = self._net1() -# -# with tf.variable_scope('net2'): -# # Pre-net -# prenet_out = prenet(ppgs, -# num_units=[hp.train2.hidden_units, hp.train2.hidden_units // 2], -# dropout_rate=hp.train2.dropout_rate, -# is_training=self.is_training) # (N, T, E/2) -# -# # CBHG1: mel-scale -# # pred_mel = cbhg(prenet_out, hp.train2.num_banks, hp.train2.hidden_units // 2, -# # hp.train2.num_highway_blocks, hp.train2.norm_type, self.is_training, -# # scope="cbhg_mel") -# # pred_mel = tf.layers.dense(pred_mel, self.y_mel.shape[-1]) # (N, T, n_mels) -# pred_mel = prenet_out -# -# # CBHG2: linear-scale -# pred_spec = tf.layers.dense(pred_mel, hp.train2.hidden_units // 2) # (N, T, n_mels) -# pred_spec = cbhg(pred_spec, hp.train2.num_banks, hp.train2.hidden_units // 2, -# hp.train2.num_highway_blocks, hp.train2.norm_type, self.is_training, -# scope="cbhg_linear") -# pred_spec = tf.layers.dense(pred_spec, self.y_spec.shape[-1]) # (N, T, 1+hp.n_fft//2) -# pred_spec = tf.expand_dims(pred_spec, axis=-1) -# pred_spec_mu = tf.layers.dense(pred_spec, hp.train2.n_mixtures) # (N, T, 1+hp.n_fft//2, n_mixtures) -# pred_spec_phi = tf.nn.softmax(tf.layers.dense(pred_spec, hp.train2.n_mixtures)) # (N, T, 1+hp.n_fft//2, n_mixtures) -# -# return ppgs, preds_ppg, logits_ppg, pred_spec_mu, pred_spec_phi -# -# def loss_net2(self): -# # negative log likelihood -# normal_dists = [] -# for i in range(hp.train2.n_mixtures): -# mu = self.pred_spec_mu[..., i] -# normal_dist = distributions.Logistic(mu, tf.ones_like(mu)) -# normal_dists.append(normal_dist) -# cat = distributions.Categorical(probs=self.pred_spec_phi) -# mixture_dist = distributions.Mixture(cat=cat, components=normal_dists) -# prob = mixture_dist.cdf(value=self.y_spec + hp.train2.mol_step) - \ -# mixture_dist.cdf(value=self.y_spec - hp.train2.mol_step) -# prob /= hp.train2.mol_step * 2 -# self.prob_min = tf.reduce_min(prob) -# loss = -tf.reduce_mean(tf.log(prob + 1e-8)) -# return loss -# -# # def loss_net2(self): -# # mol_step = 1e-3 -# # # negative log likelihood -# # # normal_dists = [] -# # # for i in range(hp.train2.n_mixtures): -# # # mu = self.pred_spec_mu[..., i] -# # mu = self.pred_spec_mu -# # normal_dist = distributions.Logistic(mu, tf.ones_like(mu)) -# # # normal_dists.append(normal_dist) -# # prob = normal_dist.cdf(value=self.y_spec + mol_step) - normal_dist.cdf(value=self.y_spec - mol_step) -# # prob /= mol_step * 2 -# # prob = tf.reduce_sum(prob * self.pred_spec_phi, axis=-1) -# # # cat = distributions.Categorical(probs=self.pred_spec_phi) -# # # mixture_dist = distributions.Mixture(cat=cat, components=normal_dists) -# # # log_likelihood = tf.reduce_sum(mixture_dist.log_prob(value=self.y_spec)) -# # loss = -tf.reduce_mean(tf.log(prob)) -# # return loss -# -# @staticmethod -# def load(sess, mode, logdir, logdir2=None, step=None): -# def print_model_loaded(mode, logdir, step): -# model_name = Model.get_model_name(logdir, step=step) -# print('Model loaded. mode: {}, model_name: {}'.format(mode, model_name)) -# -# if mode in ['train1', 'test1']: -# var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'net/net1') -# if Model._load_variables(sess, logdir, var_list=var_list, step=step): -# print_model_loaded(mode, logdir, step) -# -# elif mode == 'train2': -# var_list1 = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'net/net1') -# if Model._load_variables(sess, logdir, var_list=var_list1, step=step): -# print_model_loaded(mode, logdir, step) -# -# var_list2 = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'net/net2') -# if Model._load_variables(sess, logdir2, var_list=var_list2, step=step): -# print_model_loaded(mode, logdir2, step) -# -# elif mode in ['test2', 'convert']: -# if Model._load_variables(sess, logdir, var_list=None, step=step): # Load all variables -# print_model_loaded(mode, logdir, step) -# -# @staticmethod -# def _load_variables(sess, logdir, var_list, step=None): -# model_name = Model.get_model_name(logdir, step) -# if model_name: -# ckpt = os.path.join(logdir, model_name) -# tf.train.Saver(var_list=var_list).restore(sess, ckpt) -# return True -# else: -# return False -# -# @staticmethod -# def get_model_name(logdir, step=None): -# model_name = None -# if step: -# paths = glob.glob('{}/*step_{}.index*'.format(logdir, step)) -# if paths: -# _, model_name, _ = split_path(paths[0]) -# else: -# ckpt = tf.train.latest_checkpoint(logdir) -# if ckpt: -# _, model_name = os.path.split(ckpt) -# return model_name -# -# @staticmethod -# def get_epoch_and_global_step(logdir, step=None): -# model_name = Model.get_model_name(logdir, step) -# if model_name: -# tokens = model_name.split('_') -# epoch, gs = int(tokens[1]), int(tokens[3]) -# else: -# epoch = gs = 0 -# return epoch, gs -# -# @staticmethod -# def all_model_names(logdir): -# path = '{}/*.meta'.format(logdir) -# model_names = map(lambda f: os.path.basename(f).replace('.meta', ''), glob.glob(path)) -# return model_names diff --git a/modules.py b/modules.py index d882ab90..f0340edb 100644 --- a/modules.py +++ b/modules.py @@ -169,6 +169,7 @@ def conv1d(inputs, outputs = tf.layers.conv1d(**params) return outputs + def conv1d_banks(inputs, K=16, num_units=None, norm_type=None, is_training=True, scope="conv1d_banks", reuse=None): '''Applies a series of conv1d separately. @@ -191,6 +192,7 @@ def conv1d_banks(inputs, K=16, num_units=None, norm_type=None, is_training=True, outputs = tf.concat(outputs, -1) return outputs # (N, T, Hp.embed_size//2*K) + def gru(inputs, num_units=None, bidirection=False, seqlens=None, scope="gru", reuse=None): '''Applies a GRU. @@ -224,6 +226,7 @@ def gru(inputs, num_units=None, bidirection=False, seqlens=None, scope="gru", re dtype=tf.float32) return outputs + def attention_decoder(inputs, memory, seqlens=None, num_units=None, scope="attention_decoder", reuse=None): '''Applies a GRU to `inputs`, while attending `memory`. Args: diff --git a/train1.py b/train1.py index c1258db6..32f7c274 100644 --- a/train1.py +++ b/train1.py @@ -48,7 +48,7 @@ def train(args, logdir): # session_config=session_conf ) ckpt = args.ckpt if args.ckpt else tf.train.latest_checkpoint(logdir) - if ckpt and not args.r: + if ckpt: train_conf.session_init = SaverRestore(ckpt) if args.gpu: diff --git a/train2.py b/train2.py index 31af8047..63a36b7a 100644 --- a/train2.py +++ b/train2.py @@ -5,14 +5,14 @@ import argparse import math -import os import tensorflow as tf from tensorpack.callbacks.saver import ModelSaver -from tensorpack.graph_builder.utils import LeastLoadedDeviceSetter from tensorpack.graph_builder.distributed import DataParallelBuilder +from tensorpack.graph_builder.utils import LeastLoadedDeviceSetter from tensorpack.input_source.input_source import QueueInput from tensorpack.input_source.input_source import StagingInput +from tensorpack.tfutils.sessinit import ChainInit from tensorpack.tfutils.sessinit import SaverRestore from tensorpack.tfutils.tower import TowerFuncWrapper from tensorpack.train.interface import TrainConfig @@ -24,7 +24,8 @@ from data_load import Net2DataFlow from hparam import hparam as hp from models import Net2 -from tensorpack.tfutils.sessinit import ChainInit +import os + def train(args, logdir1, logdir2): # model @@ -43,23 +44,16 @@ def train(args, logdir1, logdir2): # ), # ) - input = QueueInput(df(n_prefetch=1000, n_thread=4)) - session_inits = [] ckpt2 = args.ckpt if args.ckpt else tf.train.latest_checkpoint(logdir2) if ckpt2: session_inits.append(SaverRestore(ckpt2)) ckpt1 = tf.train.latest_checkpoint(logdir1) if ckpt1: - session_inits.append(SaverRestore(ckpt1)) - - # if args.gpu: - # os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu - # train_conf.nr_tower = len(args.gpu.split(',')) - - trainer = MultiGPUNet2Trainer(hp.train2.num_gpu, model=model, input=input) - - trainer.train_with_defaults( + session_inits.append(SaverRestore(ckpt1, ignore=['global_step'])) + train_conf = TrainConfig( + model=model, + data=QueueInput(df(n_prefetch=1000, n_thread=4)), callbacks=[ # TODO save on prefix net2 ModelSaver(checkpoint_dir=logdir2), @@ -69,53 +63,19 @@ def train(args, logdir1, logdir2): steps_per_epoch=hp.train2.steps_per_epoch, session_init=ChainInit(session_inits) ) + if args.gpu: + os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu + train_conf.nr_tower = len(args.gpu.split(',')) + + trainer = SyncMultiGPUTrainerReplicated(hp.train2.num_gpu) + + launch_train_with_config(train_conf, trainer=trainer) -class MultiGPUNet2Trainer(TowerTrainer): - def __init__(self, nr_gpu, input, model): - super(MultiGPUNet2Trainer, self).__init__() - assert nr_gpu > 0 - raw_devices = ['/gpu:{}'.format(k) for k in range(nr_gpu)] - - # Setup input - input = StagingInput(input, raw_devices) - cbs = input.setup(model.get_inputs_desc()) - for cb in cbs: - self.register_callback(cb) - - # Build the graph with multi-gpu replication - def get_cost(*inputs): - model.build_graph(*inputs) - return model.cost - - self.tower_func = TowerFuncWrapper(get_cost, model.get_inputs_desc()) - devices = [LeastLoadedDeviceSetter(d, raw_devices) for d in raw_devices] - cost_list = DataParallelBuilder.build_on_towers( - list(range(nr_gpu)), - lambda: self.tower_func(*input.get_input_tensors()), - devices) - # Simply average the cost here. It might be faster to average the gradients - loss_op = tf.add_n([x for x in cost_list]) * (1.0 / nr_gpu) - - # Define optimizer - lr = tf.get_variable('learning_rate', initializer=hp.train2.lr, trainable=False) - optimizer = tf.train.AdamOptimizer(learning_rate=lr) - with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)): - var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'net2') - - # Gradient clipping to prevent loss explosion - gvs = optimizer.compute_gradients(loss_op, var_list=var_list) - gvs = [(tf.clip_by_value(grad, hp.train2.clip_value_min, hp.train2.clip_value_max), var) for grad, var in - gvs] - gvs = [(tf.clip_by_norm(grad, hp.train2.clip_norm), var) for grad, var in gvs] - - self.train_op = optimizer.apply_gradients(gvs) - - -def get_cyclic_lr(step): - lr_margin = hp.train2.lr_cyclic_margin * math.sin(2. * math.pi / hp.train2.lr_cyclic_steps * step) - lr = hp.train2.lr + lr_margin - return lr +# def get_cyclic_lr(step): +# lr_margin = hp.train2.lr_cyclic_margin * math.sin(2. * math.pi / hp.train2.lr_cyclic_steps * step) +# lr = hp.train2.lr + lr_margin +# return lr def get_arguments():