diff --git a/data_genration.py b/data_genration.py new file mode 100644 index 0000000..be8a4d1 --- /dev/null +++ b/data_genration.py @@ -0,0 +1,359 @@ +""" +Defines a class that is used to featurize audio clips, and provide +them to the network for training or testing. +""" + +from __future__ import absolute_import, division, print_function + +import os +import random +import wave +from concurrent.futures import ThreadPoolExecutor, wait +from functools import reduce + +import numpy as np +import soundfile +from numpy.lib.stride_tricks import as_strided + +RNG_SEED = 123 +char_map_str = """ +' 1 + 2 +a 3 +b 4 +c 5 +d 6 +e 7 +f 8 +g 9 +h 10 +i 11 +j 12 +k 13 +l 14 +m 15 +n 16 +o 17 +p 18 +q 19 +r 20 +s 21 +t 22 +u 23 +v 24 +w 25 +x 26 +y 27 +z 28 +""" +char_map = {} +index_map = {} +for line in char_map_str.strip().split('\n'): + ch, index = line.split() + char_map[ch] = int(index) + index_map[int(index)] = ch +index_map[2] = ' ' + + +def calc_feat_dim(window, max_freq): + return int(0.001 * window * max_freq) + 1 + + +def text_to_int_sequence(text): + """ Use a character map and convert text to an integer sequence """ + int_sequence = [] + for c in text: + if c == ' ': + ch = char_map[''] + else: + ch = char_map[c] + int_sequence.append(ch) + return int_sequence + + +def int_to_text_sequence(seq): + text_sequence = [] + for c in seq: + if c == 28: + ch = '' + else: + ch = index_map[c] + text_sequence.append(ch) + return text_sequence + + +def spectrogram(samples, fft_length=256, sample_rate=2, hop_length=128): + """ + Compute the spectrogram for a real signal. + The parameters follow the naming convention of + matplotlib.mlab.specgram + Args: + samples (1D array): input audio signal + fft_length (int): number of elements in fft window + sample_rate (scalar): sample rate + hop_length (int): hop length (relative offset between neighboring + fft windows). + Returns: + x (2D array): spectrogram [frequency x time] + freq (1D array): frequency of each row in x + Note: + This is a truncating computation e.g. if fft_length=10, + hop_length=5 and the signal has 23 elements, then the + last 3 elements will be truncated. + """ + assert not np.iscomplexobj(samples), "Must not pass in complex numbers" + + window = np.hanning(fft_length)[:, None] + window_norm = np.sum(window ** 2) + + # The scaling below follows the convention of + # matplotlib.mlab.specgram which is the same as + # matlabs specgram. + scale = window_norm * sample_rate + + trunc = (len(samples) - fft_length) % hop_length + x = samples[:len(samples) - trunc] + + # "stride trick" reshape to include overlap + nshape = (fft_length, (len(x) - fft_length) // hop_length + 1) + nstrides = (x.strides[0], x.strides[0] * hop_length) + x = as_strided(x, shape=nshape, strides=nstrides) + + # window stride sanity check + assert np.all(x[:, 1] == samples[hop_length:(hop_length + fft_length)]) + + # broadcast window, compute fft over columns and square mod + x = np.fft.rfft(x * window, axis=0) + x = np.absolute(x) ** 2 + + # scale, 2.0 for everything except dc and fft_length/2 + x[1:-1, :] *= (2.0 / scale) + x[(0, -1), :] /= scale + + freqs = float(sample_rate) / fft_length * np.arange(x.shape[0]) + + return x, freqs + + +def spectrogram_from_file(filename, step=10, window=20, max_freq=None, + eps=1e-14): + """ Calculate the log of linear spectrogram from FFT energy + Params: + filename (str): Path to the audio file + step (int): Step size in milliseconds between windows + window (int): FFT window size in milliseconds + max_freq (int): Only FFT bins corresponding to frequencies between + [0, max_freq] are returned + eps (float): Small value to ensure numerical stability (for ln(x)) + """ + with soundfile.SoundFile(filename) as sound_file: + audio = sound_file.read(dtype='float32') + sample_rate = sound_file.samplerate + if audio.ndim >= 2: + audio = np.mean(audio, 1) + if max_freq is None: + max_freq = sample_rate / 2 + if max_freq > sample_rate / 2: + raise ValueError("max_freq must not be greater than half of " + " sample rate") + if step > window: + raise ValueError("step size must not be greater than window size") + hop_length = int(0.001 * step * sample_rate) + fft_length = int(0.001 * window * sample_rate) + pxx, freqs = spectrogram( + audio, fft_length=fft_length, sample_rate=sample_rate, + hop_length=hop_length) + ind = np.where(freqs <= max_freq)[0][-1] + 1 + return np.transpose(np.log(pxx[:ind, :] + eps)) + + +class DataGenerator(object): + def __init__(self, step=10, window=20, max_freq=8000, desc_file=None): + """ + Params: + step (int): Step size in milliseconds between windows + window (int): FFT window size in milliseconds + max_freq (int): Only FFT bins corresponding to frequencies between + [0, max_freq] are returned + desc_file (str, optional): Path to a JSON-line file that contains + labels and paths to the audio files. If this is None, then + load metadata right away + """ + self.feat_dim = calc_feat_dim(window, max_freq) + self.feats_mean = np.zeros((self.feat_dim,)) + self.feats_std = np.ones((self.feat_dim,)) + self.rng = random.Random(RNG_SEED) + if desc_file is not None: + self.load_metadata_from_desc_file(desc_file) + self.step = step + self.window = window + self.max_freq = max_freq + + def read_data(self, data_directory, max_duration=10.0): + labels = [] + durations = [] + keys = [] + for group in os.listdir(data_directory): + speaker_path = os.path.join(data_directory, group) + if not os.path.isdir(speaker_path): + continue + for speaker in os.listdir(speaker_path): + chapter_path = os.path.join(speaker_path, speaker) + if not os.path.isdir(chapter_path): + continue + for chapter in os.listdir(chapter_path): + labels_file = os.path.join(chapter_path, chapter, + '{}-{}.trans.txt' + .format(speaker, chapter)) + for line in open(labels_file): + split = line.strip().split() + file_id = split[0] + label = ' '.join(split[1:]).lower() + audio_file = os.path.join(chapter_path, chapter, + file_id) + '.wav' + audio = wave.open(audio_file) + duration = float(audio.getnframes()) / audio.getframerate() + audio.close() + if float(duration) > max_duration: + continue + keys.append(audio_file) + durations.append(duration) + labels.append(label) + + return keys, durations, labels + + def featurize(self, audio_clip): + """ For a given audio clip, calculate the log of its Fourier Transform + Params: + audio_clip(str): Path to the audio clip + """ + return spectrogram_from_file( + audio_clip, step=self.step, window=self.window, + max_freq=self.max_freq) + + def load_data(self, data_directory, partition='train', + max_duration=10.0): + """ Read metadata from the description file + (possibly takes long, depending on the filesize) + Params: + desc_file (str): Path to a JSON-line file that contains labels and + paths to the audio files + partition (str): One of 'train', 'validation' or 'test' + max_duration (float): In seconds, the maximum duration of + utterances to train or test on + """ + audio_paths, durations, texts = self.read_data(data_directory, max_duration) + if partition == 'train': + self.train_audio_paths = audio_paths + self.train_durations = durations + self.train_texts = texts + elif partition == 'validation': + self.val_audio_paths = audio_paths + self.val_durations = durations + self.val_texts = texts + elif partition == 'test': + self.test_audio_paths = audio_paths + self.test_durations = durations + self.test_texts = texts + else: + raise Exception("Invalid partition to load metadata. " + "Must be train/validation/test") + + def load_train_data(self, data_directory): + self.load_data(data_directory, 'train') + + def load_test_data(self, data_directory): + self.load_data(data_directory, 'test') + + def load_validation_data(self, data_directory): + self.load_data(data_directory, 'validation') + + @staticmethod + def sort_by_duration(durations, audio_paths, texts): + return zip(*sorted(zip(durations, audio_paths, texts))) + + def normalize(self, feature, eps=1e-14): + return (feature - self.feats_mean) / (self.feats_std + eps) + + def prepare_batch(self, audio_paths, texts): + """ Featurize a batch of audio, zero pad them and return a dictionary + Params: + audio_paths (list(str)): List of paths to audio files + texts (list(str)): List of texts corresponding to the audio files + Returns: + dict: See below for contents + """ + assert len(audio_paths) == len(texts), \ + "Inputs and outputs to the network must be of the same number" + # Features is a list of (timesteps, feature_dim) arrays + # Calculate the features for each audio clip, as the log of the + # Fourier Transform of the audio + features = [self.featurize(a) for a in audio_paths] + input_lengths = [f.shape[0] for f in features] + max_length = max(input_lengths) + feature_dim = features[0].shape[1] + mb_size = len(features) + # Pad all the inputs so that they are all the same length + x = np.zeros((mb_size, max_length, feature_dim)) + y = [] + label_lengths = [] + for i in range(mb_size): + feat = features[i] + feat = self.normalize(feat) # Center using means and std + x[i, :feat.shape[0], :] = feat + label = text_to_int_sequence(texts[i]) + y.append(label) + label_lengths.append(len(label)) + # Flatten labels to comply with warp-CTC signature + y = reduce(lambda i, j: i + j, y) + return { + 'x': x, # (0-padded features of shape(batch_size, timesteps, feat_dim) + 'y': y, # list(int) Flattened labels (integer sequences) + 'texts': texts, # list(str) Original texts + 'input_lengths': input_lengths, # list(int) Length of each input + 'label_lengths': label_lengths # list(int) Length of each label + } + + def get_generator(self, audio_paths, texts, batch_size, shuffle=True, sort_by_duration=False): + def generator(): + num_samples = len(audio_paths) + while True: + if shuffle: + temp = list(zip(audio_paths, texts)) + self.rng.shuffle(temp) + x, y = list(zip(*temp)) + + pool = ThreadPoolExecutor(1) # Run a single I/O thread in parallel + future = pool.submit(self.prepare_batch, + x[:batch_size], + y[:batch_size]) + for offset in range(batch_size, num_samples, batch_size): + wait([future]) + batch = future.result() + future = pool.submit(self.prepare_batch, + x[offset: offset + batch_size], + y[offset: offset + batch_size]) + yield batch + + return generator() + + def get_train_generator(self, batch_size=16, shuffle=True): + return self.get_generator(self.train_audio_paths, self.train_texts, batch_size, shuffle) + + def get_test_generator(self, batch_size=16, shuffle=True): + return self.get_generator(self.test_audio_paths, self.test_texts, batch_size, shuffle) + + def get_validation_generator(self, batch_size=16, shuffle=True): + return self.get_generator(self.val_audio_paths, self.val_texts, batch_size, shuffle) + + def fit_train(self, k_samples=100): + """ Estimate the mean and std of the features from the training set + Params: + k_samples (int): Use this number of samples for estimation + """ + k_samples = min(k_samples, len(self.train_audio_paths)) + samples = self.rng.sample(self.train_audio_paths, k_samples) + feats = [self.featurize(s) for s in samples] + feats = np.vstack(feats) + self.feats_mean = np.mean(feats, axis=0) + self.feats_std = np.std(feats, axis=0) \ No newline at end of file diff --git a/model.ipynb b/model.ipynb new file mode 100644 index 0000000..3a8048a --- /dev/null +++ b/model.ipynb @@ -0,0 +1,140 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "import tensorflow as tf\n", + "from tensorflow.python.keras.models import Sequential, Model\n", + "from tensorflow.python.keras.callbacks import Callback\n", + "from tensorflow.python.keras.layers import *\n", + "from tensorflow.python.keras.optimizers import Adam, SGD\n", + "from tensorflow.python.keras.activations import relu\n", + "from tensorflow.python.keras.metrics import categorical_accuracy, mean_squared_error\n", + "from tensorflow.python.keras.callbacks import BaseLogger, ModelCheckpoint, EarlyStopping, TensorBoard, ReduceLROnPlateau\n", + "from tensorflow.python.keras import backend as K\n", + "from tensorflow.python.keras.initializers import Ones, Zeros, glorot_normal\n", + "from tensorflow.python.framework import tensor_shape\n", + "from data_genration import DataGenerator\n", + "\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "def clipped_relu(x):\n", + " return relu(x, max_value=20)\n", + "\n", + "def ctc_lambda_func(args):\n", + " labels, y_pred, input_length, label_length = args\n", + " return K.ctc_batch_cost(labels, y_pred, input_length, label_length)\n", + "\n", + "def ctc(y_true, y_pred):\n", + " return y_pred" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "def get_speech_model():\n", + " model = Sequential()\n", + " \n", + " # Batch normalize the input\n", + " model.add(BatchNormalization(axis=-1, input_shape=(None, 161), name='BN_1'))\n", + " \n", + " # 1D Convs\n", + " model.add(Conv1D(512, 5, strides=1, activation=clipped_relu, name='Conv1D_1'))\n", + " model.add(Conv1D(512, 5, strides=1, activation=clipped_relu, name='Conv1D_2'))\n", + " model.add(Conv1D(512, 5, strides=2, activation=clipped_relu, name='Conv1D_3'))\n", + " \n", + " # Batch Normalization\n", + " model.add(BatchNormalization(axis=-1, name='BN_2'))\n", + " \n", + " # BiRNNs\n", + " model.add(Bidirectional(SimpleRNN(1280, return_sequences=True, name='BiRNN_1'), merge_mode='sum'))\n", + " model.add(Bidirectional(SimpleRNN(1280, return_sequences=True, name='BiRNN_2'), merge_mode='sum'))\n", + " model.add(Bidirectional(SimpleRNN(1280, return_sequences=True, name='BiRNN_3'), merge_mode='sum'))\n", + " model.add(Bidirectional(SimpleRNN(1280, return_sequences=True, name='BiRNN_4'), merge_mode='sum'))\n", + " model.add(Bidirectional(SimpleRNN(1280, return_sequences=True, name='BiRNN_5'), merge_mode='sum'))\n", + " model.add(Bidirectional(SimpleRNN(1280, return_sequences=True, name='BiRNN_6'), merge_mode='sum'))\n", + " model.add(Bidirectional(SimpleRNN(1280, return_sequences=True, name='BiRNN_7'), merge_mode='sum'))\n", + " \n", + " # Batch Normalization\n", + " model.add(BatchNormalization(axis=-1, name='BN_3'))\n", + " \n", + " # FC\n", + " model.add(TimeDistributed(Dense(1024, activation=clipped_relu, name='FC1')))\n", + " model.add(TimeDistributed(Dense(29, activation='softmax', name='y_pred')))\n", + " return model\n", + "\n", + "def get_trainable_speech_model():\n", + " model = get_speech_model()\n", + " y_pred = model.outputs[0]\n", + " model_input = model.inputs[0]\n", + " \n", + " model.summary()\n", + " \n", + " labels = Input(name='the_labels', shape=[None,], dtype='int32')\n", + " input_length = Input(name='input_length', shape=[1], dtype='int32')\n", + " label_length = Input(name='label_length', shape=[1], dtype='int32')\n", + "\n", + " loss_out = Lambda(ctc_lambda_func, name='ctc')([labels, y_pred, input_length, label_length])\n", + " trainable_model = Model(inputs=[model_input, labels, input_length, label_length], outputs=loss_out)\n", + " return trainable_model" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "Model: \"sequential\"\n_________________________________________________________________\nLayer (type) Output Shape Param # \n=================================================================\nBN_1 (BatchNormalization) (None, None, 161) 644 \n_________________________________________________________________\nConv1D_1 (Conv1D) (None, None, 512) 412672 \n_________________________________________________________________\nConv1D_2 (Conv1D) (None, None, 512) 1311232 \n_________________________________________________________________\nConv1D_3 (Conv1D) (None, None, 512) 1311232 \n_________________________________________________________________\nBN_2 (BatchNormalization) (None, None, 512) 2048 \n_________________________________________________________________\nbidirectional (Bidirectional (None, None, 1280) 4590080 \n_________________________________________________________________\nbidirectional_1 (Bidirection (None, None, 1280) 6556160 \n_________________________________________________________________\nbidirectional_2 (Bidirection (None, None, 1280) 6556160 \n_________________________________________________________________\nbidirectional_3 (Bidirection (None, None, 1280) 6556160 \n_________________________________________________________________\nbidirectional_4 (Bidirection (None, None, 1280) 6556160 \n_________________________________________________________________\nbidirectional_5 (Bidirection (None, None, 1280) 6556160 \n_________________________________________________________________\nbidirectional_6 (Bidirection (None, None, 1280) 6556160 \n_________________________________________________________________\nBN_3 (BatchNormalization) (None, None, 1280) 5120 \n_________________________________________________________________\ntime_distributed (TimeDistri (None, None, 1024) 1311744 \n_________________________________________________________________\ntime_distributed_1 (TimeDist (None, None, 29) 29725 \n=================================================================\nTotal params: 48,311,457\nTrainable params: 48,307,551\nNon-trainable params: 3,906\n_________________________________________________________________\nModel: \"model\"\n__________________________________________________________________________________________________\nLayer (type) Output Shape Param # Connected to \n==================================================================================================\nBN_1_input (InputLayer) [(None, None, 161)] 0 \n__________________________________________________________________________________________________\nBN_1 (BatchNormalization) (None, None, 161) 644 BN_1_input[0][0] \n__________________________________________________________________________________________________\nConv1D_1 (Conv1D) (None, None, 512) 412672 BN_1[0][0] \n__________________________________________________________________________________________________\nConv1D_2 (Conv1D) (None, None, 512) 1311232 Conv1D_1[0][0] \n__________________________________________________________________________________________________\nConv1D_3 (Conv1D) (None, None, 512) 1311232 Conv1D_2[0][0] \n__________________________________________________________________________________________________\nBN_2 (BatchNormalization) (None, None, 512) 2048 Conv1D_3[0][0] \n__________________________________________________________________________________________________\nbidirectional (Bidirectional) (None, None, 1280) 4590080 BN_2[0][0] \n__________________________________________________________________________________________________\nbidirectional_1 (Bidirectional) (None, None, 1280) 6556160 bidirectional[0][0] \n__________________________________________________________________________________________________\nbidirectional_2 (Bidirectional) (None, None, 1280) 6556160 bidirectional_1[0][0] \n__________________________________________________________________________________________________\nbidirectional_3 (Bidirectional) (None, None, 1280) 6556160 bidirectional_2[0][0] \n__________________________________________________________________________________________________\nbidirectional_4 (Bidirectional) (None, None, 1280) 6556160 bidirectional_3[0][0] \n__________________________________________________________________________________________________\nbidirectional_5 (Bidirectional) (None, None, 1280) 6556160 bidirectional_4[0][0] \n__________________________________________________________________________________________________\nbidirectional_6 (Bidirectional) (None, None, 1280) 6556160 bidirectional_5[0][0] \n__________________________________________________________________________________________________\nBN_3 (BatchNormalization) (None, None, 1280) 5120 bidirectional_6[0][0] \n__________________________________________________________________________________________________\ntime_distributed (TimeDistribut (None, None, 1024) 1311744 BN_3[0][0] \n__________________________________________________________________________________________________\nthe_labels (InputLayer) [(None, None)] 0 \n__________________________________________________________________________________________________\ntime_distributed_1 (TimeDistrib (None, None, 29) 29725 time_distributed[0][0] \n__________________________________________________________________________________________________\ninput_length (InputLayer) [(None, 1)] 0 \n__________________________________________________________________________________________________\nlabel_length (InputLayer) [(None, 1)] 0 \n__________________________________________________________________________________________________\nctc (Lambda) (None, 1) 0 the_labels[0][0] \n time_distributed_1[0][0] \n input_length[0][0] \n label_length[0][0] \n==================================================================================================\nTotal params: 48,311,457\nTrainable params: 48,307,551\nNon-trainable params: 3,906\n__________________________________________________________________________________________________\n" + } + ], + "source": [ + "model = get_trainable_speech_model()\n", + "model.summary()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8-final" + }, + "orig_nbformat": 2, + "kernelspec": { + "name": "python36864bit1f6584a510a843b7888176314c46071f", + "display_name": "Python 3.6.8 64-bit" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file