diff --git a/data_genration.py b/data_genration.py
new file mode 100644
index 0000000..be8a4d1
--- /dev/null
+++ b/data_genration.py
@@ -0,0 +1,359 @@
+"""
+Defines a class that is used to featurize audio clips, and provide
+them to the network for training or testing.
+"""
+
+from __future__ import absolute_import, division, print_function
+
+import os
+import random
+import wave
+from concurrent.futures import ThreadPoolExecutor, wait
+from functools import reduce
+
+import numpy as np
+import soundfile
+from numpy.lib.stride_tricks import as_strided
+
+RNG_SEED = 123
+char_map_str = """
+' 1
+<SPACE> 2
+a 3
+b 4
+c 5
+d 6
+e 7
+f 8
+g 9
+h 10
+i 11
+j 12
+k 13
+l 14
+m 15
+n 16
+o 17
+p 18
+q 19
+r 20
+s 21
+t 22
+u 23
+v 24
+w 25
+x 26
+y 27
+z 28
+"""
+char_map = {}
+index_map = {}
+for line in char_map_str.strip().split('\n'):
+    ch, index = line.split()
+    char_map[ch] = int(index)
+    index_map[int(index)] = ch
+index_map[2] = ' '
+
+
+def calc_feat_dim(window, max_freq):
+    return int(0.001 * window * max_freq) + 1
+
+
+def text_to_int_sequence(text):
+    """ Use a character map and convert text to an integer sequence """
+    int_sequence = []
+    for c in text:
+        if c == ' ':
+            ch = char_map['<SPACE>']
+        else:
+            ch = char_map[c]
+        int_sequence.append(ch)
+    return int_sequence
+
+
+def int_to_text_sequence(seq):
+    text_sequence = []
+    for c in seq:
+        if c == 28:
+            ch = ''
+        else:
+            ch = index_map[c]
+        text_sequence.append(ch)
+    return text_sequence
+
+
+def spectrogram(samples, fft_length=256, sample_rate=2, hop_length=128):
+    """
+    Compute the spectrogram for a real signal.
+    The parameters follow the naming convention of
+    matplotlib.mlab.specgram
+    Args:
+        samples (1D array): input audio signal
+        fft_length (int): number of elements in fft window
+        sample_rate (scalar): sample rate
+        hop_length (int): hop length (relative offset between neighboring
+            fft windows).
+    Returns:
+        x (2D array): spectrogram [frequency x time]
+        freq (1D array): frequency of each row in x
+    Note:
+        This is a truncating computation e.g. if fft_length=10,
+        hop_length=5 and the signal has 23 elements, then the
+        last 3 elements will be truncated.
+    """
+    assert not np.iscomplexobj(samples), "Must not pass in complex numbers"
+
+    window = np.hanning(fft_length)[:, None]
+    window_norm = np.sum(window ** 2)
+
+    # The scaling below follows the convention of
+    # matplotlib.mlab.specgram which is the same as
+    # matlabs specgram.
+    scale = window_norm * sample_rate
+
+    trunc = (len(samples) - fft_length) % hop_length
+    x = samples[:len(samples) - trunc]
+
+    # "stride trick" reshape to include overlap
+    nshape = (fft_length, (len(x) - fft_length) // hop_length + 1)
+    nstrides = (x.strides[0], x.strides[0] * hop_length)
+    x = as_strided(x, shape=nshape, strides=nstrides)
+
+    # window stride sanity check
+    assert np.all(x[:, 1] == samples[hop_length:(hop_length + fft_length)])
+
+    # broadcast window, compute fft over columns and square mod
+    x = np.fft.rfft(x * window, axis=0)
+    x = np.absolute(x) ** 2
+
+    # scale, 2.0 for everything except dc and fft_length/2
+    x[1:-1, :] *= (2.0 / scale)
+    x[(0, -1), :] /= scale
+
+    freqs = float(sample_rate) / fft_length * np.arange(x.shape[0])
+
+    return x, freqs
+
+
+def spectrogram_from_file(filename, step=10, window=20, max_freq=None,
+                          eps=1e-14):
+    """ Calculate the log of linear spectrogram from FFT energy
+    Params:
+        filename (str): Path to the audio file
+        step (int): Step size in milliseconds between windows
+        window (int): FFT window size in milliseconds
+        max_freq (int): Only FFT bins corresponding to frequencies between
+            [0, max_freq] are returned
+        eps (float): Small value to ensure numerical stability (for ln(x))
+    """
+    with soundfile.SoundFile(filename) as sound_file:
+        audio = sound_file.read(dtype='float32')
+        sample_rate = sound_file.samplerate
+        if audio.ndim >= 2:
+            audio = np.mean(audio, 1)
+        if max_freq is None:
+            max_freq = sample_rate / 2
+        if max_freq > sample_rate / 2:
+            raise ValueError("max_freq must not be greater than half of "
+                             " sample rate")
+        if step > window:
+            raise ValueError("step size must not be greater than window size")
+        hop_length = int(0.001 * step * sample_rate)
+        fft_length = int(0.001 * window * sample_rate)
+        pxx, freqs = spectrogram(
+            audio, fft_length=fft_length, sample_rate=sample_rate,
+            hop_length=hop_length)
+        ind = np.where(freqs <= max_freq)[0][-1] + 1
+    return np.transpose(np.log(pxx[:ind, :] + eps))
+
+
+class DataGenerator(object):
+    def __init__(self, step=10, window=20, max_freq=8000, desc_file=None):
+        """
+        Params:
+            step (int): Step size in milliseconds between windows
+            window (int): FFT window size in milliseconds
+            max_freq (int): Only FFT bins corresponding to frequencies between
+                [0, max_freq] are returned
+            desc_file (str, optional): Path to a JSON-line file that contains
+                labels and paths to the audio files. If this is None, then
+                load metadata right away
+        """
+        self.feat_dim = calc_feat_dim(window, max_freq)
+        self.feats_mean = np.zeros((self.feat_dim,))
+        self.feats_std = np.ones((self.feat_dim,))
+        self.rng = random.Random(RNG_SEED)
+        if desc_file is not None:
+            self.load_metadata_from_desc_file(desc_file)
+        self.step = step
+        self.window = window
+        self.max_freq = max_freq
+
+    def read_data(self, data_directory, max_duration=10.0):
+        labels = []
+        durations = []
+        keys = []
+        for group in os.listdir(data_directory):
+            speaker_path = os.path.join(data_directory, group)
+            if not os.path.isdir(speaker_path):
+                continue
+            for speaker in os.listdir(speaker_path):
+                chapter_path = os.path.join(speaker_path, speaker)
+                if not os.path.isdir(chapter_path):
+                    continue
+                for chapter in os.listdir(chapter_path):
+                    labels_file = os.path.join(chapter_path, chapter,
+                                               '{}-{}.trans.txt'
+                                               .format(speaker, chapter))
+                    for line in open(labels_file):
+                        split = line.strip().split()
+                        file_id = split[0]
+                        label = ' '.join(split[1:]).lower()
+                        audio_file = os.path.join(chapter_path, chapter,
+                                                  file_id) + '.wav'
+                        audio = wave.open(audio_file)
+                        duration = float(audio.getnframes()) / audio.getframerate()
+                        audio.close()
+                        if float(duration) > max_duration:
+                            continue
+                        keys.append(audio_file)
+                        durations.append(duration)
+                        labels.append(label)
+
+        return keys, durations, labels
+
+    def featurize(self, audio_clip):
+        """ For a given audio clip, calculate the log of its Fourier Transform
+        Params:
+            audio_clip(str): Path to the audio clip
+        """
+        return spectrogram_from_file(
+            audio_clip, step=self.step, window=self.window,
+            max_freq=self.max_freq)
+
+    def load_data(self, data_directory, partition='train',
+                  max_duration=10.0):
+        """ Read metadata from the description file
+            (possibly takes long, depending on the filesize)
+        Params:
+            desc_file (str):  Path to a JSON-line file that contains labels and
+                paths to the audio files
+            partition (str): One of 'train', 'validation' or 'test'
+            max_duration (float): In seconds, the maximum duration of
+                utterances to train or test on
+        """
+        audio_paths, durations, texts = self.read_data(data_directory, max_duration)
+        if partition == 'train':
+            self.train_audio_paths = audio_paths
+            self.train_durations = durations
+            self.train_texts = texts
+        elif partition == 'validation':
+            self.val_audio_paths = audio_paths
+            self.val_durations = durations
+            self.val_texts = texts
+        elif partition == 'test':
+            self.test_audio_paths = audio_paths
+            self.test_durations = durations
+            self.test_texts = texts
+        else:
+            raise Exception("Invalid partition to load metadata. "
+                            "Must be train/validation/test")
+
+    def load_train_data(self, data_directory):
+        self.load_data(data_directory, 'train')
+
+    def load_test_data(self, data_directory):
+        self.load_data(data_directory, 'test')
+
+    def load_validation_data(self, data_directory):
+        self.load_data(data_directory, 'validation')
+
+    @staticmethod
+    def sort_by_duration(durations, audio_paths, texts):
+        return zip(*sorted(zip(durations, audio_paths, texts)))
+
+    def normalize(self, feature, eps=1e-14):
+        return (feature - self.feats_mean) / (self.feats_std + eps)
+
+    def prepare_batch(self, audio_paths, texts):
+        """ Featurize a batch of audio, zero pad them and return a dictionary
+        Params:
+            audio_paths (list(str)): List of paths to audio files
+            texts (list(str)): List of texts corresponding to the audio files
+        Returns:
+            dict: See below for contents
+        """
+        assert len(audio_paths) == len(texts), \
+            "Inputs and outputs to the network must be of the same number"
+        # Features is a list of (timesteps, feature_dim) arrays
+        # Calculate the features for each audio clip, as the log of the
+        # Fourier Transform of the audio
+        features = [self.featurize(a) for a in audio_paths]
+        input_lengths = [f.shape[0] for f in features]
+        max_length = max(input_lengths)
+        feature_dim = features[0].shape[1]
+        mb_size = len(features)
+        # Pad all the inputs so that they are all the same length
+        x = np.zeros((mb_size, max_length, feature_dim))
+        y = []
+        label_lengths = []
+        for i in range(mb_size):
+            feat = features[i]
+            feat = self.normalize(feat)  # Center using means and std
+            x[i, :feat.shape[0], :] = feat
+            label = text_to_int_sequence(texts[i])
+            y.append(label)
+            label_lengths.append(len(label))
+        # Flatten labels to comply with warp-CTC signature
+        y = reduce(lambda i, j: i + j, y)
+        return {
+            'x': x,  # (0-padded features of shape(batch_size, timesteps, feat_dim)
+            'y': y,  # list(int) Flattened labels (integer sequences)
+            'texts': texts,  # list(str) Original texts
+            'input_lengths': input_lengths,  # list(int) Length of each input
+            'label_lengths': label_lengths  # list(int) Length of each label
+        }
+
+    def get_generator(self, audio_paths, texts, batch_size, shuffle=True, sort_by_duration=False):
+        def generator():
+            num_samples = len(audio_paths)
+            while True:
+                if shuffle:
+                    temp = list(zip(audio_paths, texts))
+                    self.rng.shuffle(temp)
+                    x, y = list(zip(*temp))
+
+                pool = ThreadPoolExecutor(1)  # Run a single I/O thread in parallel
+                future = pool.submit(self.prepare_batch,
+                                     x[:batch_size],
+                                     y[:batch_size])
+                for offset in range(batch_size, num_samples, batch_size):
+                    wait([future])
+                    batch = future.result()
+                    future = pool.submit(self.prepare_batch,
+                                         x[offset: offset + batch_size],
+                                         y[offset: offset + batch_size])
+                    yield batch
+
+        return generator()
+
+    def get_train_generator(self, batch_size=16, shuffle=True):
+        return self.get_generator(self.train_audio_paths, self.train_texts, batch_size, shuffle)
+
+    def get_test_generator(self, batch_size=16, shuffle=True):
+        return self.get_generator(self.test_audio_paths, self.test_texts, batch_size, shuffle)
+
+    def get_validation_generator(self, batch_size=16, shuffle=True):
+        return self.get_generator(self.val_audio_paths, self.val_texts, batch_size, shuffle)
+
+    def fit_train(self, k_samples=100):
+        """ Estimate the mean and std of the features from the training set
+        Params:
+            k_samples (int): Use this number of samples for estimation
+        """
+        k_samples = min(k_samples, len(self.train_audio_paths))
+        samples = self.rng.sample(self.train_audio_paths, k_samples)
+        feats = [self.featurize(s) for s in samples]
+        feats = np.vstack(feats)
+        self.feats_mean = np.mean(feats, axis=0)
+        self.feats_std = np.std(feats, axis=0)
\ No newline at end of file
diff --git a/model.ipynb b/model.ipynb
new file mode 100644
index 0000000..3a8048a
--- /dev/null
+++ b/model.ipynb
@@ -0,0 +1,140 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tensorflow as tf\n",
+    "from tensorflow.python.keras.models import Sequential, Model\n",
+    "from tensorflow.python.keras.callbacks import Callback\n",
+    "from tensorflow.python.keras.layers import *\n",
+    "from tensorflow.python.keras.optimizers import Adam, SGD\n",
+    "from tensorflow.python.keras.activations import relu\n",
+    "from tensorflow.python.keras.metrics import categorical_accuracy, mean_squared_error\n",
+    "from tensorflow.python.keras.callbacks import BaseLogger, ModelCheckpoint, EarlyStopping, TensorBoard, ReduceLROnPlateau\n",
+    "from tensorflow.python.keras import backend as K\n",
+    "from tensorflow.python.keras.initializers import Ones, Zeros, glorot_normal\n",
+    "from tensorflow.python.framework import tensor_shape\n",
+    "from data_genration import DataGenerator\n",
+    "\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def clipped_relu(x):\n",
+    "    return relu(x, max_value=20)\n",
+    "\n",
+    "def ctc_lambda_func(args):\n",
+    "    labels, y_pred, input_length, label_length = args\n",
+    "    return K.ctc_batch_cost(labels, y_pred, input_length, label_length)\n",
+    "\n",
+    "def ctc(y_true, y_pred):\n",
+    "    return y_pred"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_speech_model():\n",
+    "    model = Sequential()\n",
+    "    \n",
+    "    # Batch normalize the input\n",
+    "    model.add(BatchNormalization(axis=-1, input_shape=(None, 161), name='BN_1'))\n",
+    "    \n",
+    "    # 1D Convs\n",
+    "    model.add(Conv1D(512, 5, strides=1, activation=clipped_relu, name='Conv1D_1'))\n",
+    "    model.add(Conv1D(512, 5, strides=1, activation=clipped_relu, name='Conv1D_2'))\n",
+    "    model.add(Conv1D(512, 5, strides=2, activation=clipped_relu, name='Conv1D_3'))\n",
+    "    \n",
+    "    # Batch Normalization\n",
+    "    model.add(BatchNormalization(axis=-1, name='BN_2'))\n",
+    "    \n",
+    "    # BiRNNs\n",
+    "    model.add(Bidirectional(SimpleRNN(1280, return_sequences=True, name='BiRNN_1'), merge_mode='sum'))\n",
+    "    model.add(Bidirectional(SimpleRNN(1280, return_sequences=True, name='BiRNN_2'), merge_mode='sum'))\n",
+    "    model.add(Bidirectional(SimpleRNN(1280, return_sequences=True, name='BiRNN_3'), merge_mode='sum'))\n",
+    "    model.add(Bidirectional(SimpleRNN(1280, return_sequences=True, name='BiRNN_4'), merge_mode='sum'))\n",
+    "    model.add(Bidirectional(SimpleRNN(1280, return_sequences=True, name='BiRNN_5'), merge_mode='sum'))\n",
+    "    model.add(Bidirectional(SimpleRNN(1280, return_sequences=True, name='BiRNN_6'), merge_mode='sum'))\n",
+    "    model.add(Bidirectional(SimpleRNN(1280, return_sequences=True, name='BiRNN_7'), merge_mode='sum'))\n",
+    "    \n",
+    "    # Batch Normalization\n",
+    "    model.add(BatchNormalization(axis=-1, name='BN_3'))\n",
+    "    \n",
+    "    # FC\n",
+    "    model.add(TimeDistributed(Dense(1024, activation=clipped_relu, name='FC1')))\n",
+    "    model.add(TimeDistributed(Dense(29, activation='softmax', name='y_pred')))\n",
+    "    return model\n",
+    "\n",
+    "def get_trainable_speech_model():\n",
+    "    model = get_speech_model()\n",
+    "    y_pred = model.outputs[0]\n",
+    "    model_input = model.inputs[0]\n",
+    "    \n",
+    "    model.summary()\n",
+    "    \n",
+    "    labels = Input(name='the_labels', shape=[None,], dtype='int32')\n",
+    "    input_length = Input(name='input_length', shape=[1], dtype='int32')\n",
+    "    label_length = Input(name='label_length', shape=[1], dtype='int32')\n",
+    "\n",
+    "    loss_out = Lambda(ctc_lambda_func, name='ctc')([labels, y_pred, input_length, label_length])\n",
+    "    trainable_model = Model(inputs=[model_input, labels, input_length, label_length], outputs=loss_out)\n",
+    "    return trainable_model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": "Model: \"sequential\"\n_________________________________________________________________\nLayer (type)                 Output Shape              Param #   \n=================================================================\nBN_1 (BatchNormalization)    (None, None, 161)         644       \n_________________________________________________________________\nConv1D_1 (Conv1D)            (None, None, 512)         412672    \n_________________________________________________________________\nConv1D_2 (Conv1D)            (None, None, 512)         1311232   \n_________________________________________________________________\nConv1D_3 (Conv1D)            (None, None, 512)         1311232   \n_________________________________________________________________\nBN_2 (BatchNormalization)    (None, None, 512)         2048      \n_________________________________________________________________\nbidirectional (Bidirectional (None, None, 1280)        4590080   \n_________________________________________________________________\nbidirectional_1 (Bidirection (None, None, 1280)        6556160   \n_________________________________________________________________\nbidirectional_2 (Bidirection (None, None, 1280)        6556160   \n_________________________________________________________________\nbidirectional_3 (Bidirection (None, None, 1280)        6556160   \n_________________________________________________________________\nbidirectional_4 (Bidirection (None, None, 1280)        6556160   \n_________________________________________________________________\nbidirectional_5 (Bidirection (None, None, 1280)        6556160   \n_________________________________________________________________\nbidirectional_6 (Bidirection (None, None, 1280)        6556160   \n_________________________________________________________________\nBN_3 (BatchNormalization)    (None, None, 1280)        5120      \n_________________________________________________________________\ntime_distributed (TimeDistri (None, None, 1024)        1311744   \n_________________________________________________________________\ntime_distributed_1 (TimeDist (None, None, 29)          29725     \n=================================================================\nTotal params: 48,311,457\nTrainable params: 48,307,551\nNon-trainable params: 3,906\n_________________________________________________________________\nModel: \"model\"\n__________________________________________________________________________________________________\nLayer (type)                    Output Shape         Param #     Connected to                     \n==================================================================================================\nBN_1_input (InputLayer)         [(None, None, 161)]  0                                            \n__________________________________________________________________________________________________\nBN_1 (BatchNormalization)       (None, None, 161)    644         BN_1_input[0][0]                 \n__________________________________________________________________________________________________\nConv1D_1 (Conv1D)               (None, None, 512)    412672      BN_1[0][0]                       \n__________________________________________________________________________________________________\nConv1D_2 (Conv1D)               (None, None, 512)    1311232     Conv1D_1[0][0]                   \n__________________________________________________________________________________________________\nConv1D_3 (Conv1D)               (None, None, 512)    1311232     Conv1D_2[0][0]                   \n__________________________________________________________________________________________________\nBN_2 (BatchNormalization)       (None, None, 512)    2048        Conv1D_3[0][0]                   \n__________________________________________________________________________________________________\nbidirectional (Bidirectional)   (None, None, 1280)   4590080     BN_2[0][0]                       \n__________________________________________________________________________________________________\nbidirectional_1 (Bidirectional) (None, None, 1280)   6556160     bidirectional[0][0]              \n__________________________________________________________________________________________________\nbidirectional_2 (Bidirectional) (None, None, 1280)   6556160     bidirectional_1[0][0]            \n__________________________________________________________________________________________________\nbidirectional_3 (Bidirectional) (None, None, 1280)   6556160     bidirectional_2[0][0]            \n__________________________________________________________________________________________________\nbidirectional_4 (Bidirectional) (None, None, 1280)   6556160     bidirectional_3[0][0]            \n__________________________________________________________________________________________________\nbidirectional_5 (Bidirectional) (None, None, 1280)   6556160     bidirectional_4[0][0]            \n__________________________________________________________________________________________________\nbidirectional_6 (Bidirectional) (None, None, 1280)   6556160     bidirectional_5[0][0]            \n__________________________________________________________________________________________________\nBN_3 (BatchNormalization)       (None, None, 1280)   5120        bidirectional_6[0][0]            \n__________________________________________________________________________________________________\ntime_distributed (TimeDistribut (None, None, 1024)   1311744     BN_3[0][0]                       \n__________________________________________________________________________________________________\nthe_labels (InputLayer)         [(None, None)]       0                                            \n__________________________________________________________________________________________________\ntime_distributed_1 (TimeDistrib (None, None, 29)     29725       time_distributed[0][0]           \n__________________________________________________________________________________________________\ninput_length (InputLayer)       [(None, 1)]          0                                            \n__________________________________________________________________________________________________\nlabel_length (InputLayer)       [(None, 1)]          0                                            \n__________________________________________________________________________________________________\nctc (Lambda)                    (None, 1)            0           the_labels[0][0]                 \n                                                                 time_distributed_1[0][0]         \n                                                                 input_length[0][0]               \n                                                                 label_length[0][0]               \n==================================================================================================\nTotal params: 48,311,457\nTrainable params: 48,307,551\nNon-trainable params: 3,906\n__________________________________________________________________________________________________\n"
+    }
+   ],
+   "source": [
+    "model = get_trainable_speech_model()\n",
+    "model.summary()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8-final"
+  },
+  "orig_nbformat": 2,
+  "kernelspec": {
+   "name": "python36864bit1f6584a510a843b7888176314c46071f",
+   "display_name": "Python 3.6.8 64-bit"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
\ No newline at end of file