Skip to content

Commit

Permalink
Optimizers
Browse files Browse the repository at this point in the history
  • Loading branch information
eriklindernoren committed Sep 20, 2017
1 parent e3a4d62 commit 5f7d7f4
Show file tree
Hide file tree
Showing 9 changed files with 51 additions and 84 deletions.
11 changes: 10 additions & 1 deletion mlfromscratch/deep_learning/layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,21 +11,31 @@
class Layer(object):

def set_input_shape(self, shape):
""" Sets the shape that the layer expects of the input in the forward
pass method """
self.input_shape = shape

def layer_name(self):
""" The name of the layer. Used in model summary. """
return self.__class__.__name__

def parameters(self):
""" The number of trainable parameters used by the layer """
return 0

def forward_pass(self, X, training):
""" Propogates the signal forward in the network """
raise NotImplementedError()

def backward_pass(self, acc_grad):
""" Propogates the accumulated gradient backwards in the network.
If the has trainable weights then these weights are also tuned in this method.
As input (acc_grad) it receives the gradient with respect to the output of the layer and
returns the gradient with respect to the output of the previous layer. """
raise NotImplementedError()

def output_shape(self):
""" The shape of the output produced by forward_pass """
raise NotImplementedError()


Expand Down Expand Up @@ -65,7 +75,6 @@ def forward_pass(self, X, training=True):
return X.dot(self.W) + self.w0

def backward_pass(self, acc_grad):

# Save weights used during forwards pass
W = self.W

Expand Down
8 changes: 1 addition & 7 deletions mlfromscratch/deep_learning/neural_network.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,6 @@
from __future__ import print_function
from sklearn import datasets
from terminaltables import AsciiTable
import sys
import os
import math
import copy
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import progressbar

Expand Down Expand Up @@ -134,7 +128,7 @@ def summary(self, name="Model Summary"):

# Print network configuration table
print (AsciiTable(table_data).table)

print ("Total Parameters: %d\n" % tot_params)

def predict(self, X):
Expand Down
67 changes: 27 additions & 40 deletions mlfromscratch/deep_learning/optimizers.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,23 @@
import numpy as np
from mlfromscratch.utils.data_manipulation import make_diagonal, normalize

# Optimizers for models that use gradient methods for finding the
# Optimizers for models that use gradient based methods for finding the
# weights that minimizes the loss.
# A good resource:
# A great resource for understanding these methods:
# http://sebastianruder.com/optimizing-gradient-descent/index.html

class GradientDescent():
class StochasticGradientDescent():
def __init__(self, learning_rate=0.01, momentum=0):
self.learning_rate = learning_rate
self.momentum = momentum
self.w_updt = np.array([])
self.w_updt = None

def update(self, w, grad_wrt_w):
if not self.w_updt.any():
# If not initialized
if self.w_updt is None:
self.w_updt = np.zeros(np.shape(w))
# Use momentum if set
self.w_updt = self.momentum * self.w_updt + grad_wrt_w
self.w_updt = self.momentum * self.w_updt + (1 - self.momentum) * grad_wrt_w
# Move against the gradient to minimize loss
return w - self.learning_rate * self.w_updt

Expand All @@ -40,43 +41,35 @@ def update(self, w, grad_func):
class Adagrad():
def __init__(self, learning_rate=0.01):
self.learning_rate = learning_rate
self.G = np.array([]) # Sum of squares of the gradients
self.G = None # Sum of squares of the gradients
self.eps = 1e-8

def update(self, w, grad_wrt_w):
# Gradient clipping to avoid exploding grads
grad_at_w = np.clip(grad_wrt_w, -1, 1)
# If not initialized
if not self.G.any():
if self.G is None:
self.G = np.zeros(np.shape(w))
# Add the square of the gradient of the loss function at w
self.G += np.power(grad_at_w, 2)
self.G += np.power(grad_wrt_w, 2)
# Adaptive gradient with higher learning rate for sparse data
w_updt = self.learning_rate / np.sqrt(self.G + self.eps).T * grad_at_w

return w - w_updt

return w - self.learning_rate * grad_wrt_w / np.sqrt(self.G + self.eps)

class Adadelta():
def __init__(self, rho=0.95, eps=1e-6):
self.E_w_updt = np.array([]) # Running average of squared parameter updates
self.E_grad = np.array([]) # Running average of the squared gradient of w
self.w_updt = np.array([]) # Parameter update
self.E_w_updt = None # Running average of squared parameter updates
self.E_grad = None # Running average of the squared gradient of w
self.w_updt = None # Parameter update
self.eps = eps
self.rho = rho

def update(self, w, grad_wrt_w):
# Gradient clipping to avoid exploding grads
grad_at_w = np.clip(grad_wrt_w, -1, 1)

# If not initialized
if not self.w_updt.any():
if self.w_updt is None:
self.w_updt = np.zeros(np.shape(w))
self.E_w_updt = np.zeros(np.shape(w))
self.E_grad = np.zeros(np.shape(grad_at_w))
self.E_grad = np.zeros(np.shape(grad_wrt_w))

# Update average of gradients at w
self.E_grad = self.rho * self.E_grad + (1 - self.rho) * np.power(grad_at_w, 2)
self.E_grad = self.rho * self.E_grad + (1 - self.rho) * np.power(grad_wrt_w, 2)

RMS_delta_w = np.sqrt(self.E_w_updt + self.eps)
RMS_grad = np.sqrt(self.E_grad + self.eps)
Expand All @@ -85,7 +78,7 @@ def update(self, w, grad_wrt_w):
adaptive_lr = RMS_delta_w / RMS_grad

# Calculate the update
self.w_updt = adaptive_lr * grad_at_w
self.w_updt = adaptive_lr * grad_wrt_w

# Update the running average of w updates
self.E_w_updt = self.rho * self.E_w_updt + (1 - self.rho) * np.power(self.w_updt, 2)
Expand All @@ -95,40 +88,34 @@ def update(self, w, grad_wrt_w):
class RMSprop():
def __init__(self, learning_rate=0.01, rho=0.9):
self.learning_rate = learning_rate
self.Eg = np.array([]) # Running average of the square gradients at w
self.Eg = None # Running average of the square gradients at w
self.eps = 1e-8
self.rho = rho

def update(self, w, grad_wrt_w):
# Gradient clipping to avoid exploding grads
grad_at_w = np.clip(grad_wrt_w, -1, 1)

# If not initialized
if not self.Eg.any():
self.Eg = np.zeros(np.shape(grad_at_w))
if self.Eg is None:
self.Eg = np.zeros(np.shape(grad_wrt_w))

self.Eg = self.rho * self.Eg + (1 - self.rho) * np.power(grad_at_w, 2)
self.Eg = self.rho * self.Eg + (1 - self.rho) * np.power(grad_wrt_w, 2)

# Divide the learning rate for a weight by a running average of the magnitudes of recent
# gradients for that weight
self.w_updt = self.learning_rate * np.linalg.pinv(np.sqrt(self.Eg + self.eps)).T * grad_at_w

return w - self.w_updt
return w - self.learning_rate * grad_wrt_w / np.sqrt(self.Eg + self.eps)

class Adam():
def __init__(self, learning_rate=0.001, b1=0.9, b2=0.999):
self.learning_rate = learning_rate
self.eps = 1e-8
self.m = np.array([])
self.v = np.array([])
self.m = None
self.v = None
# Decay rates
self.b1 = b1
self.b2 = b2

def update(self, w, grad_wrt_w):

# If not initialized
if not self.m.any():
if self.m is None:
self.m = np.zeros(np.shape(grad_wrt_w))
self.v = np.zeros(np.shape(grad_wrt_w))

Expand All @@ -138,7 +125,7 @@ def update(self, w, grad_wrt_w):
m_hat = self.m / (1 - self.b1)
v_hat = self.v / (1 - self.b2)

self.w_updt = self.learning_rate / (np.sqrt(v_hat) + self.eps) * m_hat
self.w_updt = self.learning_rate * m_hat / (np.sqrt(v_hat) + self.eps)

return w - self.w_updt

Expand Down
4 changes: 2 additions & 2 deletions mlfromscratch/examples/convolutional_neural_network.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from mlfromscratch.utils.data_manipulation import train_test_split, to_categorical, normalize
from mlfromscratch.utils.data_manipulation import get_random_subsets, shuffle_data
from mlfromscratch.utils.data_operation import accuracy_score
from mlfromscratch.deep_learning.optimizers import GradientDescent, Adam, RMSprop, Adagrad, Adadelta
from mlfromscratch.deep_learning.optimizers import StochasticGradientDescent, Adam, RMSprop, Adagrad, Adadelta
from mlfromscratch.deep_learning.loss_functions import CrossEntropy
from mlfromscratch.utils.misc import bar_widgets
from mlfromscratch.utils import Plot
Expand All @@ -25,7 +25,7 @@ def main():
# Conv Net
#----------

optimizer = Adam()
optimizer = Adadelta()

data = datasets.load_digits()
X = data.data
Expand Down
4 changes: 2 additions & 2 deletions mlfromscratch/examples/multilayer_perceptron.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from mlfromscratch.utils.data_manipulation import train_test_split, to_categorical, normalize
from mlfromscratch.utils.data_manipulation import get_random_subsets, shuffle_data
from mlfromscratch.utils.data_operation import accuracy_score
from mlfromscratch.deep_learning.optimizers import GradientDescent, Adam, RMSprop, Adagrad, Adadelta
from mlfromscratch.deep_learning.optimizers import StochasticGradientDescent, Adam, RMSprop, Adagrad, Adadelta
from mlfromscratch.deep_learning.loss_functions import CrossEntropy
from mlfromscratch.utils.misc import bar_widgets
from mlfromscratch.utils import Plot
Expand All @@ -19,7 +19,7 @@

def main():

optimizer = Adam()
optimizer = Adadelta()

#-----
# MLP
Expand Down
13 changes: 2 additions & 11 deletions mlfromscratch/supervised_learning/logistic_regression.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,9 @@
from __future__ import print_function
import sys
import os
import math
from sklearn import datasets
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Import helper functions
from mlfromscratch.utils.data_manipulation import make_diagonal, normalize, train_test_split
from mlfromscratch.utils.data_operation import accuracy_score
from mlfromscratch.utils.data_manipulation import make_diagonal
from mlfromscratch.utils.activation_functions import Sigmoid
from mlfromscratch.utils.optimizers import GradientDescent
from mlfromscratch.unsupervised_learning import PCA
from mlfromscratch.utils import Plot


Expand Down Expand Up @@ -42,7 +33,7 @@ def fit(self, X, y, n_iterations=4000):
# Initialize parameters between [-1/sqrt(N), 1/sqrt(N)]
limit = 1 / math.sqrt(n_features)
self.param = np.random.uniform(-limit, limit, (n_features,))

# Tune parameters for n iterations
for i in range(n_iterations):
# Make a new prediction
Expand Down
18 changes: 7 additions & 11 deletions mlfromscratch/supervised_learning/naive_bayes.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def fit(self, X, y):
parameters = {"mean": col.mean(), "var": col.var()}
self.parameters[i].append(parameters)

def _calculate_probability(self, mean, var, x):
def _calculate_likelihood(self, mean, var, x):
""" Gaussian likelihood of the data x given mean and var """
coeff = (1.0 / (math.sqrt((2.0 * math.pi) * var)))
exponent = math.exp(-(math.pow(x - mean, 2) / (2 * var)))
Expand All @@ -54,7 +54,7 @@ def _calculate_prior(self, c):

def _classify(self, sample):
""" Classify using Bayes Rule, P(Y|X) = P(X|Y)*P(Y)/P(X)
P(X|Y) - Probability. Gaussian distribution (given by _calculate_probability)
P(X|Y) - Likelihood. Gaussian distribution (given by _calculate_likelihood)
P(Y) - Prior (given by _calculate_prior)
P(X) - Scales the posterior to make it a proper probability distribution.
This term is ignored in this implementation since it doesn't affect
Expand All @@ -65,24 +65,20 @@ def _classify(self, sample):
# Go through list of classes
for i in range(len(self.classes)):
c = self.classes[i]
prior = self._calculate_prior(c)
posterior = prior
# multiply with the additional probabilties
posterior = self._calculate_prior(c)
# Naive assumption (independence):
# P(x1,x2,x3|Y) = P(x1|Y)*P(x2|Y)*P(x3|Y)
# Multiply with the class likelihoods
for j, params in enumerate(self.parameters[i]):
sample_feature = sample[j]
# Determine P(x|Y)
likelihood = self._calculate_probability(params["mean"], params["var"], sample_feature)
# Multiply with the rest
likelihood = self._calculate_likelihood(params["mean"], params["var"], sample_feature)
# Multiply with the accumulated probability
posterior *= likelihood
# Total posterior = P(Y)*P(x1|Y)*P(x2|Y)*...*P(xN|Y)
posteriors.append(posterior)
# Get the largest probability and return the class corresponding
# to that probability
# Return the class with the largest posterior probability
index_of_max = np.argmax(posteriors)
max_value = posteriors[index_of_max]

return self.classes[index_of_max]

def predict(self, X):
Expand Down
4 changes: 0 additions & 4 deletions mlfromscratch/supervised_learning/perceptron.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from mlfromscratch.utils.data_manipulation import train_test_split, to_categorical, normalize
from mlfromscratch.utils.data_operation import accuracy_score
from mlfromscratch.deep_learning.activation_functions import Sigmoid, ReLU, SoftPlus, LeakyReLU, TanH, ELU
from mlfromscratch.deep_learning.optimizers import GradientDescent
from mlfromscratch.deep_learning.loss_functions import CrossEntropy, SquareLoss
from mlfromscratch.utils import Plot

Expand Down Expand Up @@ -52,14 +51,11 @@ def fit(self, X, y):
# Calculate outputs
linear_output = X.dot(self.W) + self.w0
y_pred = self.activation.function(linear_output)

# Calculate the loss gradient w.r.t the input of the activation function
error_gradient = self.loss.gradient(y, y_pred) * self.activation.gradient(linear_output)

# Calculate the gradient of the loss with respect to each weight
grad_wrt_w = X.T.dot(error_gradient)
grad_wrt_w0 = np.sum(error_gradient, axis=0, keepdims=True)

# Update weights
self.W -= self.learning_rate * grad_wrt_w
self.w0 -= self.learning_rate * grad_wrt_w0
Expand Down
6 changes: 0 additions & 6 deletions mlfromscratch/supervised_learning/regression.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,5 @@
from __future__ import print_function
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets
import sys
import os
import math
# Import helper functions
from mlfromscratch.utils.data_manipulation import normalize
from mlfromscratch.utils.data_manipulation import polynomial_features
Expand Down

0 comments on commit 5f7d7f4

Please sign in to comment.