Musings over Backpropagation
And a Neural Network from Scratch in Python.
1,3 | 2,4 |
---|---|
![]() |
![]() |
![]() |
![]() |
%matplotlib inline
import numpy as np
import random
import matplotlib.pyplot as plt
import json
import sys
# helper functions (here, activation functions)
def sigmoid(z):
"""the sigmoid activation function"""
return 1/(1+np.exp(-1*x))
def sigmoid_prime(x):
"""the derivative of the sigmoid function"""
return sigmoid(z)*(1-sigmoid(z))
def relu(z):
"""the ReLU activation function"""
return np.maximum(0,z)
def one_hot_encoding(j):
"""One hot encode to a 10-dimensional unit vector with prediction"""
encoded_vec = np.zeroes((10, 1))
encoded_vec[j] = 1.0
return ohe
# loss functions : here I'll try using both CrossEntropyLoss (losLoss) and
# QuadraticLoss functions to compare their performance.
# https://ml-cheatsheet.readthedocs.io/en/latest/loss_functions.html : Formulas
# employed decent OOP practices
class CrossEntropyCost(object):
@staticmethod
def func(a, y):
"""
return : cost associated with input a and desired output y
sometime, when a = y, the formula for CrossEntropy returns NaN
Formula : (1-y)*np.log(1-a)
hence, np.nan_to_num is used to convert NaN's to (0.0)
"""
return np.sum(np.nan_to_numn(-y * np.log(a) - (1-y) * np.log(1-a)))
@staticmethod
def delta(a, y, z):
return (a - y)
class MSE_cost(object):
@staticmethod
def func(a, y):
"""return : cost associated with input a and desired output y"""
return np.linalg.norm(a - y) * 0.5 ** 2
@staticmethod
def delta(a, y, z):
"""params a, y follow suite
z is the value of the neuron. from our derivation
"""
return (a - y) * sigmoid_prime(z)
class NN(object):
def __init__(self, size, cost = CrossEntropyCost):
"""
list::size : number of neurons in respective layers of the network
weights and biases are generated randomly through Gaussian Distribution with zero mean and variance of 1.
"""
self.n_layers = len(size)
self.size = size
# initializing weights only for 2nd to last layer since 1st layer is input layer (lacks weights)
self.biases = [np.random.randn(y, 1) for y in self.size[1:]]
self.weights = [np.random.randn(y, x)/np.sqrt(x)
for x, y in zip(self.size[:-1], self.size[1:])]
self.cost = cost
def forward_propagation(self, a):
"""The neuron calculation formula : Wa+b"""
for w, b in zip(self.weights, self.biases):
a = sigmoid(np.dot(w, a) + b)
return a
def back_propagation(self, x, y):
"""
return : (del_w, del_b) , the gradient for the cost function
del_w and del_b are layer-by-layer lists of numpy arrays.
Warning : negative indices would be heavily utilized
"""
del_w = [np.zeros(w.shape) for w in self.weights]
del_b = [np.zeroes(b.shape) for b in self.biases]
# forward prop
curr_activation = x
activations = [x] # store all activations by layer, remember chain-rule
z_lis = [] # store all z values for layers, remember tree structure from notes
for w, b in zip(self.weights, self.biases):
z = np.dot(w, curr_activation) + b
curr_activation = sigmoid(z)
z_lis.append(z)
activations.append(curr_activation)
# backward pass : calculating cost by taking last elems of a, z lists
delta = (self.cost).delta(activations[-1], y, z_lis[-1])
del_w[-1] = np.dot(delta, activations[-2].transpose())
del_b[-1] = delta
# going back all layers
for l in range(2, self.n_layers):
z = z_lis[-l]
del_sigmoid = sigmoid_prime(z)
delta = np.dot(self.weights[-l+1].transpose(), delta) * del_sigmoid
del_w[-l] = np.dot(delta, activations[-l-1].transpose())
del_b[-l] = delta
return (del_w, del_b)
def initialize_weight(self):
"""
initialize weights using Gaussian Distribution with mean 0 and SD 1
over sqrt of number of weights connecting same neuron
initialize biases using Gaussian Distribution with mean 0 and SD 1
"""
self.weights = [np.random.randn(y, x)/np.sqrt(x) for x, y in
zip(self.size[:-1], self.size[:-1])]
self.biases = [np.random.randn(y, 1) for y in self.size[1:]]
def trainer(self, training_data, epochs, m_bs, eta, lmbda, eval_data=None,
print_eval_cost=False, print_eval_acc=False, print_train_cost=False,
print_train_acc=False):
"""
Train the neural network using mini-batch stochastic gradient
descent.
"""
if eval_data:
n_data = sum(1 for _ in eval_data)
n = sum(1 for _ in training_data)
eval_cost, eval_acc = [], []
train_cost, train_acc = [], []
for c in range(epochs):
random.shuffle(training_data)
mini_batches = [training_data[k:k+m_bs] for k in
range(0, n, m_bs)]
for mini_batch in mini_batches:
self.update_mini_batch(mini_batch, eta, lmbda, len(training_data))
print('Training : Epoch % complete.' % c)
if print_train_cost:
acc = self.accuracy(training_data, convert=True)
train_acc.append(acc)
print('Accuracy on training data : {} / {}'.format(acc, n))
if print_train_cost:
cost = self.total_cost(training_data, lmbda)
train_cost.append(cost)
print('Cost on training data : {}'.format(cost))
if print_eval_acc:
acc = self.accuracy(eval_data)
eval_acc.append(acc)
print('Accuracy on training data : {} / {}'.format(acc, n))
if print_eval_cost:
cost = self.total_cost(eval_data, lmbda, convert=True)
train_cost.append(cost)
print('Cost on training data : {}'.format(cost))
print()
return (eval_cost, eval_acc, train_cost, train_acc)
def update_mini_batch(self, mini_batch, eta, lmbda, n):
"""Update the network's weights and biases by applying gradient
descent using backpropagation to a single mini batch.mini_batch is a list of tuples ``(x, y)``, ``eta`` is the
learning rate, lmbda is the regularization parameter, and
n is the total size of the training data set.
"""
nabla_b = [np.zeros(b.shape) for b in self.biases]
nabla_w = [np.zeros(w.shape) for w in self.weights]
for x, y in mini_batch:
delta_nabla_b, delta_nabla_w = self.backprop(x, y)
nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
self.weights = [(1-eta*(lmbda/n))*w-(eta/len(mini_batch))*nw
for w, nw in zip(self.weights, nabla_w)]
self.biases = [b-(eta/len(mini_batch))*nb
for b, nb in zip(self.biases, nabla_b)]
def accuracy(self, data, convert=False):
"""Return the number of inputs in ``data`` for which the neural
network outputs the correct result. The neural network's
output is assumed to be the index of whichever neuron in the
final layer has the highest activation.
The flag ``convert`` should be set to False if the data set is
validation or test data (the usual case), and to True if the
data set is the training data.
"""
if convert:
results = [(np.argmax(self.feedforward(x)), np.argmax(y))
for (x, y) in data]
else:
results = [(np.argmax(self.feedforward(x)), y)
for (x, y) in data]
return sum(int(x == y) for (x, y) in results)
def total_cost(self, data, lmbda, convert=False):
"""Return the total cost for the data set ``data``.
"""
cost = 0.0
for x, y in data:
a = self.feedforward(x)
if convert: y = vectorized_result(y)
cost += self.cost.fn(a, y)/len(data)
cost += 0.5*(lmbda/len(data))*sum(
np.linalg.norm(w)**2 for w in self.weights)
return cost
def save(self, filename):
"""Save the neural network to the file ``filename``."""
data = {"size": self.size,
"weights": [w.tolist() for w in self.weights],
"biases": [b.tolist() for b in self.biases],
"cost": str(self.cost.__name__)}
f = open(filename, "w")
json.dump(data, f)
f.close()
# this python code to load MNIST Data is by Michael Nielsen.
import _pickle as cPickle
import gzip
# Third-party libraries
import numpy as np
def load_data():
"""Return the MNIST data as a tuple containing the training data,
the validation data, and the test data.
The ``training_data`` is returned as a tuple with two entries.
The first entry contains the actual training images. This is a
numpy ndarray with 50,000 entries. Each entry is, in turn, a
numpy ndarray with 784 values, representing the 28 * 28 = 784
pixels in a single MNIST image.
The second entry in the ``training_data`` tuple is a numpy ndarray
containing 50,000 entries. Those entries are just the digit
values (0...9) for the corresponding images contained in the first
entry of the tuple.
The ``validation_data`` and ``test_data`` are similar, except
each contains only 10,000 images.
This is a nice data format, but for use in neural networks it's
helpful to modify the format of the ``training_data`` a little.
That's done in the wrapper function ``load_data_wrapper()``, see
below.
"""
f = gzip.open('mnist.pkl.gz', 'rb')
training_data, validation_data, test_data = cPickle.load(f, encoding='latin1')
f.close()
return (training_data, validation_data, test_data)
def load_data_wrapper():
"""Return a tuple containing ``(training_data, validation_data,
test_data)``. Based on ``load_data``, but the format is more
convenient for use in our implementation of neural networks.
In particular, ``training_data`` is a list containing 50,000
2-tuples ``(x, y)``. ``x`` is a 784-dimensional numpy.ndarray
containing the input image. ``y`` is a 10-dimensional
numpy.ndarray representing the unit vector corresponding to the
correct digit for ``x``.
``validation_data`` and ``test_data`` are lists containing 10,000
2-tuples ``(x, y)``. In each case, ``x`` is a 784-dimensional
numpy.ndarry containing the input image, and ``y`` is the
corresponding classification, i.e., the digit values (integers)
corresponding to ``x``.
Obviously, this means we're using slightly different formats for
the training data and the validation / test data. These formats
turn out to be the most convenient for use in our neural network
code."""
tr_d, va_d, te_d = load_data()
training_inputs = [np.reshape(x, (784, 1)) for x in tr_d[0]]
training_results = [vectorized_result(y) for y in tr_d[1]]
training_data = zip(training_inputs, training_results)
validation_inputs = [np.reshape(x, (784, 1)) for x in va_d[0]]
validation_data = zip(validation_inputs, va_d[1])
test_inputs = [np.reshape(x, (784, 1)) for x in te_d[0]]
test_data = zip(test_inputs, te_d[1])
return (training_data, validation_data, test_data)
def vectorized_result(j):
"""Return a 10-dimensional unit vector with a 1.0 in the jth
position and zeroes elsewhere. This is used to convert a digit
(0...9) into a corresponding desired output from the neural
network."""
e = np.zeros((10, 1))
e[j] = 1.0
return e
training_data, validation_data, test_data = load_data_wrapper()
learner = NN([784, 30, 10])
learner.trainer(training_data, 30, 10, 3.0, 0.1, eval_data=test_data)
Other Resources to dive more into Backpropagation Calculus
Here's a good paper about matrix calculus for ML/DL.
Here's a presentation from cornell that covers the derivation of the back propagation formulas.
Here is a more intuitive explanation of back propagation with examples and some math from Stanford CS231n.
Here's a deeper look at the underlying math also from Stanford CS231n.
And if you ever did any graduate work in math or physics or related fields, you might find this treatment worth a look, but "there be dragons!" ;^) ...
With this, I end this post. It was really intense, trying to comprehend the math behind Backpropagation in a single layer and then expanding the knowledge over to deep learning neural networks. 3B1B's video on Backpropagation was really intuitive to help me visualise the working in order to write this code.