Notes and Implementation of Backpropagation Algorithm

A third attempt (finally a successful one) to understand the mechanics behind a neural network FT. Calculus

Explaination and Math

1,3 2,4

Important: If my notes aren’t visible, please go the the individual page link at imgur:





Notation used

Weights Biases (star)

read notes to better understand

Formulas to be further used in code :


%matplotlib inline
import numpy as np
import random
import matplotlib.pyplot as plt
import json
import sys
# helper functions (here, activation functions)
def sigmoid(z):
    """the sigmoid activation function"""
    return 1/(1+np.exp(-1*x))

def sigmoid_prime(x):
    """the derivative of the sigmoid function"""
    return sigmoid(z)*(1-sigmoid(z))

def relu(z):
    """the ReLU activation function"""
    return np.maximum(0,z)

def one_hot_encoding(j):
    """One hot encode to a 10-dimensional unit vector with prediction"""
    encoded_vec = np.zeroes((10, 1))
    encoded_vec[j] = 1.0
    return ohe
# loss functions : here I'll try using both CrossEntropyLoss (losLoss) and 
# QuadraticLoss functions to compare their performance.
# : Formulas
# employed decent OOP practices

class CrossEntropyCost(object):
    def func(a, y):
        return : cost associated with input a and desired output y
        sometime, when a = y, the formula for CrossEntropy returns NaN
        Formula : (1-y)*np.log(1-a) 
        hence, np.nan_to_num is used to convert NaN's to (0.0)
        return np.sum(np.nan_to_numn(-y * np.log(a) - (1-y) * np.log(1-a)))
    def delta(a, y, z):
        return (a - y)
class MSE_cost(object):
    def func(a, y):
        """return : cost associated with input a and desired output y"""
        return np.linalg.norm(a - y) * 0.5 ** 2

    def delta(a, y, z):
        """params a, y follow suite
            z is the value of the neuron. from our derivation
        return (a - y) * sigmoid_prime(z)
class NN(object):
    def __init__(self, size, cost = CrossEntropyCost):
        list::size : number of neurons in respective layers of the network
        weights and biases are generated randomly through Gaussian Distribution with zero mean and variance of 1.
        self.n_layers = len(size)
        self.size = size
        # initializing weights only for 2nd to last layer since 1st layer is input layer (lacks weights)
        self.biases = [np.random.randn(y, 1) for y in self.size[1:]]
        self.weights = [np.random.randn(y, x)/np.sqrt(x)
                        for x, y in zip(self.size[:-1], self.size[1:])]
        self.cost = cost
    def forward_propagation(self, a):
        """The neuron calculation formula : Wa+b"""
        for w, b in zip(self.weights, self.biases):
            a = sigmoid(, a) + b)
        return a

    def back_propagation(self, x, y):
        return : (del_w, del_b) , the gradient for the cost function
        del_w and del_b are layer-by-layer lists of numpy arrays.
        Warning : negative indices would be heavily utilized
        del_w = [np.zeros(w.shape) for w in self.weights]
        del_b = [np.zeroes(b.shape) for b in self.biases]
        # forward prop
        curr_activation = x
        activations = [x] # store all activations by layer, remember chain-rule 
        z_lis = [] # store all z values for layers, remember tree structure from notes
        for w, b in zip(self.weights, self.biases):
            z =, curr_activation) + b
            curr_activation = sigmoid(z)
        # backward pass : calculating cost by taking last elems of a, z lists
        delta = (self.cost).delta(activations[-1], y, z_lis[-1])
        del_w[-1] =, activations[-2].transpose())
        del_b[-1] = delta
        # going back all layers
        for l in range(2, self.n_layers):
            z = z_lis[-l]
            del_sigmoid = sigmoid_prime(z)
            delta =[-l+1].transpose(), delta) * del_sigmoid
            del_w[-l] =, activations[-l-1].transpose())
            del_b[-l] = delta
        return (del_w, del_b)
    def initialize_weight(self):
        initialize weights using Gaussian Distribution with mean 0 and SD  1
        over sqrt of number of weights connecting same neuron
        initialize biases using Gaussian Distribution with mean 0 and SD 1
        self.weights = [np.random.randn(y, x)/np.sqrt(x) for x, y in
                        zip(self.size[:-1], self.size[:-1])]
        self.biases = [np.random.randn(y, 1) for y in self.size[1:]]
    def trainer(self, training_data, epochs, m_bs, eta, lmbda, eval_data=None, 
               print_eval_cost=False, print_eval_acc=False, print_train_cost=False, 
        Train the neural network using mini-batch stochastic gradient
        if eval_data:
            n_data = sum(1 for _ in eval_data)
        n = sum(1 for _ in training_data)
        eval_cost, eval_acc = [], []
        train_cost, train_acc = [], []
        for c in range(epochs):
            mini_batches = [training_data[k:k+m_bs] for k in
                           range(0, n, m_bs)]
            for mini_batch in mini_batches:
                self.update_mini_batch(mini_batch, eta, lmbda, len(training_data))
            print('Training : Epoch % complete.' % c)
            if print_train_cost:
                acc = self.accuracy(training_data, convert=True)
                print('Accuracy on training data : {} / {}'.format(acc, n))
            if print_train_cost:
                cost = self.total_cost(training_data, lmbda)
                print('Cost on training data : {}'.format(cost))
            if print_eval_acc:
                acc = self.accuracy(eval_data)
                print('Accuracy on training data : {} / {}'.format(acc, n))
            if print_eval_cost:
                cost = self.total_cost(eval_data, lmbda, convert=True)
                print('Cost on training data : {}'.format(cost))
        return (eval_cost, eval_acc, train_cost, train_acc)
    def update_mini_batch(self, mini_batch, eta, lmbda, n):
        """Update the network's weights and biases by applying gradient
        descent using backpropagation to a single mini batch.mini_batch is a list of tuples ``(x, y)``, ``eta`` is the
        learning rate, lmbda is the regularization parameter, and
        n is the total size of the training data set.
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]
        for x, y in mini_batch:
            delta_nabla_b, delta_nabla_w = self.backprop(x, y)
            nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
            nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
        self.weights = [(1-eta*(lmbda/n))*w-(eta/len(mini_batch))*nw
                        for w, nw in zip(self.weights, nabla_w)]
        self.biases = [b-(eta/len(mini_batch))*nb
                       for b, nb in zip(self.biases, nabla_b)]
    def accuracy(self, data, convert=False):
        """Return the number of inputs in ``data`` for which the neural
        network outputs the correct result. The neural network's
        output is assumed to be the index of whichever neuron in the
        final layer has the highest activation.
        The flag ``convert`` should be set to False if the data set is
        validation or test data (the usual case), and to True if the
        data set is the training data. 
        if convert:
            results = [(np.argmax(self.feedforward(x)), np.argmax(y))
                       for (x, y) in data]
            results = [(np.argmax(self.feedforward(x)), y)
                        for (x, y) in data]
        return sum(int(x == y) for (x, y) in results)

    def total_cost(self, data, lmbda, convert=False):
        """Return the total cost for the data set ``data``. 
        cost = 0.0
        for x, y in data:
            a = self.feedforward(x)
            if convert: y = vectorized_result(y)
            cost += self.cost.fn(a, y)/len(data)
        cost += 0.5*(lmbda/len(data))*sum(
            np.linalg.norm(w)**2 for w in self.weights)
        return cost

    def save(self, filename):
        """Save the neural network to the file ``filename``."""
        data = {"size": self.size,
                "weights": [w.tolist() for w in self.weights],
                "biases": [b.tolist() for b in self.biases],
                "cost": str(self.cost.__name__)}
        f = open(filename, "w")
        json.dump(data, f)

Loading Data

# this python code to load MNIST Data is by Michael Nielsen.
import _pickle as cPickle
import gzip

# Third-party libraries
import numpy as np

def load_data():
    """Return the MNIST data as a tuple containing the training data,
    the validation data, and the test data.
    The ``training_data`` is returned as a tuple with two entries.
    The first entry contains the actual training images.  This is a
    numpy ndarray with 50,000 entries.  Each entry is, in turn, a
    numpy ndarray with 784 values, representing the 28 * 28 = 784
    pixels in a single MNIST image.
    The second entry in the ``training_data`` tuple is a numpy ndarray
    containing 50,000 entries.  Those entries are just the digit
    values (0...9) for the corresponding images contained in the first
    entry of the tuple.
    The ``validation_data`` and ``test_data`` are similar, except
    each contains only 10,000 images.
    This is a nice data format, but for use in neural networks it's
    helpful to modify the format of the ``training_data`` a little.
    That's done in the wrapper function ``load_data_wrapper()``, see
    f ='mnist.pkl.gz', 'rb')
    training_data, validation_data, test_data = cPickle.load(f, encoding='latin1')
    return (training_data, validation_data, test_data)

def load_data_wrapper():
    """Return a tuple containing ``(training_data, validation_data,
    test_data)``. Based on ``load_data``, but the format is more
    convenient for use in our implementation of neural networks.
    In particular, ``training_data`` is a list containing 50,000
    2-tuples ``(x, y)``.  ``x`` is a 784-dimensional numpy.ndarray
    containing the input image.  ``y`` is a 10-dimensional
    numpy.ndarray representing the unit vector corresponding to the
    correct digit for ``x``.
    ``validation_data`` and ``test_data`` are lists containing 10,000
    2-tuples ``(x, y)``.  In each case, ``x`` is a 784-dimensional
    numpy.ndarry containing the input image, and ``y`` is the
    corresponding classification, i.e., the digit values (integers)
    corresponding to ``x``.
    Obviously, this means we're using slightly different formats for
    the training data and the validation / test data.  These formats
    turn out to be the most convenient for use in our neural network
    tr_d, va_d, te_d = load_data()
    training_inputs = [np.reshape(x, (784, 1)) for x in tr_d[0]]
    training_results = [vectorized_result(y) for y in tr_d[1]]
    training_data = zip(training_inputs, training_results)
    validation_inputs = [np.reshape(x, (784, 1)) for x in va_d[0]]
    validation_data = zip(validation_inputs, va_d[1])
    test_inputs = [np.reshape(x, (784, 1)) for x in te_d[0]]
    test_data = zip(test_inputs, te_d[1])
    return (training_data, validation_data, test_data)

def vectorized_result(j):
    """Return a 10-dimensional unit vector with a 1.0 in the jth
    position and zeroes elsewhere.  This is used to convert a digit
    (0...9) into a corresponding desired output from the neural
    e = np.zeros((10, 1))
    e[j] = 1.0
    return e
training_data, validation_data, test_data = load_data_wrapper()
learner = NN([784, 30, 10])
learner.trainer(training_data, 30, 10, 3.0, 0.1, eval_data=test_data)
TypeError                                 Traceback (most recent call last)
<ipython-input-44-742e24694131> in <module>
----> 1 learner.trainer(training_data, 30, 10, 3.0, 0.1, eval_data=test_data)

<ipython-input-40-6f3c6ca52295> in trainer(self, training_data, epochs, m_bs, eta, lmbda, eval_data, print_eval_cost, print_eval_acc, print_train_cost, print_train_acc)
     79         train_cost, train_acc = [], []
     80         for c in range(epochs):
---> 81             random.shuffle(training_data)
     82             mini_batches = [training_data[k:k+m_bs] for k in
     83                            range(0, n, m_bs)]

~/anaconda3/envs/fastai/lib/python3.7/ in shuffle(self, x, random)
    273         if random is None:
    274             randbelow = self._randbelow
--> 275             for i in reversed(range(1, len(x))):
    276                 # pick an element in x[:i+1] with which to exchange x[i]
    277                 j = randbelow(i+1)

TypeError: object of type 'zip' has no len()

Other Resources to dive more into Backpropagation Calculus

With this, I end this post. It was really intense, trying to comprehend the math behind Backpropagation in a single layer and then expanding the knowledge over to deep learning neural networks. 3B1B's video on Backpropagation was really intuitive to help me visualise the working in order to write this code.