TP4: Neural Networks¶

The architecture of this code is freely inspired from the torch and pytorch architectures, and some sample of code are re-used. It is however designed to be as concise as possible, not in any case for efficiency or flexibility (i.e. if you ever want to do Deep Learning out of this class, don't try to adapt this code, use an existing framework)

0. A simple MLP implementation¶

As always some useful imports:

# /!\ I'm using python 3

import numpy as np 
# this is the key library for manipulating arrays. Use the online ressources! http://www.numpy.org/

import matplotlib.pyplot as plt 
# used to read images, display and plot http://matplotlib.org/api/pyplot_api.html . 
#You can also check this simple intro to using ipython notebook with images https://matplotlib.org/users/image_tutorial.html

%matplotlib inline 
# to display directly in the notebook

import scipy.ndimage as ndimage
# one of several python libraries for image procession

import scipy.io as sio
# usefull for loading mat files

import scipy.optimize as optimize
# usefull for optimization

plt.rcParams['image.cmap'] = 'gray' 
# by default, the grayscale images are displayed with the jet colormap: use grayscale instead

import gzip
import _pickle as cPickle
import os
import os.path
from six.moves import urllib



import plotly
import plotly.plotly as py
import plotly.graph_objs as go


plotly.offline.init_notebook_mode(connected=True)
from IPython.core.display import display, HTML, Markdown
# The polling here is to ensure that plotly.js has already been loaded before
# setting display alignment in order to avoid a race condition.
display(HTML(
    '<script>'
        'var waitForPlotly = setInterval( function() {'
            'if( typeof(window.Plotly) !== "undefined" ){'
                'MathJax.Hub.Config({ SVG: { font: "STIX-Web" }, displayAlign: "center" });'
                'MathJax.Hub.Queue(["setRenderer", MathJax.Hub, "SVG"]);'
                'clearInterval(waitForPlotly);'
            '}}, 250 );'
    '</script>'
))

We will consider a Neural Network as a modular architecture, all layer and loss being a Module. We define the Linear, ReLU layers, necessary to implement a MLP, as well as a simple 2-layer MLP and the least square loss function, LeastSquareCriterion.

import math
import numpy as np

class Module(object):
    def __init__(self):
        self.gradInput=None 
        self.output=None
        
    def forward(self, *input):
        """Defines the computation performed at every call.
        Should be overriden by all subclasses.
        """
        raise NotImplementedError
        
    def backward(self, *input):
        """Defines the computation performed at every call.
        Should be overriden by all subclasses.
        """
        raise NotImplementedError
        
class LeastSquareCriterion(Module):
    """
    This implementation of the least square loss assumes that the data comes as a 2 dimensionnal array
    of size (batch_size,num_classes) and the labels as a vector of size (num_classes) 
    """
    def __init__(self, num_classes=10):
        super(LeastSquareCriterion, self).__init__()
        self.num_classes=num_classes
    
    def forward(self, x,labels):
        target=np.zeros([x.shape[0],self.num_classes])
        for i in range(x.shape[0]):
            target[i,labels[i]]=1
        self.output = np.sum((target-x)**2,axis=0)
        return np.sum(self.output)
    
    def backward(self, x, labels):
        self.gradInput=x
        for i in range(x.shape[0]):
            self.gradInput[i,labels[i]]=x[i,labels[i]]-1
        return self.gradInput
    

class Linear(Module):
    """
    The input is supposed to have two dimensions (batchSize,in_feature)
    """
    def __init__(self, in_features, out_features, bias=True):
        super(Linear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = math.sqrt(1. / (out_features* in_features))*np.random.randn(out_features, in_features)
        self.bias = np.zeros(out_features)
        self.gradWeight=None
        self.gradBias=None
        self.deltaWeight=0
        self.deltaBias=0
        
    def forward(self, x):
        self.output= np.dot(x,self.weight.transpose())+np.repeat(self.bias.reshape([1,-1]),x.shape[0], axis=0)
        return self.output
    
    def backward(self, x, gradOutput):
        self.gradInput=np.dot(gradOutput,self.weight)
        self.gradWeight=np.dot(gradOutput.transpose(),x)
        self.gradBias=np.sum(gradOutput, axis=0)
        return self.gradInput
    
    def gradientStep(self,lr, weight_decay=0, momentum=0):
        self.deltaWeight=momentum*self.deltaWeight+ self.gradWeight + weight_decay*self.weight
        self.deltaBias=momentum*self.deltaBias + self.gradBias
        self.weight=self.weight-lr*self.deltaWeight
        self.bias=self.bias-lr*self.deltaBias
        

class ReLU(Module):
    
    def __init__(self, bias=True):
        super(ReLU, self).__init__()
        
    def forward(self, x):
        self.output=x.clip(0)
        return self.output
    
    def backward(self, x, gradOutput):
        self.gradInput=(x>0)*gradOutput
        return self.gradInput
    
        

class MLP(Module):

    def __init__(self, num_classes=10, hidden_layer_size = 64):
        super(MLP, self).__init__()
        self.fc1 = Linear(784, hidden_layer_size)
        self.relu1 = ReLU()
        self.fc2 = Linear(hidden_layer_size, 10)
    
    def forward(self, x):
        x = self.fc1.forward(x)
        x = self.relu1.forward(x)
        x = self.fc2.forward(x)
        return x
    
    def backward(self, x, gradient):
        gradient = self.fc2.backward(self.relu1.output,gradient)
        gradient = self.relu1.backward(self.fc1.output,gradient)
        gradient = self.fc1.backward(x,gradient)
        return gradient
    
    def gradientStep(self,lr):
        self.fc2.gradientStep(lr)
        self.fc1.gradientStep(lr)
        return True

1. Training a neural network¶

To train a network, we will need data. Download the MNIST data (~15Mo). I consists of 28x28 images (loaded as a 784 vector) and the associated label for training, validation and test sets. For this TP, you can focus on the training and validation sets.

# Load the dataset
f = gzip.open('mnist.pkl.gz', 'rb')
train_set, val_set, test_set = cPickle.load(f, encoding='latin1')
f.close()
train_data=train_set[0]
train_labels=train_set[1]
val_data=val_set[0]
val_labels=val_set[1]
N_train=train_data.shape[1]
N_val=val_data.shape[1]
# check data makes sense
plt.imshow(train_data[0,:].reshape(28,28))
print(train_labels[0])

5

1/ Write a simple loop to train 50 iterations of the MLP defined in 0 with a learning rate 0.001 and batches of size 16.¶

learning_rate = 0.01

def iteration_step(mlp, lsc, X, Y, learning_rate=learning_rate, training=True):
    X_forward = mlp.forward(X)
    
    if training:
        grad_loss = lsc.backward(X_forward, Y)
        mlp.backward(X, grad_loss)
        mlp.gradientStep(learning_rate)
    
    if not training:
        # return loss and array indicating whether predicted label are correct
        return lsc.forward(X_forward, Y), np.argmax(X_forward, axis=1) == Y

batch_size = 16
learning_rate = 0.01

mlp = MLP()
lsc = LeastSquareCriterion()

for _ in range(50):
    
    ind_rand = np.random.randint(len(train_data), size=16)

    X, Y = train_data[ind_rand, :], train_labels[ind_rand] # images, labels
    
    iteration_step(mlp, lsc, X, Y, learning_rate=learning_rate)

2/ Test the resulting model on some training and validation data.¶

test_ind = np.random.randint(len(train_data), size=16)

display(Markdown("### Images and their predicted labels"))

fig = plt.figure(figsize = (20., 20.))
   
for index, n in enumerate(test_ind):
    x_test = train_data[n, :].reshape(28, 28)
    y_test = train_labels[n]
 
    plt.subplot(16, 4, index+1)\
           .set_title(str(np.argmax(mlp.forward(train_data[n, :]))))
    
    plt.imshow(x_test)
    plt.axis('off')
    plt.tight_layout(w_pad=0, h_pad=0)

plt.show()

3/ Plot the training and validation losses and accuracies during training. (note: you don't have to test your network at every iteration, you can for example do it every 10 iterations)¶

batch_size = 16
learning_rate = 0.01
total_nb_batches = 1000
every_n_iterations = 10

def training_procedure(mlp, lsc, indices, batch_size=batch_size, total_nb_batches=total_nb_batches,\
                       learning_rate=learning_rate, every_n_iterations=every_n_iterations):
    
    loss = {}
    loss['train'] = []
    loss['validation'] = []

    accuracy = {}
    accuracy['train'] = []
    accuracy['validation'] = []

    nb_iteration = 1

    for inds in zip(indices['train'], indices['validation']):

        for ind, train_valid in zip(inds, ['train', 'validation']):

            X, Y = train_data[ind, :], train_labels[ind] # images, labels

            if train_valid == 'train':
                iteration_step(mlp, lsc, X, Y, learning_rate=learning_rate)

            if nb_iteration%every_n_iterations == 0:
                current_loss, correctness = iteration_step(mlp, lsc, X, Y, learning_rate=learning_rate, training=False)
                loss[train_valid].append(current_loss)
                accuracy[train_valid].append((accuracy[train_valid][-1] if accuracy[train_valid] else 0)\
                                             + np.sum(correctness))

        nb_iteration += 1
    
    return loss, accuracy

batch_size = 16
learning_rate = 0.01
total_nb_batches = 1000
every_n_iterations = 10


mlp = MLP()
lsc = LeastSquareCriterion()


indices = {}


all_indices = np.random.randint(len(train_data), size=(total_nb_batches, 16))
indices['validation'], indices['train'] = all_indices[:total_nb_batches//2, :], all_indices[total_nb_batches//2:,:] 


loss, accuracy = training_procedure(mlp, lsc, indices, batch_size=batch_size, total_nb_batches=total_nb_batches,\
                       learning_rate=learning_rate, every_n_iterations=every_n_iterations)

# Plotting loss and accuracy
traces = {}
traces['loss'] = []
traces['accuracy'] = []

for train_valid in ['train', 'validation']:
    traces['loss'].append(
        go.Scatter(x = [i*batch_size*every_n_iterations for i in range(1, len(loss[train_valid])+1)], 
                   y = np.array(loss[train_valid]), 
                   mode = 'lines', 
                   name = '{} loss'.format(train_valid.capitalize())
                  )
    )

    traces['accuracy'].append(
        go.Scatter(x = [i*batch_size*every_n_iterations for i in range(1, len(accuracy[train_valid])+1)], 
                   y = np.array([acc/((i+1)*batch_size) for i, acc in enumerate(accuracy[train_valid])]), 
                   mode = 'lines', 
                   name = '{} accuracy'.format(train_valid.capitalize())
                  )
    )



def layout(loss_acc, end_title=''):
    if loss_acc == 'loss':
        return go.Layout(
                    title= 'Losses{}'.format(end_title),
                    hovermode= 'closest',
                    xaxis= dict(
                        title= 'Iterations',
                        ticklen= 5,
                        zeroline= False,
                        gridwidth= 2,
                    ),
                    yaxis=dict(
                        title= 'Loss',
                        ticklen= 5,
                        gridwidth= 2,
                    ),
                    showlegend= True
                ) 

    elif loss_acc == 'accuracy':     
        return go.Layout(
                    title= 'Accuracy{}'.format(end_title),
                    hovermode= 'closest',
                    xaxis= dict(
                        title= 'Iterations',
                        ticklen= 5,
                        zeroline= False,
                        gridwidth= 2,
                    ),
                    yaxis=dict(
                        title= 'Accuracy',
                        ticklen= 5,
                        gridwidth= 2,
                    ),
                    showlegend= True
                )

for loss_acc in ['loss', 'accuracy']:
    plotly.offline.iplot(go.Figure(data=traces[loss_acc], layout=layout(loss_acc)))

4/ Design a training procedure that optimizes the training loss.¶

One introduces a number of epochs, that is: a number of times one iterates over all a given training set. Why iterating multiple times over a given training set? Because the backpropagation procedure is akin to a gradient descent, which is an iterative approximation algorithm: as a result, it makes sense to reiterate the previous training procedure several times so as to converge toward a local minimum of the loss function (over our given training set).

batch_size = 16
learning_rate = 0.01
total_nb_batches = 1000
every_n_iterations = 10
nb_epochs = 10

def optimized_training_procedure(mlp, lsc, indices, nb_epochs=nb_epochs, batch_size=batch_size, total_nb_batches=total_nb_batches,\
                       learning_rate=learning_rate, every_n_iterations=every_n_iterations):
    
    Loss = {}
    Loss['train'] = []
    Loss['validation'] = []

    Accuracy = {}
    Accuracy['train'] = []
    Accuracy['validation'] = []
    
    for _ in range(nb_epochs):
        
        loss, accuracy = training_procedure(mlp, lsc, indices, batch_size=batch_size, total_nb_batches=total_nb_batches,\
                       learning_rate=learning_rate, every_n_iterations=every_n_iterations)
        
        for train_valid in ['train', 'validation']:
            Loss[train_valid].append(loss[train_valid])
            Accuracy[train_valid].append(accuracy[train_valid])
    
    return Loss, Accuracy

# Colorscales
def colorscale_list(cmap, number_colors, return_rgb_only=False):
    cm = plt.get_cmap(cmap)
    colors = [np.array(cm(i/number_colors)) for i in range(1, number_colors+1)]
    rgb_colors_plotly = []
    rgb_colors_only = []
    for i, c in enumerate(colors):
        col = 'rgb{}'.format(tuple(255*c[:-1]))
        rgb_colors_only.append(col)
        rgb_colors_plotly.append([i/number_colors, col])
        rgb_colors_plotly.append([(i+1)/number_colors, col])
    return rgb_colors_only if return_rgb_only else rgb_colors_plotly

batch_size = 16
learning_rate = 0.01
total_nb_batches = 1000
every_n_iterations = 10
nb_epochs = 10


mlp = MLP()
lsc = LeastSquareCriterion()


indices = {}


all_indices = np.random.randint(len(train_data), size=(total_nb_batches, 16))
indices['validation'], indices['train'] = all_indices[:total_nb_batches//2, :], all_indices[total_nb_batches//2:,:] 


Loss, Accuracy = optimized_training_procedure(mlp, lsc, indices, nb_epochs=nb_epochs, batch_size=batch_size,\
                                              total_nb_batches=total_nb_batches, learning_rate=learning_rate, every_n_iterations=every_n_iterations)


colors = {}
colors['loss'], colors['accuracy'] = {}, {}


for loss_acc, Loss_Acc in zip(['loss', 'accuracy'], [Loss, Accuracy]):
    colors[loss_acc]['train'] = colorscale_list('Reds', len(Loss_Acc['train'])+3, return_rgb_only=True)
    colors[loss_acc]['validation'] = colorscale_list('Greens', len(Loss_Acc['train'])+3, return_rgb_only=True)


# Plotting losses and accuracies
traces = {}
traces['loss'] = []
traces['accuracy'] = []

for train_valid in ['train', 'validation']:

    for i, loss in enumerate(Loss[train_valid]):
        traces['loss'].append(
            go.Scatter(
                x = [i*batch_size*every_n_iterations for i in range(1, len(loss)+1)], 
                y = np.array(loss),
                mode = 'lines',
                name = '{} loss: epoch {}'.format(train_valid.capitalize(), i+1),
                line = dict(
                    width = 3,
                    color = colors['loss'][train_valid][i+2],
                    shape = 'spline',
                    dash = 'solid'
                ),
                hoverlabel = dict(
                    namelength = -1
                )
            )
        )
    
    for i, accuracy in enumerate(Accuracy[train_valid]):
        traces['accuracy'].append(
            go.Scatter(
                x = [i*batch_size*every_n_iterations for i in range(1, len(accuracy)+1)], 
                y = np.array([acc/((i+1)*batch_size) for i, acc in enumerate(accuracy)]),
                mode = 'lines',
                name = '{} accuracy: epoch {}'.format(train_valid.capitalize(), i+1),
                line = dict(
                    width = 3,
                    color = colors['accuracy'][train_valid][i+2],
                    shape = 'spline',
                    dash = 'solid'
                ),
                hoverlabel = dict(
                    namelength = -1
                )
            )
        )

for loss_acc in ['loss', 'accuracy']:
    plotly.offline.iplot(go.Figure(data=traces[loss_acc], layout=layout(loss_acc)))

2. Overfitting¶

1/ What's the number of parameters in the network of 1?¶

There are 3 layers,

the first layer is comprised of $784$ neurons
the second one of $64$ neurons
the last one of $10$ neurons

which amounts to two weight matrices of size $64 × 784$ (between the 1st and the 2nd layer) and $10 × 64$ (between the 2nd and the 3rd layer).

On top of that, there are two bias vectors of size $64$ (between the 1st and the 2nd layer) and $10$ (between the 2nd and the 3rd layer).

In total, there are $50890$ weights and biaises.

2/ Add a parameter to vary the size of the intermediate layer. Design a second type of MLP with 3 linear layers, and parameters for the sizes of the two intermediate layers.¶

To add a parameter determining the size of the intermediate layer, one modifies the __init__ method of the MLP class as follows:

def __init__(self, num_classes=10, hidden_layer_size = 64):
        super(MLP, self).__init__()
        self.fc1 = Linear(784, hidden_layer_size)
        self.relu1 = ReLU()
        self.fc2 = Linear(hidden_layer_size, 10)

class MLP_3linear(Module):

    def __init__(self, num_classes=10, hidden_layers_sizes = (64, 64)):
        super(MLP_3linear, self).__init__()
        size1, size2 = hidden_layers_sizes
        self.fc1 = Linear(784, size1)
        self.relu1 = ReLU()
        self.fc2 = Linear(size1, size2)
        self.fc3 = Linear(size2, 10)
    
    def forward(self, x):
        x = self.fc1.forward(x)
        x = self.relu1.forward(x)
        x = self.fc2.forward(x)
        x = self.fc3.forward(x)
        return x
    
    def backward(self, x, gradient):
        gradient = self.fc3.backward(self.fc2.output,gradient)
        gradient = self.fc2.backward(self.relu1.output,gradient)
        gradient = self.relu1.backward(self.fc1.output,gradient)
        gradient = self.fc1.backward(x,gradient)
        return gradient
    
    def gradientStep(self,lr):
        self.fc3.gradientStep(lr)
        self.fc2.gradientStep(lr)
        self.fc1.gradientStep(lr)
        return True

3/ Keeping the training procedure fixed, test different size of networks and layer. Show evidence of overfitting.¶

batch_size = 16
learning_rate = 0.01
total_nb_batches = 1000
every_n_iterations = 20
nb_epochs = 21
epoch_steps = 10

indices = {}


all_indices = np.random.randint(len(train_data), size=(total_nb_batches, 16))
indices['validation'], indices['train'] = all_indices[:total_nb_batches//2, :], all_indices[total_nb_batches//2:,:] 

MLPs = [MLP(hidden_layer_size=10), MLP(hidden_layer_size=100), MLP_3linear(),\
            MLP_3linear(hidden_layers_sizes=(10, 10)), MLP_3linear(hidden_layers_sizes=(100, 100))]

MLP_names = ['hidden layer: 10', 'hidden layer: 100', 'hidden layers: 64,64', \
             'hidden layers: 10, 10', 'hidden layers: 100, 100']

MLP_colors = [('Greens', 'Reds'), ('Blues', 'Purples'), ('Greens', 'Reds'),\
              ('Blues', 'Purples'), ('Greens', 'Reds')]

# Plotting losses and accuracies
traces = {}

for mlp_name in MLP_names:
    traces[mlp_name] = {}

for mlp, mlp_name, mlp_colors in zip(MLPs, MLP_names, MLP_colors):

    lsc = LeastSquareCriterion()


    Loss, Accuracy = optimized_training_procedure(mlp, lsc, indices, nb_epochs=nb_epochs, batch_size=batch_size,\
                                                  total_nb_batches=total_nb_batches, learning_rate=learning_rate,\
                                                  every_n_iterations=every_n_iterations)

    colors = {}
    colors['loss'], colors['accuracy'] = {}, {}


    for loss_acc, Loss_Acc in zip(['loss', 'accuracy'], [Loss, Accuracy]):
        colors[loss_acc]['train'] = colorscale_list(mlp_colors[1], len(Loss_Acc['train'])//epoch_steps+3, return_rgb_only=True)
        colors[loss_acc]['validation'] = colorscale_list(mlp_colors[0], len(Loss_Acc['train'])//epoch_steps+3, return_rgb_only=True)

    traces[mlp_name]['loss'] = []
    traces[mlp_name]['accuracy'] = []

    for train_valid in ['train', 'validation']:

        for i, loss in enumerate(Loss[train_valid]):
            if i%epoch_steps==0:
                traces[mlp_name]['loss'].append(
                    go.Scatter(
                        x = [i*batch_size*every_n_iterations for i in range(1, len(loss)+1)], 
                        y = np.array(loss),
                        mode = 'lines',
                        name = '[{}] {} loss: epoch {}'.format(mlp_name, train_valid.capitalize(), i+1),
                        line = dict(
                            width = 3,
                            color = colors['loss'][train_valid][i//epoch_steps+2],
                            shape = 'spline',
                            dash = 'solid' if train_valid=='train' else 'longdashdot'                  
                        ),
                        hoverlabel = dict(
                            namelength = -1
                        )
                    )
                )

        for i, accuracy in enumerate(Accuracy[train_valid]):
            if i%epoch_steps==0:
                traces[mlp_name]['accuracy'].append(
                    go.Scatter(
                        x = [i*batch_size*every_n_iterations for i in range(1, len(accuracy)+1)], 
                        y = np.array([acc/((i+1)*batch_size) for i, acc in enumerate(accuracy)]),
                        mode = 'lines',
                        name = '[{}] {} accuracy: epoch {}'.format(mlp_name, train_valid.capitalize(), i+1),
                        line = dict(
                            width = 3,
                            color = colors['accuracy'][train_valid][i//epoch_steps+2],
                            shape = 'spline',
                            dash = 'solid' if train_valid=='train' else 'longdashdot'
                        ),
                        hoverlabel = dict(
                            namelength = -1
                        )
                    )
                )


names_1 = ['hidden layer: 10', 'hidden layer: 100']
names_2 = ['hidden layer: 100', 'hidden layers: 64,64']
names_3 = ['hidden layers: 64,64', 'hidden layers: 10, 10']
names_4 = ['hidden layers: 10, 10', 'hidden layers: 100, 100']

for names in [names_1, names_2, names_3, names_4]:
    for loss_acc in ['loss', 'accuracy']:
        plotly.offline.iplot(go.Figure(data=[t for name in names for t in traces[name][loss_acc]],\
                                       layout=layout(loss_acc, end_title=': {} vs {}'\
                                                     .format(names[0].capitalize(), names[1].capitalize()))))

display(Markdown("### Overfitting"))

batch_size = 16
learning_rate = 0.01
total_nb_batches = 260
every_n_iterations = 10
nb_epochs = 101
epoch_steps = 20



mlp = MLP_3linear(hidden_layers_sizes=(200, 200))
lsc = LeastSquareCriterion()


indices = {}


all_indices = np.arange(total_nb_batches*batch_size).reshape([-1, 16])
indices['validation'], indices['train'] = all_indices[:total_nb_batches//2, :], all_indices[total_nb_batches//2:,:] 


Loss, _ = optimized_training_procedure(mlp, lsc, indices, nb_epochs=nb_epochs, batch_size=batch_size,\
                                                  total_nb_batches=total_nb_batches, learning_rate=learning_rate,\
                                                  every_n_iterations=every_n_iterations)


colors = {}
colors['loss'] = {}


colors['loss']['train'] = colorscale_list('Reds', len(Loss['train'])+3, return_rgb_only=True)
colors['loss']['validation'] = colorscale_list('Greens', len(Loss['train'])+3, return_rgb_only=True)


# Plotting losses and accuracies
traces_overfitting = {}
traces_overfitting['loss'] = []

for train_valid in ['train', 'validation']:

    for i, loss in enumerate(Loss[train_valid]):
        traces_overfitting['loss'].append(
            go.Scatter(
                x = [i*batch_size*every_n_iterations for i in range(1, len(loss)+1)], 
                y = np.array(loss),
                mode = 'lines',
                name = '{} loss: epoch {}'.format(train_valid.capitalize(), i+1),
                line = dict(
                    width = 3,
                    color = colors['loss'][train_valid][i+2],
                    shape = 'spline',
                    dash = 'solid'
                ),
                hoverlabel = dict(
                    namelength = -1
                ),
                showlegend = (i % epoch_steps == 0)
            )
        )

plotly.offline.iplot(go.Figure(data=traces_overfitting['loss'], layout=layout('loss')))

One sees an instance of overfitting above, for MLP_3linear(hidden_layers_sizes=(200, 200)) with 101 epochs:

the traning loss keep decreasing
while the validation loss end up increasing

which means that the MLP is overfitting over the training data.

4/ Add an optional parameter for weight decay (in the gradientStep functions) and show that it can reduce overfitting in some cases¶

We modify the gradientStep method of the Linear class as follows:

def gradientStep(self,lr, weight_decay=0):
        self.weight=self.weight-lr*(self.gradWeight+weight_decay*self.weight)
        self.bias=self.bias-lr*self.gradBias

class MLP_3linear_weightDecays(Module):

    def __init__(self, num_classes=10, hidden_layers_sizes = (64, 64), weight_decays = (0, 0, 0)):
        super(MLP_3linear_weightDecays, self).__init__()
        size1, size2 = hidden_layers_sizes
        self.fc1 = Linear(784, size1)
        self.relu1 = ReLU()
        self.fc2 = Linear(size1, size2)
        self.fc3 = Linear(size2, 10)
        self.weight_decays = weight_decays
    
    def forward(self, x):
        x = self.fc1.forward(x)
        x = self.relu1.forward(x)
        x = self.fc2.forward(x)
        x = self.fc3.forward(x)
        return x
    
    def backward(self, x, gradient):
        gradient = self.fc3.backward(self.fc2.output,gradient)
        gradient = self.fc2.backward(self.relu1.output,gradient)
        gradient = self.relu1.backward(self.fc1.output,gradient)
        gradient = self.fc1.backward(x,gradient)
        return gradient
    
    def gradientStep(self,lr):
        wd1, wd2, wd3 = self.weight_decays
        self.fc3.gradientStep(lr, weight_decay=wd1)
        self.fc2.gradientStep(lr, weight_decay=wd2)
        self.fc1.gradientStep(lr, weight_decay=wd3)
        return True

display(Markdown("### Weight Decay: Overfitting"))

batch_size = 16
learning_rate = 0.01
total_nb_batches = 260
every_n_iterations = 10
nb_epochs = 21

indices = {}
all_indices = np.arange(total_nb_batches*batch_size).reshape([-1, 16])
indices['validation'], indices['train'] = all_indices[:total_nb_batches//2, :], all_indices[total_nb_batches//2:,:] 

weight_decays = [(0, 0, 0), (0.1, 0.01, 0.01), (0.1, 0.1, 0.01), (0.1, 0.1, 0.1),\
                 (1, 0.1, 0.1), (1, 1, 0.1), (1, 1, 1)]

colors = {}
colors['loss'] = {}

colors['loss']['train'] = colorscale_list('Reds', len(weight_decays)+3, return_rgb_only=True)
colors['loss']['validation'] = colorscale_list('Greens', len(weight_decays)+3, return_rgb_only=True)

# Plotting losses
traces_wd = []

for i, wd in enumerate(weight_decays):

    mlp = MLP_3linear_weightDecays(hidden_layers_sizes=(200, 200), weight_decays = wd)
    lsc = LeastSquareCriterion()


    Loss, _ = optimized_training_procedure(mlp, lsc, indices, nb_epochs=nb_epochs, batch_size=batch_size,\
                                                      total_nb_batches=total_nb_batches, learning_rate=learning_rate,\
                                                      every_n_iterations=every_n_iterations)




    for train_valid in ['train', 'validation']:
        loss = Loss[train_valid][-1]
        traces_wd.append(
            go.Scatter(
                x = [i*batch_size*every_n_iterations for i in range(1, len(loss)+1)], 
                y = np.array(loss),
                mode = 'lines',
                name = '{} loss (epoch {}): weight decay = {}'.format(train_valid.capitalize(), nb_epochs, wd),
                line = dict(
                    width = 3,
                    color = colors['loss'][train_valid][i+2],
                    shape = 'spline',
                    dash = 'solid'
                ),
                hoverlabel = dict(
                    namelength = -1
                )
            )
        )

plotly.offline.iplot(go.Figure(data=traces_wd,\
                               layout=layout(loss_acc, end_title=' for various weight decays')))

For sure, when the weight decays increase:

the overfitting decreases
but the overall losses also tend to increase

So there is a compromise to reach between low weight decays (which results in overfitting) and high ones (resulting in greater overall losses).

3. Optional¶

The following questions are completely optional!

1/ Add some momentum to your training procedure. How does it change the training behavior?¶

To take into account a momentum parameter momentum:

we add two attributes to the Linear class: self.deltaWeight and self.deltaBias (initialized to zero)
we modify the gradientStep method of the latter class as follows:

def gradientStep(self,lr, weight_decay=0, momentum=0):
        self.deltaWeight=momentum*self.deltaWeight+ self.gradWeight + weight_decay*self.weight
        self.deltaBias=momentum*self.deltaBias + self.gradBias
        self.weight=self.weight-lr*self.deltaWeight
        self.bias=self.bias-lr*self.deltaBias

class MLP_3linear_wD_momentum(Module):

    def __init__(self, num_classes=10, hidden_layers_sizes = (64, 64),\
                 weight_decays = (0, 0, 0), momentum = 0):
        super(MLP_3linear_wD_momentum, self).__init__()
        size1, size2 = hidden_layers_sizes
        self.fc1 = Linear(784, size1)
        self.relu1 = ReLU()
        self.fc2 = Linear(size1, size2)
        self.fc3 = Linear(size2, 10)
        self.weight_decays = weight_decays
        self.momentum = momentum
    
    def forward(self, x):
        x = self.fc1.forward(x)
        x = self.relu1.forward(x)
        x = self.fc2.forward(x)
        x = self.fc3.forward(x)
        return x
    
    def backward(self, x, gradient):
        gradient = self.fc3.backward(self.fc2.output,gradient)
        gradient = self.fc2.backward(self.relu1.output,gradient)
        gradient = self.relu1.backward(self.fc1.output,gradient)
        gradient = self.fc1.backward(x,gradient)
        return gradient
    
    def gradientStep(self,lr):
        wd1, wd2, wd3 = self.weight_decays
        mom = self.momentum
        self.fc3.gradientStep(lr, weight_decay=wd1, momentum=mom)
        self.fc2.gradientStep(lr, weight_decay=wd2, momentum=mom)
        self.fc1.gradientStep(lr, weight_decay=wd3, momentum=mom)
        return True

display(Markdown("### Momentum"))

batch_size = 16
learning_rate = 0.01
total_nb_batches = 260
every_n_iterations = 10
nb_epochs = 1

indices = {}
all_indices = np.arange(total_nb_batches*batch_size).reshape([-1, 16])
indices['validation'], indices['train'] = all_indices[:total_nb_batches//2, :], all_indices[total_nb_batches//2:,:] 

momenta = [0, 0.2, 0.4, 0.6, 0.7, 0.8, 0.85, 0.9, 0.95]

colors = {}

colors['train'] = colorscale_list('Reds', len(momenta)+3, return_rgb_only=True)
colors['validation'] = colorscale_list('Greens', len(momenta)+3, return_rgb_only=True)


# Plotting losses
traces_mom = {}

traces_mom['loss'] = []
traces_mom['accuracy'] = []

for i, mom in enumerate(momenta):

    mlp = MLP_3linear_wD_momentum(momentum=mom)
    lsc = LeastSquareCriterion()


    Loss, Accuracy = optimized_training_procedure(mlp, lsc, indices, nb_epochs=nb_epochs, batch_size=batch_size,\
                                                      total_nb_batches=total_nb_batches, learning_rate=learning_rate,\
                                                      every_n_iterations=every_n_iterations)




    for train_valid in ['train', 'validation']:
        loss = Loss[train_valid][-1]
        traces_mom['loss'].append(
            go.Scatter(
                x = [i*batch_size*every_n_iterations for i in range(1, len(loss)+1)], 
                y = np.array(loss),
                mode = 'lines',
                name = '{} loss (epoch {}): momentum = {}'.format(train_valid.capitalize(), nb_epochs, mom),
                line = dict(
                    width = 3,
                    color = colors[train_valid][i+2],
                    shape = 'spline',
                    dash = 'solid'
                ),
                hoverlabel = dict(
                    namelength = -1
                )
            )
        )
        
        accuracy = Accuracy[train_valid][-1]
        traces_mom['accuracy'].append(
            go.Scatter(
                x = [i*batch_size*every_n_iterations for i in range(1, len(accuracy)+1)], 
                y = np.array([acc/((i+1)*batch_size) for i, acc in enumerate(accuracy)]),
                mode = 'lines',
                name = '{} accuracy (epoch {}): momentum = {}'.format(train_valid.capitalize(), nb_epochs, mom),
                line = dict(
                    width = 3,
                    color = colors[train_valid][i+2],
                    shape = 'spline',
                    dash = 'solid'
                ),
                hoverlabel = dict(
                    namelength = -1
                )
            )
        )


plotly.offline.iplot(go.Figure(data=traces_mom['loss'], layout=layout('loss', end_title=' for various momenta')))

plotly.offline.iplot(go.Figure(data=traces_mom['accuracy'], layout=layout('accuracy', end_title=' for various momenta')))

Observations¶

For a momentum lower than 0.8: the bigger the momentum, the more accelerated the training phase (the loss decreases and the accuracy increases faster)
For a momentum greater than 0.8: it backfires, the loss and accuracy are not as good (the bigger the momentum, the worse they are)

2/ Implement and test different loss functions (L1, Cross-Entropy)¶

class L1Criterion(Module):
    """
    This implementation of the L1 loss assumes that the data comes as a 2 dimensionnal array
    of size (batch_size,num_classes) and the labels as a vector of size (num_classes) 
    """
    def __init__(self, num_classes=10):
        super(L1Criterion, self).__init__()
        self.num_classes=num_classes
        
    
    def forward(self, x,labels):
        target=np.zeros([x.shape[0],self.num_classes])
        for i in range(x.shape[0]):
            target[i,labels[i]]=1
        self.output = np.sum(np.abs(target-x),axis=0)
        return np.sum(self.output)
    
    def deriv(x):
        if x > 0:
            return 1.
        elif x < 0:
            return -1.
        else:
            return 0.
        
    deriv = np.vectorize(deriv)
    
    def backward(self, x, labels):
        target=np.zeros([x.shape[0],self.num_classes])
        
        for i in range(x.shape[0]):
            target[i,labels[i]]=1
    
        self.gradInput=self.deriv(x-target)

        return self.gradInput

    
class CrossEntropyCriterion(Module):
    """
    This implementation of the Cross-Entropy loss assumes that the data comes as a 2 dimensionnal array
    of size (batch_size,num_classes) and the labels as a vector of size (num_classes) 
    """
    def __init__(self, num_classes=10):
        super(CrossEntropyCriterion, self).__init__()
        self.num_classes=num_classes
        
    def sigmoid(x):
        return 1./(1+np.exp(-x))
    
    sigmoid = np.vectorize(sigmoid)
    
    def forward(self, x,labels):
        target=np.zeros([x.shape[0],self.num_classes])
        for i in range(x.shape[0]):
            target[i,labels[i]]=1
        self.output = np.sum(target*np.log(self.sigmoid(x)) \
                             + (1-target)*np.log(1-self.sigmoid(x)) ,axis=0)
        return np.sum(self.output)
    
    def backward(self, x, labels):
        target=np.zeros([x.shape[0],self.num_classes])
        
        for i in range(x.shape[0]):
            target[i,labels[i]]=1
    
        self.gradInput=self.sigmoid(x)-target

        return self.gradInput

batch_size = 16
learning_rate = 0.01
total_nb_batches = 1000
every_n_iterations = 20
nb_epochs = 21
epoch_steps = 10

indices = {}


all_indices = np.random.randint(len(train_data), size=(total_nb_batches, 16))
indices['validation'], indices['train'] = all_indices[:total_nb_batches//2, :], all_indices[total_nb_batches//2:,:] 

Losses = [LeastSquareCriterion(), L1Criterion(), CrossEntropyCriterion()]

Losses_names = ['Least Square', 'L1', 'Cross-Entropy']

Losses_colors = [('Greens', 'Reds'), ('Blues', 'Purples'), ('spring', 'Oranges')]

# Plotting losses and accuracies
traces_losses = {}

for loss_name in Losses_names:
    traces_losses[loss_name] = {}

for loss, loss_name, loss_colors in zip(Losses, Losses_names, Losses_colors):

    mlp = MLP()


    Loss, Accuracy = optimized_training_procedure(mlp, loss, indices, nb_epochs=nb_epochs, batch_size=batch_size,\
                                                  total_nb_batches=total_nb_batches, learning_rate=learning_rate,\
                                                  every_n_iterations=every_n_iterations)

    colors = {}
    colors['loss'], colors['accuracy'] = {}, {}


    for loss_acc, Loss_Acc in zip(['loss', 'accuracy'], [Loss, Accuracy]):
        colors[loss_acc]['train'] = colorscale_list(loss_colors[1], len(Loss_Acc['train'])//epoch_steps+3, return_rgb_only=True)
        colors[loss_acc]['validation'] = colorscale_list(loss_colors[0], len(Loss_Acc['train'])//epoch_steps+3, return_rgb_only=True)

    traces_losses[loss_name]['loss'] = []
    traces_losses[loss_name]['accuracy'] = []

    for train_valid in ['train', 'validation']:

        for i, loss in enumerate(Loss[train_valid]):
            if i%epoch_steps==0:
                traces_losses[loss_name]['loss'].append(
                    go.Scatter(
                        x = [i*batch_size*every_n_iterations for i in range(1, len(loss)+1)], 
                        y = np.array(loss),
                        mode = 'lines',
                        name = '[{}] {} loss: epoch {}'.format(loss_name, train_valid.capitalize(), i+1),
                        line = dict(
                            width = 3,
                            color = colors['loss'][train_valid][i//epoch_steps+2],
                            shape = 'spline',
                            dash = 'solid' if train_valid=='train' else 'longdashdot'                   
                        ),
                        hoverlabel = dict(
                            namelength = -1
                        )
                    )
                )

        for i, accuracy in enumerate(Accuracy[train_valid]):
            if i%epoch_steps==0:
                traces_losses[loss_name]['accuracy'].append(
                    go.Scatter(
                        x = [i*batch_size*every_n_iterations for i in range(1, len(accuracy)+1)], 
                        y = np.array([acc/((i+1)*batch_size) for i, acc in enumerate(accuracy)]),
                        mode = 'lines',
                        name = '[{}] {} accuracy: epoch {}'.format(loss_name, train_valid.capitalize(), i+1),
                        line = dict(
                            width = 3,
                            color = colors['accuracy'][train_valid][i//epoch_steps+2],
                            shape = 'spline',
                            dash = 'solid' if train_valid=='train' else 'longdashdot'
                        ),
                        hoverlabel = dict(
                            namelength = -1
                        )
                    )
                )


names_1 = ['Least Square', 'L1']
names_2 = ['L1', 'Cross-Entropy']
names_3 = ['Least Square', 'Cross-Entropy']

for names in [names_1, names_2, names_3]:
    for loss_acc in ['loss', 'accuracy']:
        plotly.offline.iplot(go.Figure(data=[t for name in names for t in traces_losses[name][loss_acc]],\
                                       layout=layout(loss_acc, end_title=': {} vs {}'\
                                                     .format(names[0].capitalize(), names[1].capitalize()))))

L1 ends up with a very bad $\sim 0,2$ accuracy, owing to its derivatives being almost always either $-1$ or $1$.

Therefore, even after classifying a digit almost exactly, there is still a heavy penalty for it (whereas with losses like "least squares" or "cross entropy": the closer the prediction, the smaller the penalty).

3/ Implement and test different non-linearities¶

class Tanh(Module):
    
    def __init__(self, bias=True):
        super(Tanh, self).__init__()
        
    def forward(self, x):
        self.output = np.tanh(x)
        return self.output
    
    def backward(self, x, gradOutput):
        self.gradInput=(1 - np.tanh(x)**2)*gradOutput
        return self.gradInput

class Sigmoid(Module):
    
    def __init__(self, bias=True):
        super(Sigmoid, self).__init__()
        
    def sigmoid(x):
        return 1./(1+np.exp(-x))
    
    sigmoid = np.vectorize(sigmoid)
        
    def forward(self, x):
        self.output = self.sigmoid(x)
        return self.output
    
    def backward(self, x, gradOutput):
        self.gradInput = self.sigmoid(x)*(1 - self.sigmoid(x))*gradOutput
        return self.gradInput

class MLP_tanh(Module):

    def __init__(self, num_classes=10, hidden_layer_size = 64):
        super(MLP_tanh, self).__init__()
        self.fc1 = Linear(784, hidden_layer_size)
        self.tanh1 = Tanh()
        self.fc2 = Linear(hidden_layer_size, 10)
    
    def forward(self, x):
        x = self.fc1.forward(x)
        x = self.tanh1.forward(x)
        x = self.fc2.forward(x)
        return x
    
    def backward(self, x, gradient):
        gradient = self.fc2.backward(self.tanh1.output,gradient)
        gradient = self.tanh1.backward(self.fc1.output,gradient)
        gradient = self.fc1.backward(x,gradient)
        return gradient
    
    def gradientStep(self,lr):
        self.fc2.gradientStep(lr)
        self.fc1.gradientStep(lr)
        return True

class MLP_sigmoid(Module):

    def __init__(self, num_classes=10, hidden_layer_size = 64):
        super(MLP_sigmoid, self).__init__()
        self.fc1 = Linear(784, hidden_layer_size)
        self.sig1 = Sigmoid()
        self.fc2 = Linear(hidden_layer_size, 10)
    
    def forward(self, x):
        x = self.fc1.forward(x)
        x = self.sig1.forward(x)
        x = self.fc2.forward(x)
        return x
    
    def backward(self, x, gradient):
        gradient = self.fc2.backward(self.sig1.output,gradient)
        gradient = self.sig1.backward(self.fc1.output,gradient)
        gradient = self.fc1.backward(x,gradient)
        return gradient
    
    def gradientStep(self,lr):
        self.fc2.gradientStep(lr)
        self.fc1.gradientStep(lr)
        return True

batch_size = 16
learning_rate = 0.01
total_nb_batches = 1000
every_n_iterations = 20
nb_epochs = 21
epoch_steps = 10

indices = {}


all_indices = np.random.randint(len(train_data), size=(total_nb_batches, 16))
indices['validation'], indices['train'] = all_indices[:total_nb_batches//2, :], all_indices[total_nb_batches//2:,:] 

MLPs = [MLP(), MLP_tanh(),  MLP_sigmoid()]

MLP_names = ['ReLU', 'Hyperbolic Tangent', 'Sigmoid']

MLP_colors = [('Greens', 'Reds'), ('Blues', 'Purples'), ('spring', 'Oranges')]

# Plotting losses and accuracies
traces_nl = {}

for mlp_name in MLP_names:
    traces_nl[mlp_name] = {}

for mlp, mlp_name, mlp_colors in zip(MLPs, MLP_names, MLP_colors):

    lsc = LeastSquareCriterion()


    Loss, Accuracy = optimized_training_procedure(mlp, lsc, indices, nb_epochs=nb_epochs, batch_size=batch_size,\
                                                  total_nb_batches=total_nb_batches, learning_rate=learning_rate,\
                                                  every_n_iterations=every_n_iterations)

    colors = {}
    colors['loss'], colors['accuracy'] = {}, {}


    for loss_acc, Loss_Acc in zip(['loss', 'accuracy'], [Loss, Accuracy]):
        colors[loss_acc]['train'] = colorscale_list(mlp_colors[1], len(Loss_Acc['train'])//epoch_steps+3, return_rgb_only=True)
        colors[loss_acc]['validation'] = colorscale_list(mlp_colors[0], len(Loss_Acc['train'])//epoch_steps+3, return_rgb_only=True)

    traces_nl[mlp_name]['loss'] = []
    traces_nl[mlp_name]['accuracy'] = []

    for train_valid in ['train', 'validation']:

        for i, loss in enumerate(Loss[train_valid]):
            if i%epoch_steps==0:
                traces_nl[mlp_name]['loss'].append(
                    go.Scatter(
                        x = [i*batch_size*every_n_iterations for i in range(1, len(loss)+1)], 
                        y = np.array(loss),
                        mode = 'lines',
                        name = '[{}] {} loss: epoch {}'.format(mlp_name, train_valid.capitalize(), i+1),
                        line = dict(
                            width = 3,
                            color = colors['loss'][train_valid][i//epoch_steps+2],
                            shape = 'spline',
                            dash = 'solid' if train_valid=='train' else 'longdashdot'                   
                        ),
                        hoverlabel = dict(
                            namelength = -1
                        )
                    )
                )

        for i, accuracy in enumerate(Accuracy[train_valid]):
            if i%epoch_steps==0:
                traces_nl[mlp_name]['accuracy'].append(
                    go.Scatter(
                        x = [i*batch_size*every_n_iterations for i in range(1, len(accuracy)+1)], 
                        y = np.array([acc/((i+1)*batch_size) for i, acc in enumerate(accuracy)]),
                        mode = 'lines',
                        name = '[{}] {} accuracy: epoch {}'.format(mlp_name, train_valid.capitalize(), i+1),
                        line = dict(
                            width = 3,
                            color = colors['accuracy'][train_valid][i//epoch_steps+2],
                            shape = 'spline',
                            dash = 'solid' if train_valid=='train' else 'longdashdot'
                        ),
                        hoverlabel = dict(
                            namelength = -1
                        )
                    )
                )


names_1 = ['ReLU', 'Hyperbolic Tangent']
names_2 = ['Hyperbolic Tangent', 'Sigmoid']
names_3 = ['ReLU', 'Sigmoid']

for names in [names_1, names_2, names_3]:
    for loss_acc in ['loss', 'accuracy']:
        plotly.offline.iplot(go.Figure(data=[t for name in names for t in traces_nl[name][loss_acc]],\
                                       layout=layout(loss_acc, end_title=': {} vs {}'\
                                                     .format(names[0].capitalize(), names[1].capitalize()))))