The architecture of this code is freely inspired from the torch and pytorch architectures, and some sample of code are re-used. It is however designed to be as concise as possible, not in any case for efficiency or flexibility (i.e. if you ever want to do Deep Learning out of this class, don't try to adapt this code, use an existing framework)
As always some useful imports:
# /!\ I'm using python 3
import numpy as np
# this is the key library for manipulating arrays. Use the online ressources! http://www.numpy.org/
import matplotlib.pyplot as plt
# used to read images, display and plot http://matplotlib.org/api/pyplot_api.html .
#You can also check this simple intro to using ipython notebook with images https://matplotlib.org/users/image_tutorial.html
%matplotlib inline
# to display directly in the notebook
import scipy.ndimage as ndimage
# one of several python libraries for image procession
import scipy.io as sio
# usefull for loading mat files
import scipy.optimize as optimize
# usefull for optimization
plt.rcParams['image.cmap'] = 'gray'
# by default, the grayscale images are displayed with the jet colormap: use grayscale instead
import gzip
import _pickle as cPickle
import os
import os.path
from six.moves import urllib
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
plotly.offline.init_notebook_mode(connected=True)
from IPython.core.display import display, HTML, Markdown
# The polling here is to ensure that plotly.js has already been loaded before
# setting display alignment in order to avoid a race condition.
display(HTML(
'<script>'
'var waitForPlotly = setInterval( function() {'
'if( typeof(window.Plotly) !== "undefined" ){'
'MathJax.Hub.Config({ SVG: { font: "STIX-Web" }, displayAlign: "center" });'
'MathJax.Hub.Queue(["setRenderer", MathJax.Hub, "SVG"]);'
'clearInterval(waitForPlotly);'
'}}, 250 );'
'</script>'
))
We will consider a Neural Network as a modular architecture, all layer and loss being a Module. We define the Linear, ReLU layers, necessary to implement a MLP, as well as a simple 2-layer MLP and the least square loss function, LeastSquareCriterion.
import math
import numpy as np
class Module(object):
def __init__(self):
self.gradInput=None
self.output=None
def forward(self, *input):
"""Defines the computation performed at every call.
Should be overriden by all subclasses.
"""
raise NotImplementedError
def backward(self, *input):
"""Defines the computation performed at every call.
Should be overriden by all subclasses.
"""
raise NotImplementedError
class LeastSquareCriterion(Module):
"""
This implementation of the least square loss assumes that the data comes as a 2 dimensionnal array
of size (batch_size,num_classes) and the labels as a vector of size (num_classes)
"""
def __init__(self, num_classes=10):
super(LeastSquareCriterion, self).__init__()
self.num_classes=num_classes
def forward(self, x,labels):
target=np.zeros([x.shape[0],self.num_classes])
for i in range(x.shape[0]):
target[i,labels[i]]=1
self.output = np.sum((target-x)**2,axis=0)
return np.sum(self.output)
def backward(self, x, labels):
self.gradInput=x
for i in range(x.shape[0]):
self.gradInput[i,labels[i]]=x[i,labels[i]]-1
return self.gradInput
class Linear(Module):
"""
The input is supposed to have two dimensions (batchSize,in_feature)
"""
def __init__(self, in_features, out_features, bias=True):
super(Linear, self).__init__()
self.in_features = in_features
self.out_features = out_features
self.weight = math.sqrt(1. / (out_features* in_features))*np.random.randn(out_features, in_features)
self.bias = np.zeros(out_features)
self.gradWeight=None
self.gradBias=None
self.deltaWeight=0
self.deltaBias=0
def forward(self, x):
self.output= np.dot(x,self.weight.transpose())+np.repeat(self.bias.reshape([1,-1]),x.shape[0], axis=0)
return self.output
def backward(self, x, gradOutput):
self.gradInput=np.dot(gradOutput,self.weight)
self.gradWeight=np.dot(gradOutput.transpose(),x)
self.gradBias=np.sum(gradOutput, axis=0)
return self.gradInput
def gradientStep(self,lr, weight_decay=0, momentum=0):
self.deltaWeight=momentum*self.deltaWeight+ self.gradWeight + weight_decay*self.weight
self.deltaBias=momentum*self.deltaBias + self.gradBias
self.weight=self.weight-lr*self.deltaWeight
self.bias=self.bias-lr*self.deltaBias
class ReLU(Module):
def __init__(self, bias=True):
super(ReLU, self).__init__()
def forward(self, x):
self.output=x.clip(0)
return self.output
def backward(self, x, gradOutput):
self.gradInput=(x>0)*gradOutput
return self.gradInput
class MLP(Module):
def __init__(self, num_classes=10, hidden_layer_size = 64):
super(MLP, self).__init__()
self.fc1 = Linear(784, hidden_layer_size)
self.relu1 = ReLU()
self.fc2 = Linear(hidden_layer_size, 10)
def forward(self, x):
x = self.fc1.forward(x)
x = self.relu1.forward(x)
x = self.fc2.forward(x)
return x
def backward(self, x, gradient):
gradient = self.fc2.backward(self.relu1.output,gradient)
gradient = self.relu1.backward(self.fc1.output,gradient)
gradient = self.fc1.backward(x,gradient)
return gradient
def gradientStep(self,lr):
self.fc2.gradientStep(lr)
self.fc1.gradientStep(lr)
return True
To train a network, we will need data. Download the MNIST data (~15Mo). I consists of 28x28 images (loaded as a 784 vector) and the associated label for training, validation and test sets. For this TP, you can focus on the training and validation sets.
# Load the dataset
f = gzip.open('mnist.pkl.gz', 'rb')
train_set, val_set, test_set = cPickle.load(f, encoding='latin1')
f.close()
train_data=train_set[0]
train_labels=train_set[1]
val_data=val_set[0]
val_labels=val_set[1]
N_train=train_data.shape[1]
N_val=val_data.shape[1]
# check data makes sense
plt.imshow(train_data[0,:].reshape(28,28))
print(train_labels[0])
learning_rate = 0.01
def iteration_step(mlp, lsc, X, Y, learning_rate=learning_rate, training=True):
X_forward = mlp.forward(X)
if training:
grad_loss = lsc.backward(X_forward, Y)
mlp.backward(X, grad_loss)
mlp.gradientStep(learning_rate)
if not training:
# return loss and array indicating whether predicted label are correct
return lsc.forward(X_forward, Y), np.argmax(X_forward, axis=1) == Y
batch_size = 16
learning_rate = 0.01
mlp = MLP()
lsc = LeastSquareCriterion()
for _ in range(50):
ind_rand = np.random.randint(len(train_data), size=16)
X, Y = train_data[ind_rand, :], train_labels[ind_rand] # images, labels
iteration_step(mlp, lsc, X, Y, learning_rate=learning_rate)
test_ind = np.random.randint(len(train_data), size=16)
display(Markdown("### Images and their predicted labels"))
fig = plt.figure(figsize = (20., 20.))
for index, n in enumerate(test_ind):
x_test = train_data[n, :].reshape(28, 28)
y_test = train_labels[n]
plt.subplot(16, 4, index+1)\
.set_title(str(np.argmax(mlp.forward(train_data[n, :]))))
plt.imshow(x_test)
plt.axis('off')
plt.tight_layout(w_pad=0, h_pad=0)
plt.show()
batch_size = 16
learning_rate = 0.01
total_nb_batches = 1000
every_n_iterations = 10
def training_procedure(mlp, lsc, indices, batch_size=batch_size, total_nb_batches=total_nb_batches,\
learning_rate=learning_rate, every_n_iterations=every_n_iterations):
loss = {}
loss['train'] = []
loss['validation'] = []
accuracy = {}
accuracy['train'] = []
accuracy['validation'] = []
nb_iteration = 1
for inds in zip(indices['train'], indices['validation']):
for ind, train_valid in zip(inds, ['train', 'validation']):
X, Y = train_data[ind, :], train_labels[ind] # images, labels
if train_valid == 'train':
iteration_step(mlp, lsc, X, Y, learning_rate=learning_rate)
if nb_iteration%every_n_iterations == 0:
current_loss, correctness = iteration_step(mlp, lsc, X, Y, learning_rate=learning_rate, training=False)
loss[train_valid].append(current_loss)
accuracy[train_valid].append((accuracy[train_valid][-1] if accuracy[train_valid] else 0)\
+ np.sum(correctness))
nb_iteration += 1
return loss, accuracy
batch_size = 16
learning_rate = 0.01
total_nb_batches = 1000
every_n_iterations = 10
mlp = MLP()
lsc = LeastSquareCriterion()
indices = {}
all_indices = np.random.randint(len(train_data), size=(total_nb_batches, 16))
indices['validation'], indices['train'] = all_indices[:total_nb_batches//2, :], all_indices[total_nb_batches//2:,:]
loss, accuracy = training_procedure(mlp, lsc, indices, batch_size=batch_size, total_nb_batches=total_nb_batches,\
learning_rate=learning_rate, every_n_iterations=every_n_iterations)
# Plotting loss and accuracy
traces = {}
traces['loss'] = []
traces['accuracy'] = []
for train_valid in ['train', 'validation']:
traces['loss'].append(
go.Scatter(x = [i*batch_size*every_n_iterations for i in range(1, len(loss[train_valid])+1)],
y = np.array(loss[train_valid]),
mode = 'lines',
name = '{} loss'.format(train_valid.capitalize())
)
)
traces['accuracy'].append(
go.Scatter(x = [i*batch_size*every_n_iterations for i in range(1, len(accuracy[train_valid])+1)],
y = np.array([acc/((i+1)*batch_size) for i, acc in enumerate(accuracy[train_valid])]),
mode = 'lines',
name = '{} accuracy'.format(train_valid.capitalize())
)
)
def layout(loss_acc, end_title=''):
if loss_acc == 'loss':
return go.Layout(
title= 'Losses{}'.format(end_title),
hovermode= 'closest',
xaxis= dict(
title= 'Iterations',
ticklen= 5,
zeroline= False,
gridwidth= 2,
),
yaxis=dict(
title= 'Loss',
ticklen= 5,
gridwidth= 2,
),
showlegend= True
)
elif loss_acc == 'accuracy':
return go.Layout(
title= 'Accuracy{}'.format(end_title),
hovermode= 'closest',
xaxis= dict(
title= 'Iterations',
ticklen= 5,
zeroline= False,
gridwidth= 2,
),
yaxis=dict(
title= 'Accuracy',
ticklen= 5,
gridwidth= 2,
),
showlegend= True
)
for loss_acc in ['loss', 'accuracy']:
plotly.offline.iplot(go.Figure(data=traces[loss_acc], layout=layout(loss_acc)))
One introduces a number of epochs, that is: a number of times one iterates over all a given training set. Why iterating multiple times over a given training set? Because the backpropagation procedure is akin to a gradient descent, which is an iterative approximation algorithm: as a result, it makes sense to reiterate the previous training procedure several times so as to converge toward a local minimum of the loss function (over our given training set).
batch_size = 16
learning_rate = 0.01
total_nb_batches = 1000
every_n_iterations = 10
nb_epochs = 10
def optimized_training_procedure(mlp, lsc, indices, nb_epochs=nb_epochs, batch_size=batch_size, total_nb_batches=total_nb_batches,\
learning_rate=learning_rate, every_n_iterations=every_n_iterations):
Loss = {}
Loss['train'] = []
Loss['validation'] = []
Accuracy = {}
Accuracy['train'] = []
Accuracy['validation'] = []
for _ in range(nb_epochs):
loss, accuracy = training_procedure(mlp, lsc, indices, batch_size=batch_size, total_nb_batches=total_nb_batches,\
learning_rate=learning_rate, every_n_iterations=every_n_iterations)
for train_valid in ['train', 'validation']:
Loss[train_valid].append(loss[train_valid])
Accuracy[train_valid].append(accuracy[train_valid])
return Loss, Accuracy
# Colorscales
def colorscale_list(cmap, number_colors, return_rgb_only=False):
cm = plt.get_cmap(cmap)
colors = [np.array(cm(i/number_colors)) for i in range(1, number_colors+1)]
rgb_colors_plotly = []
rgb_colors_only = []
for i, c in enumerate(colors):
col = 'rgb{}'.format(tuple(255*c[:-1]))
rgb_colors_only.append(col)
rgb_colors_plotly.append([i/number_colors, col])
rgb_colors_plotly.append([(i+1)/number_colors, col])
return rgb_colors_only if return_rgb_only else rgb_colors_plotly
batch_size = 16
learning_rate = 0.01
total_nb_batches = 1000
every_n_iterations = 10
nb_epochs = 10
mlp = MLP()
lsc = LeastSquareCriterion()
indices = {}
all_indices = np.random.randint(len(train_data), size=(total_nb_batches, 16))
indices['validation'], indices['train'] = all_indices[:total_nb_batches//2, :], all_indices[total_nb_batches//2:,:]
Loss, Accuracy = optimized_training_procedure(mlp, lsc, indices, nb_epochs=nb_epochs, batch_size=batch_size,\
total_nb_batches=total_nb_batches, learning_rate=learning_rate, every_n_iterations=every_n_iterations)
colors = {}
colors['loss'], colors['accuracy'] = {}, {}
for loss_acc, Loss_Acc in zip(['loss', 'accuracy'], [Loss, Accuracy]):
colors[loss_acc]['train'] = colorscale_list('Reds', len(Loss_Acc['train'])+3, return_rgb_only=True)
colors[loss_acc]['validation'] = colorscale_list('Greens', len(Loss_Acc['train'])+3, return_rgb_only=True)
# Plotting losses and accuracies
traces = {}
traces['loss'] = []
traces['accuracy'] = []
for train_valid in ['train', 'validation']:
for i, loss in enumerate(Loss[train_valid]):
traces['loss'].append(
go.Scatter(
x = [i*batch_size*every_n_iterations for i in range(1, len(loss)+1)],
y = np.array(loss),
mode = 'lines',
name = '{} loss: epoch {}'.format(train_valid.capitalize(), i+1),
line = dict(
width = 3,
color = colors['loss'][train_valid][i+2],
shape = 'spline',
dash = 'solid'
),
hoverlabel = dict(
namelength = -1
)
)
)
for i, accuracy in enumerate(Accuracy[train_valid]):
traces['accuracy'].append(
go.Scatter(
x = [i*batch_size*every_n_iterations for i in range(1, len(accuracy)+1)],
y = np.array([acc/((i+1)*batch_size) for i, acc in enumerate(accuracy)]),
mode = 'lines',
name = '{} accuracy: epoch {}'.format(train_valid.capitalize(), i+1),
line = dict(
width = 3,
color = colors['accuracy'][train_valid][i+2],
shape = 'spline',
dash = 'solid'
),
hoverlabel = dict(
namelength = -1
)
)
)
for loss_acc in ['loss', 'accuracy']:
plotly.offline.iplot(go.Figure(data=traces[loss_acc], layout=layout(loss_acc)))
There are 3 layers,
which amounts to two weight matrices of size $64 × 784$ (between the 1st and the 2nd layer) and $10 × 64$ (between the 2nd and the 3rd layer).
On top of that, there are two bias vectors of size $64$ (between the 1st and the 2nd layer) and $10$ (between the 2nd and the 3rd layer).
In total, there are $50890$ weights and biaises.
To add a parameter determining the size of the intermediate layer, one modifies the __init__
method of the MLP
class as follows:
def __init__(self, num_classes=10, hidden_layer_size = 64):
super(MLP, self).__init__()
self.fc1 = Linear(784, hidden_layer_size)
self.relu1 = ReLU()
self.fc2 = Linear(hidden_layer_size, 10)
class MLP_3linear(Module):
def __init__(self, num_classes=10, hidden_layers_sizes = (64, 64)):
super(MLP_3linear, self).__init__()
size1, size2 = hidden_layers_sizes
self.fc1 = Linear(784, size1)
self.relu1 = ReLU()
self.fc2 = Linear(size1, size2)
self.fc3 = Linear(size2, 10)
def forward(self, x):
x = self.fc1.forward(x)
x = self.relu1.forward(x)
x = self.fc2.forward(x)
x = self.fc3.forward(x)
return x
def backward(self, x, gradient):
gradient = self.fc3.backward(self.fc2.output,gradient)
gradient = self.fc2.backward(self.relu1.output,gradient)
gradient = self.relu1.backward(self.fc1.output,gradient)
gradient = self.fc1.backward(x,gradient)
return gradient
def gradientStep(self,lr):
self.fc3.gradientStep(lr)
self.fc2.gradientStep(lr)
self.fc1.gradientStep(lr)
return True
batch_size = 16
learning_rate = 0.01
total_nb_batches = 1000
every_n_iterations = 20
nb_epochs = 21
epoch_steps = 10
indices = {}
all_indices = np.random.randint(len(train_data), size=(total_nb_batches, 16))
indices['validation'], indices['train'] = all_indices[:total_nb_batches//2, :], all_indices[total_nb_batches//2:,:]
MLPs = [MLP(hidden_layer_size=10), MLP(hidden_layer_size=100), MLP_3linear(),\
MLP_3linear(hidden_layers_sizes=(10, 10)), MLP_3linear(hidden_layers_sizes=(100, 100))]
MLP_names = ['hidden layer: 10', 'hidden layer: 100', 'hidden layers: 64,64', \
'hidden layers: 10, 10', 'hidden layers: 100, 100']
MLP_colors = [('Greens', 'Reds'), ('Blues', 'Purples'), ('Greens', 'Reds'),\
('Blues', 'Purples'), ('Greens', 'Reds')]
# Plotting losses and accuracies
traces = {}
for mlp_name in MLP_names:
traces[mlp_name] = {}
for mlp, mlp_name, mlp_colors in zip(MLPs, MLP_names, MLP_colors):
lsc = LeastSquareCriterion()
Loss, Accuracy = optimized_training_procedure(mlp, lsc, indices, nb_epochs=nb_epochs, batch_size=batch_size,\
total_nb_batches=total_nb_batches, learning_rate=learning_rate,\
every_n_iterations=every_n_iterations)
colors = {}
colors['loss'], colors['accuracy'] = {}, {}
for loss_acc, Loss_Acc in zip(['loss', 'accuracy'], [Loss, Accuracy]):
colors[loss_acc]['train'] = colorscale_list(mlp_colors[1], len(Loss_Acc['train'])//epoch_steps+3, return_rgb_only=True)
colors[loss_acc]['validation'] = colorscale_list(mlp_colors[0], len(Loss_Acc['train'])//epoch_steps+3, return_rgb_only=True)
traces[mlp_name]['loss'] = []
traces[mlp_name]['accuracy'] = []
for train_valid in ['train', 'validation']:
for i, loss in enumerate(Loss[train_valid]):
if i%epoch_steps==0:
traces[mlp_name]['loss'].append(
go.Scatter(
x = [i*batch_size*every_n_iterations for i in range(1, len(loss)+1)],
y = np.array(loss),
mode = 'lines',
name = '[{}] {} loss: epoch {}'.format(mlp_name, train_valid.capitalize(), i+1),
line = dict(
width = 3,
color = colors['loss'][train_valid][i//epoch_steps+2],
shape = 'spline',
dash = 'solid' if train_valid=='train' else 'longdashdot'
),
hoverlabel = dict(
namelength = -1
)
)
)
for i, accuracy in enumerate(Accuracy[train_valid]):
if i%epoch_steps==0:
traces[mlp_name]['accuracy'].append(
go.Scatter(
x = [i*batch_size*every_n_iterations for i in range(1, len(accuracy)+1)],
y = np.array([acc/((i+1)*batch_size) for i, acc in enumerate(accuracy)]),
mode = 'lines',
name = '[{}] {} accuracy: epoch {}'.format(mlp_name, train_valid.capitalize(), i+1),
line = dict(
width = 3,
color = colors['accuracy'][train_valid][i//epoch_steps+2],
shape = 'spline',
dash = 'solid' if train_valid=='train' else 'longdashdot'
),
hoverlabel = dict(
namelength = -1
)
)
)
names_1 = ['hidden layer: 10', 'hidden layer: 100']
names_2 = ['hidden layer: 100', 'hidden layers: 64,64']
names_3 = ['hidden layers: 64,64', 'hidden layers: 10, 10']
names_4 = ['hidden layers: 10, 10', 'hidden layers: 100, 100']
for names in [names_1, names_2, names_3, names_4]:
for loss_acc in ['loss', 'accuracy']:
plotly.offline.iplot(go.Figure(data=[t for name in names for t in traces[name][loss_acc]],\
layout=layout(loss_acc, end_title=': {} vs {}'\
.format(names[0].capitalize(), names[1].capitalize()))))
display(Markdown("### Overfitting"))
batch_size = 16
learning_rate = 0.01
total_nb_batches = 260
every_n_iterations = 10
nb_epochs = 101
epoch_steps = 20
mlp = MLP_3linear(hidden_layers_sizes=(200, 200))
lsc = LeastSquareCriterion()
indices = {}
all_indices = np.arange(total_nb_batches*batch_size).reshape([-1, 16])
indices['validation'], indices['train'] = all_indices[:total_nb_batches//2, :], all_indices[total_nb_batches//2:,:]
Loss, _ = optimized_training_procedure(mlp, lsc, indices, nb_epochs=nb_epochs, batch_size=batch_size,\
total_nb_batches=total_nb_batches, learning_rate=learning_rate,\
every_n_iterations=every_n_iterations)
colors = {}
colors['loss'] = {}
colors['loss']['train'] = colorscale_list('Reds', len(Loss['train'])+3, return_rgb_only=True)
colors['loss']['validation'] = colorscale_list('Greens', len(Loss['train'])+3, return_rgb_only=True)
# Plotting losses and accuracies
traces_overfitting = {}
traces_overfitting['loss'] = []
for train_valid in ['train', 'validation']:
for i, loss in enumerate(Loss[train_valid]):
traces_overfitting['loss'].append(
go.Scatter(
x = [i*batch_size*every_n_iterations for i in range(1, len(loss)+1)],
y = np.array(loss),
mode = 'lines',
name = '{} loss: epoch {}'.format(train_valid.capitalize(), i+1),
line = dict(
width = 3,
color = colors['loss'][train_valid][i+2],
shape = 'spline',
dash = 'solid'
),
hoverlabel = dict(
namelength = -1
),
showlegend = (i % epoch_steps == 0)
)
)
plotly.offline.iplot(go.Figure(data=traces_overfitting['loss'], layout=layout('loss')))
One sees an instance of overfitting above, for MLP_3linear(hidden_layers_sizes=(200, 200))
with 101
epochs:
which means that the MLP is overfitting over the training data.
We modify the gradientStep
method of the Linear
class as follows:
def gradientStep(self,lr, weight_decay=0):
self.weight=self.weight-lr*(self.gradWeight+weight_decay*self.weight)
self.bias=self.bias-lr*self.gradBias
class MLP_3linear_weightDecays(Module):
def __init__(self, num_classes=10, hidden_layers_sizes = (64, 64), weight_decays = (0, 0, 0)):
super(MLP_3linear_weightDecays, self).__init__()
size1, size2 = hidden_layers_sizes
self.fc1 = Linear(784, size1)
self.relu1 = ReLU()
self.fc2 = Linear(size1, size2)
self.fc3 = Linear(size2, 10)
self.weight_decays = weight_decays
def forward(self, x):
x = self.fc1.forward(x)
x = self.relu1.forward(x)
x = self.fc2.forward(x)
x = self.fc3.forward(x)
return x
def backward(self, x, gradient):
gradient = self.fc3.backward(self.fc2.output,gradient)
gradient = self.fc2.backward(self.relu1.output,gradient)
gradient = self.relu1.backward(self.fc1.output,gradient)
gradient = self.fc1.backward(x,gradient)
return gradient
def gradientStep(self,lr):
wd1, wd2, wd3 = self.weight_decays
self.fc3.gradientStep(lr, weight_decay=wd1)
self.fc2.gradientStep(lr, weight_decay=wd2)
self.fc1.gradientStep(lr, weight_decay=wd3)
return True
display(Markdown("### Weight Decay: Overfitting"))
batch_size = 16
learning_rate = 0.01
total_nb_batches = 260
every_n_iterations = 10
nb_epochs = 21
indices = {}
all_indices = np.arange(total_nb_batches*batch_size).reshape([-1, 16])
indices['validation'], indices['train'] = all_indices[:total_nb_batches//2, :], all_indices[total_nb_batches//2:,:]
weight_decays = [(0, 0, 0), (0.1, 0.01, 0.01), (0.1, 0.1, 0.01), (0.1, 0.1, 0.1),\
(1, 0.1, 0.1), (1, 1, 0.1), (1, 1, 1)]
colors = {}
colors['loss'] = {}
colors['loss']['train'] = colorscale_list('Reds', len(weight_decays)+3, return_rgb_only=True)
colors['loss']['validation'] = colorscale_list('Greens', len(weight_decays)+3, return_rgb_only=True)
# Plotting losses
traces_wd = []
for i, wd in enumerate(weight_decays):
mlp = MLP_3linear_weightDecays(hidden_layers_sizes=(200, 200), weight_decays = wd)
lsc = LeastSquareCriterion()
Loss, _ = optimized_training_procedure(mlp, lsc, indices, nb_epochs=nb_epochs, batch_size=batch_size,\
total_nb_batches=total_nb_batches, learning_rate=learning_rate,\
every_n_iterations=every_n_iterations)
for train_valid in ['train', 'validation']:
loss = Loss[train_valid][-1]
traces_wd.append(
go.Scatter(
x = [i*batch_size*every_n_iterations for i in range(1, len(loss)+1)],
y = np.array(loss),
mode = 'lines',
name = '{} loss (epoch {}): weight decay = {}'.format(train_valid.capitalize(), nb_epochs, wd),
line = dict(
width = 3,
color = colors['loss'][train_valid][i+2],
shape = 'spline',
dash = 'solid'
),
hoverlabel = dict(
namelength = -1
)
)
)
plotly.offline.iplot(go.Figure(data=traces_wd,\
layout=layout(loss_acc, end_title=' for various weight decays')))
For sure, when the weight decays increase:
So there is a compromise to reach between low weight decays (which results in overfitting) and high ones (resulting in greater overall losses).
The following questions are completely optional!
To take into account a momentum parameter momentum
:
Linear
class: self.deltaWeight
and self.deltaBias
(initialized to zero)gradientStep
method of the latter class as follows:def gradientStep(self,lr, weight_decay=0, momentum=0):
self.deltaWeight=momentum*self.deltaWeight+ self.gradWeight + weight_decay*self.weight
self.deltaBias=momentum*self.deltaBias + self.gradBias
self.weight=self.weight-lr*self.deltaWeight
self.bias=self.bias-lr*self.deltaBias
class MLP_3linear_wD_momentum(Module):
def __init__(self, num_classes=10, hidden_layers_sizes = (64, 64),\
weight_decays = (0, 0, 0), momentum = 0):
super(MLP_3linear_wD_momentum, self).__init__()
size1, size2 = hidden_layers_sizes
self.fc1 = Linear(784, size1)
self.relu1 = ReLU()
self.fc2 = Linear(size1, size2)
self.fc3 = Linear(size2, 10)
self.weight_decays = weight_decays
self.momentum = momentum
def forward(self, x):
x = self.fc1.forward(x)
x = self.relu1.forward(x)
x = self.fc2.forward(x)
x = self.fc3.forward(x)
return x
def backward(self, x, gradient):
gradient = self.fc3.backward(self.fc2.output,gradient)
gradient = self.fc2.backward(self.relu1.output,gradient)
gradient = self.relu1.backward(self.fc1.output,gradient)
gradient = self.fc1.backward(x,gradient)
return gradient
def gradientStep(self,lr):
wd1, wd2, wd3 = self.weight_decays
mom = self.momentum
self.fc3.gradientStep(lr, weight_decay=wd1, momentum=mom)
self.fc2.gradientStep(lr, weight_decay=wd2, momentum=mom)
self.fc1.gradientStep(lr, weight_decay=wd3, momentum=mom)
return True
display(Markdown("### Momentum"))
batch_size = 16
learning_rate = 0.01
total_nb_batches = 260
every_n_iterations = 10
nb_epochs = 1
indices = {}
all_indices = np.arange(total_nb_batches*batch_size).reshape([-1, 16])
indices['validation'], indices['train'] = all_indices[:total_nb_batches//2, :], all_indices[total_nb_batches//2:,:]
momenta = [0, 0.2, 0.4, 0.6, 0.7, 0.8, 0.85, 0.9, 0.95]
colors = {}
colors['train'] = colorscale_list('Reds', len(momenta)+3, return_rgb_only=True)
colors['validation'] = colorscale_list('Greens', len(momenta)+3, return_rgb_only=True)
# Plotting losses
traces_mom = {}
traces_mom['loss'] = []
traces_mom['accuracy'] = []
for i, mom in enumerate(momenta):
mlp = MLP_3linear_wD_momentum(momentum=mom)
lsc = LeastSquareCriterion()
Loss, Accuracy = optimized_training_procedure(mlp, lsc, indices, nb_epochs=nb_epochs, batch_size=batch_size,\
total_nb_batches=total_nb_batches, learning_rate=learning_rate,\
every_n_iterations=every_n_iterations)
for train_valid in ['train', 'validation']:
loss = Loss[train_valid][-1]
traces_mom['loss'].append(
go.Scatter(
x = [i*batch_size*every_n_iterations for i in range(1, len(loss)+1)],
y = np.array(loss),
mode = 'lines',
name = '{} loss (epoch {}): momentum = {}'.format(train_valid.capitalize(), nb_epochs, mom),
line = dict(
width = 3,
color = colors[train_valid][i+2],
shape = 'spline',
dash = 'solid'
),
hoverlabel = dict(
namelength = -1
)
)
)
accuracy = Accuracy[train_valid][-1]
traces_mom['accuracy'].append(
go.Scatter(
x = [i*batch_size*every_n_iterations for i in range(1, len(accuracy)+1)],
y = np.array([acc/((i+1)*batch_size) for i, acc in enumerate(accuracy)]),
mode = 'lines',
name = '{} accuracy (epoch {}): momentum = {}'.format(train_valid.capitalize(), nb_epochs, mom),
line = dict(
width = 3,
color = colors[train_valid][i+2],
shape = 'spline',
dash = 'solid'
),
hoverlabel = dict(
namelength = -1
)
)
)
plotly.offline.iplot(go.Figure(data=traces_mom['loss'], layout=layout('loss', end_title=' for various momenta')))
plotly.offline.iplot(go.Figure(data=traces_mom['accuracy'], layout=layout('accuracy', end_title=' for various momenta')))
0.8
: the bigger the momentum, the more accelerated the training phase (the loss decreases and the accuracy increases faster)0.8
: it backfires, the loss and accuracy are not as good (the bigger the momentum, the worse they are)class L1Criterion(Module):
"""
This implementation of the L1 loss assumes that the data comes as a 2 dimensionnal array
of size (batch_size,num_classes) and the labels as a vector of size (num_classes)
"""
def __init__(self, num_classes=10):
super(L1Criterion, self).__init__()
self.num_classes=num_classes
def forward(self, x,labels):
target=np.zeros([x.shape[0],self.num_classes])
for i in range(x.shape[0]):
target[i,labels[i]]=1
self.output = np.sum(np.abs(target-x),axis=0)
return np.sum(self.output)
def deriv(x):
if x > 0:
return 1.
elif x < 0:
return -1.
else:
return 0.
deriv = np.vectorize(deriv)
def backward(self, x, labels):
target=np.zeros([x.shape[0],self.num_classes])
for i in range(x.shape[0]):
target[i,labels[i]]=1
self.gradInput=self.deriv(x-target)
return self.gradInput
class CrossEntropyCriterion(Module):
"""
This implementation of the Cross-Entropy loss assumes that the data comes as a 2 dimensionnal array
of size (batch_size,num_classes) and the labels as a vector of size (num_classes)
"""
def __init__(self, num_classes=10):
super(CrossEntropyCriterion, self).__init__()
self.num_classes=num_classes
def sigmoid(x):
return 1./(1+np.exp(-x))
sigmoid = np.vectorize(sigmoid)
def forward(self, x,labels):
target=np.zeros([x.shape[0],self.num_classes])
for i in range(x.shape[0]):
target[i,labels[i]]=1
self.output = np.sum(target*np.log(self.sigmoid(x)) \
+ (1-target)*np.log(1-self.sigmoid(x)) ,axis=0)
return np.sum(self.output)
def backward(self, x, labels):
target=np.zeros([x.shape[0],self.num_classes])
for i in range(x.shape[0]):
target[i,labels[i]]=1
self.gradInput=self.sigmoid(x)-target
return self.gradInput
batch_size = 16
learning_rate = 0.01
total_nb_batches = 1000
every_n_iterations = 20
nb_epochs = 21
epoch_steps = 10
indices = {}
all_indices = np.random.randint(len(train_data), size=(total_nb_batches, 16))
indices['validation'], indices['train'] = all_indices[:total_nb_batches//2, :], all_indices[total_nb_batches//2:,:]
Losses = [LeastSquareCriterion(), L1Criterion(), CrossEntropyCriterion()]
Losses_names = ['Least Square', 'L1', 'Cross-Entropy']
Losses_colors = [('Greens', 'Reds'), ('Blues', 'Purples'), ('spring', 'Oranges')]
# Plotting losses and accuracies
traces_losses = {}
for loss_name in Losses_names:
traces_losses[loss_name] = {}
for loss, loss_name, loss_colors in zip(Losses, Losses_names, Losses_colors):
mlp = MLP()
Loss, Accuracy = optimized_training_procedure(mlp, loss, indices, nb_epochs=nb_epochs, batch_size=batch_size,\
total_nb_batches=total_nb_batches, learning_rate=learning_rate,\
every_n_iterations=every_n_iterations)
colors = {}
colors['loss'], colors['accuracy'] = {}, {}
for loss_acc, Loss_Acc in zip(['loss', 'accuracy'], [Loss, Accuracy]):
colors[loss_acc]['train'] = colorscale_list(loss_colors[1], len(Loss_Acc['train'])//epoch_steps+3, return_rgb_only=True)
colors[loss_acc]['validation'] = colorscale_list(loss_colors[0], len(Loss_Acc['train'])//epoch_steps+3, return_rgb_only=True)
traces_losses[loss_name]['loss'] = []
traces_losses[loss_name]['accuracy'] = []
for train_valid in ['train', 'validation']:
for i, loss in enumerate(Loss[train_valid]):
if i%epoch_steps==0:
traces_losses[loss_name]['loss'].append(
go.Scatter(
x = [i*batch_size*every_n_iterations for i in range(1, len(loss)+1)],
y = np.array(loss),
mode = 'lines',
name = '[{}] {} loss: epoch {}'.format(loss_name, train_valid.capitalize(), i+1),
line = dict(
width = 3,
color = colors['loss'][train_valid][i//epoch_steps+2],
shape = 'spline',
dash = 'solid' if train_valid=='train' else 'longdashdot'
),
hoverlabel = dict(
namelength = -1
)
)
)
for i, accuracy in enumerate(Accuracy[train_valid]):
if i%epoch_steps==0:
traces_losses[loss_name]['accuracy'].append(
go.Scatter(
x = [i*batch_size*every_n_iterations for i in range(1, len(accuracy)+1)],
y = np.array([acc/((i+1)*batch_size) for i, acc in enumerate(accuracy)]),
mode = 'lines',
name = '[{}] {} accuracy: epoch {}'.format(loss_name, train_valid.capitalize(), i+1),
line = dict(
width = 3,
color = colors['accuracy'][train_valid][i//epoch_steps+2],
shape = 'spline',
dash = 'solid' if train_valid=='train' else 'longdashdot'
),
hoverlabel = dict(
namelength = -1
)
)
)
names_1 = ['Least Square', 'L1']
names_2 = ['L1', 'Cross-Entropy']
names_3 = ['Least Square', 'Cross-Entropy']
for names in [names_1, names_2, names_3]:
for loss_acc in ['loss', 'accuracy']:
plotly.offline.iplot(go.Figure(data=[t for name in names for t in traces_losses[name][loss_acc]],\
layout=layout(loss_acc, end_title=': {} vs {}'\
.format(names[0].capitalize(), names[1].capitalize()))))
L1 ends up with a very bad $\sim 0,2$ accuracy, owing to its derivatives being almost always either $-1$ or $1$.
Therefore, even after classifying a digit almost exactly, there is still a heavy penalty for it (whereas with losses like "least squares" or "cross entropy": the closer the prediction, the smaller the penalty).
class Tanh(Module):
def __init__(self, bias=True):
super(Tanh, self).__init__()
def forward(self, x):
self.output = np.tanh(x)
return self.output
def backward(self, x, gradOutput):
self.gradInput=(1 - np.tanh(x)**2)*gradOutput
return self.gradInput
class Sigmoid(Module):
def __init__(self, bias=True):
super(Sigmoid, self).__init__()
def sigmoid(x):
return 1./(1+np.exp(-x))
sigmoid = np.vectorize(sigmoid)
def forward(self, x):
self.output = self.sigmoid(x)
return self.output
def backward(self, x, gradOutput):
self.gradInput = self.sigmoid(x)*(1 - self.sigmoid(x))*gradOutput
return self.gradInput
class MLP_tanh(Module):
def __init__(self, num_classes=10, hidden_layer_size = 64):
super(MLP_tanh, self).__init__()
self.fc1 = Linear(784, hidden_layer_size)
self.tanh1 = Tanh()
self.fc2 = Linear(hidden_layer_size, 10)
def forward(self, x):
x = self.fc1.forward(x)
x = self.tanh1.forward(x)
x = self.fc2.forward(x)
return x
def backward(self, x, gradient):
gradient = self.fc2.backward(self.tanh1.output,gradient)
gradient = self.tanh1.backward(self.fc1.output,gradient)
gradient = self.fc1.backward(x,gradient)
return gradient
def gradientStep(self,lr):
self.fc2.gradientStep(lr)
self.fc1.gradientStep(lr)
return True
class MLP_sigmoid(Module):
def __init__(self, num_classes=10, hidden_layer_size = 64):
super(MLP_sigmoid, self).__init__()
self.fc1 = Linear(784, hidden_layer_size)
self.sig1 = Sigmoid()
self.fc2 = Linear(hidden_layer_size, 10)
def forward(self, x):
x = self.fc1.forward(x)
x = self.sig1.forward(x)
x = self.fc2.forward(x)
return x
def backward(self, x, gradient):
gradient = self.fc2.backward(self.sig1.output,gradient)
gradient = self.sig1.backward(self.fc1.output,gradient)
gradient = self.fc1.backward(x,gradient)
return gradient
def gradientStep(self,lr):
self.fc2.gradientStep(lr)
self.fc1.gradientStep(lr)
return True
batch_size = 16
learning_rate = 0.01
total_nb_batches = 1000
every_n_iterations = 20
nb_epochs = 21
epoch_steps = 10
indices = {}
all_indices = np.random.randint(len(train_data), size=(total_nb_batches, 16))
indices['validation'], indices['train'] = all_indices[:total_nb_batches//2, :], all_indices[total_nb_batches//2:,:]
MLPs = [MLP(), MLP_tanh(), MLP_sigmoid()]
MLP_names = ['ReLU', 'Hyperbolic Tangent', 'Sigmoid']
MLP_colors = [('Greens', 'Reds'), ('Blues', 'Purples'), ('spring', 'Oranges')]
# Plotting losses and accuracies
traces_nl = {}
for mlp_name in MLP_names:
traces_nl[mlp_name] = {}
for mlp, mlp_name, mlp_colors in zip(MLPs, MLP_names, MLP_colors):
lsc = LeastSquareCriterion()
Loss, Accuracy = optimized_training_procedure(mlp, lsc, indices, nb_epochs=nb_epochs, batch_size=batch_size,\
total_nb_batches=total_nb_batches, learning_rate=learning_rate,\
every_n_iterations=every_n_iterations)
colors = {}
colors['loss'], colors['accuracy'] = {}, {}
for loss_acc, Loss_Acc in zip(['loss', 'accuracy'], [Loss, Accuracy]):
colors[loss_acc]['train'] = colorscale_list(mlp_colors[1], len(Loss_Acc['train'])//epoch_steps+3, return_rgb_only=True)
colors[loss_acc]['validation'] = colorscale_list(mlp_colors[0], len(Loss_Acc['train'])//epoch_steps+3, return_rgb_only=True)
traces_nl[mlp_name]['loss'] = []
traces_nl[mlp_name]['accuracy'] = []
for train_valid in ['train', 'validation']:
for i, loss in enumerate(Loss[train_valid]):
if i%epoch_steps==0:
traces_nl[mlp_name]['loss'].append(
go.Scatter(
x = [i*batch_size*every_n_iterations for i in range(1, len(loss)+1)],
y = np.array(loss),
mode = 'lines',
name = '[{}] {} loss: epoch {}'.format(mlp_name, train_valid.capitalize(), i+1),
line = dict(
width = 3,
color = colors['loss'][train_valid][i//epoch_steps+2],
shape = 'spline',
dash = 'solid' if train_valid=='train' else 'longdashdot'
),
hoverlabel = dict(
namelength = -1
)
)
)
for i, accuracy in enumerate(Accuracy[train_valid]):
if i%epoch_steps==0:
traces_nl[mlp_name]['accuracy'].append(
go.Scatter(
x = [i*batch_size*every_n_iterations for i in range(1, len(accuracy)+1)],
y = np.array([acc/((i+1)*batch_size) for i, acc in enumerate(accuracy)]),
mode = 'lines',
name = '[{}] {} accuracy: epoch {}'.format(mlp_name, train_valid.capitalize(), i+1),
line = dict(
width = 3,
color = colors['accuracy'][train_valid][i//epoch_steps+2],
shape = 'spline',
dash = 'solid' if train_valid=='train' else 'longdashdot'
),
hoverlabel = dict(
namelength = -1
)
)
)
names_1 = ['ReLU', 'Hyperbolic Tangent']
names_2 = ['Hyperbolic Tangent', 'Sigmoid']
names_3 = ['ReLU', 'Sigmoid']
for names in [names_1, names_2, names_3]:
for loss_acc in ['loss', 'accuracy']:
plotly.offline.iplot(go.Figure(data=[t for name in names for t in traces_nl[name][loss_acc]],\
layout=layout(loss_acc, end_title=': {} vs {}'\
.format(names[0].capitalize(), names[1].capitalize()))))