import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time
%matplotlib inline
plt.rcParams['figure.figsize'] = (5.0, 4.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'
We use data from the Kaggle competition for digit recognition taken from MNIST. https://www.kaggle.com/c/digit-recognizer
Each image is 28 pixels in height and 28 pixels in width, for a total of 784 pixels in total. Each pixel has a single pixel-value associated with it, indicating the lightness or darkness of that pixel, with higher numbers meaning darker. This pixel-value is an integer between 0 and 255, inclusive.
train = pd.read_csv("./mnist_kaggle_data/train.csv")
First, we view some digits to understand the data.
def plot_digit(index):
im = train.iloc[index, 1:]
digit = train.iloc[index, 0]
im_reshape = im.values.reshape(28, 28)
plt.imshow(im_reshape, cmap='Greys')
plt.title("The label is: " + str(digit))
plot_digit(20)
Next, we reshape the data such that X is a matrix with dimensions (# of features, # of training examples) and Y is a matrix with dimensions (# of Classes, # of training examples), following the setup of Andrew Ng in his deep learning course. This would involve transposing X and also one-hot encoding Y.
# Take in an array with classes from 0, 1, ... n, with m number of elements
# Returns a one hot encoded matrix of shape (n, m)
def one_hot_array(Y):
b = np.zeros((Y.size, Y.max() + 1))
b[np.arange(Y.size), Y] = 1
return b.T
X = train.iloc[:, 1:].values.T
Y = train.iloc[:, 0]
Y_onehot = one_hot_array(Y.values)
print("Shape of X is: ", str(X.shape))
print("Shape of Y is: ", str(Y_onehot.shape))
Next, we take a subset of the data for training and testing. We use the first 5,000 examples as training and the next 5,000 examples as testing to reduce training time, and also to prove that we don't need gigabytes of data for neural networks to work.
X_train = X[:, 0:5000]
X_test = X[:, 5000:10000]
Y_train = Y_onehot[:, 0:5000]
Y_test = Y_onehot[:, 5000:10000]
print("Shape of X_train is: " + str(X_train.shape))
print("Shape of X_test is: " + str(X_test.shape))
print("Shape of Y_train is: " + str(Y_train.shape))
print("Shape of Y_test is: " + str(Y_test.shape))
These are functions we will use to construct the deep learning model.
def initialize_parameters_deep(layers_dims):
return parameters
def L_model_forward(X, parameters):
return AL, caches
def compute_cost(AL, Y):
return cost
def L_model_backward(AL, Y, caches):
return grads
def update_parameters(parameters, grads, learning_rate):
return parameters
initialize_parameters_deep sets up $W^{[l]}$ and $b^{[l]}$ for each hidden layer of the neural network (where $l$ denotes the $l^{th}$ hidden layer). These are the parameters the neural network will try to optimise. Note that the dimensions of the first (input) layer must correspond to the number of features in your input data, and the last (output) layer must correspond to the number of classes of Y, in this case 10.
def initialize_parameters_deep(layer_dims):
np.random.seed(3)
parameters = {}
L = len(layer_dims) # number of layers in the network
for l in range(1, L):
parameters['W' + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) * 0.01
assert(parameters['W' + str(l)].shape == (layer_dims[l], layer_dims[l-1]))
parameters['b' + str(l)] = np.zeros((layer_dims[l], 1))
assert(parameters['b' + str(l)].shape == (layer_dims[l], 1))
return parameters
parameters = initialize_parameters_deep([784, 4, 3, 2, 10])
parameters.keys()
softmax, sigmoid, relu are common activation functions, which we do a simple implementation below.
# For softmax, Z should be a (# of classes, m) matrix
# Recall that axis=0 is column sum, while axis=1 is row sum
def softmax(Z):
t = np.exp(Z)
t = t / t.sum(axis=0, keepdims=True)
return t
def sigmoid(Z):
A = 1 / (1 + np.exp(-Z))
return A
def relu(Z):
A = np.maximum(0,Z)
assert(A.shape == Z.shape)
return A
linear_activation_forward implements forward propagation for one hidden layer of the neural network. It first calculates the linear matrix multiplation Z = W.X + b, then calculates A = g(Z) where g is the non-linear activation function for that layer.
def linear_activation_forward(A_prev, W, b, activation):
if activation == "sigmoid":
Z = np.dot(W, A_prev) + b
A = sigmoid(Z)
elif activation == "relu":
Z = np.dot(W, A_prev) + b
A = relu(Z)
elif activation == "softmax":
Z = np.dot(W, A_prev) + b
A = softmax(Z)
# Some assertions to check that shapes are right
assert(Z.shape == (W.shape[0], A.shape[1]))
assert (A.shape == (W.shape[0], A_prev.shape[1]))
# Cache the necessary values for back propagation later
cache = (A_prev, W, b, Z)
return A, cache
Having defined the forward propagation for one layer, we now do a for loop through the layers with L_model_forward. The first L-1 layers will have the relu activation function, while the last layer will be softmax activation for a classification problem.
def L_model_forward(X, parameters):
caches = []
A = X
L = len(parameters) // 2 # number of hidden layers in the neural network
# Hidden layers 1 to L-1 will be Relu.
for l in range(1, L):
A_prev = A
A, cache = linear_activation_forward(A_prev, parameters["W" + str(l)], parameters["b" + str(l)], activation="relu")
caches.append(cache)
# Output layer L will be softmax
AL, cache = linear_activation_forward(A, parameters["W" + str(L)], parameters["b" + str(L)], activation="softmax")
caches.append(cache)
assert(AL.shape == (10, X.shape[1]))
return AL, caches
L_model_forward completes a single pass of the forward propagation from the input layer through the network of weights, to compute the prediction y-hat, which is a matrix of dimensions (1, 1000). This is all we need for forward propagation.
Looking at the first 5 examples that our forward propagation generated, we can see that it outputs around 10% probability for each case. This is because we haven't started training our model yet, so random initalization gives each class equal probability.
AL, caches = L_model_forward(X_train, parameters)
pd.DataFrame(AL[:, 0:5])
We can now compute the cost $J$ after a pass of the forward propagation. Tracking the cost over each epoch (one iteration of forward/backward propagation) tells us the progress of model performance and whether it is decreasing. The cost for softmax activation is as follows (summing over all the n classes of Y and all the m examples): $$ J = -\frac{1}{m} \sum_{i=1}^m \sum_{j=1}^n y \log a^{[L]} $$
def compute_cost(AL, Y):
m = Y.shape[1]
cost = -1/m * np.sum(np.multiply(Y, np.log(AL)))
cost = np.squeeze(cost) # To coerce data from [[17]] into 17
assert(cost.shape == ())
return cost
compute_cost(AL, Y_train) # initial cost of 2.3
Assuming that we have $dZ$, linear_backward calculates $dW$, $db$ and $dA\_prev$ for the backward pass.
def linear_backward(dZ, A_prev, W, b):
m = A_prev.shape[1]
dW = 1/m * np.dot(dZ, A_prev.T)
db = 1/m * np.sum(dZ, axis=1, keepdims=True)
dA_prev = np.dot(W.T, dZ)
assert (dA_prev.shape == A_prev.shape)
assert (dW.shape == W.shape)
assert (db.shape == b.shape)
return dA_prev, dW, db
relu_backward and softmax_backward computes the backward propagation for a single unit. It takes input dA, cache, and outputs dA_prev, dW and db:
def relu_backward(dA, cache):
A_prev, W, b, Z = cache
# Compute dZ
dZ = np.array(dA, copy=True) # convert dz to a numpy array
dZ[Z <= 0] = 0
assert (dZ.shape == Z.shape)
# Compute dA_prev, dW, db
dA_prev, dW, db = linear_backward(dZ, A_prev, W, b)
return dA_prev, dW, db
def softmax_backward(AL, Y, cache):
A_prev, W, b, Z = cache
# Compute dZ
dZ = AL - Y
# Compute dA_prev, dW, db
dA_prev, dW, db = linear_backward(dZ, A_prev, W, b)
return dA_prev, dW, db
L_model_backward iterates through every hidden layer from L-1 to 1 for the full backward propagation. It returns all the gradients (e.g. $dW$) for each layer, which will be used to update the parameters.
def L_model_backward(AL, Y, caches):
grads = {}
L = len(caches) # the number of layers
m = AL.shape[1]
Y = Y.reshape(AL.shape) # after this line, Y is the same shape as AL
# Backpropagation at layer L-1
# The activation is softmax at layer L-1
current_cache = caches[L-1]
grads["dA" + str(L-1)], grads["dW" + str(L)], grads["db" + str(L)] = softmax_backward(AL, Y, current_cache)
# Backpropagation from layers L-2 to 1
# The activations are relu at all these layers
for l in reversed(range(L-1)):
current_cache = caches[l]
dA_prev_temp, dW_temp, db_temp = relu_backward(grads["dA" + str(l+1)], current_cache)
grads["dA" + str(l)] = dA_prev_temp
grads["dW" + str(l + 1)] = dW_temp
grads["db" + str(l + 1)] = db_temp
return grads
grads = L_model_backward(AL, Y_train, caches)
grads.keys()
With the gradients computed using backpropagation, we can now update the parameters of the model, using gradient descent:
$$ W^{[l]} = W^{[l]} - \alpha \text{ } dW^{[l]}$$ $$ b^{[l]} = b^{[l]} - \alpha \text{ } db^{[l]}$$
where $\alpha$ is the learning rate.
def update_parameters(parameters, grads, learning_rate):
L = len(parameters) // 2 # number of layers in the neural network
for l in range(L):
parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate * grads["dW" + str(l+1)]
parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate * grads["db" + str(l+1)]
return parameters
Inspecting the predicted probabilities after one parameter update, we can see that the parameters have changed slightly. We can keep running this repeatedly to see the parameters adjust.
parameters = update_parameters(parameters, grads, learning_rate=0.0075)
AL, caches = L_model_forward(X_train, parameters)
pd.DataFrame(AL[:, 0:5])
We can now combine the forward propagation, cost computation, backward propagation, and finally update parameters into a single function L_layer_model that will repeat these steps over and over again to update the weights using gradient descent.
def L_layer_model(X, Y, layers_dims, learning_rate = 0.0075, num_iterations = 3000, print_cost=False):
np.random.seed(1)
costs = []
# Step a: Initialise Parameters
parameters = initialize_parameters_deep(layers_dims)
# Iterative loops of gradient descent
for i in range(0, num_iterations):
# Step b: Forward Propagation
AL, caches = L_model_forward(X, parameters)
# Step c: Compute cost
cost = compute_cost(AL, Y)
# Step d: Backward Propagation
grads = L_model_backward(AL, Y, caches)
# Step e: Update parameters
parameters = update_parameters(parameters, grads, learning_rate)
# Print the cost every 100 training example
if print_cost and i % 100 == 0:
print ("Cost after iteration %i: %f" %(i, cost))
if print_cost and i % 10 == 0:
costs.append(cost)
# plot the cost
plt.plot(np.squeeze(costs))
plt.ylabel('cost')
plt.xlabel('iterations (per tens)')
plt.title("Learning rate =" + str(learning_rate))
plt.show()
return parameters
We also implement a predict function, which takes in the parameters and predicts the labels for a test set, returning the predictions (1 or 0) and the probabilities (real values between 0-1). We then implement the evaluate_prediction function, which takes in our predictions and actual labels Y to return classification accuracy.
def predict(X, parameters):
# Forward propagation
probabilities, caches = L_model_forward(X, parameters)
# Calculate Predictions (the highest probability for a given example is coded as 1, otherwise 0)
predictions = (probabilities == np.amax(probabilities, axis=0, keepdims=True))
predictions = predictions.astype(float)
return predictions, probabilities
def evaluate_prediction(predictions, Y):
m = Y.shape[1]
predictions_class = predictions.argmax(axis=0).reshape(1, m)
Y_class = Y.argmax(axis=0).reshape(1, m)
return np.sum((predictions_class == Y_class) / (m))
We can now test it out, training our classifier and then predicting it on an out of sample test set.
layers_dims = [784, 10, 10]
parameters = L_layer_model(X_train, Y_train, layers_dims, learning_rate = 0.001, num_iterations = 500, print_cost=True)
pred_train, probs_train = predict(X_train, parameters)
print("Train set error is: " + str(evaluate_prediction(pred_train, Y_train)))
pred_test, probs_test = predict(X_test, parameters)
print("Test set error is: " + str(evaluate_prediction(pred_test, Y_test)))
We see that we can achieve 89% accuracy with a fairly simple neural network! The test accuracy is lower than the train accuracy, suggesting that we are overfitting a little.
Next, we can test our model out by submitting to Kaggle and seeing how we perform on the digit-recognizer task.
test = pd.read_csv("./mnist_kaggle_data/test.csv")
X_kaggle = test.values.T
predictions, probabilities = predict(X_kaggle, parameters)
kaggle_submission = pd.DataFrame({"Label" : predictions.argmax(axis=0)})
kaggle_submission['ImageId'] = kaggle_submission.index + 1
kaggle_submission.to_csv("./mnist_kaggle_data/submission.csv", index=False)
As expected, test accuracy is 88.9% with this simple neural network (very similar to our development set error). We'll try to improve our accuracy in the next section.
Let's make some minor adjustments:
dev_index = np.random.choice(42000, size=2000, replace=False)
train_index = np.setdiff1d(np.arange(42000), dev_index)
X_train = X[:, train_index]
Y_train = Y_onehot[:, train_index]
X_dev = X[:, dev_index]
Y_dev = Y_onehot[:, dev_index]
def L_layer_model_cv(X_train, Y_train, X_dev, Y_dev,
layers_dims, learning_rate = 0.0075, num_iterations = 3000, print_cost=False):
np.random.seed(1)
costs = {"cost_train" : [], "cost_dev" : []}
accuracies = {"acc_train" : [], "acc_dev" : []}
t0 = time.time()
# Step a: Initialise Parameters
parameters = initialize_parameters_deep(layers_dims)
# Iterative loops of gradient descent
for i in range(0, num_iterations):
# Step b: Forward Propagation
AL, caches = L_model_forward(X_train, parameters)
AL_dev, caches_dev = L_model_forward(X_dev, parameters)
# Step c: Compute cost and accuracies every 10 iterations
if print_cost and i % 10 == 0:
cost = compute_cost(AL, Y_train)
cost_dev = compute_cost(AL_dev, Y_dev)
costs["cost_train"].append(cost)
costs["cost_dev"].append(cost_dev)
pred_train, prob_train = predict(X_train, parameters)
acc_train = evaluate_prediction(pred_train, Y_train)
pred_dev, prob_dev = predict(X_dev, parameters)
acc_dev = evaluate_prediction(pred_dev, Y_dev)
accuracies["acc_train"].append(acc_train)
accuracies["acc_dev"].append(acc_dev)
# Step d: Backward Propagation
grads = L_model_backward(AL, Y_train, caches)
# Step e: Update parameters
parameters = update_parameters(parameters, grads, learning_rate)
# Print the cost every 100 training example
if print_cost and i % 100 == 0:
print ("Training cost after iteration %i: %f" %(i, cost))
# plot the train and dev accuracies
plt.plot(np.squeeze(accuracies["acc_train"]))
plt.plot(np.squeeze(accuracies["acc_dev"]))
plt.ylabel('Accuracy')
plt.xlabel('iterations (per tens)')
plt.title("Learning rate =" + str(learning_rate))
plt.show()
# Print final accuracies
pred_train, probs_train = predict(X_train, parameters)
print("Train set error is: " + str(evaluate_prediction(pred_train, Y_train)))
pred_dev, probs_dev = predict(X_dev, parameters)
print("Dev set error is: " + str(evaluate_prediction(pred_dev, Y_dev)))
t1 = time.time()
print ("Elapsed time of: " + str((t1-t0) / 60) + " minutes.")
return parameters
layers_dims = [784, 785, 10]
parameters = L_layer_model_cv(X_train, Y_train, X_dev, Y_dev,
layers_dims, learning_rate = 0.002, num_iterations = 2000, print_cost=True)
From the plot of training set accuracy vs development set accuracy, we do see a gradual divergence as we run more iterations. The interesting thing is that dev set accuracy does not drop, but seems to taper off. This suggests that we are not overfitting too much and do not need to do early stopping.
test = pd.read_csv("./mnist_kaggle_data/test.csv")
X_kaggle = test.values.T
predictions, probabilities = predict(X_kaggle, parameters)
kaggle_submission = pd.DataFrame({"Label" : predictions.argmax(axis=0)})
kaggle_submission['ImageId'] = kaggle_submission.index + 1
kaggle_submission.to_csv("./mnist_kaggle_data/submission.csv", index=False)
After submitting this to Kaggle, and we get a 96.7% accuracy! That's pretty good!