In this practical we will carry out some basic EDA and analysis of some popular ML Datasets that will be used in the course.

1 Fashion MNIST

1.1 Load the data from Keras package

Install Keras R package

install.packages("keras")

library(keras)
fashion <- dataset_fashion_mnist()

class(fashion$train)

[1] "list"

names(fashion$train)

[1] "x" "y"

class(fashion$train$x)

[1] "array"

class(fashion$train$y)

[1] "array"

dim(fashion$train$x)

[1] 60000    28    28

1.2 Plot image

Here is the first image of the training set

rotate <- function(x) t(apply(x, 2, rev))
# Function to plot image from a matrix x
plot_image <- function(x, title = "", title.color = "black") {
  image(rotate(x), axes = FALSE,
        col = grey(seq(0, 1, length = 255)),
        main = list(title, col = title.color))
}
plot_image(fashion$train$x[1,,])

- Is it a shoe ?

Class labels are stored in the outcomes but coded from 0 to 9

clothes.labels <-c( "T-shirt/top", "Trouser", "Pullover", "Dress", "Coat",
                    "Sandal", "Shirt", "Sneaker", "Bag", "Ankle boot")

clothes.labels[as.numeric(fashion$train$y[1])+1]

[1] "Ankle boot"

plot_image(fashion$train$x[1,,],clothes.labels[as.numeric(fashion$train$y[1])+1])

1.3 Task 1: Present the second image of the training set in the same way as done for the first.

1.3.1 Lets see if the data is balanced

table(fashion$train$y)


   0    1    2    3    4    5    6    7    8    9 
6000 6000 6000 6000 6000 6000 6000 6000 6000 6000

1.4 Task 2: Do the same for the test set labels.


   0    1    2    3    4    5    6    7    8    9 
1000 1000 1000 1000 1000 1000 1000 1000 1000 1000

2 MNIST DATA

2.1 Is it the same for MNIST?

load the data

MNIST <- dataset_mnist()

dim(MNIST$train$x)

[1] 60000    28    28

dim(MNIST$train$y)

[1] 60000

dim(MNIST$test$x)

[1] 10000    28    28

dim(MNIST$test$y)

[1] 10000

Scale to [0,1] values

MNIST$train$x <- MNIST$train$x/255
MNIST$test$x <- MNIST$test$x/255

table(MNIST$train$y)


   0    1    2    3    4    5    6    7    8    9 
5923 6742 5958 6131 5842 5421 5918 6265 5851 5949

table(MNIST$test$y)


   0    1    2    3    4    5    6    7    8    9 
 980 1135 1032 1010  982  892  958 1028  974 1009

It is close to balanced (but we can still call that balanced).

2.2 A rough comparison of the training and test set

Training set

res.mean <- NULL
for(i in 0:9){
res.mean <- c(res.mean,mean(MNIST$train$x[which(MNIST$train$y==i),,]))  
}
res.mean.train <- res.mean
res.mean.train

 [1] 0.17339933 0.07599864 0.14897513 0.14153014 0.12136559 0.12874939
 [7] 0.13730178 0.11452770 0.15015598 0.12258994

Test set

res.mean <- NULL
for(i in 0:9){
res.mean <- c(res.mean,mean(MNIST$test$x[which(MNIST$test$y==i),,]))  
}
res.mean.test <- res.mean
res.mean.test

 [1] 0.1723103 0.0767375 0.1501850 0.1433053 0.1226675 0.1320539 0.1435761
 [8] 0.1149012 0.1531272 0.1252664

library(ggplot2)
data.res <- data.frame(meanPixel=c(res.mean.train,res.mean.test),set=rep(c("train","test"),each=10),digit=rep(0:9,2))
data.res$digit<-as.factor(data.res$digit)
ggplot(data=data.res,aes(x=digit,y=meanPixel,fill=set))+
  geom_bar(stat="identity", position=position_dodge())

2.3 Task 3: Do the same for Fashion MNIST

Scale to [0,1] values

 [1] 0.3256078 0.2229053 0.3767010 0.2588977 0.3853255 0.1367355 0.3317848
 [8] 0.1676944 0.3535575 0.3011965

Test set

 [1] 0.3279359 0.2234565 0.3739321 0.2603726 0.3911572 0.1363033 0.3327781
 [8] 0.1687051 0.3534861 0.3003660

2.4 Some linear binary classification

Start with digit ‘3’ being ‘positive’ and digit ‘8’ being negative

postiveTrain <- MNIST$train$x[which(MNIST$train$y==3),,]
dim(postiveTrain)[1]

[1] 6131

negativeTrain <- MNIST$train$x[which(MNIST$train$y==8),,]
dim(negativeTrain)[1]

[1] 5851

making a vector out of an image

vec.image1 <- c(postiveTrain[1,,])
vec.image1[1:20]

 [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

length(vec.image1)

[1] 784

A data matrix of positive samples

dim(postiveTrain)

[1] 6131   28   28

matPosTrain <- matrix(NA,nrow=dim(postiveTrain)[1],ncol=dim(postiveTrain)[2]*dim(postiveTrain)[3])
for (i in 1:dim(postiveTrain)[1]){
  matPosTrain[i,] <- c(postiveTrain[i,,])
}
dim(matPosTrain)

[1] 6131  784

image(rotate(matPosTrain))

A data matrix of negative samples

dim(negativeTrain)

[1] 5851   28   28

matNegTrain <- matrix(NA,nrow=dim(negativeTrain)[1],ncol=dim(negativeTrain)[2]*dim(negativeTrain)[3])
for (i in 1:dim(negativeTrain)[1]){
  matNegTrain[i,] <- c(negativeTrain[i,,])
}
dim(matNegTrain)

[1] 5851  784

2.5 The Data Matrix

A <- rbind(matPosTrain,matNegTrain)
dim(A)

[1] 11982   784

We add a column of one for the bais (intercept)

A <- cbind(rep(1,dim(A)[1]),A)
dim(A)

[1] 11982   785

We now minimize

with

library(MASS)
y <- matrix(rep(c(1,-1),times=c(dim(matPosTrain)[1],dim(matNegTrain)[1])),ncol=1)
A.inv <- ginv(A)
beta <- A.inv%*%y
beta[1:20]

 [1]  2.987433e-01  1.649403e-10 -2.743291e-11  7.066158e-11 -6.348332e-11
 [6] -5.295066e-11  1.643006e-10 -2.041703e-11  2.634341e-10  5.118827e-10
[11]  5.996431e-10 -1.034161e-10 -5.919315e-11  2.355582e-11  4.622277e-11
[16]  6.345416e-11 -2.690877e-11 -3.176165e-12  1.572036e-12 -5.945032e-11

plot(beta,xlab="Index",ylab="value",type="h")

2.6 Create our classifier in one code line

classsify <- function(x,beta) ifelse(sum(c(1,x)*beta)>0,1,-1)

classsify(postiveTrain[1,,],beta)

[1] 1

classsify(negativeTrain[1,,],beta)

[1] -1

2.7 Evaluate on the test set

positiveTest <- MNIST$test$x[which(MNIST$test$y==3),,]
(nPosTest <- dim(positiveTest)[1])

[1] 1010

negativeTest <- MNIST$test$x[which(MNIST$test$y==8),,]
(nNegTest <- dim(negativeTest)[1])

[1] 974

True Positive

res.pos <- NULL
for(i in 1:dim(positiveTest)[1]){
  res.pos <- c(res.pos,classsify(positiveTest[i,,],beta))
}
table(res.pos)

res.pos
 -1   1 
 42 968

truePositives <- table(res.pos)[2]
truePositives

  1 
968

2.7.1 True Negative

res.neg <- NULL
for(i in 1:dim(negativeTest)[1]){
  res.neg <- c(res.neg,classsify(negativeTest[i,,],beta))
}
table(res.neg)

res.neg
 -1   1 
934  40

trueNegatives <- table(res.neg)[1]
trueNegatives

 -1 
934

2.8 Task 4: What is the accuracy? What is the precision and recall? What is the score ?

Reminder:

-1 
40

 1 
42

2.8.1 Precision

        1 
0.9603175

2.8.2 Recall

        1 
0.9584158

2.8.3 Harmonic mean

[1] 0.9593657

2.9 Task 5: Repeat the above to make a classifier that distingiushes between the 0 digit and 3 digit

Task5 solution
Accuracy, precision, recall, and F1 score will be very high

3 CIFAR10

3.1 Load the data

CIFAR10 <- dataset_cifar10()

x_train <- CIFAR10$train$x/255
x_test  <- CIFAR10$test$x/255
y_train <- keras::to_categorical(CIFAR10$train$y, num_classes = 10)
y_test  <- keras::to_categorical(CIFAR10$test$y,  num_classes = 10)
label_name = c("flyer", "car", "bird", "cat", "deer", "dog", "frog ", "horse", "ship", "truck")
label_name

 [1] "flyer" "car"   "bird"  "cat"   "deer"  "dog"   "frog " "horse" "ship" 
[10] "truck"

dim(x_train)

[1] 50000    32    32     3

dim(x_test)

[1] 10000    32    32     3

table(CIFAR10$train$y)


   0    1    2    3    4    5    6    7    8    9 
5000 5000 5000 5000 5000 5000 5000 5000 5000 5000

It is a 4-tensor

3.2 Display the first image

library(EBImage)


Attaching package: 'EBImage'

The following object is masked _by_ '.GlobalEnv':

    rotate

pictures = c(9802, 5, 7, 10, 4, 28,1, 8, 9, 2)

fig_img  = list()
for (i in 1:10 ) {
  fig_mat  = CIFAR10$train$x[pictures[i], , , ]
  fig_img[[i]]  = normalize(Image(transpose(fig_mat), dim=c(32,32,3), colormode='Color'))
}
fig_img_comb = combine(fig_img[1:10])
fig_img_obj = tile(fig_img_comb,5)
plot(fig_img_obj, all=T)

## First image from CIFAR10

CIFAR10$train$y[1]

[1] 6

label_name[CIFAR10$train$y[1]+1]

[1] "frog "

plot(normalize(Image(transpose(x_train[1,,,]), dim=c(32,32,3), colormode='Color')))

3.3 Image for each channel

temp <- x_train[1,,,]/255
image(rotate(temp[,,1]),axes=FALSE,col = rgb((0:10)/10,0,0))

image(rotate(temp[,,2]),axes=FALSE,col = rgb(0,(0:10)/10,0))

image(rotate(temp[,,3]),axes=FALSE,col = rgb(0,0,(0:10)/10))

3.4 Select the first 5 frogs

ind.frog <- which(CIFAR10$train$y==6)
length(ind.frog)

[1] 5000

3.5 Show the first 5 frog

for (i in ind.frog[1:5]){
 plot(normalize(Image(transpose(x_train[i,,,]), dim=c(32,32,3), colormode='Color')))
  text(20,20,label_name[CIFAR10$train$y[i]+1],col="red",lty=2,lwd=10,cex=5)
  }

Practical 1 (R version)

13 January 2021

Contents

1 Fashion MNIST

1.1 Load the data from Keras package

1.2 Plot image

1.3 Task 1: Present the second image of the training set in the same way as done for the first.

1.3.1 Lets see if the data is balanced

1.4 Task 2: Do the same for the test set labels.

2 MNIST DATA

2.1 Is it the same for MNIST?

2.2 A rough comparison of the training and test set

2.3 Task 3: Do the same for Fashion MNIST

2.4 Some linear binary classification

2.5 The Data Matrix

2.6 Create our classifier in one code line

2.7 Evaluate on the test set

2.7.1 True Negative

2.8 Task 4: What is the accuracy? What is the precision and recall? What is the score ?

2.8.1 Precision

2.8.2 Recall

2.8.3 Harmonic mean

2.9 Task 5: Repeat the above to make a classifier that distingiushes between the 0 digit and 3 digit

3 CIFAR10

3.1 Load the data

3.2 Display the first image

3.3 Image for each channel

3.4 Select the first 5 frogs

3.5 Show the first 5 frog

3.6 Task 6: Create an animation that goes through the first 100 images and presents the label of each image as red text.

Practical 1 (R version)

13 January 2021

Contents

1 Fashion MNIST

1.1 Load the data from Keras package

1.2 Plot image

1.3 Task 1: Present the second image of the training set in the same way as done for the first.

1.3.1 Lets see if the data is balanced

1.4 Task 2: Do the same for the test set labels.

2 MNIST DATA

2.1 Is it the same for MNIST?

2.2 A rough comparison of the training and test set

2.3 Task 3: Do the same for Fashion MNIST

2.4 Some linear binary classification

2.5 The Data Matrix

2.6 Create our classifier in one code line

2.7 Evaluate on the test set

2.7.1 True Negative

2.8 Task 4: What is the accuracy? What is the precision and recall? What is the F1 score ?

2.8.1 Precision

2.8.2 Recall

2.8.3 Harmonic mean

2.9 Task 5: Repeat the above to make a classifier that distingiushes between the 0 digit and 3 digit

3 CIFAR10

3.1 Load the data

3.2 Display the first image

3.3 Image for each channel

3.4 Select the first 5 frogs

3.5 Show the first 5 frog

3.6 Task 6: Create an animation that goes through the first 100 images and presents the label of each image as red text.

2.8 Task 4: What is the accuracy? What is the precision and recall? What is the score ?