—
title: “assignment”
output: html_document
—
“`{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
“`
## Question 1
### Load Data
“`{r}
source(“loader.R”)
load_mnist()
“`
### 1.a
“`{r}
library(RANN)
res = nn2(train$x, query = test$x, k = 3)
“`
“`{r}
predLabels = rep(0, nrow(res$nn.idx))
for(i in 1:nrow(res$nn.idx))
{
labels = train$y[res$nn.idx[i, ]]
if(labels[2] == labels[3])
{
predLabels[i] = labels[2];
}else{
predLabels[i] = labels[1];
}
}
“`
“`{r}
accuracy = sum(predLabels == test$y) / test$n
accuracy
“`
the achieved accuracy on the test set is 0.8556
### 1.b
“`{r}
getPredLabel <- function(idx, dists, y, k)
{
predLabelsb = rep(0, nrow(idx))
for(i in 1:nrow(idx))
{
labels = y[idx[i, 1:k]]
dises = dists[i, 1:k]
if(k == 1 || dises[1] < 1e-8)
{
predLabelsb[i] = labels[1];
}else{
weights = rep(0, 10)
for(k in 1:length(labels))
{
l = labels[k] + 1
weights[l] = weights[l] + 1 / dises[k]
}
l = labels[1] + 1
for(k in 2:length(labels))
{
if(weights[labels[k] + 1] > weights[l])
{
l = labels[k] + 1
}
}
predLabelsb[i] = l – 1;
}
}
return (predLabelsb)
}
predLabelsb = getPredLabel(res$nn.idx, res$nn.dists, train$y, 3)
“`
“`{r}
accuracb = sum(predLabelsb == test$y) / test$n
accuracb
“`
the achieved accuracy of weighted version on the test set is 0.8561
which is an improvement over the unweighted version 0.8556.
## Question 2
“`{r}
getFold <- function(foldNum)
{
trainSeq = 20000*foldNum + (1:20000)
trainx = train$x[trainSeq,]
trainy = train$y[trainSeq]
testx = train$x[-trainSeq,]
testy = train$y[-trainSeq]
res = nn2(trainx, query = testx, k = 10)
return (list(trainx=trainx, trainy=trainy, testx=testx, testy = testy, res = res))
}
f0 = getFold(0)
f1 = getFold(1)
f2 = getFold(2)
```
```{r}
getFoldAccuracy <- function(f, k)
{
predLabels = getPredLabel(f$res$nn.idx, f$res$nn.dists, f$trainy, k)
accuracy = sum(predLabels == f$testy) / length(f$testy)
return (accuracy)
}
getAverageAccuracy <- function(k)
{
return ((getFoldAccuracy(f0, k) + getFoldAccuracy(f1, k) + getFoldAccuracy(f2, k)) / 3)
}
ks = 1:10
accus = rep(0, 10)
for(k in ks)
{
accus[k] = getAverageAccuracy(k)
}
accus
max(accus)
which(accus == max(accus))
```
```{r}
plot(ks, accus, type="l", xlab = "k", ylab="accuracy")
```
I use 3-fold cross validation to compute the average accuracy of k=1:10. When k=6, it achieves the highest average accuracy 0.8414083.
```{r}
res2 = nn2(train$x, query = test$x, k = 6)
```
```{r}
predk6 = getPredLabel(res2$nn.idx, res2$nn.dists, train$y, 6)
accuracyK6 = sum(predk6 == test$y) / test$n
accuracyK6
```
With K = 6, the accuray over test data is 0.8575 which is an further improvement over 0.8561 in 1.b.