## ——————————————
##
## ST4060 / ST6015 / ST6040
## R script – Fri 26 Nov 2021 lecture
## Eric Wolsztynski
##
## Classical classifiers…
##
## ——————————————
# ————————————————
# Logistic regression based classification
# Up or Down?
library(ISLR)
head(Smarket)
?Smarket
glmo = glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,
data=Smarket, family=binomial)
summary(glmo)
n = nrow(Smarket)
n
itrain = which(Smarket$Year<2005)
# glmo.cv = glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,
# data=Smarket,
# family=binomial,
# subset=itrain)
glmo.cv = glm(Direction~Lag1+Lag2,
data=Smarket,
family=binomial,
subset=itrain)
summary(glmo.cv)
po = predict(glmo.cv, type="response")
summary(po)
glm.pred = rep("Down", length(po))
glm.pred[po>.5] = “Up”
tb = table(glm.pred,Smarket$Direction[itrain])
# training set prediction accuracy:
1-sum(diag(tb))/sum(tb)
Direction.2005 = Smarket$Direction[!itrain]
po.2005 = predict(glmo.cv, type=”response”,
Smarket[-itrain,])
length(po.2005)
glm.pred.2005 = rep(“Down”, length(po.2005))
glm.pred.2005[po.2005>.5] = “Up”
# test set prediction accuracy:
tb.test = table(glm.pred.2005,
Smarket$Direction[-itrain])
1-sum(diag(tb.test))/sum(tb.test)
# ————————————————
# Logistic regression based classification
# which iris?
head(iris)
n = nrow(iris)
x = iris[sample(1:n),]
head(x)
# reduce problem to identifying virginica
is.vir <- (x$Species=='virginica')
x$Species <- NULL
x$Species <- is.vir
glmo <- glm(Species~., x, family=binomial)
glmo$fitted.values - predict(glmo, type='response')
# Note: how to select a subset...
# iris.vir = which(iris$Species == "virginica")
# iris.vir = subset(iris, Species=='virginica')
# dim(iris.vir)
n = nrow(x)
K = 10
train.acc = test.acc = numeric(K)
folds = cut(1:n, K, labels=FALSE)
k = 1
thr = .8
for(k in 1:K){
itrain = which(folds!=k)
glmo = glm(Species~., data=x,
family=binomial,
subset=itrain)
tb = table(glmo$fitted.values>thr, x$Species[itrain])
train.acc[k] = sum(diag(tb))/sum(tb)
# prediction
p.test = predict(glmo, x[-itrain,], type=’response’)
tb = table(p.test>thr, x$Species[-itrain])
test.acc[k] = sum(diag(tb))/sum(tb)
}
# warnings()
mean(train.acc)
mean(test.acc)
tb
# ————————————————
# Illustration of hierarchical clustering
data(eurodist)
hc = hclust(eurodist, method=”ward”)
hc = hclust(eurodist, method=”single”)
hc = hclust(eurodist, method=”complete”)
plot(hc)