STUDNETID_assessment_2_q3-checkpoint
In [8]:
####################### GENERAL AUXILIARY FUNCTIONS #######################
## The following structure helps us to have functions with multiple outputs
### credit: https://stat.ethz.ch/pipermail/r-help/2004-June/053343.html
error.rate <- function(Y1, T1){ if (nrow(Y1)!=nrow(T1)){ stop('error.rate: size of true lables and predicted labels mismatch') } return (sum(T1!=Y1)/nrow(T1)) } ########################## options(warn=-1) library(h2o) #If there is a proxy: proxy.old <- Sys.getenv('http_proxy'); Sys.setenv('http_proxy'=''); localH2O = h2o.init(nthreads = -1, port = 54321, max_mem_size = '6G', startH2O = TRUE) # Students: Use the "absolute" path to the datasets on your machine (important) labeled.frame <- h2o.importFile(path = '/Users/vi/Documents/assessments_datasets/Task3C_labeled.csv' ,sep=',') unlabeled.frame <- h2o.importFile(path = '/Users/vi/Documents/assessments_datasets/Task3C_unlabeled.csv' ,sep=',') test.frame <- h2o.importFile(path = '/Users/vi/Documents/assessments_datasets/Task3C_test.csv' ,sep=',') labeled.frame[,1] <- as.factor(labeled.frame$label) unlabeled.frame[,1] <- NA train.frame <- h2o.rbind(labeled.frame[,-1], unlabeled.frame[,-1]) test.frame[,1] <- as.factor(test.frame$label) # build a neural network classifier based on the labeled training data NN.model <- h2o.deeplearning( x = 2:ncol(labeled.frame), # select all pixels + extra features y = 1, training_frame = labeled.frame, # specify the frame (imported file) hidden = c(100), # number of layers and their units epochs = 50, # maximum number of epoches activation = 'Tanh', # activation function autoencoder = FALSE, # is it an autoencoder? Yes! l2 = 0.1 ) labeled.predict <- h2o.predict(NN.model, labeled.frame)$predict error.rate(labeled.frame$label, labeled.predict) test.predict <- h2o.predict(NN.model, test.frame)$predict error.rate(test.frame$label, test.predict) Connection successful! R is connected to the H2O cluster: H2O cluster uptime: 3 minutes 15 seconds H2O cluster timezone: Asia/Shanghai H2O data parsing timezone: UTC H2O cluster version: 3.18.0.8 H2O cluster version age: 21 days, 2 hours and 30 minutes H2O cluster name: H2O_started_from_R_vagrant_efu630 H2O cluster total nodes: 1 H2O cluster total memory: 5.32 GB H2O cluster total cores: 8 H2O cluster allowed cores: 8 H2O cluster healthy: TRUE H2O Connection ip: localhost H2O Connection port: 54321 H2O Connection proxy: NA H2O Internal Security: FALSE H2O API Extensions: XGBoost, Algos, AutoML, Core V3, Core V4 R Version: R version 3.4.1 (2017-06-30) |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% 0 |======================================================================| 100% 0.337 In [9]: k.range = seq(20, 500, 20) # k.range = seq(20, 500, 20) #k.range = c(20) # k.range = seq(20, 40, 20) reconstruction.train.error <- matrix(NA, nrow=length(k.range), ncol=1) classification.labeled.error <- matrix(NA, nrow=length(k.range), ncol=1) reconstruction.test.error <- matrix(NA, nrow=length(k.range), ncol=1) classification.test.error <- matrix(NA, nrow=length(k.range), ncol=1) i = 1 # k = 20 for (k in k.range){ autoEncoder.model <- h2o.deeplearning( x = 1:ncol(train.frame), training_frame = train.frame, hidden = c(k), epochs = 50, activation = 'Tanh', autoencoder = TRUE ) t = h2o.anomaly(autoEncoder.model, train.frame) reconstruction.train.error[i] = mean(sqrt(t * ncol(t))) t = h2o.anomaly(autoEncoder.model, test.frame[,-1]) reconstruction.test.error[i] = mean(sqrt(t * ncol(t))) labeled.frame.add = h2o.cbind(labeled.frame, h2o.deepfeatures(autoEncoder.model, labeled.frame[,-1], layer = 1)) test.frame.add = h2o.cbind(test.frame[,-1], h2o.deepfeatures(autoEncoder.model, test.frame[,-1], layer = 1)) NN.model <- h2o.deeplearning( x = 2:ncol(labeled.frame.add), y = 1, training_frame = labeled.frame.add, hidden = c(100), epochs = 50, activation = 'Tanh', autoencoder = FALSE, l2 = 0.1 ) pre.label <- h2o.predict(NN.model, labeled.frame.add)$predict classification.labeled.error[i] = error.rate(labeled.frame$label, pre.label) pre.test <- h2o.predict(NN.model, test.frame.add)$predict classification.test.error[i] = error.rate(test.frame$label, pre.test) i = i + 1 } |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% |======================================================================| 100% In [21]: # Produce the needed plots. #XXXXX library(ggplot2) ggplot(data.frame(k.range, reconstruction.train.error,reconstruction.test.error) , aes(k.range)) + geom_line(aes(y = reconstruction.train.error, colour = "reconstruction train error")) + geom_line(aes(y = reconstruction.test.error, colour = "reconstruction test error")) + geom_point(aes(y = reconstruction.train.error, colour = "reconstruction train error")) + geom_point(aes(y = reconstruction.test.error, colour = "reconstruction test error")) + labs(title="Plot of reconstruction error", x="number of units", y="reconstruction error") In [22]: ggplot(data.frame(k.range, classification.labeled.error,classification.test.error) , aes(k.range)) + geom_line(aes(y = classification.labeled.error, colour = "classification labeled error")) + geom_point(aes(y = classification.labeled.error, colour = "classification labeled error")) + geom_line(aes(y = classification.test.error, colour = "classification test error")) + geom_point(aes(y = classification.test.error, colour = "classification test error")) + labs(title="Plot of classification error", x="number of units", y="classification error") In [27]: # optimum number(s) of units in the middle layer of the autoencoder in terms of the reconstruction errors best.reconstruction.k = k.range[which(min(reconstruction.test.error) == reconstruction.test.error)] best.reconstruction.k 180 In [26]: # optimum number(s) of units in the middle layer of the autoencoder in terms of the misclassification errors best.classification.k = k.range[which(min(classification.test.error) == classification.test.error)] best.classification.k 280 There is no obvous correlation between reconstruction error and misclassification error. Sometimes, reconstruction error is low but misclassification error is high. In [ ]: