—
title: “Student Performance Analysis”
date: “October 16, 2018”
output: html_document
—
“`{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(ggplot2)
library(randomForest)
library(car)
standard<-read.csv("Student_data_standard.csv",header=T,sep=",")
colnames(standard)[1]<-c("Year")
advanced<-read.csv("Student_data_advanced.csv",header=T,sep=",")
colnames(advanced)[1]<-c("Year")
```
##Distribution of Grades by year
below chart provides us with the distribution of grades by year
```{r}
d<-data.frame(table(standard$Year,standard$Unit.of.Study.Grade))
colnames(d)<-c("year","Unit.of.Study.Grade","Freq")
ggplot()+geom_col(data=d,aes(x=year,y=Freq,fill=Unit.of.Study.Grade),position = "dodge")
ggplot(data=d,aes(x=year,y=Freq,group=Unit.of.Study.Grade))+
geom_line(aes(color = Unit.of.Study.Grade))+
geom_point(aes(color = Unit.of.Study.Grade))
```
##Classification
```{r}
#split data into trainining and validation set
set.seed(100)
dat<-advanced[,-1]
ind<-sample(2,nrow(dat),
replace=T,
prob =c(.70,0.3))
training<-dat[ind==1,]
testing<-dat[ind==2,]
classi<-randomForest(Unit.of.Study.Grade~.,data=training,importance=T, ntree = 10)
classi
```
##Prediction
```{r}
#predicting on the training set
predictionTrain<-predict(classi,training,type="class")
#classification
table(predictionTrain,training$Unit.of.Study.Grade)
#predicting on the validation set
predictionTest<-predict(classi,testing,type="class")
#classification
mean(predictionTest==testing$Unit.of.Study.Grade)
table(predictionTest,testing$Unit.of.Study.Grade)
```
##Checking for important variables
```{r}
importance(classi)
varImpPlot(classi)
```
##iterating mtry to obtain the best classification
```{r}
leveneTest(Count~Gender*Mode*Unit.of.Study.Level*Domestic.Intl,data = standard)
```
##Linear model
a one way anova to check if the number of the number of grades with the preceding attributes differ across the variables Gender,Mode,Unit.of.Study.Level and Domestic.Intl.the model is as below
```{r}
fit<-aov(standard$Count~standard$Gender+standard$Mode+standard$Unit.of.Study.Level+standard$Domestic.Intl)
summary(fit)
```
```{r}
plot(fit)
hist(residuals(fit),col="cyan")
```
```{r}
kruskal.test(Count~ Gender,data=standard)
kruskal.test(Count~ Unit.of.Study.Level,data=standard)
kruskal.test(Count~ Domestic.Intl,data=standard)
kruskal.test(Count~ Mode,data=standard)
```