MAST90083 Computational Statistics & Data Mining KR and GAM
Figure 1: Solution of Question 1
rm( l i s t=l s ( ) ) # c l e a r a l l the v a r i a b l e s in conso l e
l i b r a r y ( l o c f i t )
l i b r a r y ( ISLR)
l i b r a r y (gam)
l i b r a r y ( pracma )
################################################################################
#Question 1 :
n<=250
R<=5
s e t . seed (25) # s e t s the seed f o r random number gene ra t i on making t h e i r r e g ene ra t i on p o s s i b l e
e<=rnorm (n , 0 , 0 . 2 )
x<=s o r t ( r un i f (n , 0 ,R) )
a <= seq (0 , R, l ength= n)
y<=cos (2* pi *x)=0.2*x+e
b<=cos (2* pi *a)=0.2*a
p l o t (x , y )
l i n e s ( a , b )
y . f i t <=l o e s s ( y ˜ x)# , degree=1
p lo t (x , y , c o l = ”white ”)
po in t s (x , y , c o l = ” f i r e b r i c k ”)
l i n e s ( a , b , c o l = ”dodgerblue2 ” , lwd = 2 , l t y = 2)
l i n e s (x , f i t t e d (y . f i t ) , c o l = ” f o r e s t g r e e n ” , lwd = 2 , l t y = 1)
y . f i t <=l o e s s ( y ˜ x , span=0.3 , degree=2)# , degree=1
p lo t (x , y , c o l = ”white ”)
po in t s (x , y , c o l = ” f i r e b r i c k ”)
l i n e s ( a , b , c o l = ”dodgerblue2 ” , lwd = 2 , l t y = 2)
l i n e s (x , f i t t e d (y . f i t ) , c o l = ” f o r e s t g r e e n ” , lwd = 2 , l t y = 1)
# span parameter c on t r o l s the amount o f data po in t s that can be used with in each window and
# hence lower i t s percentage more l o c a l i z e d the f i t and be t t e r the o v e r a l l est imate , degree
# con t r o l s the l o c a l f i t between l i n e a r , polynomial or i n t e r c ep t , in t h i s case polynomial
# performs be t t e r obv ious ly
r e s u l t s <= numeric (20)
f o r ( i in 1 : 20 ) {
y . f i t <=l o e s s ( y ˜ x , span=i /20)
r e s u l t s [ i ] <= sum( ( f i t t e d (y . f i t ) = b )ˆ2)
}
p lo t ( 1 : 2 0 , r e s u l t s [ 1 : 2 0 ] , type = ”b” , c o l = ”dodgerblue2 ” , xlab = ”span ” , ylab = ”RSS” , pch = 19 , lwd = 3)
df = which . min ( r e s u l t s )
1
MAST90083 Computational Statistics & Data Mining KR and GAM
Figure 2: Solution of Question 1.1
n<=50
R<=1
s e t . seed (25) # s e t s the seed f o r random number gene ra t i on making t h e i r r e g ene ra t i on p o s s i b l e
e<=rnorm (n , 0 , 0 . 2 )
x2<=s o r t ( r un i f (n , 0 ,R) )
a2 <= seq (0 , R, l ength= n)
y2<=cos (2* pi *x2 )=0.2*x2+e
b2<=cos (2* pi *a2 )=0.2*a2
r e s u l t s <= numeric (20)
f o r ( i in 1 : 20 ) {
y2 . f i t <=l o e s s ( y2 ˜ x2 , span=i /20)
r e s u l t s [ i ] <= sum( ( f i t t e d ( y2 . f i t ) = b2 )ˆ2)
}
p lo t ( 1 : 2 0 , r e s u l t s [ 1 : 2 0 ] , type = ”b” , c o l = ”dodgerblue2 ” , xlab = ”span ” , ylab = ”RSS” , pch = 19 , lwd = 3)
df = which . min ( r e s u l t s )
# by reduc ing the datas i z e , span parameter va lue can be ra i s ed , because r e l a t i v e l y more data po in t s
# can be used per window th i s time .
################################################################################
#Question 2 :
y . f i t <= l o c f i t ( y ˜ lp (x , nn=1, deg=1))
p l o t (x , y , c o l = ”white ”)
po in t s (x , y , c o l = ” f i r e b r i c k ”)
l i n e s ( a , b , c o l = ”dodgerblue2 ” , lwd = 2 , l t y = 2)
l i n e s (x , f i t t e d (y . f i t ) , c o l = ” f o r e s t g r e e n ” , lwd = 2 , l t y = 1)
y . f i t <= l o c f i t ( y ˜ lp (x , h=1,deg=1) , kern=”r e c t ”)
p l o t (x , y , c o l = ”white ”)
po in t s (x , y , c o l = ” f i r e b r i c k ”)
l i n e s ( a , b , c o l = ”dodgerblue2 ” , lwd = 2 , l t y = 2)
l i n e s (x , f i t t e d (y . f i t ) , c o l = ” f o r e s t g r e e n ” , lwd = 2 , l t y = 1)
kern name <= c (” r e c t ” ,” trwt ” ,” gauss ” ,” b i sq ” ,” expl ”)
r e s u l t s <= matrix (0 ,20 , l ength ( kern name ) )
f o r ( i in 1 : 20 ) {
f o r ( j in 1 : l ength ( kern name ) )
{
y . f i t <= l o c f i t ( y ˜ lp (x , h=i /20 , deg=1) , kern=kern name [ j ] )
r e s u l t s [ i , j ] <= sum( ( f i t t e d (y . f i t ) = b )ˆ2)
}
}
2
MAST90083 Computational Statistics & Data Mining KR and GAM
Figure 3: Solution of Question 1.2
p lo t ( 1 : 2 0 , r e s u l t s [ 1 : 2 0 , 1 ] , type = ”b” , c o l = ”dodgerblue2 ” , xlab = ”BW” , ylab = ”RSS” , pch = 19 , lwd = 3)
l i n e s ( 1 : 2 0 , r e s u l t s [ 1 : 2 0 , 2 ] , type = ”b” , c o l = ” green ” , lwd = 3)
l i n e s ( 1 : 2 0 , r e s u l t s [ 1 : 2 0 , 3 ] , type = ”b” , c o l = ” black ” , lwd = 3)
l i n e s ( 1 : 2 0 , r e s u l t s [ 1 : 2 0 , 4 ] , type = ”b” , c o l = ” ye l low ” , lwd = 3)
l i n e s ( 1 : 2 0 , r e s u l t s [ 1 : 2 0 , 5 ] , type = ”b” , c o l = ”cyan ” , lwd = 3)
df = which ( r e s u l t s == min( r e s u l t s ) , a r r . ind = TRUE)
y . f i t <= l o c f i t ( y ˜ lp (x , h=0.1 , deg=1) , kern=”r e c t ”)
p l o t (x , y , c o l = ”white ”)
po in t s (x , y , c o l = ” f i r e b r i c k ”)
l i n e s ( a , b , c o l = ”dodgerblue2 ” , lwd = 2 , l t y = 2)
l i n e s (x , f i t t e d (y . f i t ) , c o l = ” f o r e s t g r e e n ” , lwd = 2 , l t y = 1)
################################################################################
#Question 3
M1=gam(wage˜ s ( age , df=5)+education , data=Wage)
# there are 6 c o e f f i c i e n t s in to ta l , 1 f o r i n t e r c ep t , 1 f o r knots , 4 f o r the dummy va r i ab l e
M2=gam(wage˜ s ( age , df=5)+educat ion+year , data=Wage)
M3=gam(wage˜ s ( age , df=5)+educat ion+ns ( year , df =5) , data=Wage)
anova (M1,M2,M3, t e s t=”F”)
# M2 seems to be more s i g n i f i c a n t because i t has the lowest p=value and h ighe s t F=value
# a l t e r n a t i v e approach to model s e l e c t i o n
l i b r a r y ( bayestestR )
M0=gam(wage ˜1 , data=Wage)
BMSelection <= bayes fac to r mode l s (M1, M2, M3, denominator = M0)
BMSelection
# here i n t e r c e p t only model i s compared aga in s t the re s t , and the model with the h i ghe s t BF value i s
# more s i g n i f i c a n t
################################################################################
#Question 4
load ( f i l e = ”D:/ Code f i l e s / h ea r td i s . RData”)
V <= c ( names ( h ea r td i s ) )
ncs1 <= gam( chd ˜ ns ( get (V[ 9 ] ) , d f = 3) , data = hea r td i s ) $a i c # AIC : age
ncs1
ncs2 <= gam( chd ˜ ns ( get (V[ 9 ] ) , d f = 3) + ns ( get (V[ 2 ] ) , d f = 3) , data = hea r td i s ) $a i c # AIC : age + tob
ncs2
( ncs1 = ncs2 )/ ncs1 *100 # % improvement
3
MAST90083 Computational Statistics & Data Mining KR and GAM
Figure 4: Solution of Question 1.3
ncs3 <= gam( chd ˜ ns ( get (V[ 9 ] ) , d f = 3) + ns ( get (V[ 2 ] ) , d f = 3) + get (V[ 5 ] ) , data = hea r td i s ) $a i c
# AIC : age + tob + h i s
ncs3
( ncs2 = ncs3 )/ ncs2 *100 # % improvement
ncs4 <= gam( chd ˜ ns ( get (V[ 9 ] ) , d f = 3) + ns ( get (V[ 2 ] ) , d f = 3) + get (V[ 5 ] ) + ns ( get (V[ 3 ] ) , d f = 3) ,
data = hea r td i s ) $a i c # AIC : age + tob + h i s +l d l
ncs4
( ncs3 = ncs4 )/ ncs3 *100 # % improvement
ncs5 <= gam( chd ˜ ns ( get (V[ 9 ] ) , d f = 3) + ns ( get (V[ 2 ] ) , d f = 3) + get (V[ 5 ] ) + ns ( get (V[ 3 ] ) , d f = 3)
+ ns ( get (V[ 6 ] ) , d f = 3) , data = hea r td i s ) $a i c # AIC : age + tob + h i s + l d l + typea
ncs5
( ncs4 = ncs5 )/ ncs4 *100 # % improvement
ncs6 <= gam( chd ˜ ns ( get (V[ 9 ] ) , d f = 3) + ns ( get (V[ 2 ] ) , d f = 3) + get (V[ 5 ] ) + ns ( get (V[ 3 ] ) , d f = 3)
+ ns ( get (V[ 6 ] ) , d f = 3) + ns ( get (V[ 7 ] ) , d f = 3) , data = hea r td i s ) $a i c
# AIC : age + tob + h i s + l d l + typea + obes
ncs6
( ncs5 = ncs6 )/ ncs5 *100 # % improvement
################################################################################
ncs6 <= gam( chd ˜ ns ( get (V[ 9 ] ) , d f = 3) + ns ( get (V[ 2 ] ) , d f = 3) + get (V[ 5 ] ) + ns ( get (V[ 3 ] ) , d f = 3)
+ ns ( get (V[ 6 ] ) , d f = 3) + ns ( get (V[ 7 ] ) , d f = 3)+ ns ( get (V[ 1 ] ) , d f = 3)+ ns ( get (V[ 2 ] ) , d f = 3)
+ ns ( get (V[ 4 ] ) , d f = 3) , data = hea r td i s )
x <= s o r t ( hea r td i s$ tobacco )
y <= ns (x , df = 3) %*% coe f ( ncs6 ) [ 5 : 7 ]
p l o t (x , y , type = ” l ” , lwd = 2 , c o l = ” f o r e s t g r e e n ” , xlab = ”Tobacco ” , ylab = ”Heart Disease ”)
# I t i s non=l i n e a r f o r some samples but mostly l i n e a r
y <= ns (x , df = 3)%*%pinv ( ns (x , df = 3))%*%heartd i s$chd
p lo t ( hear td i s$ tobacco , hear td i s$chd )
l i n e s (x , y , type = ” l ” , lwd = 2 , c o l = ” f o r e s t g r e e n ” , xlab = ”Tobacco ” , ylab = ”Heart Disease ”)
# I t seems l i k e tobacco va r i ab l e has a very major e f f e c t on the heart d i s ea s e , because tobacco va r i ab l e
# o v e r f i t s the response .
4
MAST90083 Computational Statistics & Data Mining KR and GAM
Figure 5: Solution of Question 1.4
Figure 6: Solution of Question 2.1
5
MAST90083 Computational Statistics & Data Mining KR and GAM
Figure 7: Solution of Question 2.2
Figure 8: Solution of Question 2.3
6
MAST90083 Computational Statistics & Data Mining KR and GAM
Figure 9: Solution of Question 2.4
Figure 10: Solution of Question 4.2
7
MAST90083 Computational Statistics & Data Mining KR and GAM
Figure 11: Solution of Question 4.3
8