程序代做CS代考 data mining MAST90083 Computational Statistics & Data Mining KR and GAM

MAST90083 Computational Statistics & Data Mining KR and GAM

Figure 1: Solution of Question 1

rm( l i s t=l s ( ) ) # c l e a r a l l the v a r i a b l e s in conso l e
l i b r a r y ( l o c f i t )
l i b r a r y ( ISLR)
l i b r a r y (gam)
l i b r a r y ( pracma )
################################################################################
#Question 1 :
n<=250 R<=5 s e t . seed (25) # s e t s the seed f o r random number gene ra t i on making t h e i r r e g ene ra t i on p o s s i b l e e<=rnorm (n , 0 , 0 . 2 ) x<=s o r t ( r un i f (n , 0 ,R) ) a <= seq (0 , R, l ength= n) y<=cos (2* pi *x)=0.2*x+e b<=cos (2* pi *a)=0.2*a p l o t (x , y ) l i n e s ( a , b ) y . f i t <=l o e s s ( y ˜ x)# , degree=1 p lo t (x , y , c o l = ”white ”) po in t s (x , y , c o l = ” f i r e b r i c k ”) l i n e s ( a , b , c o l = ”dodgerblue2 ” , lwd = 2 , l t y = 2) l i n e s (x , f i t t e d (y . f i t ) , c o l = ” f o r e s t g r e e n ” , lwd = 2 , l t y = 1) y . f i t <=l o e s s ( y ˜ x , span=0.3 , degree=2)# , degree=1 p lo t (x , y , c o l = ”white ”) po in t s (x , y , c o l = ” f i r e b r i c k ”) l i n e s ( a , b , c o l = ”dodgerblue2 ” , lwd = 2 , l t y = 2) l i n e s (x , f i t t e d (y . f i t ) , c o l = ” f o r e s t g r e e n ” , lwd = 2 , l t y = 1) # span parameter c on t r o l s the amount o f data po in t s that can be used with in each window and # hence lower i t s percentage more l o c a l i z e d the f i t and be t t e r the o v e r a l l est imate , degree # con t r o l s the l o c a l f i t between l i n e a r , polynomial or i n t e r c ep t , in t h i s case polynomial # performs be t t e r obv ious ly r e s u l t s <= numeric (20) f o r ( i in 1 : 20 ) { y . f i t <=l o e s s ( y ˜ x , span=i /20) r e s u l t s [ i ] <= sum( ( f i t t e d (y . f i t ) = b )ˆ2) } p lo t ( 1 : 2 0 , r e s u l t s [ 1 : 2 0 ] , type = ”b” , c o l = ”dodgerblue2 ” , xlab = ”span ” , ylab = ”RSS” , pch = 19 , lwd = 3) df = which . min ( r e s u l t s ) 1 MAST90083 Computational Statistics & Data Mining KR and GAM Figure 2: Solution of Question 1.1 n<=50 R<=1 s e t . seed (25) # s e t s the seed f o r random number gene ra t i on making t h e i r r e g ene ra t i on p o s s i b l e e<=rnorm (n , 0 , 0 . 2 ) x2<=s o r t ( r un i f (n , 0 ,R) ) a2 <= seq (0 , R, l ength= n) y2<=cos (2* pi *x2 )=0.2*x2+e b2<=cos (2* pi *a2 )=0.2*a2 r e s u l t s <= numeric (20) f o r ( i in 1 : 20 ) { y2 . f i t <=l o e s s ( y2 ˜ x2 , span=i /20) r e s u l t s [ i ] <= sum( ( f i t t e d ( y2 . f i t ) = b2 )ˆ2) } p lo t ( 1 : 2 0 , r e s u l t s [ 1 : 2 0 ] , type = ”b” , c o l = ”dodgerblue2 ” , xlab = ”span ” , ylab = ”RSS” , pch = 19 , lwd = 3) df = which . min ( r e s u l t s ) # by reduc ing the datas i z e , span parameter va lue can be ra i s ed , because r e l a t i v e l y more data po in t s # can be used per window th i s time . ################################################################################ #Question 2 : y . f i t <= l o c f i t ( y ˜ lp (x , nn=1, deg=1)) p l o t (x , y , c o l = ”white ”) po in t s (x , y , c o l = ” f i r e b r i c k ”) l i n e s ( a , b , c o l = ”dodgerblue2 ” , lwd = 2 , l t y = 2) l i n e s (x , f i t t e d (y . f i t ) , c o l = ” f o r e s t g r e e n ” , lwd = 2 , l t y = 1) y . f i t <= l o c f i t ( y ˜ lp (x , h=1,deg=1) , kern=”r e c t ”) p l o t (x , y , c o l = ”white ”) po in t s (x , y , c o l = ” f i r e b r i c k ”) l i n e s ( a , b , c o l = ”dodgerblue2 ” , lwd = 2 , l t y = 2) l i n e s (x , f i t t e d (y . f i t ) , c o l = ” f o r e s t g r e e n ” , lwd = 2 , l t y = 1) kern name <= c (” r e c t ” ,” trwt ” ,” gauss ” ,” b i sq ” ,” expl ”) r e s u l t s <= matrix (0 ,20 , l ength ( kern name ) ) f o r ( i in 1 : 20 ) { f o r ( j in 1 : l ength ( kern name ) ) { y . f i t <= l o c f i t ( y ˜ lp (x , h=i /20 , deg=1) , kern=kern name [ j ] ) r e s u l t s [ i , j ] <= sum( ( f i t t e d (y . f i t ) = b )ˆ2) } } 2 MAST90083 Computational Statistics & Data Mining KR and GAM Figure 3: Solution of Question 1.2 p lo t ( 1 : 2 0 , r e s u l t s [ 1 : 2 0 , 1 ] , type = ”b” , c o l = ”dodgerblue2 ” , xlab = ”BW” , ylab = ”RSS” , pch = 19 , lwd = 3) l i n e s ( 1 : 2 0 , r e s u l t s [ 1 : 2 0 , 2 ] , type = ”b” , c o l = ” green ” , lwd = 3) l i n e s ( 1 : 2 0 , r e s u l t s [ 1 : 2 0 , 3 ] , type = ”b” , c o l = ” black ” , lwd = 3) l i n e s ( 1 : 2 0 , r e s u l t s [ 1 : 2 0 , 4 ] , type = ”b” , c o l = ” ye l low ” , lwd = 3) l i n e s ( 1 : 2 0 , r e s u l t s [ 1 : 2 0 , 5 ] , type = ”b” , c o l = ”cyan ” , lwd = 3) df = which ( r e s u l t s == min( r e s u l t s ) , a r r . ind = TRUE) y . f i t <= l o c f i t ( y ˜ lp (x , h=0.1 , deg=1) , kern=”r e c t ”) p l o t (x , y , c o l = ”white ”) po in t s (x , y , c o l = ” f i r e b r i c k ”) l i n e s ( a , b , c o l = ”dodgerblue2 ” , lwd = 2 , l t y = 2) l i n e s (x , f i t t e d (y . f i t ) , c o l = ” f o r e s t g r e e n ” , lwd = 2 , l t y = 1) ################################################################################ #Question 3 M1=gam(wage˜ s ( age , df=5)+education , data=Wage) # there are 6 c o e f f i c i e n t s in to ta l , 1 f o r i n t e r c ep t , 1 f o r knots , 4 f o r the dummy va r i ab l e M2=gam(wage˜ s ( age , df=5)+educat ion+year , data=Wage) M3=gam(wage˜ s ( age , df=5)+educat ion+ns ( year , df =5) , data=Wage) anova (M1,M2,M3, t e s t=”F”) # M2 seems to be more s i g n i f i c a n t because i t has the lowest p=value and h ighe s t F=value # a l t e r n a t i v e approach to model s e l e c t i o n l i b r a r y ( bayestestR ) M0=gam(wage ˜1 , data=Wage) BMSelection <= bayes fac to r mode l s (M1, M2, M3, denominator = M0) BMSelection # here i n t e r c e p t only model i s compared aga in s t the re s t , and the model with the h i ghe s t BF value i s # more s i g n i f i c a n t ################################################################################ #Question 4 load ( f i l e = ”D:/ Code f i l e s / h ea r td i s . RData”) V <= c ( names ( h ea r td i s ) ) ncs1 <= gam( chd ˜ ns ( get (V[ 9 ] ) , d f = 3) , data = hea r td i s ) $a i c # AIC : age ncs1 ncs2 <= gam( chd ˜ ns ( get (V[ 9 ] ) , d f = 3) + ns ( get (V[ 2 ] ) , d f = 3) , data = hea r td i s ) $a i c # AIC : age + tob ncs2 ( ncs1 = ncs2 )/ ncs1 *100 # % improvement 3 MAST90083 Computational Statistics & Data Mining KR and GAM Figure 4: Solution of Question 1.3 ncs3 <= gam( chd ˜ ns ( get (V[ 9 ] ) , d f = 3) + ns ( get (V[ 2 ] ) , d f = 3) + get (V[ 5 ] ) , data = hea r td i s ) $a i c # AIC : age + tob + h i s ncs3 ( ncs2 = ncs3 )/ ncs2 *100 # % improvement ncs4 <= gam( chd ˜ ns ( get (V[ 9 ] ) , d f = 3) + ns ( get (V[ 2 ] ) , d f = 3) + get (V[ 5 ] ) + ns ( get (V[ 3 ] ) , d f = 3) , data = hea r td i s ) $a i c # AIC : age + tob + h i s +l d l ncs4 ( ncs3 = ncs4 )/ ncs3 *100 # % improvement ncs5 <= gam( chd ˜ ns ( get (V[ 9 ] ) , d f = 3) + ns ( get (V[ 2 ] ) , d f = 3) + get (V[ 5 ] ) + ns ( get (V[ 3 ] ) , d f = 3) + ns ( get (V[ 6 ] ) , d f = 3) , data = hea r td i s ) $a i c # AIC : age + tob + h i s + l d l + typea ncs5 ( ncs4 = ncs5 )/ ncs4 *100 # % improvement ncs6 <= gam( chd ˜ ns ( get (V[ 9 ] ) , d f = 3) + ns ( get (V[ 2 ] ) , d f = 3) + get (V[ 5 ] ) + ns ( get (V[ 3 ] ) , d f = 3) + ns ( get (V[ 6 ] ) , d f = 3) + ns ( get (V[ 7 ] ) , d f = 3) , data = hea r td i s ) $a i c # AIC : age + tob + h i s + l d l + typea + obes ncs6 ( ncs5 = ncs6 )/ ncs5 *100 # % improvement ################################################################################ ncs6 <= gam( chd ˜ ns ( get (V[ 9 ] ) , d f = 3) + ns ( get (V[ 2 ] ) , d f = 3) + get (V[ 5 ] ) + ns ( get (V[ 3 ] ) , d f = 3) + ns ( get (V[ 6 ] ) , d f = 3) + ns ( get (V[ 7 ] ) , d f = 3)+ ns ( get (V[ 1 ] ) , d f = 3)+ ns ( get (V[ 2 ] ) , d f = 3) + ns ( get (V[ 4 ] ) , d f = 3) , data = hea r td i s ) x <= s o r t ( hea r td i s$ tobacco ) y <= ns (x , df = 3) %*% coe f ( ncs6 ) [ 5 : 7 ] p l o t (x , y , type = ” l ” , lwd = 2 , c o l = ” f o r e s t g r e e n ” , xlab = ”Tobacco ” , ylab = ”Heart Disease ”) # I t i s non=l i n e a r f o r some samples but mostly l i n e a r y <= ns (x , df = 3)%*%pinv ( ns (x , df = 3))%*%heartd i s$chd p lo t ( hear td i s$ tobacco , hear td i s$chd ) l i n e s (x , y , type = ” l ” , lwd = 2 , c o l = ” f o r e s t g r e e n ” , xlab = ”Tobacco ” , ylab = ”Heart Disease ”) # I t seems l i k e tobacco va r i ab l e has a very major e f f e c t on the heart d i s ea s e , because tobacco va r i ab l e # o v e r f i t s the response . 4 MAST90083 Computational Statistics & Data Mining KR and GAM Figure 5: Solution of Question 1.4 Figure 6: Solution of Question 2.1 5 MAST90083 Computational Statistics & Data Mining KR and GAM Figure 7: Solution of Question 2.2 Figure 8: Solution of Question 2.3 6 MAST90083 Computational Statistics & Data Mining KR and GAM Figure 9: Solution of Question 2.4 Figure 10: Solution of Question 4.2 7 MAST90083 Computational Statistics & Data Mining KR and GAM Figure 11: Solution of Question 4.3 8