R, Factorial & Regression program example (15' SCY)
코딩 공부/R-전산 통계학 2020. 2. 23. 16:18# Factorial - Two Factor
수리통계학을 공부할 때는 "어떤게 treatment, block 인거지?" 에 따라서 분석을 하는 것이 귀찮았지만, 전산프로그램 R을 다룰 때는 분석 결과에 전부 다 나오기 때문에 따로 어려운 점이 없었을 것입니다
# DATA
A = gl(4,2*3)
B = gl(2,3,4*2*3)
y <- c(40,36,43, 36,34,29,
39,36,33, 32,26,25,
32,34,29, 26,23,24,
33,27,25, 20,22,18)
sm.fac2 <- function(A,B,y) {
a=nlevels(A) ; b=nlevels(B) ; n=match(unique(B), B)[2]-1 ; N=a*b*n
Ti=tapply(y,A,sum) ; Tj=tapply(y,B,sum)
SS.Tot=sum(y^2)-sum(sum(Ti)^2)/N
k = rep(1:(a*b),each=n)
SS.trt = 0 ; for (i in 1:(a*b)) { SS.trt = SS.trt + ((sum(y[k==i]))^2/n) }
SS.trt = SS.trt-sum(sum(Ti)^2)/N
SS.A=sum(Ti^2)/(b*n)-sum(sum(Ti)^2)/N ; SS.B=sum(Tj^2)/(a*n)-sum(sum(Ti)^2)/N
SS.AB=SS.trt-SS.A-SS.B ; SS.E=SS.Tot-SS.trt
df.A=a-1 ; df.B=b-1 ; df.AB=(a-1)*(b-1) ; df.E=a*b*(n-1)
MS.A=SS.A/df.A ; MS.B=SS.B/df.B ; MS.AB=SS.AB/df.AB ; MS.E=SS.E/df.E
F.valuea=MS.A/MS.E ; pvaluea=pf(abs(F.valuea),df.A,df.E,lower.tail = F)
F.valueb=MS.B/MS.E ; pvalueb=pf(abs(F.valueb),df.B,df.E,lower.tail = F)
F.valueab=MS.AB/MS.E ; pvalueab=pf(abs(F.valueab),df.AB,df.E,lower.tail = F)
A.data=c(df.A,SS.A,MS.A,F.valuea,pvaluea) ; B.data=c(df.B,SS.B,MS.B,F.valueb,pvalueb)
AB.data=c(df.AB,SS.AB,MS.AB,F.valueab,pvalueab) ; error.data=c(df.E,SS.E,MS.E,NA,NA)
data=rbind(A.data, B.data, AB.data, error.data)
colnames(data)=c("Df","Sum Sq","Mean Sq","F value","Pr(>F))") ; rownames(data)=c("A","B","A:B","Residuals")
print(data)
}
sm.fac2(A,B,y)
# Original (compare with it)
anova(lm(y~A*B))
# Factorial - Three Factor
본인이 식을 만들면서, 꼬이지만 않으면 역시 어렵지 않은 프로그램이라고 생각됩니다
# DATA
A<-gl(3,24)
B<-gl(4,3,72)
C<-gl(2,12,72)
y=c(42,34,33, 47,59,43, 55,66,69, 64,60,94,
38,34,41, 38,41,32, 48,26,63, 58,51,67,
28,28,11, 52,33,29, 54,36,53, 63,55,40,
22,8,3, 33,15,24, 27,41,38, 41,40,64,
34,19,17, 22,3,17, 17,40,40, 33,34,40,
10,8,0, 16,10,1, 22,6,38, 29,24,20)
sm.fac3=function(A,B,C,y){
a=length(unique(A)); b=length(unique(B)); c=length(unique(C)); n=length(y)/(a*b*c)
df.a=a-1; df.b=b-1; df.c=c-1
df.ab=(a-1)*(b-1); df.ac=(a-1)*(c-1); df.bc=(b-1)*(c-1)
df.abc=(a-1)*(b-1)*(c-1) ; df.e=a*b*c*(n-1); df.tot=length(y)-1
at=abt2=act2=abct2=ct=cbt2=bt2=0
for(i in 1:b){
for(j in 1:a){
at=at+sum(y[A==unique(A)[j]])^2
abt2=abt2+sum(y[A==unique(A)[j]&B==unique(B)[i]])^2
for(k in 1:c){
act2=act2+sum(y[A==unique(A)[j]&C==unique(C)[k]])^2
abct2=abct2+sum(y[A==unique(A)[j]&B==unique(B)[i]&C==unique(C)[k]])^2 #
ct=ct+sum(y[C==unique(C)[k]])^2
cbt2=cbt2+sum(y[C==unique(C)[k]&B==unique(B)[i]])^2
}
}
abct=sum(y)
abc2t=sum(y^2)
bt2=bt2+sum(y[B==unique(B)[i]])^2
}
at=at/b
ct=ct/(a*b)
act2=act2/b
cbt2=cbt2/a
ssa=at/(b*c*n) - abct^2/length(y)
ssb=bt2/(a*c*n) - abct^2/length(y)
ssc=ct/(a*b*n) - abct^2/length(y)
ssab=abt2/(c*n) - ssa-ssb -abct^2/length(y)
ssbc=cbt2/(a*n) - ssb-ssc -abct^2/length(y)
ssac=act2/(b*n) - ssc-ssa -abct^2/length(y)
sstr=abct2/n - abct^2/length(y)
ssabc=sstr- ssa-ssb-ssc-ssab-ssbc-ssac
sstot=abc2t - abct^2/length(y)
sse=sstot-ssa-ssb-ssc-ssab-ssbc-ssac-ssabc
msa=ssa/df.a
msb=ssb/df.b
msc=ssc/df.c
msab=ssab/df.ab
msac=ssac/df.ac
msbc=ssbc/df.bc
msabc=ssabc/df.abc
mse=sse/df.e
f.a=msa/mse ; pval.a=pf(abs(f.a),df.a,df.e,lower.tail = F)
f.b=msb/mse ; pval.b=pf(abs(f.b),df.b,df.e,lower.tail = F)
f.c=msc/mse ; pval.c=pf(abs(f.c),df.c,df.e,lower.tail = F)
f.ab=msab/mse ; pval.ab=pf(abs(f.ab),df.ab,df.e,lower.tail = F)
f.ac=msac/mse ; pval.ac=pf(abs(f.ac),df.ac,df.e,lower.tail = F)
f.bc=msbc/mse ; pval.bc=pf(abs(f.bc),df.bc,df.e,lower.tail = F)
f.abc=msabc/mse ; pval.abc=pf(abs(f.abc),df.abc,df.e,lower.tail = F)
AA=round(c(df.a,ssa,msa),0)
BB=round(c(df.b,ssb,msb),0)
CC=round(c(df.c,ssc,msc),0)
AB=round(c(df.ab,ssab,msab),0)
AC=round(c(df.ac,ssac,msac),0)
BC=round(c(df.bc,ssbc,msbc),0)
ABC=round(c(df.abc,ssabc,msabc),0)
error=round(c(df.e,sse,mse,NA,NA),0)
total=round(c(df.tot,sstot,NA,NA,NA),0)
anov=matrix(c(AA,round(f.a,3),pval.a,
BB,round(f.b,3),pval.b,
CC,round(f.c,3),pval.c,
AB,round(f.ab,3),pval.ab,
AC,round(f.ac,3),pval.ac,
BC,round(f.bc,3),pval.bc,
ABC,round(f.abc,3),pval.abc,error,total),
ncol=5,byrow=T,
dimnames = list(c("a","b","c","a:b","a:c","b:c","a:b:c","Residuals","Total"),
c("DF","Sum Sq","Mean Sq","F value","Pr(>F)")))
anov
}
sm.fac3(A,B,C,y)
# Original (compare with it)
summary(aov(y~A*B*C))
# Regression - yhat=b0+b1*x , C.I
회귀분석 시간에 배운 Sxx, Sxy, Syy 식만 제대로 입력해주면 어렵지는 않습니다
# DATA
x <- c(1.35, 1.9, 1.7, 1.8, 1.3,
2.05, 1.6, 1.8, 1.85, 1.4)
y <- c(17.9, 16.5, 16.4, 16.8, 18.8,
15.5, 17.5, 16.4, 15.9, 18.3)
# f.t
sm.yhat <- function(x,y,x0,U,level){
n <- length(y)
model <- lm(y~x)
df <- (n-2)
sxx=sum((x-mean(x))^2) ; syy=sum((y-mean(y))^2) ; sxy=sum((x-mean(x))*(y-mean(y)))
b1 <- sxy/sxx ; b0 <- mean(y)-b1*mean(x)
sse <- syy-b1*sxy
s <- sqrt(sse/(n-2))
lowerbound <- b0+b1*x0+qt((1-level)/2,df)*s*sqrt(1/n+(x0-mean(x))^2/sxx)
upperbound <- b0+b1*x0-qt((1-level)/2,df)*s*sqrt(1/n+(x0-mean(x))^2/sxx)
CI <- c()
CI <- c(lowerbound," , ",upperbound)
tstar <- abs((b0+b1*x0-U)/( s*sqrt(1/n+(x0-mean(x))^2/sxx) ))
pv <- pt(tstar, (n-2), lower.tail = F)
cat (100*level,"% C.I for b0+b1*",x0," = (",CI,")",
"\n t-star = ",tstar, ", p-value = ",pv,sep = "")
}
sm.yhat(x,y,x0=1.7,U=0,level=0.9)
# Regression - Y|x=x0 , C.I
바로 위의 회귀분석과 식 구성이 매우 동일해보이지만, 'Y|x=x0 일 때는 lower, upper bound 에 +1 이 붙음' 을 꼭 아셔야합니다
# DATA
x <- c(1.35, 1.9, 1.7, 1.8, 1.3,
2.05, 1.6, 1.8, 1.85, 1.4)
y <- c(17.9, 16.5, 16.4, 16.8, 18.8,
15.5, 17.5, 16.4, 15.9, 18.3)
# f.t
sm.xgiven <- function (x,y,x0,U,level) {
n <- length(y)
model <- lm(y~x)
df <- (n-2)
sxx=sum((x-mean(x))^2) ; syy=sum((y-mean(y))^2) ; sxy=sum((x-mean(x))*(y-mean(y)))
b1 <- sxy/sxx ; b0 <- mean(y)-b1*mean(x)
sse <- syy-b1*sxy
s <- sqrt(sse/(n-2))
lowerbound <- b0+b1*x0+qt((1-level)/2,df)*s*sqrt(1+1/n+(x0-mean(x))^2/sxx)
upperbound <- b0+b1*x0-qt((1-level)/2,df)*s*sqrt(1+1/n+(x0-mean(x))^2/sxx)
CI <- c()
CI <- c(lowerbound,upperbound)
tstar <- abs((b0+b1*x0-U)/( s*sqrt(1+1/n+(x0-mean(x))^2/sxx) ))
pv <- pt(tstar, (n-2), lower.tail = F)
cat (100*level,"% C.I for Y|x, x=",x0,"= (",CI,")",
"\n t-star =",tstar, ", p-value =",pv)
}
sm.xgiven(x,y,x0=1.7,U=0,level=0.90)