#L1 probabilities pnorm(1.96) qnorm(0.95) pt(2,3) pchisq(2,3) qchisq(0.1,5) #L2 One sample t-test - beer content x<- c(374.8, 375.0, 375.3, 374.8, 374.4, 374.9) t.test(x, alternative="less", mu=375) n=length(x) mu0=375 xbar=mean(x) s=sd(x) t0=(xbar-mu0)/(s/sqrt(n)) p.value=pt(t0,n-1) c(n,xbar,s,t0,p.value) #L2 One sample t-test - Sales contacts x<-c(18,17,18,13,15,16,21,14,24,12,19,18,17,16,15,14,17,18,19,20,13,14,12,15) n=length(x) mu0=15 qqnorm(x) qqline(x) t.test(x, alternative="greater",mu=15) barx=mean(x) s=sqrt(var(x)) t0=(barx - mu0)/(s/sqrt(n)) p=pt(t0,n-1,lower.tail=F) c(barx,s,t0,p) #L2 Paired sample t-test - Smoking x<- c(25, 25, 27, 44, 30, 67, 53, 53, 52, 60, 28) y<- c(27, 29, 37, 36, 46, 82, 57, 80, 61, 59, 43) d=y-x d qqnorm(d) qqline(d) t.test(x,y, alternative="two.sided", mu=0, paired=T) #L3 One sample z-test - Birthweights x=c(79,92,88,98,109,109,112,88,105,89,121,71,110,96) qqnorm(x) qqline(x) n=length(x) mu0=109 xbar=mean(x) sd0=15 t0=(xbar-mu0)/(sd0/sqrt(n)) p.value=pnorm(t0) c(n,xbar,sd0,t0,p.value) #L4 Rejection region - height comparison x<- c(25, 25, 27, 44, 30, 67, 53, 53, 52, 60, 28) y<- c(27, 29, 37, 36, 46, 82, 57, 80, 61, 59, 43) d=y-x d sdd = sd(d) sdd n=11 cv1=qt(0.975,10) cv1 cvlower=-qt(0.975,10)*sdd/sqrt(n) cvupper=qt(0.975,10)*sdd/sqrt(n) cvlower cvupper #L7 Binomial test - Flu Vaccine n=12 p0=0.2 x=5 binom.test(x,n,p0,alt="greater",0.95) p.value=pbinom(x-1,n,p0,lower.tail=F) # exact p-value p.value #L7 Binomial test - Washers n=100 p0=1/3 x=40 binom.test(x,n,p0,alt="greater",0.95) z10=(x-0.5-n*p0)/sqrt(n*p0*(1-p0)) sp=x/n z20=(sp-p0)/sqrt(p0*(1-p0)/n) p.value.exact=pbinom(x-1,n,p0,lower.tail=F) # exact p-value p.value.norm=pnorm(z10,lower.tail=F) #normal approx. p.value.ztest=pnorm(z20,lower.tail=F) #z.test c(sp,z10,z20) c(p.value.exact,p.value.norm,p.value.ztest) #L8 Sign test Moisture retention y=c(97.5,95.2,97.3,96.0,96.8,99.8,97.4,95.3,98.2,99.1,96.1,97.6,98.2,98.5,99.4) mu0=96 diff=y-mu0 diff n=length(diff[diff!=0]) x=length(diff[diff>0]) ps=x/n p0=0.5 boxplot(y) title("Boxplot of Moisture retention") binom.test(x,n,0.5,alt="greater",0.95) p.value=pbinom(x-1,n,0.5,lower.tail=FALSE) p.value CI.lower=ps-qnorm(0.95)*sqrt(ps*(1-ps)/n) c(n,x,ps,p.value,CI.lower) #L8 Sign test - Smoking before=c(25,25,27,44,30,67,53,53,52,60,28) after=c(27,29,37,36,46,82,57,80,61,59,43) diff=before-after diff n=length(diff[diff!=0]) x=length(diff[diff>0]) c(n,x) binom.test(x,n,0.5,alt="two.sided",0.95) p.value=2*pbinom(x,n,0.5) p.value #L9 Diet X and Y y=c(85,69,81,112,77,86) x=c(83,78,70,72,67,68) wilcox.test(y,x,alternative="greater",mu=0,paired=T,exact=T,correct=F) d=y-x #checking only d n=length(d) r = rank(abs(d)) r sign.r=r*sign(d) sign.r w.plus = sum(r[d>0]) w.minus = sum(r[d<0]) w = min(w.plus, w.minus) p.value=psignrank(w,n) c(n,w.plus,w.minus,w,p.value) #Smoking x=c(25,25,27,44,30,67,53,53,52,60,28) y=c(27,29,37,36,46,82,57,80,61,59,43) wilcox.test(x,y,alternative="two.sided",mu=0,paired=T,exact=F,correct=F) d=x-y #checking only d r = rank(abs(d)) r sign.r=r*sign(d) sign.r w.plus = sum(r[d>0]) w.minus = sum(r[d<0]) w = min(w.plus, w.minus) ew.plus = sum(r[d!=0])/2 varw.plus = sum((r[d!=0])^2)/4 z0=(w.plus-ew.plus)/sqrt(varw.plus) p.value=2*pnorm(z0) c(w.plus,w.minus,w,ew.plus) c(varw.plus,z0,p.value) boxplot(d) #check symmetric data distribution #L10 Transformation of data dat=read.table("SurveyAge.txt") x=dat[,1] par(mfrow=c(2,2)) hist(x,nclass=25) hist(log(x-16),nclass=16) hist(log(x-16.7),nclass=20) #L11 2 sample t-test height comparison x=c(135.3,137.0,136.0,139.7,136.5,137.2,138.8,139.6,140.0,137.7,135.5,134.9,139.5) y=c(140.3,139.8,138.6,137.1,140.0,136.2,138.7,138.5,134.9,141.0) mx=mean(x) my=mean(y) nx=length(x) ny=length(y) mxy=c(rep(mx,nx),rep(my,ny)) #create a vector of mean par(mfrow=c(1,2)) boxplot(x,y) #joint boxplots of x and y qqnorm(c(x,y)-mxy) qqline(c(x,y)-mxy) #qqplot of the combined residuals t.test(x,y,alternative="less",mu=0,var.equal=T) t.test(x,y,alternative="less",mu=0,var.equal=F,conf.level=.90) #L11 2 sample z-test High fiber cereal m=43 n=107 meanx=604.01 meany=633.23 sdx=64.05 sdy=103.29 z=(meanx-meany)/sqrt(sdx^2/m+sdy^2/n) z p.value=pnorm(z) p.value #L12 Wilcoxon rank sum test compare methods A & B A=c(32,29,35,28) B=c(27,31,26,25,30) wilcox.test(A,B,alternative="two.sided",mu=0,exact=T,correct=F) nx=length(A) ny=length(B) N=nx+ny rankA=rank(c(A,B))[1:nx] rankA rankB=rank(c(A,B))[(nx+1):N] rankB w=sum(rankA) Ew=nx*(N+1)/2 wt=2*Ew-w min=nx*(nx+1)/2 max=nx*(N+ny+1)/2 p.value=2*pwilcox(wt-min,nx,ny) p.value c(w,wt,Ew,min,max) #L12 Wilcoxon rank sum test - latent heat of fusion icea=c(79.98,80.04,80.02,80.04,80.03,80.03,80.04,79.97,80.05,80.03,80.02,80.00,80.02) iceb=c(80.02,79.94,79.98,79.97,79.97,80.03,79.95,79.97) wilcox.test(icea,iceb,alternative="greater",mu=0,exact=F,correct=F) rank=rank(c(icea,iceb)) #checking only nx=length(icea) ny=length(iceb) N=nx+ny rankA=rank(c(icea,iceb))[1:nx] rankA rankB=rank(c(icea,iceb))[(nx+1):N] rankB sum(rankB) w=sum(rankA) EW=nx*(nx+ny+1)/2 sumsqrank=sum(rank^2) g=N*(N+1)^2/4 varW=nx*ny*(sumsqrank-g)/(N*(N-1)) z0=(w-EW)/sqrt(varW) p.value=1- pnorm(z0) min=nx*(nx+1)/2 max=nx*(N+ny+1)/2 wr=w-min c(w,min,max,wr,EW) round(c(sumsqrank,g,varW,z0,p.value),digit=5) #L13 One sample test on variance - variance of diameter n=10 sigma20=0.0002 s2=0.00018 chi20=(n-1)*s2/sigma20 chi20 p.value=pchisq(chi20,n-1) p.value alp=0.05 infty=9999999 neginfty=-1*infty CI.upper=c((n-1)*s2/qchisq(1-alp,n-1),infty) #H1: sigma^2>sigma0^2 CI.upper CI.lower=c(neginfty,(n-1)*s2/qchisq(alp,n-1)) #H1: sigma^2sigmax^2 CI.upper CI.2sided=c(var(y)/(var(x)*qf(1-alp/2,n1-1,n2-1)),var(y)/(var(x)*qf(alp/2,n1-1,n2-1))) CI.2sided #L16 1-way ANOVA test Teaching technques mark=c(65,87,73,79,81,69,75,69,83,81,72,79,90,59,78,67,62,83,76,94,89,80,88,85) mark.f=factor(rep(letters[1:4],c(6,7,6,5))) mark.f aov.mark=aov(mark~mark.f) summary(aov.mark) mark1=mark[1:6] mark2=mark[7:13] mark3=mark[14:19] mark4=mark[20:24] par(mfrow=c(2,2)) boxplot(mark1,mark2,mark3,mark4) title("boxplot for mean mark") plot(aov.mark$fitted,aov.mark$resid) abline(h=0) title("Residual plot") qqnorm(resid(aov.mark)) qqline(resid(aov.mark)) m=mean(mark) m N=length(mark) N CM=N*m^2 CM m1=mean(mark1) m2=mean(mark2) m3=mean(mark3) m4=mean(mark4) meani=c(m1,m2,m3,m4) meani n1=length(mark1) n2=length(mark2) n3=length(mark3) n4=length(mark4) ni=c(n1,n2,n3,n4) ni SST=sum(ni*meani^2)-CM SST sumsq=sum(mark^2) sumsq SSTo=sumsq-CM SSTo v1=var(mark1) v2=var(mark2) v3=var(mark3) v4=var(mark4) vi=c(v1,v2,v3,v4) vi SSR=sum((ni-1)*vi) SSR g=4 f0=(SST/(g-1))/(SSR/(N-g)) f0 p.value=1-pf(f0,g-1,N-g) p.value #show that F test is a generalization of t test N.12=n1+n2 SSR.12=v1*(n1-1)+v2*(n2-1) MSR.12=SSR.12/(N.12-2) t0.12=(m1-m2)/sqrt(MSR.12*(1/n1+1/n2)) #t-test pvt.12=2*(1-pt(abs(t0.12),N.12-2)) #two-sided y.12=c(mark1,mark2) f.12=factor(rep(letters[1:2],c(n1,n2))) aov.12=aov(y.12~f.12) summary(aov(y.12~f.12)) m.12=mean(c(mark1,mark2)) SST.12=(n1*m1^2+n2*m2^2)-N.12*m.12^2 f0.12=(SST.12/(2-1))/(SSR.12/(N.12-2)) #one-way ANOVA test pvf.12=1-pf(f0.12,2-1,N.12-2) #upper-sided c(t0.12^2,f0.12,pvt.12,pvf.12) #pairwise test ni.m=matrix(rep(ni,g),nr=g,byrow=T) ni.m nj.m=t(ni.m) nj.m mi.m=matrix(rep(mi,g),nr=g,byrow=T) mi.m mj.m=t(mi.m) mj.m diff=mj.m-mi.m diff RMSR=sqrt(sum((ni-1)*vi)/(N-g)) RMSR t0=diff/(RMSR*sqrt(1/ni.m+1/nj.m)) round(t0,3) p.value=2*(1-pt(abs(t0),N-g)) round(p.value,4) r=g*(g-1)/2 c(r,0.05/r) p.value<0.05/r #L16 ANOVA test Iron iron=read.csv("iron.csv") attach(iron) summary=apply(iron,2,summary) summary par(mfrow=c(1,2)) boxplot(iron) title("boxplot for iron") boxplot(log(iron)) title("boxplot for log(iron)") ironv=as.matrix(iron) #not for iron_v.csv ironv=as.vector(ironv) #not for iron_v.csv ironv factor=factor(rep(letters[1:6],c(18,18,18,18,18,18))) #not for iron_v.csv aov.iron=aov(ironv~factor) summary(aov.iron) ironlog=log(ironv) aov.ironlog=aov(ironlog~factor) summary(aov.ironlog) centre=matrix(summary[3,],ncol=6,nrow=18,byrow=T) centre residual=iron-centre residual boxplot(data.frame(residual)) #L17 Individual comparisons Chemical tested lab=read.csv("lab.csv") attach(lab) labv=as.matrix(lab) #not for lab_v.csv labv=as.vector(labv) #not for lab_v.csv labv N=length(labv) n=length(lab[,1]) g=length(lab[1,]) c(N,n,g) lab.f=factor(rep(letters[1:7],c(10,10,10,10,10,10,10))) #not for lab_v.csv lab.f aov.lab=aov(labv~lab.f) summary(aov.lab) par(mfrow=c(1,2)) boxplot(data.frame(lab)) title("boxplot of 7 labs") qqnorm(resid(aov.lab)) qqline(resid(aov.lab)) lab=matrix(labv,n,g,byrow=F) #create matrix, for lab_v.csv only g.mean=apply(lab,2,mean) g.mean yi.bar=matrix(rep(g.mean,g),nr=g,byrow=T) yj.bar=t(yi.bar) # transpose of xi.bar diff=yi.bar-yj.bar g.var=apply(lab,2,var) round(g.var,5) RMSR=sqrt((n-1)*sum(g.var)/(N-g)) #n-1=10-1=9, N-g=70-7=63 RMSR t0=diff/(RMSR*sqrt(2/n)) round(t0,3) #to 3 dec. place p.value=2*(1-pt(abs(t0),(N-g))) #to 4 dec. place p.value=matrix(p.value,nr=g,byrow=T) round(p.value,4) r=g*(g-1)/2 #no. of pairs r=7(7-1)/2=21 round(c(r,0.1/r,0.01/r ),6) p.value<0.1/r #al=0.1 p.value<0.01/r #al=0.01 #L18 KW test Chemical tested rm(list=ls()) lab=read.csv("lab.csv") attach(lab) labv=as.matrix(lab) #change matrix to vector labv=as.vector(labv) labv lab.f=factor(rep(letters[1:7],c(10,10,10,10,10,10,10))) lab.f kruskal.test(labv,lab.f) rankv=rank(labv) #checking only rankv rankm=matrix(rankv, nc=7) dimnames(rankm)=list(NULL,paste("Lab", 1:7)) rankm mean.rank =apply(rankm, 2, mean) #group mean by col mean.rank N=length(rankv) g=length(rankm[1,]) ni=length(rankm[,1]) c(N,g,ni) sumsq.mrank=sum(mean.rank^2) sumsq.rank=sum(rankv^2) barmean=mean(rankv) c(sumsq.mrank,sumsq.rank,barmean) k0=(N-1)*(ni*sumsq.mrank-N*barmean^2)/(sumsq.rank-N*barmean^2) p.value=1-pchisq(k0,g-1) c(k0,p.value) k0r=12/(N*(N+1))*ni*sum(mean.rank^2)-3*(N+1) #just when no ties k0r #to show that KW test is a generalization of WRS test lab=read.csv("lab.csv") lab1=lab[,1] lab2=lab[,2] n=length(lab1) wilcox.test(lab1,lab2,alternative="two.sided",mu=0,exact=F,correct=F) rank=rank(c(lab1,lab2)) #WRS test N=2*n rankA=rank(c(lab1,lab2))[1:n] rankA rankB=rank(c(lab1,lab2))[(n+1):N] rankB w=sum(rankA) EW=n*(N+1)/2 sumsqrank=sum(rank^2) g=N*(N+1)^2/4 varW=n*n*(sumsqrank-g)/(N*(N-1)) z0=(w-EW)/sqrt(varW) p.value=2*(1-pnorm(abs(z0))) #2-sided c(z0^2,p.value) lab12v=c(lab1,lab2) lab12.f=factor(rep(letters[1:2],c(10,10))) kruskal.test(lab12v,lab12.f) #KW test #L19 2 way ANOVA model Penicillin pen=read.csv("penicillin.csv") attach(pen) pen r=5 c=4 penv=as.matrix(pen) #for penicillin_v.csv penv=as.vector(penv) #not for penicillin_v.csv penv #pen=matrix(penv,r,c,byrow=F) #create matrix, for pencicillin_v.csv only mean=mean(penv) mean mean.tr=apply(pen,2,mean) mean.tr mean.bl=apply(pen,1,mean) mean.bl mean.m=matrix(mean,r,c) mean.m mean.trm=matrix(mean.tr,r,c,byrow=T) mean.trm mean.blm=matrix(mean.bl,r,c,byrow=F) mean.blm tr.effect=mean.trm-mean.m tr.effect bl.effect=mean.blm-mean.m bl.effect fitted=mean.m+tr.effect+bl.effect fitted resid=pen-fitted resid #L19 2 way ANOVA test Penicillin r=length(pen[,1]) c=length(pen[1,]) factor.bl=factor(rep(letters[1:5],4)) #not for penicillin_v.csv factor.bl factor.tr=factor(rep(letters[1:4],c(5,5,5,5))) #not for penicillin_v.csv factor.tr pen.aov=aov(penv~factor.bl+factor.tr) summary(pen.aov) par(mfrow=c(2,2)) plot(pen.aov$fitted,pen.aov$resid) abline(h=0) title("residual plot") qqnorm(pen.aov$resid) qqline(pen.aov$resid) cm=(sum(penv))^2/(r*c) #check p-values SSTo=sum(penv^2)-cm SSB=c*sum(mean.bl^2)-cm SST=r*sum(mean.tr^2)-cm SSR=SSTo-SSB-SST c(cm,SSTo,SSB,SST,SSR) MST=SST/(c-1) MSB=SSB/(r-1) MSR=SSR/((r-1)*(c-1)) F.tr=MST/MSR F.bl=MSB/MSR p.tr=1-pf(F.tr,c-1,(c-1)*(r-1)) p.bl=1-pf(F.bl,r-1,(c-1)*(r-1)) c(MST,MSB,MSR,F.tr,F.bl,p.tr,p.bl) #to show that 2-way ANOVA test is a generalization of match pair t test pen1=pen[,1] pen2=pen[,2] d=pen1-pen2 dm=mean(d) dv=var(d) n=length(pen1) Nr=2*n t0=dm/sqrt(dv/n) #t-test pvt=2*(1-pt(abs(t0),n-1)) #two-sided c(t0^2,pvt) pen.12=c(pen1,pen2) fb.12=factor(rep(letters[1:5],2)) ft.12=factor(rep(letters[1:2],c(5,5))) aov.12=aov(pen.12~fb.12+ft.12) summary(aov.12) #L21 Friedman test Penicillin pen=read.csv("penicillin.csv") attach(pen) pen penv=as.matrix(pen) #for matrix format only; create vector penv=as.vector(penv) penv factor.bl=factor(rep(letters[1:5],4)) #for matrix format factor.bl factor.tr=factor(rep(letters[1:4],c(5,5,5,5))) #for matrix format factor.tr friedman.test(penv,factor.tr,factor.bl) rank.pen=apply(pen, 1, rank) rank.pen rank.pen=t(rank.pen) #transpose dimnames(rank.pen)=list(paste("Blend", 1:5), paste("Method", 1:4)) rank.pen r=length(rank.pen[,1]) #for checking c=length(rank.pen[1,]) c(r,c) bar.r=mean(rank.pen) bar.r bar.rj=apply(rank.pen,2,mean) bar.rj SST=r*sum(bar.rj^2)-r*c*(bar.r^2) MSR=(sum(rank.pen^2)-r*c*(bar.r^2))/(r*(c-1)) q0=SST/MSR p.value=1-pchisq(q0,c-1) c(SST,MSR,q0,p.value) #to show that Friedman test is a generalization of sign test x=length(d[d>0]) n=length(d[d!=0]) z0=(x-n/2)/sqrt(n/4) #z-test pvz=2*(1-pnorm(abs(z0))) #two-sided friedman.test(pen.12,ft.12,fb.12) c(z0^2,pvz) #L22 two-way ANOVA with replicates Survival poison=read.csv("poison.csv") attach(poison) poison c=4 r=3 m=length(poison[,1])/3 m N=r*c*m poisonv=as.matrix(poison) #for matrix format only; create vector poisonv=as.vector(poisonv) poisonv poisonv=as.numeric(poisonv[13:(N+12)]) poisonv=c(poison[,2],poison[,3],poison[,4],poison[,5]) poisonv factor.bl=factor(rep(rep(letters[1:3],c(m,m,m)),c)) factor.bl factor.tr=factor(rep(letters[1:4],c(r*m,r*m,r*m,r*m))) factor.tr poison.aov=aov(poisonv~factor.tr*factor.bl) summary(poison.aov) poisonv=read.csv("poisonv.csv") attach(poisonv) poisonv treatment=rep(1:c,c(r*m,r*m,r*m,r*m)) type=rep(rep(1:r,c(m,m,m)),c) vector=cbind(poisonv,type,treatment) vector poison1.aov=aov(poisonv~type*treatment) summary(poison1.aov) treatment=as.factor(treatment) type=as.factor(type) poison2.aov=aov(poisonv~treatment*type) summary(poison2.aov) poison3.aov=aov(poisonv~treatment+type) summary(poison3.aov) poison4.aov=aov(poisonv~treatment) summary(poison4.aov) par(mfrow=c(2,2)) interaction.plot(factor.bl,factor.tr,poisonv) title("Interaction plot") interaction.plot(factor.tr,factor.bl,poisonv) title("Interaction plot") plot(poison.aov$fitted,poison.aov$resid) abline(h=0) title("Residual plot") qqnorm(poison.aov$resid) qqline(poison.aov$resid) mean=mean(poisonv) mean mean.g=tapply(poisonv,list(factor.bl,factor.tr),mean) mean.g mean.bl=apply(mean.g,1,mean) mean.bl alpha=mean.bl-mean alpha mean.tr=apply(mean.g,2,mean) mean.tr beta=mean.tr-mean beta delta=mean.g-matrix(mean.tr,r,c,byrow=T)- matrix(mean.bl,r,c,byrow=F)+mean delta var.g=tapply(poisonv,list(factor.bl,factor.tr),var) var.g sumsq=sum(poisonv^2) sumsq.bl=sum(mean.bl^2) sumsq.tr=sum(mean.tr^2) c(sumsq,sumsq.bl,sumsq.tr) cm=m*c*r*mean^2 SSTo=sumsq-cm SST=m*r*sumsq.tr-cm SSB=m*c*sumsq.bl-cm SSR=(m-1)*sum(var.g) SSI=SSTo-SST-SSB-SSR c(cm,SSTo,SST,SSB,SSR,SSI) MST=SST/(c-1) MSB=SSB/(r-1) MSR=SSR/(r*c*(m-1)) MSI=SSI/((r-1)*(c-1)) c(MST,MSB,MSR,MSI) Ft=MST/MSR Fb=MSB/MSR Fi=MSI/MSR pt=1-pf(Ft,r-1,r*c*(m-1)) pb=1-pf(Fb,c-1,r*c*(m-1)) pi=1-pf(Fi,(r-1)*(c-1),r*c*(m-1)) c(Ft,Fb,Fi,pt,pb,pi) #L23 Regression Predicting height x=c(39,30,32,34,35,36,36,30,33,37,33,38,32,35) #height at 2 y=c(71,63,63,67,68,68,70,64,65,68,66,70,64,69) #height as adult #y=sqrt(y) ls.height=lsfit(x,y) ls.height$coef ls.height$residuals par(mfrow=c(2,2)) plot(x,y) abline(ls.height) title("Fitted line plot") plot(x,ls.height$residuals) abline(h=0) title("Residual plot") qqnorm(ls.height$resid) qqline(ls.height$resid) n=length(x) sx=sum(x) sy=sum(y) sx2=sum(x^2) sy2=sum(y^2) sxy=sum(x*y) c(sx,sy,sx2,sy2,sxy) ym=mean(y) xm=mean(x) Sxx=sum(x^2)-n*xm^2 Sxy=sum(x*y)-n*xm*ym Syy=sum(y^2)-n*ym^2 c(xm,ym,Sxx,Syy,Sxy) beta=Sxy/Sxx alpha=ym-beta*xm r=Sxy/sqrt(Sxx*Syy) r2=r^2 c(alpha,beta,r,r2) SST=beta*Sxy SST SSR=Syy-SST SSR s2=SSR/(n-2) s2 f0=SST/s2 f0 t0=sqrt(f0) t0 p.value=1-pf(f0,1,n-2) p.value c(SST,SSR,s2,f0,t0,p.value) alp=0.05 t=qt(1-alp/2,n-2) t CIbeta.low=beta-t*sqrt(s2/Sxx) CIbeta.up=beta+t*sqrt(s2/Sxx) c(CIbeta.low,CIbeta.up) CIalpha.low=alpha-t*sqrt(s2/n+s2*xm^2/Sxx) CIalpha.up=alpha+t*sqrt(s2/n+s2*xm^2/Sxx) c(CIalpha.low,CIalpha.up) x0=40 y0=alpha+beta*x0 y0 alp=0.1 t=qt(1-alp/2,n-2) t se.est=sqrt(s2*(1/n+(x0-xm)^2/Sxx)) se.est se.pre=sqrt(s2*(1+1/n+(x0-xm)^2/Sxx)) se.pre CIest.low=y0-t*se.est CIest.up=y0+t*se.est c(CIest.low,CIest.up) CIpre.low=y0-t*se.pre CIpre.up=y0+t*se.pre c(CIpre.low,CIpre.up) r2=SST/Syy r2 r=sqrt(r2) r t0=r*sqrt((n-2)/(1-r^2)) t0 p.value=2*(1-pt(abs(t0),n-2)) p.value cor.test(x,y,alt="two.sided") #prediction and estimation interval lm1 = lm(y~x) py0=predict(lm1,newdata = data.frame(x=40),interval="predict") #(e) prediction interval ey0=predict(lm1,newdata = data.frame(x=40),interval="confidence") #(f) estimation interval x0=c(40,40,40) # Comparison between prediction and confidence intervals xx = seq(30,40,by=0.1) newdata = data.frame(x=xx) PI = predict(lm1,newdata,interval="predict") CI = predict(lm1,newdata,interval="confidence") Height_two=x Height_adult=y plot(Height_adult~Height_two,xlim = c(30,40), ylim = c(60,75), pch=20,col="red") #both prediction & est interval abline(lm1,col = "red") lines(PI[,2]~xx,type="l",lwd=2) lines(PI[,3]~xx,type="l",lwd=2) lines(CI[,2]~xx,type="l",lty = 2,lwd=2) lines(CI[,3]~xx,type="l",lty = 2,lwd=2) points(x0,py0,pch=1) points(x0,ey0,pch=19) legend("topleft",legend=c("LS Regression Line","Prediction Interval","Estimation Interval"), lwd=c(1,2,2),lty = c(1,1,2),col = c("red","black","black")) plot(Height_adult~Height_two,xlim = c(30,40), ylim = c(60,75), pch=20,col="red") #est interval abline(lm1,col = "red") lines(CI[,2]~xx,type="l",lwd=2) lines(CI[,3]~xx,type="l",lwd=2) #legend("topleft",legend=c("LS Regression Line","Estimation Interval"), lwd=c(1,2,2),lty = c(1,1,2),col = c("red","black","black")) title("Estimation interval") plot(Height_adult~Height_two,xlim = c(30,40), ylim = c(60,75), pch=20,col="red") abline(lm1,col = "red") lines(PI[,2]~xx,type="l",lwd=2) lines(PI[,3]~xx,type="l",lwd=2) #legend("topleft",legend=c("LS Regression Line","Prediction Interval"), lwd=c(1,2,2),lty = c(1,1,2),col = c("red","black","black")) title("Prediction interval") #L24 Robust regression fuel data fuel=read.csv("data/fuel.csv") attach(fuel) fuel x=fuel[,2] # Weight of cars y=fuel[,4] # Mileage per gallon ls.fuel=lsfit(x, y) # y=a+bx beta=ls.fuel$coef[2] names(beta)=NULL alpha=ls.fuel$coef[1] names(alpha)=NULL c(alpha,beta) par(mfrow=c(2,1)) plot(x, y) abline(ls.fuel) plot(x,ls.fuel$resid) abline(h=0) ls.fuel=lsfit(x, 1/y) # 1/y=a+bx ls.fuel$coef par(mfrow=c(2,1)) plot(x, 1/y) abline(ls.fuel) plot(x, ls.fuel$resid) abline(h=0) x=fuel[,2] # Weight of cars y=fuel[,4] # Mileage per gallon y1=y y1[3]=370 # y[3]=37, changed to give an outlier ls.fuel=lsfit(x, y) # y=a+bx ls.fuel$coef ls.fuel1=lsfit(x, y1) # y1=a+bx ls.fuel1$coef par(mfrow=c(2,2)) plot(x, y) abline(ls.fuel) plot(x,ls.fuel$resid) abline(h=0) plot(x, y1) abline(ls.fuel1) plot(x,ls.fuel1$resid) abline(h=0) #L24 Regression correlation Automobile data x=c(89, 93, 87, 90, 89, 95, 100, 98) y=c(13, 13.2, 13, 13.6, 13.3, 13.8, 14.1, 14) cor.test(x,y,alt="two.sided") plot(x,y,pch=19) title("Scatter plot of Miles per gal. vs Octane") n=length(x) #checking Sxx=sum(x^2)-sum(x)^2/n Sxy=sum(x*y)-sum(x)*sum(y)/n Syy=sum(y^2)-sum(y)^2/n c(Sxx,Sxy,Syy) beta=Sxy/Sxx alpha=mean(y)-beta*mean(x) c(alpha,beta) SST=beta*Sxy r2=SST/Syy r=sqrt(r2) t0=r*sqrt((n-2)/(1-r^2)) p.value=2*(1-pt(abs(t0),n-2)) c(SST,r2,r,t0,p.value) x=c(89, 93, 87, 90, 89, 95, 100, 98) y=c(13, 13.2, 13, 13.6, 13.3, 13.8, 14.1, 14) x=c(46,53,37,42,34,29,60,44,41,48,33,40) y=c(12,14,11,13,10,8,17,12,10,15,9,13) #L28 Chi-square test Computer sold y=c(110,57,53,80) p=c(2/5,1/5,1/5,1/5) n=sum(y) k=length(y) ey=n*p ey ey<5 chi2=sum((y-ey)^2/ey) chi2 p.value=1-pchisq(chi2,k-1) p.value chisq.test(y,p=p) #L28 Chi-square test Genetic linkage y=c(128,86,74,112) p=c(1/4,1/4,1/4,1/4) k=length(y) ey=n*p ey ey<5 chi2=(y-ey)^2/ey chi2 chi2=sum(chi2) chi2 p.value=1-pchisq(chi2,k-1) p.value chisq.test(y,p=p) #L28 Chi-square test Genetic linkage y=c(128,86,74,112) n=sum(y) par=(y[2]+y[3])/n p=c((1-par)/2,par/2,par/2,(1-par)/2) p ey=n*p ey ey<5 chi2=(y-ey)^2/ey chi2 chi2=sum(chi2) chi2 p.value=1-pchisq(chi2,k-1-1) p.value chisq.test(y,p=p) #L29 Chi-square test for distribution Poisson with unknown lambda, Suicides y=c(33,17,7,3) x=c(0,1,2,3) n=sum(y) k=length(y) lam=sum(y*x)/n lam p=lam^x*exp(-1*lam)/factorial(x) p[4]=1-sum(p[1:3]) p ey=n*p ey ey<5 yr=c(y[1:2],y[3]+y[4]) yr eyr=c(ey[1:2],ey[3]+ey[4]) eyr pr=c(p[1:2],p[3]+p[4]) pr kr=length(yr) xr=x[1:kr] xr chi2=(yr-eyr)^2/eyr chi2 chi2=sum(chi2) chi2 p.value=1-pchisq(chi2,kr-1-1) p.value chisq.test(yr,p=pr) par(mfrow=c(2,2)) barplot(yr,names.arg=xr,col="lightgray",main="Observed frequency",cex.names=0.7) barplot(eyr,names.arg=xr,col="lightgray",main="Expected frequency",cex.names=0.7) #L29 Chi-square test for distribution Binomial with unknown p, Sales y=c(10,38,69,63,18,2) x=c(0,1,2,3,4,5) n=sum(y) m=max(x) k=length(y) prob=sum(y*x)/(m*n) prob p=dbinom(x,m,prob) p ey=n*p ey ey<5 yr=c(y[1:4],y[5]+y[6]) yr eyr=c(ey[1:4],ey[5]+ey[6]) eyr pr=c(p[1:4],p[5]+p[6]) pr kr=length(yr) xr=x[1:kr] xr chi2=(yr-eyr)^2/eyr chi2 chi2=sum(chi2) chi2 p.value=1-pchisq(chi2,kr-1-1) p.value chisq.test(yr,p=pr) par(mfrow=c(2,2)) barplot(yr,names.arg=xr,col="lightgray",main="Observed frequency",cex.names=0.7) barplot(eyr,names.arg=xr,col="lightgray",main="Expected frequency",cex.names=0.7) #L29 Chi-square test for distribution: normal with unknown mu and sigma^2, beav1 #10 intervals beav1=read.csv("data/beav1.csv") attach(beav1) beav1 x=beav1$temp hist(x, nclass = 10,col="lightgray") summary(x) n=length(x) nclass = 10 ma=max(x) mi=min(x) c(mi,ma) width = (ma - mi)/nclass width Inter= seq(mi,ma,width) Inter freq=c() freq[1]=length(x[x<=mi+width]) for(t in 2:10) {freq[t]=length(x[x<=mi+t*width])-length(x[x<=mi+(t-1)*width])} freq sum(freq) I= Inter[-10] I I= I[-9] I I= I[-2] I y=c() y[1]=freq[1]+freq[2] for(i in 2:6) {y[i]=freq[i+1]} y[7]=freq[8]+freq[9]+freq[10] y k=length(y) mu=mean(x) mu sigma=sqrt(var(x)) sigma p=c() for(i in 1:7) {p[i]=pnorm((I[i+1]-mu)/sigma)-pnorm((I[i]-mu)/sigma)} p ey=n*p ey chi=(y-ey)^2/(ey) chi chi=sum(chi) chi p.value=1-pchisq(chi,k-1-2) p.value chisq.test(y,p=p) mid=c() for(t in 1:6) {mid[t]=(I[t]+I[t+1])/2} mid[7]=I[8]+width par(mfrow=c(2,2)) barplot(y,col="lightgray",main="Observed frequency") barplot(ey,col="lightgray",main="Expected frequency",cex.names=0.7) #5 intervals, no need to combine classes beav1=read.csv("data/beav1.csv") attach(beav1) beav1 x=sort(beav1$temp) x n=length(x) k=6 if (n<=80) k=4 else if (n<=220) k=5 k xm=mean(x) xsd=sd(x) c(xm,xsd) zlow=-2 #set the lowest class marks for z if (n<=80) zlow=-1 else if (n<=220) zlow=-1.5 zlow int=c() #set class marks for x int=xm+(zlow+0:3)*xsd int y=c() #count class freq. y[1]=length(x[x<=int[1]]) y[2]=length(x[x<=int[2]])-length(x[x<=int[1]]) y[3]=length(x[x<=int[3]])-length(x[x<=int[2]]) y[4]=length(x[x<=int[4]])-length(x[x<=int[3]]) y[5]=length(x[x>int[4]]) y sum(y) #check if sum to n p=c() #calculate expected prob. p[1]=pnorm(zlow) p[2:4]=pnorm(zlow+1:3)-pnorm(zlow+0:2) p[5]=1-pnorm(zlow+3) p sum(p) #check if sum to 1 ey=n*p ey chi=(y-ey)^2/(ey) chi chi=sum(chi) chi p.value=1-pchisq(chi,k-1-2) p.value par(mfrow=c(2,2)) barplot(y,col="lightgray",main="Observed frequency") barplot(ey,col="lightgray",main="Expected frequency") hist(x,col="lightgray") #T13 intervals, no need to combine classes x=c(26,21,25,20,21,29,26,23,22,24,24,30,23,32,26,24,32,16,36,26,21,31,26,23,32,35,40,30,14,26,46,27,33,25,27,21,26,18,29,36) x=sort(x) x n=length(x) k=6 if (n<=80) k=4 else if (n<=220) k=5 k xm=mean(x) xsd=sd(x) c(xm,xsd) zlow=-2 #set the lowest class marks for z if (n<=80) zlow=-1 else if (n<=220) zlow=-1.5 zlow int=c() #set class marks for x for (i in 1:(k-1)) {int[i]=xm+(zlow+i-1)*xsd} int y=c() #count class freq. y[1]=length(x[x<=int[1]]) y[k]=length(x[x>int[k-1]]) for(i in 2:(k-1)) {y[i]=length(x[x<=int[i]])-length(x[x<=int[i-1]])} y sum(y) p=c() #calculate expected prob. for(i in 2:(k-1)) {p[i]=pnorm(zlow+i-1)-pnorm(zlow+i-2)} p[1]=pnorm(zlow) p[k]=1-pnorm(zlow+k-2) p sum(p) ey=n*p ey chi=(y-ey)^2/(ey) chi chi=sum(chi) chi p.value=1-pchisq(chi,k-1-2) p.value par(mfrow=c(2,2)) barplot(y,col="lightgray",main="Observed frequency") barplot(ey,col="lightgray",main="Expected frequency") hist(x,col="lightgray") #L30 Chi-square test for independence Advertisement y=c(62,47,29,46,9,7) n=sum(y) n c=3 r=2 y.mat=matrix(y,r,c) y.mat yr=apply(y.mat,1,sum) yr yc=apply(y.mat,2,sum) yc yr.mat=matrix(yr,r,c,byrow=F) yr.mat yc.mat=matrix(yc,r,c,byrow=T) yc.mat ey.mat=yr.mat*yc.mat/n ey.mat ey.mat<5 chi=(y.mat-ey.mat)^2/ey.mat chi chi=sum(chi) chi p.value=1-pchisq(chi,(r-1)*(c-1)) p.value chisq.test(y.mat) #L30 Chi-square test for independence Advertisement y=c(24,32,46,22,38,38) n=sum(y) n c=3 r=2 y.mat=matrix(y,r,c) y.mat pr=apply(y.mat,1,sum)/n pr pc=apply(y.mat,2,sum)/n pc pr.mat=matrix(pr,r,c,byrow=F) pr.mat pc.mat=matrix(pc,r,c,byrow=T) pc.mat ey.mat=n*pr.mat*pc.mat ey.mat ey.mat<5 chi=(y.mat-ey.mat)^2/ey.mat chi chi=sum(chi) chi p.value=1-pchisq(chi,(r-1)*(c-1)) p.value chisq.test(y.mat) #Revision Exam.2005 x=c(172,165,206,184,174,142,190,169,161,200) y=c(201,179,159,192,177,170,182,179,169,210) d=x-y d xc = length(d[d>0]) xc n=length(x) p=0.5 binom.test(xc,n,p,alt="less",0.95) pvalue=pbinom(xc,n,p) # exact p-value pvalue wilcox.test(x,y,alternative="less",mu=0,paired=T,exact=T,correct=F) r = rank(abs(d)) r sign.r=r*sign(d) sign.r w.plus = sum(r[d>0]) w.plus w.minus = sum(r[d<0]) w.minus w = min(w.plus, w.minus) w ew.plus = sum(r[d!=0])/2 ew.plus varw.plus = sum((r[d!=0])^2)/4 varw.plus z0=(w.plus-ew.plus)/sqrt(varw.plus) z0 p.value=pnorm(z0) p.value 1 - exp((?8)/10) + exp((?12)/10) exp(?8/2) - exp(?12/2) exp(?8/12) - exp(?12/12) y=c(16,18,22,15,16,25,26,14,23,28,13,18,20,21) groups=factor(rep(letters[1:4],c(3,4,3,4))) kruskal.test(y,groups) rank=rank(y) rank k0=12/(14*(14+1))*(21^2/3+32.5^2/4+27^2/3+24.5^2/4)-3*(14+1) p.value=1-pchisq(k0, 3) p.value y=c(42,26,29,23,18,18,12,20,25,48,32,27,27,51,19,22,32,46,30,35,32,30,29,17,31,27,25) groups=factor(rep(letters[1:4],c(5,7,9,6))) y.df=data.frame(groups,y) y.df aov.y=aov(y~groups,y.df) summary(aov.y) n=length(y) s1=sum(y[groups=="a"]) s2=sum(y[groups=="b"]) s3=sum(y[groups=="c"]) s4=sum(y[groups=="d"]) s=c(s1,s2,s3,s4) s ni=c(5,7,9,6) cm=n*mean(y)^2 cm sy2=sum(y^2) sy2 SST0=sum(y^2)-cm SST=sum(s^2/ni)-cm SSR=SST0-SST c(SST0,SST,SSR) s2=92.81 n=27 g=4 low=(n-g)*s2/qchisq(0.05,n-g) up=(n-g)*s2/qchisq(0.95,n-g) c(low,up) #Exam. 2004 #Q1 One sample t-test and Wilcoxon signed rank test x=c(3,6,5,7,6,4,2,5,7,5) y=c(3,7,5,6,8,4,4,9,4,6) d=y-x n=length(d) t0=mean(d)/sqrt(var(d)/n) t0 t.test(d,alternative="greater", mu=0) wilcox.test(d,alternative="greater",mu=0,paired=F,exact=F,correct=F) #Q2 One-way ANOVA test and Bonferoni pairwise comparison y=c(64,62,66,65,60,61,65,66,65,63,65,62,68,67,67,63,67,66,63,72,69,72,71,66,76,74,71,66,68,67,72,70,72,69,73,71,72,68,68,71) r=10 c=4 N=length(y) ymat=matrix(y,r,c) ymat factor.tr=factor(rep(letters[1:c],c(r,r,r,r))) factor.tr summary(aov(y~factor.tr)) ym=apply(ymat,2,mean) ys2=apply(ymat,2,var) SSTo=var(y)*(N-1) SST=r*sum(ym^2)-N*mean(y)^2 SSR=(r-1)*sum(ys2) c(SSTo,SST,SSR) MSR=SSR/(N-c) MSR t0=(ym[2]-ym[4])/sqrt(MSR*2/r) #compare B and D t0 p.value=1-pt(abs(t0),N-c) npair=c*(c-1)/2 p.value<0.05/npair #3 two-way ANOVA without replicate y=c(15.2,14.3,14.7,15.1,19,16.9,16.4,15.9,16.7,15.6,17.1,16.1,15.7,17,15.5) r=5 c=3 N=length(y) ymat=matrix(y,r,c) ymat yrm=apply(ymat,1,mean) yrm ycm=apply(ymat,2,mean) ycm factor.bl=factor(rep(letters[1:r],c)) factor.bl factor.tr=factor(rep(letters[1:c],c(r,r,r))) factor.tr summary(aov(y~factor.tr+factor.bl)) SSTo=var(y)*(N-1) SST=r*sum(ycm^2)-N*mean(y)^2 SSB=c*sum(yrm^2)-N*mean(y)^2 SSR=SSTo-SST-SSB c(SSTo,SST,SSB,SSR) friedman.test(y,factor.tr,factor.bl) yrank=apply(ymat, 1, rank) yrank yrank=t(yrank) # transpose dimnames(yrank)=list(paste("Shipment", 1:5), paste("Carrier", 1:3)) yrank bar.r=mean(yrank) bar.r bar.rj=apply(yrank,2,mean) bar.rj SSt=r*sum(bar.rj^2)-r*c*(bar.r^2) SSt SSe=1/(r*(c-1))*(sum(yrank^2)-r*c*(bar.r^2)) SSe q0=SSt/SSe q0 p.value=1-pchisq(q0,c-1) p.value #4 significance level and power cv=5.8 #critical value mu0=5 n=100 sigma=5 cv.z0=(cv-mu0)/(sigma/sqrt(n)) #standardize critical.value, mu=mu0 alpha=1-pnorm(cv.z0) #type1 error=P(rej H0|mu=5)=P(xbar>cv)=P(z>cv.z0) c(cv.z0,alpha) mu1=7 #mu in H1 cv.z1=(cv-mu1)/(sigma/sqrt(n)) #standardize critical.value, mu=mu1 beta=pnorm(cv.z1) #type2 error=P(acc H0|mu=7)=P(xbar