Survey							
                            
		                
		                * Your assessment is very important for improving the workof artificial intelligence, which forms the content of this project
* Your assessment is very important for improving the workof artificial intelligence, which forms the content of this project
#
# sta-671-ch8-10-11-notes-R-18nov06.doc
#
# --------------------------------------------------------#
# Ex 8.1 - phosphorous in leaves from 3 varieties of trees
#
(p. 389)
#
# ANOVA TEST
#
#H0: mu1 = mu2 = mu3
#Ha: at least two means differ
#
#TS Fobs = MS(Between)/MS(Within)
#
#P-value = Pr[F(g-1, n-g) > Fobs)
# entering the data ….
phosphor <- scan()
.35 .40 .58 .50 .47 .65 .7 .9 .84 .79 .6 .8 .75 .73 .66
variety <- rep(c(1,2,3), c(5,5,5))
ex8p1.df <- data.frame(P=phosphor,Variety=factor(variety))
attach(ex8p1.df)
# generating numeric summary statistics for the three varieties
by(P,Variety,function(x) c(mean(x),var(x),sd(x)))
INDICES: 1
[1] 0.46000000 0.00795000 0.08916277
---------------------------------------------INDICES: 2
[1] 0.7760000 0.0103300 0.1016366
---------------------------------------------INDICES: 3
[1] 0.70800000 0.00617000 0.07854935
# fitting an anova model …
aov.8p1<- lm(P~Variety)
anova(aov.8p1)
Analysis of Variance Table
1
Response: P
Df Sum Sq Mean Sq F value
Pr(>F)
Variety
2 0.27664 0.13832 16.972 0.0003175 ***
Residuals 12 0.09780 0.00815
--Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
# plots
#
plot(P ~ Variety,ylim=c(0,1),
xlab="Tree Variety",ylab="Phosphorous Content")
QuickTime™ and a
TIFF (Uncompressed) decompressor
are needed to see this picture.
plot(P ~ as.numeric(Variety),ylim=c(0,1),
xlab="Tree Variety",ylab="Phosphorous
Content",xlim=c(0,4),xaxt="n")
axis(1,at=c(1,2,3),labels=c("1","2","3"))
xbars <- c(.46, .776, .708)
segments(0.75, xbars[1], 1.25, xbars[1], lwd=2)
2
segments(1.75, xbars[2], 2.25, xbars[2], lwd=2)
segments(2.75, xbars[3], 3.25, xbars[3], lwd=2)
QuickTime™ and a
TIFF (Uncompressed) decompressor
are needed to see this picture.
# --------------------------------------------------------#
# Z-test for testing proportions
#
# H0: pi = pi_0
#
# TEST STATISTIC
# zstat = [ pihat – pi_0 ] / [sqrt( pi_0*(1-pi_0)/n )]
#
#
# Example 10.4 (p. 475) – n=150 sports cars
#
60 of 150 sports cars failed inspection
#
Research hypothesis: more than 30% fail
#
Ha: pi > .30
3
prop.test(x=60,n=150,p=.30,alternative="greater")
1-sample proportions test with continuity correction
data: 60 out of 150, null probability 0.3
X-squared = 6.6746, df = 1, p-value = 0.00489
alternative hypothesis: true p is greater than 0.3
95 percent confidence interval:
0.3333552 1.0000000
sample estimates:
p
0.4
#
# CI for pi
#
prop.test(x=60, n=150)
1-sample proportions test with continuity correction
data: 60 out of 150, null probability 0.5
X-squared = 5.6067, df = 1, p-value = 0.01789
alternative hypothesis: true p is not equal to 0.5
95 percent confidence interval:
0.3218997 0.4833039
sample estimates:
p
0.4
#
#
#
#
#
#
--------------------------------------------------------H0: pi1 = pi2
TEST STATISTIC
zstat = [ pi1hat – pi2hat ] /
sqrt[pi1hat *(1-pi1hat)/n1 + pi2hat *(1-pi2hat)/n2 ]
# Example 10.6 – p. 485
# number pass out of number in study
npass <- c(94, 113)
ntotal <- c(125, 175)
prop.test(x=npass, n=ntotal, alternative="greater")
2-sample test for equality of proportions with
continuity correction
4
data: npass out of ntotal
X-squared = 3.3701, df = 1, p-value = 0.03320
alternative hypothesis: greater
95 percent confidence interval:
0.01240338 1.00000000
sample estimates:
prop 1
prop 2
0.7520000 0.6457143
#
# CI for pi1-pi2
#
prop.test(x=npass, n=ntotal)
2-sample test for equality of proportions with
continuity correction
data: npass out of ntotal
X-squared = 3.3701, df = 1, p-value = 0.06639
alternative hypothesis: two.sided
95 percent confidence interval:
-0.004268342 0.216839771
sample estimates:
prop 1
prop 2
0.7520000 0.6457143
#
# OL text presents the NON-continuity corrected version
#
of the proportion and confidence interval
#
prop.test(x=npass, n=ntotal, correct=FALSE)
2-sample test for equality of proportions without
continuity correction
data: npass out of ntotal
X-squared = 3.8509, df = 1, p-value = 0.04972
alternative hypothesis: two.sided
95 percent confidence interval:
0.002588801 0.209982628
sample estimates:
prop 1
prop 2
0.7520000 0.6457143
# ---------------------------------------------------------
5
#
#
#
Testing for independence between two classification factors
# H0: two factors are independent
# Ha: two factors are dependent
# X2 = sum( (O-E)^2 / E )
ex10p14 <- scan(what=list(policy=0,states= "",count=0))
1 oil/gas 50 1 coal 59 1 other 161
2 oil/gas 88 2 coal 20 2 other 40
3 oil/gas 56 3 coal 52 3 other 188
4 oil/gas 4 4 coal 3 4 other
5
5 oil/gas 2 5 coal 66 5 other
6
ex10p14
attach(ex10p14)
matrix(count,nrow=5,byrow=T)
[,1] [,2] [,3]
[1,]
50
59 161
[2,]
88
20
40
[3,]
56
52 188
[4,]
4
3
5
[5,]
2
66
6
chisq.test(matrix(count,nrow=5,byrow=T))
Pearson's Chi-squared test
data: matrix(count, nrow = 5, byrow = T)
X-squared = 289.2229, df = 8, p-value < 2.2e-16
Warning message:
Chi-squared approximation may be incorrect in:
chisq.test(matrix(count, nrow = 5, byrow = T))
#
# as an alternative to X2
#
chisq.test(matrix(count,nrow=5,byrow=T), simulate.p.value =T)
Pearson's Chi-squared test with simulated p-value
(based on 2000 replicates)
data: matrix(count, nrow = 5, byrow = T)
X-squared = 289.2229, df = NA, p-value = 0.0004998
6
#
# Example 10.15 (p. 513)
#
matrix(c(5,8,9,8,10,5,14,7,4,13,5,2),byrow=T,nrow=4)
[,1] [,2] [,3]
[1,]
5
8
9
[2,]
8
10
5
[3,]
14
7
4
[4,]
13
5
2
ex10p15.mat <matrix(c(5,8,9,8,10,5,14,7,4,13,5,2),byrow=T,nrow=4)
ex10p15.mat
[,1] [,2] [,3]
[1,]
5
8
9
[2,]
8
10
5
[3,]
14
7
4
[4,]
13
5
2
chisq.test(ex10p15.mat)
Pearson's Chi-squared test
data: ex10p15.mat
X-squared = 12.1104, df = 6, p-value = 0.05955
Warning message:
Chi-squared approximation may be incorrect in:
chisq.test(ex10p15.mat)
# --------------------------------------------------------# Linear Regression
#
# Example 11.4
y.absences.per.100workers <- scan()
31.5 33.1 27.4 24.5 27 27.8 23.3 24.7 16.9 18.1
x.experience.mos <- scan()
18.1 20 20.8 21.5 22 22.4 22.9 24 25.4 27.3
# fit least squares regression line
lmfit <- lm(y.absences.per.100workers ~ x.experience.mos)
# common hypothesis test of interest here …
# Model Y = beta0 + beta1*x + epsilon
7
# H0: beta1
# Ha: beta1
# TS: tstat
#
compare
=0
NE 0
= (b1)/SE(b1)
to t distribution with appropriate DF
summary(lmfit)
# compare to p. 548
Call:
lm(formula = y.absences.per.100workers ~ x.experience.mos)
Residuals:
Min
1Q
Median
-3.35372 -1.47105 -0.04869
3Q
1.79075
Max
3.40307
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept)
64.6718
6.7621
9.564 1.18e-05 ***
x.experience.mos -1.7487
0.2995 -5.840 0.000387 ***
--Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.388 on 8 degrees of freedom
Multiple R-Squared: 0.81,
Adjusted R-squared: 0.7862
F-statistic: 34.1 on 1 and 8 DF, p-value: 0.0003873
#
# scatterplot of the data along with the fitted regression line
# superimposed
#
plot(x.experience.mos, y.absences.per.100workers,
xlab= " Experience (months)", ylab= "Absences (per 100 workers)")
abline(lmfit)
8
QuickTime™ and a
TIFF (Uncompressed) decompressor
are needed to see this picture.
9