#In-class exercise solutions for class 4, classical tests, BS 720 #Exercise 1 library(MASS) attach(Boston) t.test(dis, mu=3.5) (mean(dis)-3.5)/(sd(dis)/sqrt(length(dis))) #t = (x-bar - 3.5)/(sd/sqrt(n)) summary(dis) hist(dis) #does this look 'normal,' or Gaussian? qqnorm(dis, main="QQ Plot of weighted distance to employment center") qqline(dis) #make sure to add this to get a straight line shapiro.test(dis) #formal test of normality of dis variable #for those interested in the actual formula for the statsitic, see: # http://en.wikipedia.org/wiki/Shapiro%E2%80%93Wilk_test #Exercise 2 wilcox.test(rm, mu=6) t.test(rm, mu=6) #compare these two results. Are they different? #Exercise 3 detach(Boston)# it's important to detach old datasets attach(anorexia) ?anorexia #take a look at the dataset dif <- Postwt - Prewt #each person gets a post-study weight minus pre-study weight measurement #now select only those subjects who were controls dif.Cont <- dif[which(Treat=="Cont")] summary(dif.Cont) length(dif.Cont) #only 26 observations, not very big sample hist(dif.Cont) #does this look normal? tough to say... qqnorm(dif.Cont, main="QQ Plot of weight change in placebo group") qqline(dif.Cont) #formal hypothesis test of normality of dif.Cont shapiro.test(dif.Cont) #p-value is greater than our prespecified rejection threshold (alpha) of 0.05, so we fail to reject H0. We cannot say that the data definitely are not normally distributed #Exercise 4 is.na(anorexia\$Treat) #any missing values? sum(is.na(anorexia\$Treat)) #if so, this sum will be >0 (why?) #the summary() function also gives number of NAs #there are several ways to do this #first, The Hard Way: trt<-numeric(length(anorexia\$Treat)) for (i in 1:length(anorexia\$Treat)){ ifelse(anorexia\$Treat[i] == "CBT" || anorexia\$Treat[i] == "FT", trt[i]<-1, trt[i]<-0) } #that's a lot of busy work. loops aren't necessary here #the easy way: trt<-ifelse(anorexia\$Treat== "CBT" | anorexia\$Treat == "FT",1, 0) trt<-ifelse(anorexia\$Treat== "Cont" 0, 1) #now perform statistical test. Method 1: t.test(Postwt[which(trt==1)], Prewt[which(trt==1)], paired=TRUE) #equivalent Method 2 (create a derived variable): dif2<-Postwt-Prewt dif.trt <- dif2[which(trt==1)] t.test(dif.trt) #Exercise 5 detach(anorexia) attach(Pima.tr) par(mfrow=c(1,2)) hist(glu[which(type=="Yes")],main="diabetics") hist(glu[which(type=="No")],main="non-diabetics") #does spread look the same? tough to say... tapply(glu,type,var) tapply(glu,type,sd) #these commands give variance and it's square root, the standard deviation, for glucose reading in each diabetes class (Yes/No) #boxplots are a nice way to look at the spread of data: par(mfrow=c(1,2)) boxplot(glu[which(type=="Yes")],main="diabetics") boxplot(glu[which(type=="No")],main="non-diabetics") #but a more succinct code for boxplots is this: boxplot(glu~type) #Exercise 6 #method number 1: bmi.new<-rep(NA,length(glu)) sum(is.na(bmi)) #any missing values? for(i in 1:length(glu)){ if(bmi[i]<25) bmi.new[i]<-"Normal/underweight" else if (25<=bmi[i] & bmi[i]<30) bmi.new[i]<-"Overweight" else bmi.new[i]<-"Obesity"} cbind(bmi,bmi.new) bmi.new<-factor(bmi.new) data.frame(bmi,bmi.new) table(bmi.new) #method number 2 (the better method, in my opinion): BMIcategory<-ifelse(bmi<25, "Normal/underweight", ifelse(25<=bmi & bmi<30, "Overweight", "Obesity")) cbind(bmi,BMIcategory) data.frame(bmi, BMIcategory) #Now look at the mean glucose reading in each BMI category. What do you see? tapply(glu,BMIcategory, mean) #Exercise 7 head(Pima.tr) #what does this command do? #This categorization of age is done with the 'cut' command, described on page 14 of the class notes age.label<- c("group1", "group2", "group3") summary(age) age.break <- c(21, 30, 39, 63) age.cat <- cut(age, breaks=age.break, labels = age.label) #this shows how many subjects fall in each category table(age.cat) tapply(glu[which(type=="No")], age.cat[which(type=="No")], mean) #this is the mean of glu by each age category, only for non-diabetic people tapply(glu[which(type=="No")], age.cat[which(type=="No")], summary) #this is the summary statistics of glu by each age category, only for non-diabetic people #now look at the spread of the data across age groups (for all classes of diabetes) boxplot(glu~age.cat) #now look at the spread of the data across age groups (for only the non-diabetics) boxplot(glu[which(type=="No")]~age.cat[which(type=="No")]) #formal test for whether glu is normal for nondiabetics within each age category shapiro.test(glu[which(type=="No"& age.cat=="group1")]) shapiro.test(glu[which(type=="No"& age.cat=="group2")]) shapiro.test(glu[which(type=="No"& age.cat=="group3")]) #perhaps group 1 is not normal. What does the histogram / QQ plot look like? #compact presentation: run the next block all together par(mfrow=c(2,2)) boxplot(glu[which(type=="No")]~age.cat[which(type=="No")]) qqnorm(glu[which(type=="No"& age.cat=="group1")], main="Non-diabetic, age group 1") qqline(glu[which(type=="No"& age.cat=="group2")]) qqnorm(glu[which(type=="No"& age.cat=="group2")], main="Non-diabetic, age group 2") qqline(glu[which(type=="No"& age.cat=="group2")]) qqnorm(glu[which(type=="No"& age.cat=="group3")], main="Non-diabetic, age group 3") qqline(glu[which(type=="No"& age.cat=="group3")])