# Linear Regression # April 29, 2008 # Genome 560 ################################ # Load some packages ################################ library(faraway) library(cars) ################################ # open a dataset up on sexual activity and the # life span of male fruitflies from Partridge and Farquhar (1981) # 125 fertile males divided randomly into five groups of 25 # Two independent variables were measured: # 1.) Activity: The group: isolated = fly kept solitary, one = fly kept with one pregnant fruitfly, many = fly kept with eight pregnant fruitflies, low= fly kept with one virgin fruitfly, high = fly kept with eight virgin fruitflies. # 2.) Thorax: The thorax length of each male was measured as this was known to affect longevity. ################################ library(faraway) data(fruitfly) attach(fruitfly) fruitfly fruitfly$activity # to see the levels associated with this variable ################################ # Take a look at the data ################################ plot(longevity ~ thorax, fruitfly, pch = unclass(activity)) legend(0.63, 100, levels(fruitfly$activity), pch = 1:5) ################################ # Let's start by fitting an additive model (i.e., no interaction) ################################ g <- lm(longevity ~ thorax + activity, data = fruitfly) ################################ # take a look and see all the things R has done for us # that are stored in this object ################################ names(g) ################################ # for now we can get all we need by # using the summary command ################################ summary(g) ################################ # Note that since activity is a categorical variable with 5 categories it is # automatically treated as a factor in R, indicated by the names "activityisolated", etc. # Thus, R converts this factor into a series of k-1 = 4 dummy variables # Note "isolated" was used as the reference level # To see how the coding was done examine the model matrix # Useful function to see how dummy coding works is contr.treatment(n), where n is number of levels ################################ model.matrix(g) ##################################### # What is the fitted regression line for the "isolated" group? # longevity = -48.7 + 134.3 * thorax # What about for "many"? # longevity = (-48.7 + 4.1) + 134.3 * thorax ##################################### ########## one.index <- which(fruitfly$activity=="one") isolated.index <- which(fruitfly$activity=="isolated") mean(fruitfly$longevity[one.index]) mean(fruitfly$longevity[isolated.index]) g <- lm(longevity ~ activity, data = fruitfly) summary(g) ######### # We can get the anova table by: anova(g) # Note this is a sequential anova table that starts from a null model and added terms are sequentially tested # and is equivalent to: m1 <- lm(longevity ~ thorax, fruitfly) m2 <- lm(longevity ~ thorax + activity, fruitfly) anova(m1,m2) # Do we really need both thorax and activity in the model? We could use the anova output, which suggests # both terms are significant. However, thorax is tested by itself and then activity is tested once thorax is # entered into the model. Instead, we might prefer to check whether each predictor is significant once the other # has been taken into account. This is really useful when we have a large number of predictors drop1(g, test="F") # Let's return to interpreting the model summary(g) # Note that the intercepts of "one" and "many" are not significantly different from the reference group "isolated" # However, "low" survives about 7 days less and the high sexual activity flies "high" survive about 20 days less than the reference group. ############### #Diagnostics ############## par(mfrow=c(2,2)) plot(g) # Plot 1: Fitted vs Residuals: Want to see points stochastically bouncing above and below the line y = 0 # Plot 2: Normal QQ: Like to see straight line # Plot 3: Scale-Location: If the variance were independent of the mean one would expect the points to stochastically wiggle around the line y = 0.8 # Plot 4 Residuals vs Leverage: Points with high leverage may change parameter estimates dramatically if omitted from the data. Especially ones with high leverages and residuals # In this example it looks like some non-constant variance. Let's get a better look: plot(residuals(g) ~ fitted(g), pch = unclass(fruitfly$activity)) # A log transformation can remove heteroscedasctity g.log <- lm(log(longevity) ~ thorax+ activity, fruitfly) summary(g.log) # note, because of the log transformation we can interpret the coefficients as having a multiplicative effect: exp(coef(g.log)[3:6]) # compared to the reference group, the high activity group has about 0.66 times the life span (i.e., ~ 33% less) # Other diagnostic options in library(car) package outlier.test(g) # Cook's D plot; Cook's D tests to see whether the error in the model changes when a specific data value is either included, or excluded from the model. # identify D values > 4/(n-k-1) cutoff <- 4/((nrow(fruitfly)-length(g$coefficients)-2)) plot(g, which=4, cook.levels=cutoff)