# Linear Regression
# April 29, 2008
# Genome 560

################################
# Load some packages
################################
library(faraway)
library(cars)

################################
# open a dataset up on sexual activity and the 
# life span of male fruitflies from Partridge and Farquhar (1981)
# 125 fertile males divided randomly into five groups of 25
# Two independent variables were measured:
# 1.) Activity: The group: isolated = fly kept solitary, one = fly kept with one pregnant fruitfly, many = fly kept with eight pregnant fruitflies, low= fly kept with one virgin fruitfly, high = fly kept with eight virgin fruitflies.
# 2.) Thorax: The thorax length of each male was measured as this was known to affect longevity.
################################
library(faraway)
data(fruitfly)
attach(fruitfly)
fruitfly
fruitfly$activity  # to see the levels associated with this variable

################################
# Take a look at the data
################################
plot(longevity ~ thorax, fruitfly, pch = unclass(activity))
legend(0.63, 100, levels(fruitfly$activity), pch = 1:5)

################################
# Let's start by fitting an additive model (i.e., no interaction)
################################
g <- lm(longevity ~ thorax + activity, data = fruitfly)

################################
# take a look and see all the things R has done for us
# that are stored in this object
################################
names(g)

################################
# for now we can get all we need by 
# using the summary command
################################
summary(g)

################################
# Note that since activity is a categorical variable with 5 categories it is
# automatically treated as a factor in R, indicated by the names "activityisolated", etc.
# Thus, R converts this factor into a series of k-1 = 4 dummy variables
# Note "isolated" was used as the reference level
# To see how the coding was done examine the model matrix
# Useful function to see how dummy coding works is contr.treatment(n), where n is number of levels
################################
model.matrix(g)

#####################################
# What is the fitted regression line for the "isolated" group?
# longevity = -48.7 + 134.3 * thorax
# What about for "many"?
# longevity = (-48.7 + 4.1) + 134.3 * thorax
#####################################

##########
one.index <- which(fruitfly$activity=="one")
isolated.index <- which(fruitfly$activity=="isolated")
mean(fruitfly$longevity[one.index])
mean(fruitfly$longevity[isolated.index])
g <- lm(longevity ~ activity, data = fruitfly)
summary(g)
#########


# We can get the anova table by:
anova(g)

# Note this is a sequential anova table that starts from a null model and added terms are sequentially tested
# and is equivalent to:
m1 <- lm(longevity ~ thorax, fruitfly)
m2 <- lm(longevity ~ thorax + activity, fruitfly)
anova(m1,m2)

# Do we really need both thorax and activity in the model? We could use the anova output, which suggests
# both terms are significant. However, thorax is tested by itself and then activity is tested once thorax is 
# entered into the model. Instead, we might prefer to check whether each predictor is significant once the other
# has been taken into account. This is really useful when we have a large number of predictors
drop1(g, test="F")

# Let's return to interpreting the model 
summary(g)

# Note that the intercepts of "one" and "many" are not significantly different from the reference group "isolated"
# However, "low" survives about 7 days less and the high sexual activity flies "high" survive about 20 days less than the reference group.


###############
#Diagnostics
##############
par(mfrow=c(2,2))
plot(g)

# Plot 1: Fitted vs Residuals: Want to see points stochastically bouncing above and below the line y = 0
# Plot 2: Normal QQ: Like to see straight line
# Plot 3: Scale-Location: If the variance were independent of the mean one would expect the points to stochastically wiggle around the line y = 0.8
# Plot 4 Residuals vs Leverage: Points with high leverage may change parameter estimates dramatically if omitted from the data. Especially ones with high leverages and residuals

# In this example it looks like some non-constant variance. Let's get a better look:
plot(residuals(g) ~ fitted(g), pch = unclass(fruitfly$activity))

# A log transformation can remove heteroscedasctity
g.log <- lm(log(longevity) ~ thorax+ activity, fruitfly)
summary(g.log)
# note, because of the log transformation we can interpret the coefficients as having a multiplicative effect:
exp(coef(g.log)[3:6])
# compared to the reference group, the high activity group has about 0.66 times the life span (i.e., ~ 33% less)

# Other diagnostic options in library(car) package
outlier.test(g)
# Cook's D plot; Cook's D tests to see whether the error in the model changes when a specific data value is either included, or excluded from the model.
# identify D values > 4/(n-k-1) 
cutoff <- 4/((nrow(fruitfly)-length(g$coefficients)-2)) 
plot(g, which=4, cook.levels=cutoff)