# Descriptive Statistics and Graphs
# April 3, 2008
# Stat560

# Read some expression data from a file into a data frame called exp
exp <- read.table(file="RMA_Filtered.txt", header=T)

# Get some information about the data frame
attributes(exp)
dim(exp)

# Let's average the replicates together
# Lots of ways to do this - here I'll do it in a somewhat inefficient way to make it easier to follow
array1 <- seq(2,32,2)
array2 <- seq(3,33,2)
all.average <- (exp[array1] + exp[array2])/2

# to see what is going on let's do this again on a subset of the expression data (rows  1 through 10)
exp.subset <- exp[2:11,]
exp.subset
exp.subset.avg <- (exp.subset[array1] + exp.subset[array2])/2
exp.subset.avg 

# lets rename the columns
labels <- c(rep("CEU", 8), rep("YRI", 8))
labels <- paste(labels, c(seq(1:8), seq(1:8)), sep="")
names(exp.subset.avg) <- labels 
names(all.average) <- labels

##################
# Descriptive Statistics
##################

# For a broad overview of expression for each individual we can use the summary command
summary(all.average)

# lets extract the data for the first probeset
ps1 <- as.vector(all.average[1,1:16], mode="numeric")

# average, range, sd of ps1
mean.ps1 <- mean(ps1)
range.ps1 <- range(ps1)
sd.ps1 <- sd(ps1)

# what about the european and yoruba means for ps1?
mean.ceu <- mean(ps1[1:8])
mean.yri <- mean(ps1[9:16])


####################
# Making Simple Plots
####################

# plots can be controlled at exquisite detail; for a list of all available options type par()
par()

## HISTOGRAMS
# lets make a histogram of all average log2 expression values for each probeset in the CEU sample
hist(ps1)
?hist
hist(ps1, breaks = 20)

# let's change this to a density histogram
hist(ps1, freq=F)
lines((density(ps1)), col = "red")

# let's make it prettier
hist(ps1, freq=F, main ="Probeset 1", col = "blue")

# let's make a plot with two histograms next to each other
par(mfrow=c(1,2))
hist(ps1, freq=F, main ="Bin = Standard", col = "blue")
hist(ps1, freq=F, main ="Breaks = 25", col = "blue", breaks = 25)

## BoxPlots
?boxplot

# let's make a boxplot for ps1 for the CEU and YRI samples
boxplot(ps1[1:8], ps1[9:17], names = c("CEU", "YRI"))

# boxplots with the formula interface; first make a dataframe
lab <- c(rep("CEU", 8), rep("YRI", 8))
data <- data.frame(ps1,lab)
data
boxplot(ps1 ~ lab, data=data)

# For a small number of points, I think dot charts are nice
x.coord <- c(rep(1.5, 8), rep(3,8))
plot(x.coord, data$ps1, pch = 19, col = "red", xlim = c(0,5), ylim = c(5.5,6.5), axes=F, xlab = "Sample", ylab = "Expression")
axis(1, at = c(1.5, 3), labels = c("CEU", "YRI"))
axis(2)
box()

## Scatterplots
# Let's look at a scatter plot of replicate 1 and 2 for the first CEU sample
plot(exp[,2], exp[,3])

# a little prettier please
plot(exp[,2], exp[,3], pch = 19, xlab = "Rep1", ylab = "Rep2")
abline(0,1, col = "red")

# Now let's make pair-wise scatterplots for the first 4 CEU arrays
# first make ceu specific data frame
ceu.arrays <- exp[,2:5]
plot(ceu.arrays, pch = 19)

# This is cool - let's write it out to a file
postscript(file="PairWisePlot.ps")
plot(ceu.arrays, pch = 19)
dev.off()

## Hierarchical Clustering
?hclust
hc <- hclust(dist(trans.all.avg), "ave")
plot(hc)

# centroid clustering and squared Euclidean distance
hc1 <- hclust(dist(trans.all.avg)^2, "cen")
plot(hc1)

## PCA
pcdat = princomp(all.average)
summary(pcdat)
plot(pcdat, ylim = c(0,100))
plot(loadings(pcdat), col = c(rep("red",8), rep("blue",8)), pch = 19)
library(scatterplot3d)
scatterplot3d(loadings(pcdat), pch = 19, highlight.3d = T)

## Heatmap
library(gplots)
col.pan <- colorpanel(100, "magenta", "black", "green")
heatmap.2(as.matrix(exp.subset.avg), col = col.pan, symkey=F, density.info="none", trace="none", keysize = 0.8)