# Descriptive Statistics and Graphs # April 3, 2008 # Stat560 # Read some expression data from a file into a data frame called exp exp <- read.table(file="RMA_Filtered.txt", header=T) # Get some information about the data frame attributes(exp) dim(exp) # Let's average the replicates together # Lots of ways to do this - here I'll do it in a somewhat inefficient way to make it easier to follow array1 <- seq(2,32,2) array2 <- seq(3,33,2) all.average <- (exp[array1] + exp[array2])/2 # to see what is going on let's do this again on a subset of the expression data (rows 1 through 10) exp.subset <- exp[2:11,] exp.subset exp.subset.avg <- (exp.subset[array1] + exp.subset[array2])/2 exp.subset.avg # lets rename the columns labels <- c(rep("CEU", 8), rep("YRI", 8)) labels <- paste(labels, c(seq(1:8), seq(1:8)), sep="") names(exp.subset.avg) <- labels names(all.average) <- labels ################## # Descriptive Statistics ################## # For a broad overview of expression for each individual we can use the summary command summary(all.average) # lets extract the data for the first probeset ps1 <- as.vector(all.average[1,1:16], mode="numeric") # average, range, sd of ps1 mean.ps1 <- mean(ps1) range.ps1 <- range(ps1) sd.ps1 <- sd(ps1) # what about the european and yoruba means for ps1? mean.ceu <- mean(ps1[1:8]) mean.yri <- mean(ps1[9:16]) #################### # Making Simple Plots #################### # plots can be controlled at exquisite detail; for a list of all available options type par() par() ## HISTOGRAMS # lets make a histogram of all average log2 expression values for each probeset in the CEU sample hist(ps1) ?hist hist(ps1, breaks = 20) # let's change this to a density histogram hist(ps1, freq=F) lines((density(ps1)), col = "red") # let's make it prettier hist(ps1, freq=F, main ="Probeset 1", col = "blue") # let's make a plot with two histograms next to each other par(mfrow=c(1,2)) hist(ps1, freq=F, main ="Bin = Standard", col = "blue") hist(ps1, freq=F, main ="Breaks = 25", col = "blue", breaks = 25) ## BoxPlots ?boxplot # let's make a boxplot for ps1 for the CEU and YRI samples boxplot(ps1[1:8], ps1[9:17], names = c("CEU", "YRI")) # boxplots with the formula interface; first make a dataframe lab <- c(rep("CEU", 8), rep("YRI", 8)) data <- data.frame(ps1,lab) data boxplot(ps1 ~ lab, data=data) # For a small number of points, I think dot charts are nice x.coord <- c(rep(1.5, 8), rep(3,8)) plot(x.coord, data$ps1, pch = 19, col = "red", xlim = c(0,5), ylim = c(5.5,6.5), axes=F, xlab = "Sample", ylab = "Expression") axis(1, at = c(1.5, 3), labels = c("CEU", "YRI")) axis(2) box() ## Scatterplots # Let's look at a scatter plot of replicate 1 and 2 for the first CEU sample plot(exp[,2], exp[,3]) # a little prettier please plot(exp[,2], exp[,3], pch = 19, xlab = "Rep1", ylab = "Rep2") abline(0,1, col = "red") # Now let's make pair-wise scatterplots for the first 4 CEU arrays # first make ceu specific data frame ceu.arrays <- exp[,2:5] plot(ceu.arrays, pch = 19) # This is cool - let's write it out to a file postscript(file="PairWisePlot.ps") plot(ceu.arrays, pch = 19) dev.off() ## Hierarchical Clustering ?hclust hc <- hclust(dist(trans.all.avg), "ave") plot(hc) # centroid clustering and squared Euclidean distance hc1 <- hclust(dist(trans.all.avg)^2, "cen") plot(hc1) ## PCA pcdat = princomp(all.average) summary(pcdat) plot(pcdat, ylim = c(0,100)) plot(loadings(pcdat), col = c(rep("red",8), rep("blue",8)), pch = 19) library(scatterplot3d) scatterplot3d(loadings(pcdat), pch = 19, highlight.3d = T) ## Heatmap library(gplots) col.pan <- colorpanel(100, "magenta", "black", "green") heatmap.2(as.matrix(exp.subset.avg), col = col.pan, symkey=F, density.info="none", trace="none", keysize = 0.8)