StatQuest: The main ideas in PCA in less than 5 minutes!!!

Advertisements

StatQuest: PCA in R, Clearly Explained!!!

Here’s the code:

## In this example, the data is in a matrix called
## data.matrix
## columns are individual samples (i.e. cells)
## rows are measurements taken for all the samples (i.e. genes)
## Just for the sake of the example, here's some made up data...
data.matrix <- matrix(nrow=100, ncol=10)
colnames(data.matrix) <- c(
  paste("wt", 1:5, sep=""),
  paste("ko", 1:5, sep=""))
rownames(data.matrix) <- paste("gene", 1:100, sep="")
for (i in 1:100) {
  wt.values <- rpois(5, lambda=sample(x=10:1000, size=1))
  ko.values <- rpois(5, lambda=sample(x=10:1000, size=1))

  data.matrix[i,] <- c(wt.values, ko.values)
}
head(data.matrix)
dim(data.matrix)

pca <- prcomp(t(data.matrix), scale=TRUE) 

## plot pc1 and pc2
plot(pca$x[,1], pca$x[,2])

## make a scree plot
pca.var <- pca$sdev^2
pca.var.per <- round(pca.var/sum(pca.var)*100, 1)

barplot(pca.var.per, main="Scree Plot", xlab="Principal Component", ylab="Percent Variation")

## now make a fancy looking plot that shows the PCs and the variation:
library(ggplot2)

pca.data <- data.frame(Sample=rownames(pca$x),
  X=pca$x[,1],
  Y=pca$x[,2])
pca.data

ggplot(data=pca.data, aes(x=X, y=Y, label=Sample)) +
  geom_text() +
  xlab(paste("PC1 - ", pca.var.per[1], "%", sep="")) +
  ylab(paste("PC2 - ", pca.var.per[2], "%", sep="")) +
  theme_bw() +
  ggtitle("My PCA Graph")

## get the name of the top 10 measurements (genes) that contribute
## most to pc1.
loading_scores <- pca$rotation[,1]
gene_scores <- abs(loading_scores) ## get the magnitudes
gene_score_ranked <- sort(gene_scores, decreasing=TRUE)
top_10_genes <- names(gene_score_ranked[1:10])

top_10_genes ## show the names of the top 10 genes

pca$rotation[top_10_genes,1] ## show the scores (and +/- sign)

#######
##
## NOTE: Everything that follow is just bonus stuff.
## It simply demonstrates how to get the same 
## results using "svd()" (Singular Value Decomposition) or using "eigen()"
## (Eigen Decomposition).
##
##
#######

############################################
##
## Now let's do the same thing with svd()
##
## svd() returns three things
## v = the "rotation" that prcomp() returns, this is a matrix of eigenvectors
##     in other words, a matrix of loading scores
## u = this is similar to the "x" that prcomp() returns. In other words,
##     sum(the rotation * the original data), but compressed to the unit vector
##     You can spread it out by multiplying by "d"
## d = this is similar to the "sdev" value that prcomp() returns (and thus
##     related to the eigen values), but not
##     scaled by sample size in an unbiased way (ie. 1/(n-1)). 
##     For prcomp(), sdev = sqrt(var) = sqrt(ss(fit)/(n-1)) 
##     For svd(), d = sqrt(ss(fit))
############################################

svd.stuff <- svd(scale(t(data.matrix), center=TRUE))

## calculate the PCs
svd.data <- data.frame(Sample=colnames(data.matrix),
  X=(svd.stuff$u[,1] * svd.stuff$d[1]),
  Y=(svd.stuff$u[,2] * svd.stuff$d[2]))
svd.data

## alternatively, we could compute the PCs with the eigen vectors and the
## original data
svd.pcs <- t(t(svd.stuff$v) %*% t(scale(t(data.matrix), center=TRUE)))
svd.pcs[,1:2] ## the first to principal components

svd.df <- ncol(data.matrix) - 1
svd.var <- svd.stuff$d^2 / svd.df
svd.var.per <- round(svd.var/sum(svd.var)*100, 1)

ggplot(data=svd.data, aes(x=X, y=Y, label=Sample)) + 
  geom_text() +
  xlab(paste("PC1 - ", svd.var.per[1], "%", sep="")) +
  ylab(paste("PC2 - ", svd.var.per[2], "%", sep="")) +
  theme_bw() +
  ggtitle("svd(scale(t(data.matrix), center=TRUE)")


############################################
##
## Now let's do the same thing with eigen()
##
## eigen() returns two things...
## vectors = eigen vectors (vectors of loading scores)
##           NOTE: pcs = sum(loading scores * values for sample)
## values = eigen values
############################################
cov.mat <- cov(scale(t(data.matrix), center=TRUE))
dim(cov.mat)

## since the covariance matrix is symmetric, we can tell eigen() to just
## work on the lower triangle with "symmetric=TRUE"
eigen.stuff <- eigen(cov.mat, symmetric=TRUE)
dim(eigen.stuff$vectors)
head(eigen.stuff$vectors[,1:2])

eigen.pcs <- t(t(eigen.stuff$vectors) %*% t(scale(t(data.matrix), center=TRUE)))
eigen.pcs[,1:2]

eigen.data <- data.frame(Sample=rownames(eigen.pcs),
  X=(-1 * eigen.pcs[,1]), ## eigen() flips the X-axis in this case, so we flip it back
  Y=eigen.pcs[,2]) ## X axis will be PC1, Y axis will be PC2
eigen.data

eigen.var.per <- round(eigen.stuff$values/sum(eigen.stuff$values)*100, 1)

ggplot(data=eigen.data, aes(x=X, y=Y, label=Sample)) + 
  geom_text() +
  xlab(paste("PC1 - ", eigen.var.per[1], "%", sep="")) +
  ylab(paste("PC2 - ", eigen.var.per[2], "%", sep="")) +
  theme_bw() +
  ggtitle("eigen on cov(t(data.matrix))")

StatQuest: Multiple Regression in R

## Here's the data
mouse.data <- data.frame(
  size = c(1.4, 2.6, 1.0, 3.7, 5.5, 3.2, 3.0, 4.9, 6.3),
  weight = c(0.9, 1.8, 2.4, 3.5, 3.9, 4.4, 5.1, 5.6, 6.3),
  tail = c(0.7, 1.3, 0.7, 2.0, 3.6, 3.0, 2.9, 3.9, 4.0))

mouse.data

#######################################################
##
## Let's start by reviewing simple regression by 
## modeling mouse size with mouse weight.
##
#######################################################

## STEP 1: Draw a graph of the data to make sure the relationship make sense
plot(mouse.data$weight, mouse.data$size, pch=16, cex=2)

## STEP 2: Do the regression
simple.regression <- lm(size ~ weight, data=mouse.data)

## STEP 3: Look at the R^2, F-value and p-value
summary(simple.regression)

abline(simple.regression, lwd=5, col="red")

## now let's verify that our formula for R^2 is correct..
ss.mean <- sum((mouse.data$size - mean(mouse.data$size))^2)
ss.simple <- sum(simple.regression$residuals^2)

(ss.mean - ss.simple) / ss.mean # this is the R^2 value

## now let's verify the our formula for F is correct...
f.simple <- ((ss.mean - ss.simple) / (2 - 1)) / 
  (ss.simple / (nrow(mouse.data) - 2))

f.simple # this is the F-value

## Now let's draw a figure that shows how to calculate the p-value from the
## F-value
##
## First, draw the correct f-distribution curve with df1=1 and df2=7
x <- seq(from=0, to=15, by=0.1)
y <- df(x, df1=1, df2=7)
plot(x, y, type="l")

## now draw a verticle line where our F-value, f.simple, is.
abline(v=f.simple, col="red")

## color the graph on the left side of the line blue
x.zero.to.line <- seq(from=0, to=f.simple, by=0.1)
y.zero.to.line <- df(x.zero.to.line, df1=1, df2=7)
polygon(x=c(x.zero.to.line, 0), y=c(y.zero.to.line, 0), col="blue")

## color the graph on the right side of the line red
x.line.to.20 <- seq(from=f.simple, to=20, by=0.1)
y.line.to.20 <- df(x.line.to.20, df1=1, df2=7)
polygon(x=c(x.line.to.20, f.simple), y=c(y.line.to.20, 0), col="red")

pf(f.simple, df1=1, df2=7) ## the area under the curve that is blue

1-pf(f.simple, df1=1, df2=7) ## the area under the curve that is red

## lastly, let's compare this p-value to the one from the 
## original regression
summary(simple.regression)


#######################################################
##
## Now let's do multiple regression by adding an extra term, tail length
##
#######################################################

## STEP 1: Draw a graph of the data to make sure the relationship make sense
## This graph is more complex because it shows the relationships between all
## of the columns in "mouse.data".
plot(mouse.data)

## STEP 2: Do the regression
multiple.regression <- lm(size ~ weight + tail, data=mouse.data)

## STEP 3: Look at the R^2, F-value and p-value
summary(multiple.regression)


## again, we can verify that our R^2 value is what we think it is
ss.multiple <- sum(multiple.regression$residuals^2)

(ss.mean - ss.multiple) / ss.mean

## we can also verify that the F-value is what we think it is
f.multiple <- ((ss.mean - ss.multiple) / (3 - 1)) / 
  (ss.multiple / (nrow(mouse.data) - 3))

f.multiple  

## Again let's draw a figure that shows how to calculate the p-value from the
## F-value
##
## First, draw the correct f-distribution curve with df1=2 and df2=6
x <- seq(from=0, to=20, by=0.1)
y <- df(x, df1=2, df2=6)
plot(x, y, type="l")

## now draw a verticle line where our f.value is for this test
abline(v=f.multiple, col="red")

## color the graph on the left side of the line blue
x.zero.to.line <- seq(from=0, to=f.multiple, by=0.1)
y.zero.to.line <- df(x.zero.to.line, df1=2, df2=6)
polygon(x=c(x.zero.to.line, 0), y=c(y.zero.to.line, 0), col="blue")

## color the graph on the right side of the line red
x.line.to.20 <- seq(from=f.multiple, to=20, by=0.1)
y.line.to.20 <- df(x.line.to.20, df1=2, df2=6)
polygon(x=c(x.line.to.20, f.multiple), y=c(y.line.to.20, 0), col="red")


pf(f.multiple, df1=2, df2=6) ## the area under the curve that is blue

1-pf(f.multiple, df1=2, df2=6) ## the area under the curve that is red

## lastly, let's compare this p-value to the one from the 
## original regression
summary(multiple.regression)



#######################################################
##
## Now, let's see if "tail" makes a significant controbution by 
## comparing the "simple" fit (which does not include the tail data) 
## to the "multiple" fit (which has the extra term for the tail data)
##
#######################################################

f.simple.v.multiple <- ((ss.simple - ss.multiple) / (3-2)) / 
  (ss.multiple / (nrow(mouse.data) - 3))

1-pf(f.simple.v.multiple, df1=1, df2=6)

## Notice that this value is the same as the p-value next to the term for
## for "tail" in the summary of multiple regression:
summary(multiple.regression)

## Thus, the summary already calculated this F-value and p-value for us.
## this line tells us that including the "tail" term makes a statistically
## significant difference. The magnitude can be determined by looking
## at the change in R^2 between the simple and multiple regressions.