Homework #2

library(marray)
library(limma)
# Read in the .gpr files (change the R files to .gpr)
dir.path <-"C:\\Users\\amand\\Documents\\Gene Expression Data Analysis and Visualization JHU 2022\\GSE12050_GenePix"
GSE12050 <- read.GenePix(path = dir.path, name.Gf = "F532 Median", name.Gb = "B532 Median", name.Rf = "F635 Median", name.Rb = "B635 Median", name.W = "Flags")

# https://www.computerworld.com/article/2497319/business-intelligence-beginner-s-guide-to-r-syntax-quirks-you-ll-want-to-know.html
# R basics, when using the apply() function, 1=row, 2=column

# Normalize each array using median global, loess, and print-tip-group loess methods.  Then plot MvA plots of all 4 arrays comparing no normalization to the other 3 normalization approaches. 

# Subject 1

subject.one        <- GSE12050[ , 1] 
subject.one.Median <- maNorm(subject.one, norm = c("median"))
subject.one.Loess  <- maNorm(subject.one, norm = c("loess"))
subject.one.PTL    <- maNorm(subject.one, norm = c("printTipLoess")) 

# MvA plots for all 3 normalizations and no normalization
par(mfrow=c(4,1)) 
maPlot(subject.one, main = "Subject 1 - No Normalization", lines.func = NULL, legend.func = NULL) 
maPlot(subject.one.Median, main = "Subject 1 - Global Median Normalization", lines.func = NULL, legend.func = NULL) 
maPlot(subject.one.Loess, main = "Subject 1 - Loess Normalization", lines.func = NULL, legend.func = NULL) 
maPlot(subject.one.PTL, main = "Subject 1 - Print-tip-group Loess Normalization", lines.func = NULL, legend.func = NULL) 


# Subject 2

subject.two        <- GSE12050[ , 2] 
subject.two.Median <- maNorm(subject.two, norm = c("median"))
subject.two.Loess  <- maNorm(subject.two, norm = c("loess"))
subject.two.PTL    <- maNorm(subject.two, norm = c("printTipLoess")) 

# MvA plots for all 3 normalizations and no normalization
par(mfrow=c(4,1)) 
maPlot(subject.two, main = "Subject 2 - No Normalization", lines.func = NULL, legend.func = NULL) 
maPlot(subject.two.Median, main = "Subject 2 - Global Median Normalization", lines.func = NULL, legend.func = NULL) 
maPlot(subject.two.Loess, main = "Subject 2 - Loess Normalization", lines.func = NULL, legend.func = NULL) 
maPlot(subject.two.PTL, main = "Subject 2 - Print-tip-group Loess Normalization", lines.func = NULL, legend.func = NULL) 


# Subject 3

subject.three       <- GSE12050[ , 3] 
subject.three.Median <- maNorm(subject.three, norm = c("median"))
subject.three.Loess  <- maNorm(subject.three, norm = c("loess"))
subject.three.PTL    <- maNorm(subject.three, norm = c("printTipLoess")) 

# MvA plots for all 3 normalizations and no normalization
par(mfrow=c(4,1)) 
maPlot(subject.three, main = "Subject 3 - No Normalization", lines.func = NULL, legend.func = NULL) 
maPlot(subject.three.Median, main = "Subject 3 - Global Median Normalization", lines.func = NULL, legend.func = NULL) 
maPlot(subject.three.Loess, main = "Subject 3 - Loess Normalization", lines.func = NULL, legend.func = NULL) 
maPlot(subject.three.PTL, main = "Subject 3 - Print-tip-group Loess Normalization", lines.func = NULL, legend.func = NULL) 


# Subject 4

subject.four      <- GSE12050[ , 4] 
subject.four.Median <- maNorm(subject.four, norm = c("median"))
subject.four.Loess  <- maNorm(subject.four, norm = c("loess"))
subject.four.PTL    <- maNorm(subject.four, norm = c("printTipLoess")) 

# MvA plots for all 3 normalizations and no normalization
par(mfrow=c(4,1)) 
maPlot(subject.four, main = "Subject 4 - No Normalization", lines.func = NULL, legend.func = NULL) 
maPlot(subject.four.Median, main = "Subject 4 - Global Median Normalization", lines.func = NULL, legend.func = NULL) 
maPlot(subject.four.Loess, main = "Subject 4 - Loess Normalization", lines.func = NULL, legend.func = NULL) 
maPlot(subject.four.PTL, main = "Subject 4 - Print-tip-group Loess Normalization", lines.func = NULL, legend.func = NULL) 

# Plot density plots of the log ratio values for each normalization (and pre normalization) for only array #4.  Put them all on the same plot.  Make sure to label the axes and provide a legend.

# how to use na.omit https://statisticsglobe.com/na-omit-r-example/

# maM - Object of class "matrix", intensity log-ratios (base 2) M, rows correspond to spotted probe sequences, columns to arrays in the batch

plot(density(na.omit(maM(subject.four))), main = "Log Ratio Values for Each Normalization and Pre-Normalization for Array #4",
     ylim=c(0, 1), xlim=c(-7, 9),col="purple")
lines(density(na.omit(maM(subject.four.Loess))), col="red")
lines(density(na.omit(maM(subject.four.Median))), col="blue")
lines(density(na.omit(maM(subject.four.PTL))), col="orange")
leg <- c("No Normalization", "Loess", "Global Median", "Print Tip Group Loess")

legend("topright", legend = leg,
       col = c("purple", "red", "blue", "orange"), 
       lty = 1)


# So, first extract the Cy5 foreground and background values for each of the 4 arrays and subtract the background from the foreground values, then log2 transform these values. Then calculate global median normalization on these 4 arrays using these background subtracted Cy5 values. Hint, you need to use the median of each array to scale, such that after normalization, all arrays will have a median of 1.

# maRf = Red foreground intensities (Cy5), maRb = Red background intensities (Cy5), subtract the 2 intensities and log2 data. apply by column(2)
log2.Cy5 <- log2(GSE12050@maRf - GSE12050@maRb)
apply(GSE12050@maRf - GSE12050@maRb, 2, function(x) sum(x < 0))

# http://web.mit.edu/~r/current/arch/i386_linux26/lib/R/library/limma/html/normalizeMedianAbsValues.html
# normalize with global median normalization
norm.log2.Cy5 <- normalizeMedianValues(log2.Cy5)
norm.log2.Cy5

# now all arrays have same median and so to scale just divide by the median, and ignore NA values
scaled.norm.log2.Cy5 <- norm.log2.Cy5/
  median(norm.log2.Cy5, na.rm = T)


# check median of each array is 1, apply by column(2)
apply(scaled.norm.log2.Cy5, 2, median, na.rm = T)


# Next calculate a Spearman's rank correlation between all 4 arrays that you normalized in #5 and do the same with the M values from loess normalized data that you generated in #2.  Plot a scatter plot matrix for each of the two normalizations (pairs() function), and be sure to label the arrays and title the plot. Print the correlation coefficients to the screen. 
library(limma)

colnames(scaled.norm.log2.Cy5) <- make.names(array.names, unique = TRUE)
cor.Cy5 <- cor(scaled.norm.log2.Cy5, method="spearman", use="complete.obs")
cor.Cy5

s.1 <- maM(subject.one.Loess)
s.2 <- maM(subject.two.Loess)
s.3 <- maM(subject.three.Loess)
s.4 <- maM(subject.four.Loess)

loess.data <- cbind(s.1, s.2, s.3, s.4)
array.names <- c("GSM304445", "GSM304446", "GSM304447", "GSM304448")
colnames(loess.data) <- make.names(array.names, unique = TRUE)  
cor.loess <- cor(loess.data, method = "spearman", use = "complete.obs")
cor.loess

pairs(cor.Cy5, main = "Scatter Plot of Normalized Data of Cy5", pch=21, col=1, 
      bg='pink')

print('Spearman Correlation Global Median Scaled log2 Cy5')
cor.Cy5

pairs(cor.loess, main = "Scatter Plot of Normalized Loess Arrays", pch=21, col=1, 
      bg='purple')

print('Spearman Correlation Normalized Loess Arrays')
cor.loess

# 1. Subtract the foreground - background for each of the 4 chips for only the Cy5 channel.  This should all be on the linear or raw scale (no logging yet).
Cy5 <- (GSE12050@maRf - GSE12050@maRb)
Cy5
# 2. Sort each column independently in this new matrix, apply by column(2)
Sort.Cy5 <- apply(Cy5, 2, sort)
Sort.Cy5
# 3. Calculate row means for the sorted matrix
Cy5.mean <- rowMeans(Sort.Cy5, na.rm = T)
Cy5.mean
# 4. Create a new matrix with each row having the same values as the sorted row mean vectors from step #3 (you should have a new R matrix)
Cy5.matrix <- matrix(rep(Cy5.mean, 4), ncol = 4, byrow = F)
Cy5.matrix
# 5. Rank the columns independently on the original background subtracted matrix (from step #1)  Hint: use the rank() function with the argument ties="first" or order(), apply by column(2)
Cy5.ranked <- apply(Cy5, 2, function(x) rank(x, ties="first"))
Cy5.ranked
# 6. Reorder the columns in the new matrix from step #4 using the ranks from step #5, apply by column(2)
Cy5.matrix.ranked <- apply(Cy5.matrix, 2, function(x) rank(x, ties = "first"))
Cy5.matrix.ranked

# To verify that each array has the same distribution, use the hist() function to look at various arrays
par(mfrow=c(4,1)) 
hist(Cy5.matrix[,1])
hist(Cy5.matrix[,2])
hist(Cy5.matrix[,3])
hist(Cy5.matrix[,4])

# Now log (base 2) the new R matrix you created from step 6 (question #7) and calculate a Spearman's rank correlation between the 4 arrays and plot a scatter plot matrix as you did before. Print the correlation coefficients to the screen

log.Cy5.matrix <- log2(Cy5.matrix)
log.Cy5.matrix

cor.Cy5.matrix <- cor(log.Cy5.matrix, method="spearman", use="complete.obs")
cor.Cy5.matrix

pairs(log.Cy5.matrix, 
           pch=21, col=1, bg='darkgreen',
           main='Spearman Correlation Normalized New Matrix Arrays', cex=0.4,
           method="spearman") 
print('Spearman Correlation Normalized New Matrix Arrays')
cor.Cy5.matrix

# Then change the normalization script from the lecture notes to include the housekeeping genes beta actin, GAPDH, and 18S. Look at the file to make sure the housekeepers are spelled correctly.

# qRT-PCR file formatting and calculation of fold changes

f.parse <- function(path=pa,file=fi,out=out.fi) {
  d <- read.table(paste(path,file,sep=""),skip=11,sep=",",header=T)
  u <- as.character(unique(d$Name))
  u <- u[u!=""]; u <- u[!is.na(u)];
  ref <- unique(as.character(d$Name[d$Type=="Reference"]))
  u <- unique(c(ref,u))
  hg <- c("B-ACTIN","GAPDH","18S")
  hg <- toupper(hg)
  p <- unique(toupper(as.character(d$Name.1)))
  p <- sort(setdiff(p,c("",hg)))
  
  mat <- matrix(0,nrow=length(u),ncol=length(p))
  dimnames(mat) <- list(u,p)
  for (i in 1:length(u)) {
    print(paste(i,": ",u[i],sep=""))
    tmp <- d[d$Name %in% u[i],c(1:3,6,9)]
    g <- toupper(unique(as.character(tmp$Name.1)))
    g <- sort(setdiff(g,c("",hg)))
    
    for (j in 1:length(g)) {
      v <- tmp[toupper(as.character(tmp$Name.1)) %in% g[j],5]
      v <- v[v!=999]
      v <- v[((v/mean(v))<1.5) & ((v/mean(v))>0.67)]	#gene j vector
      
      hv3 <- NULL
      for (k in 1:length(hg)) {	#housekeeping gene vector (each filtered by reps)
        hv <- tmp[toupper(as.character(tmp$Name.1)) %in% hg[k],5]
        hv <- hv[hv!=999]
        hv3 <- c(hv3,hv[((hv/mean(hv))<1.5) & ((hv/mean(hv))>0.67)]) 	
      }
      
      sv <- mean(as.numeric(v)) - mean(as.numeric(hv3))	#scaled value for gene j
      
      if(i==1) { #reference sample only
        mat[u[i],g[j]] <- sv
        next
      }
      
      mat[u[i],g[j]] <- sv - mat[u[1],g[j]]
    }
  }
  
  mat[1,][!is.na(mat[1,])] <- 0
  fc <- 2^(-1 * mat)
  write.table(t(c("Subject",dimnames(mat)[[2]])),paste(path,out,sep=""),quote=F,sep="\t",col.names=F,row.names=F)
  write.table(round(fc,3),paste(path,out,sep=""),quote=F,sep="\t",append=T,col.names=F)
}


# run function
pa <- "C:\\Users\\amand\\Documents\\Gene Expression Data Analysis and Visualization JHU 2022\\"
fi <- "Inflammation_qRT-PCR.csv"
out.fi <- "fold_chg_matrix.txt"

f.parse(pa,fi,out.fi)

# Read the normalized qRT-PCR data matrix into R, using a Spearman's rank correlation, which two patients are most correlated? Plot these two patients against each other in a scatter plot.


pcr <- t(read.table("C:\\Users\\amand\\Documents\\Gene Expression Data Analysis and Visualization JHU 2022\\fold_chg_matrix.txt",
                    sep = "\t", header = T, row.names = 1))

cor.pcr <- cor(pcr, method = "spearman", use = "complete.obs")

## remove diag of 1 to find index of highest match
## https://stackoverflow.com/questions/12471780/set-diagonal-of-a-matrix-to-zero-in-r
diag(cor.pcr) = NA

# find index
# https://stackoverflow.com/questions/17606906/find-row-and-column-index-of-maximum-value-in-a-matrix
which(cor.pcr == max(cor.pcr, na.rm=T), arr.ind = T)
correlation <- round(max(cor.pcr, na.rm=T), 4)

## get the sample names
highest.correlated <- row.names(which(cor.pcr == max(cor.pcr, na.rm=T), 
                                      arr.ind = T))
highest.correlated
## new data x and y
pcr.dat.plot <- pcr[, highest.correlated]
plot(pcr.dat.plot, pch=23, main="Two most correlated samples")
text(3, 55, paste("Spearman correlation = ", correlation), pos=4)
abline(0, 1, col="red")