AnalysisFunctions.R

##helper function to calculate two-tailed T-
calculatePvaluefromTvalue <- function(t.value, df){
  return(2*pt(-abs(t.value), df))
}

customWrite <- function(DF, filename, ...){
  write.table(DF, filename, sep = "\t", row.names = FALSE, ...)
}

customRead <- function(filename, ...){
  read.table(filename, sep = '\t', header = TRUE, stringsAsFactors = FALSE, ...)
}
##function for cleaning hsbc(after getting the correct subjects in the hsbc)

cleanHsbc <- function(hsbcDF){
  ##get the hsbc right
  hsbcDF$vhash <- NULL
  hsbcDF$jhash <- NULL
  hsbcDF$chain <- NULL
  hsbcDF <- as.data.frame(t(hsbcDF))
  names(hsbcDF) <- c("age", "dbid", "gender", "positive_602", "positive_0301", "beforeafter", "casecontrol", "cell_type", "rs1483979")
  hsbcDF <- hsbcDF[-1,]
  hsbcDF$age <- as.numeric(as.character(hsbcDF$age))
  hsbcDF$gender <- factor(as.character(hsbcDF$gender))
  hsbcDF$positive_602 <- factor(as.character(hsbcDF$positive_602))
  hsbcDF$positive_0301 <- factor(as.character(hsbcDF$positive_0301))
  hsbcDF$casecontrol <- factor(as.character(hsbcDF$casecontrol))
  hsbcDF$cell_type <- factor(as.character(hsbcDF$cell_type))
  hsbcDF$rs1483979 <- factor(as.character(hsbcDF$rs1483979))
  return(hsbcDF)
}

##function for identifying two types of alpha-J24s
identifyJ24 <- function(DF = data.frame()){
  J24.df <- subset(DF,jhash=="TRAJ24*01")
  not.J24.df <- subset(DF, jhash!="TRAJ24*01")
  #get ones that match the "SWGK[FL][RQ]$" pattern
  J24.df <- J24.df[grepl("SWGK[FL][RQ]$", J24.df$covariate),]
  J24.df$J24type <- sapply(J24.df$covariate, function(x){ifelse(grepl("L[RQ]$", x), "01", "02")}, USE.NAMES = FALSE, simplify = TRUE)
  J24.df$jhash <- paste(J24.df$jhash, J24.df$J24type, sep = ":")
  J24.df$J24type <- NULL
  return(rbind(J24.df, not.J24.df))
}

##analysis functions
##function for building dataframes for usage analyses
##data:the dataframe of raw counts
##columns.to.aggregate.by: named list 
##for example: alpha.V.cd4  <- aggregateForUsage(alpha.cd4, list(vhash = alpha.cd4$vhash), 1:3)
aggregateForUsage <- function(data, columns.to.aggregate.by, columns.to.omit = 1:4, agg.function = sum){
  return(aggregate(data[, -columns.to.omit], by = columns.to.aggregate.by, FUN = agg.function ))
}

#scale to smallest library size
normalize<-function(mat=matrix()){
  col.sums<-colSums(mat)
  if(prod(col.sums)){
    t(t(mat)/col.sums)*min(col.sums)
  }
  else{
    stop("some columns sum to zero")
  }
}

#scale to frequencies
normalize2 <- function(mat=matrix()){
  col.sums<-colSums(mat)
  if(prod(col.sums)){
    t(t(mat)/col.sums)
  }
  else{
    stop("some columns sum to zero")
  }
}

##This is according to 
##http://grokbase.com/t/r/bioconductor/127r11kp23/bioc-edger-and-libsize-normalization
##answer by Mark Robinson
getTMMNormalizedMatrix <- function(M){
  require(edgeR)
  norm.factors <- calcNormFactors(M, method = "TMM")
  lib.sizes    <- colSums(M)
  t(t(M)/(norm.factors*lib.sizes))
}


##removeRareClones assumes first column contains clone names, and the rest of the columns contain actual numeric data
removeRareClones<-function(m,N){
  m[rowSums(m[,-1]>0)>N,]
}

wrapNormalize <- function(DF){
  mat <- as.matrix(sapply(DF[, -1], as.numeric))
  mat <- normalize(mat)
  return(cbind(name = DF[, 1], as.data.frame(mat)))
}

##does not order the result 
doLogmodel <- function(DF, hsbc = NULL){  
  name <- DF$name
  DF$name <- NULL
  bigmodel <- lm(log(t(DF)+1) ~ hsbc$age + hsbc$gender + hsbc$positive_0301 + hsbc$casecontrol + hsbc$rs1483979)
  bigsummary <- summary(bigmodel)
  coef     <- lapply(bigsummary, function(x)x$coefficients[,4])
  eff      <- lapply(bigsummary,function(x)x$coefficients[,1])
  coefDF   <- as.data.frame(coef)
  effDF    <- as.data.frame(eff)
  minpvals <- apply(coefDF,2,min)
  coefDF   <- rbind(coefDF,minpvals)
  names(coefDF) <- name 
  names(effDF) <- name 
  coefDF <- rbind(coefDF,effDF)
  return(coefDF)
}

##simple log(x+1) transformed model with just case control status as covariate

doSimpleLogmodel <- function(DF, hsbc = NULL){  
  name <- DF$name
  DF$name <- NULL
  bigmodel <- lm(log(t(DF)+1) ~ hsbc$casecontrol)
  bigsummary <- summary(bigmodel)
  coef     <- lapply(bigsummary, function(x)x$coefficients[,4])
  eff      <- lapply(bigsummary,function(x)x$coefficients[,1])
  coefDF   <- as.data.frame(coef)
  effDF    <- as.data.frame(eff)
  minpvals <- apply(coefDF,2,min)
  coefDF   <- rbind(coefDF,minpvals)
  names(coefDF) <- name 
  names(effDF) <- name 
  coefDF <- rbind(coefDF,effDF)
  return(coefDF)
}

##same function with casecontrol and celltype as covariates
doSimpleLogmodelwithcelltype <- function(DF, hsbc = NULL){  
  name <- as.character(DF[, 1])
  DF[, 1] <- NULL
  bigmodel <- lm(log(t(DF)+1) ~ hsbc$casecontrol + hsbc$cell_type)
  bigsummary <- summary(bigmodel)
  coef     <- lapply(bigsummary, function(x)x$coefficients[,4])
  eff      <- lapply(bigsummary,function(x)x$coefficients[,1])
  coefDF   <- as.data.frame(coef)
  effDF    <- as.data.frame(eff)
  minpvals <- apply(coefDF,2,min)
  coefDF   <- rbind(coefDF,minpvals)
  names(coefDF) <- name 
  names(effDF) <- name 
  coefDF <- data.frame(name = name, t(rbind(coefDF,effDF)))
  names(coefDF)[5] <- "MinP"
  return(coefDF)
}

##same function with casecontrol and 301-status as covariates
doSimpleLogmodelwith301status <- function(DF, hsbc = NULL){  
  name <- as.character(DF[, 1])
  DF[, 1] <- NULL
  bigmodel <- lm(log(t(DF)+1) ~ hsbc$casecontrol + hsbc$positive0301)
  bigsummary <- summary(bigmodel)
  coef     <- lapply(bigsummary, function(x)x$coefficients[,4])
  eff      <- lapply(bigsummary,function(x)x$coefficients[,1])
  coefDF   <- as.data.frame(coef)
  effDF    <- as.data.frame(eff)
  minpvals <- apply(coefDF,2,min)
  coefDF   <- rbind(coefDF,minpvals)
  names(coefDF) <- name 
  names(effDF) <- name 
  coefDF <- data.frame(name = name, t(rbind(coefDF,effDF)))
  names(coefDF)[5] <- "MinP"
  return(coefDF)
}


##calculate standard error of a vector
std <- function(x) sd(x, na.rm =TRUE)/sqrt(length(x[!is.na(x)]))

##function to perform wilcox test for case/control groups according to a character vector. By default perform an unpaired wilcoxon test.
doWilcoxtest<-function(DF = NULL, casecontrol = NULL, paired = FALSE  ){
  name       <- NA
  testp      <- NA
  meancase <- NA
  SEMcase  <- NA
  SEMctrl   <- NA
  meanctrl  <- NA
  
  tDF   <- as.data.frame(t(DF[, -1]))
  nimet <- as.character(DF[, 1])
  
  for (i in 1:ncol(tDF)) {
    testp[i]    <- wilcox.test(tDF[, i] ~ casecontrol, paired = paired)$p.value
    name[i]     <-nimet[i]
    meancase[i] <-mean(tDF[which(casecontrol == "case") ,i])
    meanctrl[i] <-mean(tDF[which(casecontrol == "control") ,i])
    SEMcase[i]  <-std(tDF[which(casecontrol == "case") ,i])
    SEMctrl[i]  <-std(tDF[which(casecontrol == "control") ,i])
    
  }
  
  bound <- as.data.frame(cbind(name, testp, meancase, SEMcase, meanctrl, SEMctrl))
  
  return(bound)
}

##countNonZeroClones function:
##count the number of cases and controls for which the clone count is non-zero
##input: dataframe with names of clones on the first column, and numeric data in the rest of the columns
##       character vector indicating cases ("case") and controls ("control") 
countNonZeroClones <- function(DF, casecontrol = character()){
  case    <- which(casecontrol == "case")
  control <- which(casecontrol == "control")
  case.nonzero    <- rowSums(apply(DF[, -1][, case], c(1,2), function(x){x != 0}))
  control.nonzero <- rowSums(apply(DF[, -1][, control], c(1,2), function(x){x != 0}))
  names(case.nonzero)    <- NULL
  names(control.nonzero) <- NULL
  return(data.frame(casenonzeroN = case.nonzero, ctrlnonzeroN = control.nonzero))
}

##calculateCaseControlMeans function:
##calculate mean counts of cases and controls indicated by character vector in dataframe with names of clones in the first column and numeric
##data in the rest of the columns.
##outputs a two column dataframe with columns case.mean and column.mean containing the numerical mean count values
calculateCaseControlMeans <- function(DF, casecontrol = character()){
  case.indices    <- which(casecontrol == "case")
  control.indices <- which(casecontrol == "control")
  case.n    <- length(case.indices)
  control.n <- length(control.indices)
  case.mean    <- rowSums(DF[, -1][, case.indices]) / case.n
  control.mean <- rowSums(DF[, -1][, control.indices]) / control.n
  return(data.frame(case.mean = case.mean, control.mean = control.mean))
}


##buildContingencyTables
##input: numerical vectors of same length x, and y, indicating the number of present condition, 
##numbers totalX and totalY indicating the total amounts of subjects
##(cases vs controls for example).
##builds a list (of length length(x)=length(y)) of tables ready-for-use for chisq.test function in {base}

buildContingencyTables <- function(x, y, totalX, totalY){
  if ((length(x) != length(y)) | (length(x)*length(y) == 0)){
    stop("the input vectors have to be of equal, nonzero, length")
  }
  result = list()
  for (i in 1:length(x)){
    xpresent <- x[[i]]
    xabsent  <- totalX - x[[i]]
    ypresent <- y[[i]]
    yabsent  <- totalY - y[[i]]
    t <- as.table(rbind(c(xpresent, xabsent), c(ypresent, yabsent)))
    dimnames(t) <- list(cactrl = c("case", "control"), presence = c("present", "absent"))
    result[[i]] <- t
  }
  return(result)
}

##wrapper for analysis functions requires dplyr
buildWilcoxDataFrameWithExtras <- function(DF = data.frame(), casecontrol = character()){
  require(dplyr)
  result <- data.frame(doWilcoxtest(DF = DF, casecontrol = casecontrol), countNonZeroClones(DF = DF, casecontrol = casecontrol))
  n.case <- length(which(casecontrol == "case"))
  n.ctrl <- length(which(casecontrol == "control"))
  result <- mutate(result, caseminusctrl = casenonzeroN - ctrlnonzeroN)
  result <- mutate(result, caseminusctrlpercentage = (casenonzeroN/n.case - ctrlnonzeroN/n.ctrl)*100)
  return(result)
}

combineNames <- function(DF = data.frame(), sep = "_"){
  DF$chain <- NULL
  DF <- data.frame(name = paste(DF[, 1], DF[, 2], DF[, 3], sep = sep), DF[, -c(1:3)], stringsAsFactors = FALSE)
  return(DF)
}