-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #56 from VEuPathDB/feature-54-add-PCA
Feature 54 add pca
- Loading branch information
Showing
6 changed files
with
206 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
|
||
#' PCA | ||
#' | ||
#' @param collection A Collection object | ||
#' @param nPCs Number of principal components to return. Default 10 | ||
#' @param ntop Use the top ntop genes with the highest variance for the pca computation. Mirrors the deseq2 plotPCA argument. Default 500. | ||
#' @param verbose Boolean indicating if extra messaging should be printed. | ||
#' @return A ComputeResult object. The data slot contains a data.table with the id columns and the first nPCs principal components. | ||
#' @export | ||
setGeneric("pca", | ||
function(collection, nPCs = 10, ntop = 500, verbose = c(TRUE, FALSE)) standardGeneric("pca"), | ||
signature = c("collection") | ||
) | ||
|
||
#' @export | ||
setMethod(pca, "Collection", | ||
function(collection, nPCs = 10, ntop = 500, verbose = c(TRUE, FALSE)) { | ||
|
||
verbose <- veupathUtils::matchArg(verbose) | ||
assay <- getCollectionData(collection) | ||
recordIdColumn <- collection@recordIdColumn | ||
ancestorIdColumns <- collection@ancestorIdColumns | ||
allIdColumns <- c(recordIdColumn, ancestorIdColumns) | ||
entity <- veupathUtils::strSplit(recordIdColumn,".", 4, 1) | ||
|
||
# Remove id columns from the assay to get only the features. | ||
features <- assay[, -..allIdColumns] # features has samples as rows. | ||
|
||
# Update ntop if it's too large. | ||
if (ntop > ncol(features)) { | ||
if (verbose) { | ||
message("ntop is larger than the number of features. Using all features.") | ||
} | ||
ntop <- min(ntop, ncol(features)) | ||
} | ||
|
||
# Ensure ntop is at least 1. | ||
if (ntop <= 1) { | ||
stop("ntop must be at least 2.") | ||
} | ||
|
||
# Use prcomp to perform PCA. | ||
# The following is heavily borrowed from the deseq2 plotPCA function. | ||
rowVariances <- matrixStats::rowVars(t(as.matrix(features))) | ||
keepFeatures <- order(rowVariances, decreasing=TRUE)[seq_len(ntop)] | ||
pcaResult <- prcomp(features[, ..keepFeatures]) | ||
|
||
|
||
# Assemble the output ComputeResult data and variable metadata. | ||
dt <- assay[, ..allIdColumns] | ||
# The PCA results are in pcaResult$x. Keep the first nPCs PCS. | ||
dt <- cbind(dt, pcaResult$x[, 1:nPCs]) # this works fine even with one id column | ||
|
||
variableMetadataList <- lapply(1:nPCs, function(i) { | ||
veupathUtils::VariableMetadata( | ||
variableClass = veupathUtils::VariableClass(value = "computed"), | ||
variableSpec = veupathUtils::VariableSpec(variableId = paste0("PC",i), entityId = entity), | ||
displayName = paste0("PC ",i), | ||
displayRangeMin = min(pcaResult$x[,i]), | ||
displayRangeMax = max(pcaResult$x[,i]), | ||
dataType = veupathUtils::DataType(value = "NUMBER"), | ||
dataShape = veupathUtils::DataShape(value = "CONTINUOUS"), | ||
plotReference = veupathUtils::PlotReference(value = "undefined") | ||
) | ||
}) | ||
|
||
result <- new("ComputeResult") | ||
result@name <- 'pca' | ||
result@recordIdColumn <- recordIdColumn | ||
result@ancestorIdColumns <- ancestorIdColumns | ||
result@data <- dt | ||
result@parameters <- paste0('recordIdColumn = ', recordIdColumn,", nPCs = ", nPCs, ', ntop = ', ntop) | ||
result@computedVariableMetadata <- veupathUtils::VariableMetadataList( | ||
S4Vectors::SimpleList(variableMetadataList) | ||
) | ||
|
||
return(result) | ||
} | ||
) |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
# Test PCA | ||
|
||
test_that("The pca function produces the expected output", { | ||
|
||
testData <- testCountDataCollection | ||
|
||
# Run pca | ||
load_all() | ||
output <- veupathUtils::pca(testData, nPCs = 2) | ||
expect_s4_class(output, "ComputeResult") | ||
outputData <- output@data | ||
expect_s3_class(outputData, "data.table") | ||
expect_equal(nrow(outputData), nrow(getCollectionData(testData))) | ||
expect_equal(ncol(outputData), 3) | ||
expect_equal(names(outputData), c("entity.SampleID", "PC1", "PC2")) | ||
computedVariableMetadata <- output@computedVariableMetadata | ||
expect_s4_class(computedVariableMetadata, "VariableMetadataList") | ||
expect_equal(length(computedVariableMetadata), 2) | ||
expect_s4_class(computedVariableMetadata[[1]], "VariableMetadata") | ||
expect_s4_class(computedVariableMetadata[[2]], "VariableMetadata") | ||
expect_equal(computedVariableMetadata[[1]]@variableSpec@variableId, "PC1") | ||
expect_equal(computedVariableMetadata[[2]]@variableSpec@variableId, "PC2") | ||
|
||
|
||
# Generate some fake data | ||
nSamples <- 500 | ||
fakeData <- data.table( | ||
entity.SampleID = paste0("Sample", 1:nSamples), | ||
feat1 = rnorm(nSamples), | ||
feat2 = rnorm(nSamples), | ||
feat3 = rnorm(nSamples) | ||
) | ||
|
||
fakeCollection <- new("Collection", data = fakeData, recordIdColumn = "entity.SampleID", name="test") | ||
|
||
output <- veupathUtils::pca(fakeCollection, nPCs = 2, ntop=20) | ||
expect_s4_class(output, "ComputeResult") | ||
outputData <- output@data | ||
expect_s3_class(outputData, "data.table") | ||
expect_equal(nrow(outputData), nSamples) | ||
expect_equal(ncol(outputData), 3) | ||
expect_equal(names(outputData), c("entity.SampleID", "PC1", "PC2")) | ||
computedVariableMetadata <- output@computedVariableMetadata | ||
expect_s4_class(computedVariableMetadata, "VariableMetadataList") | ||
expect_equal(length(computedVariableMetadata), 2) | ||
expect_s4_class(computedVariableMetadata[[1]], "VariableMetadata") | ||
expect_s4_class(computedVariableMetadata[[2]], "VariableMetadata") | ||
expect_equal(computedVariableMetadata[[1]]@variableSpec@variableId, "PC1") | ||
expect_equal(computedVariableMetadata[[2]]@variableSpec@variableId, "PC2") | ||
}) | ||
|
||
test_that("The pca function can handle messy data", { | ||
|
||
# Generate some fake data | ||
nSamples <- 500 | ||
fakeData <- data.table( | ||
entity.SampleID = paste0("Sample", 1:nSamples), | ||
entity.feat1 = rnorm(nSamples), | ||
entity.feat2 = rnorm(nSamples), | ||
entity.feat3 = rnorm(nSamples) | ||
) | ||
|
||
fakeCollection <- new("Collection", | ||
data = fakeData, | ||
recordIdColumn = "entity.SampleID", | ||
name="test", | ||
imputeZero=TRUE | ||
) | ||
|
||
# Add some missing values and ensure one sample gets dropped | ||
fakeCollection@data[1:50, "entity.feat1"] <- NA | ||
fakeCollection@data[25:100, "entity.feat2"] <- 0 | ||
fakeCollection@data[(nSamples-50):nSamples, "entity.feat3"] <- NA | ||
fakeCollection@data[26, "entity.feat3"] <- NA | ||
|
||
output <- pca(fakeCollection, nPCs = 3, ntop=5, verbose=T) | ||
expect_s4_class(output, "ComputeResult") | ||
outputData <- output@data | ||
expect_s3_class(outputData, "data.table") | ||
expect_equal(nrow(outputData), nSamples-1) | ||
expect_equal(ncol(outputData), 4) | ||
expect_equal(names(outputData), c("entity.SampleID", "PC1", "PC2", "PC3")) | ||
computedVariableMetadata <- output@computedVariableMetadata | ||
expect_s4_class(computedVariableMetadata, "VariableMetadataList") | ||
expect_equal(length(computedVariableMetadata), 3) | ||
expect_s4_class(computedVariableMetadata[[1]], "VariableMetadata") | ||
expect_s4_class(computedVariableMetadata[[2]], "VariableMetadata") | ||
expect_s4_class(computedVariableMetadata[[3]], "VariableMetadata") | ||
expect_equal(computedVariableMetadata[[1]]@variableSpec@variableId, "PC1") | ||
expect_equal(computedVariableMetadata[[1]]@variableSpec@entityId, "entity") | ||
expect_equal(computedVariableMetadata[[2]]@variableSpec@variableId, "PC2") | ||
expect_equal(computedVariableMetadata[[2]]@variableSpec@entityId, "entity") | ||
expect_equal(computedVariableMetadata[[3]]@variableSpec@variableId, "PC3") | ||
expect_equal(computedVariableMetadata[[3]]@variableSpec@entityId, "entity") | ||
|
||
|
||
# Test with not enough features | ||
expect_error(veupathUtils::pca(fakeCollection, nPCs = 2, ntop=1, verbose=T), "ntop must be at least 2.") | ||
|
||
}) |