Skip to content

Commit

Permalink
Merge pull request #56 from VEuPathDB/feature-54-add-PCA
Browse files Browse the repository at this point in the history
Feature 54 add pca
  • Loading branch information
asizemore authored Feb 6, 2025
2 parents 8e94c92 + adc86b8 commit 7239006
Show file tree
Hide file tree
Showing 6 changed files with 206 additions and 1 deletion.
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ Collate:
'data.R'
'method-correlation.R'
'method-differentialExpression.R'
'method-pca.R'
'methods-Bin.R'
'methods-CollectionWithMetadata.R'
'methods-Collections.R'
Expand Down
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ export(naToZero)
export(name)
export(new_data_frame)
export(nonZeroRound)
export(pca)
export(predicateFactory)
export(pruneFeatures)
export(removeAttr)
Expand Down Expand Up @@ -170,6 +171,7 @@ exportMethods(getStudyIdColumnName)
exportMethods(getVariableSpec)
exportMethods(getVariableSpecColumnName)
exportMethods(merge)
exportMethods(pca)
exportMethods(predicateFactory)
exportMethods(toJSON)
exportMethods(whichValuesInBin)
Expand Down
2 changes: 1 addition & 1 deletion R/class-VariableMetadata.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

variable_classes <- c('native', 'derived', 'computed')
#the other option is to just let this be any character vector so long as it has only a single value..
plot_references <- c('xAxis', 'yAxis', 'zAxis', 'overlay', 'facet1', 'facet2', 'geo', 'latitude', 'longitude')
plot_references <- c('xAxis', 'yAxis', 'zAxis', 'overlay', 'facet1', 'facet2', 'geo', 'latitude', 'longitude', 'undefined')
data_types <- c('NUMBER', 'STRING', 'INTEGER', 'DATE', 'LONGITUDE')
data_shapes <- c('CONTINUOUS', 'CATEGORICAL', 'ORDINAL', 'BINARY')

Expand Down
79 changes: 79 additions & 0 deletions R/method-pca.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@

#' PCA
#'
#' @param collection A Collection object
#' @param nPCs Number of principal components to return. Default 10
#' @param ntop Use the top ntop genes with the highest variance for the pca computation. Mirrors the deseq2 plotPCA argument. Default 500.
#' @param verbose Boolean indicating if extra messaging should be printed.
#' @return A ComputeResult object. The data slot contains a data.table with the id columns and the first nPCs principal components.
#' @export
setGeneric("pca",
function(collection, nPCs = 10, ntop = 500, verbose = c(TRUE, FALSE)) standardGeneric("pca"),
signature = c("collection")
)

#' @export
setMethod(pca, "Collection",
function(collection, nPCs = 10, ntop = 500, verbose = c(TRUE, FALSE)) {

verbose <- veupathUtils::matchArg(verbose)
assay <- getCollectionData(collection)
recordIdColumn <- collection@recordIdColumn
ancestorIdColumns <- collection@ancestorIdColumns
allIdColumns <- c(recordIdColumn, ancestorIdColumns)
entity <- veupathUtils::strSplit(recordIdColumn,".", 4, 1)

# Remove id columns from the assay to get only the features.
features <- assay[, -..allIdColumns] # features has samples as rows.

# Update ntop if it's too large.
if (ntop > ncol(features)) {
if (verbose) {
message("ntop is larger than the number of features. Using all features.")
}
ntop <- min(ntop, ncol(features))
}

# Ensure ntop is at least 1.
if (ntop <= 1) {
stop("ntop must be at least 2.")
}

# Use prcomp to perform PCA.
# The following is heavily borrowed from the deseq2 plotPCA function.
rowVariances <- matrixStats::rowVars(t(as.matrix(features)))
keepFeatures <- order(rowVariances, decreasing=TRUE)[seq_len(ntop)]
pcaResult <- prcomp(features[, ..keepFeatures])


# Assemble the output ComputeResult data and variable metadata.
dt <- assay[, ..allIdColumns]
# The PCA results are in pcaResult$x. Keep the first nPCs PCS.
dt <- cbind(dt, pcaResult$x[, 1:nPCs]) # this works fine even with one id column

variableMetadataList <- lapply(1:nPCs, function(i) {
veupathUtils::VariableMetadata(
variableClass = veupathUtils::VariableClass(value = "computed"),
variableSpec = veupathUtils::VariableSpec(variableId = paste0("PC",i), entityId = entity),
displayName = paste0("PC ",i),
displayRangeMin = min(pcaResult$x[,i]),
displayRangeMax = max(pcaResult$x[,i]),
dataType = veupathUtils::DataType(value = "NUMBER"),
dataShape = veupathUtils::DataShape(value = "CONTINUOUS"),
plotReference = veupathUtils::PlotReference(value = "undefined")
)
})

result <- new("ComputeResult")
result@name <- 'pca'
result@recordIdColumn <- recordIdColumn
result@ancestorIdColumns <- ancestorIdColumns
result@data <- dt
result@parameters <- paste0('recordIdColumn = ', recordIdColumn,", nPCs = ", nPCs, ', ntop = ', ntop)
result@computedVariableMetadata <- veupathUtils::VariableMetadataList(
S4Vectors::SimpleList(variableMetadataList)
)

return(result)
}
)
23 changes: 23 additions & 0 deletions man/pca.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

100 changes: 100 additions & 0 deletions tests/testthat/test-method-pca.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# Test PCA

test_that("The pca function produces the expected output", {

testData <- testCountDataCollection

# Run pca
load_all()
output <- veupathUtils::pca(testData, nPCs = 2)
expect_s4_class(output, "ComputeResult")
outputData <- output@data
expect_s3_class(outputData, "data.table")
expect_equal(nrow(outputData), nrow(getCollectionData(testData)))
expect_equal(ncol(outputData), 3)
expect_equal(names(outputData), c("entity.SampleID", "PC1", "PC2"))
computedVariableMetadata <- output@computedVariableMetadata
expect_s4_class(computedVariableMetadata, "VariableMetadataList")
expect_equal(length(computedVariableMetadata), 2)
expect_s4_class(computedVariableMetadata[[1]], "VariableMetadata")
expect_s4_class(computedVariableMetadata[[2]], "VariableMetadata")
expect_equal(computedVariableMetadata[[1]]@variableSpec@variableId, "PC1")
expect_equal(computedVariableMetadata[[2]]@variableSpec@variableId, "PC2")


# Generate some fake data
nSamples <- 500
fakeData <- data.table(
entity.SampleID = paste0("Sample", 1:nSamples),
feat1 = rnorm(nSamples),
feat2 = rnorm(nSamples),
feat3 = rnorm(nSamples)
)

fakeCollection <- new("Collection", data = fakeData, recordIdColumn = "entity.SampleID", name="test")

output <- veupathUtils::pca(fakeCollection, nPCs = 2, ntop=20)
expect_s4_class(output, "ComputeResult")
outputData <- output@data
expect_s3_class(outputData, "data.table")
expect_equal(nrow(outputData), nSamples)
expect_equal(ncol(outputData), 3)
expect_equal(names(outputData), c("entity.SampleID", "PC1", "PC2"))
computedVariableMetadata <- output@computedVariableMetadata
expect_s4_class(computedVariableMetadata, "VariableMetadataList")
expect_equal(length(computedVariableMetadata), 2)
expect_s4_class(computedVariableMetadata[[1]], "VariableMetadata")
expect_s4_class(computedVariableMetadata[[2]], "VariableMetadata")
expect_equal(computedVariableMetadata[[1]]@variableSpec@variableId, "PC1")
expect_equal(computedVariableMetadata[[2]]@variableSpec@variableId, "PC2")
})

test_that("The pca function can handle messy data", {

# Generate some fake data
nSamples <- 500
fakeData <- data.table(
entity.SampleID = paste0("Sample", 1:nSamples),
entity.feat1 = rnorm(nSamples),
entity.feat2 = rnorm(nSamples),
entity.feat3 = rnorm(nSamples)
)

fakeCollection <- new("Collection",
data = fakeData,
recordIdColumn = "entity.SampleID",
name="test",
imputeZero=TRUE
)

# Add some missing values and ensure one sample gets dropped
fakeCollection@data[1:50, "entity.feat1"] <- NA
fakeCollection@data[25:100, "entity.feat2"] <- 0
fakeCollection@data[(nSamples-50):nSamples, "entity.feat3"] <- NA
fakeCollection@data[26, "entity.feat3"] <- NA

output <- pca(fakeCollection, nPCs = 3, ntop=5, verbose=T)
expect_s4_class(output, "ComputeResult")
outputData <- output@data
expect_s3_class(outputData, "data.table")
expect_equal(nrow(outputData), nSamples-1)
expect_equal(ncol(outputData), 4)
expect_equal(names(outputData), c("entity.SampleID", "PC1", "PC2", "PC3"))
computedVariableMetadata <- output@computedVariableMetadata
expect_s4_class(computedVariableMetadata, "VariableMetadataList")
expect_equal(length(computedVariableMetadata), 3)
expect_s4_class(computedVariableMetadata[[1]], "VariableMetadata")
expect_s4_class(computedVariableMetadata[[2]], "VariableMetadata")
expect_s4_class(computedVariableMetadata[[3]], "VariableMetadata")
expect_equal(computedVariableMetadata[[1]]@variableSpec@variableId, "PC1")
expect_equal(computedVariableMetadata[[1]]@variableSpec@entityId, "entity")
expect_equal(computedVariableMetadata[[2]]@variableSpec@variableId, "PC2")
expect_equal(computedVariableMetadata[[2]]@variableSpec@entityId, "entity")
expect_equal(computedVariableMetadata[[3]]@variableSpec@variableId, "PC3")
expect_equal(computedVariableMetadata[[3]]@variableSpec@entityId, "entity")


# Test with not enough features
expect_error(veupathUtils::pca(fakeCollection, nPCs = 2, ntop=1, verbose=T), "ntop must be at least 2.")

})

0 comments on commit 7239006

Please sign in to comment.