diff --git a/.Rbuildignore b/.Rbuildignore
index 4b3815dd8..708509a59 100644
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -2,6 +2,7 @@
^renv\.lock$
^.*\.Rproj$
^\.Rproj\.user$
+^.vscode$
standalone
deploy.sh
extras/*
@@ -11,5 +12,6 @@ compare_versions
docs/*
_pkgdown.yml
^vignettes/articles$
+^vignettes/Videos.Rmd
^doc$
^Meta$
diff --git a/.cproject b/.cproject
deleted file mode 100644
index fb7beacc2..000000000
--- a/.cproject
+++ /dev/null
@@ -1,143 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/.gitignore b/.gitignore
index cb6091781..a079224c7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,3 +25,8 @@ standalone/build/*
/inst/shiny/DiagnosticsExplorer/rsconnect/*
/doc/
/Meta/
+/extras/
+/results/
+/.vscode/
+.project
+.cproject
diff --git a/.project b/.project
deleted file mode 100644
index cdbed80c5..000000000
--- a/.project
+++ /dev/null
@@ -1,27 +0,0 @@
-
-
- PatientLevelPrediction
-
-
-
-
-
- org.eclipse.cdt.managedbuilder.core.genmakebuilder
- clean,full,incremental,
-
-
-
-
- org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder
- full,incremental,
-
-
-
-
-
- org.eclipse.cdt.core.cnature
- org.eclipse.cdt.core.ccnature
- org.eclipse.cdt.managedbuilder.core.managedBuildNature
- org.eclipse.cdt.managedbuilder.core.ScannerConfigNature
-
-
diff --git a/DESCRIPTION b/DESCRIPTION
index f6f7a6b08..a56b87c28 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,6 @@
Package: PatientLevelPrediction
Type: Package
-Title: Developing patient level prediction using data in the OMOP Common Data
- Model
+Title: Developing Patient Level Prediction Models Using the Observational Medical Outcomes Partnership Common Data Model
Version: 6.3.9.9999
Date: 2024-10-10
Authors@R: c(
@@ -10,12 +9,12 @@ Authors@R: c(
person("Martijn", "Schuemie", role = c("aut")),
person("Marc", "Suchard", role = c("aut")),
person("Patrick", "Ryan", role = c("aut")),
- person("Peter", "Rijnbeek", role = c("aut"))
- )
-Description: A user friendly way to create patient level prediction models using the OMOP common data model. Given a
+ person("Peter", "Rijnbeek", role = c("aut")),
+ person("Observational Health Data Science and Informatics", role = c("cph")))
+Description: A user friendly way to create patient level prediction models using the OMOP (ObservationalMedical Outcomes Partnership) common data model. Given a
cohort of interest and an outcome of interest, the package can use data in the
OMOP Common Data Model to build a large set of features. These features can then
- be assessed to fit a predictive model using a number of machine learning algorithms.
+ be used to fit a predictive model with a number of machine learning algorithms.
Several performance measures are implemented for model evaluation.
License: Apache License 2.0
URL: https://ohdsi.github.io/PatientLevelPrediction, https://github.com/OHDSI/PatientLevelPrediction
@@ -27,50 +26,41 @@ Imports:
Andromeda,
Cyclops (>= 3.0.0),
DatabaseConnector (>= 6.0.0),
+ digest,
dplyr,
FeatureExtraction (>= 3.0.0),
- ggplot2,
- gridExtra,
Matrix,
memuse,
- mgcv,
ParallelLogger (>= 2.0.0),
- polspline,
pROC,
PRROC,
- reticulate (>= 1.30),
rlang,
SqlRender (>= 1.1.3),
- survival,
tidyr,
utils
Suggests:
- AUC,
- BigKnn (>= 1.0.0),
- devtools,
- Eunomia,
+ Eunomia (>= 2.0.0),
+ ggplot2,
+ gridExtra,
IterativeHardThresholding,
knitr,
- markdown,
Metrics,
+ mgcv,
parallel,
- plyr,
- pool,
+ polspline,
readr,
ResourceSelection,
ResultModelManager (>= 0.2.0),
+ reticulate (>= 1.30),
rmarkdown,
RSQLite,
scoring,
- ShinyAppBuilder (>= 1.1.1),
+ OhdsiShinyAppBuilder (>= 1.0.0),
+ survival,
survminer,
testthat,
withr,
xgboost (> 1.3.2.1),
lightgbm
-Remotes:
- ohdsi/BigKnn,
- ohdsi/ShinyAppBuilder,
- ohdsi/ResultModelManager
RoxygenNote: 7.3.2
Encoding: UTF-8
diff --git a/NAMESPACE b/NAMESPACE
index 33f338f90..de50241d5 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -22,6 +22,7 @@ export(createDatabaseSchemaSettings)
export(createDefaultExecuteSettings)
export(createDefaultSplitSetting)
export(createExecuteSettings)
+export(createExistingSplitSettings)
export(createFeatureEngineeringMapColumnsSettings)
export(createFeatureEngineeringSettings)
export(createGlmModel)
@@ -103,7 +104,6 @@ export(setCoxModel)
export(setDecisionTree)
export(setGradientBoostingMachine)
export(setIterativeHardThresholding)
-export(setKNN)
export(setLassoLogisticRegression)
export(setLightGBM)
export(setMLP)
diff --git a/NEWS.md b/NEWS.md
index 627b8acca..ab03023bc 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,7 @@
PatientLevelPrediction develop
======================
+- Added existingSplit settings where users can split data bases on an
+existing split
- Fixed linting errors and R codestyle in docs to conform to HADES style
- Remove links to pdf's, point to website instead.
- Fix broken links in Readme and BuildingPredictiveModels vignette
diff --git a/PatientLevelPrediction.Rproj b/PatientLevelPrediction.Rproj
index 6284fc105..396fe1267 100644
--- a/PatientLevelPrediction.Rproj
+++ b/PatientLevelPrediction.Rproj
@@ -1,4 +1,5 @@
Version: 1.0
+ProjectId: 6acb9f49-7428-4e24-8a2a-6b10f35b95e2
RestoreWorkspace: No
SaveWorkspace: No
diff --git a/R/AdditionalCovariates.R b/R/AdditionalCovariates.R
index dfb8abca8..141d4560a 100644
--- a/R/AdditionalCovariates.R
+++ b/R/AdditionalCovariates.R
@@ -24,7 +24,9 @@
#' cohort during the time periods relative to target population cohort index
#'
#' @param connection The database connection
-#' @param oracleTempSchema The temp schema if using oracle
+
+#' @param tempEmulationSchema The schema to use for temp tables
+#' @param oracleTempSchema DEPRECATED The temp schema if using oracle
#' @param cdmDatabaseSchema The schema of the OMOP CDM data
#' @param cdmVersion version of the OMOP CDM data
#' @param cohortTable the table name that contains the target population cohort
@@ -39,18 +41,25 @@
#'
#' @export
getCohortCovariateData <- function(
- connection,
- oracleTempSchema = NULL,
- cdmDatabaseSchema,
- cdmVersion = "5",
- cohortTable = "#cohort_person",
- rowIdField = "row_id",
- aggregated,
- cohortIds,
- covariateSettings,
- ...
- ){
-
+ connection,
+ tempEmulationSchema = NULL,
+ oracleTempSchema = NULL,
+ cdmDatabaseSchema,
+ cdmVersion = "5",
+ cohortTable = "#cohort_person",
+ rowIdField = "row_id",
+ aggregated,
+ cohortIds,
+ covariateSettings,
+ ...) {
+ if (!is.null(oracleTempSchema) && oracleTempSchema != "") {
+ rlang::warn("The 'oracleTempSchema' argument is deprecated. Use 'tempEmulationSchema' instead.",
+ .frequency = "regularly",
+ .frequency_id = "oracleTempSchema"
+ )
+ tempEmulationSchema <- oracleTempSchema
+ }
+
# Some SQL to construct the covariate:
sql <- paste(
"select a.@row_id_field AS row_id, @covariate_id AS covariate_id,",
@@ -66,26 +75,27 @@ getCohortCovariateData <- function(
"where b.cohort_definition_id = @covariate_cohort_id
group by a.@row_id_field; "
)
-
+
if (is.null(covariateSettings$cohortTable) &&
- (is.null(covariateSettings$cohortDatabaseSchema))) {
- ParallelLogger::logInfo("cohortTable and cohortDatabaseSchema not specified in
+ (is.null(covariateSettings$cohortDatabaseSchema))) {
+ ParallelLogger::logInfo("cohortTable and cohortDatabaseSchema not specified in
cohort covariateSettings. Attempting to fetch from databaseDetails")
- # use settings from databaseDetails which is two frames up
+ # use settings from databaseDetails which is two frames up
# in the call stack
tryCatch(databaseDetails <- get("databaseDetails", parent.frame(n = 2)),
error = function(e) {
- stop("cohortTable and cohortDatabaseSchema not specified in
- cohort covariateSettings. Attempt to fetch databaseDetails from parent
+ stop("cohortTable and cohortDatabaseSchema not specified in
+ cohort covariateSettings. Attempt to fetch databaseDetails from parent
frame failed with error: ", e$message)
- })
+ }
+ )
cohortCovariateTable <- databaseDetails$cohortTable
cohortCovariateDatabaseSchema <- databaseDetails$cohortDatabaseSchema
} else {
cohortCovariateTable <- covariateSettings$cohortTable
cohortCovariateDatabaseSchema <- covariateSettings$cohortDatabaseSchema
}
-
+
sql <- SqlRender::render(
sql,
covariate_cohort_schema = cohortCovariateDatabaseSchema,
@@ -100,48 +110,49 @@ getCohortCovariateData <- function(
ageInteraction = covariateSettings$ageInteraction,
lnAgeInteraction = covariateSettings$lnAgeInteraction,
cdm_database_schema = cdmDatabaseSchema
- )
-
+ )
+
sql <- SqlRender::translate(
- sql = sql,
+ sql = sql,
targetDialect = attr(connection, "dbms"),
- tempEmulationSchema = oracleTempSchema
- )
-
+ tempEmulationSchema = tempEmulationSchema
+ )
+
# Retrieve the covariate:
covariates <- DatabaseConnector::querySql(connection, sql)
-
+
# Convert colum names to camelCase:
colnames(covariates) <- SqlRender::snakeCaseToCamelCase(colnames(covariates))
# Construct covariate reference:
sql <- "select @covariate_id as covariate_id, '@concept_set' as covariate_name,
@analysis_id as analysis_id, -1 as concept_id;"
sql <- SqlRender::render(
- sql = sql,
+ sql = sql,
covariate_id = covariateSettings$covariateId,
analysis_id = covariateSettings$analysisId,
- concept_set = paste('Cohort_covariate during day',
+ concept_set = paste(
+ "Cohort_covariate during day",
covariateSettings$startDay,
- 'through',
+ "through",
covariateSettings$endDays,
- 'days relative to index:',
- ifelse(covariateSettings$count, 'Number of', ''),
+ "days relative to index:",
+ ifelse(covariateSettings$count, "Number of", ""),
covariateSettings$covariateName,
- ifelse(covariateSettings$ageInteraction, ' X Age', ''),
- ifelse(covariateSettings$lnAgeInteraction, ' X ln(Age)', '')
+ ifelse(covariateSettings$ageInteraction, " X Age", ""),
+ ifelse(covariateSettings$lnAgeInteraction, " X ln(Age)", "")
)
)
-
+
sql <- SqlRender::translate(
- sql = sql,
- targetDialect = attr(connection, "dbms"),
- tempEmulationSchema = oracleTempSchema
- )
-
+ sql = sql,
+ targetDialect = attr(connection, "dbms"),
+ tempEmulationSchema = tempEmulationSchema
+ )
+
# Retrieve the covariateRef:
- covariateRef <- DatabaseConnector::querySql(connection, sql)
+ covariateRef <- DatabaseConnector::querySql(connection, sql)
colnames(covariateRef) <- SqlRender::snakeCaseToCamelCase(colnames(covariateRef))
-
+
analysisRef <- data.frame(
analysisId = covariateSettings$analysisId,
analysisName = "cohort covariate",
@@ -150,17 +161,17 @@ getCohortCovariateData <- function(
endDay = 0,
isBinary = "Y",
missingMeansZero = "Y"
- )
-
+ )
+
metaData <- list(sql = sql, call = match.call())
result <- Andromeda::andromeda(
covariates = covariates,
covariateRef = covariateRef,
analysisRef = analysisRef
- )
-
+ )
+
attr(result, "metaData") <- metaData
- class(result) <- "CovariateData"
+ class(result) <- "CovariateData"
return(result)
}
@@ -172,19 +183,19 @@ getCohortCovariateData <- function(
#' cohort during the time periods relative to target population cohort index
#'
#' @param cohortName Name for the cohort
-#' @param settingId A unique id for the covariate time and
-#' @param cohortDatabaseSchema The schema of the database with the cohort. If
+#' @param settingId A unique id for the covariate time and
+#' @param cohortDatabaseSchema The schema of the database with the cohort. If
#' nothing is specified then the cohortDatabaseSchema from databaseDetails at runtime is used.
-#' @param cohortTable the table name that contains the covariate cohort. If
+#' @param cohortTable the table name that contains the covariate cohort. If
#' nothing is specified then the cohortTable from databaseDetails at runtime is used.
#' @param cohortId cohort id for the covariate cohort
#' @param startDay The number of days prior to index to start observing the cohort
#' @param endDay The number of days prior to index to stop observing the cohort
#' @param count If FALSE the covariate value is binary (1 means cohort occurred between index+startDay and index+endDay, 0 means it did not)
-#' If TRUE then the covariate value is the number of unique cohort_start_dates between index+startDay and index+endDay
-#' @param ageInteraction If TRUE multiple covariate value by the patient's age in years
-#' @param lnAgeInteraction If TRUE multiple covariate value by the log of the patient's age in years
-#' @param analysisId The analysisId for the covariate
+#' If TRUE then the covariate value is the number of unique cohort_start_dates between index+startDay and index+endDay
+#' @param ageInteraction If TRUE multiple covariate value by the patient's age in years
+#' @param lnAgeInteraction If TRUE multiple covariate value by the log of the patient's age in years
+#' @param analysisId The analysisId for the covariate
#'
#' @return
#' An object of class covariateSettings specifying how to create the cohort covariate with the covariateId
@@ -192,26 +203,24 @@ getCohortCovariateData <- function(
#'
#' @export
createCohortCovariateSettings <- function(
- cohortName,
- settingId,
- cohortDatabaseSchema=NULL,
- cohortTable=NULL,
- cohortId,
- startDay = -30,
- endDay = 0,
- count = F,
- ageInteraction = F,
- lnAgeInteraction = F,
- analysisId = 456
- ){
-
- if( settingId > 100 || settingId < 0){
- stop('settingId must be between 1 and 100')
+ cohortName,
+ settingId,
+ cohortDatabaseSchema = NULL,
+ cohortTable = NULL,
+ cohortId,
+ startDay = -30,
+ endDay = 0,
+ count = FALSE,
+ ageInteraction = FALSE,
+ lnAgeInteraction = FALSE,
+ analysisId = 456) {
+ if (settingId > 100 || settingId < 0) {
+ stop("settingId must be between 1 and 100")
}
-
+
covariateSettings <- list(
- covariateName = cohortName,
- covariateId = cohortId*100000+settingId*1000+analysisId,
+ covariateName = cohortName,
+ covariateId = cohortId * 100000 + settingId * 1000 + analysisId,
cohortDatabaseSchema = cohortDatabaseSchema,
cohortTable = cohortTable,
cohortIds = cohortId,
@@ -221,8 +230,8 @@ createCohortCovariateSettings <- function(
ageInteraction = ageInteraction,
lnAgeInteraction = lnAgeInteraction,
analysisId = analysisId
- )
-
+ )
+
attr(covariateSettings, "fun") <- "PatientLevelPrediction::getCohortCovariateData"
class(covariateSettings) <- "covariateSettings"
return(covariateSettings)
diff --git a/R/AndromedaHelperFunctions.R b/R/AndromedaHelperFunctions.R
index b6182610c..8b030f9cb 100644
--- a/R/AndromedaHelperFunctions.R
+++ b/R/AndromedaHelperFunctions.R
@@ -1,15 +1,10 @@
-# @file ffHelperFunctions.R
-#
-# Copyright 2021 Observational Health Data Sciences and Informatics
-#
-# This file is part of PatientLevelPrediction
-#
+# @file ffHelperFunctions.R Copyright 2021 Observational Health Data Sciences and Informatics This file is part of PatientLevelPrediction
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
-#
+#
# http://www.apache.org/licenses/LICENSE-2.0
-#
+#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -17,48 +12,54 @@
# limitations under the License.
limitCovariatesToPopulation <- function(covariateData, rowIds) {
- ParallelLogger::logTrace(paste0('Starting to limit covariate data to population...'))
-
- metaData <- attr(covariateData, 'metaData')
-
- newCovariateData <- Andromeda::andromeda(covariateRef = covariateData$covariateRef,
- analysisRef = covariateData$analysisRef)
-
+ ParallelLogger::logTrace(paste0("Starting to limit covariate data to population..."))
+
+ metaData <- attr(covariateData, "metaData")
+
+ newCovariateData <- Andromeda::andromeda(
+ covariateRef = covariateData$covariateRef,
+ analysisRef = covariateData$analysisRef
+ )
+
covariateData$pop <- data.frame(rowId = rowIds)
- Andromeda::createIndex(tbl = covariateData$pop, columnNames = 'rowId',
- indexName = 'pop_rowIds')
-
- on.exit(covariateData$pop <- NULL, add = T)
-
- newCovariateData$covariates <- covariateData$covariates %>%
- dplyr::inner_join(covariateData$pop, by = 'rowId')
- Andromeda::createIndex(tbl = newCovariateData$covariates, columnNames = 'covariateId',
- indexName = 'covariates_ncovariateIds')
-
+ Andromeda::createIndex(
+ tbl = covariateData$pop, columnNames = "rowId",
+ indexName = "pop_rowIds"
+ )
+
+ on.exit(covariateData$pop <- NULL, add = TRUE)
+
+ newCovariateData$covariates <- covariateData$covariates %>%
+ dplyr::inner_join(covariateData$pop, by = "rowId")
+ Andromeda::createIndex(
+ tbl = newCovariateData$covariates, columnNames = "covariateId",
+ indexName = "covariates_ncovariateIds"
+ )
+
metaData$populationSize <- length(rowIds)
- attr(newCovariateData, 'metaData') <- metaData
+ attr(newCovariateData, "metaData") <- metaData
class(newCovariateData) <- "CovariateData"
- ParallelLogger::logTrace(paste0('Finished limiting covariate data to population...'))
+ ParallelLogger::logTrace(paste0("Finished limiting covariate data to population..."))
return(newCovariateData)
}
-batchRestrict <- function(covariateData, population, sizeN = 10000000){
-
- ParallelLogger::logInfo('Due to data size using batchRestrict to limit covariate data to population')
-
+batchRestrict <- function(covariateData, population, sizeN = 10000000) {
+ ParallelLogger::logInfo("Due to data size using batchRestrict to limit covariate data to population")
+
start <- Sys.time()
-
- metaData <- attr(covariateData, 'metaData')
-
- newCovariateData <- Andromeda::andromeda(covariateRef = covariateData$covariateRef,
- analysisRef = covariateData$analysisRef)
-
+
+ metaData <- attr(covariateData, "metaData")
+
+ newCovariateData <- Andromeda::andromeda(
+ covariateRef = covariateData$covariateRef,
+ analysisRef = covariateData$analysisRef
+ )
+
Andromeda::batchApply(covariateData$covariates, function(tempData) {
-
- filtered <- dplyr::inner_join(tempData, population, by = 'rowId')
-
+ filtered <- dplyr::inner_join(tempData, population, by = "rowId")
+
if ("covariates" %in% names(newCovariateData)) {
Andromeda::appendToTable(newCovariateData$covariates, data = filtered)
} else {
@@ -66,54 +67,60 @@ batchRestrict <- function(covariateData, population, sizeN = 10000000){
}
},
progressBar = TRUE,
- batchSize = sizeN)
-
- Andromeda::createIndex(tbl = newCovariateData$covariates,
- columnNames = 'covariateId',
- indexName = 'covariates_ncovariateIds')
- Andromeda::createIndex(tbl = newCovariateData$covariates,
- columnNames = 'rowId',
- indexName = 'covariates_rowId')
- Andromeda::createIndex(tbl = newCovariateData$covariates,
- columnNames = c('covariateId', 'covariateValue'),
- indexName = 'covariates_covariateId_value')
-
+ batchSize = sizeN
+ )
+
+ Andromeda::createIndex(
+ tbl = newCovariateData$covariates,
+ columnNames = "covariateId",
+ indexName = "covariates_ncovariateIds"
+ )
+ Andromeda::createIndex(
+ tbl = newCovariateData$covariates,
+ columnNames = "rowId",
+ indexName = "covariates_rowId"
+ )
+ Andromeda::createIndex(
+ tbl = newCovariateData$covariates,
+ columnNames = c("covariateId", "covariateValue"),
+ indexName = "covariates_covariateId_value"
+ )
+
metaData$populationSize <- nrow(population)
- attr(newCovariateData, 'metaData') <- metaData
+ attr(newCovariateData, "metaData") <- metaData
class(newCovariateData) <- "CovariateData"
-
+
timeTaken <- as.numeric(Sys.time() - start, units = "mins")
- ParallelLogger::logInfo(paste0('Limiting covariate data took: ', timeTaken, ' mins'))
-
+ ParallelLogger::logInfo(paste0("Limiting covariate data took: ", timeTaken, " mins"))
+
return(newCovariateData)
}
# is this used?
-# return prev of ffdf
-calculatePrevs <- function(plpData, population){
- #===========================
+# return prev of ffdf
+calculatePrevs <- function(plpData, population) {
+ # ===========================
# outcome prevs
- #===========================
-
+ # ===========================
+
# add population to sqllite
population <- dplyr::as_tibble(population)
- plpData$covariateData$population <- population %>%
+ plpData$covariateData$population <- population %>%
dplyr::select("rowId", "outcomeCount")
-
+
outCount <- nrow(plpData$covariateData$population %>% dplyr::filter(.data$outcomeCount == 1))
nonOutCount <- nrow(plpData$covariateData$population %>% dplyr::filter(.data$outcomeCount == 0))
-
+
# join covariate with label
- prevs <- plpData$covariateData$covariates %>% dplyr::inner_join(plpData$covariateData$population) %>%
- dplyr::group_by(.data$covariateId) %>%
- dplyr::summarise(prev.out = 1.0*sum(.data$outcomeCount==1, na.rm = TRUE)/outCount,
- prev.noout = 1.0*sum(.data$outcomeCount==0, na.rm = TRUE)/nonOutCount) %>%
+ prevs <- plpData$covariateData$covariates %>%
+ dplyr::inner_join(plpData$covariateData$population) %>%
+ dplyr::group_by(.data$covariateId) %>%
+ dplyr::summarise(
+ prev.out = 1.0 * sum(.data$outcomeCount == 1, na.rm = TRUE) / outCount,
+ prev.noout = 1.0 * sum(.data$outcomeCount == 0, na.rm = TRUE) / nonOutCount
+ ) %>%
dplyr::select("covariateId", "prev.out", "prev.noout")
-
- #clear up data
- ##plpData$covariateData$population <- NULL
-
+
return(as.data.frame(prevs))
}
-
diff --git a/R/CalibrationSummary.R b/R/CalibrationSummary.R
index e8094350b..ab7f69512 100644
--- a/R/CalibrationSummary.R
+++ b/R/CalibrationSummary.R
@@ -6,8 +6,8 @@
#'
#' @param prediction A prediction object as generated using the
#' \code{\link{predict}} functions.
-#' @param predictionType The type of prediction (binary or survival)
-#' @param typeColumn A column that is used to stratify the results
+#' @param predictionType The type of prediction (binary or survival)
+#' @param typeColumn A column that is used to stratify the results
#' @param numberOfStrata The number of strata in the plot.
#' @param truncateFraction This fraction of probability values will be ignored when plotting, to
#' avoid the x-axis scale being dominated by a few outliers.
@@ -16,187 +16,203 @@
#' A dataframe with the calibration summary
#' @export
getCalibrationSummary <- function(
- prediction,
- predictionType,
- typeColumn = 'evaluation',
- numberOfStrata = 100,
- truncateFraction = 0.05
- ){
+ prediction,
+ predictionType,
+ typeColumn = "evaluation",
+ numberOfStrata = 100,
+ truncateFraction = 0.05) {
evaluation <- do.call(
- what = paste0('getCalibrationSummary_', predictionType),
+ what = paste0("getCalibrationSummary_", predictionType),
args = list(
- prediction = prediction,
+ prediction = prediction,
evalColumn = typeColumn,
numberOfStrata = numberOfStrata,
truncateFraction = truncateFraction,
- timepoint = attr(prediction, 'metaData')$timepoint
+ timepoint = attr(prediction, "metaData")$timepoint
)
)
-
+
return(evaluation)
}
getCalibrationSummary_binary <- function(
- prediction,
- evalColumn,
- numberOfStrata = 10,
- truncateFraction = 0.05,
- ...) {
-
+ prediction,
+ evalColumn,
+ numberOfStrata = 10,
+ truncateFraction = 0.05,
+ ...) {
result <- c()
- evalTypes <- unique(as.data.frame(prediction)[,evalColumn])
-
- for(evalType in evalTypes){
-
+ evalTypes <- unique(as.data.frame(prediction)[, evalColumn])
+
+ for (evalType in evalTypes) {
predictionOfInterest <- prediction %>% dplyr::filter(.data[[evalColumn]] == evalType)
-
-
- q <- unique(stats::quantile(predictionOfInterest$value, (1:(numberOfStrata - 1))/numberOfStrata))
+
+
+ q <- unique(stats::quantile(predictionOfInterest$value, (1:(numberOfStrata - 1)) / numberOfStrata))
predictionOfInterest$predictionThresholdId <- cut(predictionOfInterest$value,
breaks = unique(c(-0.00001, q, max(predictionOfInterest$value))),
- labels = FALSE)
-
+ labels = FALSE
+ )
+
predictionOfInterest <- merge(predictionOfInterest,
- data.frame(predictionThresholdId=1:(length(q)+1), predictionThreshold=c(0, q)),
- by='predictionThresholdId', all.x=T)
-
- computeStratumStats <- function(data) {
- return(data.frame(minx = min(data$value),
- maxx = max(data$value),
- fraction = sum(data$outcomeCount)/nrow(data)))
- }
-
+ data.frame(predictionThresholdId = 1:(length(q) + 1), predictionThreshold = c(0, q)),
+ by = "predictionThresholdId", all.x = TRUE
+ )
+
# count the number of persons with the age/gender strata
- PersonCountAtRisk <- stats::aggregate(rowId ~ predictionThreshold, data = predictionOfInterest, length)
+ PersonCountAtRisk <- stats::aggregate(rowId ~ predictionThreshold,
+ data = predictionOfInterest, length
+ )
names(PersonCountAtRisk)[2] <- "PersonCountAtRisk"
-
+
# count the number of persons with the age/gender strata in T also in O at time-at-risk
- PersonCountWithOutcome <- stats::aggregate(outcomeCount ~ predictionThreshold, data = predictionOfInterest, sum)
+ PersonCountWithOutcome <- stats::aggregate(outcomeCount ~ predictionThreshold,
+ data = predictionOfInterest, sum
+ )
names(PersonCountWithOutcome)[2] <- "PersonCountWithOutcome"
-
- strataData <- merge(PersonCountAtRisk,
- PersonCountWithOutcome)
-
+
+ strataData <- merge(
+ PersonCountAtRisk,
+ PersonCountWithOutcome
+ )
+
# Select all persons within the predictionThreshold, compute their average predicted probability
- averagePredictedProbability <- stats::aggregate(predictionOfInterest$value, list(predictionOfInterest$predictionThreshold),
- mean)
- colnames(averagePredictedProbability) <- c('predictionThreshold', 'averagePredictedProbability')
- strataData <- merge(strataData,averagePredictedProbability)
-
- StDevPredictedProbability <- stats::aggregate(predictionOfInterest$value, list(predictionOfInterest$predictionThreshold),
- stats::sd)
- colnames(StDevPredictedProbability) <- c('predictionThreshold', 'StDevPredictedProbability')
+ averagePredictedProbability <- stats::aggregate(
+ predictionOfInterest$value, list(predictionOfInterest$predictionThreshold),
+ mean
+ )
+ colnames(averagePredictedProbability) <- c("predictionThreshold", "averagePredictedProbability")
+ strataData <- merge(strataData, averagePredictedProbability)
+
+ StDevPredictedProbability <- stats::aggregate(
+ predictionOfInterest$value, list(predictionOfInterest$predictionThreshold),
+ stats::sd
+ )
+ colnames(StDevPredictedProbability) <- c("predictionThreshold", "StDevPredictedProbability")
strataData <- merge(strataData, StDevPredictedProbability)
-
+
# MinPredictedProbability, P25PredictedProbability, MedianPredictedProbability,
# P75PredictedProbability, MaxPredictedProbability
- quantiles <- stats::aggregate(predictionOfInterest$value, list(predictionOfInterest$predictionThreshold),
- function(x) stats::quantile(x, probs = c(0,0.25,0.5,0.75,1))
+ quantiles <- stats::aggregate(
+ predictionOfInterest$value, list(predictionOfInterest$predictionThreshold),
+ function(x) stats::quantile(x, probs = c(0, 0.25, 0.5, 0.75, 1))
)
quantiles <- as.matrix(quantiles)
- colnames(quantiles) <- c('predictionThreshold', 'MinPredictedProbability',
- 'P25PredictedProbability', 'MedianPredictedProbability',
- 'P75PredictedProbability', 'MaxPredictedProbability')
+ colnames(quantiles) <- c(
+ "predictionThreshold", "MinPredictedProbability",
+ "P25PredictedProbability", "MedianPredictedProbability",
+ "P75PredictedProbability", "MaxPredictedProbability"
+ )
strataData <- merge(strataData, quantiles)
-
+
# From among all persons in T within the age/gender strata, compute the proportion in O during the time-at-risk (PersonCountWithOutcome / PersonCountAtRisk)
# TODO: need to make sure PersonCountAtRisk is not zero - how to handle zeros?
- strataData$observedIncidence <- strataData$PersonCountWithOutcome/strataData$PersonCountAtRisk
-
+ strataData$observedIncidence <- strataData$PersonCountWithOutcome / strataData$PersonCountAtRisk
+
strataData$evaluation <- evalType
-
+
# what is this used for?
- attr(strataData,'lims') <- stats::quantile(predictionOfInterest$value, c(truncateFraction, 1 - truncateFraction))
-
+ attr(strataData, "lims") <- stats::quantile(
+ predictionOfInterest$value,
+ c(truncateFraction, 1 - truncateFraction)
+ )
+
result <- rbind(
- result,
+ result,
strataData
)
-
}
-
+
return(result)
}
getCalibrationSummary_survival <- function(
- prediction,
- evalColumn,
- numberOfStrata = 10,
- truncateFraction = 0.01,
- timepoint,
- ...) {
-
+ prediction,
+ evalColumn,
+ numberOfStrata = 10,
+ truncateFraction = 0.01,
+ timepoint,
+ ...) {
+ rlang::check_installed(
+ pkg = c("survival"),
+ reason = "getCalibrationSummary_survival requires the survival package to be installed"
+ )
result <- c()
- evalTypes <- unique(as.data.frame(prediction)[,evalColumn])
-
- for(evalType in evalTypes){
-
+ evalTypes <- unique(as.data.frame(prediction)[, evalColumn])
+
+ for (evalType in evalTypes) {
predictionOfInterest <- prediction %>% dplyr::filter(.data[[evalColumn]] == evalType)
-
- # add in calibration for survival
+
+ # add in calibration for survival
t <- predictionOfInterest$survivalTime
y <- ifelse(predictionOfInterest$outcomeCount > 0, 1, 0)
-
- S <- survival::Surv(t, y)
- if(length(unique(predictionOfInterest$value)) <= numberOfStrata){
- gval <- 10
- } else{
- gval <- numberOfStrata
- }
- groups<-cut2(predictionOfInterest$value,g=gval)
- n.groups<-length(levels(groups))
- pred<-tapply(predictionOfInterest$value,groups,mean)
- sizesN<-tapply(predictionOfInterest$value,groups,length)
- obs.q<-NULL
- obs.lower.q<-NULL
- obs.upper.q<-NULL
- avPred <- NULL
- for (q in 1:n.groups){
- KM <- tryCatch({survival::survfit(S ~ 1,sub=groups==levels(groups)[q])},
- error = function(e){ParallelLogger::logError(e); return(list(surv = 0,
- upper = 0,
- lower = 0 ))})
- obs.q<-c(obs.q,max(1-KM$surv)) # maximum OK because of prediction_horizon
- obs.lower.q<-c(obs.lower.q,obs.lower<-max(1-KM$upper))
- obs.upper.q<-c(obs.upper.q,obs.upper<-max(1-KM$lower))
- }
-
- midpoints <- function(x, dp=6){
- if(length(grep(',',x))>0){
- lower <- as.double(gsub('\\[','',gsub('\\(','',strsplit(x, ',')[[1]][1])))
- upper <- as.double(gsub('\\]','',gsub('\\)','',strsplit(x, ',')[[1]][2])))
- return(round(lower+(upper-lower)/2, dp))
- } else{
- return(as.double(x))
+
+ S <- survival::Surv(t, y)
+ if (length(unique(predictionOfInterest$value)) <= numberOfStrata) {
+ gval <- 10
+ } else {
+ gval <- numberOfStrata
}
- }
-
- calibrationSummary <- data.frame(
- evaluation = evalType,
- predictionThreshold = unlist(
- lapply(
- levels(groups),
- midpoints
+ groups <- cut2(predictionOfInterest$value, g = gval)
+ n.groups <- length(levels(groups))
+ pred <- tapply(predictionOfInterest$value, groups, mean)
+ sizesN <- tapply(predictionOfInterest$value, groups, length)
+ obs.q <- NULL
+ obs.lower.q <- NULL
+ obs.upper.q <- NULL
+ avPred <- NULL
+ for (q in 1:n.groups) {
+ KM <- tryCatch(
+ {
+ survival::survfit(S ~ 1, sub = groups == levels(groups)[q])
+ },
+ error = function(e) {
+ ParallelLogger::logError(e)
+ return(list(
+ surv = 0,
+ upper = 0,
+ lower = 0
+ ))
+ }
)
- ),
- averagePredictedProbability = pred,
- observedIncidence = obs.q,
- observedIncidenceLB = obs.lower.q,
- observedIncidenceUB = obs.upper.q,
- PersonCountAtRisk = sizesN
- )
-
- result <- rbind(
- result,
- calibrationSummary
- )
-
+ obs.q <- c(obs.q, max(1 - KM$surv)) # maximum OK because of prediction_horizon
+ obs.lower.q <- c(obs.lower.q, obs.lower <- max(1 - KM$upper))
+ obs.upper.q <- c(obs.upper.q, obs.upper <- max(1 - KM$lower))
+ }
+
+ midpoints <- function(x, dp = 6) {
+ if (length(grep(",", x)) > 0) {
+ lower <- as.double(gsub("\\[", "", gsub("\\(", "", strsplit(x, ",")[[1]][1])))
+ upper <- as.double(gsub("\\]", "", gsub("\\)", "", strsplit(x, ",")[[1]][2])))
+ return(round(lower + (upper - lower) / 2, dp))
+ } else {
+ return(as.double(x))
+ }
+ }
+
+ calibrationSummary <- data.frame(
+ evaluation = evalType,
+ predictionThreshold = unlist(
+ lapply(
+ levels(groups),
+ midpoints
+ )
+ ),
+ averagePredictedProbability = pred,
+ observedIncidence = obs.q,
+ observedIncidenceLB = obs.lower.q,
+ observedIncidenceUB = obs.upper.q,
+ PersonCountAtRisk = sizesN
+ )
+
+ result <- rbind(
+ result,
+ calibrationSummary
+ )
}
-
+
return(result)
-
}
-
diff --git a/R/CovariateSummary.R b/R/CovariateSummary.R
index 6972441a5..1ed60f4d6 100644
--- a/R/CovariateSummary.R
+++ b/R/CovariateSummary.R
@@ -25,382 +25,365 @@
#' by train and test set
#' @details
#' The function calculates various metrics to measure the performance of the model
-#' @param covariateData The covariateData part of the plpData that is
+#' @param covariateData The covariateData part of the plpData that is
#' extracted using \code{getPlpData}
-#' @param cohort The patient cohort to calculate the summary
+#' @param cohort The patient cohort to calculate the summary
#' @param labels A data.frame with the columns rowId and outcomeCount
#' @param strata A data.frame containing the columns rowId, strataName
-#' @param variableImportance A data.frame with the columns covariateId and
-#' value (the variable importance value)
+#' @param variableImportance A data.frame with the columns covariateId and
+#' value (the variable importance value)
#' @param featureEngineering (currently not used )
#' A function or list of functions specifying any feature engineering
-#' to create covariates before summarising
-#'
+#' to create covariates before summarising
+#'
#' @return
#' A data.frame containing: CovariateCount CovariateMean and CovariateStDev plus these values
#' for any specified stratification
#' @export
covariateSummary <- function(
- covariateData,
- cohort,
- labels = NULL,
- strata = NULL,
- variableImportance = NULL,
- featureEngineering = NULL
-){
-
- ParallelLogger::logInfo(paste0('Calculating covariate summary @ ', Sys.time()))
- ParallelLogger::logInfo('This can take a while...')
-
- if(missing(covariateData)){
- stop('Must enter the covariateData')
+ covariateData,
+ cohort,
+ labels = NULL,
+ strata = NULL,
+ variableImportance = NULL,
+ featureEngineering = NULL) {
+ start <- Sys.time()
+ ParallelLogger::logInfo(paste0("Calculating covariate summary @ ", start))
+ ParallelLogger::logInfo("This can take a while...")
+
+ if (missing(covariateData)) {
+ stop("Must enter the covariateData")
}
-
- if(missing(cohort)){
- stop('Must enter the cohort of patients')
+
+ if (missing(cohort)) {
+ stop("Must enter the cohort of patients")
}
-
+
subsetList <- createCovariateSubsets(
cohort = cohort,
- labels = labels,
+ labels = labels,
strata = strata
)
-
+
# apply feature engineering
- if(!is.null(featureEngineering)){
-
+ if (!is.null(featureEngineering)) {
# create copy of covariateData
newCovariateData <- Andromeda::andromeda(
covariateRef = covariateData$covariateRef,
analysisRef = covariateData$analysisRef,
covariates = covariateData$covariates
- )
+ )
covariateData <- newCovariateData
-
- if(!is.null(featureEngineering$funct)){
+
+ if (!is.null(featureEngineering$funct)) {
featureEngineering <- list(featureEngineering)
}
-
- for(fe in featureEngineering){
+
+ for (fe in featureEngineering) {
feSettings <- fe$settings
- feSettings$trainData = list(covariateData = covariateData)
+ feSettings$trainData <- list(covariateData = covariateData)
covariateData <- do.call(fe$funct, feSettings)$covariateData
}
}
-
+
# make this run in parallel for big speed improvements..
- covariateSummariesPerStrata <- lapply(subsetList,
- function(x){
+ covariateSummariesPerStrata <- lapply(
+ subsetList,
+ function(x) {
do.call(
- covariateSummarySubset,
+ covariateSummarySubset,
list(
covariateData = covariateData,
subset = x$subset$rowId,
subsetName = x$subsetName,
- restrictCovariateDataToSubsetIds = T
+ restrictCovariateDataToSubsetIds = TRUE
)
- )}
+ )
+ }
)
-
+
covariateSummary <- aggregateCovariateSummaries(
covariateSummariesPerStrata = do.call(rbind, covariateSummariesPerStrata),
- labels = labels,
+ labels = labels,
strata = strata
)
-
+
# add variable importance if input
- if(!is.null(variableImportance)){
- covariateSummary <- covariateSummary %>%
- dplyr::left_join(variableImportance, by = 'covariateId')
+ if (!is.null(variableImportance)) {
+ covariateSummary <- covariateSummary %>%
+ dplyr::left_join(variableImportance, by = "covariateId")
}
-
- # add covariate names
- covariateSummary <- covariateData$covariateRef %>%
- dplyr::collect() %>% dplyr::left_join(covariateSummary, by ='covariateId')
-
- ParallelLogger::logInfo(paste0('Finished covariate summary @ ', Sys.time()))
-
+
+ # add covariate names
+ covariateSummary <- covariateData$covariateRef %>%
+ dplyr::collect() %>%
+ dplyr::left_join(covariateSummary, by = "covariateId")
+
+ ParallelLogger::logInfo(paste0("Finished covariate summary @ ", Sys.time()))
+ delta <- Sys.time() - start
+ ParallelLogger::logInfo("Time to calculate covariate summary: ",
+ signif(delta, 3), " ", attr(delta, "units"))
return(covariateSummary)
}
# aggregate the covariateSummaries:
aggregateCovariateSummaries <- function(
- covariateSummariesPerStrata,
- labels,
- strata
-){
-
+ covariateSummariesPerStrata,
+ labels,
+ strata) {
# if no labels or strata
- if(is.null(labels) & is.null(strata)){
- ParallelLogger::logInfo('Aggregating with no labels or strata')
- result <- covariateSummariesPerStrata %>%
+ if (is.null(labels) && is.null(strata)) {
+ ParallelLogger::logInfo("Aggregating with no labels or strata")
+ result <- covariateSummariesPerStrata %>%
dplyr::select(
- "covariateId",
- "CovariateCount",
- "CovariateMean",
- "CovariateStDev",
+ "covariateId",
+ "CovariateCount",
+ "CovariateMean",
+ "CovariateStDev",
)
}
-
+
# if labels but no strata or strata and no labels
- if( ( !is.null(labels) & is.null(strata) ) | ( is.null(labels) & !is.null(strata) ) ){
- ParallelLogger::logInfo('Aggregating with only labels or strata')
- resultLabels <- covariateSummariesPerStrata %>%
+ if ((!is.null(labels) && is.null(strata)) || (is.null(labels) && !is.null(strata))) {
+ ParallelLogger::logInfo("Aggregating with only labels or strata")
+ resultLabels <- covariateSummariesPerStrata %>%
dplyr::select(
"group",
- "covariateId",
- "CovariateCount",
- "CovariateMean",
- "CovariateStDev",
+ "covariateId",
+ "CovariateCount",
+ "CovariateMean",
+ "CovariateStDev",
)
resultLabels <- tidyr::pivot_longer(
- data = resultLabels,
- cols = colnames(resultLabels)[!colnames(resultLabels) %in% c('covariateId','group')],
- names_to = 'variable',
- values_to = 'value'
- )
-
- resultLabels <- resultLabels %>%
- dplyr::mutate(group_variable = paste(.data$group, .data$variable, sep ='_')) %>%
+ data = resultLabels,
+ cols = colnames(resultLabels)[!colnames(resultLabels) %in% c("covariateId", "group")],
+ names_to = "variable",
+ values_to = "value"
+ )
+
+ resultLabels <- resultLabels %>%
+ dplyr::mutate(group_variable = paste(.data$group, .data$variable, sep = "_")) %>%
dplyr::select(-"group", -"variable")
-
+
resultLabels <- tidyr::pivot_wider(
- data = resultLabels,
- names_from = 'group_variable',
- values_from = 'value',
+ data = resultLabels,
+ names_from = "group_variable",
+ values_from = "value",
values_fill = 0
- )
-
- #resultLabels <- reshape2::melt(resultLabels, id.vars = c('covariateId','group'))
- #resultLabels <- reshape2::dcast(resultLabels, covariateId~group+variable, fill = 0)
-
- resultLabels <- resultLabels %>%
- dplyr::mutate(StandardizedMeanDiff = (.data$WithOutcome_CovariateMean - .data$WithNoOutcome_CovariateMean)/sqrt((.data$WithOutcome_CovariateStDev^2 + .data$WithNoOutcome_CovariateStDev^2)/2) )
-
-
- resultAll <- covariateSummariesPerStrata %>%
- dplyr::group_by(.data$covariateId) %>%
+ )
+
+ resultLabels <- resultLabels %>%
+ dplyr::mutate(StandardizedMeanDiff = (.data$WithOutcome_CovariateMean - .data$WithNoOutcome_CovariateMean) / sqrt((.data$WithOutcome_CovariateStDev^2 + .data$WithNoOutcome_CovariateStDev^2) / 2))
+
+
+ resultAll <- covariateSummariesPerStrata %>%
+ dplyr::group_by(.data$covariateId) %>%
dplyr::summarise(
- CovariateCount = sum(.data$CovariateCount),
- CovariateMean = sum(.data$sumVal)/sum(.data$N),
- CovariateStDev = sqrt(sum(.data$sumSquares)/sum(.data$N) - (sum(.data$sumVal)/sum(.data$N))^2 )
+ CovariateCount = sum(.data$CovariateCount),
+ CovariateMean = sum(.data$sumVal) / sum(.data$N),
+ CovariateStDev = sqrt(sum(.data$sumSquares) / sum(.data$N) - (sum(.data$sumVal) / sum(.data$N))^2)
)
-
- result <- resultAll %>% dplyr::left_join(resultLabels, by = 'covariateId')
+ result <- resultAll %>% dplyr::left_join(resultLabels, by = "covariateId")
}
-
+
# if strata and labels
- if( !is.null(labels) & !is.null(strata) ) {
- ParallelLogger::logInfo('Aggregating with labels and strata')
+ if (!is.null(labels) && !is.null(strata)) {
+ ParallelLogger::logInfo("Aggregating with labels and strata")
# labels and strata
- resultLabelStratas <- covariateSummariesPerStrata %>%
+ resultLabelStratas <- covariateSummariesPerStrata %>%
dplyr::select(
"group",
- "covariateId",
- "CovariateCount",
- "CovariateMean",
- "CovariateStDev",
+ "covariateId",
+ "CovariateCount",
+ "CovariateMean",
+ "CovariateStDev",
)
-
+
resultLabelStratas <- tidyr::pivot_longer(
- data = resultLabelStratas,
- cols = colnames(resultLabelStratas)[!colnames(resultLabelStratas) %in% c('covariateId','group')],
- names_to = 'variable',
- values_to = 'value'
+ data = resultLabelStratas,
+ cols = colnames(resultLabelStratas)[!colnames(resultLabelStratas) %in% c("covariateId", "group")],
+ names_to = "variable",
+ values_to = "value"
)
-
- resultLabelStratas <- resultLabelStratas %>%
- dplyr::mutate(group_variable = paste(.data$group, .data$variable, sep ='_')) %>%
+
+ resultLabelStratas <- resultLabelStratas %>%
+ dplyr::mutate(group_variable = paste(.data$group, .data$variable, sep = "_")) %>%
dplyr::select(-"group", -"variable")
-
+
resultLabelStratas <- tidyr::pivot_wider(
- data = resultLabelStratas,
- names_from = 'group_variable',
- values_from = 'value',
+ data = resultLabelStratas,
+ names_from = "group_variable",
+ values_from = "value",
values_fill = 0
)
-
- #resultLabelStratas <- reshape2::melt(resultLabelStratas, id.vars = c('covariateId','group'))
- #resultLabelStratas <- reshape2::dcast(resultLabelStratas, covariateId~group+variable, fill = 0)
-
+
# labels only
- resultLabels <- covariateSummariesPerStrata %>%
+ resultLabels <- covariateSummariesPerStrata %>%
dplyr::mutate(
- groupLabel = sapply(.data$group, function(x){ ifelse(
- length( grep('WithNoOutcome', x))>0,
- 'WithNoOutcome',
- 'WithOutcome'
- )})
+ groupLabel = sapply(.data$group, function(x) {
+ ifelse(
+ length(grep("WithNoOutcome", x)) > 0,
+ "WithNoOutcome",
+ "WithOutcome"
+ )
+ })
) %>%
dplyr::group_by(.data$covariateId, .data$groupLabel) %>%
- dplyr::summarise(
- CovariateCount = sum(.data$CovariateCount),
- CovariateMean = sum(.data$sumVal)/sum(.data$N),
- CovariateStDev = sqrt(sum(.data$sumSquares)/sum(.data$N) - (sum(.data$sumVal)/sum(.data$N))^2 )
- ) %>%
+ dplyr::summarise(
+ CovariateCount = sum(.data$CovariateCount),
+ CovariateMean = sum(.data$sumVal) / sum(.data$N),
+ CovariateStDev = sqrt(sum(.data$sumSquares) / sum(.data$N) - (sum(.data$sumVal) / sum(.data$N))^2)
+ ) %>%
dplyr::select(
"groupLabel",
- "covariateId",
- "CovariateCount",
- "CovariateMean",
+ "covariateId",
+ "CovariateCount",
+ "CovariateMean",
"CovariateStDev"
)
-
+
resultLabels <- tidyr::pivot_longer(
- data = resultLabels,
- cols = colnames(resultLabels)[!colnames(resultLabels) %in% c('covariateId','groupLabel')],
- names_to = 'variable',
- values_to = 'value'
+ data = resultLabels,
+ cols = colnames(resultLabels)[!colnames(resultLabels) %in% c("covariateId", "groupLabel")],
+ names_to = "variable",
+ values_to = "value"
)
-
- resultLabels <- resultLabels %>%
- dplyr::mutate(group_variable = paste(.data$groupLabel, .data$variable, sep ='_')) %>%
+
+ resultLabels <- resultLabels %>%
+ dplyr::mutate(group_variable = paste(.data$groupLabel, .data$variable, sep = "_")) %>%
dplyr::select(-"groupLabel", -"variable")
-
+
resultLabels <- tidyr::pivot_wider(
- data = resultLabels,
- names_from = 'group_variable',
- values_from = 'value',
+ data = resultLabels,
+ names_from = "group_variable",
+ values_from = "value",
values_fill = 0
)
-
- #resultLabels <- reshape2::melt(resultLabels, id.vars = c('covariateId','groupLabel'))
- #resultLabels <- reshape2::dcast(resultLabels, covariateId~groupLabel+variable, fill = 0)
-
- resultLabels <- resultLabels %>%
- dplyr::mutate(StandardizedMeanDiff = (.data$WithOutcome_CovariateMean - .data$WithNoOutcome_CovariateMean)/sqrt((.data$WithOutcome_CovariateStDev^2 + .data$WithNoOutcome_CovariateStDev^2)/2) )
-
-
+
+ resultLabels <- resultLabels %>%
+ dplyr::mutate(StandardizedMeanDiff = (.data$WithOutcome_CovariateMean - .data$WithNoOutcome_CovariateMean) / sqrt((.data$WithOutcome_CovariateStDev^2 + .data$WithNoOutcome_CovariateStDev^2) / 2))
+
+
# all results
- resultAll <- covariateSummariesPerStrata %>%
- dplyr::group_by(.data$covariateId) %>%
- dplyr::summarise(
- CovariateCount = sum(.data$CovariateCount),
- CovariateMean = sum(.data$sumVal)/sum(.data$N),
- CovariateStDev = sqrt(sum(.data$sumSquares)/sum(.data$N) - (sum(.data$sumVal)/sum(.data$N))^2 )
- )
-
- result <- resultAll %>%
- dplyr::left_join(resultLabels, by = 'covariateId') %>%
- dplyr::left_join(resultLabelStratas, by = 'covariateId')
-
+ resultAll <- covariateSummariesPerStrata %>%
+ dplyr::group_by(.data$covariateId) %>%
+ dplyr::summarise(
+ CovariateCount = sum(.data$CovariateCount),
+ CovariateMean = sum(.data$sumVal) / sum(.data$N),
+ CovariateStDev = sqrt(sum(.data$sumSquares) / sum(.data$N) - (sum(.data$sumVal) / sum(.data$N))^2)
+ )
+
+ result <- resultAll %>%
+ dplyr::left_join(resultLabels, by = "covariateId") %>%
+ dplyr::left_join(resultLabelStratas, by = "covariateId")
}
- return(result)
+ return(result)
}
createCovariateSubsets <- function(
- cohort,
- labels = NULL,
- strata = NULL
-){
-
- if(!is.null(labels)){
- ParallelLogger::logInfo('Creating binary labels')
- cohort <- cohort %>% dplyr::inner_join(labels, by = 'rowId') %>%
- dplyr::mutate(label = ifelse(.data$outcomeCount==0, 'WithNoOutcome','WithOutcome'))
- } else{
- cohort$label <- ''
+ cohort,
+ labels = NULL,
+ strata = NULL) {
+ if (!is.null(labels)) {
+ ParallelLogger::logInfo("Creating binary labels")
+ cohort <- cohort %>%
+ dplyr::inner_join(labels, by = "rowId") %>%
+ dplyr::mutate(label = ifelse(.data$outcomeCount == 0, "WithNoOutcome", "WithOutcome"))
+ } else {
+ cohort$label <- ""
}
-
- if(!is.null(strata)){
- ParallelLogger::logInfo('Joining with strata')
- cohort <- cohort %>% dplyr::inner_join(strata, by = 'rowId')
- } else{
- cohort$strataName <- ''
+
+ if (!is.null(strata)) {
+ ParallelLogger::logInfo("Joining with strata")
+ cohort <- cohort %>% dplyr::inner_join(strata, by = "rowId")
+ } else {
+ cohort$strataName <- ""
}
-
- cohort <- cohort %>%
- dplyr::mutate(finalStrata = paste0(.data$strataName,.data$label))
-
+
+ cohort <- cohort %>%
+ dplyr::mutate(finalStrata = paste0(.data$strataName, .data$label))
+
finalStratas <- unique(cohort$finalStrata)
-
+
result <- list()
length(result) <- length(finalStratas)
-
- for(i in 1:length(finalStratas)){
- ParallelLogger::logInfo(paste0('calculating subset of strata ',i))
- subset <- cohort %>%
+
+ for (i in 1:length(finalStratas)) {
+ ParallelLogger::logInfo(paste0("calculating subset of strata ", i))
+ subset <- cohort %>%
dplyr::filter(.data$finalStrata == finalStratas[[i]]) %>%
dplyr::select("rowId")
-
+
result[[i]] <- list(
- subset = subset,
+ subset = subset,
subsetName = finalStratas[[i]]
)
-
}
-
+
return(result)
}
covariateSummarySubset <- function(
- covariateData,
- subset,
- subsetName = '',
- restrictCovariateDataToSubsetIds = T){
-
+ covariateData,
+ subset,
+ subsetName = "",
+ restrictCovariateDataToSubsetIds = TRUE) {
N <- length(subset)
-
- if(restrictCovariateDataToSubsetIds){
- ParallelLogger::logInfo('Restricting to subgroup')
+
+ if (restrictCovariateDataToSubsetIds) {
+ ParallelLogger::logInfo("Restricting to subgroup")
covariates <- getCovariatesForGroup(
- covariateData,
+ covariateData,
restrictIds = subset
- )
- } else{
+ )
+ } else {
covariates <- covariateData$covariates
}
-
- ParallelLogger::logInfo(paste0('Calculating summary for subgroup ', subsetName))
-
+
+ ParallelLogger::logInfo(paste0("Calculating summary for subgroup ", subsetName))
+
result <- covariates %>%
dplyr::group_by(.data$covariateId) %>%
dplyr::summarise(
CovariateCount = dplyr::n(),
- sumVal = sum(.data$covariateValue,na.rm = TRUE),
- sumSquares = sum(.data$covariateValue^2,na.rm = TRUE)
+ sumVal = sum(.data$covariateValue, na.rm = TRUE),
+ sumSquares = sum(.data$covariateValue^2, na.rm = TRUE)
) %>%
dplyr::mutate(
- CovariateMean = 1.0*.data$sumVal/N,
- CovariateStDev = sqrt(.data$sumSquares/N - (.data$sumVal/N)^2 ),
+ CovariateMean = 1.0 * .data$sumVal / N,
+ CovariateStDev = sqrt(.data$sumSquares / N - (.data$sumVal / N)^2),
N = N,
group = subsetName
- ) %>%
+ ) %>%
dplyr::collect()
-
+
return(result)
}
-getCovariatesForGroup <- function(covariateData, restrictIds){
+getCovariatesForGroup <- function(covariateData, restrictIds) {
# restrict covaraiteData to specified rowIds
- if(length(restrictIds)<200000){
-
+ if (length(restrictIds) < 200000) {
covariateData$restrictIds <- data.frame(rowId = restrictIds)
- #on.exit(covariateData$restrictIds <- NULL, add = T)
-
- newCovariates <- covariateData$covariates %>%
- dplyr::inner_join(covariateData$restrictIds, by= 'rowId')
-
- } else{
+ newCovariates <- covariateData$covariates %>%
+ dplyr::inner_join(covariateData$restrictIds, by = "rowId")
+ } else {
newCovariateData <- batchRestrict(
- covariateData,
- data.frame(rowId = restrictIds),
+ covariateData,
+ data.frame(rowId = restrictIds),
sizeN = 10000000
- )
-
+ )
+
newCovariates <- newCovariateData$covariates
-
}
-
+
# add index to rowId and covariateId?
-
+
return(newCovariates)
}
-
diff --git a/R/CyclopsModels.R b/R/CyclopsModels.R
index 16f035164..bb0b395f7 100644
--- a/R/CyclopsModels.R
+++ b/R/CyclopsModels.R
@@ -18,42 +18,40 @@
fitCyclopsModel <- function(
- trainData,
- modelSettings, # old:param,
- search='adaptive',
- analysisId,
- ...){
-
+ trainData,
+ modelSettings, # old:param,
+ search = "adaptive",
+ analysisId,
+ ...) {
param <- modelSettings$param
-
+
# check plpData is coo format:
- if (!FeatureExtraction::isCovariateData(trainData$covariateData)){
+ if (!FeatureExtraction::isCovariateData(trainData$covariateData)) {
stop("Needs correct covariateData")
}
-
- settings <- attr(param, 'settings')
-
- trainData$covariateData$labels <- trainData$labels %>%
+
+ settings <- attr(param, "settings")
+
+ trainData$covariateData$labels <- trainData$labels %>%
dplyr::mutate(
- y = sapply(.data$outcomeCount, function(x) min(1,x)),
+ y = sapply(.data$outcomeCount, function(x) min(1, x)),
time = .data$survivalTime
)
-
+
covariates <- filterCovariateIds(param, trainData$covariateData)
-
+
if (!is.null(param$priorCoefs)) {
sourceCoefs <- param$priorCoefs %>%
- dplyr::filter(abs(.data$betas)>0 & .data$covariateIds != "(Intercept)")
-
+ dplyr::filter(abs(.data$betas) > 0 & .data$covariateIds != "(Intercept)")
+
newCovariates <- covariates %>%
dplyr::filter(.data$covariateId %in% !!sourceCoefs$covariateIds) %>%
- dplyr::mutate(newCovariateId = .data$covariateId*-1) %>%
+ dplyr::mutate(newCovariateId = .data$covariateId * -1) %>%
dplyr::select(-"covariateId") %>%
dplyr::rename(covariateId = .data$newCovariateId) %>%
dplyr::collect()
-
+
Andromeda::appendToTable(covariates, newCovariates)
-
}
start <- Sys.time()
@@ -66,155 +64,161 @@ fitCyclopsModel <- function(
checkRowIds = FALSE,
normalize = NULL,
quiet = TRUE
- )
-
+ )
+
if (!is.null(param$priorCoefs)) {
- fixedCoefficients <- c(FALSE,
- rep(TRUE, nrow(sourceCoefs)),
- rep(FALSE, length(cyclopsData$coefficientNames)-(nrow(sourceCoefs)+1)))
-
+ fixedCoefficients <- c(
+ FALSE,
+ rep(TRUE, nrow(sourceCoefs)),
+ rep(FALSE, length(cyclopsData$coefficientNames) - (nrow(sourceCoefs) + 1))
+ )
+
startingCoefficients <- rep(0, length(fixedCoefficients))
-
+
# skip intercept index
- startingCoefficients[2:(nrow(sourceCoefs)+1)] <- sourceCoefs$betas
+ startingCoefficients[2:(nrow(sourceCoefs) + 1)] <- sourceCoefs$betas
} else {
startingCoefficients <- NULL
- fixedCoefficients <- NULL
+ fixedCoefficients <- NULL
}
- if(settings$crossValidationInPrior){
- param$priorParams$useCrossValidation <- max(trainData$folds$index)>1
+ if (settings$crossValidationInPrior) {
+ param$priorParams$useCrossValidation <- max(trainData$folds$index) > 1
}
prior <- do.call(eval(parse(text = settings$priorfunction)), param$priorParams)
- if(settings$useControl){
-
+ if (settings$useControl) {
control <- Cyclops::createControl(
cvType = "auto",
fold = max(trainData$folds$index),
startingVariance = param$priorParams$variance,
lowerLimit = param$lowerLimit,
upperLimit = param$upperLimit,
- tolerance = settings$tolerance,
+ tolerance = settings$tolerance,
cvRepetitions = 1, # make an option?
selectorType = settings$selectorType,
noiseLevel = "silent",
threads = settings$threads,
maxIterations = settings$maxIterations,
seed = settings$seed
- )
-
- fit <- tryCatch({
- ParallelLogger::logInfo('Running Cyclops')
- Cyclops::fitCyclopsModel(
- cyclopsData = cyclopsData,
- prior = prior,
- control = control,
- fixedCoefficients = fixedCoefficients,
- startingCoefficients = startingCoefficients
- )},
- finally = ParallelLogger::logInfo('Done.')
- )
- } else{
- fit <- tryCatch({
- ParallelLogger::logInfo('Running Cyclops with fixed varience')
- Cyclops::fitCyclopsModel(cyclopsData, prior = prior)},
- finally = ParallelLogger::logInfo('Done.'))
+ )
+
+ fit <- tryCatch(
+ {
+ ParallelLogger::logInfo("Running Cyclops")
+ Cyclops::fitCyclopsModel(
+ cyclopsData = cyclopsData,
+ prior = prior,
+ control = control,
+ fixedCoefficients = fixedCoefficients,
+ startingCoefficients = startingCoefficients
+ )
+ },
+ finally = ParallelLogger::logInfo("Done.")
+ )
+ } else {
+ fit <- tryCatch(
+ {
+ ParallelLogger::logInfo("Running Cyclops with fixed varience")
+ Cyclops::fitCyclopsModel(cyclopsData, prior = prior)
+ },
+ finally = ParallelLogger::logInfo("Done.")
+ )
}
-
-
+
+
modelTrained <- createCyclopsModel(
- fit = fit,
- modelType = settings$modelType,
- useCrossValidation = max(trainData$folds$index)>1,
- cyclopsData = cyclopsData,
+ fit = fit,
+ modelType = settings$modelType,
+ useCrossValidation = max(trainData$folds$index) > 1,
+ cyclopsData = cyclopsData,
labels = trainData$covariateData$labels,
folds = trainData$folds,
priorType = param$priorParams$priorType
- )
-
+ )
+
if (!is.null(param$priorCoefs)) {
modelTrained$coefficients <- reparamTransferCoefs(modelTrained$coefficients)
}
-
+
# TODO get optimal lambda value
- ParallelLogger::logTrace('Returned from fitting to LassoLogisticRegression')
+ ParallelLogger::logTrace("Returned from fitting to LassoLogisticRegression")
comp <- Sys.time() - start
- ParallelLogger::logTrace('Getting variable importance')
+ ParallelLogger::logTrace("Getting variable importance")
variableImportance <- getVariableImportance(modelTrained, trainData)
-
- #get prediction on test set:
- ParallelLogger::logTrace('Getting predictions on train set')
+
+ # get prediction on test set:
+ ParallelLogger::logTrace("Getting predictions on train set")
tempModel <- list(model = modelTrained)
- attr(tempModel, "modelType") <- attr(param, 'modelType')
+ attr(tempModel, "modelType") <- attr(param, "modelType")
prediction <- predictCyclops(
plpModel = tempModel,
- cohort = trainData$labels,
+ cohort = trainData$labels,
data = trainData
- )
- prediction$evaluationType <- 'Train'
-
+ )
+ prediction$evaluationType <- "Train"
+
# get cv AUC if exists
cvPerFold <- data.frame()
- if(!is.null(modelTrained$cv)){
- cvPrediction <- do.call(rbind, lapply(modelTrained$cv, function(x){x$predCV}))
- cvPrediction$evaluationType <- 'CV'
- # fit date issue convertion caused by andromeda
- cvPrediction$cohortStartDate <- as.Date(cvPrediction$cohortStartDate, origin = '1970-01-01')
-
- prediction <- rbind(prediction, cvPrediction[,colnames(prediction)])
-
- cvPerFold <- unlist(lapply(modelTrained$cv, function(x){x$out_sample_auc}))
- if(length(cvPerFold)>0){
+ if (!is.null(modelTrained$cv)) {
+ cvPrediction <- do.call(rbind, lapply(modelTrained$cv, function(x) {
+ x$predCV
+ }))
+ cvPrediction$evaluationType <- "CV"
+ # fit date issue convertion caused by andromeda
+ cvPrediction$cohortStartDate <- as.Date(cvPrediction$cohortStartDate, origin = "1970-01-01")
+
+ prediction <- rbind(prediction, cvPrediction[, colnames(prediction)])
+
+ cvPerFold <- unlist(lapply(modelTrained$cv, function(x) {
+ x$out_sample_auc
+ }))
+ if (length(cvPerFold) > 0) {
cvPerFold <- data.frame(
- metric = 'AUC',
+ metric = "AUC",
fold = 1:length(cvPerFold),
value = cvPerFold,
- startingVariance = ifelse(is.null(param$priorParams$variance), 'NULL', param$priorParams$variance),
- lowerLimit = ifelse(is.null(param$lowerLimit), 'NULL', param$lowerLimit),
- upperLimit = ifelse(is.null(param$upperLimit), 'NULL', param$upperLimit),
- tolerance = ifelse(is.null(settings$tolerance), 'NULL', settings$tolerance)
+ startingVariance = ifelse(is.null(param$priorParams$variance), "NULL", param$priorParams$variance),
+ lowerLimit = ifelse(is.null(param$lowerLimit), "NULL", param$lowerLimit),
+ upperLimit = ifelse(is.null(param$upperLimit), "NULL", param$upperLimit),
+ tolerance = ifelse(is.null(settings$tolerance), "NULL", settings$tolerance)
)
- } else{
+ } else {
cvPerFold <- data.frame()
}
-
+
# remove the cv from the model:
modelTrained$cv <- NULL
}
-
+
result <- list(
model = modelTrained,
-
preprocessing = list(
- featureEngineering = attr(trainData, "metaData")$featureEngineering,#learned mapping
- tidyCovariates = attr(trainData$covariateData, "metaData")$tidyCovariateDataSettings, #learned mapping
- requireDenseMatrix = F
+ featureEngineering = attr(trainData$covariateData, "metaData")$featureEngineering,
+ tidyCovariates = attr(trainData$covariateData, "metaData")$tidyCovariateDataSettings,
+ requireDenseMatrix = FALSE
),
-
prediction = prediction,
-
modelDesign = PatientLevelPrediction::createModelDesign(
targetId = attr(trainData, "metaData")$targetId, # added
outcomeId = attr(trainData, "metaData")$outcomeId, # added
restrictPlpDataSettings = attr(trainData, "metaData")$restrictPlpDataSettings, # made this restrictPlpDataSettings
covariateSettings = attr(trainData, "metaData")$covariateSettings,
- populationSettings = attr(trainData, "metaData")$populationSettings,
+ populationSettings = attr(trainData, "metaData")$populationSettings,
featureEngineeringSettings = attr(trainData, "metaData")$featureEngineeringSettings,
preprocessSettings = attr(trainData$covariateData, "metaData")$preprocessSettings,
- modelSettings = modelSettings, #modified
+ modelSettings = modelSettings, # modified
splitSettings = attr(trainData, "metaData")$splitSettings,
sampleSettings = attr(trainData, "metaData")$sampleSettings
),
-
trainDetails = list(
- analysisId = analysisId,
- analysisSource = '', #TODO add from model
+ analysisId = analysisId,
+ analysisSource = "", # TODO add from model
developmentDatabase = attr(trainData, "metaData")$cdmDatabaseName,
- developmentDatabaseSchema = attr(trainData, "metaData")$cdmDatabaseSchema,
- attrition = attr(trainData, "metaData")$attrition,
- trainingTime = paste(as.character(abs(comp)), attr(comp,'units')),
+ developmentDatabaseSchema = attr(trainData, "metaData")$cdmDatabaseSchema,
+ attrition = attr(trainData, "metaData")$attrition,
+ trainingTime = paste(as.character(abs(comp)), attr(comp, "units")),
trainingDate = Sys.Date(),
modelName = settings$modelType,
finalModelParameters = list(
@@ -223,15 +227,14 @@ fitCyclopsModel <- function(
),
hyperParamSearch = cvPerFold
),
-
covariateImportance = variableImportance
)
-
-
- class(result) <- 'plpModel'
- attr(result, 'predictionFunction') <- 'predictCyclops'
- attr(result, 'modelType') <- attr(param, 'modelType')
- attr(result, 'saveType') <- attr(param, 'saveType')
+
+
+ class(result) <- "plpModel"
+ attr(result, "predictionFunction") <- "predictCyclops"
+ attr(result, "modelType") <- attr(param, "modelType")
+ attr(result, "saveType") <- attr(param, "saveType")
return(result)
}
@@ -248,119 +251,117 @@ fitCyclopsModel <- function(
#'
#' @param plpModel An object of type \code{predictiveModel} as generated using
#' \code{\link{fitPlp}}.
-#' @param data The new plpData containing the covariateData for the new population
+#' @param data The new plpData containing the covariateData for the new population
#' @param cohort The cohort to calculate the prediction for
#' @export
-predictCyclops <- function(plpModel, data, cohort ) {
+predictCyclops <- function(plpModel, data, cohort) {
start <- Sys.time()
-
- ParallelLogger::logTrace('predictProbabilities - predictAndromeda start')
-
+
+ ParallelLogger::logTrace("predictProbabilities - predictAndromeda start")
+
prediction <- predictCyclopsType(
plpModel$model$coefficients,
cohort,
data$covariateData,
plpModel$model$modelType
)
-
+
# survival cyclops use baseline hazard to convert to risk from exp(LP) to 1-S^exp(LP)
- if(attr(plpModel, 'modelType') == 'survival'){
- if(!is.null(plpModel$model$baselineSurvival)){
- if(is.null(attr(cohort, 'timepoint'))){
- timepoint <- attr(cohort,'metaData')$populationSettings$riskWindowEnd
- } else{
- timepoint <- attr(cohort, 'timepoint')
+ if (attr(plpModel, "modelType") == "survival") {
+ if (!is.null(plpModel$model$baselineSurvival)) {
+ if (is.null(attr(cohort, "timepoint"))) {
+ timepoint <- attr(cohort, "metaData")$populationSettings$riskWindowEnd
+ } else {
+ timepoint <- attr(cohort, "timepoint")
}
- bhind <- which.min(abs(plpModel$model$baselineSurvival$time-timepoint))
+ bhind <- which.min(abs(plpModel$model$baselineSurvival$time - timepoint))
# 1- baseline survival(time)^ (exp(betas*values))
- prediction$value <- 1-plpModel$model$baselineSurvival$surv[bhind]^prediction$value
-
-
+ prediction$value <- 1 - plpModel$model$baselineSurvival$surv[bhind]^prediction$value
+
+
metaData <- list()
metaData$baselineSurvivalTimepoint <- plpModel$model$baselineSurvival$time[bhind]
metaData$baselineSurvival <- plpModel$model$baselineSurvival$surv[bhind]
metaData$offset <- 0
-
- attr(prediction, 'metaData') <- metaData
+
+ attr(prediction, "metaData") <- metaData
}
}
-
+
delta <- Sys.time() - start
ParallelLogger::logInfo("Prediction took ", signif(delta, 3), " ", attr(delta, "units"))
return(prediction)
}
predictCyclopsType <- function(coefficients, population, covariateData, modelType = "logistic") {
- if (!(modelType %in% c("logistic", "poisson", "survival","cox"))) {
+ if (!(modelType %in% c("logistic", "poisson", "survival", "cox"))) {
stop(paste("Unknown modelType:", modelType))
}
- if (!FeatureExtraction::isCovariateData(covariateData)){
+ if (!FeatureExtraction::isCovariateData(covariateData)) {
stop("Needs correct covariateData")
}
-
- intercept <- coefficients$betas[coefficients$covariateId%in%'(Intercept)']
- if(length(intercept)==0) intercept <- 0
- betas <- coefficients$betas[!coefficients$covariateIds%in%'(Intercept)']
- coefficients <- data.frame(beta = betas,
- covariateId = coefficients$covariateIds[coefficients$covariateIds!='(Intercept)']
+
+ intercept <- coefficients$betas[coefficients$covariateIds %in% "(Intercept)"]
+ if (length(intercept) == 0) intercept <- 0
+ betas <- coefficients$betas[!coefficients$covariateIds %in% "(Intercept)"]
+ coefficients <- data.frame(
+ beta = betas,
+ covariateId = coefficients$covariateIds[coefficients$covariateIds != "(Intercept)"]
)
coefficients <- coefficients[coefficients$beta != 0, ]
- if(sum(coefficients$beta != 0)>0){
+ if (sum(coefficients$beta != 0) > 0) {
covariateData$coefficients <- coefficients
on.exit(covariateData$coefficients <- NULL, add = TRUE)
-
- prediction <- covariateData$covariates %>%
- dplyr::inner_join(covariateData$coefficients, by= 'covariateId') %>%
- dplyr::mutate(values = .data$covariateValue*.data$beta) %>%
+
+ prediction <- covariateData$covariates %>%
+ dplyr::inner_join(covariateData$coefficients, by = "covariateId") %>%
+ dplyr::mutate(values = .data$covariateValue * .data$beta) %>%
dplyr::group_by(.data$rowId) %>%
dplyr::summarise(value = sum(.data$values, na.rm = TRUE)) %>%
dplyr::select("rowId", "value")
-
+
prediction <- as.data.frame(prediction)
- prediction <- merge(population, prediction, by ="rowId", all.x = TRUE, fill = 0)
+ prediction <- merge(population, prediction, by = "rowId", all.x = TRUE, fill = 0)
prediction$value[is.na(prediction$value)] <- 0
prediction$value <- prediction$value + intercept
- } else{
- warning('Model had no non-zero coefficients so predicted same for all population...')
+ } else {
+ warning("Model had no non-zero coefficients so predicted same for all population...")
prediction <- population
prediction$value <- rep(0, nrow(population)) + intercept
}
if (modelType == "logistic") {
link <- function(x) {
- return(1/(1 + exp(0 - x)))
+ return(1 / (1 + exp(0 - x)))
}
prediction$value <- link(prediction$value)
- attr(prediction, "metaData")$modelType <- 'binary'
+ attr(prediction, "metaData")$modelType <- "binary"
} else if (modelType == "poisson" || modelType == "survival" || modelType == "cox") {
-
# add baseline hazard stuff
-
+
prediction$value <- exp(prediction$value)
- attr(prediction, "metaData")$modelType <- 'survival'
- if(modelType == "survival"){ # is this needed?
- attr(prediction, 'metaData')$timepoint <- max(population$survivalTime, na.rm = T)
+ attr(prediction, "metaData")$modelType <- "survival"
+ if (modelType == "survival") { # is this needed?
+ attr(prediction, "metaData")$timepoint <- max(population$survivalTime, na.rm = TRUE)
}
-
}
return(prediction)
}
createCyclopsModel <- function(fit, modelType, useCrossValidation, cyclopsData, labels, folds,
- priorType){
-
+ priorType) {
if (is.character(fit)) {
coefficients <- c(0)
- names(coefficients) <- ''
+ names(coefficients) <- ""
status <- fit
} else if (fit$return_flag == "ILLCONDITIONED") {
coefficients <- c(0)
- names(coefficients) <- ''
+ names(coefficients) <- ""
status <- "ILL CONDITIONED, CANNOT FIT"
ParallelLogger::logWarn(paste("GLM fitting issue: ", status))
} else if (fit$return_flag == "MAX_ITERATIONS") {
coefficients <- c(0)
- names(coefficients) <- ''
+ names(coefficients) <- ""
status <- "REACHED MAXIMUM NUMBER OF ITERATIONS, CANNOT FIT"
ParallelLogger::logWarn(paste("GLM fitting issue: ", status))
} else {
@@ -368,12 +369,12 @@ createCyclopsModel <- function(fit, modelType, useCrossValidation, cyclopsData,
coefficients <- stats::coef(fit) # not sure this is stats??
ParallelLogger::logInfo(paste("GLM fit status: ", status))
}
-
+
# use a dataframe for the coefficients
betas <- as.numeric(coefficients)
betaNames <- names(coefficients)
- coefficients <- data.frame(betas=betas, covariateIds=betaNames)
-
+ coefficients <- data.frame(betas = betas, covariateIds = betaNames)
+
outcomeModel <- list(
priorVariance = fit$variance,
log_likelihood = fit$log_likelihood,
@@ -382,130 +383,136 @@ createCyclopsModel <- function(fit, modelType, useCrossValidation, cyclopsData,
coefficients = coefficients
)
- if(modelType == "cox" || modelType == "survival") {
- baselineSurvival <- tryCatch({survival::survfit(fit, type = "aalen")},
- error = function(e) {ParallelLogger::logInfo(e); return(NULL)})
- if(is.null(baselineSurvival)){
- ParallelLogger::logInfo('No baseline hazard function returned')
+ if (modelType == "cox" || modelType == "survival") {
+ baselineSurvival <- tryCatch(
+ {
+ survival::survfit(fit, type = "aalen")
+ },
+ error = function(e) {
+ ParallelLogger::logInfo(e)
+ return(NULL)
+ }
+ )
+ if (is.null(baselineSurvival)) {
+ ParallelLogger::logInfo("No baseline hazard function returned")
}
outcomeModel$baselineSurvival <- baselineSurvival
}
class(outcomeModel) <- "plpModel"
-
- #get CV - added && status == "OK" to only run if the model fit sucsessfully
- if(modelType == "logistic" && useCrossValidation && status == "OK"){
- outcomeModel$cv <- getCV(cyclopsData, labels, cvVariance = fit$variance, folds = folds,
- priorType = priorType)
+
+ # get CV - added && status == "OK" to only run if the model fit sucsessfully
+ if (modelType == "logistic" && useCrossValidation && status == "OK") {
+ outcomeModel$cv <- getCV(cyclopsData, labels,
+ cvVariance = fit$variance, folds = folds,
+ priorType = priorType
+ )
}
-
- return(outcomeModel)
+ return(outcomeModel)
}
-modelTypeToCyclopsModelType <- function(modelType, stratified=F) {
+modelTypeToCyclopsModelType <- function(modelType, stratified = FALSE) {
if (modelType == "logistic") {
- if (stratified)
+ if (stratified) {
return("clr")
- else
+ } else {
return("lr")
+ }
} else if (modelType == "poisson") {
- if (stratified)
+ if (stratified) {
return("cpr")
- else
+ } else {
return("pr")
+ }
} else if (modelType == "cox") {
return("cox")
} else {
ParallelLogger::logError(paste("Unknown model type:", modelType))
stop()
}
-
}
getCV <- function(
- cyclopsData,
- labels,
- cvVariance,
- folds,
- priorType
-)
-{
- fixed_prior <- Cyclops::createPrior(priorType = priorType,
- variance = cvVariance,
- useCrossValidation = FALSE)
-
+ cyclopsData,
+ labels,
+ cvVariance,
+ folds,
+ priorType) {
+ fixed_prior <- Cyclops::createPrior(
+ priorType = priorType,
+ variance = cvVariance,
+ useCrossValidation = FALSE
+ )
+
# add the index to the labels
- labels <- merge(labels, folds, by = 'rowId')
-
+ labels <- merge(labels, folds, by = "rowId")
+
result <- lapply(1:max(labels$index), function(i) {
- hold_out <- labels$index==i
+ hold_out <- labels$index == i
weights <- rep(1.0, Cyclops::getNumberOfRows(cyclopsData))
weights[hold_out] <- 0.0
subset_fit <- suppressWarnings(Cyclops::fitCyclopsModel(cyclopsData,
prior = fixed_prior,
- weights = weights))
+ weights = weights
+ ))
predict <- stats::predict(subset_fit)
-
+
auc <- aucWithoutCi(predict[hold_out], labels$y[hold_out])
-
- predCV <- cbind(labels[hold_out,],
- value = predict[hold_out])
- #predCV$outcomeCount <- predCV$y
-
- return(list(out_sample_auc = auc,
+
+ predCV <- cbind(labels[hold_out, ],
+ value = predict[hold_out]
+ )
+ return(list(
+ out_sample_auc = auc,
predCV = predCV,
log_likelihood = subset_fit$log_likelihood,
log_prior = subset_fit$log_prior,
- coef = stats::coef(subset_fit)))
+ coef = stats::coef(subset_fit)
+ ))
})
-
- return(result)
-
+
+ return(result)
}
-getVariableImportance <- function(modelTrained, trainData){
+getVariableImportance <- function(modelTrained, trainData) {
varImp <- data.frame(
- covariateId = as.double(modelTrained$coefficients$covariateIds[modelTrained$coefficients$covariateIds!='(Intercept)']),
- value = modelTrained$coefficients$betas[modelTrained$coefficients$covariateIds!='(Intercept)']
+ covariateId = as.double(modelTrained$coefficients$covariateIds[modelTrained$coefficients$covariateIds != "(Intercept)"]),
+ value = modelTrained$coefficients$betas[modelTrained$coefficients$covariateIds != "(Intercept)"]
)
-if(sum(abs(varImp$value)>0)==0){
- ParallelLogger::logWarn('No non-zero coefficients')
- varImp <- NULL
-} else {
- ParallelLogger::logInfo('Creating variable importance data frame')
-
- #trainData$covariateData$varImp <- varImp
- #on.exit(trainData$covariateData$varImp <- NULL, add = T)
-
- varImp <- trainData$covariateData$covariateRef %>%
- dplyr::collect() %>%
- #dplyr::left_join(trainData$covariateData$varImp) %>%
- dplyr::left_join(varImp, by = 'covariateId') %>%
- dplyr::mutate(covariateValue = ifelse(is.na(.data$value), 0, .data$value)) %>%
- dplyr::select(-"value") %>%
- dplyr::arrange(-abs(.data$covariateValue)) %>%
- dplyr::collect()
-}
-
+ if (sum(abs(varImp$value) > 0) == 0) {
+ ParallelLogger::logWarn("No non-zero coefficients")
+ varImp <- NULL
+ } else {
+ ParallelLogger::logInfo("Creating variable importance data frame")
+
+ varImp <- trainData$covariateData$covariateRef %>%
+ dplyr::collect() %>%
+ dplyr::left_join(varImp, by = "covariateId") %>%
+ dplyr::mutate(covariateValue = ifelse(is.na(.data$value), 0, .data$value)) %>%
+ dplyr::select(-"value") %>%
+ dplyr::arrange(-abs(.data$covariateValue)) %>%
+ dplyr::collect()
+ }
+
return(varImp)
}
-filterCovariateIds <- function(param, covariateData){
- if ( (length(param$includeCovariateIds) != 0) & (length(param$excludeCovariateIds) != 0)) {
- covariates <- covariateData$covariates %>%
- dplyr::filter(.data$covariateId %in% param$includeCovariateIds) %>%
+filterCovariateIds <- function(param, covariateData) {
+ if ((length(param$includeCovariateIds) != 0) && (length(param$excludeCovariateIds) != 0)) {
+ covariates <- covariateData$covariates %>%
+ dplyr::filter(.data$covariateId %in% param$includeCovariateIds) %>%
dplyr::filter(!.data$covariateId %in% param$excludeCovariateIds) # does not work
- } else if ( (length(param$includeCovariateIds) == 0) & (length(param$excludeCovariateIds) != 0)) {
- covariates <- covariateData$covariates %>%
+ } else if ((length(param$includeCovariateIds) == 0) && (length(param$excludeCovariateIds) != 0)) {
+ covariates <- covariateData$covariates %>%
dplyr::filter(!.data$covariateId %in% param$excludeCovariateIds) # does not work
- } else if ( (length(param$includeCovariateIds) != 0) & (length(param$excludeCovariateIds) == 0)) {
+ } else if ((length(param$includeCovariateIds) != 0) && (length(param$excludeCovariateIds) == 0)) {
includeCovariateIds <- as.double(param$includeCovariateIds) # fixes odd dplyr issue with param
- covariates <- covariateData$covariates %>%
- dplyr::filter(.data$covariateId %in% includeCovariateIds)
+ covariates <- covariateData$covariates %>%
+ dplyr::filter(.data$covariateId %in% includeCovariateIds)
} else {
covariates <- covariateData$covariates
}
@@ -515,15 +522,15 @@ filterCovariateIds <- function(param, covariateData){
reparamTransferCoefs <- function(inCoefs) {
transferCoefs <- inCoefs %>%
dplyr::filter(grepl("-", .data$covariateIds))
-
+
transferCoefs$covariateIds <- substring(transferCoefs$covariateIds, 2)
-
+
originalCoefs <- inCoefs %>%
dplyr::filter(!grepl("-", .data$covariateIds))
-
+
coefs <- rbind(originalCoefs, transferCoefs)
coefs <- rowsum(coefs$betas, coefs$covariateIds)
coefs <- data.frame(betas = coefs, covariateIds = rownames(coefs), row.names = NULL)
return(coefs)
-}
\ No newline at end of file
+}
diff --git a/R/CyclopsSettings.R b/R/CyclopsSettings.R
index fb688f03e..7000933ec 100644
--- a/R/CyclopsSettings.R
+++ b/R/CyclopsSettings.R
@@ -3,7 +3,7 @@
#' @param variance Numeric: prior distribution starting variance
#' @param seed An option to add a seed when training the model
#' @param includeCovariateIds a set of covariate IDS to limit the analysis to
-#' @param noShrinkage a set of covariates whcih are to be forced to be included in the final model. default is the intercept
+#' @param noShrinkage a set of covariates whcih are to be forced to be included in the final model. default is the intercept
#' @param threads An option to set number of threads when training model
#' @param forceIntercept Logical: Force intercept coefficient into prior
#' @param upperLimit Numeric: Upper prior variance limit for grid-search
@@ -11,74 +11,72 @@
#' @param tolerance Numeric: maximum relative change in convergence criterion from successive iterations to achieve convergence
#' @param maxIterations Integer: maximum iterations of Cyclops to attempt before returning a failed-to-converge error
#' @param priorCoefs Use coefficients from a previous model as starting points for model fit (transfer learning)
-#'
+#'
#' @examples
#' model.lr <- setLassoLogisticRegression()
#' @export
-setLassoLogisticRegression<- function(
- variance = 0.01,
- seed = NULL,
- includeCovariateIds = c(),
- noShrinkage = c(0),
- threads = -1,
- forceIntercept = F,
- upperLimit = 20,
- lowerLimit = 0.01,
- tolerance = 2e-06,
- maxIterations = 3000,
- priorCoefs = NULL
- ){
-
- checkIsClass(seed, c('numeric','NULL','integer'))
- if(is.null(seed[1])){
- seed <- as.integer(sample(100000000,1))
+setLassoLogisticRegression <- function(
+ variance = 0.01,
+ seed = NULL,
+ includeCovariateIds = c(),
+ noShrinkage = c(0),
+ threads = -1,
+ forceIntercept = FALSE,
+ upperLimit = 20,
+ lowerLimit = 0.01,
+ tolerance = 2e-06,
+ maxIterations = 3000,
+ priorCoefs = NULL) {
+ checkIsClass(seed, c("numeric", "NULL", "integer"))
+ if (is.null(seed[1])) {
+ seed <- as.integer(sample(100000000, 1))
}
- checkIsClass(threads, c('numeric','integer'))
- checkIsClass(variance, c('numeric','integer'))
+ checkIsClass(threads, c("numeric", "integer"))
+ checkIsClass(variance, c("numeric", "integer"))
checkHigherEqual(variance, 0)
-
- checkIsClass(lowerLimit, c('numeric','integer'))
- checkIsClass(upperLimit, c('numeric','integer'))
+
+ checkIsClass(lowerLimit, c("numeric", "integer"))
+ checkIsClass(upperLimit, c("numeric", "integer"))
checkHigherEqual(upperLimit, lowerLimit)
-
+
param <- list(
priorParams = list(
- priorType = "laplace",
+ priorType = "laplace",
forceIntercept = forceIntercept,
- variance = variance,
+ variance = variance,
exclude = noShrinkage
- ),
- includeCovariateIds = includeCovariateIds,
- upperLimit = upperLimit,
+ ),
+ includeCovariateIds = includeCovariateIds,
+ upperLimit = upperLimit,
lowerLimit = lowerLimit,
priorCoefs = priorCoefs
- )
-
- attr(param, 'settings') <- list(
- priorfunction = 'Cyclops::createPrior',
- selectorType = "byPid", # is this correct?
- crossValidationInPrior = T,
- modelType = 'logistic',
- addIntercept = T,
- useControl = T,
+ )
+
+ attr(param, "settings") <- list(
+ priorfunction = "Cyclops::createPrior",
+ selectorType = "byPid", # is this correct?
+ crossValidationInPrior = TRUE,
+ modelType = "logistic",
+ addIntercept = TRUE,
+ useControl = TRUE,
seed = seed[1],
name = "Lasso Logistic Regression",
- threads = threads[1],
- tolerance = tolerance[1], #2e-06
- cvRepetitions = 1, #1
- maxIterations = maxIterations[1] #3000
+ threads = threads[1],
+ tolerance = tolerance[1], # 2e-06
+ cvRepetitions = 1, # 1
+ maxIterations = maxIterations[1] # 3000
)
-
- attr(param, 'modelType') <- 'binary'
- attr(param, 'saveType') <- 'RtoJson'
-
+
+ attr(param, "modelType") <- "binary"
+ attr(param, "saveType") <- "RtoJson"
+
result <- list(
fitFunction = "fitCyclopsModel",
param = param
)
class(result) <- "modelSettings"
-
+
return(result)
}
@@ -89,7 +87,7 @@ setLassoLogisticRegression<- function(
#' @param variance Numeric: prior distribution starting variance
#' @param seed An option to add a seed when training the model
#' @param includeCovariateIds a set of covariate IDS to limit the analysis to
-#' @param noShrinkage a set of covariates whcih are to be forced to be included in the final model. default is the intercept
+#' @param noShrinkage a set of covariates whcih are to be forced to be included in the final model. default is the intercept
#' @param threads An option to set number of threads when training model
#' @param upperLimit Numeric: Upper prior variance limit for grid-search
#' @param lowerLimit Numeric: Lower prior variance limit for grid-search
@@ -100,79 +98,77 @@ setLassoLogisticRegression<- function(
#' model.lr <- setCoxModel()
#' @export
setCoxModel <- function(
- variance = 0.01,
- seed = NULL,
- includeCovariateIds = c(),
- noShrinkage = c(),
- threads = -1,
- upperLimit = 20,
- lowerLimit = 0.01,
- tolerance = 2e-07,
- maxIterations = 3000
-){
-
- checkIsClass(seed, c('numeric','NULL','integer'))
- if(is.null(seed[1])){
- seed <- as.integer(sample(100000000,1))
+ variance = 0.01,
+ seed = NULL,
+ includeCovariateIds = c(),
+ noShrinkage = c(),
+ threads = -1,
+ upperLimit = 20,
+ lowerLimit = 0.01,
+ tolerance = 2e-07,
+ maxIterations = 3000) {
+
+ checkSurvivalPackages()
+ checkIsClass(seed, c("numeric", "NULL", "integer"))
+ if (is.null(seed[1])) {
+ seed <- as.integer(sample(100000000, 1))
}
- checkIsClass(threads, c('numeric','integer'))
- checkIsClass(variance, c('numeric','integer'))
+ checkIsClass(threads, c("numeric", "integer"))
+ checkIsClass(variance, c("numeric", "integer"))
checkHigherEqual(variance, 0)
-
- checkIsClass(lowerLimit, c('numeric','integer'))
- checkIsClass(upperLimit, c('numeric','integer'))
-
+
+ checkIsClass(lowerLimit, c("numeric", "integer"))
+ checkIsClass(upperLimit, c("numeric", "integer"))
+
checkHigherEqual(upperLimit, lowerLimit)
-
- #selectorType = "byRow",
param <- list(
priorParams = list(
- priorType = "laplace",
- variance = variance,
+ priorType = "laplace",
+ variance = variance,
exclude = noShrinkage
),
- includeCovariateIds = includeCovariateIds,
- upperLimit = upperLimit,
+ includeCovariateIds = includeCovariateIds,
+ upperLimit = upperLimit,
lowerLimit = lowerLimit
)
-
- attr(param, 'settings') <- list(
- priorfunction = 'Cyclops::createPrior',
+
+ attr(param, "settings") <- list(
+ priorfunction = "Cyclops::createPrior",
selectorType = "byRow",
- crossValidationInPrior = T,
- modelType = 'cox',
- addIntercept = F,
- useControl = T,
+ crossValidationInPrior = TRUE,
+ modelType = "cox",
+ addIntercept = FALSE,
+ useControl = TRUE,
seed = seed[1],
name = "LASSO Cox Regression",
- threads = threads[1],
- tolerance = tolerance[1], #2e-07
- cvRepetitions = 1, #1
- maxIterations = maxIterations[1] #3000
+ threads = threads[1],
+ tolerance = tolerance[1], # 2e-07
+ cvRepetitions = 1, # 1
+ maxIterations = maxIterations[1] # 3000
)
-
- attr(param, 'modelType') <- 'survival'
- attr(param, 'saveType') <- 'RtoJson'
-
+
+ attr(param, "modelType") <- "survival"
+ attr(param, "saveType") <- "RtoJson"
+
result <- list(
fitFunction = "fitCyclopsModel",
param = param
)
class(result) <- "modelSettings"
-
+
return(result)
}
#' Create setting for lasso logistic regression
-#'
+#'
#' @param K The maximum number of non-zero predictors
#' @param penalty Specifies the IHT penalty; possible values are `BIC` or `AIC` or a numeric value
#' @param seed An option to add a seed when training the model
#' @param exclude A vector of numbers or covariateId names to exclude from prior
#' @param forceIntercept Logical: Force intercept coefficient into regularization
-#' @param fitBestSubset Logical: Fit final subset with no regularization
+#' @param fitBestSubset Logical: Fit final subset with no regularization
#' @param initialRidgeVariance integer
#' @param tolerance numeric
#' @param maxIterations integer
@@ -182,74 +178,76 @@ setCoxModel <- function(
#' @examples
#' model.lr <- setLassoLogisticRegression()
#' @export
-setIterativeHardThresholding<- function(
- K = 10,
- penalty = "bic",
- seed = sample(100000,1),
- exclude = c(),
- forceIntercept = F,
- fitBestSubset = FALSE,
- initialRidgeVariance = 10000,
- tolerance = 1e-08,
- maxIterations = 10000,
- threshold = 1e-06,
- delta = 0
- ){
-
- ensure_installed("IterativeHardThresholding")
-
- if(K<1)
- stop('Invalid maximum number of predictors')
- if(!(penalty %in% c("aic", "bic") || is.numeric(penalty)))
+setIterativeHardThresholding <- function(
+ K = 10,
+ penalty = "bic",
+ seed = sample(100000, 1),
+ exclude = c(),
+ forceIntercept = FALSE,
+ fitBestSubset = FALSE,
+ initialRidgeVariance = 0.1,
+ tolerance = 1e-08,
+ maxIterations = 10000,
+ threshold = 1e-06,
+ delta = 0) {
+ rlang::check_installed("IterativeHardThresholding")
+
+ if (K < 1) {
+ stop("Invalid maximum number of predictors")
+ }
+ if (!(penalty %in% c("aic", "bic") || is.numeric(penalty))) {
stop('Penalty must be "aic", "bic" or numeric')
- if(!is.logical(forceIntercept))
+ }
+ if (!is.logical(forceIntercept)) {
stop("forceIntercept must be of type: logical")
- if(!is.logical(fitBestSubset))
+ }
+ if (!is.logical(fitBestSubset)) {
stop("fitBestSubset must be of type: logical")
- if(!inherits(x = seed, what = c('numeric','NULL','integer')))
- stop('Invalid seed')
+ }
+ if (!inherits(x = seed, what = c("numeric", "NULL", "integer"))) {
+ stop("Invalid seed")
+ }
# set seed
- if(is.null(seed[1])){
- seed <- as.integer(sample(100000000,1))
+ if (is.null(seed[1])) {
+ seed <- as.integer(sample(100000000, 1))
}
-
+
param <- list(
priorParams = list(
K = K,
- penalty = penalty,
+ penalty = penalty,
exclude = exclude,
forceIntercept = forceIntercept,
fitBestSubset = fitBestSubset,
initialRidgeVariance = initialRidgeVariance,
- tolerance = tolerance[1],
- maxIterations = maxIterations[1],
- threshold = threshold,
+ tolerance = tolerance[1],
+ maxIterations = maxIterations[1],
+ threshold = threshold,
delta = delta
)
)
-
- attr(param, 'settings') <- list(
- priorfunction = 'IterativeHardThresholding::createIhtPrior',
+
+ attr(param, "settings") <- list(
+ priorfunction = "IterativeHardThresholding::createIhtPrior",
selectorType = "byRow",
- crossValidationInPrior = F,
- modelType = 'logistic',
- addIntercept = F,
- useControl = F,
+ crossValidationInPrior = FALSE,
+ modelType = "logistic",
+ addIntercept = FALSE,
+ useControl = FALSE,
seed = seed[1],
name = "Iterative Hard Thresholding"
)
-
- attr(param, 'modelType') <- 'binary'
- attr(param, 'saveType') <- 'RtoJson'
-
+
+ attr(param, "modelType") <- "binary"
+ attr(param, "saveType") <- "RtoJson"
+
result <- list(
fitFunction = "fitCyclopsModel",
param = param
)
class(result) <- "modelSettings"
-
+
return(result)
}
-
diff --git a/R/DataSplitting.R b/R/DataSplitting.R
index 93c7a060e..3d3e5a2ec 100644
--- a/R/DataSplitting.R
+++ b/R/DataSplitting.R
@@ -2,13 +2,13 @@
# Copyright 2021 Observational Health Data Sciences and Informatics
#
# This file is part of PatientLevelPrediction
-#
+#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
-#
+#
# http://www.apache.org/licenses/LICENSE-2.0
-#
+#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -16,397 +16,445 @@
# limitations under the License.
-#' Create the settings for defining how the plpData are split into test/validation/train sets using
-#' default splitting functions (either random stratified by outcome, time or subject splitting)
+#' Create the settings for defining how the plpData are split into
+#' test/validation/train sets using default splitting functions
+#' (either random stratified by outcome, time or subject splitting)
#'
#' @details
-#' Returns an object of class \code{splitSettings} that specifies the splitting function that will be called and the settings
+#' Returns an object of class \code{splitSettings} that specifies the
+#' splitting function that will be called and the settings
#'
-#' @param testFraction (numeric) A real number between 0 and 1 indicating the test set fraction of the data
-#' @param trainFraction (numeric) A real number between 0 and 1 indicating the train set fraction of the data.
-#' If not set train is equal to 1 - test
-#' @param nfold (numeric) An integer > 1 specifying the number of folds used in cross validation
-#' @param splitSeed (numeric) A seed to use when splitting the data for reproducibility (if not set a random number will be generated)
-#' @param type (character) Choice of: \itemize{
-#' \item'stratified' Each data point is randomly assigned into the test or a train fold set but this is done stratified such that the outcome rate is consistent in each partition
-#' \item'time' Older data are assigned into the training set and newer data are assigned into the test set
-#' \item'subject' Data are partitioned by subject, if a subject is in the data more than once, all the data points for the subject are assigned either into the test data or into the train data (not both).
-#' }
+#' @param testFraction (numeric) A real number between 0 and 1
+#' indicating the test set fraction of the data
+#' @param trainFraction (numeric) A real number between 0 and 1 indicating the
+#' train set fraction of the data. If not set train is equal to 1 - test
+#' @param nfold (numeric) An integer > 1 specifying the number of
+#' folds used in cross validation
+#' @param splitSeed (numeric) A seed to use when splitting the data for
+#' reproducibility (if not set a random number will be generated)
+#' @param type (character) Choice of: \itemize{
+#' \item'stratified' Each data point is
+#' randomly assigned into the test or a train fold set but this is done
+#' stratified such that the outcome rate is consistent in each partition
+#' \item'time' Older data are assigned
+#' into the training set and newer data are assigned into the test set
+#' \item'subject' Data are partitioned by
+#' subject, if a subject is in the data more than once, all the data points for
+#' the subject are assigned either into the test data or into the train data
+#'(not both).
+#' }
#'
#' @return
#' An object of class \code{splitSettings}
#' @export
-createDefaultSplitSetting <- function(testFraction=0.25,
- trainFraction = 0.75,
- splitSeed=sample(100000,1),
- nfold=3,
- type = 'stratified'){
-
- checkIsClass(testFraction, c('numeric','integer'))
- checkHigherEqual(testFraction,0)
- checkHigher(-1*testFraction,-1)
-
- checkIsClass(trainFraction, c('numeric','integer'))
- checkHigher(trainFraction,0)
- checkHigherEqual(-1*trainFraction,-1)
-
- checkIsClass(nfold, c('numeric','integer'))
+createDefaultSplitSetting <- function(testFraction = 0.25,
+ trainFraction = 0.75,
+ splitSeed = sample(100000, 1),
+ nfold = 3,
+ type = "stratified") {
+ checkIsClass(testFraction, c("numeric", "integer"))
+ checkHigherEqual(testFraction, 0)
+ checkHigher(-1 * testFraction, -1)
+
+ checkIsClass(trainFraction, c("numeric", "integer"))
+ checkHigher(trainFraction, 0)
+ checkHigherEqual(-1 * trainFraction, -1)
+
+ checkIsClass(nfold, c("numeric", "integer"))
checkHigher(nfold, 1)
-
- checkIsClass(splitSeed, c('numeric','integer'))
-
+
+ checkIsClass(splitSeed, c("numeric", "integer"))
+
# add type check
checkIsClass(type, c("character"))
- if(!type %in% c('stratified','time','subject')){
- ParallelLogger::logError("Invalid type setting. Pick from: 'stratified','time','subject'")
- stop('Incorrect Type')
+ if (!type %in% c("stratified", "time", "subject")) {
+ ParallelLogger::logError("Invalid type setting.
+ Pick from: 'stratified','time','subject'")
+ stop("Incorrect Type")
}
-
- splitSettings <- list(test = testFraction,
- train = trainFraction,
- seed = splitSeed,
- nfold = nfold)
-
- if(type == 'stratified'){
+
+ splitSettings <- list(
+ test = testFraction,
+ train = trainFraction,
+ seed = splitSeed,
+ nfold = nfold
+ )
+
+ if (type == "stratified") {
attr(splitSettings, "fun") <- "randomSplitter"
}
- if(type == 'time'){
+ if (type == "time") {
attr(splitSettings, "fun") <- "timeSplitter"
}
- if(type == 'subject'){
+ if (type == "subject") {
attr(splitSettings, "fun") <- "subjectSplitter"
}
class(splitSettings) <- "splitSettings"
return(splitSettings)
}
+#' Create the settings for defining how the plpData are split into
+#' test/validation/train sets using an existing split - good to use for
+#' reproducing results from a different run
+#' @param splitIds (data.frame) A data frame with rowId and index columns of
+#' type integer/numeric. Index is -1 for test set, positive integer for train
+#' set folds
+#' @return An object of class \code{splitSettings}
+#' @export
+createExistingSplitSettings <- function(splitIds) {
+ checkIsClass(splitIds, "data.frame")
+ checkColumnNames(splitIds, c("rowId", "index"))
+ checkIsClass(splitIds$rowId, c("integer", "numeric"))
+ checkIsClass(splitIds$index, c("integer", "numeric"))
+ checkHigherEqual(splitIds$index, -1)
+
+ splitSettings <- list(splitIds = splitIds)
+ attr(splitSettings, "fun") <- "existingSplitter"
+ class(splitSettings) <- "splitSettings"
+ return(splitSettings)
+}
-# may want to remove 94 -126 as this makes it less flexible
-#' Split the plpData into test/train sets using a splitting settings of class \code{splitSettings}
+#' Split the plpData into test/train sets using a splitting settings of class
+#' \code{splitSettings}
#'
#' @details
-#' Returns a list containing the training data (Train) and optionally the test data (Test). Train is an Andromeda object containing
-#' \itemize{\item covariates: a table (rowId, covariateId, covariateValue) containing the covariates for each data point in the train data
+#' Returns a list containing the training data (Train) and optionally the test
+#' data (Test). Train is an Andromeda object containing
+#' \itemize{\item covariates: a table (rowId, covariateId, covariateValue)
+#' containing the covariates for each data point in the train data
#' \item covariateRef: a table with the covariate information
-#' \item labels: a table (rowId, outcomeCount, ...) for each data point in the train data (outcomeCount is the class label)
-#' \item folds: a table (rowId, index) specifying which training fold each data point is in.
-#' }
+#' \item labels: a table (rowId, outcomeCount, ...) for each data point
+#' in the train data (outcomeCount is the class label)
+#' \item folds: a table (rowId, index) specifying which training
+#' fold each data point is in.
+#' }
#' Test is an Andromeda object containing
-#' \itemize{\item covariates: a table (rowId, covariateId, covariateValue) containing the covariates for each data point in the test data
+#' \itemize{\item covariates: a table (rowId, covariateId, covariateValue)
+#' containing the covariates for each data point in the test data
#' \item covariateRef: a table with the covariate information
-#' \item labels: a table (rowId, outcomeCount, ...) for each data point in the test data (outcomeCount is the class label)
-#' }
-#'
-#'
-#'
-#' @param plpData An object of type \code{plpData} - the patient level prediction
-#' data extracted from the CDM.
-#' @param population The population created using \code{createStudyPopulation} that define who will be used to develop the model
-#' @param splitSettings An object of type \code{splitSettings} specifying the split - the default can be created using \code{createDefaultSplitSetting}
-#'
+#' \item labels: a table (rowId, outcomeCount, ...) for each data
+#' point in the test data (outcomeCount is the class label)
+#' }
+#' @param plpData An object of type \code{plpData} - the patient level
+#' prediction data extracted from the CDM.
+#' @param population The population created using \code{createStudyPopulation}
+#' that define who will be used to develop the model
+#' @param splitSettings An object of type \code{splitSettings} specifying the
+#' split - the default can be created using \code{createDefaultSplitSetting}
#'
#' @return
#' An object of class \code{splitSettings}
#' @export
splitData <- function(plpData = plpData,
population = population,
- splitSettings = splitSettings){
-
+ splitSettings = splitSettings) {
+ start <- Sys.time()
fun <- attr(splitSettings, "fun")
- args <- list(population = population,
- splitSettings = splitSettings)
+ args <- list(
+ population = population,
+ splitSettings = splitSettings
+ )
splitId <- do.call(eval(parse(text = fun)), args)
# now separate the data:
- if(sum(splitId$index<0)==0){
+ if (sum(splitId$index < 0) == 0) {
# NO TEST SET
- trainId <- splitId[splitId$index>0,]
+ trainId <- splitId[splitId$index > 0, ]
trainData <- list()
- class(trainData) <- 'plpData'
- trainData$labels <- population %>% dplyr::filter(.data$rowId %in% trainId$rowId)
+ class(trainData) <- "plpData"
+ trainData$labels <- population %>%
+ dplyr::filter(.data$rowId %in% trainId$rowId)
trainData$folds <- trainId
-
- #restrict to trainIds
- if(length(trainId$rowId)<200000){
+
+ # restrict to trainIds
+ if (length(trainId$rowId) < 200000) {
trainData$covariateData <- limitCovariatesToPopulation(
- plpData$covariateData,
+ plpData$covariateData,
trainId$rowId
- )
- } else{
+ )
+ } else {
trainData$covariateData <- batchRestrict(
- plpData$covariateData,
- data.frame(rowId = trainId$rowId),
+ plpData$covariateData,
+ data.frame(rowId = trainId$rowId),
sizeN = 10000000
- )
+ )
}
-
- #trainData$covariateData <- Andromeda::andromeda()
- #trainData$covariateData$covariates <- plpData$covariateData$covariates %>% dplyr::filter(.data$rowId %in% trainId$rowId)
- #trainData$covariateData$covariateRef <- plpData$covariateRef
+ metaData <- attr(population, "metaData")
attr(trainData, "metaData") <- list(
- outcomeId = attr(population, "metaData")$outcomeId,
- targetId = attr(population, "metaData")$targetId,
+ outcomeId = metaData$outcomeId,
+ targetId = metaData$targetId,
cdmDatabaseSchema = plpData$metaData$databaseDetails$cdmDatabaseSchema,
cdmDatabaseName = plpData$metaData$databaseDetails$cdmDatabaseName,
cdmDatabaseId = plpData$metaData$databaseDetails$cdmDatabaseId,
- restrictPlpDataSettings = attr(population, "metaData")$restrictPlpDataSettings,
+ restrictPlpDataSettings = metaData$restrictPlpDataSettings,
covariateSettings = plpData$metaData$covariateSettings,
- populationSettings = attr(population, "metaData")$populationSettings,
- attrition = attr(population, "metaData")$attrition,
+ populationSettings = metaData$populationSettings,
+ attrition = metaData$attrition,
splitSettings = splitSettings,
populationSize = nrow(trainData$labels)
)
# add pop size to covariateData as used in tidyCovariates
- attr(trainData$covariateData, "metaData") <- list(populationSize = nrow(trainData$labels))
+ attr(trainData$covariateData, "metaData") <-
+ list(populationSize = nrow(trainData$labels))
class(trainData$covariateData) <- "CovariateData"
-
- result <- list(Train = trainData)
-
- }else{
- trainId <- splitId[splitId$index>0,]
+
+ result <- list(Train = trainData)
+ } else {
+ trainId <- splitId[splitId$index > 0, ]
trainData <- list()
- class(trainData) <- 'plpData'
- trainData$labels <- population %>% dplyr::filter(.data$rowId %in% trainId$rowId)
+ class(trainData) <- "plpData"
+ trainData$labels <- population %>%
+ dplyr::filter(.data$rowId %in% trainId$rowId)
trainData$folds <- trainId
-
- #restrict to trainIds
- if(length(trainId$rowId)<200000){
+
+ # restrict to trainIds
+ if (length(trainId$rowId) < 200000) {
trainData$covariateData <- limitCovariatesToPopulation(
- plpData$covariateData,
+ plpData$covariateData,
trainId$rowId
)
- } else{
+ } else {
trainData$covariateData <- batchRestrict(
- plpData$covariateData,
- data.frame(rowId = trainId$rowId),
+ plpData$covariateData,
+ data.frame(rowId = trainId$rowId),
sizeN = 10000000
- )
+ )
}
+ metaData <- attr(population, "metaData")
attr(trainData, "metaData") <- list(
- outcomeId = attr(population, "metaData")$outcomeId,
- targetId = attr(population, "metaData")$targetId,
+ outcomeId = metaData$outcomeId,
+ targetId = metaData$targetId,
cdmDatabaseSchema = plpData$metaData$databaseDetails$cdmDatabaseSchema,
cdmDatabaseName = plpData$metaData$databaseDetails$cdmDatabaseName,
cdmDatabaseId = plpData$metaData$databaseDetails$cdmDatabaseId,
- restrictPlpDataSettings = attr(population, "metaData")$restrictPlpDataSettings,
+ restrictPlpDataSettings = metaData$restrictPlpDataSettings,
covariateSettings = plpData$metaData$covariateSettings,
- populationSettings = attr(population, "metaData")$populationSettings,
+ populationSettings = metaData$populationSettings,
attrition = attr(population, "metaData")$attrition,
splitSettings = splitSettings,
populationSize = nrow(trainData$labels)
- )
-
- testId <- splitId[splitId$index<0,]
+ )
+
+ testId <- splitId[splitId$index < 0, ]
testData <- list()
- class(testData) <- 'plpData'
- testData$labels <- population %>% dplyr::filter(.data$rowId %in% testId$rowId)
-
- if(length(testId$rowId)<200000){
+ class(testData) <- "plpData"
+ testData$labels <- population %>%
+ dplyr::filter(.data$rowId %in% testId$rowId)
+
+ if (length(testId$rowId) < 200000) {
testData$covariateData <- limitCovariatesToPopulation(
- plpData$covariateData,
+ plpData$covariateData,
testId$rowId
)
- } else{
- testData$covariateData <- batchRestrict(plpData$covariateData,
- data.frame(rowId = testId$rowId),
- sizeN = 10000000)
+ } else {
+ testData$covariateData <- batchRestrict(plpData$covariateData,
+ data.frame(rowId = testId$rowId),
+ sizeN = 10000000
+ )
}
-
+
result <- list(
- Train = trainData,
+ Train = trainData,
Test = testData
- )
+ )
}
-
- class(result) <- 'splitData'
+
+ class(result) <- "splitData"
+ delta <- Sys.time() - start
+ ParallelLogger::logInfo("Data split in ", signif(delta, 3), " ", attr(delta, "units"))
return(result)
}
-dataSummary <- function(data){
- #Test/Train: lsit(covariates,covariateRef, label, folds)
-
- ParallelLogger::logInfo('Train Set:')
- result <- data$Train$labels %>%
- dplyr::inner_join(data$Train$folds, by = 'rowId') %>%
+dataSummary <- function(data) {
+ ParallelLogger::logInfo("Train Set:")
+ result <- data$Train$labels %>%
+ dplyr::inner_join(data$Train$folds, by = "rowId") %>%
dplyr::group_by(.data$index) %>%
- dplyr::summarise(N = length(.data$outcomeCount),
- outcomes = sum(.data$outcomeCount)) %>%
- dplyr::collect()
-
- ParallelLogger::logInfo(paste('Fold ', result$index ,' ', result$N, ' patients with ', result$outcomes, ' outcomes', sep ='', collapse = ' - '))
-
-
- result <- data$Train$covariateData$covariates %>%
- dplyr::group_by(.data$covariateId) %>%
- dplyr::summarise(N = length(.data$covariateValue)) %>%
+ dplyr::summarise(
+ N = length(.data$outcomeCount),
+ outcomes = sum(.data$outcomeCount)
+ ) %>%
dplyr::collect()
-
- ParallelLogger::logInfo(paste0(nrow(result), ' covariates in train data'))
-
- if('Test' %in% names(data)){
- ParallelLogger::logInfo('Test Set:')
- result <- data$Test$label %>%
+
+ ParallelLogger::logInfo(paste("Fold ", result$index, " ", result$N,
+ " patients with ", result$outcomes, " outcomes", sep = "",
+ collapse = " - "))
+
+
+ result <- data$Train$covariateData$covariates %>%
+ dplyr::distinct(.data$covariateId) %>%
+ dplyr::count() %>%
+ dplyr::pull()
+
+ ParallelLogger::logInfo(paste0(result, " covariates in train data"))
+
+ if ("Test" %in% names(data)) {
+ ParallelLogger::logInfo("Test Set:")
+ result <- data$Test$label %>%
dplyr::collect()
-
- ParallelLogger::logInfo(paste0(nrow(result), ' patients with ', sum(result$outcomeCount>0), ' outcomes'))
-
+
+ ParallelLogger::logInfo(paste0(nrow(result), " patients with ",
+ sum(result$outcomeCount > 0), " outcomes"))
}
-
+
return(invisible(TRUE))
}
randomSplitter <- function(population, splitSettings) {
-
test <- splitSettings$test
- train <- splitSettings$train
+ train <- splitSettings$train
nfold <- splitSettings$nfold
seed <- splitSettings$seed
-
+
checkInputsSplit(test, train, nfold, seed)
# parameter checking
- if (!is.null(seed))
+ if (!is.null(seed)) {
set.seed(seed)
-
- if (length(table(population$outcomeCount)) <= 1 | sum(population$outcomeCount > 0) < 10) {
- stop("Outcome only occurs in fewer than 10 people or only one class")
}
-
- if (floor(sum(population$outcomeCount > 0) * train/nfold) < 5) {
- stop(paste0("Insufficient (",sum(population$outcomeCount > 0),") outcomes for choosen nfold value, please reduce"))
- }
-
-
- ParallelLogger::logInfo(paste0("Creating a ",
- test * 100,
- "% test and ",
- train * 100,
- "% train (into ",
- nfold,
- " folds) random stratified split by class"))
+ checkOutcomes(population, train, nfold)
+ ParallelLogger::logInfo(paste0(
+ "Creating a ",
+ test * 100,
+ "% test and ",
+ train * 100,
+ "% train (into ",
+ nfold,
+ " folds) random stratified split by class"
+ ))
outPpl <- population$rowId[population$outcomeCount == 1]
nonPpl <- population$rowId[population$outcomeCount == 0]
- # give random number to all and shuffle then assign to test/train/cv if using set.seed() then use
- # permutation <- stats::runif(length(nonPpl)+length(outPpl)) and nonPpl <-
- # nonPpl[order(permutation[1:length(nonPpl)])] and outPpl <-
- # outPpl[order(permutation[(1+length(nonPpl)):length(permutation)])]
nonPpl <- nonPpl[order(stats::runif(length(nonPpl)))]
outPpl <- outPpl[order(stats::runif(length(outPpl)))]
# reset all to not included (index=0)
- nonPpl.group <- rep(0, length(nonPpl))
-
+ nonPplGroup <- rep(0, length(nonPpl))
+
# set test set (index=-1)
- if(test>0){
- test.ind <- 1:floor(length(nonPpl) * test)
- nonPpl.group[test.ind] <- -1
+ if (test > 0) {
+ testInd <- 1:floor(length(nonPpl) * test)
+ nonPplGroup[testInd] <- -1
}
-
+
# set train set (index>0)
- train.ind <- (floor(length(nonPpl) * test) + round(length(nonPpl) * (1-train-test)) + 1):length(nonPpl)
- reps <- floor(length(train.ind)/nfold)
- leftOver <- length(train.ind)%%nfold
- if (leftOver > 0)
- nonPpl.group[train.ind] <- c(rep(1:nfold, each = reps), 1:leftOver)
- if (leftOver == 0)
- nonPpl.group[train.ind] <- rep(1:nfold, each = reps)
-
+ trainInd <- (floor(length(nonPpl) * test) + round(length(nonPpl) *
+ (1 - train - test)) + 1):length(nonPpl)
+ reps <- floor(length(trainInd) / nfold)
+ leftOver <- length(trainInd) %% nfold
+ if (leftOver > 0) {
+ nonPplGroup[trainInd] <- c(rep(1:nfold, each = reps), 1:leftOver)
+ }
+ if (leftOver == 0) {
+ nonPplGroup[trainInd] <- rep(1:nfold, each = reps)
+ }
+
# same for outcome = 1
- outPpl.group <- rep(0, length(outPpl))
- if(test>0){
- test.ind <- 1:floor(length(outPpl) * test)
- outPpl.group[test.ind] <- -1
+ outPplGroup <- rep(0, length(outPpl))
+ if (test > 0) {
+ testInd <- 1:floor(length(outPpl) * test)
+ outPplGroup[testInd] <- -1
+ }
+ trainInd <- (floor(length(outPpl) * test) + round(length(outPpl) *
+ (1 - train - test)) + 1):length(outPpl)
+ reps <- floor(length(trainInd) / nfold)
+ leftOver <- length(trainInd) %% nfold
+ if (leftOver > 0) {
+ outPplGroup[trainInd] <- c(rep(1:nfold, each = reps), 1:leftOver)
+ }
+ if (leftOver == 0) {
+ outPplGroup[trainInd] <- rep(1:nfold, each = reps)
}
- train.ind <- (floor(length(outPpl) * test) + round(length(outPpl) * (1-train-test)) + 1):length(outPpl)
- reps <- floor(length(train.ind)/nfold)
- leftOver <- length(train.ind)%%nfold
- if (leftOver > 0)
- outPpl.group[train.ind] <- c(rep(1:nfold, each = reps), 1:leftOver)
- if (leftOver == 0)
- outPpl.group[train.ind] <- rep(1:nfold, each = reps)
-
- split <- data.frame(rowId = c(nonPpl, outPpl), index = c(nonPpl.group, outPpl.group))
+
+ split <- data.frame(rowId = c(nonPpl, outPpl),
+ index = c(nonPplGroup, outPplGroup))
split <- split[order(-split$rowId), ]
foldSizesTrain <- utils::tail(table(split$index), nfold)
- ParallelLogger::logInfo(paste0("Data split into ", sum(split$index < 0), " test cases and ", sum(split$index >
+ ParallelLogger::logInfo(paste0("Data split into ", sum(split$index < 0),
+ " test cases and ", sum(split$index >
0), " train cases", " (", toString(foldSizesTrain), ")"))
- if (test+train<1)
- ParallelLogger::logInfo(paste0(sum(split$index == 0), " were not used for training or testing"))
-
+ if (test + train < 1) {
+ ParallelLogger::logInfo(paste0(sum(split$index == 0),
+ " were not used for training or testing"))
+ }
# return index vector
return(split)
}
timeSplitter <- function(population, splitSettings) {
-
test <- splitSettings$test
- train <- splitSettings$train
+ train <- splitSettings$train
nfold <- splitSettings$nfold
seed <- splitSettings$seed
-
+
checkInputsSplit(test, train, nfold, seed)
-
+
# parameter checking
- if (!is.null(seed))
+ if (!is.null(seed)) {
set.seed(seed)
-
+ }
+
dates <- as.Date(population$cohortStartDate, format = "%Y-%m-%d")
# find date that test frac have greater than - set dates older than this to this date
- dates.ord <- dates[order(dates)]
- if(test>0){
- testDate <- dates.ord[round(length(dates.ord) * (1 - test))]
- } else{
- testDate <- dates.ord[length(dates.ord)] # if no test use all for train
+ datesOrd <- dates[order(dates)]
+ if (test > 0) {
+ testDate <- datesOrd[round(length(datesOrd) * (1 - test))]
+ } else {
+ testDate <- datesOrd[length(datesOrd)] # if no test use all for train
}
- outPpl <- data.frame(rowId = population$rowId[population$outcomeCount == 1],
- date = dates[population$outcomeCount ==
- 1])
- nonPpl <- data.frame(rowId = population$rowId[population$outcomeCount == 0],
- date = dates[population$outcomeCount ==
- 0])
-
- ParallelLogger::logInfo(paste0("Creating ",
- test * 100,
- "% test and ",
- train * 100,
- "% train (into ",
- nfold,
- " folds) stratified split at ",
- testDate))
+ outPpl <- data.frame(
+ rowId = population$rowId[population$outcomeCount == 1],
+ date = dates[population$outcomeCount == 1]
+ )
+ nonPpl <- data.frame(
+ rowId = population$rowId[population$outcomeCount == 0],
+ date = dates[population$outcomeCount == 0]
+ )
+
+ ParallelLogger::logInfo(paste0(
+ "Creating ", test * 100, "% test and ",
+ train * 100, "% train (into ", nfold, " folds) stratified split at ",
+ testDate
+ ))
# shuffle the data
nonPpl <- nonPpl[order(stats::runif(nrow(nonPpl))), ]
outPpl <- outPpl[order(stats::runif(nrow(outPpl))), ]
- nonPpl.group <- rep(-1, nrow(nonPpl))
- nonPpl.group[nonPpl$date <= testDate] <- rep(1:nfold,
- each = ceiling(sum(nonPpl$date <= testDate)*(train/(1-test))/nfold))[1:sum(nonPpl$date <=
+ nonPplGroup <- rep(-1, nrow(nonPpl))
+ nonPplGroup[nonPpl$date <= testDate] <- rep(1:nfold,
+ each = ceiling(sum(nonPpl$date <= testDate) * (train / (1 - test)) / nfold)
+ )[1:sum(nonPpl$date <=
testDate)]
-
- # Fill NA values with 0
- nonPpl.group[is.na(nonPpl.group)] <- 0
-
- outPpl.group <- rep(-1, nrow(outPpl))
- outPpl.group[outPpl$date <= testDate] <- rep(1:nfold,
- each = ceiling(sum(outPpl$date <= testDate)*(train/(1-test))/nfold))[1:sum(outPpl$date <=
+
+ # Fill NA values with 0
+ nonPplGroup[is.na(nonPplGroup)] <- 0
+
+ outPplGroup <- rep(-1, nrow(outPpl))
+ outPplGroup[outPpl$date <= testDate] <- rep(1:nfold,
+ each = ceiling(sum(outPpl$date <= testDate) * (train / (1 - test)) / nfold)
+ )[1:sum(outPpl$date <=
testDate)]
-
- # Fill NA values with 0
- outPpl.group[is.na(outPpl.group)] <- 0
- split <- data.frame(rowId = c(nonPpl$rowId, outPpl$rowId), index = c(nonPpl.group, outPpl.group))
+ # Fill NA values with 0
+ outPplGroup[is.na(outPplGroup)] <- 0
+
+ split <- data.frame(rowId = c(nonPpl$rowId, outPpl$rowId),
+ index = c(nonPplGroup, outPplGroup))
split <- split[order(split$rowId), ]
foldSizesTrain <- utils::tail(table(split$index), nfold)
- ParallelLogger::logInfo(paste0("Data split into ", sum(split$index < 0), " test cases and ", sum(split$index >
+ ParallelLogger::logInfo(paste0("Data split into ", sum(split$index < 0),
+ " test cases and ", sum(split$index >
0), " train samples", " (", toString(foldSizesTrain), ")"))
- if (test+train<1)
- ParallelLogger::logInfo(paste0(sum(split$index == 0), " were not used for training or testing"))
+ if (test + train < 1) {
+ ParallelLogger::logInfo(paste0(sum(split$index == 0),
+ " were not used for training or testing"))
+ }
# return index vector
return(split)
}
@@ -414,115 +462,149 @@ timeSplitter <- function(population, splitSettings) {
subjectSplitter <- function(population, splitSettings) {
test <- splitSettings$test
- train <- splitSettings$train
+ train <- splitSettings$train
nfold <- splitSettings$nfold
seed <- splitSettings$seed
-
+
checkInputsSplit(test, train, nfold, seed)
-
+
# parameter checking
- if (!is.null(seed))
+ if (!is.null(seed)) {
set.seed(seed)
-
- if (length(table(population$outcomeCount)) <= 1 | sum(population$outcomeCount > 0) < 10) {
+ }
+
+ if (length(table(population$outcomeCount)) <= 1 ||
+ sum(population$outcomeCount > 0) < 10) {
stop("Outcome only occurs in fewer than 10 people or only one class")
}
-
- if (floor(sum(population$outcomeCount > 0) * train/nfold) < 5) {
+
+ if (floor(sum(population$outcomeCount > 0) * train / nfold) < 5) {
stop("Insufficient outcomes for choosen nfold value, please reduce")
}
-
-
- ParallelLogger::logInfo(paste0("Creating a ",
- test * 100,
- "% test and ",
- train * 100,
- "% train (into ",
- nfold,
- " folds) stratified split by subject"))
-
-
+
+ ParallelLogger::logInfo(paste0(
+ "Creating a ", test * 100, "% test and ", train * 100, "% train (into ",
+ nfold, " folds) stratified split by subject"
+ ))
+
# get subejct ids for outcome and split them up
outPpl <- unique(population$subjectId[population$outcomeCount == 1])
-
- # get subject id for non outcome and split them up
+
+ # get subject id for non outcome and split them up
nonPpl <- setdiff(unique(population$subjectId), outPpl)
-
- # give random number to all and shuffle then assign to test/train/cv if using set.seed() then use
+
+ # give random number to all and shuffle then assign to test/train/cv
nonPpl <- nonPpl[order(stats::runif(length(nonPpl)))]
outPpl <- outPpl[order(stats::runif(length(outPpl)))]
-
+
# default all to not included (index=0)
nonPplGroup <- rep(0, length(nonPpl))
-
- if(test>0){
- # set test set (index=-1)
- # use floor(x + 0.5) to get rounding to nearest integer
- # instead of to nearest even number when x is .5
- testInd <- 1:floor(length(nonPpl) * test + 0.5)
+
+ if (test > 0) {
+ # set test set (index=-1)
+ # use floor(x + 0.5) to get rounding to nearest integer
+ # instead of to nearest even number when x is .5
+ testInd <- 1:floor(length(nonPpl) * test + 0.5)
nonPplGroup[testInd] <- -1
}
-
+
# set train set (index>0)
- trainInd <- floor(length(nonPpl) * test + length(nonPpl) * (1-train-test) + 1.5):length(nonPpl)
- reps <- floor(length(trainInd)/nfold)
- leftOver <- length(trainInd)%%nfold
- if (leftOver > 0){
+ trainInd <- floor(length(nonPpl) * test +
+ length(nonPpl) * (1 - train - test) + 1.5):length(nonPpl)
+ reps <- floor(length(trainInd) / nfold)
+ leftOver <- length(trainInd) %% nfold
+ if (leftOver > 0) {
nonPplGroup[trainInd] <- c(rep(1:nfold, each = reps), 1:leftOver)
}
- if (leftOver == 0){
+ if (leftOver == 0) {
nonPplGroup[trainInd] <- rep(1:nfold, each = reps)
}
-
+
# same for outcome = 1
outPplGroup <- rep(0, length(outPpl))
- if(test>0){
+ if (test > 0) {
testInd <- 1:floor(length(outPpl) * test + 0.5)
outPplGroup[testInd] <- -1
}
- trainInd <- floor(length(outPpl) * test + length(outPpl) * (1-train-test) + 1.5):length(outPpl)
- reps <- floor(length(trainInd)/nfold)
- leftOver <- length(trainInd)%%nfold
- if (leftOver > 0){
+ trainInd <- floor(length(outPpl) * test +
+ length(outPpl) * (1 - train - test) + 1.5):length(outPpl)
+ reps <- floor(length(trainInd) / nfold)
+ leftOver <- length(trainInd) %% nfold
+ if (leftOver > 0) {
outPplGroup[trainInd] <- c(rep(1:nfold, each = reps), 1:leftOver)
}
- if (leftOver == 0){
+ if (leftOver == 0) {
outPplGroup[trainInd] <- rep(1:nfold, each = reps)
}
-
- split <- data.frame(subjectId = c(nonPpl, outPpl), index = c(nonPplGroup, outPplGroup))
-
- split <- merge(population[, c('rowId','subjectId')], split, by = 'subjectId')
- split <- split[, c('rowId', 'index')]
+
+ split <- data.frame(subjectId = c(nonPpl, outPpl),
+ index = c(nonPplGroup, outPplGroup))
+
+ split <- merge(population[, c("rowId", "subjectId")], split, by = "subjectId")
+ split <- split[, c("rowId", "index")]
split <- split[order(-split$rowId), ]
-
+
foldSizesTrain <- utils::tail(table(split$index), nfold)
- ParallelLogger::logInfo(paste0("Data split into ", sum(split$index < 0), " test cases and ", sum(split$index >
- 0), " train cases", " (", toString(foldSizesTrain), ")"))
- if (test+train<1)
- ParallelLogger::logInfo(paste0(sum(split$index == 0), " were not used for training or testing"))
-
+ ParallelLogger::logInfo(paste0("Data split into ", sum(split$index < 0),
+ " test cases and ", sum(split$index >
+ 0), " train cases", " (", toString(foldSizesTrain), ")"))
+ if (test + train < 1) {
+ ParallelLogger::logInfo(paste0(sum(split$index == 0),
+ " were not used for training or testing"))
+ }
+
# return index vector
return(split)
}
# this is not needed for each function - just the setting where is it not used? (fix in future)
-checkInputsSplit <- function(test, train, nfold, seed){
- ParallelLogger::logDebug(paste0('test: ', test))
- checkIsClass(test, c('numeric','integer'))
- checkHigherEqual(test,0)
- checkHigher(-1*test,-1)
-
- ParallelLogger::logDebug(paste0('train: ', train))
- checkIsClass(train, c('numeric','integer'))
- checkHigherEqual(train,0)
- checkHigherEqual(-1*train,-1)
-
- ParallelLogger::logDebug(paste0('nfold: ', nfold))
- checkIsClass(nfold, c('numeric','integer'))
+checkInputsSplit <- function(test, train, nfold, seed) {
+ ParallelLogger::logDebug(paste0("test: ", test))
+ checkIsClass(test, c("numeric", "integer"))
+ checkHigherEqual(test, 0)
+ checkHigher(-1 * test, -1)
+
+ ParallelLogger::logDebug(paste0("train: ", train))
+ checkIsClass(train, c("numeric", "integer"))
+ checkHigherEqual(train, 0)
+ checkHigherEqual(-1 * train, -1)
+
+ ParallelLogger::logDebug(paste0("nfold: ", nfold))
+ checkIsClass(nfold, c("numeric", "integer"))
checkHigher(nfold, 1)
-
- ParallelLogger::logInfo(paste0('seed: ', seed))
- checkIsClass(seed, c('numeric','integer'))
+
+ ParallelLogger::logInfo(paste0("seed: ", seed))
+ checkIsClass(seed, c("numeric", "integer"))
+}
+
+existingSplitter <- function(population, splitSettings) {
+ splitIds <- splitSettings$splitIds
+ # check all row Ids are in population
+ if (sum(!splitIds$rowId %in% population$rowId) > 0) {
+ stop("Not all rowIds in splitIds are in the population")
+ }
+ return(splitIds)
+}
+
+checkOutcomes <- function(population, train, nfold) {
+ if (!is.null(options("plp.outcomes")[[1]])) {
+ if (sum(population$outcomeCount > 0) < options("plp.outcomes")[[1]]) {
+ stop("Outcome count is less than specified option plp.outcomes: ",
+ options("plp.outcomes")[[1]])
+ }
+
+ } else {
+ # plp default minimum outcomes, less < 10 in total or less than 5 in train folds
+ if (length(table(population$outcomeCount)) <= 1 ||
+ sum(population$outcomeCount > 0) < 10) {
+ stop("Outcome only occurs in fewer than 10 people or only one class")
+ }
+
+ if (floor(sum(population$outcomeCount > 0) * train / nfold) < 5) {
+ stop(paste0("Insufficient (", sum(population$outcomeCount > 0), ")
+ outcomes for choosen nfold value, please reduce"))
+ }
+ }
+ return(invisible(TRUE))
}
diff --git a/R/DatabaseMigration.R b/R/DatabaseMigration.R
index bc480ce32..406a20d97 100644
--- a/R/DatabaseMigration.R
+++ b/R/DatabaseMigration.R
@@ -56,7 +56,7 @@ migrateDataModel <- function(connectionDetails, databaseSchema, tablePrefix = ""
getDataMigrator <- function(connectionDetails, databaseSchema, tablePrefix = "") {
- ensure_installed("ResultModelManager")
+ rlang::check_installed("ResultModelManager")
ResultModelManager::DataMigrationManager$new(
connectionDetails = connectionDetails,
@@ -65,4 +65,4 @@ getDataMigrator <- function(connectionDetails, databaseSchema, tablePrefix = "")
migrationPath = "migrations",
packageName = utils::packageName()
)
-}
\ No newline at end of file
+}
diff --git a/R/DemographicSummary.R b/R/DemographicSummary.R
index b3df14305..f4bf7ee36 100644
--- a/R/DemographicSummary.R
+++ b/R/DemographicSummary.R
@@ -3,162 +3,161 @@
#' @details
#' Generates a data.frame with the calibration per each 5 year age group and gender group
#'
-#' @param prediction A prediction object
-#' @param predictionType The type of prediction (binary or survival)
-#' @param typeColumn A column that is used to stratify the results
+#' @param prediction A prediction object
+#' @param predictionType The type of prediction (binary or survival)
+#' @param typeColumn A column that is used to stratify the results
#'
#' @return
#' A dataframe with the calibration summary
#' @export
getDemographicSummary <- function(
- prediction,
- predictionType,
- typeColumn = 'evaluation'
-){
+ prediction,
+ predictionType,
+ typeColumn = "evaluation") {
evaluation <- do.call(
- what = paste0('getDemographicSummary_', predictionType),
+ what = paste0("getDemographicSummary_", predictionType),
args = list(
- prediction = prediction,
+ prediction = prediction,
evalColumn = typeColumn,
- timepoint = attr(prediction, 'metaData')$timepoint
+ timepoint = attr(prediction, "metaData")$timepoint
)
)
-
+
return(evaluation)
}
-getDemographicSummary_binary <- function(prediction, evalColumn , ...){
-
+getDemographicSummary_binary <- function(prediction, evalColumn, ...) {
result <- c()
- evalTypes <- unique(as.data.frame(prediction)[,evalColumn])
-
- for(evalType in evalTypes){
-
+ evalTypes <- unique(as.data.frame(prediction)[, evalColumn])
+
+ for (evalType in evalTypes) {
predictionOfInterest <- prediction %>% dplyr::filter(.data[[evalColumn]] == evalType)
-
- demographicData <- predictionOfInterest[,c('rowId','ageYear','gender')] %>%
+
+ demographicData <- predictionOfInterest[, c("rowId", "ageYear", "gender")] %>%
dplyr::mutate(
- ageId = floor(.data$ageYear/5),
- ageGroup = paste0('Age group: ', floor(.data$ageYear/5)*5, '-',floor(.data$ageYear/5)*5+4),
+ ageId = floor(.data$ageYear / 5),
+ ageGroup = paste0("Age group: ", floor(.data$ageYear / 5) * 5, "-", floor(.data$ageYear / 5) * 5 + 4),
genId = .data$gender,
- genGroup = ifelse(.data$gender==8507, 'Male', 'Female')) %>%
- dplyr::select("rowId","ageId","ageGroup","genId","genGroup") %>%
- dplyr::inner_join(predictionOfInterest[,colnames(predictionOfInterest)%in%c('rowId', 'value','outcomeCount','survivalTime')], by='rowId')
-
+ genGroup = ifelse(.data$gender == 8507, "Male", "Female")
+ ) %>%
+ dplyr::select("rowId", "ageId", "ageGroup", "genId", "genGroup") %>%
+ dplyr::inner_join(predictionOfInterest[, colnames(predictionOfInterest) %in% c("rowId", "value", "outcomeCount", "survivalTime")], by = "rowId")
+
demographicData <- demographicData %>%
- dplyr::group_by(.data$ageGroup,.data$genGroup) %>%
+ dplyr::group_by(.data$ageGroup, .data$genGroup) %>%
dplyr::summarise(
- PersonCountAtRisk = length(.data$outcomeCount),
+ PersonCountAtRisk = length(.data$outcomeCount),
PersonCountWithOutcome = sum(.data$outcomeCount),
- averagePredictedProbability = mean(.data$value, na.rm = T),
- StDevPredictedProbability = stats::sd(.data$value, na.rm = T),
- MinPredictedProbability =stats::quantile(.data$value, probs = 0),
- P25PredictedProbability =stats::quantile(.data$value, probs = 0.25),
- P50PredictedProbability =stats::quantile(.data$value, probs = 0.50),
- P75PredictedProbability =stats::quantile(.data$value, probs = 0.75),
- MaxPredictedProbability =stats::quantile(.data$value, probs = 1),
+ averagePredictedProbability = mean(.data$value, na.rm = TRUE),
+ StDevPredictedProbability = stats::sd(.data$value, na.rm = TRUE),
+ MinPredictedProbability = stats::quantile(.data$value, probs = 0),
+ P25PredictedProbability = stats::quantile(.data$value, probs = 0.25),
+ P50PredictedProbability = stats::quantile(.data$value, probs = 0.50),
+ P75PredictedProbability = stats::quantile(.data$value, probs = 0.75),
+ MaxPredictedProbability = stats::quantile(.data$value, probs = 1),
)
-
- demographicData$evaluation = evalType
-
+
+ demographicData$evaluation <- evalType
+
result <- rbind(result, demographicData)
-
}
-
+
result <- as.data.frame(result)
return(result)
-
}
-getDemographicSummary_survival <- function(prediction, evalColumn, timepoint = NULL, ...){
-
+getDemographicSummary_survival <- function(prediction, evalColumn, timepoint = NULL, ...) {
+ rlang::check_installed(
+ pkg = c("survival"),
+ reason = "getDemographicSummary_survival requires the survival package to be installed"
+ )
result <- c()
- evalTypes <- unique(as.data.frame(prediction)[,evalColumn])
-
- for(evalType in evalTypes){
-
- predictionOfInterest <- prediction %>%
+ evalTypes <- unique(as.data.frame(prediction)[, evalColumn])
+
+ for (evalType in evalTypes) {
+ predictionOfInterest <- prediction %>%
dplyr::filter(.data[[evalColumn]] == evalType)
-
- demographicData <- predictionOfInterest[,c('rowId','ageYear','gender')] %>%
- dplyr::mutate(ageId = floor(.data$ageYear/5),
- ageGroup = paste0('Age group: ', floor(.data$ageYear/5)*5, '-',floor(.data$ageYear/5)*5+4),
+
+ demographicData <- predictionOfInterest[, c("rowId", "ageYear", "gender")] %>%
+ dplyr::mutate(
+ ageId = floor(.data$ageYear / 5),
+ ageGroup = paste0("Age group: ", floor(.data$ageYear / 5) * 5, "-", floor(.data$ageYear / 5) * 5 + 4),
genId = .data$gender,
- genGroup = ifelse(.data$gender==8507, 'Male', 'Female')) %>%
- dplyr::select("rowId","ageId","ageGroup","genId","genGroup" ) %>%
- dplyr::inner_join(predictionOfInterest[,colnames(predictionOfInterest)%in%c('rowId', 'value','outcomeCount','survivalTime')], by='rowId')
-
-
- if(is.null(timepoint)){
+ genGroup = ifelse(.data$gender == 8507, "Male", "Female")
+ ) %>%
+ dplyr::select("rowId", "ageId", "ageGroup", "genId", "genGroup") %>%
+ dplyr::inner_join(predictionOfInterest[, colnames(predictionOfInterest) %in% c("rowId", "value", "outcomeCount", "survivalTime")], by = "rowId")
+
+
+ if (is.null(timepoint)) {
timepoint <- max(demographicData$survivalTime)
}
- demographicSum <- demographicData %>%
+ demographicSum <- demographicData %>%
dplyr::mutate(
t = .data$survivalTime,
y = ifelse(.data$outcomeCount > 0, 1, 0)
)
-
- gen <- unique(demographicData$genGroup)
- ageGroup <- unique(demographicData$ageGroup)
-
+
+ genList <- unique(demographicData$genGroup)
+ ageGroups <- unique(demographicData$ageGroup)
+
demographicData <- NULL
- for(gen in gen){
- for(age in ageGroup){
-
- tempDemo <- demographicSum %>%
- dplyr::filter( .data$genGroup == gen & .data$ageGroup == age )
-
- if (nrow(tempDemo) > 1 & length(unique(tempDemo$y)) > 1) {
+ for (gen in genList) {
+ for (age in ageGroups) {
+ tempDemo <- demographicSum %>%
+ dplyr::filter(.data$genGroup == gen & .data$ageGroup == age)
+
+ if (nrow(tempDemo) > 1 && length(unique(tempDemo$y)) > 1) {
t <- tempDemo$t
y <- tempDemo$y
value <- tempDemo$value
-
+
out <- tryCatch(
{
summary(
- survival::survfit(survival::Surv(t, y) ~ 1),
+ survival::survfit(survival::Surv(t, y) ~ 1),
times = timepoint
)
},
- error = function(e){ParallelLogger::logError(e);return(NULL)}
+ error = function(e) {
+ ParallelLogger::logError(e)
+ return(NULL)
+ }
)
-
- if(!is.null(out)){
+
+ if (!is.null(out) && !is.null(out$surv)) {
demoTemp <- c(
- genGroup = gen,
- ageGroup = age,
+ genGroup = gen,
+ ageGroup = age,
PersonCountAtRisk = length(value),
- PersonCountWithOutcome = round(length(value)*(1-out$surv)),
- observedRisk = 1-out$surv,
- averagePredictedProbability = mean(value, na.rm = T),
- StDevPredictedProbability = stats::sd(value, na.rm = T)
- )
-
+ PersonCountWithOutcome = round(length(value) * (1 - out$surv)),
+ observedRisk = 1 - out$surv,
+ averagePredictedProbability = mean(value, na.rm = TRUE),
+ StDevPredictedProbability = stats::sd(value, na.rm = TRUE)
+ )
+
demographicData <- rbind(demographicData, demoTemp)
}
}
-
}
-
}
demographicData <- as.data.frame(demographicData)
- demographicData$averagePredictedProbability <- as.double(as.character(demographicData$averagePredictedProbability ))
- demographicData$StDevPredictedProbability <- as.double(as.character(demographicData$StDevPredictedProbability ))
- demographicData$PersonCountAtRisk <- as.double(as.character(demographicData$PersonCountAtRisk ))
- demographicData$PersonCountWithOutcome <- as.double(as.character(demographicData$PersonCountWithOutcome ))
-
+ demographicData$averagePredictedProbability <- as.double(as.character(demographicData$averagePredictedProbability))
+ demographicData$StDevPredictedProbability <- as.double(as.character(demographicData$StDevPredictedProbability))
+ demographicData$PersonCountAtRisk <- as.double(as.character(demographicData$PersonCountAtRisk))
+ demographicData$PersonCountWithOutcome <- as.double(as.character(demographicData$PersonCountWithOutcome))
+
demographicData$evaluation <- evalType
-
+
result <- rbind(
- result,
+ result,
demographicData
)
-
}
-
+
result <- as.data.frame(result)
return(result)
}
diff --git a/R/DiagnosePlp.R b/R/DiagnosePlp.R
index dee4bba82..7a8bdb448 100644
--- a/R/DiagnosePlp.R
+++ b/R/DiagnosePlp.R
@@ -19,51 +19,48 @@
#' Run a list of predictions diagnoses
#'
#' @details
-#' This function will run all specified prediction design diagnoses as defined using .
+#' This function will run all specified prediction design diagnoses as defined using .
#'
#' @param databaseDetails The database settings created using \code{createDatabaseDetails()}
#' @param modelDesignList A list of model designs created using \code{createModelDesign()}
#' @param cohortDefinitions A list of cohort definitions for the target and outcome cohorts
#' @param logSettings The setting spexcifying the logging for the analyses created using \code{createLogSettings()}
#' @param saveDirectory Name of the folder where all the outputs will written to.
-#'
+#'
#' @return
#' A data frame with the following columns: \tabular{ll}{ \verb{analysisId} \tab The unique identifier
#' for a set of analysis choices.\cr \verb{targetId} \tab The ID of the target cohort populations.\cr
-#' \verb{outcomeId} \tab The ID of the outcomeId.\cr \verb{dataLocation} \tab The location where the plpData was saved
+#' \verb{outcomeId} \tab The ID of the outcomeId.\cr \verb{dataLocation} \tab The location where the plpData was saved
#' \cr \verb{the settings ids} \tab The ids for all other settings used for model development.\cr }
#'
#' @export
diagnoseMultiplePlp <- function(
- databaseDetails = createDatabaseDetails(),
- modelDesignList = list(
- createModelDesign(targetId = 1, outcomeId = 2, modelSettings = setLassoLogisticRegression()),
- createModelDesign(targetId = 1, outcomeId = 3, modelSettings = setLassoLogisticRegression())
- ),
- cohortDefinitions = NULL,
- logSettings = createLogSettings(
- verbosity = "DEBUG",
- timeStamp = T,
- logName = "diagnosePlp Log"
- ),
- saveDirectory = getwd()
-){
-
- #input checks
- checkIsClass(databaseDetails, c('databaseDetails'))
- checkIsClass(modelDesignList, c('list', 'modelDesign'))
- checkIsClass(logSettings, 'logSettings')
- checkIsClass(saveDirectory, 'character')
- if(!dir.exists(saveDirectory)){
- dir.create(saveDirectory, recursive = T)
+ databaseDetails = createDatabaseDetails(),
+ modelDesignList = list(
+ createModelDesign(targetId = 1, outcomeId = 2, modelSettings = setLassoLogisticRegression()),
+ createModelDesign(targetId = 1, outcomeId = 3, modelSettings = setLassoLogisticRegression())
+ ),
+ cohortDefinitions = NULL,
+ logSettings = createLogSettings(
+ verbosity = "DEBUG",
+ timeStamp = TRUE,
+ logName = "diagnosePlp Log"
+ ),
+ saveDirectory = getwd()) {
+ # input checks
+ checkIsClass(databaseDetails, c("databaseDetails"))
+ checkIsClass(modelDesignList, c("list", "modelDesign"))
+ checkIsClass(logSettings, "logSettings")
+ checkIsClass(saveDirectory, "character")
+ if (!dir.exists(saveDirectory)) {
+ dir.create(saveDirectory, recursive = TRUE)
}
-
- if(is.null(cohortDefinitions)){
-
+
+ if (is.null(cohortDefinitions)) {
cohortIds <- unlist(
lapply(
- X = 1:length(modelDesignList),
- FUN = function(i){
+ X = 1:length(modelDesignList),
+ FUN = function(i) {
c(
modelDesignList[[i]]$targetId,
modelDesignList[[i]]$outcomeId
@@ -71,91 +68,94 @@ diagnoseMultiplePlp <- function(
}
)
)
-
+
cohortDefinitions <- lapply(
- X = cohortIds,
- FUN = function(x){
+ X = cohortIds,
+ FUN = function(x) {
list(
- id = x,
- name = paste0('Cohort: ', x)
+ id = x,
+ name = paste0("Cohort: ", x)
)
}
)
-
}
-
- settingstable <- convertToJson(modelDesignList,cohortDefinitions) # from runMultiplePlp.R
-
- if(nrow(settingstable) != length(modelDesignList)){
- stop('Error in settingstable')
+
+ settingstable <- convertToJson(modelDesignList, cohortDefinitions) # from runMultiplePlp.R
+
+ if (nrow(settingstable) != length(modelDesignList)) {
+ stop("Error in settingstable")
}
-
+
# save the settings: TODO fix
utils::write.csv(
x = settingstable %>% dplyr::select(
"analysisId",
- "targetId",
+ "targetId",
"targetName",
- "outcomeId",
+ "outcomeId",
"outcomeName",
"dataLocation"
- ),
- file.path(saveDirectory,'settings.csv'),
- row.names = F
+ ),
+ file.path(saveDirectory, "settings.csv"),
+ row.names = FALSE
)
# group the outcomeIds per combination of data extraction settings
- dataSettings <- settingstable %>%
+ dataSettings <- settingstable %>%
dplyr::group_by(
.data$targetId,
.data$covariateSettings,
.data$restrictPlpDataSettings,
.data$dataLocation
- ) %>%
+ ) %>%
dplyr::summarise(
- outcomeIds = paste(unique(.data$outcomeId), collapse = ',')
+ outcomeIds = paste(unique(.data$outcomeId), collapse = ",")
)
-
+
# extract data
- for(i in 1:nrow(as.data.frame(dataSettings))){
- dataExists <- length(dir(file.path(saveDirectory, dataSettings$dataLocation[i])))>0
- if(!dataExists){
- ParallelLogger::logInfo(paste('Extracting data for cohort', dataSettings$targetId[i], 'to', file.path(saveDirectory, dataSettings$dataLocation[i])))
-
+ for (i in 1:nrow(as.data.frame(dataSettings))) {
+ dataExists <- length(dir(file.path(saveDirectory, dataSettings$dataLocation[i]))) > 0
+ if (!dataExists) {
+ ParallelLogger::logInfo(paste("Extracting data for cohort", dataSettings$targetId[i], "to", file.path(saveDirectory, dataSettings$dataLocation[i])))
+
databaseDetails$targetId <- dataSettings$targetId[i]
- databaseDetails$outcomeIds <- strsplit(dataSettings$outcomeIds[i], ',')[[1]]
-
+ databaseDetails$outcomeIds <- strsplit(dataSettings$outcomeIds[i], ",")[[1]]
+
plpDataSettings <- list(
databaseDetails = databaseDetails,
covariateSettings = ParallelLogger::convertJsonToSettings(dataSettings$covariateSettings[i]),
restrictPlpDataSettings = ParallelLogger::convertJsonToSettings(dataSettings$restrictPlpDataSettings[i])
)
-
+
plpData <- tryCatch(
- {do.call(getPlpData, plpDataSettings)},
- error = function(e){ParallelLogger::logInfo(e); return(NULL)}
+ {
+ do.call(getPlpData, plpDataSettings)
+ },
+ error = function(e) {
+ ParallelLogger::logInfo(e)
+ return(NULL)
+ }
)
- if(!is.null(plpData)){
+ if (!is.null(plpData)) {
savePlpData(plpData, file.path(saveDirectory, dataSettings$dataLocation[i]))
}
- } else{
- ParallelLogger::logInfo(paste('Data for cohort', dataSettings$targetId[i], 'exists at', file.path(saveDirectory, dataSettings$dataLocation[i])))
+ } else {
+ ParallelLogger::logInfo(paste("Data for cohort", dataSettings$targetId[i], "exists at", file.path(saveDirectory, dataSettings$dataLocation[i])))
}
}
-
+
# diagnosePlp
- for(i in 1:nrow(as.data.frame(settingstable))){
+ for (i in 1:nrow(as.data.frame(settingstable))) {
modelDesign <- modelDesignList[[i]]
- settings <- settingstable[i,] # just the data locations?
-
- dataExists <- length(dir(file.path(saveDirectory, settings$dataLocation)))>0
-
- if(dataExists){
+ settings <- settingstable[i, ] # just the data locations?
+
+ dataExists <- length(dir(file.path(saveDirectory, settings$dataLocation))) > 0
+
+ if (dataExists) {
plpData <- PatientLevelPrediction::loadPlpData(file.path(saveDirectory, settings$dataLocation))
-
- diagnoseExists <- file.exists(file.path(saveDirectory, settings$analysisId, 'diagnosePlp.rds'))
- if(!diagnoseExists){
-
+
+ diagnoseExists <- file.exists(file.path(saveDirectory, settings$analysisId, "diagnosePlp.rds"))
+ if (!diagnoseExists) {
diagnosePlpSettings <- list(
plpData = plpData,
outcomeId = modelDesign$outcomeId,
@@ -169,16 +169,20 @@ diagnoseMultiplePlp <- function(
logSettings = logSettings,
saveDirectory = saveDirectory
)
-
+
result <- tryCatch(
- {do.call(diagnosePlp, diagnosePlpSettings)},
- error = function(e){ParallelLogger::logInfo(e); return(NULL)}
+ {
+ do.call(diagnosePlp, diagnosePlpSettings)
+ },
+ error = function(e) {
+ ParallelLogger::logInfo(e)
+ return(NULL)
+ }
)
- } else{
- ParallelLogger::logInfo(paste('Diagnosis ', settings$analysisId, 'exists at', file.path(saveDirectory, settings$analysisId)))
+ } else {
+ ParallelLogger::logInfo(paste("Diagnosis ", settings$analysisId, "exists at", file.path(saveDirectory, settings$analysisId)))
}
} # end run per setting
-
}
return(invisible(settingstable))
}
@@ -187,30 +191,30 @@ diagnoseMultiplePlp <- function(
#' diagnostic - Investigates the prediction problem settings - use before training a model
#'
#' @description
-#' This function runs a set of prediction diagnoses to help pick a suitable T, O, TAR and determine
+#' This function runs a set of prediction diagnoses to help pick a suitable T, O, TAR and determine
#' whether the prediction problem is worth executing.
-#'
+#'
#' @details
#' Users can define set of Ts, Os, databases and population settings. A list of data.frames containing details such as
-#' follow-up time distribution, time-to-event information, characteriszation details, time from last prior event,
-#' observation time distribution.
+#' follow-up time distribution, time-to-event information, characteriszation details, time from last prior event,
+#' observation time distribution.
#'
#' @param plpData An object of type \code{plpData} - the patient level prediction
-#' data extracted from the CDM. Can also include an initial population as
+#' data extracted from the CDM. Can also include an initial population as
#' plpData$popualtion.
-#' @param outcomeId (integer) The ID of the outcome.
+#' @param outcomeId (integer) The ID of the outcome.
#' @param analysisId (integer) Identifier for the analysis. It is used to create, e.g., the result folder. Default is a timestamp.
#' @param populationSettings An object of type \code{populationSettings} created using \code{createStudyPopulationSettings} that
-#' specifies how the data class labels are defined and addition any exclusions to apply to the
+#' specifies how the data class labels are defined and addition any exclusions to apply to the
#' plpData cohort
-#' @param splitSettings An object of type \code{splitSettings} that specifies how to split the data into train/validation/test.
-#' The default settings can be created using \code{createDefaultSplitSetting}.
+#' @param splitSettings An object of type \code{splitSettings} that specifies how to split the data into train/validation/test.
+#' The default settings can be created using \code{createDefaultSplitSetting}.
#' @param sampleSettings An object of type \code{sampleSettings} that specifies any under/over sampling to be done.
#' The default is none.
-#' @param featureEngineeringSettings An object of \code{featureEngineeringSettings} specifying any feature engineering to be learned (using the train data)
-#' @param preprocessSettings An object of \code{preprocessSettings}. This setting specifies the minimum fraction of
-#' target population who must have a covariate for it to be included in the model training
-#' and whether to normalise the covariates before training
+#' @param featureEngineeringSettings An object of \code{featureEngineeringSettings} specifying any feature engineering to be learned (using the train data)
+#' @param preprocessSettings An object of \code{preprocessSettings}. This setting specifies the minimum fraction of
+#' target population who must have a covariate for it to be included in the model training
+#' and whether to normalise the covariates before training
#' @param modelSettings An object of class \code{modelSettings} created using one of the function:
#' \itemize{
#' \item setLassoLogisticRegression() A lasso logistic regression model
@@ -219,12 +223,12 @@ diagnoseMultiplePlp <- function(
#' \item setRandomForest() A random forest model
#' \item setDecisionTree() A decision tree model
#' \item setKNN() A KNN model
-#'
-#' }
-#' @param logSettings An object of \code{logSettings} created using \code{createLogSettings}
-#' specifying how the logging is done
+#'
+#' }
+#' @param logSettings An object of \code{logSettings} created using \code{createLogSettings}
+#' specifying how the logging is done
#' @param saveDirectory The path to the directory where the results will be saved (if NULL uses working directory)
-#'
+#'
#' @return
#' An object containing the model or location where the model is save, the data selection settings, the preprocessing
#' and training settings as well as various performance measures obtained by the model.
@@ -237,62 +241,60 @@ diagnoseMultiplePlp <- function(
#' @export
#' @examples
#' \dontrun{
-#' #******** EXAMPLE 1 *********
-#' }
+#' #******** EXAMPLE 1 *********
+#' }
diagnosePlp <- function(
- plpData = NULL,
- outcomeId,
- analysisId,
- populationSettings,
- splitSettings = createDefaultSplitSetting(),
- sampleSettings = createSampleSettings(), # default none
- saveDirectory = NULL,
- featureEngineeringSettings = createFeatureEngineeringSettings(), # default none
- modelSettings = setLassoLogisticRegression(), # default to logistic regression
- logSettings = createLogSettings(
- verbosity = 'DEBUG',
- timeStamp = T,
- logName = 'diagnosePlp Log'
- ),
- preprocessSettings = createPreprocessSettings()
-){
-
- # start log
+ plpData = NULL,
+ outcomeId,
+ analysisId,
+ populationSettings,
+ splitSettings = createDefaultSplitSetting(),
+ sampleSettings = createSampleSettings(), # default none
+ saveDirectory = NULL,
+ featureEngineeringSettings = createFeatureEngineeringSettings(), # default none
+ modelSettings = setLassoLogisticRegression(), # default to logistic regression
+ logSettings = createLogSettings(
+ verbosity = "DEBUG",
+ timeStamp = TRUE,
+ logName = "diagnosePlp Log"
+ ),
+ preprocessSettings = createPreprocessSettings()) {
+ # start log
analysisPath <- file.path(saveDirectory, analysisId)
logSettings$saveDirectory <- analysisPath
- logSettings$logFileName <- 'plpLog'
- logger <- do.call(createLog,logSettings)
+ logSettings$logFileName <- "plpLog"
+ logger <- do.call(createLog, logSettings)
ParallelLogger::registerLogger(logger)
on.exit(closeLog(logger))
-
+
participantsDiag <- probastParticipants(
- plpData,
+ plpData,
outcomeId = outcomeId,
populationSettings = populationSettings
)
-
+
predictorDiag <- probastPredictors(
- plpData,
+ plpData,
outcomeId = outcomeId,
populationSettings = populationSettings
)
-
+
outcomeDiag <- probastOutcome(
- plpData,
+ plpData,
outcomeId = outcomeId,
populationSettings = populationSettings
)
-
+
designDiag <- probastDesign(
- plpData,
+ plpData,
outcomeId = outcomeId,
populationSettings = populationSettings
)
-
- # Question: what about
- # splitSettings, sampleSettings,
+
+ # Question: what about
+ # splitSettings, sampleSettings,
# FeatureEngineeringSettings, modelSettings
-
+
result <- list(
summary = rbind(
participantsDiag$diagnosticParticipantsAggregate,
@@ -319,60 +321,58 @@ diagnosePlp <- function(
databaseSchema = plpData$metaData$databaseDetails$cdmDatabaseSchema,
databaseId = plpData$metaData$databaseDetails$cdmDatabaseId
)
-
- class(result) <- 'diagnosePlp'
-
- if(!is.null(saveDirectory)){
- if(!dir.exists(file.path(saveDirectory, analysisId))){
- dir.create(file.path(saveDirectory, analysisId), recursive = T)
+
+ class(result) <- "diagnosePlp"
+
+ if (!is.null(saveDirectory)) {
+ if (!dir.exists(file.path(saveDirectory, analysisId))) {
+ dir.create(file.path(saveDirectory, analysisId), recursive = TRUE)
}
- saveLocation <- file.path(saveDirectory, analysisId, 'diagnosePlp.rds')
- ParallelLogger::logInfo(paste0('Saving diagnosePlp to ', saveLocation))
+ saveLocation <- file.path(saveDirectory, analysisId, "diagnosePlp.rds")
+ ParallelLogger::logInfo(paste0("Saving diagnosePlp to ", saveLocation))
saveRDS(result, saveLocation)
}
-
+
return(result)
}
probastDesign <- function(
- plpData,
- outcomeId,
- populationSettings
-){
-
+ plpData,
+ outcomeId,
+ populationSettings) {
diagnosticAggregate <- c()
diagnosticFull <- c()
-
+
population <- PatientLevelPrediction::createStudyPopulation(
- plpData = plpData,
- outcomeId = outcomeId,
+ plpData = plpData,
+ outcomeId = outcomeId,
populationSettings = populationSettings
)
-
- probastId <- '4.1'
- if(min(sum(population$outcomeCount > 0),nrow(population) - sum(population$outcomeCount > 0)) >= 1000){
+
+ probastId <- "4.1"
+ if (min(sum(population$outcomeCount > 0), nrow(population) - sum(population$outcomeCount > 0)) >= 1000) {
diagnosticAggregate <- rbind(
diagnosticAggregate,
- c(probastId, 'Pass')
+ c(probastId, "Pass")
)
- }else if((min(sum(population$outcomeCount > 0),nrow(population) - sum(population$outcomeCount > 0)) >= 100)){
+ } else if ((min(sum(population$outcomeCount > 0), nrow(population) - sum(population$outcomeCount > 0)) >= 100)) {
diagnosticAggregate <- rbind(
diagnosticAggregate,
- c(probastId, 'Unknown')
+ c(probastId, "Unknown")
)
- } else{
+ } else {
diagnosticAggregate <- rbind(
diagnosticAggregate,
- c(probastId, 'Fail')
+ c(probastId, "Fail")
)
}
-
- if(!is.null(dim(diagnosticAggregate))){
- diagnosticAggregate <- as.data.frame(diagnosticAggregate) %>%
+
+ if (!is.null(dim(diagnosticAggregate))) {
+ diagnosticAggregate <- as.data.frame(diagnosticAggregate) %>%
dplyr::mutate_if(is.factor, as.character)
- colnames(diagnosticAggregate) <- c('probastId','resultValue')
+ colnames(diagnosticAggregate) <- c("probastId", "resultValue")
}
-
+
return(
list
(
@@ -380,140 +380,132 @@ probastDesign <- function(
diagnosticDesignAggregate = diagnosticAggregate
)
)
-
-
}
probastParticipants <- function(
- plpData,
- outcomeId,
- populationSettings
-){
-
+ plpData,
+ outcomeId,
+ populationSettings) {
diagnosticAggregate <- c()
diagnosticFull <- c()
-
+
# true due to plp
- probastId <- '1.1'
+ probastId <- "1.1"
diagnosticAggregate <- rbind(
diagnosticAggregate,
- c(probastId, 'Pass')
+ c(probastId, "Pass")
)
-
+
# appropriate inclusions
## 1.2.1 min prior observation
- if(populationSettings$washoutPeriod != 0 ){
- var <- 'washoutPeriod'
- probastId <- '1.2.1'
-
+ if (populationSettings$washoutPeriod != 0) {
+ var <- "washoutPeriod"
+ probastId <- "1.2.1"
+
result <- getDiagnostic(
probastId,
- plpData,
- outcomeId,
- populationSettings,
- var,
- 0
+ plpData,
+ outcomeId,
+ populationSettings,
+ var,
+ 0
)
-
+
diagnosticAggregate <- rbind(
diagnosticAggregate,
result$diagnosticAggregate
)
-
+
diagnosticFull <- rbind(
diagnosticFull,
result$diagnosticFull
)
-
}
-
+
## 1.2.2 min time-at-risk
- if(populationSettings$requireTimeAtRisk & populationSettings$minTimeAtRisk > 0){
- probastId <- '1.2.2'
- var <- 'minTimeAtRisk'
-
+ if (populationSettings$requireTimeAtRisk && populationSettings$minTimeAtRisk > 0) {
+ probastId <- "1.2.2"
+ var <- "minTimeAtRisk"
+
result <- getDiagnostic(
probastId,
- plpData,
- outcomeId,
- populationSettings,
- var,
- 0
+ plpData,
+ outcomeId,
+ populationSettings,
+ var,
+ 0
)
-
+
diagnosticAggregate <- rbind(
diagnosticAggregate,
result$diagnosticAggregate
)
-
+
diagnosticFull <- rbind(
diagnosticFull,
result$diagnosticFull
)
-
}
-
-
+
+
## 1.2.3 first exposure only
- if(populationSettings$firstExposureOnly){
- probastId <- '1.2.3'
- var <- 'firstExposureOnly'
- default <- F
-
+ if (populationSettings$firstExposureOnly) {
+ probastId <- "1.2.3"
+ var <- "firstExposureOnly"
+ default <- FALSE
+
result <- getDiagnostic(
probastId,
- plpData,
- outcomeId,
- populationSettings,
- var,
- default
+ plpData,
+ outcomeId,
+ populationSettings,
+ var,
+ default
)
-
+
diagnosticAggregate <- rbind(
diagnosticAggregate,
result$diagnosticAggregate
)
-
+
diagnosticFull <- rbind(
diagnosticFull,
result$diagnosticFull
)
-
}
-
- ## 1.2.4 prior observation
- if(populationSettings$removeSubjectsWithPriorOutcome & populationSettings$priorOutcomeLookback > 0){
- probastId <- '1.2.4'
- var <- 'priorOutcomeLookback'
+
+ ## 1.2.4 prior observation
+ if (populationSettings$removeSubjectsWithPriorOutcome && populationSettings$priorOutcomeLookback > 0) {
+ probastId <- "1.2.4"
+ var <- "priorOutcomeLookback"
default <- 0
-
+
result <- getDiagnostic(
probastId,
- plpData,
- outcomeId,
- populationSettings,
- var,
- default
+ plpData,
+ outcomeId,
+ populationSettings,
+ var,
+ default
)
-
+
diagnosticAggregate <- rbind(
diagnosticAggregate,
result$diagnosticAggregate
)
-
+
diagnosticFull <- rbind(
diagnosticFull,
result$diagnosticFull
)
-
}
-
- if(!is.null(dim(diagnosticAggregate))){
- diagnosticAggregate <- as.data.frame(diagnosticAggregate) %>%
+
+ if (!is.null(dim(diagnosticAggregate))) {
+ diagnosticAggregate <- as.data.frame(diagnosticAggregate) %>%
dplyr::mutate_if(is.factor, as.character)
- colnames(diagnosticAggregate) <- c('probastId','resultValue')
+ colnames(diagnosticAggregate) <- c("probastId", "resultValue")
}
-
+
return(
list
(
@@ -524,17 +516,18 @@ probastParticipants <- function(
}
-getMaxEndDaysFromCovariates <- function(covariateSettings){
-
- if(inherits(covariateSettings, 'covariateSettings')){
+getMaxEndDaysFromCovariates <- function(covariateSettings) {
+ if (inherits(covariateSettings, "covariateSettings")) {
covariateSettings <- list(covariateSettings)
}
-
- vals <- unlist(lapply(covariateSettings, function(x){x$endDays}))
-
- if(length(vals) == 0){
+
+ vals <- unlist(lapply(covariateSettings, function(x) {
+ x$endDays
+ }))
+
+ if (length(vals) == 0) {
return(0)
- } else{
+ } else {
return(max(vals))
}
}
@@ -542,136 +535,132 @@ getMaxEndDaysFromCovariates <- function(covariateSettings){
probastPredictors <- function(
- plpData,
- outcomeId,
- populationSettings
-){
-
+ plpData,
+ outcomeId,
+ populationSettings) {
diagnosticAggregate <- c()
diagnosticFull <- c()
-
+
# true due to plp
- probastId <- '2.1'
+ probastId <- "2.1"
diagnosticAggregate <- rbind(
diagnosticAggregate,
- c(probastId, 'Pass')
+ c(probastId, "Pass")
)
-
+
# 2.2.1
# cov end date < tar_start
# covariate + outcome correlation; km of outcome (close to index or not)?
- probastId <- '2.2'
- if(populationSettings$startAnchor == 'cohort start'){
- if(populationSettings$riskWindowStart > getMaxEndDaysFromCovariates(plpData$metaData$covariateSettings)){
+ probastId <- "2.2"
+ if (populationSettings$startAnchor == "cohort start") {
+ if (populationSettings$riskWindowStart > getMaxEndDaysFromCovariates(plpData$metaData$covariateSettings)) {
diagnosticAggregate <- rbind(
diagnosticAggregate,
- c(probastId, 'Pass')
+ c(probastId, "Pass")
)
- } else{
+ } else {
diagnosticAggregate <- rbind(
diagnosticAggregate,
- c(probastId, 'Fail')
+ c(probastId, "Fail")
)
}
- } else{
+ } else {
diagnosticAggregate <- rbind(
diagnosticAggregate,
- c(probastId, 'Unknown')
+ c(probastId, "Unknown")
)
}
-
+
# KM of outcome
populationSettingsFull <- populationSettings
- populationSettingsFull$riskWindowEnd <- 10*365
-
+ populationSettingsFull$riskWindowEnd <- 10 * 365
+
population <- PatientLevelPrediction::createStudyPopulation(
- plpData = plpData,
- outcomeId = outcomeId,
+ plpData = plpData,
+ outcomeId = outcomeId,
populationSettings = populationSettings
)
-
+
populationFull <- PatientLevelPrediction::createStudyPopulation(
- plpData = plpData,
- outcomeId = outcomeId,
+ plpData = plpData,
+ outcomeId = outcomeId,
populationSettings = populationSettingsFull
)
-
- #dayOfEvent, outcomeAtTime, observedAtStartOfDay
- kmObservation <- population %>%
- dplyr::group_by(.data$daysToEvent) %>%
+
+ # dayOfEvent, outcomeAtTime, observedAtStartOfDay
+ kmObservation <- population %>%
+ dplyr::group_by(.data$daysToEvent) %>%
dplyr::summarise(
outcomeAtTime = sum(!is.na(.data$daysToEvent))
)
-
- kmObservation$observedAtStartOfDay <- unlist(
+
+ kmObservation$observedAtStartOfDay <- unlist(
lapply(
kmObservation$daysToEvent,
- function(x){
- population %>%
+ function(x) {
+ population %>%
dplyr::filter(.data$survivalTime >= x) %>%
- dplyr::tally() %>%
+ dplyr::tally() %>%
dplyr::select("n")
}
)
)
-
+
kmObservation$probastId <- probastId
- kmObservation$inputType <- 'populationSettings'
-
- kmObservationFull <- populationFull %>%
- dplyr::group_by(.data$daysToEvent) %>%
+ kmObservation$inputType <- "populationSettings"
+
+ kmObservationFull <- populationFull %>%
+ dplyr::group_by(.data$daysToEvent) %>%
dplyr::summarise(
outcomeAtTime = sum(!is.na(.data$daysToEvent))
)
-
- kmObservationFull$observedAtStartOfDay <- unlist(
+
+ kmObservationFull$observedAtStartOfDay <- unlist(
lapply(
kmObservationFull$daysToEvent,
- function(x){
- populationFull %>%
+ function(x) {
+ populationFull %>%
dplyr::filter(.data$survivalTime >= x) %>%
- dplyr::tally() %>%
+ dplyr::tally() %>%
dplyr::select("n")
}
)
)
-
+
kmObservationFull$probastId <- probastId
- kmObservationFull$inputType <- '10-year'
-
+ kmObservationFull$inputType <- "10-year"
+
diagnosticFull <- rbind(kmObservation, kmObservationFull)
-
-
+
+
# 2.3.1
# cov end_date <=0
- probastId <- '2.3'
- if(getMaxEndDaysFromCovariates(plpData$metaData$covariateSettings) <= 0){
-
- if(getMaxEndDaysFromCovariates(plpData$metaData$covariateSettings) < 0){
+ probastId <- "2.3"
+ if (getMaxEndDaysFromCovariates(plpData$metaData$covariateSettings) <= 0) {
+ if (getMaxEndDaysFromCovariates(plpData$metaData$covariateSettings) < 0) {
diagnosticAggregate <- rbind(
diagnosticAggregate,
- c(probastId, 'Pass')
+ c(probastId, "Pass")
)
- }
- else{
+ } else {
diagnosticAggregate <- rbind(
diagnosticAggregate,
- c(probastId, 'Unknown')
+ c(probastId, "Unknown")
)
}
- } else{
+ } else {
diagnosticAggregate <- rbind(
diagnosticAggregate,
- c(probastId, 'Fail')
+ c(probastId, "Fail")
)
}
-
- if(!is.null(dim(diagnosticAggregate))){
- diagnosticAggregate <- as.data.frame(diagnosticAggregate) %>%
+
+ if (!is.null(dim(diagnosticAggregate))) {
+ diagnosticAggregate <- as.data.frame(diagnosticAggregate) %>%
dplyr::mutate_if(is.factor, as.character)
- colnames(diagnosticAggregate) <- c('probastId','resultValue')
+ colnames(diagnosticAggregate) <- c("probastId", "resultValue")
}
-
+
return(
list
(
@@ -679,102 +668,99 @@ probastPredictors <- function(
diagnosticPredictorsAggregate = diagnosticAggregate
)
)
-
}
probastOutcome <- function(
- plpData,
- outcomeId,
- populationSettings
-){
-
+ plpData,
+ outcomeId,
+ populationSettings) {
diagnosticAggregate <- c()
diagnosticFull <- c()
-
+
# true due to plp
- probastId <- '3.4'
+ probastId <- "3.4"
diagnosticAggregate <- rbind(
diagnosticAggregate,
- c(probastId, 'Pass')
+ c(probastId, "Pass")
)
-
+
# 3.5 - check the outcome definition doesn't use things before index?
-
+
# 3.6 - check tar after covariate end_days
- probastId <- '3.6'
- if(populationSettings$startAnchor == 'cohort start'){
- if(populationSettings$riskWindowStart > getMaxEndDaysFromCovariates(plpData$metaData$covariateSettings)){
+ probastId <- "3.6"
+ if (populationSettings$startAnchor == "cohort start") {
+ if (populationSettings$riskWindowStart > getMaxEndDaysFromCovariates(plpData$metaData$covariateSettings)) {
diagnosticAggregate <- rbind(
diagnosticAggregate,
- c(probastId, 'Pass')
+ c(probastId, "Pass")
)
- } else{
+ } else {
diagnosticAggregate <- rbind(
diagnosticAggregate,
- c(probastId, 'Fail')
+ c(probastId, "Fail")
)
}
- } else{
+ } else {
diagnosticAggregate <- rbind(
diagnosticAggregate,
- c(probastId, 'Unknown')
+ c(probastId, "Unknown")
)
}
-
+
# 3.1.1 - check the outcome rate per gender/age/index year
- probastId <- '3.1.1'
-
+ probastId <- "3.1.1"
+
# all cohort vs pop
pop <- PatientLevelPrediction::createStudyPopulation(
- plpData = plpData,
- outcomeId = outcomeId,
+ plpData = plpData,
+ outcomeId = outcomeId,
populationSettings = populationSettings
)
-
+
popSum <- getOutcomeSummary(
- type = 'population',
+ type = "population",
population = pop
)
-
+
cohort <- PatientLevelPrediction::createStudyPopulation(
- plpData = plpData,
- outcomeId = outcomeId,
+ plpData = plpData,
+ outcomeId = outcomeId,
populationSettings = PatientLevelPrediction::createStudyPopulationSettings(
- includeAllOutcomes = F,
- firstExposureOnly = F,
- washoutPeriod = 0,
- removeSubjectsWithPriorOutcome = F,
- priorOutcomeLookback = 0,
- requireTimeAtRisk = F,
- minTimeAtRisk = 0,
- riskWindowStart = populationSettings$riskWindowStart,
- startAnchor = populationSettings$startAnchor,
- riskWindowEnd = populationSettings$riskWindowEnd,
+ includeAllOutcomes = FALSE,
+ firstExposureOnly = FALSE,
+ washoutPeriod = 0,
+ removeSubjectsWithPriorOutcome = FALSE,
+ priorOutcomeLookback = 0,
+ requireTimeAtRisk = FALSE,
+ minTimeAtRisk = 0,
+ riskWindowStart = populationSettings$riskWindowStart,
+ startAnchor = populationSettings$startAnchor,
+ riskWindowEnd = populationSettings$riskWindowEnd,
endAnchor = populationSettings$endAnchor
)
)
-
+
cohortSum <- getOutcomeSummary(
- type = 'cohort',
+ type = "cohort",
population = cohort
)
-
-
-
- #dayOfEvent, outcomeAtTime, observedAtStartOfDay
+
+
+
+ # dayOfEvent, outcomeAtTime, observedAtStartOfDay
diagnosticFull <- rbind(
do.call(rbind, popSum),
do.call(rbind, cohortSum)
)
-
- if(!is.null(dim(diagnosticAggregate))){
- diagnosticAggregate <- as.data.frame(diagnosticAggregate) %>%
+
+ if (!is.null(dim(diagnosticAggregate))) {
+ diagnosticAggregate <- as.data.frame(diagnosticAggregate) %>%
dplyr::mutate_if(is.factor, as.character)
- colnames(diagnosticAggregate) <- c('probastId','resultValue')
+ colnames(diagnosticAggregate) <- c("probastId", "resultValue")
}
-
+
return(
list
(
@@ -782,152 +768,145 @@ probastOutcome <- function(
diagnosticOutcomeAggregate = diagnosticAggregate
)
)
-
}
getOutcomeSummary <- function(
- type = 'population',
- population
-){
-
+ type = "population",
+ population) {
res <- list()
length(res) <- 4
- probastId <- '3'
-
- res[[1]] <- population %>%
+ probastId <- "3"
+
+ res[[1]] <- population %>%
dplyr::group_by(.data$ageYear) %>%
- dplyr::summarise(outcomePercent = sum(.data$outcomeCount>0)/length(.data$outcomeCount)) %>%
+ dplyr::summarise(outcomePercent = sum(.data$outcomeCount > 0) / length(.data$outcomeCount)) %>%
dplyr::mutate(
probastId = probastId,
- aggregation = 'age',
+ aggregation = "age",
inputType = type
- ) %>%
+ ) %>%
dplyr::rename(xvalue = "ageYear")
-
- res[[2]] <- population %>%
+
+ res[[2]] <- population %>%
dplyr::group_by(.data$gender) %>%
- dplyr::summarise(outcomePercent = sum(.data$outcomeCount>0)/length(.data$outcomeCount)) %>%
+ dplyr::summarise(outcomePercent = sum(.data$outcomeCount > 0) / length(.data$outcomeCount)) %>%
dplyr::mutate(
probastId = probastId,
- aggregation = 'gender',
+ aggregation = "gender",
inputType = type
- )%>%
+ ) %>%
dplyr::rename(xvalue = "gender")
-
- res[[3]] <- population %>%
+
+ res[[3]] <- population %>%
dplyr::mutate(
- year = substring(.data$cohortStartDate,1,4)
+ year = substring(.data$cohortStartDate, 1, 4)
) %>%
dplyr::group_by(.data$year) %>%
- dplyr::summarise(outcomePercent = sum(.data$outcomeCount>0)/length(.data$outcomeCount)) %>%
+ dplyr::summarise(outcomePercent = sum(.data$outcomeCount > 0) / length(.data$outcomeCount)) %>%
dplyr::mutate(
probastId = probastId,
- aggregation = 'year',
+ aggregation = "year",
inputType = type
- ) %>%
+ ) %>%
dplyr::rename(xvalue = "year")
-
- res[[4]] <- population %>%
+
+ res[[4]] <- population %>%
dplyr::mutate(
- year = substring(.data$cohortStartDate,6,7)
+ year = substring(.data$cohortStartDate, 6, 7)
) %>%
dplyr::group_by(.data$year) %>%
- dplyr::summarise(outcomePercent = sum(.data$outcomeCount>0)/length(.data$outcomeCount)) %>%
+ dplyr::summarise(outcomePercent = sum(.data$outcomeCount > 0) / length(.data$outcomeCount)) %>%
dplyr::mutate(
probastId = probastId,
- aggregation = 'month',
+ aggregation = "month",
inputType = type
- ) %>%
+ ) %>%
dplyr::rename(xvalue = "year")
-
+
return(res)
}
-cos_sim <- function(a,b)
-{
- return( sum(a*b)/sqrt(sum(a^2)*sum(b^2)) )
-}
+cos_sim <- function(a, b) {
+ return(sum(a * b) / sqrt(sum(a^2) * sum(b^2)))
+}
getDiagnostic <- function(
- probastId,
- plpData,
- outcomeId,
- populationSettings,
- var,
- defaultValue = 0
-){
- ParallelLogger::logInfo(paste0('Diagnosing impact of ',var,' in populationSettings'))
-
+ probastId,
+ plpData,
+ outcomeId,
+ populationSettings,
+ var,
+ defaultValue = 0) {
+ ParallelLogger::logInfo(paste0("Diagnosing impact of ", var, " in populationSettings"))
+
populationSettingsCheck <- populationSettings
populationSettingsCheck[var] <- defaultValue
-
+
pop <- PatientLevelPrediction::createStudyPopulation(
- plpData = plpData,
- outcomeId = outcomeId,
+ plpData = plpData,
+ outcomeId = outcomeId,
populationSettings = populationSettings
)
-
+
popCheck <- PatientLevelPrediction::createStudyPopulation(
- plpData = plpData,
- outcomeId = outcomeId,
- populationSettings = populationSettingsCheck
+ plpData = plpData,
+ outcomeId = outcomeId,
+ populationSettings = populationSettingsCheck
)
-
- #compare the populations:
+
+ # compare the populations:
diag <- rbind(
data.frame(
probastId = probastId,
- design = paste0(var,': ', defaultValue),
+ design = paste0(var, ": ", defaultValue),
metric = c(
- 'N', 'outcomePercent', 'minAge',
- 'meanAge', 'medianAge', 'maxAge',
- 'malePercent'
+ "N", "outcomePercent", "minAge",
+ "meanAge", "medianAge", "maxAge",
+ "malePercent"
),
value = c(
- nrow(popCheck), sum(popCheck$outcomeCount>0)/nrow(popCheck)*100, min(popCheck$ageYear),
+ nrow(popCheck), sum(popCheck$outcomeCount > 0) / nrow(popCheck) * 100, min(popCheck$ageYear),
mean(popCheck$ageYear), stats::median(popCheck$ageYear), max(popCheck$ageYear),
- sum(popCheck$gender == 8507)/nrow(popCheck)*100
+ sum(popCheck$gender == 8507) / nrow(popCheck) * 100
)
),
data.frame(
probastId = probastId,
- design = paste0(var,': ',populationSettings[var]),
+ design = paste0(var, ": ", populationSettings[var]),
metric = c(
- 'N', 'outcomePercent', 'minAge',
- 'meanAge', 'medianAge', 'maxAge',
- 'malePercent'
+ "N", "outcomePercent", "minAge",
+ "meanAge", "medianAge", "maxAge",
+ "malePercent"
),
value = c(
- nrow(pop), sum(pop$outcomeCount>0)/nrow(pop)*100, min(pop$ageYear),
+ nrow(pop), sum(pop$outcomeCount > 0) / nrow(pop) * 100, min(pop$ageYear),
mean(pop$ageYear), stats::median(pop$ageYear), max(pop$ageYear),
- sum(pop$gender == 8507)/nrow(pop)*100
+ sum(pop$gender == 8507) / nrow(pop) * 100
)
)
)
-
+
diagSim <- cos_sim(
- diag %>%
+ diag %>%
dplyr::filter(.data$design == unique(diag$design)[1]) %>%
- dplyr::filter(.data$metric != 'N') %>%
+ dplyr::filter(.data$metric != "N") %>%
dplyr::arrange(.data$metric) %>%
- dplyr::select("value")
- ,
- diag %>%
+ dplyr::select("value"),
+ diag %>%
dplyr::filter(.data$design == unique(diag$design)[2]) %>%
- dplyr::filter(.data$metric != 'N') %>%
+ dplyr::filter(.data$metric != "N") %>%
dplyr::arrange(.data$metric) %>%
dplyr::select("value")
)
-
-
+
+
return(
list(
diagnosticAggregate = c(
- probastId,
+ probastId,
diagSim
),
diagnosticFull = diag
)
)
}
-
diff --git a/R/EvaluatePlp.R b/R/EvaluatePlp.R
index 20f1cbe8e..8ab987618 100644
--- a/R/EvaluatePlp.R
+++ b/R/EvaluatePlp.R
@@ -23,93 +23,113 @@
#' @details
#' The function calculates various metrics to measure the performance of the model
#' @param prediction The patient level prediction model's prediction
-#' @param typeColumn The column name in the prediction object that is used to
+#' @param typeColumn The column name in the prediction object that is used to
#' stratify the evaluation
#' @return
#' A list containing the performance values
#'
#' @export
-evaluatePlp <- function(prediction, typeColumn = 'evaluationType'){
-
+evaluatePlp <- function(prediction, typeColumn = "evaluationType") {
+ start <- Sys.time()
# checking inputs
- #========================================
+ # ========================================
modelType <- attr(prediction, "metaData")$modelType
-
+
# could remove the bit below to let people add custom types (but currently
- # we are thinking this should be set - so people should add a new type
+ # we are thinking this should be set - so people should add a new type
# evaluation into the package rather than use custom
- if (!modelType %in% c("binary","survival")) {
- stop('Currently only support binary or survival classification models')
+ if (!modelType %in% c("binary", "survival")) {
+ stop("Currently only support binary or survival classification models")
}
-
- if(is.null(prediction$outcomeCount)){
- stop('No outcomeCount column present')
+
+ if (is.null(prediction$outcomeCount)) {
+ stop("No outcomeCount column present")
}
- if(length(unique(prediction$value))==1){
- stop('Cannot evaluate as predictions all the same value')
+ if (length(unique(prediction$value)) == 1) {
+ stop("Cannot evaluate as predictions all the same value")
}
-
+
# 1) evaluationSummary
- ParallelLogger::logTrace(paste0('Calulating evaluation summary Started @ ',Sys.time()))
+ ParallelLogger::logTrace(paste0("Calulating evaluation summary Started @ ", Sys.time()))
evaluationStatistics <- getEvaluationStatistics(
- prediction = prediction,
+ prediction = prediction,
predictionType = modelType,
typeColumn = typeColumn
)
-
+
# 2) thresholdSummary
# need to update thresholdSummary this with all the requested values
- ParallelLogger::logTrace(paste0('Calulating threshold summary Started @ ',Sys.time()))
- thresholdSummary <- tryCatch({
- getThresholdSummary(
- prediction = prediction,
- predictionType = modelType,
- typeColumn = typeColumn
- )
- },
- error = function(e){ParallelLogger::logInfo('getThresholdSummary error');ParallelLogger::logInfo(e);return(NULL)}
+ ParallelLogger::logTrace(paste0("Calulating threshold summary Started @ ", Sys.time()))
+ thresholdSummary <- tryCatch(
+ {
+ getThresholdSummary(
+ prediction = prediction,
+ predictionType = modelType,
+ typeColumn = typeColumn
+ )
+ },
+ error = function(e) {
+ ParallelLogger::logInfo("getThresholdSummary error")
+ ParallelLogger::logInfo(e)
+ return(NULL)
+ }
)
-
+
# 3) demographicSummary
- ParallelLogger::logTrace(paste0('Calulating Demographic Based Evaluation Started @ ',Sys.time()))
- demographicSummary <- tryCatch({
- getDemographicSummary(
- prediction = prediction,
- predictionType = modelType,
- typeColumn = typeColumn
+ ParallelLogger::logTrace(paste0("Calulating Demographic Based Evaluation Started @ ", Sys.time()))
+ demographicSummary <- tryCatch(
+ {
+ getDemographicSummary(
+ prediction = prediction,
+ predictionType = modelType,
+ typeColumn = typeColumn
)
},
- error = function(e){ParallelLogger::logInfo('getDemographicSummary error');ParallelLogger::logInfo(e);return(NULL)}
+ error = function(e) {
+ ParallelLogger::logInfo("getDemographicSummary error")
+ ParallelLogger::logInfo(e)
+ return(NULL)
+ }
)
-
+
# 4) calibrationSummary
- ParallelLogger::logTrace(paste0('Calculating Calibration Summary Started @ ',Sys.time()))
- calibrationSummary <- tryCatch({
- getCalibrationSummary(
- prediction = prediction,
- predictionType = modelType,
- typeColumn = typeColumn,
- numberOfStrata = 100,
- truncateFraction = 0.01
- )
- },
- error = function(e){ParallelLogger::logInfo('getCalibrationSummary error');ParallelLogger::logInfo(e);return(NULL)}
+ ParallelLogger::logTrace(paste0("Calculating Calibration Summary Started @ ", Sys.time()))
+ calibrationSummary <- tryCatch(
+ {
+ getCalibrationSummary(
+ prediction = prediction,
+ predictionType = modelType,
+ typeColumn = typeColumn,
+ numberOfStrata = 100,
+ truncateFraction = 0.01
+ )
+ },
+ error = function(e) {
+ ParallelLogger::logInfo("getCalibrationSummary error")
+ ParallelLogger::logInfo(e)
+ return(NULL)
+ }
)
-
-
+
+
# 5) predictionDistribution - done
- ParallelLogger::logTrace(paste0('Calculating Quantiles Started @ ',Sys.time()))
- predictionDistribution <- tryCatch({
- getPredictionDistribution(
- prediction = prediction,
- predictionType = modelType,
- typeColumn = typeColumn
- )
- },
- error = function(e){ParallelLogger::logInfo('getPredictionDistribution error');ParallelLogger::logInfo(e);return(NULL)}
+ ParallelLogger::logTrace(paste0("Calculating Quantiles Started @ ", Sys.time()))
+ predictionDistribution <- tryCatch(
+ {
+ getPredictionDistribution(
+ prediction = prediction,
+ predictionType = modelType,
+ typeColumn = typeColumn
+ )
+ },
+ error = function(e) {
+ ParallelLogger::logInfo("getPredictionDistribution error")
+ ParallelLogger::logInfo(e)
+ return(NULL)
+ }
)
-
+
result <- list(
evaluationStatistics = evaluationStatistics,
thresholdSummary = thresholdSummary,
@@ -118,10 +138,11 @@ evaluatePlp <- function(prediction, typeColumn = 'evaluationType'){
predictionDistribution = predictionDistribution
)
- class(result) <- 'plpEvaluation'
-
+ class(result) <- "plpEvaluation"
+ delta <- Sys.time() - start
+ ParallelLogger::logInfo("Time to calculate evaluation metrics: ",
+ signif(delta, 3), " ", attr(delta, "units"))
return(result)
-
}
@@ -129,27 +150,27 @@ evaluatePlp <- function(prediction, typeColumn = 'evaluationType'){
#' Calculate the model-based concordance, which is a calculation of the expected discrimination performance of a model under the assumption the model predicts the "TRUE" outcome
#' as detailed in van Klaveren et al. https://pubmed.ncbi.nlm.nih.gov/27251001/
-#'
+#'
#' @details
#' Calculate the model-based concordance
#'
#' @param prediction the prediction object found in the plpResult object
-#'
+#'
#' @return
#' model-based concordance value
#'
#' @export
-modelBasedConcordance <- function(prediction){
- if (!length(prediction$value >0)){
+modelBasedConcordance <- function(prediction) {
+ if (!length(prediction$value > 0)) {
stop("Prediction object not found")
}
prediction <- prediction$value
- n<-length(prediction)
- ord<-order(prediction)
- prediction<-prediction[ord]
- q.hat<-1-prediction
- V1<-(prediction*(cumsum(q.hat)-q.hat)+q.hat*(sum(prediction)-cumsum(prediction)))/(n-1)
- V2<-(prediction*(sum(q.hat)-q.hat)+q.hat*(sum(prediction)-prediction))/(n-1)
- mb.c<-sum(V1)/sum(V2)
+ n <- length(prediction)
+ ord <- order(prediction)
+ prediction <- prediction[ord]
+ q.hat <- 1 - prediction
+ V1 <- (prediction * (cumsum(q.hat) - q.hat) + q.hat * (sum(prediction) - cumsum(prediction))) / (n - 1)
+ V2 <- (prediction * (sum(q.hat) - q.hat) + q.hat * (sum(prediction) - prediction)) / (n - 1)
+ mb.c <- sum(V1) / sum(V2)
return(mb.c)
}
diff --git a/R/EvaluationSummary.R b/R/EvaluationSummary.R
index 34924fd7a..3404c295b 100644
--- a/R/EvaluationSummary.R
+++ b/R/EvaluationSummary.R
@@ -1,236 +1,249 @@
getEvaluationStatistics <- function(
- prediction,
- predictionType,
- typeColumn = 'evaluation'
-){
-
+ prediction,
+ predictionType,
+ typeColumn = "evaluation") {
evaluation <- do.call(
- what = paste0('getEvaluationStatistics_', predictionType),
+ what = paste0("getEvaluationStatistics_", predictionType),
args = list(
- prediction = prediction,
+ prediction = prediction,
evalColumn = typeColumn,
- timepoint = attr(prediction, 'metaData')$timepoint
+ timepoint = attr(prediction, "metaData")$timepoint
)
)
-
+
return(evaluation)
}
# get all the standard metrics for a given evaluation type
# function to calculate evaluation summary data.frame with columns: evaluation, metric, value
-getEvaluationStatistics_binary <- function(prediction, evalColumn, ...){
-
+getEvaluationStatistics_binary <- function(prediction, evalColumn, ...) {
result <- c()
- evalTypes <- unique(as.data.frame(prediction)[,evalColumn])
+ evalTypes <- unique(as.data.frame(prediction)[, evalColumn])
- for(evalType in evalTypes){
-
+ for (evalType in evalTypes) {
predictionOfInterest <- prediction %>% dplyr::filter(.data[[evalColumn]] == evalType)
-
+
result <- rbind(
result,
- c(evalType, 'populationSize', nrow(predictionOfInterest)),
- c(evalType, 'outcomeCount', sum(predictionOfInterest$outcomeCount))
+ c(evalType, "populationSize", nrow(predictionOfInterest)),
+ c(evalType, "outcomeCount", sum(predictionOfInterest$outcomeCount))
)
-
+
# auc
- ParallelLogger::logInfo(paste0('Calculating Performance for ', evalType))
- ParallelLogger::logInfo('=============')
-
- ParallelLogger::logTrace('Calculating AUC')
- auc <- computeAuc(predictionOfInterest, confidenceInterval = T)
-
+ ParallelLogger::logInfo(paste0("Calculating Performance for ", evalType))
+ ParallelLogger::logInfo("=============")
+
+ ParallelLogger::logTrace("Calculating AUC")
+ auc <- computeAuc(predictionOfInterest, confidenceInterval = TRUE)
+
result <- rbind(
- result,
- c(evalType, 'AUROC', auc[1]),
- c(evalType, '95% lower AUROC', auc[2]),
- c(evalType, '95% upper AUROC', auc[3])
+ result,
+ c(evalType, "AUROC", auc[1]),
+ c(evalType, "95% lower AUROC", auc[2]),
+ c(evalType, "95% upper AUROC", auc[3])
)
- ParallelLogger::logInfo(sprintf('%-20s%.2f', 'AUC', auc[1]*100))
- ParallelLogger::logInfo(sprintf('%-20s%.2f', '95% lower AUC: ', auc[2]*100))
- ParallelLogger::logInfo(sprintf('%-20s%.2f', '95% upper AUC: ', auc[3]*100))
-
+ ParallelLogger::logInfo(sprintf("%-20s%.2f", "AUC", auc[1] * 100))
+ ParallelLogger::logInfo(sprintf("%-20s%.2f", "95% lower AUC: ", auc[2] * 100))
+ ParallelLogger::logInfo(sprintf("%-20s%.2f", "95% upper AUC: ", auc[3] * 100))
+
# auprc
- ParallelLogger::logTrace('Calculating AUPRC')
+ ParallelLogger::logTrace("Calculating AUPRC")
positive <- predictionOfInterest$value[predictionOfInterest$outcomeCount == 1]
negative <- predictionOfInterest$value[predictionOfInterest$outcomeCount == 0]
pr <- PRROC::pr.curve(scores.class0 = positive, scores.class1 = negative)
auprc <- pr$auc.integral
result <- rbind(
- result,
- c(evalType, 'AUPRC', auprc)
+ result,
+ c(evalType, "AUPRC", auprc)
)
- ParallelLogger::logInfo(sprintf('%-20s%.2f', 'AUPRC: ', auprc*100))
-
+ ParallelLogger::logInfo(sprintf("%-20s%.2f", "AUPRC: ", auprc * 100))
+
# brier scores-returnss; brier, brierScaled
- ParallelLogger::logTrace('Calculating Brier Score')
+ ParallelLogger::logTrace("Calculating Brier Score")
brier <- brierScore(predictionOfInterest)
result <- rbind(
- result,
- c(evalType, 'brier score', brier$brier),
- c(evalType, 'brier score scaled', brier$brierScaled)
+ result,
+ c(evalType, "brier score", brier$brier),
+ c(evalType, "brier score scaled", brier$brierScaled)
)
- ParallelLogger::logInfo(sprintf('%-20s%.2f', 'Brier: ', brier$brier))
+ ParallelLogger::logInfo(sprintf("%-20s%.2f", "Brier: ", brier$brier))
+
-
# using rms::val.prob
- indValProb <- predictionOfInterest$value>0 & predictionOfInterest$value < 1
+ indValProb <- predictionOfInterest$value > 0 & predictionOfInterest$value < 1
valProb <- tryCatch(
calculateEStatisticsBinary(prediction = predictionOfInterest[indValProb, ]),
error = function(e) {
- ParallelLogger::logInfo(e); return(
+ ParallelLogger::logInfo(e)
+ return(
c(
- Eavg = 0,
- E90 = 0,
+ Eavg = 0,
+ E90 = 0,
Emax = 0
)
)
}
)
result <- rbind(
- result,
- c(evalType, 'Eavg', valProb['Eavg']),
- c(evalType, 'E90', valProb['E90']),
- c(evalType, 'Emax', valProb['Emax'])
+ result,
+ c(evalType, "Eavg", valProb["Eavg"]),
+ c(evalType, "E90", valProb["E90"]),
+ c(evalType, "Emax", valProb["Emax"])
)
- ParallelLogger::logInfo(sprintf('%-20s%.2f', 'Eavg: ', round(valProb['Eavg'], digits = 4)))
+ ParallelLogger::logInfo(sprintf("%-20s%.2f", "Eavg: ", round(valProb["Eavg"], digits = 4)))
+
+
-
-
# Removing for now as too slow...
- #ici <- ici(prediction)
- #result <- rbind(
- # result,
+ # ici <- ici(prediction)
+ # result <- rbind(
+ # result,
# c(evalType, 'ici', ifelse(is.null(ici), 'NA', ici))
- #)
- #ParallelLogger::logInfo(paste0('ICI ', round(ifelse(is.null(ici), 'NA', ici), digits = 4)))
-
-
+ # )
+ # ParallelLogger::logInfo(paste0('ICI ', round(ifelse(is.null(ici), 'NA', ici), digits = 4)))
+
+
# calibration linear fit- returns gradient, intercept
- ParallelLogger::logTrace('Calculating Calibration-in-large')
+ ParallelLogger::logTrace("Calculating Calibration-in-large")
calinlarge <- calibrationInLarge(predictionOfInterest)
result <- rbind(
- result,
- c(evalType, 'calibrationInLarge mean prediction', calinlarge$meanPredictionRisk),
- c(evalType, 'calibrationInLarge observed risk', calinlarge$observedRisk)
+ result,
+ c(evalType, "calibrationInLarge mean prediction", calinlarge$meanPredictionRisk),
+ c(evalType, "calibrationInLarge observed risk", calinlarge$observedRisk)
)
- ParallelLogger::logInfo(paste0('Calibration in large- Mean predicted risk ', round(calinlarge$meanPredictionRisk, digits = 4), ' : observed risk ',round(calinlarge$observedRisk, digits = 4)))
-
+ ParallelLogger::logInfo(paste0("Calibration in large- Mean predicted risk ", round(calinlarge$meanPredictionRisk, digits = 4), " : observed risk ", round(calinlarge$observedRisk, digits = 4)))
+
calinlargeInt <- calibrationInLargeIntercept(predictionOfInterest)
result <- rbind(
- result,
- c(evalType, 'calibrationInLarge intercept', calinlargeInt)
+ result,
+ c(evalType, "calibrationInLarge intercept", calinlargeInt)
)
- ParallelLogger::logInfo(paste0('Calibration in large- Intercept ', round(calinlargeInt, digits = 4)))
-
-
- ParallelLogger::logTrace('Calculating Weak Calibration')
+ ParallelLogger::logInfo(paste0("Calibration in large- Intercept ", round(calinlargeInt, digits = 4)))
+
+
+ ParallelLogger::logTrace("Calculating Weak Calibration")
weakCal <- calibrationWeak(predictionOfInterest)
result <- rbind(
- result,
- c(evalType, 'weak calibration intercept', weakCal$intercept),
- c(evalType, 'weak calibration gradient', weakCal$gradient)
+ result,
+ c(evalType, "weak calibration intercept", weakCal$intercept),
+ c(evalType, "weak calibration gradient", weakCal$gradient)
)
- ParallelLogger::logInfo(paste0('Weak calibration intercept: ',
- round(weakCal$intercept, digits = 4),
- ' - gradient:',round(weakCal$gradient, digits = 4)))
-
- ParallelLogger::logTrace('Calculating Hosmer-Lemeshow Calibration Line')
+ ParallelLogger::logInfo(paste0(
+ "Weak calibration intercept: ",
+ round(weakCal$intercept, digits = 4),
+ " - gradient:", round(weakCal$gradient, digits = 4)
+ ))
+
+ ParallelLogger::logTrace("Calculating Hosmer-Lemeshow Calibration Line")
calLine10 <- calibrationLine(predictionOfInterest, numberOfStrata = 10)
result <- rbind(
- result,
- c(evalType, 'Hosmer-Lemeshow calibration intercept', calLine10$lm[1]),
- c(evalType, 'Hosmer-Lemeshow calibration gradient', calLine10$lm[2])
+ result,
+ c(evalType, "Hosmer-Lemeshow calibration intercept", calLine10$lm[1]),
+ c(evalType, "Hosmer-Lemeshow calibration gradient", calLine10$lm[2])
)
- ParallelLogger::logInfo(sprintf('%-20s%.2f%-20s%.2f', 'Hosmer-Lemeshow calibration gradient: ', calLine10$lm[2], ' intercept: ',calLine10$lm[1]))
-
+ ParallelLogger::logInfo(sprintf("%-20s%.2f%-20s%.2f", "Hosmer-Lemeshow calibration gradient: ", calLine10$lm[2], " intercept: ", calLine10$lm[1]))
+
# Extra: Average Precision
aveP.val <- averagePrecision(predictionOfInterest)
result <- rbind(
- result,
- c(evalType, 'Average Precision', aveP.val)
+ result,
+ c(evalType, "Average Precision", aveP.val)
)
- ParallelLogger::logInfo(sprintf('%-20s%.2f', 'Average Precision: ', aveP.val))
-
+ ParallelLogger::logInfo(sprintf("%-20s%.2f", "Average Precision: ", aveP.val))
}
-
+
result <- as.data.frame(result)
- colnames(result) <- c('evaluation','metric','value')
-
+ colnames(result) <- c("evaluation", "metric", "value")
+
return(result)
}
-getEvaluationStatistics_survival <- function(prediction, evalColumn, timepoint, ...){
-
- if(is.null(prediction$survivalTime)){
- stop('No survival time column present')
+getEvaluationStatistics_survival <- function(prediction, evalColumn, timepoint, ...) {
+ if (is.null(prediction$survivalTime)) {
+ stop("No survival time column present")
}
-
+ rlang::check_installed(
+ pkg = c("survival", "polspline"),
+ reason = "This function requires these package to be installed"
+ )
+
result <- c()
- evalTypes <- unique(as.data.frame(prediction)[,evalColumn])
-
- for(evalType in evalTypes){
-
+ evalTypes <- unique(as.data.frame(prediction)[, evalColumn])
+
+ for (evalType in evalTypes) {
predictionOfInterest <- prediction %>% dplyr::filter(.data[[evalColumn]] == evalType)
-
+
result <- rbind(
result,
- c(evalType, timepoint, 'populationSize', nrow(predictionOfInterest)),
- c(evalType, timepoint, 'outcomeCount', sum(predictionOfInterest$outcomeCount))
+ c(evalType, timepoint, "populationSize", nrow(predictionOfInterest)),
+ c(evalType, timepoint, "outcomeCount", sum(predictionOfInterest$outcomeCount))
)
-
- #============================
-
- ##timepoint <- attr(prediction, 'metaData')$timepoint #max(prediction$survivalTime)
- ParallelLogger::logInfo(paste0('Evaluating survival model at time: ', timepoint, ' days'))
-
+
+ # ============================
+
+ ParallelLogger::logInfo(paste0("Evaluating survival model at time: ", timepoint, " days"))
+
t <- predictionOfInterest$survivalTime
y <- ifelse(predictionOfInterest$outcomeCount > 0, 1, 0)
-
- S <- survival::Surv(t, y)
+
+ S <- survival::Surv(t, y)
p <- predictionOfInterest$value
-
- out <- tryCatch({summary(survival::survfit(survival::Surv(t, y) ~ 1), times = timepoint)},
- error = function(e){ParallelLogger::logError(e); return(NULL)})
- survVal <- 1-out$surv
+
+ out <- tryCatch(
+ {
+ summary(survival::survfit(survival::Surv(t, y) ~ 1), times = timepoint)
+ },
+ error = function(e) {
+ ParallelLogger::logError(e)
+ return(NULL)
+ }
+ )
+ survVal <- 1 - out$surv
meanSurvivalTime <- mean(t)
-
+
result <- rbind(
- result,
- c(evalType, timepoint, 'Survival', survVal),
- c(evalType, timepoint, 'Mean survival time', meanSurvivalTime)
+ result,
+ c(evalType, timepoint, "Survival", survVal),
+ c(evalType, timepoint, "Mean survival time", meanSurvivalTime)
)
-
+
# add c-stat
- ParallelLogger::logTrace('Calculating C-statistic')
-
- conc <- tryCatch({survival::concordance(S~p, reverse=TRUE)},
- error = function(e){ParallelLogger::logError(e); return(NULL)})
+ ParallelLogger::logTrace("Calculating C-statistic")
+
+ conc <- tryCatch(
+ {
+ survival::concordance(S ~ p, reverse = TRUE)
+ },
+ error = function(e) {
+ ParallelLogger::logError(e)
+ return(NULL)
+ }
+ )
cStatistic <- 0
cStatistic_l95CI <- 0
cStatistic_u95CI <- 0
-
- if(!is.null(conc)){
- cStatistic <- round(conc$concordance,5)
+
+ if (!is.null(conc)) {
+ cStatistic <- round(conc$concordance, 5)
c.se <- sqrt(conc$var)
- cStatistic_l95CI <- round(conc$concordance+stats::qnorm(.025)*c.se,3)
- cStatistic_u95CI <- round(conc$concordance+stats::qnorm(.975)*c.se,3)
+ cStatistic_l95CI <- round(conc$concordance + stats::qnorm(.025) * c.se, 3)
+ cStatistic_u95CI <- round(conc$concordance + stats::qnorm(.975) * c.se, 3)
}
result <- rbind(
- result,
- c(evalType, timepoint, 'C-statistic', cStatistic),
- c(evalType, timepoint, 'C-statistic lower 95% CI', cStatistic_l95CI),
- c(evalType, timepoint, 'C-statistic upper 95% CI', cStatistic_u95CI)
+ result,
+ c(evalType, timepoint, "C-statistic", cStatistic),
+ c(evalType, timepoint, "C-statistic lower 95% CI", cStatistic_l95CI),
+ c(evalType, timepoint, "C-statistic upper 95% CI", cStatistic_u95CI)
)
- ParallelLogger::logInfo(paste0('C-statistic: ',cStatistic, ' (',cStatistic_l95CI ,'-', cStatistic_u95CI ,')'))
+ ParallelLogger::logInfo(paste0("C-statistic: ", cStatistic, " (", cStatistic_l95CI, "-", cStatistic_u95CI, ")"))
# add e-stat
.validateSurvival <- function(p, S, timepoint) {
estimatedSurvival <- 1 - p
notMissing <- !is.na(estimatedSurvival + S[, 1] + S[, 2])
- estimatedSurvival <- estimatedSurvival[notMissing]
+ estimatedSurvival <- estimatedSurvival[notMissing]
S <- S[notMissing, ]
.curtail <- function(x) pmin(.9999, pmax(x, .0001))
f <- polspline::hare(
@@ -250,35 +263,39 @@ getEvaluationStatistics_survival <- function(prediction, evalColumn, timepoint,
}
w <- tryCatch(
- {
- .validateSurvival(
- p = p,
- S = S,
- timepoint = timepoint
- )
- },
- error = function(e){ParallelLogger::logError(e); return(NULL)}
+ {
+ .validateSurvival(
+ p = p,
+ S = S,
+ timepoint = timepoint
+ )
+ },
+ error = function(e) {
+ ParallelLogger::logError(e)
+ return(NULL)
+ }
)
eStatistic <- eStatistic90 <- -1
if (!is.null(w)) {
- eStatistic <- mean(abs(w$actual - w$estimatedSurvival))
+ eStatistic <- mean(abs(w$actual - w$estimatedSurvival))
eStatistic90 <- stats::quantile(abs(w$actual - w$estimatedSurvival),
- probs = .9, na.rm = TRUE)
+ probs = .9, na.rm = TRUE
+ )
}
result <- rbind(
- result,
- c(evalType, timepoint, 'E-statistic', eStatistic),
- c(evalType, timepoint, 'E-statistic 90%', eStatistic90)
+ result,
+ c(evalType, timepoint, "E-statistic", eStatistic),
+ c(evalType, timepoint, "E-statistic 90%", eStatistic90)
)
- ParallelLogger::logInfo(paste0('E-statistic: ',eStatistic))
- ParallelLogger::logInfo(paste0('E-statistic 90%: ',eStatistic90))
+ ParallelLogger::logInfo(paste0("E-statistic: ", eStatistic))
+ ParallelLogger::logInfo(paste0("E-statistic 90%: ", eStatistic90))
}
-
+
result <- as.data.frame(result)
- colnames(result) <- c('evaluation','timepoint','metric','value')
-
+ colnames(result) <- c("evaluation", "timepoint", "metric", "value")
+
return(result)
}
@@ -286,7 +303,7 @@ getEvaluationStatistics_survival <- function(prediction, evalColumn, timepoint,
calculateEStatisticsBinary <- function(prediction) {
risk <- prediction$value
outcome <- prediction$outcomeCount
- notna <- ! is.na(risk + outcome)
+ notna <- !is.na(risk + outcome)
risk <- risk[notna]
outcome <- outcome[notna]
smoothFit <- stats::lowess(risk, outcome, iter = 0)
@@ -306,9 +323,9 @@ calculateEStatisticsBinary <- function(prediction) {
}
-#==================================
+# ==================================
# Fucntions for the summary
-#==================================
+# ==================================
#' Compute the area under the ROC curve
#'
@@ -321,11 +338,13 @@ calculateEStatisticsBinary <- function(prediction) {
#' @param confidenceInterval Should 95 percebt confidence intervals be computed?
#'
#' @export
-computeAuc <- function(prediction,
- confidenceInterval = FALSE) {
- if (attr(prediction, "metaData")$modelType != "binary")
+computeAuc <- function(
+ prediction,
+ confidenceInterval = FALSE) {
+ if (attr(prediction, "metaData")$modelType != "binary") {
stop("Computing AUC is only implemented for binary classification models")
-
+ }
+
if (confidenceInterval) {
return(aucWithCi(prediction = prediction$value, truth = prediction$outcomeCount))
} else {
@@ -333,14 +352,14 @@ computeAuc <- function(prediction,
}
}
-aucWithCi <- function(prediction, truth){
- auc <- pROC::auc(as.factor(truth), prediction, direction="<", quiet=TRUE)
- aucci <-pROC::ci(auc)
+aucWithCi <- function(prediction, truth) {
+ auc <- pROC::auc(as.factor(truth), prediction, direction = "<", quiet = TRUE)
+ aucci <- pROC::ci(auc)
return(data.frame(auc = aucci[2], auc_lb95ci = aucci[1], auc_ub95ci = aucci[3]))
}
-aucWithoutCi <- function(prediction, truth){
- auc <- pROC::auc(as.factor(truth), prediction, direction="<", quiet=TRUE)
+aucWithoutCi <- function(prediction, truth) {
+ auc <- pROC::auc(as.factor(truth), prediction, direction = "<", quiet = TRUE)
return(as.double(auc))
}
@@ -356,12 +375,11 @@ aucWithoutCi <- function(prediction, truth){
#' A list containing the brier score and the scaled brier score
#'
#' @export
-brierScore <- function(prediction){
-
- brier <- sum((prediction$outcomeCount -prediction$value)^2)/nrow(prediction)
- brierMax <- mean(prediction$value)*(1-mean(prediction$value))
- brierScaled <- 1-brier/brierMax
- return(list(brier=brier,brierScaled=brierScaled))
+brierScore <- function(prediction) {
+ brier <- sum((prediction$outcomeCount - prediction$value)^2) / nrow(prediction)
+ brierMax <- mean(prediction$value) * (1 - mean(prediction$value))
+ brierScaled <- 1 - brier / brierMax
+ return(list(brier = brier, brierScaled = brierScaled))
}
#' calibrationLine
@@ -373,60 +391,59 @@ brierScore <- function(prediction){
#' Calculates the calibration from prediction object
#'
#' @export
-calibrationLine <- function(prediction,numberOfStrata=10){
+calibrationLine <- function(prediction, numberOfStrata = 10) {
outPpl <- unique(prediction$rowId)
-
- q <- unique(stats::quantile(prediction$value, c((1:(numberOfStrata - 1))/numberOfStrata, 1)))
-
- if(length(unique(c(0,q)))==2){
- warning('Prediction not spread')
- #res <- c(0,0)
- #lmData <- NULL
- #hosmerlemeshow <- c(0,0,0)
+
+ q <- unique(stats::quantile(prediction$value, c((1:(numberOfStrata - 1)) / numberOfStrata, 1)))
+
+ if (length(unique(c(0, q))) == 2) {
+ warning("Prediction not spread")
prediction$strata <- cut(prediction$value,
- breaks = c(-0.1,0.5,1), #,max(prediction$value)),
- labels = FALSE)
+ breaks = c(-0.1, 0.5, 1),
+ labels = FALSE
+ )
} else {
prediction$strata <- cut(prediction$value,
- breaks = unique(c(-0.1,q)), #,max(prediction$value)),
- labels = FALSE)
+ breaks = unique(c(-0.1, q)),
+ labels = FALSE
+ )
}
-
+
# get observed events:
- obs.Points <- stats::aggregate(prediction$outcomeCount, by=list(prediction$strata), FUN=mean)
- colnames(obs.Points) <- c('group','obs')
- pred.Points <- stats::aggregate(prediction$value, by=list(prediction$strata), FUN=mean)
- colnames(pred.Points) <- c('group','pred')
-
+ obs.Points <- stats::aggregate(prediction$outcomeCount, by = list(prediction$strata), FUN = mean)
+ colnames(obs.Points) <- c("group", "obs")
+ pred.Points <- stats::aggregate(prediction$value, by = list(prediction$strata), FUN = mean)
+ colnames(pred.Points) <- c("group", "pred")
+
# hosmer-lemeshow-goodness-of-fit-test
- obs.count <- stats::aggregate(prediction$outcomeCount, by=list(prediction$strata), FUN=sum)
- colnames(obs.count) <- c('group','observed')
- expected.count <- stats::aggregate(prediction$value, by=list(prediction$strata), FUN=sum)
- colnames(expected.count) <- c('group','expected')
- hoslem <- merge(obs.count, expected.count, by='group')
- obs.count2 <- stats::aggregate(1-prediction$outcomeCount, by=list(prediction$strata), FUN=sum)
- colnames(obs.count2) <- c('group','observed')
- expected.count2 <- stats::aggregate(1-prediction$value, by=list(prediction$strata), FUN=sum)
- colnames(expected.count2) <- c('group','expected')
- nhoslem <- merge(obs.count2, expected.count2, by='group')
- Xsquared <- sum((hoslem$observed-hoslem$expected)^2/hoslem$expected) +
- sum((nhoslem$observed-nhoslem$expected)^2/nhoslem$expected)
- pvalue <- stats::pchisq(Xsquared, df=numberOfStrata-2, lower.tail = F)
- hosmerlemeshow <- data.frame(Xsquared=Xsquared, df=numberOfStrata-2, pvalue=pvalue)
-
+ obs.count <- stats::aggregate(prediction$outcomeCount, by = list(prediction$strata), FUN = sum)
+ colnames(obs.count) <- c("group", "observed")
+ expected.count <- stats::aggregate(prediction$value, by = list(prediction$strata), FUN = sum)
+ colnames(expected.count) <- c("group", "expected")
+ hoslem <- merge(obs.count, expected.count, by = "group")
+ obs.count2 <- stats::aggregate(1 - prediction$outcomeCount, by = list(prediction$strata), FUN = sum)
+ colnames(obs.count2) <- c("group", "observed")
+ expected.count2 <- stats::aggregate(1 - prediction$value, by = list(prediction$strata), FUN = sum)
+ colnames(expected.count2) <- c("group", "expected")
+ nhoslem <- merge(obs.count2, expected.count2, by = "group")
+ Xsquared <- sum((hoslem$observed - hoslem$expected)^2 / hoslem$expected) +
+ sum((nhoslem$observed - nhoslem$expected)^2 / nhoslem$expected)
+ pvalue <- stats::pchisq(Xsquared, df = numberOfStrata - 2, lower.tail = FALSE)
+ hosmerlemeshow <- data.frame(Xsquared = Xsquared, df = numberOfStrata - 2, pvalue = pvalue)
+
# linear model fitting obs to pred:
- lmData <- merge(obs.Points, pred.Points, by='group')
- model <- stats::lm(obs ~pred, data=lmData)
-
- ##graphics::plot(lmData$pred, lmData$obs)
- ##graphics::abline(a = model$coefficients[1], b = model$coefficients[2], col='red')
+ lmData <- merge(obs.Points, pred.Points, by = "group")
+ model <- stats::lm(obs ~ pred, data = lmData)
+
res <- model$coefficients
- names(res) <- c('Intercept','Gradient')
+ names(res) <- c("Intercept", "Gradient")
#
-
- result <- list(lm=res,
+
+ result <- list(
+ lm = res,
aggregateLmData = lmData,
- hosmerlemeshow = hosmerlemeshow)
+ hosmerlemeshow = hosmerlemeshow
+ )
return(result)
}
@@ -441,102 +458,115 @@ calibrationLine <- function(prediction,numberOfStrata=10){
#' The average precision
#'
#' @export
-averagePrecision <- function(prediction){
+averagePrecision <- function(prediction) {
lab.order <- prediction$outcomeCount[order(-prediction$value)]
n <- nrow(prediction)
- P <- sum(prediction$outcomeCount>0)
+ P <- sum(prediction$outcomeCount > 0)
val <- rep(0, n)
- val[lab.order>0] <- 1:P
- return(sum(val/(1:n))/P)
+ val[lab.order > 0] <- 1:P
+ return(sum(val / (1:n)) / P)
}
-
-calibrationInLarge <- function(prediction){
-
- result <- data.frame(meanPredictionRisk = mean(prediction$value),
- observedRisk = sum(prediction$outcomeCount)/nrow(prediction),
+#' Calculate the calibration in large
+#' @param prediction A prediction dataframe
+#' @return data.frame with meanPredictionRisk, observedRisk, and N
+#' @keywords internal
+calibrationInLarge <- function(prediction) {
+ result <- data.frame(
+ meanPredictionRisk = mean(prediction$value),
+ observedRisk = sum(prediction$outcomeCount) / nrow(prediction),
N = nrow(prediction)
)
-
+
return(result)
}
-calibrationInLargeIntercept <- function(prediction){
-
- #do invert of log function:
+calibrationInLargeIntercept <- function(prediction) {
+ # do invert of log function:
# log(p/(1-p))
-
+
# edit the 0 and 1 values
- prediction$value[prediction$value==0] <- 0.000000000000001
- prediction$value[prediction$value==1] <- 1-0.000000000000001
-
- inverseLog <- log(prediction$value/(1-prediction$value))
- y <- ifelse(prediction$outcomeCount>0, 1, 0)
-
- intercept <- suppressWarnings(stats::glm(y ~ offset(1*inverseLog), family = 'binomial'))
+ prediction$value[prediction$value == 0] <- 0.000000000000001
+ prediction$value[prediction$value == 1] <- 1 - 0.000000000000001
+
+ inverseLog <- log(prediction$value / (1 - prediction$value))
+ y <- ifelse(prediction$outcomeCount > 0, 1, 0)
+
+ intercept <- suppressWarnings(stats::glm(y ~ offset(1 * inverseLog), family = "binomial"))
intercept <- intercept$coefficients[1]
-
+
return(intercept)
}
-calibrationWeak <- function(prediction){
-
- #do invert of log function:
+calibrationWeak <- function(prediction) {
+ # do invert of log function:
# log(p/(1-p))
-
+
# edit the 0 and 1 values
- prediction$value[prediction$value==0] <- 0.000000000000001
- prediction$value[prediction$value==1] <- 1-0.000000000000001
-
- inverseLog <- log(prediction$value/(1-prediction$value))
- y <- ifelse(prediction$outcomeCount>0, 1, 0)
-
- #intercept <- suppressWarnings(stats::glm(y ~ offset(1*inverseLog), family = 'binomial'))
- #intercept <- intercept$coefficients[1]
- #gradient <- suppressWarnings(stats::glm(y ~ inverseLog+0, family = 'binomial',
- # offset = rep(intercept,length(inverseLog))))
- #gradient <- gradient$coefficients[1]
-
- vals <- suppressWarnings(stats::glm(y ~ inverseLog, family = 'binomial'))
-
- result <- data.frame(intercept = vals$coefficients[1],
- gradient = vals$coefficients[2])
-
+ prediction$value[prediction$value == 0] <- 0.000000000000001
+ prediction$value[prediction$value == 1] <- 1 - 0.000000000000001
+
+ inverseLog <- log(prediction$value / (1 - prediction$value))
+ y <- ifelse(prediction$outcomeCount > 0, 1, 0)
+
+ vals <- suppressWarnings(stats::glm(y ~ inverseLog, family = "binomial"))
+
+ result <- data.frame(
+ intercept = vals$coefficients[1],
+ gradient = vals$coefficients[2]
+ )
+
return(result)
}
#' Calculate the Integrated Calibration Information from Austin and Steyerberg
#' https://onlinelibrary.wiley.com/doi/full/10.1002/sim.8281
-#'
+#'
#' @details
#' Calculate the Integrated Calibration Information
#'
#' @param prediction the prediction object found in the plpResult object
-#'
+#'
#' @return
#' Integrated Calibration Information
#'
#' @export
-ici <- function(prediction){
- #remove na
- if(sum(!is.finite(prediction$value))>0){
- prediction <- prediction[is.finite(prediction$value),]
+ici <- function(prediction) {
+ # remove na
+ if (sum(!is.finite(prediction$value)) > 0) {
+ prediction <- prediction[is.finite(prediction$value), ]
}
- loess.calibrate <- tryCatch({stats::loess(prediction$outcomeCount ~ prediction$value)},
- warning = function(w){ParallelLogger::logInfo(w)},
- error = function(e){ParallelLogger::logInfo(e); return(NULL)}
+ loess.calibrate <- tryCatch(
+ {
+ stats::loess(prediction$outcomeCount ~ prediction$value)
+ },
+ warning = function(w) {
+ ParallelLogger::logInfo(w)
+ },
+ error = function(e) {
+ ParallelLogger::logInfo(e)
+ return(NULL)
+ }
)
- if(!is.null(loess.calibrate)){
+ if (!is.null(loess.calibrate)) {
# Estimate loess-based smoothed calibration curve
- P.calibrate <- tryCatch({stats::predict(loess.calibrate, newdata = prediction$value)},
- warning = function(w){ParallelLogger::logInfo(w)},
- error = function(e){ParallelLogger::logInfo(e); return(NULL)}
+ P.calibrate <- tryCatch(
+ {
+ stats::predict(loess.calibrate, newdata = prediction$value)
+ },
+ warning = function(w) {
+ ParallelLogger::logInfo(w)
+ },
+ error = function(e) {
+ ParallelLogger::logInfo(e)
+ return(NULL)
+ }
)
- if(!is.null(P.calibrate)){
+ if (!is.null(P.calibrate)) {
# This is the point on the loess calibration curve corresponding to a given predicted probability.
- ICI <- mean (abs(P.calibrate - prediction$value))
+ ICI <- mean(abs(P.calibrate - prediction$value))
return(ICI)
}
}
diff --git a/R/ExistingPython.R b/R/ExistingPython.R
index d1fc6c2f2..3f1e6dde3 100644
--- a/R/ExistingPython.R
+++ b/R/ExistingPython.R
@@ -1,6 +1,6 @@
# @file ExistingPython.R
#
-# Copyright 2024 Observational Health Data Sciences and Informatics
+# Copyright 2025 Observational Health Data Sciences and Informatics
#
# This file is part of PatientLevelPrediction
#
@@ -17,7 +17,7 @@
# limitations under the License.
-#' Plug an existing scikit learn python model developed outside OHDSI into the
+#' Plug an existing scikit learn python model into the
#' PLP framework
#'
#' @details
@@ -29,43 +29,41 @@
#'
#' @param pythonModelLocation The location of the folder that contains the model as model.pkl
#' @param covariateMap A data.frame with the columns: columnId specifying the column order for the
-#' covariate, covariateId the covariate ID from FeatureExtraction and modelCovariateIdName which is the
-#' column name used when fitting the model. For example, if you had a column called 'age' in your model and this was the 3rd
+#' covariate, covariateId the covariate ID from FeatureExtraction and modelCovariateIdName which is the
+#' column name used when fitting the model. For example, if you had a column called 'age' in your model and this was the 3rd
#' column when fitting the model, then the values for columnId would be 3, covariateId would be 1002 (the covariateId for age in years) and
#' modelCovariateIdName would be 'age'.
#' @param covariateSettings The settings for the standardized covariates
-#' @param populationSettings The settings for the population, this includes the time-at-risk settings and
+#' @param populationSettings The settings for the population, this includes the time-at-risk settings and
#' and inclusion criteria.
#' @param isPickle If the model is saved as a pickle set this to T if it is a json set this to F
-#'
+#'
#' @return
#' An object of class plpModel, this is a list that contains: model (the location of the model.pkl),
#' preprocessing (settings for mapping the covariateIds to the model column mames), modelDesign (specification
-#' of the model design), trainDetails (information about the model fitting) and covariateImportance. You can use the output
+#' of the model design), trainDetails (information about the model fitting) and covariateImportance. You can use the output
#' as an input in PatientLevelPrediction::predictPlp to apply the model and calculate the risk for patients.
#'
#' @export
createSciKitLearnModel <- function(
- pythonModelLocation = '/model', # model needs to be saved here as "model.pkl"
+ pythonModelLocation = "/model", # model needs to be saved here as "model.pkl"
covariateMap = data.frame(
columnId = 1:2,
- covariateId = c(1,2),
- modelCovariateIdName = c('pred_1', 'pred_2')
+ covariateId = c(1, 2),
+ modelCovariateIdName = c("pred_1", "pred_2")
),
covariateSettings, # specify the covariates
populationSettings, # specify time at risk used to develop model
- isPickle = T
-){
-
- existingModel <- list(model = 'existingPython')
- class(existingModel) <- 'modelSettings'
-
+ isPickle = TRUE) {
+ existingModel <- list(model = "existingPython")
+ class(existingModel) <- "modelSettings"
+
plpModel <- list(
# use plpModel$preprocessing$featureEngineering to rename columns
# set plpModel$preprocessing$tidyCovariates to NULL
preprocessing = list(
featureEngineering = list(
- funct = 'mapColumns',
+ funct = "mapColumns",
settings = list(
featureEngineeringSettings = createFeatureEngineeringMapColumnsSettings(
columnMap = covariateMap
@@ -73,123 +71,118 @@ createSciKitLearnModel <- function(
)
),
tidyCovariates = NULL,
- requireDenseMatrix = F
+ requireDenseMatrix = FALSE
),
covariateImportance = data.frame(
columnId = covariateMap$columnId,
covariateId = covariateMap$modelCovariateIdName,
- included = T
+ included = TRUE
),
modelDesign = PatientLevelPrediction::createModelDesign(
- targetId = 1,
+ targetId = 1,
outcomeId = 2,
- restrictPlpDataSettings = PatientLevelPrediction::createRestrictPlpDataSettings(),
+ restrictPlpDataSettings = PatientLevelPrediction::createRestrictPlpDataSettings(),
covariateSettings = covariateSettings,
populationSettings = populationSettings,
- sampleSettings = PatientLevelPrediction::createSampleSettings(),
+ sampleSettings = PatientLevelPrediction::createSampleSettings(),
featureEngineeringSettings = createFeatureEngineeringMapColumnsSettings(
columnMap = covariateMap
),
preprocessSettings = PatientLevelPrediction::createPreprocessSettings(
- minFraction = 0,
- normalize = F,
- removeRedundancy = F
- ),
- modelSettings = existingModel,
+ minFraction = 0,
+ normalize = FALSE,
+ removeRedundancy = FALSE
+ ),
+ modelSettings = existingModel,
splitSettings = PatientLevelPrediction::createDefaultSplitSetting()
),
- model = pythonModelLocation,
+ model = pythonModelLocation,
trainDetails = list(
- analysisId = 'exisitingPython',
- developmentDatabase = 'nonOMOP',
- developmentDatabaseId = 'nonOMOP',
+ analysisId = "exisitingPython",
+ developmentDatabase = "nonOMOP",
+ developmentDatabaseId = "nonOMOP",
trainingTime = -1,
- modelName = 'existing'
- )
+ modelName = "existing"
+ )
)
-
+
attr(plpModel, "modelType") <- "binary"
- attr(plpModel, "saveType") <- "file"
- attr(plpModel, 'predictionFunction') <- "predictPythonSklearn"
- attr(plpModel, 'saveToJson') <- !isPickle
+ attr(plpModel, "saveType") <- "file"
+ attr(plpModel, "predictionFunction") <- "predictPythonSklearn"
+ attr(plpModel, "saveToJson") <- !isPickle
class(plpModel) <- "plpModel"
return(plpModel)
-
}
-#' Create settings that enable you to convert from standard covariateIds to
+#' Create settings that enable you to convert from standard covariateIds to
#' model covariate names - this is useful when implementing a model developed
#' outside of the OHDSI tools.
#'
#' @details
-#' This function create settings that let you rename the covariates in the plpData object
+#' This function creates settings that let you rename the covariates in the plpData object
+#'
+#' @param columnMap A data.frame containing the columns: covariateId the covariate ID from FeatureExtraction and
+#' modelCovariateIdName which is the column name used when fitting the model.
#'
-#' @param columnMap A data.frame containing the columns: covariateId the covariate ID from FeatureExtraction and
-#' modelCovariateIdName which is the column name used when fitting the model.
-#'
#' @return
-#' An object of class featureEngineeringSettings that will convert column names
-#'
+#' An object of class \code{featureEngineeringSettings} that will convert column names
+#'
#' @export
createFeatureEngineeringMapColumnsSettings <- function(
- columnMap
-){
-
+ columnMap) {
featureEngineeringSettings <- list(
columnMap = columnMap
)
-
+
attr(featureEngineeringSettings, "fun") <- "mapColumns"
class(featureEngineeringSettings) <- "featureEngineeringSettings"
return(featureEngineeringSettings)
-
}
mapColumns <- function(
- trainData,
- featureEngineeringSettings
- ){
- ParallelLogger::logInfo('Changing column names')
-
+ trainData,
+ featureEngineeringSettings) {
+ ParallelLogger::logInfo("Changing column names")
+
# map the columns - swap the covariateId with the modelCovariateIdName
- trainData$covariateData$columnMap <- featureEngineeringSettings$columnMap %>%
- dplyr::select("covariateId","modelCovariateIdName")
+ trainData$covariateData$columnMap <- featureEngineeringSettings$columnMap %>%
+ dplyr::select("covariateId", "modelCovariateIdName")
trainData$covariateData$covariates <- dplyr::inner_join(
- trainData$covariateData$covariates,
- trainData$covariateData$columnMap,
+ trainData$covariateData$covariates,
+ trainData$covariateData$columnMap,
by = "covariateId"
- ) %>%
+ ) %>%
dplyr::select(-"covariateId") %>%
dplyr::rename(
covariateId = "modelCovariateIdName"
)
-
+
trainData$covariateData$covariateRef <- dplyr::inner_join(
- trainData$covariateData$covariateRef,
- trainData$covariateData$columnMap,
+ trainData$covariateData$covariateRef,
+ trainData$covariateData$columnMap,
by = "covariateId"
- ) %>%
+ ) %>%
dplyr::select(-"covariateId") %>%
dplyr::rename(
covariateId = "modelCovariateIdName"
)
-
- # remove the columnMap
+
+ # remove the columnMap
trainData$covariateData$columnMap <- NULL
-
+
# add attribute for FE
featureEngineering <- list(
- funct = 'mapColumns',
+ funct = "mapColumns",
settings = list(
featureEngineeringSettings = featureEngineeringSettings
)
)
-
- attr(trainData, 'metaData')$featureEngineering = listAppend(
- attr(trainData, 'metaData')$featureEngineering,
+
+ attr(trainData, "metaData")$featureEngineering <- listAppend(
+ attr(trainData, "metaData")$featureEngineering,
featureEngineering
)
-
+
return(trainData)
}
diff --git a/R/ExternalValidatePlp.R b/R/ExternalValidatePlp.R
index de6ed9f27..202925901 100644
--- a/R/ExternalValidatePlp.R
+++ b/R/ExternalValidatePlp.R
@@ -20,69 +20,82 @@
externalValidatePlp <- function(plpModel,
plpData,
population,
- settings = list(# add covariateSummary option?
- recalibrate = 'weakRecalibration')) {
+ settings = list(
+ recalibrate = "weakRecalibration"
+ )) {
# Apply model
- #=======
- prediction <- tryCatch({
- predictPlp(plpModel = plpModel,
- plpData = plpData,
- population = population)
- },
- error = function(e) {
- ParallelLogger::logError(e)
- })
-
- prediction$evaluationType <- 'Validation'
-
+ # =======
+ prediction <- tryCatch(
+ {
+ predictPlp(
+ plpModel = plpModel,
+ plpData = plpData,
+ population = population
+ )
+ },
+ error = function(e) {
+ ParallelLogger::logError(e)
+ }
+ )
+
+ prediction$evaluationType <- "Validation"
+
# Recalibrate
- #=======
+ # =======
if (!is.null(settings$recalibrate)) {
for (recalMethod in settings$recalibrate) {
prediction <-
- tryCatch({
- recalibratePlp(prediction = prediction, method = recalMethod)
- },
- error = function(e) {
- ParallelLogger::logError(e)
- })
+ tryCatch(
+ {
+ recalibratePlp(prediction = prediction, method = recalMethod)
+ },
+ error = function(e) {
+ ParallelLogger::logError(e)
+ }
+ )
}
}
-
+
# Evaluate
- #=======
- performance <- tryCatch({
- evaluatePlp(prediction = prediction, typeColumn = 'evaluationType')
- },
- error = function(e) {
- ParallelLogger::logError(e)
- })
-
+ # =======
+ performance <- tryCatch(
+ {
+ evaluatePlp(prediction = prediction, typeColumn = "evaluationType")
+ },
+ error = function(e) {
+ ParallelLogger::logError(e)
+ }
+ )
+
# step 6: covariate summary
- labels <- tryCatch({
- population %>% dplyr::select("rowId", "outcomeCount")
- },
- error = function(e) {
- return(NULL)
- })
-
- if (settings$runCovariateSummary) {
- covariateSum <- tryCatch({
- covariateSummary(
- covariateData = plpData$covariateData,
- cohort = population[, colnames(population) != 'outcomeCount'],
- labels = labels,
- variableImportance = plpModel$covariateImportance %>% dplyr::select("covariateId", "covariateValue")
- )
+ labels <- tryCatch(
+ {
+ population %>% dplyr::select("rowId", "outcomeCount")
},
error = function(e) {
- ParallelLogger::logInfo(e)
return(NULL)
- })
- } else{
+ }
+ )
+
+ if (settings$runCovariateSummary) {
+ covariateSum <- tryCatch(
+ {
+ covariateSummary(
+ covariateData = plpData$covariateData,
+ cohort = population[, colnames(population) != "outcomeCount"],
+ labels = labels,
+ variableImportance = plpModel$covariateImportance %>% dplyr::select("covariateId", "covariateValue")
+ )
+ },
+ error = function(e) {
+ ParallelLogger::logInfo(e)
+ return(NULL)
+ }
+ )
+ } else {
covariateSum <- NULL
}
-
+
executionSummary <- list(
ExecutionDateTime = Sys.Date(),
PackageVersion = list(
@@ -91,36 +104,36 @@ externalValidatePlp <- function(plpModel,
),
PlatformDetails = list(
platform = R.Version()$platform,
- cores = Sys.getenv('NUMBER_OF_PROCESSORS'),
+ cores = Sys.getenv("NUMBER_OF_PROCESSORS"),
RAM = memuse::Sys.meminfo()[1]
) # test for non-windows needed
)
-
- model = list(
- model = 'external validation of model',
+
+ model <- list(
+ model = "external validation of model",
modelDesign = plpModel$modelDesign,
# was settings
- validationDetails = list(
- analysisId = '',
- #TODO add from model
- analysisSource = '',
- #TODO add from model
+ validationDetails = list(
+ analysisId = "",
+ # TODO add from model
+ analysisSource = "",
+ # TODO add from model
developmentDatabase = plpModel$trainDetails$developmentDatabase,
developmentDatabaseId = plpModel$trainDetails$developmentDatabaseId,
validationDatabase = plpData$metaData$databaseDetails$cdmDatabaseSchema,
validationDatabaseId = plpData$metaData$databaseDetails$cdmDatabaseId,
- populationSettings = attr(population, 'metaData')$populationSettings,
+ populationSettings = attr(population, "metaData")$populationSettings,
restrictPlpDataSettings = plpData$metaData$restrictPlpDataSettings,
- outcomeId = attr(population, 'metaData')$outcomeId,
+ outcomeId = attr(population, "metaData")$outcomeId,
targetId = plpData$metaData$databaseDetails$targetId,
- attrition = attr(population, 'metaData')$attrition,
+ attrition = attr(population, "metaData")$attrition,
validationDate = Sys.Date() # is this needed?
)
)
- attr(model, "predictionFunction") <- 'none'
- attr(model, "saveType") <- 'RtoJson'
- class(model) <- 'plpModel'
-
+ attr(model, "predictionFunction") <- "none"
+ attr(model, "saveType") <- "RtoJson"
+ class(model) <- "plpModel"
+
result <- list(
model = model,
executionSummary = executionSummary,
@@ -128,10 +141,9 @@ externalValidatePlp <- function(plpModel,
performanceEvaluation = performance,
covariateSummary = covariateSum
)
-
- class(result) <- 'externalValidatePlp'
+
+ class(result) <- "externalValidatePlp"
return(result)
-
}
@@ -159,52 +171,60 @@ externalValidatePlp <- function(plpModel,
externalValidateDbPlp <- function(plpModel,
validationDatabaseDetails = createDatabaseDetails(),
validationRestrictPlpDataSettings = createRestrictPlpDataSettings(),
- settings = createValidationSettings(recalibrate = 'weakRecalibration'),
- logSettings = createLogSettings(verbosity = 'INFO', logName = 'validatePLP'),
+ settings = createValidationSettings(recalibrate = "weakRecalibration"),
+ logSettings = createLogSettings(verbosity = "INFO", logName = "validatePLP"),
outputFolder = getwd()) {
# Input checks
- #=======
-
- checkIsClass(plpModel, 'plpModel')
-
+ # =======
+
+ checkIsClass(plpModel, "plpModel")
+
# check the class and make a list if a single database setting
- if (inherits(validationDatabaseDetails, 'list')) {
- lapply(validationDatabaseDetails, function(x)
- checkIsClass(x, 'databaseDetails'))
- } else{
- checkIsClass(validationDatabaseDetails, 'databaseDetails')
+ if (inherits(validationDatabaseDetails, "list")) {
+ lapply(validationDatabaseDetails, function(x) {
+ checkIsClass(x, "databaseDetails")
+ })
+ } else {
+ checkIsClass(validationDatabaseDetails, "databaseDetails")
validationDatabaseDetails <- list(validationDatabaseDetails)
}
- checkIsClass(validationRestrictPlpDataSettings,
- 'restrictPlpDataSettings')
- checkIsClass(settings, 'validationSettings')
-
+ checkIsClass(
+ validationRestrictPlpDataSettings,
+ "restrictPlpDataSettings"
+ )
+ checkIsClass(settings, "validationSettings")
+
# create results list with the names of the databases to validate across
result <- list()
length(result) <- length(validationDatabaseDetails)
names(result) <-
- unlist(lapply(validationDatabaseDetails, function(x)
- attr(x, 'cdmDatabaseName')))
-
+ unlist(lapply(validationDatabaseDetails, function(x) {
+ attr(x, "cdmDatabaseName")
+ }))
+
for (databaseDetails in validationDatabaseDetails) {
- databaseName <- attr(databaseDetails, 'cdmDatabaseName')
-
+ databaseName <- attr(databaseDetails, "cdmDatabaseName")
+
# initiate log
logSettings$saveDirectory <-
- file.path(outputFolder,
- databaseName,
- plpModel$trainDetails$analysisId)
- logSettings$logFileName <- 'validationLog'
+ file.path(
+ outputFolder,
+ databaseName,
+ plpModel$trainDetails$analysisId
+ )
+ logSettings$logFileName <- "validationLog"
logger <- do.call(createLog, logSettings)
ParallelLogger::registerLogger(logger)
-
- ParallelLogger::logInfo(paste('Validating model on', databaseName))
-
+
+ ParallelLogger::logInfo(paste("Validating model on", databaseName))
+
# Step 1: get data
- #=======
-
- getPlpDataSettings <- list(databaseDetails = databaseDetails,
- restrictPlpDataSettings = validationRestrictPlpDataSettings)
+ # =======
+
+ getPlpDataSettings <- list(
+ databaseDetails = databaseDetails,
+ restrictPlpDataSettings = validationRestrictPlpDataSettings
+ )
if (is.null(getPlpDataSettings$databaseDetails$targetId)) {
ParallelLogger::logInfo("targetId not in databaseSettings so using model's")
getPlpDataSettings$databaseDetails$targetId <-
@@ -215,7 +235,7 @@ externalValidateDbPlp <- function(plpModel,
getPlpDataSettings$databaseDetails$outcomeIds <-
plpModel$modelDesign$outcomeId
}
-
+
if (is.null(getPlpDataSettings$restrictPlpDataSettings$firstExposureOnly)) {
ParallelLogger::logInfo("firstExposureOnly not in restrictPlpDataSettings so using model's")
getPlpDataSettings$restrictPlpDataSettings$firstExposureOnly <-
@@ -226,63 +246,69 @@ externalValidateDbPlp <- function(plpModel,
getPlpDataSettings$restrictPlpDataSettings$washoutPeriod <-
plpModel$modelDesign$restrictPlpDataSettings$washoutPeriod
}
-
+
# TODO: we need to update this to restrict to model covariates and update custom features
getPlpDataSettings$covariateSettings <-
plpModel$modelDesign$covariateSettings
-
- plpData <- tryCatch({
- do.call(getPlpData, getPlpDataSettings)
- },
- error = function(e) {
- ParallelLogger::logError(e)
- return(NULL)
- })
-
+
+ plpData <- tryCatch(
+ {
+ do.call(getPlpData, getPlpDataSettings)
+ },
+ error = function(e) {
+ ParallelLogger::logError(e)
+ return(NULL)
+ }
+ )
+
if (is.null(plpData)) {
closeLog(logger)
}
-
+
# Step 2: create population
- #=======
-
- population <- tryCatch({
- do.call(
- createStudyPopulation,
- list(
- plpData = plpData,
- outcomeId = getPlpDataSettings$databaseDetails$outcomeIds,
- populationSettings = plpModel$modelDesign$populationSettings
+ # =======
+
+ population <- tryCatch(
+ {
+ do.call(
+ createStudyPopulation,
+ list(
+ plpData = plpData,
+ outcomeId = getPlpDataSettings$databaseDetails$outcomeIds,
+ populationSettings = plpModel$modelDesign$populationSettings
+ )
)
- )
- },
- error = function(e) {
- ParallelLogger::logError(e)
- return(NULL)
- })
-
+ },
+ error = function(e) {
+ ParallelLogger::logError(e)
+ return(NULL)
+ }
+ )
+
if (is.null(population)) {
closeLog(logger)
}
-
+
# Step 3: Apply model to plpData and population
- #=======
-
- result[[databaseName]] <- tryCatch({
- externalValidatePlp(plpModel,
- plpData,
- #databaseName = databaseName,
- population,
- settings = settings)
- },
- error = function(e) {
- ParallelLogger::logInfo(e)
- return(NULL)
- })
-
+ # =======
+
+ result[[databaseName]] <- tryCatch(
+ {
+ externalValidatePlp(plpModel,
+ plpData,
+ population,
+ settings = settings
+ )
+ },
+ error = function(e) {
+ ParallelLogger::logInfo(e)
+ return(NULL)
+ }
+ )
+
if (is.null(result[[databaseName]])) {
closeLog(logger)
- } else{
+ } else {
if (!dir.exists(file.path(
outputFolder,
databaseName,
@@ -294,22 +320,22 @@ externalValidateDbPlp <- function(plpModel,
databaseName,
plpModel$trainDetails$analysisId
),
- recursive = T
+ recursive = TRUE
)
}
-
+
savePlpResult(
result[[databaseName]],
dirPath = file.path(
outputFolder,
databaseName,
plpModel$trainDetails$analysisId,
- 'validationResult'
+ "validationResult"
)
)
}
}
-
+
# Now return results
return(invisible(result))
}
@@ -329,20 +355,22 @@ externalValidateDbPlp <- function(plpModel,
#'
#' @export
createValidationSettings <- function(recalibrate = NULL,
- runCovariateSummary = T) {
- checkIsClass(recalibrate, c('character', 'NULL'))
+ runCovariateSummary = TRUE) {
+ checkIsClass(recalibrate, c("character", "NULL"))
if (!is.null(recalibrate)) {
- if (sum(recalibrate %in% c('recalibrationInTheLarge', 'weakRecalibration')) !=
- length(recalibrate)) {
+ if (sum(recalibrate %in% c("recalibrationInTheLarge", "weakRecalibration")) !=
+ length(recalibrate)) {
ParallelLogger::logError(
- 'Incorrect recalibrate options used. Must be recalibrationInTheLarge or weakRecalibration'
+ "Incorrect recalibrate options used. Must be recalibrationInTheLarge or weakRecalibration"
)
}
}
-
- result <- list(recalibrate = recalibrate,
- runCovariateSummary = runCovariateSummary)
- class(result) <- 'validationSettings'
+
+ result <- list(
+ recalibrate = recalibrate,
+ runCovariateSummary = runCovariateSummary
+ )
+ class(result) <- "validationSettings"
return(result)
}
@@ -350,7 +378,7 @@ createValidationSettings <- function(recalibrate = NULL,
#'
#' @param targetId The targetId of the target cohort to validate on
#' @param outcomeId The outcomeId of the outcome cohort to validate on
-#' @param populationSettings A list of population restriction settings created
+#' @param populationSettings A list of population restriction settings created
#' by \code{createPopulationSettings}. Default is NULL and then this is taken
#' from the model
#' @param restrictPlpDataSettings A list of plpData restriction settings
@@ -359,7 +387,7 @@ createValidationSettings <- function(recalibrate = NULL,
#' @param plpModelList A list of plpModels objects created by \code{runPlp} or a path to such objects
#' @param recalibrate A vector of characters specifying the recalibration method to apply,
#' @param runCovariateSummary whether to run the covariate summary for the validation data
-#' @return A validation design object of class \code{validationDesign}
+#' @return A validation design object of class \code{validationDesign} or a list of such objects
#' @export
createValidationDesign <-
function(targetId,
@@ -381,8 +409,8 @@ createValidationDesign <-
checkIsClass(x, "restrictPlpDataSettings")
})
} else {
- checkIsClass(restrictPlpDataSettings, "restrictPlpDataSettings")
- }
+ checkIsClass(restrictPlpDataSettings, "restrictPlpDataSettings")
+ }
}
checkIsClass(plpModelList, "list")
lapply(plpModelList, function(x) {
@@ -391,14 +419,14 @@ createValidationDesign <-
checkIsClass(recalibrate, c("character", "NULL"))
if (!is.null(recalibrate)) {
if (sum(recalibrate %in% c("recalibrationInTheLarge", "weakRecalibration")) !=
- length(recalibrate)) {
+ length(recalibrate)) {
ParallelLogger::logError(
"Incorrect recalibrate options used. Must be recalibrationInTheLarge or weakRecalibration"
)
}
}
checkIsClass(runCovariateSummary, "logical")
-
+
# if restrictPlpDataSettings is a list make a list of designs with each
# settings
if (is.list(restrictPlpDataSettings) && !inherits(restrictPlpDataSettings, "restrictPlpDataSettings")) {
@@ -412,8 +440,8 @@ createValidationDesign <-
recalibrate = recalibrate,
runCovariateSummary = runCovariateSummary
)
- class(design) <- "validationDesign"
- return(design)
+ class(design) <- "validationDesign"
+ return(design)
})
} else {
design <- list(
@@ -426,7 +454,7 @@ createValidationDesign <-
runCovariateSummary = runCovariateSummary
)
class(design) <- "validationDesign"
- }
+ }
return(design)
}
@@ -447,10 +475,12 @@ validateExternal <- function(validationDesignList,
logSettings,
outputFolder) {
# Input checks
- changedInputs <- checkValidateExternalInputs(validationDesignList,
- databaseDetails,
- logSettings,
- outputFolder)
+ changedInputs <- checkValidateExternalInputs(
+ validationDesignList,
+ databaseDetails,
+ logSettings,
+ outputFolder
+ )
validationDesignList <- changedInputs[["validationDesignList"]]
databaseDetails <- changedInputs[["databaseDetails"]]
@@ -459,42 +489,43 @@ validateExternal <- function(validationDesignList,
length(result) <- length(databaseDetails)
names(result) <-
unlist(lapply(databaseDetails, function(x) {
- attr(x, "cdmDatabaseName")}))
-
+ attr(x, "cdmDatabaseName")
+ }))
+
# Need to keep track of incremental analysisId's for each database
databaseNames <- unlist(lapply(databaseDetails, function(x) {
- x$cdmDatabaseName}))
+ x$cdmDatabaseName
+ }))
analysisInfo <- list()
for (name in databaseNames) {
analysisInfo[name] <- 1
}
-
+
# initiate log
logSettings$saveDirectory <- outputFolder
logSettings$logFileName <- "validationLog"
logger <- do.call(createLog, logSettings)
ParallelLogger::registerLogger(logger)
on.exit(closeLog(logger))
-
+
+ downloadTasks <- createDownloadTasks(validationDesignList)
+
results <- NULL
for (design in validationDesignList) {
for (database in databaseDetails) {
databaseName <- database$cdmDatabaseName
-
- ParallelLogger::logInfo(paste("Validating model on", database$cdmDatabaseName))
-
- database$targetId <- design$targetId
-
- database$outcomeIds <- design$outcomeId
-
+
+ ParallelLogger::logInfo(paste(
+ "Validating models on", database$cdmDatabaseName,
+ "with targetId:", design$targetId, "and outcomeId:", design$outcomeId
+ ))
+
modelDesigns <- extractModelDesigns(design$plpModelList)
- allCovSettings <- lapply(modelDesigns, function(x) x$covariateSettings)
design <- fromDesignOrModel(design, modelDesigns, "restrictPlpDataSettings")
- checkAllSameInModels(allCovSettings, "covariateSettings")
-
+
# get plpData
- plpData <- getData(design, database, outputFolder, allCovSettings)
+ plpData <- getData(design, database, outputFolder, downloadTasks)
if (is.null(plpData)) {
ParallelLogger::logInfo("Couldn't extract plpData for the given design and database, proceeding to the next one.")
next
@@ -502,6 +533,18 @@ validateExternal <- function(validationDesignList,
# create study population
population <- getPopulation(design, modelDesigns, plpData)
+ if (sum(population$outcomeCount) < 10) {
+ ParallelLogger::logInfo(
+ paste(
+ "Outcome size is less than 10, skipping validation for design and database:",
+ databaseName,
+ "and targetId:", design$targetId,
+ "outcomeId", design$outcomeId
+ )
+ )
+ next
+ }
+
results <- lapply(design$plpModelList, function(model) {
analysisName <- paste0("Analysis_", analysisInfo[databaseName])
analysisDone <- file.exists(
@@ -522,10 +565,13 @@ validateExternal <- function(validationDesignList,
runCovariateSummary = design$runCovariateSummary,
outputFolder = outputFolder,
databaseName = databaseName,
- analysisName = analysisName)
+ analysisName = analysisName
+ )
} else {
- ParallelLogger::logInfo(paste0("Analysis ", analysisName, " already done",
- ", Proceeding to the next one."))
+ ParallelLogger::logInfo(paste0(
+ "Analysis ", analysisName, " already done",
+ ", Proceeding to the next one."
+ ))
}
analysisInfo[[databaseName]] <<- analysisInfo[[databaseName]] + 1
@@ -536,19 +582,20 @@ validateExternal <- function(validationDesignList,
databaseName <- database$cdmDatabaseName
sqliteLocation <- file.path(outputFolder, "sqlite")
- tryCatch({
- insertResultsToSqlite(
- resultLocation = file.path(outputFolder, databaseName),
- cohortDefinitions = NULL,
- databaseList = createDatabaseList(
- cdmDatabaseSchemas = database$cdmDatabaseSchema,
- cdmDatabaseNames = database$cdmDatabaseName,
- databaseRefIds = database$cdmDatabaseId
- ),
- sqliteLocation = sqliteLocation
- )
- },
- error = function(e) ParallelLogger::logInfo(e)
+ tryCatch(
+ {
+ insertResultsToSqlite(
+ resultLocation = file.path(outputFolder, databaseName),
+ cohortDefinitions = NULL,
+ databaseList = createDatabaseList(
+ cdmDatabaseSchemas = database$cdmDatabaseSchema,
+ cdmDatabaseNames = database$cdmDatabaseName,
+ databaseRefIds = database$cdmDatabaseId
+ ),
+ sqliteLocation = sqliteLocation
+ )
+ },
+ error = function(e) ParallelLogger::logInfo(e)
)
}
return(invisible(results))
@@ -563,8 +610,6 @@ validateModel <-
outputFolder,
databaseName,
analysisName) {
-
-
if (is.character(plpModel)) {
plpModel <- loadPlpModel(plpModel)
}
@@ -573,31 +618,36 @@ validateModel <-
plpModel = plpModel,
plpData = plpData,
population = population,
- settings = list(recalibrate = recalibrate,
- runCovariateSummary = runCovariateSummary)
+ settings = list(
+ recalibrate = recalibrate,
+ runCovariateSummary = runCovariateSummary
+ )
)
savePlpResult(result,
- dirPath = file.path(
- outputFolder,
- databaseName,
- analysisName,
-
- "validationResult"
- ))
+ dirPath = file.path(
+ outputFolder,
+ databaseName,
+ analysisName,
+ "validationResult"
+ )
+ )
return(result)
-}
+ }
#' checkAllSameInModels - Check if all settings are the same across models
#' @param settingsList A list of settings to check
#' @param settingName The name of the setting to check
#' @keywords internal
checkAllSameInModels <- function(settingsList, settingName) {
- if (!Reduce(function(x, y) {
- x &&
- identical(y, settingsList[[1]])},
+ if (!Reduce(
+ function(x, y) {
+ x &&
+ identical(y, settingsList[[1]])
+ },
settingsList[-1],
- init = TRUE)) {
- stop(paste0(settingName, "are not the same across models which is not supported yet"))
+ init = TRUE
+ )) {
+ stop(paste0(settingName, "are not the same across models which is not supported"))
}
}
@@ -613,7 +663,7 @@ extractModelDesigns <- function(plpModelList) {
)
return(modelDesign)
} else {
- plpModel$modelDesign
+ plpModel$modelDesign
}
})
}
@@ -631,22 +681,26 @@ checkValidateExternalInputs <- function(validationDesignList,
outputFolder) {
if (inherits(validationDesignList, "list")) {
lapply(validationDesignList, function(x) {
- checkIsClass(x, "validationDesign")})
+ checkIsClass(x, "validationDesign")
+ })
} else {
checkIsClass(validationDesignList, "validationDesign")
validationDesignList <- list(validationDesignList)
}
-
+
# check the class and make a list if a single database setting
if (inherits(databaseDetails, "list")) {
lapply(databaseDetails, function(x) {
- checkIsClass(x, "databaseDetails")})
+ checkIsClass(x, "databaseDetails")
+ })
} else {
checkIsClass(databaseDetails, "databaseDetails")
databaseDetails <- list(databaseDetails)
}
- results <- list(validationDesignList = validationDesignList,
- databaseDetails = databaseDetails)
+ results <- list(
+ validationDesignList = validationDesignList,
+ databaseDetails = databaseDetails
+ )
return(results)
}
@@ -664,9 +718,10 @@ fromDesignOrModel <- function(validationDesign, modelDesigns, settingName) {
ParallelLogger::logInfo(paste0(settingName, " not set in design, using model's"))
} else {
if (any(unlist(lapply(modelDesigns, function(x) {
- !identical(x[[settingName]], validationDesign[[settingName]])
- })))) {
- ParallelLogger::logWarn(settingName, " are not the same in models and validationDesign")
+ !identical(x[[settingName]], validationDesign[[settingName]])
+ })))) {
+ ParallelLogger::logWarn(settingName, " are not the same in models and validationDesign,
+ using from design")
}
}
return(validationDesign)
@@ -676,39 +731,67 @@ fromDesignOrModel <- function(validationDesign, modelDesigns, settingName) {
#' @param design The validationDesign object
#' @param database The databaseDetails object
#' @param outputFolder The directory to save the validation results to
-#' @param allCovSettings A list of covariateSettings from the models
+#' @param downloadTasks A list of download tasks determined by unique
+#' combinations of targetId and restrictPlpDataSettings
#' @return The plpData object
#' @keywords internal
-getData <- function(design, database, outputFolder, allCovSettings) {
+getData <- function(design, database, outputFolder, downloadTasks) {
+ # find task associated with design and the index of the task in downloadTasks
+ task <- downloadTasks %>%
+ dplyr::filter(.data$targetId == design$targetId) %>%
+ dplyr::mutate(
+ taskId = dplyr::row_number(),
+ collapsed = sapply(.data$restrictPlpDataSettings, paste0, collapse = "|")
+ ) %>%
+ dplyr::filter(.data$collapsed == paste0(design$restrictPlpDataSettings, collapse = "|")) %>%
+ dplyr::select(-"collapsed")
+ covariateSettings <- task$covariateSettings[[1]]
+ task$covariateSettings <- NULL
+ if (length(covariateSettings) > 1) {
+ task$covariateSettings <- list(unlist(covariateSettings, recursive = FALSE))
+ } else {
+ task$covariateSettings <- covariateSettings
+ }
+
databaseName <- database$cdmDatabaseName
+ database$targetId <- task$targetId
+ database$outcomeIds <- task$outcomeIds[[1]]
plpDataName <-
- paste0("targetId_", design$targetId, "_L", "1")
+ paste0("targetId_", design$targetId, "_L", task$taskId)
plpDataLocation <-
file.path(outputFolder, databaseName, plpDataName)
if (!dir.exists(plpDataLocation)) {
- plpData <- tryCatch({
- do.call(
- getPlpData,
- list(
- databaseDetails = database,
- restrictPlpDataSettings = design$restrictPlpDataSettings,
- covariateSettings = allCovSettings[[1]]
+ plpData <- tryCatch(
+ {
+ do.call(
+ getPlpData,
+ list(
+ databaseDetails = database,
+ restrictPlpDataSettings = task$restrictPlpDataSettings[[1]],
+ covariateSettings = task$covariateSettings[[1]]
+ )
)
- )
- },
- error = function(e) {
- ParallelLogger::logError(e)
- return(NULL)
- })
+ },
+ error = function(e) {
+ ParallelLogger::logError(e)
+ return(NULL)
+ }
+ )
if (!is.null(plpData)) {
if (!dir.exists(file.path(outputFolder, databaseName))) {
dir.create(file.path(outputFolder, databaseName), recursive = TRUE)
}
+ if (length(covariateSettings) > 1) {
+ plpData$covariateData <-
+ deDuplicateCovariateData(plpData$covariateData)
+ }
savePlpData(plpData, file = plpDataLocation)
}
} else {
- ParallelLogger::logInfo(paste0("Data already extracted for ",
- plpDataName, ": Loading from disk"))
+ ParallelLogger::logInfo(paste0(
+ "Data already extracted for ",
+ plpDataName, ": Loading from disk"
+ ))
plpData <- loadPlpData(plpDataLocation)
}
return(plpData)
@@ -722,20 +805,123 @@ getData <- function(design, database, outputFolder, allCovSettings) {
#' @keywords internal
getPopulation <- function(validationDesign, modelDesigns, plpData) {
design <- fromDesignOrModel(validationDesign, modelDesigns, "populationSettings")
- population <- tryCatch({
- do.call(
- createStudyPopulation,
- list(
- plpData = plpData,
- outcomeId = design$outcomeId,
- populationSettings = design$populationSettings
+ population <- tryCatch(
+ {
+ do.call(
+ createStudyPopulation,
+ list(
+ plpData = plpData,
+ outcomeId = design$outcomeId,
+ populationSettings = design$populationSettings
+ )
)
- )
- },
- error = function(e) {
- ParallelLogger::logError(e)
- return(NULL)
- })
+ },
+ error = function(e) {
+ ParallelLogger::logError(e)
+ return(NULL)
+ }
+ )
return(population)
}
+#' createDownloadTasks
+#' create download tasks based on unique combinations of targetId and
+#' restrictPlpDataSettings. It adds all covariateSettings and outcomes that
+#' have that targetId and restrictPlpDataSettings. This is used to avoid
+#' downloading the same data multiple times.
+#' @param validationDesignList A list of validationDesign objects
+#' @return A dataframe where each row is a downloadTask
+#' @keywords internal
+createDownloadTasks <- function(validationDesignList) {
+ ParallelLogger::logInfo("Extracting unique combinations of targetId, \
+ restrictPlpDataSettings and covariateSettings for extracting data")
+
+ rowsList <- list()
+ modelCache <- list()
+
+ for (design in validationDesignList) {
+ targetId <- design$targetId
+ outcomeId <- design$outcomeId
+ restrictPlpDataSettings <- design$restrictPlpDataSettings
+ if (is.null(restrictPlpDataSettings)) {
+ restrictList <- lapply(
+ design$plpModelList,
+ function(x) x$modelDesign$restrictPlpDataSettings
+ )
+ checkAllSameInModels(restrictList, "restrictPlpDataSettings")
+ restrictPlpDataSettings <- restrictList[[1]]
+ }
+ plpModelList <- design$plpModelList
+ for (model in plpModelList) {
+ if (is.character(model)) {
+ modelKey <- model
+ if (!is.null(modelCache[[modelKey]])) {
+ model <- modelCache[[modelKey]]
+ } else {
+ model <- loadPlpModel(modelKey)
+ modelCache[[modelKey]] <- model
+ }
+ } else {
+ modelKey <- digest::digest(model)
+ if (!is.null(modelCache[[modelKey]])) {
+ model <- modelCache[[modelKey]]
+ } else {
+ modelCache[[modelKey]] <- model
+ }
+ }
+ covariateSettings <- model$modelDesign$covariateSettings
+ row <- list(
+ targetId = targetId,
+ outcomeIds = outcomeId,
+ restrictPlpDataSettings = list(restrictPlpDataSettings),
+ covariateSettings = list(covariateSettings)
+ )
+ rowsList[[length(rowsList) + 1]] <- row
+ }
+ }
+ rowsDf <- dplyr::bind_rows(rowsList)
+
+ rowsDf <- rowsDf %>%
+ dplyr::rowwise() %>%
+ dplyr::mutate(
+ restrictKey = digest::digest(restrictPlpDataSettings),
+ covariateKey = digest::digest(covariateSettings)
+ ) %>%
+ dplyr::ungroup()
+
+ uniqueCovariateSettings <- function(settingsList, settingsKeys) {
+ uniqueKeys <- unique(settingsKeys)
+ indices <- match(uniqueKeys, settingsKeys)
+ uniqueSettings <- settingsList[indices]
+ return(uniqueSettings)
+ }
+
+ downloadTasks <- rowsDf %>%
+ dplyr::group_by(.data$targetId, .data$restrictKey) %>%
+ dplyr::summarise(
+ outcomeIds = list(unique(.data$outcomeIds)),
+ restrictPlpDataSettings = .data$restrictPlpDataSettings[1],
+ covariateSettings = list(uniqueCovariateSettings(.data$covariateSettings, .data$covariateKey)),
+ .groups = "drop"
+ ) %>%
+ dplyr::select(c(
+ "targetId", "outcomeIds", "restrictPlpDataSettings",
+ "covariateSettings"
+ ))
+
+ return(downloadTasks)
+}
+
+#' deplucateCovariateData - Remove duplicate covariate data
+#' when downloading data with multiple different covariateSettings sometimes
+#' there will be duplicated analysisIds which need to be removed
+#' @param covariateData The covariate data Andromeda object
+#' @return The deduplicated covariate data
+#' @keywords internal
+deDuplicateCovariateData <- function(covariateData) {
+ covariateData$covariateRef <- covariateData$covariateRef %>%
+ dplyr::distinct()
+ covariateData$covariates <- covariateData$covariates %>%
+ dplyr::distinct()
+ return(covariateData)
+}
diff --git a/R/ExtractData.R b/R/ExtractData.R
index e754eb802..4dbd6bcfe 100644
--- a/R/ExtractData.R
+++ b/R/ExtractData.R
@@ -24,7 +24,7 @@
#' This function creates the settings used to restrict the target cohort when calling getPlpData
#' @details
#' Users need to specify the extra restrictions to apply when downloading the target cohort
-#'
+#'
#' @param studyStartDate A calendar date specifying the minimum date that a cohort index
#' date can appear. Date format is 'yyyymmdd'.
#' @param studyEndDate A calendar date specifying the maximum date that a cohort index
@@ -39,27 +39,27 @@
#' this is typically done in the \code{createStudyPopulation} function,
#' but can already be done here for efficiency reasons.
#' @param sampleSize If not NULL, the number of people to sample from the target cohort
-#'
+#'
#' @return
#' A setting object of class \code{restrictPlpDataSettings} containing a list getPlpData extra settings
#'
#' @export
createRestrictPlpDataSettings <- function(
- studyStartDate = "",
- studyEndDate = "",
- firstExposureOnly = F,
- washoutPeriod = 0,
- sampleSize = NULL
-){
-
- if (studyStartDate != "" && regexpr("^[12][0-9]{3}[01][0-9][0-3][0-9]$", studyStartDate) == -1)
+ studyStartDate = "",
+ studyEndDate = "",
+ firstExposureOnly = FALSE,
+ washoutPeriod = 0,
+ sampleSize = NULL) {
+ if (studyStartDate != "" && regexpr("^[12][0-9]{3}[01][0-9][0-3][0-9]$", studyStartDate) == -1) {
stop("Study start date must have format YYYYMMDD")
- if (studyEndDate != "" && regexpr("^[12][0-9]{3}[01][0-9][0-3][0-9]$", studyEndDate) == -1)
+ }
+ if (studyEndDate != "" && regexpr("^[12][0-9]{3}[01][0-9][0-3][0-9]$", studyEndDate) == -1) {
stop("Study end date must have format YYYYMMDD")
-
+ }
+
# add input checks
- checkIsClass(sampleSize, c('integer','numeric','NULL'))
-
+ checkIsClass(sampleSize, c("integer", "numeric", "NULL"))
+
result <- list(
studyStartDate = studyStartDate,
studyEndDate = studyEndDate,
@@ -67,16 +67,16 @@ createRestrictPlpDataSettings <- function(
washoutPeriod = washoutPeriod,
sampleSize = sampleSize
)
-
- class(result) <- 'restrictPlpDataSettings'
+
+ class(result) <- "restrictPlpDataSettings"
return(result)
}
#' Create a setting that holds the details about the cdmDatabase connection for data extraction
#'
#' @details
-#' This function simply stores the settings for communicating with the cdmDatabase when extracting
-#' the target cohort and outcomes
+#' This function simply stores the settings for communicating with the cdmDatabase when extracting
+#' the target cohort and outcomes
#'
#' @param connectionDetails An R object of type \code{connectionDetails} created using the
#' function \code{createConnectionDetails} in the
@@ -106,53 +106,51 @@ createRestrictPlpDataSettings <- function(
#' @param outcomeIds A single integer or vector of integers specifying the cohort ids for the outcome cohorts
#' @param cdmVersion Define the OMOP CDM version used: currently support "4" and "5".
#' @param cohortId (depreciated: use targetId) old input for the target cohort id
-#'
+#'
#' @return
#' A list with the the database specific settings (this is used by the runMultiplePlp function and the skeleton packages)
#'
#' @export
createDatabaseDetails <- function(
- connectionDetails,
- cdmDatabaseSchema,
- cdmDatabaseName,
- cdmDatabaseId, # added for strategus
- tempEmulationSchema = cdmDatabaseSchema,
- cohortDatabaseSchema = cdmDatabaseSchema,
- cohortTable = "cohort",
- outcomeDatabaseSchema = cdmDatabaseSchema,
- outcomeTable = "cohort",
- targetId = NULL,
- outcomeIds = NULL,
- cdmVersion = 5,
- cohortId = NULL
-){
-
- if(is.null(targetId)){
- if(!is.null(cohortId)){
- ParallelLogger::logWarn('cohortId has been depreciated. Please use targetId.')
+ connectionDetails,
+ cdmDatabaseSchema,
+ cdmDatabaseName,
+ cdmDatabaseId, # added for strategus
+ tempEmulationSchema = cdmDatabaseSchema,
+ cohortDatabaseSchema = cdmDatabaseSchema,
+ cohortTable = "cohort",
+ outcomeDatabaseSchema = cdmDatabaseSchema,
+ outcomeTable = "cohort",
+ targetId = NULL,
+ outcomeIds = NULL,
+ cdmVersion = 5,
+ cohortId = NULL) {
+ if (is.null(targetId)) {
+ if (!is.null(cohortId)) {
+ ParallelLogger::logWarn("cohortId has been depreciated. Please use targetId.")
targetId <- cohortId
}
}
-
- if(missing(cdmDatabaseName)){
- ParallelLogger::logInfo('No cdm database name entered so using cdmDatabaseSchema')
+
+ if (missing(cdmDatabaseName)) {
+ ParallelLogger::logInfo("No cdm database name entered so using cdmDatabaseSchema")
cdmDatabaseName <- removeInvalidString(cdmDatabaseSchema)
}
- if(missing(cdmDatabaseId)){
- ParallelLogger::logInfo('No cdm database id entered so using cdmDatabaseSchema - if cdmDatabaseSchema is the same for multiple different databases, please use cdmDatabaseId to specify a unique identifier for the database and version')
+ if (missing(cdmDatabaseId)) {
+ ParallelLogger::logInfo("No cdm database id entered so using cdmDatabaseSchema - if cdmDatabaseSchema is the same for multiple different databases, please use cdmDatabaseId to specify a unique identifier for the database and version")
cdmDatabaseId <- removeInvalidString(cdmDatabaseSchema)
}
-
- if(length(cdmDatabaseId) == 0 ){
- stop('cdmDatabaseId must be a string with length > 0')
+
+ if (length(cdmDatabaseId) == 0) {
+ stop("cdmDatabaseId must be a string with length > 0")
}
-
+
# check to make sure cdmDatabaseId is not an int as that will cause issues
- if(!inherits(cdmDatabaseId, 'character')){
- ParallelLogger::logInfo('cdmDatabaseId is not a string - this will cause issues when inserting into a result database so casting it')
+ if (!inherits(cdmDatabaseId, "character")) {
+ ParallelLogger::logInfo("cdmDatabaseId is not a string - this will cause issues when inserting into a result database so casting it")
cdmDatabaseId <- as.character(cdmDatabaseId)
}
-
+
result <- list(
connectionDetails = connectionDetails,
cdmDatabaseSchema = cdmDatabaseSchema,
@@ -167,13 +165,13 @@ createDatabaseDetails <- function(
outcomeIds = outcomeIds,
cdmVersion = cdmVersion
)
-
- attr(result, 'cdmDatabaseName') <- cdmDatabaseName
- class(result) <- 'databaseDetails'
+
+ attr(result, "cdmDatabaseName") <- cdmDatabaseName
+ class(result) <- "databaseDetails"
return(result)
}
-#' Get the patient level prediction data from the server
+#' Extract the patient level prediction data from the server
#' @description
#' This function executes a large set of SQL statements against the database in OMOP CDM format to
#' extract the data needed to perform the analysis.
@@ -182,19 +180,20 @@ createDatabaseDetails <- function(
#' Based on the arguments, the at risk cohort data is retrieved, as well as outcomes
#' occurring in these subjects. The at risk cohort is identified through
#' user-defined cohorts in a cohort table either inside the CDM instance or in a separate schema.
-#' Similarly, outcomes are identified
+#' Similarly, outcomes are identified
#' through user-defined cohorts in a cohort table either inside the CDM instance or in a separate
#' schema. Covariates are automatically extracted from the appropriate tables within the CDM.
#' If you wish to exclude concepts from covariates you will need to
#' manually add the concept_ids and descendants to the \code{excludedCovariateConceptIds} of the
#' \code{covariateSettings} argument.
#'
-#'
+#'
#' @param databaseDetails The cdm database details created using \code{createDatabaseDetails()}
-#' @param covariateSettings An object of type \code{covariateSettings} as created using the
+#' @param covariateSettings An object of type \code{covariateSettings} or a list of such objects as created using the
#' \code{createCovariateSettings} function in the
#' \code{FeatureExtraction} package.
-#' @param restrictPlpDataSettings Extra settings to apply to the target population while extracting data. Created using \code{createRestrictPlpDataSettings()}.
+#' @param restrictPlpDataSettings Extra settings to apply to the target population while extracting data.
+#' Created using \code{createRestrictPlpDataSettings()}. This is optional.
#'
#' @return
#' Returns an object of type \code{plpData}, containing information on the cohorts, their
@@ -204,103 +203,120 @@ createDatabaseDetails <- function(
#' the outcome id. Outcomes are not yet filtered based on risk window, since this is done at
#' a later stage.} \item{cohorts}{A data frame listing the persons in each cohort, listing their
#' exposure status as well as the time to the end of the observation period and time to the end of the
-#' cohort (usually the end of the exposure era).} \item{covariates}{An ffdf object listing the
+#' cohort (usually the end of the exposure era).} \item{covariates}{An Andromeda object listing the
#' baseline covariates per person in the two cohorts. This is done using a sparse representation:
-#' covariates with a value of 0 are omitted to save space.} \item{covariateRef}{An ffdf object describing the covariates that have been extracted.}
-#' \item{metaData}{A list of objects with information on how the cohortMethodData object was
-#' constructed.} } The generic \code{()} and \code{summary()} functions have been implemented for this object.
-#'
+#' covariates with a value of 0 are omitted to save space. Usually has three columns,
+#' rowId, covariateId and covariateValue'.} \item{covariateRef}{An Andromeda object describing the covariates that have been extracted.}
+#' \item{AnalysisRef}{An Andromeda object with information about which analysisIds from 'FeatureExtraction' were used.} }
#' @export
getPlpData <- function(
- databaseDetails,
- covariateSettings,
- restrictPlpDataSettings
-) {
-
- checkIsClass(databaseDetails, 'databaseDetails')
- checkIsClass(restrictPlpDataSettings, 'restrictPlpDataSettings')
-
- if(is.null(databaseDetails$targetId))
- stop('User must input targetId')
- if(length(databaseDetails$targetId)>1)
- stop('Currently only supports one targetId at a time')
- if(is.null(databaseDetails$outcomeIds))
- stop('User must input outcomeIds')
- #ToDo: add other checks the inputs are valid
-
+ databaseDetails,
+ covariateSettings,
+ restrictPlpDataSettings = NULL) {
+ checkIsClass(databaseDetails, "databaseDetails")
+ if (!inherits(covariateSettings, "list")) {
+ checkIsClass(covariateSettings, "covariateSettings")
+ } else {
+ for (i in seq_along(covariateSettings)) {
+ checkIsClass(covariateSettings[[i]], "covariateSettings")
+ }
+ }
+ if (!is.null(restrictPlpDataSettings)) {
+ checkIsClass(restrictPlpDataSettings, "restrictPlpDataSettings")
+ }
+
+
+
+ if (is.null(databaseDetails$targetId)) {
+ stop("User must input targetId")
+ }
+ if (length(databaseDetails$targetId) > 1) {
+ stop("Currently only supports one targetId at a time")
+ }
+ if (is.null(databaseDetails$outcomeIds)) {
+ stop("User must input outcomeIds")
+ }
+ # ToDo: add other checks the inputs are valid
+
connection <- DatabaseConnector::connect(databaseDetails$connectionDetails)
on.exit(DatabaseConnector::disconnect(connection))
dbms <- databaseDetails$connectionDetails$dbms
-
+
ParallelLogger::logTrace("\nConstructing the at risk cohort")
- if(!is.null(restrictPlpDataSettings$sampleSize)) writeLines(paste("\n Sampling ",restrictPlpDataSettings$sampleSize, " people"))
+ if (!is.null(restrictPlpDataSettings$sampleSize)) writeLines(paste("\n Sampling ", restrictPlpDataSettings$sampleSize, " people"))
pathToSql <- system.file(
- paste("sql/", "sql_server",
- sep = ""),
- 'CreateCohorts.sql',
+ paste("sql/", "sql_server",
+ sep = ""
+ ),
+ "CreateCohorts.sql",
package = "PatientLevelPrediction"
)
-
- renderedSql <- readChar(pathToSql, file.info(pathToSql)$size)
- renderedSql <- SqlRender::render(
- sql = renderedSql,
+
+ renderedSql <- readChar(pathToSql, file.info(pathToSql)$size)
+ renderArgs <- list(sql = renderedSql,
cdm_database_schema = databaseDetails$cdmDatabaseSchema,
cohort_database_schema = databaseDetails$cohortDatabaseSchema,
cohort_table = databaseDetails$cohortTable,
cdm_version = databaseDetails$cdmVersion,
- target_id = databaseDetails$targetId,
- study_start_date = restrictPlpDataSettings$studyStartDate,
- study_end_date = restrictPlpDataSettings$studyEndDate,
- first_only = restrictPlpDataSettings$firstExposureOnly,
- washout_period = restrictPlpDataSettings$washoutPeriod,
- use_sample = !is.null(restrictPlpDataSettings$sampleSize),
- sample_number = restrictPlpDataSettings$sampleSize
- )
+ target_id = databaseDetails$targetId)
+
+ if (!is.null(restrictPlpDataSettings$sampleSize)) {
+ renderArgs$study_start_date <- restrictPlpDataSettings$studyStartDate
+ renderArgs$study_end_date <- restrictPlpDataSettings$studyEndDate
+ renderArgs$first_only <- restrictPlpDataSettings$firstExposureOnly
+ renderArgs$washout_period <- restrictPlpDataSettings$washoutPeriod
+ renderArgs$use_sample <- !is.null(restrictPlpDataSettings$sampleSize)
+ renderArgs$sample_number <- restrictPlpDataSettings$sampleSize
+ }
+
+ renderedSql <- do.call(SqlRender::render, renderArgs)
+
renderedSql <- SqlRender::translate(
- sql = renderedSql,
- targetDialect = dbms,
+ sql = renderedSql,
+ targetDialect = dbms,
tempEmulationSchema = databaseDetails$tempEmulationSchema
)
-
+
DatabaseConnector::executeSql(connection, renderedSql)
-
+
ParallelLogger::logTrace("Fetching cohorts from server")
start <- Sys.time()
-
+
pathToSql <- system.file(
- paste("sql/", "sql_server",
- sep = ""),
- "GetCohorts.sql",
+ paste("sql/", "sql_server",
+ sep = ""
+ ),
+ "GetCohorts.sql",
package = "PatientLevelPrediction"
)
-
- cohortSql <- readChar(pathToSql, file.info(pathToSql)$size)
-
+
+ cohortSql <- readChar(pathToSql, file.info(pathToSql)$size)
+
cohortSql <- SqlRender::render(
sql = cohortSql,
cdm_version = databaseDetails$cdmVersion
)
-
+
cohortSql <- SqlRender::translate(
- sql = cohortSql,
- targetDialect = dbms,
+ sql = cohortSql,
+ targetDialect = dbms,
tempEmulationSchema = databaseDetails$tempEmulationSchema
)
cohorts <- DatabaseConnector::querySql(connection, cohortSql)
colnames(cohorts) <- SqlRender::snakeCaseToCamelCase(colnames(cohorts))
- metaData.cohort <- list(targetId = databaseDetails$targetId)
-
- if(nrow(cohorts)==0){
- stop('Target population is empty')
+ metaDataCohort <- list(targetId = databaseDetails$targetId)
+
+ if (nrow(cohorts) == 0) {
+ stop("Target population is empty")
}
-
+
delta <- Sys.time() - start
ParallelLogger::logTrace(paste("Loading cohorts took", signif(delta, 3), attr(delta, "units")))
covariateData <- FeatureExtraction::getDbCovariateData(
- connection = connection,
- oracleTempSchema = databaseDetails$tempEmulationSchema,
+ connection = connection,
+ tempEmulationSchema = databaseDetails$tempEmulationSchema,
cdmDatabaseSchema = databaseDetails$cdmDatabaseSchema,
cdmVersion = databaseDetails$cdmVersion,
cohortTable = "#cohort_person",
@@ -309,26 +325,30 @@ getPlpData <- function(
covariateSettings = covariateSettings
)
# add indexes for tidyCov + covariate summary
- Andromeda::createIndex(covariateData$covariates, c('rowId'),
- indexName = 'covariates_rowId')
- Andromeda::createIndex(covariateData$covariates, c('covariateId'),
- indexName = 'covariates_covariateId')
- Andromeda::createIndex(covariateData$covariates, c('covariateId', 'covariateValue'),
- indexName = 'covariates_covariateId_value')
-
- if(max(databaseDetails$outcomeIds)!=-999){
+ Andromeda::createIndex(covariateData$covariates, c("rowId"),
+ indexName = "covariates_rowId"
+ )
+ Andromeda::createIndex(covariateData$covariates, c("covariateId"),
+ indexName = "covariates_covariateId"
+ )
+ Andromeda::createIndex(covariateData$covariates, c("covariateId", "covariateValue"),
+ indexName = "covariates_covariateId_value"
+ )
+
+ if (max(databaseDetails$outcomeIds) != -999) {
ParallelLogger::logTrace("Fetching outcomes from server")
start <- Sys.time()
-
+
pathToSql <- system.file(
- paste("sql/", "sql_server",
- sep = ""),
- "GetOutcomes.sql",
+ paste("sql/", "sql_server",
+ sep = ""
+ ),
+ "GetOutcomes.sql",
package = "PatientLevelPrediction"
)
-
- outcomeSql <- readChar(pathToSql, file.info(pathToSql)$size)
-
+
+ outcomeSql <- readChar(pathToSql, file.info(pathToSql)$size)
+
outcomeSql <- SqlRender::render(
sql = outcomeSql,
cdm_database_schema = databaseDetails$cdmDatabaseSchema,
@@ -337,75 +357,80 @@ getPlpData <- function(
outcome_ids = databaseDetails$outcomeIds,
cdm_version = databaseDetails$cdmVersion
)
-
+
outcomeSql <- SqlRender::translate(
- sql = outcomeSql,
- targetDialect = dbms,
+ sql = outcomeSql,
+ targetDialect = dbms,
tempEmulationSchema = databaseDetails$tempEmulationSchema
)
-
+
outcomes <- DatabaseConnector::querySql(connection, outcomeSql)
colnames(outcomes) <- SqlRender::snakeCaseToCamelCase(colnames(outcomes))
- metaData.outcome <- data.frame(outcomeIds = databaseDetails$outcomeIds)
- attr(outcomes, "metaData") <- metaData.outcome
- if(nrow(outcomes)==0){
- stop('No Outcomes')
+ metaDataOutcome <- data.frame(outcomeIds = databaseDetails$outcomeIds)
+ attr(outcomes, "metaData") <- metaDataOutcome
+ if (nrow(outcomes) == 0) {
+ stop("No Outcomes")
}
-
- metaData.cohort$attrition <- getCounts2(cohorts,outcomes, "Original cohorts")
- attr(cohorts, "metaData") <- metaData.cohort
-
+
+ metaDataCohort$attrition <- getCounts2(cohorts, outcomes, "Original cohorts")
+ attr(cohorts, "metaData") <- metaDataCohort
+
delta <- Sys.time() - start
ParallelLogger::logTrace(paste("Loading outcomes took", signif(delta, 3), attr(delta, "units")))
} else {
outcomes <- NULL
}
-
-
+
+
# Remove temp tables:
pathToSql <- system.file(
- paste("sql/", "sql_server",
- sep = ""),
- "RemoveCohortTempTables.sql",
+ paste("sql/", "sql_server",
+ sep = ""
+ ),
+ "RemoveCohortTempTables.sql",
package = "PatientLevelPrediction"
)
-
- removeSql <- readChar(pathToSql, file.info(pathToSql)$size)
+
+ removeSql <- readChar(pathToSql, file.info(pathToSql)$size)
removeSql <- SqlRender::translate(
- sql = removeSql,
- targetDialect = dbms,
+ sql = removeSql,
+ targetDialect = dbms,
tempEmulationSchema = databaseDetails$tempEmulationSchema
)
-
+
DatabaseConnector::executeSql(connection, removeSql, progressBar = FALSE, reportOverallTime = FALSE)
- #DatabaseConnector::disconnect(connection)
-
+
metaData <- covariateData$metaData
metaData$databaseDetails <- databaseDetails
- metaData$databaseDetails$connectionDetails = NULL
- metaData$databaseDetails$connection = NULL
+ metaData$databaseDetails$connectionDetails <- NULL
+ metaData$databaseDetails$connection <- NULL
metaData$restrictPlpDataSettings <- restrictPlpDataSettings
metaData$covariateSettings <- covariateSettings
-
+
# create the temporal settings (if temporal use)
timeReference <- NULL
- if(!is.null(covariateSettings$temporal)){
- if(covariateSettings$temporal){
+ if (!is.null(covariateSettings$temporal)) {
+ if (covariateSettings$temporal) {
# make sure time days populated
- if(length(covariateSettings$temporalStartDays)>0){
- timeReference = data.frame(timeId=1:length(covariateSettings$temporalStartDays),
- startDay = covariateSettings$temporalStartDays,
- endDay = covariateSettings$temporalEndDays)
+ if (length(covariateSettings$temporalStartDays) > 0) {
+ timeReference <- data.frame(
+ timeId = seq_along(covariateSettings$temporalStartDays),
+ startDay = covariateSettings$temporalStartDays,
+ endDay = covariateSettings$temporalEndDays
+ )
}
- }}
-
-
- result <- list(cohorts = cohorts,
- outcomes = outcomes,
- covariateData = covariateData,
- timeRef = timeReference,
- metaData = metaData)
-
+ }
+ }
+
+
+ result <- list(
+ cohorts = cohorts,
+ outcomes = outcomes,
+ covariateData = covariateData,
+ timeRef = timeReference,
+ metaData = metaData
+ )
+
class(result) <- "plpData"
return(result)
}
@@ -421,22 +446,26 @@ print.plpData <- function(x, ...) {
#' @method summary plpData
#' @export
-summary.plpData <- function(object,...){
+summary.plpData <- function(object, ...) {
people <- length(unique(object$cohorts$subjectId))
- outcomeCounts <- data.frame(outcomeId = attr(object$outcomes, "metaData")$outcomeIds,
- eventCount = 0,
- personCount = 0)
- for (i in 1:nrow(outcomeCounts)) {
+ outcomeCounts <- data.frame(
+ outcomeId = attr(object$outcomes, "metaData")$outcomeIds,
+ eventCount = 0,
+ personCount = 0
+ )
+ for (i in seq_len(nrow(outcomeCounts))) {
outcomeCounts$eventCount[i] <- sum(object$outcomes$outcomeId == attr(object$outcomes, "metaData")$outcomeIds[i])
outcomeCounts$personCount[i] <- length(unique(object$outcomes$rowId[object$outcomes$outcomeId == attr(object$outcomes, "metaData")$outcomeIds[i]]))
}
-
+
covDetails <- FeatureExtraction::summary(object$covariateData)
- result <- list(metaData = append(append(object$metaData, attr(object$cohorts, "metaData")), attr(object$outcomes, "metaData")),
- people = people,
- outcomeCounts = outcomeCounts,
- covariateCount = covDetails$covariateCount,
- covariateValueCount = covDetails$covariateValueCount)
+ result <- list(
+ metaData = append(append(object$metaData, attr(object$cohorts, "metaData")), attr(object$outcomes, "metaData")),
+ people = people,
+ outcomeCounts = outcomeCounts,
+ covariateCount = covDetails$covariateCount,
+ covariateValueCount = covDetails$covariateValueCount
+ )
class(result) <- "summary.plpData"
return(result)
}
diff --git a/R/FeatureEngineering.R b/R/FeatureEngineering.R
index a5f4bc1ca..aca2dfb7f 100644
--- a/R/FeatureEngineering.R
+++ b/R/FeatureEngineering.R
@@ -2,13 +2,13 @@
# Copyright 2021 Observational Health Data Sciences and Informatics
#
# This file is part of PatientLevelPrediction
-#
+#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
-#
+#
# http://www.apache.org/licenses/LICENSE-2.0
-#
+#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -16,29 +16,30 @@
# limitations under the License.
-featureEngineer <- function(data, featureEngineeringSettings){
-
- ParallelLogger::logInfo('Starting Feature Engineering')
-
+featureEngineer <- function(data, featureEngineeringSettings) {
+ start <- Sys.time()
+ ParallelLogger::logInfo("Starting Feature Engineering")
+
# if a single setting, make it a list
- if(inherits(featureEngineeringSettings, 'featureEngineeringSettings')){
+ if (inherits(featureEngineeringSettings, "featureEngineeringSettings")) {
featureEngineeringSettings <- list(featureEngineeringSettings)
}
-
- for(featureEngineeringSetting in featureEngineeringSettings){
+
+ for (featureEngineeringSetting in featureEngineeringSettings) {
fun <- attr(featureEngineeringSetting, "fun")
- args <- list(trainData = data,
- featureEngineeringSettings = featureEngineeringSetting)
- ParallelLogger::logInfo(paste0('Applying ',fun))
+ args <- list(
+ trainData = data,
+ featureEngineeringSettings = featureEngineeringSetting
+ )
+ ParallelLogger::logInfo(paste0("Applying ", fun))
data <- do.call(eval(parse(text = fun)), args)
}
-
- attr(data, 'metaData')$featureEngineeringSettings <- featureEngineeringSettings
-
- ParallelLogger::logInfo('Done Feature Engineering')
-
+
+ attr(data, "metaData")$featureEngineeringSettings <- featureEngineeringSettings
+ delta <- Sys.time() - start
+ ParallelLogger::logInfo("Feature Engineering completed in ",
+ signif(delta, 3), " ", attr(delta, "units"))
return(data)
-
}
#' Create the settings for defining any feature engineering that will be done
@@ -47,22 +48,20 @@ featureEngineer <- function(data, featureEngineeringSettings){
#' Returns an object of class \code{featureEngineeringSettings} that specifies the sampling function that will be called and the settings
#'
#' @param type (character) Choice of: \itemize{
-#' \item'none' No feature engineering - this is the default
-#' }
+#' \item'none' No feature engineering - this is the default
+#' }
#'
#' @return
#' An object of class \code{featureEngineeringSettings}
#' @export
-createFeatureEngineeringSettings <- function(type = 'none'){
-
+createFeatureEngineeringSettings <- function(type = "none") {
featureEngineeringSettings <- list()
-
- if(type == 'none'){
+
+ if (type == "none") {
attr(featureEngineeringSettings, "fun") <- "sameData"
}
class(featureEngineeringSettings) <- "featureEngineeringSettings"
return(featureEngineeringSettings)
-
}
@@ -76,22 +75,29 @@ createFeatureEngineeringSettings <- function(type = 'none'){
#' @return
#' An object of class \code{featureEngineeringSettings}
#' @export
-createUnivariateFeatureSelection <- function(k = 100){
-
- if (inherits(k, 'numeric')) {
+createUnivariateFeatureSelection <- function(k = 100) {
+ if (inherits(k, "numeric")) {
k <- as.integer(k)
}
-
- checkIsClass(k, 'integer')
+ rlang::check_installed(
+ "reticulate",
+ reason = "This function requires the reticulate package to be installed"
+ )
+ tryCatch({
+ reticulate::import("sklearn")
+ }, error = function(e) {
+ stop("This function requires the scikit-learn package to be installed")
+ })
+
+ checkIsClass(k, "integer")
checkHigherEqual(k, 0)
-
- featureEngineeringSettings <- list(k = k)
-
+
+ featureEngineeringSettings <- list(k = k)
+
attr(featureEngineeringSettings, "fun") <- "univariateFeatureSelection"
class(featureEngineeringSettings) <- "featureEngineeringSettings"
-
+
return(featureEngineeringSettings)
-
}
#' Create the settings for random foreat based feature selection
@@ -105,21 +111,29 @@ createUnivariateFeatureSelection <- function(k = 100){
#' @return
#' An object of class \code{featureEngineeringSettings}
#' @export
-createRandomForestFeatureSelection <- function(ntrees = 2000, maxDepth = 17){
-
- checkIsClass(ntrees, c('numeric','integer'))
- checkIsClass(maxDepth, c('numeric','integer'))
+createRandomForestFeatureSelection <- function(ntrees = 2000, maxDepth = 17) {
+ rlang::check_installed(
+ "reticulate",
+ reason = "This function requires the reticulate package to be installed"
+ )
+ tryCatch({
+ reticulate::import("sklearn")
+ }, error = function(e) {
+ stop("This function requires the scikit-learn package to be installed")
+ })
+ checkIsClass(ntrees, c("numeric", "integer"))
+ checkIsClass(maxDepth, c("numeric", "integer"))
checkHigher(ntrees, 0)
checkHigher(maxDepth, 0)
-
+
featureEngineeringSettings <- list(
ntrees = ntrees,
max_depth = maxDepth
- )
-
+ )
+
attr(featureEngineeringSettings, "fun") <- "randomForestFeatureSelection"
class(featureEngineeringSettings) <- "featureEngineeringSettings"
-
+
return(featureEngineeringSettings)
}
@@ -130,7 +144,7 @@ createRandomForestFeatureSelection <- function(ntrees = 2000, maxDepth = 17){
#'
#' @param continousCovariateId The covariateId to apply splines to
#' @param knots Either number of knots of vector of split values
-#' @param analysisId The analysisId to use for the spline covariates
+#' @param analysisId The analysisId to use for the spline covariates
#'
#' @return
#' An object of class \code{featureEngineeringSettings}
@@ -138,34 +152,29 @@ createRandomForestFeatureSelection <- function(ntrees = 2000, maxDepth = 17){
createSplineSettings <- function(
continousCovariateId,
knots,
- analysisId = 683
- ){
-
- checkIsClass(continousCovariateId, c('numeric','integer'))
- checkIsClass(knots, c('numeric','integer'))
-
+ analysisId = 683) {
+ checkIsClass(continousCovariateId, c("numeric", "integer"))
+ checkIsClass(knots, c("numeric", "integer"))
+
featureEngineeringSettings <- list(
continousCovariateId = continousCovariateId,
knots = knots,
analysisId = analysisId
)
-
+
attr(featureEngineeringSettings, "fun") <- "splineCovariates"
class(featureEngineeringSettings) <- "featureEngineeringSettings"
-
+
return(featureEngineeringSettings)
}
-
+
splineCovariates <- function(
- trainData,
+ trainData,
featureEngineeringSettings,
- knots = NULL
- ){
-
- ParallelLogger::logInfo('Starting splineCovariates')
-
- if(is.null(knots)){
-
+ knots = NULL) {
+ ParallelLogger::logInfo("Starting splineCovariates")
+
+ if (is.null(knots)) {
if (length(featureEngineeringSettings$knots) == 1) {
measurements <- trainData$covariateData$covariates %>%
dplyr::filter(.data$covariateId == !!featureEngineeringSettings$continousCovariateId) %>%
@@ -175,112 +184,109 @@ splineCovariates <- function(
} else {
knots <- featureEngineeringSettings$knots
}
-
}
-
- # apply the spline mapping
+
+ # apply the spline mapping
trainData <- splineMap(
data = trainData,
covariateId = featureEngineeringSettings$continousCovariateId,
analysisId = featureEngineeringSettings$analysisId,
knots = knots
- )
+ )
featureEngineering <- list(
- funct = 'splineCovariates',
+ funct = "splineCovariates",
settings = list(
featureEngineeringSettings = featureEngineeringSettings,
knots = knots
)
)
-
+
# add the feature engineering in
- attr(trainData, 'metaData')$featureEngineering = listAppend(
- attr(trainData, 'metaData')$featureEngineering,
+ attr(trainData, "metaData")$featureEngineering <- listAppend(
+ attr(trainData, "metaData")$featureEngineering,
featureEngineering
)
- ParallelLogger::logInfo('Finished splineCovariates')
-
+ ParallelLogger::logInfo("Finished splineCovariates")
+
return(trainData)
}
# create the spline map to add spline columns
splineMap <- function(
- data,
+ data,
covariateId,
analysisId,
- knots
-){
-
- ParallelLogger::logInfo('Starting splineMap')
+ knots) {
+ ParallelLogger::logInfo("Starting splineMap")
measurements <- data$covariateData$covariates %>%
dplyr::filter(.data$covariateId == !!covariateId) %>%
as.data.frame()
-
+
designMatrix <- splines::bs(
- x = measurements$covariateValue,#knots[1]:knots[length(knots)],
+ x = measurements$covariateValue,
knots = knots[2:(length(knots) - 1)],
Boundary.knots = knots[c(1, length(knots))]
)
-
+
data$covariateData$covariates <- data$covariateData$covariates %>%
dplyr::filter(.data$covariateId != !!covariateId)
-
+
# get the covariate name
details <- data$covariateData$covariateRef %>%
dplyr::filter(.data$covariateId == !!covariateId) %>%
as.data.frame()
covariateName <- details$covariateName
-
+
data$covariateData$covariateRef <- data$covariateData$covariateRef %>%
dplyr::filter(.data$covariateId != !!covariateId)
-
+
# remove last 3 numbers as this was old analysis id
- covariateId <- floor(covariateId/1000)
-
+ covariateId <- floor(covariateId / 1000)
+
# add the spline columns
- for(i in 1:ncol(designMatrix)){
+ for (i in 1:ncol(designMatrix)) {
Andromeda::appendToTable(
- tbl = data$covariateData$covariates,
+ tbl = data$covariateData$covariates,
data = data.frame(
rowId = measurements$rowId,
- covariateId = covariateId*10000+i*1000+analysisId,
- covariateValue = designMatrix[,i]
+ covariateId = covariateId * 10000 + i * 1000 + analysisId,
+ covariateValue = designMatrix[, i]
)
)
}
-
+
# add the covariates to the ref table
Andromeda::appendToTable(
tbl = data$covariateData$covariateRef,
data = data.frame(
- covariateId = covariateId*10000+(1:(ncol(designMatrix)))*1000+analysisId,
+ covariateId = covariateId * 10000 + (1:(ncol(designMatrix))) * 1000 + analysisId,
covariateName = paste(
- paste0(covariateName," spline component "),
+ paste0(covariateName, " spline component "),
1:ncol(designMatrix)
),
conceptId = 0,
analysisId = analysisId
)
)
-
+
# add analysisRef for the first time a spline is added
analysisRef <- data$covariateData$analysisRef %>% as.data.frame()
- if(!analysisId %in% analysisRef$analysisId){
+ if (!analysisId %in% analysisRef$analysisId) {
Andromeda::appendToTable(
tbl = data$covariateData$analysisRef,
data = data.frame(
analysisId = analysisId,
- analysisName = 'splines',
- domainId = 'feature engineering',
+ analysisName = "splines",
+ domainId = "feature engineering",
startDay = 0,
endDay = 0,
- isBinary = 'N',
- missingMeansZero = 'N'
+ isBinary = "N",
+ missingMeansZero = "N"
)
)
}
- ParallelLogger::logInfo('Finished splineMap')
+ ParallelLogger::logInfo("Finished splineMap")
return(data)
}
@@ -299,251 +305,222 @@ splineMap <- function(
#' @export
createStratifiedImputationSettings <- function(
covariateId,
- ageSplits = NULL
-){
-
- checkIsClass(covariateId, c('numeric','integer'))
- checkIsClass(ageSplits, c('numeric','integer'))
-
+ ageSplits = NULL) {
+ checkIsClass(covariateId, c("numeric", "integer"))
+ checkIsClass(ageSplits, c("numeric", "integer"))
+
featureEngineeringSettings <- list(
covariateId = covariateId,
ageSplits = ageSplits
)
-
+
attr(featureEngineeringSettings, "fun") <- "stratifiedImputeCovariates"
class(featureEngineeringSettings) <- "featureEngineeringSettings"
-
+
return(featureEngineeringSettings)
}
stratifiedImputeCovariates <- function(
- trainData,
+ trainData,
featureEngineeringSettings,
- stratifiedMeans = NULL
-){
-
- if(is.null(stratifiedMeans)){
-
+ stratifiedMeans = NULL) {
+ if (is.null(stratifiedMeans)) {
stratifiedMeans <- calculateStratifiedMeans(
trainData = trainData,
featureEngineeringSettings = featureEngineeringSettings
)
-
}
-
+
trainData <- imputeMissingMeans(
- trainData = trainData,
+ trainData = trainData,
covariateId = featureEngineeringSettings$covariateId,
ageSplits = featureEngineeringSettings$ageSplits,
stratifiedMeans = stratifiedMeans
- )
-
+ )
+
return(trainData)
}
calculateStratifiedMeans <- function(
trainData,
- featureEngineeringSettings
-){
- if(is.null(featureEngineeringSettings$ageSplits)){
- trainData$cohorts$ageGroup <- floor(trainData$cohorts$ageYear/5)
- } else{
+ featureEngineeringSettings) {
+ if (is.null(featureEngineeringSettings$ageSplits)) {
+ trainData$cohorts$ageGroup <- floor(trainData$cohorts$ageYear / 5)
+ } else {
trainData$cohorts$ageGroup <- rep(0, length(trainData$cohorts$ageYear))
- for(i in 1:length(featureEngineeringSettings$ageSplits)){
+ for (i in seq_along(featureEngineeringSettings$ageSplits)) {
trainData$cohorts$ageGroup[trainData$cohorts$ageYear > featureEngineeringSettings$ageSplits[i]] <- i
}
}
-
- trainData$covariateData$cohorts <- trainData$cohorts[,c('rowId', 'ageGroup', 'gender')]
-
+
+ trainData$covariateData$cohorts <- trainData$cohorts[, c("rowId", "ageGroup", "gender")]
+
stratifiedMeans <- trainData$covariateData$covariates %>%
dplyr::filter(.data$covariateId == !!featureEngineeringSettings$covariateId) %>%
dplyr::inner_join(
- y = trainData$covariateData$cohorts,
- by = c('rowId')
+ y = trainData$covariateData$cohorts,
+ by = c("rowId")
) %>%
dplyr::group_by(.data$ageGroup, .data$gender) %>%
dplyr::summarise(covariateValue = mean(.data$covariateValue, na.rm = TRUE)) %>%
as.data.frame()
-
+
return(stratifiedMeans)
}
imputeMissingMeans <- function(
- trainData,
- covariateId,
- ageSplits,
- stratifiedMeans
-){
-
- if(is.null(ageSplits)){
- trainData$cohorts$ageGroup <- floor(trainData$cohorts$ageYear/5)
- } else{
+ trainData,
+ covariateId,
+ ageSplits,
+ stratifiedMeans) {
+ if (is.null(ageSplits)) {
+ trainData$cohorts$ageGroup <- floor(trainData$cohorts$ageYear / 5)
+ } else {
trainData$cohorts$ageGroup <- rep(0, length(trainData$cohorts$ageYear))
- for(i in 1:length(ageSplits)){
+ for (i in seq_along(ageSplits)) {
trainData$cohorts$ageGroup[trainData$cohorts$ageYear > ageSplits[i]] <- i
}
}
-
+
rowIdsWithValues <- trainData$covariateData$covariates %>%
- dplyr::filter(.data$covariateId == !! covariateId) %>%
- dplyr::select('rowId') %>%
+ dplyr::filter(.data$covariateId == !!covariateId) %>%
+ dplyr::select("rowId") %>%
dplyr::pull()
rowIdsWithMissingValues <- trainData$cohorts$rowId[!trainData$cohorts$rowId %in% rowIdsWithValues]
-
- imputedData <- trainData$cohorts %>%
+
+ imputedData <- trainData$cohorts %>%
dplyr::filter(.data$rowId %in% rowIdsWithMissingValues) %>%
- dplyr::select('rowId', 'ageGroup', 'gender') %>%
+ dplyr::select("rowId", "ageGroup", "gender") %>%
dplyr::left_join(
- y = stratifiedMeans,
- by = c('ageGroup', 'gender')
- ) %>%
+ y = stratifiedMeans,
+ by = c("ageGroup", "gender")
+ ) %>%
dplyr::mutate(
covariateId = !!covariateId,
covariateValue = .data$covariateValue
- ) %>%
- dplyr::select('rowId', 'covariateId', 'covariateValue')
-
+ ) %>%
+ dplyr::select("rowId", "covariateId", "covariateValue")
+
Andromeda::appendToTable(
- tbl = trainData$covariateData$covariates,
+ tbl = trainData$covariateData$covariates,
data = imputedData
)
-
+
return(trainData)
}
univariateFeatureSelection <- function(
- trainData,
- featureEngineeringSettings,
- covariateIdsInclude = NULL){
-
- if(is.null(covariateIdsInclude)){
- #convert data into matrix:
+ trainData,
+ featureEngineeringSettings,
+ covariateIdsInclude = NULL) {
+ if (is.null(covariateIdsInclude)) {
+ # convert data into matrix:
mappedData <- toSparseM(trainData, trainData$labels)
-
+
matrixData <- mappedData$dataMatrix
labels <- mappedData$labels
covariateMap <- mappedData$covariateMap
-
+
X <- reticulate::r_to_py(matrixData)
- y <- reticulate::r_to_py(labels[,'outcomeCount'])
-
- np <- reticulate::import('numpy')
- os <- reticulate::import('os')
- sys <- reticulate::import('sys')
- math <- reticulate::import('math')
- scipy <- reticulate::import('scipy')
-
- sklearn <- reticulate::import('sklearn')
-
+ y <- reticulate::r_to_py(labels[, "outcomeCount"])
+
+ np <- reticulate::import("numpy")
+ sklearn <- reticulate::import("sklearn")
+
SelectKBest <- sklearn$feature_selection$SelectKBest
chi2 <- sklearn$feature_selection$chi2
-
+
kbest <- SelectKBest(chi2, k = featureEngineeringSettings$k)$fit(X, y$outcomeCount)
kbest$scores_ <- np$nan_to_num(kbest$scores_)
# taken from sklearn code, matches the application during transform call
k <- featureEngineeringSettings$k
- mask <- np$zeros(length(kbest$scores_), dtype='bool')
- mask[np$argsort(kbest$scores_, kind="mergesort")+1][(length(kbest$scores_)-k+1):length(kbest$scores_)] <- TRUE
-
- covariateIdsInclude <- covariateMap[mask,]$covariateId
+ mask <- np$zeros(length(kbest$scores_), dtype = "bool")
+ mask[np$argsort(kbest$scores_, kind = "mergesort") + 1][(length(kbest$scores_) - k + 1):length(kbest$scores_)] <- TRUE
+
+ covariateIdsInclude <- covariateMap[mask, ]$covariateId
}
-
- trainData$covariateData$covariates <- trainData$covariateData$covariates %>%
+
+ trainData$covariateData$covariates <- trainData$covariateData$covariates %>%
dplyr::filter(.data$covariateId %in% covariateIdsInclude)
-
- trainData$covariateData$covariateRef <- trainData$covariateData$covariateRef %>%
+
+ trainData$covariateData$covariateRef <- trainData$covariateData$covariateRef %>%
dplyr::filter(.data$covariateId %in% covariateIdsInclude)
-
+
featureEngineering <- list(
- funct = 'univariateFeatureSelection',
+ funct = "univariateFeatureSelection",
settings = list(
featureEngineeringSettings = featureEngineeringSettings,
covariateIdsInclude = covariateIdsInclude
)
)
-
- attr(trainData, 'metaData')$featureEngineering = listAppend(
- attr(trainData, 'metaData')$featureEngineering,
+
+ attr(trainData, "metaData")$featureEngineering <- listAppend(
+ attr(trainData, "metaData")$featureEngineering,
featureEngineering
)
-
+
return(trainData)
-
}
randomForestFeatureSelection <- function(
- trainData,
- featureEngineeringSettings,
- covariateIdsInclude = NULL
-){
-
- if(is.null(covariateIdsInclude)){
- #convert data into matrix:
+ trainData,
+ featureEngineeringSettings,
+ covariateIdsInclude = NULL) {
+ if (is.null(covariateIdsInclude)) {
+ # convert data into matrix:
mappedData <- toSparseM(trainData)
-
+
matrixData <- mappedData$dataMatrix
labels <- mappedData$labels
covariateMap <- mappedData$covariateMap
-
+
X <- reticulate::r_to_py(matrixData)
- y <- reticulate::r_to_py(matrix(labels$outcomeCount, ncol=1))
-
- np <- reticulate::import('numpy')
- os <- reticulate::import('os')
- sys <- reticulate::import('sys')
- math <- reticulate::import('math')
- scipy <- reticulate::import('scipy')
-
- sklearn <- reticulate::import('sklearn')
-
- ntrees = featureEngineeringSettings$ntrees #2000
- max_depth = featureEngineeringSettings$max_depth #17
-
- rf = sklearn$ensemble$RandomForestClassifier(
- max_features = 'sqrt',
+ y <- reticulate::r_to_py(matrix(labels$outcomeCount, ncol = 1))
+
+ sklearn <- reticulate::import("sklearn")
+
+ ntrees <- featureEngineeringSettings$ntrees # 2000
+ max_depth <- featureEngineeringSettings$max_depth # 17
+
+ rf <- sklearn$ensemble$RandomForestClassifier(
+ max_features = "sqrt",
n_estimators = as.integer(ntrees),
max_depth = as.integer(max_depth),
- min_samples_split = as.integer(2),
+ min_samples_split = as.integer(2),
random_state = as.integer(10), # make this an imput for consistency
- n_jobs = as.integer(-1),
- bootstrap = F
+ n_jobs = as.integer(-1),
+ bootstrap = FALSE
)
-
- rf = rf$fit(X, y$ravel())
-
- inc <- rf$feature_importances_ > 0
-
+
+ rf <- rf$fit(X, y$ravel())
+
+ inc <- rf$feature_importances_ > 0
+
covariateIdsInclude <- covariateMap$covariateId[inc]
- }
-
- trainData$covariateData$covariates <- trainData$covariateData$covariates %>%
+ }
+
+ trainData$covariateData$covariates <- trainData$covariateData$covariates %>%
dplyr::filter(.data$covariateId %in% covariateIdsInclude)
-
- trainData$covariateData$covariateRef <- trainData$covariateData$covariateRef %>%
+
+ trainData$covariateData$covariateRef <- trainData$covariateData$covariateRef %>%
dplyr::filter(.data$covariateId %in% covariateIdsInclude)
-
-
+
+
featureEngeering <- list(
- funct = 'randomForestFeatureSelection',
+ funct = "randomForestFeatureSelection",
settings = list(
featureEngineeringSettings = featureEngineeringSettings,
covariateIdsInclude = covariateIdsInclude
)
)
-
- attr(trainData, 'metaData')$featureEngineering = listAppend(
- attr(trainData, 'metaData')$featureEngineering,
+
+ attr(trainData, "metaData")$featureEngineering <- listAppend(
+ attr(trainData, "metaData")$featureEngineering,
featureEngeering
)
-
+
return(trainData)
-
}
-
-
-
-
diff --git a/R/FeatureImportance.R b/R/FeatureImportance.R
index 6c73b1223..ddd1ff013 100644
--- a/R/FeatureImportance.R
+++ b/R/FeatureImportance.R
@@ -1,18 +1,18 @@
# permutation feature importance - use parrallel computing...
-##Input: Trained model f, feature matrix X, target vector y, error measure L(y,f).
+## Input: Trained model f, feature matrix X, target vector y, error measure L(y,f).
-##Estimate the original model error eorig = L(y, f(X)) (e.g. mean squared error)
-##For each feature j = 1,...,p do:
+## Estimate the original model error eorig = L(y, f(X)) (e.g. mean squared error)
+## For each feature j = 1,...,p do:
## Generate feature matrix Xperm by permuting feature j in the data X. This breaks the association between feature j and true outcome y.
-##Estimate error eperm = L(Y,f(Xperm)) based on the predictions of the permuted data.
-##Calculate permutation feature importance FIj= eperm/eorig. Alternatively, the difference can be used: FIj = eperm - eorig
-##Sort features by descending FI.
+## Estimate error eperm = L(Y,f(Xperm)) based on the predictions of the permuted data.
+## Calculate permutation feature importance FIj= eperm/eorig. Alternatively, the difference can be used: FIj = eperm - eorig
+## Sort features by descending FI.
#' pfi
#'
#' @description
-#' Calculate the permutation feature importance for a PLP model.
+#' Calculate the permutation feature importance for a PLP model.
#' @details
#' The function permutes the each covariate/features times and calculates the mean AUC change caused by the permutation.
#' @param plpResult An object of type \code{runPlp}
@@ -24,7 +24,7 @@
#' @param cores Number of cores to use when running this (it runs in parallel)
#' @param log A location to save the log for running pfi
#' @param logthreshold The log threshold (e.g., INFO, TRACE, ...)
-#'
+#'
#' @return
#' A dataframe with the covariateIds and the pfi (change in AUC caused by permuting the covariate) value
#'
@@ -32,100 +32,106 @@
#' @export
pfi <- function(plpResult, population, plpData, repeats = 1,
covariates = NULL, cores = NULL, log = NULL,
- logthreshold = "INFO"){
-
-
- if(!is.null(log)){
- appender <- ParallelLogger::createFileAppender(layout = ParallelLogger::layoutParallel,
- fileName = log)
-
- logger <- ParallelLogger::createLogger(name = "PAR",
- threshold = logthreshold,
- appenders = list(appender))
- ParallelLogger::registerLogger(logger)
+ logthreshold = "INFO") {
+ if (!is.null(log)) {
+ appender <- ParallelLogger::createFileAppender(
+ layout = ParallelLogger::layoutParallel,
+ fileName = log
+ )
+
+ logger <- ParallelLogger::createLogger(
+ name = "PAR",
+ threshold = logthreshold,
+ appenders = list(appender)
+ )
+ ParallelLogger::registerLogger(logger)
}
-
-
- if(is.null(covariates)){
- covariates <- plpResult$model$covariateImportance %>%
- dplyr::filter(.data$covariateValue != 0) %>%
- dplyr::select("covariateId") %>%
+
+
+ if (is.null(covariates)) {
+ covariates <- plpResult$model$covariateImportance %>%
+ dplyr::filter(.data$covariateValue != 0) %>%
+ dplyr::select("covariateId") %>%
dplyr::pull()
}
-
+
# add code to format covariateData based on plpModel
-
- if(!is.null(plpResult$model$preprocessing$featureEngineering)){
+
+ if (!is.null(plpResult$model$preprocessing$featureEngineering)) {
# do feature engineering/selection
- ParallelLogger::logInfo('Running FE in model')
+ ParallelLogger::logInfo("Running FE in model")
plpData <- do.call(
- applyFeatureengineering,
- list(plpData = plpData,
- settings = plpResult$model$preprocessing$featureEngineering
+ applyFeatureengineering,
+ list(
+ plpData = plpData,
+ settings = plpResult$model$preprocessing$featureEngineering
)
)
- } else{
- ParallelLogger::logInfo('No FE in model')
+ } else {
+ ParallelLogger::logInfo("No FE in model")
}
-
- if(!is.null(plpResult$model$preprocessing$tidyCovariates)){
+
+ if (!is.null(plpResult$model$preprocessing$tidyCovariates)) {
# do preprocessing
- ParallelLogger::logInfo('Applying data tidying done in model')
+ ParallelLogger::logInfo("Applying data tidying done in model")
plpData$covariateData <- do.call(
- applyTidyCovariateData,
- list(covariateData = plpData$covariateData,
- preprocessSettings = plpResult$model$preprocessing$tidyCovariates
+ applyTidyCovariateData,
+ list(
+ covariateData = plpData$covariateData,
+ preprocessSettings = plpResult$model$preprocessing$tidyCovariates
)
)
- } else{
- ParallelLogger::logInfo('No data tidying done in model')
+ } else {
+ ParallelLogger::logInfo("No data tidying done in model")
}
-
+
# apply prediction function
pred <- do.call(
- attr(plpResult$model, "predictionFunction"),
+ attr(plpResult$model, "predictionFunction"),
list(
- plpModel = plpResult$model,
- data = plpData,
+ plpModel = plpResult$model,
+ data = plpData,
cohort = population
)
)
-
+
auc <- computeAuc(pred)
-
- #do permulation and savePlpData to temp loc
- plpDataLocation <- file.path(tempdir(), paste0('data'))
- #plpDataLocation <- file.path('D:/testing', paste0('data'))
+
+ # do permulation and savePlpData to temp loc
+ plpDataLocation <- file.path(tempdir(), paste0("data"))
savePlpData(plpData, file = plpDataLocation)
-
- if(is.null(cores)){
- ensure_installed('parallel')
- ParallelLogger::logInfo(paste0('Number of cores not specified'))
+
+ if (is.null(cores)) {
+ rlang::check_installed("parallel")
+ ParallelLogger::logInfo(paste0("Number of cores not specified"))
cores <- parallel::detectCores()
- ParallelLogger::logInfo(paste0('Using all ', cores))
- ParallelLogger::logInfo(paste0('Set cores input to use fewer...'))
+ ParallelLogger::logInfo(paste0("Using all ", cores))
+ ParallelLogger::logInfo(paste0("Set cores input to use fewer..."))
}
getVpiSettings <- function(i) {
- result <- list(plpModel = plpResult$model,
- population = population,
- plpDataLocation = plpDataLocation,
- covariateId = covariates[i],
- repeats = repeats)
+ result <- list(
+ plpModel = plpResult$model,
+ population = population,
+ plpDataLocation = plpDataLocation,
+ covariateId = covariates[i],
+ repeats = repeats
+ )
return(result)
}
if (cores > 1) {
- cluster <- ParallelLogger::makeCluster(numberOfThreads = cores)
- ParallelLogger::clusterRequire(cluster, c("PatientLevelPrediction", "Andromeda"))
-
- vpiSettings <- lapply(1:length(covariates), getVpiSettings)
-
- aucP <- ParallelLogger::clusterApply(cluster = cluster,
- x = vpiSettings,
- fun = permutePerf,
- stopOnError = FALSE,
- progressBar = TRUE)
- ParallelLogger::stopCluster(cluster)
-
+ cluster <- ParallelLogger::makeCluster(numberOfThreads = cores)
+ ParallelLogger::clusterRequire(cluster, c("PatientLevelPrediction", "Andromeda"))
+
+ vpiSettings <- lapply(1:length(covariates), getVpiSettings)
+
+ aucP <- ParallelLogger::clusterApply(
+ cluster = cluster,
+ x = vpiSettings,
+ fun = permutePerf,
+ stopOnError = FALSE,
+ progressBar = TRUE
+ )
+ ParallelLogger::stopCluster(cluster)
} else {
ParallelLogger::logInfo("Running in serial")
aucP <- lapply(1:length(covariates), function(i) {
@@ -133,42 +139,47 @@ pfi <- function(plpResult, population, plpData, repeats = 1,
})
}
aucP <- do.call(c, aucP)
- varImp <- data.frame(covariateId = covariates,
- pfi = auc - aucP)
+ varImp <- data.frame(
+ covariateId = covariates,
+ pfi = auc - aucP
+ )
return(varImp)
-
}
# function to take plpModel,plpData location, load data and permute then calcualte auc
-permutePerf <- function(settings){
-
+permutePerf <- function(settings) {
auc <- c()
- for(i in 1:settings$repeats){
- ParallelLogger::logInfo(paste0('Starting to permute data for covariate: ',settings$covariateId))
- plpData <- tryCatch({suppressWarnings(permute(settings$plpDataLocation, settings$covariateId,
- settings$population ))},
- warning = function(war) {
- ParallelLogger::logInfo(paste0('a warning: ', war))
- },
- error = function(err) {
- ParallelLogger::logError(paste0('an error: ', err))
- return(NULL)
- }
+ for (i in 1:settings$repeats) {
+ ParallelLogger::logInfo(paste0("Starting to permute data for covariate: ", settings$covariateId))
+ plpData <- tryCatch(
+ {
+ suppressWarnings(permute(
+ settings$plpDataLocation, settings$covariateId,
+ settings$population
+ ))
+ },
+ warning = function(war) {
+ ParallelLogger::logInfo(paste0("a warning: ", war))
+ },
+ error = function(err) {
+ ParallelLogger::logError(paste0("an error: ", err))
+ return(NULL)
+ }
)
-
- if(is.null(plpData)){
- ParallelLogger::logInfo(paste0('plpData NULL for covariate: ',settings$covariateId))
+
+ if (is.null(plpData)) {
+ ParallelLogger::logInfo(paste0("plpData NULL for covariate: ", settings$covariateId))
return(0)
}
- ParallelLogger::logInfo(paste0('Calculating prediction for permutation of covariate: ',settings$covariateId))
-
+ ParallelLogger::logInfo(paste0("Calculating prediction for permutation of covariate: ", settings$covariateId))
+
# need to stop preprocessing and do it once...
pred <- do.call(
- attr(settings$plpModel, "predictionFunction"),
+ attr(settings$plpModel, "predictionFunction"),
list(
- plpModel = settings$plpModel,
- data = plpData,
+ plpModel = settings$plpModel,
+ data = plpData,
cohort = settings$population
)
)
@@ -177,74 +188,72 @@ permutePerf <- function(settings){
auc <- c(auc, auct)
}
return(mean(auc))
-
}
-permute <- function(plpDataLocation,cId,population){
-
+permute <- function(plpDataLocation, cId, population) {
plpData <- suppressWarnings(PatientLevelPrediction::loadPlpData(plpDataLocation))
-
- #get analysisId
- aId <- plpData$covariateData$covariateRef %>%
+
+ # get analysisId
+ aId <- plpData$covariateData$covariateRef %>%
dplyr::filter(.data$covariateId == !!cId) %>%
- dplyr::select("analysisId") %>% dplyr::collect()
-
+ dplyr::select("analysisId") %>%
+ dplyr::collect()
+
# if analysis id is not 3 (age group), 4 (race) or 5 (ethnicity)
- if(!aId$analysisId %in% c(3,4,5)){
-
+ if (!aId$analysisId %in% c(3, 4, 5)) {
# select covariateId data
- coi <- plpData$covariateData$covariates %>% dplyr::filter(.data$covariateId == !!cId) %>% dplyr::collect()
+ coi <- plpData$covariateData$covariates %>%
+ dplyr::filter(.data$covariateId == !!cId) %>%
+ dplyr::collect()
nSamp <- length(coi$rowId)
-
+
# find a new random selection of people and give them the covariate and value
- newPlp <- sample(population$rowId,nSamp)
- newData <- dplyr::as_tibble(cbind(rowId = newPlp,coi[,-1]))
-
+ newPlp <- sample(population$rowId, nSamp)
+ newData <- dplyr::as_tibble(cbind(rowId = newPlp, coi[, -1]))
+
# swap old covariate data with new
- plpData$covariateData$covariates <- plpData$covariateData$covariates %>% dplyr::filter(.data$covariateId != !!cId) %>% dplyr::collect()
+ plpData$covariateData$covariates <- plpData$covariateData$covariates %>%
+ dplyr::filter(.data$covariateId != !!cId) %>%
+ dplyr::collect()
Andromeda::appendToTable(plpData$covariateData$covariates, newData)
-
- } else{
+ } else {
# do some code for the connected variables... with more than 2 options
-
+
# get the ppl with covariates
- haveCidData <- plpData$covariateData$covariates %>% dplyr::filter(.data$covariateId == !!cId) %>% dplyr::collect()
+ haveCidData <- plpData$covariateData$covariates %>%
+ dplyr::filter(.data$covariateId == !!cId) %>%
+ dplyr::collect()
nSamp <- length(haveCidData$rowId)
-
- # sample the pop to replace
- swapPlp <- sample(population$rowId,nSamp)
- haveCidDataSwapped <- dplyr::as_tibble(cbind(rowId = swapPlp,haveCidData[,-1]))
-
- # find the swapped people to switch
- connectedCovs <- plpData$covariateData$covariateRef %>%
- dplyr::filter(.data$analysisId == !!aId$analysisId) %>%
- dplyr::group_by(.data$covariateId) %>%
- dplyr::select("covariateId") %>%
+
+ # sample the pop to replace
+ swapPlp <- sample(population$rowId, nSamp)
+ haveCidDataSwapped <- dplyr::as_tibble(cbind(rowId = swapPlp, haveCidData[, -1]))
+
+ # find the swapped people to switch
+ connectedCovs <- plpData$covariateData$covariateRef %>%
+ dplyr::filter(.data$analysisId == !!aId$analysisId) %>%
+ dplyr::group_by(.data$covariateId) %>%
+ dplyr::select("covariateId") %>%
dplyr::collect()
- plpToSwap <- plpData$covariateData$covariates %>%
- dplyr::filter(.data$covariateId %in% !!connectedCovs$covariateId) %>%
- dplyr::filter(.data$rowId %in% swapPlp) %>%
+ plpToSwap <- plpData$covariateData$covariates %>%
+ dplyr::filter(.data$covariateId %in% !!connectedCovs$covariateId) %>%
+ dplyr::filter(.data$rowId %in% swapPlp) %>%
dplyr::collect()
-
- swappedForCid <- dplyr::as_tibble(cbind(rowId = haveCidData$rowId[1:nrow(plpToSwap)],plpToSwap[,-1]))
-
+
+ swappedForCid <- dplyr::as_tibble(cbind(rowId = haveCidData$rowId[1:nrow(plpToSwap)], plpToSwap[, -1]))
+
# swap old covariate data with new
- plpData$covariateData$covariates <- plpData$covariateData$covariates %>%
- dplyr::filter(.data$covariateId != !!cId) %>%
- dplyr::filter(!.data$rowId %in% !!swapPlp) %>%
+ plpData$covariateData$covariates <- plpData$covariateData$covariates %>%
+ dplyr::filter(.data$covariateId != !!cId) %>%
+ dplyr::filter(!.data$rowId %in% !!swapPlp) %>%
dplyr::collect()
- Andromeda::appendToTable(plpData$covariateData$covariates,
- rbind(haveCidDataSwapped, swappedForCid)
- )
-
+ Andromeda::appendToTable(
+ plpData$covariateData$covariates,
+ rbind(haveCidDataSwapped, swappedForCid)
+ )
}
-
+
return(plpData)
}
-
-
-
-
-
diff --git a/R/Fit.R b/R/Fit.R
index 30ef99710..06212a2df 100644
--- a/R/Fit.R
+++ b/R/Fit.R
@@ -24,7 +24,7 @@
#' @details
#' The user can define the machine learning model to train (regularised logistic regression, random forest,
#' gradient boosting machine, neural network and )
-#'
+#'
#' @param trainData An object of type \code{TrainData} created using \code{splitData}
#' data extracted from the CDM.
#' @param modelSettings An object of class \code{modelSettings} created using one of the function:
@@ -34,12 +34,12 @@
#' \item setRandomForest() A random forest model
#' \item setKNN() A KNN model
#' }
-#' @param search The search strategy for the hyper-parameter selection (currently not used)
+#' @param search The search strategy for the hyper-parameter selection (currently not used)
#' @param analysisId The id of the analysis
#' @param analysisPath The path of the analysis
#' @return
#' An object of class \code{plpModel} containing:
-#'
+#'
#' \item{model}{The trained prediction model}
#' \item{preprocessing}{The preprocessing required when applying the model}
#' \item{prediction}{The cohort data.frame with the predicted risk column added}
@@ -50,26 +50,27 @@
#'
#' @export
fitPlp <- function(
- trainData,
- modelSettings,
- search = "grid",
- analysisId,
- analysisPath
- )
- {
-
- if(is.null(trainData))
- stop('trainData is NULL')
- if(is.null(trainData$covariateData))
- stop('covariateData is NULL')
- checkIsClass(trainData$covariateData, 'CovariateData')
- if(is.null(modelSettings$fitFunction))
- stop('No model specified')
- checkIsClass(modelSettings, 'modelSettings')
-
- #=========================================================
+ trainData,
+ modelSettings,
+ search = "grid",
+ analysisId,
+ analysisPath) {
+ start <- Sys.time()
+ if (is.null(trainData)) {
+ stop("trainData is NULL")
+ }
+ if (is.null(trainData$covariateData)) {
+ stop("covariateData is NULL")
+ }
+ checkIsClass(trainData$covariateData, "CovariateData")
+ if (is.null(modelSettings$fitFunction)) {
+ stop("No model specified")
+ }
+ checkIsClass(modelSettings, "modelSettings")
+
+ # =========================================================
# run through pipeline list and apply:
- #=========================================================
+ # =========================================================
# Now apply the classifier:
fun <- eval(parse(text = modelSettings$fitFunction))
@@ -79,16 +80,17 @@ fitPlp <- function(
search = search,
analysisId = analysisId,
analysisPath = analysisPath
- )
+ )
plpModel <- do.call(fun, args)
- ParallelLogger::logTrace('Returned from classifier function')
-
+ ParallelLogger::logTrace("Returned from classifier function")
+
# adding trainDetails databaseId to all classifiers
# TODO - move other details into fit
- plpModel$trainDetails$developmentDatabaseId = attr(trainData, "metaData")$cdmDatabaseId
-
- class(plpModel) <- 'plpModel'
-
+ plpModel$trainDetails$developmentDatabaseId <- attr(trainData, "metaData")$cdmDatabaseId
+ class(plpModel) <- "plpModel"
+ delta <- Sys.time() - start
+ ParallelLogger::logInfo("Time to fit model: ",
+ signif(delta, 3), " ", attr(delta, "units"))
+
return(plpModel)
-
}
diff --git a/R/Formatting.R b/R/Formatting.R
index 1e545b818..b4421f225 100644
--- a/R/Formatting.R
+++ b/R/Formatting.R
@@ -26,10 +26,10 @@
#' the package Matrix
#' @param plpData An object of type \code{plpData} with covariate in coo format - the patient level prediction
#' data extracted from the CDM.
-#' @param cohort If specified the plpData is restricted to the rowIds in the cohort (otherwise plpData$labels is used)
+#' @param cohort If specified the plpData is restricted to the rowIds in the cohort (otherwise plpData$labels is used)
#' @param map A covariate map (telling us the column number for covariates)
#' @examples
-#' #TODO
+#' # TODO
#'
#' @return
#' Returns a list, containing the data as a sparse matrix, the plpData covariateRef
@@ -41,64 +41,65 @@
#' }
#'
#' @export
-toSparseM <- function(plpData, cohort = NULL, map=NULL){
-
- ParallelLogger::logInfo(paste0('starting toSparseM'))
-
+toSparseM <- function(plpData, cohort = NULL, map = NULL) {
+ ParallelLogger::logInfo(paste0("starting toSparseM"))
+
ParallelLogger::logDebug(
paste0(
- 'Max covariateId in original covariates: ',
- plpData$covariateData$covariates %>%
- dplyr::summarise(max = max(.data$covariateId, na.rm=T)) %>%
- dplyr::pull()
+ "Max covariateId in original covariates: ",
+ plpData$covariateData$covariates %>%
+ dplyr::summarise(max = max(.data$covariateId, na.rm = TRUE)) %>%
+ dplyr::pull()
)
)
-
+
# assign covariateId to colummId specified in map (if null create columnIds)
- # assign rowId to xId
+ # assign rowId to xId
# return the new covariateData (rowId changes in covariates plus covariates/covariateRef with modified columnIds)
# return labels with modified rowId if plpData$labels exists
- if(!is.null(plpData$labels)){
+ if (!is.null(plpData$labels)) {
newcovariateData <- MapIds(
plpData$covariateData,
cohort = plpData$labels,
- mapping=map
- )
- } else{
+ mapping = map
+ )
+ } else {
newcovariateData <- MapIds(
plpData$covariateData,
cohort = cohort,
mapping = map
)
}
-
-
- ParallelLogger::logDebug(paste0('# covariates in mapped covariateRef: ', nrow(newcovariateData$covariateRef)))
- maxY <- newcovariateData$mapping %>%
- dplyr::summarise(max=max(.data$columnId, na.rm = TRUE)) %>%
+
+ ParallelLogger::logDebug(paste0("# covariates in mapped covariateRef: ", nrow(newcovariateData$covariateRef)))
+
+ maxY <- newcovariateData$mapping %>%
+ dplyr::summarise(max = max(.data$columnId, na.rm = TRUE)) %>%
+ dplyr::pull()
+ ParallelLogger::logDebug(paste0("Max newCovariateId in mapping: ", maxY))
+ maxX <- newcovariateData$cohort %>%
+ dplyr::summarise(max = max(.data$rowId, na.rm = TRUE)) %>%
dplyr::pull()
- ParallelLogger::logDebug(paste0('Max newCovariateId in mapping: ',maxY))
- maxX <- newcovariateData$cohort %>% dplyr::summarise(max = max(.data$rowId, na.rm=T)) %>% dplyr::pull()
- ParallelLogger::logDebug(paste0('Max rowId in new : ',maxX))
-
-
- ParallelLogger::logInfo(paste0('toSparseM non temporal used'))
-
- checkRam(newcovariateData) # estimates size of RAM required and prints it
-
+ ParallelLogger::logDebug(paste0("Max rowId in new : ", maxX))
+
+
+ ParallelLogger::logInfo(paste0("toSparseM non temporal used"))
+
+ checkRam(newcovariateData) # estimates size of RAM required and prints it
+
data <- Matrix::sparseMatrix(
i = newcovariateData$covariates %>% dplyr::select("rowId") %>% dplyr::pull(),
j = newcovariateData$covariates %>% dplyr::select("columnId") %>% dplyr::pull(),
x = newcovariateData$covariates %>% dplyr::select("covariateValue") %>% dplyr::pull(),
- dims=c(maxX,maxY)
+ dims = c(maxX, maxY)
)
-
- ParallelLogger::logDebug(paste0('Sparse matrix with dimensionality: ', paste(dim(data), collapse=',') ))
- ParallelLogger::logInfo(paste0('finishing toSparseM'))
-
+ ParallelLogger::logDebug(paste0("Sparse matrix with dimensionality: ", paste(dim(data), collapse = ",")))
+
+ ParallelLogger::logInfo(paste0("finishing toSparseM"))
+
result <- list(
dataMatrix = data,
labels = newcovariateData$cohort %>% dplyr::collect(),
@@ -109,94 +110,93 @@ toSparseM <- function(plpData, cohort = NULL, map=NULL){
}
#' Map covariate and row Ids so they start from 1
-#' @description this functions takes covariate data and a cohort/population and remaps
+#' @description this functions takes covariate data and a cohort/population and remaps
#' the covariate and row ids, restricts to pop and saves/creates mapping
#' @param covariateData a covariateData object
#' @param cohort if specified rowIds restricted to the ones in cohort
-#' @param mapping A pre defined mapping to use
+#' @param mapping A pre defined mapping to use
#' @export
MapIds <- function(
- covariateData,
- cohort = NULL,
- mapping = NULL
- ){
-
- ParallelLogger::logInfo(paste0('starting to map the columns and rows'))
-
-
+ covariateData,
+ cohort = NULL,
+ mapping = NULL) {
+ ParallelLogger::logInfo(paste0("starting to map the columns and rows"))
+
+
# change the rowIds in cohort (if exists)
- if(!is.null(cohort)){
+ if (!is.null(cohort)) {
rowMap <- data.frame(
rowId = cohort %>% dplyr::select("rowId")
)
rowMap$xId <- 1:nrow(rowMap)
- } else{
+ } else {
rowMap <- data.frame(
- rowId = covariateData$covariates %>%
- dplyr::distinct(.data$rowId) %>%
+ rowId = covariateData$covariates %>%
+ dplyr::distinct(.data$rowId) %>%
dplyr::pull()
)
rowMap$xId <- 1:nrow(rowMap)
}
-
+
covariateData$rowMap <- rowMap
on.exit(covariateData$rowMap <- NULL)
-
+
# change the rowIds in covariateData$covariates
- if(is.null(mapping)){
-
+ if (is.null(mapping)) {
mapping <- data.frame(
- covariateId = covariateData$covariates %>%
- dplyr::inner_join(covariateData$rowMap, by = 'rowId') %>% # first restrict the covariates to the rowMap$rowId
- dplyr::distinct(.data$covariateId) %>%
+ covariateId = covariateData$covariates %>%
+ dplyr::inner_join(covariateData$rowMap, by = "rowId") %>% # first restrict the covariates to the rowMap$rowId
+ dplyr::distinct(.data$covariateId) %>%
+ dplyr::arrange(.data$covariateId) %>%
dplyr::pull()
)
mapping$columnId <- 1:nrow(mapping)
}
covariateData$mapping <- mapping
- on.exit(covariateData$mapping <- NULL, add = T)
-
+ on.exit(covariateData$mapping <- NULL, add = TRUE)
+
newCovariateData <- Andromeda::andromeda()
# change the covariateIds in covariates
- newCovariateData$covariates <- covariateData$covariates %>%
- dplyr::inner_join(covariateData$mapping, by = 'covariateId')
-
-
- # change the covariateIds in covariateRef
- newCovariateData$covariateRef <- covariateData$mapping %>%
- dplyr::inner_join(covariateData$covariateRef, by = 'covariateId')
-
- # change the rowId in covariates
- newCovariateData$rowMap <- rowMap
- newCovariateData$covariates <- newCovariateData$covariates %>%
- dplyr::inner_join(newCovariateData$rowMap, by = 'rowId') %>%
- dplyr::select(- "rowId") %>%
- dplyr::rename(rowId = "xId")
-
- if(!is.null(cohort)){
- # change the rowId in labels
- newCovariateData$cohort <- cohort %>%
- dplyr::inner_join(rowMap, by = 'rowId') %>%
- dplyr::rename(
- originalRowId = "rowId",
- rowId = "xId"
- ) %>%
- dplyr::arrange(.data$rowId) # make sure it is ordered lowest to highest
- }
-
+ newCovariateData$covariates <- covariateData$covariates %>%
+ dplyr::inner_join(covariateData$mapping, by = "covariateId")
+
+
+ # change the covariateIds in covariateRef
+ newCovariateData$covariateRef <- covariateData$mapping %>%
+ dplyr::inner_join(covariateData$covariateRef, by = "covariateId")
+
+ # change the rowId in covariates
+ newCovariateData$rowMap <- rowMap
+ newCovariateData$covariates <- newCovariateData$covariates %>%
+ dplyr::inner_join(newCovariateData$rowMap, by = "rowId") %>%
+ dplyr::select(-"rowId") %>%
+ dplyr::rename(rowId = "xId")
+
+ if (!is.null(cohort)) {
+ # change the rowId in labels
+ newCovariateData$cohort <- cohort %>%
+ dplyr::inner_join(rowMap, by = "rowId") %>%
+ dplyr::rename(
+ originalRowId = "rowId",
+ rowId = "xId"
+ ) %>%
+ dplyr::arrange(.data$rowId) # make sure it is ordered lowest to highest
+ }
+
newCovariateData$mapping <- mapping
-
- ParallelLogger::logInfo(paste0('finished MapCovariates'))
-
+
+ ParallelLogger::logInfo(paste0("finished MapCovariates"))
+
return(newCovariateData)
}
-checkRam <- function(covariateData){
-
- nrowV <- covariateData$covariates %>% dplyr::summarise(size = dplyr::n()) %>% dplyr::collect()
- estRamB <- (nrowV$size/1000000*24000984)
-
- ParallelLogger::logInfo(paste0('plpData size estimated to use ', round(estRamB/1000000000,1), 'GBs of RAM'))
-
+checkRam <- function(covariateData) {
+ nrowV <- covariateData$covariates %>%
+ dplyr::summarise(size = dplyr::n()) %>%
+ dplyr::collect()
+ estRamB <- (nrowV$size / 1000000 * 24000984)
+
+ ParallelLogger::logInfo(paste0("plpData size estimated to use ", round(estRamB / 1000000000, 1), "GBs of RAM"))
+
return(invisible(TRUE))
}
diff --git a/R/Glm.R b/R/Glm.R
index 2f7295139..4cbb779c0 100644
--- a/R/Glm.R
+++ b/R/Glm.R
@@ -25,7 +25,7 @@
#' @param data An object of type \code{plpData} - the patient level prediction
#' data extracted from the CDM.
#' @param cohort The population dataframe created using
-#' /code{createStudyPopulation} who will have their risks predicted or a cohort
+#' \code{createStudyPopulation} who will have their risks predicted or a cohort
#' without the outcome known
#' @export
#' @return A dataframe containing the prediction for each person in the
@@ -75,7 +75,7 @@ predictGlm <- function(plpModel, data, cohort) {
#' PatientLevelPrediction package.
#' @param coefficients A dataframe containing two columns, coefficients and
#' covariateId, both of type numeric. The covariateId column must contain
-#' valid covariateIds that match those used in the /code{FeatureExtraction}
+#' valid covariateIds that match those used in the \code{FeatureExtraction}
#' package.
#' @param intercept A numeric value representing the intercept of the model.
#' @param finalMapping A string representing the final mapping from the
diff --git a/R/GradientBoostingMachine.R b/R/GradientBoostingMachine.R
index fc2a48151..169509c31 100644
--- a/R/GradientBoostingMachine.R
+++ b/R/GradientBoostingMachine.R
@@ -18,11 +18,11 @@
#' Create setting for gradient boosting machine model using gbm_xgboost implementation
#'
-#' @param ntrees The number of trees to build
+#' @param ntrees The number of trees to build
#' @param nthread The number of computer threads to use (how many cores do you have?)
#' @param earlyStopRound If the performance does not increase over earlyStopRound number of trees then training stops (this prevents overfitting)
#' @param maxDepth Maximum depth of each tree - a large value will lead to slow model training
-#' @param minChildWeight Minimum sum of of instance weight in a child node - larger values are more conservative
+#' @param minChildWeight Minimum sum of of instance weight in a child node - larger values are more conservative
#' @param learnRate The boosting learn rate
#' @param scalePosWeight Controls weight of positive class in loss - useful for imbalanced classes
#' @param lambda L2 regularization on weights - larger is more conservative
@@ -30,214 +30,213 @@
#' @param seed An option to add a seed when training the final model
#'
#' @examples
-#' model.gbm <- setGradientBoostingMachine(ntrees=c(10,100), nthread=20,
-#' maxDepth=c(4,6), learnRate=c(0.1,0.3))
+#' model.gbm <- setGradientBoostingMachine(
+#' ntrees = c(10, 100), nthread = 20,
+#' maxDepth = c(4, 6), learnRate = c(0.1, 0.3)
+#' )
#'
#' @export
-setGradientBoostingMachine <- function(ntrees=c(100, 300), nthread=20, earlyStopRound = 25,
- maxDepth=c(4,6,8), minChildWeight=1, learnRate=c(0.05, 0.1,0.3),
- scalePosWeight=1, lambda=1, alpha=0,
- seed= sample(10000000,1)){
-
- ensure_installed("xgboost")
-
- checkIsClass(seed, c('numeric', 'integer'))
-
- if(length(nthread)>1){
- stop(paste('nthreads must be length 1'))
+setGradientBoostingMachine <- function(ntrees = c(100, 300),
+ nthread = 20,
+ earlyStopRound = 25,
+ maxDepth = c(4, 6, 8),
+ minChildWeight = 1,
+ learnRate = c(0.05, 0.1, 0.3),
+ scalePosWeight = 1,
+ lambda = 1,
+ alpha = 0,
+ seed = sample(10000000, 1)) {
+ rlang::check_installed("xgboost")
+
+ checkIsClass(seed, c("numeric", "integer"))
+
+ if (length(nthread) > 1) {
+ stop(paste("nthreads must be length 1"))
}
- if(!inherits(x = seed, what = c('numeric','NULL', 'integer'))){
- stop('Invalid seed')
+ if (!inherits(x = seed, what = c("numeric", "NULL", "integer"))) {
+ stop("Invalid seed")
}
- if(!inherits(x = ntrees, what = c("numeric", "integer"))){
- stop('ntrees must be a numeric value ')
+ if (!inherits(x = ntrees, what = c("numeric", "integer"))) {
+ stop("ntrees must be a numeric value ")
}
- if(sum(ntrees < 1) > 0 ){
- stop('ntrees must be greater than 0 or -1')
+ if (sum(ntrees < 1) > 0) {
+ stop("ntrees must be greater than 0 or -1")
}
- if(!inherits(x = maxDepth, what = c("numeric", "integer"))){
- stop('maxDepth must be a numeric value')
- }
- if(sum(maxDepth < 1) > 0){
- stop('maxDepth must be greater that 0')
+ if (!inherits(x = maxDepth, what = c("numeric", "integer"))) {
+ stop("maxDepth must be a numeric value")
+ }
+ if (sum(maxDepth < 1) > 0) {
+ stop("maxDepth must be greater that 0")
}
- if(!inherits(x = minChildWeight, what = c("numeric", "integer"))){
- stop('minChildWeight must be a numeric value')
+ if (!inherits(x = minChildWeight, what = c("numeric", "integer"))) {
+ stop("minChildWeight must be a numeric value")
}
- if(sum(minChildWeight < 0) > 0){
- stop('minChildWeight must be greater that 0')
+ if (sum(minChildWeight < 0) > 0) {
+ stop("minChildWeight must be greater that 0")
}
- if(!inherits(x = learnRate, what = 'numeric')){
- stop('learnRate must be a numeric value')
+ if (!inherits(x = learnRate, what = "numeric")) {
+ stop("learnRate must be a numeric value")
}
- if(sum(learnRate <= 0) > 0){
- stop('learnRate must be greater that 0')
+ if (sum(learnRate <= 0) > 0) {
+ stop("learnRate must be greater that 0")
}
- if(sum(learnRate > 1) > 0){
- stop('learnRate must be less that or equal to 1')
+ if (sum(learnRate > 1) > 0) {
+ stop("learnRate must be less that or equal to 1")
}
- if(!inherits(x = earlyStopRound, what = c("numeric", "integer", "NULL"))){
- stop('incorrect class for earlyStopRound')
+ if (!inherits(x = earlyStopRound, what = c("numeric", "integer", "NULL"))) {
+ stop("incorrect class for earlyStopRound")
}
- if (!inherits(x = lambda, what = c("numeric", "integer"))){
- stop('lambda must be a numeric value')
+ if (!inherits(x = lambda, what = c("numeric", "integer"))) {
+ stop("lambda must be a numeric value")
}
- if (sum(lambda < 0) > 0){
- stop('lambda must be 0 or greater')
+ if (sum(lambda < 0) > 0) {
+ stop("lambda must be 0 or greater")
}
- if(!inherits(x = alpha, what = c("numeric", "integer"))){
- stop('alpha must be a numeric value')
+ if (!inherits(x = alpha, what = c("numeric", "integer"))) {
+ stop("alpha must be a numeric value")
}
- if (sum(alpha < 0) > 0){
- stop('alpha must be 0 or greater')
+ if (sum(alpha < 0) > 0) {
+ stop("alpha must be 0 or greater")
}
- if (!inherits(x = scalePosWeight, what = c("numeric", "integer"))){
- stop('scalePosWeight must be a numeric value >= 0')
+ if (!inherits(x = scalePosWeight, what = c("numeric", "integer"))) {
+ stop("scalePosWeight must be a numeric value >= 0")
}
- if (sum(scalePosWeight < 0) > 0){
- stop('scalePosWeight must be 0 or greater')
+ if (sum(scalePosWeight < 0) > 0) {
+ stop("scalePosWeight must be 0 or greater")
}
paramGrid <- list(
- ntrees = ntrees,
- earlyStopRound = earlyStopRound,
- maxDepth = maxDepth,
- minChildWeight = minChildWeight,
- learnRate = learnRate,
- lambda = lambda,
- alpha = alpha,
- scalePosWeight = scalePosWeight
- )
-
+ ntrees = ntrees,
+ earlyStopRound = earlyStopRound,
+ maxDepth = maxDepth,
+ minChildWeight = minChildWeight,
+ learnRate = learnRate,
+ lambda = lambda,
+ alpha = alpha,
+ scalePosWeight = scalePosWeight
+ )
+
param <- listCartesian(paramGrid)
-
- attr(param, 'settings') <- list(
- modelType = 'Xgboost',
+
+ attr(param, "settings") <- list(
+ modelType = "Xgboost",
seed = seed[[1]],
modelName = "Gradient Boosting Machine",
threads = nthread[1],
- varImpRFunction = 'varImpXgboost',
- trainRFunction = 'fitXgboost',
- predictRFunction = 'predictXgboost'
+ varImpRFunction = "varImpXgboost",
+ trainRFunction = "fitXgboost",
+ predictRFunction = "predictXgboost"
)
-
- attr(param, 'saveType') <- 'xgboost'
-
+
+ attr(param, "saveType") <- "xgboost"
+
result <- list(
fitFunction = "fitRclassifier",
param = param
)
- class(result) <- 'modelSettings'
-
+ class(result) <- "modelSettings"
+
return(result)
}
varImpXgboost <- function(
- model,
- covariateMap
- ){
-
+ model,
+ covariateMap) {
varImp <- xgboost::xgb.importance(model = model)
-
- varImp$Feature <- as.numeric(substring(varImp$Feature,2))+1 # adding +1 as xgboost index starts at 0
- varImp <- merge(covariateMap, varImp, by.x='columnId', by.y='Feature')
- varImp <- varImp %>%
+
+ varImp$Feature <- as.numeric(substring(varImp$Feature, 2)) + 1 # adding +1 as xgboost index starts at 0
+ varImp <- merge(covariateMap, varImp, by.x = "columnId", by.y = "Feature")
+ varImp <- varImp %>%
dplyr::mutate(included = 1) %>%
- dplyr::rename(covariateValue = "Gain") %>%
+ dplyr::rename(covariateValue = "Gain") %>%
dplyr::select("covariateId", "covariateValue", "included")
-
+
return(varImp)
-
}
predictXgboost <- function(
- plpModel,
- data,
- cohort
- ){
-
- if(inherits(data , 'plpData')){
+ plpModel,
+ data,
+ cohort) {
+ if (inherits(data, "plpData")) {
# convert
matrixObjects <- toSparseM(
- plpData = data,
+ plpData = data,
cohort = cohort,
- map = plpModel$covariateImportance %>%
+ map = plpModel$covariateImportance %>%
dplyr::select("columnId", "covariateId")
)
-
+
# use the include??
-
+
newData <- matrixObjects$dataMatrix
cohort <- matrixObjects$labels
-
- }else{
+ } else {
newData <- data
}
-
- if(inherits(plpModel, 'plpModel')){
+
+ if (inherits(plpModel, "plpModel")) {
model <- plpModel$model
- } else{
+ } else {
model <- plpModel
}
-
+
pred <- data.frame(value = stats::predict(model, newData))
prediction <- cohort
prediction$value <- pred$value
-
+
# fix the rowIds to be the old ones?
# now use the originalRowId and remove the matrix rowId
- prediction <- prediction %>%
+ prediction <- prediction %>%
dplyr::select(-"rowId") %>%
dplyr::rename(rowId = "originalRowId")
attr(prediction, "metaData") <- list(modelType = attr(plpModel, "modelType"))
-
+
return(prediction)
}
fitXgboost <- function(
- dataMatrix,
- labels,
- hyperParameters,
- settings
- ){
+ dataMatrix,
+ labels,
+ hyperParameters,
+ settings) {
set.seed(settings$seed)
if (!is.null(hyperParameters$earlyStopRound)) {
- trainInd <- sample(nrow(dataMatrix), nrow(dataMatrix)*0.9)
+ trainInd <- sample(nrow(dataMatrix), nrow(dataMatrix) * 0.9)
if (sum(labels$outcomeCount[-trainInd]) == 0) {
stop("No outcomes in early stopping set, either increase size of training
set or turn off early stopping")
}
train <- xgboost::xgb.DMatrix(
- data = dataMatrix[trainInd,, drop = F],
+ data = dataMatrix[trainInd, , drop = FALSE],
label = labels$outcomeCount[trainInd]
- )
+ )
test <- xgboost::xgb.DMatrix(
- data = dataMatrix[-trainInd,, drop = F],
+ data = dataMatrix[-trainInd, , drop = FALSE],
label = labels$outcomeCount[-trainInd]
- )
+ )
watchlist <- list(train = train, test = test)
-
} else {
train <- xgboost::xgb.DMatrix(
- data = dataMatrix,
+ data = dataMatrix,
label = labels$outcomeCount
)
watchlist <- list()
}
-
+
outcomes <- sum(labels$outcomeCount > 0)
N <- nrow(labels)
- outcomeProportion <- outcomes/N
+ outcomeProportion <- outcomes / N
model <- xgboost::xgb.train(
- data = train,
+ data = train,
params = list(
- booster = 'gbtree',
+ booster = "gbtree",
max_depth = hyperParameters$maxDepth,
- eta = hyperParameters$learnRate,
+ eta = hyperParameters$learnRate,
min_child_weight = hyperParameters$minChildWeight,
scale_pos_weight = hyperParameters$scalePosWeight,
lambda = hyperParameters$lambda,
@@ -246,13 +245,13 @@ fitXgboost <- function(
base_score = outcomeProportion,
eval_metric = "auc"
),
- nthread = settings$threads, #?
+ nthread = settings$threads, # ?
nrounds = hyperParameters$ntrees,
watchlist = watchlist,
print_every_n = 10,
early_stopping_rounds = hyperParameters$earlyStopRound,
- maximize = T
- )
-
+ maximize = TRUE
+ )
+
return(model)
}
diff --git a/R/HelperFunctions.R b/R/HelperFunctions.R
index d73a4e9f0..d5e86d13b 100644
--- a/R/HelperFunctions.R
+++ b/R/HelperFunctions.R
@@ -1,64 +1,26 @@
-# fix issue with nrow - temp fix for me locally
-nrow <- function(x){UseMethod("nrow",x)}
-#' @exportS3Method NULL
-nrow.default <- base::nrow
-#' @exportS3Method NULL
-nrow.tbl <- function(x){x %>% dplyr::tally() %>% dplyr::pull()}
-
-
-removeInvalidString <- function(string){
- modString <- gsub('_', ' ', string)
- modString <- gsub('\\.', ' ', modString)
+removeInvalidString <- function(string) {
+ modString <- gsub("_", " ", string)
+ modString <- gsub("\\.", " ", modString)
modString <- gsub("[[:punct:]]", "", modString)
- modString <- gsub(' ', '_', modString)
+ modString <- gsub(" ", "_", modString)
return(modString)
}
-
-# Borrowed from devtools: https://github.com/hadley/devtools/blob/ba7a5a4abd8258c52cb156e7b26bb4bf47a79f0b/R/utils.r#L44
-is_installed <- function (pkg) {
- installed_version <- tryCatch(utils::packageVersion(pkg),
- error = function(e) NA)
- !is.na(installed_version)
-}
-
-# Borrowed and adapted from devtools: https://github.com/hadley/devtools/blob/ba7a5a4abd8258c52cb156e7b26bb4bf47a79f0b/R/utils.r#L74
-ensure_installed <- function(pkg) {
- if (!is_installed(pkg)) {
- msg <- paste0(sQuote(pkg), " must be installed for this functionality.")
- if (interactive()) {
- message(msg, "\nWould you like to install it?")
- if (utils::menu(c("Yes", "No")) == 1) {
- if(pkg%in%c('BigKnn', "IterativeHardThresholding", "ShinyAppBuilder", "ResultModelManager")){
-
- # add code to check for devtools...
- dvtCheck <- tryCatch(utils::packageVersion('devtools'),
- error = function(e) NA)
- if(is.na(dvtCheck)){
- utils::install.packages('devtools')
- }
-
- devtools::install_github(paste0('OHDSI/',pkg))
- }else{
- utils::install.packages(pkg)
- }
- } else {
- stop(msg, call. = FALSE)
- }
- } else {
- stop(msg, call. = FALSE)
- }
- }
+#' Check if the required packages for survival analysis are installed
+#' @keywords internal
+checkSurvivalPackages <- function() {
+ rlang::check_installed(c("survival", "polspline"),
+ reason = "Please install the required packages for survival analysis"
+ )
}
#' Create a temporary model location
-#'
+#'
#' @export
-createTempModelLoc <- function(){
- repeat{
- ##loc <- paste(tempdir(), paste0('python_models_',sample(10002323,1)), sep = '\\')
- loc <- file.path(tempdir(), paste0('python_models_',sample(10002323,1)))
- if(!dir.exists(loc)){
+createTempModelLoc <- function() {
+ repeat {
+ loc <- file.path(tempdir(), paste0("python_models_", sample(10002323, 1)))
+ if (!dir.exists(loc)) {
return(loc)
}
}
@@ -68,19 +30,19 @@ createTempModelLoc <- function(){
#'
#' @details
#' This function joins two lists
-#' @param a A list
+#' @param a A list
#' @param b Another list
#'
#' @export
-listAppend <- function(a, b){
+listAppend <- function(a, b) {
size <- length(a) + length(b)
x <- list()
length(x) <- size
- for(i in 1:size){
- if(i<=length(a)){
+ for (i in 1:size) {
+ if (i <= length(a)) {
x[[i]] <- a[[i]]
- } else{
- x[[i]] <- b[[i-length(a)]]
+ } else {
+ x[[i]] <- b[[i - length(a)]]
}
}
names(x) <- c(names(a), names(b))
@@ -88,107 +50,111 @@ listAppend <- function(a, b){
}
-#' Sets up a virtual environment to use for PLP (can be conda or python)
+#' Sets up a virtual environment to use for PLP (can be conda or python)
#'
#' @details
#' This function creates a virtual environment that can be used by PatientLevelPrediction
#' and installs all the required package dependancies. If using python, pip must be set up.
#'
-#' @param envname A string for the name of the virtual environment (default is 'PLP')
-#' @param envtype An option for specifying the environment as'conda' or 'python'. If NULL then the default is 'conda' for windows users and 'python' for non-windows users
+#' @param envname A string for the name of the virtual environment (default is 'PLP')
+#' @param envtype An option for specifying the environment as'conda' or 'python'. If NULL then the default is 'conda' for windows users and 'python' for non-windows users
#' @param condaPythonVersion String, Python version to use when creating a conda environment
#'
#' @export
-configurePython <- function(envname='PLP', envtype=NULL, condaPythonVersion="3.11"){
-
- if(is.null(envtype)){
- if(getOs()=='windows'){
+configurePython <- function(envname = "PLP", envtype = NULL, condaPythonVersion = "3.11") {
+ if (is.null(envtype)) {
+ if (getOs() == "windows") {
envtype <- "conda"
} else {
envtype <- "python"
}
}
-
- if(envtype=='conda'){
+
+ if (envtype == "conda") {
pEnvironments <- reticulate::conda_list()
- if(length(pEnvironments) > 0 && envname %in% pEnvironments$name){
- location <- ''
- warning(paste0('Conda environment ', envname,' exists. You can use reticulate::conda_remove() to remove if you want to fresh config'))
+ if (length(pEnvironments) > 0 && envname %in% pEnvironments$name) {
+ location <- ""
+ warning(paste0("Conda environment ", envname, " exists. You can use reticulate::conda_remove() to remove if you want to fresh config"))
} else {
- ParallelLogger::logInfo(paste0('Creating virtual conda environment called ', envname))
- location <- reticulate::conda_create(envname=envname, packages = paste0("python==", condaPythonVersion), conda = "auto")
+ ParallelLogger::logInfo(paste0("Creating virtual conda environment called ", envname))
+ location <- reticulate::conda_create(envname = envname, packages = paste0("python==", condaPythonVersion), conda = "auto")
}
- packages <- c('numpy','scipy','scikit-learn', 'pandas','pydotplus','joblib')
- ParallelLogger::logInfo(paste0('Adding python dependancies to ', envname))
- reticulate::conda_install(envname=envname, packages = packages, forge = TRUE, pip = FALSE,
- pip_ignore_installed = TRUE, conda = "auto")
+ packages <- c("numpy", "scipy", "scikit-learn", "pandas", "pydotplus", "joblib")
+ ParallelLogger::logInfo(paste0("Adding python dependancies to ", envname))
+ reticulate::conda_install(
+ envname = envname, packages = packages, forge = TRUE, pip = FALSE,
+ pip_ignore_installed = TRUE, conda = "auto"
+ )
} else {
pEnvironments <- reticulate::virtualenv_list()
- if(length(pEnvironments) > 0 && envname %in% pEnvironments){
- location <- ''
- warning(paste0('Python environment ', envname,' exists.'))
+ if (length(pEnvironments) > 0 && envname %in% pEnvironments) {
+ location <- ""
+ warning(paste0("Python environment ", envname, " exists."))
} else {
- ParallelLogger::logInfo(paste0('Creating virtual python environment called ', envname))
- location <- reticulate::virtualenv_create(envname=envname)
+ ParallelLogger::logInfo(paste0("Creating virtual python environment called ", envname))
+ location <- reticulate::virtualenv_create(envname = envname)
}
- packages <- c('numpy', 'scikit-learn','scipy', 'pandas','pydotplus')
- ParallelLogger::logInfo(paste0('Adding python dependancies to ', envname))
- reticulate::virtualenv_install(envname=envname, packages = packages,
- ignore_installed = TRUE)
+ packages <- c("numpy", "scikit-learn", "scipy", "pandas", "pydotplus")
+ ParallelLogger::logInfo(paste0("Adding python dependancies to ", envname))
+ reticulate::virtualenv_install(
+ envname = envname, packages = packages,
+ ignore_installed = TRUE
+ )
}
-
+
return(invisible(location))
}
#' Use the virtual environment created using configurePython()
#'
#' @details
-#' This function sets PatientLevelPrediction to use a virtual environment
+#' This function sets PatientLevelPrediction to use a virtual environment
#'
-#' @param envname A string for the name of the virtual environment (default is 'PLP')
-#' @param envtype An option for specifying the environment as'conda' or 'python'. If NULL then the default is 'conda' for windows users and 'python' for non-windows users
+#' @param envname A string for the name of the virtual environment (default is 'PLP')
+#' @param envtype An option for specifying the environment as'conda' or 'python'. If NULL then the default is 'conda' for windows users and 'python' for non-windows users
#'
#' @export
-setPythonEnvironment <- function(envname='PLP', envtype=NULL){
-
- if(is.null(envtype)){
- if(getOs()=='windows'){
- envtype=='conda'
+setPythonEnvironment <- function(envname = "PLP", envtype = NULL) {
+ if (is.null(envtype)) {
+ if (getOs() == "windows") {
+ envtype == "conda"
} else {
- envtype=='python'
+ envtype == "python"
}
}
-
- if(envtype=='conda'){
+
+ if (envtype == "conda") {
pEnvironments <- reticulate::conda_list()
- if(!envname%in%pEnvironments$name){
- return(paste0('Conda environment ', envname,' not found. Please set up using configurePython()'))
+ if (!envname %in% pEnvironments$name) {
+ return(paste0("Conda environment ", envname, " not found. Please set up using configurePython()"))
}
reticulate::use_condaenv(envname)
- return(paste0('Using conda environment ',envname))
+ return(paste0("Using conda environment ", envname))
} else {
pEnvironments <- reticulate::virtualenv_list()
- if(!envname%in%pEnvironments$name){
- return(paste0('Python environment ', envname,' not found. Please set up using configurePython()'))
+ if (!envname %in% pEnvironments$name) {
+ return(paste0("Python environment ", envname, " not found. Please set up using configurePython()"))
}
reticulate::use_virtualenv(envname)
- return(paste0('Using python environment ',envname))
+ return(paste0("Using python environment ", envname))
}
-
}
-getOs <- function(){
+getOs <- function() {
sysinf <- Sys.info()
- if (!is.null(sysinf)){
- os <- sysinf['sysname']
- if (os == 'Darwin')
+ if (!is.null(sysinf)) {
+ os <- sysinf["sysname"]
+ if (os == "Darwin") {
os <- "osx"
+ }
} else { ## mystery machine
os <- .Platform$OS.type
- if (grepl("^darwin", R.version$os))
+ if (grepl("^darwin", R.version$os)) {
os <- "osx"
- if (grepl("linux-gnu", R.version$os))
+ }
+ if (grepl("linux-gnu", R.version$os)) {
os <- "linux"
+ }
}
tolower(os)
}
@@ -196,96 +162,110 @@ getOs <- function(){
# Borrowed and adapted from Hmisc: https://github.com/harrelfe/Hmisc/blob/39011dae3af3c943e67401ed6000644014707e8b/R/cut2.s
cut2 <- function(x, g, m = 150, digits = 3) {
-
method <- 1 ## 20may02
x.unique <- sort(unique(c(x[!is.na(x)])))
- min.dif <- min(diff(x.unique))/2
+ min.dif <- min(diff(x.unique)) / 2
min.dif.factor <- 1
- oldopt <- options('digits')
- options(digits=digits)
+ oldopt <- options("digits")
+ options(digits = digits)
on.exit(options(oldopt))
- xlab <- attr(x, 'label')
+ xlab <- attr(x, "label")
nnm <- sum(!is.na(x))
- if(missing(g)) g <- max(1,floor(nnm/m))
- if(g < 1)
- stop('g must be >=1, m must be positive')
+ if (missing(g)) g <- max(1, floor(nnm / m))
+ if (g < 1) {
+ stop("g must be >=1, m must be positive")
+ }
- options(digits=15)
+ options(digits = 15)
n <- table(x)
xx <- as.double(names(n))
options(digits = digits)
cum <- cumsum(n)
m <- length(xx)
- y <- as.integer(ifelse(is.na(x),NA,1))
+ y <- as.integer(ifelse(is.na(x), NA, 1))
labs <- character(g)
- cuts <- stats::approx(cum, xx, xout=(1:g)*nnm/g,
- method='constant', rule=2, f=1)$y
+ cuts <- stats::approx(cum, xx,
+ xout = (1:g) * nnm / g,
+ method = "constant", rule = 2, f = 1
+ )$y
cuts[length(cuts)] <- max(xx)
lower <- xx[1]
upper <- 1e45
up <- low <- double(g)
i <- 0
- for(j in 1:g) {
- cj <- if(method==1 || j==1) cuts[j] else {
- if(i==0)
- stop('program logic error')
+ for (j in 1:g) {
+ cj <- if (method == 1 || j == 1) {
+ cuts[j]
+ } else {
+ if (i == 0) {
+ stop("program logic error")
+ }
# Not used unique values found in table(x)
- s <- if(is.na(lower)) FALSE else xx >= lower
- cum.used <- if(all(s)) 0 else max(cum[!s])
- if(j==m) max(xx) else if(sum(s)<2) max(xx) else
- stats::approx(cum[s]-cum.used, xx[s], xout=(nnm-cum.used)/(g-j+1),
- method='constant', rule=2, f=1)$y
+ s <- if (is.na(lower)) FALSE else xx >= lower
+ cum.used <- if (all(s)) 0 else max(cum[!s])
+ if (j == m) {
+ max(xx)
+ } else if (sum(s) < 2) {
+ max(xx)
+ } else {
+ stats::approx(cum[s] - cum.used, xx[s],
+ xout = (nnm - cum.used) / (g - j + 1),
+ method = "constant", rule = 2, f = 1
+ )$y
+ }
}
-
- if(cj==upper) next
-
+
+ if (cj == upper) next
+
i <- i + 1
upper <- cj
# assign elements to group i
# y contains the group number in the end
- y[x >= (lower-min.dif.factor*min.dif)] <- i
+ y[x >= (lower - min.dif.factor * min.dif)] <- i
low[i] <- lower
- lower <- if(j==g) upper else min(xx[xx > upper])
-
- if(is.na(lower)) lower <- upper
-
- up[i] <- lower
+ lower <- if (j == g) upper else min(xx[xx > upper])
+
+ if (is.na(lower)) lower <- upper
+
+ up[i] <- lower
}
-
- low <- low[1:i]
- up <- up[1:i]
+
+ low <- low[1:i]
+ up <- up[1:i]
# Are the bounds different?
variation <- logical(i)
- for(ii in 1:i) {
- r <- range(x[y==ii], na.rm=TRUE)
+ for (ii in 1:i) {
+ r <- range(x[y == ii], na.rm = TRUE)
variation[ii] <- diff(r) > 0
}
- flow <- do.call(format,c(list(low), digits = 3))
- fup <- do.call(format,c(list(up), digits = 3))
- bb <- c(rep(')',i-1),']')
- labs <- ifelse(low==up | (!variation), flow,
- paste('[',flow,',',fup,bb,sep=''))
- ss <- y==0 & !is.na(y)
- if(any(ss))
- stop(paste('categorization error in cut2. Values of x not appearing in any interval:\n',
- paste(format(x[ss],digits=12),collapse=' '),
- '\nLower endpoints:',
- paste(format(low,digits=12), collapse=' '),
- '\nUpper endpoints:',
- paste(format(up,digits=12),collapse=' ')))
+ flow <- do.call(format, c(list(low), digits = 3))
+ fup <- do.call(format, c(list(up), digits = 3))
+ bb <- c(rep(")", i - 1), "]")
+ labs <- ifelse(low == up | (!variation), flow,
+ paste("[", flow, ",", fup, bb, sep = "")
+ )
+ ss <- y == 0 & !is.na(y)
+ if (any(ss)) {
+ stop(paste(
+ "categorization error in cut2. Values of x not appearing in any interval:\n",
+ paste(format(x[ss], digits = 12), collapse = " "),
+ "\nLower endpoints:",
+ paste(format(low, digits = 12), collapse = " "),
+ "\nUpper endpoints:",
+ paste(format(up, digits = 12), collapse = " ")
+ ))
+ }
- y <- structure(y, class='factor', levels=labs)
+ y <- structure(y, class = "factor", levels = labs)
- attr(y,'class') <- "factor"
- if(length(xlab)){
- #label(y) <- xlab # what is label?
- # think the below does the same as the line above
- class(y) <- 'labelled'
- attr(y, 'label') <- xlab
+ attr(y, "class") <- "factor"
+ if (length(xlab)) {
+ class(y) <- "labelled"
+ attr(y, "label") <- xlab
}
return(y)
diff --git a/R/ImportFromCsv.R b/R/ImportFromCsv.R
index 61536acd3..f41520f53 100644
--- a/R/ImportFromCsv.R
+++ b/R/ImportFromCsv.R
@@ -3,107 +3,109 @@
#' This function converts a folder with csv results into plp objects and loads them into a plp result database
#'
#' @details
-#' The user needs to have plp csv results in a single folder and an existing plp result database
-#'
+#' The user needs to have plp csv results in a single folder and an existing plp result database
+#'
#' @param csvFolder The location to the csv folder with the plp results
#' @param connectionDetails A connection details for the plp results database that the csv results will be inserted into
-#' @param databaseSchemaSettings A object created by \code{createDatabaseSchemaSettings} with all the settings specifying the result tables to insert the csv results into
+#' @param databaseSchemaSettings A object created by \code{createDatabaseSchemaSettings} with all the settings specifying the result tables to insert the csv results into
#' @param modelSaveLocation The location to save any models from the csv folder - this should be the same location you picked when inserting other models into the database
-#' @param csvTableAppend A string that appends the csv file names
-#'
+#' @param csvTableAppend A string that appends the csv file names
+#'
#' @return
#' Returns a data.frame indicating whether the results were inported into the database
-#'
+#'
#' @export
insertCsvToDatabase <- function(
- csvFolder,
- connectionDetails,
- databaseSchemaSettings,
- modelSaveLocation,
- csvTableAppend = ''
-){
-
- ensure_installed('readr')
-
- ParallelLogger::logInfo('Starting input checks')
-
- csvFileNames <- tryCatch({
- dir(csvFolder, pattern = 'csv')
- },
- error = function(e){ParallelLogger::logInfo(e); return(NULL)}
+ csvFolder,
+ connectionDetails,
+ databaseSchemaSettings,
+ modelSaveLocation,
+ csvTableAppend = "") {
+ rlang::check_installed("readr")
+
+ ParallelLogger::logInfo("Starting input checks")
+
+ csvFileNames <- tryCatch(
+ {
+ dir(csvFolder, pattern = "csv")
+ },
+ error = function(e) {
+ ParallelLogger::logInfo(e)
+ return(NULL)
+ }
)
- if(is.null(csvFileNames)){
+ if (is.null(csvFileNames)) {
return(invisible(NULL))
}
-
- if(!missing(csvTableAppend)){
- csvFileNamesNoAppend <- sub(csvTableAppend, '', csvFileNames)
- } else{
+
+ if (!missing(csvTableAppend)) {
+ csvFileNamesNoAppend <- sub(csvTableAppend, "", csvFileNames)
+ } else {
csvFileNamesNoAppend <- csvFileNames
- }
-
+ }
+
# check all tables are in folder
# settings/resultsDataModelSpecification.csv table_name
resultNames <- paste0(unique(
readr::read_csv(
system.file(
- 'settings',
- 'resultsDataModelSpecification.csv',
+ "settings",
+ "resultsDataModelSpecification.csv",
package = "PatientLevelPrediction"
)
)$table_name
- ), '.csv')
- if(sum(csvFileNamesNoAppend %in% resultNames) != length(resultNames)){
- missingTables <- paste(resultNames[!resultNames %in% csvFileNamesNoAppend], collapse = ',')
- ParallelLogger::logInfo(paste0('CSV folder missing these tables: ', missingTables))
+ ), ".csv")
+ if (sum(csvFileNamesNoAppend %in% resultNames) != length(resultNames)) {
+ missingTables <- paste(resultNames[!resultNames %in% csvFileNamesNoAppend], collapse = ",")
+ ParallelLogger::logInfo(paste0("CSV folder missing these tables: ", missingTables))
return(invisible(NULL))
}
-
+
alltables <- getTableNamesPlp(
connectionDetails = connectionDetails,
databaseSchema = databaseSchemaSettings$resultSchema
)
-
- if(!tolower(paste0(databaseSchemaSettings$tablePrefix,'PERFORMANCES')) %in% alltables){
+
+ if (!tolower(paste0(databaseSchemaSettings$tablePrefix, "PERFORMANCES")) %in% alltables) {
ParallelLogger::logInfo(
paste0(
- 'performance table: ',paste0(toupper(databaseSchemaSettings$tablePrefix),'PERFORMANCES'),' not found, result database only contains ',
- paste(alltables, collapse = ',')
- )
+ "performance table: ", paste0(toupper(databaseSchemaSettings$tablePrefix), "PERFORMANCES"), " not found, result database only contains ",
+ paste(alltables, collapse = ",")
)
+ )
return(invisible(NULL))
}
-
- ParallelLogger::logInfo('Input checks passed')
- ParallelLogger::logInfo('Extracting cohort definitions')
+
+ ParallelLogger::logInfo("Input checks passed")
+ ParallelLogger::logInfo("Extracting cohort definitions")
# create cohortDefinitions:
cohortDefinitions <- extractCohortDefinitionsCSV(
csvFolder = csvFolder
)
-
- ParallelLogger::logInfo('Extracting database details')
+
+ ParallelLogger::logInfo("Extracting database details")
# create databaseList
databaseList <- extractDatabaseListCSV(
csvFolder = csvFolder
)
-
- ParallelLogger::logInfo('Extracting performance ids')
- performanceIds <- readr::read_csv(file.path(csvFolder, csvFileNames[grep('performances', csvFileNames)]))$performance_id
-
- if(length(performanceIds) > 0 ){
- for(performanceId in performanceIds){
+
+ ParallelLogger::logInfo("Extracting performance ids")
+ performanceIds <- readr::read_csv(file.path(csvFolder, csvFileNames[grep("performances", csvFileNames)]))$performance_id
+
+ if (length(performanceIds) > 0) {
+ for (performanceId in performanceIds) {
ParallelLogger::logInfo(
paste0(
- 'Converting and inserting performance id',
+ "Converting and inserting performance id",
performanceId
)
)
# convert to runPlp
runPlp <- extractObjectFromCsv(
- performanceId = performanceId,
+ performanceId = performanceId,
csvFolder = csvFolder
)
-
+
# load into database
addRunPlpToDatabase(
runPlp = runPlp,
@@ -115,22 +117,22 @@ insertCsvToDatabase <- function(
)
}
}
-
- diagnosticIds <- readr::read_csv(file.path(csvFolder, csvFileNames[grep('diagnostics', csvFileNames)]))$diagnostic_id
- if(length(diagnosticIds) > 0){
- for(diagnosticId in diagnosticIds){
+ diagnosticIds <- readr::read_csv(file.path(csvFolder, csvFileNames[grep("diagnostics", csvFileNames)]))$diagnostic_id
+
+ if (length(diagnosticIds) > 0) {
+ for (diagnosticId in diagnosticIds) {
ParallelLogger::logInfo(
paste0(
- 'Converting and inserting diagnostic id',
+ "Converting and inserting diagnostic id",
diagnosticId
)
)
diagnosePlp <- extractDiagnosticFromCsv(
- diagnosticId = diagnosticId,
+ diagnosticId = diagnosticId,
csvFolder = csvFolder
)
- if(!is.null(diagnosePlp)){
+ if (!is.null(diagnosePlp)) {
tryCatch(
{
addDiagnosePlpToDatabase(
@@ -140,16 +142,17 @@ insertCsvToDatabase <- function(
cohortDefinitions = cohortDefinitions,
databaseList = databaseList
)
- }, error = function(e){ParallelLogger::logError(e)}
+ },
+ error = function(e) {
+ ParallelLogger::logError(e)
+ }
)
}
-
}
}
-
-
+
+
return(TRUE)
-
}
@@ -157,90 +160,95 @@ insertCsvToDatabase <- function(
extractCohortDefinitionsCSV <- function(
- csvFolder
-){
-
+ csvFolder) {
# cohorts: cohort_id, cohort_definition_id, cohort_name
- # cohort_definition: cohort_definition_id cohort_name description json sql_command
-
- cohortDefinitionName <- dir(csvFolder, pattern = 'cohort_definition.csv')
+ # cohort_definition: cohort_definition_id cohort_name description json sql_command
+
+ cohortDefinitionName <- dir(csvFolder, pattern = "cohort_definition.csv")
cohort_definition <- readr::read_csv(file.path(csvFolder, cohortDefinitionName))
-
+
result <- data.frame(
cohortId = cohort_definition$cohort_definition_id,
cohortName = cohort_definition$cohort_name,
json = cohort_definition$json,
- sql = cohort_definition$sql_command
- )
-
+ sql = cohort_definition$sql_command
+ )
+
return(result)
}
extractDatabaseListCSV <- function(
- csvFolder
-){
+ csvFolder) {
# database_meta_data: database_id cdm_source_name cdm_source_abbreviation
# database_details: database_id database_meta_data_id
- databaseMetaDataName <- dir(csvFolder, pattern = 'database_meta_data.csv')
- databaseMetaData <- readr::read_csv(file.path(csvFolder, databaseMetaDataName))
+ databaseMetaDataName <- dir(csvFolder, pattern = "database_meta_data.csv")
+ databaseMetaData <- readr::read_csv(file.path(csvFolder, databaseMetaDataName))
databaseList <- createDatabaseList(
cdmDatabaseSchemas = databaseMetaData$cdm_source_name,
cdmDatabaseNames = databaseMetaData$cdm_source_abbreviation,
databaseRefIds = databaseMetaData$database_id
)
-
+
return(databaseList)
}
-getModelDesignSettingTable <- function(modeldesignsRow){
+getModelDesignSettingTable <- function(modeldesignsRow) {
result <- data.frame(
- tableName = c('cohorts', 'cohorts',
- 'population_settings', 'plp_data_settings',
- 'model_settings', 'covariate_settings', 'sample_settings',
- 'split_settings', 'feature_engineering_settings',
- 'tidy_covariates_settings'),
- idColumn = c('cohort_id', 'cohort_id',
- 'population_setting_id', 'plp_data_setting_id',
- 'model_setting_id', 'covariate_setting_id', 'sample_setting_id',
- 'split_setting_id', 'feature_engineering_setting_id',
- 'tidy_covariates_setting_id'),
- jsonColumn = c('cohort_definition_id', 'cohort_definition_id',
- 'population_settings_json', 'plp_data_settings_json',
- 'model_settings_json', 'covariate_settings_json', 'sample_settings_json',
- 'split_settings_json', 'feature_engineering_settings_json',
- 'tidy_covariates_settings_json'),
- convertJson = c(rep(F,2), rep(T, 8)),
- value = c(modeldesignsRow$target_id, modeldesignsRow$outcome_id,
- modeldesignsRow$population_setting_id, modeldesignsRow$plp_data_setting_id,
- modeldesignsRow$model_setting_id, modeldesignsRow$covariate_setting_id, modeldesignsRow$sample_setting_id,
- modeldesignsRow$split_setting_id, modeldesignsRow$feature_engineering_setting_id ,
- modeldesignsRow$tidy_covariates_setting_id),
- modelDesignInput = c('targetId', 'outcomeId',
- 'populationSettings', 'restrictPlpDataSettings',
- 'modelSettings', 'covariateSettings', 'sampleSettings',
- 'splitSettings', 'featureEngineeringSettings',
- 'preprocessSettings')
+ tableName = c(
+ "cohorts", "cohorts",
+ "population_settings", "plp_data_settings",
+ "model_settings", "covariate_settings", "sample_settings",
+ "split_settings", "feature_engineering_settings",
+ "tidy_covariates_settings"
+ ),
+ idColumn = c(
+ "cohort_id", "cohort_id",
+ "population_setting_id", "plp_data_setting_id",
+ "model_setting_id", "covariate_setting_id", "sample_setting_id",
+ "split_setting_id", "feature_engineering_setting_id",
+ "tidy_covariates_setting_id"
+ ),
+ jsonColumn = c(
+ "cohort_definition_id", "cohort_definition_id",
+ "population_settings_json", "plp_data_settings_json",
+ "model_settings_json", "covariate_settings_json", "sample_settings_json",
+ "split_settings_json", "feature_engineering_settings_json",
+ "tidy_covariates_settings_json"
+ ),
+ convertJson = c(rep(FALSE, 2), rep(TRUE, 8)),
+ value = c(
+ modeldesignsRow$target_id, modeldesignsRow$outcome_id,
+ modeldesignsRow$population_setting_id, modeldesignsRow$plp_data_setting_id,
+ modeldesignsRow$model_setting_id, modeldesignsRow$covariate_setting_id, modeldesignsRow$sample_setting_id,
+ modeldesignsRow$split_setting_id, modeldesignsRow$feature_engineering_setting_id,
+ modeldesignsRow$tidy_covariates_setting_id
+ ),
+ modelDesignInput = c(
+ "targetId", "outcomeId",
+ "populationSettings", "restrictPlpDataSettings",
+ "modelSettings", "covariateSettings", "sampleSettings",
+ "splitSettings", "featureEngineeringSettings",
+ "preprocessSettings"
+ )
)
return(result)
}
getModelDesignCsv <- function(
- modelDesignSettingTable,
- csvFolder = csvFolder
-) {
-
- csvFileNames <- dir(csvFolder, pattern = '.csv')
-
+ modelDesignSettingTable,
+ csvFolder = csvFolder) {
+ csvFileNames <- dir(csvFolder, pattern = ".csv")
+
result <- list()
- for(i in 1:nrow(modelDesignSettingTable)){
+ for (i in 1:nrow(modelDesignSettingTable)) {
table <- readr::read_csv(file.path(csvFolder, csvFileNames[grep(modelDesignSettingTable$tableName[i], csvFileNames)]))
ind <- table[modelDesignSettingTable$idColumn[i]] == modelDesignSettingTable$value[i]
- result[[i]] <- table[ind,][modelDesignSettingTable$jsonColumn[i]]
- if(modelDesignSettingTable$convertJson[i]){
+ result[[i]] <- table[ind, ][modelDesignSettingTable$jsonColumn[i]]
+ if (modelDesignSettingTable$convertJson[i]) {
result[[i]] <- ParallelLogger::convertJsonToSettings(as.character(result[[i]]))
- } else{
+ } else {
# ids need to be integer
result[[i]] <- as.double(result[[i]])
}
@@ -248,307 +256,311 @@ getModelDesignCsv <- function(
names(result) <- modelDesignSettingTable$modelDesignInput
modelDesign <- do.call(what = PatientLevelPrediction::createModelDesign, args = result)
-
+
return(modelDesign)
}
getPerformanceEvaluationCsv <- function(
- performanceId,
- csvFolder
-){
-
- csvFileNames <- dir(csvFolder, pattern = '.csv')
-
+ performanceId,
+ csvFolder) {
+ csvFileNames <- dir(csvFolder, pattern = ".csv")
+
result <- list(
-
evaluationStatistics = tryCatch(
{
- res <- readr::read_csv(file.path(csvFolder, csvFileNames[grep('evaluation_statistics', csvFileNames)])) %>%
- dplyr::filter(.data$performance_id == !!performanceId) %>%
- dplyr::select(-"performance_id");
- colnames(res) <- SqlRender::snakeCaseToCamelCase( colnames(res));
- res
- },
- error = function(e){ParallelLogger::logInfo(e); return(NULL)}
+ res <- readr::read_csv(file.path(csvFolder, csvFileNames[grep("evaluation_statistics", csvFileNames)])) %>%
+ dplyr::filter(.data$performance_id == !!performanceId) %>%
+ dplyr::select(-"performance_id")
+ colnames(res) <- SqlRender::snakeCaseToCamelCase(colnames(res))
+ res
+ },
+ error = function(e) {
+ ParallelLogger::logInfo(e)
+ return(NULL)
+ }
),
-
- thresholdSummary = tryCatch({
- res <- readr::read_csv(file.path(csvFolder, csvFileNames[grep('threshold_summary', csvFileNames)])) %>%
- dplyr::filter(.data$performance_id == !!performanceId) %>%
- dplyr::select(-"performance_id");
- colnames(res) <- SqlRender::snakeCaseToCamelCase( colnames(res));
- res
- },
- error = function(e){ParallelLogger::logInfo(e); return(NULL)}
+ thresholdSummary = tryCatch(
+ {
+ res <- readr::read_csv(file.path(csvFolder, csvFileNames[grep("threshold_summary", csvFileNames)])) %>%
+ dplyr::filter(.data$performance_id == !!performanceId) %>%
+ dplyr::select(-"performance_id")
+ colnames(res) <- SqlRender::snakeCaseToCamelCase(colnames(res))
+ res
+ },
+ error = function(e) {
+ ParallelLogger::logInfo(e)
+ return(NULL)
+ }
),
-
- calibrationSummary = tryCatch({
- res <- readr::read_csv(file.path(csvFolder, csvFileNames[grep('calibration_summary', csvFileNames)])) %>%
- dplyr::filter(.data$performance_id == !!performanceId) %>%
- dplyr::select(-"performance_id");
- colnames(res) <- SqlRender::snakeCaseToCamelCase( colnames(res));
- res
- },
- error = function(e){ParallelLogger::logInfo(e); return(NULL)}
+ calibrationSummary = tryCatch(
+ {
+ res <- readr::read_csv(file.path(csvFolder, csvFileNames[grep("calibration_summary", csvFileNames)])) %>%
+ dplyr::filter(.data$performance_id == !!performanceId) %>%
+ dplyr::select(-"performance_id")
+ colnames(res) <- SqlRender::snakeCaseToCamelCase(colnames(res))
+ res
+ },
+ error = function(e) {
+ ParallelLogger::logInfo(e)
+ return(NULL)
+ }
),
-
- demographicSummary = tryCatch({
- res <- readr::read_csv(file.path(csvFolder, csvFileNames[grep('demographic_summary', csvFileNames)])) %>%
- dplyr::filter(.data$performance_id == !!performanceId) %>%
- dplyr::select(-"performance_id");
- colnames(res) <- SqlRender::snakeCaseToCamelCase( colnames(res));
- res
- },
- error = function(e){ParallelLogger::logInfo(e); return(NULL)}
+ demographicSummary = tryCatch(
+ {
+ res <- readr::read_csv(file.path(csvFolder, csvFileNames[grep("demographic_summary", csvFileNames)])) %>%
+ dplyr::filter(.data$performance_id == !!performanceId) %>%
+ dplyr::select(-"performance_id")
+ colnames(res) <- SqlRender::snakeCaseToCamelCase(colnames(res))
+ res
+ },
+ error = function(e) {
+ ParallelLogger::logInfo(e)
+ return(NULL)
+ }
),
-
- predictionDistribution = tryCatch({
- res <- readr::read_csv(file.path(csvFolder, csvFileNames[grep('prediction_distribution', csvFileNames)])) %>%
- dplyr::filter(.data$performance_id == !!performanceId) %>%
- dplyr::select(-"performance_id");
- colnames(res) <- SqlRender::snakeCaseToCamelCase( colnames(res));
- res
- },
- error = function(e){ParallelLogger::logInfo(e); return(NULL)}
+ predictionDistribution = tryCatch(
+ {
+ res <- readr::read_csv(file.path(csvFolder, csvFileNames[grep("prediction_distribution", csvFileNames)])) %>%
+ dplyr::filter(.data$performance_id == !!performanceId) %>%
+ dplyr::select(-"performance_id")
+ colnames(res) <- SqlRender::snakeCaseToCamelCase(colnames(res))
+ res
+ },
+ error = function(e) {
+ ParallelLogger::logInfo(e)
+ return(NULL)
+ }
)
)
-
+
return(result)
-
}
extractObjectFromCsv <- function(
- performanceId,
- csvFolder
-){
-
- csvFileNames <- dir(csvFolder, pattern = '.csv')
-
+ performanceId,
+ csvFolder) {
+ csvFileNames <- dir(csvFolder, pattern = ".csv")
+
# get the model design
# performance_id model_design_id development_database_id validation_database_id target_id outcome_id tar_id plp_data_setting_id population_setting_id model_development execution_date_time plp_version
- performances <- readr::read_csv(file.path(csvFolder, csvFileNames[grep('performances', csvFileNames)]))
- poi <- performances[performances$performance_id == performanceId,,]
-
+ performances <- readr::read_csv(file.path(csvFolder, csvFileNames[grep("performances", csvFileNames)]))
+ poi <- performances[performances$performance_id == performanceId, , ]
+
modelDesignId <- poi$model_design_id
- modeldesigns <- readr::read_csv(file.path(csvFolder, csvFileNames[grep('model_designs', csvFileNames)]))
+ modeldesigns <- readr::read_csv(file.path(csvFolder, csvFileNames[grep("model_designs", csvFileNames)]))
# model_design_id target_id outcome_id tar_id plp_data_setting_id population_setting_id model_setting_id covariate_setting_id sample_setting_id split_setting_id feature_engineering_setting_id tidy_covariates_setting_id
- modeldesigns <- modeldesigns[modeldesigns$model_design_id == modelDesignId,,]
-
+ modeldesigns <- modeldesigns[modeldesigns$model_design_id == modelDesignId, , ]
+
modelDesignSettingTable <- getModelDesignSettingTable(
modeldesignsRow = modeldesigns
)
-
+
modelDesign <- getModelDesignCsv(
- modelDesignSettingTable = modelDesignSettingTable,
+ modelDesignSettingTable = modelDesignSettingTable,
csvFolder = csvFolder
- )
-
- covariateSummary <- readr::read_csv(file.path(csvFolder, csvFileNames[grep('covariate_summary', csvFileNames)])) %>%
+ )
+
+ covariateSummary <- readr::read_csv(file.path(csvFolder, csvFileNames[grep("covariate_summary", csvFileNames)])) %>%
dplyr::filter(.data$performance_id == !!poi$performance_id) %>%
dplyr::select(-"performance_id")
colnames(covariateSummary) <- SqlRender::snakeCaseToCamelCase(colnames(covariateSummary))
-
+
performanceEvaluation <- getPerformanceEvaluationCsv(
- performanceId = poi$performance_id,
+ performanceId = poi$performance_id,
csvFolder = csvFolder
)
-
- modelMissing <- F
- if(poi$model_development == 1){
-
- modelsName <- dir(csvFolder, pattern = 'models.csv')
- models <- readr::read_csv(file.path(csvFolder, modelsName))
- models <- models %>%
- dplyr::filter(.data$model_design_id == !!poi$model_design_id ) %>%
+
+ modelMissing <- FALSE
+ if (poi$model_development == 1) {
+ modelsName <- dir(csvFolder, pattern = "models.csv")
+ models <- readr::read_csv(file.path(csvFolder, modelsName))
+ models <- models %>%
+ dplyr::filter(.data$model_design_id == !!poi$model_design_id) %>%
dplyr::filter(.data$database_id == !!poi$development_database_id)
-
- modelLoc <- strsplit(x = models$plp_model_file, split = '/')[[1]][length(strsplit(x = models$plp_model_file, split = '/')[[1]])]
- plpModel <- tryCatch({
- PatientLevelPrediction::loadPlpModel(file.path(csvFolder, 'models', modelLoc))
- },
- error = function(e){ParallelLogger::logInfo(e); return(NULL)}
+
+ modelLoc <- strsplit(x = models$plp_model_file, split = "/")[[1]][length(strsplit(x = models$plp_model_file, split = "/")[[1]])]
+ plpModel <- tryCatch(
+ {
+ PatientLevelPrediction::loadPlpModel(file.path(csvFolder, "models", modelLoc))
+ },
+ error = function(e) {
+ ParallelLogger::logInfo(e)
+ return(NULL)
+ }
)
-
- resultClass <- 'runPlp'
-
- if(is.null(modelLoc)){
- ParallelLogger::logInfo('Models missing from csv folder - just adding performance')
- modelMissing <- T
+
+ resultClass <- "runPlp"
+
+ if (is.null(modelLoc)) {
+ ParallelLogger::logInfo("Models missing from csv folder - just adding performance")
+ modelMissing <- TRUE
}
-
}
-
- if(poi$model_development == 0 | modelMissing){
-
+
+ if (poi$model_development == 0 || modelMissing) {
# database_details: database_id database_meta_data_id
- databaseMetaDataName <- dir(csvFolder, pattern = 'database_meta_data.csv')
- databaseMetaData <- readr::read_csv(file.path(csvFolder, databaseMetaDataName))
- databaseDetailsName <- dir(csvFolder, pattern = 'database_details.csv')
- databaseDetails <- readr::read_csv(file.path(csvFolder, databaseDetailsName))
- databases <- merge(databaseDetails, databaseMetaData, by.x = 'database_meta_data_id', by.y = 'database_id')
-
- dev <- databases[databases$database_id == poi$development_database_id,,]
- val <- databases[databases$database_id == poi$validation_database_id,,]
-
+ databaseMetaDataName <- dir(csvFolder, pattern = "database_meta_data.csv")
+ databaseMetaData <- readr::read_csv(file.path(csvFolder, databaseMetaDataName))
+ databaseDetailsName <- dir(csvFolder, pattern = "database_details.csv")
+ databaseDetails <- readr::read_csv(file.path(csvFolder, databaseDetailsName))
+ databases <- merge(databaseDetails, databaseMetaData, by.x = "database_meta_data_id", by.y = "database_id")
+
+ dev <- databases[databases$database_id == poi$development_database_id, , ]
+ val <- databases[databases$database_id == poi$validation_database_id, , ]
+
developmentDatabase <- dev$cdm_source_name
developmentDatabaseId <- dev$database_meta_data_id
validationDatabase <- val$cdm_source_name
validationDatabaseId <- val$database_meta_data_id
-
- attritionName <- dir(csvFolder, pattern = 'attrition.csv')
- attrition <- readr::read_csv(file.path(csvFolder, attritionName)) %>%
- dplyr::filter(.data$performance_id == !!poi$performance_id) %>%
- dplyr::select(-"performance_id")
- colnames(attrition) <- SqlRender::snakeCaseToCamelCase(colnames(attrition))
-
- cohortsName <- dir(csvFolder, pattern = 'cohorts.csv')
- cohorts <- readr::read_csv(file.path(csvFolder, cohortsName))
- plpDataSetName <- dir(csvFolder, pattern = 'plp_data_settings.csv')
- plpDataSet <- readr::read_csv(file.path(csvFolder, plpDataSetName))
- popSetName <- dir(csvFolder, pattern = 'population_settings.csv')
- popSet <- readr::read_csv(file.path(csvFolder, popSetName))
-
- # get the model
- plpModel <- list(
- model = 'external validation of model',
- modelDesign = modelDesign,
- validationDetails = list(
- analysisId = '',
- analysisSource = '',
- developmentDatabase = developmentDatabase,
- developmentDatabaseId = developmentDatabaseId,
- validationDatabase = validationDatabase,
- validationDatabaseId = validationDatabaseId,
-
- populationSettings = ParallelLogger::convertJsonToSettings(
- as.character(
- popSet %>%
- dplyr::filter(.data$population_setting_id == !!poi$population_setting_id) %>%
- dplyr::select("population_settings_json")
- )
- ),
- restrictPlpDataSettings = ParallelLogger::convertJsonToSettings(
- as.character(
- plpDataSet %>%
+
+ attritionName <- dir(csvFolder, pattern = "attrition.csv")
+ attrition <- readr::read_csv(file.path(csvFolder, attritionName)) %>%
+ dplyr::filter(.data$performance_id == !!poi$performance_id) %>%
+ dplyr::select(-"performance_id")
+ colnames(attrition) <- SqlRender::snakeCaseToCamelCase(colnames(attrition))
+
+ cohortsName <- dir(csvFolder, pattern = "cohorts.csv")
+ cohorts <- readr::read_csv(file.path(csvFolder, cohortsName))
+ plpDataSetName <- dir(csvFolder, pattern = "plp_data_settings.csv")
+ plpDataSet <- readr::read_csv(file.path(csvFolder, plpDataSetName))
+ popSetName <- dir(csvFolder, pattern = "population_settings.csv")
+ popSet <- readr::read_csv(file.path(csvFolder, popSetName))
+
+ # get the model
+ plpModel <- list(
+ model = "external validation of model",
+ modelDesign = modelDesign,
+ validationDetails = list(
+ analysisId = "",
+ analysisSource = "",
+ developmentDatabase = developmentDatabase,
+ developmentDatabaseId = developmentDatabaseId,
+ validationDatabase = validationDatabase,
+ validationDatabaseId = validationDatabaseId,
+ populationSettings = ParallelLogger::convertJsonToSettings(
+ as.character(
+ popSet %>%
+ dplyr::filter(.data$population_setting_id == !!poi$population_setting_id) %>%
+ dplyr::select("population_settings_json")
+ )
+ ),
+ restrictPlpDataSettings = ParallelLogger::convertJsonToSettings(
+ as.character(
+ plpDataSet %>%
dplyr::filter(.data$plp_data_setting_id == !!poi$plp_data_setting_id) %>%
dplyr::select("plp_data_settings_json")
- )
- ),
-
- outcomeId = as.double(
- cohorts %>%
- dplyr::filter(.data$cohort_id == !!poi$outcome_id) %>%
- dplyr::select("cohort_definition_id")
- ),
- targetId = as.double(
- cohorts %>%
- dplyr::filter(.data$cohort_id == !!poi$target_id) %>%
- dplyr::select("cohort_definition_id")
- ),
-
- attrition = attrition
- )
+ )
+ ),
+ outcomeId = as.double(
+ cohorts %>%
+ dplyr::filter(.data$cohort_id == !!poi$outcome_id) %>%
+ dplyr::select("cohort_definition_id")
+ ),
+ targetId = as.double(
+ cohorts %>%
+ dplyr::filter(.data$cohort_id == !!poi$target_id) %>%
+ dplyr::select("cohort_definition_id")
+ ),
+ attrition = attrition
)
- attr(plpModel, "predictionFunction") <- 'none'
- attr(plpModel, "saveType") <- 'RtoJson'
- class(plpModel) <- 'plpModel'
-
- resultClass <- 'externalValidatePlp'
+ )
+ attr(plpModel, "predictionFunction") <- "none"
+ attr(plpModel, "saveType") <- "RtoJson"
+ class(plpModel) <- "plpModel"
+
+ resultClass <- "externalValidatePlp"
}
-
+
result <- list(
executionSummary = list(
PackageVersion = list(
packageVersion = poi$plp_version
),
- #TotalExecutionElapsedTime = ,
- ExecutionDateTime = poi$execution_date_time
+ # TotalExecutionElapsedTime = ,
+ ExecutionDateTime = poi$execution_date_time
),
model = plpModel,
performanceEvaluation = performanceEvaluation,
covariateSummary = covariateSummary,
analysisRef = list(
- analysisId = ''
- )
+ analysisId = ""
+ )
)
class(result) <- resultClass
-
+
# return the object
return(result)
-
}
extractDiagnosticFromCsv <- function(
- diagnosticId,
- csvFolder
-){
-
+ diagnosticId,
+ csvFolder) {
# diagnostic_id model_design_id database_id execution_date_time
- csvFileNames <- dir(csvFolder, pattern = '.csv')
-
+ csvFileNames <- dir(csvFolder, pattern = ".csv")
+
# get the model design
# performance_id model_design_id development_database_id validation_database_id target_id outcome_id tar_id plp_data_setting_id population_setting_id model_development execution_date_time plp_version
- diagnostics <- readr::read_csv(file.path(csvFolder, csvFileNames[grep('diagnostics', csvFileNames)]))
- if(length(diagnostics) == 0){
- ParallelLogger::logInfo('No diagnostics in csv results')
+ diagnostics <- readr::read_csv(file.path(csvFolder, csvFileNames[grep("diagnostics", csvFileNames)]))
+ if (length(diagnostics) == 0) {
+ ParallelLogger::logInfo("No diagnostics in csv results")
return(NULL)
}
- doi <- diagnostics[diagnostics$diagnostic_id == diagnosticId,,]
- if(nrow(doi) == 0){
- ParallelLogger::logInfo('No diagnostics in csv results with specified diagnosticId')
+ doi <- diagnostics[diagnostics$diagnostic_id == diagnosticId, , ]
+ if (nrow(doi) == 0) {
+ ParallelLogger::logInfo("No diagnostics in csv results with specified diagnosticId")
return(NULL)
}
-
+
modelDesignId <- doi$model_design_id
- modeldesigns <- readr::read_csv(file.path(csvFolder, csvFileNames[grep('model_designs', csvFileNames)]))
+ modeldesigns <- readr::read_csv(file.path(csvFolder, csvFileNames[grep("model_designs", csvFileNames)]))
# model_design_id target_id outcome_id tar_id plp_data_setting_id population_setting_id model_setting_id covariate_setting_id sample_setting_id split_setting_id feature_engineering_setting_id tidy_covariates_setting_id
- modeldesigns <- modeldesigns[modeldesigns$model_design_id == modelDesignId,,]
-
+ modeldesigns <- modeldesigns[modeldesigns$model_design_id == modelDesignId, , ]
+
modelDesignSettingTable <- getModelDesignSettingTable(
modeldesignsRow = modeldesigns
)
-
+
modelDesign <- getModelDesignCsv(
- modelDesignSettingTable = modelDesignSettingTable,
+ modelDesignSettingTable = modelDesignSettingTable,
csvFolder = csvFolder
)
-
- databaseMetaDataName <- dir(csvFolder, pattern = 'database_meta_data.csv')
- databaseMetaData <- readr::read_csv(file.path(csvFolder, databaseMetaDataName))
- databaseDetailsName <- dir(csvFolder, pattern = 'database_details.csv')
- databaseDetails <- readr::read_csv(file.path(csvFolder, databaseDetailsName))
- databases <- merge(databaseDetails, databaseMetaData, by.x = 'database_meta_data_id', by.y = 'database_id')
-
+
+ databaseMetaDataName <- dir(csvFolder, pattern = "database_meta_data.csv")
+ databaseMetaData <- readr::read_csv(file.path(csvFolder, databaseMetaDataName))
+ databaseDetailsName <- dir(csvFolder, pattern = "database_details.csv")
+ databaseDetails <- readr::read_csv(file.path(csvFolder, databaseDetailsName))
+ databases <- merge(databaseDetails, databaseMetaData, by.x = "database_meta_data_id", by.y = "database_id")
+
db <- databases[databases$database_id == doi$database_id]
-
+
databaseSchema <- db$cdm_source_name
databaseId <- db$database_meta_data_id
-
- outcomesName <- dir(csvFolder, pattern = 'diagnostic_outcomes.csv')
- outcomes <- readr::read_csv(file.path(csvFolder, outcomesName)) %>%
- dplyr::filter(.data$diagnostic_id == !! diagnosticId) %>%
+
+ outcomesName <- dir(csvFolder, pattern = "diagnostic_outcomes.csv")
+ outcomes <- readr::read_csv(file.path(csvFolder, outcomesName)) %>%
+ dplyr::filter(.data$diagnostic_id == !!diagnosticId) %>%
dplyr::select(-"diagnostic_id")
- colnames(outcomes) <- SqlRender::snakeCaseToCamelCase(colnames(outcomes))
-
- predictorsName <- dir(csvFolder, pattern = 'diagnostic_predictors.csv')
- predictors <- readr::read_csv(file.path(csvFolder, predictorsName)) %>%
- dplyr::filter(.data$diagnostic_id == !! diagnosticId) %>%
+ colnames(outcomes) <- SqlRender::snakeCaseToCamelCase(colnames(outcomes))
+
+ predictorsName <- dir(csvFolder, pattern = "diagnostic_predictors.csv")
+ predictors <- readr::read_csv(file.path(csvFolder, predictorsName)) %>%
+ dplyr::filter(.data$diagnostic_id == !!diagnosticId) %>%
dplyr::select(-"diagnostic_id")
- colnames(predictors) <- SqlRender::snakeCaseToCamelCase(colnames(predictors))
-
- participantsName <- dir(csvFolder, pattern = 'diagnostic_participants.csv')
- participants <- readr::read_csv(file.path(csvFolder, participantsName)) %>%
- dplyr::filter(.data$diagnostic_id == !! diagnosticId) %>%
+ colnames(predictors) <- SqlRender::snakeCaseToCamelCase(colnames(predictors))
+
+ participantsName <- dir(csvFolder, pattern = "diagnostic_participants.csv")
+ participants <- readr::read_csv(file.path(csvFolder, participantsName)) %>%
+ dplyr::filter(.data$diagnostic_id == !!diagnosticId) %>%
dplyr::select(-"diagnostic_id")
- colnames(participants) <- SqlRender::snakeCaseToCamelCase(colnames(participants))
-
- summaryName <- dir(csvFolder, pattern = 'diagnostic_summary.csv')
- summary <- readr::read_csv(file.path(csvFolder, summaryName)) %>%
- dplyr::filter(.data$diagnostic_id == !! diagnosticId) %>%
+ colnames(participants) <- SqlRender::snakeCaseToCamelCase(colnames(participants))
+
+ summaryName <- dir(csvFolder, pattern = "diagnostic_summary.csv")
+ summary <- readr::read_csv(file.path(csvFolder, summaryName)) %>%
+ dplyr::filter(.data$diagnostic_id == !!diagnosticId) %>%
dplyr::select(-"diagnostic_id")
- colnames(summary) <- SqlRender::snakeCaseToCamelCase(colnames(summary))
-
+ colnames(summary) <- SqlRender::snakeCaseToCamelCase(colnames(summary))
+
result <- list(
summary = summary,
participants = participants,
@@ -559,26 +571,24 @@ extractDiagnosticFromCsv <- function(
databaseSchema = databaseSchema,
databaseId = databaseId
)
-
- class(result) <- 'diagnosePlp'
-
+
+ class(result) <- "diagnosePlp"
+
return(result)
}
getTableNamesPlp <- function(
connectionDetails,
- databaseSchema
-){
-
- # check some plp tables exists in databaseSchemaSettings
- conn <- DatabaseConnector::connect(connectionDetails)
+ databaseSchema) {
+ # check some plp tables exists in databaseSchemaSettings
+ conn <- DatabaseConnector::connect(connectionDetails)
on.exit(DatabaseConnector::disconnect(conn))
-
+
result <- DatabaseConnector::getTableNames(
- connection = conn,
+ connection = conn,
databaseSchema = databaseSchema
)
-
+
return(tolower(result))
}
diff --git a/R/KNN.R b/R/KNN.R
deleted file mode 100644
index 834b26eaa..000000000
--- a/R/KNN.R
+++ /dev/null
@@ -1,202 +0,0 @@
-# @file knn.R
-#
-# Copyright 2021 Observational Health Data Sciences and Informatics
-#
-# This file is part of PatientLevelPrediction
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#' Create setting for knn model
-#'
-#' @param k The number of neighbors to consider
-#' @param indexFolder The directory where the results and intermediate steps are output
-#' @param threads The number of threads to use when applying big knn
-#'
-#' @examples
-#' \dontrun{
-#' model.knn <- setKNN(k=10000)
-#' }
-#' @export
-setKNN <- function(k=1000, indexFolder=file.path(getwd(),'knn'), threads = 1 ){
- ensure_installed("BigKnn")
-
- checkIsClass(indexFolder, c('character'))
-
- checkIsClass(k, c('numeric','integer'))
- checkHigher(k, 0)
-
- if(length(k)>1){
- ParallelLogger::logWarn('k can only be a single value - using first value only')
- k <- k[1]
- }
-
-
- param <- list(
- k = k,
- indexFolder = indexFolder,
- threads = threads
- )
-
- attr(param, 'settings') <- list(
- modelType = 'knn',
- modelName = 'K Nearest Neighbors'
- )
-
- attr(param, 'saveType') <- 'file'
-
- result <- list(
- fitFunction = "fitKNN",
- param = param
- )
-
- class(result) <- 'modelSettings'
-
- return(result)
-}
-
-fitKNN <- function(trainData, modelSettings, search = 'none', analysisId, ...){
-
- param <- modelSettings$param
-
- if (!FeatureExtraction::isCovariateData(trainData$covariateData)){
- stop("Needs correct covariateData")
- }
-
- settings <- attr(param, 'settings')
- ParallelLogger::logInfo(paste0('Training ', settings$modelName))
-
- start <- Sys.time()
- k <- param$k
- if(is.null(k))
- k <- 10
- if(is.null(param$threads)){
- param$threads<- 1
- }
- indexFolder <- param$indexFolder
-
-
- BigKnn::buildKnnFromPlpData(
- plpData = trainData,
- population = trainData$labels %>%
- dplyr::mutate(
- y = sapply(.data$outcomeCount, function(x) min(1,x))
- ),
- indexFolder = indexFolder,
- overwrite = TRUE
- )
-
- comp <- Sys.time() - start
-
- ParallelLogger::logInfo(paste0('Model knn trained - took:', format(comp, digits=3)))
-
- variableImportance <- as.data.frame(trainData$covariateData$covariateRef)
- variableImportance$covariateValue <- rep(1, nrow(variableImportance))
-
- prediction <- predictKnn(
- data = trainData,
- cohort = trainData$labels,
- plpModel = list(
- model = indexFolder,
- trainDetails = list(
- finalModelParameters = list(
- k = k,
- threads = param$threads
- )
- )
- )
- )
-
-
- prediction$evaluationType <- 'Train'
-
- result <- list(
- model = indexFolder,
-
- preprocessing = list(
- featureEngineering = attr(trainData, "metaData")$featureEngineering,#learned mapping
- tidyCovariates = attr(trainData$covariateData, "metaData")$tidyCovariateDataSettings, #learned mapping
- requireDenseMatrix = F
- ),
-
- prediction = prediction,
-
- modelDesign = PatientLevelPrediction::createModelDesign(
- targetId = attr(trainData, "metaData")$targetId,
- outcomeId = attr(trainData, "metaData")$outcomeId,
- restrictPlpDataSettings = attr(trainData, "metaData")$restrictPlpDataSettings,
- covariateSettings = attr(trainData, "metaData")$covariateSettings,
- populationSettings = attr(trainData, "metaData")$populationSettings,
- featureEngineeringSettings = attr(trainData$covariateData, "metaData")$featureEngineeringSettings,
- preprocessSettings = attr(trainData$covariateData, "metaData")$preprocessSettings,
- modelSetting = modelSettings,
- splitSettings = attr(trainData, "metaData")$splitSettings,
- sampleSettings = attr(trainData, "metaData")$sampleSettings
- ),
-
- trainDetails = list(
- analysisId = analysisId,
- developmentDatabase = attr(trainData, "metaData")$cdmDatabaseName,
- developmentDatabaseSchema = attr(trainData, "metaData")$cdmDatabaseSchema,
- attrition = attr(trainData, "metaData")$attrition,
- trainingTime = paste(as.character(abs(comp)), attr(comp,'units')),
- trainingDate = Sys.Date(),
- modelName = 'KNN',
- hyperParamSearch = data.frame(),
- finalModelParameters = list(
- k = k,
- threads = param$threads
- )
- ),
-
- covariateImportance = variableImportance
- )
-
-
- class(result) <- 'plpModel'
- attr(result, 'predictionFunction') <- 'predictKnn'
- attr(result, 'modelType') <- 'binary'
- attr(result, 'saveType') <- attr(param, 'saveType')
- return(result)
-}
-
-
-predictKnn <- function(
- plpModel,
- data,
- cohort
-){
-
- prediction <- BigKnn::predictKnn(
- covariates = data$covariateData$covariates,
- cohort = cohort[,!colnames(cohort)%in%'cohortStartDate'],
- indexFolder = plpModel$model,
- k = plpModel$trainDetails$finalModelParameters$k,
- weighted = TRUE,
- threads = plpModel$trainDetails$finalModelParameters$threads
- )
-
- # can add: threads = 1 in the future
-
- # return the cohorts as a data frame with the prediction added as
- # a new column with the column name 'value'
- prediction <- merge(cohort, prediction[,c('rowId','value')], by='rowId',
- all.x=T, fill=0)
- prediction$value[is.na(prediction$value)] <- 0
-
- attr(prediction, "metaData")$modelType <- 'binary'
-
- return(prediction)
-
-}
-
-
diff --git a/R/LearningCurve.R b/R/LearningCurve.R
index 863d7e778..a9a8d6cd3 100644
--- a/R/LearningCurve.R
+++ b/R/LearningCurve.R
@@ -20,16 +20,16 @@
#'
#' @description Creates a learning curve object, which can be plotted using the
#' \code{plotLearningCurve()} function.
-#'
+#'
#' @param plpData An object of type \code{plpData} - the patient level prediction
#' data extracted from the CDM.
-#' @param outcomeId (integer) The ID of the outcome.
+#' @param outcomeId (integer) The ID of the outcome.
#' @param analysisId (integer) Identifier for the analysis. It is used to create, e.g., the result folder. Default is a timestamp.
#' @param populationSettings An object of type \code{populationSettings} created using \code{createStudyPopulationSettings} that
-#' specifies how the data class labels are defined and addition any exclusions to apply to the
+#' specifies how the data class labels are defined and addition any exclusions to apply to the
#' plpData cohort
-#' @param splitSettings An object of type \code{splitSettings} that specifies how to split the data into train/validation/test.
-#' The default settings can be created using \code{createDefaultSplitSetting}.
+#' @param splitSettings An object of type \code{splitSettings} that specifies how to split the data into train/validation/test.
+#' The default settings can be created using \code{createDefaultSplitSetting}.
#' @param sampleSettings An object of type \code{sampleSettings} that specifies any under/over sampling to be done.
#' The default is none.
#' @param trainFractions A list of training fractions to create models for.
@@ -42,10 +42,10 @@
#' \itemize{
#' \item \code{c(500, 1000, 1500) } - a list of training events
#' }
-#' @param featureEngineeringSettings An object of \code{featureEngineeringSettings} specifying any feature engineering to be learned (using the train data)
-#' @param preprocessSettings An object of \code{preprocessSettings}. This setting specifies the minimum fraction of
-#' target population who must have a covariate for it to be included in the model training
-#' and whether to normalise the covariates before training
+#' @param featureEngineeringSettings An object of \code{featureEngineeringSettings} specifying any feature engineering to be learned (using the train data)
+#' @param preprocessSettings An object of \code{preprocessSettings}. This setting specifies the minimum fraction of
+#' target population who must have a covariate for it to be included in the model training
+#' and whether to normalise the covariates before training
#' @param modelSettings An object of class \code{modelSettings} created using one of the function:
#' \itemize{
#' \item \code{setLassoLogisticRegression()} A lasso logistic regression model
@@ -54,12 +54,12 @@
#' \item \code{setRandomForest()} A random forest model
#' \item \code{setDecisionTree()} A decision tree model
#' \item \code{setKNN()} A KNN model
-#' }
-#' @param logSettings An object of \code{logSettings} created using \code{createLogSettings}
-#' specifying how the logging is done
+#' }
+#' @param logSettings An object of \code{logSettings} created using \code{createLogSettings}
+#' specifying how the logging is done
#' @param executeSettings An object of \code{executeSettings} specifying which parts of the analysis to run
-#'
-#'
+#'
+#'
#' @param saveDirectory The path to the directory where the results will be saved (if NULL uses working directory)
#' @param cores The number of computer cores to use if running in parallel
#' @param parallel Whether to run the code in parallel
@@ -67,89 +67,87 @@
#' @return A learning curve object containing the various performance measures
#' obtained by the model for each training set fraction. It can be plotted
#' using \code{plotLearningCurve}.
-#'
+#'
#' @examples
#' \dontrun{
#' # define model
-#' modelSettings = PatientLevelPrediction::setLassoLogisticRegression()
-#'
+#' modelSettings <- PatientLevelPrediction::setLassoLogisticRegression()
+#'
#' # create learning curve
-#' learningCurve <- PatientLevelPrediction::createLearningCurve(population,
-#' plpData,
-#' modelSettings)
+#' learningCurve <- PatientLevelPrediction::createLearningCurve(
+#' population,
+#' plpData,
+#' modelSettings
+#' )
#' # plot learning curve
#' PatientLevelPrediction::plotLearningCurve(learningCurve)
#' }
-#'
+#'
#' @export
createLearningCurve <- function(
- plpData,
- outcomeId,
- parallel = T,
- cores = 4,
- modelSettings,
- saveDirectory = getwd(),
- analysisId = 'learningCurve',
- populationSettings = createStudyPopulationSettings(),
- splitSettings = createDefaultSplitSetting(),
- trainFractions = c(0.25, 0.50, 0.75),
- trainEvents = NULL,
- sampleSettings = createSampleSettings(),
- featureEngineeringSettings = createFeatureEngineeringSettings(),
- preprocessSettings = createPreprocessSettings(
- minFraction = 0.001,
- normalize = T
- ),
- logSettings = createLogSettings(),
- executeSettings = createExecuteSettings(
- runSplitData = T,
- runSampleData = F,
- runfeatureEngineering = F,
- runPreprocessData = T,
- runModelDevelopment = T,
- runCovariateSummary = F
- )
-){
-
+ plpData,
+ outcomeId,
+ parallel = TRUE,
+ cores = 4,
+ modelSettings,
+ saveDirectory = getwd(),
+ analysisId = "learningCurve",
+ populationSettings = createStudyPopulationSettings(),
+ splitSettings = createDefaultSplitSetting(),
+ trainFractions = c(0.25, 0.50, 0.75),
+ trainEvents = NULL,
+ sampleSettings = createSampleSettings(),
+ featureEngineeringSettings = createFeatureEngineeringSettings(),
+ preprocessSettings = createPreprocessSettings(
+ minFraction = 0.001,
+ normalize = TRUE
+ ),
+ logSettings = createLogSettings(),
+ executeSettings = createExecuteSettings(
+ runSplitData = TRUE,
+ runSampleData = FALSE,
+ runfeatureEngineering = FALSE,
+ runPreprocessData = TRUE,
+ runModelDevelopment = TRUE,
+ runCovariateSummary = FALSE
+ )) {
if (is.null(analysisId)) {
- analysisId <- gsub(':', '', gsub('-', '', gsub(' ', '', Sys.time())))
+ analysisId <- gsub(":", "", gsub("-", "", gsub(" ", "", Sys.time())))
}
-
-
+
+
# if trainEvents is provided override trainFractions input
if (!is.null(trainEvents)) {
-
trainFractions <- getTrainFractions(
trainEvents,
- plpData,
- outcomeId,
+ plpData,
+ outcomeId,
populationSettings,
splitSettings
)
-
}
-
+
# record global start time
ExecutionDateTime <- Sys.time()
-
- if(parallel){
- ensure_installed('parallel')
- if(is.null(cores)){
- ParallelLogger::logInfo(paste0('Number of cores not specified'))
+
+ if (parallel) {
+ rlang::check_installed("parallel")
+ if (is.null(cores)) {
+ ParallelLogger::logInfo(paste0("Number of cores not specified"))
cores <- parallel::detectCores()
- ParallelLogger::logInfo(paste0('Using all ', cores))
- ParallelLogger::logInfo(paste0('Set cores input to use fewer...'))
+ ParallelLogger::logInfo(paste0("Using all ", cores))
+ ParallelLogger::logInfo(paste0("Set cores input to use fewer..."))
}
-
+
# save data
- savePlpData(plpData, file.path(saveDirectory,'data'))
-
+ savePlpData(plpData, file.path(saveDirectory, "data"))
+
# code to run in parallel
- getLcSettings <- function(i){
+ getLcSettings <- function(i) {
result <- list(
- plpData = file.path(saveDirectory,'data'),
+ plpData = file.path(saveDirectory, "data"),
outcomeId = outcomeId,
- analysisId = paste0(analysisId,i),
+ analysisId = paste0(analysisId, i),
populationSettings = populationSettings,
splitSettings = splitSettings,
sampleSettings = sampleSettings,
@@ -161,28 +159,28 @@ createLearningCurve <- function(
saveDirectory = saveDirectory
)
result$splitSettings$train <- trainFractions[i]
-
+
return(result)
}
lcSettings <- lapply(1:length(trainFractions), getLcSettings)
-
+
cluster <- ParallelLogger::makeCluster(numberOfThreads = cores)
ParallelLogger::clusterRequire(cluster, c("PatientLevelPrediction", "Andromeda", "FeatureExtraction"))
-
- learningCurve <- ParallelLogger::clusterApply(cluster = cluster,
- x = lcSettings,
- fun = lcWrapper,
+
+ learningCurve <- ParallelLogger::clusterApply(
+ cluster = cluster,
+ x = lcSettings,
+ fun = lcWrapper,
stopOnError = FALSE,
- progressBar = TRUE)
+ progressBar = TRUE
+ )
ParallelLogger::stopCluster(cluster)
-
- } else{
-
+ } else {
# code to run not in parallel
# number of training set fractions
nRuns <- length(trainFractions)
-
- settings = list(
+
+ settings <- list(
plpData = quote(plpData),
outcomeId = outcomeId,
analysisId = analysisId,
@@ -196,59 +194,57 @@ createLearningCurve <- function(
executeSettings = executeSettings,
saveDirectory = saveDirectory
)
-
- learningCurve <- lapply(1:nRuns, function(i){
-
- settings$splitSettings$train = trainFractions[i]
- settings$analysisId = paste0(settings$analysisId, '_', i)
- result <- do.call(runPlp, settings)
-
- result <- learningCurveHelper(
- result = result,
- trainFractions = trainFractions[i]
+
+ learningCurve <- lapply(1:nRuns, function(i) {
+ settings$splitSettings$train <- trainFractions[i]
+ settings$analysisId <- paste0(settings$analysisId, "_", i)
+ result <- do.call(runPlp, settings)
+
+ result <- learningCurveHelper(
+ result = result,
+ trainFractions = trainFractions[i]
)
- return(result)
-
- })
-
+ return(result)
+ })
}
-
- learningCurve <- do.call(rbind,learningCurve)
-
- learningCurve <- tidyr::pivot_wider(
- data = learningCurve,
- names_from = 'name',
- values_from = 'value'
- )
- #learningCurve <- reshape2::dcast(data = learningCurve, trainFraction ~ name)
+ learningCurve <- do.call(rbind, learningCurve)
+
+ learningCurve <- tidyr::pivot_wider(
+ data = learningCurve,
+ names_from = "name",
+ values_from = "value"
+ )
endTime <- Sys.time()
TotalExecutionElapsedTime <-
as.numeric(difftime(endTime, ExecutionDateTime,
- units = "secs"))
- ParallelLogger::logInfo('Finished in ', round(TotalExecutionElapsedTime), ' secs.')
-
+ units = "secs"
+ ))
+ ParallelLogger::logInfo("Finished in ", round(TotalExecutionElapsedTime), " secs.")
+
return(learningCurve)
}
-lcWrapper <- function(settings){
+lcWrapper <- function(settings) {
plpData <- PatientLevelPrediction::loadPlpData(settings$plpData)
settings$plpData <- quote(plpData)
- result <- tryCatch({do.call(runPlp, settings)},
- warning = function(war) {
- ParallelLogger::logInfo(paste0('a warning: ', war))
- },
- error = function(err) {
- ParallelLogger::logError(paste0('an error: ', err))
- return(NULL)
- }
+ result <- tryCatch(
+ {
+ do.call(runPlp, settings)
+ },
+ warning = function(war) {
+ ParallelLogger::logInfo(paste0("a warning: ", war))
+ },
+ error = function(err) {
+ ParallelLogger::logError(paste0("an error: ", err))
+ return(NULL)
+ }
)
- if(!is.null(result)){
- ParallelLogger::logInfo('Extracting performance for learning curve...')
+ if (!is.null(result)) {
+ ParallelLogger::logInfo("Extracting performance for learning curve...")
final <- learningCurveHelper(result, settings$splitSettings$train)
return(final)
-
- } else{
+ } else {
return(c())
}
}
@@ -256,62 +252,62 @@ lcWrapper <- function(settings){
getTrainFractions <- function(
- trainEvents,
- plpData,
- outcomeId,
- populationSettings,
- splitSettings
-){
-
+ trainEvents,
+ plpData,
+ outcomeId,
+ populationSettings,
+ splitSettings) {
population <- do.call(
- createStudyPopulation,
+ createStudyPopulation,
list(
plpData = plpData,
outcomeId = outcomeId,
populationSettings = populationSettings
)
)
-
+
# compute training set fractions from training events
- samplesRequired <- trainEvents/(sum(population$outcomeCount/nrow(population)))
- trainFractionsTemp <- samplesRequired/nrow(population)
-
+ samplesRequired <- trainEvents / (sum(population$outcomeCount / nrow(population)))
+ trainFractionsTemp <- samplesRequired / nrow(population)
+
# filter out no. of events that would exceed the available training set size
binaryMask <- trainFractionsTemp <= (1.0 - splitSettings$test)
-
+
# override any input to trainFractions with event-based training fractions
trainFractions <- trainFractionsTemp[binaryMask]
-
+
# Check if any train fractions could be associated with the provided events
- if(!length(trainFractions)) {
+ if (!length(trainFractions)) {
# If not, fall back on default train fractions
trainFractions <- c(0.25, 0.50, 0.75)
}
-
+
return(trainFractions)
}
-learningCurveHelper <- function(result, trainFractions){
-
+learningCurveHelper <- function(result, trainFractions) {
executeTime <- result$executionSummary$TotalExecutionElapsedTime
- nPredictors <- result$model$covariateImportance %>% dplyr::filter(.data$covariateValue != 0) %>% dplyr::tally() %>% dplyr::pull()
-
+ nPredictors <- result$model$covariateImportance %>%
+ dplyr::filter(.data$covariateValue != 0) %>%
+ dplyr::tally() %>%
+ dplyr::pull()
+
# evaluationStatistics is a data.frame with columns 'evaluation','metric','value'
result <- result$performanceEvaluation$evaluationStatistics
-
- result$name <- paste(result$evaluation, result$metric, sep='_')
-
+
+ result$name <- paste(result$evaluation, result$metric, sep = "_")
+
result <- result %>% dplyr::select("name", "value")
-
+
result <- rbind(
- c('executionTime', executeTime),
- result,
- c('nPredictors', nPredictors)
+ c("executionTime", executeTime),
+ result,
+ c("nPredictors", nPredictors)
)
-
- result$trainFraction <- trainFractions * 100
-
+
+ result$trainFraction <- trainFractions * 100
+
return(result)
}
@@ -340,119 +336,117 @@ learningCurveHelper <- function(result, trainFractions){
#' @param plotTitle Title of the learning curve plot.
#' @param plotSubtitle Subtitle of the learning curve plot.
#' @param fileName Filename of plot to be saved, for example \code{'plot.png'}.
-#' See the function \code{ggsave} in the ggplot2 package for supported file
+#' See the function \code{ggsave} in the ggplot2 package for supported file
#' formats.
#'
#' @return
-#' A ggplot object. Use the \code{\link[ggplot2]{ggsave}} function to save to
+#' A ggplot object. Use the \code{\link[ggplot2]{ggsave}} function to save to
#' file in a different format.
-#'
+#'
#' @examples
#' \dontrun{
#' # create learning curve object
-#' learningCurve <- createLearningCurve(population,
-#' plpData,
-#' modelSettings)
+#' learningCurve <- createLearningCurve(
+#' population,
+#' plpData,
+#' modelSettings
+#' )
#' # plot the learning curve
#' plotLearningCurve(learningCurve)
#' }
-#'
+#'
#' @export
-plotLearningCurve <- function(learningCurve,
- metric = "AUROC",
- abscissa = "events",
- plotTitle = "Learning Curve",
- plotSubtitle = NULL,
- fileName = NULL){
-
+plotLearningCurve <- function(
+ learningCurve,
+ metric = "AUROC",
+ abscissa = "events",
+ plotTitle = "Learning Curve",
+ plotSubtitle = NULL,
+ fileName = NULL) {
+ rlang::check_installed("ggplot2",
+ reason = "plotLearningCurve requires the ggplot2 package to be installed.")
tidyLearningCurve <- NULL
yAxisRange <- NULL
y <- NULL
-
+
learningCurve <- as.data.frame(learningCurve)
-
+
# check for performance metric to plot
- if(metric == "AUROC") {
+ if (metric == "AUROC") {
# create a data.frame with evalautionType, AUROC
- tidyLearningCurve <- learningCurve %>%
+ tidyLearningCurve <- learningCurve %>%
dplyr::rename(
- Occurrences = "Train_outcomeCount",
- Observations = "Train_populationSize" ) %>%
+ Occurrences = "Train_outcomeCount",
+ Observations = "Train_populationSize"
+ ) %>%
dplyr::select("trainFraction", "Occurrences", "Observations", "Test_AUROC", "Train_AUROC")
-
- for(i in 1:ncol(tidyLearningCurve)){
- tidyLearningCurve[,i] <- as.double(as.character(tidyLearningCurve[,i]))
+
+ for (i in 1:ncol(tidyLearningCurve)) {
+ tidyLearningCurve[, i] <- as.double(as.character(tidyLearningCurve[, i]))
}
-
+
tidyLearningCurve <- tidyr::pivot_longer(
data = as.data.frame(tidyLearningCurve),
- cols = colnames(as.data.frame(tidyLearningCurve))[!colnames(as.data.frame(tidyLearningCurve)) %in% c('trainFraction', 'Occurrences', 'Observations')],
- values_to = "value",
- names_to = 'variable'
+ cols = colnames(as.data.frame(tidyLearningCurve))[!colnames(as.data.frame(tidyLearningCurve)) %in% c("trainFraction", "Occurrences", "Observations")],
+ values_to = "value",
+ names_to = "variable"
)
-
- #tidyLearningCurve <- reshape2::melt(as.data.frame(tidyLearningCurve), id.vars = c('trainFraction', 'Occurrences', 'Observations'))
-
- tidyLearningCurve$Dataset <- sapply(tidyLearningCurve$variable, function(x)strsplit(as.character(x), '_')[[1]][1])
-
+
+ tidyLearningCurve$Dataset <- sapply(tidyLearningCurve$variable, function(x) strsplit(as.character(x), "_")[[1]][1])
+
# define plot properties
yAxisRange <- c(0.5, 1.0)
-
} else if (metric == "AUPRC") {
# tidy up dataframe
- tidyLearningCurve <- learningCurve %>%
+ tidyLearningCurve <- learningCurve %>%
dplyr::rename(
- Occurrences = "Train_outcomeCount",
- Observations = "Train_populationSize" ) %>%
+ Occurrences = "Train_outcomeCount",
+ Observations = "Train_populationSize"
+ ) %>%
dplyr::select("trainFraction", "Occurrences", "Observations", "Test_AUPRC", "Train_AUPRC")
-
- for(i in 1:ncol(tidyLearningCurve)){
- tidyLearningCurve[,i] <- as.double(as.character(tidyLearningCurve[,i]))
+
+ for (i in 1:ncol(tidyLearningCurve)) {
+ tidyLearningCurve[, i] <- as.double(as.character(tidyLearningCurve[, i]))
}
tidyLearningCurve <- tidyr::pivot_longer(
data = as.data.frame(tidyLearningCurve),
- cols = colnames(as.data.frame(tidyLearningCurve))[!colnames(as.data.frame(tidyLearningCurve)) %in% c('trainFraction', 'Occurrences', 'Observations')],
- values_to = "value",
- names_to = 'variable'
+ cols = colnames(as.data.frame(tidyLearningCurve))[!colnames(as.data.frame(tidyLearningCurve)) %in% c("trainFraction", "Occurrences", "Observations")],
+ values_to = "value",
+ names_to = "variable"
)
- #tidyLearningCurve <- reshape2::melt(as.data.frame(tidyLearningCurve), id.vars = c('trainFraction', 'Occurrences', 'Observations'))
-
- tidyLearningCurve$Dataset <- sapply(tidyLearningCurve$variable, function(x)strsplit(as.character(x), '_')[[1]][1])
-
+ tidyLearningCurve$Dataset <- sapply(tidyLearningCurve$variable, function(x) strsplit(as.character(x), "_")[[1]][1])
+
# define plot properties
yAxisRange <- c(0.0, 1.0)
-
} else if (metric == "sBrier") {
# tidy up dataframe
- tidyLearningCurve <- learningCurve %>%
+ tidyLearningCurve <- learningCurve %>%
dplyr::rename(
- Occurrences = "Train_outcomeCount",
- Observations = "Train_populationSize" ) %>%
+ Occurrences = "Train_outcomeCount",
+ Observations = "Train_populationSize"
+ ) %>%
dplyr::select("trainFraction", "Occurrences", "Observations", "Test_brier score scaled", "Train_brier score scaled")
-
- for(i in 1:ncol(tidyLearningCurve)){
- tidyLearningCurve[,i] <- as.double(as.character(tidyLearningCurve[,i]))
+
+ for (i in 1:ncol(tidyLearningCurve)) {
+ tidyLearningCurve[, i] <- as.double(as.character(tidyLearningCurve[, i]))
}
-
+
tidyLearningCurve <- tidyr::pivot_longer(
data = as.data.frame(tidyLearningCurve),
- cols = colnames(as.data.frame(tidyLearningCurve))[!colnames(as.data.frame(tidyLearningCurve)) %in% c('trainFraction', 'Occurrences', 'Observations')],
- values_to = "value",
- names_to = 'variable'
+ cols = colnames(as.data.frame(tidyLearningCurve))[!colnames(as.data.frame(tidyLearningCurve)) %in% c("trainFraction", "Occurrences", "Observations")],
+ values_to = "value",
+ names_to = "variable"
)
- #tidyLearningCurve <- reshape2::melt(as.data.frame(tidyLearningCurve), id.vars = c('trainFraction', 'Occurrences', 'Observations'))
-
- tidyLearningCurve$Dataset <- sapply(tidyLearningCurve$variable, function(x)strsplit(as.character(x), '_')[[1]][1])
-
-
+ tidyLearningCurve$Dataset <- sapply(tidyLearningCurve$variable, function(x) strsplit(as.character(x), "_")[[1]][1])
+
+
# define plot properties
yAxisRange <- c(0.0, 1.0)
-
} else {
stop("An incorrect metric has been specified.")
}
-
+
if (abscissa == "observations") {
abscissa <- "Observations"
abscissaLabel <- "No. of observations"
@@ -462,21 +456,25 @@ plotLearningCurve <- function(learningCurve,
} else {
stop("An incorrect abscissa has been specified.")
}
-
+
# create plot object
plot <- tidyLearningCurve %>%
- ggplot2::ggplot(ggplot2::aes(x = .data[[abscissa]], y = .data[['value']],
- col = .data[["Dataset"]])) +
+ ggplot2::ggplot(ggplot2::aes(
+ x = .data[[abscissa]], y = .data[["value"]],
+ col = .data[["Dataset"]]
+ )) +
ggplot2::geom_line() +
ggplot2::coord_cartesian(ylim = yAxisRange, expand = FALSE) +
- ggplot2::labs(title = plotTitle, subtitle = plotSubtitle,
- x = abscissaLabel, y = metric) +
+ ggplot2::labs(
+ title = plotTitle, subtitle = plotSubtitle,
+ x = abscissaLabel, y = metric
+ ) +
ggplot2::theme_light()
-
+
# save plot, if fucntion call provides a file name
- if ((!is.null(fileName)) & (is.character(fileName))) {
+ if ((!is.null(fileName)) && (is.character(fileName))) {
ggplot2::ggsave(fileName, plot, width = 5, height = 4.5, dpi = 400)
}
-
+
return(plot)
}
diff --git a/R/LightGBM.R b/R/LightGBM.R
index 69f1b6c6e..5d2e165a2 100644
--- a/R/LightGBM.R
+++ b/R/LightGBM.R
@@ -17,9 +17,9 @@
#' Create setting for gradient boosting machine model using lightGBM (https://github.com/microsoft/LightGBM/tree/master/R-package).
#'
-#' @param nthread The number of computer threads to use (how many cores do you have?)
+#' @param nthread The number of computer threads to use (how many cores do you have?)
#' @param earlyStopRound If the performance does not increase over earlyStopRound number of trees then training stops (this prevents overfitting)
-#' @param numIterations Number of boosting iterations.
+#' @param numIterations Number of boosting iterations.
#' @param numLeaves This hyperparameter sets the maximum number of leaves. Increasing this parameter can lead to higher model complexity and potential overfitting.
#' @param maxDepth This hyperparameter sets the maximum depth . Increasing this parameter can also lead to higher model complexity and potential overfitting.
#' @param minDataInLeaf This hyperparameter sets the minimum number of data points that must be present in a leaf node. Increasing this parameter can help to reduce overfitting
@@ -27,18 +27,18 @@
#' @param lambdaL1 This hyperparameter controls L1 regularization, which can help to reduce overfitting by encouraging sparse models.
#' @param lambdaL2 This hyperparameter controls L2 regularization, which can also help to reduce overfitting by discouraging large weights in the model.
#' @param scalePosWeight Controls weight of positive class in loss - useful for imbalanced classes
-#' @param isUnbalance This parameter cannot be used at the same time with scalePosWeight, choose only one of them. While enabling this should increase the overall performance metric of your model, it will also result in poor estimates of the individual class probabilities.
+#' @param isUnbalance This parameter cannot be used at the same time with scalePosWeight, choose only one of them. While enabling this should increase the overall performance metric of your model, it will also result in poor estimates of the individual class probabilities.
#' @param seed An option to add a seed when training the final model
#'
#' @examples
#' model.lightgbm <- setLightGBM(
-#' numLeaves = c(20, 31, 50), maxDepth = c(-1, 5, 10),
-#' minDataInLeaf = c(10, 20, 30), learningRate = c(0.05, 0.1, 0.3)
+#' numLeaves = c(20, 31, 50), maxDepth = c(-1, 5, 10),
+#' minDataInLeaf = c(10, 20, 30), learningRate = c(0.05, 0.1, 0.3)
#' )
#'
#' @export
-setLightGBM <- function(nthread = 20,
- earlyStopRound = 25,
+setLightGBM <- function(nthread = 20,
+ earlyStopRound = 25,
numIterations = c(100),
numLeaves = c(31),
maxDepth = c(5, 10),
@@ -49,40 +49,40 @@ setLightGBM <- function(nthread = 20,
scalePosWeight = 1,
isUnbalance = FALSE,
seed = sample(10000000, 1)) {
- ensure_installed("lightgbm")
+ rlang::check_installed("lightgbm")
checkIsClass(seed, c("numeric", "integer"))
-
+
if (length(nthread) > 1) {
stop("nthread must be length 1")
}
if (!inherits(x = seed, what = c("numeric", "integer"))) {
stop("Invalid seed")
}
- if(sum(numIterations < 1) > 0){
- stop('numIterations must be greater that 0')
+ if (sum(numIterations < 1) > 0) {
+ stop("numIterations must be greater that 0")
}
- if(sum(numLeaves < 2) > 0){
- stop('numLeaves must be greater that 1')
+ if (sum(numLeaves < 2) > 0) {
+ stop("numLeaves must be greater that 1")
}
- if(sum(numLeaves > 131072) > 0){
- stop('numLeaves must be less that or equal 131072')
+ if (sum(numLeaves > 131072) > 0) {
+ stop("numLeaves must be less that or equal 131072")
}
- if(sum(learningRate <= 0) > 0){
- stop('learningRate must be greater that 0')
+ if (sum(learningRate <= 0) > 0) {
+ stop("learningRate must be greater that 0")
}
- if (sum(lambdaL1 < 0) > 0){
- stop('lambdaL1 must be 0 or greater')
+ if (sum(lambdaL1 < 0) > 0) {
+ stop("lambdaL1 must be 0 or greater")
}
- if (sum(lambdaL2 < 0) > 0){
- stop('lambdaL2 must be 0 or greater')
+ if (sum(lambdaL2 < 0) > 0) {
+ stop("lambdaL2 must be 0 or greater")
}
- if (sum(scalePosWeight < 0) > 0){
- stop('scalePosWeight must be 0 or greater')
+ if (sum(scalePosWeight < 0) > 0) {
+ stop("scalePosWeight must be 0 or greater")
}
- if (isUnbalance == TRUE & sum(scalePosWeight != 1) > 0){
- stop('isUnbalance cannot be used at the same time with scale_pos_weight != 1, choose only one of them')
+ if (isUnbalance == TRUE && sum(scalePosWeight != 1) > 0) {
+ stop("isUnbalance cannot be used at the same time with scale_pos_weight != 1, choose only one of them")
}
-
+
paramGrid <- list(
earlyStopRound = earlyStopRound,
numIterations = numIterations,
@@ -95,9 +95,9 @@ setLightGBM <- function(nthread = 20,
isUnbalance = isUnbalance,
scalePosWeight = scalePosWeight
)
-
+
param <- listCartesian(paramGrid)
-
+
attr(param, "settings") <- list(
modelType = "LightGBM",
seed = seed[[1]],
@@ -107,16 +107,16 @@ setLightGBM <- function(nthread = 20,
trainRFunction = "fitLightGBM",
predictRFunction = "predictLightGBM"
)
-
+
attr(param, "saveType") <- "lightgbm"
-
+
result <- list(
fitFunction = "fitRclassifier",
param = param
)
-
+
class(result) <- "modelSettings"
-
+
return(result)
}
@@ -124,18 +124,18 @@ setLightGBM <- function(nthread = 20,
varImpLightGBM <- function(model,
covariateMap) {
- varImp <- lightgbm::lgb.importance(model, percentage = T) %>% dplyr::select("Feature", "Gain")
-
+ varImp <- lightgbm::lgb.importance(model, percentage = TRUE) %>% dplyr::select("Feature", "Gain")
+
varImp <- data.frame(
- covariateId = gsub(".*_","",varImp$Feature),
+ covariateId = gsub(".*_", "", varImp$Feature),
covariateValue = varImp$Gain,
included = 1
)
-
+
varImp <- merge(covariateMap, varImp, by.x = "columnId", by.y = "covariateId")
varImp <- varImp %>%
dplyr::select("covariateId", "covariateValue", "included")
-
+
return(varImp)
}
@@ -150,31 +150,31 @@ predictLightGBM <- function(plpModel,
map = plpModel$covariateImportance %>%
dplyr::select("columnId", "covariateId")
)
-
+
# use the include??
-
+
newData <- matrixObjects$dataMatrix
cohort <- matrixObjects$labels
} else {
newData <- data
}
-
+
if (inherits(plpModel, "plpModel")) {
model <- plpModel$model
} else {
model <- plpModel
}
-
+
pred <- data.frame(value = stats::predict(model, newData))
prediction <- cohort
prediction$value <- pred$value
-
+
prediction <- prediction %>%
dplyr::select(-"rowId") %>%
dplyr::rename(rowId = "originalRowId")
-
+
attr(prediction, "metaData") <- list(modelType = attr(plpModel, "modelType"))
-
+
return(prediction)
}
@@ -185,11 +185,11 @@ fitLightGBM <- function(dataMatrix,
if (!is.null(hyperParameters$earlyStopRound)) {
trainInd <- sample(nrow(dataMatrix), nrow(dataMatrix) * 0.9)
train <- lightgbm::lgb.Dataset(
- data = dataMatrix[trainInd, , drop = F],
+ data = dataMatrix[trainInd, , drop = FALSE],
label = labels$outcomeCount[trainInd]
)
test <- lightgbm::lgb.Dataset(
- data = dataMatrix[-trainInd, , drop = F],
+ data = dataMatrix[-trainInd, , drop = FALSE],
label = labels$outcomeCount[-trainInd]
)
watchlist <- list(train = train, test = test)
@@ -201,10 +201,7 @@ fitLightGBM <- function(dataMatrix,
)
watchlist <- list()
}
-
- outcomes <- sum(labels$outcomeCount > 0)
- N <- nrow(labels)
- outcomeProportion <- outcomes / N
+
set.seed(settings$seed)
model <- lightgbm::lgb.train(
data = train,
@@ -216,7 +213,7 @@ fitLightGBM <- function(dataMatrix,
num_leaves = hyperParameters$numLeaves,
max_depth = hyperParameters$maxDepth,
learning_rate = hyperParameters$learningRate,
- feature_pre_filter=FALSE,
+ feature_pre_filter = FALSE,
min_data_in_leaf = hyperParameters$minDataInLeaf,
scale_pos_weight = hyperParameters$scalePosWeight,
lambda_l1 = hyperParameters$lambdaL1,
@@ -229,8 +226,7 @@ fitLightGBM <- function(dataMatrix,
verbose = 1,
early_stopping_rounds = hyperParameters$earlyStopRound,
valids = watchlist
- # categorical_feature = 'auto' # future work
)
-
+
return(model)
}
diff --git a/R/ParamChecks.R b/R/ParamChecks.R
index 542e7e803..be424c42c 100644
--- a/R/ParamChecks.R
+++ b/R/ParamChecks.R
@@ -19,73 +19,92 @@
# Functions used to check parameter values
checkBoolean <- function(parameter) {
- name = deparse(substitute(parameter))
+ name <- deparse(substitute(parameter))
if (!is.logical(parameter)) {
- ParallelLogger::logError(paste0(name, ' needs to be a boolean'))
- stop(paste0(name, ' not defined correctly'))
+ ParallelLogger::logError(paste0(name, " needs to be a boolean"))
+ stop(paste0(name, " not defined correctly"))
}
return(TRUE)
-}
+}
-checkHigherEqual <- function(parameter,value) {
- name = deparse(substitute(parameter))
- if (!is.numeric(parameter) | parameter= ',value))
- stop(paste0(name, ' needs to be >= ', value))
+checkHigherEqual <- function(parameter, value) {
+ name <- deparse(substitute(parameter))
+ if (!is.numeric(parameter) || any(parameter < value)) {
+ ParallelLogger::logError(paste0(name, " needs to be >= ", value))
+ stop(paste0(name, " needs to be >= ", value))
}
return(TRUE)
-}
+}
-checkLowerEqual <- function(parameter,value) {
- name = deparse(substitute(parameter))
- if (!is.numeric(parameter) | parameter>value) {
- ParallelLogger::logError(paste0(name, ' needs to be <= ',value))
- stop(paste0(name, ' needs to be <= ', value))
+checkLowerEqual <- function(parameter, value) {
+ name <- deparse(substitute(parameter))
+ if (!is.numeric(parameter) || any(parameter > value)) {
+ ParallelLogger::logError(paste0(name, " needs to be <= ", value))
+ stop(paste0(name, " needs to be <= ", value))
}
return(TRUE)
-}
+}
-checkHigher <- function(parameter,value) {
- name = deparse(substitute(parameter))
- if (!is.numeric(parameter) | parameter<=value) {
- ParallelLogger::logError(paste0(name, ' needs to be > ',value))
- stop(paste0(name, ' needs to be > ', value))
+checkHigher <- function(parameter, value) {
+ name <- deparse(substitute(parameter))
+ if (!is.numeric(parameter) || any(parameter <= value)) {
+ ParallelLogger::logError(paste0(name, " needs to be > ", value))
+ stop(paste0(name, " needs to be > ", value))
}
return(TRUE)
}
-checkLower <- function(parameter,value) {
- name = deparse(substitute(parameter))
- if (!is.numeric(parameter) | parameter>=value) {
- ParallelLogger::logError(paste0(name, ' needs to be < ',value))
- stop(paste0(name, ' needs to be < ', value))
+checkLower <- function(parameter, value) {
+ name <- deparse(substitute(parameter))
+ if (!is.numeric(parameter) || any(parameter >= value)) {
+ ParallelLogger::logError(paste0(name, " needs to be < ", value))
+ stop(paste0(name, " needs to be < ", value))
}
return(TRUE)
}
checkNotNull <- function(parameter) {
- name = deparse(substitute(parameter))
+ name <- deparse(substitute(parameter))
if (is.null(parameter)) {
- ParallelLogger::logError(paste0(name, ' cannot be empty'))
- stop(paste0(name, ' cannot be empty'))
+ ParallelLogger::logError(paste0(name, " cannot be empty"))
+ stop(paste0(name, " cannot be empty"))
}
return(TRUE)
}
-checkIsClass<- function(parameter,classes) {
- name = deparse(substitute(parameter))
+checkIsClass <- function(parameter, classes) {
+ name <- deparse(substitute(parameter))
if (!inherits(x = parameter, what = classes)) {
- ParallelLogger::logError(paste0(name, ' should be of class:', classes))
- stop(paste0(name, ' is wrong class'))
+ ParallelLogger::logError(paste0(name, " should be of class:", classes))
+ stop(paste0(name, " is wrong class"))
+ }
+ return(TRUE)
+}
+
+checkInStringVector <- function(parameter, values) {
+ name <- deparse(substitute(parameter))
+ if (!parameter %in% values) {
+ ParallelLogger::logError(paste0(name, " should be ", paste0(as.character(values), collapse = "or ")))
+ stop(paste0(name, " has incorrect value"))
+ }
+ return(TRUE)
+}
+
+# check column names of dataframe
+checkColumnNames <- function(parameter, columnNames) {
+ name <- deparse(substitute(parameter))
+ if (!all(columnNames %in% names(parameter))) {
+ ParallelLogger::logError(paste0("Column names of ", name, " are not correct"))
+ stop(paste0("Column names of ", name, " are not correct"))
}
return(TRUE)
}
-checkInStringVector<- function(parameter,values) {
- name = deparse(substitute(parameter))
- if (!parameter%in%values) {
- ParallelLogger::logError(paste0(name, ' should be ', paste0(as.character(values), collapse="or ")))
- stop(paste0(name, ' has incorrect value'))
+checkIsEqual <- function(parameter, value) {
+ name <- deparse(substitute(parameter))
+ if (!identical(parameter, value)) {
+ ParallelLogger::logError(paste0(name, " should be equal to ", value))
+ stop(paste0(name, " is not equal to ", value))
}
return(TRUE)
}
diff --git a/R/PatientLevelPrediction.R b/R/PatientLevelPrediction.R
index 848c42705..e16026f62 100644
--- a/R/PatientLevelPrediction.R
+++ b/R/PatientLevelPrediction.R
@@ -42,4 +42,3 @@
#' @usage
#' data(plpDataSimulationProfile)
NULL
-
diff --git a/R/Plotting.R b/R/Plotting.R
index e3cadcda1..078ca5961 100644
--- a/R/Plotting.R
+++ b/R/Plotting.R
@@ -23,88 +23,86 @@
#' This creates a survival plot that can be used to pick a suitable time-at-risk period
#'
#' @param plpData The plpData object returned by running getPlpData()
-#' @param outcomeId The cohort id corresponding to the outcome
+#' @param outcomeId The cohort id corresponding to the outcome
#' @param populationSettings The population settings created using \code{createStudyPopulationSettings}
#' @param riskTable (binary) Whether to include a table at the bottom of the plot showing the number of people at risk over time
#' @param confInt (binary) Whether to include a confidence interval
-#' @param yLabel (string) The label for the y-axis
+#' @param yLabel (string) The label for the y-axis
#'
#' @return
-#' TRUE if it ran
+#' TRUE if it ran
#'
#' @export
outcomeSurvivalPlot <- function(
- plpData,
- outcomeId,
- populationSettings = createStudyPopulationSettings(
- binary = T,
- includeAllOutcomes = T,
- firstExposureOnly = FALSE,
- washoutPeriod = 0,
- removeSubjectsWithPriorOutcome = TRUE,
- priorOutcomeLookback = 99999,
- requireTimeAtRisk = F,
- riskWindowStart = 1,
- startAnchor = 'cohort start',
- riskWindowEnd = 3650,
- endAnchor = "cohort start"
- ),
- riskTable = T,
- confInt= T,
- yLabel = 'Fraction of those who are outcome free in target population'
-)
-{
-
- ensure_installed("survminer")
- if(missing(plpData)){
- stop('plpData missing')
+ plpData,
+ outcomeId,
+ populationSettings = createStudyPopulationSettings(
+ binary = TRUE,
+ includeAllOutcomes = TRUE,
+ firstExposureOnly = FALSE,
+ washoutPeriod = 0,
+ removeSubjectsWithPriorOutcome = TRUE,
+ priorOutcomeLookback = 99999,
+ requireTimeAtRisk = FALSE,
+ riskWindowStart = 1,
+ startAnchor = "cohort start",
+ riskWindowEnd = 3650,
+ endAnchor = "cohort start"
+ ),
+ riskTable = TRUE,
+ confInt = TRUE,
+ yLabel = "Fraction of those who are outcome free in target population") {
+ rlang::check_installed(c("survival", "survminer"))
+ if (missing(plpData)) {
+ stop("plpData missing")
}
- if(missing(outcomeId)){
- stop('outcomeId missing')
+ if (missing(outcomeId)) {
+ stop("outcomeId missing")
}
- if(!inherits(x = plpData, what = 'plpData')){
- stop('Incorrect plpData object')
+ if (!inherits(x = plpData, what = "plpData")) {
+ stop("Incorrect plpData object")
}
- if(!outcomeId%in%unique(plpData$outcomes$outcomeId)){
- stop('outcome id not in data')
+ if (!outcomeId %in% unique(plpData$outcomes$outcomeId)) {
+ stop("outcome id not in data")
}
-
- populationSettings$plpData <- plpData
+
+ populationSettings$plpData <- plpData
populationSettings$outcomeId <- outcomeId
-
+
population <- do.call(
- what = 'createStudyPopulation',
+ what = "createStudyPopulation",
args = list(
- plpData = plpData,
- outcomeId = outcomeId,
+ plpData = plpData,
+ outcomeId = outcomeId,
populationSettings = populationSettings
- )
)
-
- population$daysToEvent[is.na(population$daysToEvent)] <- population$survivalTime[is.na(population$daysToEvent)]
+ )
+
+ population$daysToEvent[is.na(population$daysToEvent)] <-
+ population$survivalTime[is.na(population$daysToEvent)]
survivalFit <- survival::survfit(
- survival::Surv(daysToEvent, outcomeCount)~targetId,
- #riskDecile,
- population,
+ survival::Surv(daysToEvent, outcomeCount) ~ targetId,
+ # riskDecile,
+ population,
conf.int = TRUE
)
-
- if(!is.null(survivalFit$surv)){
+
+ if (!is.null(survivalFit$surv)) {
yliml <- min(survivalFit$surv)
- } else{
+ } else {
yliml <- 0.5
}
-
+
result <- survminer::ggsurvplot(
- fit=survivalFit,
+ fit = survivalFit,
data = population,
risk.table = riskTable,
- pval = F,
- xlim = c(0,populationSettings$riskWindowEnd),
- ylim = c(yliml*0.95,1),
+ pval = FALSE,
+ xlim = c(0, populationSettings$riskWindowEnd),
+ ylim = c(yliml * 0.95, 1),
conf.int = confInt,
ggtheme = ggplot2::theme_minimal(),
- risk.table.y.text.col = T,
+ risk.table.y.text.col = TRUE,
risk.table.y.text = FALSE,
ylab = yLabel
)
@@ -119,149 +117,208 @@ outcomeSurvivalPlot <- function(
#'
#' @param plpResult Object returned by the runPlp() function
#' @param saveLocation Name of the directory where the plots should be saved (NULL means no saving)
-#' @param typeColumn The name of the column specifying the evaluation type
-#' (to stratify the plots)
+#' @param typeColumn The name of the column specifying the evaluation type
+#' (to stratify the plots)
#'
#' @return
-#' TRUE if it ran
+#' TRUE if it ran
#'
#' @export
plotPlp <- function(
- plpResult,
- saveLocation = NULL,
- typeColumn = 'evaluation'
- ){
-
+ plpResult,
+ saveLocation = NULL,
+ typeColumn = "evaluation") {
# check inputs
- if(!is.null(saveLocation)){
- if(!dir.exists(saveLocation)){
- dir.create(saveLocation, recursive =T)
+ if (!is.null(saveLocation)) {
+ if (!dir.exists(saveLocation)) {
+ dir.create(saveLocation, recursive = TRUE)
}
}
-
+ rlang::check_installed(
+ pkg = c("ggplot2", "gridExtra"),
+ reason = "These packages are required for plotting"
+ )
+
# run each of the plots:
- tryCatch({
- plotSparseRoc(
- plpResult = plpResult,
- typeColumn = typeColumn,
- saveLocation = saveLocation,
- fileName = 'sparseROC.pdf'
- )},
- error = function(e){ParallelLogger::logError('Issue with plotSparseRoc')})
-
- tryCatch({
- plotPredictedPDF(
- plpResult,
- saveLocation = saveLocation,
- fileName = 'predictedPDF.pdf',
- typeColumn = typeColumn)
- },
- error = function(e){ParallelLogger::logError('Issue with plotPredictedPDF')})
-
- tryCatch({
- plotPreferencePDF(
- plpResult,
- saveLocation = saveLocation,
- fileName = 'preferencePDF.pdf',
- typeColumn = typeColumn)
- },
- error = function(e){ParallelLogger::logError('Issue with plotPreferencePDF')})
-
- tryCatch({
- plotPrecisionRecall(
- plpResult,
- saveLocation = saveLocation,
- fileName = 'precisionRecall.pdf',
- typeColumn = typeColumn)
- },
- error = function(e){ParallelLogger::logError('Issue with plotPrecisionRecall')})
-
- tryCatch({
- plotF1Measure(
- plpResult,
- saveLocation = saveLocation,
- fileName = 'f1Measure.pdf',
- typeColumn = typeColumn)
- },
- error = function(e){ParallelLogger::logError('Issue with plotF1Measure')})
-
- tryCatch({
- plotDemographicSummary(
- plpResult,
- saveLocation = saveLocation,
- fileName = 'demographicSummary.pdf',
- typeColumn = typeColumn)
- },
- error = function(e){ParallelLogger::logError('Issue with plotDemographicSummary')})
-
+ tryCatch(
+ {
+ plotSparseRoc(
+ plpResult = plpResult,
+ typeColumn = typeColumn,
+ saveLocation = saveLocation,
+ fileName = "sparseROC.pdf"
+ )
+ },
+ error = function(e) {
+ ParallelLogger::logError("Issue with plotSparseRoc")
+ }
+ )
+
+ tryCatch(
+ {
+ plotPredictedPDF(
+ plpResult,
+ saveLocation = saveLocation,
+ fileName = "predictedPDF.pdf",
+ typeColumn = typeColumn
+ )
+ },
+ error = function(e) {
+ ParallelLogger::logError("Issue with plotPredictedPDF")
+ }
+ )
+
+ tryCatch(
+ {
+ plotPreferencePDF(
+ plpResult,
+ saveLocation = saveLocation,
+ fileName = "preferencePDF.pdf",
+ typeColumn = typeColumn
+ )
+ },
+ error = function(e) {
+ ParallelLogger::logError("Issue with plotPreferencePDF")
+ }
+ )
+
+ tryCatch(
+ {
+ plotPrecisionRecall(
+ plpResult,
+ saveLocation = saveLocation,
+ fileName = "precisionRecall.pdf",
+ typeColumn = typeColumn
+ )
+ },
+ error = function(e) {
+ ParallelLogger::logError("Issue with plotPrecisionRecall")
+ }
+ )
+
+ tryCatch(
+ {
+ plotF1Measure(
+ plpResult,
+ saveLocation = saveLocation,
+ fileName = "f1Measure.pdf",
+ typeColumn = typeColumn
+ )
+ },
+ error = function(e) {
+ ParallelLogger::logError("Issue with plotF1Measure")
+ }
+ )
+
+ tryCatch(
+ {
+ plotDemographicSummary(
+ plpResult,
+ saveLocation = saveLocation,
+ fileName = "demographicSummary.pdf",
+ typeColumn = typeColumn
+ )
+ },
+ error = function(e) {
+ ParallelLogger::logError("Issue with plotDemographicSummary")
+ }
+ )
+
# add smooth calibration
- tryCatch({
- plotSmoothCalibration(
- plpResult = plpResult,
- smooth = 'loess',
- typeColumn = typeColumn,
- saveLocation = saveLocation,
- fileName = 'smoothCalibration.pdf'
+ tryCatch(
+ {
+ plotSmoothCalibration(
+ plpResult = plpResult,
+ smooth = "loess",
+ typeColumn = typeColumn,
+ saveLocation = saveLocation,
+ fileName = "smoothCalibration.pdf"
+ )
+ },
+ error = function(e) {
+ ParallelLogger::logError("Issue with plotSmoothCalibration")
+ }
)
- },
- error = function(e){ParallelLogger::logError('Issue with plotSmoothCalibration')})
-
- tryCatch({
- plotSparseCalibration(
- plpResult,
- saveLocation = saveLocation,
- fileName = 'sparseCalibration.pdf',
- typeColumn = typeColumn)
- },
- error = function(e){ParallelLogger::logError('Issue with plotSparseCalibration')})
-
- tryCatch({
- plotSparseCalibration2(
- plpResult,
- saveLocation = saveLocation,
- fileName = 'sparseCalibrationConventional.pdf',
- typeColumn = typeColumn)
- },
- error = function(e){ParallelLogger::logError('Issue with plotSparseCalibration2')})
-
- tryCatch({
- plotPredictionDistribution(
- plpResult,
- saveLocation = saveLocation,
- fileName = 'predictionDistribution.pdf',
- typeColumn = typeColumn)
- },
- error = function(e){ParallelLogger::logError('Issue with plotPredictionDistribution')})
-
- tryCatch({
- plotVariableScatterplot(
- plpResult$covariateSummary,
- saveLocation = saveLocation,
- fileName = 'variableScatterplot.pdf'
- )
- },
- error = function(e){ParallelLogger::logError('Issue with plotVariableScatterplot')})
-
-
- if(sum(c('TrainWithNoOutcome_CovariateMean', 'TestWithNoOutcome_CovariateMean') %in% colnames(plpResult$covariateSummary))==2){
- tryCatch({
- plotGeneralizability(
- plpResult$covariateSummary,
- saveLocation = saveLocation,
- fileName = 'generalizability.pdf'
+
+ tryCatch(
+ {
+ plotSparseCalibration(
+ plpResult,
+ saveLocation = saveLocation,
+ fileName = "sparseCalibration.pdf",
+ typeColumn = typeColumn
+ )
+ },
+ error = function(e) {
+ ParallelLogger::logError("Issue with plotSparseCalibration")
+ }
+ )
+
+ tryCatch(
+ {
+ plotSparseCalibration2(
+ plpResult,
+ saveLocation = saveLocation,
+ fileName = "sparseCalibrationConventional.pdf",
+ typeColumn = typeColumn
+ )
+ },
+ error = function(e) {
+ ParallelLogger::logError("Issue with plotSparseCalibration2")
+ }
+ )
+
+ tryCatch(
+ {
+ plotPredictionDistribution(
+ plpResult,
+ saveLocation = saveLocation,
+ fileName = "predictionDistribution.pdf",
+ typeColumn = typeColumn
+ )
+ },
+ error = function(e) {
+ ParallelLogger::logError("Issue with plotPredictionDistribution")
+ }
+ )
+
+ tryCatch(
+ {
+ plotVariableScatterplot(
+ plpResult$covariateSummary,
+ saveLocation = saveLocation,
+ fileName = "variableScatterplot.pdf"
)
},
- error = function(e){ParallelLogger::logError('Issue with plotGeneralizability')})
+ error = function(e) {
+ ParallelLogger::logError("Issue with plotVariableScatterplot")
+ }
+ )
+
+
+ if (sum(c("TrainWithNoOutcome_CovariateMean", "TestWithNoOutcome_CovariateMean") %in% colnames(plpResult$covariateSummary)) == 2) {
+ tryCatch(
+ {
+ plotGeneralizability(
+ plpResult$covariateSummary,
+ saveLocation = saveLocation,
+ fileName = "generalizability.pdf"
+ )
+ },
+ error = function(e) {
+ ParallelLogger::logError("Issue with plotGeneralizability")
+ }
+ )
}
-
+
return(invisible(TRUE))
}
-#=============
-# THERSHOLDSUMMARY PLOTS
-#=============
+# =============
+# THERSHOLDSUMMARY PLOTS
+# =============
#' Plot the ROC curve using the sparse thresholdSummary data frame
@@ -282,52 +339,49 @@ plotPlp <- function(
#'
#' @export
plotSparseRoc <- function(
- plpResult,
- typeColumn = 'evaluation',
- saveLocation = NULL,
- fileName = 'roc.png'
-){
-
- evalTypes <- unique(plpResult$performanceEvaluation$thresholdSummary[,typeColumn])
-
+ plpResult,
+ typeColumn = "evaluation",
+ saveLocation = NULL,
+ fileName = "roc.png") {
+ evalTypes <- unique(plpResult$performanceEvaluation$thresholdSummary[, typeColumn])
+
plots <- list()
length(plots) <- length(evalTypes)
-
- for (i in 1:length(evalTypes)){
+
+ for (i in 1:length(evalTypes)) {
evalType <- evalTypes[i]
- x <- plpResult$performanceEvaluation$thresholdSummary %>%
- dplyr::filter(.data[[typeColumn]] == evalType) %>%
+ x <- plpResult$performanceEvaluation$thresholdSummary %>%
+ dplyr::filter(.data[[typeColumn]] == evalType) %>%
dplyr::select("falsePositiveRate", "sensitivity")
-
- #x <- thresholdSummary[,c('falsePositiveRate','sensitivity')]
- x <- x[order(x$falsePositiveRate, x$sensitivity),]
-
+
+ x <- x[order(x$falsePositiveRate, x$sensitivity), ]
+
# add the bit to get the step
- stepsExtra <- cbind(x[-1,1], x[-nrow(x),2])
- colnames( stepsExtra) <- colnames(x)
- x <- rbind(c(1,1), x, stepsExtra, c(0,0))
- x <- x[order(x$falsePositiveRate, x$sensitivity),]
-
+ stepsExtra <- cbind(x[-1, 1], x[-nrow(x), 2])
+ colnames(stepsExtra) <- colnames(x)
+ x <- rbind(c(1, 1), x, stepsExtra, c(0, 0))
+ x <- x[order(x$falsePositiveRate, x$sensitivity), ]
+
plots[[i]] <- ggplot2::ggplot(
- x,
+ x,
ggplot2::aes(
- .data$falsePositiveRate,
+ .data$falsePositiveRate,
.data$sensitivity
- )
+ )
) +
ggplot2::geom_polygon(fill = "blue", alpha = 0.2) +
ggplot2::geom_line(linewidth = 1) +
- ggplot2::geom_abline(intercept = 0, slope = 1,linetype = 2) +
- ggplot2::scale_x_continuous("1 - specificity", limits = c(0,1)) +
- ggplot2::scale_y_continuous("Sensitivity", limits = c(0,1)) +
+ ggplot2::geom_abline(intercept = 0, slope = 1, linetype = 2) +
+ ggplot2::scale_x_continuous("1 - specificity", limits = c(0, 1)) +
+ ggplot2::scale_y_continuous("Sensitivity", limits = c(0, 1)) +
ggplot2::ggtitle(evalType)
}
-
- plot <- gridExtra::marrangeGrob(plots, nrow =length(plots), ncol = 1)
-
- if (!is.null(saveLocation)){
- if(!dir.exists(saveLocation)){
- dir.create(saveLocation, recursive = T)
+
+ plot <- gridExtra::marrangeGrob(plots, nrow = length(plots), ncol = 1)
+
+ if (!is.null(saveLocation)) {
+ if (!dir.exists(saveLocation)) {
+ dir.create(saveLocation, recursive = TRUE)
}
ggplot2::ggsave(file.path(saveLocation, fileName), plot, width = 5, height = 4.5, dpi = 400)
}
@@ -353,22 +407,19 @@ plotSparseRoc <- function(
#'
#' @export
plotPredictedPDF <- function(
- plpResult,
- typeColumn = 'evaluation',
- saveLocation = NULL,
- fileName = 'PredictedPDF.png'
- )
- {
-
- evalTypes <- unique(plpResult$performanceEvaluation$thresholdSummary[,typeColumn])
-
+ plpResult,
+ typeColumn = "evaluation",
+ saveLocation = NULL,
+ fileName = "PredictedPDF.png") {
+ evalTypes <- unique(plpResult$performanceEvaluation$thresholdSummary[, typeColumn])
+
plots <- list()
length(plots) <- length(evalTypes)
-
- for(ind in 1:length(evalTypes)){
+
+ for (ind in 1:length(evalTypes)) {
evalType <- evalTypes[ind]
- x <- plpResult$performanceEvaluation$thresholdSummary %>%
- dplyr::filter(.data[[typeColumn]] == evalType) %>%
+ x <- plpResult$performanceEvaluation$thresholdSummary %>%
+ dplyr::filter(.data[[typeColumn]] == evalType) %>%
dplyr::select(
"predictionThreshold",
"truePositiveCount",
@@ -376,48 +427,63 @@ plotPredictedPDF <- function(
"falsePositiveCount",
"falseNegativeCount"
)
-
- x<- x[order(x$predictionThreshold,-x$truePositiveCount, -x$falsePositiveCount),]
- x$out <- c(x$truePositiveCount[-length(x$truePositiveCount)]-x$truePositiveCount[-1], x$truePositiveCount[length(x$truePositiveCount)])
- x$nout <- c(x$falsePositiveCount[-length(x$falsePositiveCount)]-x$falsePositiveCount[-1], x$falsePositiveCount[length(x$falsePositiveCount)])
-
- vals <- c()
- for(i in 1:length(x$predictionThreshold)){
- if(i!=length(x$predictionThreshold)){
- upper <- x$predictionThreshold[i+1]} else {upper <- min(x$predictionThreshold[i]+0.01,1)}
- val <- x$predictionThreshold[i]+stats::runif(x$out[i])*(upper-x$predictionThreshold[i])
- vals <- c(val, vals)
- }
- vals <- vals[!is.na(vals)] #assigned
-
- vals2 <- c()
- for(i in 1:length(x$predictionThreshold)){
- if(i!=length(x$predictionThreshold)){
- upper <- x$predictionThreshold[i+1]} else {upper <- min(x$predictionThreshold[i]+0.01,1)}
- val2 <- x$predictionThreshold[i]+stats::runif(x$nout[i])*(upper-x$predictionThreshold[i])
- vals2 <- c(val2, vals2)
- }
- vals2 <- vals2[!is.na(vals2)] #assigned
-
- x <- rbind(data.frame(variable=rep('outcome',length(vals)), value=vals),
- data.frame(variable=rep('No outcome',length(vals2)), value=vals2)
- )
-
- plots[[ind]] <- ggplot2::ggplot(x, ggplot2::aes(x=.data$value,
- group=.data$variable,
- fill=.data$variable)) +
- ggplot2::geom_density(ggplot2::aes(x=.data$value, fill=.data$variable), alpha=.3) +
- ggplot2::scale_x_continuous("Prediction Threshold") + #, limits=c(0,1)) +
- ggplot2::scale_y_continuous("Density") +
- ggplot2::guides(fill=ggplot2::guide_legend(title="Class")) +
- ggplot2::ggtitle(evalType)
+
+ x <- x[order(x$predictionThreshold, -x$truePositiveCount, -x$falsePositiveCount), ]
+ x$out <- c(
+ x$truePositiveCount[-length(x$truePositiveCount)] - x$truePositiveCount[-1],
+ x$truePositiveCount[length(x$truePositiveCount)]
+ )
+ x$nout <- c(
+ x$falsePositiveCount[-length(x$falsePositiveCount)] - x$falsePositiveCount[-1],
+ x$falsePositiveCount[length(x$falsePositiveCount)]
+ )
+
+ vals <- c()
+ for (i in 1:length(x$predictionThreshold)) {
+ if (i != length(x$predictionThreshold)) {
+ upper <- x$predictionThreshold[i + 1]
+ } else {
+ upper <- min(x$predictionThreshold[i] + 0.01, 1)
+ }
+ val <- x$predictionThreshold[i] + stats::runif(x$out[i]) * (upper - x$predictionThreshold[i])
+ vals <- c(val, vals)
+ }
+ vals <- vals[!is.na(vals)] # assigned
+
+ vals2 <- c()
+ for (i in 1:length(x$predictionThreshold)) {
+ if (i != length(x$predictionThreshold)) {
+ upper <- x$predictionThreshold[i + 1]
+ } else {
+ upper <- min(x$predictionThreshold[i] + 0.01, 1)
+ }
+ val2 <- x$predictionThreshold[i] + stats::runif(x$nout[i]) * (upper - x$predictionThreshold[i])
+ vals2 <- c(val2, vals2)
+ }
+ vals2 <- vals2[!is.na(vals2)] # assigned
+
+ x <- rbind(
+ data.frame(variable = rep("outcome", length(vals)), value = vals),
+ data.frame(variable = rep("No outcome", length(vals2)), value = vals2)
+ )
+
+ plots[[ind]] <- ggplot2::ggplot(x, ggplot2::aes(
+ x = .data$value,
+ group = .data$variable,
+ fill = .data$variable
+ )) +
+ ggplot2::geom_density(ggplot2::aes(x = .data$value, fill = .data$variable), alpha = .3) +
+ ggplot2::scale_x_continuous("Prediction Threshold") + # , limits=c(0,1)) +
+ ggplot2::scale_y_continuous("Density") +
+ ggplot2::guides(fill = ggplot2::guide_legend(title = "Class")) +
+ ggplot2::ggtitle(evalType)
}
-
- plot <- gridExtra::marrangeGrob(plots, nrow=length(plots), ncol=1)
-
- if (!is.null(saveLocation)){
- if(!dir.exists(saveLocation)){
- dir.create(saveLocation, recursive = T)
+
+ plot <- gridExtra::marrangeGrob(plots, nrow = length(plots), ncol = 1)
+
+ if (!is.null(saveLocation)) {
+ if (!dir.exists(saveLocation)) {
+ dir.create(saveLocation, recursive = TRUE)
}
ggplot2::ggsave(file.path(saveLocation, fileName), plot, width = 5, height = 4.5, dpi = 400)
}
@@ -444,21 +510,19 @@ plotPredictedPDF <- function(
#'
#' @export
plotPreferencePDF <- function(
- plpResult,
- typeColumn = 'evaluation',
- saveLocation = NULL,
- fileName = 'plotPreferencePDF.png'
-){
-
- evalTypes <- unique(plpResult$performanceEvaluation$thresholdSummary[,typeColumn])
-
+ plpResult,
+ typeColumn = "evaluation",
+ saveLocation = NULL,
+ fileName = "plotPreferencePDF.png") {
+ evalTypes <- unique(plpResult$performanceEvaluation$thresholdSummary[, typeColumn])
+
plots <- list()
length(plots) <- length(evalTypes)
-
- for(ind in 1:length(evalTypes)){
+
+ for (ind in 1:length(evalTypes)) {
evalType <- evalTypes[ind]
- x <- plpResult$performanceEvaluation$thresholdSummary %>%
- dplyr::filter(.data[[typeColumn]] == evalType) %>%
+ x <- plpResult$performanceEvaluation$thresholdSummary %>%
+ dplyr::filter(.data[[typeColumn]] == evalType) %>%
dplyr::select(
"preferenceThreshold",
"truePositiveCount",
@@ -466,49 +530,63 @@ plotPreferencePDF <- function(
"falsePositiveCount",
"falseNegativeCount"
)
-
- x<- x[order(x$preferenceThreshold,-x$truePositiveCount, x$trueNegativeCount),]
- x$out <- c(x$truePositiveCount[-length(x$truePositiveCount)]-x$truePositiveCount[-1], x$truePositiveCount[length(x$truePositiveCount)])
- x$nout <- c(x$falsePositiveCount[-length(x$falsePositiveCount)]-x$falsePositiveCount[-1], x$falsePositiveCount[length(x$falsePositiveCount)])
-
- vals <- c()
- for(i in 1:length(x$preferenceThreshold)){
- if(i!=length(x$preferenceThreshold)){
- upper <- x$preferenceThreshold[i+1]} else {upper <- 1}
- val <- x$preferenceThreshold[i]+stats::runif(x$out[i])*(upper-x$preferenceThreshold[i])
- vals <- c(val, vals)
- }
- vals <- vals[!is.na(vals)]
-
- vals2 <- c()
- for(i in 1:length(x$preferenceThreshold)){
- if(i!=length(x$preferenceThreshold)){
- upper <- x$preferenceThreshold[i+1]} else {upper <- 1}
- val2 <- x$preferenceThreshold[i]+stats::runif(x$nout[i])*(upper-x$preferenceThreshold[i])
- vals2 <- c(val2, vals2)
- }
- vals2 <- vals2[!is.na(vals2)]
-
- x <- rbind(data.frame(variable=rep('outcome',length(vals)), value=vals),
- data.frame(variable=rep('No outcome',length(vals2)), value=vals2)
- )
-
- plots[[ind]] <- ggplot2::ggplot(x, ggplot2::aes(x=.data$value,
- group=.data$variable,
- fill=.data$variable)) +
- ggplot2::geom_density(ggplot2::aes(x=.data$value, fill=.data$variable), alpha=.3) +
- ggplot2::scale_x_continuous("Preference Threshold")+#, limits=c(0,1)) +
- ggplot2::scale_y_continuous("Density") +
- ggplot2::guides(fill=ggplot2::guide_legend(title="Class")) +
- ggplot2::ggtitle(evalType)
-
+
+ x <- x[order(x$preferenceThreshold, -x$truePositiveCount, x$trueNegativeCount), ]
+ x$out <- c(
+ x$truePositiveCount[-length(x$truePositiveCount)] - x$truePositiveCount[-1],
+ x$truePositiveCount[length(x$truePositiveCount)]
+ )
+ x$nout <- c(
+ x$falsePositiveCount[-length(x$falsePositiveCount)] - x$falsePositiveCount[-1],
+ x$falsePositiveCount[length(x$falsePositiveCount)]
+ )
+
+ vals <- c()
+ for (i in 1:length(x$preferenceThreshold)) {
+ if (i != length(x$preferenceThreshold)) {
+ upper <- x$preferenceThreshold[i + 1]
+ } else {
+ upper <- 1
+ }
+ val <- x$preferenceThreshold[i] + stats::runif(x$out[i]) * (upper - x$preferenceThreshold[i])
+ vals <- c(val, vals)
+ }
+ vals <- vals[!is.na(vals)]
+
+ vals2 <- c()
+ for (i in 1:length(x$preferenceThreshold)) {
+ if (i != length(x$preferenceThreshold)) {
+ upper <- x$preferenceThreshold[i + 1]
+ } else {
+ upper <- 1
+ }
+ val2 <- x$preferenceThreshold[i] + stats::runif(x$nout[i]) * (upper - x$preferenceThreshold[i])
+ vals2 <- c(val2, vals2)
+ }
+ vals2 <- vals2[!is.na(vals2)]
+
+ x <- rbind(
+ data.frame(variable = rep("outcome", length(vals)), value = vals),
+ data.frame(variable = rep("No outcome", length(vals2)), value = vals2)
+ )
+
+ plots[[ind]] <- ggplot2::ggplot(x, ggplot2::aes(
+ x = .data$value,
+ group = .data$variable,
+ fill = .data$variable
+ )) +
+ ggplot2::geom_density(ggplot2::aes(x = .data$value, fill = .data$variable), alpha = .3) +
+ ggplot2::scale_x_continuous("Preference Threshold") + # , limits=c(0,1)) +
+ ggplot2::scale_y_continuous("Density") +
+ ggplot2::guides(fill = ggplot2::guide_legend(title = "Class")) +
+ ggplot2::ggtitle(evalType)
}
-
- plot <- gridExtra::marrangeGrob(plots, nrow=length(plots), ncol=1)
-
- if (!is.null(saveLocation)){
- if(!dir.exists(saveLocation)){
- dir.create(saveLocation, recursive = T)
+
+ plot <- gridExtra::marrangeGrob(plots, nrow = length(plots), ncol = 1)
+
+ if (!is.null(saveLocation)) {
+ if (!dir.exists(saveLocation)) {
+ dir.create(saveLocation, recursive = TRUE)
}
ggplot2::ggsave(file.path(saveLocation, fileName), plot, width = 5, height = 4.5, dpi = 400)
}
@@ -535,51 +613,51 @@ plotPreferencePDF <- function(
#'
#' @export
plotPrecisionRecall <- function(
- plpResult,
- typeColumn = 'evaluation',
- saveLocation = NULL,
- fileName = 'roc.png'
-){
-
- evalTypes <- unique(plpResult$performanceEvaluation$thresholdSummary[,typeColumn])
-
+ plpResult,
+ typeColumn = "evaluation",
+ saveLocation = NULL,
+ fileName = "roc.png") {
+ evalTypes <- unique(plpResult$performanceEvaluation$thresholdSummary[, typeColumn])
+
plots <- list()
length(plots) <- length(evalTypes)
-
- for(i in 1:length(evalTypes)){
+
+ for (i in 1:length(evalTypes)) {
evalType <- evalTypes[i]
-
- N <- max(plpResult$performanceEvaluation$thresholdSummary %>%
- dplyr::filter(.data[[typeColumn]] == evalType) %>%
- dplyr::select("falseCount") %>%
- dplyr::pull(), na.rm = T)
-
-
- O <- max(plpResult$performanceEvaluation$thresholdSummary %>%
- dplyr::filter(.data[[typeColumn]] == evalType) %>%
- dplyr::select("trueCount") %>%
- dplyr::pull(), na.rm = T)
-
- inc <- O/(O + N)
-
- x <- plpResult$performanceEvaluation$thresholdSummary %>%
- dplyr::filter(.data[[typeColumn]] == evalType) %>%
+
+ N <- max(plpResult$performanceEvaluation$thresholdSummary %>%
+ dplyr::filter(.data[[typeColumn]] == evalType) %>%
+ dplyr::select("falseCount") %>%
+ dplyr::pull(), na.rm = TRUE)
+
+
+ O <- max(plpResult$performanceEvaluation$thresholdSummary %>%
+ dplyr::filter(.data[[typeColumn]] == evalType) %>%
+ dplyr::select("trueCount") %>%
+ dplyr::pull(), na.rm = TRUE)
+
+ inc <- O / (O + N)
+
+ x <- plpResult$performanceEvaluation$thresholdSummary %>%
+ dplyr::filter(.data[[typeColumn]] == evalType) %>%
dplyr::select("positivePredictiveValue", "sensitivity")
-
+
plots[[i]] <- ggplot2::ggplot(x, ggplot2::aes(.data$sensitivity, .data$positivePredictiveValue)) +
- ggplot2::geom_line(linewidth=1) +
- ggplot2::scale_x_continuous("Recall")+#, limits=c(0,1)) +
- ggplot2::scale_y_continuous("Precision") + #, limits=c(0,1))
- ggplot2::geom_hline(yintercept = inc, linetype="dashed",
- color = "red", linewidth = 1) +
+ ggplot2::geom_line(linewidth = 1) +
+ ggplot2::scale_x_continuous("Recall") + # , limits=c(0,1)) +
+ ggplot2::scale_y_continuous("Precision") + # , limits=c(0,1))
+ ggplot2::geom_hline(
+ yintercept = inc, linetype = "dashed",
+ color = "red", linewidth = 1
+ ) +
ggplot2::ggtitle(evalType)
}
-
- plot <- gridExtra::marrangeGrob(plots, nrow=length(plots), ncol=1)
-
- if (!is.null(saveLocation)){
- if(!dir.exists(saveLocation)){
- dir.create(saveLocation, recursive = T)
+
+ plot <- gridExtra::marrangeGrob(plots, nrow = length(plots), ncol = 1)
+
+ if (!is.null(saveLocation)) {
+ if (!dir.exists(saveLocation)) {
+ dir.create(saveLocation, recursive = TRUE)
}
ggplot2::ggsave(file.path(saveLocation, fileName), plot, width = 5, height = 4.5, dpi = 400)
}
@@ -606,42 +684,42 @@ plotPrecisionRecall <- function(
#'
#' @export
plotF1Measure <- function(
- plpResult,
- typeColumn = 'evaluation',
- saveLocation = NULL,
- fileName = 'roc.png'
-){
- evalTypes <- unique(plpResult$performanceEvaluation$thresholdSummary[,typeColumn])
-
+ plpResult,
+ typeColumn = "evaluation",
+ saveLocation = NULL,
+ fileName = "roc.png") {
+ evalTypes <- unique(plpResult$performanceEvaluation$thresholdSummary[, typeColumn])
+
plots <- list()
length(plots) <- length(evalTypes)
-
- for(i in 1:length(evalTypes)){
-
+
+ for (i in 1:length(evalTypes)) {
evalType <- evalTypes[i]
-
- x <- plpResult$performanceEvaluation$thresholdSummary %>%
- dplyr::filter(.data[[typeColumn]] == evalType) %>%
+
+ x <- plpResult$performanceEvaluation$thresholdSummary %>%
+ dplyr::filter(.data[[typeColumn]] == evalType) %>%
dplyr::select("predictionThreshold", "f1Score")
-
- if(sum(is.nan(x$f1Score))>0){
- x <- x[!is.nan(x$f1Score),]
- if(nrow(x)==0){return(NULL)}
- }
-
- plots[[i]] <- ggplot2::ggplot(x, ggplot2::aes(.data$predictionThreshold, .data$f1Score)) +
- ggplot2::geom_line(linewidth = 1) +
- ggplot2::geom_point(size = 1) +
- ggplot2::scale_x_continuous("predictionThreshold")+#, limits=c(0,1)) +
- ggplot2::scale_y_continuous("F1Score") +#, limits=c(0,1))
- ggplot2::ggtitle(evalType)
+
+ if (sum(is.nan(x$f1Score)) > 0) {
+ x <- x[!is.nan(x$f1Score), ]
+ if (nrow(x) == 0) {
+ return(NULL)
+ }
+ }
+
+ plots[[i]] <- ggplot2::ggplot(x, ggplot2::aes(.data$predictionThreshold, .data$f1Score)) +
+ ggplot2::geom_line(linewidth = 1) +
+ ggplot2::geom_point(size = 1) +
+ ggplot2::scale_x_continuous("predictionThreshold") + # , limits=c(0,1)) +
+ ggplot2::scale_y_continuous("F1Score") + # , limits=c(0,1))
+ ggplot2::ggtitle(evalType)
}
-
- plot <- gridExtra::marrangeGrob(plots, nrow=length(plots), ncol=1)
-
- if (!is.null(saveLocation)){
- if(!dir.exists(saveLocation)){
- dir.create(saveLocation, recursive = T)
+
+ plot <- gridExtra::marrangeGrob(plots, nrow = length(plots), ncol = 1)
+
+ if (!is.null(saveLocation)) {
+ if (!dir.exists(saveLocation)) {
+ dir.create(saveLocation, recursive = TRUE)
}
ggplot2::ggsave(file.path(saveLocation, fileName), plot, width = 5, height = 4.5, dpi = 400)
}
@@ -650,9 +728,9 @@ plotF1Measure <- function(
-#=============
-# DEMOGRAPHICSUMMARY PLOTS
-#=============
+# =============
+# DEMOGRAPHICSUMMARY PLOTS
+# =============
#' Plot the Observed vs. expected incidence, by age and gender
#'
#' @details
@@ -671,113 +749,118 @@ plotF1Measure <- function(
#'
#' @export
plotDemographicSummary <- function(
- plpResult,
- typeColumn = 'evaluation',
- saveLocation = NULL,
- fileName = 'roc.png'
-)
-{
-
- evalTypes <- unique(plpResult$performanceEvaluation$demographicSummary[,typeColumn])
-
+ plpResult,
+ typeColumn = "evaluation",
+ saveLocation = NULL,
+ fileName = "roc.png") {
+ evalTypes <- unique(plpResult$performanceEvaluation$demographicSummary[, typeColumn])
+
plots <- list()
length(plots) <- length(evalTypes)
-
- for(i in 1:length(evalTypes)){
+
+ for (i in 1:length(evalTypes)) {
evalType <- evalTypes[i]
- x <- plpResult$performanceEvaluation$demographicSummary %>%
- dplyr::filter(.data[[typeColumn]] == evalType) %>%
+ x <- plpResult$performanceEvaluation$demographicSummary %>%
+ dplyr::filter(.data[[typeColumn]] == evalType) %>%
dplyr::select(
"ageGroup",
"genGroup",
"averagePredictedProbability",
- "PersonCountAtRisk",
+ "PersonCountAtRisk",
"PersonCountWithOutcome"
- )
-
+ )
+
# remove -1 values:
x$averagePredictedProbability[is.na(x$averagePredictedProbability)] <- 0
- x <- x[x$PersonCountWithOutcome != -1,]
- if(nrow(x)==0){
+ x <- x[x$PersonCountWithOutcome != -1, ]
+ if (nrow(x) == 0) {
return(NULL)
}
-
- x$observed <- x$PersonCountWithOutcome/x$PersonCountAtRisk
-
- x <- x[,colnames(x)%in%c('ageGroup','genGroup','averagePredictedProbability','observed')]
-
- # if age or gender missing add
- if(sum(colnames(x)=='ageGroup')==1 && sum(colnames(x)=='genGroup')==0 ){
- x$genGroup = rep('Non', nrow(x))
- evaluation$demographicSummary$genGroup = rep('Non', nrow(evaluation$demographicSummary))
- }
- if(sum(colnames(x)=='ageGroup')==0 && sum(colnames(x)=='genGroup')==1 ){
- x$ageGroup = rep('-1', nrow(x))
- evaluation$demographicSummary$ageGroup = rep('-1', nrow(evaluation$demographicSummary))
-
+
+ x$observed <- x$PersonCountWithOutcome / x$PersonCountAtRisk
+
+ x <- x[, colnames(x) %in% c("ageGroup", "genGroup", "averagePredictedProbability", "observed")]
+
+ # if age or gender missing add
+ if (sum(colnames(x) == "ageGroup") == 1 && sum(colnames(x) == "genGroup") == 0) {
+ x$genGroup <- rep("Non", nrow(x))
+ evaluation$demographicSummary$genGroup <- rep("Non", nrow(evaluation$demographicSummary))
+ }
+ if (sum(colnames(x) == "ageGroup") == 0 && sum(colnames(x) == "genGroup") == 1) {
+ x$ageGroup <- rep("-1", nrow(x))
+ evaluation$demographicSummary$ageGroup <- rep("-1", nrow(evaluation$demographicSummary))
}
x <- tidyr::pivot_longer(
- data = x,
- cols = colnames(x)[!colnames(x) %in% c('ageGroup','genGroup')],
- names_to = 'variable',
+ data = x,
+ cols = colnames(x)[!colnames(x) %in% c("ageGroup", "genGroup")],
+ names_to = "variable",
values_to = "value"
- )
- #x <- reshape2::melt(x, id.vars=c('ageGroup','genGroup'))
-
- # 1.96*StDevPredictedProbability
- ci <- plpResult$performanceEvaluation$demographicSummary %>%
- dplyr::filter(.data[[typeColumn]] == evalType) %>%
+ )
+ ci <- plpResult$performanceEvaluation$demographicSummary %>%
+ dplyr::filter(.data[[typeColumn]] == evalType) %>%
dplyr::select(
"ageGroup",
"genGroup",
"averagePredictedProbability",
"StDevPredictedProbability"
)
-
+
ci$StDevPredictedProbability[is.na(ci$StDevPredictedProbability)] <- 1
- ci$lower <- ci$averagePredictedProbability-1.96*ci$StDevPredictedProbability
- ci$lower[ci$lower <0] <- 0
- ci$upper <- ci$averagePredictedProbability+1.96*ci$StDevPredictedProbability
- ci$upper[ci$upper >1] <- max(ci$upper[ci$upper <1])
-
- x$age <- gsub('Age group:','', x$ageGroup)
- x$age <- factor(x$age,levels=c(" 0-4"," 5-9"," 10-14",
- " 15-19"," 20-24"," 25-29"," 30-34"," 35-39"," 40-44",
- " 45-49"," 50-54"," 55-59"," 60-64"," 65-69"," 70-74",
- " 75-79"," 80-84"," 85-89"," 90-94"," 95-99","-1"),ordered=TRUE)
-
- x <- merge(x, ci[,c('ageGroup','genGroup','lower','upper')], by=c('ageGroup','genGroup'))
- x <- x[!is.na(x$value),]
-
- plots[[i]] <- ggplot2::ggplot(data=x,
- ggplot2::aes(x=.data$age,
- group=interaction(.data$variable,.data$genGroup))) +
-
- ggplot2::geom_line(ggplot2::aes(y=.data$value, group=.data$variable,
- color=.data$variable,
- linetype = .data$variable))+
- ggplot2::geom_ribbon(data=x[x$variable!='observed',],
- ggplot2::aes(ymin=.data$lower, ymax=.data$upper
- , group=.data$genGroup),
- fill="blue", alpha=0.2) +
- ggplot2::facet_grid(.~ .data$genGroup, scales = "free") +
+ ci$lower <- ci$averagePredictedProbability - 1.96 * ci$StDevPredictedProbability
+ ci$lower[ci$lower < 0] <- 0
+ ci$upper <- ci$averagePredictedProbability + 1.96 * ci$StDevPredictedProbability
+ ci$upper[ci$upper > 1] <- max(ci$upper[ci$upper < 1])
+
+ x$age <- gsub("Age group:", "", x$ageGroup)
+ x$age <- factor(x$age, levels = c(
+ " 0-4", " 5-9", " 10-14",
+ " 15-19", " 20-24", " 25-29", " 30-34", " 35-39", " 40-44",
+ " 45-49", " 50-54", " 55-59", " 60-64", " 65-69", " 70-74",
+ " 75-79", " 80-84", " 85-89", " 90-94", " 95-99", "-1"
+ ), ordered = TRUE)
+
+ x <- merge(x, ci[, c("ageGroup", "genGroup", "lower", "upper")], by = c("ageGroup", "genGroup"))
+ x <- x[!is.na(x$value), ]
+
+ plots[[i]] <- ggplot2::ggplot(
+ data = x,
+ ggplot2::aes(
+ x = .data$age,
+ group = interaction(.data$variable, .data$genGroup)
+ )
+ ) +
+ ggplot2::geom_line(ggplot2::aes(
+ y = .data$value, group = .data$variable,
+ color = .data$variable,
+ linetype = .data$variable
+ )) +
+ ggplot2::geom_ribbon(
+ data = x[x$variable != "observed", ],
+ ggplot2::aes(
+ ymin = .data$lower, ymax = .data$upper,
+ group = .data$genGroup
+ ),
+ fill = "blue", alpha = 0.2
+ ) +
+ ggplot2::facet_grid(. ~ .data$genGroup, scales = "free") +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, hjust = 1)) +
ggplot2::scale_y_continuous("Fraction") +
ggplot2::scale_x_discrete("Age") +
- ggplot2::scale_color_manual(values = c("royalblue4","red"),
- guide = ggplot2::guide_legend(title = NULL),
- labels = c("Expected", "Observed")) +
-
- ggplot2::guides( linetype = "none") + # change from FALSE due to warning
+ ggplot2::scale_color_manual(
+ values = c("royalblue4", "red"),
+ guide = ggplot2::guide_legend(title = NULL),
+ labels = c("Expected", "Observed")
+ ) +
+ ggplot2::guides(linetype = "none") + # change from FALSE due to warning
ggplot2::ggtitle(evalType)
}
-
- plot <- gridExtra::marrangeGrob(plots, nrow=length(plots), ncol=1)
-
- if (!is.null(saveLocation)){
- if(!dir.exists(saveLocation)){
- dir.create(saveLocation, recursive = T)
+
+ plot <- gridExtra::marrangeGrob(plots, nrow = length(plots), ncol = 1)
+
+ if (!is.null(saveLocation)) {
+ if (!dir.exists(saveLocation)) {
+ dir.create(saveLocation, recursive = TRUE)
}
ggplot2::ggsave(file.path(saveLocation, fileName), plot, width = 5, height = 4.5, dpi = 400)
}
@@ -785,9 +868,9 @@ plotDemographicSummary <- function(
}
-#=============
-# CALIBRATIONSUMMARY PLOTS
-#=============
+# =============
+# CALIBRATIONSUMMARY PLOTS
+# =============
#' Plot the calibration
#'
#' @details
@@ -806,70 +889,78 @@ plotDemographicSummary <- function(
#'
#' @export
plotSparseCalibration <- function(
- plpResult,
- typeColumn = 'evaluation',
- saveLocation = NULL,
- fileName = 'roc.png'
-){
-
- evalTypes <- unique(plpResult$performanceEvaluation$calibrationSummary[,typeColumn])
-
+ plpResult,
+ typeColumn = "evaluation",
+ saveLocation = NULL,
+ fileName = "roc.png") {
+ evalTypes <- unique(plpResult$performanceEvaluation$calibrationSummary[, typeColumn])
+
plots <- list()
length(plots) <- length(evalTypes)
-
- for(i in 1:length(evalTypes)){
+
+ for (i in 1:length(evalTypes)) {
evalType <- evalTypes[i]
- x <- plpResult$performanceEvaluation$calibrationSummary %>%
- dplyr::filter(.data[[typeColumn]] == evalType) %>%
+ x <- plpResult$performanceEvaluation$calibrationSummary %>%
+ dplyr::filter(.data[[typeColumn]] == evalType) %>%
dplyr::select("averagePredictedProbability", "observedIncidence")
- maxVal <- max(x$averagePredictedProbability,x$observedIncidence)
- model <- stats::lm(observedIncidence~averagePredictedProbability, data=x)
- res <- model$coefficients
- names(res) <- c('Intercept','Gradient')
-
- # confidence int
- interceptConf <- stats::confint(model)[1,]
- gradientConf <- stats::confint(model)[2,]
-
- cis <- data.frame(lci = interceptConf[1]+seq(0,1,length.out = nrow(x))*gradientConf[1],
- uci = interceptConf[2]+seq(0,1,length.out = nrow(x))*gradientConf[2],
- x=seq(0,1,length.out = nrow(x)))
-
- x <- cbind(x, cis)
- # TODO: CHECK INPUT
- plots[[i]] <- ggplot2::ggplot(data=x,
- ggplot2::aes(x=.data$averagePredictedProbability,
- y=.data$observedIncidence
- )) +
- ggplot2::geom_ribbon(ggplot2::aes(ymin=.data$lci,ymax=.data$uci, x=x),
- fill="blue", alpha=0.2) +
- ggplot2::geom_point(size = 1, color='darkblue') +
- ggplot2::coord_cartesian(ylim = c(0, maxVal), xlim =c(0,maxVal)) +
- ggplot2::geom_abline(intercept = 0, slope = 1, linetype = 2, linewidth = 1,
- show.legend = TRUE) +
- ggplot2::geom_abline(intercept = res['Intercept'], slope = res['Gradient'],
- linetype = 1,show.legend = TRUE,
- color='darkblue') +
- ggplot2::scale_x_continuous("Average Predicted Probability") +
- ggplot2::scale_y_continuous("Observed Fraction With Outcome") +
- ggplot2::ggtitle(evalType)
+ maxVal <- max(x$averagePredictedProbability, x$observedIncidence)
+ model <- stats::lm(observedIncidence ~ averagePredictedProbability, data = x)
+ res <- model$coefficients
+ names(res) <- c("Intercept", "Gradient")
+
+ # confidence int
+ interceptConf <- stats::confint(model)[1, ]
+ gradientConf <- stats::confint(model)[2, ]
+
+ cis <- data.frame(
+ lci = interceptConf[1] + seq(0, 1, length.out = nrow(x)) * gradientConf[1],
+ uci = interceptConf[2] + seq(0, 1, length.out = nrow(x)) * gradientConf[2],
+ x = seq(0, 1, length.out = nrow(x))
+ )
+
+ x <- cbind(x, cis)
+ # TODO: CHECK INPUT
+ plots[[i]] <- ggplot2::ggplot(
+ data = x,
+ ggplot2::aes(
+ x = .data$averagePredictedProbability,
+ y = .data$observedIncidence
+ )
+ ) +
+ ggplot2::geom_ribbon(ggplot2::aes(ymin = .data$lci, ymax = .data$uci, x = x),
+ fill = "blue", alpha = 0.2
+ ) +
+ ggplot2::geom_point(size = 1, color = "darkblue") +
+ ggplot2::coord_cartesian(ylim = c(0, maxVal), xlim = c(0, maxVal)) +
+ ggplot2::geom_abline(
+ intercept = 0, slope = 1, linetype = 2, linewidth = 1,
+ show.legend = TRUE
+ ) +
+ ggplot2::geom_abline(
+ intercept = res["Intercept"], slope = res["Gradient"],
+ linetype = 1, show.legend = TRUE,
+ color = "darkblue"
+ ) +
+ ggplot2::scale_x_continuous("Average Predicted Probability") +
+ ggplot2::scale_y_continuous("Observed Fraction With Outcome") +
+ ggplot2::ggtitle(evalType)
}
- plot <- gridExtra::marrangeGrob(plots, nrow=length(plots), ncol=1)
-
- if (!is.null(saveLocation)){
- if(!dir.exists(saveLocation)){
- dir.create(saveLocation, recursive = T)
+ plot <- gridExtra::marrangeGrob(plots, nrow = length(plots), ncol = 1)
+
+ if (!is.null(saveLocation)) {
+ if (!dir.exists(saveLocation)) {
+ dir.create(saveLocation, recursive = TRUE)
}
ggplot2::ggsave(file.path(saveLocation, fileName), plot, width = 5, height = 4.5, dpi = 400)
}
return(plot)
}
-#=============
+# =============
# CALIBRATIONSUMMARY PLOTS 2
-#=============
+# =============
#' Plot the conventional calibration
#'
#' @details
@@ -888,59 +979,60 @@ plotSparseCalibration <- function(
#'
#' @export
plotSparseCalibration2 <- function(
- plpResult,
- typeColumn = 'evaluation',
- saveLocation = NULL,
- fileName = 'roc.png'
-){
-
- evalTypes <- unique(plpResult$performanceEvaluation$calibrationSummary[,typeColumn])
-
+ plpResult,
+ typeColumn = "evaluation",
+ saveLocation = NULL,
+ fileName = "roc.png") {
+ evalTypes <- unique(plpResult$performanceEvaluation$calibrationSummary[, typeColumn])
+
plots <- list()
length(plots) <- length(evalTypes)
-
- for(i in 1:length(evalTypes)){
+
+ for (i in 1:length(evalTypes)) {
evalType <- evalTypes[i]
- x <- plpResult$performanceEvaluation$calibrationSummary %>%
- dplyr::filter(.data[[typeColumn]] == evalType) %>%
+ x <- plpResult$performanceEvaluation$calibrationSummary %>%
+ dplyr::filter(.data[[typeColumn]] == evalType) %>%
dplyr::select("averagePredictedProbability", "observedIncidence", "PersonCountAtRisk")
- cis <- apply(x, 1, function(x) stats::binom.test(round(x[2]*x[3]), x[3], alternative = c("two.sided"), conf.level = 0.95)$conf.int)
- x$lci <- cis[1,]
- x$uci <- cis[2,]
-
- maxes <- max(max(x$averagePredictedProbability), max(x$observedIncidence))*1.1
-
- limits <- ggplot2::aes(ymax = .data$uci, ymin= .data$lci)
-
- plots[[i]] <- ggplot2::ggplot(data=x,
- ggplot2::aes(x=.data$averagePredictedProbability, y=.data$observedIncidence
- )) +
- ggplot2::geom_point(size=2, color='black') +
- ggplot2::geom_errorbar(limits) +
- ggplot2::geom_line(colour='darkgrey') +
- ggplot2::geom_abline(intercept = 0, slope = 1, linetype = 5, linewidth=0.4,
- show.legend = TRUE) +
- ggplot2::scale_x_continuous("Average Predicted Probability") +
- ggplot2::scale_y_continuous("Observed Fraction With Outcome") +
- ggplot2::coord_cartesian(xlim = c(0, maxes), ylim=c(0,maxes)) +
- ggplot2::ggtitle(evalType)
+ cis <- apply(x, 1, function(x) stats::binom.test(round(x[2] * x[3]), x[3], alternative = c("two.sided"), conf.level = 0.95)$conf.int)
+ x$lci <- cis[1, ]
+ x$uci <- cis[2, ]
+
+ maxes <- max(max(x$averagePredictedProbability), max(x$observedIncidence)) * 1.1
+
+ limits <- ggplot2::aes(ymax = .data$uci, ymin = .data$lci)
+
+ plots[[i]] <- ggplot2::ggplot(
+ data = x,
+ ggplot2::aes(x = .data$averagePredictedProbability, y = .data$observedIncidence)
+ ) +
+ ggplot2::geom_point(size = 2, color = "black") +
+ ggplot2::geom_errorbar(limits) +
+ ggplot2::geom_line(colour = "darkgrey") +
+ ggplot2::geom_abline(
+ intercept = 0, slope = 1, linetype = 5, linewidth = 0.4,
+ show.legend = TRUE
+ ) +
+ ggplot2::scale_x_continuous("Average Predicted Probability") +
+ ggplot2::scale_y_continuous("Observed Fraction With Outcome") +
+ ggplot2::coord_cartesian(xlim = c(0, maxes), ylim = c(0, maxes)) +
+ ggplot2::ggtitle(evalType)
}
-
- plot <- gridExtra::marrangeGrob(plots, nrow=length(plots), ncol=1)
-
- if (!is.null(saveLocation)){
- if(!dir.exists(saveLocation)){
- dir.create(saveLocation, recursive = T)
+
+ plot <- gridExtra::marrangeGrob(plots, nrow = length(plots), ncol = 1)
+
+ if (!is.null(saveLocation)) {
+ if (!dir.exists(saveLocation)) {
+ dir.create(saveLocation, recursive = TRUE)
}
ggplot2::ggsave(file.path(saveLocation, fileName), plot, width = 5, height = 4.5, dpi = 400)
}
return(plot)
}
-#=============
+# =============
# SMOOTHCALIBRATION plot
-#=============
+# =============
#' Plot the smooth calibration as detailed in Calster et al. "A calibration heirarchy for risk models
#' was defined: from utopia to empirical data" (2016)
#'
@@ -956,7 +1048,7 @@ plotSparseCalibration2 <- function(
#' @param nKnots The number of knots to be used by the rcs evaluation. Default is 5
#' @param scatter plot the decile calibrations as points on the graph. Default is False
#' @param bins The number of bins for the histogram. Default is 20.
-#' @param sample If using loess then by default 20,000 patients will be sampled to save time
+#' @param sample If using loess then by default 20,000 patients will be sampled to save time
#' @param typeColumn The name of the column specifying the evaluation type
#' @param saveLocation Directory to save plot (if NULL plot is not saved)
#' @param fileName Name of the file to save to plot, for example
@@ -973,31 +1065,39 @@ plotSmoothCalibration <- function(plpResult,
scatter = FALSE,
bins = 20,
sample = TRUE,
- typeColumn = 'evaluation',
+ typeColumn = "evaluation",
saveLocation = NULL,
fileName = "smoothCalibration.pdf") {
-
- if (!smooth %in% c("loess", "rcs")) stop(ParallelLogger::logError("Smooth type must be either 'loess' or 'rcs"))
- if (nKnots < 3) stop(ParallelLogger::logError("Number of knots must be larger than 3"))
+ if (!smooth %in% c("loess", "rcs")) {
+ stop(ParallelLogger::logError("Smooth type must be either 'loess' or 'rcs"))
+ }
+ if (nKnots < 3) {
+ stop(ParallelLogger::logError("Number of knots must be larger than 3"))
+ }
+ if (smooth == "rcs") {
+ rlang::check_installed("mgcv",
+ reason = "mgcv is required for restricted cubic spline smoothing"
+ )
+ }
- evalTypes <- unique(plpResult$performanceEvaluation$calibrationSummary[,typeColumn])
+ evalTypes <-
+ unique(plpResult$performanceEvaluation$calibrationSummary[, typeColumn])
plots <- list()
length(plots) <- length(evalTypes)
-
+
failedEvalType <- rep(FALSE, length(evalTypes))
names(failedEvalType) <- evalTypes
for (i in seq_along(evalTypes)) {
evalType <- evalTypes[i]
ParallelLogger::logInfo(paste("Smooth calibration plot for "), evalType)
-
- if('prediction'%in%names(plpResult)) {
- x <- plpResult$performanceEvaluation$calibrationSummary %>%
- dplyr::filter(.data[[typeColumn]] == evalType) %>%
+
+ if ("prediction" %in% names(plpResult)) {
+ x <- plpResult$performanceEvaluation$calibrationSummary %>%
+ dplyr::filter(.data[[typeColumn]] == evalType) %>%
dplyr::select("averagePredictedProbability", "observedIncidence")
-
- prediction <- plpResult$prediction %>% dplyr::filter(.data$evaluationType == evalType)
-
- maxVal <- max(x$averagePredictedProbability, x$observedIncidence)
+
+ prediction <- plpResult$prediction %>% dplyr::filter(.data$evaluationType == evalType)
+
maxes <- max(max(x$averagePredictedProbability), max(x$observedIncidence))
if (smooth == "rcs") {
@@ -1009,18 +1109,20 @@ plotSmoothCalibration <- function(plpResult,
y <- prediction$outcomeCount
p <- prediction$value
- nma <- !is.na(p + y) # select only non-missing cases
+ nma <- !is.na(p + y) # select only non-missing cases
sumNA <- sum(!nma)
- if (sumNA > 0)
+ if (sumNA > 0) {
warning(ParallelLogger::logWarn(paste(sumNA, "observations deleted due to NA probabilities or outcomes")))
+ }
y <- y[nma]
p <- p[nma]
- logit <- log(p/(1 - p)) # delete cases with 0 and 1 probs
+ logit <- log(p / (1 - p)) # delete cases with 0 and 1 probs
nonInf <- !is.infinite(logit)
sumNonInf <- sum(!nonInf)
- if (sumNonInf > 0)
+ if (sumNonInf > 0) {
warning(ParallelLogger::logWarn(paste(sumNonInf, "observations deleted due to probabilities of 0 or 1")))
+ }
y <- y[nonInf]
p <- p[nonInf]
@@ -1028,21 +1130,21 @@ plotSmoothCalibration <- function(plpResult,
p <- p[order(p)]
if (smooth == "loess") {
- if(sample){
- if(length(p)>40000){
- inds <- unique(c(0,seq(0, length(p), by = floor(length(p)/20000)),
- length(p)))
+ if (sample) {
+ if (length(p) > 40000) {
+ inds <- unique(c(0, seq(0, length(p),
+ by = floor(length(p) / 20000)
+ ), length(p)))
p <- p[inds]
y <- y[inds]
- } else if(length(p)>20000){
+ } else if (length(p) > 20000) {
inds <- sample(length(p), 20000)
p <- p[inds]
- y <- y[inds]
+ y <- y[inds]
}
}
# loess
smoothData <- data.frame(y, p)
- # xlim <- ylim <- c(0, 1)
smoothPlot <- plotSmoothCalibrationLoess(data = smoothData, span = span) +
ggplot2::coord_cartesian(
xlim = c(0, maxes),
@@ -1072,12 +1174,13 @@ plotSmoothCalibration <- function(plpResult,
data = x,
ggplot2::aes(
x = .data$averagePredictedProbability,
- y = .data$observedIncidence),
+ y = .data$observedIncidence
+ ),
color = "black",
size = 2
)
}
-
+
# Histogram object detailing the distibution of event/noevent for each probability interval
count <- NULL
histPlot <- ggplot2::ggplot() +
@@ -1085,9 +1188,9 @@ plotSmoothCalibration <- function(plpResult,
data = prediction,
ggplot2::aes(
x = .data$value,
- y = ggplot2::after_stat(count),
- fill = as.character(.data$outcomeCount) #MAYBE ISSUE
- ),
+ y = ggplot2::after_stat(count),
+ fill = as.character(.data$outcomeCount) # MAYBE ISSUE
+ ),
bins = bins,
position = "stack",
alpha = 0.5,
@@ -1102,10 +1205,9 @@ plotSmoothCalibration <- function(plpResult,
) +
ggplot2::labs(x = "Predicted Probability") +
ggplot2::coord_cartesian(xlim = c(0, maxes))
-
} else {
# use calibrationSummary
- sparsePred <- plpResult$performanceEvaluation$calibrationSummary %>%
+ sparsePred <- plpResult$performanceEvaluation$calibrationSummary %>%
dplyr::filter(.data[[typeColumn]] == evalType) %>%
dplyr::mutate(
y = .data$observedIncidence,
@@ -1113,7 +1215,7 @@ plotSmoothCalibration <- function(plpResult,
)
smoothPlot <- plotSmoothCalibrationLoess(data = sparsePred, span = span)
-
+
# construct the plot grid
if (scatter) {
smoothPlot <- smoothPlot +
@@ -1126,46 +1228,56 @@ plotSmoothCalibration <- function(plpResult,
size = 1,
color = "black",
show.legend = FALSE
- )
+ )
}
-
+
# Histogram object detailing the distibution of event/noevent for each probability interval
-
- popData1 <- sparsePred[, c('averagePredictedProbability', 'PersonCountWithOutcome')]
+
+ popData1 <- sparsePred[, c("averagePredictedProbability", "PersonCountWithOutcome")]
popData1$Label <- "Outcome"
- colnames(popData1) <- c('averagePredictedProbability', 'PersonCount', "Label")
- popData2 <- sparsePred[,c('averagePredictedProbability', 'PersonCountAtRisk')]
+ colnames(popData1) <- c(
+ "averagePredictedProbability", "PersonCount",
+ "Label"
+ )
+ popData2 <- sparsePred[, c(
+ "averagePredictedProbability",
+ "PersonCountAtRisk"
+ )]
popData2$Label <- "No Outcome"
popData2$PersonCountAtRisk <- -1 * (popData2$PersonCountAtRisk - popData1$PersonCount)
- colnames(popData2) <- c('averagePredictedProbability', 'PersonCount', "Label")
+ colnames(popData2) <- c(
+ "averagePredictedProbability", "PersonCount",
+ "Label"
+ )
popData <- rbind(popData1, popData2)
- popData$averagePredictedProbability <- factor(popData$averagePredictedProbability)
+ popData$averagePredictedProbability <-
+ factor(popData$averagePredictedProbability)
histPlot <- ggplot2::ggplot(
data = popData,
ggplot2::aes(
y = .data$averagePredictedProbability,
- x = .data$PersonCount,
+ x = .data$PersonCount,
fill = .data$Label
)
- ) +
+ ) +
ggplot2::geom_bar(
- data = popData[popData$Label == "Outcome",],
+ data = popData[popData$Label == "Outcome", ],
stat = "identity"
- ) +
+ ) +
ggplot2::geom_bar(
- data = popData[popData$Label == "No Outcome",],
+ data = popData[popData$Label == "No Outcome", ],
stat = "identity"
) +
- ggplot2::geom_bar(stat = "identity") +
- ggplot2::scale_x_continuous(labels = abs) +
- ggplot2::coord_flip( ) +
+ ggplot2::geom_bar(stat = "identity") +
+ ggplot2::scale_x_continuous(labels = abs) +
+ ggplot2::coord_flip() +
ggplot2::theme(
- axis.title.x=ggplot2::element_blank(),
- axis.text.x=ggplot2::element_blank(),
- axis.ticks.x=ggplot2::element_blank()
+ axis.title.x = ggplot2::element_blank(),
+ axis.text.x = ggplot2::element_blank(),
+ axis.ticks.x = ggplot2::element_blank()
)
}
-
+
if (!failedEvalType[i]) {
plots[[i]]$smoothPlot <- smoothPlot
plots[[i]]$histPlot <- histPlot
@@ -1173,10 +1285,10 @@ plotSmoothCalibration <- function(plpResult,
}
names(plots) <- tolower(evalTypes)
-
+
if (!is.null(saveLocation)) {
- if(!dir.exists(saveLocation)) {
- dir.create(saveLocation, recursive = T)
+ if (!dir.exists(saveLocation)) {
+ dir.create(saveLocation, recursive = TRUE)
}
for (i in seq_along(evalTypes)) {
if (!failedEvalType[i]) {
@@ -1185,7 +1297,7 @@ plotSmoothCalibration <- function(plpResult,
plots[[i]]$histPlot,
ncol = 1,
nrow = 2,
- heights=c(2,1)
+ heights = c(2, 1)
)
fileNameComponents <- unlist(strsplit(fileName, split = "\\."))
n <- length(fileNameComponents)
@@ -1195,7 +1307,7 @@ plotSmoothCalibration <- function(plpResult,
collapse = "."
)
} else {
- actualFileName = fileNameComponents[1]
+ actualFileName <- fileNameComponents[1]
}
saveFileName <- paste0(
actualFileName,
@@ -1216,7 +1328,7 @@ plotSmoothCalibration <- function(plpResult,
return(plots)
}
-#' Plot the net benefit
+#' Plot the net benefit
#' @param plpResult A plp result object as generated using the \code{\link{runPlp}} function.
#' @param typeColumn The name of the column specifying the evaluation type
#' @param saveLocation Directory to save plot (if NULL plot is not saved)
@@ -1232,14 +1344,13 @@ plotNetBenefit <- function(plpResult,
fileName = "netBenefit.png",
evalType = NULL,
ylim = NULL,
- xlim = NULL
-) {
+ xlim = NULL) {
if (is.null(evalType)) {
evalTypes <- unique(plpResult$performanceEvaluation$thresholdSummary[, typeColumn])
} else {
evalTypes <- evalType
- }
-
+ }
+
plots <- list()
length(plots) <- length(evalTypes)
for (i in 1:length(evalTypes)) {
@@ -1247,26 +1358,31 @@ plotNetBenefit <- function(plpResult,
# calculate net benefit straight from predictions instead of thresholdSummary.
nbData <- getNetBenefit(plpResult, evalType)
if (is.null(ylim)) {
- ylim <- c(min(nbData$netBenefit),
- max(nbData$netBenefit))
+ ylim <- c(
+ min(nbData$netBenefit),
+ max(nbData$netBenefit)
+ )
}
if (is.null(xlim)) {
# match limit in Smooth Calibration by default
- xlim <- c(min(nbData$threshold),
- max(max(plpResult$performanceEvaluation$calibrationSummary$averagePredictedProbability),
- max(plpResult$performanceEvaluation$calibrationSummary$observedIncidence)))
-
+ xlim <- c(
+ min(nbData$threshold),
+ max(
+ max(plpResult$performanceEvaluation$calibrationSummary$averagePredictedProbability),
+ max(plpResult$performanceEvaluation$calibrationSummary$observedIncidence)
+ )
+ )
}
-
+
plots[[i]] <- ggplot2::ggplot(data = nbData, ggplot2::aes(x = .data$threshold)) +
- ggplot2::geom_line(ggplot2::aes(y = treatAll, color = "Treat All", linetype = "Treat All")) +
- ggplot2::geom_line(ggplot2::aes(y = treatNone, color = "Treat None", linetype = "Treat None")) +
- ggplot2::geom_line(ggplot2::aes(y = netBenefit, color = "Net Benefit", linetype = "Net Benefit")) +
+ ggplot2::geom_line(ggplot2::aes(y = .data$treatAll, color = "Treat All", linetype = "Treat All")) +
+ ggplot2::geom_line(ggplot2::aes(y = .data$treatNone, color = "Treat None", linetype = "Treat None")) +
+ ggplot2::geom_line(ggplot2::aes(y = .data$netBenefit, color = "Net Benefit", linetype = "Net Benefit")) +
ggplot2::scale_color_manual(
name = "Strategies",
values = c(
- "Net Benefit" = "blue",
- "Treat All" = "red",
+ "Model" = "blue",
+ "Treat all" = "red",
"Treat None" = "brown"
)
) +
@@ -1285,12 +1401,12 @@ plotNetBenefit <- function(plpResult,
ggplot2::ggtitle(evalType) +
ggplot2::coord_cartesian(xlim = xlim, ylim = ylim)
}
-
+
plot <- gridExtra::marrangeGrob(plots, nrow = length(plots), ncol = 1)
-
+
if (!is.null(saveLocation)) {
if (!dir.exists(saveLocation)) {
- dir.create(saveLocation, recursive = T)
+ dir.create(saveLocation, recursive = TRUE)
}
ggplot2::ggsave(file.path(saveLocation, fileName), plot, width = 5, height = 4.5, dpi = 400)
}
@@ -1307,7 +1423,8 @@ getNetBenefit <- function(plpResult, evalType) {
if (nrow(prediction) == 0) {
stop("No prediction data found for evaluation type ", evalType)
}
- prediction <- prediction %>% dplyr::arrange(dplyr::desc(.data$value)) %>%
+ prediction <- prediction %>%
+ dplyr::arrange(dplyr::desc(.data$value)) %>%
dplyr::mutate(
cumsumTrue = cumsum(.data$outcomeCount),
cumsumFalse = cumsum(1 - .data$outcomeCount)
@@ -1316,16 +1433,16 @@ getNetBenefit <- function(plpResult, evalType) {
n <- nrow(prediction)
falseCount <- n - trueCount
outcomeRate <- trueCount / n
-
- nbData <- prediction %>%
- dplyr::group_by(.data$value) %>%
+
+ nbData <- prediction %>%
+ dplyr::group_by(.data$value) %>%
dplyr::summarise(
threshold = unique(.data$value),
TP = max(.data$cumsumTrue),
FP = max(.data$cumsumFalse),
- ) %>%
+ ) %>%
dplyr::ungroup()
-
+
nbData <- nbData %>%
dplyr::mutate(
netBenefit = (.data$TP / n) - (.data$FP / n) * (.data$threshold / (1 - .data$threshold)),
@@ -1345,7 +1462,7 @@ plotSmoothCalibrationLoess <- function(data, span = 0.75) {
lci = .data$calibration - stats::qt(.975, predictedFit$df) * .data$se,
uci = .data$calibration + stats::qt(.975, predictedFit$df) * .data$se
)
-
+
plot <- ggplot2::ggplot(
data = data,
ggplot2::aes(
@@ -1353,47 +1470,47 @@ plotSmoothCalibrationLoess <- function(data, span = 0.75) {
y = .data$calibration
)
) +
- ggplot2::geom_line(
- ggplot2::aes(
- color = "Loess",
- linetype = "Loess"
- )
- ) +
- ggplot2::geom_ribbon(
- ggplot2::aes(
- ymin = .data$lci,
- ymax = .data$uci
- ),
- fill = "blue",
- alpha = 0.2
- ) +
- ggplot2::annotate(
- geom = "segment",
- x = 0,
- xend = 1,
- y = 0,
- yend = 1,
- color = "red",
- linetype = "dashed"
- ) +
- ggplot2::scale_linetype_manual(
- name = "Models",
+ ggplot2::geom_line(
+ ggplot2::aes(
+ color = "Loess",
+ linetype = "Loess"
+ )
+ ) +
+ ggplot2::geom_ribbon(
+ ggplot2::aes(
+ ymin = .data$lci,
+ ymax = .data$uci
+ ),
+ fill = "blue",
+ alpha = 0.2
+ ) +
+ ggplot2::annotate(
+ geom = "segment",
+ x = 0,
+ xend = 1,
+ y = 0,
+ yend = 1,
+ color = "red",
+ linetype = "dashed"
+ ) +
+ ggplot2::scale_linetype_manual(
+ name = "Models",
values = c(
Loess = "solid",
Ideal = "dashed"
)
- ) +
- ggplot2::scale_color_manual(
- name = "Models",
- values = c(
- Loess = "blue",
- Ideal = "red"
+ ) +
+ ggplot2::scale_color_manual(
+ name = "Models",
+ values = c(
+ Loess = "blue",
+ Ideal = "red"
+ )
+ ) +
+ ggplot2::labs(
+ x = "Predicted Probability",
+ y = "Observed Probability"
)
- ) +
- ggplot2::labs(
- x = "Predicted Probability",
- y = "Observed Probability"
- )
return(plot)
}
@@ -1407,15 +1524,15 @@ plotSmoothCalibrationRcs <- function(data, numberOfKnots) {
if (numberOfKnots == 3) {
lowestQuantile <- .1
highestQuantile <- .9
- } else if (numberOfKnots > 3 & numberOfKnots <= 6) {
+ } else if (numberOfKnots > 3 && numberOfKnots <= 6) {
lowestQuantile <- .05
highestQuantile <- .95
} else if (numberOfKnots == 7) {
lowestQuantile <- .025
highestQuantile <- .975
} else {
- # use mgcv defaults
- return(numberOfKnots)
+ # use mgcv defaults
+ return(numberOfKnots)
}
knotQuantiles <- seq(
lowestQuantile,
@@ -1437,7 +1554,7 @@ plotSmoothCalibrationRcs <- function(data, numberOfKnots) {
smoothFit <- tryCatch(
expr = {
mgcv::gam(
- y ~ s(p, bs = 'cr', k = k, m = 2),
+ y ~ s(p, bs = "cr", k = k, m = 2),
data = data,
family = stats::binomial()
)
@@ -1450,7 +1567,7 @@ plotSmoothCalibrationRcs <- function(data, numberOfKnots) {
smoothFit <- tryCatch(
expr = {
mgcv::gam(
- y ~ s(p, bs = 'cr', k = k, m = 2),
+ y ~ s(p, bs = "cr", k = k, m = 2),
data = data,
knots = list(p = .defineKnots(p, k)),
family = stats::binomial()
@@ -1463,16 +1580,16 @@ plotSmoothCalibrationRcs <- function(data, numberOfKnots) {
}
if (is.character(smoothFit)) {
if (k > 3) {
- ParallelLogger::logInfo(paste0("Setting number of Knots to ", k, " led to estimation problems. Switching to nKnots = ", k-1))
+ ParallelLogger::logInfo(paste0("Setting number of Knots to ", k, " led to estimation problems. Switching to nKnots = ", k - 1))
} else {
- ParallelLogger::logInfo(paste0('Unable to fit model'))
+ ParallelLogger::logInfo(paste0("Unable to fit model"))
}
} else {
break
}
}
- if (is.character(smoothFit)){
+ if (is.character(smoothFit)) {
return("Failed")
}
@@ -1528,9 +1645,9 @@ plotSmoothCalibrationRcs <- function(data, numberOfKnots) {
return(plot)
}
-#=============
-# PREDICTIONSUMMARY PLOTS
-#=============
+# =============
+# PREDICTIONSUMMARY PLOTS
+# =============
#' Plot the side-by-side boxplots of prediction distribution, by class#'
#' @details
@@ -1549,56 +1666,66 @@ plotSmoothCalibrationRcs <- function(data, numberOfKnots) {
#'
#' @export
plotPredictionDistribution <- function(
- plpResult,
- typeColumn = 'evaluation',
- saveLocation = NULL,
- fileName = 'PredictionDistribution.png'
-){
-
- evalTypes <- unique(plpResult$performanceEvaluation$predictionDistribution[,typeColumn])
-
+ plpResult,
+ typeColumn = "evaluation",
+ saveLocation = NULL,
+ fileName = "PredictionDistribution.png") {
+ evalTypes <- unique(plpResult$performanceEvaluation$predictionDistribution[, typeColumn])
+
plots <- list()
length(plots) <- length(evalTypes)
-
+
for (i in 1:length(evalTypes)) {
evalType <- evalTypes[i]
- x <- plpResult$performanceEvaluation$predictionDistribution %>%
- dplyr::filter(.data[[typeColumn]] == evalType)
-
- non05 <- x$P05PredictedProbability[x$class == 0]
- non95 <- x$P95PredictedProbability[x$class == 0]
- one05 <- x$P05PredictedProbability[x$class == 1]
- one95 <- x$P95PredictedProbability[x$class == 1]
-
- plots[[i]] <- ggplot2::ggplot(x,
- ggplot2::aes(x = as.factor(class),
- ymin = .data$MinPredictedProbability,
- lower = .data$P25PredictedProbability,
- middle = .data$MedianPredictedProbability,
- upper = .data$P75PredictedProbability,
- ymax = .data$MaxPredictedProbability,
- color = as.factor(.data$class))) +
- ggplot2::coord_flip() +
- ggplot2::geom_boxplot(stat = "identity") +
- ggplot2::scale_x_discrete("Class") +
- ggplot2::scale_y_continuous("Predicted Probability") +
- ggplot2::theme(legend.position = "none") +
- ggplot2::annotate("segment", x = 0.9, xend = 1.1, y = non05, yend = non05,
- color = "red") +
- ggplot2::annotate("segment", x = 0.9, xend = 1.1, y = non95, yend = non95,
- color = "red") +
- ggplot2::annotate("segment", x = 1.9, xend = 2.1, y = one05, yend = one05,
- color = "#00BFC4") +
- ggplot2::annotate("segment", x = 1.9, xend = 2.1, y = one95, yend = one95,
- color = "#00BFC4") +
- ggplot2::ggtitle(evalType)
+ x <- plpResult$performanceEvaluation$predictionDistribution %>%
+ dplyr::filter(.data[[typeColumn]] == evalType)
+
+ non05 <- x$P05PredictedProbability[x$class == 0]
+ non95 <- x$P95PredictedProbability[x$class == 0]
+ one05 <- x$P05PredictedProbability[x$class == 1]
+ one95 <- x$P95PredictedProbability[x$class == 1]
+
+ plots[[i]] <- ggplot2::ggplot(
+ x,
+ ggplot2::aes(
+ x = as.factor(class),
+ ymin = .data$MinPredictedProbability,
+ lower = .data$P25PredictedProbability,
+ middle = .data$MedianPredictedProbability,
+ upper = .data$P75PredictedProbability,
+ ymax = .data$MaxPredictedProbability,
+ color = as.factor(.data$class)
+ )
+ ) +
+ ggplot2::coord_flip() +
+ ggplot2::geom_boxplot(stat = "identity") +
+ ggplot2::scale_x_discrete("Class") +
+ ggplot2::scale_y_continuous("Predicted Probability") +
+ ggplot2::theme(legend.position = "none") +
+ ggplot2::annotate("segment",
+ x = 0.9, xend = 1.1, y = non05, yend = non05,
+ color = "red"
+ ) +
+ ggplot2::annotate("segment",
+ x = 0.9, xend = 1.1, y = non95, yend = non95,
+ color = "red"
+ ) +
+ ggplot2::annotate("segment",
+ x = 1.9, xend = 2.1, y = one05, yend = one05,
+ color = "#00BFC4"
+ ) +
+ ggplot2::annotate("segment",
+ x = 1.9, xend = 2.1, y = one95, yend = one95,
+ color = "#00BFC4"
+ ) +
+ ggplot2::ggtitle(evalType)
}
-
+
plot <- gridExtra::marrangeGrob(plots, nrow = length(plots), ncol = 1)
-
+
if (!is.null(saveLocation)) {
if (!dir.exists(saveLocation)) {
- dir.create(saveLocation, recursive = T)
+ dir.create(saveLocation, recursive = TRUE)
}
ggplot2::ggsave(file.path(saveLocation, fileName), plot, width = 5, height = 4.5, dpi = 400)
}
@@ -1607,9 +1734,9 @@ plotPredictionDistribution <- function(
-#=============
-# COVARIATESUMMARY PLOTS
-#=============
+# =============
+# COVARIATESUMMARY PLOTS
+# =============
#' Plot the variable importance scatterplot
#'
@@ -1629,32 +1756,29 @@ plotPredictionDistribution <- function(
#'
#' @export
plotVariableScatterplot <- function(
- covariateSummary,
- saveLocation = NULL,
- fileName = 'VariableScatterplot.png'
- )
- {
-
+ covariateSummary,
+ saveLocation = NULL,
+ fileName = "VariableScatterplot.png") {
# remove the non-incidence variables from the plot
- covariateSummary <- covariateSummary[covariateSummary$WithNoOutcome_CovariateMean <= 1,]
-
+ covariateSummary <- covariateSummary[covariateSummary$WithNoOutcome_CovariateMean <= 1, ]
+
covariateSummary$size <- rep(0.1, nrow(covariateSummary))
- covariateSummary$size[covariateSummary$covariateValue!=0] <- 0.5
-
- plot <- ggplot2::ggplot(covariateSummary, ggplot2::aes(y=.data$WithOutcome_CovariateMean,
- x=.data$WithNoOutcome_CovariateMean,
- #size=abs(covariateValue)+0.1
- size=.data$size
- )) +
- ggplot2::geom_point(ggplot2::aes(color = .data$size)) +
- ggplot2::scale_size(range = c(0, 1)) +
- ggplot2::scale_colour_gradient2(low = "red",mid = "blue", high="green") +
+ covariateSummary$size[covariateSummary$covariateValue != 0] <- 0.5
+
+ plot <- ggplot2::ggplot(covariateSummary, ggplot2::aes(
+ y = .data$WithOutcome_CovariateMean,
+ x = .data$WithNoOutcome_CovariateMean,
+ size = .data$size
+ )) +
+ ggplot2::geom_point(ggplot2::aes(color = .data$size)) +
+ ggplot2::scale_size(range = c(0, 1)) +
+ ggplot2::scale_colour_gradient2(low = "red", mid = "blue", high = "green") +
ggplot2::scale_y_continuous("Outcome Covariate Mean") +
- ggplot2::scale_x_continuous("Non-outcome Covariate Mean") +
- ggplot2::geom_abline(intercept = 0, slope = 1,linetype = 2) +
- ggplot2::theme(legend.position="none")
-
- if (!is.null(saveLocation)){
+ ggplot2::scale_x_continuous("Non-outcome Covariate Mean") +
+ ggplot2::geom_abline(intercept = 0, slope = 1, linetype = 2) +
+ ggplot2::theme(legend.position = "none")
+
+ if (!is.null(saveLocation)) {
suppressWarnings(ggplot2::ggsave(file.path(saveLocation, fileName), plot, width = 5, height = 3.5, dpi = 400))
}
return(plot)
@@ -1679,44 +1803,50 @@ plotVariableScatterplot <- function(
#'
#' @export
plotGeneralizability <- function(
- covariateSummary,
- saveLocation = NULL,
- fileName = 'Generalizability.png'){
-
+ covariateSummary,
+ saveLocation = NULL,
+ fileName = "Generalizability.png") {
covariateSummary$TrainWithOutcome_CovariateMean[is.na(covariateSummary$TrainWithOutcome_CovariateMean)] <- 0
covariateSummary$TestWithOutcome_CovariateMean[is.na(covariateSummary$TestWithOutcome_CovariateMean)] <- 0
-
- covariateSummary <- covariateSummary[covariateSummary$TrainWithOutcome_CovariateMean <= 1,]
-
- plot1 <- ggplot2::ggplot(covariateSummary,
- ggplot2::aes(.data$TrainWithOutcome_CovariateMean,
- .data$TestWithOutcome_CovariateMean)) +
+
+ covariateSummary <- covariateSummary[covariateSummary$TrainWithOutcome_CovariateMean <= 1, ]
+
+ plot1 <- ggplot2::ggplot(
+ covariateSummary,
+ ggplot2::aes(
+ .data$TrainWithOutcome_CovariateMean,
+ .data$TestWithOutcome_CovariateMean
+ )
+ ) +
ggplot2::geom_point() +
ggplot2::scale_x_continuous("Train set Mean") +
- ggplot2::scale_y_continuous("Test Set Mean") +
- ggplot2::theme(legend.position="none") +
- ggplot2::geom_abline(intercept = 0, slope = 1,linetype = 2)+
+ ggplot2::scale_y_continuous("Test Set Mean") +
+ ggplot2::theme(legend.position = "none") +
+ ggplot2::geom_abline(intercept = 0, slope = 1, linetype = 2) +
ggplot2::ggtitle("Outcome")
-
-
+
+
covariateSummary$TrainWithNoOutcome_CovariateMean[is.na(covariateSummary$TrainWithNoOutcome_CovariateMean)] <- 0
covariateSummary$TestWithNoOutcome_CovariateMean[is.na(covariateSummary$TestWithNoOutcome_CovariateMean)] <- 0
-
- plot2 <- ggplot2::ggplot(covariateSummary,
- ggplot2::aes(.data$TrainWithNoOutcome_CovariateMean,
- .data$TestWithNoOutcome_CovariateMean)) +
+
+ plot2 <- ggplot2::ggplot(
+ covariateSummary,
+ ggplot2::aes(
+ .data$TrainWithNoOutcome_CovariateMean,
+ .data$TestWithNoOutcome_CovariateMean
+ )
+ ) +
ggplot2::geom_point() +
ggplot2::scale_x_continuous("Train set Mean") +
- ggplot2::scale_y_continuous("Test Set Mean") +
- ggplot2::theme(legend.position="none") +
- ggplot2::geom_abline(intercept = 0, slope = 1,linetype = 2) +
+ ggplot2::scale_y_continuous("Test Set Mean") +
+ ggplot2::theme(legend.position = "none") +
+ ggplot2::geom_abline(intercept = 0, slope = 1, linetype = 2) +
ggplot2::ggtitle("No Outcome")
-
- plot <- gridExtra::grid.arrange(plot1, plot2, ncol=2)
-
- if (!is.null(saveLocation))
+
+ plot <- gridExtra::grid.arrange(plot1, plot2, ncol = 2)
+
+ if (!is.null(saveLocation)) {
ggplot2::ggsave(file.path(saveLocation, fileName), plot, width = 5, height = 3.5, dpi = 400)
+ }
return(plot)
}
-
-
diff --git a/R/PopulationSettings.R b/R/PopulationSettings.R
index 098a784f7..1d23ff4d4 100644
--- a/R/PopulationSettings.R
+++ b/R/PopulationSettings.R
@@ -21,7 +21,7 @@
#'
#' @details
#' Takes as input the inputs to create study population
-#' @param binary Forces the outcomeCount to be 0 or 1 (use for binary prediction problems)
+#' @param binary Forces the outcomeCount to be 0 or 1 (use for binary prediction problems)
#' @param includeAllOutcomes (binary) indicating whether to include people with outcomes who are not observed for the whole at risk period
#' @param firstExposureOnly Should only the first exposure per subject be included? Note that
#' this is typically done in the \code{createStudyPopulation} function,
@@ -44,79 +44,77 @@
#' A list containing all the settings required for creating the study population
#' @export
createStudyPopulationSettings <- function(
- binary = T,
- includeAllOutcomes = T,
- firstExposureOnly = FALSE,
- washoutPeriod = 0,
- removeSubjectsWithPriorOutcome = TRUE,
- priorOutcomeLookback = 99999,
- requireTimeAtRisk = T,
- minTimeAtRisk = 364,
- riskWindowStart = 1,
- startAnchor = 'cohort start',
- riskWindowEnd = 365,
- endAnchor = "cohort start",
- restrictTarToCohortEnd = F
-){
-
+ binary = TRUE,
+ includeAllOutcomes = TRUE,
+ firstExposureOnly = FALSE,
+ washoutPeriod = 0,
+ removeSubjectsWithPriorOutcome = TRUE,
+ priorOutcomeLookback = 99999,
+ requireTimeAtRisk = TRUE,
+ minTimeAtRisk = 364,
+ riskWindowStart = 1,
+ startAnchor = "cohort start",
+ riskWindowEnd = 365,
+ endAnchor = "cohort start",
+ restrictTarToCohortEnd = FALSE) {
checkIsClass(binary, "logical")
checkNotNull(binary)
-
+
checkIsClass(includeAllOutcomes, "logical")
checkNotNull(includeAllOutcomes)
-
+
checkIsClass(firstExposureOnly, "logical")
checkNotNull(firstExposureOnly)
-
+
checkIsClass(washoutPeriod, c("numeric", "integer"))
checkNotNull(washoutPeriod)
checkHigherEqual(washoutPeriod, 0)
-
+
checkIsClass(removeSubjectsWithPriorOutcome, "logical")
checkNotNull(removeSubjectsWithPriorOutcome)
-
+
checkIsClass(priorOutcomeLookback, c("numeric", "integer"))
checkNotNull(priorOutcomeLookback)
checkHigherEqual(priorOutcomeLookback, 0)
-
+
checkIsClass(requireTimeAtRisk, "logical")
checkNotNull(requireTimeAtRisk)
-
- if(requireTimeAtRisk){
+
+ if (requireTimeAtRisk) {
checkIsClass(minTimeAtRisk, c("numeric", "integer"))
checkNotNull(minTimeAtRisk)
checkHigherEqual(minTimeAtRisk, 0)
}
-
+
checkIsClass(riskWindowStart, c("numeric", "integer"))
checkNotNull(riskWindowStart)
-
+
checkIsClass(startAnchor, c("character"))
checkNotNull(startAnchor)
- if(!startAnchor%in%c('cohort start', 'cohort end')){
- stop('Incorrect startAnchor')
+ if (!startAnchor %in% c("cohort start", "cohort end")) {
+ stop("Incorrect startAnchor")
}
-
+
checkIsClass(riskWindowEnd, c("numeric", "integer"))
checkNotNull(riskWindowEnd)
-
+
checkIsClass(endAnchor, c("character"))
checkNotNull(endAnchor)
- if(!endAnchor%in%c('cohort start', 'cohort end')){
- stop('Incorrect endAnchor')
+ if (!endAnchor %in% c("cohort start", "cohort end")) {
+ stop("Incorrect endAnchor")
}
-
- if(startAnchor == endAnchor){
+
+ if (startAnchor == endAnchor) {
checkHigherEqual(riskWindowEnd, riskWindowStart)
}
checkIsClass(restrictTarToCohortEnd, "logical")
checkNotNull(restrictTarToCohortEnd)
-
- if(requireTimeAtRisk){
- if(startAnchor==endAnchor){
- if(minTimeAtRisk>(riskWindowEnd-riskWindowStart)){
- warning('issue: minTimeAtRisk is greater than max possible time-at-risk')
+
+ if (requireTimeAtRisk) {
+ if (startAnchor == endAnchor) {
+ if (minTimeAtRisk > (riskWindowEnd - riskWindowStart)) {
+ warning("issue: minTimeAtRisk is greater than max possible time-at-risk")
}
}
}
@@ -128,18 +126,17 @@ createStudyPopulationSettings <- function(
washoutPeriod = washoutPeriod,
removeSubjectsWithPriorOutcome = removeSubjectsWithPriorOutcome,
priorOutcomeLookback = priorOutcomeLookback,
- requireTimeAtRisk = requireTimeAtRisk,
+ requireTimeAtRisk = requireTimeAtRisk,
minTimeAtRisk = minTimeAtRisk,
riskWindowStart = riskWindowStart,
startAnchor = startAnchor,
riskWindowEnd = riskWindowEnd,
- endAnchor = endAnchor,
+ endAnchor = endAnchor,
restrictTarToCohortEnd = restrictTarToCohortEnd
- )
-
- class(result) <- 'populationSettings'
+ )
+
+ class(result) <- "populationSettings"
return(result)
-
}
#' Create a study population
@@ -168,18 +165,17 @@ createStudyPopulationSettings <- function(
#'
#' @export
createStudyPopulation <- function(
- plpData,
- outcomeId,
- populationSettings,
- population = NULL
- ) {
-
- checkIsClass(populationSettings, 'populationSettings')
-
+ plpData,
+ outcomeId,
+ populationSettings,
+ population = NULL) {
+ start <- Sys.time()
+ checkIsClass(populationSettings, "populationSettings")
+
binary <- populationSettings$binary
includeAllOutcomes <- populationSettings$includeAllOutcomes
firstExposureOnly <- populationSettings$firstExposureOnly
- washoutPeriod <- populationSettings$washoutPeriod
+ washoutPeriod <- populationSettings$washoutPeriod
removeSubjectsWithPriorOutcome <- populationSettings$removeSubjectsWithPriorOutcome
priorOutcomeLookback <- populationSettings$priorOutcomeLookback
requireTimeAtRisk <- populationSettings$requireTimeAtRisk
@@ -189,281 +185,323 @@ createStudyPopulation <- function(
riskWindowEnd <- populationSettings$riskWindowEnd
endAnchor <- populationSettings$endAnchor
restrictTarToCohortEnd <- populationSettings$restrictTarToCohortEnd
-
+
# parameter checks
- if(!inherits(x = plpData, what = c('plpData'))){
- ParallelLogger::logError('Check plpData format')
- stop('Wrong plpData input')
+ if (!inherits(x = plpData, what = c("plpData"))) {
+ ParallelLogger::logError("Check plpData format")
+ stop("Wrong plpData input")
}
- ParallelLogger::logDebug(paste0('outcomeId: ', outcomeId))
+ ParallelLogger::logDebug(paste0("outcomeId: ", outcomeId))
checkNotNull(outcomeId)
-
- ParallelLogger::logDebug(paste0('binary: ', binary))
- ParallelLogger::logDebug(paste0('includeAllOutcomes: ', includeAllOutcomes))
- ParallelLogger::logDebug(paste0('firstExposureOnly: ', firstExposureOnly))
- ParallelLogger::logDebug(paste0('washoutPeriod: ', washoutPeriod))
- ParallelLogger::logDebug(paste0('removeSubjectsWithPriorOutcome: ', removeSubjectsWithPriorOutcome))
- ParallelLogger::logDebug(paste0('priorOutcomeLookback: ', priorOutcomeLookback))
- ParallelLogger::logDebug(paste0('requireTimeAtRisk: ', requireTimeAtRisk))
- ParallelLogger::logDebug(paste0('minTimeAtRisk: ', minTimeAtRisk))
- ParallelLogger::logDebug(paste0('restrictTarToCohortEnd: ', restrictTarToCohortEnd))
- ParallelLogger::logDebug(paste0('riskWindowStart: ', riskWindowStart))
- ParallelLogger::logDebug(paste0('startAnchor: ', startAnchor))
- ParallelLogger::logDebug(paste0('riskWindowEnd: ', riskWindowEnd))
- ParallelLogger::logDebug(paste0('endAnchor: ', endAnchor))
- ParallelLogger::logDebug(paste0('restrictTarToCohortEnd: ', restrictTarToCohortEnd))
+
+ ParallelLogger::logDebug(paste0("binary: ", binary))
+ ParallelLogger::logDebug(paste0("includeAllOutcomes: ", includeAllOutcomes))
+ ParallelLogger::logDebug(paste0("firstExposureOnly: ", firstExposureOnly))
+ ParallelLogger::logDebug(paste0("washoutPeriod: ", washoutPeriod))
+ ParallelLogger::logDebug(paste0("removeSubjectsWithPriorOutcome: ", removeSubjectsWithPriorOutcome))
+ ParallelLogger::logDebug(paste0("priorOutcomeLookback: ", priorOutcomeLookback))
+ ParallelLogger::logDebug(paste0("requireTimeAtRisk: ", requireTimeAtRisk))
+ ParallelLogger::logDebug(paste0("minTimeAtRisk: ", minTimeAtRisk))
+ ParallelLogger::logDebug(paste0("restrictTarToCohortEnd: ", restrictTarToCohortEnd))
+ ParallelLogger::logDebug(paste0("riskWindowStart: ", riskWindowStart))
+ ParallelLogger::logDebug(paste0("startAnchor: ", startAnchor))
+ ParallelLogger::logDebug(paste0("riskWindowEnd: ", riskWindowEnd))
+ ParallelLogger::logDebug(paste0("endAnchor: ", endAnchor))
+ ParallelLogger::logDebug(paste0("restrictTarToCohortEnd: ", restrictTarToCohortEnd))
if (is.null(population)) {
population <- plpData$cohorts
} else {
- population <- plpData$cohorts %>%
+ population <- plpData$cohorts %>%
dplyr::filter(.data$rowId %in% (population %>% dplyr::pull(.data$rowId)))
}
-
+
# save the metadata (should have the ?targetId, outcomeId, plpDataSettings and population settings)
metaData <- attr(population, "metaData")
metaData$restrictPlpDataSettings <- plpData$metaData$restrictPlpDataSettings
metaData$outcomeId <- outcomeId
metaData$populationSettings <- populationSettings # this will overwrite an existing setting
-
+
# set the existing attrition
- if(is.null(metaData$attrition)){
- metaData$attrition <- attr(plpData$cohorts, 'metaData')$attrition
+ if (is.null(metaData$attrition)) {
+ metaData$attrition <- attr(plpData$cohorts, "metaData")$attrition
}
- if(!is.null(metaData$attrition)){
- metaData$attrition <- data.frame(outcomeId = metaData$attrition$outcomeId,
- description = metaData$attrition$description,
- targetCount = metaData$attrition$targetCount,
- uniquePeople = metaData$attrition$uniquePeople,
- outcomes = metaData$attrition$outcomes)
- if(sum(metaData$attrition$outcomeId==outcomeId)>0){
- metaData$attrition <- metaData$attrition[metaData$attrition$outcomeId==outcomeId,]
- } else{
+ if (!is.null(metaData$attrition)) {
+ metaData$attrition <- data.frame(
+ outcomeId = metaData$attrition$outcomeId,
+ description = metaData$attrition$description,
+ targetCount = metaData$attrition$targetCount,
+ uniquePeople = metaData$attrition$uniquePeople,
+ outcomes = metaData$attrition$outcomes
+ )
+ if (sum(metaData$attrition$outcomeId == outcomeId) > 0) {
+ metaData$attrition <- metaData$attrition[metaData$attrition$outcomeId == outcomeId, ]
+ } else {
metaData$attrition <- NULL
}
}
-
-
+
+
# ADD TAR
oId <- outcomeId
- population <- population %>%
- dplyr::mutate(startAnchor = startAnchor, startDay = riskWindowStart,
- endAnchor = endAnchor, endDay = riskWindowEnd) %>%
- dplyr::mutate(tarStart = ifelse(.data$startAnchor == 'cohort start', .data$startDay, .data$startDay+ .data$daysToCohortEnd),
- tarEnd = ifelse(.data$endAnchor == 'cohort start', .data$endDay, .data$endDay+ .data$daysToCohortEnd)) %>%
- dplyr::mutate(tarEnd = ifelse(.data$tarEnd>.data$daysToObsEnd, .data$daysToObsEnd,.data$tarEnd ))
-
-
+ population <- population %>%
+ dplyr::mutate(
+ startAnchor = startAnchor, startDay = riskWindowStart,
+ endAnchor = endAnchor, endDay = riskWindowEnd
+ ) %>%
+ dplyr::mutate(
+ tarStart = ifelse(.data$startAnchor == "cohort start", .data$startDay, .data$startDay + .data$daysToCohortEnd),
+ tarEnd = ifelse(.data$endAnchor == "cohort start", .data$endDay, .data$endDay + .data$daysToCohortEnd)
+ ) %>%
+ dplyr::mutate(tarEnd = ifelse(.data$tarEnd > .data$daysToObsEnd, .data$daysToObsEnd, .data$tarEnd))
+
+
# censor at cohortEndDate:
- if(max(population$daysToCohortEnd)>0 & restrictTarToCohortEnd){
- ParallelLogger::logInfo('Restricting tarEnd to end of target cohort')
- population <- population %>% dplyr::mutate(tarEnd = ifelse(.data$tarEnd>.data$daysToCohortEnd, .data$daysToCohortEnd,.data$tarEnd ))
+ if (max(population$daysToCohortEnd) > 0 && restrictTarToCohortEnd) {
+ ParallelLogger::logInfo("Restricting tarEnd to end of target cohort")
+ population <- population %>% dplyr::mutate(tarEnd = ifelse(.data$tarEnd > .data$daysToCohortEnd, .data$daysToCohortEnd, .data$tarEnd))
}
-
-
+
+
# get the outcomes during TAR
- outcomeTAR <- plpData$outcomes %>%
+ outcomeTAR <- plpData$outcomes %>%
dplyr::filter(.data$outcomeId == get("oId")) %>%
dplyr::inner_join(population, by = "rowId") %>%
- dplyr::select("rowId", "daysToEvent", "tarStart", "tarEnd") %>%
- dplyr::filter(.data$daysToEvent >= .data$tarStart & .data$daysToEvent <= .data$tarEnd)
-
+ dplyr::select("rowId", "daysToEvent", "tarStart", "tarEnd") %>%
+ dplyr::filter(.data$daysToEvent >= .data$tarStart & .data$daysToEvent <= .data$tarEnd)
+
# prevent warnings when no results left
- if(nrow(as.data.frame(outcomeTAR))>0){
- outcomeTAR <- outcomeTAR %>%
- dplyr::group_by(.data$rowId) %>%
- dplyr::summarise(first = min(.data$daysToEvent),
- ocount = length(unique(.data$daysToEvent))) %>%
- dplyr::select("rowId", "first", "ocount")
+ if (nrow(as.data.frame(outcomeTAR)) > 0) {
+ outcomeTAR <- outcomeTAR %>%
+ dplyr::group_by(.data$rowId) %>%
+ dplyr::summarise(
+ first = min(.data$daysToEvent),
+ ocount = length(unique(.data$daysToEvent))
+ ) %>%
+ dplyr::select("rowId", "first", "ocount")
} else {
- outcomeTAR <- outcomeTAR %>%
- dplyr::mutate(first = 0, ocount = 0) %>%
- dplyr::select("rowId", "first", "ocount")
+ outcomeTAR <- outcomeTAR %>%
+ dplyr::mutate(first = 0, ocount = 0) %>%
+ dplyr::select("rowId", "first", "ocount")
}
-
+
population <- population %>%
- dplyr::left_join(outcomeTAR, by = 'rowId')
-
-
+ dplyr::left_join(outcomeTAR, by = "rowId")
+
+
# get the initial row
- attrRow <- population %>% dplyr::group_by() %>%
- dplyr::summarise(outcomeId = get('oId'),
- description = 'Initial plpData cohort or population',
- targetCount = length(.data$rowId),
- uniquePeople = length(unique(.data$subjectId)),
- outcomes = sum(!is.na(.data$first)))
+ attrRow <- population %>%
+ dplyr::group_by() %>%
+ dplyr::summarise(
+ outcomeId = get("oId"),
+ description = "Initial plpData cohort or population",
+ targetCount = length(.data$rowId),
+ uniquePeople = length(unique(.data$subjectId)),
+ outcomes = sum(!is.na(.data$first))
+ )
metaData$attrition <- rbind(metaData$attrition, attrRow)
-
+
if (firstExposureOnly) {
ParallelLogger::logTrace(paste("Restricting to first exposure"))
-
+
if (nrow(population) > dplyr::n_distinct(population$subjectId)) {
population <- population %>%
- dplyr::arrange(.data$subjectId,.data$cohortStartDate) %>%
+ dplyr::arrange(.data$subjectId, .data$cohortStartDate) %>%
dplyr::group_by(.data$subjectId) %>%
- dplyr::filter(dplyr::row_number(.data$subjectId)==1)
+ dplyr::filter(dplyr::row_number(.data$subjectId) == 1)
}
-
- attrRow <- population %>% dplyr::group_by() %>%
- dplyr::summarise(outcomeId = get('oId'),
- description = 'First Exposure',
- targetCount = length(.data$rowId),
- uniquePeople = length(unique(.data$subjectId)),
- outcomes = sum(!is.na(.data$first)))
+
+ attrRow <- population %>%
+ dplyr::group_by() %>%
+ dplyr::summarise(
+ outcomeId = get("oId"),
+ description = "First Exposure",
+ targetCount = length(.data$rowId),
+ uniquePeople = length(unique(.data$subjectId)),
+ outcomes = sum(!is.na(.data$first))
+ )
metaData$attrition <- rbind(metaData$attrition, attrRow)
}
-
- if(washoutPeriod) {
+
+ if (washoutPeriod) {
ParallelLogger::logTrace(paste("Requiring", washoutPeriod, "days of observation prior index date"))
msg <- paste("At least", washoutPeriod, "days of observation prior")
population <- population %>%
dplyr::mutate(washoutPeriod = washoutPeriod) %>%
dplyr::filter(.data$daysFromObsStart >= .data$washoutPeriod)
-
- attrRow <- population %>% dplyr::group_by() %>%
- dplyr::summarise(outcomeId = get('oId'),
- description = msg,
- targetCount = length(.data$rowId),
- uniquePeople = length(unique(.data$subjectId)),
- outcomes = sum(!is.na(.data$first)))
+
+ attrRow <- population %>%
+ dplyr::group_by() %>%
+ dplyr::summarise(
+ outcomeId = get("oId"),
+ description = msg,
+ targetCount = length(.data$rowId),
+ uniquePeople = length(unique(.data$subjectId)),
+ outcomes = sum(!is.na(.data$first))
+ )
metaData$attrition <- rbind(metaData$attrition, attrRow)
}
-
- if(removeSubjectsWithPriorOutcome) {
- ParallelLogger::logTrace("Removing subjects with prior outcomes (if any)")
-
+
+ if (removeSubjectsWithPriorOutcome) {
+ ParallelLogger::logTrace("Removing subjects with prior outcomes (if any)")
+
# get the outcomes during TAR
- outcomeBefore <- plpData$outcomes %>%
- dplyr::filter(outcomeId == get('oId')) %>%
- dplyr::inner_join(population, by = 'rowId') %>%
- dplyr::select("rowId", "daysToEvent", "tarStart") %>%
- dplyr::filter(.data$daysToEvent < .data$tarStart & .data$daysToEvent > -get('priorOutcomeLookback') )
-
- if(nrow(as.data.frame(outcomeBefore))>0){
+ outcomeBefore <- plpData$outcomes %>%
+ dplyr::filter(outcomeId == get("oId")) %>%
+ dplyr::inner_join(population, by = "rowId") %>%
+ dplyr::select("rowId", "daysToEvent", "tarStart") %>%
+ dplyr::filter(.data$daysToEvent < .data$tarStart & .data$daysToEvent > -get("priorOutcomeLookback"))
+
+ if (nrow(as.data.frame(outcomeBefore)) > 0) {
outcomeBefore %>%
dplyr::group_by(.data$rowId) %>%
- dplyr::summarise(first = min(.data$daysToEvent)) %>%
+ dplyr::summarise(first = min(.data$daysToEvent)) %>%
dplyr::select("rowId")
}
-
+
population <- population %>%
- dplyr::filter(!.data$rowId %in% outcomeBefore$rowId )
-
- attrRow <- population %>% dplyr::group_by() %>%
- dplyr::summarise(outcomeId = get('oId'),
- description = "No prior outcome",
- targetCount = length(.data$rowId),
- uniquePeople = length(unique(.data$subjectId)),
- outcomes = sum(!is.na(.data$first)))
- metaData$attrition <- rbind(metaData$attrition, attrRow)
+ dplyr::filter(!.data$rowId %in% outcomeBefore$rowId)
+
+ attrRow <- population %>%
+ dplyr::group_by() %>%
+ dplyr::summarise(
+ outcomeId = get("oId"),
+ description = "No prior outcome",
+ targetCount = length(.data$rowId),
+ uniquePeople = length(unique(.data$subjectId)),
+ outcomes = sum(!is.na(.data$first))
+ )
+ metaData$attrition <- rbind(metaData$attrition, attrRow)
}
-
+
if (requireTimeAtRisk) {
- if(includeAllOutcomes){
+ if (includeAllOutcomes) {
ParallelLogger::logTrace("Removing non outcome subjects with insufficient time at risk (if any)")
-
-
+
+
population <- population %>%
- dplyr::filter(!is.na(.data$first) | .data$tarEnd >= .data$tarStart + minTimeAtRisk )
-
- attrRow <- population %>% dplyr::group_by() %>%
- dplyr::summarise(outcomeId = get('oId'),
- description = "Removing non-outcome subjects with insufficient time at risk (if any)",
- targetCount = length(.data$rowId),
- uniquePeople = length(unique(.data$subjectId)),
- outcomes = sum(!is.na(.data$first)))
- metaData$attrition <- rbind(metaData$attrition, attrRow)
+ dplyr::filter(!is.na(.data$first) | .data$tarEnd >= .data$tarStart + minTimeAtRisk)
- }
- else {
+ attrRow <- population %>%
+ dplyr::group_by() %>%
+ dplyr::summarise(
+ outcomeId = get("oId"),
+ description = "Removing non-outcome subjects with insufficient time at risk (if any)",
+ targetCount = length(.data$rowId),
+ uniquePeople = length(unique(.data$subjectId)),
+ outcomes = sum(!is.na(.data$first))
+ )
+ metaData$attrition <- rbind(metaData$attrition, attrRow)
+ } else {
ParallelLogger::logTrace("Removing subjects with insufficient time at risk (if any)")
-
+
population <- population %>%
- dplyr::filter( .data$tarEnd >= .data$tarStart + minTimeAtRisk )
-
- attrRow <- population %>% dplyr::group_by() %>%
- dplyr::summarise(outcomeId = get('oId'),
- description = "Removing subjects with insufficient time at risk (if any)",
- targetCount = length(.data$rowId),
- uniquePeople = length(unique(.data$subjectId)),
- outcomes = sum(!is.na(.data$first)))
+ dplyr::filter(.data$tarEnd >= .data$tarStart + minTimeAtRisk)
+
+ attrRow <- population %>%
+ dplyr::group_by() %>%
+ dplyr::summarise(
+ outcomeId = get("oId"),
+ description = "Removing subjects with insufficient time at risk (if any)",
+ targetCount = length(.data$rowId),
+ uniquePeople = length(unique(.data$subjectId)),
+ outcomes = sum(!is.na(.data$first))
+ )
metaData$attrition <- rbind(metaData$attrition, attrRow)
-
}
} else {
# remve any patients with negative timeAtRisk
ParallelLogger::logTrace("Removing subjects with no time at risk (if any)")
-
+
population <- population %>%
- dplyr::filter( .data$tarEnd >= .data$tarStart )
-
- attrRow <- population %>% dplyr::group_by() %>%
- dplyr::summarise(outcomeId = get('oId'),
- description = "Removing subjects with no time at risk (if any))",
- targetCount = length(.data$rowId),
- uniquePeople = length(unique(.data$subjectId)),
- outcomes = sum(!is.na(.data$first)))
+ dplyr::filter(.data$tarEnd >= .data$tarStart)
+
+ attrRow <- population %>%
+ dplyr::group_by() %>%
+ dplyr::summarise(
+ outcomeId = get("oId"),
+ description = "Removing subjects with no time at risk (if any))",
+ targetCount = length(.data$rowId),
+ uniquePeople = length(unique(.data$subjectId)),
+ outcomes = sum(!is.na(.data$first))
+ )
metaData$attrition <- rbind(metaData$attrition, attrRow)
}
-
+
# now add columns to pop
-
- if(binary){
+
+ if (binary) {
ParallelLogger::logInfo("Outcome is 0 or 1")
- population <- population %>%
- dplyr::mutate(outcomeCount = ifelse(is.na(.data$ocount),0,1))
- } else{
+ population <- population %>%
+ dplyr::mutate(outcomeCount = ifelse(is.na(.data$ocount), 0, 1))
+ } else {
ParallelLogger::logTrace("Outcome is count")
population <- population %>%
- dplyr::mutate(outcomeCount = ifelse(is.na(.data$ocount),0,.data$ocount))
+ dplyr::mutate(outcomeCount = ifelse(is.na(.data$ocount), 0, .data$ocount))
}
-
+
population <- population %>%
- dplyr::mutate(timeAtRisk = .data$tarEnd - .data$tarStart + 1 ,
- survivalTime = ifelse(.data$outcomeCount == 0, .data$tarEnd -.data$tarStart + 1, .data$first - .data$tarStart + 1),
- daysToEvent = .data$first) %>%
- dplyr::select("rowId", "subjectId", "targetId", "cohortStartDate", "daysFromObsStart",
- "daysToCohortEnd", "daysToObsEnd", "ageYear", "gender",
- "outcomeCount", "timeAtRisk", "daysToEvent", "survivalTime")
-
- # check outcome still there
- if(sum(!is.na(population$daysToEvent))==0){
- ParallelLogger::logWarn('No outcomes left...')
- return(NULL)
- }
-
+ dplyr::mutate(
+ timeAtRisk = .data$tarEnd - .data$tarStart + 1,
+ survivalTime = ifelse(.data$outcomeCount == 0, .data$tarEnd - .data$tarStart + 1, .data$first - .data$tarStart + 1),
+ daysToEvent = .data$first
+ ) %>%
+ dplyr::select(
+ "rowId", "subjectId", "targetId", "cohortStartDate", "daysFromObsStart",
+ "daysToCohortEnd", "daysToObsEnd", "ageYear", "gender",
+ "outcomeCount", "timeAtRisk", "daysToEvent", "survivalTime"
+ )
+
+ # check outcome still there
+ if (sum(!is.na(population$daysToEvent)) == 0) {
+ ParallelLogger::logWarn("No outcomes left...")
+ return(NULL)
+ }
+ ParallelLogger::logInfo(
+ "Population created with: ", nrow(population),
+ " observations, ", length(unique(population$subjectId)),
+ " unique subjects and ", sum(population$outcomeCount), " outcomes"
+ )
population <- as.data.frame(population)
-
+
attr(population, "metaData") <- metaData
+ delta <- Sys.time() - start
+ ParallelLogger::logInfo("Population created in ", signif(delta, 3), " ", attr(delta, "units"))
return(population)
}
-getCounts <- function(population,description = "") {
+getCounts <- function(population, description = "") {
persons <- length(unique(population$subjectId))
targets <- nrow(population)
-
- counts <- data.frame(description = description,
- targetCount= targets,
- uniquePeople = persons)
+
+ counts <- data.frame(
+ description = description,
+ targetCount = targets,
+ uniquePeople = persons
+ )
return(counts)
}
-getCounts2 <- function(cohort,outcomes, description = "") {
+getCounts2 <- function(cohort, outcomes, description = "") {
persons <- length(unique(cohort$subjectId))
targets <- nrow(cohort)
-
- outcomes <- stats::aggregate(cbind(count = outcomeId) ~ outcomeId,
- data = outcomes,
- FUN = function(x){NROW(x)})
-
- counts <- data.frame(outcomeId = outcomes$outcomeId,
- description = description,
- targetCount= targets,
- uniquePeople = persons,
- outcomes = outcomes$count)
+
+ outcomes <- stats::aggregate(cbind(count = outcomeId) ~ outcomeId,
+ data = outcomes,
+ FUN = function(x) {
+ NROW(x)
+ }
+ )
+
+ counts <- data.frame(
+ outcomeId = outcomes$outcomeId,
+ description = description,
+ targetCount = targets,
+ uniquePeople = persons,
+ outcomes = outcomes$count
+ )
return(counts)
}
-
diff --git a/R/Predict.R b/R/Predict.R
index 14eff8787..f267b4a3c 100644
--- a/R/Predict.R
+++ b/R/Predict.R
@@ -3,13 +3,13 @@
# Copyright 2021 Observational Health Data Sciences and Informatics
#
# This file is part of PatientLevelPrediction
-#
+#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
-#
+#
# http://www.apache.org/licenses/LICENSE-2.0
-#
+#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -33,181 +33,188 @@
# parent predict that calls the others
#' @export
-predictPlp <- function(plpModel, plpData, population, timepoint){
-
- if(is.null(plpModel))
- stop('No model input')
- if(is.null(population))
- stop('No population input')
- if(is.null(plpData))
- stop('No plpData input')
-
-
+predictPlp <- function(plpModel, plpData, population, timepoint) {
+ start <- Sys.time()
+ if (is.null(plpModel)) {
+ stop("No model input")
+ }
+ if (is.null(population)) {
+ stop("No population input")
+ }
+ if (is.null(plpData)) {
+ stop("No plpData input")
+ }
+
+
# do feature engineering/selection
- if(!is.null(plpModel$preprocessing$featureEngineering)){
+ if (!is.null(plpModel$preprocessing$featureEngineering)) {
plpData <- do.call(
- applyFeatureengineering,
+ applyFeatureengineering,
list(
plpData = plpData,
settings = plpModel$preprocessing$featureEngineering
)
)
- featureEngineering <- T
- } else{
- featureEngineering <- F
+ featureEngineering <- TRUE
+ } else {
+ featureEngineering <- FALSE
}
-
- ParallelLogger::logTrace('did FE')
-
- if(!is.null(plpModel$preprocessing$tidyCovariates)){
+
+ ParallelLogger::logTrace("did FE")
+
+ if (!is.null(plpModel$preprocessing$tidyCovariates)) {
# do preprocessing
plpData$covariateData <- do.call(
- applyTidyCovariateData,
+ applyTidyCovariateData,
list(
covariateData = plpData$covariateData,
preprocessSettings = plpModel$preprocessing$tidyCovariates
)
)
- tidyCovariates <- T
- } else{
- tidyCovariates <- F
+ tidyCovariates <- TRUE
+ } else {
+ tidyCovariates <- FALSE
}
-
- ParallelLogger::logTrace('did tidy')
-
+
+ ParallelLogger::logTrace("did tidy")
+
# add timepoint if not missing to population attribute
- if(!missing(timepoint)){
- attr(population, 'timepoint') <- timepoint
- } else{
- timepoint <- attr(population,'metaData')$populationSettings$riskWindowEnd
+ if (!missing(timepoint)) {
+ attr(population, "timepoint") <- timepoint
+ } else {
+ timepoint <- attr(population, "metaData")$populationSettings$riskWindowEnd
}
-
+
# apply prediction function
prediction <- do.call(
- eval(parse(text = attr(plpModel, "predictionFunction"))),
+ eval(parse(text = attr(plpModel, "predictionFunction"))),
list(
- plpModel = plpModel,
- data = plpData,
+ plpModel = plpModel,
+ data = plpData,
cohort = population
- )
)
-
- if(!is.null(attr(prediction, "metaData"))){
+ )
+
+ if (!is.null(attr(prediction, "metaData"))) {
metaData <- attr(prediction, "metaData")
- } else{
+ } else {
metaData <- list()
}
-
+
# add metaData
- metaData$modelType <- attr(plpModel, 'modelType') #"binary",
- metaData$targetId <- attr(population,'metaData')$targetId
- metaData$outcomeId <- attr(population,'metaData')$outcomeId
+ metaData$modelType <- attr(plpModel, "modelType") # "binary",
+ metaData$targetId <- attr(population, "metaData")$targetId
+ metaData$outcomeId <- attr(population, "metaData")$outcomeId
metaData$timepoint <- timepoint
-
+
# added information about running preprocessing/FE
metaData$tidyCovariates <- tidyCovariates
metaData$featureEngineering <- featureEngineering
-
+
attr(prediction, "metaData") <- metaData
+ delta <- Sys.time() - start
+ ParallelLogger::logInfo("Prediction done in: ",
+ signif(delta, 3), " ", attr(delta, "units"))
return(prediction)
}
applyFeatureengineering <- function(
- plpData,
- settings
-){
-
+ plpData,
+ settings) {
# if a single setting make it into a list
- if(!is.null(settings$funct)){
+ if (!is.null(settings$funct)) {
settings <- list(settings)
}
-
+
# add code for implementing the feature engineering
- for(set in settings){
+ for (set in settings) {
set$settings$trainData <- plpData
plpData <- do.call(eval(parse(text = set$funct)), set$settings)
}
-
+
# dont do anything for now
return(plpData)
-
}
-#NEED TO UPDATE....
+# NEED TO UPDATE....
# fucntion for implementing the pre-processing (normalisation and redundant features removal)
applyTidyCovariateData <- function(
- covariateData,
- preprocessSettings
-)
-{
-
- if(!FeatureExtraction::isCovariateData(covariateData)){stop("Data not of class CovariateData")}
-
- newCovariateData <- Andromeda::andromeda(covariateRef = covariateData$covariateRef,
- analysisRef = covariateData$analysisRef)
-
+ covariateData,
+ preprocessSettings) {
+ if (!FeatureExtraction::isCovariateData(covariateData)) {
+ stop("Data not of class CovariateData")
+ }
+
+ newCovariateData <- Andromeda::andromeda(
+ covariateRef = covariateData$covariateRef,
+ analysisRef = covariateData$analysisRef
+ )
+
maxs <- preprocessSettings$normFactors
deleteRedundantCovariateIds <- preprocessSettings$deletedRedundantCovariateIds
deletedInfrequentCovariateIds <- preprocessSettings$deletedInfrequentCovariateIds
-
+
# --- added for speed
- deleteCovariateIds <- c(deleteRedundantCovariateIds,deletedInfrequentCovariateIds)
+ deleteCovariateIds <- c(deleteRedundantCovariateIds, deletedInfrequentCovariateIds)
temp <- covariateData$covariateRef %>% dplyr::collect()
allCovariateIds <- temp$covariateId
- covariateData$includeCovariates <- data.frame(covariateId = allCovariateIds[!allCovariateIds%in%deleteCovariateIds])
- Andromeda::createIndex(covariateData$includeCovariates, c('covariateId'),
- indexName = 'includeCovariates_covariateId')
+ covariateData$includeCovariates <- data.frame(covariateId = allCovariateIds[!allCovariateIds %in% deleteCovariateIds])
+ Andromeda::createIndex(covariateData$includeCovariates, c("covariateId"),
+ indexName = "includeCovariates_covariateId"
+ )
on.exit(covariateData$includeCovariates <- NULL, add = TRUE)
# ---
-
+
ParallelLogger::logInfo("Removing infrequent and redundant covariates and normalizing")
- start <- Sys.time()
-
- if(!is.null(maxs)){
- if('bins'%in%colnames(maxs)){
- covariateData$maxes <- dplyr::as_tibble(maxs) %>% dplyr::rename(covariateId = "bins") %>%
+ start <- Sys.time()
+
+ if (!is.null(maxs)) {
+ if ("bins" %in% colnames(maxs)) {
+ covariateData$maxes <- dplyr::as_tibble(maxs) %>%
+ dplyr::rename(covariateId = "bins") %>%
dplyr::rename(maxValue = "maxs")
- } else{
- covariateData$maxes <- maxs
+ } else {
+ covariateData$maxes <- maxs
}
on.exit(covariateData$maxes <- NULL, add = TRUE)
-
+
# --- added for speed
- Andromeda::createIndex(covariateData$maxes, c('covariateId'),
- indexName = 'maxes_covariateId')
+ Andromeda::createIndex(covariateData$maxes, c("covariateId"),
+ indexName = "maxes_covariateId"
+ )
# ---
-
- newCovariateData$covariates <- covariateData$covariates %>%
- dplyr::inner_join(covariateData$includeCovariates, by='covariateId') %>% # added as join
- dplyr::inner_join(covariateData$maxes, by = 'covariateId') %>%
- dplyr::mutate(value = 1.0*.data$covariateValue/.data$maxValue) %>%
+
+ newCovariateData$covariates <- covariateData$covariates %>%
+ dplyr::inner_join(covariateData$includeCovariates, by = "covariateId") %>% # added as join
+ dplyr::inner_join(covariateData$maxes, by = "covariateId") %>%
+ dplyr::mutate(value = 1.0 * .data$covariateValue / .data$maxValue) %>%
dplyr::select(-"covariateValue") %>%
dplyr::rename(covariateValue = "value")
- } else{
- newCovariateData$covariates <- covariateData$covariates %>%
- dplyr::inner_join(covariateData$includeCovariates, by='covariateId')
+ } else {
+ newCovariateData$covariates <- covariateData$covariates %>%
+ dplyr::inner_join(covariateData$includeCovariates, by = "covariateId")
}
-
+
# reduce covariateRef
- newCovariateData$covariateRef <- covariateData$covariateRef %>%
- dplyr::inner_join(covariateData$includeCovariates, by='covariateId')
-
+ newCovariateData$covariateRef <- covariateData$covariateRef %>%
+ dplyr::inner_join(covariateData$includeCovariates, by = "covariateId")
+
# adding index for restrict to pop
Andromeda::createIndex(
- newCovariateData$covariates,
- c('rowId'),
- indexName = 'ncovariates_rowId'
+ newCovariateData$covariates,
+ c("rowId"),
+ indexName = "ncovariates_rowId"
)
-
-
+
+
class(newCovariateData) <- "CovariateData"
-
+
delta <- Sys.time() - start
writeLines(paste("Removing infrequent and redundant covariates covariates and normalizing took", signif(delta, 3), attr(delta, "units")))
-
+
# return processed data
return(newCovariateData)
}
diff --git a/R/PredictionDistribution.R b/R/PredictionDistribution.R
index 0219b2bae..465031aad 100644
--- a/R/PredictionDistribution.R
+++ b/R/PredictionDistribution.R
@@ -3,28 +3,27 @@
#' @details
#' Calculates the quantiles from a predition object
#'
-#' @param prediction A prediction object
-#' @param predictionType The type of prediction (binary or survival)
-#' @param typeColumn A column that is used to stratify the results
+#' @param prediction A prediction object
+#' @param predictionType The type of prediction (binary or survival)
+#' @param typeColumn A column that is used to stratify the results
#'
#' @return
#' The 0.00, 0.1, 0.25, 0.5, 0.75, 0.9, 1.00 quantile pf the prediction,
#' the mean and standard deviation per class
#' @export
getPredictionDistribution <- function(
- prediction,
- predictionType,
- typeColumn = 'evaluation'
-){
+ prediction,
+ predictionType,
+ typeColumn = "evaluation") {
evaluation <- do.call(
- what = paste0('getPredictionDistribution_', predictionType),
+ what = paste0("getPredictionDistribution_", predictionType),
args = list(
- prediction = prediction,
+ prediction = prediction,
evalColumn = typeColumn,
- timepoint = attr(prediction, 'metaData')$timepoint
+ timepoint = attr(prediction, "metaData")$timepoint
)
)
-
+
return(evaluation)
}
@@ -34,79 +33,78 @@ getPredictionDistribution <- function(
#' @details
#' Calculates the quantiles from a predition object
#'
-#' @param prediction A prediction object
-#' @param evalColumn A column that is used to stratify the results
+#' @param prediction A prediction object
+#' @param evalColumn A column that is used to stratify the results
#' @param ... Other inputs
#'
#' @return
#' The 0.00, 0.1, 0.25, 0.5, 0.75, 0.9, 1.00 quantile pf the prediction,
#' the mean and standard deviation per class
#'
-getPredictionDistribution_binary <- function(prediction, evalColumn, ...){
-
-
+getPredictionDistribution_binary <- function(prediction, evalColumn, ...) {
result <- c()
- evalTypes <- unique(as.data.frame(prediction)[,evalColumn])
-
- for(evalType in evalTypes){
-
+ evalTypes <- unique(as.data.frame(prediction)[, evalColumn])
+
+ for (evalType in evalTypes) {
predictionOfInterest <- prediction %>% dplyr::filter(.data[[evalColumn]] == evalType)
-
-
+
+
# PersonCount - count the number of persons in the class
predictionDistribution <- stats::aggregate(
- predictionOfInterest$value,
+ predictionOfInterest$value,
list(predictionOfInterest$outcomeCount),
length
)
-
- colnames(predictionDistribution) <- c('class', 'PersonCount')
-
+
+ colnames(predictionDistribution) <- c("class", "PersonCount")
+
# averagePredictedProbability StDevPredictedProbability
averagePredictedProbability <- stats::aggregate(
- predictionOfInterest$value,
+ predictionOfInterest$value,
list(predictionOfInterest$outcomeCount),
mean
)
- colnames(averagePredictedProbability) <- c('class', 'averagePredictedProbability')
-
+ colnames(averagePredictedProbability) <- c("class", "averagePredictedProbability")
+
StDevPredictedProbability <- stats::aggregate(
- predictionOfInterest$value,
+ predictionOfInterest$value,
list(predictionOfInterest$outcomeCount),
stats::sd
)
- colnames(StDevPredictedProbability) <- c('class', 'StDevPredictedProbability')
-
- predictionDistribution <- merge(predictionDistribution, averagePredictedProbability )
- predictionDistribution <- merge(predictionDistribution, StDevPredictedProbability )
-
+ colnames(StDevPredictedProbability) <- c("class", "StDevPredictedProbability")
+
+ predictionDistribution <- merge(predictionDistribution, averagePredictedProbability)
+ predictionDistribution <- merge(predictionDistribution, StDevPredictedProbability)
+
quantiles <- stats::aggregate(
- predictionOfInterest$value,
+ predictionOfInterest$value,
list(predictionOfInterest$outcomeCount),
- function(x){
+ function(x) {
stats::quantile(
- x,
- probs = c(0.00,0.05, 0.25, 0.5, 0.75,0.95, 1.00)
+ x,
+ probs = c(0.00, 0.05, 0.25, 0.5, 0.75, 0.95, 1.00)
)
}
)
quantiles <- as.matrix(quantiles)
- colnames(quantiles) <- c('class', 'MinPredictedProbability', 'P05PredictedProbability',
- 'P25PredictedProbability', 'MedianPredictedProbability',
- 'P75PredictedProbability', 'P95PredictedProbability',
- 'MaxPredictedProbability')
-
+ colnames(quantiles) <- c(
+ "class", "MinPredictedProbability", "P05PredictedProbability",
+ "P25PredictedProbability", "MedianPredictedProbability",
+ "P75PredictedProbability", "P95PredictedProbability",
+ "MaxPredictedProbability"
+ )
+
predictionDistribution <- merge(predictionDistribution, quantiles)
predictionDistribution$evaluation <- evalType
-
+
result <- rbind(result, predictionDistribution)
}
-
+
return(result)
}
-getPredictionDistribution_survival <- function(prediction, evalColumn, timepoint, ...){
- ParallelLogger::logWarn('PredictionDistribution not available for survival models')
+getPredictionDistribution_survival <- function(prediction, evalColumn, timepoint, ...) {
+ ParallelLogger::logWarn("PredictionDistribution not available for survival models")
return(NULL)
}
diff --git a/R/PreprocessingData.R b/R/PreprocessingData.R
index 834d27a85..e4c6e01ee 100644
--- a/R/PreprocessingData.R
+++ b/R/PreprocessingData.R
@@ -2,13 +2,13 @@
# Copyright 2021 Observational Health Data Sciences and Informatics
#
# This file is part of PatientLevelPrediction
-#
+#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
-#
+#
# http://www.apache.org/licenses/LICENSE-2.0
-#
+#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -20,34 +20,31 @@
#' @details
#' Returns an object of class \code{preprocessingSettings} that specifies how to preprocess the training data
#'
-#' @param minFraction The minimum fraction of target population who must have a covariate for it to be included in the model training
+#' @param minFraction The minimum fraction of target population who must have a covariate for it to be included in the model training
#' @param normalize Whether to normalise the covariates before training (Default: TRUE)
#' @param removeRedundancy Whether to remove redundant features (Default: TRUE)
#' @return
#' An object of class \code{preprocessingSettings}
#' @export
createPreprocessSettings <- function(
- minFraction = 0.001,
- normalize = TRUE,
- removeRedundancy = TRUE
- ){
-
- checkIsClass(minFraction, c('numeric','integer'))
- checkHigherEqual(minFraction,0)
-
- checkIsClass(normalize, c("logical"))
+ minFraction = 0.001,
+ normalize = TRUE,
+ removeRedundancy = TRUE) {
+ checkIsClass(minFraction, c("numeric", "integer"))
+ checkHigherEqual(minFraction, 0)
+
+ checkIsClass(normalize, c("logical"))
+
+ checkIsClass(removeRedundancy, c("logical"))
+
+ preprocessingSettings <- list(
+ minFraction = minFraction,
+ normalize = normalize,
+ removeRedundancy = removeRedundancy
+ )
- checkIsClass(removeRedundancy, c("logical"))
-
- preprocessingSettings <- list(
- minFraction = minFraction,
- normalize = normalize,
- removeRedundancy = removeRedundancy
- )
-
class(preprocessingSettings) <- "preprocessSettings"
return(preprocessingSettings)
-
}
@@ -57,39 +54,37 @@ createPreprocessSettings <- function(
#' @details
#' Returns an object of class \code{covariateData} that has been processed
#'
-#' @param covariateData The covariate part of the training data created by \code{splitData} after being sampled and having
-#' any required feature engineering
-#' @param preprocessSettings The settings for the preprocessing created by \code{createPreprocessSettings}
+#' @param covariateData The covariate part of the training data created by \code{splitData} after being sampled and having
+#' any required feature engineering
+#' @param preprocessSettings The settings for the preprocessing created by \code{createPreprocessSettings}
#' @return
-#' The data processed
-preprocessData <- function (covariateData,
- preprocessSettings){
-
+#' The data processed
+preprocessData <- function(covariateData,
+ preprocessSettings) {
metaData <- attr(covariateData, "metaData")
preprocessSettingsInput <- preprocessSettings # saving this before adding covariateData
-
+
checkIsClass(covariateData, c("CovariateData"))
checkIsClass(preprocessSettings, c("preprocessSettings"))
-
- ParallelLogger::logDebug(paste0('minFraction: ', preprocessSettings$minFraction))
- ParallelLogger::logDebug(paste0('normalize: ', preprocessSettings$normalize))
- ParallelLogger::logDebug(paste0('removeRedundancy: ', preprocessSettings$removeRedundancy))
-
+
+ ParallelLogger::logDebug(paste0("minFraction: ", preprocessSettings$minFraction))
+ ParallelLogger::logDebug(paste0("normalize: ", preprocessSettings$normalize))
+ ParallelLogger::logDebug(paste0("removeRedundancy: ", preprocessSettings$removeRedundancy))
+
preprocessSettings$covariateData <- covariateData
covariateData <- do.call(FeatureExtraction::tidyCovariateData, preprocessSettings)
-
- #update covariateRef
+
+ # update covariateRef
removed <- unique(c(
attr(covariateData, "metaData")$deletedInfrequentCovariateIds,
attr(covariateData, "metaData")$deletedRedundantCovariateIds
- )
- )
- covariateData$covariateRef <- covariateData$covariateRef %>%
- dplyr::filter(!.data$covariateId %in% removed)
-
+ ))
+ covariateData$covariateRef <- covariateData$covariateRef %>%
+ dplyr::filter(!.data$covariateId %in% removed)
+
metaData$tidyCovariateDataSettings <- attr(covariateData, "metaData")
metaData$preprocessSettings <- preprocessSettingsInput
attr(covariateData, "metaData") <- metaData
-
+
return(covariateData)
}
diff --git a/R/RClassifier.R b/R/RClassifier.R
index a74e2358b..c6c93ae3a 100644
--- a/R/RClassifier.R
+++ b/R/RClassifier.R
@@ -1,72 +1,69 @@
# this is a generic wrapper for training models using classifiers in R
fitRclassifier <- function(
- trainData,
- modelSettings,
- search = 'grid',
- analysisId,
- ...){
-
+ trainData,
+ modelSettings,
+ search = "grid",
+ analysisId,
+ ...) {
param <- modelSettings$param
-
- if (!FeatureExtraction::isCovariateData(trainData$covariateData)){
+
+ if (!FeatureExtraction::isCovariateData(trainData$covariateData)) {
stop("Needs correct covariateData")
}
-
-
+
+
# add folds to labels if present:
- if(!is.null(trainData$folds)){
- trainData$labels <- merge(trainData$labels, trainData$folds, by = 'rowId')
+ if (!is.null(trainData$folds)) {
+ trainData$labels <- merge(trainData$labels, trainData$folds, by = "rowId")
}
-
- settings <- attr(param, 'settings')
- ParallelLogger::logInfo(paste0('Training ', settings$modelName))
-
+
+ settings <- attr(param, "settings")
+ ParallelLogger::logInfo(paste0("Training ", settings$modelName))
+
set.seed(settings$seed)
-
+
# convert data into sparse Matrix:
result <- toSparseM(
trainData,
- map=NULL
+ map = NULL
)
-
+
dataMatrix <- result$dataMatrix
labels <- result$labels
covariateRef <- result$covariateRef
-
+
# set test/train sets (for printing performance as it trains)
start <- Sys.time()
-
+
# use the new R CV wrapper
cvResult <- applyCrossValidationInR(
- dataMatrix,
- labels,
- hyperparamGrid = param,
+ dataMatrix,
+ labels,
+ hyperparamGrid = param,
covariateMap = result$covariateMap
- )
-
+ )
+
hyperSummary <- do.call(rbind, lapply(cvResult$paramGridSearch, function(x) x$hyperSummary))
-
+
prediction <- cvResult$prediction
variableImportance <- cvResult$variableImportance
- covariateRef <- merge(covariateRef, variableImportance, all.x = T, by = 'covariateId')
+ covariateRef <- merge(covariateRef, variableImportance, all.x = TRUE,
+ by = "covariateId")
covariateRef$covariateValue[is.na(covariateRef$covariateValue)] <- 0
covariateRef$included[is.na(covariateRef$included)] <- 0
-
+
comp <- start - Sys.time()
-
+
result <- list(
model = cvResult$model,
-
preprocessing = list(
- featureEngineering = attr(trainData, "metaData")$featureEngineering,
- tidyCovariates = attr(trainData$covariateData, "metaData")$tidyCovariateDataSettings,
- requireDenseMatrix = F
+ featureEngineering = attr(trainData$covariateData, "metaData")$featureEngineering,
+ tidyCovariates = attr(trainData$covariateData, "metaData")$tidyCovariateDataSettings,
+ requireDenseMatrix = FALSE
),
-
prediction = prediction,
-
modelDesign = PatientLevelPrediction::createModelDesign(
targetId = attr(trainData, "metaData")$targetId,
outcomeId = attr(trainData, "metaData")$outcomeId,
@@ -79,71 +76,65 @@ fitRclassifier <- function(
splitSettings = attr(trainData, "metaData")$splitSettings,
sampleSettings = attr(trainData, "metaData")$sampleSettings
),
-
trainDetails = list(
analysisId = analysisId,
- analysisSource = '', #TODO add from model
+ analysisSource = "", # TODO add from model
developmentDatabase = attr(trainData, "metaData")$cdmDatabaseName,
- developmentDatabaseSchema = attr(trainData, "metaData")$cdmDatabaseSchema,
- attrition = attr(trainData, "metaData")$attrition,
- trainingTime = paste(as.character(abs(comp)), attr(comp,'units')),
+ developmentDatabaseSchema = attr(trainData, "metaData")$cdmDatabaseSchema,
+ attrition = attr(trainData, "metaData")$attrition,
+ trainingTime = paste(as.character(abs(comp)), attr(comp, "units")),
trainingDate = Sys.Date(),
- modelName = attr(param, 'settings')$trainRFunction,
+ modelName = attr(param, "settings")$trainRFunction,
finalModelParameters = cvResult$finalParam,
hyperParamSearch = hyperSummary
),
-
covariateImportance = covariateRef
)
-
+
class(result) <- "plpModel"
attr(result, "predictionFunction") <- settings$predictRFunction
attr(result, "modelType") <- "binary"
- attr(result, "saveType") <- attr(param, 'saveType')
-
+ attr(result, "saveType") <- attr(param, "saveType")
+
return(result)
}
-applyCrossValidationInR <- function(dataMatrix, labels, hyperparamGrid, covariateMap){
-
+applyCrossValidationInR <- function(dataMatrix, labels, hyperparamGrid, covariateMap) {
gridSearchPredictions <- list()
length(gridSearchPredictions) <- length(hyperparamGrid)
-
- for(gridId in 1:length(hyperparamGrid)){
-
+
+ for (gridId in 1:length(hyperparamGrid)) {
param <- hyperparamGrid[[gridId]]
-
+
cvPrediction <- c()
- for(i in unique(labels$index)){
-
+ for (i in unique(labels$index)) {
ind <- labels$index != i
-
+
model <- do.call(
- attr(hyperparamGrid, 'settings')$trainRFunction,
+ attr(hyperparamGrid, "settings")$trainRFunction,
list(
- dataMatrix = dataMatrix[ind,],
- labels = labels[ind,],
+ dataMatrix = dataMatrix[ind, ],
+ labels = labels[ind, ],
hyperParameters = param,
- settings = attr(hyperparamGrid, 'settings')
- )
+ settings = attr(hyperparamGrid, "settings")
)
-
+ )
+
cvPrediction <- rbind(
cvPrediction,
do.call(
- attr(hyperparamGrid, 'settings')$predictRFunction,
+ attr(hyperparamGrid, "settings")$predictRFunction,
list(
plpModel = model,
- data = dataMatrix[!ind,],
- cohort = labels[!ind,]
- )
+ data = dataMatrix[!ind, ],
+ cohort = labels[!ind, ]
+ )
)
)
-
}
-
+
attr(cvPrediction, "metaData") <- list(modelType = "binary") # make this some attribute of model
# save hyper-parameter cv prediction
@@ -152,58 +143,60 @@ applyCrossValidationInR <- function(dataMatrix, labels, hyperparamGrid, covariat
param = param
)
}
-
+
# computeGridPerformance function is currently in SklearnClassifier.R
- paramGridSearch <- lapply(gridSearchPredictions, function(x) do.call(computeGridPerformance, x)) # cvAUCmean, cvAUC, param
+ paramGridSearch <- lapply(gridSearchPredictions, function(x) do.call(computeGridPerformance, x)) # cvAUCmean, cvAUC, param
optimalParamInd <- which.max(unlist(lapply(paramGridSearch, function(x) x$cvPerformance)))
-
+
finalParam <- paramGridSearch[[optimalParamInd]]$param
-
+
cvPrediction <- gridSearchPredictions[[optimalParamInd]]$prediction
- cvPrediction$evaluationType <- 'CV'
-
+ cvPrediction$evaluationType <- "CV"
+
# fit final model
-
+
finalModel <- do.call(
- attr(hyperparamGrid, 'settings')$trainRFunction,
+ attr(hyperparamGrid, "settings")$trainRFunction,
list(
dataMatrix = dataMatrix,
labels = labels,
hyperParameters = finalParam,
- settings = attr(hyperparamGrid, 'settings')
- )
+ settings = attr(hyperparamGrid, "settings")
+ )
)
-
+
prediction <- do.call(
- attr(hyperparamGrid, 'settings')$predictRFunction,
+ attr(hyperparamGrid, "settings")$predictRFunction,
list(
plpModel = finalModel,
data = dataMatrix,
cohort = labels
- )
+ )
)
-
- prediction$evaluationType <- 'Train'
-
+
+ prediction$evaluationType <- "Train"
+
prediction <- rbind(
prediction,
cvPrediction
)
-
+
# variable importance - how to mak sure this just returns a vector?
variableImportance <- tryCatch(
- {do.call(
- attr(hyperparamGrid, 'settings')$varImpRFunction,
- list(model = finalModel, covariateMap = covariateMap)
- )}
- ,
- error = function(e){
- ParallelLogger::logInfo('Error calculating variableImportance');
- ParallelLogger::logInfo(e);
+ {
+ do.call(
+ attr(hyperparamGrid, "settings")$varImpRFunction,
+ list(model = finalModel, covariateMap = covariateMap)
+ )
+ },
+ error = function(e) {
+ ParallelLogger::logInfo("Error calculating variableImportance")
+ ParallelLogger::logInfo(e)
return(NULL)
- })
-
+ }
+ )
+
result <- list(
model = finalModel,
prediction = prediction,
@@ -211,6 +204,6 @@ applyCrossValidationInR <- function(dataMatrix, labels, hyperparamGrid, covariat
paramGridSearch = paramGridSearch,
variableImportance = variableImportance
)
-
+
return(result)
}
diff --git a/R/Recalibration.R b/R/Recalibration.R
index 2053f7ed7..e49181145 100644
--- a/R/Recalibration.R
+++ b/R/Recalibration.R
@@ -20,264 +20,275 @@
#' recalibratePlpRefit
#'
#' @description
-#' Train various models using a default parameter gird search or user specified parameters
+#' Recalibrating a model by refitting it
#'
-#' @details
-#' The user can define the machine learning model to train (regularised logistic regression, random forest,
-#' gradient boosting machine, neural network and )
-#'
#' @param plpModel The trained plpModel (runPlp$model)
#' @param newPopulation The population created using createStudyPopulation() who will have their risks predicted
#' @param newData An object of type \code{plpData} - the patient level prediction
#' data extracted from the CDM.
+#' @param returnModel Logical: return the refitted model
#' @return
-#' An object of class \code{runPlp} that is recalibrated on the new data
+#' An prediction dataframe with the predictions of the recalibrated model added
#'
#' @export
recalibratePlpRefit <- function(
- plpModel,
- newPopulation,
- newData
-){
- if (is.null(newPopulation)){
- stop("NULL population")
- }
- if (!inherits(x = newData, what = "plpData")){
- stop("Incorrect plpData class")
- }
- if (!inherits(x = plpModel, what = "plpModel")){
- stop("plpModel is not of class plpModel")
- }
-
- #get selected covariates
- includeCovariateIds <- plpModel$covariateImportance %>%
- dplyr::filter(.data$covariateValue != 0) %>%
- dplyr::select("covariateId") %>%
+ plpModel,
+ newPopulation,
+ newData,
+ returnModel = FALSE) {
+ checkNotNull(plpModel)
+ checkNotNull(newPopulation)
+ checkNotNull(newData)
+ checkIsClass(plpModel, "plpModel")
+ checkIsClass(newData, "plpData")
+ checkBoolean(returnModel)
+
+ # get selected covariates
+ includeCovariateIds <- plpModel$covariateImportance %>%
+ dplyr::filter(.data$covariateValue != 0) %>%
+ dplyr::select("covariateId") %>%
dplyr::pull()
-
+
# check which covariates are included in new data
containedIds <- newData$covariateData$covariateRef %>% dplyr::collect()
noShrinkage <- intersect(includeCovariateIds, containedIds$covariateId)
-
+
# add intercept
noShrinkage <- append(noShrinkage, 0, 0)
-
+
setLassoRefit <- setLassoLogisticRegression(
includeCovariateIds = includeCovariateIds,
- noShrinkage = noShrinkage,
+ noShrinkage = noShrinkage,
maxIterations = 10000 # increasing this due to test code often not converging
)
-
- newData$labels <- newPopulation
-
+
+ newData$labels <- newPopulation
+
newData$folds <- data.frame(
- rowId = newData$labels$rowId,
- index = sample(2, length(newData$labels$rowId), replace = T)
- )
-
+ rowId = newData$labels$rowId,
+ index = sample(2, length(newData$labels$rowId), replace = TRUE)
+ )
+
# add dummy settings to fit model
- attr(newData, "metaData")$outcomeId <- attr(newPopulation, 'metaData')$outcomeId
- attr(newData, "metaData")$targetId <- attr(newPopulation, 'metaData')$targetId
- attr(newData, "metaData")$restrictPlpDataSettings <- attr(newPopulation, 'metaData')$restrictPlpDataSettings
+ attr(newData, "metaData")$outcomeId <- attr(newPopulation, "metaData")$outcomeId
+ attr(newData, "metaData")$targetId <- attr(newPopulation, "metaData")$targetId
+ attr(newData, "metaData")$restrictPlpDataSettings <- attr(newPopulation, "metaData")$restrictPlpDataSettings
attr(newData, "metaData")$covariateSettings <- newData$metaData$covariateSettings
- attr(newData, "metaData")$populationSettings <- attr(newPopulation, 'metaData')$populationSettings
+ attr(newData, "metaData")$populationSettings <- attr(newPopulation, "metaData")$populationSettings
attr(newData$covariateData, "metaData")$featureEngineeringSettings <- PatientLevelPrediction::createFeatureEngineeringSettings()
attr(newData$covariateData, "metaData")$preprocessSettings <- PatientLevelPrediction::createPreprocessSettings()
attr(newData, "metaData")$splitSettings <- PatientLevelPrediction::createDefaultSplitSetting()
attr(newData, "metaData")$sampleSettings <- PatientLevelPrediction::createSampleSettings()
-
- newModel <- tryCatch({
- fitPlp(
- trainData = newData,
- modelSettings = setLassoRefit,
- analysisId = 'recalibrationRefit',
- analysisPath = NULL
- )
- },
- error = function(e){ParallelLogger::logInfo(e); return(NULL)}
+
+ newModel <- tryCatch(
+ {
+ fitPlp(
+ trainData = newData,
+ modelSettings = setLassoRefit,
+ analysisId = "recalibrationRefit",
+ analysisPath = NULL
+ )
+ },
+ error = function(e) {
+ ParallelLogger::logInfo(e)
+ return(NULL)
+ }
)
- if(is.null(newModel)){
- ParallelLogger::logInfo('Recalibration fit failed')
+ if (is.null(newModel)) {
+ ParallelLogger::logInfo("Recalibration fit failed")
return(NULL)
}
-
- newModel$prediction$evaluationType <- 'recalibrationRefit'
+
+ newModel$prediction$evaluationType <- "recalibrationRefit"
oldPred <- predictPlp(
- plpModel = plpModel,
- plpData = newData,
- population = newPopulation,
+ plpModel = plpModel,
+ plpData = newData,
+ population = newPopulation,
timepoint = 0
- )
-
- oldPred$evaluationType <- 'validation'
-
+ )
+
+ oldPred$evaluationType <- "validation"
+
prediction <- rbind(
- oldPred,
+ oldPred,
newModel$prediction[, colnames(oldPred)]
- )
+ )
- if(!is.null(newModel$covariateImportance)){
- adjust <- newModel$covariateImportance %>%
- dplyr::filter(.data$covariateValue != 0) %>%
+ if (!is.null(newModel$covariateImportance)) {
+ adjust <- newModel$covariateImportance %>%
+ dplyr::filter(.data$covariateValue != 0) %>%
dplyr::select(
- "covariateId",
+ "covariateId",
"covariateValue"
)
- } else{
+ } else {
adjust <- c()
}
-
- newIntercept <- newModel$model$coefficients[names(newModel$model$coefficients) == '(Intercept)']
-
+
+ newIntercept <- newModel$model$coefficients[names(newModel$model$coefficients) == "(Intercept)"]
+
attr(prediction, "metaData")$recalibratePlpRefit <- list(adjust = adjust, newIntercept = newIntercept)
-
- return(prediction)
+
+ if (returnModel) {
+ return(list(prediction = prediction, model = newModel))
+ } else {
+ return(prediction)
+ }
}
#' recalibratePlp
#'
#' @description
-#' Train various models using a default parameter gird search or user specified parameters
+#' Recalibrating a model using the recalibrationInTheLarge or weakRecalibration methods
#'
#' @details
-#' The user can define the machine learning model to train (regularised logistic regression, random forest,
-#' gradient boosting machine, neural network and )
-#'
+#' 'recalibrationInTheLarge' calculates a single correction factor for the
+#' average predicted risks to match the average observed risks.
+#' 'weakRecalibration' fits a glm model to the logit of the predicted risks,
+#' also known as Platt scaling/logistic recalibration.
+#'
#' @param prediction A prediction dataframe
#' @param analysisId The model analysisId
#' @param typeColumn The column name where the strata types are specified
#' @param method Method used to recalibrate ('recalibrationInTheLarge' or 'weakRecalibration' )
#' @return
-#' An object of class \code{runPlp} that is recalibrated on the new data
+#' A prediction dataframe with the recalibrated predictions added
#'
#' @export
-recalibratePlp <- function(prediction, analysisId, typeColumn = 'evaluationType',
- method = c('recalibrationInTheLarge', 'weakRecalibration')){
+recalibratePlp <- function(prediction, analysisId, typeColumn = "evaluationType",
+ method = c("recalibrationInTheLarge", "weakRecalibration")) {
# check input:
- if (!inherits(x = prediction, what = 'data.frame')){
- stop("Incorrect prediction")
- }
-
- if(!method %in% c('recalibrationInTheLarge', 'weakRecalibration')){
+ if (!inherits(x = prediction, what = "data.frame")) {
+ stop("Incorrect prediction")
+ }
+
+ if (!method %in% c("recalibrationInTheLarge", "weakRecalibration")) {
stop("Unknown recalibration method type. must be of type: recalibrationInTheLarge, weakRecalibration")
}
-
+
prediction <- do.call(method, list(prediction = prediction, columnType = typeColumn))
-
+
return(prediction)
-
}
-
-recalibrationInTheLarge <- function(prediction, columnType = 'evaluationType'){
-
- if(attr(prediction, "metaData")$modelType == 'binary'){
+#' recalibrationInTheLarge
+#'
+#' @description
+#' Recalibrate a model using the recalibrationInTheLarge method which calculates a single correction factor
+#' for the average predicted risks to match the average observed risks
+#'
+#' @param prediction A prediction dataframe
+#' @param columnType The column name where the strata types are specified
+#' @return
+#' An prediction dataframe with the recalibrated predictions added
+#' @keywords internal
+recalibrationInTheLarge <- function(prediction, columnType = "evaluationType") {
+ if (attr(prediction, "metaData")$modelType == "binary") {
misCal <- calibrationInLarge(prediction)
- obsOdds <- misCal$observedRisk/ (1-misCal$observedRisk)
- predOdds <- misCal$meanPredictionRisk/ (1 - misCal$meanPredictionRisk)
+ obsOdds <- misCal$observedRisk / (1 - misCal$observedRisk)
+ predOdds <- misCal$meanPredictionRisk / (1 - misCal$meanPredictionRisk)
correctionFactor <- log(obsOdds / predOdds)
-
+
recalibrated <- prediction
- recalibrated$value = logFunct(inverseLog(recalibrated$value) + correctionFactor)
-
- recalibrated[,columnType] <- 'recalibrationInTheLarge'
+ recalibrated$value <- stats::plogis(stats::qlogis(recalibrated$value) + correctionFactor)
+
+ recalibrated[, columnType] <- "recalibrationInTheLarge"
prediction <- rbind(prediction, recalibrated)
- attr(prediction, 'metaData')$recalibrationInTheLarge = list(correctionFactor = correctionFactor)
-
+ attr(prediction, "metaData")$recalibrationInTheLarge <-
+ list(correctionFactor = correctionFactor)
+
return(prediction)
}
-
- if(attr(prediction, "metaData")$modelType == 'survival'){
- ParallelLogger::logError('Survival recal in the large not currently available')
+ if (attr(prediction, "metaData")$modelType == "survival") {
+ ParallelLogger::logError("Survival recal in the large not currently available")
}
-
-
}
-
-weakRecalibration <- function(prediction, columnType = 'evaluationType'){
-
+#' weakRecalibration
+#'
+#' @description
+#' Recalibrate a model using the weakRecalibration method which fits a glm model
+#' to the logit of the predicted risks.
+#' Alsi known as Platt scaling/logistic recalibration
+#' @param prediction A prediction dataframe
+#' @param columnType The column name where the strata types are specified
+#' @return
+#' An prediction dataframe with the recalibrated predictions added
+#' @keywords internal
+weakRecalibration <- function(prediction, columnType = "evaluationType") {
# if binary:
- if(attr(prediction, "metaData")$modelType == 'binary'){
+ if (attr(prediction, "metaData")$modelType == "binary") {
recalibrated <- prediction
- recalibrated$value[recalibrated$value==0] <- 0.000000000000001
- recalibrated$value[recalibrated$value==1] <- 1-0.000000000000001
-
- y <- ifelse(recalibrated$outcomeCount>0, 1, 0)
- inverseLog <- inverseLog(recalibrated$value)
- refit <- suppressWarnings(stats::glm(y ~ inverseLog, family = 'binomial'))
-
- recalibrated$value <- logFunct((inverseLog * refit$coefficients[2]) + refit$coefficients[1])
-
- recalibrated[,columnType] <- 'weakRecalibration'
+ epsilon <- .Machine$double.eps
+ recalibrated$value <- pmin(pmax(recalibrated$value, epsilon), 1 - epsilon)
+
+ y <- ifelse(recalibrated$outcomeCount > 0, 1, 0)
+
+ # convert risk probailities to logits
+ inverseLog <- stats::qlogis(recalibrated$value)
+ refit <- suppressWarnings(stats::glm(y ~ inverseLog, family = "binomial"))
+
+ recalibrated$value <- stats::plogis((inverseLog * refit$coefficients[2]) + refit$coefficients[1])
+
+ recalibrated[, columnType] <- "weakRecalibration"
prediction <- rbind(prediction, recalibrated)
- attr(prediction, 'metaData')$weakRecalibration = list(
- adjustGradient = refit$coefficients[2],
+ attr(prediction, "metaData")$weakRecalibration <- list(
+ adjustGradient = refit$coefficients[2],
adjustIntercept = refit$coefficients[1]
- )
-
+ )
+
return(prediction)
- }
-
+ }
+
# add if survival
- if(attr(prediction, "metaData")$modelType == 'survival'){
-
+ if (attr(prediction, "metaData")$modelType == "survival") {
+ rlang::check_installed("survival",
+ reason = "weakRecalibration for survival models requires the survival package to be installed"
+ )
recalibrated <- prediction
-
+
# this will make the recalibration work if the baselineSurvival is missing
baseline <- ifelse(is.null(attr(recalibrated, "baselineSurvival")), 0.9, attr(recalibrated, "baselineSurvival"))
- ParallelLogger::logInfo(paste0('recal initial baseline hazard: ',baseline))
-
+ ParallelLogger::logInfo(paste0("recal initial baseline hazard: ", baseline))
+
offset <- ifelse(is.null(attr(recalibrated, "offset")), 0, attr(recalibrated, "offset"))
- ParallelLogger::logInfo(paste0('recal initial offset: ',offset))
-
+ ParallelLogger::logInfo(paste0("recal initial offset: ", offset))
+
timepoint <- ifelse(is.null(attr(recalibrated, "timePoint")), 365, attr(recalibrated, "timePoint"))
- ParallelLogger::logInfo(paste0('recal initial timepoint: ',timepoint))
-
- if(!is.null(baseline)){
- lp <- log(log(1-recalibrated$value)/log(baseline)) + offset
- } else{
+ ParallelLogger::logInfo(paste0("recal initial timepoint: ", timepoint))
+
+ if (!is.null(baseline)) {
+ lp <- log(log(1 - recalibrated$value) / log(baseline)) + offset
+ } else {
lp <- log(recalibrated$value)
}
-
-
+
+
t <- apply(cbind(recalibrated$daysToCohortEnd, recalibrated$survivalTime), 1, min)
- y <- ifelse(recalibrated$outcomeCount>0,1,0) # observed outcome
- y[t>timepoint] <- 0
- t[t>timepoint] <- timepoint
- S<- survival::Surv(t, y)
+ y <- ifelse(recalibrated$outcomeCount > 0, 1, 0) # observed outcome
+ y[t > timepoint] <- 0
+ t[t > timepoint] <- timepoint
+ S <- survival::Surv(t, y)
#### Intercept + Slope recalibration
- f.slope <- survival::coxph(S~lp)
- h.slope <- max(survival::basehaz(f.slope)$hazard) # maximum OK because of prediction_horizon
+ f.slope <- survival::coxph(S ~ lp)
+ h.slope <- max(survival::basehaz(f.slope)$hazard) # maximum OK because of prediction_horizon
lp.slope <- stats::predict(f.slope)
- recalibrated$value <- 1-exp(-h.slope*exp(lp.slope))
+ recalibrated$value <- 1 - exp(-h.slope * exp(lp.slope))
# 1-h.slope^exp(lp.slope)
-
-
- recalibrated[,columnType] <- 'weakRecalibration'
+
+
+ recalibrated[, columnType] <- "weakRecalibration"
prediction <- rbind(prediction, recalibrated)
- attr(prediction, 'metaData')$weakRecalibration = list(
- adjustGradient = f.slope$coefficients['lp'],
+ attr(prediction, "metaData")$weakRecalibration <- list(
+ adjustGradient = f.slope$coefficients["lp"],
adjustIntercept = h.slope
)
-
- return(prediction)
-
- }
-
-}
-logFunct <- function(values){
- return(1/(1 + exp(0 - values)))
-}
-
-inverseLog <- function(values){
- res <- log(values/(1-values))
- return(res)
+ return(prediction)
+ }
}
-
diff --git a/R/RunMultiplePlp.R b/R/RunMultiplePlp.R
index 9e90f5452..2ddf97129 100644
--- a/R/RunMultiplePlp.R
+++ b/R/RunMultiplePlp.R
@@ -20,7 +20,7 @@
#' Run a list of predictions analyses
#'
#' @details
-#' This function will run all specified predictions as defined using .
+#' This function will run all specified predictions as defined using .
#'
#' @param databaseDetails The database settings created using \code{createDatabaseDetails()}
#' @param modelDesignList A list of model designs created using \code{createModelDesign()}
@@ -29,112 +29,115 @@
#' @param logSettings The setting specifying the logging for the analyses created using \code{createLogSettings()}
#' @param saveDirectory Name of the folder where all the outputs will written to.
#' @param sqliteLocation (optional) The location of the sqlite database with the results
-#'
+#'
#' @return
#' A data frame with the following columns: \tabular{ll}{ \verb{analysisId} \tab The unique identifier
#' for a set of analysis choices.\cr \verb{targetId} \tab The ID of the target cohort populations.\cr
-#' \verb{outcomeId} \tab The ID of the outcomeId.\cr \verb{dataLocation} \tab The location where the plpData was saved
+#' \verb{outcomeId} \tab The ID of the outcomeId.\cr \verb{dataLocation} \tab The location where the plpData was saved
#' \cr \verb{the settings ids} \tab The ids for all other settings used for model development.\cr }
#'
#' @export
runMultiplePlp <- function(
- databaseDetails = createDatabaseDetails(),
- modelDesignList = list(
- createModelDesign(targetId = 1, outcomeId = 2, modelSettings = setLassoLogisticRegression()),
- createModelDesign(targetId = 1, outcomeId = 3, modelSettings = setLassoLogisticRegression())
- ),
- onlyFetchData = F,
- cohortDefinitions = NULL,
- logSettings = createLogSettings(
- verbosity = "DEBUG",
- timeStamp = T,
- logName = "runPlp Log"
- ),
- saveDirectory = getwd(),
- sqliteLocation = file.path(saveDirectory, 'sqlite')
-){
-
- #input checks
- checkIsClass(databaseDetails, c('databaseDetails'))
- checkIsClass(modelDesignList, c('list', 'modelDesign'))
- checkIsClass(onlyFetchData, 'logical')
- checkIsClass(logSettings, 'logSettings')
- checkIsClass(saveDirectory, 'character')
- if(!dir.exists(saveDirectory)){
- dir.create(saveDirectory, recursive = T)
+ databaseDetails = createDatabaseDetails(),
+ modelDesignList = list(
+ createModelDesign(targetId = 1, outcomeId = 2, modelSettings = setLassoLogisticRegression()),
+ createModelDesign(targetId = 1, outcomeId = 3, modelSettings = setLassoLogisticRegression())
+ ),
+ onlyFetchData = FALSE,
+ cohortDefinitions = NULL,
+ logSettings = createLogSettings(
+ verbosity = "DEBUG",
+ timeStamp = TRUE,
+ logName = "runPlp Log"
+ ),
+ saveDirectory = getwd(),
+ sqliteLocation = file.path(saveDirectory, "sqlite")) {
+ # input checks
+ checkIsClass(databaseDetails, c("databaseDetails"))
+ checkIsClass(modelDesignList, c("list", "modelDesign"))
+ checkIsClass(onlyFetchData, "logical")
+ checkIsClass(logSettings, "logSettings")
+ checkIsClass(saveDirectory, "character")
+ if (!dir.exists(saveDirectory)) {
+ dir.create(saveDirectory, recursive = TRUE)
}
-
- settingstable <- convertToJson(modelDesignList,cohortDefinitions)
-
- if(nrow(settingstable) != length(modelDesignList)){
- stop('Error in settingstable')
+
+ settingstable <- convertToJson(modelDesignList, cohortDefinitions)
+
+ if (nrow(settingstable) != length(modelDesignList)) {
+ stop("Error in settingstable")
}
-
+
# save the settings - TODO change this to save jsons in csv
utils::write.csv(
x = settingstable %>% dplyr::select(
"analysisId",
- "targetId",
+ "targetId",
"targetName",
- "outcomeId",
+ "outcomeId",
"outcomeName",
"dataLocation"
- ),
- file.path(saveDirectory,'settings.csv'),
- row.names = F
- )
+ ),
+ file.path(saveDirectory, "settings.csv"),
+ row.names = FALSE
+ )
# group the outcomeIds per combination of data extraction settings
- dataSettings <- settingstable %>%
+ dataSettings <- settingstable %>%
dplyr::group_by(
.data$targetId,
.data$covariateSettings,
.data$restrictPlpDataSettings,
.data$dataLocation
- ) %>%
+ ) %>%
dplyr::summarise(
- outcomeIds = paste(unique(.data$outcomeId), collapse = ',')
- )
-
+ outcomeIds = paste(unique(.data$outcomeId), collapse = ",")
+ )
+
# extract data
- for(i in 1:nrow(as.data.frame(dataSettings))){
- dataExists <- length(dir(file.path(saveDirectory, dataSettings$dataLocation[i])))>0
- if(!dataExists){
- ParallelLogger::logInfo(paste('Extracting data for cohort', dataSettings$targetId[i], 'to', file.path(saveDirectory, dataSettings$dataLocation[i])))
-
+ for (i in 1:nrow(as.data.frame(dataSettings))) {
+ dataExists <- length(dir(file.path(saveDirectory, dataSettings$dataLocation[i]))) > 0
+ if (!dataExists) {
+ ParallelLogger::logInfo(paste("Extracting data for cohort", dataSettings$targetId[i], "to", file.path(saveDirectory, dataSettings$dataLocation[i])))
+
databaseDetails$targetId <- dataSettings$targetId[i]
- databaseDetails$outcomeIds <- strsplit(dataSettings$outcomeIds[i], ',')[[1]]
-
+ databaseDetails$outcomeIds <- strsplit(dataSettings$outcomeIds[i], ",")[[1]]
+
plpDataSettings <- list(
databaseDetails = databaseDetails,
covariateSettings = ParallelLogger::convertJsonToSettings(dataSettings$covariateSettings[i]),
restrictPlpDataSettings = ParallelLogger::convertJsonToSettings(dataSettings$restrictPlpDataSettings[i])
)
-
+
plpData <- tryCatch(
- {do.call(getPlpData, plpDataSettings)},
- error = function(e){ParallelLogger::logInfo(e); return(NULL)}
+ {
+ do.call(getPlpData, plpDataSettings)
+ },
+ error = function(e) {
+ ParallelLogger::logInfo(e)
+ return(NULL)
+ }
)
- if(!is.null(plpData)){
+ if (!is.null(plpData)) {
savePlpData(plpData, file.path(saveDirectory, dataSettings$dataLocation[i]))
}
- } else{
- ParallelLogger::logInfo(paste('Data for target', dataSettings$targetId[i], 'exists at', file.path(saveDirectory, dataSettings$dataLocation[i])))
+ } else {
+ ParallelLogger::logInfo(paste("Data for target", dataSettings$targetId[i], "exists at", file.path(saveDirectory, dataSettings$dataLocation[i])))
}
}
-
+
# runDiagnosis - NEW
- if(!onlyFetchData){
- for(i in 1:nrow(as.data.frame(settingstable))){
+ if (!onlyFetchData) {
+ for (i in 1:nrow(as.data.frame(settingstable))) {
modelDesign <- modelDesignList[[i]]
- settings <- settingstable[i,] # just the data locations?
-
- dataExists <- length(dir(file.path(saveDirectory, settings$dataLocation)))>0
-
- if(dataExists){
- analysisExists <- file.exists(file.path(saveDirectory, settings$analysisId,'diagnosePlp.rds'))
-
- if(!analysisExists){
+ settings <- settingstable[i, ] # just the data locations?
+
+ dataExists <- length(dir(file.path(saveDirectory, settings$dataLocation))) > 0
+
+ if (dataExists) {
+ analysisExists <- file.exists(file.path(saveDirectory, settings$analysisId, "diagnosePlp.rds"))
+
+ if (!analysisExists) {
plpData <- PatientLevelPrediction::loadPlpData(file.path(saveDirectory, settings$dataLocation))
diagnosePlpSettings <- list(
plpData = plpData,
@@ -149,30 +152,35 @@ runMultiplePlp <- function(
logSettings = logSettings,
saveDirectory = saveDirectory
)
-
+
diagnose <- tryCatch(
- {do.call(diagnosePlp, diagnosePlpSettings)},
- error = function(e){ParallelLogger::logInfo(e); return(NULL)}
+ {
+ do.call(diagnosePlp, diagnosePlpSettings)
+ },
+ error = function(e) {
+ ParallelLogger::logInfo(e)
+ return(NULL)
+ }
)
- } else{
- ParallelLogger::logInfo(paste('Diagnosis ', settings$analysisId, 'exists at', file.path(saveDirectory, settings$analysisId)))
+ } else {
+ ParallelLogger::logInfo(paste("Diagnosis ", settings$analysisId, "exists at", file.path(saveDirectory, settings$analysisId)))
}
}
}
}
-
+
# runPlp
- if(!onlyFetchData){
- for(i in 1:nrow(as.data.frame(settingstable))){
+ if (!onlyFetchData) {
+ for (i in 1:nrow(as.data.frame(settingstable))) {
modelDesign <- modelDesignList[[i]]
- settings <- settingstable[i,] # just the data locations?
-
- dataExists <- length(dir(file.path(saveDirectory, settings$dataLocation)))>0
-
- if(dataExists){
- analysisExists <- file.exists(file.path(saveDirectory, settings$analysisId,'plpResult', 'runPlp.rds'))
-
- if(!analysisExists){
+ settings <- settingstable[i, ] # just the data locations?
+
+ dataExists <- length(dir(file.path(saveDirectory, settings$dataLocation))) > 0
+
+ if (dataExists) {
+ analysisExists <- file.exists(file.path(saveDirectory, settings$analysisId, "plpResult", "runPlp.rds"))
+
+ if (!analysisExists) {
plpData <- PatientLevelPrediction::loadPlpData(file.path(saveDirectory, settings$dataLocation))
runPlpSettings <- list(
plpData = quote(plpData),
@@ -188,22 +196,27 @@ runMultiplePlp <- function(
executeSettings = modelDesign$executeSettings,
saveDirectory = saveDirectory
)
-
+
result <- tryCatch(
- {do.call(runPlp, runPlpSettings)},
- error = function(e){ParallelLogger::logInfo(e); return(NULL)}
+ {
+ do.call(runPlp, runPlpSettings)
+ },
+ error = function(e) {
+ ParallelLogger::logInfo(e)
+ return(NULL)
+ }
)
- } else{
- ParallelLogger::logInfo(paste('Analysis ', settings$analysisId, 'exists at', file.path(saveDirectory, settings$analysisId)))
+ } else {
+ ParallelLogger::logInfo(paste("Analysis ", settings$analysisId, "exists at", file.path(saveDirectory, settings$analysisId)))
}
}
} # end run per setting
}
-
+
# [TODO] add code to create sqlite database and populate with results...
- if(!onlyFetchData){
+ if (!onlyFetchData) {
insertResultsToSqlite(
- resultLocation = saveDirectory,
+ resultLocation = saveDirectory,
cohortDefinitions = cohortDefinitions,
databaseList = createDatabaseList(
cdmDatabaseSchemas = databaseDetails$cohortDatabaseSchema,
@@ -213,13 +226,13 @@ runMultiplePlp <- function(
sqliteLocation = sqliteLocation
)
}
-
-
+
+
return(invisible(settingstable))
}
-#' Specify settings for deceloping a single model
+#' Specify settings for deceloping a single model
#'
#' @details
#' This specifies a single analysis for developing as single model
@@ -235,76 +248,73 @@ runMultiplePlp <- function(
#' @param modelSettings The model settings such as \code{setLassoLogisticRegression()}
#' @param splitSettings The train/validation/test splitting used by all analyses created using \code{createDefaultSplitSetting()}
#' @param runCovariateSummary Whether to run the covariateSummary
-#'
+#'
#' @return
#' A list with analysis settings used to develop a single prediction model
#'
#' @export
createModelDesign <- function(
- targetId,
- outcomeId,
- restrictPlpDataSettings = createRestrictPlpDataSettings(),
- populationSettings = createStudyPopulationSettings(),
- covariateSettings = FeatureExtraction::createDefaultCovariateSettings(),
- featureEngineeringSettings = NULL,
- sampleSettings = NULL,
- preprocessSettings = NULL,
- modelSettings = NULL,
- splitSettings = createDefaultSplitSetting(
- type = "stratified",
- testFraction = 0.25,
- trainFraction = 0.75,
- splitSeed = 123,
- nfold = 3
- ),
- runCovariateSummary = T
-){
-
- checkIsClass(targetId, c('numeric','integer'))
- checkIsClass(outcomeId, c('numeric','integer'))
-
- checkIsClass(populationSettings, c('populationSettings'))
- checkIsClass(restrictPlpDataSettings, 'restrictPlpDataSettings')
- checkIsClass(covariateSettings, c('covariateSettings', 'list'))
- checkIsClass(splitSettings, 'splitSettings')
-
- useFE <- F
- if(!is.null(featureEngineeringSettings)){
- if(inherits(featureEngineeringSettings, 'featureEngineeringSettings')){
+ targetId,
+ outcomeId,
+ restrictPlpDataSettings = createRestrictPlpDataSettings(),
+ populationSettings = createStudyPopulationSettings(),
+ covariateSettings = FeatureExtraction::createDefaultCovariateSettings(),
+ featureEngineeringSettings = NULL,
+ sampleSettings = NULL,
+ preprocessSettings = NULL,
+ modelSettings = NULL,
+ splitSettings = createDefaultSplitSetting(
+ type = "stratified",
+ testFraction = 0.25,
+ trainFraction = 0.75,
+ splitSeed = 123,
+ nfold = 3
+ ),
+ runCovariateSummary = TRUE) {
+ checkIsClass(targetId, c("numeric", "integer"))
+ checkIsClass(outcomeId, c("numeric", "integer"))
+
+ checkIsClass(populationSettings, c("populationSettings"))
+ checkIsClass(restrictPlpDataSettings, "restrictPlpDataSettings")
+ checkIsClass(covariateSettings, c("covariateSettings", "list"))
+ checkIsClass(splitSettings, "splitSettings")
+
+ useFE <- FALSE
+ if (!is.null(featureEngineeringSettings)) {
+ if (inherits(featureEngineeringSettings, "featureEngineeringSettings")) {
featureEngineeringSettings <- list(featureEngineeringSettings)
}
- lapply(featureEngineeringSettings, function(x) checkIsClass(x, c('featureEngineeringSettings')))
- useFE <- T
- } else{
+ lapply(featureEngineeringSettings, function(x) checkIsClass(x, c("featureEngineeringSettings")))
+ useFE <- TRUE
+ } else {
featureEngineeringSettings <- list(createFeatureEngineeringSettings(type = "none"))
}
-
- useSample <- F
- if(!is.null(sampleSettings)){
-
- if(inherits(sampleSettings, 'sampleSettings')){
+
+ useSample <- FALSE
+ if (!is.null(sampleSettings)) {
+ if (inherits(sampleSettings, "sampleSettings")) {
sampleSettings <- list(sampleSettings)
}
- lapply(sampleSettings, function(x) checkIsClass(x, c('sampleSettings')))
-
- useSample <- T
- } else{
+ lapply(sampleSettings, function(x) checkIsClass(x, c("sampleSettings")))
+
+ useSample <- TRUE
+ } else {
sampleSettings <- list(createSampleSettings(type = "none"))
}
-
- usePreprocess <- F
- if(!is.null(preprocessSettings)){
- checkIsClass(preprocessSettings, c('preprocessSettings'))
- usePreprocess <- T
- } else{
+
+ usePreprocess <- FALSE
+ if (!is.null(preprocessSettings)) {
+ checkIsClass(preprocessSettings, c("preprocessSettings"))
+ usePreprocess <- TRUE
+ } else {
preprocessSettings <- createPreprocessSettings(
minFraction = 0.001,
- normalize = T
+ normalize = TRUE
)
}
-
- checkIsClass(modelSettings, c('modelSettings'))
-
+
+ checkIsClass(modelSettings, c("modelSettings"))
+
settings <- list(
targetId = targetId,
outcomeId = outcomeId,
@@ -317,19 +327,17 @@ createModelDesign <- function(
modelSettings = modelSettings,
splitSettings = splitSettings,
executeSettings = createExecuteSettings(
- runSplitData = T,
+ runSplitData = TRUE,
runSampleData = useSample,
runfeatureEngineering = useFE,
runPreprocessData = usePreprocess,
runModelDevelopment = !is.null(modelSettings),
- runCovariateSummary = runCovariateSummary
+ runCovariateSummary = runCovariateSummary
)
-
)
-
- class(settings) <- 'modelDesign'
+
+ class(settings) <- "modelDesign"
return(settings)
-
}
@@ -337,58 +345,58 @@ createModelDesign <- function(
#'
#' @details
#' This function creates a json file with the modelDesignList saved
-#'
+#'
#' @param modelDesignList A list of modelDesigns created using \code{createModelDesign()}
#' @param cohortDefinitions A list of the cohortDefinitions (generally extracted from ATLAS)
#' @param saveDirectory The directory to save the modelDesignList settings
-#'
+#'
#' @examples
#' \dontrun{
#' savePlpAnalysesJson(
-#' modelDesignList = list(
-#' createModelDesign(targetId = 1, outcomeId = 2, modelSettings = setLassoLogisticRegression()),
-#' createModelDesign(targetId = 1, outcomeId = 3, modelSettings = setLassoLogisticRegression())
-#' ),
-#' saveDirectory = 'C:/bestModels'
+#' modelDesignList = list(
+#' createModelDesign(targetId = 1, outcomeId = 2, modelSettings = setLassoLogisticRegression()),
+#' createModelDesign(targetId = 1, outcomeId = 3, modelSettings = setLassoLogisticRegression())
+#' ),
+#' saveDirectory = "C:/bestModels"
#' )
#' }
#'
#' @export
savePlpAnalysesJson <- function(
- modelDesignList = list(
- createModelDesign(targetId = 1, outcomeId = 2, modelSettings = setLassoLogisticRegression()),
- createModelDesign(targetId = 1, outcomeId = 3, modelSettings = setLassoLogisticRegression())
- ),
- cohortDefinitions = NULL,
- # add cohortDefinitions
- saveDirectory = NULL
- ){
-
- if(inherits(modelDesignList, 'modelDesign')){
+ modelDesignList = list(
+ createModelDesign(targetId = 1, outcomeId = 2, modelSettings = setLassoLogisticRegression()),
+ createModelDesign(targetId = 1, outcomeId = 3, modelSettings = setLassoLogisticRegression())
+ ),
+ cohortDefinitions = NULL,
+ # add cohortDefinitions
+ saveDirectory = NULL) {
+ if (inherits(modelDesignList, "modelDesign")) {
modelDesignList <- list(modelDesignList)
}
-
- lapply(modelDesignList, function(x){checkIsClass(x, 'modelDesign')})
-
- if(!is.null(saveDirectory)){
- checkIsClass(saveDirectory, 'character')
-
- if(!dir.exists(saveDirectory)){
- dir.create(saveDirectory, recursive = T)
+
+ lapply(modelDesignList, function(x) {
+ checkIsClass(x, "modelDesign")
+ })
+
+ if (!is.null(saveDirectory)) {
+ checkIsClass(saveDirectory, "character")
+
+ if (!dir.exists(saveDirectory)) {
+ dir.create(saveDirectory, recursive = TRUE)
}
-
+
ParallelLogger::saveSettingsToJson(
object = list(
plpVersion = as.character(utils::packageVersion("PatientLevelPrediction")),
analyses = modelDesignList,
cohortDefinitions = cohortDefinitions
- ),
- fileName = file.path(saveDirectory,"predictionAnalysisList.json")
+ ),
+ fileName = file.path(saveDirectory, "predictionAnalysisList.json")
)
-
- return(file.path(saveDirectory,"predictionAnalysisList.json"))
+
+ return(file.path(saveDirectory, "predictionAnalysisList.json"))
}
-
+
return(
ParallelLogger::convertSettingsToJson(
object = list(
@@ -404,28 +412,26 @@ savePlpAnalysesJson <- function(
#' Load the multiple prediction json settings from a file
#'
#' @details
-#' This function interprets a json with the multiple prediction settings and creates a list
+#' This function interprets a json with the multiple prediction settings and creates a list
#' that can be combined with connection settings to run a multiple prediction study
-#'
-#' @param jsonFileLocation The location of the file 'predictionAnalysisList.json' with the modelDesignList
-#'
+#'
+#' @param jsonFileLocation The location of the file 'predictionAnalysisList.json' with the modelDesignList
+#'
#' @examples
#' \dontrun{
-#' modelDesignList <- loadPlpAnalysesJson('location of json settings')$analysis
+#' modelDesignList <- loadPlpAnalysesJson("location of json settings")$analysis
#' }
#'
#' @export
loadPlpAnalysesJson <- function(
- jsonFileLocation
-){
-
- checkIsClass(jsonFileLocation, 'character')
- if(!file.exists(jsonFileLocation)){
- ParallelLogger::logError('Invalid directory - does not exist')
+ jsonFileLocation) {
+ checkIsClass(jsonFileLocation, "character")
+ if (!file.exists(jsonFileLocation)) {
+ ParallelLogger::logError("Invalid directory - does not exist")
}
-
+
rList <- ParallelLogger::loadSettingsFromJson(fileName = jsonFileLocation)
-
+
return(rList)
}
@@ -440,65 +446,64 @@ loadPlpAnalysesJson <- function(
#' @details
#' Users need to input a location where the results of the multiple plp analyses
#' are found and the connection and database settings for the new data
-#'
+#'
#' @param analysesLocation The location where the multiple plp analyses are
#' @param validationDatabaseDetails A single or list of validation database settings created using \code{createDatabaseDetails()}
#' @param validationRestrictPlpDataSettings The settings specifying the extra restriction settings when extracting the data created using \code{createRestrictPlpDataSettings()}.
#' @param recalibrate A vector of recalibration methods (currently supports 'RecalibrationintheLarge' and/or 'weakRecalibration')
#' @param cohortDefinitions A list of cohortDefinitions
#' @param saveDirectory The location to save to validation results
-#'
-#' @export
+#'
+#' @export
validateMultiplePlp <- function(
- analysesLocation,
- validationDatabaseDetails,
- validationRestrictPlpDataSettings = createRestrictPlpDataSettings(),
- recalibrate = NULL,
- cohortDefinitions = NULL,
- saveDirectory = NULL
- ){
-
- # add input checks
- checkIsClass(analysesLocation, 'character')
-
- if(inherits(validationDatabaseDetails, 'databaseDetails')){
+ analysesLocation,
+ validationDatabaseDetails,
+ validationRestrictPlpDataSettings = createRestrictPlpDataSettings(),
+ recalibrate = NULL,
+ cohortDefinitions = NULL,
+ saveDirectory = NULL) {
+ # add input checks
+ checkIsClass(analysesLocation, "character")
+
+ if (inherits(validationDatabaseDetails, "databaseDetails")) {
validationDatabaseDetails <- list(validationDatabaseDetails)
}
lapply(
- validationDatabaseDetails,
- function(x){checkIsClass(x, 'databaseDetails')}
- )
-
- checkIsClass(validationRestrictPlpDataSettings, 'restrictPlpDataSettings')
-
- checkIsClass(recalibrate, c('character', 'NULL'))
- checkIsClass(saveDirectory, c('character', 'NULL'))
+ validationDatabaseDetails,
+ function(x) {
+ checkIsClass(x, "databaseDetails")
+ }
+ )
+
+ checkIsClass(validationRestrictPlpDataSettings, "restrictPlpDataSettings")
+
+ checkIsClass(recalibrate, c("character", "NULL"))
+ checkIsClass(saveDirectory, c("character", "NULL"))
# for each model run externalValidateDbPlp()
- analyses <- dir(analysesLocation, recursive = F, full.names = F)
-
+ analyses <- dir(analysesLocation, recursive = FALSE, full.names = FALSE)
+
# now fine all analysis folders..
- analyses <- analyses[grep('Analysis_',analyses)]
-
- for(i in 1:length(analyses)){
-
- if(is.null(saveDirectory)){
- saveLocation <- file.path(analysesLocation, 'Validation')
- } else{
+ analyses <- analyses[grep("Analysis_", analyses)]
+
+ for (i in 1:length(analyses)) {
+ if (is.null(saveDirectory)) {
+ saveLocation <- file.path(analysesLocation, "Validation")
+ } else {
saveLocation <- saveDirectory
}
-
+
analysis <- analyses[i]
modelSettings <- file.path(analysesLocation, analysis)
-
- ParallelLogger::logInfo(paste0('Evaluating model in ',modelSettings ))
-
- if(dir.exists(file.path(modelSettings,'plpResult'))){
- ParallelLogger::logInfo(paste0('plpResult found in ',modelSettings ))
-
- plpModel <- loadPlpModel(file.path(modelSettings,'plpResult','model'))
-
- validations <- tryCatch(
+
+ ParallelLogger::logInfo(paste0("Evaluating model in ", modelSettings))
+
+ if (dir.exists(file.path(modelSettings, "plpResult"))) {
+ ParallelLogger::logInfo(paste0("plpResult found in ", modelSettings))
+
+ plpModel <- loadPlpModel(file.path(modelSettings, "plpResult", "model"))
+
+ validations <- tryCatch(
{
externalValidateDbPlp(
plpModel = plpModel,
@@ -506,58 +511,58 @@ validateMultiplePlp <- function(
validationRestrictPlpDataSettings = validationRestrictPlpDataSettings,
settings = createValidationSettings(
recalibrate = recalibrate,
- runCovariateSummary = F
+ runCovariateSummary = FALSE
),
outputFolder = saveLocation
- )},
- error = function(cont){ParallelLogger::logInfo(paste0('Error: ',cont ))
- ;return(NULL)}
+ )
+ },
+ error = function(cont) {
+ ParallelLogger::logInfo(paste0("Error: ", cont))
+ return(NULL)
+ }
)
-
}
}
-
+
# add to sqlite database - needed for shiny app
- #=======================
-
- if(saveLocation == file.path(analysesLocation, 'Validation')){
- ParallelLogger::logInfo('Saving validation results into the development sqlite database')
- sqliteLocation <- file.path(analysesLocation, 'sqlite')
- } else{
- ParallelLogger::logInfo('Saving validation results into validation sqlite')
- sqliteLocation <- file.path(saveDirectory,'sqlite')
+ # =======================
+
+ if (saveLocation == file.path(analysesLocation, "Validation")) {
+ ParallelLogger::logInfo("Saving validation results into the development sqlite database")
+ sqliteLocation <- file.path(analysesLocation, "sqlite")
+ } else {
+ ParallelLogger::logInfo("Saving validation results into validation sqlite")
+ sqliteLocation <- file.path(saveDirectory, "sqlite")
}
-
- for(validationDatabaseDetail in validationDatabaseDetails){
+
+ for (validationDatabaseDetail in validationDatabaseDetails) {
tryCatch({
insertResultsToSqlite(
- resultLocation = file.path(saveLocation, validationDatabaseDetail$cdmDatabaseName),
+ resultLocation = file.path(saveLocation, validationDatabaseDetail$cdmDatabaseName),
cohortDefinitions = cohortDefinitions,
databaseList = createDatabaseList(
cdmDatabaseSchemas = validationDatabaseDetail$cdmDatabaseSchema,
cdmDatabaseNames = validationDatabaseDetail$cdmDatabaseName,
- databaseRefIds = validationDatabaseDetail$cdmDatabaseId
+ databaseRefIds = validationDatabaseDetail$cdmDatabaseId
),
sqliteLocation = sqliteLocation
)
})
}
-
}
-convertToJson <-function(
- modelDesignList,
- cohortDefinitions = NULL
-){
-
- convertToJsonString <- function(x){as.character(ParallelLogger::convertSettingsToJson(x))}
-
- if(is.null(cohortDefinitions)){
-
+convertToJson <- function(
+ modelDesignList,
+ cohortDefinitions = NULL) {
+ convertToJsonString <- function(x) {
+ as.character(ParallelLogger::convertSettingsToJson(x))
+ }
+
+ if (is.null(cohortDefinitions)) {
cohortIds <- unlist(
lapply(
- X = 1:length(modelDesignList),
- FUN = function(i){
+ X = 1:length(modelDesignList),
+ FUN = function(i) {
c(
modelDesignList[[i]]$targetId,
modelDesignList[[i]]$outcomeId
@@ -566,62 +571,60 @@ convertToJson <-function(
)
)
cohortIds <- unique(cohortIds)
-
+
cohortDefinitions <- data.frame(
cohortId = cohortIds,
- cohortName = paste0('Cohort: ', cohortIds)
+ cohortName = paste0("Cohort: ", cohortIds)
)
-
- } else{
- cohortDefinitions <- cohortDefinitions %>%
+ } else {
+ cohortDefinitions <- cohortDefinitions %>%
dplyr::select(
- "cohortId",
+ "cohortId",
"cohortName"
- )
+ )
}
-
+
result <- data.frame(
- analysisId = paste0('Analysis_', 1:length(modelDesignList)),
+ analysisId = paste0("Analysis_", 1:length(modelDesignList)),
targetId = unlist(lapply(modelDesignList, function(x) ifelse(is.null(x$targetId), x$cohortId, x$targetId))),
outcomeId = unlist(lapply(modelDesignList, function(x) x$outcomeId)),
covariateSettings = unlist(lapply(modelDesignList, function(x) convertToJsonString(x$covariateSettings))),
- restrictPlpDataSettings = unlist(lapply(modelDesignList, function(x) convertToJsonString(x$restrictPlpDataSettings))),
- populationSettings = unlist(lapply(modelDesignList, function(x) convertToJsonString(x$populationSettings))),
- sampleSettings = unlist(lapply(modelDesignList, function(x) convertToJsonString(x$sampleSettings))),
- splitSettings = unlist(lapply(modelDesignList, function(x) convertToJsonString(x$splitSettings))),
- featureEngineeringSettings = unlist(lapply(modelDesignList, function(x) convertToJsonString(x$featureEngineeringSettings))),
- preprocessSettings = unlist(lapply(modelDesignList, function(x) convertToJsonString(x$preprocessSettings))),
- modelSettings = unlist(lapply(modelDesignList, function(x) convertToJsonString(x$modelSettings))),
- executeSettings = unlist(lapply(modelDesignList, function(x) convertToJsonString(x$executeSettings)))
+ restrictPlpDataSettings = unlist(lapply(modelDesignList, function(x) convertToJsonString(x$restrictPlpDataSettings))),
+ populationSettings = unlist(lapply(modelDesignList, function(x) convertToJsonString(x$populationSettings))),
+ sampleSettings = unlist(lapply(modelDesignList, function(x) convertToJsonString(x$sampleSettings))),
+ splitSettings = unlist(lapply(modelDesignList, function(x) convertToJsonString(x$splitSettings))),
+ featureEngineeringSettings = unlist(lapply(modelDesignList, function(x) convertToJsonString(x$featureEngineeringSettings))),
+ preprocessSettings = unlist(lapply(modelDesignList, function(x) convertToJsonString(x$preprocessSettings))),
+ modelSettings = unlist(lapply(modelDesignList, function(x) convertToJsonString(x$modelSettings))),
+ executeSettings = unlist(lapply(modelDesignList, function(x) convertToJsonString(x$executeSettings)))
)
-
- result <- result %>%
- dplyr::left_join(cohortDefinitions, by = c("outcomeId" = "cohortId")) %>%
- dplyr::rename(outcomeName = "cohortName") %>%
- dplyr::left_join(cohortDefinitions, by = c('targetId' = 'cohortId')) %>%
- dplyr::rename(targetName = "cohortName") # new
-
+
+ result <- result %>%
+ dplyr::left_join(cohortDefinitions, by = c("outcomeId" = "cohortId")) %>%
+ dplyr::rename(outcomeName = "cohortName") %>%
+ dplyr::left_join(cohortDefinitions, by = c("targetId" = "cohortId")) %>%
+ dplyr::rename(targetName = "cohortName") # new
+
# get the names
- uniqueSettings <- result %>%
+ uniqueSettings <- result %>%
dplyr::distinct(
- .data$targetId,
- .data$covariateSettings,
+ .data$targetId,
+ .data$covariateSettings,
.data$restrictPlpDataSettings
) %>%
- dplyr::group_by(.data$targetId) %>%
- dplyr::mutate(dataLocation = paste0('targetId_',.data$targetId, '_L', dplyr::row_number()))
-
+ dplyr::group_by(.data$targetId) %>%
+ dplyr::mutate(dataLocation = paste0("targetId_", .data$targetId, "_L", dplyr::row_number()))
+
# add the data names
- result <- result %>%
+ result <- result %>%
dplyr::left_join(
- uniqueSettings,
+ uniqueSettings,
by = c(
"targetId" = "targetId",
"covariateSettings" = "covariateSettings",
"restrictPlpDataSettings" = "restrictPlpDataSettings"
)
)
-
+
return(result)
}
-
diff --git a/R/RunPlp.R b/R/RunPlp.R
index 2d7119388..c1c178d45 100644
--- a/R/RunPlp.R
+++ b/R/RunPlp.R
@@ -188,37 +188,38 @@
runPlp <- function(
plpData,
outcomeId = plpData$metaData$call$outcomeIds[1],
- analysisId = paste(Sys.Date(), plpData$metaData$call$outcomeIds[1], sep = '-'),
- analysisName = 'Study details',
+ analysisId = paste(Sys.Date(), plpData$metaData$call$outcomeIds[1], sep = "-"),
+ analysisName = "Study details",
populationSettings = createStudyPopulationSettings(),
splitSettings = createDefaultSplitSetting(
- type = 'stratified',
- testFraction=0.25,
+ type = "stratified",
+ testFraction = 0.25,
trainFraction = 0.75,
- splitSeed=123,
- nfold=3
+ splitSeed = 123,
+ nfold = 3
),
- sampleSettings = createSampleSettings(type = 'none'),
- featureEngineeringSettings = createFeatureEngineeringSettings(type = 'none'),
+ sampleSettings = createSampleSettings(type = "none"),
+ featureEngineeringSettings = createFeatureEngineeringSettings(type = "none"),
preprocessSettings = createPreprocessSettings(
minFraction = 0.001,
- normalize = T
+ normalize = TRUE
),
modelSettings = setLassoLogisticRegression(),
logSettings = createLogSettings(
- verbosity = 'DEBUG',
- timeStamp = T,
- logName = 'runPlp Log'
+ verbosity = "DEBUG",
+ timeStamp = TRUE,
+ logName = "runPlp Log"
),
executeSettings = createDefaultExecuteSettings(),
saveDirectory = getwd()
-){
+) {
+ start <- Sys.time()
# start log
analysisPath <- file.path(saveDirectory, analysisId)
logSettings$saveDirectory <- analysisPath
- logSettings$logFileName <- 'plpLog'
- logger <- do.call(createLog,logSettings)
+ logSettings$logFileName <- "plpLog"
+ logger <- do.call(createLog, logSettings)
ParallelLogger::registerLogger(logger)
on.exit(closeLog(logger))
@@ -239,15 +240,17 @@ runPlp <- function(
)
)
},
- error = function(e){ParallelLogger::logError(e); return(NULL)}
- )
+ error = function(e) {
+ ParallelLogger::logError(e)
+ return(NULL)
+ })
- if(is.null(settingsValid)){
- stop('Settings are invalid - check log for error message')
+ if (is.null(settingsValid)) {
+ stop("Settings are invalid - check log for error message")
}
# log the start time:
- ExecutionDateTime <- Sys.time()
+ executionDateTime <- Sys.time()
# print the header in the log
tryCatch({
@@ -257,16 +260,16 @@ runPlp <- function(
outcomeId,
analysisId,
analysisName,
- ExecutionDateTime
+ executionDateTime
)
})
# create the population
- if(!is.null(plpData$population)) {
- ParallelLogger::logInfo('Using existing population')
+ if (!is.null(plpData$population)) {
+ ParallelLogger::logInfo("Using existing population")
population <- plpData$population
} else {
- ParallelLogger::logInfo('Creating population')
+ ParallelLogger::logInfo("Creating population")
population <- tryCatch({
do.call(createStudyPopulation,
list(plpData = plpData,
@@ -275,15 +278,18 @@ runPlp <- function(
population = plpData$population
)
)},
- error = function(e){ParallelLogger::logError(e); return(NULL)}
+ error = function(e) {
+ ParallelLogger::logError(e)
+ return(NULL)
+ }
)
}
- if(is.null(population)){
- stop('population NULL')
+ if (is.null(population)) {
+ stop("population NULL")
}
- if(executeSettings$runSplitData){
+ if (executeSettings$runSplitData) {
# split the data (test/train/cv) + summarise at the end
data <- tryCatch(
{
@@ -293,16 +299,19 @@ runPlp <- function(
splitSettings = splitSettings
)
},
- error = function(e){ParallelLogger::logError(e); return(NULL)}
+ error = function(e) {
+ ParallelLogger::logError(e)
+ return(NULL)
+ }
)
- if(is.null(data)){
- stop('data NULL after splitting')
+ if (is.null(data)) {
+ stop("data NULL after splitting")
}
dataSummary(data)
}
- if(executeSettings$runSampleData){
+ if (executeSettings$runSampleData) {
# sampling
data$Train <- tryCatch(
{
@@ -311,15 +320,18 @@ runPlp <- function(
sampleSettings = sampleSettings
)
},
- error = function(e){ParallelLogger::logError(e); return(NULL)}
+ error = function(e) {
+ ParallelLogger::logError(e)
+ return(NULL)
+ }
)
- if(is.null(data$Train)){
- stop('train data NULL after sample')
+ if (is.null(data$Train)) {
+ stop("train data NULL after sample")
}
dataSummary(data)
}
- if(executeSettings$runfeatureEngineering){
+ if (executeSettings$runfeatureEngineering) {
data$Train <- tryCatch(
{
@@ -328,15 +340,18 @@ runPlp <- function(
featureEngineeringSettings = featureEngineeringSettings
)
},
- error = function(e){ParallelLogger::logError(e); return(NULL)}
+ error = function(e) {
+ ParallelLogger::logError(e)
+ return(NULL)
+ }
)
- if(is.null(data$Train)){
- stop('train data NULL after feature engineering')
+ if (is.null(data$Train)) {
+ stop("train data NULL after feature engineering")
}
dataSummary(data)
}
- if(executeSettings$runPreprocessData){
+ if (executeSettings$runPreprocessData) {
data$Train$covariateData <- tryCatch(
{
@@ -345,10 +360,13 @@ runPlp <- function(
preprocessSettings = preprocessSettings
)
},
- error = function(e){ParallelLogger::logError(e); return(NULL)}
+ error = function(e) {
+ ParallelLogger::logError(e)
+ return(NULL)
+ }
)
- if(is.null(data$Train$covariateData)){
- stop('train data NULL after preprocessing')
+ if (is.null(data$Train$covariateData)) {
+ stop("train data NULL after preprocessing")
}
dataSummary(data)
}
@@ -357,7 +375,7 @@ runPlp <- function(
model <- NULL
prediction <- NULL
performance <- NULL
- if(executeSettings$runModelDevelopment){
+ if (executeSettings$runModelDevelopment) {
# fit model
settings <- list(
trainData = data$Train,
@@ -366,21 +384,26 @@ runPlp <- function(
analysisPath = analysisPath
)
- ParallelLogger::logInfo(sprintf('Training %s model',settings$modelSettings$name))
+ ParallelLogger::logInfo(sprintf("Training %s model",
+ settings$modelSettings$name))
+
model <- tryCatch(
{
do.call(fitPlp, settings)
},
- error = function(e) { ParallelLogger::logError(e); return(NULL)}
+ error = function(e) {
+ ParallelLogger::logError(e)
+ return(NULL)
+ }
)
- if(!is.null(model)){
+ if (!is.null(model)) {
prediction <- model$prediction
# remove prediction from model
model$prediction <- NULL
#apply to test data if exists:
- if('Test' %in% names(data)){
+ if ("Test" %in% names(data)) {
predictionTest <- tryCatch(
{
predictPlp(
@@ -389,13 +412,16 @@ runPlp <- function(
population = data$Test$labels
)
},
- error = function(e) { ParallelLogger::logError(e); return(NULL)}
+ error = function(e) {
+ ParallelLogger::logError(e)
+ return(NULL)
+ }
)
- predictionTest$evaluationType <- 'Test'
+ predictionTest$evaluationType <- "Test"
- if(!is.null(predictionTest)){
- prediction <- rbind(predictionTest, prediction[, colnames(prediction)!='index'])
+ if (!is.null(predictionTest)) {
+ prediction <- rbind(predictionTest, prediction[, colnames(prediction) != "index"])
}
@@ -404,9 +430,12 @@ runPlp <- function(
# evaluate model
performance <- tryCatch(
{
- evaluatePlp(prediction, typeColumn = 'evaluationType')
+ evaluatePlp(prediction, typeColumn = "evaluationType")
},
- error = function(e) { ParallelLogger::logError(e); return(NULL)}
+ error = function(e) {
+ ParallelLogger::logError(e)
+ return(NULL)
+ }
)
}
@@ -415,23 +444,23 @@ runPlp <- function(
# covariateSummary
covariateSummaryResult <- NULL
- if(executeSettings$runCovariateSummary){
+ if (executeSettings$runCovariateSummary) {
- if(!is.null(data$Test)){
+ if (!is.null(data$Test)) {
strata <- data.frame(
rowId = c(
data$Train$labels$rowId,
data$Test$labels$rowId
),
strataName = c(
- rep('Train', nrow(data$Train$labels)),
- rep('Test', nrow(data$Test$labels))
+ rep("Train", nrow(data$Train$labels)),
+ rep("Test", nrow(data$Test$labels))
)
)
- } else{
+ } else {
strata <- data.frame(
- rowId = c( data$Train$labels$rowId ),
- strataName = c( rep('Train', nrow(data$Train$labels)) )
+ rowId = c(data$Train$labels$rowId),
+ strataName = c(rep("Train", nrow(data$Train$labels)))
)
}
@@ -439,8 +468,8 @@ runPlp <- function(
dplyr::mutate(covariateValue = 0) %>%
dplyr::select("covariateId", "covariateValue") %>%
dplyr::collect()
- if(!is.null(model)){
- if(!is.null(model$covariateImportance)){
+ if (!is.null(model)) {
+ if (!is.null(model$covariateImportance)) {
variableImportance <- model$covariateImportance %>%
dplyr::select("covariateId", "covariateValue")
}
@@ -448,7 +477,7 @@ runPlp <- function(
# apply FE if it is used
featureEngineering <- NULL
- if(!is.null(model)){
+ if (!is.null(model)) {
featureEngineering <- model$preprocessing$featureEngineering
}
@@ -468,34 +497,33 @@ runPlp <- function(
# ExecutionSummary details:
# log the end time:
endTime <- Sys.time()
- TotalExecutionElapsedTime <- difftime(endTime, ExecutionDateTime, units='mins')
+ TotalExecutionElapsedTime <- difftime(endTime, executionDateTime, units = "mins")
executionSummary <- list(
PackageVersion = list(
- rVersion= R.Version()$version.string,
+ rVersion = R.Version()$version.string,
packageVersion = utils::packageVersion("PatientLevelPrediction")
),
- PlatformDetails= list(
+ PlatformDetails = list(
platform = R.Version()$platform,
- cores = Sys.getenv('NUMBER_OF_PROCESSORS'),
+ cores = Sys.getenv("NUMBER_OF_PROCESSORS"),
RAM = memuse::Sys.meminfo()[1]
),
TotalExecutionElapsedTime = TotalExecutionElapsedTime,
- ExecutionDateTime = ExecutionDateTime,
+ ExecutionDateTime = executionDateTime,
Log = logSettings$logFileName # location for now
#Not available at the moment: CDM_SOURCE - meta-data containing CDM version, release date, vocabulary version
)
# if model is NULL convert it to list for saving
- if(is.null(model)){
- model <- list(noModel = T)
- attr(model, "predictionFunction") <- 'noModel'
- attr(model, "saveType") <- 'RtoJson'
- class(model) <- 'plpModel'
+ if (is.null(model)) {
+ model <- list(noModel = TRUE)
+ attr(model, "predictionFunction") <- "noModel"
+ attr(model, "saveType") <- "RtoJson"
+ class(model) <- "plpModel"
}
results <- list(
- #inputSetting = inputSetting,
executionSummary = executionSummary,
model = model,
prediction = prediction,
@@ -506,18 +534,17 @@ runPlp <- function(
analysisName = analysisName
)
)
- class(results) <- c('runPlp')
+ class(results) <- c("runPlp")
ParallelLogger::logInfo("Run finished successfully.")
# save the results
- ParallelLogger::logInfo(paste0('Saving PlpResult'))
- tryCatch(savePlpResult(results, file.path(analysisPath,'plpResult')),
- finally= ParallelLogger::logTrace('Done.'))
- ParallelLogger::logInfo(paste0('plpResult saved to ..\\', analysisPath ,'\\plpResult'))
-
+ ParallelLogger::logInfo(paste0("Saving PlpResult"))
+ tryCatch(savePlpResult(results, file.path(analysisPath, "plpResult")),
+ finally = ParallelLogger::logTrace("Done."))
+ ParallelLogger::logInfo(paste0("plpResult saved to ..\\", analysisPath, "\\plpResult"))
+ delta <- Sys.time() - start
+ ParallelLogger::logInfo(paste0("Total time taken: ", signif(delta, 3), " ", attr(delta, "units")))
return(results)
}
-
-
diff --git a/R/RunPlpHelpers.R b/R/RunPlpHelpers.R
index 607e1872c..3357852f4 100644
--- a/R/RunPlpHelpers.R
+++ b/R/RunPlpHelpers.R
@@ -1,57 +1,85 @@
-printHeader <- function(plpData, targetId, outcomeId , analysisId, analysisName, ExecutionDateTime){
-
- ParallelLogger::logInfo(paste0('Patient-Level Prediction Package version ', utils::packageVersion("PatientLevelPrediction")))
-
- ParallelLogger::logInfo(paste0('Study started at: ', ExecutionDateTime))
-
- ParallelLogger::logInfo(sprintf('%-20s%s', 'AnalysisID: ',analysisId))
- ParallelLogger::logInfo(sprintf('%-20s%s', 'AnalysisName: ',analysisName))
-
+# @file RunPlpHelpers.R
+#
+# Copyright 2024 Observational Health Data Sciences and Informatics
+#
+# This file is part of PatientLevelPrediction
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+printHeader <- function(plpData,
+ targetId,
+ outcomeId,
+ analysisId,
+ analysisName,
+ executionDateTime) {
+ ParallelLogger::logInfo(paste0("Patient-Level Prediction Package version ",
+ utils::packageVersion("PatientLevelPrediction")))
+
+ ParallelLogger::logInfo(paste0("Study started at: ", executionDateTime))
+
+ ParallelLogger::logInfo(sprintf("%-20s%s", "AnalysisID: ", analysisId))
+ ParallelLogger::logInfo(sprintf("%-20s%s", "AnalysisName: ", analysisName))
+
# add header to analysis log
- ParallelLogger::logInfo(sprintf('%-20s%s', 'TargetID: ', targetId))
- ParallelLogger::logInfo(sprintf('%-20s%s', 'OutcomeID: ', outcomeId))
- ParallelLogger::logInfo(sprintf('%-20s%s', 'Cohort size: ', nrow(plpData$cohorts)))
- if(!is.null(plpData$population)){
- ParallelLogger::logInfo(sprintf('%-20s%s', 'Initial population size: ', nrow(plpData$population)))
- ParallelLogger::logInfo(sprintf('%-20s%s', 'Initial cases: ', sum(plpData$population$outcomeCount>0)))
+ ParallelLogger::logInfo(sprintf("%-20s%s", "TargetID: ", targetId))
+ ParallelLogger::logInfo(sprintf("%-20s%s", "OutcomeID: ", outcomeId))
+ ParallelLogger::logInfo(sprintf("%-20s%s", "Cohort size: ", nrow(plpData$cohorts)))
+ if (!is.null(plpData$population)) {
+ ParallelLogger::logInfo(sprintf("%-20s%s", "Initial population size: ",
+ nrow(plpData$population)))
+ ParallelLogger::logInfo(sprintf("%-20s%s", "Initial cases: ",
+ sum(plpData$population$outcomeCount > 0)))
}
- ParallelLogger::logInfo(sprintf('%-20s%s', 'Covariates: ', nrow(plpData$covariateData$covariateRef)))
- ## ParallelLogger::logInfo(sprintf('%-20s%s', 'Population size: ', nrow(population)))
- ## ParallelLogger::logInfo(sprintf('%-20s%s', 'Cases: ', sum(population$outcomeCount>0)))
-
+ ParallelLogger::logInfo(sprintf("%-20s%s", "Covariates: ",
+ plpData$covariateData$covariateRef %>%
+ dplyr::pull(.data$covariateId) %>%
+ length()))
return(invisible(TRUE))
}
checkInputs <- function(inputs) {
-
inputNames <- names(inputs)
+
+ checkIsClass(inputs[["plpData"]], c("plpData"))
+ checkIsClass(inputs[["outcomeId"]], c("numeric", "integer"))
- checkIsClass(inputs[['plpData']], c('plpData'))
- checkIsClass(inputs[['outcomeId']], c("numeric", "integer"))
-
- for(inputName in inputNames[!inputNames %in% c('plpData', 'outcomeId')]){
-
- ParallelLogger::logDebug(paste0(names(inputs[[inputName]]), ' : ',
- unlist(lapply(inputs[[inputName]], function(x) paste0(unlist(x), collapse= '-')))
- )
- )
-
+ if (FeatureExtraction::isTemporalCovariateData(inputs$plpData$covariateData)
+ && (is.null(attr(inputs$modelSettings$param, "temporalModel")))) {
+ stop("Temporal covariates detected but chosen model does not support temporal covariates")
+ }
+
+ for (inputName in inputNames[!inputNames %in% c("plpData", "outcomeId")]) {
+ ParallelLogger::logDebug(paste0(
+ names(inputs[[inputName]]), " : ",
+ unlist(lapply(inputs[[inputName]], function(x) paste0(unlist(x), collapse = "-")))
+ ))
+
# check class is correct
- if(!inherits(x = inputs[[inputName]], what = c(inputName,'list'))){
- ParallelLogger::logError(paste0('Incorrect ', inputName))
- stop('Bad input')
- }
-
- if(inherits(x = inputs[[inputName]], what = 'list')){
- if(sum(unlist(lapply(inputs[[inputName]], function(obj){inherits(x = obj, what = inputName)}))) != length(inputs[[inputName]])){
- ParallelLogger::logError(paste0('Incorrect ', inputName))
- stop('Bad input list')
+ if (!inherits(x = inputs[[inputName]], what = c(inputName, "list"))) {
+ ParallelLogger::logError(paste0("Incorrect ", inputName))
+ stop("Bad input")
+ }
+
+ if (inherits(x = inputs[[inputName]], what = "list")) {
+ if (sum(unlist(lapply(inputs[[inputName]], function(obj) {
+ inherits(x = obj, what = inputName)
+ }))) != length(inputs[[inputName]])) {
+ ParallelLogger::logError(paste0("Incorrect ", inputName))
+ stop("Bad input list")
}
- }
-
+ }
}
-
+
# return all the settings
return(invisible(TRUE))
}
@@ -68,28 +96,26 @@ checkInputs <- function(inputs) {
#' @param runfeatureEngineering TRUE or FALSE whether to do feature engineering
#' @param runPreprocessData TRUE or FALSE whether to do preprocessing
#' @param runModelDevelopment TRUE or FALSE whether to develop the model
-#' @param runCovariateSummary TRUE or FALSE whether to create covariate summary
+#' @param runCovariateSummary TRUE or FALSE whether to create covariate summary
#'
#' @return
#' list with TRUE/FALSE for each part of runPlp
#'
#' @export
createExecuteSettings <- function(
- runSplitData = F,
- runSampleData = F,
- runfeatureEngineering = F,
- runPreprocessData = F,
- runModelDevelopment = F,
- runCovariateSummary = F
-){
-
+ runSplitData = FALSE,
+ runSampleData = FALSE,
+ runfeatureEngineering = FALSE,
+ runPreprocessData = FALSE,
+ runModelDevelopment = FALSE,
+ runCovariateSummary = FALSE) {
checkIsClass(runSplitData, "logical")
checkIsClass(runSampleData, "logical")
checkIsClass(runfeatureEngineering, "logical")
checkIsClass(runPreprocessData, "logical")
checkIsClass(runModelDevelopment, "logical")
checkIsClass(runCovariateSummary, "logical")
-
+
result <- list(
runSplitData = runSplitData,
runSampleData = runSampleData,
@@ -98,7 +124,7 @@ createExecuteSettings <- function(
runModelDevelopment = runModelDevelopment,
runCovariateSummary = runCovariateSummary
)
- class(result) <- 'executeSettings'
+ class(result) <- "executeSettings"
return(result)
}
@@ -112,13 +138,13 @@ createExecuteSettings <- function(
#' list with TRUE for split, preprocess, model development and covariate summary
#'
#' @export
-createDefaultExecuteSettings <- function(){
+createDefaultExecuteSettings <- function() {
createExecuteSettings(
- runSplitData = T,
- runSampleData = F,
- runfeatureEngineering = F,
- runPreprocessData = T,
- runModelDevelopment = T,
- runCovariateSummary = T
+ runSplitData = TRUE,
+ runSampleData = FALSE,
+ runfeatureEngineering = FALSE,
+ runPreprocessData = TRUE,
+ runModelDevelopment = TRUE,
+ runCovariateSummary = TRUE
)
}
diff --git a/R/Sampling.R b/R/Sampling.R
index f97b024e0..7604ded65 100644
--- a/R/Sampling.R
+++ b/R/Sampling.R
@@ -2,20 +2,20 @@
# Copyright 2021 Observational Health Data Sciences and Informatics
#
# This file is part of PatientLevelPrediction
-#
+#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
-#
+#
# http://www.apache.org/licenses/LICENSE-2.0
-#
+#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-#' Create the settings for defining how the trainData from \code{splitData} are sampled using
+#' Create the settings for defining how the trainData from \code{splitData} are sampled using
#' default sample functions.
#'
#' @details
@@ -23,235 +23,243 @@
#'
#' @param type (character) Choice of: \itemize{
#' \item 'none' No sampling is applied - this is the default
-#' \item 'underSample' Undersample the non-outcome class to make the data more ballanced
+#' \item 'underSample' Undersample the non-outcome class to make the data more balanced
#' \item 'overSample' Oversample the outcome class by adding in each outcome multiple times
-#' }
-#' @param numberOutcomestoNonOutcomes (numeric) An numeric specifying the require number of non-outcomes per outcome
+#' }
+#' @param numberOutcomestoNonOutcomes (numeric) A numeric specifying the required number of outcomes per non-outcomes
#' @param sampleSeed (numeric) A seed to use when splitting the data for reproducibility (if not set a random number will be generated)
#'
#' @return
#' An object of class \code{sampleSettings}
#' @export
-createSampleSettings <- function(type = 'none',
- numberOutcomestoNonOutcomes = 1,
- sampleSeed = sample(10000,1)){
-
- checkIsClass(numberOutcomestoNonOutcomes, c('numeric','integer'))
- checkHigher(numberOutcomestoNonOutcomes,0)
- checkIsClass(sampleSeed, c('numeric','integer'))
- checkIsClass(type, c('character'))
- if(! type %in% c('none', 'underSample', 'overSample')){
- stop('Incorrect type. Pick: none/underSample/overSample')
+createSampleSettings <- function(type = "none",
+ numberOutcomestoNonOutcomes = 1,
+ sampleSeed = sample(10000, 1)) {
+ checkIsClass(numberOutcomestoNonOutcomes, c("numeric", "integer"))
+ checkHigher(numberOutcomestoNonOutcomes, 0)
+ checkIsClass(sampleSeed, c("numeric", "integer"))
+ checkIsClass(type, c("character"))
+ if (!type %in% c("none", "underSample", "overSample")) {
+ stop("Incorrect type. Pick: none/underSample/overSample")
}
-
+
+ if (type %in% c("underSample", "overSample")) {
+ ParallelLogger::logWarn("The previous documentation for `numberOutcomestoNonOutcomes` used to not reflect the functionality and has now been changed. The user needs to make sure the code is not relying on what was in the docs previously.")
+ }
+
sampleSettings <- list(
numberOutcomestoNonOutcomes = numberOutcomestoNonOutcomes,
- sampleSeed = ifelse(type == 'none', 1, sampleSeed) # to make it the same for none
- )
-
- if(type == 'none'){
+ sampleSeed = ifelse(type == "none", 1, sampleSeed) # to make it the same for none
+ )
+
+ if (type == "none") {
attr(sampleSettings, "fun") <- "sameData"
}
- if(type == 'underSample'){
+ if (type == "underSample") {
attr(sampleSettings, "fun") <- "underSampleData"
}
- if(type == 'overSample'){
+ if (type == "overSample") {
attr(sampleSettings, "fun") <- "overSampleData" # TODO
}
class(sampleSettings) <- "sampleSettings"
return(sampleSettings)
-
}
# code to run the sampling - add desc
-sampleData <- function(trainData, sampleSettings){
-
+sampleData <- function(trainData, sampleSettings) {
+ start <- Sys.time()
metaData <- attr(trainData, "metaData")
-
- ParallelLogger::logInfo('Starting data sampling')
-
+
+ ParallelLogger::logInfo("Starting data sampling")
+
# if a single setting, make it a list
- if(inherits(sampleSettings,'sampleSettings')){
+ if (inherits(sampleSettings, "sampleSettings")) {
sampleSettings <- list(sampleSettings)
}
-
- for(sampleSetting in sampleSettings){
+
+ for (sampleSetting in sampleSettings) {
fun <- attr(sampleSetting, "fun")
- args <- list(trainData = trainData,
- sampleSettings = sampleSetting)
- ParallelLogger::logInfo(paste0('Applying ', fun))
+ args <- list(
+ trainData = trainData,
+ sampleSettings = sampleSetting
+ )
+ ParallelLogger::logInfo(paste0("Applying ", fun))
trainData <- do.call(eval(parse(text = fun)), args)
}
-
- ParallelLogger::logInfo('Finished data sampling')
-
+
+ ParallelLogger::logInfo("Finished data sampling")
+
metaData$sampleSettings <- sampleSettings
-
+
attr(trainData, "metaData") <- metaData
+ delta <- Sys.time() - start
+ ParallelLogger::logInfo("Sampling took ",
+ signif(delta, 3), " ", attr(delta, "units"))
return(trainData)
}
-sameData <- function(trainData, ...){
- ParallelLogger::logInfo('No sampling - returning same data')
-
+sameData <- function(trainData, ...) {
+ ParallelLogger::logInfo("No sampling - returning same data")
+
# add attribute for FE
featureEngeering <- list(
- funct = 'sameData',
+ funct = "sameData",
settings = list(
- none = T
+ none = TRUE
)
)
- attr(trainData, 'metaData')$featureEngineering = listAppend(
- attr(trainData, 'metaData')$featureEngineering,
+ attr(trainData, "metaData")$featureEngineering <- listAppend(
+ attr(trainData, "metaData")$featureEngineering,
featureEngeering
)
-
+
return(trainData)
}
-underSampleData <- function(trainData, sampleSettings){
-
- checkIsClass(sampleSettings$sampleSeed, c('numeric', 'integer'))
- checkIsClass(sampleSettings$numberOutcomestoNonOutcomes, c('numeric', 'integer'))
+underSampleData <- function(trainData, sampleSettings) {
+ checkIsClass(sampleSettings$sampleSeed, c("numeric", "integer"))
+ checkIsClass(sampleSettings$numberOutcomestoNonOutcomes, c("numeric", "integer"))
checkHigherEqual(sampleSettings$numberOutcomestoNonOutcomes, 0)
-
- ParallelLogger::logInfo(paste0('sampleSeed: ', sampleSettings$sampleSeed))
- ParallelLogger::logInfo(paste0('numberOutcomestoNonOutcomes: ', sampleSettings$numberOutcomestoNonOutcomes))
-
+
+ ParallelLogger::logInfo(paste0("sampleSeed: ", sampleSettings$sampleSeed))
+ ParallelLogger::logInfo(paste0("numberOutcomestoNonOutcomes: ", sampleSettings$numberOutcomestoNonOutcomes))
+
set.seed(sampleSettings$sampleSeed)
- ParallelLogger::logInfo(paste0('Starting undersampling with seed ', sampleSettings$sampleSeed))
-
+ ParallelLogger::logInfo(paste0("Starting undersampling with seed ", sampleSettings$sampleSeed))
+
population <- trainData$labels %>% dplyr::collect()
folds <- trainData$folds %>% dplyr::collect()
-
- population <- merge(population, folds, by = 'rowId')
-
- ParallelLogger::logInfo(paste0('Initial train data has ',sum(population$outcomeCount > 0),' outcomes to ',
- sum(population$outcomeCount == 0), ' non-outcomes'))
-
+
+ population <- merge(population, folds, by = "rowId")
+
+ ParallelLogger::logInfo(paste0(
+ "Initial train data has ", sum(population$outcomeCount > 0), " outcomes to ",
+ sum(population$outcomeCount == 0), " non-outcomes"
+ ))
+
pplOfInterest <- c()
- for(i in unique(folds$index)){
+ for (i in unique(folds$index)) {
outcomeIds <- population$rowId[population$outcomeCount > 0 & population$index == i]
nonoutcomeIds <- population$rowId[population$outcomeCount == 0 & population$index == i]
-
- sampleSize <- length(outcomeIds)/sampleSettings$numberOutcomestoNonOutcomes
-
- if(sampleSize > length(nonoutcomeIds)){
- ParallelLogger::logWarn('Non-outcome count less that require sample size')
+
+ sampleSize <- length(outcomeIds) / sampleSettings$numberOutcomestoNonOutcomes
+
+ if (sampleSize > length(nonoutcomeIds)) {
+ ParallelLogger::logWarn("Non-outcome count less that require sample size")
sampleSize <- length(nonoutcomeIds)
}
-
+
# randomly pick non-outcome people
sampleNonoutcomeIds <- sample(nonoutcomeIds, sampleSize)
-
+
pplOfInterest <- c(pplOfInterest, outcomeIds, sampleNonoutcomeIds)
}
-
- # filter to these patients
+
+ # filter to these patients
sampleTrainData <- list()
- class(sampleTrainData) <- 'plpData'
+ class(sampleTrainData) <- "plpData"
sampleTrainData$labels <- trainData$labels %>% dplyr::filter(.data$rowId %in% pplOfInterest)
sampleTrainData$folds <- trainData$folds %>% dplyr::filter(.data$rowId %in% pplOfInterest)
-
-
+
+
sampleTrainData$covariateData <- Andromeda::andromeda()
sampleTrainData$covariateData$covariateRef <- trainData$covariateData$covariateRef
- sampleTrainData$covariateData$covariates <- trainData$covariateData$covariates %>%
+ sampleTrainData$covariateData$covariates <- trainData$covariateData$covariates %>%
dplyr::filter(.data$rowId %in% pplOfInterest)
-
- #update metaData$populationSize = nrow(trainData$labels)
- metaData <- attr(trainData$covariateData, 'metaData')
- metaData$populationSize = nrow(sampleTrainData$labels)
- attr(sampleTrainData$covariateData, 'metaData') <- metaData
-
- class(sampleTrainData$covariateData) <- 'CovariateData'
-
+
+ # update metaData$populationSize = nrow(trainData$labels)
+ metaData <- attr(trainData$covariateData, "metaData")
+ metaData$populationSize <- nrow(sampleTrainData$labels)
+ attr(sampleTrainData$covariateData, "metaData") <- metaData
+
+ class(sampleTrainData$covariateData) <- "CovariateData"
+
return(sampleTrainData)
}
-overSampleData <- function(trainData, sampleSettings){
-
- checkIsClass(sampleSettings$sampleSeed, c('numeric', 'integer'))
- checkIsClass(sampleSettings$numberOutcomestoNonOutcomes, c('numeric', 'integer'))
+overSampleData <- function(trainData, sampleSettings) {
+ checkIsClass(sampleSettings$sampleSeed, c("numeric", "integer"))
+ checkIsClass(sampleSettings$numberOutcomestoNonOutcomes, c("numeric", "integer"))
checkHigherEqual(sampleSettings$numberOutcomestoNonOutcomes, 0)
-
- ParallelLogger::logInfo(paste0('sampleSeed: ', sampleSettings$sampleSeed))
- ParallelLogger::logInfo(paste0('numberOutcomestoNonOutcomes: ', sampleSettings$numberOutcomestoNonOutcomes))
-
+
+ ParallelLogger::logInfo(paste0("sampleSeed: ", sampleSettings$sampleSeed))
+ ParallelLogger::logInfo(paste0("numberOutcomestoNonOutcomes: ", sampleSettings$numberOutcomestoNonOutcomes))
+
set.seed(sampleSettings$sampleSeed)
- ParallelLogger::logInfo(paste0('Starting oversampling with seed ', sampleSettings$sampleSeed))
-
+ ParallelLogger::logInfo(paste0("Starting oversampling with seed ", sampleSettings$sampleSeed))
+
population <- trainData$labels %>% dplyr::collect()
folds <- trainData$folds %>% dplyr::collect()
-
- population <- merge(population, folds, by = 'rowId')
-
- ParallelLogger::logInfo(paste0('Initial train data has ',sum(population$outcomeCount > 0),' outcomes to ',
- sum(population$outcomeCount == 0), ' non-outcomes'))
-
+
+ population <- merge(population, folds, by = "rowId")
+
+ ParallelLogger::logInfo(paste0(
+ "Initial train data has ", sum(population$outcomeCount > 0), " outcomes to ",
+ sum(population$outcomeCount == 0), " non-outcomes"
+ ))
+
sampleTrainData <- list()
- class(sampleTrainData) <- 'plpData'
+ class(sampleTrainData) <- "plpData"
sampleTrainData$labels <- trainData$labels %>% dplyr::collect()
sampleTrainData$folds <- trainData$folds %>% dplyr::collect()
-
+
sampleTrainData$covariateData <- Andromeda::andromeda()
sampleTrainData$covariateData$covariateRef <- trainData$covariateData$covariateRef
sampleTrainData$covariateData$covariates <- trainData$covariateData$covariates
-
- for(i in unique(folds$index)){
-
+
+ for (i in unique(folds$index)) {
outcomeIds <- population$rowId[population$outcomeCount > 0 & population$index == i]
nonoutcomeIds <- population$rowId[population$outcomeCount == 0 & population$index == i]
-
- sampleSize <- floor(length(nonoutcomeIds)*sampleSettings$numberOutcomestoNonOutcomes)
-
- if(sampleSize > length(nonoutcomeIds)){
- ParallelLogger::logWarn('Non-outcome count less than required sample size')
+
+ sampleSize <- floor(length(nonoutcomeIds) * sampleSettings$numberOutcomestoNonOutcomes)
+
+ if (sampleSize > length(nonoutcomeIds)) {
+ ParallelLogger::logWarn("Non-outcome count less than required sample size")
sampleSize <- length(nonoutcomeIds) - length(outcomeIds)
} else {
sampleSize <- sampleSize - length(outcomeIds)
}
-
- while (sampleSize > 0){
- tempSampleSize = min(length(outcomeIds),sampleSize)
+
+ while (sampleSize > 0) {
+ tempSampleSize <- min(length(outcomeIds), sampleSize)
# randomly oversample outcome people
sampleOutcomeIds <- sample(outcomeIds, tempSampleSize, replace = TRUE)
pplOfInterest <- unique(sampleOutcomeIds) # to enable oversampling with replacement
- sampleSize <- sampleSize-length(pplOfInterest)
-
+ sampleSize <- sampleSize - length(pplOfInterest)
+
addTrainData <- list()
- addTrainData$labels <- trainData$labels %>% dplyr::filter(.data$rowId %in% pplOfInterest)
+ addTrainData$labels <- trainData$labels %>% dplyr::filter(.data$rowId %in% pplOfInterest)
addTrainData$labels <- addTrainData$labels %>% dplyr::mutate(newRowId = 1:nrow(addTrainData$labels))
addTrainData$labels <- addTrainData$labels %>% dplyr::mutate(newRowId = .data$newRowId + max(sampleTrainData$labels$rowId))
-
- addTrainData$folds <- trainData$folds %>% dplyr::filter(.data$rowId %in% pplOfInterest)
- addTrainData$folds <- dplyr::inner_join(addTrainData$folds, addTrainData$labels[, c("rowId", "newRowId")], copy=TRUE)
+
+ addTrainData$folds <- trainData$folds %>% dplyr::filter(.data$rowId %in% pplOfInterest)
+ addTrainData$folds <- dplyr::inner_join(addTrainData$folds, addTrainData$labels[, c("rowId", "newRowId")], copy = TRUE)
addTrainData$folds <- addTrainData$folds %>% dplyr::mutate(rowId = .data$newRowId)
- addTrainData$folds <- dplyr::select(addTrainData$folds,-dplyr::starts_with("newRowId"))
-
- addTrainData$covariateData$covariates <- trainData$covariateData$covariates %>% dplyr::filter(.data$rowId %in% pplOfInterest)
- addTrainData$covariateData$covariates <- dplyr::inner_join(addTrainData$covariateData$covariates, addTrainData$labels[, c("rowId", "newRowId")], copy=TRUE)
- addTrainData$covariateData$covariates <- addTrainData$covariateData$covariates %>% dplyr::mutate(rowId = .data$newRowId)
- addTrainData$covariateData$covariates <- dplyr::select(addTrainData$covariateData$covariates,-dplyr::starts_with("newRowId"))
-
- addTrainData$labels <- addTrainData$labels %>% dplyr::mutate(rowId = .data$newRowId)
- addTrainData$labels <- dplyr::select(addTrainData$labels,-dplyr::starts_with("newRowId"))
-
+ addTrainData$folds <- dplyr::select(addTrainData$folds, -dplyr::starts_with("newRowId"))
+
+ addTrainData$covariateData$covariates <- trainData$covariateData$covariates %>% dplyr::filter(.data$rowId %in% pplOfInterest)
+ addTrainData$covariateData$covariates <- dplyr::inner_join(addTrainData$covariateData$covariates, addTrainData$labels[, c("rowId", "newRowId")], copy = TRUE)
+ addTrainData$covariateData$covariates <- addTrainData$covariateData$covariates %>% dplyr::mutate(rowId = .data$newRowId)
+ addTrainData$covariateData$covariates <- dplyr::select(addTrainData$covariateData$covariates, -dplyr::starts_with("newRowId"))
+
+ addTrainData$labels <- addTrainData$labels %>% dplyr::mutate(rowId = .data$newRowId)
+ addTrainData$labels <- dplyr::select(addTrainData$labels, -dplyr::starts_with("newRowId"))
+
sampleTrainData$labels <- dplyr::bind_rows(sampleTrainData$labels, addTrainData$labels)
sampleTrainData$folds <- dplyr::bind_rows(sampleTrainData$folds, addTrainData$folds)
Andromeda::appendToTable(sampleTrainData$covariateData$covariates, addTrainData$covariateData$covariates)
}
}
-
- #update metaData$populationSize = nrow(trainData$labels)
- metaData <- attr(trainData$covariateData, 'metaData')
- metaData$populationSize = nrow(sampleTrainData$labels)
- attr(sampleTrainData$covariateData, 'metaData') <- metaData
-
- class(sampleTrainData$covariateData) <- 'CovariateData'
-
+
+ # update metaData$populationSize = nrow(trainData$labels)
+ metaData <- attr(trainData$covariateData, "metaData")
+ metaData$populationSize <- nrow(sampleTrainData$labels)
+ attr(sampleTrainData$covariateData, "metaData") <- metaData
+
+ class(sampleTrainData$covariateData) <- "CovariateData"
+
return(sampleTrainData)
}
diff --git a/R/SaveLoadPlp.R b/R/SaveLoadPlp.R
index 47353fcf3..7079e0009 100644
--- a/R/SaveLoadPlp.R
+++ b/R/SaveLoadPlp.R
@@ -34,31 +34,31 @@
#' # todo
#'
#' @export
-savePlpData <- function(plpData, file, envir=NULL, overwrite=F) {
- if (missing(plpData)){
+savePlpData <- function(plpData, file, envir = NULL, overwrite = FALSE) {
+ if (missing(plpData)) {
stop("Must specify plpData")
}
- if (missing(file)){
+ if (missing(file)) {
stop("Must specify file")
}
- if (!inherits(x = plpData, what = c("plpData"))){
+ if (!inherits(x = plpData, what = c("plpData"))) {
stop("Data not of class plpData")
}
- if(dir.exists(file.path(file, "covariates"))){
- stop('Folder to save covariates already exists...')
+ if (dir.exists(file.path(file, "covariates"))) {
+ stop("Folder to save covariates already exists...")
}
-
- if(!dir.exists(file)){
- dir.create(file, recursive = T)
+
+ if (!dir.exists(file)) {
+ dir.create(file, recursive = TRUE)
}
-
+
# save the actual values in the metaData
# TODO - only do this if exists in parent or environ
- if(is.null(plpData$metaData$call$sampleSize)){ # fixed a bug when sampleSize is NULL
- plpData$metaData$call$sampleSize <- 'NULL'
+ if (is.null(plpData$metaData$call$sampleSize)) { # fixed a bug when sampleSize is NULL
+ plpData$metaData$call$sampleSize <- "NULL"
}
-
- Andromeda::saveAndromeda(plpData$covariateData, file = file.path(file, "covariates"), maintainConnection = T)
+
+ Andromeda::saveAndromeda(plpData$covariateData, file = file.path(file, "covariates"), maintainConnection = TRUE)
saveRDS(plpData$timeRef, file = file.path(file, "timeRef.rds"))
saveRDS(plpData$cohorts, file = file.path(file, "cohorts.rds"))
saveRDS(plpData$outcomes, file = file.path(file, "outcomes.rds"))
@@ -85,16 +85,20 @@ savePlpData <- function(plpData, file, envir=NULL, overwrite=F) {
#'
#' @export
loadPlpData <- function(file, readOnly = TRUE) {
- if (!file.exists(file))
+ if (!file.exists(file)) {
stop(paste("Cannot find folder", file))
- if (!file.info(file)$isdir)
+ }
+ if (!file.info(file)$isdir) {
stop(paste("Not a folder", file))
-
- result <- list(covariateData = FeatureExtraction::loadCovariateData(file = file.path(file, "covariates")),
- timeRef = readRDS(file.path(file, "timeRef.rds")),
- cohorts = readRDS(file.path(file, "cohorts.rds")),
- outcomes = readRDS(file.path(file, "outcomes.rds")),
- metaData = readRDS(file.path(file, "metaData.rds")))
+ }
+
+ result <- list(
+ covariateData = FeatureExtraction::loadCovariateData(file = file.path(file, "covariates")),
+ timeRef = readRDS(file.path(file, "timeRef.rds")),
+ cohorts = readRDS(file.path(file, "cohorts.rds")),
+ outcomes = readRDS(file.path(file, "outcomes.rds")),
+ metaData = readRDS(file.path(file, "metaData.rds"))
+ )
class(result) <- "plpData"
@@ -110,117 +114,118 @@ loadPlpData <- function(file, readOnly = TRUE) {
#' @param dirPath A location to save the model to
#'
#' @export
-savePlpModel <- function(plpModel, dirPath){
- if (missing(plpModel)){
+savePlpModel <- function(plpModel, dirPath) {
+ if (missing(plpModel)) {
stop("Must specify plpModel")
}
- if (missing(dirPath)){
+ if (missing(dirPath)) {
stop("Must specify directory path")
}
- if (!inherits(x = plpModel, what = "plpModel")){
+ if (!inherits(x = plpModel, what = "plpModel")) {
stop("Not a plpModel")
}
-
- if(!dir.exists(dirPath)){
- ParallelLogger::logInfo('Creating directory to save model')
- dir.create(dirPath, recursive = T)
+
+ if (!dir.exists(dirPath)) {
+ ParallelLogger::logInfo("Creating directory to save model")
+ dir.create(dirPath, recursive = TRUE)
}
-
+
# save the covariateImportance
utils::write.csv(
- x = plpModel$covariateImportance,
- file = file.path(dirPath, 'covariateImportance.csv'),
- row.names = F
+ x = plpModel$covariateImportance,
+ file = file.path(dirPath, "covariateImportance.csv"),
+ row.names = FALSE
)
-
+
# save the trainDetails
- if(!is.null(plpModel$trainDetails)){
+ if (!is.null(plpModel$trainDetails)) {
ParallelLogger::saveSettingsToJson(
- object = plpModel$trainDetails,
- fileName = file.path(dirPath, 'trainDetails.json')
- )
+ object = plpModel$trainDetails,
+ fileName = file.path(dirPath, "trainDetails.json")
+ )
}
-
+
# save the validationDetails
- if(!is.null(plpModel$validationDetails)){
+ if (!is.null(plpModel$validationDetails)) {
ParallelLogger::saveSettingsToJson(
- object = plpModel$validationDetails,
- fileName = file.path(dirPath, 'validationDetails.json')
+ object = plpModel$validationDetails,
+ fileName = file.path(dirPath, "validationDetails.json")
)
}
-
-
+
+
# save the settings
ParallelLogger::saveSettingsToJson(
- object = plpModel$modelDesign,
- fileName = file.path(dirPath, 'modelDesign.json')
+ object = plpModel$modelDesign,
+ fileName = file.path(dirPath, "modelDesign.json")
)
-
- if(!is.null(plpModel$preprocessing)){
-
+
+ if (!is.null(plpModel$preprocessing)) {
# cheap fix to get past bug in ParallelLogger::saveSettingsToJson with tibbles
- plpModel$preprocessing$tidyCovariates$normFactors <-
+ plpModel$preprocessing$tidyCovariates$normFactors <-
as.data.frame(plpModel$preprocessing$tidyCovariates$normFactors)
-
+
ParallelLogger::saveSettingsToJson(
- object = plpModel$preprocessing,
- fileName = file.path(dirPath, 'preprocessing.json')
- )
+ object = plpModel$preprocessing,
+ fileName = file.path(dirPath, "preprocessing.json")
+ )
}
-
-
- # save the model part function to file
+
+
+ # save the model part function to file
saveModelPart(
- model = plpModel$model,
- savetype = attr(plpModel, 'saveType'),
+ model = plpModel$model,
+ savetype = attr(plpModel, "saveType"),
dirPath = dirPath
- )
-
+ )
+
# save the attributes of plpModel
modelAttributes <- attributes(plpModel)
modelAttributes$names <- NULL
ParallelLogger::saveSettingsToJson(
- object = modelAttributes,
- fileName = file.path(dirPath, 'attributes.json')
+ object = modelAttributes,
+ fileName = file.path(dirPath, "attributes.json")
)
-
+
return(dirPath)
}
-saveModelPart <- function(model, savetype, dirPath){
+saveModelPart <- function(model, savetype, dirPath) {
# save the model based on saveType
- if(savetype == "xgboost"){
+ if (savetype == "xgboost") {
xgboost::xgb.save(
- model = model,
+ model = model,
fname = file.path(dirPath, "model.json")
)
- } else if(savetype == "lightgbm"){
- lightgbm::lgb.save(booster = model,
- filename = file.path(dirPath, "model.json"))
- } else if(savetype == "RtoJson"){
+ } else if (savetype == "lightgbm") {
+ lightgbm::lgb.save(
+ booster = model,
+ filename = file.path(dirPath, "model.json")
+ )
+ } else if (savetype == "RtoJson") {
ParallelLogger::saveSettingsToJson(
- object = model,
- fileName = file.path(dirPath, 'model.json')
+ object = model,
+ fileName = file.path(dirPath, "model.json")
)
- } else if(savetype == "file"){
+ } else if (savetype == "file") {
# move the model into model
- if(!dir.exists(file.path(dirPath, 'model'))){
- dir.create(file.path(dirPath, 'model'), recursive = T)
+ if (!dir.exists(file.path(dirPath, "model"))) {
+ dir.create(file.path(dirPath, "model"), recursive = TRUE)
}
- for(file in dir(model)){
+ for (file in dir(model)) {
file.copy(
- file.path(model,file),
- file.path(dirPath,'model'),
- overwrite = TRUE,
+ file.path(model, file),
+ file.path(dirPath, "model"),
+ overwrite = TRUE,
recursive = FALSE,
- copy.mode = TRUE,
- copy.date = FALSE)
+ copy.mode = TRUE,
+ copy.date = FALSE
+ )
}
- } else{
- ParallelLogger::logWarn('Not sure how to save model - invalid saveType')
+ } else {
+ ParallelLogger::logWarn("Not sure how to save model - invalid saveType")
}
-
}
@@ -233,73 +238,89 @@ saveModelPart <- function(model, savetype, dirPath){
#'
#' @export
loadPlpModel <- function(dirPath) {
- if (!file.exists(dirPath))
+ if (!file.exists(dirPath)) {
stop(paste("Cannot find folder", dirPath))
- if (!file.info(dirPath)$isdir)
+ }
+ if (!file.info(dirPath)$isdir) {
stop(paste("Not a folder", dirPath))
-
+ }
+
plpModel <- list()
modelAttributes <- tryCatch(
- ParallelLogger::loadSettingsFromJson(file.path(dirPath, 'attributes.json')),
- error = function(e){NULL}
+ ParallelLogger::loadSettingsFromJson(file.path(dirPath, "attributes.json")),
+ error = function(e) {
+ NULL
+ }
)
-
- if(is.null(modelAttributes)){
- ParallelLogger::logWarn('Incorrect plpModel object - is this an old model?')
+
+ if (is.null(modelAttributes)) {
+ ParallelLogger::logWarn("Incorrect plpModel object - is this an old model?")
return(NULL)
}
-
+
attributes(plpModel) <- modelAttributes
-
+
plpModel$covariateImportance <- tryCatch(
utils::read.csv(file.path(dirPath, "covariateImportance.csv")),
- error = function(e){NULL}
+ error = function(e) {
+ NULL
+ }
)
-
- if(file.exists(file.path(dirPath, "trainDetails.json"))){
+
+ if (file.exists(file.path(dirPath, "trainDetails.json"))) {
plpModel$trainDetails <- tryCatch(
ParallelLogger::loadSettingsFromJson(file.path(dirPath, "trainDetails.json")),
- error = function(e){NULL}
+ error = function(e) {
+ NULL
+ }
)
}
- if(file.exists(file.path(dirPath, "validationDetails.json"))){
+ if (file.exists(file.path(dirPath, "validationDetails.json"))) {
plpModel$validationDetails <- tryCatch(
ParallelLogger::loadSettingsFromJson(file.path(dirPath, "validationDetails.json")),
- error = function(e){NULL}
+ error = function(e) {
+ NULL
+ }
)
}
-
+
plpModel$modelDesign <- tryCatch(
ParallelLogger::loadSettingsFromJson(file.path(dirPath, "modelDesign.json")),
- error = function(e){NULL}
+ error = function(e) {
+ NULL
+ }
)
-
- # we don't use "preprocess" anymore, should be "preprocessing",
+
+ # we don't use "preprocess" anymore, should be "preprocessing",
# but leave this here if loading an older model
- if(file.exists(file.path(dirPath, "preprocess.json"))){
+ if (file.exists(file.path(dirPath, "preprocess.json"))) {
plpModel$preprocessing <- tryCatch(
ParallelLogger::loadSettingsFromJson(file.path(dirPath, "preprocess.json")),
- error = function(e){NULL}
+ error = function(e) {
+ NULL
+ }
)
}
- if(file.exists(file.path(dirPath, "preprocessing.json")) & is.null(plpModel$preprocessing)){
+ if (file.exists(file.path(dirPath, "preprocessing.json")) && is.null(plpModel$preprocessing)) {
plpModel$preprocessing <- tryCatch(
ParallelLogger::loadSettingsFromJson(file.path(dirPath, "preprocessing.json")),
- error = function(e){NULL}
+ error = function(e) {
+ NULL
+ }
)
}
-
-
- if(attr(plpModel, 'saveType') == "xgboost"){
- ensure_installed("xgboost")
+
+
+ if (attr(plpModel, "saveType") == "xgboost") {
+ rlang::check_installed("xgboost")
plpModel$model <- xgboost::xgb.load(file.path(dirPath, "model.json"))
- } else if(attr(plpModel, 'saveType') == "lightgbm"){
- ensure_installed("lightgbm")
+ } else if (attr(plpModel, "saveType") == "lightgbm") {
+ rlang::check_installed("lightgbm")
plpModel$model <- lightgbm::lgb.load(file.path(dirPath, "model.json"))
- } else if(attr(plpModel, 'saveType') %in% c("RtoJson")){
+ } else if (attr(plpModel, "saveType") %in% c("RtoJson")) {
plpModel$model <- ParallelLogger::loadSettingsFromJson(file.path(dirPath, "model.json"))
- } else{
- plpModel$model <- file.path(dirPath, 'model')
+ } else {
+ plpModel$model <- file.path(dirPath, "model")
}
return(plpModel)
@@ -314,16 +335,16 @@ loadPlpModel <- function(dirPath) {
#' @param prediction The prediciton data.frame
#' @param dirPath The directory to save the prediction RDS
#' @param fileName The name of the RDS file that will be saved in dirPath
-#'
+#'
#' @export
-savePrediction <- function(prediction, dirPath, fileName='prediction.rds'){
- #TODO check inupts
+savePrediction <- function(prediction, dirPath, fileName = "prediction.rds") {
+ # TODO check inupts
ParallelLogger::saveSettingsToJson(
- object = prediction,
- fileName = file.path(dirPath,fileName)
- )
-
- return(file.path(dirPath,fileName))
+ object = prediction,
+ fileName = file.path(dirPath, fileName)
+ )
+
+ return(file.path(dirPath, fileName))
}
#' Loads the prediciton dataframe to csv
@@ -332,10 +353,10 @@ savePrediction <- function(prediction, dirPath, fileName='prediction.rds'){
#' Loads the prediciton RDS file
#'
#' @param fileLocation The location with the saved prediction
-#'
+#'
#' @export
-loadPrediction <- function(fileLocation){
- #TODO check inupts
+loadPrediction <- function(fileLocation) {
+ # TODO check inupts
prediction <- ParallelLogger::loadSettingsFromJson(fileName = fileLocation)
return(prediction)
}
@@ -347,51 +368,49 @@ loadPrediction <- function(fileLocation){
#'
#' @param result The result of running runPlp()
#' @param dirPath The directory to save the csv
-#'
+#'
#' @export
-savePlpResult <- function(result, dirPath){
- if (missing(result)){
+savePlpResult <- function(result, dirPath) {
+ if (missing(result)) {
stop("Must specify runPlp output")
}
- if (missing(dirPath)){
+ if (missing(dirPath)) {
stop("Must specify directory location")
}
- if(!dir.exists(dirPath)){
- dir.create(dirPath, recursive = T)
+ if (!dir.exists(dirPath)) {
+ dir.create(dirPath, recursive = TRUE)
}
-
- savePlpModel(result$model, dirPath=file.path(dirPath,'model') )
+
+ savePlpModel(result$model, dirPath = file.path(dirPath, "model"))
result$model <- NULL
saveRDS(result, file = file.path(dirPath, "runPlp.rds"))
-
}
#' Loads the evalaution dataframe
#'
#' @details
-#' Loads the evaluation
+#' Loads the evaluation
#'
#' @param dirPath The directory where the evaluation was saved
-#'
+#'
#' @export
-loadPlpResult <- function(dirPath){
- if (!file.exists(dirPath)){
+loadPlpResult <- function(dirPath) {
+ if (!file.exists(dirPath)) {
stop(paste("Cannot find folder", dirPath))
}
- if (!file.info(dirPath)$isdir){
+ if (!file.info(dirPath)$isdir) {
stop(paste("Not a folder", dirPath))
}
-
+
result <- readRDS(file.path(dirPath, "runPlp.rds"))
- result$model = loadPlpModel(file.path(dirPath, "model"))
-
+ result$model <- loadPlpModel(file.path(dirPath, "model"))
+
if (is.null(class(result))) {
- class(result) <- 'runPlp'
+ class(result) <- "runPlp"
}
return(result)
-
}
@@ -403,79 +422,78 @@ loadPlpResult <- function(dirPath){
#' @param result An object of class runPlp with development or validation results
#' @param saveDirectory The directory the save the results as csv files
#' @param minCellCount Minimum cell count for the covariateSummary and certain evaluation results
-#'
+#'
#' @export
-savePlpShareable <- function(result, saveDirectory, minCellCount = 10){
-
- if(!dir.exists(saveDirectory)) dir.create(saveDirectory, recursive = T)
-
- #executionSummary
+savePlpShareable <- function(result, saveDirectory, minCellCount = 10) {
+ if (!dir.exists(saveDirectory)) dir.create(saveDirectory, recursive = TRUE)
+
+ # executionSummary
result$executionSummary$PackageVersion$packageVersion <- as.character(result$executionSummary$PackageVersion$packageVersion)
result$executionSummary$PlatformDetails$RAM <- as.character(result$executionSummary$PlatformDetails$RAM)
ParallelLogger::saveSettingsToJson(
- object = result$executionSummary,
- fileName = file.path(saveDirectory, 'executionSummary.json')
- )
-
- #save model as json files
- savePlpModel(result$model, file.path(saveDirectory, 'model'))
-
- #performanceEvaluation
- if(!dir.exists(file.path(saveDirectory, 'performanceEvaluation'))){dir.create(file.path(saveDirectory, 'performanceEvaluation'), recursive = T)}
- utils::write.csv(removeList(result$performanceEvaluation$evaluationStatistics), file = file.path(saveDirectory, 'performanceEvaluation','evaluationStatistics.csv'), row.names = F)
- utils::write.csv(result$performanceEvaluation$thresholdSummary, file = file.path(saveDirectory, 'performanceEvaluation','thresholdSummary.csv'), row.names = F)
+ object = result$executionSummary,
+ fileName = file.path(saveDirectory, "executionSummary.json")
+ )
+
+ # save model as json files
+ savePlpModel(result$model, file.path(saveDirectory, "model"))
+
+ # performanceEvaluation
+ if (!dir.exists(file.path(saveDirectory, "performanceEvaluation"))) {
+ dir.create(file.path(saveDirectory, "performanceEvaluation"), recursive = TRUE)
+ }
+ utils::write.csv(removeList(result$performanceEvaluation$evaluationStatistics), file = file.path(saveDirectory, "performanceEvaluation", "evaluationStatistics.csv"), row.names = FALSE)
+ utils::write.csv(result$performanceEvaluation$thresholdSummary, file = file.path(saveDirectory, "performanceEvaluation", "thresholdSummary.csv"), row.names = FALSE)
utils::write.csv(
removeCellCount(
- result$performanceEvaluation$demographicSummary,
- minCellCount = minCellCount,
- filterColumns = c('PersonCountAtRisk', 'PersonCountWithOutcome')
- ),
- file = file.path(saveDirectory, 'performanceEvaluation','demographicSummary.csv'),
- row.names = F
+ result$performanceEvaluation$demographicSummary,
+ minCellCount = minCellCount,
+ filterColumns = c("PersonCountAtRisk", "PersonCountWithOutcome")
+ ),
+ file = file.path(saveDirectory, "performanceEvaluation", "demographicSummary.csv"),
+ row.names = FALSE
)
- utils::write.csv(result$performanceEvaluation$calibrationSummary, file = file.path(saveDirectory, 'performanceEvaluation','calibrationSummary.csv'), row.names = F)
- utils::write.csv(result$performanceEvaluation$predictionDistribution, file = file.path(saveDirectory, 'performanceEvaluation','predictionDistribution.csv'), row.names = F)
-
- if(!is.null(result$covariateSummary)){
- #covariateSummary
+ utils::write.csv(result$performanceEvaluation$calibrationSummary, file = file.path(saveDirectory, "performanceEvaluation", "calibrationSummary.csv"), row.names = FALSE)
+ utils::write.csv(result$performanceEvaluation$predictionDistribution, file = file.path(saveDirectory, "performanceEvaluation", "predictionDistribution.csv"), row.names = FALSE)
+
+ if (!is.null(result$covariateSummary)) {
+ # covariateSummary
utils::write.csv(
removeCellCount(
result$covariateSummary,
- minCellCount = minCellCount,
- filterColumns = c('CovariateCount', 'WithOutcome_CovariateCount', 'WithNoOutcome_CovariateCount'),
- extraCensorColumns = c('WithOutcome_CovariateMean', 'WithNoOutcome_CovariateMean'),
- restrictColumns = c('covariateId','covariateName', 'analysisId', 'conceptId','CovariateCount', 'covariateValue','WithOutcome_CovariateCount','WithNoOutcome_CovariateCount','WithOutcome_CovariateMean','WithNoOutcome_CovariateMean','StandardizedMeanDiff')
- ),
- file = file.path(saveDirectory,'covariateSummary.csv'),
- row.names = F
+ minCellCount = minCellCount,
+ filterColumns = c("CovariateCount", "WithOutcome_CovariateCount", "WithNoOutcome_CovariateCount"),
+ extraCensorColumns = c("WithOutcome_CovariateMean", "WithNoOutcome_CovariateMean"),
+ restrictColumns = c("covariateId", "covariateName", "analysisId", "conceptId", "CovariateCount", "covariateValue", "WithOutcome_CovariateCount", "WithNoOutcome_CovariateCount", "WithOutcome_CovariateMean", "WithNoOutcome_CovariateMean", "StandardizedMeanDiff")
+ ),
+ file = file.path(saveDirectory, "covariateSummary.csv"),
+ row.names = FALSE
)
}
-
- #analysisRef
+
+ # analysisRef
ParallelLogger::saveSettingsToJson(
- object = result$analysisRef,
- fileName = file.path(saveDirectory, 'analysisRef.json')
- )
-
+ object = result$analysisRef,
+ fileName = file.path(saveDirectory, "analysisRef.json")
+ )
+
return(invisible(saveDirectory))
}
-removeList <- function(x){
-
- if(is.null(x)){
+removeList <- function(x) {
+ if (is.null(x)) {
return(x)
}
-
- for(i in 1:ncol(x)){
- x[,i] <- unlist(x[,i])
+
+ for (i in 1:ncol(x)) {
+ x[, i] <- unlist(x[, i])
}
-
- if('value' %in% colnames(x)){
+
+ if ("value" %in% colnames(x)) {
x$value <- as.double(x$value)
}
-
+
return(x)
-
}
#' Loads the plp result saved as json/csv files for transparent sharing
@@ -484,72 +502,118 @@ removeList <- function(x){
#' Load the main results from json/csv files into a runPlp object
#'
#' @param loadDirectory The directory with the results as json/csv files
-#'
+#'
#' @export
-loadPlpShareable <- function(loadDirectory){
-
+loadPlpShareable <- function(loadDirectory) {
result <- list()
- objects <- gsub('.json', '', gsub('.csv','',dir(loadDirectory)))
- if(sum(!c('covariateSummary','executionSummary','performanceEvaluation', 'model', 'analysisRef')%in%objects)>0){
- stop('Incorrect results file')
+ objects <- gsub(".json", "", gsub(".csv", "", dir(loadDirectory)))
+ if (sum(!c("covariateSummary", "executionSummary", "performanceEvaluation", "model", "analysisRef") %in% objects) > 0) {
+ stop("Incorrect results file")
}
-
+
length(result) <- length(objects)
names(result) <- objects
-
+
# load model settings
- result$model <- loadPlpModel(file.path(loadDirectory,'model'))
-
- #executionSummary
- result$executionSummary <- tryCatch({ParallelLogger::loadSettingsFromJson(fileName = file.path(loadDirectory, 'executionSummary.json'))}, error = function(e){return(NULL)})
-
- #performanceEvaluation
+ result$model <- loadPlpModel(file.path(loadDirectory, "model"))
+
+ # executionSummary
+ result$executionSummary <- tryCatch(
+ {
+ ParallelLogger::loadSettingsFromJson(fileName = file.path(loadDirectory, "executionSummary.json"))
+ },
+ error = function(e) {
+ return(NULL)
+ }
+ )
+
+ # performanceEvaluation
result$performanceEvaluation <- list()
- result$performanceEvaluation$evaluationStatistics <- tryCatch({utils::read.csv(file = file.path(loadDirectory, 'performanceEvaluation','evaluationStatistics.csv'))}, error = function(e){return(NULL)})
- result$performanceEvaluation$thresholdSummary <- tryCatch({utils::read.csv(file = file.path(loadDirectory, 'performanceEvaluation','thresholdSummary.csv'))}, error = function(e){return(NULL)})
- result$performanceEvaluation$demographicSummary <- tryCatch({utils::read.csv(file = file.path(loadDirectory, 'performanceEvaluation','demographicSummary.csv'))}, error = function(e){return(NULL)})
- result$performanceEvaluation$calibrationSummary <- tryCatch({utils::read.csv(file = file.path(loadDirectory, 'performanceEvaluation','calibrationSummary.csv'))}, error = function(e){return(NULL)})
- result$performanceEvaluation$predictionDistribution <- tryCatch({utils::read.csv(file = file.path(loadDirectory, 'performanceEvaluation','predictionDistribution.csv'))}, error = function(e){return(NULL)})
-
- #covariateSummary
- result$covariateSummary <- utils::read.csv(file = file.path(loadDirectory,'covariateSummary.csv'))
-
- #analysisRef
- result$analysisRef <- tryCatch({ParallelLogger::loadSettingsFromJson(fileName = file.path(loadDirectory, 'analysisRef.json'))}, error = function(e){return(NULL)})
-
+ result$performanceEvaluation$evaluationStatistics <- tryCatch(
+ {
+ utils::read.csv(file = file.path(loadDirectory, "performanceEvaluation", "evaluationStatistics.csv"))
+ },
+ error = function(e) {
+ return(NULL)
+ }
+ )
+ result$performanceEvaluation$thresholdSummary <- tryCatch(
+ {
+ utils::read.csv(file = file.path(loadDirectory, "performanceEvaluation", "thresholdSummary.csv"))
+ },
+ error = function(e) {
+ return(NULL)
+ }
+ )
+ result$performanceEvaluation$demographicSummary <- tryCatch(
+ {
+ utils::read.csv(file = file.path(loadDirectory, "performanceEvaluation", "demographicSummary.csv"))
+ },
+ error = function(e) {
+ return(NULL)
+ }
+ )
+ result$performanceEvaluation$calibrationSummary <- tryCatch(
+ {
+ utils::read.csv(file = file.path(loadDirectory, "performanceEvaluation", "calibrationSummary.csv"))
+ },
+ error = function(e) {
+ return(NULL)
+ }
+ )
+ result$performanceEvaluation$predictionDistribution <- tryCatch(
+ {
+ utils::read.csv(file = file.path(loadDirectory, "performanceEvaluation", "predictionDistribution.csv"))
+ },
+ error = function(e) {
+ return(NULL)
+ }
+ )
+
+ # covariateSummary
+ result$covariateSummary <- utils::read.csv(file = file.path(loadDirectory, "covariateSummary.csv"))
+
+ # analysisRef
+ result$analysisRef <- tryCatch(
+ {
+ ParallelLogger::loadSettingsFromJson(fileName = file.path(loadDirectory, "analysisRef.json"))
+ },
+ error = function(e) {
+ return(NULL)
+ }
+ )
+
class(result) <- "runPlp"
return(result)
}
removeCellCount <- function(
- data,
- minCellCount = minCellCount,
- filterColumns = c('CovariateCount', 'WithOutcome_CovariateCount', 'WithNoOutcome_CovariateCount'),
- extraCensorColumns = c('WithOutcome_CovariateMean', 'WithNoOutcome_CovariateMean'),
- restrictColumns = NULL
-){
-
+ data,
+ minCellCount = minCellCount,
+ filterColumns = c("CovariateCount", "WithOutcome_CovariateCount", "WithNoOutcome_CovariateCount"),
+ extraCensorColumns = c("WithOutcome_CovariateMean", "WithNoOutcome_CovariateMean"),
+ restrictColumns = NULL) {
# first restrict to certain columns if required
- if(!is.null(restrictColumns)){
- data <- data[,restrictColumns]
+ if (!is.null(restrictColumns)) {
+ data <- data[, restrictColumns]
}
-
- #next find the rows that need censoring
- ind <- rep(F, nrow(data))
- for(i in 1:length(filterColumns)){
- data[,filterColumns[i]][is.na(data[,filterColumns[i]])] <- 0
- ind <- ind | (data[,filterColumns[i]] < minCellCount)
+
+ # next find the rows that need censoring
+ ind <- rep(FALSE, nrow(data))
+ for (i in 1:length(filterColumns)) {
+ data[, filterColumns[i]][is.na(data[, filterColumns[i]])] <- 0
+ ind <- ind | (data[, filterColumns[i]] < minCellCount)
}
-
+
# now replace these value with -1
-
- removeColumns <- c(filterColumns,extraCensorColumns)[c(filterColumns,extraCensorColumns) %in% colnames(data)]
-
- for(i in 1:length(removeColumns)){
- data[ind,removeColumns[i]] <- NA
+
+ removeColumns <- c(filterColumns, extraCensorColumns)[c(filterColumns, extraCensorColumns) %in% colnames(data)]
+
+ for (i in 1:length(removeColumns)) {
+ data[ind, removeColumns[i]] <- NA
}
-
+
return(data)
}
@@ -567,145 +631,141 @@ removeCellCount <- function(
#' @param minCellCount The min value to show in cells that are sensitive (values less than this value will be replaced with -1)
#' @param sensitiveColumns A named list (name of table columns belong to) with a list of columns to apply the minCellCount to.
#' @param fileAppend If set to a string this will be appended to the start of the csv file names
-#'
+#'
#' @export
extractDatabaseToCsv <- function(
- conn = NULL,
- connectionDetails,
- databaseSchemaSettings = createDatabaseSchemaSettings(resultSchema = 'main'),
- csvFolder,
- minCellCount = 5,
- sensitiveColumns = getPlpSensitiveColumns(),
- fileAppend = NULL
- ){
-
- ensure_installed('readr')
-
+ conn = NULL,
+ connectionDetails,
+ databaseSchemaSettings = createDatabaseSchemaSettings(resultSchema = "main"),
+ csvFolder,
+ minCellCount = 5,
+ sensitiveColumns = getPlpSensitiveColumns(),
+ fileAppend = NULL) {
+ rlang::check_installed("readr")
+
# check inputs
- if(!is.null(fileAppend)){
- fileAppend <- paste0(gsub('_','',gsub(' ','', fileAppend)), '_')
+ if (!is.null(fileAppend)) {
+ fileAppend <- paste0(gsub("_", "", gsub(" ", "", fileAppend)), "_")
}
-
- if(is.null(conn)){
+
+ if (is.null(conn)) {
# connect
conn <- DatabaseConnector::connect(connectionDetails)
on.exit(DatabaseConnector::disconnect(conn))
}
-
+
# create the folder to save the csv files
- if(!dir.exists(csvFolder)){
- dir.create(csvFolder, recursive = T)
+ if (!dir.exists(csvFolder)) {
+ dir.create(csvFolder, recursive = TRUE)
}
-
+
# get the table names using the function in uploadToDatabase.R
tables <- getPlpResultTables()
-
+
# extract result per table - give option to extract from different cohort/database tables?
modelLocations <- list()
- for(table in tables){
+ for (table in tables) {
sql <- "select * from @resultSchema.@appendtotable@tablename"
sql <- SqlRender::render(
- sql,
+ sql,
resultSchema = databaseSchemaSettings$resultSchema,
appendtotable = databaseSchemaSettings$stringAppendToResultSchemaTables,
- tablename = table
+ tablename = table
)
sql <- SqlRender::translate(
- sql = sql,
- targetDialect = databaseSchemaSettings$targetDialect,
- tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema)
+ sql = sql,
+ targetDialect = databaseSchemaSettings$targetDialect,
+ tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema
+ )
result <- DatabaseConnector::querySql(conn, sql)
-
+
# get the model locations
- if(table == 'MODELS'){
+ if (table == "MODELS") {
modelLocations <- result$PLP_MODEL_FILE
}
-
+
# lower case for consistency in sharing csv results
colnames(result) <- tolower(colnames(result))
-
+
# TODO: add min cell count filter here
- if(tolower(table) %in% names(sensitiveColumns)){
+ if (tolower(table) %in% names(sensitiveColumns)) {
result <- applyMinCellCount(
tableName = table,
sensitiveColumns = sensitiveColumns,
result = result,
minCellCount = minCellCount
- )
+ )
}
-
+
# save the results as a csv
readr::write_excel_csv(
- x = result,
- file = file.path(csvFolder, paste0(fileAppend,tolower(table),'.csv'))
- )
+ x = result,
+ file = file.path(csvFolder, paste0(fileAppend, tolower(table), ".csv"))
+ )
}
-
-
+
+
# load plpModels from database file and save into csv file
- if(length(modelLocations)>0){
- if(!dir.exists(file.path(csvFolder, 'models'))){
- dir.create(file.path(csvFolder, 'models'), recursive = T)
+ if (length(modelLocations) > 0) {
+ if (!dir.exists(file.path(csvFolder, "models"))) {
+ dir.create(file.path(csvFolder, "models"), recursive = TRUE)
}
- for(modelLocation in modelLocations){
- modelLocAppend <- strsplit(x = modelLocation, split = '/')[[1]][length(strsplit(x = modelLocation, split = '/')[[1]])]
+ for (modelLocation in modelLocations) {
+ modelLocAppend <- strsplit(x = modelLocation, split = "/")[[1]][length(strsplit(x = modelLocation, split = "/")[[1]])]
plpModel <- tryCatch(
{
PatientLevelPrediction::loadPlpModel(file.path(modelLocation))
- }, error = function(e){ParallelLogger::logInfo(e); return(NULL)}
+ },
+ error = function(e) {
+ ParallelLogger::logInfo(e)
+ return(NULL)
+ }
)
- if(!is.null(plpModel)){
- PatientLevelPrediction::savePlpModel(plpModel, file.path(csvFolder, 'models', modelLocAppend))
+ if (!is.null(plpModel)) {
+ PatientLevelPrediction::savePlpModel(plpModel, file.path(csvFolder, "models", modelLocAppend))
}
}
}
-
+
return(invisible(NULL))
-
}
-getPlpSensitiveColumns <- function(){
-
+getPlpSensitiveColumns <- function() {
result <- list(
-
prediction_distribution = list(
- c('person_count')
- ),
+ c("person_count")
+ ),
covariate_summary = list(
- c('covariate_count'),
- c('with_no_outcome_covariate_count', 'with_outcome_covariate_count')
+ c("covariate_count"),
+ c("with_no_outcome_covariate_count", "with_outcome_covariate_count")
),
calibration_summary = list(
- c('person_count_at_risk', 'person_count_with_outcome')
+ c("person_count_at_risk", "person_count_with_outcome")
),
-
demographic_summary = list(
- c('person_count_at_risk'),
- c('person_count_with_outcome')
+ c("person_count_at_risk"),
+ c("person_count_with_outcome")
)
-
)
-
+
return(result)
}
applyMinCellCount <- function(
- tableName,
- sensitiveColumns,
- result,
- minCellCount
-){
-
+ tableName,
+ sensitiveColumns,
+ result,
+ minCellCount) {
columnsToCensor <- sensitiveColumns[[tableName]]
-
- for(columns in columnsToCensor){
- rowInd <- apply(result[,columns, drop = F] < minCellCount, 1, sum) > 0
- if(sum(rowInd) > 0){
- result[rowInd , columns] <- -1
+
+ for (columns in columnsToCensor) {
+ rowInd <- apply(result[, columns, drop = FALSE] < minCellCount, 1, sum) > 0
+ if (sum(rowInd) > 0) {
+ result[rowInd, columns] <- -1
}
}
-
+
return(result)
}
diff --git a/R/Simulation.R b/R/Simulation.R
index b1d115c14..75d8e11f4 100644
--- a/R/Simulation.R
+++ b/R/Simulation.R
@@ -16,65 +16,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-# Create simulation profile
-#
-# @description
-# \code{createplpDataSimulationProfile} creates a profile based on the provided plpData object, which
-# can be used to generate simulated data that has similar characteristics.
-#
-# @param plpData An object of type \code{plpData} as generated using \code{getDbplpData}.
-#
-# @details
-# The output of this function is an object that can be used by the \code{simulateplpData} function to
-# generate a plpData object.
-#
-# @return
-# An object of type \code{plpDataSimulationProfile}.
-#
-# @export
-##createPlpSimulationProfile <- function(plpData) {
-## writeLines("Computing covariate prevalence") # (Note: currently assuming binary covariates)
-## sums <- bySumFf(plpData$covariates$covariateValue, plpData$covariates$covariateId)
-## covariatePrevalence <- sums$sums/nrow(plpData$cohorts)
-## attr(covariatePrevalence, "names") <- sums$bins
-##
-##
-## writeLines("Fitting outcome model(s)")
-## outcomeModels <- vector("list", length(plpData$metaData$outcomeIds))
-## for (i in 1:length(plpData$metaData$outcomeIds)) {
-## outcomeId <- plpData$metaData$outcomeIds[i]
-## model <- fitPredictiveModel(plpData = plpData,
-## outcomeId = outcomeId,
-## modelType = "poisson",
-## prior = Cyclops::createPrior("laplace",
-## exclude = c(0),
-## useCrossValidation = TRUE),
-## control = Cyclops::createControl(noiseLevel = "quiet",
-## cvType = "auto",
-## startingVariance = 0.001,
-## threads = 10))
-## model$coefficients <- model$coefficients[model$coefficients != 0]
-## outcomeModels[[i]] <- model$coefficients
-## }
-##
-## writeLines("Computing time distribution")
-## timePrevalence <- table(ff::as.ram(plpData$cohorts$time))/nrow(plpData$cohorts)
-##
-## if (!is.null(plpData$exclude)) {
-## writeLines("Computing prevalence of exlusion")
-## exclusionPrevalence <- table(ff::as.ram(plpData$exclude$outcomeId))/nrow(plpData$cohorts)
-## } else {
-## exclusionPrevalence <- NULL
-## }
-## result <- list(covariatePrevalence = covariatePrevalence,
-## outcomeModels = outcomeModels,
-## metaData = plpData$metaData,
-## covariateRef = ff::as.ram(plpData$covariateRef),
-## timePrevalence = timePrevalence,
-## exclusionPrevalence = exclusionPrevalence)
-## class(result) <- "plpDataSimulationProfile"
-## return(result)
-##}
#' Generate simulated data
#'
@@ -98,52 +39,60 @@ simulatePlpData <- function(plpDataSimulationProfile, n = 10000) {
# Note: currently, simulation is done completely in-memory. Could easily do batch-wise
writeLines("Generating covariates")
covariatePrevalence <- plpDataSimulationProfile$covariatePrevalence
-
+
personsPerCov <- stats::rpois(n = length(covariatePrevalence), lambda = covariatePrevalence * n)
personsPerCov[personsPerCov > n] <- n
rowId <- sapply(personsPerCov, function(x, n) sample.int(size = x, n), n = n)
rowId <- do.call("c", rowId)
covariateIds <- as.numeric(names(covariatePrevalence))
covariateId <- unlist(sapply(1:length(personsPerCov),
- function(x, personsPerCov, covariateIds) rep(covariateIds[x],
- personsPerCov[x]),
- personsPerCov = personsPerCov,
- covariateIds = covariateIds))
+ function(x, personsPerCov, covariateIds) {
+ rep(covariateIds[x], personsPerCov[x])
+ },
+ personsPerCov = personsPerCov,
+ covariateIds = covariateIds
+ ))
covariateValue <- rep(1, length(covariateId))
- covariateData <- Andromeda::andromeda(covariates = data.frame(rowId = rowId,
- covariateId = covariateId,
- covariateValue = covariateValue),
- covariateRef = plpDataSimulationProfile$covariateRef,
- analysisRef = data.frame(analysisId = 1))
-
+ covariateData <- Andromeda::andromeda(
+ covariates = data.frame(
+ rowId = rowId,
+ covariateId = covariateId,
+ covariateValue = covariateValue
+ ),
+ covariateRef = plpDataSimulationProfile$covariateRef,
+ analysisRef = data.frame(analysisId = 1)
+ )
+
class(covariateData) <- "CovariateData"
-
+
writeLines("Generating cohorts")
cohorts <- data.frame(rowId = 1:n, subjectId = 2e+10 + (1:n), targetId = 1)
breaks <- cumsum(plpDataSimulationProfile$timePrevalence)
r <- stats::runif(n)
cohorts$time <- as.numeric(as.character(cut(r, breaks = c(0, breaks), labels = names(breaks))))
- cohorts$cohortStartDate <- sample(-1000:1000,n,replace=TRUE) + as.Date("2010-01-01")
- cohorts$daysFromObsStart <- sample(1:1000,n,replace=TRUE)
- cohorts$daysToCohortEnd <- sample(1:1000,n,replace=TRUE)
- cohorts$daysToObsEnd <- cohorts$daysToCohortEnd + sample(1:1000,n,replace=TRUE)
- cohorts$ageYear <- sample(0:95,n,replace=TRUE)
- cohorts$gender <- 8532 #female
- cohorts$gender[sample((1:nrow(cohorts)), nrow(cohorts)/2)] <- 8507
-
+ cohorts$cohortStartDate <- sample(-1000:1000, n, replace = TRUE) + as.Date("2010-01-01")
+ cohorts$daysFromObsStart <- sample(1:1000, n, replace = TRUE)
+ cohorts$daysToCohortEnd <- sample(1:1000, n, replace = TRUE)
+ cohorts$daysToObsEnd <- cohorts$daysToCohortEnd + sample(1:1000, n, replace = TRUE)
+ cohorts$ageYear <- sample(0:95, n, replace = TRUE)
+ cohorts$gender <- 8532 # female
+ cohorts$gender[sample((1:nrow(cohorts)), nrow(cohorts) / 2)] <- 8507
+
writeLines("Generating outcomes")
allOutcomes <- data.frame()
for (i in 1:length(plpDataSimulationProfile$metaData$outcomeIds)) {
- coefficients <- data.frame(betas=as.numeric(plpDataSimulationProfile$outcomeModels[[i]]),
- covariateIds=names(plpDataSimulationProfile$outcomeModels[[i]])
- )
+ coefficients <- data.frame(
+ betas = as.numeric(plpDataSimulationProfile$outcomeModels[[i]]),
+ covariateIds = names(plpDataSimulationProfile$outcomeModels[[i]])
+ )
prediction <- predictCyclopsType(coefficients,
- cohorts,
- covariateData,
- modelType = "poisson")
+ cohorts,
+ covariateData,
+ modelType = "poisson"
+ )
outcomes <- merge(prediction, cohorts[, c("rowId", "time")])
- outcomes$value <- outcomes$value * outcomes$time #Value is lambda
+ outcomes$value <- outcomes$value * outcomes$time # Value is lambda
outcomes$outcomeCount <- as.numeric(stats::rpois(n, outcomes$value))
outcomes <- outcomes[outcomes$outcomeCount != 0, ]
outcomes$outcomeId <- plpDataSimulationProfile$metaData$outcomeIds[i]
@@ -151,18 +100,18 @@ simulatePlpData <- function(plpDataSimulationProfile, n = 10000) {
outcomes <- outcomes[, c("rowId", "outcomeId", "outcomeCount", "daysToEvent")]
allOutcomes <- rbind(allOutcomes, outcomes)
}
-
- covariateData$coefficients <- NULL
-
- # add indexes for covariate summary
- Andromeda::createIndex(tbl = covariateData$covariates, columnNames = 'rowId', indexName = 'covsum_rowId')
- Andromeda::createIndex(tbl = covariateData$covariates, columnNames = 'covariateId', indexName = 'covsum_covariateId')
+
+ covariateData$coefficients <- NULL
+
+ # add indexes for covariate summary
+ Andromeda::createIndex(tbl = covariateData$covariates, columnNames = "rowId", indexName = "covsum_rowId")
+ Andromeda::createIndex(tbl = covariateData$covariates, columnNames = "covariateId", indexName = "covsum_covariateId")
# Remove rownames else they will be copied to the ffdf objects:
- metaData = list()
-
+ metaData <- list()
+
metaData$databaseDetails <- list(
- cdmDatabaseSchema = 'CDM_SCHEMA',
+ cdmDatabaseSchema = "CDM_SCHEMA",
cdmDatabaseName = "CDM_NAME",
outcomeDatabaseSchema = NULL,
cohortDatabaseSchema = NULL,
@@ -171,31 +120,34 @@ simulatePlpData <- function(plpDataSimulationProfile, n = 10000) {
cohortTable = NULL,
cdmVersion = 5,
targetId = 1,
- outcomeIds = c(2,3)
+ outcomeIds = c(2, 3)
)
metaData$restrictPlpDataSettings <- PatientLevelPrediction::createRestrictPlpDataSettings()
- metaData$covariateSettings <- FeatureExtraction::createCovariateSettings(useDemographicsAgeGroup = T)
-
-
- attrition <- data.frame(outcomeId=2,description='Simulated data',
- targetCount=nrow(cohorts), uniquePeople=nrow(cohorts),
- outcomes=nrow(outcomes))
+ metaData$covariateSettings <- FeatureExtraction::createCovariateSettings(useDemographicsAgeGroup = TRUE)
+
+
+ attrition <- data.frame(
+ outcomeId = 2, description = "Simulated data",
+ targetCount = nrow(cohorts), uniquePeople = nrow(cohorts),
+ outcomes = nrow(outcomes)
+ )
attr(cohorts, "metaData") <- list(
- targetId = 1,
+ targetId = 1,
attrition = attrition
- )
-
- attr(allOutcomes, "metaData") <- data.frame(outcomeIds = c(2,3))
-
+ )
+
+ attr(allOutcomes, "metaData") <- data.frame(outcomeIds = c(2, 3))
+
attr(covariateData, "metaData") <- list(populationSize = n, cohortId = 1)
-
- result <- list(cohorts = cohorts,
- outcomes = allOutcomes,
- covariateData = covariateData,
- timeRef = NULL,
- metaData = metaData)
-
+
+ result <- list(
+ cohorts = cohorts,
+ outcomes = allOutcomes,
+ covariateData = covariateData,
+ timeRef = NULL,
+ metaData = metaData
+ )
+
class(result) <- "plpData"
return(result)
}
-
diff --git a/R/SklearnClassifier.R b/R/SklearnClassifier.R
index 850e7b130..21591017e 100644
--- a/R/SklearnClassifier.R
+++ b/R/SklearnClassifier.R
@@ -22,43 +22,36 @@ fitSklearn <- function(trainData,
analysisId,
...) {
param <- modelSettings$param
-
+
# check covariate data
if (!FeatureExtraction::isCovariateData(trainData$covariateData)) {
stop("Needs correct covariateData")
}
-
+
# get the settings from the param
- pySettings <- attr(param, 'settings')
-
+ pySettings <- attr(param, "settings")
+
# make sure the inputs are valid
checkPySettings(pySettings)
-
+
start <- Sys.time()
-
+
if (!is.null(trainData$folds)) {
trainData$labels <-
- merge(trainData$labels, trainData$fold, by = 'rowId')
+ merge(trainData$labels, trainData$fold, by = "rowId")
}
-
+
# convert the data to a sparse R Matrix and then use reticulate to convert to python sparse
# need to save the covariateMap so we know the covariateId to columnId when applying model
mappedData <- toSparseM(trainData)
-
+
matrixData <- mappedData$dataMatrix
labels <- mappedData$labels
covariateRef <- mappedData$covariateRef
-
+
# save the model to outLoc
outLoc <- createTempModelLoc()
-
- # functions does CV and fits final models
- # returns: prediction (Train/CV),
- # finalParam (optimal hyper-parameters)
- # variableImportance (final model)
- # paramGridSearch list with performance and params for complete grid search
- # at the moment it uses AUC as performance but this could be modified to let user
- # specify the performance metric to optimise
+
cvResult <- do.call(
what = gridCvPython,
args = list(
@@ -71,38 +64,38 @@ fitSklearn <- function(trainData,
pythonClass = pySettings$pythonClass,
modelLocation = outLoc,
paramSearch = param,
- saveToJson = attr(param, 'saveToJson')
+ saveToJson = attr(param, "saveToJson")
)
)
-
+
hyperSummary <-
- do.call(rbind,
- lapply(cvResult$paramGridSearch, function(x)
- x$hyperSummary))
-
+ do.call(
+ rbind,
+ lapply(cvResult$paramGridSearch, function(x) {
+ x$hyperSummary
+ })
+ )
+
prediction <- cvResult$prediction
-
+
variableImportance <- cvResult$variableImportance
variableImportance[is.na(variableImportance)] <- 0
-
+
incs <- rep(1, nrow(covariateRef))
covariateRef$included <- incs
covariateRef$covariateValue <-
unlist(variableImportance) # check this is correct order
-
+
comp <- start - Sys.time()
-
+
result <- list(
model = file.path(outLoc),
-
preprocessing = list(
- featureEngineering = attr(trainData, "metaData")$featureEngineering,
+ featureEngineering = attr(trainData$covariateData, "metaData")$featureEngineering,
tidyCovariates = attr(trainData$covariateData, "metaData")$tidyCovariateDataSettings,
- requireDenseMatrix = attr(param, 'settings')$requiresDenseMatrix
+ requireDenseMatrix = attr(param, "settings")$requiresDenseMatrix
),
-
prediction = prediction,
-
modelDesign = PatientLevelPrediction::createModelDesign(
targetId = attr(trainData, "metaData")$targetId,
outcomeId = attr(trainData, "metaData")$outcomeId,
@@ -115,32 +108,30 @@ fitSklearn <- function(trainData,
splitSettings = attr(trainData, "metaData")$splitSettings,
sampleSettings = attr(trainData, "metaData")$sampleSettings
),
-
trainDetails = list(
analysisId = analysisId,
- analysisSource = '',
- #TODO add from model
+ analysisSource = "",
+ # TODO add from model
developmentDatabase = attr(trainData, "metaData")$cdmDatabaseName,
developmentDatabaseSchema = attr(trainData, "metaData")$cdmDatabaseSchema,
attrition = attr(trainData, "metaData")$attrition,
- trainingTime = paste(as.character(abs(comp)), attr(comp, 'units')),
+ trainingTime = paste(as.character(abs(comp)), attr(comp, "units")),
trainingDate = Sys.Date(),
modelName = pySettings$name,
finalModelParameters = cvResult$finalParam,
hyperParamSearch = hyperSummary
),
-
covariateImportance = covariateRef
)
-
+
class(result) <- "plpModel"
attr(result, "predictionFunction") <- "predictPythonSklearn"
attr(result, "modelType") <- "binary"
attr(result, "saveType") <-
- attr(param, 'saveType') # in save/load plp
+ attr(param, "saveType") # in save/load plp
attr(result, "saveToJson") <-
- attr(param, 'saveToJson') # when saving in reticulate
-
+ attr(param, "saveToJson") # when saving in reticulate
+
return(result)
}
@@ -148,7 +139,7 @@ fitSklearn <- function(trainData,
predictPythonSklearn <- function(plpModel,
data,
cohort) {
- if (inherits(data, 'plpData')) {
+ if (inherits(data, "plpData")) {
# convert
matrixObjects <- toSparseM(
plpData = data,
@@ -156,74 +147,73 @@ predictPythonSklearn <- function(plpModel,
map = plpModel$covariateImportance %>%
dplyr::select("columnId", "covariateId")
)
-
+
newData <- matrixObjects$dataMatrix
cohort <- matrixObjects$labels
-
- } else{
+ } else {
newData <- data
}
-
+
# load model
- if (attr(plpModel, 'saveToJson')) {
+ if (attr(plpModel, "saveToJson")) {
modelLocation <-
reticulate::r_to_py(file.path(plpModel$model, "model.json"))
model <- sklearnFromJson(path = modelLocation)
- } else{
- os <- reticulate::import('os')
- joblib <- reticulate::import('joblib', convert = FALSE)
+ } else {
+ os <- reticulate::import("os")
+ joblib <- reticulate::import("joblib", convert = FALSE)
modelLocation <-
reticulate::r_to_py(file.path(plpModel$model, "model.pkl"))
model <- joblib$load(os$path$join(modelLocation))
}
included <-
plpModel$covariateImportance$columnId[plpModel$covariateImportance$included >
- 0] # does this include map?
- pythonData <- reticulate::r_to_py(newData[, included, drop = F])
-
+ 0] # does this include map?
+ pythonData <- reticulate::r_to_py(newData[, included, drop = FALSE])
+
# make dense if needed
if (plpModel$preprocessing$requireDenseMatrix) {
pythonData <- pythonData$toarray()
}
-
+
cohort <- predictValues(
model = model,
data = pythonData,
cohort = cohort,
- type = attr(plpModel, 'modelType')
+ type = attr(plpModel, "modelType")
)
-
+
return(cohort)
}
-predictValues <- function(model, data, cohort, type = 'binary') {
- predictionValue <- model$predict_proba(data)
+predictValues <- function(model, data, cohort, type = "binary") {
+ predictionValue <- model$predict_proba(data)
cohort$value <- reticulate::py_to_r(predictionValue)[, 2]
-
+
cohort <- cohort %>%
dplyr::select(-"rowId") %>%
dplyr::rename(rowId = "originalRowId")
-
- attr(cohort, "metaData")$modelType <- type
-
+
+ attr(cohort, "metaData")$modelType <- type
+
return(cohort)
}
checkPySettings <- function(settings) {
- checkIsClass(settings$seed, c('numeric', 'integer'))
- ParallelLogger::logDebug(paste0('classifier seed: ', settings$seed))
-
- checkIsClass(settings$requiresDenseMatrix, c('logical'))
- ParallelLogger::logDebug(paste0('requiresDenseMatrix: ', settings$requiresDenseMatrix))
-
- checkIsClass(settings$name, c('character'))
- ParallelLogger::logDebug(paste0('name: ', settings$name))
-
- checkIsClass(settings$pythonModule, c('character'))
- ParallelLogger::logDebug(paste0('pythonModule: ', settings$pythonModule))
-
- checkIsClass(settings$pythonClass, c('character'))
- ParallelLogger::logDebug(paste0('pythonClass: ', settings$pythonClass))
+ checkIsClass(settings$seed, c("numeric", "integer"))
+ ParallelLogger::logDebug(paste0("classifier seed: ", settings$seed))
+
+ checkIsClass(settings$requiresDenseMatrix, c("logical"))
+ ParallelLogger::logDebug(paste0("requiresDenseMatrix: ", settings$requiresDenseMatrix))
+
+ checkIsClass(settings$name, c("character"))
+ ParallelLogger::logDebug(paste0("name: ", settings$name))
+
+ checkIsClass(settings$pythonModule, c("character"))
+ ParallelLogger::logDebug(paste0("pythonModule: ", settings$pythonModule))
+
+ checkIsClass(settings$pythonClass, c("character"))
+ ParallelLogger::logDebug(paste0("pythonClass: ", settings$pythonClass))
}
gridCvPython <- function(matrixData,
@@ -235,53 +225,50 @@ gridCvPython <- function(matrixData,
pythonClass,
modelLocation,
paramSearch,
- saveToJson)
-{
+ saveToJson) {
ParallelLogger::logInfo(paste0("Running CV for ", modelName, " model"))
-
- np <- reticulate::import('numpy')
- os <- reticulate::import('os')
- sys <- reticulate::import('sys')
- math <- reticulate::import('math')
- scipy <- reticulate::import('scipy')
- joblib <- reticulate::import('joblib')
-
+
+ np <- reticulate::import("numpy")
+ joblib <- reticulate::import("joblib")
+
module <- reticulate::import(pythonModule, convert = FALSE)
classifier <- module[pythonClass]
-
+
###########################################################################
-
+
gridSearchPredictons <- list()
length(gridSearchPredictons) <- length(paramSearch)
-
+
for (gridId in 1:length(paramSearch)) {
# initiate prediction
prediction <- c()
-
+
fold <- labels$index
- ParallelLogger::logInfo(paste0('Max fold: ', max(fold)))
-
+ ParallelLogger::logInfo(paste0("Max fold: ", max(fold)))
+
for (i in 1:max(fold)) {
- ParallelLogger::logInfo(paste0('Fold ', i))
+ ParallelLogger::logInfo(paste0("Fold ", i))
trainY <- reticulate::r_to_py(labels$outcomeCount[fold != i])
trainX <- reticulate::r_to_py(matrixData[fold != i, ])
testX <- reticulate::r_to_py(matrixData[fold == i, ])
-
+
if (requiresDenseMatrix) {
- ParallelLogger::logInfo('Converting sparse martix to dense matrix (CV)')
+ ParallelLogger::logInfo("Converting sparse martix to dense matrix (CV)")
trainX <- trainX$toarray()
testX <- testX$toarray()
}
-
+
model <-
- fitPythonModel(classifier,
- paramSearch[[gridId]],
- seed,
- trainX,
- trainY,
- np,
- pythonClass)
-
+ fitPythonModel(
+ classifier,
+ paramSearch[[gridId]],
+ seed,
+ trainX,
+ trainY,
+ np,
+ pythonClass
+ )
+
ParallelLogger::logInfo("Calculating predictions on left out fold set...")
prediction <-
rbind(
@@ -290,84 +277,95 @@ gridCvPython <- function(matrixData,
model = model,
data = testX,
cohort = labels[fold == i, ],
- type = 'binary'
+ type = "binary"
)
)
-
}
-
- gridSearchPredictons[[gridId]] <- list(prediction = prediction,
- param = paramSearch[[gridId]])
+
+ gridSearchPredictons[[gridId]] <- list(
+ prediction = prediction,
+ param = paramSearch[[gridId]]
+ )
}
-
+
# get best para (this could be modified to enable any metric instead of AUC, just need metric input in function)
-
+
paramGridSearch <-
lapply(gridSearchPredictons, function(x) {
do.call(computeGridPerformance, x)
- }) # cvAUCmean, cvAUC, param
-
+ }) # cvAUCmean, cvAUC, param
+
optimalParamInd <-
- which.max(unlist(lapply(paramGridSearch, function(x)
- x$cvPerformance)))
-
+ which.max(unlist(lapply(paramGridSearch, function(x) {
+ x$cvPerformance
+ })))
+
finalParam <- paramGridSearch[[optimalParamInd]]$param
-
+
cvPrediction <- gridSearchPredictons[[optimalParamInd]]$prediction
- cvPrediction$evaluationType <- 'CV'
-
- ParallelLogger::logInfo('Training final model using optimal parameters')
-
+ cvPrediction$evaluationType <- "CV"
+
+ ParallelLogger::logInfo("Training final model using optimal parameters")
+
trainY <- reticulate::r_to_py(labels$outcomeCount)
trainX <- reticulate::r_to_py(matrixData)
-
+
if (requiresDenseMatrix) {
- ParallelLogger::logInfo('Converting sparse martix to dense matrix (final model)')
+ ParallelLogger::logInfo("Converting sparse martix to dense matrix (final model)")
trainX <- trainX$toarray()
}
-
+
model <-
- fitPythonModel(classifier,
- finalParam ,
- seed,
- trainX,
- trainY,
- np,
- pythonClass)
-
+ fitPythonModel(
+ classifier,
+ finalParam,
+ seed,
+ trainX,
+ trainY,
+ np,
+ pythonClass
+ )
+
ParallelLogger::logInfo("Calculating predictions on all train data...")
prediction <-
predictValues(
model = model,
data = trainX,
cohort = labels,
- type = 'binary'
+ type = "binary"
)
- prediction$evaluationType <- 'Train'
-
- prediction <- rbind(prediction,
- cvPrediction)
-
+ prediction$evaluationType <- "Train"
+
+ prediction <- rbind(
+ prediction,
+ cvPrediction
+ )
+
# saving model
if (!dir.exists(file.path(modelLocation))) {
- dir.create(file.path(modelLocation), recursive = T)
+ dir.create(file.path(modelLocation), recursive = TRUE)
}
if (saveToJson) {
- sklearnToJson(model = model,
- path = file.path(modelLocation, "model.json"))
- } else{
- joblib$dump(model, file.path(modelLocation, "model.pkl"), compress = T)
+ sklearnToJson(
+ model = model,
+ path = file.path(modelLocation, "model.json")
+ )
+ } else {
+ joblib$dump(model, file.path(modelLocation, "model.pkl"), compress = TRUE)
}
-
+
# feature importance
variableImportance <-
- tryCatch({
- reticulate::py_to_r(model$feature_importances_)
- }, error = function(e) {
- ParallelLogger::logInfo(e)
- return(rep(1, ncol(matrixData)))
- })
-
+ tryCatch(
+ {
+ reticulate::py_to_r(model$feature_importances_)
+ },
+ error = function(e) {
+ ParallelLogger::logInfo(e)
+ return(rep(1, ncol(matrixData)))
+ }
+ )
+
return(
list(
prediction = prediction,
@@ -376,7 +374,6 @@ gridCvPython <- function(matrixData,
variableImportance = variableImportance
)
)
-
}
@@ -388,47 +385,49 @@ fitPythonModel <-
trainY,
np,
pythonClass) {
- ParallelLogger::logInfo(paste0('data X dim: ', trainX$shape[0], 'x', trainX$shape[1]))
+ ParallelLogger::logInfo(paste0("data X dim: ", trainX$shape[0], "x", trainX$shape[1]))
ParallelLogger::logInfo(paste0(
- 'data Y length: ',
+ "data Y length: ",
np$shape(trainY)[[1]],
- ' with ',
+ " with ",
np$sum(trainY),
- ' outcomes'
+ " outcomes"
))
-
+
timeStart <- Sys.time()
-
+
# print parameters
# convert NULL to NA values
paramString <- param
for (ind in 1:length(paramString)) {
if (is.null(paramString[[ind]])) {
- paramString[[ind]] <- 'null'
+ paramString[[ind]] <- "null"
}
}
ParallelLogger::logInfo(paste(
names(param),
unlist(paramString),
- sep = ':',
- collapse = ' '
+ sep = ":",
+ collapse = " "
))
-
+
if (!is.null(param)) {
model <-
- do.call(paste0(pythonClass, 'Inputs'),
- list(classifier = classifier, param = param))
- } else{
+ do.call(
+ paste0(pythonClass, "Inputs"),
+ list(classifier = classifier, param = param)
+ )
+ } else {
model <- classifier()
}
model <- model$fit(trainX, trainY)
timeEnd <- Sys.time()
-
+
ParallelLogger::logInfo(paste0(
"Training model took (mins): ",
- difftime(timeEnd, timeStart, units = 'mins')
+ difftime(timeEnd, timeStart, units = "mins")
))
-
+
return(model)
}
@@ -443,36 +442,33 @@ fitPythonModel <-
#' @return A list with overview of the performance
#' @export
computeGridPerformance <-
- function(prediction, param, performanceFunct = 'computeAuc') {
- performance <- do.call(what = eval(parse(text = performanceFunct)),
- args = list(prediction = prediction))
-
+ function(prediction, param, performanceFunct = "computeAuc") {
+ performance <- do.call(
+ what = eval(parse(text = performanceFunct)),
+ args = list(prediction = prediction)
+ )
+
performanceFold <- c()
for (i in 1:max(prediction$index)) {
- performanceFold <- c(performanceFold,
- do.call(
- what = eval(parse(text = performanceFunct)),
- args = list(prediction = prediction[prediction$index == i, ])
- ))
+ performanceFold <- c(
+ performanceFold,
+ do.call(
+ what = eval(parse(text = performanceFunct)),
+ args = list(prediction = prediction[prediction$index == i, ])
+ )
+ )
}
-
+
paramString <- param
for (ind in 1:length(paramString)) {
if (is.null(paramString[[ind]])) {
- paramString[[ind]] <- 'null'
+ paramString[[ind]] <- "null"
}
}
-
- #hyperSummary <- c(performanceFunct, performance, performanceFold, unlist(paramString))
- #names(hyperSummary) <- c(
- # 'Metric',
- # 'cvPerformance',
- # paste0('cvPerformanceFold',1:length(performanceFold)),
- # names(param)
- #)
+
paramValues <- unlist(paramString)
names(paramValues) <- names(param)
-
+
hyperSummary <- as.data.frame(c(
data.frame(
metric = performanceFunct,
diff --git a/R/SklearnClassifierHelpers.R b/R/SklearnClassifierHelpers.R
index 0535d833e..e4d85753b 100644
--- a/R/SklearnClassifierHelpers.R
+++ b/R/SklearnClassifierHelpers.R
@@ -29,4 +29,3 @@ listCartesian <- function(allList) {
function(i) lapply(combinations, function(x) x[i][[1]]))
return(results)
}
-
diff --git a/R/SklearnClassifierSettings.R b/R/SklearnClassifierSettings.R
index 1b6446b70..bcb25c5d0 100644
--- a/R/SklearnClassifierSettings.R
+++ b/R/SklearnClassifierSettings.R
@@ -20,90 +20,106 @@
#' @param nEstimators (list) The maximum number of estimators at which boosting is terminated. In case of perfect fit, the learning procedure is stopped early.
#' @param learningRate (list) Weight applied to each classifier at each boosting iteration. A higher learning rate increases the contribution of each classifier. There is a trade-off between the learningRate and nEstimators parameters
#' There is a trade-off between learningRate and nEstimators.
-#' @param algorithm (list) If ‘SAMME.R’ then use the SAMME.R real boosting algorithm. base_estimator must support calculation of class probabilities. If ‘SAMME’ then use the SAMME discrete boosting algorithm. The SAMME.R algorithm typically converges faster than SAMME, achieving a lower test error with fewer boosting iterations.
+#' @param algorithm Only ‘SAMME’ can be provided. The 'algorithm' argument will be deprecated in scikit-learn 1.8.
#' @param seed A seed for the model
#'
#' @examples
#' \dontrun{
-#' model.adaBoost <- setAdaBoost(nEstimators = list(10,50,200), learningRate = list(1, 0.5, 0.1),
-#' algorithm = list('SAMME.R'), seed = sample(1000000,1)
-#' )
+#' model.adaBoost <- setAdaBoost(
+#' nEstimators = list(10, 50, 200), learningRate = list(1, 0.5, 0.1),
+#' algorithm = list("SAMME.R"), seed = sample(1000000, 1)
+#' )
#' }
#' @export
setAdaBoost <- function(nEstimators = list(10, 50, 200),
learningRate = list(1, 0.5, 0.1),
- algorithm = list('SAMME.R'),
+ algorithm = list("SAMME"),
seed = sample(1000000, 1)) {
+ rlang::check_installed(
+ "reticulate",
+ reason = "Reticulate is required to use the Python models"
+ )
+ tryCatch({
+ reticulate::import("sklearn")
+ }, error = function(e) {
+ stop("scikit-learn in a python environment reachable by reticulate is required to use the Python models")
+ })
+
checkIsClass(seed[[1]], c("numeric", "integer"))
- checkIsClass(nEstimators, 'list')
- checkIsClass(learningRate, 'list')
- checkIsClass(algorithm, 'list')
-
- lapply(1:length(nEstimators), function(i)
- checkIsClass(nEstimators[[i]] , c("integer", "numeric")))
- lapply(1:length(nEstimators), function(i)
- checkHigher(nEstimators[[i]] , 0))
-
+ checkIsClass(nEstimators, "list")
+ checkIsClass(learningRate, "list")
+ checkIsClass(algorithm, "list")
+
+ lapply(1:length(nEstimators), function(i) {
+ checkIsClass(nEstimators[[i]], c("integer", "numeric"))
+ })
+ lapply(1:length(nEstimators), function(i) {
+ checkHigher(nEstimators[[i]], 0)
+ })
+
for (i in 1:length(nEstimators)) {
if (inherits(x = nEstimators[[i]], what = c("numeric", "integer"))) {
nEstimators[[i]] <- as.integer(nEstimators[[i]])
}
}
-
- lapply(1:length(learningRate), function(i)
- checkIsClass(learningRate[[i]] , c("numeric")))
- lapply(1:length(learningRate), function(i)
- checkHigher(learningRate[[i]] , 0))
-
- lapply(1:length(algorithm), function(i)
- checkIsClass(algorithm[[i]] , c("character")))
-
- # test python is available and the required dependancies are there:
- ##checkPython()
-
+
+ lapply(1:length(learningRate), function(i) {
+ checkIsClass(learningRate[[i]], c("numeric"))
+ })
+ lapply(1:length(learningRate), function(i) {
+ checkHigher(learningRate[[i]], 0)
+ })
+
+ lapply(1:length(algorithm), function(i) {
+ checkIsClass(algorithm[[i]], c("character"))
+ checkIsEqual(algorithm[[i]], "SAMME")
+ })
+
paramGrid <- list(
nEstimators = nEstimators,
learningRate = learningRate,
algorithm = algorithm,
seed = list(as.integer(seed[[1]]))
)
-
+
param <- listCartesian(paramGrid)
-
- attr(param, 'settings') <- list(
- modelType = 'adaBoost',
+
+ attr(param, "settings") <- list(
+ modelType = "adaBoost",
seed = seed[[1]],
paramNames = names(paramGrid),
- #use this for logging params
- requiresDenseMatrix = F,
+ # use this for logging params
+ requiresDenseMatrix = FALSE,
name = "AdaBoost",
pythonModule = "sklearn.ensemble",
pythonClass = "AdaBoostClassifier"
)
-
- attr(param, 'saveToJson') <- T
- attr(param, 'saveType') <- 'file'
-
- result <- list(fitFunction = "fitSklearn",
- param = param)
+
+ attr(param, "saveToJson") <- TRUE
+ attr(param, "saveType") <- "file"
+
+ result <- list(
+ fitFunction = "fitSklearn",
+ param = param
+ )
class(result) <- "modelSettings"
-
+
return(result)
}
AdaBoostClassifierInputs <- function(classifier, param) {
model <- classifier(
- n_estimators = param[[which.max(names(param) == 'nEstimators')]],
- learning_rate = param[[which.max(names(param) == 'learningRate')]],
- algorithm = param[[which.max(names(param) == 'algorithm')]],
- random_state = param[[which.max(names(param) == 'seed')]]
+ n_estimators = param[[which.max(names(param) == "nEstimators")]],
+ learning_rate = param[[which.max(names(param) == "learningRate")]],
+ algorithm = param[[which.max(names(param) == "algorithm")]],
+ random_state = param[[which.max(names(param) == "seed")]]
)
-
+
return(model)
}
-#' Create setting for the scikit-learn 1.0.1 DecisionTree with python
+#' Create setting for the scikit-learn DecisionTree with python
#' @param criterion The function to measure the quality of a split. Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain.
#' @param splitter The strategy used to choose the split at each node. Supported strategies are “best” to choose the best split and “random” to choose the best random split.
#' @param maxDepth (list) The maximum depth of the tree. If NULL, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.
@@ -118,144 +134,183 @@ AdaBoostClassifierInputs <- function(classifier, param) {
#'
#' @examples
#' \dontrun{
-#' model.decisionTree <- setDecisionTree(maxDepth=10,minSamplesLeaf=10, seed=NULL )
+#' model.decisionTree <- setDecisionTree(maxDepth = 10, minSamplesLeaf = 10, seed = NULL)
#' }
#' @export
-setDecisionTree <- function(criterion = list('gini'),
- splitter = list('best'),
+setDecisionTree <- function(criterion = list("gini"),
+ splitter = list("best"),
maxDepth = list(as.integer(4), as.integer(10), NULL),
minSamplesSplit = list(2, 10),
minSamplesLeaf = list(10, 50),
minWeightFractionLeaf = list(0),
- maxFeatures = list(100, 'sqrt', NULL),
+ maxFeatures = list(100, "sqrt", NULL),
maxLeafNodes = list(NULL),
- minImpurityDecrease = list(10 ^ -7),
+ minImpurityDecrease = list(10^-7),
classWeight = list(NULL),
seed = sample(1000000, 1)) {
- if (!inherits(x = seed[[1]], what = c('numeric', 'integer'))) {
- stop('Invalid seed')
+ rlang::check_installed(
+ "reticulate",
+ reason = "Reticulate is required to use the Python models"
+ )
+ tryCatch({
+ reticulate::import("sklearn")
+ }, error = function(e) {
+ stop("scikit-learn in a python environment reachable by reticulate is required to use the Python models")
+ })
+ if (!inherits(x = seed[[1]], what = c("numeric", "integer"))) {
+ stop("Invalid seed")
}
-
- checkIsClass(criterion, 'list')
- checkIsClass(splitter, 'list')
- checkIsClass(maxDepth, 'list')
- checkIsClass(minSamplesSplit, 'list')
- checkIsClass(minSamplesLeaf, 'list')
- checkIsClass(minWeightFractionLeaf, 'list')
- checkIsClass(maxFeatures, 'list')
- checkIsClass(maxLeafNodes, 'list')
- checkIsClass(minImpurityDecrease, 'list')
- checkIsClass(classWeight, 'list')
-
- lapply(1:length(criterion), function(i)
- checkIsClass(criterion[[i]] , 'character'))
- lapply(1:length(splitter), function(i)
- checkIsClass(splitter[[i]] , 'character'))
-
-
- lapply(1:length(criterion),
- function(i) {
- if (!criterion[[i]] %in% c('gini', 'entropy')) {
- stop('Incorrect criterion')
- }
- })
-
-
- lapply(1:length(maxDepth), function(i)
- checkIsClass(maxDepth[[i]] , c("numeric", "integer", "NULL")))
- lapply(1:length(maxDepth), function(i)
- checkHigher(ifelse(is.null(maxDepth[[i]]), 1, maxDepth[[i]]) , 0))
+
+ checkIsClass(criterion, "list")
+ checkIsClass(splitter, "list")
+ checkIsClass(maxDepth, "list")
+ checkIsClass(minSamplesSplit, "list")
+ checkIsClass(minSamplesLeaf, "list")
+ checkIsClass(minWeightFractionLeaf, "list")
+ checkIsClass(maxFeatures, "list")
+ checkIsClass(maxLeafNodes, "list")
+ checkIsClass(minImpurityDecrease, "list")
+ checkIsClass(classWeight, "list")
+
+ lapply(1:length(criterion), function(i) {
+ checkIsClass(criterion[[i]], "character")
+ })
+ lapply(1:length(splitter), function(i) {
+ checkIsClass(splitter[[i]], "character")
+ })
+
+
+ lapply(
+ 1:length(criterion),
+ function(i) {
+ if (!criterion[[i]] %in% c("gini", "entropy")) {
+ stop("Incorrect criterion")
+ }
+ }
+ )
+
+
+ lapply(1:length(maxDepth), function(i) {
+ checkIsClass(maxDepth[[i]], c("numeric", "integer", "NULL"))
+ })
+ lapply(1:length(maxDepth), function(i) {
+ checkHigher(ifelse(is.null(maxDepth[[i]]), 1, maxDepth[[i]]), 0)
+ })
for (i in 1:length(maxDepth)) {
- if (inherits(x = maxDepth[[i]], what = c("numeric", "integer"))) {
+ if (inherits(x = maxDepth[[i]], what = c("numeric", "integer"))) {
maxDepth[[i]] <- as.integer(maxDepth[[i]])
}
}
-
- lapply(1:length(minSamplesSplit),
- function(i)
- checkIsClass(minSamplesSplit[[i]] , c("numeric", "integer", "NULL")))
- lapply(1:length(minSamplesSplit),
- function(i)
- checkHigher(ifelse(
- is.null(minSamplesSplit[[i]]), 1, minSamplesSplit[[i]]
- ) , 0))
-
+
+ lapply(
+ 1:length(minSamplesSplit),
+ function(i) {
+ checkIsClass(minSamplesSplit[[i]], c("numeric", "integer", "NULL"))
+ }
+ )
+ lapply(
+ 1:length(minSamplesSplit),
+ function(i) {
+ checkHigher(ifelse(
+ is.null(minSamplesSplit[[i]]), 1, minSamplesSplit[[i]]
+ ), 0)
+ }
+ )
+
# convert to integer if >= 1
for (i in 1:length(minSamplesSplit)) {
if (minSamplesSplit[[i]] >= 1) {
minSamplesSplit[[i]] <- as.integer(minSamplesSplit[[i]])
}
}
-
-
- lapply(1:length(minSamplesLeaf),
- function(i)
- checkIsClass(minSamplesLeaf[[i]] , c("numeric", "integer")))
- lapply(1:length(minSamplesLeaf),
- function(i)
- checkHigher(minSamplesLeaf[[i]] , 0))
-
+
+
+ lapply(
+ 1:length(minSamplesLeaf),
+ function(i) {
+ checkIsClass(minSamplesLeaf[[i]], c("numeric", "integer"))
+ }
+ )
+ lapply(
+ 1:length(minSamplesLeaf),
+ function(i) {
+ checkHigher(minSamplesLeaf[[i]], 0)
+ }
+ )
+
# convert to integer if >= 1
for (i in 1:length(minSamplesLeaf)) {
if (minSamplesLeaf[[i]] >= 1) {
minSamplesLeaf[[i]] <- as.integer(minSamplesLeaf[[i]])
}
}
-
- lapply(1:length(minWeightFractionLeaf),
- function(i)
- checkIsClass(minWeightFractionLeaf[[i]] , c("numeric")))
- lapply(1:length(minWeightFractionLeaf),
- function(i)
- checkHigherEqual(minWeightFractionLeaf[[i]] , 0))
-
- lapply(1:length(maxFeatures),
- function(i)
- checkIsClass(maxFeatures[[i]] , c(
- "numeric", "integer", "character", "NULL"
- )))
-
+
+ lapply(
+ 1:length(minWeightFractionLeaf),
+ function(i) {
+ checkIsClass(minWeightFractionLeaf[[i]], c("numeric"))
+ }
+ )
+ lapply(
+ 1:length(minWeightFractionLeaf),
+ function(i) {
+ checkHigherEqual(minWeightFractionLeaf[[i]], 0)
+ }
+ )
+
+ lapply(
+ 1:length(maxFeatures),
+ function(i) {
+ checkIsClass(maxFeatures[[i]], c(
+ "numeric", "integer", "character", "NULL"
+ ))
+ }
+ )
+
for (i in 1:length(maxFeatures)) {
- if (inherits(x = maxFeatures[[i]], what = c("numeric", "integer"))) {
+ if (inherits(x = maxFeatures[[i]], what = c("numeric", "integer"))) {
maxFeatures[[i]] <- as.integer(maxFeatures[[i]])
}
}
-
- lapply(1:length(maxLeafNodes),
- function(i)
- checkIsClass(maxLeafNodes[[i]], c("integer", "NULL")))
- lapply(1:length(maxLeafNodes),
- function(i)
- checkHigher(ifelse(
- is.null(maxLeafNodes[[i]]), 1, maxLeafNodes[[i]]
- ) , 0))
-
+
+ lapply(
+ 1:length(maxLeafNodes),
+ function(i) {
+ checkIsClass(maxLeafNodes[[i]], c("integer", "NULL"))
+ }
+ )
+ lapply(
+ 1:length(maxLeafNodes),
+ function(i) {
+ checkHigher(ifelse(
+ is.null(maxLeafNodes[[i]]), 1, maxLeafNodes[[i]]
+ ), 0)
+ }
+ )
+
for (i in 1:length(maxLeafNodes)) {
- if (inherits(x = maxLeafNodes[[i]], what = c("numeric", "integer"))) {
+ if (inherits(x = maxLeafNodes[[i]], what = c("numeric", "integer"))) {
maxLeafNodes[[i]] <- as.integer(maxLeafNodes[[i]])
}
}
-
- lapply(1:length(minImpurityDecrease),
- function(i)
- checkIsClass(minImpurityDecrease[[i]] , c("numeric")))
- lapply(1:length(minImpurityDecrease),
- function(i)
- checkHigherEqual(minImpurityDecrease[[i]], 0))
-
- lapply(1:length(classWeight), function(i)
- checkIsClass(classWeight[[i]] , c('character', 'NULL')))
-
- # test python is available and the required dependancies are there:
- ##checkPython()
-
- # scikit-learn 1.0.1 inputs:
- # criterion='gini', splitter='best', max_depth=None, min_samples_split=2,
- # min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None,
- # max_leaf_nodes=None, min_impurity_decrease=0.0, class_weight=None, ccp_alpha=0.0
-
- # must be correct order for python classifier as I can't find a way to do.call a named list
- # using reticulate
+
+ lapply(
+ 1:length(minImpurityDecrease),
+ function(i) {
+ checkIsClass(minImpurityDecrease[[i]], c("numeric"))
+ }
+ )
+ lapply(
+ 1:length(minImpurityDecrease),
+ function(i) {
+ checkHigherEqual(minImpurityDecrease[[i]], 0)
+ }
+ )
+
+ lapply(1:length(classWeight), function(i) {
+ checkIsClass(classWeight[[i]], c("character", "NULL"))
+ })
+
paramGrid <- list(
criterion = criterion,
splitter = splitter,
@@ -270,44 +325,45 @@ setDecisionTree <- function(criterion = list('gini'),
classWeight = classWeight
)
param <- listCartesian(paramGrid)
-
- attr(param, 'settings') <- list(
- modelType = 'decisionTree',
+
+ attr(param, "settings") <- list(
+ modelType = "decisionTree",
seed = seed[[1]],
paramNames = names(paramGrid),
- #use this for logging params
- requiresDenseMatrix = F,
+ requiresDenseMatrix = FALSE,
name = "Decision Tree",
pythonModule = "sklearn.tree",
pythonClass = "DecisionTreeClassifier"
)
-
- attr(param, 'saveToJson') <- T
- attr(param, 'saveType') <- 'file'
-
- result <- list(fitFunction = "fitSklearn",
- param = param)
+
+ attr(param, "saveToJson") <- TRUE
+ attr(param, "saveType") <- "file"
+
+ result <- list(
+ fitFunction = "fitSklearn",
+ param = param
+ )
class(result) <- "modelSettings"
-
+
return(result)
}
DecisionTreeClassifierInputs <- function(classifier, param) {
model <- classifier(
- criterion = param[[which.max(names(param) == 'criterion')]],
- splitter = param[[which.max(names(param) == 'splitter')]],
- max_depth = param[[which.max(names(param) == 'maxDepth')]],
- min_samples_split = param[[which.max(names(param) == 'minSamplesSplit')]],
- min_samples_leaf = param[[which.max(names(param) == 'minSamplesLeaf')]],
- min_weight_fraction_leaf = param[[which.max(names(param) == 'minWeightFractionLeaf')]],
- max_features = param[[which.max(names(param) == 'maxFeatures')]],
- random_state = param[[which.max(names(param) == 'seed')]],
- max_leaf_nodes = param[[which.max(names(param) == 'maxLeafNodes')]],
- min_impurity_decrease = param[[which.max(names(param) == 'minImpurityDecrease')]],
- class_weight = param[[which.max(names(param) == 'classWeight')]]
- )
-
+ criterion = param[[which.max(names(param) == "criterion")]],
+ splitter = param[[which.max(names(param) == "splitter")]],
+ max_depth = param[[which.max(names(param) == "maxDepth")]],
+ min_samples_split = param[[which.max(names(param) == "minSamplesSplit")]],
+ min_samples_leaf = param[[which.max(names(param) == "minSamplesLeaf")]],
+ min_weight_fraction_leaf = param[[which.max(names(param) == "minWeightFractionLeaf")]],
+ max_features = param[[which.max(names(param) == "maxFeatures")]],
+ random_state = param[[which.max(names(param) == "seed")]],
+ max_leaf_nodes = param[[which.max(names(param) == "maxLeafNodes")]],
+ min_impurity_decrease = param[[which.max(names(param) == "minImpurityDecrease")]],
+ class_weight = param[[which.max(names(param) == "classWeight")]]
+ )
+
return(model)
}
@@ -348,12 +404,12 @@ DecisionTreeClassifierInputs <- function(classifier, param) {
#' }
#' @export
setMLP <- function(hiddenLayerSizes = list(c(100), c(20)),
- #must be integers
- activation = list('relu'),
- solver = list('adam'),
+ # must be integers
+ activation = list("relu"),
+ solver = list("adam"),
alpha = list(0.3, 0.01, 0.0001, 0.000001),
- batchSize = list('auto'),
- learningRate = list('constant'),
+ batchSize = list("auto"),
+ learningRate = list("constant"),
learningRateInit = list(0.001),
powerT = list(0.5),
maxIter = list(200, 100),
@@ -369,52 +425,61 @@ setMLP <- function(hiddenLayerSizes = list(c(100), c(20)),
epsilon = list(0.00000001),
nIterNoChange = list(10),
seed = sample(100000, 1)) {
- checkIsClass(seed, c('numeric', 'integer'))
- checkIsClass(hiddenLayerSizes, c('list'))
- checkIsClass(activation, c('list'))
- checkIsClass(solver, c('list'))
- checkIsClass(alpha, c('list'))
- checkIsClass(batchSize, c('list'))
- checkIsClass(learningRate, c('list'))
- checkIsClass(learningRateInit, c('list'))
- checkIsClass(powerT, c('list'))
- checkIsClass(maxIter, c('list'))
- checkIsClass(shuffle, c('list'))
- checkIsClass(tol, c('list'))
- checkIsClass(warmStart, c('list'))
- checkIsClass(momentum, c('list'))
- checkIsClass(nesterovsMomentum, c('list'))
- checkIsClass(earlyStopping, c('list'))
- checkIsClass(validationFraction, c('list'))
- checkIsClass(beta1, c('list'))
- checkIsClass(beta2, c('list'))
- checkIsClass(epsilon, c('list'))
- checkIsClass(nIterNoChange, c('list'))
-
-
+ rlang::check_installed(
+ "reticulate",
+ reason = "Reticulate is required to use the Python models"
+ )
+ tryCatch({
+ reticulate::import("sklearn")
+ }, error = function(e) {
+ stop("scikit-learn in a python environment reachable by reticulate is required to use the Python models")
+ })
+ checkIsClass(seed, c("numeric", "integer"))
+ checkIsClass(hiddenLayerSizes, c("list"))
+ checkIsClass(activation, c("list"))
+ checkIsClass(solver, c("list"))
+ checkIsClass(alpha, c("list"))
+ checkIsClass(batchSize, c("list"))
+ checkIsClass(learningRate, c("list"))
+ checkIsClass(learningRateInit, c("list"))
+ checkIsClass(powerT, c("list"))
+ checkIsClass(maxIter, c("list"))
+ checkIsClass(shuffle, c("list"))
+ checkIsClass(tol, c("list"))
+ checkIsClass(warmStart, c("list"))
+ checkIsClass(momentum, c("list"))
+ checkIsClass(nesterovsMomentum, c("list"))
+ checkIsClass(earlyStopping, c("list"))
+ checkIsClass(validationFraction, c("list"))
+ checkIsClass(beta1, c("list"))
+ checkIsClass(beta2, c("list"))
+ checkIsClass(epsilon, c("list"))
+ checkIsClass(nIterNoChange, c("list"))
+
+
for (i in 1:length(hiddenLayerSizes)) {
hiddenLayerSizes[[i]] <- as.integer(hiddenLayerSizes[[i]])
}
-
-
+
+
for (i in 1:length(batchSize)) {
- if (inherits(x = batchSize[[i]], what = c("numeric", "integer"))) {
+ if (inherits(x = batchSize[[i]], what = c("numeric", "integer"))) {
batchSize[[i]] <- as.integer(batchSize[[i]])
}
}
-
+
for (i in 1:length(maxIter)) {
- if (inherits(x = maxIter[[i]], what = c("numeric", "integer"))) {
+ if (inherits(x = maxIter[[i]], what = c("numeric", "integer"))) {
maxIter[[i]] <- as.integer(maxIter[[i]])
}
}
-
+
for (i in 1:length(nIterNoChange)) {
- if (inherits(x = nIterNoChange[[i]], what = c("numeric", "integer"))) {
+ if (inherits(x = nIterNoChange[[i]], what = c("numeric", "integer"))) {
nIterNoChange[[i]] <- as.integer(nIterNoChange[[i]])
}
}
-
+
# add lapply for values...
paramGrid <- list(
hiddenLayerSizes = hiddenLayerSizes,
@@ -436,60 +501,62 @@ setMLP <- function(hiddenLayerSizes = list(c(100), c(20)),
earlyStopping = earlyStopping,
validationFraction = validationFraction,
beta1 = beta1,
- beta2 = beta2 ,
+ beta2 = beta2,
epsilon = epsilon,
nIterNoChange = nIterNoChange
)
-
+
param <- listCartesian(paramGrid)
-
- attr(param, 'settings') <- list(
- modelType = 'mlp',
+
+ attr(param, "settings") <- list(
+ modelType = "mlp",
seed = seed[[1]],
paramNames = names(paramGrid),
- #use this for logging params
- requiresDenseMatrix = F,
+ # use this for logging params
+ requiresDenseMatrix = FALSE,
name = "Neural Network",
pythonModule = "sklearn.neural_network",
pythonClass = "MLPClassifier"
)
-
- attr(param, 'saveToJson') <- T
- attr(param, 'saveType') <- 'file'
-
- result <- list(fitFunction = "fitSklearn",
- param = param)
+
+ attr(param, "saveToJson") <- TRUE
+ attr(param, "saveType") <- "file"
+
+ result <- list(
+ fitFunction = "fitSklearn",
+ param = param
+ )
class(result) <- "modelSettings"
-
+
return(result)
}
MLPClassifierInputs <- function(classifier, param) {
model <- classifier(
- hidden_layer_sizes = param[[which.max(names(param) == 'hiddenLayerSizes')]],
- activation = param[[which.max(names(param) == 'activation')]],
- solver = param[[which.max(names(param) == 'solver')]],
- alpha = param[[which.max(names(param) == 'alpha')]],
- batch_size = param[[which.max(names(param) == 'batchSize')]],
- learning_rate = param[[which.max(names(param) == 'learningRate')]],
- learning_rate_init = param[[which.max(names(param) == 'learningRateInit')]],
- power_t = param[[which.max(names(param) == 'powerT')]],
- max_iter = param[[which.max(names(param) == 'maxIter')]],
- shuffle = param[[which.max(names(param) == 'shuffle')]],
- random_state = param[[which.max(names(param) == 'seed')]],
- tol = param[[which.max(names(param) == 'tol')]],
- verbose = F,
- warm_start = param[[which.max(names(param) == 'warmStart')]],
- momentum = param[[which.max(names(param) == 'momentum')]],
- nesterovs_momentum = param[[which.max(names(param) == 'nesterovsMomentum')]],
- early_stopping = param[[which.max(names(param) == 'earlyStopping')]],
- validation_fraction = param[[which.max(names(param) == 'validationFraction')]],
- beta_1 = param[[which.max(names(param) == 'beta1')]],
- beta_2 = param[[which.max(names(param) == 'beta2')]],
- epsilon = param[[which.max(names(param) == 'epsilon')]],
- n_iter_no_change = param[[which.max(names(param) == 'nIterNoChange')]]
- )
-
+ hidden_layer_sizes = param[[which.max(names(param) == "hiddenLayerSizes")]],
+ activation = param[[which.max(names(param) == "activation")]],
+ solver = param[[which.max(names(param) == "solver")]],
+ alpha = param[[which.max(names(param) == "alpha")]],
+ batch_size = param[[which.max(names(param) == "batchSize")]],
+ learning_rate = param[[which.max(names(param) == "learningRate")]],
+ learning_rate_init = param[[which.max(names(param) == "learningRateInit")]],
+ power_t = param[[which.max(names(param) == "powerT")]],
+ max_iter = param[[which.max(names(param) == "maxIter")]],
+ shuffle = param[[which.max(names(param) == "shuffle")]],
+ random_state = param[[which.max(names(param) == "seed")]],
+ tol = param[[which.max(names(param) == "tol")]],
+ verbose = FALSE,
+ warm_start = param[[which.max(names(param) == "warmStart")]],
+ momentum = param[[which.max(names(param) == "momentum")]],
+ nesterovs_momentum = param[[which.max(names(param) == "nesterovsMomentum")]],
+ early_stopping = param[[which.max(names(param) == "earlyStopping")]],
+ validation_fraction = param[[which.max(names(param) == "validationFraction")]],
+ beta_1 = param[[which.max(names(param) == "beta1")]],
+ beta_2 = param[[which.max(names(param) == "beta2")]],
+ epsilon = param[[which.max(names(param) == "epsilon")]],
+ n_iter_no_change = param[[which.max(names(param) == "nIterNoChange")]]
+ )
+
return(model)
}
@@ -503,35 +570,41 @@ MLPClassifierInputs <- function(classifier, param) {
#' }
#' @export
setNaiveBayes <- function() {
- # test python is available and the required dependancies are there:
- ##checkPython()
-
- param <- list(none = 'true')
-
- attr(param, 'settings') <- list(
- modelType = 'naiveBayes',
+ param <- list(none = "true")
+
+ attr(param, "settings") <- list(
+ modelType = "naiveBayes",
seed = as.integer(0),
paramNames = c(),
- #use this for logging params
- requiresDenseMatrix = T,
+ # use this for logging params
+ requiresDenseMatrix = TRUE,
name = "Naive Bayes",
pythonModule = "sklearn.naive_bayes",
pythonClass = "GaussianNB"
)
-
- attr(param, 'saveToJson') <- T
- attr(param, 'saveType') <- 'file'
-
- result <- list(fitFunction = "fitSklearn",
- param = param)
+ rlang::check_installed(
+ "reticulate",
+ reason = "Reticulate is required to use the Python models"
+ )
+ tryCatch({
+ reticulate::import("sklearn")
+ }, error = function(e) {
+ stop("scikit-learn in a python environment reachable by reticulate is required to use the Python models")
+ })
+ attr(param, "saveToJson") <- TRUE
+ attr(param, "saveType") <- "file"
+
+ result <- list(
+ fitFunction = "fitSklearn",
+ param = param
+ )
class(result) <- "modelSettings"
-
+
return(result)
}
GaussianNBInputs <- function(classifier, param) {
model <- classifier()
-
return(model)
}
@@ -563,17 +636,19 @@ GaussianNBInputs <- function(classifier, param) {
#'
#' @examples
#' \dontrun{
-#' model.rf <- setRandomForest(mtries=list('auto',5,20), ntrees=c(10,100),
-#' maxDepth=c(5,20))
+#' model.rf <- setRandomForest(
+#' mtries = list("auto", 5, 20), ntrees = c(10, 100),
+#' maxDepth = c(5, 20)
+#' )
#' }
#' @export
-setRandomForest <- function(ntrees = list(100, 500),
- criterion = list('gini'),
+setRandomForest <- function(ntrees = list(100, 500),
+ criterion = list("gini"),
maxDepth = list(4, 10, 17),
minSamplesSplit = list(2, 5),
minSamplesLeaf = list(1, 10),
minWeightFractionLeaf = list(0),
- mtries = list('sqrt', 'log2'),
+ mtries = list("sqrt", "log2"),
maxLeafNodes = list(NULL),
minImpurityDecrease = list(0),
bootstrap = list(TRUE),
@@ -582,22 +657,31 @@ setRandomForest <- function(ntrees = list(100, 500),
nJobs = list(NULL),
classWeight = list(NULL),
seed = sample(100000, 1)) {
- checkIsClass(seed, c('numeric', 'integer'))
- checkIsClass(ntrees, c('list'))
- checkIsClass(criterion, c('list'))
- checkIsClass(maxDepth, c('list'))
- checkIsClass(minSamplesSplit, c('list'))
- checkIsClass(minSamplesLeaf, c('list'))
- checkIsClass(minWeightFractionLeaf, c('list'))
- checkIsClass(mtries, c('list'))
- checkIsClass(maxLeafNodes, c('list'))
- checkIsClass(minImpurityDecrease, c('list'))
- checkIsClass(bootstrap, c('list'))
- checkIsClass(maxSamples, c('list'))
- checkIsClass(oobScore, c('list'))
- checkIsClass(nJobs, c('list'))
- checkIsClass(classWeight, c('list'))
-
+ rlang::check_installed(
+ "reticulate",
+ reason = "Reticulate is required to use the Python models"
+ )
+ tryCatch({
+ reticulate::import("sklearn")
+ }, error = function(e) {
+ stop("scikit-learn in a python environment reachable by reticulate is required to use the Python models")
+ })
+ checkIsClass(seed, c("numeric", "integer"))
+ checkIsClass(ntrees, c("list"))
+ checkIsClass(criterion, c("list"))
+ checkIsClass(maxDepth, c("list"))
+ checkIsClass(minSamplesSplit, c("list"))
+ checkIsClass(minSamplesLeaf, c("list"))
+ checkIsClass(minWeightFractionLeaf, c("list"))
+ checkIsClass(mtries, c("list"))
+ checkIsClass(maxLeafNodes, c("list"))
+ checkIsClass(minImpurityDecrease, c("list"))
+ checkIsClass(bootstrap, c("list"))
+ checkIsClass(maxSamples, c("list"))
+ checkIsClass(oobScore, c("list"))
+ checkIsClass(nJobs, c("list"))
+ checkIsClass(classWeight, c("list"))
+
# convert to integer when needed
for (i in 1:length(ntrees)) {
if (inherits(x = ntrees[[i]], what = c("numeric", "integer"))) {
@@ -605,46 +689,46 @@ setRandomForest <- function(ntrees = list(100, 500),
}
}
for (i in 1:length(maxDepth)) {
- if (inherits(x = maxDepth[[i]], what = c("numeric", "integer"))) {
+ if (inherits(x = maxDepth[[i]], what = c("numeric", "integer"))) {
maxDepth[[i]] <- as.integer(maxDepth[[i]])
}
}
-
+
for (i in 1:length(minSamplesSplit)) {
if (minSamplesSplit[[i]] >= 1) {
minSamplesSplit[[i]] <- as.integer(minSamplesSplit[[i]])
}
}
-
+
for (i in 1:length(minSamplesLeaf)) {
if (minSamplesLeaf[[i]] >= 1) {
minSamplesLeaf[[i]] <- as.integer(minSamplesLeaf[[i]])
}
}
-
+
for (i in 1:length(maxLeafNodes)) {
- if (inherits(x = maxLeafNodes[[i]], what = c("numeric", "integer"))) {
+ if (inherits(x = maxLeafNodes[[i]], what = c("numeric", "integer"))) {
maxLeafNodes[[i]] <- as.integer(maxLeafNodes[[i]])
}
}
-
+
for (i in 1:length(nJobs)) {
- if (inherits(x = nJobs[[i]], what = c("numeric", "integer"))) {
+ if (inherits(x = nJobs[[i]], what = c("numeric", "integer"))) {
nJobs[[i]] <- as.integer(nJobs[[i]])
}
}
-
+
for (i in 1:length(maxSamples)) {
- if (inherits(x = maxSamples[[i]], what = c("numeric", "integer"))) {
+ if (inherits(x = maxSamples[[i]], what = c("numeric", "integer"))) {
if (maxSamples[[i]] >= 1) {
maxSamples[[i]] <- as.integer(maxSamples[[i]])
}
}
}
-
+
# add value checks
- paramGrid = list(
- ntrees = ntrees,
+ paramGrid <- list(
+ ntrees = ntrees,
criterion = criterion,
maxDepth = maxDepth,
minSamplesSplit = minSamplesSplit,
@@ -661,50 +745,52 @@ setRandomForest <- function(ntrees = list(100, 500),
maxSamples = maxSamples
)
param <- listCartesian(paramGrid)
-
- attr(param, 'settings') <- list(
- modelType = 'randomForest',
+
+ attr(param, "settings") <- list(
+ modelType = "randomForest",
seed = seed[[1]],
paramNames = names(paramGrid),
- #use this for logging params
- requiresDenseMatrix = F,
+ # use this for logging params
+ requiresDenseMatrix = FALSE,
name = "Random forest",
pythonModule = "sklearn.ensemble",
pythonClass = "RandomForestClassifier"
)
-
- attr(param, 'saveToJson') <- T
- attr(param, 'saveType') <- 'file'
-
- result <- list(fitFunction = "fitSklearn",
- param = param)
+
+ attr(param, "saveToJson") <- TRUE
+ attr(param, "saveType") <- "file"
+
+ result <- list(
+ fitFunction = "fitSklearn",
+ param = param
+ )
class(result) <- "modelSettings"
-
+
return(result)
}
RandomForestClassifierInputs <- function(classifier, param) {
model <- classifier(
- n_estimators = param[[which.max(names(param) == 'ntrees')]],
- criterion = param[[which.max(names(param) == 'criterion')]],
- max_depth = param[[which.max(names(param) == 'maxDepth')]],
- min_samples_split = param[[which.max(names(param) == 'minSamplesSplit')]],
- min_samples_leaf = param[[which.max(names(param) == 'minSamplesLeaf')]],
- min_weight_fraction_leaf = param[[which.max(names(param) == 'minWeightFractionLeaf')]],
- max_features = param[[which.max(names(param) == 'mtries')]],
- max_leaf_nodes = param[[which.max(names(param) == 'maxLeafNodes')]],
- min_impurity_decrease = param[[which.max(names(param) == 'minImpurityDecrease')]],
- bootstrap = param[[which.max(names(param) == 'bootstrap')]],
- max_samples = param[[which.max(names(param) == 'maxSamples')]],
- oob_score = param[[which.max(names(param) == 'oobScore')]],
- n_jobs = param[[which.max(names(param) == 'nJobs')]],
- random_state = param[[which.max(names(param) == 'seed')]],
+ n_estimators = param[[which.max(names(param) == "ntrees")]],
+ criterion = param[[which.max(names(param) == "criterion")]],
+ max_depth = param[[which.max(names(param) == "maxDepth")]],
+ min_samples_split = param[[which.max(names(param) == "minSamplesSplit")]],
+ min_samples_leaf = param[[which.max(names(param) == "minSamplesLeaf")]],
+ min_weight_fraction_leaf = param[[which.max(names(param) == "minWeightFractionLeaf")]],
+ max_features = param[[which.max(names(param) == "mtries")]],
+ max_leaf_nodes = param[[which.max(names(param) == "maxLeafNodes")]],
+ min_impurity_decrease = param[[which.max(names(param) == "minImpurityDecrease")]],
+ bootstrap = param[[which.max(names(param) == "bootstrap")]],
+ max_samples = param[[which.max(names(param) == "maxSamples")]],
+ oob_score = param[[which.max(names(param) == "oobScore")]],
+ n_jobs = param[[which.max(names(param) == "nJobs")]],
+ random_state = param[[which.max(names(param) == "seed")]],
verbose = 0L,
- warm_start = F,
- class_weight = param[[which.max(names(param) == 'classWeight')]]
+ warm_start = FALSE,
+ class_weight = param[[which.max(names(param) == "classWeight")]]
)
-
+
return(model)
}
@@ -724,37 +810,46 @@ RandomForestClassifierInputs <- function(classifier, param) {
#'
#' @examples
#' \dontrun{
-#' model.svm <- setSVM(kernel='rbf', seed = NULL)
+#' model.svm <- setSVM(kernel = "rbf", seed = NULL)
#' }
#' @export
setSVM <- function(C = list(1, 0.9, 2, 0.1),
- kernel = list('rbf'),
+ kernel = list("rbf"),
degree = list(1, 3, 5),
- gamma = list('scale', 1e-04, 3e-05, 0.001, 0.01, 0.25),
+ gamma = list("scale", 1e-04, 3e-05, 0.001, 0.01, 0.25),
coef0 = list(0.0),
shrinking = list(TRUE),
tol = list(0.001),
classWeight = list(NULL),
- cacheSize = 500,
+ cacheSize = 500,
seed = sample(100000, 1)) {
- checkIsClass(seed, c('numeric', 'integer'))
- checkIsClass(cacheSize, c('numeric', 'integer'))
- checkIsClass(C, c('list'))
- checkIsClass(kernel, c('list'))
- checkIsClass(degree, c('list'))
- checkIsClass(gamma, c('list'))
- checkIsClass(coef0, c('list'))
- checkIsClass(shrinking, c('list'))
- checkIsClass(tol, c('list'))
- checkIsClass(classWeight, c('list'))
-
+ rlang::check_installed(
+ "reticulate",
+ reason = "Reticulate is required to use the Python models"
+ )
+ tryCatch({
+ reticulate::import("sklearn")
+ }, error = function(e) {
+ stop("Cannot import scikit-learn in python. scikit-learn in a python environment reachable by reticulate is required to use the Python models. Please check your python setup with reticulate::py_config() followed by reticulate::import('sklearn')")
+ })
+ checkIsClass(seed, c("numeric", "integer"))
+ checkIsClass(cacheSize, c("numeric", "integer"))
+ checkIsClass(C, c("list"))
+ checkIsClass(kernel, c("list"))
+ checkIsClass(degree, c("list"))
+ checkIsClass(gamma, c("list"))
+ checkIsClass(coef0, c("list"))
+ checkIsClass(shrinking, c("list"))
+ checkIsClass(tol, c("list"))
+ checkIsClass(classWeight, c("list"))
+
for (i in 1:length(degree)) {
- if (inherits(x = degree[[i]], what = c("numeric", "integer"))) {
+ if (inherits(x = degree[[i]], what = c("numeric", "integer"))) {
degree[[i]] <- as.integer(degree[[i]])
}
}
-
- paramGrid = list(
+
+ paramGrid <- list(
C = C,
kernel = kernel,
degree = degree,
@@ -766,49 +861,51 @@ setSVM <- function(C = list(1, 0.9, 2, 0.1),
classWeight = classWeight,
seed = list(as.integer(seed[[1]]))
)
-
+
param <- listCartesian(paramGrid)
-
-
- attr(param, 'settings') <- list(
- modelType = 'svm',
+
+
+ attr(param, "settings") <- list(
+ modelType = "svm",
seed = seed[[1]],
paramNames = names(paramGrid),
- #use this for logging params
- requiresDenseMatrix = F,
+ # use this for logging params
+ requiresDenseMatrix = FALSE,
name = "Support Vector Machine",
pythonModule = "sklearn.svm",
pythonClass = "SVC"
)
-
- attr(param, 'saveToJson') <- T
- attr(param, 'saveType') <- 'file'
-
- result <- list(fitFunction = "fitSklearn",
- param = param)
+
+ attr(param, "saveToJson") <- TRUE
+ attr(param, "saveType") <- "file"
+
+ result <- list(
+ fitFunction = "fitSklearn",
+ param = param
+ )
class(result) <- "modelSettings"
-
+
return(result)
}
SVCInputs <- function(classifier, param) {
model <- classifier(
- C = param[[which.max(names(param) == 'C')]],
- kernel = param[[which.max(names(param) == 'kernel')]],
- degree = param[[which.max(names(param) == 'degree')]],
- gamma = param[[which.max(names(param) == 'gamma')]],
- coef0 = param[[which.max(names(param) == 'coef0')]],
- shrinking = param[[which.max(names(param) == 'shrinking')]],
- probability = T,
- tol = param[[which.max(names(param) == 'tol')]],
- cache_size = param[[which.max(names(param) == 'cacheSize')]],
- class_weight = param[[which.max(names(param) == 'classWeight')]],
- verbose = F,
+ C = param[[which.max(names(param) == "C")]],
+ kernel = param[[which.max(names(param) == "kernel")]],
+ degree = param[[which.max(names(param) == "degree")]],
+ gamma = param[[which.max(names(param) == "gamma")]],
+ coef0 = param[[which.max(names(param) == "coef0")]],
+ shrinking = param[[which.max(names(param) == "shrinking")]],
+ probability = TRUE,
+ tol = param[[which.max(names(param) == "tol")]],
+ cache_size = param[[which.max(names(param) == "cacheSize")]],
+ class_weight = param[[which.max(names(param) == "classWeight")]],
+ verbose = FALSE,
max_iter = as.integer(-1),
- decision_function_shape = 'ovr',
- break_ties = F,
- random_state = param[[which.max(names(param) == 'seed')]]
+ decision_function_shape = "ovr",
+ break_ties = FALSE,
+ random_state = param[[which.max(names(param) == "seed")]]
)
-
+
return(model)
}
diff --git a/R/SklearnToJson.R b/R/SklearnToJson.R
index 2a2771338..bf5030a18 100644
--- a/R/SklearnToJson.R
+++ b/R/SklearnToJson.R
@@ -21,8 +21,8 @@
#' @param path path to the saved model file
#' @export
sklearnToJson <- function(model, path) {
- py <- reticulate::import_builtins(convert=FALSE)
- json <- reticulate::import("json", convert=FALSE)
+ py <- reticulate::import_builtins(convert = FALSE)
+ json <- reticulate::import("json", convert = FALSE)
if (inherits(model, "sklearn.tree._classes.DecisionTreeClassifier")) {
serializedModel <- serializeDecisionTree(model)
} else if (inherits(model, "sklearn.ensemble._forest.RandomForestClassifier")) {
@@ -32,27 +32,27 @@ sklearnToJson <- function(model, path) {
} else if (inherits(model, "sklearn.naive_bayes.GaussianNB")) {
serializedModel <- serializeNaiveBayes(model)
} else if (inherits(model, "sklearn.neural_network._multilayer_perceptron.MLPClassifier")) {
- serializedModel <- serializeMLP(model)
- } else if (inherits(model, "sklearn.svm._classes.SVC" )) {
+ serializedModel <- serializeMLP(model)
+ } else if (inherits(model, "sklearn.svm._classes.SVC")) {
serializedModel <- serializeSVM(model)
} else {
stop("Unsupported model")
}
-
- with(py$open(path, "w"), as=file, {
- json$dump(serializedModel, fp=file)
+
+ with(py$open(path, "w"), as = file, {
+ json$dump(serializedModel, fp = file)
})
return(invisible())
}
-#' Loads sklearn python model from json
+#' Loads sklearn python model from json
#' @param path path to the model json file
#' @export
sklearnFromJson <- function(path) {
- py <- reticulate::import_builtins(convert=FALSE)
- json <- reticulate::import("json", convert=FALSE)
- with(py$open(path, "r"), as=file, {
- model <- json$load(fp=file)
+ py <- reticulate::import_builtins(convert = FALSE)
+ json <- reticulate::import("json", convert = FALSE)
+ with(py$open(path, "r"), as = file, {
+ model <- json$load(fp = file)
})
if (reticulate::py_bool(model["meta"] == "decision-tree")) {
model <- deSerializeDecisionTree(model)
@@ -64,7 +64,7 @@ sklearnFromJson <- function(path) {
model <- deSerializeNaiveBayes(model)
} else if (reticulate::py_bool(model["meta"] == "mlp")) {
model <- deSerializeMlp(model)
- } else if (reticulate::py_bool(model["meta"] == "svm")) {
+ } else if (reticulate::py_bool(model["meta"] == "svm")) {
model <- deSerializeSVM(model)
} else {
stop("Unsupported model")
@@ -75,90 +75,100 @@ sklearnFromJson <- function(path) {
serializeTree <- function(tree) {
serializedTree <- tree$`__getstate__`()
dtypes <- serializedTree["nodes"]$dtype
-
+
serializedTree["nodes"] <- serializedTree["nodes"]$tolist()
serializedTree["values"] <- serializedTree["values"]$tolist()
-
- return(list(serializedTree, dtypes))
+
+ return(list(serializedTree, dtypes))
}
deSerializeTree <- function(tree_dict, nFeatures, nClasses, nOutputs) {
# TODO the below only works for tree_dict loaded from json, if not it
- for (i in 0:(length(tree_dict["nodes"])-1)) {
- reticulate::py_set_item(tree_dict["nodes"], i,
- reticulate::tuple(reticulate::py_to_r(tree_dict["nodes"][i])))
+ for (i in 0:(length(tree_dict["nodes"]) - 1)) {
+ reticulate::py_set_item(
+ tree_dict["nodes"], i,
+ reticulate::tuple(reticulate::py_to_r(tree_dict["nodes"][i]))
+ )
}
-
+
names <- list("left_child", "right_child", "feature", "threshold", "impurity", "n_node_samples", "weighted_n_node_samples")
- if (length(tree_dict["nodes"][0])==8) {
+ if (length(tree_dict["nodes"][0]) == 8) {
# model used sklearn>=1.3 which added a parameter
names[[8]] <- "missing_go_to_left"
}
-
+
sklearn <- reticulate::import("sklearn")
np <- reticulate::import("numpy", convert = FALSE)
-
- tree_dict["nodes"] <- np$array(tree_dict["nodes"],
- dtype=np$dtype(reticulate::dict(
- names = names,
- formats = tree_dict["nodes_dtype"]
- )))
+
+ tree_dict["nodes"] <- np$array(tree_dict["nodes"],
+ dtype = np$dtype(reticulate::dict(
+ names = names,
+ formats = tree_dict["nodes_dtype"]
+ ))
+ )
tree_dict["values"] <- np$array(tree_dict["values"])
-
- Tree <- sklearn$tree$`_tree`$Tree(nFeatures,
- np$array(reticulate::tuple(nClasses),
- dtype=np$intp),
- nOutputs)
-
+
+ Tree <- sklearn$tree$`_tree`$Tree(
+ nFeatures,
+ np$array(reticulate::tuple(nClasses),
+ dtype = np$intp
+ ),
+ nOutputs
+ )
+
Tree$`__setstate__`(tree_dict)
-
+
return(Tree)
}
serializeDecisionTree <- function(model) {
- tree <- serializeTree(model$tree_)
- dtypes <- tree[[2]]
- tree <- tree[[1]]
- py <- reticulate::import_builtins(convert=FALSE)
- serialized_model <- reticulate::dict(
- "meta" = "decision-tree",
- "feature_importances_" = model$feature_importances_$tolist(),
- "max_features_" = model$max_features_,
- "n_classes_" = py$int(model$n_classes_),
- "n_features_in_" = model$n_features_in_,
- "n_outputs_" = model$n_outputs_,
- "tree_" = tree,
- "classes_" = model$classes_$tolist(),
- "params" = model$get_params()
- )
-
- tree_dtypes <- list()
- for (i in 0:(length(dtypes)-1)) {
- tree_dtypes <- c(tree_dtypes, dtypes[[i]]$str)
- }
-
- serialized_model["tree_"]["nodes_dtype"] <- tree_dtypes
- return(serialized_model)
+ tree <- serializeTree(model$tree_)
+ dtypes <- tree[[2]]
+ tree <- tree[[1]]
+ py <- reticulate::import_builtins(convert = FALSE)
+ serialized_model <- reticulate::dict(
+ "meta" = "decision-tree",
+ "feature_importances_" = model$feature_importances_$tolist(),
+ "max_features_" = model$max_features_,
+ "n_classes_" = py$int(model$n_classes_),
+ "n_features_in_" = model$n_features_in_,
+ "n_outputs_" = model$n_outputs_,
+ "tree_" = tree,
+ "classes_" = model$classes_$tolist(),
+ "params" = model$get_params()
+ )
+
+ tree_dtypes <- list()
+ for (i in 0:(length(dtypes) - 1)) {
+ tree_dtypes <- c(tree_dtypes, dtypes[[i]]$str)
+ }
+
+ serialized_model["tree_"]["nodes_dtype"] <- tree_dtypes
+ return(serialized_model)
}
deSerializeDecisionTree <- function(model_dict) {
- np <- reticulate::import("numpy", convert=FALSE)
- sklearn <- reticulate::import("sklearn", convert=FALSE)
- deserialized_model <- do.call(sklearn$tree$DecisionTreeClassifier,
- reticulate::py_to_r(model_dict["params"]))
-
+ np <- reticulate::import("numpy", convert = FALSE)
+ sklearn <- reticulate::import("sklearn", convert = FALSE)
+ deserialized_model <- do.call(
+ sklearn$tree$DecisionTreeClassifier,
+ reticulate::py_to_r(model_dict["params"])
+ )
+
deserialized_model$classes_ <- np$array(model_dict["classes_"])
deserialized_model$max_features_ <- model_dict["max_features_"]
deserialized_model$n_classes_ <- model_dict["n_classes_"]
deserialized_model$n_features_in <- model_dict["n_features_in_"]
deserialized_model$n_outputs_ <- model_dict["n_outputs_"]
-
- tree <- deSerializeTree(model_dict["tree_"],
- model_dict["n_features_in_"],
- model_dict["n_classes_"],
- model_dict["n_outputs_"])
+
+ tree <- deSerializeTree(
+ model_dict["tree_"],
+ model_dict["n_features_in_"],
+ model_dict["n_classes_"],
+ model_dict["n_outputs_"]
+ )
deserialized_model$tree_ <- tree
-
+
return(deserialized_model)
}
@@ -167,7 +177,7 @@ serializeRandomForest <- function(model) {
for (i in 1:length(model$estimators_)) {
estimators <- c(estimators, serializeDecisionTree(model$estimators_[i - 1]))
}
-
+
serialized_model <- reticulate::dict(
"meta" = "rf",
"max_depth" = model$max_depth,
@@ -183,32 +193,35 @@ serializeRandomForest <- function(model) {
"classes_" = model$classes_$tolist(),
"estimators_" = reticulate::r_to_py(estimators),
"params" = model$get_params(),
- "n_classes_" = model$n_classes_)
-
- if (reticulate::py_bool(model$`__dict__`["oob_score_"] != reticulate::py_none())) {
- serialized_model["oob_score_"] <- model$oob_score_
+ "n_classes_" = model$n_classes_
+ )
+
+ if (reticulate::py_bool(model$`__dict__`["oob_score_"] != reticulate::py_none())) {
+ serialized_model["oob_score_"] <- model$oob_score_
serialized_model["oob_decision_function_"] <- model$oob_decision_function_$tolist()
}
-
+
return(serialized_model)
}
-
+
deSerializeRandomForest <- function(model_dict) {
- np <- reticulate::import("numpy", convert=FALSE)
- sklearn <- reticulate::import("sklearn", convert=FALSE)
- model <- do.call(sklearn$ensemble$RandomForestClassifier,
- reticulate::py_to_r(model_dict["params"]))
-
+ np <- reticulate::import("numpy", convert = FALSE)
+ sklearn <- reticulate::import("sklearn", convert = FALSE)
+ model <- do.call(
+ sklearn$ensemble$RandomForestClassifier,
+ reticulate::py_to_r(model_dict["params"])
+ )
+
estimators <- list()
for (i in 1:length(model_dict$estimators_)) {
estimators <- c(estimators, deSerializeDecisionTree(model_dict["estimators_"][i - 1]))
}
-
+
model$estimators_ <- np$array(estimators)
-
+
model$classes_ <- np$array(model_dict["classes_"])
model$n_features_in_ <- model_dict["n_features_in_"]
- model$n_outputs_ <- model_dict["n_outputs_"]
+ model$n_outputs_ <- model_dict["n_outputs_"]
model$max_depth <- model_dict["max_depth"]
model$min_samples_split <- model_dict["min_samples_split"]
model$min_samples_leaf <- model_dict["min_samples_leaf"]
@@ -218,12 +231,12 @@ deSerializeRandomForest <- function(model_dict) {
model$min_impurity_decrease <- model_dict["min_impurity_decrease"]
model$min_impurity_split <- model_dict["min_impurity_split"]
model$n_classes_ <- model_dict["n_classes_"]
-
- if (reticulate::py_bool(model_dict$oob_score_ != reticulate::py_none())){
+
+ if (reticulate::py_bool(model_dict$oob_score_ != reticulate::py_none())) {
model$oob_score_ <- model_dict["oob_score_"]
- model$oob_decision_function_ <- model_dict["oob_decision_function_"]
+ model$oob_decision_function_ <- model_dict["oob_decision_function_"]
}
- return(model)
+ return(model)
}
serializeAdaboost <- function(model) {
@@ -238,32 +251,35 @@ serializeAdaboost <- function(model) {
"n_classes_" = model$n_classes_,
"params" = model$get_params(),
"classes_" = model$classes_$tolist(),
- "estimator_weights_" = model$estimator_weights_$tolist())
-
+ "estimator_weights_" = model$estimator_weights_$tolist()
+ )
+
return(serialized_model)
}
deSerializeAdaboost <- function(model_dict) {
- np <- reticulate::import("numpy", convert=FALSE)
- sklearn <- reticulate::import("sklearn", convert=FALSE)
- model <- do.call(sklearn$ensemble$AdaBoostClassifier,
- reticulate::py_to_r(model_dict["params"]))
+ np <- reticulate::import("numpy", convert = FALSE)
+ sklearn <- reticulate::import("sklearn", convert = FALSE)
+ model <- do.call(
+ sklearn$ensemble$AdaBoostClassifier,
+ reticulate::py_to_r(model_dict["params"])
+ )
estimators <- list()
for (i in 1:length(model_dict$estimators_)) {
estimators <- c(estimators, deSerializeDecisionTree(model_dict["estimators_"][i - 1]))
}
-
+
model$estimators_ <- np$array(estimators)
model$classes_ <- np$array(model_dict["classes_"])
model$n_features_in_ <- model_dict["n_features_in_"]
model$n_classes_ <- model_dict["n_classes_"]
model$estimator_weights_ <- np$array(model_dict["estimator_weights_"])
-
+
return(model)
}
serializeNaiveBayes <- function(model) {
- serialized_model = reticulate::dict(
+ serialized_model <- reticulate::dict(
"meta" = "naive-bayes",
"classes_" = model$classes_$tolist(),
"class_count_" = model$class_count_$tolist(),
@@ -277,28 +293,34 @@ serializeNaiveBayes <- function(model) {
}
deSerializeNaiveBayes <- function(model_dict) {
- sklearn <- reticulate::import("sklearn", convert=FALSE)
- np <- reticulate::import("numpy", convert=FALSE)
- model <- do.call(sklearn$naive_bayes$GaussianNB,
- reticulate::py_to_r(model_dict["params"]))
-
+ sklearn <- reticulate::import("sklearn", convert = FALSE)
+ np <- reticulate::import("numpy", convert = FALSE)
+ model <- do.call(
+ sklearn$naive_bayes$GaussianNB,
+ reticulate::py_to_r(model_dict["params"])
+ )
+
model$classes_ <- np$array(model_dict["classes_"])
model$class_count_ <- np$array(model_dict["class_count_"])
model$class_prior_ <- np$array(model_dict["class_prior_"])
model$theta_ <- np$array(model_dict["theta_"])
model$epsilon_ <- model_dict["epsilon_"]
model$var_ <- np$array(model_dict["var_"])
-
+
return(model)
}
serializeMLP <- function(model) {
# TODO Check if length(intercepts_) is ever different from length(coefs_)
for (i in 0:(length(model$coefs_) - 1)) {
- reticulate::py_set_item(model$coefs_, i,
- model$coefs_[i]$tolist())
- reticulate::py_set_item(model$intercepts_, i,
- model$intercepts_[i]$tolist())
+ reticulate::py_set_item(
+ model$coefs_, i,
+ model$coefs_[i]$tolist()
+ )
+ reticulate::py_set_item(
+ model$intercepts_, i,
+ model$intercepts_[i]$tolist()
+ )
}
serialized_model <- reticulate::dict(
"meta" = "mlp",
@@ -316,32 +338,37 @@ serializeMLP <- function(model) {
}
deSerializeMlp <- function(model_dict) {
- sklearn <- reticulate::import("sklearn", convert=FALSE)
- np <- reticulate::import("numpy", convert=FALSE)
-
- model <- do.call(sklearn$neural_network$MLPClassifier,
- reticulate::py_to_r(model_dict["params"]))
+ sklearn <- reticulate::import("sklearn", convert = FALSE)
+ np <- reticulate::import("numpy", convert = FALSE)
+
+ model <- do.call(
+ sklearn$neural_network$MLPClassifier,
+ reticulate::py_to_r(model_dict["params"])
+ )
for (i in 0:(length(model_dict["coefs_"]) - 1)) {
- reticulate::py_set_item(model_dict["coefs_"], i,
- np$array(model_dict["coefs_"][i]))
- reticulate::py_set_item(model_dict["intercepts_"], i,
- np$array(model_dict["intercepts_"][i]))
-
+ reticulate::py_set_item(
+ model_dict["coefs_"], i,
+ np$array(model_dict["coefs_"][i])
+ )
+ reticulate::py_set_item(
+ model_dict["intercepts_"], i,
+ np$array(model_dict["intercepts_"][i])
+ )
}
- model$coefs_ = model_dict["coefs_"]
- model$loss_ = model_dict["loss_"]
- model$intercepts_ = model_dict["intercepts_"]
- model$n_iter_ = model_dict["n_iter_"]
- model$n_layers_ = model_dict["n_layers_"]
- model$n_outputs_ = model_dict["n_outputs_"]
- model$out_activation_ = model_dict["out_activation_"]
- model$classes_ = np$array(model_dict["classes_"])
-
+ model$coefs_ <- model_dict["coefs_"]
+ model$loss_ <- model_dict["loss_"]
+ model$intercepts_ <- model_dict["intercepts_"]
+ model$n_iter_ <- model_dict["n_iter_"]
+ model$n_layers_ <- model_dict["n_layers_"]
+ model$n_outputs_ <- model_dict["n_outputs_"]
+ model$out_activation_ <- model_dict["out_activation_"]
+ model$classes_ <- np$array(model_dict["classes_"])
+
return(model)
}
serializeSVM <- function(model) {
- serialized_model = reticulate::dict(
+ serialized_model <- reticulate::dict(
"meta" = "svm",
"class_weight_" = model$class_weight_$tolist(),
"classes_" = model$classes_$tolist(),
@@ -360,84 +387,87 @@ serializeSVM <- function(model) {
} else {
serialized_model["support_vectors_"] <- serializeCsrMatrix(model$support_vectors_)
}
-
+
if (inherits(model$dual_coef_, "numpy.ndarray")) {
serialized_model["dual_coef_"] <- model$dual_coef_$tolist()
} else {
serialized_model["dual_coef_"] <- serializeCsrMatrix(model$dual_coef_)
}
-
+
if (inherits(model$`_dual_coef_`, "numpy.ndarray")) {
serialized_model["_dual_coef_"] <- model$`_dual_coef_`$tolist()
} else {
serialized_model["_dual_coef_"] <- serializeCsrMatrix(model$`_dual_coef_`)
}
- return(serialized_model)
+ return(serialized_model)
}
deSerializeSVM <- function(model_dict) {
- sklearn <- reticulate::import("sklearn", convert=FALSE)
- np <- reticulate::import("numpy", convert=FALSE)
- model <- do.call(sklearn$svm$SVC,
- reticulate::py_to_r(model_dict["params"]))
+ sklearn <- reticulate::import("sklearn", convert = FALSE)
+ np <- reticulate::import("numpy", convert = FALSE)
+ model <- do.call(
+ sklearn$svm$SVC,
+ reticulate::py_to_r(model_dict["params"])
+ )
model$shape_fit_ <- model_dict$shape_fit_
- model$`_gamma`<- model_dict["_gamma"]
+ model$`_gamma` <- model_dict["_gamma"]
model$class_weight_ <- np$array(model_dict$class_weight_)$astype(np$float64)
- model$classes_ <- np$array(model_dict["classes_"])
+ model$classes_ <- np$array(model_dict["classes_"])
model$support_ <- np$array(model_dict["support_"])$astype(np$int32)
model$`_n_support` <- np$array(model_dict["n_support_"])$astype(np$int32)
model$intercept_ <- np$array(model_dict["intercept_"])$astype(np$float64)
model$`_probA` <- np$array(model_dict["probA_"])$astype(np$float64)
model$`_probB` <- np$array(model_dict["probB_"])$astype(np$float64)
model$`_intercept_` <- np$array(model_dict["_intercept_"])$astype(np$float64)
-
- if (reticulate::py_bool((model_dict$support_vectors_["meta"] != reticulate::py_none())) &
- (reticulate::py_bool(model_dict$support_vectors_["meta"] == "csr"))) {
+
+ if (reticulate::py_bool((model_dict$support_vectors_["meta"] != reticulate::py_none())) &
+ (reticulate::py_bool(model_dict$support_vectors_["meta"] == "csr"))) {
model$support_vectors_ <- deSerializeCsrMatrix(model_dict$support_vectors_)
model$`_sparse` <- TRUE
} else {
model$support_vectors_ <- np$array(model_dict$support_vectors_)$astype(np$float64)
model$`_sparse` <- FALSE
}
- if (reticulate::py_bool((model_dict$dual_coef_["meta"] != reticulate::py_none())) &
- (reticulate::py_bool(model_dict$dual_coef_["meta"] == "csr"))) {
+ if (reticulate::py_bool((model_dict$dual_coef_["meta"] != reticulate::py_none())) &
+ (reticulate::py_bool(model_dict$dual_coef_["meta"] == "csr"))) {
model$dual_coef_ <- deSerializeCsrMatrix(model_dict$dual_coef_)
} else {
model$dual_coef_ <- np$array(model_dict$dual_coef_)$astype(np$float64)
}
-
- if (reticulate::py_bool((model_dict$`_dual_coef_`["meta"] != reticulate::py_none())) &
- (reticulate::py_bool(model_dict$`_dual_coef_`["meta"] == "csr"))) {
+
+ if (reticulate::py_bool((model_dict$`_dual_coef_`["meta"] != reticulate::py_none())) &
+ (reticulate::py_bool(model_dict$`_dual_coef_`["meta"] == "csr"))) {
model$`_dual_coef_` <- deSerializeCsrMatrix(model_dict$`dual_coef_`)
} else {
model$`_dual_coef_` <- np$array(model_dict$`_dual_coef_`)$astype(np$float64)
}
return(model)
-}
+}
serializeCsrMatrix <- function(csr_matrix) {
- serialized_csr_matrix = reticulate::dict(
+ serialized_csr_matrix <- reticulate::dict(
"meta" = "csr",
"indices" = csr_matrix$indices$tolist(),
"indptr" = csr_matrix$indptr$tolist(),
- "_shape"= csr_matrix$`_shape`)
+ "_shape" = csr_matrix$`_shape`
+ )
serialized_csr_matrix["data"] <- csr_matrix$data$tolist()
return(serialized_csr_matrix)
}
-deSerializeCsrMatrix <- function(csr_dict,
- data_type=np$float64,
- indices_type=np$int32,
- indptr_type=np$int32) {
- sp <- reticulate::import("scipy", convert=FALSE)
- np <- reticulate::import("numpy", convert=FALSE)
+deSerializeCsrMatrix <- function(csr_dict,
+ data_type = np$float64,
+ indices_type = np$int32,
+ indptr_type = np$int32) {
+ sp <- reticulate::import("scipy", convert = FALSE)
+ np <- reticulate::import("numpy", convert = FALSE)
csr_matrix <- sp$sparse$csr_matrix(
- reticulate::tuple(list(np$array(csr_dict["data"])$astype(data_type),
- np$array(csr_dict["indices"])$astype(indices_type),
- np$array(csr_dict["indptr"])$astype(indptr_type))),
- shape=csr_dict["shape"]
+ reticulate::tuple(list(
+ np$array(csr_dict["data"])$astype(data_type),
+ np$array(csr_dict["indices"])$astype(indices_type),
+ np$array(csr_dict["indptr"])$astype(indptr_type)
+ )),
+ shape = csr_dict["shape"]
)
return(csr_matrix)
}
-
-
\ No newline at end of file
diff --git a/R/ThresholdSummary.R b/R/ThresholdSummary.R
index b36d86014..7fac675d4 100644
--- a/R/ThresholdSummary.R
+++ b/R/ThresholdSummary.R
@@ -4,27 +4,26 @@
#' Calculates the TP, FP, TN, FN, TPR, FPR, accuracy, PPF, FOR and Fmeasure
#' from a prediction object
#'
-#' @param prediction A prediction object
-#' @param predictionType The type of prediction (binary or survival)
-#' @param typeColumn A column that is used to stratify the results
+#' @param prediction A prediction object
+#' @param predictionType The type of prediction (binary or survival)
+#' @param typeColumn A column that is used to stratify the results
#'
#' @return
#' A data.frame with all the measures
#' @export
getThresholdSummary <- function(
- prediction,
- predictionType,
- typeColumn = 'evaluation'
-){
+ prediction,
+ predictionType,
+ typeColumn = "evaluation") {
evaluation <- do.call(
- what = paste0('getThresholdSummary_', predictionType),
+ what = paste0("getThresholdSummary_", predictionType),
args = list(
- prediction = prediction,
+ prediction = prediction,
evalColumn = typeColumn,
- timepoint = attr(prediction, 'metaData')$timepoint
+ timepoint = attr(prediction, "metaData")$timepoint
)
)
-
+
return(evaluation)
}
@@ -36,129 +35,127 @@ getThresholdSummary <- function(
#' Calculates the TP, FP, TN, FN, TPR, FPR, accuracy, PPF, FOR and Fmeasure
#' from a prediction object
#'
-#' @param prediction A prediction object
-#' @param evalColumn A column that is used to stratify the results
+#' @param prediction A prediction object
+#' @param evalColumn A column that is used to stratify the results
#' @param ... Other inputs
#'
#' @return
#' A data.frame with all the measures
#'
-getThresholdSummary_binary <- function(prediction, evalColumn, ...){
-
+getThresholdSummary_binary <- function(prediction, evalColumn, ...) {
result <- c()
- evalTypes <- unique(as.data.frame(prediction)[,evalColumn])
-
- for(evalType in evalTypes){
-
+ evalTypes <- unique(as.data.frame(prediction)[, evalColumn])
+
+ for (evalType in evalTypes) {
predictionOfInterest <- prediction %>% dplyr::filter(.data[[evalColumn]] == evalType)
-
+
# do input checks
- if(nrow(predictionOfInterest)<1){
- warning('sparse threshold summary not calculated due to empty dataset')
+ if (nrow(predictionOfInterest) < 1) {
+ warning("sparse threshold summary not calculated due to empty dataset")
return(NULL)
}
-
+
n <- nrow(predictionOfInterest)
- P <- sum(predictionOfInterest$outcomeCount>0)
+ P <- sum(predictionOfInterest$outcomeCount > 0)
N <- n - P
-
- if(N==0){
- warning('Number of negatives is zero')
+
+ if (N == 0) {
+ warning("Number of negatives is zero")
return(NULL)
}
- if(P==0){
- warning('Number of positives is zero')
+ if (P == 0) {
+ warning("Number of positives is zero")
return(NULL)
}
-
+
# add the preference score:
- proportion <- sum(predictionOfInterest$outcomeCount>0)/nrow(predictionOfInterest)
+ proportion <- sum(predictionOfInterest$outcomeCount > 0) / nrow(predictionOfInterest)
# ISSUE WITH CAL # remove any predictions of 1
- predictionOfInterest$value[predictionOfInterest$value==1] <- 0.99999999
- x <- exp(log(predictionOfInterest$value/(1 - predictionOfInterest$value)) - log(proportion/(1 - proportion)))
- predictionOfInterest$preferenceScore <- x/(x + 1)
-
+ predictionOfInterest$value[predictionOfInterest$value == 1] <- 0.99999999
+ x <- exp(log(predictionOfInterest$value / (1 - predictionOfInterest$value)) - log(proportion / (1 - proportion)))
+ predictionOfInterest$preferenceScore <- x / (x + 1)
+
# sort prediction
- predictionOfInterest <- predictionOfInterest[order(-predictionOfInterest$value),]
-
- # because of numerical precision issues (I think), in very rare cases the preferenceScore
+ predictionOfInterest <- predictionOfInterest[order(-predictionOfInterest$value), ]
+
+ # because of numerical precision issues (I think), in very rare cases the preferenceScore
# is not monotonically decreasing after this sort (it should follow the predictions)
# as a fix I remove the troublesome row from influencing the thresholdSummary
if (!all(predictionOfInterest$preferenceScore == cummin(predictionOfInterest$preferenceScore))) {
- troubleRow <- (which((predictionOfInterest$preferenceScore == cummin(predictionOfInterest$preferenceScore))==FALSE))
- predictionOfInterest <- predictionOfInterest[-troubleRow,]
- }
-
+ troubleRow <- (which((predictionOfInterest$preferenceScore == cummin(predictionOfInterest$preferenceScore)) == FALSE))
+ predictionOfInterest <- predictionOfInterest[-troubleRow, ]
+ }
+
# create indexes
- if(length(predictionOfInterest$preferenceScore)>100){
+ if (length(predictionOfInterest$preferenceScore) > 100) {
indexesOfInt <- c(
1:100,
seq(
- 1,
+ 1,
length(predictionOfInterest$preferenceScore),
- floor(length(predictionOfInterest$preferenceScore)/100 )
+ floor(length(predictionOfInterest$preferenceScore) / 100)
),
length(predictionOfInterest$preferenceScore)
)
- } else{
+ } else {
indexesOfInt <- 1:length(predictionOfInterest$preferenceScore)
}
pt <- unique(predictionOfInterest$value[indexesOfInt])
- indexesOfInt <- unique(unlist(lapply(pt, function(pt) max(which(predictionOfInterest$value>=pt))))) # made this >= to match net benefit
-
+ indexesOfInt <- unique(unlist(lapply(pt, function(pt) max(which(predictionOfInterest$value >= pt))))) # made this >= to match net benefit
+
# get thresholds
- predictionThreshold = predictionOfInterest$value[indexesOfInt]
- preferenceThreshold = predictionOfInterest$preferenceScore[indexesOfInt]
-
+ predictionThreshold <- predictionOfInterest$value[indexesOfInt]
+ preferenceThreshold <- predictionOfInterest$preferenceScore[indexesOfInt]
+
# improve speed create vector with cum sum
- lab.order <- ifelse(predictionOfInterest$outcomeCount>0,1,0)
+ lab.order <- ifelse(predictionOfInterest$outcomeCount > 0, 1, 0)
temp.cumsum <- rep(0, length(lab.order))
- temp.cumsum[lab.order==0] <- 1:sum(lab.order==0)
- temp.cumsum[lab.order==1] <- which(lab.order==1, arr.ind=T)-1:sum(lab.order==1)
- TP <- sapply(indexesOfInt, function(x) x-temp.cumsum[x])
+ temp.cumsum[lab.order == 0] <- 1:sum(lab.order == 0)
+ temp.cumsum[lab.order == 1] <- which(lab.order == 1, arr.ind = TRUE) - 1:sum(lab.order == 1)
+ TP <- sapply(indexesOfInt, function(x) x - temp.cumsum[x])
FP <- sapply(indexesOfInt, function(x) temp.cumsum[x])
-
- TN <- N-FP
- FN <- P-TP
-
- positiveCount <- TP+FP
- negativeCount <- TN+FN
- trueCount <- TP+FN
+
+ TN <- N - FP
+ FN <- P - TP
+
+ positiveCount <- TP + FP
+ negativeCount <- TN + FN
+ trueCount <- TP + FN
falseCount <- TN + FP
truePositiveCount <- TP
trueNegativeCount <- TN
falsePositiveCount <- FP
falseNegativeCount <- FN
-
- f1Score <- f1Score(TP,TN,FN,FP)
- accuracy <- accuracy(TP,TN,FN,FP)
- sensitivity <- sensitivity(TP,TN,FN,FP)
- falseNegativeRate <- falseNegativeRate(TP,TN,FN,FP)
- falsePositiveRate <- falsePositiveRate(TP,TN,FN,FP)
- specificity <- specificity(TP,TN,FN,FP)
- positivePredictiveValue <- positivePredictiveValue(TP,TN,FN,FP)
- falseDiscoveryRate <- falseDiscoveryRate(TP,TN,FN,FP)
- negativePredictiveValue <- negativePredictiveValue(TP,TN,FN,FP)
- falseOmissionRate <- falseOmissionRate(TP,TN,FN,FP)
- positiveLikelihoodRatio <- positiveLikelihoodRatio(TP,TN,FN,FP)
- negativeLikelihoodRatio <- negativeLikelihoodRatio(TP,TN,FN,FP)
- diagnosticOddsRatio <- diagnosticOddsRatio(TP,TN,FN,FP)
-
+
+ f1Score <- f1Score(TP, TN, FN, FP)
+ accuracy <- accuracy(TP, TN, FN, FP)
+ sensitivity <- sensitivity(TP, TN, FN, FP)
+ falseNegativeRate <- falseNegativeRate(TP, TN, FN, FP)
+ falsePositiveRate <- falsePositiveRate(TP, TN, FN, FP)
+ specificity <- specificity(TP, TN, FN, FP)
+ positivePredictiveValue <- positivePredictiveValue(TP, TN, FN, FP)
+ falseDiscoveryRate <- falseDiscoveryRate(TP, TN, FN, FP)
+ negativePredictiveValue <- negativePredictiveValue(TP, TN, FN, FP)
+ falseOmissionRate <- falseOmissionRate(TP, TN, FN, FP)
+ positiveLikelihoodRatio <- positiveLikelihoodRatio(TP, TN, FN, FP)
+ negativeLikelihoodRatio <- negativeLikelihoodRatio(TP, TN, FN, FP)
+ diagnosticOddsRatio <- diagnosticOddsRatio(TP, TN, FN, FP)
+
result <- rbind(
- result,
+ result,
data.frame(
evaluation = evalType,
predictionThreshold = predictionThreshold,
preferenceThreshold = preferenceThreshold,
positiveCount = positiveCount,
negativeCount = negativeCount,
- trueCount = trueCount,
+ trueCount = trueCount,
falseCount = falseCount,
truePositiveCount = truePositiveCount,
trueNegativeCount = trueNegativeCount,
falsePositiveCount = falsePositiveCount,
falseNegativeCount = falseNegativeCount,
- f1Score = f1Score,
+ f1Score = f1Score,
accuracy = accuracy,
sensitivity = sensitivity,
falseNegativeRate = falseNegativeRate,
@@ -173,70 +170,68 @@ getThresholdSummary_binary <- function(prediction, evalColumn, ...){
diagnosticOddsRatio = diagnosticOddsRatio
)
)
-
-
}
-
+
result <- as.data.frame(result)
return(result)
}
-getThresholdSummary_survival <- function(prediction, evalColumn, timepoint, ...){
-
+getThresholdSummary_survival <- function(prediction, evalColumn, timepoint, ...) {
result <- c()
- evalTypes <- unique(as.data.frame(prediction)[,evalColumn])
-
- for(evalType in evalTypes){
-
- predictionOfInterest <- prediction %>%
+ evalTypes <- unique(as.data.frame(prediction)[, evalColumn])
+
+ for (evalType in evalTypes) {
+ predictionOfInterest <- prediction %>%
dplyr::filter(.data[[evalColumn]] == evalType)
-
+
t <- predictionOfInterest$survivalTime
y <- ifelse(predictionOfInterest$outcomeCount > 0, 1, 0)
-
- S <- survival::Surv(t, y)
+
+ S <- survival::Surv(t, y)
p <- predictionOfInterest$value
-
+
# add netbenefit
preddat <- data.frame(
- p = p/max(p),
- t=t,
- y=y
+ p = p / max(p),
+ t = t,
+ y = y
)
-
+
nbSummary <- tryCatch(
- { xstart <- max(min(preddat$p),0.001);
- xstop <- min(max(preddat$p),0.99);
+ {
+ xstart <- max(min(preddat$p), 0.001)
+ xstop <- min(max(preddat$p), 0.99)
stdca(
- data = preddat,
- outcome = "y",
- ttoutcome = "t",
- timepoint = timepoint,
- predictors = "p",
- xstart = xstart,
- xstop = xstop,
- xby = (xstop - xstart)/100,
+ data = preddat,
+ outcome = "y",
+ ttoutcome = "t",
+ timepoint = timepoint,
+ predictors = "p",
+ xstart = xstart,
+ xstop = xstop,
+ xby = (xstop - xstart) / 100,
smooth = FALSE,
graph = FALSE
)
},
- error = function(e){ParallelLogger::logError(e); return(NULL)}
+ error = function(e) {
+ ParallelLogger::logError(e)
+ return(NULL)
+ }
)
-
- if(!is.null(nbSummary$net.benefit)){
-
+
+ if (!is.null(nbSummary$net.benefit)) {
tempResult <- as.data.frame(nbSummary$net.benefit)
tempResult$evaluation <- evalType
-
+
result <- rbind(
- result,
+ result,
tempResult
)
}
}
-
+
return(result)
-
}
@@ -244,155 +239,167 @@ getThresholdSummary_survival <- function(prediction, evalColumn, timepoint, ...)
###############################
#
-# FUNCTIONS TO HELP
+# FUNCTIONS TO HELP
###############################
# taken from the dca package @ http://www.danieldsjoberg.com/dca/articles/survival-outcomes.html
-stdca <- function (data, outcome, ttoutcome, timepoint, predictors, xstart = 0.01,
- xstop = 0.99, xby = 0.01, ymin = -0.05, probability = NULL,
- harm = NULL, graph = TRUE, intervention = FALSE, interventionper = 100,
- smooth = FALSE, loess.span = 0.1, cmprsk = FALSE)
-{
- data = data[stats::complete.cases(data[c(outcome, ttoutcome,
- predictors)]), c(outcome, ttoutcome, predictors)]
- if ((length(data[!(data[outcome] == 0 | data[outcome] ==
- 1), outcome]) > 0) & cmprsk == FALSE) {
+stdca <- function(
+ data, outcome, ttoutcome, timepoint, predictors, xstart = 0.01,
+ xstop = 0.99, xby = 0.01, ymin = -0.05, probability = NULL,
+ harm = NULL, graph = TRUE, intervention = FALSE, interventionper = 100,
+ smooth = FALSE, loess.span = 0.1, cmprsk = FALSE) {
+ data <- data[stats::complete.cases(data[c(
+ outcome, ttoutcome,
+ predictors
+ )]), c(outcome, ttoutcome, predictors)]
+ if ((length(data[!(data[outcome] == 0 | data[outcome] ==
+ 1), outcome]) > 0) && cmprsk == FALSE) {
stop("outcome must be coded as 0 and 1")
}
if (!inherits(x = data, what = "data.frame")) {
stop("Input data must be class data.frame")
}
- if (xstart < 0 | xstart > 1) {
+ if (xstart < 0 || xstart > 1) {
stop("xstart must lie between 0 and 1")
}
- if (xstop < 0 | xstop > 1) {
+ if (xstop < 0 || xstop > 1) {
stop("xstop must lie between 0 and 1")
}
- if (xby <= 0 | xby >= 1) {
+ if (xby <= 0 || xby >= 1) {
stop("xby must lie between 0 and 1")
}
if (xstart >= xstop) {
stop("xstop must be larger than xstart")
}
- pred.n = length(predictors)
- if (length(probability) > 0 & pred.n != length(probability)) {
+ pred.n <- length(predictors)
+ if (length(probability) > 0 && pred.n != length(probability)) {
stop("Number of probabilities specified must be the same as the number of predictors being checked.")
}
- if (length(harm) > 0 & pred.n != length(harm)) {
+ if (length(harm) > 0 && pred.n != length(harm)) {
stop("Number of harms specified must be the same as the number of predictors being checked.")
}
if (length(harm) == 0) {
- harm = rep(0, pred.n)
+ harm <- rep(0, pred.n)
}
if (length(probability) == 0) {
- probability = rep(TRUE, pred.n)
+ probability <- rep(TRUE, pred.n)
}
- if (length(predictors[predictors == "all" | predictors ==
- "none"])) {
+ if (length(predictors[predictors == "all" | predictors ==
+ "none"])) {
stop("Prediction names cannot be equal to all or none.")
}
for (m in 1:pred.n) {
- if (probability[m] != TRUE & probability[m] != FALSE) {
+ if (probability[m] != TRUE && probability[m] != FALSE) {
stop("Each element of probability vector must be TRUE or FALSE")
}
- if (probability[m] == TRUE & (max(data[predictors[m]]) >
- 1 | min(data[predictors[m]]) < 0)) {
- stop(paste(predictors[m], "must be between 0 and 1 OR sepcified as a non-probability in the probability option",
- sep = " "))
+ if (probability[m] == TRUE && (max(data[predictors[m]]) >
+ 1 || min(data[predictors[m]]) < 0)) {
+ stop(paste(predictors[m], "must be between 0 and 1 OR sepcified as a non-probability in the probability option",
+ sep = " "
+ ))
}
if (probability[m] == FALSE) {
- model = NULL
- pred = NULL
- model = survival::coxph(survival::Surv(data.matrix(data[ttoutcome]),
- data.matrix(data[outcome])) ~ data.matrix(data[predictors[m]]))
- surv.data = data.frame(0)
- pred = data.frame(1 - c(summary(survival::survfit(model,
- newdata = surv.data), time = timepoint)$surv))
- names(pred) = predictors[m]
- data = cbind(data[names(data) != predictors[m]],
- pred)
- print(paste(predictors[m], "converted to a probability with Cox regression. Due to linearity and proportional hazards assumption, miscalibration may occur.",
- sep = " "))
+ model <- NULL
+ pred <- NULL
+ model <- survival::coxph(survival::Surv(
+ data.matrix(data[ttoutcome]),
+ data.matrix(data[outcome])
+ ) ~ data.matrix(data[predictors[m]]))
+ surv.data <- data.frame(0)
+ pred <- data.frame(1 - c(summary(survival::survfit(model,
+ newdata = surv.data
+ ), time = timepoint)$surv))
+ names(pred) <- predictors[m]
+ data <- cbind(
+ data[names(data) != predictors[m]],
+ pred
+ )
+ print(paste(predictors[m], "converted to a probability with Cox regression. Due to linearity and proportional hazards assumption, miscalibration may occur.",
+ sep = " "
+ ))
}
}
- N = dim(data)[1]
+ N <- dim(data)[1]
if (cmprsk == FALSE) {
- km.cuminc = survival::survfit(survival::Surv(data.matrix(data[ttoutcome]), #fixed missing survival::
- data.matrix(data[outcome])) ~ 1)
- pd = 1 - summary(km.cuminc, times = timepoint)$surv
+ km.cuminc <- survival::survfit(survival::Surv(
+ data.matrix(data[ttoutcome]), # fixed missing survival::
+ data.matrix(data[outcome])
+ ) ~ 1)
+ pd <- 1 - summary(km.cuminc, times = timepoint)$surv
+ } else {
+ stop("not supported")
}
- else {
- #cr.cuminc = cmprsk::cuminc(data[[ttoutcome]], data[[outcome]])
- #pd = cmprsk::timepoints(cr.cuminc, times = timepoint)$est[1]
- stop('not supported')
- }
- nb = data.frame(seq(from = xstart, to = xstop, by = xby))
- names(nb) = "threshold"
- interv = nb
- error = NULL
- nb["all"] = pd - (1 - pd) * nb$threshold/(1 - nb$threshold)
- nb["none"] = 0
+ nb <- data.frame(seq(from = xstart, to = xstop, by = xby))
+ names(nb) <- "threshold"
+ interv <- nb
+ error <- NULL
+ nb["all"] <- pd - (1 - pd) * nb$threshold / (1 - nb$threshold)
+ nb["none"] <- 0
for (m in 1:pred.n) {
- nb[predictors[m]] = NA
+ nb[predictors[m]] <- NA
for (t in 1:length(nb$threshold)) {
- px = sum(data[predictors[m]] > nb$threshold[t])/N
+ px <- sum(data[predictors[m]] > nb$threshold[t]) / N
if (px == 0) {
- error = rbind(error, paste(predictors[m], ": No observations with risk greater than ",
- nb$threshold[t] * 100, "%", sep = ""))
+ error <- rbind(error, paste(predictors[m], ": No observations with risk greater than ",
+ nb$threshold[t] * 100, "%",
+ sep = ""
+ ))
break
- }
- else {
+ } else {
if (cmprsk == FALSE) {
- km.cuminc = survival::survfit(survival::Surv(data.matrix(data[data[predictors[m]] >
- nb$threshold[t], ttoutcome]), data.matrix(data[data[predictors[m]] >
- nb$threshold[t], outcome])) ~ 1)
- pdgivenx = (1 - summary(km.cuminc, times = timepoint)$surv)
+ km.cuminc <- survival::survfit(survival::Surv(data.matrix(data[data[predictors[m]] >
+ nb$threshold[t], ttoutcome]), data.matrix(data[data[predictors[m]] >
+ nb$threshold[t], outcome])) ~ 1)
+ pdgivenx <- (1 - summary(km.cuminc, times = timepoint)$surv)
if (length(pdgivenx) == 0) {
- error = rbind(error, paste(predictors[m],
- ": No observations with risk greater than ",
- nb$threshold[t] * 100, "% that have followup through the timepoint selected",
- sep = ""))
+ error <- rbind(error, paste(predictors[m],
+ ": No observations with risk greater than ",
+ nb$threshold[t] * 100, "% that have followup through the timepoint selected",
+ sep = ""
+ ))
break
}
- }
- else {
- #cr.cuminc = cmprsk::cuminc(data[[ttoutcome]][data[[predictors[m]]] >
- # nb$threshold[t]], data[[outcome]][data[[predictors[m]]] >
- # nb$threshold[t]])
- #pdgivenx = cmprsk::timepoints(cr.cuminc, times = timepoint)$est[1]
- #if (is.na(pdgivenx)) {
- # error = rbind(error, paste(predictors[m],
- # ": No observations with risk greater than ",
- # nb$threshold[t] * 100, "% that have followup through the timepoint selected",
- # sep = ""))
- # break
- stop('not supported')
+ } else {
+ stop("not supported")
}
}
- nb[t, predictors[m]] = pdgivenx * px - (1 - pdgivenx) *
- px * nb$threshold[t]/(1 - nb$threshold[t]) -
+ nb[t, predictors[m]] <- pdgivenx * px - (1 - pdgivenx) *
+ px * nb$threshold[t] / (1 - nb$threshold[t]) -
harm[m]
}
- interv[predictors[m]] = (nb[predictors[m]] - nb["all"]) *
- interventionper/(interv$threshold/(1 - interv$threshold))
+ interv[predictors[m]] <- (nb[predictors[m]] - nb["all"]) *
+ interventionper / (interv$threshold / (1 - interv$threshold))
}
if (length(error) > 0) {
- print(paste(error, ", and therefore net benefit not calculable in this range.",
- sep = ""))
+ print(paste(error, ", and therefore net benefit not calculable in this range.",
+ sep = ""
+ ))
}
for (m in 1:pred.n) {
if (smooth == TRUE) {
- lws = stats::loess(data.matrix(nb[!is.na(nb[[predictors[m]]]),
- predictors[m]]) ~ data.matrix(nb[!is.na(nb[[predictors[m]]]),
- "threshold"]), span = loess.span)
- nb[!is.na(nb[[predictors[m]]]), paste(predictors[m],
- "_sm", sep = "")] = lws$fitted
- lws = stats::loess(data.matrix(interv[!is.na(nb[[predictors[m]]]),
- predictors[m]]) ~ data.matrix(interv[!is.na(nb[[predictors[m]]]),
- "threshold"]), span = loess.span)
- interv[!is.na(nb[[predictors[m]]]), paste(predictors[m],
- "_sm", sep = "")] = lws$fitted
+ lws <- stats::loess(data.matrix(nb[
+ !is.na(nb[[predictors[m]]]),
+ predictors[m]
+ ]) ~ data.matrix(nb[
+ !is.na(nb[[predictors[m]]]),
+ "threshold"
+ ]), span = loess.span)
+ nb[!is.na(nb[[predictors[m]]]), paste(predictors[m],
+ "_sm",
+ sep = ""
+ )] <- lws$fitted
+ lws <- stats::loess(data.matrix(interv[
+ !is.na(nb[[predictors[m]]]),
+ predictors[m]
+ ]) ~ data.matrix(interv[
+ !is.na(nb[[predictors[m]]]),
+ "threshold"
+ ]), span = loess.span)
+ interv[!is.na(nb[[predictors[m]]]), paste(predictors[m],
+ "_sm",
+ sep = ""
+ )] <- lws$fitted
}
}
if (graph == TRUE) {
@@ -401,47 +408,62 @@ stdca <- function (data, outcome, ttoutcome, timepoint, predictors, xstart = 0.0
legendcolor <- NULL
legendwidth <- NULL
legendpattern <- NULL
- ymax = max(interv[predictors], na.rm = TRUE)
- graphics::plot(x = nb$threshold, y = nb$all, type = "n",
- xlim = c(xstart, xstop), ylim = c(ymin, ymax),
- xlab = "Threshold probability", ylab = paste("Net reduction in interventions per",
- interventionper, "patients"))
+ ymax <- max(interv[predictors], na.rm = TRUE)
+ graphics::plot(
+ x = nb$threshold, y = nb$all, type = "n",
+ xlim = c(xstart, xstop), ylim = c(ymin, ymax),
+ xlab = "Threshold probability", ylab = paste(
+ "Net reduction in interventions per",
+ interventionper, "patients"
+ )
+ )
for (m in 1:pred.n) {
if (smooth == TRUE) {
- graphics::lines(interv$threshold, data.matrix(interv[paste(predictors[m],
- "_sm", sep = "")]), col = m,
- lty = 2)
- }
- else {
- graphics::lines(interv$threshold, data.matrix(interv[predictors[m]]),
- col = m, lty = 2)
+ graphics::lines(interv$threshold, data.matrix(interv[paste(predictors[m],
+ "_sm",
+ sep = ""
+ )]),
+ col = m,
+ lty = 2
+ )
+ } else {
+ graphics::lines(interv$threshold, data.matrix(interv[predictors[m]]),
+ col = m, lty = 2
+ )
}
legendlabel <- c(legendlabel, predictors[m])
legendcolor <- c(legendcolor, m)
legendwidth <- c(legendwidth, 1)
legendpattern <- c(legendpattern, 2)
}
- }
- else {
+ } else {
legendlabel <- c("None", "All")
legendcolor <- c(17, 8)
legendwidth <- c(2, 2)
legendpattern <- c(1, 1)
- ymax = max(nb[names(nb) != "threshold"], na.rm = TRUE)
- graphics::plot(x = nb$threshold, y = nb$all, type = "l",
- col = 8, lwd = 2, xlim = c(xstart, xstop), ylim = c(ymin,
- ymax), xlab = "Threshold probability",
- ylab = "Net benefit")
+ ymax <- max(nb[names(nb) != "threshold"], na.rm = TRUE)
+ graphics::plot(
+ x = nb$threshold, y = nb$all, type = "l",
+ col = 8, lwd = 2, xlim = c(xstart, xstop), ylim = c(
+ ymin,
+ ymax
+ ), xlab = "Threshold probability",
+ ylab = "Net benefit"
+ )
graphics::lines(x = nb$threshold, y = nb$none, lwd = 2)
for (m in 1:pred.n) {
if (smooth == TRUE) {
- graphics::lines(nb$threshold, data.matrix(nb[paste(predictors[m],
- "_sm", sep = "")]), col = m,
- lty = 2)
- }
- else {
- graphics::lines(nb$threshold, data.matrix(nb[predictors[m]]),
- col = m, lty = 2)
+ graphics::lines(nb$threshold, data.matrix(nb[paste(predictors[m],
+ "_sm",
+ sep = ""
+ )]),
+ col = m,
+ lty = 2
+ )
+ } else {
+ graphics::lines(nb$threshold, data.matrix(nb[predictors[m]]),
+ col = m, lty = 2
+ )
}
legendlabel <- c(legendlabel, predictors[m])
legendcolor <- c(legendcolor, m)
@@ -449,50 +471,54 @@ stdca <- function (data, outcome, ttoutcome, timepoint, predictors, xstart = 0.0
legendpattern <- c(legendpattern, 2)
}
}
- graphics::legend("topright", legendlabel, cex = 0.8,
- col = legendcolor, lwd = legendwidth, lty = legendpattern)
+ graphics::legend("topright", legendlabel,
+ cex = 0.8,
+ col = legendcolor, lwd = legendwidth, lty = legendpattern
+ )
}
- results = list()
- results$N = N
- results$predictors = data.frame(cbind(predictors, harm, probability))
- names(results$predictors) = c("predictor", "harm.applied",
- "probability")
- results$interventions.avoided.per = interventionper
- results$net.benefit = nb
- results$interventions.avoided = interv
+ results <- list()
+ results$N <- N
+ results$predictors <- data.frame(cbind(predictors, harm, probability))
+ names(results$predictors) <- c(
+ "predictor", "harm.applied",
+ "probability"
+ )
+ results$interventions.avoided.per <- interventionper
+ results$net.benefit <- nb
+ results$interventions.avoided <- interv
return(results)
}
-checkToByTwoTableInputs <- function(TP,FP,FN,TN){
+checkToByTwoTableInputs <- function(TP, FP, FN, TN) {
# check classes
- if(!inherits(x = TP, what = c('integer','numeric'))){
- stop('Incorrect TP class')
+ if (!inherits(x = TP, what = c("integer", "numeric"))) {
+ stop("Incorrect TP class")
}
- if(!inherits(x = FP, what = c('integer','numeric'))){
- stop('Incorrect FP class')
+ if (!inherits(x = FP, what = c("integer", "numeric"))) {
+ stop("Incorrect FP class")
}
- if(!inherits(x = TN, what = c('integer','numeric'))){
- stop('Incorrect TN class')
+ if (!inherits(x = TN, what = c("integer", "numeric"))) {
+ stop("Incorrect TN class")
}
- if(!inherits(x = FN, what = c('integer','numeric'))){
- stop('Incorrect FN class')
+ if (!inherits(x = FN, what = c("integer", "numeric"))) {
+ stop("Incorrect FN class")
}
-
+
# check positive values
- if(sum(TP<0)>0){
- stop('TP < 0')
+ if (sum(TP < 0) > 0) {
+ stop("TP < 0")
}
- if(sum(FP<0)>0){
- stop('FP < 0')
+ if (sum(FP < 0) > 0) {
+ stop("FP < 0")
}
- if(sum(TN<0)>0){
- stop('TN < 0')
+ if (sum(TN < 0) > 0) {
+ stop("TN < 0")
}
- if(sum(FN<0)>0){
- stop('FN < 0')
+ if (sum(FN < 0) > 0) {
+ stop("FN < 0")
}
-
+
return(invisible(TRUE))
}
# making all this single for easy unit testing
@@ -509,15 +535,15 @@ checkToByTwoTableInputs <- function(TP,FP,FN,TN){
#' @return
#' f1Score value
#'
-f1Score <- function(TP,TN,FN,FP){
+f1Score <- function(TP, TN, FN, FP) {
checkToByTwoTableInputs(
TP = TP,
FP = FP,
FN = FN,
TN = TN
- )
+ )
- return(2*(TP/(TP+FP))*(TP/(TP+FN))/((TP/(TP+FP))+(TP/(TP+FN))))
+ return(2 * (TP / (TP + FP)) * (TP / (TP + FN)) / ((TP / (TP + FP)) + (TP / (TP + FN))))
}
#' Calculate the accuracy
#'
@@ -532,15 +558,15 @@ f1Score <- function(TP,TN,FN,FP){
#' @return
#' accuracy value
#'
-accuracy <- function(TP,TN,FN,FP){
+accuracy <- function(TP, TN, FN, FP) {
checkToByTwoTableInputs(
TP = TP,
FP = FP,
FN = FN,
TN = TN
)
-
- return((TP+TN)/(TP+TN+FP+FN))
+
+ return((TP + TN) / (TP + TN + FP + FN))
}
#' Calculate the sensitivity
@@ -556,7 +582,7 @@ accuracy <- function(TP,TN,FN,FP){
#' @return
#' sensitivity value
#'
-sensitivity <- function(TP,TN,FN,FP){
+sensitivity <- function(TP, TN, FN, FP) {
checkToByTwoTableInputs(
TP = TP,
FP = FP,
@@ -564,7 +590,7 @@ sensitivity <- function(TP,TN,FN,FP){
TN = TN
)
- return(TP/(TP+FN))
+ return(TP / (TP + FN))
}
#' Calculate the falseNegativeRate
@@ -580,15 +606,15 @@ sensitivity <- function(TP,TN,FN,FP){
#' @return
#' falseNegativeRate value
#'
-falseNegativeRate <- function(TP,TN,FN,FP){
+falseNegativeRate <- function(TP, TN, FN, FP) {
checkToByTwoTableInputs(
TP = TP,
FP = FP,
FN = FN,
TN = TN
)
-
- return(FN/(TP+FN))
+
+ return(FN / (TP + FN))
}
#' Calculate the falsePositiveRate
@@ -604,7 +630,7 @@ falseNegativeRate <- function(TP,TN,FN,FP){
#' @return
#' falsePositiveRate value
#'
-falsePositiveRate <- function(TP,TN,FN,FP){
+falsePositiveRate <- function(TP, TN, FN, FP) {
checkToByTwoTableInputs(
TP = TP,
FP = FP,
@@ -612,7 +638,7 @@ falsePositiveRate <- function(TP,TN,FN,FP){
TN = TN
)
- return(FP/(FP+TN))
+ return(FP / (FP + TN))
}
#' Calculate the specificity
@@ -628,16 +654,15 @@ falsePositiveRate <- function(TP,TN,FN,FP){
#' @return
#' specificity value
#'
-specificity <- function(TP,TN,FN,FP){
-
+specificity <- function(TP, TN, FN, FP) {
checkToByTwoTableInputs(
TP = TP,
FP = FP,
FN = FN,
TN = TN
)
-
- return(TN/(FP+TN))
+
+ return(TN / (FP + TN))
}
#' Calculate the positivePredictiveValue
@@ -653,16 +678,15 @@ specificity <- function(TP,TN,FN,FP){
#' @return
#' positivePredictiveValue value
#'
-positivePredictiveValue <- function(TP,TN,FN,FP){
-
+positivePredictiveValue <- function(TP, TN, FN, FP) {
checkToByTwoTableInputs(
TP = TP,
FP = FP,
FN = FN,
TN = TN
)
-
- return(TP/(TP+FP))
+
+ return(TP / (TP + FP))
}
#' Calculate the falseDiscoveryRate
@@ -678,17 +702,16 @@ positivePredictiveValue <- function(TP,TN,FN,FP){
#' @return
#' falseDiscoveryRate value
#'
-falseDiscoveryRate <- function(TP,TN,FN,FP){
-
+falseDiscoveryRate <- function(TP, TN, FN, FP) {
checkToByTwoTableInputs(
TP = TP,
FP = FP,
FN = FN,
TN = TN
)
-
- return(FP/(TP+FP))
- }
+
+ return(FP / (TP + FP))
+}
#' Calculate the negativePredictiveValue
#'
@@ -703,16 +726,15 @@ falseDiscoveryRate <- function(TP,TN,FN,FP){
#' @return
#' negativePredictiveValue value
#'
-negativePredictiveValue <- function(TP,TN,FN,FP){
-
+negativePredictiveValue <- function(TP, TN, FN, FP) {
checkToByTwoTableInputs(
TP = TP,
FP = FP,
FN = FN,
TN = TN
)
-
- return(TN/(FN+TN))
+
+ return(TN / (FN + TN))
}
#' Calculate the falseOmissionRate
@@ -728,16 +750,15 @@ negativePredictiveValue <- function(TP,TN,FN,FP){
#' @return
#' falseOmissionRate value
#'
-falseOmissionRate <- function(TP,TN,FN,FP){
-
+falseOmissionRate <- function(TP, TN, FN, FP) {
checkToByTwoTableInputs(
TP = TP,
FP = FP,
FN = FN,
TN = TN
)
-
- return(FN/(FN+TN))
+
+ return(FN / (FN + TN))
}
#' Calculate the positiveLikelihoodRatio
@@ -752,17 +773,16 @@ falseOmissionRate <- function(TP,TN,FN,FP){
#'
#' @return
#' positiveLikelihoodRatio value
-#'
-positiveLikelihoodRatio <- function(TP,TN,FN,FP){
-
+#'
+positiveLikelihoodRatio <- function(TP, TN, FN, FP) {
checkToByTwoTableInputs(
TP = TP,
FP = FP,
FN = FN,
TN = TN
)
-
- return((TP/(TP+FN))/(FP/(FP+TN)))
+
+ return((TP / (TP + FN)) / (FP / (FP + TN)))
}
#' Calculate the negativeLikelihoodRatio
@@ -778,16 +798,15 @@ positiveLikelihoodRatio <- function(TP,TN,FN,FP){
#' @return
#' negativeLikelihoodRatio value
#'
-negativeLikelihoodRatio <- function(TP,TN,FN,FP){
-
+negativeLikelihoodRatio <- function(TP, TN, FN, FP) {
checkToByTwoTableInputs(
TP = TP,
FP = FP,
FN = FN,
TN = TN
)
-
- return((FN/(TP+FN))/(TN/(FP+TN)))
+
+ return((FN / (TP + FN)) / (TN / (FP + TN)))
}
@@ -804,14 +823,13 @@ negativeLikelihoodRatio <- function(TP,TN,FN,FP){
#' @return
#' diagnosticOddsRatio value
#'
-diagnosticOddsRatio <- function(TP,TN,FN,FP){
-
+diagnosticOddsRatio <- function(TP, TN, FN, FP) {
checkToByTwoTableInputs(
TP = TP,
FP = FP,
FN = FN,
TN = TN
)
-
- return(((TP/(TP+FN))/(FP/(FP+TN)))/((FN/(TP+FN))/(TN/(FP+TN))))
+
+ return(((TP / (TP + FN)) / (FP / (FP + TN))) / ((FN / (TP + FN)) / (TN / (FP + TN))))
}
diff --git a/R/ViewShinyPlp.R b/R/ViewShinyPlp.R
index 71943ed27..cc7d0c067 100644
--- a/R/ViewShinyPlp.R
+++ b/R/ViewShinyPlp.R
@@ -4,30 +4,29 @@
#' Opens a shiny app for viewing the results of the models from various T,O, Tar and settings
#' settings.
#' @param analysesLocation The directory containing the results (with the analysis_x folders)
-#'
+#'
#' @export
-viewMultiplePlp <- function(analysesLocation){
-
- if(!file.exists(file.path(analysesLocation, 'sqlite', 'databaseFile.sqlite'))){
- stop('No database found')
+viewMultiplePlp <- function(analysesLocation) {
+ if (!file.exists(file.path(analysesLocation, "sqlite", "databaseFile.sqlite"))) {
+ stop("No database found")
}
-
+
connectionDetailSettings <- list(
- dbms = 'sqlite',
- server = file.path(analysesLocation, 'sqlite', 'databaseFile.sqlite')
+ dbms = "sqlite",
+ server = file.path(analysesLocation, "sqlite", "databaseFile.sqlite")
)
-
+
databaseSettings <- list(
- connectionDetailSettings = connectionDetailSettings,
- schema = 'main',
- tablePrefix = '',
- dbms = 'sqlite',
- server = file.path(analysesLocation, 'sqlite', 'databaseFile.sqlite'),
- user = NULL,
+ connectionDetailSettings = connectionDetailSettings,
+ schema = "main",
+ tablePrefix = "",
+ dbms = "sqlite",
+ server = file.path(analysesLocation, "sqlite", "databaseFile.sqlite"),
+ user = NULL,
password = NULL,
port = NULL
)
-
+
viewPlps(databaseSettings)
}
@@ -45,31 +44,29 @@ viewMultiplePlp <- function(analysesLocation){
#'
#' @export
viewPlp <- function(runPlp, validatePlp = NULL, diagnosePlp = NULL) {
-
server <- insertRunPlpToSqlite(
- runPlp = runPlp,
+ runPlp = runPlp,
externalValidatePlp = validatePlp,
diagnosePlp = diagnosePlp
- )
-
+ )
+
connectionDetailSettings <- list(
- dbms = 'sqlite',
+ dbms = "sqlite",
server = server
)
-
+
databaseSettings <- list(
- connectionDetailSettings = connectionDetailSettings,
- schema = 'main',
- tablePrefix = '',
- dbms = 'sqlite',
+ connectionDetailSettings = connectionDetailSettings,
+ schema = "main",
+ tablePrefix = "",
+ dbms = "sqlite",
server = server,
- user = NULL,
+ user = NULL,
password = NULL,
port = NULL
)
-
- viewPlps(databaseSettings)
+ viewPlps(databaseSettings)
}
@@ -77,104 +74,97 @@ viewPlp <- function(runPlp, validatePlp = NULL, diagnosePlp = NULL) {
#'
#' @details
#' Opens a shiny app for viewing the results of the models from a database
-#'
+#'
#' @param mySchema Database result schema containing the result tables
-#' @param myServer server with the result database
+#' @param myServer server with the result database
#' @param myUser Username for the connection to the result database
#' @param myPassword Password for the connection to the result database
#' @param myDbms database management system for the result database
#' @param myPort Port for the connection to the result database
#' @param myTableAppend A string appended to the results tables (optional)
-#'
+#'
#' @export
viewDatabaseResultPlp <- function(
- mySchema,
- myServer,
- myUser,
- myPassword,
- myDbms,
- myPort = NULL,
- myTableAppend
- ){
-
+ mySchema,
+ myServer,
+ myUser,
+ myPassword,
+ myDbms,
+ myPort = NULL,
+ myTableAppend) {
connectionDetailSettings <- list(
dbms = myDbms,
- server = myServer,
- user = myUser,
- password = myPassword,
+ server = myServer,
+ user = myUser,
+ password = myPassword,
port = myPort
)
-
+
databaseSettings <- list(
- connectionDetailSettings = connectionDetailSettings,
+ connectionDetailSettings = connectionDetailSettings,
schema = mySchema,
tablePrefix = myTableAppend,
dbms = myDbms,
server = myServer,
- user = myUser,
+ user = myUser,
password = myPassword,
port = myPort
)
-
+
viewPlps(databaseSettings)
-
}
# code for multiple and single together
-# one shiny app
+# one shiny app
-viewPlps <- function(databaseSettings){
- ensure_installed("ShinyAppBuilder")
- ensure_installed("ResultModelManager")
-
+viewPlps <- function(databaseSettings) {
+ rlang::check_installed("OhdsiShinyAppBuilder")
+ rlang::check_installed("ResultModelManager")
connectionDetails <- do.call(
- DatabaseConnector::createConnectionDetails,
+ DatabaseConnector::createConnectionDetails,
databaseSettings$connectionDetailSettings
- )
+ )
connection <- ResultModelManager::ConnectionHandler$new(connectionDetails)
databaseSettings$connectionDetailSettings <- NULL
-
- shinyAppVersion <- strsplit(x = as.character(utils::packageVersion('ShinyAppBuilder')), split = '\\.')[[1]]
-
- if((shinyAppVersion[1] <= 1 & shinyAppVersion[2] < 2)){
+
+ shinyAppVersion <- strsplit(x = as.character(utils::packageVersion("OhdsiShinyAppBuilder")), split = "\\.")[[1]]
+
+ if ((shinyAppVersion[1] <= 1 && shinyAppVersion[2] < 2)) {
# Old code to be backwards compatable
config <- ParallelLogger::loadSettingsFromJson(
fileName = system.file(
- 'shinyConfig.json',
+ "shinyConfig.json",
package = "PatientLevelPrediction"
)
)
# set database settings into system variables
- Sys.setenv("resultDatabaseDetails_prediction" = as.character(ParallelLogger::convertSettingsToJson(databaseSettings)))
- ShinyAppBuilder::viewShiny(
- config = config,
- connection = connection
+ Sys.setenv("resultDatabaseDetails_prediction" = as.character(ParallelLogger::convertSettingsToJson(databaseSettings)))
+ OhdsiShinyAppBuilder::viewShiny(
+ config = config,
+ connection = connection
)
- } else{
- ohdsiModulesVersion <- strsplit(x = as.character(utils::packageVersion('OhdsiShinyModules')), split = '\\.')[[1]]
- if(paste0(ohdsiModulesVersion[1], ".", ohdsiModulesVersion[2])>= 1.2){
+ } else {
+ ohdsiModulesVersion <- strsplit(x = as.character(utils::packageVersion("OhdsiShinyModules")), split = "\\.")[[1]]
+ if (paste0(ohdsiModulesVersion[1], ".", ohdsiModulesVersion[2]) >= 1.2) {
config <- ParallelLogger::loadSettingsFromJson(
fileName = system.file(
- 'shinyConfigUpdate.json',
+ "shinyConfigUpdate.json",
package = "PatientLevelPrediction"
)
)
- databaseSettings$plpTablePrefix = databaseSettings$tablePrefix
- databaseSettings$cgTablePrefix = databaseSettings$tablePrefix
- databaseSettings$databaseTable = 'database_meta_data'
- databaseSettings$databaseTablePrefix = databaseSettings$tablePrefix
- ShinyAppBuilder::viewShiny(
- config = config,
- connection = connection,
- resultDatabaseSettings = databaseSettings
- )
- } else{
- ParallelLogger::logWarn('Need to update package OhdsiShinyModules')
+ databaseSettings$plpTablePrefix <- databaseSettings$tablePrefix
+ databaseSettings$cgTablePrefix <- databaseSettings$tablePrefix
+ databaseSettings$databaseTable <- "database_meta_data"
+ databaseSettings$databaseTablePrefix <- databaseSettings$tablePrefix
+ OhdsiShinyAppBuilder::viewShiny(
+ config = config,
+ connection = connection,
+ resultDatabaseSettings = databaseSettings
+ )
+ } else {
+ ParallelLogger::logWarn("Need to update package OhdsiShinyModules")
}
-
}
-
-
-}
\ No newline at end of file
+}
diff --git a/R/uploadToDatabase.R b/R/uploadToDatabase.R
index 9f94f1464..66d769e7d 100644
--- a/R/uploadToDatabase.R
+++ b/R/uploadToDatabase.R
@@ -17,82 +17,79 @@
# limitations under the License.
insertRunPlpToSqlite <- function(
- runPlp,
- externalValidatePlp = NULL,
- diagnosePlp = NULL
- ){
-
+ runPlp,
+ externalValidatePlp = NULL,
+ diagnosePlp = NULL) {
sqliteLocation <- tempdir()
-
- ensure_installed('RSQLite')
-
+
+ rlang::check_installed("RSQLite")
+
# create sqlite database
connectionDetails <- DatabaseConnector::createConnectionDetails(
- dbms = 'sqlite',
- server = file.path(sqliteLocation,'databaseFile.sqlite')
+ dbms = "sqlite",
+ server = file.path(sqliteLocation, "databaseFile.sqlite")
)
createPlpResultTables(
connectionDetails = connectionDetails,
- targetDialect = 'sqlite',
- resultSchema = 'main',
- deleteTables = T,
- createTables = T,
- tablePrefix = ''
+ targetDialect = "sqlite",
+ resultSchema = "main",
+ deleteTables = TRUE,
+ createTables = TRUE,
+ tablePrefix = ""
)
-
- #cohortDefinitions <- data.frame(
- # cohortId = c(runPlp$model$modelDesign$targetId, runPlp$model$modelDesign$outcomeId),
- # cohortName = c('Target', 'Outcome'),
- # json = c('{}', '{}')
- # )
-
addRunPlpToDatabase(
runPlp = runPlp,
connectionDetails = connectionDetails,
- databaseSchemaSettings = createDatabaseSchemaSettings(resultSchema = 'main'),
- cohortDefinitions = NULL,#cohortDefinitions,
- databaseList = NULL,
+ databaseSchemaSettings = createDatabaseSchemaSettings(resultSchema = "main"),
+ cohortDefinitions = NULL, # cohortDefinitions,
+ databaseList = NULL,
modelSaveLocation = sqliteLocation
)
-
+
# add validation results if entered
- if(!is.null(externalValidatePlp)){
- if(inherits(x = externalValidatePlp, what = 'list')){
- for(i in 1:length(externalValidatePlp)){
+ if (!is.null(externalValidatePlp)) {
+ if (inherits(x = externalValidatePlp, what = "list")) {
+ for (i in 1:length(externalValidatePlp)) {
tryCatch(
{
addRunPlpToDatabase(
runPlp = externalValidatePlp[[i]],
connectionDetails = connectionDetails,
- databaseSchemaSettings = createDatabaseSchemaSettings(resultSchema = 'main'),
- cohortDefinitions = NULL,#cohortDefinitions,
- databaseList = NULL,
+ databaseSchemaSettings = createDatabaseSchemaSettings(resultSchema = "main"),
+ cohortDefinitions = NULL, # cohortDefinitions,
+ databaseList = NULL,
modelSaveLocation = sqliteLocation
)
- }, error = function(e){ParallelLogger::logError(e)}
+ },
+ error = function(e) {
+ ParallelLogger::logError(e)
+ }
)
}
}
}
-
+
# add diagnosis results if entered
- if(!is.null(diagnosePlp)){
+ if (!is.null(diagnosePlp)) {
tryCatch(
{
addDiagnosePlpToDatabase(
diagnosePlp = diagnosePlp,
connectionDetails = connectionDetails,
- databaseSchemaSettings = createDatabaseSchemaSettings(resultSchema = 'main'),
- cohortDefinitions = NULL,#cohortDefinitions,
+ databaseSchemaSettings = createDatabaseSchemaSettings(resultSchema = "main"),
+ cohortDefinitions = NULL, # cohortDefinitions,
databaseList = NULL
)
- }, error = function(e){ParallelLogger::logError(e)}
+ },
+ error = function(e) {
+ ParallelLogger::logError(e)
+ }
)
}
-
-
- return(file.path(sqliteLocation,'databaseFile.sqlite'))
+
+
+ return(file.path(sqliteLocation, "databaseFile.sqlite"))
}
#' Create sqlite database with the results
@@ -106,60 +103,56 @@ insertRunPlpToSqlite <- function(
#' @param cohortDefinitions A set of one or more cohorts extracted using ROhdsiWebApi::exportCohortDefinitionSet()
#' @param databaseList A list created by \code{createDatabaseList} to specify the databases
#' @param sqliteLocation (string) location of directory where the sqlite database will be saved
-#'
+#'
#' @return
#' Returns the location of the sqlite database file
-#'
+#'
#' @export
insertResultsToSqlite <- function(
- resultLocation,
- cohortDefinitions,
- databaseList = NULL,
- sqliteLocation = file.path(resultLocation, 'sqlite')
-){
-
- if(!dir.exists(sqliteLocation)){
- dir.create(sqliteLocation, recursive = T)
+ resultLocation,
+ cohortDefinitions,
+ databaseList = NULL,
+ sqliteLocation = file.path(resultLocation, "sqlite")) {
+ if (!dir.exists(sqliteLocation)) {
+ dir.create(sqliteLocation, recursive = TRUE)
}
-
- ensure_installed('RSQLite')
-
+ rlang::check_installed("RSQLite")
# create sqlite database
connectionDetails <- DatabaseConnector::createConnectionDetails(
- dbms = 'sqlite',
- server = file.path(sqliteLocation,'databaseFile.sqlite')
+ dbms = "sqlite",
+ server = file.path(sqliteLocation, "databaseFile.sqlite")
)
-
+
# create tables if they dont exist
createPlpResultTables(
- connectionDetails = connectionDetails,
- targetDialect = 'sqlite',
- resultSchema = 'main',
- deleteTables = T,
- createTables = T,
- tablePrefix = ''
- )
+ connectionDetails = connectionDetails,
+ targetDialect = "sqlite",
+ resultSchema = "main",
+ deleteTables = TRUE,
+ createTables = TRUE,
+ tablePrefix = ""
+ )
# run insert models
addMultipleRunPlpToDatabase(
- connectionDetails = connectionDetails,
- databaseSchemaSettings = createDatabaseSchemaSettings(resultSchema = 'main'),
+ connectionDetails = connectionDetails,
+ databaseSchemaSettings = createDatabaseSchemaSettings(resultSchema = "main"),
cohortDefinitions = cohortDefinitions,
databaseList = databaseList,
resultLocation = resultLocation,
modelSaveLocation = sqliteLocation
)
-
+
# run insert diagnosis
addMultipleDiagnosePlpToDatabase(
connectionDetails = connectionDetails,
- databaseSchemaSettings = createDatabaseSchemaSettings(resultSchema = 'main'),
+ databaseSchemaSettings = createDatabaseSchemaSettings(resultSchema = "main"),
cohortDefinitions = cohortDefinitions,
databaseList = databaseList,
resultLocation = resultLocation
)
-
- return(file.path(sqliteLocation,'databaseFile.sqlite'))
+
+ return(file.path(sqliteLocation, "databaseFile.sqlite"))
}
#' Create the results tables to store PatientLevelPrediction models and results into a database
@@ -169,81 +162,80 @@ insertResultsToSqlite <- function(
#' @details
#' This function can be used to create (or delete) PatientLevelPrediction result tables
#'
-#' @param connectionDetails The database connection details
+#' @param connectionDetails The database connection details
#' @param targetDialect The database management system being used
#' @param resultSchema The name of the database schema that the result tables will be created.
#' @param deleteTables If true any existing tables matching the PatientLevelPrediction result tables names will be deleted
#' @param createTables If true the PatientLevelPrediction result tables will be created
#' @param tablePrefix A string that appends to the PatientLevelPrediction result tables
#' @param tempEmulationSchema The temp schema used when the database management system is oracle
-#'
+#'
#' @param testFile (used for testing) The location of an sql file with the table creation code
#'
#' @return
#' Returns NULL but creates the required tables into the specified database schema(s).
-#'
+#'
#' @export
createPlpResultTables <- function(
connectionDetails,
- targetDialect = 'postgresql',
- resultSchema,
- deleteTables = T,
- createTables = T,
- tablePrefix = '',
- tempEmulationSchema = getOption("sqlRenderTempEmulationSchema"),
- testFile = NULL
-){
-
+ targetDialect = "postgresql",
+ resultSchema,
+ deleteTables = TRUE,
+ createTables = TRUE,
+ tablePrefix = "",
+ tempEmulationSchema = getOption("sqlRenderTempEmulationSchema"),
+ testFile = NULL) {
conn <- DatabaseConnector::connect(connectionDetails = connectionDetails)
on.exit(DatabaseConnector::disconnect(conn))
-
+
tablesExists <- sum(tolower(getPlpResultTables()) %in% tolower(DatabaseConnector::getTableNames(connection = conn, databaseSchema = resultSchema)))
tablesExists <- tablesExists == length(getPlpResultTables())
-
- if(!tablesExists){
- ParallelLogger::logInfo('All or some PLP result tables do not exist, tables being recreated')
- if(deleteTables){
- ParallelLogger::logInfo('Deleting existing tables')
-
+
+ if (!tablesExists) {
+ ParallelLogger::logInfo("All or some PLP result tables do not exist, tables being recreated")
+ if (deleteTables) {
+ ParallelLogger::logInfo("Deleting existing tables")
+
tableNames <- getPlpResultTables()
-
+
deleteTables(
conn = conn,
databaseSchema = resultSchema,
targetDialect = targetDialect,
tempEmulationSchema = tempEmulationSchema,
- tableNames = tableNames,
+ tableNames = tableNames,
tablePrefix = tablePrefix
)
-
}
-
-
- if(createTables){
- ParallelLogger::logInfo('Creating PLP results tables')
-
- if(tablePrefix != ''){
- tablePrefix <- paste0(toupper(gsub('_','',gsub(' ','', tablePrefix))), '_')
+
+
+ if (createTables) {
+ ParallelLogger::logInfo("Creating PLP results tables")
+
+ if (tablePrefix != "") {
+ tablePrefix <- paste0(toupper(gsub("_", "", gsub(" ", "", tablePrefix))), "_")
}
-
+
pathToSql <- system.file(
- paste("sql/", targetDialect,
- sep = ""),
- "PlpResultTables.sql",
+ paste("sql/", targetDialect,
+ sep = ""
+ ),
+ "PlpResultTables.sql",
package = "PatientLevelPrediction"
)
-
- if(!file.exists(pathToSql)){
+
+ if (!file.exists(pathToSql)) {
# if no dbms specific file use sql_server
pathToSql <- system.file(
- paste("sql/", 'sql_server',
- sep = ""),
- "PlpResultTables.sql",
+ paste("sql/", "sql_server",
+ sep = ""
+ ),
+ "PlpResultTables.sql",
package = "PatientLevelPrediction"
)
}
-
- sql <- readChar(pathToSql, file.info(pathToSql)$size)
+
+ sql <- readChar(pathToSql, file.info(pathToSql)$size)
renderedSql <- SqlRender::render(
sql = sql[1],
my_schema = resultSchema,
@@ -254,22 +246,20 @@ createPlpResultTables <- function(
targetDialect = targetDialect,
tempEmulationSchema = tempEmulationSchema
)
-
+
DatabaseConnector::executeSql(conn, renderedSql)
}
-
- } else{
- ParallelLogger::logInfo('PLP result tables already exist')
+ } else {
+ ParallelLogger::logInfo("PLP result tables already exist")
}
-
+
# then migrate
- ParallelLogger::logInfo('PLP result migrration being applied')
+ ParallelLogger::logInfo("PLP result migrration being applied")
migrateDataModel(
connectionDetails = connectionDetails, # input is connection
databaseSchema = resultSchema,
tablePrefix = tablePrefix
)
-
}
#' Populate the PatientLevelPrediction results tables
@@ -282,72 +272,76 @@ createPlpResultTables <- function(
#' @param connectionDetails A connection details created by using the
#' function \code{createConnectionDetails} in the
#' \code{DatabaseConnector} package.
-#' @param databaseSchemaSettings A object created by \code{createDatabaseSchemaSettings} with all the settings specifying the result tables
+#' @param databaseSchemaSettings A object created by \code{createDatabaseSchemaSettings} with all the settings specifying the result tables
#' @param cohortDefinitions A set of one or more cohorts extracted using ROhdsiWebApi::exportCohortDefinitionSet()
#' @param databaseList (Optional) A list created by \code{createDatabaseList} to specify the databases
#' @param resultLocation (string) location of directory where the main package results were saved
-#' @param resultLocationVector (only used when resultLocation is missing) a vector of locations with development or validation results
-#' @param modelSaveLocation The location of the file system for saving the models in a subdirectory
-#'
+#' @param resultLocationVector (only used when resultLocation is missing) a vector of locations with development or validation results
+#' @param modelSaveLocation The location of the file system for saving the models in a subdirectory
+#'
#' @return
#' Returns NULL but uploads all the results in resultLocation to the PatientLevelPrediction result tables in resultSchema
-#'
+#'
#' @export
addMultipleRunPlpToDatabase <- function(
- connectionDetails,
- databaseSchemaSettings = createDatabaseSchemaSettings(resultSchema = 'main'),
+ connectionDetails,
+ databaseSchemaSettings = createDatabaseSchemaSettings(resultSchema = "main"),
cohortDefinitions,
databaseList = NULL,
resultLocation = NULL,
resultLocationVector,
- modelSaveLocation
-){
+ modelSaveLocation) {
conn <- DatabaseConnector::connect(connectionDetails = connectionDetails)
on.exit(DatabaseConnector::disconnect(conn))
-
+
# for each development result add it to the database:
-
- if(missing(resultLocationVector)){
+
+ if (missing(resultLocationVector)) {
resultLocationVector <- getResultLocations(resultLocation)
}
-
- if(length(resultLocationVector) == 0){
- ParallelLogger::logInfo('No results found')
+
+ if (length(resultLocationVector) == 0) {
+ ParallelLogger::logInfo("No results found")
return(NULL)
}
-
- for(runPlpLocation in resultLocationVector){
- ParallelLogger::logInfo(paste0('Inserting result @ ', runPlpLocation, ' into database'))
-
+
+ for (runPlpLocation in resultLocationVector) {
+ ParallelLogger::logInfo(paste0("Inserting result @ ", runPlpLocation, " into database"))
+
# TODO edit csv here
runPlp <- tryCatch(
- {PatientLevelPrediction::loadPlpResult(runPlpLocation)},
- error = function(e){ParallelLogger::logInfo(e);return(NULL)}
+ {
+ PatientLevelPrediction::loadPlpResult(runPlpLocation)
+ },
+ error = function(e) {
+ ParallelLogger::logInfo(e)
+ return(NULL)
+ }
)
-
- if(!is.null(runPlp)){
- ParallelLogger::logInfo('result loaded')
-
+
+ if (!is.null(runPlp)) {
+ ParallelLogger::logInfo("result loaded")
+
# Add runPlp to the database
tryCatch(
- {addRunPlpToDatabase(
- runPlp = runPlp,
- connectionDetails = connectionDetails,
- databaseSchemaSettings = databaseSchemaSettings,
- cohortDefinitions = cohortDefinitions,
- databaseList = databaseList,
- modelSaveLocation = modelSaveLocation
- )}, error = function(e){
- ParallelLogger::logInfo('result upload failed: ');
+ {
+ addRunPlpToDatabase(
+ runPlp = runPlp,
+ connectionDetails = connectionDetails,
+ databaseSchemaSettings = databaseSchemaSettings,
+ cohortDefinitions = cohortDefinitions,
+ databaseList = databaseList,
+ modelSaveLocation = modelSaveLocation
+ )
+ },
+ error = function(e) {
+ ParallelLogger::logInfo("result upload failed: ")
ParallelLogger::logInfo(e)
- }
+ }
)
-
- } #model not null
-
+ } # model not null
} # per model
-
-} #end funct
+} # end funct
#' Create the PatientLevelPrediction database result schema settings
@@ -365,51 +359,49 @@ addMultipleRunPlpToDatabase <- function(
#' @param tablePrefixCohortDefinitionTables (string) A string that appends to the cohort definition tables
#' @param databaseDefinitionSchema (string) The name of the database schema with the database definition tables (defaults to resultSchema).
#' @param tablePrefixDatabaseDefinitionTables (string) A string that appends to the database definition tables
-#'
+#'
#' @return
#' Returns a list of class 'plpDatabaseResultSchema' with all the database settings
-#'
+#'
#' @export
createDatabaseSchemaSettings <- function(
- resultSchema = 'main',
- tablePrefix = '',
- targetDialect = 'sqlite',
- tempEmulationSchema = getOption("sqlRenderTempEmulationSchema"),
- cohortDefinitionSchema = resultSchema,
- tablePrefixCohortDefinitionTables = tablePrefix,
- databaseDefinitionSchema = resultSchema,
- tablePrefixDatabaseDefinitionTables = tablePrefix
-){
-
- if(missing(resultSchema)){
- stop('resultSchema required')
+ resultSchema = "main",
+ tablePrefix = "",
+ targetDialect = "sqlite",
+ tempEmulationSchema = getOption("sqlRenderTempEmulationSchema"),
+ cohortDefinitionSchema = resultSchema,
+ tablePrefixCohortDefinitionTables = tablePrefix,
+ databaseDefinitionSchema = resultSchema,
+ tablePrefixDatabaseDefinitionTables = tablePrefix) {
+ if (missing(resultSchema)) {
+ stop("resultSchema required")
}
- if(!inherits(x = resultSchema, what = "character")){
- stop('resultSchema must be a string')
+ if (!inherits(x = resultSchema, what = "character")) {
+ stop("resultSchema must be a string")
}
-
- if(tablePrefix != ''){
- tablePrefix <- paste0(toupper(gsub('_','',gsub(' ','', tablePrefix))), '_')
+
+ if (tablePrefix != "") {
+ tablePrefix <- paste0(toupper(gsub("_", "", gsub(" ", "", tablePrefix))), "_")
}
- if(tablePrefixCohortDefinitionTables != ''){
- tablePrefixCohortDefinitionTables <- paste0(toupper(gsub('_','',gsub(' ','', tablePrefixCohortDefinitionTables))), '_')
+ if (tablePrefixCohortDefinitionTables != "") {
+ tablePrefixCohortDefinitionTables <- paste0(toupper(gsub("_", "", gsub(" ", "", tablePrefixCohortDefinitionTables))), "_")
}
- if(tablePrefixDatabaseDefinitionTables != ''){
- tablePrefixDatabaseDefinitionTables <- paste0(toupper(gsub('_','',gsub(' ','', tablePrefixDatabaseDefinitionTables))), '_')
+ if (tablePrefixDatabaseDefinitionTables != "") {
+ tablePrefixDatabaseDefinitionTables <- paste0(toupper(gsub("_", "", gsub(" ", "", tablePrefixDatabaseDefinitionTables))), "_")
}
-
+
result <- list(
resultSchema = resultSchema,
tablePrefix = tablePrefix,
targetDialect = targetDialect,
tempEmulationSchema = tempEmulationSchema,
- cohortDefinitionSchema = cohortDefinitionSchema, # could be removed
+ cohortDefinitionSchema = cohortDefinitionSchema, # could be removed
tablePrefixCohortDefinitionTables = tablePrefixCohortDefinitionTables, # could be removed
databaseDefinitionSchema = databaseDefinitionSchema, # could be removed
tablePrefixDatabaseDefinitionTables = tablePrefixDatabaseDefinitionTables # could be removed
)
-
- class(result) <- 'plpDatabaseResultSchema'
+
+ class(result) <- "plpDatabaseResultSchema"
return(result)
}
@@ -420,58 +412,58 @@ createDatabaseSchemaSettings <- function(
#'
#' @details
#' This function is used when inserting database details into the PatientLevelPrediction database results schema
-#'
+#'
#' @param cdmDatabaseSchemas (string vector) A vector of the cdmDatabaseSchemas used in the study - if the schemas are not unique per database please also specify databaseRefId
#' @param cdmDatabaseNames Sharable names for the databases
#' @param databaseRefIds (string vector) Unique database identifiers - what you specified as cdmDatabaseId in \code{PatientLevelPrediction::createDatabaseDetails()} when developing the models
#'
#' @return
#' Returns a data.frame with the database details
-#'
+#'
#' @export
createDatabaseList <- function(
- cdmDatabaseSchemas,
- cdmDatabaseNames,
- databaseRefIds = NULL
-){
- if(missing(cdmDatabaseSchemas)){
- stop('Need to specify cdmDatabaseSchemas')
+ cdmDatabaseSchemas,
+ cdmDatabaseNames,
+ databaseRefIds = NULL) {
+ if (missing(cdmDatabaseSchemas)) {
+ stop("Need to specify cdmDatabaseSchemas")
}
-
- if(is.null(databaseRefIds)){
- ParallelLogger::logInfo('No databaseRefId specified so using schema as unique database identifier')
+
+ if (is.null(databaseRefIds)) {
+ ParallelLogger::logInfo("No databaseRefId specified so using schema as unique database identifier")
databaseRefIds <- removeInvalidString(cdmDatabaseSchemas)
}
- if(missing(cdmDatabaseNames)){
+ if (missing(cdmDatabaseNames)) {
cdmDatabaseNames <- removeInvalidString(cdmDatabaseSchemas)
}
-
-
+
+
result <- lapply(
1:length(cdmDatabaseSchemas),
-
- function(i) list(
- databaseDetails = list(
- databaseMetaDataId = databaseRefIds[i]
- ),
- databaseMetaData = list(
- databaseId = databaseRefIds[i],
- cdmSourceName = cdmDatabaseSchemas[i],
- cdmSourceAbbreviation = cdmDatabaseNames[i],
- cdmHolder = '', # could get this from CDM_source inside runPlp in future
- sourceDesciption = '',
- sourceDocumentReference = '',
- cdmEtlReference = '',
- sourceReleaseDate = '',
- cdmReleaseDate = '',
- cdmVersion = '',
- vocabularyVersion = '',
- maxObsPeriodEndDate = ''
+ function(i) {
+ list(
+ databaseDetails = list(
+ databaseMetaDataId = databaseRefIds[i]
+ ),
+ databaseMetaData = list(
+ databaseId = databaseRefIds[i],
+ cdmSourceName = cdmDatabaseSchemas[i],
+ cdmSourceAbbreviation = cdmDatabaseNames[i],
+ cdmHolder = "", # could get this from CDM_source inside runPlp in future
+ sourceDesciption = "",
+ sourceDocumentReference = "",
+ cdmEtlReference = "",
+ sourceReleaseDate = "",
+ cdmReleaseDate = "",
+ cdmVersion = "",
+ vocabularyVersion = "",
+ maxObsPeriodEndDate = ""
+ )
)
- )
+ }
)
-
- names(result) <- databaseRefIds #cdmDatabaseSchemas
+
+ names(result) <- databaseRefIds # cdmDatabaseSchemas
# using id as schema may not be unique
# id uses schema if it is not set
@@ -486,94 +478,90 @@ createDatabaseList <- function(
#'
#' @details
#' This function is used when inserting results into the PatientLevelPrediction database results schema
-#'
+#'
#' @param runPlp An object of class \code{runPlp} or class \code{externalValidatePlp}
#' @param connectionDetails A connection details created by using the
#' function \code{createConnectionDetails} in the
#' \code{DatabaseConnector} package.
-#' @param databaseSchemaSettings A object created by \code{createDatabaseSchemaSettings} with all the settings specifying the result tables
+#' @param databaseSchemaSettings A object created by \code{createDatabaseSchemaSettings} with all the settings specifying the result tables
#' @param cohortDefinitions A set of one or more cohorts extracted using ROhdsiWebApi::exportCohortDefinitionSet()
#' @param modelSaveLocation The location of the directory that models will be saved to
#' @param databaseList (Optional) If you want to change the database name then used \code{createDatabaseList} to specify the database settings but use the same cdmDatabaseId was model development/validation
-#'
+#'
#' @return
#' Returns a data.frame with the database details
-#'
+#'
#' @export
addRunPlpToDatabase <- function(
- runPlp,
- connectionDetails,
- databaseSchemaSettings,
- cohortDefinitions,
- modelSaveLocation,
- databaseList = NULL
-){
-
+ runPlp,
+ connectionDetails,
+ databaseSchemaSettings,
+ cohortDefinitions,
+ modelSaveLocation,
+ databaseList = NULL) {
conn <- DatabaseConnector::connect(connectionDetails = connectionDetails)
on.exit(DatabaseConnector::disconnect(conn))
-
+
modelDesignId <- insertModelDesignInDatabase(
- object = runPlp$model$modelDesign,
- conn = conn,
+ object = runPlp$model$modelDesign,
+ conn = conn,
databaseSchemaSettings = databaseSchemaSettings,
- cohortDefinitions = cohortDefinitions
+ cohortDefinitions = cohortDefinitions
)
-
+
# Add model if runPlp
- if(inherits(runPlp, 'runPlp')){
- includesModel <- T
+ if (inherits(runPlp, "runPlp")) {
+ includesModel <- TRUE
developmentDatabase <- runPlp$model$trainDetails$developmentDatabase
validationDatabase <- runPlp$model$trainDetails$developmentDatabase
developmentDatabaseRefId <- runPlp$model$trainDetails$developmentDatabaseId
validationDatabaseRefId <- runPlp$model$trainDetails$developmentDatabaseId
-
+
populationSettings <- runPlp$model$modelDesign$populationSettings
targetId <- runPlp$model$modelDesign$targetId
outcomeId <- runPlp$model$modelDesign$outcomeId
restrictPlpDataSettings <- runPlp$model$modelDesign$restrictPlpDataSettings
-
- modelDevelopment <- 1 #added
-
- attrition <- runPlp$model$trainDetails$attrition
- } else{
- includesModel <- F
+ modelDevelopment <- 1 # added
+
+ attrition <- runPlp$model$trainDetails$attrition
+ } else {
+ includesModel <- FALSE
developmentDatabase <- runPlp$model$validationDetails$developmentDatabase
validationDatabase <- runPlp$model$validationDetails$validationDatabase
developmentDatabaseRefId <- runPlp$model$validationDetails$developmentDatabaseId
validationDatabaseRefId <- runPlp$model$validationDetails$validationDatabaseId
-
+
populationSettings <- runPlp$model$validationDetails$populationSettings
targetId <- runPlp$model$validationDetails$targetId
outcomeId <- runPlp$model$validationDetails$outcomeId
restrictPlpDataSettings <- runPlp$model$validationDetails$restrictPlpDataSettings
-
- modelDevelopment <- 0 #added
-
+
+ modelDevelopment <- 0 # added
+
attrition <- runPlp$model$validationDetails$attrition
-
}
-
+
# Add databases
developmentDatabaseId <- addDatabase(
- conn = conn,
+ conn = conn,
databaseSchemaSettings = databaseSchemaSettings,
databaseList = databaseList,
- databaseSchema = developmentDatabase,
+ databaseSchema = developmentDatabase,
databaseId = developmentDatabaseRefId
)
-
+
validationDatabaseId <- addDatabase(
- conn = conn,
+ conn = conn,
databaseSchemaSettings = databaseSchemaSettings,
databaseList = databaseList,
databaseSchema = validationDatabase,
databaseId = validationDatabaseRefId
)
-
-
+
+
# add nodel if the result contains it
- if(includesModel){
+ if (includesModel) {
insertModelInDatabase(
model = runPlp$model,
conn = conn,
@@ -583,79 +571,76 @@ addRunPlpToDatabase <- function(
modelSaveLocation = modelSaveLocation
)
}
-
+
# get validation settings
validationTarId <- addTar(
- conn = conn,
- resultSchema = databaseSchemaSettings$resultSchema,
+ conn = conn,
+ resultSchema = databaseSchemaSettings$resultSchema,
targetDialect = databaseSchemaSettings$targetDialect,
- startDay = populationSettings$riskWindowStart,
+ startDay = populationSettings$riskWindowStart,
startAnchor = populationSettings$startAnchor,
- endDay = populationSettings$riskWindowEnd,
- endAnchor = populationSettings$endAnchor,
+ endDay = populationSettings$riskWindowEnd,
+ endAnchor = populationSettings$endAnchor,
tablePrefix = databaseSchemaSettings$tablePrefix,
tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema
)
-
- validationTargetId <- addCohort(
- conn = conn,
- resultSchema = databaseSchemaSettings$cohortDefinitionSchema,
+
+ validationTargetId <- addCohort(
+ conn = conn,
+ resultSchema = databaseSchemaSettings$cohortDefinitionSchema,
targetDialect = databaseSchemaSettings$targetDialect,
- cohortDefinition = getCohortDef(cohortDefinitions,targetId),
+ cohortDefinition = getCohortDef(cohortDefinitions, targetId),
tablePrefix = databaseSchemaSettings$tablePrefixCohortDefinitionTables,
tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema
)
-
- validationOutcomeId <- addCohort(
- conn = conn,
- resultSchema = databaseSchemaSettings$cohortDefinitionSchema,
+
+ validationOutcomeId <- addCohort(
+ conn = conn,
+ resultSchema = databaseSchemaSettings$cohortDefinitionSchema,
targetDialect = databaseSchemaSettings$targetDialect,
- cohortDefinition = getCohortDef(cohortDefinitions,outcomeId),
+ cohortDefinition = getCohortDef(cohortDefinitions, outcomeId),
tablePrefix = databaseSchemaSettings$tablePrefixCohortDefinitionTables,
tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema
)
-
+
validationPopulationId <- addPopulationSetting(
- conn = conn,
- resultSchema = databaseSchemaSettings$resultSchema,
+ conn = conn,
+ resultSchema = databaseSchemaSettings$resultSchema,
targetDialect = databaseSchemaSettings$targetDialect,
- json = populationSettings,
+ json = populationSettings,
tablePrefix = databaseSchemaSettings$tablePrefix,
tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema
)
-
- validationPlpDataId <- addPlpDataSetting(
- conn = conn,
- resultSchema = databaseSchemaSettings$resultSchema,
+
+ validationPlpDataId <- addPlpDataSetting(
+ conn = conn,
+ resultSchema = databaseSchemaSettings$resultSchema,
targetDialect = databaseSchemaSettings$targetDialect,
- json = restrictPlpDataSettings,
+ json = restrictPlpDataSettings,
tablePrefix = databaseSchemaSettings$tablePrefix,
tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema
)
-
+
# Add performance
insertPerformanceInDatabase(
performanceEvaluation = runPlp$performanceEvaluation,
covariateSummary = runPlp$covariateSummary,
attrition = attrition,
- executionDateTime = format(runPlp$executionSummary$ExecutionDateTime, format="%Y-%m-%d"),
- plpVersion = runPlp$executionSummary$PackageVersion$packageVersion,
+ executionDateTime = format(runPlp$executionSummary$ExecutionDateTime, format = "%Y-%m-%d"),
+ plpVersion = runPlp$executionSummary$PackageVersion$packageVersion,
conn = conn,
databaseSchemaSettings = databaseSchemaSettings,
-
modelDesignId = modelDesignId,
developmentDatabaseId = developmentDatabaseId,
-
validationDatabaseId = validationDatabaseId,
validationTarId = validationTarId,
- validationPopulationId= validationPopulationId,
+ validationPopulationId = validationPopulationId,
validationPlpDataId = validationPlpDataId,
validationTargetId = validationTargetId,
validationOutcomeId = validationOutcomeId,
-
modelDevelopment = modelDevelopment
)
-
+
return(invisible(NULL))
}
@@ -665,134 +650,124 @@ addRunPlpToDatabase <- function(
# INSERT MODEL
####################
insertModelInDatabase <- function(
- model,
- conn,
- databaseSchemaSettings,
- databaseId,
- modelDesignId,
- modelSaveLocation
-){
-
+ model,
+ conn,
+ databaseSchemaSettings,
+ databaseId,
+ modelDesignId,
+ modelSaveLocation) {
# save the model to the file system
modelLocation <- file.path(
- modelSaveLocation,
- 'models',
- paste0('folder-', modelDesignId, '-', databaseId)
- )
- if(!dir.exists(modelLocation)){
- dir.create(modelLocation, recursive = T)
+ modelSaveLocation,
+ "models",
+ paste0("folder-", modelDesignId, "-", databaseId)
+ )
+ if (!dir.exists(modelLocation)) {
+ dir.create(modelLocation, recursive = TRUE)
}
# savign all the model as preprocessing was too large
# for database
savePlpModel(
- plpModel = model,
+ plpModel = model,
dirPath = modelLocation
- )
-
- #saveModelPart(
- # model = model$model,
- # savetype = attr(model, 'saveType'),
- # dirPath = modelLocation
- #)
-
+ )
+
# need hyperParamSearch for shiny app but the other parts
# are too large to store into the database
trainDetails <- list(hyperParamSearch = model$trainDetails$hyperParamSearch)
-
- # create this function
- modelId <- addModel(
- conn = conn,
- resultSchema = databaseSchemaSettings$resultSchema,
- targetDialect = databaseSchemaSettings$targetDialect,
- analysisId = ifelse(
- is.null(model$trainDetails$analysisId),
- 'missing',
- model$trainDetails$analysisId
- ),
- modelDesignId = modelDesignId,
- databaseId = databaseId,
- modelType = model$trainDetails$modelName,
- plpModelFile = modelLocation, # save the model to a location and add location here
- trainDetails = as.character(ParallelLogger::convertSettingsToJson(trainDetails)),
- preprocessing = "",#as.character(ParallelLogger::convertSettingsToJson(model$preprocessing)),
-
- executionDateTime = format(model$trainDetails$trainingDate, format="%Y-%m-%d"),
- trainingTime = model$trainDetails$trainingTime,
- intercept = ifelse(is.list(model$model) & attr(model, 'saveType') != 'xgboost', model$model$coefficients$betas[1], 0), # using the param useIntercept?
-
- tablePrefix = databaseSchemaSettings$tablePrefix,
- tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema
- )
- ParallelLogger::logInfo(
- paste0('modelId: ', modelId,
- ' inserted for modelDesignId ', modelDesignId,
- 'and databaseId ', databaseId)
+
+ # create this function
+ modelId <- addModel(
+ conn = conn,
+ resultSchema = databaseSchemaSettings$resultSchema,
+ targetDialect = databaseSchemaSettings$targetDialect,
+ analysisId = ifelse(
+ is.null(model$trainDetails$analysisId),
+ "missing",
+ model$trainDetails$analysisId
+ ),
+ modelDesignId = modelDesignId,
+ databaseId = databaseId,
+ modelType = model$trainDetails$modelName,
+ plpModelFile = modelLocation, # save the model to a location and add location here
+ trainDetails = as.character(ParallelLogger::convertSettingsToJson(trainDetails)),
+ preprocessing = "",
+
+ executionDateTime = format(model$trainDetails$trainingDate, format = "%Y-%m-%d"),
+ trainingTime = model$trainDetails$trainingTime,
+ intercept = ifelse(is.list(model$model) & attr(model, "saveType") != "xgboost", model$model$coefficients$betas[1], 0), # using the param useIntercept?
+
+ tablePrefix = databaseSchemaSettings$tablePrefix,
+ tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema
+ )
+ ParallelLogger::logInfo(
+ paste0(
+ "modelId: ", modelId,
+ " inserted for modelDesignId ", modelDesignId,
+ "and databaseId ", databaseId
)
-
+ )
+
return(invisible(modelId))
}
addModel <- function(
- conn,
- resultSchema,
- targetDialect,
- tablePrefix,
- analysisId,
- modelDesignId,
- databaseId,
-
- modelType,
- plpModelFile,
- trainDetails,
- preprocessing,
-
- executionDateTime,
- trainingTime,
- intercept,
- tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")
-){
-
- if(is.null(analysisId)){
- stop('analysisId is null')
+ conn,
+ resultSchema,
+ targetDialect,
+ tablePrefix,
+ analysisId,
+ modelDesignId,
+ databaseId,
+ modelType,
+ plpModelFile,
+ trainDetails,
+ preprocessing,
+ executionDateTime,
+ trainingTime,
+ intercept,
+ tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")) {
+ if (is.null(analysisId)) {
+ stop("analysisId is null")
}
- if(is.null(modelDesignId)){
- stop('modelDesignId is null')
+ if (is.null(modelDesignId)) {
+ stop("modelDesignId is null")
}
- if(is.null(databaseId)){
- stop('databaseId is null')
+ if (is.null(databaseId)) {
+ stop("databaseId is null")
}
-
- if(is.null(plpModelFile)){
- stop('plpModelFile is null')
+
+ if (is.null(plpModelFile)) {
+ stop("plpModelFile is null")
}
- if(is.null(executionDateTime)){
- stop('executionDateTime is null')
+ if (is.null(executionDateTime)) {
+ stop("executionDateTime is null")
}
- if(is.null(intercept)){
- stop('intercept is null')
+ if (is.null(intercept)) {
+ stop("intercept is null")
}
-
+
# process json to make it ordered...
# TODO
-
+
result <- checkTable(
- conn = conn,
- resultSchema = resultSchema,
+ conn = conn,
+ resultSchema = resultSchema,
tablePrefix = tablePrefix,
- targetDialect = targetDialect,
- tableName = 'models',
+ targetDialect = targetDialect,
+ tableName = "models",
columnNames = c(
- 'model_design_id',
- 'database_id'
- ),
+ "model_design_id",
+ "database_id"
+ ),
values = c(
modelDesignId,
databaseId
),
tempEmulationSchema = tempEmulationSchema
)
-
- if(nrow(result)==0){
+
+ if (nrow(result) == 0) {
# model
sql <- "INSERT INTO @my_schema.@string_to_appendmodels(
analysis_id,
@@ -805,110 +780,100 @@ addModel <- function(
execution_date_time,
training_time,
intercept
- ) VALUES
- ('@analysis_id',
+ ) VALUES
+ ('@analysis_id',
@model_design_id,
- @database_id,
-
+ @database_id,
+
'@model_type',
- '@plp_model_file',
- '@train_details',
- '@preprocessing',
-
- '@execution_date_time',
- '@training_time',
+ '@plp_model_file',
+ '@train_details',
+ '@preprocessing',
+
+ '@execution_date_time',
+ '@training_time',
@intercept
)"
sql <- SqlRender::render(
- sql,
+ sql,
my_schema = resultSchema,
analysis_id = analysisId,
model_design_id = modelDesignId,
database_id = databaseId,
-
model_type = modelType,
plp_model_file = plpModelFile,
train_details = trainDetails,
preprocessing = preprocessing,
-
execution_date_time = executionDateTime,
training_time = trainingTime,
intercept = intercept,
-
string_to_append = tablePrefix
)
sql <- SqlRender::translate(
- sql,
+ sql,
targetDialect = targetDialect,
tempEmulationSchema = tempEmulationSchema
)
DatabaseConnector::executeSql(conn, sql)
-
- #getId of new
+
+ # getId of new
result <- checkTable(
- conn = conn,
- resultSchema = resultSchema,
+ conn = conn,
+ resultSchema = resultSchema,
tablePrefix = tablePrefix,
- targetDialect = targetDialect,
- tableName = 'models',
+ targetDialect = targetDialect,
+ tableName = "models",
columnNames = c(
- 'model_design_id',
- 'database_id'
- ),
+ "model_design_id",
+ "database_id"
+ ),
values = c(
modelDesignId,
databaseId
),
tempEmulationSchema = tempEmulationSchema
)
-
- }
-
+ }
+
return(result$modelId[1])
}
-#======================
+# ======================
# Helpers
-#======================
+# ======================
# get a vector with all the result table names
-getPlpResultTables <- function(){
+getPlpResultTables <- function() {
return(
c(
- "CALIBRATION_SUMMARY",
- "COVARIATE_SUMMARY",
+ "CALIBRATION_SUMMARY",
+ "COVARIATE_SUMMARY",
"DEMOGRAPHIC_SUMMARY",
- "EVALUATION_STATISTICS",
- "PREDICTION_DISTRIBUTION",
+ "EVALUATION_STATISTICS",
+ "PREDICTION_DISTRIBUTION",
"THRESHOLD_SUMMARY",
-
- "ATTRITION", #new
-
+ "ATTRITION", # new
+
"DIAGNOSTIC_SUMMARY",
"DIAGNOSTIC_PARTICIPANTS",
"DIAGNOSTIC_PREDICTORS",
"DIAGNOSTIC_OUTCOMES",
"DIAGNOSTIC_DESIGNS",
-
- "DIAGNOSTICS", #new
- "RECALIBRATIONS", #new
-
- "PERFORMANCES",
-
- "MODELS",
-
- "MODEL_DESIGNS",
-
- "MODEL_SETTINGS",
+ "DIAGNOSTICS", # new
+ "RECALIBRATIONS", # new
+
+ "PERFORMANCES",
+ "MODELS",
+ "MODEL_DESIGNS",
+ "MODEL_SETTINGS",
"COVARIATE_SETTINGS",
- "POPULATION_SETTINGS",
- "FEATURE_ENGINEERING_SETTINGS",
- "SPLIT_SETTINGS",
- "PLP_DATA_SETTINGS", #new
- "SAMPLE_SETTINGS",
- "TIDY_COVARIATES_SETTINGS", #new
- "TARS",
-
+ "POPULATION_SETTINGS",
+ "FEATURE_ENGINEERING_SETTINGS",
+ "SPLIT_SETTINGS",
+ "PLP_DATA_SETTINGS", # new
+ "SAMPLE_SETTINGS",
+ "TIDY_COVARIATES_SETTINGS", # new
+ "TARS",
"DATABASE_DETAILS",
"DATABASE_META_DATA",
"COHORT_DEFINITION",
@@ -917,199 +882,196 @@ getPlpResultTables <- function(){
)
}
-getResultLocations <- function(resultLocation){
+getResultLocations <- function(resultLocation) {
# get the model locations...
-
+
resultLocs <- dir(
- resultLocation,
- pattern = 'Analysis_',
- full.names = T
+ resultLocation,
+ pattern = "Analysis_",
+ full.names = TRUE
)
# automatically find Results folder, to handle both plpResult/ and validationResult/
- resultLocs <- file.path(resultLocs, dir(resultLocs, pattern='Result'))
-
-
- if(dir.exists(file.path(resultLocation, 'Validation'))){
- validationDatabases <- dir(file.path(resultLocation, 'Validation'))
-
+ resultLocs <- file.path(resultLocs, dir(resultLocs, pattern = "Result"))
+
+
+ if (dir.exists(file.path(resultLocation, "Validation"))) {
+ validationDatabases <- dir(file.path(resultLocation, "Validation"))
+
valLocs <- dir(
unlist(
lapply(
validationDatabases,
- function(x) dir(file.path(resultLocation, 'Validation', x),
- pattern = 'Analysis_',
- full.names = T
- )
+ function(x) {
+ dir(file.path(resultLocation, "Validation", x),
+ pattern = "Analysis_",
+ full.names = TRUE
+ )
+ }
)
),
- full.names = T
+ full.names = TRUE
)
-
+
resultLocs <- c(resultLocs, valLocs)
-
}
return(resultLocs)
-
}
deleteTables <- function(
- conn,
- databaseSchema,
- targetDialect,
- tempEmulationSchema,
- tableNames,
- tablePrefix
-){
-
- if(tablePrefix != ''){
- tableNames <- tolower(paste0(gsub('_','',gsub(' ','', tablePrefix)), '_', tableNames))
+ conn,
+ databaseSchema,
+ targetDialect,
+ tempEmulationSchema,
+ tableNames,
+ tablePrefix) {
+ if (tablePrefix != "") {
+ tableNames <- tolower(paste0(gsub("_", "", gsub(" ", "", tablePrefix)), "_", tableNames))
}
-
+
alltables <- tolower(DatabaseConnector::getTableNames(
- connection = conn,
+ connection = conn,
databaseSchema = databaseSchema
))
-
-
- for(tb in tableNames){
- if(tb %in% alltables){
-
- if(targetDialect != 'sqlite'){
- sql <- 'TRUNCATE TABLE @my_schema.@table'
+
+
+ for (tb in tableNames) {
+ if (tb %in% alltables) {
+ if (targetDialect != "sqlite") {
+ sql <- "TRUNCATE TABLE @my_schema.@table"
sql <- SqlRender::render(
- sql,
- my_schema = databaseSchema,
+ sql,
+ my_schema = databaseSchema,
table = tb
)
sql <- SqlRender::translate(
- sql,
- targetDialect = targetDialect,
+ sql,
+ targetDialect = targetDialect,
tempEmulationSchema = tempEmulationSchema
)
DatabaseConnector::executeSql(conn, sql)
- } else{
- sql <- 'DELETE FROM @my_schema.@table'
+ } else {
+ sql <- "DELETE FROM @my_schema.@table"
sql <- SqlRender::render(
- sql,
- my_schema = databaseSchema,
+ sql,
+ my_schema = databaseSchema,
table = tb
)
sql <- SqlRender::translate(
- sql,
+ sql,
targetDialect = targetDialect,
tempEmulationSchema = tempEmulationSchema
)
DatabaseConnector::executeSql(conn, sql)
}
- sql <- 'DROP TABLE @my_schema.@table'
- sql <- SqlRender::render(
- sql,
- my_schema = databaseSchema,
- table = tb
- )
- sql <- SqlRender::translate(
- sql, targetDialect = targetDialect,
- tempEmulationSchema = tempEmulationSchema
- )
- DatabaseConnector::executeSql(conn, sql)
+ sql <- "DROP TABLE @my_schema.@table"
+ sql <- SqlRender::render(
+ sql,
+ my_schema = databaseSchema,
+ table = tb
+ )
+ sql <- SqlRender::translate(
+ sql,
+ targetDialect = targetDialect,
+ tempEmulationSchema = tempEmulationSchema
+ )
+ DatabaseConnector::executeSql(conn, sql)
}
-
}
-
}
## Template Helpers
-enc <- function(x){
+enc <- function(x) {
return(paste0("'", x, "'"))
}
-cleanNum <- function(x){
- types <- unlist(lapply(1:ncol(x), function(i) class(x[,i])))
-
- ids <- which(types%in% c("numeric", "integer" ))
-
- for(id in ids){
- okVals <- is.finite(x[,id])
-
- if(sum(okVals)!=length(okVals)){
- x[!okVals,id] <- NA
+cleanNum <- function(x) {
+ types <- unlist(lapply(1:ncol(x), function(i) class(x[, i])))
+
+ ids <- which(types %in% c("numeric", "integer"))
+
+ for (id in ids) {
+ okVals <- is.finite(x[, id])
+
+ if (sum(okVals) != length(okVals)) {
+ x[!okVals, id] <- NA
}
-
}
- return(x)
+ return(x)
}
checkTable <- function(conn,
- resultSchema,
- tablePrefix = '',
+ resultSchema,
+ tablePrefix = "",
targetDialect,
tableName,
- columnNames,
+ columnNames,
values,
- tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")
-){
-
- vals <- paste0(paste0(columnNames," = ", values), collapse = " and ")
-
+ tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")) {
+ vals <- paste0(paste0(columnNames, " = ", values), collapse = " and ")
+
sql <- "SELECT * from @my_schema.@string_to_append@table where @input_vals;"
- sql <- SqlRender::render(sql,
- my_schema = resultSchema,
- table = tableName,
- input_vals = vals,
- string_to_append = tablePrefix)
- sql <- SqlRender::translate(sql, targetDialect = targetDialect,
- tempEmulationSchema = tempEmulationSchema)
- result <- DatabaseConnector::querySql(conn, sql, snakeCaseToCamelCase = T)
-
+ sql <- SqlRender::render(sql,
+ my_schema = resultSchema,
+ table = tableName,
+ input_vals = vals,
+ string_to_append = tablePrefix
+ )
+ sql <- SqlRender::translate(sql,
+ targetDialect = targetDialect,
+ tempEmulationSchema = tempEmulationSchema
+ )
+ result <- DatabaseConnector::querySql(conn, sql, snakeCaseToCamelCase = TRUE)
+
return(result)
}
checkJson <- function(conn,
- resultSchema,
- tablePrefix = '',
+ resultSchema,
+ tablePrefix = "",
targetDialect,
tableName,
jsonColumnName,
id,
json,
- tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")
-){
-
+ tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")) {
sql <- "SELECT * from @my_schema.@string_to_append@table;"
- sql <- SqlRender::render(sql,
- my_schema = resultSchema,
- table = tableName,
- string_to_append = tablePrefix)
- sql <- SqlRender::translate(sql, targetDialect = targetDialect,
- tempEmulationSchema = tempEmulationSchema)
- result <- DatabaseConnector::querySql(conn, sql, snakeCaseToCamelCase = T)
-
+ sql <- SqlRender::render(sql,
+ my_schema = resultSchema,
+ table = tableName,
+ string_to_append = tablePrefix
+ )
+ sql <- SqlRender::translate(sql,
+ targetDialect = targetDialect,
+ tempEmulationSchema = tempEmulationSchema
+ )
+ result <- DatabaseConnector::querySql(conn, sql, snakeCaseToCamelCase = TRUE)
+
resultId <- NULL
- if(nrow(result)>0){
- colId <- result[,jsonColumnName] == json
- if(sum(colId)>0){
- resultId <- result[colId,id][1]
+ if (nrow(result) > 0) {
+ colId <- result[, jsonColumnName] == json
+ if (sum(colId) > 0) {
+ resultId <- result[colId, id][1]
}
}
-
+
return(resultId)
}
-getCohortDef <- function(cohortDefinitions, cohortId){
- if(!is.null(cohortDefinitions)){
- if(sum(cohortDefinitions$cohortId == cohortId) > 0){
+getCohortDef <- function(cohortDefinitions, cohortId) {
+ if (!is.null(cohortDefinitions)) {
+ if (sum(cohortDefinitions$cohortId == cohortId) > 0) {
return(cohortDefinitions[cohortDefinitions$cohortId == cohortId, ])
}
}
return(
data.frame(
- cohortId = cohortId,
- cohortName = paste0('Cohort: ', cohortId),
- json = '{}'
+ cohortId = cohortId,
+ cohortName = paste0("Cohort: ", cohortId),
+ json = "{}"
)
)
}
@@ -1117,254 +1079,250 @@ getCohortDef <- function(cohortDefinitions, cohortId){
# adds json from package unless json is specified
addCohort <- function(
- conn,
- resultSchema,
- targetDialect,
- tablePrefix = '',
- cohortDefinition, # this is the R data.frame of the cohortDefinition
- tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")
-){
-
-
- # make sure the json has been converted
+ conn,
+ resultSchema,
+ targetDialect,
+ tablePrefix = "",
+ cohortDefinition, # this is the R data.frame of the cohortDefinition
+ tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")) {
+ # make sure the json has been converted
json <- cohortDefinition$json
- if(!inherits(x = json , what = 'character')){
- ParallelLogger::logInfo('converting json to character')
+ if (!inherits(x = json, what = "character")) {
+ ParallelLogger::logInfo("converting json to character")
json <- as.character(json) # now convert to character
}
-
+
# reduce the size to save
- if(!targetDialect %in% c('sqlite', 'postgres')){
- json <- substr(json, 1, 4000) # TESTING - FIX THIS [TODO]
+ if (!targetDialect %in% c("sqlite", "postgres")) {
+ json <- substr(json, 1, 4000) # TESTING - FIX THIS [TODO]
}
-
- #check whether cohort already in COHORT_DEFINITION table:
- result <- checkTable(conn = conn,
- resultSchema = resultSchema,
- tablePrefix = tablePrefix,
- targetDialect = targetDialect,
- tableName = 'cohort_definition',
- columnNames = c('cohort_name'),
- values = c(paste0("'",gsub('\'', '', cohortDefinition$cohortName),"'")),
- tempEmulationSchema = tempEmulationSchema
+
+ # check whether cohort already in COHORT_DEFINITION table:
+ result <- checkTable(
+ conn = conn,
+ resultSchema = resultSchema,
+ tablePrefix = tablePrefix,
+ targetDialect = targetDialect,
+ tableName = "cohort_definition",
+ columnNames = c("cohort_name"),
+ values = c(paste0("'", gsub("'", "", cohortDefinition$cohortName), "'")),
+ tempEmulationSchema = tempEmulationSchema
)
-
- addNew <- F
- if(nrow(result)>0){
+
+ addNew <- FALSE
+ if (nrow(result) > 0) {
addNew <- json %in% result$json
- ParallelLogger::logInfo(paste0('json in jsons:', addNew))
+ ParallelLogger::logInfo(paste0("json in jsons:", addNew))
}
-
- if(addNew){
+
+ if (addNew) {
cohortDefinitionId <- result$cohortDefinitionId[result$json %in% json]
- ParallelLogger::logInfo(paste0('Cohort ',gsub('\'', '', cohortDefinition$cohortName),' exists in cohort_definition with cohort id', result$cohortDefinitionId[result$json %in% json]))
- } else{
- ParallelLogger::logInfo(paste0('Adding cohort ',gsub('\'', '', cohortDefinition$cohortName)))
-
+ ParallelLogger::logInfo(paste0("Cohort ", gsub("'", "", cohortDefinition$cohortName), " exists in cohort_definition with cohort id", result$cohortDefinitionId[result$json %in% json]))
+ } else {
+ ParallelLogger::logInfo(paste0("Adding cohort ", gsub("'", "", cohortDefinition$cohortName)))
+
data <- data.frame(
- cohortName = gsub('\'', '', cohortDefinition$cohortName),
+ cohortName = gsub("'", "", cohortDefinition$cohortName),
cohortDefinitionId = cohortDefinition$cohortId,
json = json
)
DatabaseConnector::insertTable(
- connection = conn,
- databaseSchema = resultSchema,
- tableName = paste0(tablePrefix, 'cohort_definition'),
+ connection = conn,
+ databaseSchema = resultSchema,
+ tableName = paste0(tablePrefix, "cohort_definition"),
data = data,
- dropTableIfExists = F,
- createTable = F,
- tempTable = F,
- progressBar = T,
- camelCaseToSnakeCase = T,
+ dropTableIfExists = FALSE,
+ createTable = FALSE,
+ tempTable = FALSE,
+ progressBar = TRUE,
+ camelCaseToSnakeCase = TRUE,
tempEmulationSchema = tempEmulationSchema
)
-
+
# now check and get id
result <- checkTable(
- conn = conn,
- resultSchema = resultSchema,
+ conn = conn,
+ resultSchema = resultSchema,
tablePrefix = tablePrefix,
- targetDialect = targetDialect,
- tableName = 'cohort_definition',
- columnNames = c('cohort_name', 'cohort_definition_id'),
- values = c(paste0("'",gsub('\'', '', cohortDefinition$cohortName),"'"), cohortDefinition$cohortId),
+ targetDialect = targetDialect,
+ tableName = "cohort_definition",
+ columnNames = c("cohort_name", "cohort_definition_id"),
+ values = c(paste0("'", gsub("'", "", cohortDefinition$cohortName), "'"), cohortDefinition$cohortId),
tempEmulationSchema = tempEmulationSchema
)
-
+
jsonInd <- result$json %in% json
cohortDefinitionId <- result$cohortDefinitionId[jsonInd]
}
-
+
# now add to cohorts table
- result <- checkTable(conn = conn,
- resultSchema = resultSchema,
- tablePrefix = tablePrefix,
- targetDialect = targetDialect,
- tableName = 'cohorts',
- columnNames = c('cohort_definition_id','cohort_name'),
- values = c(cohortDefinitionId, paste0("'",gsub('\'', '', cohortDefinition$cohortName),"'")),
- tempEmulationSchema = tempEmulationSchema
+ result <- checkTable(
+ conn = conn,
+ resultSchema = resultSchema,
+ tablePrefix = tablePrefix,
+ targetDialect = targetDialect,
+ tableName = "cohorts",
+ columnNames = c("cohort_definition_id", "cohort_name"),
+ values = c(cohortDefinitionId, paste0("'", gsub("'", "", cohortDefinition$cohortName), "'")),
+ tempEmulationSchema = tempEmulationSchema
)
-
- if(nrow(result)>0){
- ParallelLogger::logInfo(paste0('Cohort ',gsub('\'', '', cohortDefinition$cohortName),' exists in cohorts with cohort id', result$cohortId))
- } else{
- ParallelLogger::logInfo(paste0('Adding cohort ',gsub('\'', '', cohortDefinition$cohortName)))
-
+
+ if (nrow(result) > 0) {
+ ParallelLogger::logInfo(paste0("Cohort ", gsub("'", "", cohortDefinition$cohortName), " exists in cohorts with cohort id", result$cohortId))
+ } else {
+ ParallelLogger::logInfo(paste0("Adding cohort ", gsub("'", "", cohortDefinition$cohortName)))
+
data <- data.frame(
cohortDefinitionId = cohortDefinitionId,
- cohortName = gsub('\'', '', cohortDefinition$cohortName)
+ cohortName = gsub("'", "", cohortDefinition$cohortName)
)
DatabaseConnector::insertTable(
- connection = conn,
- databaseSchema = resultSchema,
- tableName = paste0(tablePrefix, 'cohorts'),
+ connection = conn,
+ databaseSchema = resultSchema,
+ tableName = paste0(tablePrefix, "cohorts"),
data = data,
- dropTableIfExists = F,
- createTable = F,
- tempTable = F,
- progressBar = T,
- camelCaseToSnakeCase = T,
+ dropTableIfExists = FALSE,
+ createTable = FALSE,
+ tempTable = FALSE,
+ progressBar = TRUE,
+ camelCaseToSnakeCase = TRUE,
tempEmulationSchema = tempEmulationSchema
)
-
+
# now check and get id
- result <- checkTable(conn = conn,
- resultSchema = resultSchema,
- tablePrefix = tablePrefix,
- targetDialect = targetDialect,
- tableName = 'cohorts',
- columnNames = c('cohort_definition_id','cohort_name'),
- values = c(cohortDefinitionId, paste0("'",gsub('\'', '', cohortDefinition$cohortName),"'")),
- tempEmulationSchema = tempEmulationSchema
+ result <- checkTable(
+ conn = conn,
+ resultSchema = resultSchema,
+ tablePrefix = tablePrefix,
+ targetDialect = targetDialect,
+ tableName = "cohorts",
+ columnNames = c("cohort_definition_id", "cohort_name"),
+ values = c(cohortDefinitionId, paste0("'", gsub("'", "", cohortDefinition$cohortName), "'")),
+ tempEmulationSchema = tempEmulationSchema
)
}
-
+
return(result$cohortId[1])
}
addDatabase <- function(
- conn,
- databaseSchemaSettings,
- databaseList = NULL, # list with the database details
- databaseId = NULL, # the database id
- databaseSchema # the database schema
-){
-
- if(is.null(databaseId)){
+ conn,
+ databaseSchemaSettings,
+ databaseList = NULL, # list with the database details
+ databaseId = NULL, # the database id
+ databaseSchema # the database schema
+ ) {
+ if (is.null(databaseId)) {
databaseId <- removeInvalidString(databaseSchema)
}
-
- # get the database tables for the databaseId
- if(is.null(databaseList)){
+
+ # get the database tables for the databaseId
+ if (is.null(databaseList)) {
databaseDataFrames <- createDatabaseList(cdmDatabaseSchemas = databaseSchema, databaseRefIds = databaseId)[[1]]
- } else{
- if(databaseId %in% names(databaseList)){
+ } else {
+ if (databaseId %in% names(databaseList)) {
databaseDataFrames <- databaseList[[as.character(databaseId)]]
- } else{
- ParallelLogger::logInfo('database ID not found in databaseList so added new entry')
+ } else {
+ ParallelLogger::logInfo("database ID not found in databaseList so added new entry")
databaseDataFrames <- createDatabaseList(cdmDatabaseSchemas = databaseSchema, databaseRefIds = databaseId)[[1]]
}
}
-
-
+
+
# check the database_meta_data
- result <- checkTable(conn = conn,
- resultSchema = databaseSchemaSettings$resultSchema,
- tablePrefix = databaseSchemaSettings$tablePrefix,
- targetDialect = databaseSchemaSettings$targetDialect,
- tableName = 'database_meta_data',
- columnNames = c('database_id'),
- values = c(paste0("'",databaseDataFrames$databaseMetaData$databaseId,"'")),
- tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema
+ result <- checkTable(
+ conn = conn,
+ resultSchema = databaseSchemaSettings$resultSchema,
+ tablePrefix = databaseSchemaSettings$tablePrefix,
+ targetDialect = databaseSchemaSettings$targetDialect,
+ tableName = "database_meta_data",
+ columnNames = c("database_id"),
+ values = c(paste0("'", databaseDataFrames$databaseMetaData$databaseId, "'")),
+ tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema
)
-
- if(nrow(result)>0){
- ParallelLogger::logInfo(paste0('Database meta data ', databaseDataFrames$database_meta_data$databaseId ,' already exists'))
+
+ if (nrow(result) > 0) {
+ ParallelLogger::logInfo(paste0("Database meta data ", databaseDataFrames$database_meta_data$databaseId, " already exists"))
} else {
-
sql <- "INSERT INTO @my_schema.@string_to_appenddatabase_meta_data(
database_id,
cdm_source_name,
cdm_source_abbreviation
- )
+ )
VALUES ('@database_id','@cdm_source_name', '@cdm_source_abbreviation');"
sql <- SqlRender::render(
- sql,
+ sql,
my_schema = databaseSchemaSettings$resultSchema,
- database_id = databaseDataFrames$databaseMetaData$databaseId,
+ database_id = databaseDataFrames$databaseMetaData$databaseId,
cdm_source_name = databaseDataFrames$databaseMetaData$cdmSourceName,
cdm_source_abbreviation = databaseDataFrames$databaseMetaData$cdmSourceAbbreviation,
string_to_append = databaseSchemaSettings$tablePrefix
)
- sql <- SqlRender::translate(sql, targetDialect = databaseSchemaSettings$targetDialect,
- tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema)
+ sql <- SqlRender::translate(sql,
+ targetDialect = databaseSchemaSettings$targetDialect,
+ tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema
+ )
DatabaseConnector::executeSql(conn, sql)
-
- result <- checkTable(conn = conn,
- resultSchema = databaseSchemaSettings$resultSchema,
- tablePrefix = databaseSchemaSettings$tablePrefix,
- targetDialect = databaseSchemaSettings$targetDialect,
- tableName = 'database_meta_data',
- columnNames = c('database_id', 'cdm_source_name',
- 'cdm_source_abbreviation'),
- values = c(paste0("'",databaseDataFrames$databaseMetaData$databaseId,"'"),
- paste0("'",databaseDataFrames$databaseMetaData$cdmSourceName,"'"),
- paste0("'",databaseDataFrames$databaseMetaData$cdmSourceAbbreviation,"'")),
- tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema
+
+ result <- checkTable(
+ conn = conn,
+ resultSchema = databaseSchemaSettings$resultSchema,
+ tablePrefix = databaseSchemaSettings$tablePrefix,
+ targetDialect = databaseSchemaSettings$targetDialect,
+ tableName = "database_meta_data",
+ columnNames = c(
+ "database_id", "cdm_source_name",
+ "cdm_source_abbreviation"
+ ),
+ values = c(
+ paste0("'", databaseDataFrames$databaseMetaData$databaseId, "'"),
+ paste0("'", databaseDataFrames$databaseMetaData$cdmSourceName, "'"),
+ paste0("'", databaseDataFrames$databaseMetaData$cdmSourceAbbreviation, "'")
+ ),
+ tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema
)
-
}
-
- result <- checkTable(conn = conn,
- resultSchema = databaseSchemaSettings$resultSchema,
- tablePrefix = databaseSchemaSettings$tablePrefix,
- targetDialect = databaseSchemaSettings$targetDialect,
- tableName = 'database_details',
- columnNames = c('database_meta_data_id'),
- values = c(paste0("'",databaseDataFrames$databaseDetails$databaseMetaDataId,"'")
- ),
- tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema
+
+ result <- checkTable(
+ conn = conn,
+ resultSchema = databaseSchemaSettings$resultSchema,
+ tablePrefix = databaseSchemaSettings$tablePrefix,
+ targetDialect = databaseSchemaSettings$targetDialect,
+ tableName = "database_details",
+ columnNames = c("database_meta_data_id"),
+ values = c(paste0("'", databaseDataFrames$databaseDetails$databaseMetaDataId, "'")),
+ tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema
)
-
- if(nrow(result)>0){
- ParallelLogger::logInfo(paste0('Database', result$databaseId ,' already exists'))
+
+ if (nrow(result) > 0) {
+ ParallelLogger::logInfo(paste0("Database", result$databaseId, " already exists"))
} else {
-
- sql <- "INSERT INTO @my_schema.@string_to_appenddatabase_details(database_meta_data_id)
+ sql <- "INSERT INTO @my_schema.@string_to_appenddatabase_details(database_meta_data_id)
VALUES ('@database_meta_data_id');"
- sql <- SqlRender::render(sql,
- my_schema = databaseSchemaSettings$resultSchema,
- database_meta_data_id = databaseDataFrames$databaseDetails$databaseMetaDataId,
- string_to_append = databaseSchemaSettings$tablePrefix)
- sql <- SqlRender::translate(sql, targetDialect = databaseSchemaSettings$targetDialect,
- tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema)
+ sql <- SqlRender::render(sql,
+ my_schema = databaseSchemaSettings$resultSchema,
+ database_meta_data_id = databaseDataFrames$databaseDetails$databaseMetaDataId,
+ string_to_append = databaseSchemaSettings$tablePrefix
+ )
+ sql <- SqlRender::translate(sql,
+ targetDialect = databaseSchemaSettings$targetDialect,
+ tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema
+ )
DatabaseConnector::executeSql(conn, sql)
-
- result <- checkTable(conn = conn,
- resultSchema = databaseSchemaSettings$resultSchema,
- tablePrefix = databaseSchemaSettings$tablePrefix,
- targetDialect = databaseSchemaSettings$targetDialect,
- tableName = 'database_details',
- columnNames = c('database_meta_data_id'),
- values = c(paste0("'",databaseDataFrames$databaseDetails$databaseMetaDataId,"'")
- ),
- tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema
+
+ result <- checkTable(
+ conn = conn,
+ resultSchema = databaseSchemaSettings$resultSchema,
+ tablePrefix = databaseSchemaSettings$tablePrefix,
+ targetDialect = databaseSchemaSettings$targetDialect,
+ tableName = "database_details",
+ columnNames = c("database_meta_data_id"),
+ values = c(paste0("'", databaseDataFrames$databaseDetails$databaseMetaDataId, "'")),
+ tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema
)
-
}
-
+
return(result$databaseId[1])
-
}
-
-
-
-
-
-
-
-
-
diff --git a/R/uploadToDatabaseDiagnostics.R b/R/uploadToDatabaseDiagnostics.R
index 0d10506e0..12896a933 100644
--- a/R/uploadToDatabaseDiagnostics.R
+++ b/R/uploadToDatabaseDiagnostics.R
@@ -8,32 +8,30 @@
#' @param connectionDetails A connection details created by using the
#' function \code{createConnectionDetails} in the
#' \code{DatabaseConnector} package.
-#' @param databaseSchemaSettings A object created by \code{createDatabaseSchemaSettings} with all the settings specifying the result tables
+#' @param databaseSchemaSettings A object created by \code{createDatabaseSchemaSettings} with all the settings specifying the result tables
#' @param cohortDefinitions (list) A list of cohortDefinitions (each list must contain: name, id)
#' @param databaseList (Optional) ...
#' @param resultLocation The location of the diagnostic results
-#'
+#'
#' @return
#' Returns NULL but uploads multiple diagnosePlp results into the database schema specified in databaseSchemaSettings
-#'
+#'
#' @export
addMultipleDiagnosePlpToDatabase <- function(
- connectionDetails,
- databaseSchemaSettings,
- cohortDefinitions,
- databaseList = NULL,
- resultLocation
-){
-
- diagnosisFiles <- file.path(resultLocation, dir(resultLocation, pattern = 'Analysis_'), 'diagnosePlp.rds')
-
- if(length(diagnosisFiles) == 0){
- ParallelLogger::logInfo('No diagnostic results found')
+ connectionDetails,
+ databaseSchemaSettings,
+ cohortDefinitions,
+ databaseList = NULL,
+ resultLocation) {
+ diagnosisFiles <- file.path(resultLocation, dir(resultLocation, pattern = "Analysis_"), "diagnosePlp.rds")
+
+ if (length(diagnosisFiles) == 0) {
+ ParallelLogger::logInfo("No diagnostic results found")
return(NULL)
}
-
- for(diagnosisFile in diagnosisFiles){
- if(file.exists(diagnosisFile)){
+
+ for (diagnosisFile in diagnosisFiles) {
+ if (file.exists(diagnosisFile)) {
diagnosePlp <- readRDS(diagnosisFile)
addDiagnosePlpToDatabase(
diagnosePlp = diagnosePlp,
@@ -54,46 +52,44 @@ addMultipleDiagnosePlpToDatabase <- function(
#' @details
#' This function can be used to upload a diagnostic result into a database
#'
-#' @param diagnosePlp An object of class \code{diagnosePlp}
+#' @param diagnosePlp An object of class \code{diagnosePlp}
#' @param connectionDetails A connection details created by using the
#' function \code{createConnectionDetails} in the
#' \code{DatabaseConnector} package.
-#' @param databaseSchemaSettings A object created by \code{createDatabaseSchemaSettings} with all the settings specifying the result tables
+#' @param databaseSchemaSettings A object created by \code{createDatabaseSchemaSettings} with all the settings specifying the result tables
#' @param cohortDefinitions A set of one or more cohorts extracted using ROhdsiWebApi::exportCohortDefinitionSet()
#' @param databaseList (Optional) If you wish to overwrite the settings in the plp object use \code{createdatabaseList} to specify the databases
#' @param overWriteIfExists (default: T) Whether to delete existing results and overwrite them
-#'
+#'
#' @return
#' Returns NULL but uploads the diagnostic into the database schema specified in databaseSchemaSettings
-#'
+#'
#' @export
addDiagnosePlpToDatabase <- function(
- diagnosePlp,
- connectionDetails,
- databaseSchemaSettings,
- cohortDefinitions,
- databaseList = NULL,
- overWriteIfExists = T
-){
-
+ diagnosePlp,
+ connectionDetails,
+ databaseSchemaSettings,
+ cohortDefinitions,
+ databaseList = NULL,
+ overWriteIfExists = TRUE) {
conn <- DatabaseConnector::connect(connectionDetails = connectionDetails)
on.exit(DatabaseConnector::disconnect(conn))
-
+
modelDesignId <- insertModelDesignInDatabase(
- object = diagnosePlp$modelDesign,
- conn = conn,
+ object = diagnosePlp$modelDesign,
+ conn = conn,
databaseSchemaSettings = databaseSchemaSettings,
- cohortDefinitions = cohortDefinitions
+ cohortDefinitions = cohortDefinitions
)
-
+
databaseId <- addDatabase(
- conn = conn,
+ conn = conn,
databaseSchemaSettings = databaseSchemaSettings,
databaseList = databaseList,
- databaseSchema = diagnosePlp$databaseSchema,
- databaseId = diagnosePlp$databaseId
+ databaseSchema = diagnosePlp$databaseSchema,
+ databaseId = diagnosePlp$databaseId
)
-
+
diagnoseId <- insertDiagnosisToDatabase(
diagnostics = diagnosePlp,
conn = conn,
@@ -102,268 +98,272 @@ addDiagnosePlpToDatabase <- function(
databaseId = databaseId,
overWriteIfExists = overWriteIfExists
)
-
+
return(invisible(diagnoseId))
-}
-
-
+}
+
+
insertDiagnosisToDatabase <- function(
- diagnostics,
- conn,
- databaseSchemaSettings,
- modelDesignId,
- databaseId,
- overWriteIfExists = T
-){
-
- diagnosticId <- addDiagnostic(
- conn = conn,
- resultSchema = databaseSchemaSettings$resultSchema,
+ diagnostics,
+ conn,
+ databaseSchemaSettings,
+ modelDesignId,
+ databaseId,
+ overWriteIfExists = TRUE) {
+ diagnosticId <- addDiagnostic(
+ conn = conn,
+ resultSchema = databaseSchemaSettings$resultSchema,
targetDialect = databaseSchemaSettings$targetDialect,
-
modelDesignId = modelDesignId,
databaseId = databaseId,
-
tablePrefix = databaseSchemaSettings$tablePrefix,
tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema
)
- ParallelLogger::logInfo(paste0('diagnosticId: ', diagnosticId))
-
+ ParallelLogger::logInfo(paste0("diagnosticId: ", diagnosticId))
+
# now add the four tables
-
- ParallelLogger::logInfo('Adding DiagnosticSummary')
- tryCatch({
- addResultTable(
- conn = conn,
- resultSchema = databaseSchemaSettings$resultSchema,
- targetDialect = databaseSchemaSettings$targetDialect,
-
- tableName = 'diagnostic_summary',
- resultIdName = 'diagnosticId',
- resultId = diagnosticId,
- object = diagnostics$summary,
-
- tablePrefix = databaseSchemaSettings$tablePrefix,
- tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema,
- overWriteIfExists = overWriteIfExists
- )},
- error = function(e){ParallelLogger::logError(e);}
+
+ ParallelLogger::logInfo("Adding DiagnosticSummary")
+ tryCatch(
+ {
+ addResultTable(
+ conn = conn,
+ resultSchema = databaseSchemaSettings$resultSchema,
+ targetDialect = databaseSchemaSettings$targetDialect,
+ tableName = "diagnostic_summary",
+ resultIdName = "diagnosticId",
+ resultId = diagnosticId,
+ object = diagnostics$summary,
+ tablePrefix = databaseSchemaSettings$tablePrefix,
+ tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema,
+ overWriteIfExists = overWriteIfExists
+ )
+ },
+ error = function(e) {
+ ParallelLogger::logError(e)
+ }
)
-
- ParallelLogger::logInfo('Adding DiagnosticParticipants')
- tryCatch({
- addResultTable(
- conn = conn,
- resultSchema = databaseSchemaSettings$resultSchema,
- targetDialect = databaseSchemaSettings$targetDialect,
-
- tableName = 'diagnostic_participants',
- resultIdName = 'diagnosticId',
- resultId = diagnosticId,
- object = diagnostics$participants,
-
- tablePrefix = databaseSchemaSettings$tablePrefix,
- tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema,
- overWriteIfExists = overWriteIfExists
- )},
- error = function(e){ParallelLogger::logError(e);}
+
+ ParallelLogger::logInfo("Adding DiagnosticParticipants")
+ tryCatch(
+ {
+ addResultTable(
+ conn = conn,
+ resultSchema = databaseSchemaSettings$resultSchema,
+ targetDialect = databaseSchemaSettings$targetDialect,
+ tableName = "diagnostic_participants",
+ resultIdName = "diagnosticId",
+ resultId = diagnosticId,
+ object = diagnostics$participants,
+ tablePrefix = databaseSchemaSettings$tablePrefix,
+ tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema,
+ overWriteIfExists = overWriteIfExists
+ )
+ },
+ error = function(e) {
+ ParallelLogger::logError(e)
+ }
)
-
- ParallelLogger::logInfo('Adding DiagnosticPredictors')
- tryCatch({
- addResultTable(
- conn = conn,
- resultSchema = databaseSchemaSettings$resultSchema,
- targetDialect = databaseSchemaSettings$targetDialect,
-
- tableName = 'diagnostic_predictors',
- resultIdName = 'diagnosticId',
- resultId = diagnosticId,
- object = diagnostics$predictors,
-
- tablePrefix = databaseSchemaSettings$tablePrefix,
- tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema,
- overWriteIfExists = overWriteIfExists
- )},
- error = function(e){ParallelLogger::logError(e);}
+
+ ParallelLogger::logInfo("Adding DiagnosticPredictors")
+ tryCatch(
+ {
+ addResultTable(
+ conn = conn,
+ resultSchema = databaseSchemaSettings$resultSchema,
+ targetDialect = databaseSchemaSettings$targetDialect,
+ tableName = "diagnostic_predictors",
+ resultIdName = "diagnosticId",
+ resultId = diagnosticId,
+ object = diagnostics$predictors,
+ tablePrefix = databaseSchemaSettings$tablePrefix,
+ tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema,
+ overWriteIfExists = overWriteIfExists
+ )
+ },
+ error = function(e) {
+ ParallelLogger::logError(e)
+ }
)
-
- ParallelLogger::logInfo('Adding DiagnosticOutcomes')
- tryCatch({
- addResultTable(
- conn = conn,
- resultSchema = databaseSchemaSettings$resultSchema,
- targetDialect = databaseSchemaSettings$targetDialect,
-
- tableName = 'diagnostic_outcomes',
- resultIdName = 'diagnosticId',
- resultId = diagnosticId,
- object = diagnostics$outcomes,
-
- tablePrefix = databaseSchemaSettings$tablePrefix,
- tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema,
- overWriteIfExists = overWriteIfExists
- )},
- error = function(e){ParallelLogger::logError(e);}
+
+ ParallelLogger::logInfo("Adding DiagnosticOutcomes")
+ tryCatch(
+ {
+ addResultTable(
+ conn = conn,
+ resultSchema = databaseSchemaSettings$resultSchema,
+ targetDialect = databaseSchemaSettings$targetDialect,
+ tableName = "diagnostic_outcomes",
+ resultIdName = "diagnosticId",
+ resultId = diagnosticId,
+ object = diagnostics$outcomes,
+ tablePrefix = databaseSchemaSettings$tablePrefix,
+ tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema,
+ overWriteIfExists = overWriteIfExists
+ )
+ },
+ error = function(e) {
+ ParallelLogger::logError(e)
+ }
)
-
- ParallelLogger::logInfo('Adding DiagnosticDesigns')
- tryCatch({
- addResultTable(
- conn = conn,
- resultSchema = databaseSchemaSettings$resultSchema,
- targetDialect = databaseSchemaSettings$targetDialect,
-
- tableName = 'diagnostic_designs',
- resultIdName = 'diagnosticId',
- resultId = diagnosticId,
- object = diagnostics$designs,
-
- tablePrefix = databaseSchemaSettings$tablePrefix,
- tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema,
- overWriteIfExists = overWriteIfExists
- )},
- error = function(e){ParallelLogger::logError(e);}
+
+ ParallelLogger::logInfo("Adding DiagnosticDesigns")
+ tryCatch(
+ {
+ addResultTable(
+ conn = conn,
+ resultSchema = databaseSchemaSettings$resultSchema,
+ targetDialect = databaseSchemaSettings$targetDialect,
+ tableName = "diagnostic_designs",
+ resultIdName = "diagnosticId",
+ resultId = diagnosticId,
+ object = diagnostics$designs,
+ tablePrefix = databaseSchemaSettings$tablePrefix,
+ tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema,
+ overWriteIfExists = overWriteIfExists
+ )
+ },
+ error = function(e) {
+ ParallelLogger::logError(e)
+ }
)
-
+
return(invisible(diagnosticId))
}
addDiagnostic <- function(
- conn,
- resultSchema,
- targetDialect,
-
- modelDesignId,
- databaseId,
-
- tablePrefix,
- tempEmulationSchema
-){
-
- result <- checkTable(conn = conn,
- resultSchema = resultSchema,
- tablePrefix = tablePrefix,
- targetDialect = targetDialect,
- tableName = 'diagnostics',
- columnNames = c(
- 'model_design_id',
- 'database_id'
- ),
- values = c(
- modelDesignId,
- databaseId
- ),
- tempEmulationSchema = tempEmulationSchema
+ conn,
+ resultSchema,
+ targetDialect,
+ modelDesignId,
+ databaseId,
+ tablePrefix,
+ tempEmulationSchema) {
+ result <- checkTable(
+ conn = conn,
+ resultSchema = resultSchema,
+ tablePrefix = tablePrefix,
+ targetDialect = targetDialect,
+ tableName = "diagnostics",
+ columnNames = c(
+ "model_design_id",
+ "database_id"
+ ),
+ values = c(
+ modelDesignId,
+ databaseId
+ ),
+ tempEmulationSchema = tempEmulationSchema
)
-
- if(nrow(result)==0){
+
+ if (nrow(result) == 0) {
# model
sql <- "INSERT INTO @my_schema.@string_to_appenddiagnostics (
model_design_id,
database_id
- )
+ )
VALUES (
- @model_design_id,
+ @model_design_id,
@database_id
)"
- sql <- SqlRender::render(sql,
- my_schema = resultSchema,
- model_design_id = modelDesignId,
- database_id = databaseId,
- string_to_append = tablePrefix)
- sql <- SqlRender::translate(sql, targetDialect = targetDialect,
- tempEmulationSchema = tempEmulationSchema)
+ sql <- SqlRender::render(sql,
+ my_schema = resultSchema,
+ model_design_id = modelDesignId,
+ database_id = databaseId,
+ string_to_append = tablePrefix
+ )
+ sql <- SqlRender::translate(sql,
+ targetDialect = targetDialect,
+ tempEmulationSchema = tempEmulationSchema
+ )
DatabaseConnector::executeSql(conn, sql)
-
- #getId of new
- result <- checkTable(conn = conn,
- resultSchema = resultSchema,
- tablePrefix = tablePrefix,
- targetDialect = targetDialect,
- tableName = 'diagnostics',
- columnNames = c(
- 'model_design_id',
- 'database_id'
- ),
- values = c(
- modelDesignId,
- databaseId
- ),
- tempEmulationSchema = tempEmulationSchema
+
+ # getId of new
+ result <- checkTable(
+ conn = conn,
+ resultSchema = resultSchema,
+ tablePrefix = tablePrefix,
+ targetDialect = targetDialect,
+ tableName = "diagnostics",
+ columnNames = c(
+ "model_design_id",
+ "database_id"
+ ),
+ values = c(
+ modelDesignId,
+ databaseId
+ ),
+ tempEmulationSchema = tempEmulationSchema
)
-
- }
-
+ }
+
return(result$diagnosticId[1])
}
# replace the performance inserts with this single function...
addResultTable <- function(
- conn = conn,
- resultSchema,
- targetDialect,
- tableName = 'diagnostic_summary',
- resultIdName = 'diagnosticId',
- resultId,
- object,
- tablePrefix,
- tempEmulationSchema,
- overWriteIfExists = T
-){
-
+ conn = conn,
+ resultSchema,
+ targetDialect,
+ tableName = "diagnostic_summary",
+ resultIdName = "diagnosticId",
+ resultId,
+ object,
+ tablePrefix,
+ tempEmulationSchema,
+ overWriteIfExists = TRUE) {
object[resultIdName] <- resultId
-
- # get column names and check all present in object
+
+ # get column names and check all present in object
columnNames <- getColumnNames(
- conn = conn,
- resultSchema = resultSchema,
- targetDialect = targetDialect,
- tableName = paste0(tablePrefix,tableName),
+ conn = conn,
+ resultSchema = resultSchema,
+ targetDialect = targetDialect,
+ tableName = paste0(tablePrefix, tableName),
tempEmulationSchema = tempEmulationSchema
)
- isValid <- sum(colnames(object)%in%columnNames) == length(columnNames)
-
- exists <- checkResultExists(
- conn = conn,
- resultSchema = resultSchema,
- targetDialect = targetDialect,
- tableName = paste0(tablePrefix,tableName),
- resultIdName = SqlRender::camelCaseToSnakeCase(resultIdName),
- resultId = resultId,
- tempEmulationSchema = tempEmulationSchema
- )
-
- if(isValid && (!exists || overWriteIfExists)){
-
- # REMOVE existing result
- if(exists){
- sql <- "delete from @result_schema.@table_name where @result_id_name = @result_id;"
- sql <- SqlRender::render(sql,
- result_id_name = SqlRender::camelCaseToSnakeCase(resultIdName),
- result_id = resultId,
- result_schema = resultSchema,
- table_name = paste0(tablePrefix,tableName)
- )
- sql <- SqlRender::translate(sql,
- targetDialect = targetDialect,
- tempEmulationSchema = tempEmulationSchema)
- DatabaseConnector::executeSql(conn, sql)
- }
-
- # add
- DatabaseConnector::insertTable(
- connection = conn,
- databaseSchema = resultSchema,
- tableName = paste0(tablePrefix,tableName),
- data = as.data.frame(object[,columnNames]),
- dropTableIfExists = F, createTable = F, tempTable = F,
- bulkLoad = F, camelCaseToSnakeCase = T, progressBar = T,
+ isValid <- sum(colnames(object) %in% columnNames) == length(columnNames)
+
+ exists <- checkResultExists(
+ conn = conn,
+ resultSchema = resultSchema,
+ targetDialect = targetDialect,
+ tableName = paste0(tablePrefix, tableName),
+ resultIdName = SqlRender::camelCaseToSnakeCase(resultIdName),
+ resultId = resultId,
+ tempEmulationSchema = tempEmulationSchema
+ )
+
+ if (isValid && (!exists || overWriteIfExists)) {
+ # REMOVE existing result
+ if (exists) {
+ sql <- "delete from @result_schema.@table_name where @result_id_name = @result_id;"
+ sql <- SqlRender::render(sql,
+ result_id_name = SqlRender::camelCaseToSnakeCase(resultIdName),
+ result_id = resultId,
+ result_schema = resultSchema,
+ table_name = paste0(tablePrefix, tableName)
+ )
+ sql <- SqlRender::translate(sql,
+ targetDialect = targetDialect,
tempEmulationSchema = tempEmulationSchema
)
+ DatabaseConnector::executeSql(conn, sql)
}
-
- return(invisible(NULL))
+
+ # add
+ DatabaseConnector::insertTable(
+ connection = conn,
+ databaseSchema = resultSchema,
+ tableName = paste0(tablePrefix, tableName),
+ data = as.data.frame(object[, columnNames]),
+ dropTableIfExists = FALSE, createTable = FALSE, tempTable = FALSE,
+ bulkLoad = FALSE, camelCaseToSnakeCase = TRUE, progressBar = TRUE,
+ tempEmulationSchema = tempEmulationSchema
+ )
}
+
+ return(invisible(NULL))
+}
diff --git a/R/uploadToDatabaseModelDesign.R b/R/uploadToDatabaseModelDesign.R
index f99decd90..20f4f4fc0 100644
--- a/R/uploadToDatabaseModelDesign.R
+++ b/R/uploadToDatabaseModelDesign.R
@@ -9,23 +9,20 @@
#' @param conn A connection to a database created by using the
#' function \code{connect} in the
#' \code{DatabaseConnector} package.
-#' @param databaseSchemaSettings A object created by \code{createDatabaseSchemaSettings} with all the settings specifying the result tables
+#' @param databaseSchemaSettings A object created by \code{createDatabaseSchemaSettings} with all the settings specifying the result tables
#' @param cohortDefinitions A set of one or more cohorts extracted using ROhdsiWebApi::exportCohortDefinitionSet()
-#'
+#'
#' @return
#' Returns NULL but uploads the model design into the database schema specified in databaseSchemaSettings
-#'
+#'
#' @export
insertModelDesignInDatabase <- function(
- object,
- conn,
- databaseSchemaSettings,
- cohortDefinitions
-){
-
+ object,
+ conn,
+ databaseSchemaSettings,
+ cohortDefinitions) {
# REMOVE THIS
- if(inherits(object, 'externalValidatePlp') | inherits(object, 'runPlp')){
-
+ if (inherits(object, "externalValidatePlp") || inherits(object, "runPlp")) {
object <- PatientLevelPrediction::createModelDesign(
targetId = object$model$modelDesign$targetId,
outcomeId = object$model$modelDesign$outcomeId,
@@ -36,12 +33,11 @@ insertModelDesignInDatabase <- function(
sampleSettings = object$model$modelDesign$sampleSettings,
preprocessSettings = object$model$modelDesign$preprocessSettings,
modelSettings = object$model$modelDesign$modelSettings,
- runCovariateSummary = T
+ runCovariateSummary = TRUE
)
-
}
-
- if(inherits(object, 'modelDesign')){
+
+ if (inherits(object, "modelDesign")) {
modelDesignId <- insertModelDesignSettings(
object = object,
conn = conn,
@@ -50,7 +46,7 @@ insertModelDesignInDatabase <- function(
)
return(modelDesignId)
}
-
+
return(NULL)
}
@@ -58,142 +54,140 @@ insertModelDesignInDatabase <- function(
# this function inserts all the settings for the model design
# it returns the model_design_id for the database
insertModelDesignSettings <- function(
- object,
- conn,
- databaseSchemaSettings,
- cohortDefinitions
-){
-
- if(!inherits(x = object, what = 'modelDesign')){
- stop('object in insertModelDesign() is not a modelDesign')
+ object,
+ conn,
+ databaseSchemaSettings,
+ cohortDefinitions) {
+ if (!inherits(x = object, what = "modelDesign")) {
+ stop("object in insertModelDesign() is not a modelDesign")
}
-
+
# add TAR
tarId <- addTar(
- conn = conn,
- resultSchema = databaseSchemaSettings$resultSchema,
+ conn = conn,
+ resultSchema = databaseSchemaSettings$resultSchema,
targetDialect = databaseSchemaSettings$targetDialect,
- startDay = object$populationSettings$riskWindowStart,
+ startDay = object$populationSettings$riskWindowStart,
startAnchor = object$populationSettings$startAnchor,
- endDay = object$populationSettings$riskWindowEnd,
- endAnchor = object$populationSettings$endAnchor,
+ endDay = object$populationSettings$riskWindowEnd,
+ endAnchor = object$populationSettings$endAnchor,
tablePrefix = databaseSchemaSettings$tablePrefix,
tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema
)
- ParallelLogger::logInfo(paste0('tarId: ', tarId))
-
+ ParallelLogger::logInfo(paste0("tarId: ", tarId))
+
tId <- addCohort(
- conn = conn,
- resultSchema = databaseSchemaSettings$cohortDefinitionSchema,
+ conn = conn,
+ resultSchema = databaseSchemaSettings$cohortDefinitionSchema,
targetDialect = databaseSchemaSettings$targetDialect,
cohortDefinition = getCohortDef(cohortDefinitions, object$targetId),
tablePrefix = databaseSchemaSettings$tablePrefixCohortDefinitionTables,
tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema
)
- ParallelLogger::logInfo(paste0('tId: ', tId))
-
+ ParallelLogger::logInfo(paste0("tId: ", tId))
+
oId <- addCohort(
- conn = conn,
- resultSchema = databaseSchemaSettings$cohortDefinitionSchema,
+ conn = conn,
+ resultSchema = databaseSchemaSettings$cohortDefinitionSchema,
targetDialect = databaseSchemaSettings$targetDialect,
cohortDefinition = getCohortDef(cohortDefinitions, object$outcomeId),
tablePrefix = databaseSchemaSettings$tablePrefixCohortDefinitionTables,
tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema
)
- ParallelLogger::logInfo(paste0('oId: ', oId))
-
+ ParallelLogger::logInfo(paste0("oId: ", oId))
+
popSetId <- addPopulationSetting(
- conn = conn,
- resultSchema = databaseSchemaSettings$resultSchema,
+ conn = conn,
+ resultSchema = databaseSchemaSettings$resultSchema,
targetDialect = databaseSchemaSettings$targetDialect,
- json = object$populationSettings,
+ json = object$populationSettings,
tablePrefix = databaseSchemaSettings$tablePrefix,
tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema
)
- ParallelLogger::logInfo(paste0('popSetId: ', popSetId))
-
+ ParallelLogger::logInfo(paste0("popSetId: ", popSetId))
+
covSetId <- addCovariateSetting(
- conn = conn,
- resultSchema = databaseSchemaSettings$resultSchema,
+ conn = conn,
+ resultSchema = databaseSchemaSettings$resultSchema,
targetDialect = databaseSchemaSettings$targetDialect,
- json = object$covariateSettings,
+ json = object$covariateSettings,
tablePrefix = databaseSchemaSettings$tablePrefix,
tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema
)
- ParallelLogger::logInfo(paste0('covSetId: ', covSetId))
-
+ ParallelLogger::logInfo(paste0("covSetId: ", covSetId))
+
modSetId <- addModelSetting(
- conn = conn,
- resultSchema = databaseSchemaSettings$resultSchema,
+ conn = conn,
+ resultSchema = databaseSchemaSettings$resultSchema,
targetDialect = databaseSchemaSettings$targetDialect,
- modelType = attr(object$modelSettings$param, 'settings')$modelType, # make this the same as model$trainDetails$modelName?
- json = object$modelSettings,
+ modelType = attr(object$modelSettings$param, "settings")$modelType, # make this the same as model$trainDetails$modelName?
+ json = object$modelSettings,
tablePrefix = databaseSchemaSettings$tablePrefix,
tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema
)
- ParallelLogger::logInfo(paste0('modSetId: ', modSetId))
-
+ ParallelLogger::logInfo(paste0("modSetId: ", modSetId))
+
# NEW: add plp_data_settings
plpDataSetId <- addPlpDataSetting(
- conn = conn,
- resultSchema = databaseSchemaSettings$resultSchema,
+ conn = conn,
+ resultSchema = databaseSchemaSettings$resultSchema,
targetDialect = databaseSchemaSettings$targetDialect,
- json = object$restrictPlpDataSettings,
+ json = object$restrictPlpDataSettings,
tablePrefix = databaseSchemaSettings$tablePrefix,
tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema
)
- ParallelLogger::logInfo(paste0('plpDataSetId: ', plpDataSetId))
-
+ ParallelLogger::logInfo(paste0("plpDataSetId: ", plpDataSetId))
+
# NEW: add FE_settings
FESetId <- addFESetting(
- conn = conn,
- resultSchema = databaseSchemaSettings$resultSchema,
+ conn = conn,
+ resultSchema = databaseSchemaSettings$resultSchema,
targetDialect = databaseSchemaSettings$targetDialect,
- json = object$featureEngineeringSettings,
+ json = object$featureEngineeringSettings,
tablePrefix = databaseSchemaSettings$tablePrefix,
tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema
)
- ParallelLogger::logInfo(paste0('FESetId: ', FESetId))
-
+ ParallelLogger::logInfo(paste0("FESetId: ", FESetId))
+
# NEW: add sample_settings
sampleSetId <- addSampleSetting(
- conn = conn,
- resultSchema = databaseSchemaSettings$resultSchema,
+ conn = conn,
+ resultSchema = databaseSchemaSettings$resultSchema,
targetDialect = databaseSchemaSettings$targetDialect,
- json = object$sampleSettings,
+ json = object$sampleSettings,
tablePrefix = databaseSchemaSettings$tablePrefix,
tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema
)
- ParallelLogger::logInfo(paste0('sampleSetId: ', sampleSetId))
-
+ ParallelLogger::logInfo(paste0("sampleSetId: ", sampleSetId))
+
# NEW: add tidy_covariate_settings
tidySetId <- addTidySetting(
- conn = conn,
- resultSchema = databaseSchemaSettings$resultSchema,
+ conn = conn,
+ resultSchema = databaseSchemaSettings$resultSchema,
targetDialect = databaseSchemaSettings$targetDialect,
- json = object$preprocessSettings,
+ json = object$preprocessSettings,
tablePrefix = databaseSchemaSettings$tablePrefix,
tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema
)
- ParallelLogger::logInfo(paste0('tidySetId: ', tidySetId))
-
-
+ ParallelLogger::logInfo(paste0("tidySetId: ", tidySetId))
+
+
# this is now split setting - update this function
splitId <- addSplitSettings(
- conn = conn,
- resultSchema = databaseSchemaSettings$resultSchema,
+ conn = conn,
+ resultSchema = databaseSchemaSettings$resultSchema,
targetDialect = databaseSchemaSettings$targetDialect,
json = object$splitSettings,
tablePrefix = databaseSchemaSettings$tablePrefix,
tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema
)
- ParallelLogger::logInfo(paste0('splitId: ', splitId))
-
+ ParallelLogger::logInfo(paste0("splitId: ", splitId))
+
# create this function
modelDesignId <- addModelDesign( # need to create
- conn = conn,
- resultSchema = databaseSchemaSettings$resultSchema,
+ conn = conn,
+ resultSchema = databaseSchemaSettings$resultSchema,
targetDialect = databaseSchemaSettings$targetDialect,
targetId = tId,
outcomeId = oId,
@@ -209,86 +203,84 @@ insertModelDesignSettings <- function(
tablePrefix = databaseSchemaSettings$tablePrefix,
tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema
)
- ParallelLogger::logInfo(paste0('modelDesignId: ', modelDesignId))
-
+ ParallelLogger::logInfo(paste0("modelDesignId: ", modelDesignId))
+
return(modelDesignId)
}
addModelDesign <- function(
- conn,
- resultSchema, targetDialect,
- tablePrefix = tablePrefix,
- targetId,
- outcomeId,
- tarId,
- plpDataSettingId,
- populationSettingId,
- modelSettingId,
- covariateSettingId,
- sampleSettingId,
- splitSettingId,
- featureEngineeringSettingId,
- tidyCovariatesSettingId,
- tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")
-){
-
- if(is.null(targetId)){
- stop('targetId is null')
- }
- if(is.null(outcomeId)){
- stop('outcomeId is null')
- }
- if(is.null(tarId)){
- stop('tarId is null')
- }
-
- if(is.null(plpDataSettingId)){
- stop('plpDataSettingId is null')
- }
- if(is.null(populationSettingId)){
- stop('populationSettingId is null')
- }
- if(is.null(modelSettingId)){
- stop('modelSettingId is null')
- }
- if(is.null(covariateSettingId)){
- stop('covariateSettingId is null')
- }
- if(is.null(sampleSettingId)){
- stop('sampleSettingId is null')
- }
- if(is.null(splitSettingId)){
- stop('splitSettingId is null')
- }
- if(is.null(featureEngineeringSettingId)){
- stop('featureEngineeringSettingId is null')
- }
- if(is.null(tidyCovariatesSettingId)){
- stop('tidyCovariatesSettingId is null')
- }
-
+ conn,
+ resultSchema, targetDialect,
+ tablePrefix = tablePrefix,
+ targetId,
+ outcomeId,
+ tarId,
+ plpDataSettingId,
+ populationSettingId,
+ modelSettingId,
+ covariateSettingId,
+ sampleSettingId,
+ splitSettingId,
+ featureEngineeringSettingId,
+ tidyCovariatesSettingId,
+ tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")) {
+ if (is.null(targetId)) {
+ stop("targetId is null")
+ }
+ if (is.null(outcomeId)) {
+ stop("outcomeId is null")
+ }
+ if (is.null(tarId)) {
+ stop("tarId is null")
+ }
+
+ if (is.null(plpDataSettingId)) {
+ stop("plpDataSettingId is null")
+ }
+ if (is.null(populationSettingId)) {
+ stop("populationSettingId is null")
+ }
+ if (is.null(modelSettingId)) {
+ stop("modelSettingId is null")
+ }
+ if (is.null(covariateSettingId)) {
+ stop("covariateSettingId is null")
+ }
+ if (is.null(sampleSettingId)) {
+ stop("sampleSettingId is null")
+ }
+ if (is.null(splitSettingId)) {
+ stop("splitSettingId is null")
+ }
+ if (is.null(featureEngineeringSettingId)) {
+ stop("featureEngineeringSettingId is null")
+ }
+ if (is.null(tidyCovariatesSettingId)) {
+ stop("tidyCovariatesSettingId is null")
+ }
+
# process json to make it ordered...
# TODO
-
+
result <- checkTable(
- conn = conn,
- resultSchema = resultSchema,
+ conn = conn,
+ resultSchema = resultSchema,
tablePrefix = tablePrefix,
- targetDialect = targetDialect,
- tableName = 'model_designs',
+ targetDialect = targetDialect,
+ tableName = "model_designs",
columnNames = c(
- 'target_id',
- 'outcome_id',
- 'tar_id',
- 'plp_data_setting_id',
- 'population_setting_id',
- 'model_setting_id',
- 'covariate_setting_id',
- 'sample_setting_id',
- 'split_setting_id',
- 'feature_engineering_setting_id',
- 'tidy_covariates_setting_id'
- ),
+ "target_id",
+ "outcome_id",
+ "tar_id",
+ "plp_data_setting_id",
+ "population_setting_id",
+ "model_setting_id",
+ "covariate_setting_id",
+ "sample_setting_id",
+ "split_setting_id",
+ "feature_engineering_setting_id",
+ "tidy_covariates_setting_id"
+ ),
values = c(
targetId,
outcomeId,
@@ -304,8 +296,8 @@ addModelDesign <- function(
),
tempEmulationSchema = tempEmulationSchema
)
-
- if(nrow(result)==0){
+
+ if (nrow(result) == 0) {
# model
sql <- "INSERT INTO @my_schema.@string_to_appendmodel_designs(
target_id,
@@ -319,27 +311,27 @@ addModelDesign <- function(
split_setting_id,
feature_engineering_setting_id,
tidy_covariates_setting_id
- ) VALUES
- (
- @target_id,
- @outcome_id,
- @tar_id,
+ ) VALUES
+ (
+ @target_id,
+ @outcome_id,
+ @tar_id,
@plp_data_setting_id,
- @population_setting_id,
- @model_setting_id,
- @covariate_setting_id,
- @sample_setting_id,
- @split_setting_id,
- @feature_engineering_setting_id,
+ @population_setting_id,
+ @model_setting_id,
+ @covariate_setting_id,
+ @sample_setting_id,
+ @split_setting_id,
+ @feature_engineering_setting_id,
@tidy_covariates_setting_id
)"
sql <- SqlRender::render(
- sql,
+ sql,
my_schema = resultSchema,
target_id = targetId,
outcome_id = outcomeId,
tar_id = tarId,
- plp_data_setting_id= plpDataSettingId,
+ plp_data_setting_id = plpDataSettingId,
population_setting_id = populationSettingId,
model_setting_id = modelSettingId,
covariate_setting_id = covariateSettingId,
@@ -349,647 +341,642 @@ addModelDesign <- function(
tidy_covariates_setting_id = tidyCovariatesSettingId,
string_to_append = tablePrefix
)
- sql <- SqlRender::translate(sql, targetDialect = targetDialect,
- tempEmulationSchema = tempEmulationSchema)
+ sql <- SqlRender::translate(sql,
+ targetDialect = targetDialect,
+ tempEmulationSchema = tempEmulationSchema
+ )
DatabaseConnector::executeSql(conn, sql)
-
- #getId of new
- result <- checkTable(conn = conn,
- resultSchema = resultSchema,
- tablePrefix = tablePrefix,
- targetDialect = targetDialect,
- tableName = 'model_designs',
- columnNames = c(
- 'target_id',
- 'outcome_id',
- 'tar_id',
- 'plp_data_setting_id',
- 'population_setting_id',
- 'model_setting_id',
- 'covariate_setting_id',
- 'sample_setting_id',
- 'split_setting_id',
- 'feature_engineering_setting_id',
- 'tidy_covariates_setting_id'
- ),
- values = c(targetId,
- outcomeId,
- tarId,
- plpDataSettingId,
- populationSettingId,
- modelSettingId,
- covariateSettingId,
- sampleSettingId,
- splitSettingId,
- featureEngineeringSettingId,
- tidyCovariatesSettingId
- ),
- tempEmulationSchema = tempEmulationSchema
+
+ # getId of new
+ result <- checkTable(
+ conn = conn,
+ resultSchema = resultSchema,
+ tablePrefix = tablePrefix,
+ targetDialect = targetDialect,
+ tableName = "model_designs",
+ columnNames = c(
+ "target_id",
+ "outcome_id",
+ "tar_id",
+ "plp_data_setting_id",
+ "population_setting_id",
+ "model_setting_id",
+ "covariate_setting_id",
+ "sample_setting_id",
+ "split_setting_id",
+ "feature_engineering_setting_id",
+ "tidy_covariates_setting_id"
+ ),
+ values = c(
+ targetId,
+ outcomeId,
+ tarId,
+ plpDataSettingId,
+ populationSettingId,
+ modelSettingId,
+ covariateSettingId,
+ sampleSettingId,
+ splitSettingId,
+ featureEngineeringSettingId,
+ tidyCovariatesSettingId
+ ),
+ tempEmulationSchema = tempEmulationSchema
)
-
- }
-
+ }
+
return(result$modelDesignId[1])
}
addTar <- function(conn, resultSchema, targetDialect,
- tablePrefix = '',
- startDay,
+ tablePrefix = "",
+ startDay,
startAnchor,
- endDay,
+ endDay,
endAnchor,
- tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")
-){
-
- result <- checkTable(conn = conn,
- resultSchema = resultSchema,
- tablePrefix = tablePrefix,
- targetDialect = targetDialect,
- tableName = 'tars',
- columnNames = c('tar_start_day', 'tar_start_anchor',
- 'tar_end_day', 'tar_end_anchor'),
- values = c(startDay,
- paste0("'",startAnchor,"'"),
- endDay,
- paste0("'",endAnchor,"'")),
- tempEmulationSchema = tempEmulationSchema
+ tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")) {
+ result <- checkTable(
+ conn = conn,
+ resultSchema = resultSchema,
+ tablePrefix = tablePrefix,
+ targetDialect = targetDialect,
+ tableName = "tars",
+ columnNames = c(
+ "tar_start_day", "tar_start_anchor",
+ "tar_end_day", "tar_end_anchor"
+ ),
+ values = c(
+ startDay,
+ paste0("'", startAnchor, "'"),
+ endDay,
+ paste0("'", endAnchor, "'")
+ ),
+ tempEmulationSchema = tempEmulationSchema
)
-
- if(nrow(result)==0){
-
- ParallelLogger::logInfo('Adding TAR')
+
+ if (nrow(result) == 0) {
+ ParallelLogger::logInfo("Adding TAR")
# tars - id 1
sql <- "INSERT INTO @my_schema.@string_to_appendtars(tar_start_day, tar_start_anchor,
- tar_end_day, tar_end_anchor)
+ tar_end_day, tar_end_anchor)
VALUES (@tar_start_day, @tar_start_anchor, @tar_end_day, @tar_end_anchor);"
- sql <- SqlRender::render(sql,
- my_schema = resultSchema,
- tar_start_day = startDay,
- tar_start_anchor = paste0("'",startAnchor,"'"),
- tar_end_day = endDay,
- tar_end_anchor = paste0("'",endAnchor,"'"),
- string_to_append = tablePrefix)
-
- sql <- SqlRender::translate(sql, targetDialect = targetDialect,
- tempEmulationSchema = tempEmulationSchema)
-
+ sql <- SqlRender::render(sql,
+ my_schema = resultSchema,
+ tar_start_day = startDay,
+ tar_start_anchor = paste0("'", startAnchor, "'"),
+ tar_end_day = endDay,
+ tar_end_anchor = paste0("'", endAnchor, "'"),
+ string_to_append = tablePrefix
+ )
+
+ sql <- SqlRender::translate(sql,
+ targetDialect = targetDialect,
+ tempEmulationSchema = tempEmulationSchema
+ )
+
DatabaseConnector::executeSql(conn, sql)
-
- #getId of new
- result <- checkTable(conn = conn,
- resultSchema = resultSchema,
- tablePrefix = tablePrefix,
- targetDialect = targetDialect,
- tableName = 'tars',
- columnNames = c('tar_start_day', 'tar_start_anchor',
- 'tar_end_day', 'tar_end_anchor'),
- values = c(startDay,
- paste0("'",startAnchor,"'"),
- endDay,
- paste0("'",endAnchor,"'")),
- tempEmulationSchema = tempEmulationSchema
+
+ # getId of new
+ result <- checkTable(
+ conn = conn,
+ resultSchema = resultSchema,
+ tablePrefix = tablePrefix,
+ targetDialect = targetDialect,
+ tableName = "tars",
+ columnNames = c(
+ "tar_start_day", "tar_start_anchor",
+ "tar_end_day", "tar_end_anchor"
+ ),
+ values = c(
+ startDay,
+ paste0("'", startAnchor, "'"),
+ endDay,
+ paste0("'", endAnchor, "'")
+ ),
+ tempEmulationSchema = tempEmulationSchema
)
-
} else {
- ParallelLogger::logInfo('TAR exists')
+ ParallelLogger::logInfo("TAR exists")
}
-
-
+
+
return(result$tarId[1])
-
}
addPopulationSetting <- function(conn, resultSchema, targetDialect,
- tablePrefix = '',
+ tablePrefix = "",
json,
- tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")
-){
-
+ tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")) {
# process json to make it ordered...
- # make sure the json has been converted
- if(!inherits(x = json, what = 'character')){
- json <- orderJson(json) # to ensure attributes are alphabetic order
- json <- ParallelLogger::convertSettingsToJson(json)
- json <- as.character(json) # now convert to character
- }
-
- jsonId <- checkJson(conn = conn,
- resultSchema = resultSchema,
- tablePrefix = tablePrefix,
- targetDialect = targetDialect,
- tableName = 'population_settings',
- jsonColumnName = 'populationSettingsJson',
- id = 'populationSettingId',
- json = json,
- tempEmulationSchema = tempEmulationSchema)
-
- if(is.null(jsonId)){
- ParallelLogger::logInfo('Adding new population settings')
-
+ # make sure the json has been converted
+ if (!inherits(x = json, what = "character")) {
+ json <- orderJson(json) # to ensure attributes are alphabetic order
+ json <- ParallelLogger::convertSettingsToJson(json)
+ json <- as.character(json) # now convert to character
+ }
+
+ jsonId <- checkJson(
+ conn = conn,
+ resultSchema = resultSchema,
+ tablePrefix = tablePrefix,
+ targetDialect = targetDialect,
+ tableName = "population_settings",
+ jsonColumnName = "populationSettingsJson",
+ id = "populationSettingId",
+ json = json,
+ tempEmulationSchema = tempEmulationSchema
+ )
+
+ if (is.null(jsonId)) {
+ ParallelLogger::logInfo("Adding new population settings")
+
data <- data.frame(populationSettingsJson = json)
- DatabaseConnector::insertTable(connection = conn,
- databaseSchema = resultSchema,
- tableName = paste0(tablePrefix, 'population_settings'),
- data = data,
- dropTableIfExists = F,
- createTable = F,
- tempTable = F,
- progressBar = T,
- camelCaseToSnakeCase = T,
- tempEmulationSchema = tempEmulationSchema
+ DatabaseConnector::insertTable(
+ connection = conn,
+ databaseSchema = resultSchema,
+ tableName = paste0(tablePrefix, "population_settings"),
+ data = data,
+ dropTableIfExists = FALSE,
+ createTable = FALSE,
+ tempTable = FALSE,
+ progressBar = TRUE,
+ camelCaseToSnakeCase = TRUE,
+ tempEmulationSchema = tempEmulationSchema
)
-
- #getId of new
- jsonId <- checkJson(conn = conn,
- resultSchema = resultSchema,
- tablePrefix = tablePrefix,
- targetDialect = targetDialect,
- tableName = 'population_settings',
- jsonColumnName = 'populationSettingsJson',
- id = 'populationSettingId',
- json = json,
- tempEmulationSchema = tempEmulationSchema)
-
- } else{
- ParallelLogger::logInfo('Population settings exists')
- }
-
+
+ # getId of new
+ jsonId <- checkJson(
+ conn = conn,
+ resultSchema = resultSchema,
+ tablePrefix = tablePrefix,
+ targetDialect = targetDialect,
+ tableName = "population_settings",
+ jsonColumnName = "populationSettingsJson",
+ id = "populationSettingId",
+ json = json,
+ tempEmulationSchema = tempEmulationSchema
+ )
+ } else {
+ ParallelLogger::logInfo("Population settings exists")
+ }
+
return(jsonId)
}
addCovariateSetting <- function(conn, resultSchema, targetDialect,
- tablePrefix = '',
+ tablePrefix = "",
json,
- tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")
-){
-
+ tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")) {
# process json to make it ordered...
- # make sure the json has been converted
- if(!inherits(x = json, what = 'character')){
+ # make sure the json has been converted
+ if (!inherits(x = json, what = "character")) {
json <- orderJson(json) # to ensure attributes are alphabetic order
json <- ParallelLogger::convertSettingsToJson(json)
json <- as.character(json) # now convert to character
}
-
+
jsonId <- checkJson(
conn = conn,
- resultSchema = resultSchema,
+ resultSchema = resultSchema,
tablePrefix = tablePrefix,
- targetDialect = targetDialect,
- tableName = 'covariate_settings',
- jsonColumnName = 'covariateSettingsJson',
- id = 'covariateSettingId',
+ targetDialect = targetDialect,
+ tableName = "covariate_settings",
+ jsonColumnName = "covariateSettingsJson",
+ id = "covariateSettingId",
json = json,
tempEmulationSchema = tempEmulationSchema
)
-
- if(is.null(jsonId)){
-
- ParallelLogger::logInfo('Adding new covariate settings')
-
+
+ if (is.null(jsonId)) {
+ ParallelLogger::logInfo("Adding new covariate settings")
+
data <- data.frame(covariateSettingsJson = json)
- DatabaseConnector::insertTable(connection = conn,
- databaseSchema = resultSchema,
- tableName = paste0(tablePrefix, 'covariate_settings'),
- data = data,
- dropTableIfExists = F,
- createTable = F,
- tempTable = F,
- progressBar = T,
- camelCaseToSnakeCase = T,
- tempEmulationSchema = tempEmulationSchema
+ DatabaseConnector::insertTable(
+ connection = conn,
+ databaseSchema = resultSchema,
+ tableName = paste0(tablePrefix, "covariate_settings"),
+ data = data,
+ dropTableIfExists = FALSE,
+ createTable = FALSE,
+ tempTable = FALSE,
+ progressBar = TRUE,
+ camelCaseToSnakeCase = TRUE,
+ tempEmulationSchema = tempEmulationSchema
+ )
+
+ # getId of new
+ jsonId <- checkJson(
+ conn = conn,
+ resultSchema = resultSchema,
+ tablePrefix = tablePrefix,
+ targetDialect = targetDialect,
+ tableName = "covariate_settings",
+ jsonColumnName = "covariateSettingsJson",
+ id = "covariateSettingId",
+ json = json,
+ tempEmulationSchema = tempEmulationSchema
)
-
- #getId of new
- jsonId <- checkJson(conn = conn,
- resultSchema = resultSchema,
- tablePrefix = tablePrefix,
- targetDialect = targetDialect,
- tableName = 'covariate_settings',
- jsonColumnName = 'covariateSettingsJson',
- id = 'covariateSettingId',
- json = json,
- tempEmulationSchema = tempEmulationSchema)
-
- } else{
- ParallelLogger::logInfo('Covariate setting exists')
- }
-
+ } else {
+ ParallelLogger::logInfo("Covariate setting exists")
+ }
+
return(jsonId)
}
addModelSetting <- function(conn, resultSchema, targetDialect,
- tablePrefix = '',
+ tablePrefix = "",
modelType, json,
- tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")
-){
-
+ tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")) {
# process json to make it ordered...
- # make sure the json has been converted
- if(!inherits(x = json, what = 'character')){
+ # make sure the json has been converted
+ if (!inherits(x = json, what = "character")) {
json <- orderJson(json) # to ensure attributes are alphabetic order
json <- ParallelLogger::convertSettingsToJson(json)
json <- as.character(json) # now convert to character
}
-
- jsonId <- checkJson(conn = conn,
- resultSchema = resultSchema,
- tablePrefix = tablePrefix,
- targetDialect = targetDialect,
- tableName = 'model_settings',
- jsonColumnName = 'modelSettingsJson',
- id = 'modelSettingId',
- json = json,
- tempEmulationSchema = tempEmulationSchema)
-
- if(is.null(jsonId)){
-
- ParallelLogger::logInfo('Adding new model settings')
-
- data <- data.frame(modelType = modelType,
- modelSettingsJson = json)
- DatabaseConnector::insertTable(connection = conn,
- databaseSchema = resultSchema,
- tableName = paste0(tablePrefix, 'model_settings'),
- data = data,
- dropTableIfExists = F,
- createTable = F,
- tempTable = F,
- progressBar = T,
- camelCaseToSnakeCase = T,
- tempEmulationSchema = tempEmulationSchema)
-
- #getId of new
- jsonId <- checkJson(conn = conn,
- resultSchema = resultSchema,
- tablePrefix = tablePrefix,
- targetDialect = targetDialect,
- tableName = 'model_settings',
- jsonColumnName = 'modelSettingsJson',
- id = 'modelSettingId',
- json = json,
- tempEmulationSchema = tempEmulationSchema)
-
- } else{
- ParallelLogger::logInfo('Model setting exists')
- }
-
+
+ jsonId <- checkJson(
+ conn = conn,
+ resultSchema = resultSchema,
+ tablePrefix = tablePrefix,
+ targetDialect = targetDialect,
+ tableName = "model_settings",
+ jsonColumnName = "modelSettingsJson",
+ id = "modelSettingId",
+ json = json,
+ tempEmulationSchema = tempEmulationSchema
+ )
+
+ if (is.null(jsonId)) {
+ ParallelLogger::logInfo("Adding new model settings")
+
+ data <- data.frame(
+ modelType = modelType,
+ modelSettingsJson = json
+ )
+ DatabaseConnector::insertTable(
+ connection = conn,
+ databaseSchema = resultSchema,
+ tableName = paste0(tablePrefix, "model_settings"),
+ data = data,
+ dropTableIfExists = FALSE,
+ createTable = FALSE,
+ tempTable = FALSE,
+ progressBar = TRUE,
+ camelCaseToSnakeCase = TRUE,
+ tempEmulationSchema = tempEmulationSchema
+ )
+
+ # getId of new
+ jsonId <- checkJson(
+ conn = conn,
+ resultSchema = resultSchema,
+ tablePrefix = tablePrefix,
+ targetDialect = targetDialect,
+ tableName = "model_settings",
+ jsonColumnName = "modelSettingsJson",
+ id = "modelSettingId",
+ json = json,
+ tempEmulationSchema = tempEmulationSchema
+ )
+ } else {
+ ParallelLogger::logInfo("Model setting exists")
+ }
+
return(jsonId)
}
addTidySetting <- function(
- conn,
- resultSchema,
- targetDialect,
- tablePrefix = '',
- json,
- tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")
-){
-
- if(!inherits(x = json, what = 'character')){
+ conn,
+ resultSchema,
+ targetDialect,
+ tablePrefix = "",
+ json,
+ tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")) {
+ if (!inherits(x = json, what = "character")) {
json <- orderJson(json) # to ensure attributes are alphabetic order
json <- ParallelLogger::convertSettingsToJson(json)
json <- as.character(json) # now convert to character
}
-
- jsonId <- checkJson(conn = conn,
- resultSchema = resultSchema,
- tablePrefix = tablePrefix,
- targetDialect = targetDialect,
- tableName = 'tidy_covariates_settings',
- jsonColumnName = 'tidyCovariatesSettingsJson',
- id = 'tidyCovariatesSettingId',
- json = json,
- tempEmulationSchema = tempEmulationSchema)
-
- if(is.null(jsonId)){
-
- ParallelLogger::logInfo('Adding new tidy covariates settings')
-
+
+ jsonId <- checkJson(
+ conn = conn,
+ resultSchema = resultSchema,
+ tablePrefix = tablePrefix,
+ targetDialect = targetDialect,
+ tableName = "tidy_covariates_settings",
+ jsonColumnName = "tidyCovariatesSettingsJson",
+ id = "tidyCovariatesSettingId",
+ json = json,
+ tempEmulationSchema = tempEmulationSchema
+ )
+
+ if (is.null(jsonId)) {
+ ParallelLogger::logInfo("Adding new tidy covariates settings")
+
data <- data.frame(
tidyCovariatesSettingsJson = json
)
-
+
DatabaseConnector::insertTable(
- connection = conn,
- databaseSchema = resultSchema,
- tableName = paste0(tablePrefix, 'tidy_covariates_settings'),
- data = data,
- dropTableIfExists = F,
- createTable = F,
- tempTable = F,
- progressBar = T,
- camelCaseToSnakeCase = T,
+ connection = conn,
+ databaseSchema = resultSchema,
+ tableName = paste0(tablePrefix, "tidy_covariates_settings"),
+ data = data,
+ dropTableIfExists = FALSE,
+ createTable = FALSE,
+ tempTable = FALSE,
+ progressBar = TRUE,
+ camelCaseToSnakeCase = TRUE,
tempEmulationSchema = tempEmulationSchema
)
-
- #getId of new
+
+ # getId of new
jsonId <- checkJson(
conn = conn,
- resultSchema = resultSchema,
+ resultSchema = resultSchema,
tablePrefix = tablePrefix,
- targetDialect = targetDialect,
- tableName = 'tidy_covariates_settings',
- jsonColumnName = 'tidyCovariatesSettingsJson',
- id = 'tidyCovariatesSettingId',
+ targetDialect = targetDialect,
+ tableName = "tidy_covariates_settings",
+ jsonColumnName = "tidyCovariatesSettingsJson",
+ id = "tidyCovariatesSettingId",
json = json,
tempEmulationSchema = tempEmulationSchema
)
-
- } else{
- ParallelLogger::logInfo('tidy covariates setting exists')
+ } else {
+ ParallelLogger::logInfo("tidy covariates setting exists")
}
-
+
return(jsonId)
-
}
addSampleSetting <- function(
- conn,
- resultSchema,
- targetDialect,
- tablePrefix = '',
- json,
- tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")
-){
-
- if(!inherits(x = json, what = 'character')){
+ conn,
+ resultSchema,
+ targetDialect,
+ tablePrefix = "",
+ json,
+ tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")) {
+ if (!inherits(x = json, what = "character")) {
json <- orderJson(json) # to ensure attributes are alphabetic order
json <- ParallelLogger::convertSettingsToJson(json)
json <- as.character(json) # now convert to character
}
-
+
jsonId <- checkJson(
conn = conn,
- resultSchema = resultSchema,
+ resultSchema = resultSchema,
tablePrefix = tablePrefix,
- targetDialect = targetDialect,
- tableName = 'sample_settings',
- jsonColumnName = 'sampleSettingsJson',
- id = 'sampleSettingId',
+ targetDialect = targetDialect,
+ tableName = "sample_settings",
+ jsonColumnName = "sampleSettingsJson",
+ id = "sampleSettingId",
json = json,
tempEmulationSchema = tempEmulationSchema
)
-
- if(is.null(jsonId)){
-
- ParallelLogger::logInfo('Adding new sample settings')
-
+
+ if (is.null(jsonId)) {
+ ParallelLogger::logInfo("Adding new sample settings")
+
data <- data.frame(
sampleSettingsJson = json
)
-
+
DatabaseConnector::insertTable(
- connection = conn,
- databaseSchema = resultSchema,
- tableName = paste0(tablePrefix, 'sample_settings'),
- data = data,
- dropTableIfExists = F,
- createTable = F,
- tempTable = F,
- progressBar = T,
- camelCaseToSnakeCase = T,
+ connection = conn,
+ databaseSchema = resultSchema,
+ tableName = paste0(tablePrefix, "sample_settings"),
+ data = data,
+ dropTableIfExists = FALSE,
+ createTable = FALSE,
+ tempTable = FALSE,
+ progressBar = TRUE,
+ camelCaseToSnakeCase = TRUE,
tempEmulationSchema = tempEmulationSchema
)
-
- #getId of new
+
+ # getId of new
jsonId <- checkJson(
conn = conn,
- resultSchema = resultSchema,
+ resultSchema = resultSchema,
tablePrefix = tablePrefix,
- targetDialect = targetDialect,
- tableName = 'sample_settings',
- jsonColumnName = 'sampleSettingsJson',
- id = 'sampleSettingId',
+ targetDialect = targetDialect,
+ tableName = "sample_settings",
+ jsonColumnName = "sampleSettingsJson",
+ id = "sampleSettingId",
json = json,
tempEmulationSchema = tempEmulationSchema
)
-
- } else{
- ParallelLogger::logInfo('sample setting exists')
+ } else {
+ ParallelLogger::logInfo("sample setting exists")
}
-
+
return(jsonId)
-
}
addPlpDataSetting <- function(
- conn,
- resultSchema,
- targetDialect,
- tablePrefix = '',
- json,
- tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")
-){
-
- if(!inherits(x = json, what = 'character')){
+ conn,
+ resultSchema,
+ targetDialect,
+ tablePrefix = "",
+ json,
+ tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")) {
+ if (!inherits(x = json, what = "character")) {
json <- orderJson(json) # to ensure attributes are alphabetic order
json <- ParallelLogger::convertSettingsToJson(json)
json <- as.character(json) # now convert to character
}
-
- jsonId <- checkJson(conn = conn,
- resultSchema = resultSchema,
- tablePrefix = tablePrefix,
- targetDialect = targetDialect,
- tableName = 'plp_data_settings',
- jsonColumnName = 'plpDataSettingsJson',
- id = 'plpDataSettingId',
- json = json,
- tempEmulationSchema = tempEmulationSchema)
-
- if(is.null(jsonId)){
-
- ParallelLogger::logInfo('Adding new plp data settings')
-
+
+ jsonId <- checkJson(
+ conn = conn,
+ resultSchema = resultSchema,
+ tablePrefix = tablePrefix,
+ targetDialect = targetDialect,
+ tableName = "plp_data_settings",
+ jsonColumnName = "plpDataSettingsJson",
+ id = "plpDataSettingId",
+ json = json,
+ tempEmulationSchema = tempEmulationSchema
+ )
+
+ if (is.null(jsonId)) {
+ ParallelLogger::logInfo("Adding new plp data settings")
+
data <- data.frame(
plpDataSettingsJson = json
)
-
+
DatabaseConnector::insertTable(
- connection = conn,
- databaseSchema = resultSchema,
- tableName = paste0(tablePrefix, 'plp_data_settings'),
- data = data,
- dropTableIfExists = F,
- createTable = F,
- tempTable = F,
- progressBar = T,
- camelCaseToSnakeCase = T,
+ connection = conn,
+ databaseSchema = resultSchema,
+ tableName = paste0(tablePrefix, "plp_data_settings"),
+ data = data,
+ dropTableIfExists = FALSE,
+ createTable = FALSE,
+ tempTable = FALSE,
+ progressBar = TRUE,
+ camelCaseToSnakeCase = TRUE,
tempEmulationSchema = tempEmulationSchema
)
-
- #getId of new
+
+ # getId of new
jsonId <- checkJson(
conn = conn,
- resultSchema = resultSchema,
+ resultSchema = resultSchema,
tablePrefix = tablePrefix,
- targetDialect = targetDialect,
- tableName = 'plp_data_settings',
- jsonColumnName = 'plpDataSettingsJson',
- id = 'plpDataSettingId',
+ targetDialect = targetDialect,
+ tableName = "plp_data_settings",
+ jsonColumnName = "plpDataSettingsJson",
+ id = "plpDataSettingId",
json = json,
tempEmulationSchema = tempEmulationSchema
)
-
- } else{
- ParallelLogger::logInfo('Split setting exists')
+ } else {
+ ParallelLogger::logInfo("Split setting exists")
}
-
+
return(jsonId)
-
}
addFESetting <- function(
- conn,
- resultSchema,
- targetDialect,
- tablePrefix = '',
- json,
- tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")
-){
-
- if(!inherits(x = json, what = 'character')){
+ conn,
+ resultSchema,
+ targetDialect,
+ tablePrefix = "",
+ json,
+ tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")) {
+ if (!inherits(x = json, what = "character")) {
json <- orderJson(json) # to ensure attributes are alphabetic order
json <- ParallelLogger::convertSettingsToJson(json)
json <- as.character(json) # now convert to character
}
-
+
jsonId <- checkJson(
conn = conn,
- resultSchema = resultSchema,
+ resultSchema = resultSchema,
tablePrefix = tablePrefix,
- targetDialect = targetDialect,
- tableName = 'feature_engineering_settings',
- jsonColumnName = 'featureEngineeringSettingsJson',
- id = 'featureEngineeringSettingId',
+ targetDialect = targetDialect,
+ tableName = "feature_engineering_settings",
+ jsonColumnName = "featureEngineeringSettingsJson",
+ id = "featureEngineeringSettingId",
json = json,
tempEmulationSchema = tempEmulationSchema
)
-
- if(is.null(jsonId)){
-
- ParallelLogger::logInfo('Adding new feature_engineering settings')
-
+
+ if (is.null(jsonId)) {
+ ParallelLogger::logInfo("Adding new feature_engineering settings")
+
data <- data.frame(
featureEngineeringSettingsJson = json
)
-
+
DatabaseConnector::insertTable(
- connection = conn,
- databaseSchema = resultSchema,
- tableName = paste0(tablePrefix, 'feature_engineering_settings'),
- data = data,
- dropTableIfExists = F,
- createTable = F,
- tempTable = F,
- progressBar = T,
- camelCaseToSnakeCase = T,
+ connection = conn,
+ databaseSchema = resultSchema,
+ tableName = paste0(tablePrefix, "feature_engineering_settings"),
+ data = data,
+ dropTableIfExists = FALSE,
+ createTable = FALSE,
+ tempTable = FALSE,
+ progressBar = TRUE,
+ camelCaseToSnakeCase = TRUE,
tempEmulationSchema = tempEmulationSchema
)
-
- #getId of new
+
+ # getId of new
jsonId <- checkJson(
conn = conn,
- resultSchema = resultSchema,
+ resultSchema = resultSchema,
tablePrefix = tablePrefix,
- targetDialect = targetDialect,
- tableName = 'feature_engineering_settings',
- jsonColumnName = 'featureEngineeringSettingsJson',
- id = 'featureEngineeringSettingId',
+ targetDialect = targetDialect,
+ tableName = "feature_engineering_settings",
+ jsonColumnName = "featureEngineeringSettingsJson",
+ id = "featureEngineeringSettingId",
json = json,
tempEmulationSchema = tempEmulationSchema
)
-
- } else{
- ParallelLogger::logInfo('feature engineering setting exists')
+ } else {
+ ParallelLogger::logInfo("feature engineering setting exists")
}
-
+
return(jsonId)
-
}
addSplitSettings <- function(
- conn,
- resultSchema,
- targetDialect,
- tablePrefix = '',
- json,
- tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")
-){
-
- if(!inherits(x = json, what = 'character')){
+ conn,
+ resultSchema,
+ targetDialect,
+ tablePrefix = "",
+ json,
+ tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")) {
+ if (!inherits(x = json, what = "character")) {
json <- orderJson(json) # to ensure attributes are alphabetic order
json <- ParallelLogger::convertSettingsToJson(json)
json <- as.character(json) # now convert to character
}
-
+
jsonId <- checkJson(
conn = conn,
- resultSchema = resultSchema,
+ resultSchema = resultSchema,
tablePrefix = tablePrefix,
- targetDialect = targetDialect,
- tableName = 'split_settings',
- jsonColumnName = 'splitSettingsJson',
- id = 'splitSettingId',
+ targetDialect = targetDialect,
+ tableName = "split_settings",
+ jsonColumnName = "splitSettingsJson",
+ id = "splitSettingId",
json = json,
tempEmulationSchema = tempEmulationSchema
)
-
- if(is.null(jsonId)){
-
- ParallelLogger::logInfo('Adding new split settings')
-
+
+ if (is.null(jsonId)) {
+ ParallelLogger::logInfo("Adding new split settings")
+
data <- data.frame(
splitSettingsJson = json
)
-
+
DatabaseConnector::insertTable(
- connection = conn,
- databaseSchema = resultSchema,
- tableName = paste0(tablePrefix, 'split_settings'),
- data = data,
- dropTableIfExists = F,
- createTable = F,
- tempTable = F,
- progressBar = T,
- camelCaseToSnakeCase = T,
+ connection = conn,
+ databaseSchema = resultSchema,
+ tableName = paste0(tablePrefix, "split_settings"),
+ data = data,
+ dropTableIfExists = FALSE,
+ createTable = FALSE,
+ tempTable = FALSE,
+ progressBar = TRUE,
+ camelCaseToSnakeCase = TRUE,
tempEmulationSchema = tempEmulationSchema
)
-
- #getId of new
+
+ # getId of new
jsonId <- checkJson(
conn = conn,
- resultSchema = resultSchema,
+ resultSchema = resultSchema,
tablePrefix = tablePrefix,
- targetDialect = targetDialect,
- tableName = 'split_settings',
- jsonColumnName = 'splitSettingsJson',
- id = 'splitSettingId',
+ targetDialect = targetDialect,
+ tableName = "split_settings",
+ jsonColumnName = "splitSettingsJson",
+ id = "splitSettingId",
json = json,
tempEmulationSchema = tempEmulationSchema
)
-
- } else{
- ParallelLogger::logInfo('Split setting exists')
+ } else {
+ ParallelLogger::logInfo("Split setting exists")
}
-
+
return(jsonId)
-
}
# the ParallelLogger conversion orders attributes - use this for consistency
-orderJson <- function(x){
-newx <- ParallelLogger::convertJsonToSettings(
- ParallelLogger::convertSettingsToJson(
- x
+orderJson <- function(x) {
+ newx <- ParallelLogger::convertJsonToSettings(
+ ParallelLogger::convertSettingsToJson(
+ x
)
)
-return(newx)
+ return(newx)
}
diff --git a/R/uploadToDatabasePerformance.R b/R/uploadToDatabasePerformance.R
index 4b36af84c..eefb4308b 100644
--- a/R/uploadToDatabasePerformance.R
+++ b/R/uploadToDatabasePerformance.R
@@ -1,835 +1,890 @@
insertPerformanceInDatabase <- function(
- performanceEvaluation = NULL,
- covariateSummary = NULL,
- attrition = NULL,
- executionDateTime,
- plpVersion,
-
- conn,
- databaseSchemaSettings,
-
- modelDesignId,
- developmentDatabaseId,
-
- validationDatabaseId,
- validationTarId,
- validationPopulationId,
- validationPlpDataId,
- validationTargetId,
- validationOutcomeId,
-
- modelDevelopment #added
-
-){
-
- ParallelLogger::logInfo(paste0('Inserting performance...'))
-
+ performanceEvaluation = NULL,
+ covariateSummary = NULL,
+ attrition = NULL,
+ executionDateTime,
+ plpVersion,
+ conn,
+ databaseSchemaSettings,
+ modelDesignId,
+ developmentDatabaseId,
+ validationDatabaseId,
+ validationTarId,
+ validationPopulationId,
+ validationPlpDataId,
+ validationTargetId,
+ validationOutcomeId,
+ modelDevelopment # added
+ ) {
+ ParallelLogger::logInfo(paste0("Inserting performance..."))
+
# add the results
performanceId <- addPerformance(
- conn = conn,
- resultSchema = databaseSchemaSettings$resultSchema,
+ conn = conn,
+ resultSchema = databaseSchemaSettings$resultSchema,
targetDialect = databaseSchemaSettings$targetDialect,
-
modelDesignId = modelDesignId,
developmentDatabaseId = developmentDatabaseId,
-
validationDatabaseId = validationDatabaseId,
validationTargetId = validationTargetId,
validationOutcomeId = validationOutcomeId,
validationTarId = validationTarId,
validationPlpDataId = validationPlpDataId,
validationPopulationId = validationPopulationId,
-
modelDevelopment = modelDevelopment,
executionDateTime = executionDateTime,
- plpVersion = plpVersion,
+ plpVersion = plpVersion,
tablePrefix = databaseSchemaSettings$tablePrefix,
tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema
)
- ParallelLogger::logInfo(paste0('performanceId: ', performanceId))
-
+ ParallelLogger::logInfo(paste0("performanceId: ", performanceId))
+
# add attrition
- if(!is.null(attrition)){
+ if (!is.null(attrition)) {
addAttrition(
- conn = conn,
- resultSchema = databaseSchemaSettings$resultSchema,
+ conn = conn,
+ resultSchema = databaseSchemaSettings$resultSchema,
targetDialect = databaseSchemaSettings$targetDialect,
performanceId = performanceId,
attrition = attrition,
- overWriteIfExists = T,
+ overWriteIfExists = TRUE,
tablePrefix = databaseSchemaSettings$tablePrefix,
tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema
)
}
-
- # add performance
- #=============
- if(!is.null(performanceEvaluation)){
+
+ # add performance
+ # =============
+ if (!is.null(performanceEvaluation)) {
addEvaluation(
- conn = conn,
- resultSchema = databaseSchemaSettings$resultSchema,
+ conn = conn,
+ resultSchema = databaseSchemaSettings$resultSchema,
targetDialect = databaseSchemaSettings$targetDialect,
performanceId = performanceId,
performanceEvaluation = performanceEvaluation,
- overWriteIfExists = T,
+ overWriteIfExists = TRUE,
tablePrefix = databaseSchemaSettings$tablePrefix,
tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema
)
}
-
- if(!is.null(covariateSummary)){
+
+ if (!is.null(covariateSummary)) {
addCovariateSummary(
- conn = conn,
- resultSchema = databaseSchemaSettings$resultSchema,
+ conn = conn,
+ resultSchema = databaseSchemaSettings$resultSchema,
targetDialect = databaseSchemaSettings$targetDialect,
performanceId = performanceId,
covariateSummary = covariateSummary,
- restrictToIncluded = T,
- overWriteIfExists = T,
+ restrictToIncluded = TRUE,
+ overWriteIfExists = TRUE,
tablePrefix = databaseSchemaSettings$tablePrefix,
tempEmulationSchema = databaseSchemaSettings$tempEmulationSchema
)
}
-
+
return(invisible(performanceId))
-
}
addPerformance <- function(
- conn,
- resultSchema,
- targetDialect,
-
- modelDesignId,
- developmentDatabaseId,
-
- validationDatabaseId,
- validationTargetId,
- validationOutcomeId,
- validationTarId,
- validationPlpDataId,
- validationPopulationId,
-
- modelDevelopment,
- executionDateTime,
- plpVersion,
- tablePrefix,
- tempEmulationSchema
-
-){
-
- result <- checkTable(conn = conn,
- resultSchema = resultSchema,
- tablePrefix = tablePrefix,
- targetDialect = targetDialect,
- tableName = 'performances',
- columnNames = c(
- 'model_design_id',
- 'development_database_id',
-
- 'validation_database_id',
- 'target_id',
- 'outcome_id',
- 'tar_id',
- 'plp_data_setting_id',
- 'population_setting_id',
- 'model_development'
- ),
- values = c(
- modelDesignId,
- developmentDatabaseId,
-
- validationDatabaseId,
- validationTargetId,
- validationOutcomeId,
- validationTarId,
- validationPlpDataId,
- validationPopulationId,
- modelDevelopment
- ),
- tempEmulationSchema = tempEmulationSchema
+ conn,
+ resultSchema,
+ targetDialect,
+ modelDesignId,
+ developmentDatabaseId,
+ validationDatabaseId,
+ validationTargetId,
+ validationOutcomeId,
+ validationTarId,
+ validationPlpDataId,
+ validationPopulationId,
+ modelDevelopment,
+ executionDateTime,
+ plpVersion,
+ tablePrefix,
+ tempEmulationSchema) {
+ result <- checkTable(
+ conn = conn,
+ resultSchema = resultSchema,
+ tablePrefix = tablePrefix,
+ targetDialect = targetDialect,
+ tableName = "performances",
+ columnNames = c(
+ "model_design_id",
+ "development_database_id",
+ "validation_database_id",
+ "target_id",
+ "outcome_id",
+ "tar_id",
+ "plp_data_setting_id",
+ "population_setting_id",
+ "model_development"
+ ),
+ values = c(
+ modelDesignId,
+ developmentDatabaseId,
+ validationDatabaseId,
+ validationTargetId,
+ validationOutcomeId,
+ validationTarId,
+ validationPlpDataId,
+ validationPopulationId,
+ modelDevelopment
+ ),
+ tempEmulationSchema = tempEmulationSchema
)
-
- if(nrow(result)==0){
+
+ if (nrow(result) == 0) {
# model
sql <- "INSERT INTO @my_schema.@string_to_appendperformances (
model_design_id,
development_database_id,
-
+
validation_database_id,
target_id,
outcome_id,
tar_id,
plp_data_setting_id,
population_setting_id,
-
+
model_development,
-
+
execution_date_time,
plp_version
- )
+ )
VALUES (
- @model_design_id, @development_database_id,
- @validation_database_id, @validation_target_id, @validation_outcome_id, @validation_tar_id,
- @validation_plp_data_setting_id, @validation_population_setting_id,
- @model_development,
+ @model_design_id, @development_database_id,
+ @validation_database_id, @validation_target_id, @validation_outcome_id, @validation_tar_id,
+ @validation_plp_data_setting_id, @validation_population_setting_id,
+ @model_development,
'@execution_date_time', '@plp_version')"
- sql <- SqlRender::render(sql,
- my_schema = resultSchema,
- model_design_id = modelDesignId,
- development_database_id = developmentDatabaseId,
-
- validation_database_id = validationDatabaseId,
- validation_target_id = validationTargetId,
- validation_outcome_id = validationOutcomeId,
- validation_tar_id = validationTarId,
- validation_plp_data_setting_id = validationPlpDataId,
- validation_population_setting_id = validationPopulationId,
-
- model_development = modelDevelopment,
-
- execution_date_time = executionDateTime,
- plp_version = plpVersion,
- string_to_append = tablePrefix)
- sql <- SqlRender::translate(sql, targetDialect = targetDialect,
- tempEmulationSchema = tempEmulationSchema)
+ sql <- SqlRender::render(sql,
+ my_schema = resultSchema,
+ model_design_id = modelDesignId,
+ development_database_id = developmentDatabaseId,
+ validation_database_id = validationDatabaseId,
+ validation_target_id = validationTargetId,
+ validation_outcome_id = validationOutcomeId,
+ validation_tar_id = validationTarId,
+ validation_plp_data_setting_id = validationPlpDataId,
+ validation_population_setting_id = validationPopulationId,
+ model_development = modelDevelopment,
+ execution_date_time = executionDateTime,
+ plp_version = plpVersion,
+ string_to_append = tablePrefix
+ )
+ sql <- SqlRender::translate(sql,
+ targetDialect = targetDialect,
+ tempEmulationSchema = tempEmulationSchema
+ )
DatabaseConnector::executeSql(conn, sql)
-
- #getId of new
- result <- checkTable(conn = conn,
- resultSchema = resultSchema,
- tablePrefix = tablePrefix,
- targetDialect = targetDialect,
- tableName = 'performances',
- columnNames = c(
- 'model_design_id',
- 'development_database_id',
-
- 'validation_database_id',
- 'target_id',
- 'outcome_id',
- 'tar_id',
- 'plp_data_setting_id',
- 'population_setting_id',
-
- 'model_development'
- ),
- values = c(
- modelDesignId,
- developmentDatabaseId,
-
- validationDatabaseId,
- validationTargetId,
- validationOutcomeId,
- validationTarId,
- validationPlpDataId,
- validationPopulationId,
-
- modelDevelopment
- ),
- tempEmulationSchema = tempEmulationSchema
+
+ # getId of new
+ result <- checkTable(
+ conn = conn,
+ resultSchema = resultSchema,
+ tablePrefix = tablePrefix,
+ targetDialect = targetDialect,
+ tableName = "performances",
+ columnNames = c(
+ "model_design_id",
+ "development_database_id",
+ "validation_database_id",
+ "target_id",
+ "outcome_id",
+ "tar_id",
+ "plp_data_setting_id",
+ "population_setting_id",
+ "model_development"
+ ),
+ values = c(
+ modelDesignId,
+ developmentDatabaseId,
+ validationDatabaseId,
+ validationTargetId,
+ validationOutcomeId,
+ validationTarId,
+ validationPlpDataId,
+ validationPopulationId,
+ modelDevelopment
+ ),
+ tempEmulationSchema = tempEmulationSchema
)
-
- }
-
+ }
+
return(result$performanceId[1])
}
# attrition
addAttrition <- function(
- conn, resultSchema, targetDialect,
- tablePrefix = '',
- performanceId,
- attrition,
- overWriteIfExists = T,
- tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")){
-
+ conn, resultSchema, targetDialect,
+ tablePrefix = "",
+ performanceId,
+ attrition,
+ overWriteIfExists = TRUE,
+ tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")) {
value <- attrition
- if(is.null(value)){
+ if (is.null(value)) {
return(NULL)
}
-
+
# edit names
firstLower <- function(x) {
substr(x, 1, 1) <- tolower(substr(x, 1, 1))
return(x)
}
- colnames(value) <- sapply(colnames(value), firstLower )
-
+ colnames(value) <- sapply(colnames(value), firstLower)
+
value$performanceId <- performanceId
-
+
# get column names and check all present in object
- columnNames <- getColumnNames(conn = conn,
- resultSchema = resultSchema,
- targetDialect = targetDialect,
- tableName = paste0(tablePrefix,'attrition'),
- tempEmulationSchema = tempEmulationSchema)
- isValid <- sum(colnames(value)%in%columnNames) == length(columnNames)
-
- exists <- checkResultExists(conn = conn,
- resultSchema = resultSchema,
- targetDialect = targetDialect,
- tableName = paste0(tablePrefix,'attrition'),
- resultId = performanceId,
- tempEmulationSchema = tempEmulationSchema)
-
- if(isValid && (!exists || overWriteIfExists)){
-
+ columnNames <- getColumnNames(
+ conn = conn,
+ resultSchema = resultSchema,
+ targetDialect = targetDialect,
+ tableName = paste0(tablePrefix, "attrition"),
+ tempEmulationSchema = tempEmulationSchema
+ )
+ isValid <- sum(colnames(value) %in% columnNames) == length(columnNames)
+
+ exists <- checkResultExists(
+ conn = conn,
+ resultSchema = resultSchema,
+ targetDialect = targetDialect,
+ tableName = paste0(tablePrefix, "attrition"),
+ resultId = performanceId,
+ tempEmulationSchema = tempEmulationSchema
+ )
+
+ if (isValid && (!exists || overWriteIfExists)) {
# REMOVE existing result
- if(exists){
+ if (exists) {
sql <- "delete from @result_schema.@table_name where performance_id = @performance_id;"
- sql <- SqlRender::render(sql,
- performance_id = performanceId,
- result_schema = resultSchema,
- table_name = paste0(tablePrefix,'attrition'))
- sql <- SqlRender::translate(sql,
- targetDialect = targetDialect,
- tempEmulationSchema = tempEmulationSchema)
+ sql <- SqlRender::render(sql,
+ performance_id = performanceId,
+ result_schema = resultSchema,
+ table_name = paste0(tablePrefix, "attrition")
+ )
+ sql <- SqlRender::translate(sql,
+ targetDialect = targetDialect,
+ tempEmulationSchema = tempEmulationSchema
+ )
DatabaseConnector::executeSql(conn, sql)
}
-
- # add
- ParallelLogger::logInfo(paste0('Inserting attrition for performance ',performanceId))
- DatabaseConnector::insertTable(connection = conn,
- databaseSchema = resultSchema,
- tableName = paste0(tablePrefix,'attrition'),
- data = value[,columnNames],
- dropTableIfExists = F, createTable = F, tempTable = F,
- bulkLoad = F, camelCaseToSnakeCase = T, progressBar = T,
- tempEmulationSchema = tempEmulationSchema)
+
+ # add
+ ParallelLogger::logInfo(paste0("Inserting attrition for performance ", performanceId))
+ DatabaseConnector::insertTable(
+ connection = conn,
+ databaseSchema = resultSchema,
+ tableName = paste0(tablePrefix, "attrition"),
+ data = value[, columnNames],
+ dropTableIfExists = FALSE, createTable = FALSE, tempTable = FALSE,
+ bulkLoad = FALSE, camelCaseToSnakeCase = TRUE, progressBar = TRUE,
+ tempEmulationSchema = tempEmulationSchema
+ )
}
-
+
return(invisible(NULL))
}
# evals
addEvaluation <- function(conn, resultSchema, targetDialect,
- tablePrefix = '',
+ tablePrefix = "",
performanceId,
performanceEvaluation,
- overWriteIfExists = T,
- tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")){
-
- ParallelLogger::logInfo('Adding PredictionDistribution')
- tryCatch({addPredictionDistribution(conn = conn, resultSchema = resultSchema, targetDialect = targetDialect,
- tablePrefix = tablePrefix,
- performanceId = performanceId,
- performanceEvaluation = performanceEvaluation,
- overWriteIfExists = overWriteIfExists,
- tempEmulationSchema = tempEmulationSchema)},
- error = function(e){ParallelLogger::logError(e);})
-
- ParallelLogger::logInfo('Adding ThresholdSummary')
- tryCatch({addThresholdSummary(conn = conn, resultSchema = resultSchema, targetDialect = targetDialect,
- tablePrefix = tablePrefix,
- performanceId = performanceId,
- performanceEvaluation = performanceEvaluation,
- overWriteIfExists = overWriteIfExists,
- tempEmulationSchema = tempEmulationSchema)},
- error = function(e){ParallelLogger::logError(e);})
-
- ParallelLogger::logInfo('Adding EvaluationStatistics')
- tryCatch({addEvaluationStatistics(conn = conn, resultSchema = resultSchema, targetDialect = targetDialect,
- tablePrefix = tablePrefix,
- performanceId = performanceId,
- performanceEvaluation = performanceEvaluation,
- overWriteIfExists = overWriteIfExists,
- tempEmulationSchema = tempEmulationSchema)},
- error = function(e){ParallelLogger::logError(e);})
-
- ParallelLogger::logInfo('Adding CalibrationSummary')
- tryCatch({addCalibrationSummary(conn = conn, resultSchema = resultSchema, targetDialect = targetDialect,
- tablePrefix = tablePrefix,
- performanceId = performanceId,
- performanceEvaluation = performanceEvaluation,
- overWriteIfExists = overWriteIfExists,
- tempEmulationSchema = tempEmulationSchema)},
- error = function(e){ParallelLogger::logError(e);})
-
- ParallelLogger::logInfo('Adding DemographicSummary')
- tryCatch({addDemographicSummary(conn = conn, resultSchema = resultSchema, targetDialect = targetDialect,
- tablePrefix = tablePrefix,
- performanceId = performanceId,
- performanceEvaluation = performanceEvaluation,
- overWriteIfExists = overWriteIfExists,
- tempEmulationSchema = tempEmulationSchema)},
- error = function(e){ParallelLogger::logError(e);})
-
+ overWriteIfExists = TRUE,
+ tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")) {
+ ParallelLogger::logInfo("Adding PredictionDistribution")
+ tryCatch(
+ {
+ addPredictionDistribution(
+ conn = conn, resultSchema = resultSchema, targetDialect = targetDialect,
+ tablePrefix = tablePrefix,
+ performanceId = performanceId,
+ performanceEvaluation = performanceEvaluation,
+ overWriteIfExists = overWriteIfExists,
+ tempEmulationSchema = tempEmulationSchema
+ )
+ },
+ error = function(e) {
+ ParallelLogger::logError(e)
+ }
+ )
+
+ ParallelLogger::logInfo("Adding ThresholdSummary")
+ tryCatch(
+ {
+ addThresholdSummary(
+ conn = conn, resultSchema = resultSchema, targetDialect = targetDialect,
+ tablePrefix = tablePrefix,
+ performanceId = performanceId,
+ performanceEvaluation = performanceEvaluation,
+ overWriteIfExists = overWriteIfExists,
+ tempEmulationSchema = tempEmulationSchema
+ )
+ },
+ error = function(e) {
+ ParallelLogger::logError(e)
+ }
+ )
+
+ ParallelLogger::logInfo("Adding EvaluationStatistics")
+ tryCatch(
+ {
+ addEvaluationStatistics(
+ conn = conn, resultSchema = resultSchema, targetDialect = targetDialect,
+ tablePrefix = tablePrefix,
+ performanceId = performanceId,
+ performanceEvaluation = performanceEvaluation,
+ overWriteIfExists = overWriteIfExists,
+ tempEmulationSchema = tempEmulationSchema
+ )
+ },
+ error = function(e) {
+ ParallelLogger::logError(e)
+ }
+ )
+
+ ParallelLogger::logInfo("Adding CalibrationSummary")
+ tryCatch(
+ {
+ addCalibrationSummary(
+ conn = conn, resultSchema = resultSchema, targetDialect = targetDialect,
+ tablePrefix = tablePrefix,
+ performanceId = performanceId,
+ performanceEvaluation = performanceEvaluation,
+ overWriteIfExists = overWriteIfExists,
+ tempEmulationSchema = tempEmulationSchema
+ )
+ },
+ error = function(e) {
+ ParallelLogger::logError(e)
+ }
+ )
+
+ ParallelLogger::logInfo("Adding DemographicSummary")
+ tryCatch(
+ {
+ addDemographicSummary(
+ conn = conn, resultSchema = resultSchema, targetDialect = targetDialect,
+ tablePrefix = tablePrefix,
+ performanceId = performanceId,
+ performanceEvaluation = performanceEvaluation,
+ overWriteIfExists = overWriteIfExists,
+ tempEmulationSchema = tempEmulationSchema
+ )
+ },
+ error = function(e) {
+ ParallelLogger::logError(e)
+ }
+ )
+
return(invisible(NULL))
-
}
addPredictionDistribution <- function(conn, resultSchema, targetDialect,
- tablePrefix = '',
+ tablePrefix = "",
performanceId,
performanceEvaluation,
- overWriteIfExists = T,
- tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")){
-
+ overWriteIfExists = TRUE,
+ tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")) {
value <- performanceEvaluation$predictionDistribution
- if(is.null(value)){
+ if (is.null(value)) {
return(NULL)
}
-
+
# edit names
firstLower <- function(x) {
substr(x, 1, 1) <- tolower(substr(x, 1, 1))
return(x)
}
- colnames(value) <- sapply(colnames(value), firstLower )
- if(sum(colnames(value)=='class')>0){
- colnames(value)[colnames(value)=='class'] <- 'classLabel'
+ colnames(value) <- sapply(colnames(value), firstLower)
+ if (sum(colnames(value) == "class") > 0) {
+ colnames(value)[colnames(value) == "class"] <- "classLabel"
}
-
+
value$performanceId <- performanceId
-
+
# get column names and check all present in object
- columnNames <- getColumnNames(conn = conn,
- resultSchema = resultSchema,
- targetDialect = targetDialect,
- tableName = paste0(tablePrefix,'prediction_distribution'),
- tempEmulationSchema = tempEmulationSchema)
- isValid <- sum(colnames(value)%in%columnNames) == length(columnNames)
-
- exists <- checkResultExists(conn = conn,
- resultSchema = resultSchema,
- targetDialect = targetDialect,
- tableName = paste0(tablePrefix,'prediction_distribution'),
- resultIdName = 'performance_id',
- resultId = performanceId,
- tempEmulationSchema = tempEmulationSchema)
-
- if(isValid && (!exists || overWriteIfExists)){
-
+ columnNames <- getColumnNames(
+ conn = conn,
+ resultSchema = resultSchema,
+ targetDialect = targetDialect,
+ tableName = paste0(tablePrefix, "prediction_distribution"),
+ tempEmulationSchema = tempEmulationSchema
+ )
+ isValid <- sum(colnames(value) %in% columnNames) == length(columnNames)
+
+ exists <- checkResultExists(
+ conn = conn,
+ resultSchema = resultSchema,
+ targetDialect = targetDialect,
+ tableName = paste0(tablePrefix, "prediction_distribution"),
+ resultIdName = "performance_id",
+ resultId = performanceId,
+ tempEmulationSchema = tempEmulationSchema
+ )
+
+ if (isValid && (!exists || overWriteIfExists)) {
# REMOVE existing result
- if(exists){
+ if (exists) {
sql <- "delete from @result_schema.@table_name where performance_id = @performance_id;"
- sql <- SqlRender::render(sql,
- performance_id = performanceId,
- result_schema = resultSchema,
- table_name = paste0(tablePrefix,'prediction_distribution'))
- sql <- SqlRender::translate(sql,
- targetDialect = targetDialect,
- tempEmulationSchema = tempEmulationSchema)
+ sql <- SqlRender::render(sql,
+ performance_id = performanceId,
+ result_schema = resultSchema,
+ table_name = paste0(tablePrefix, "prediction_distribution")
+ )
+ sql <- SqlRender::translate(sql,
+ targetDialect = targetDialect,
+ tempEmulationSchema = tempEmulationSchema
+ )
DatabaseConnector::executeSql(conn, sql)
}
-
- # add
- ParallelLogger::logInfo(paste0('Inserting predictionDistribution for performance ', performanceId))
- DatabaseConnector::insertTable(connection = conn,
- databaseSchema = resultSchema,
- tableName = paste0(tablePrefix,'prediction_distribution'),
- data = value[,columnNames],
- dropTableIfExists = F, createTable = F, tempTable = F,
- bulkLoad = F, camelCaseToSnakeCase = T, progressBar = T,
- tempEmulationSchema = tempEmulationSchema)
+
+ # add
+ ParallelLogger::logInfo(paste0("Inserting predictionDistribution for performance ", performanceId))
+ DatabaseConnector::insertTable(
+ connection = conn,
+ databaseSchema = resultSchema,
+ tableName = paste0(tablePrefix, "prediction_distribution"),
+ data = value[, columnNames],
+ dropTableIfExists = FALSE, createTable = FALSE, tempTable = FALSE,
+ bulkLoad = FALSE, camelCaseToSnakeCase = TRUE, progressBar = TRUE,
+ tempEmulationSchema = tempEmulationSchema
+ )
}
-
+
return(invisible(NULL))
}
addThresholdSummary <- function(conn, resultSchema, targetDialect,
- tablePrefix = '',
+ tablePrefix = "",
performanceId,
performanceEvaluation,
- overWriteIfExists = T,
- tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")){
-
-
+ overWriteIfExists = TRUE,
+ tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")) {
value <- performanceEvaluation$thresholdSummary
- if(is.null(value)){
+ if (is.null(value)) {
return(NULL)
}
-
+
# check numerical columns:
value <- cleanNum(value)
-
+
# edit names
firstLower <- function(x) {
substr(x, 1, 1) <- tolower(substr(x, 1, 1))
return(x)
}
- colnames(value) <- sapply(colnames(value), firstLower )
+ colnames(value) <- sapply(colnames(value), firstLower)
value$performanceId <- performanceId
-
+
# get column names and check all present in object
- columnNames <- getColumnNames(conn = conn,
- resultSchema = resultSchema,
- targetDialect = targetDialect,
- tablePrefix = tablePrefix,
- tableName = 'threshold_summary',
- tempEmulationSchema = tempEmulationSchema)
- isValid <- sum(colnames(value)%in%columnNames) == length(columnNames)
-
- exists <- checkResultExists(conn = conn,
- resultSchema = resultSchema,
- targetDialect = targetDialect,
- tableName = paste0(tablePrefix,'threshold_summary'),
- resultIdName = 'performance_id',
- resultId = performanceId,
- tempEmulationSchema = tempEmulationSchema)
-
- if(isValid && (!exists || overWriteIfExists)){
-
+ columnNames <- getColumnNames(
+ conn = conn,
+ resultSchema = resultSchema,
+ targetDialect = targetDialect,
+ tablePrefix = tablePrefix,
+ tableName = "threshold_summary",
+ tempEmulationSchema = tempEmulationSchema
+ )
+ isValid <- sum(colnames(value) %in% columnNames) == length(columnNames)
+
+ exists <- checkResultExists(
+ conn = conn,
+ resultSchema = resultSchema,
+ targetDialect = targetDialect,
+ tableName = paste0(tablePrefix, "threshold_summary"),
+ resultIdName = "performance_id",
+ resultId = performanceId,
+ tempEmulationSchema = tempEmulationSchema
+ )
+
+ if (isValid && (!exists || overWriteIfExists)) {
# REMOVE existing result
- if(exists){
+ if (exists) {
sql <- "delete from @result_schema.@table_name where performance_id = @performance_id;"
- sql <- SqlRender::render(sql,
- result_schema = resultSchema,
- performance_id = performanceId,
- table_name = paste0(tablePrefix,'threshold_summary'))
- sql <- SqlRender::translate(sql,
- targetDialect = targetDialect,
- tempEmulationSchema = tempEmulationSchema)
+ sql <- SqlRender::render(sql,
+ result_schema = resultSchema,
+ performance_id = performanceId,
+ table_name = paste0(tablePrefix, "threshold_summary")
+ )
+ sql <- SqlRender::translate(sql,
+ targetDialect = targetDialect,
+ tempEmulationSchema = tempEmulationSchema
+ )
DatabaseConnector::executeSql(conn, sql)
}
-
- # add
- ParallelLogger::logInfo(paste0('Inserting thresholdSummary for performance ',performanceId))
- DatabaseConnector::insertTable(connection = conn,
- databaseSchema = resultSchema,
- tableName = paste0(tablePrefix,'threshold_summary'),
- data = value[,columnNames],
- dropTableIfExists = F, createTable = F, tempTable = F,
- bulkLoad = F, camelCaseToSnakeCase = T, progressBar = T,
- tempEmulationSchema = tempEmulationSchema)
+
+ # add
+ ParallelLogger::logInfo(paste0("Inserting thresholdSummary for performance ", performanceId))
+ DatabaseConnector::insertTable(
+ connection = conn,
+ databaseSchema = resultSchema,
+ tableName = paste0(tablePrefix, "threshold_summary"),
+ data = value[, columnNames],
+ dropTableIfExists = FALSE, createTable = FALSE, tempTable = FALSE,
+ bulkLoad = FALSE, camelCaseToSnakeCase = TRUE, progressBar = TRUE,
+ tempEmulationSchema = tempEmulationSchema
+ )
}
-
+
return(invisible(NULL))
}
-addCalibrationSummary <- function(conn, resultSchema, targetDialect,
- tablePrefix = '',
+addCalibrationSummary <- function(conn, resultSchema, targetDialect,
+ tablePrefix = "",
performanceId,
performanceEvaluation,
- overWriteIfExists = T,
- tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")){
-
-
+ overWriteIfExists = TRUE,
+ tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")) {
value <- performanceEvaluation$calibrationSummary
- if(is.null(value)){
+ if (is.null(value)) {
return(NULL)
}
-
+
# check numerical columns:
value <- cleanNum(value)
-
+
# edit names
firstLower <- function(x) {
substr(x, 1, 1) <- tolower(substr(x, 1, 1))
return(x)
}
- colnames(value) <- sapply(colnames(value), firstLower )
-
+ colnames(value) <- sapply(colnames(value), firstLower)
+
value$performanceId <- performanceId
-
+
# get column names and check all present in object
- columnNames <- getColumnNames(conn = conn,
- resultSchema = resultSchema,
- targetDialect = targetDialect,
- tablePrefix = tablePrefix,
- tableName = 'calibration_summary',
- tempEmulationSchema = tempEmulationSchema)
- isValid <- sum(colnames(value)%in%columnNames) == length(columnNames)
-
- exists <- checkResultExists(conn = conn,
- resultSchema = resultSchema,
- targetDialect = targetDialect,
- tableName = paste0(tablePrefix,'calibration_summary'),
- resultIdName = 'performance_id',
- resultId = performanceId,
- tempEmulationSchema = tempEmulationSchema)
-
- if(isValid && (!exists || overWriteIfExists)){
-
+ columnNames <- getColumnNames(
+ conn = conn,
+ resultSchema = resultSchema,
+ targetDialect = targetDialect,
+ tablePrefix = tablePrefix,
+ tableName = "calibration_summary",
+ tempEmulationSchema = tempEmulationSchema
+ )
+ isValid <- sum(colnames(value) %in% columnNames) == length(columnNames)
+
+ exists <- checkResultExists(
+ conn = conn,
+ resultSchema = resultSchema,
+ targetDialect = targetDialect,
+ tableName = paste0(tablePrefix, "calibration_summary"),
+ resultIdName = "performance_id",
+ resultId = performanceId,
+ tempEmulationSchema = tempEmulationSchema
+ )
+
+ if (isValid && (!exists || overWriteIfExists)) {
# REMOVE existing result
- if(exists){
+ if (exists) {
sql <- "delete from @result_schema.@table_name where performance_id = @performance_id;"
- sql <- SqlRender::render(sql,
- result_schema = resultSchema,
- performance_id= performanceId,
- table_name = paste0(tablePrefix,'calibration_summary'))
- sql <- SqlRender::translate(sql,
- targetDialect = targetDialect,
- tempEmulationSchema = tempEmulationSchema)
+ sql <- SqlRender::render(sql,
+ result_schema = resultSchema,
+ performance_id = performanceId,
+ table_name = paste0(tablePrefix, "calibration_summary")
+ )
+ sql <- SqlRender::translate(sql,
+ targetDialect = targetDialect,
+ tempEmulationSchema = tempEmulationSchema
+ )
DatabaseConnector::executeSql(conn, sql)
}
-
- # add
- ParallelLogger::logInfo(paste0('Inserting calibrationSummary for performance ', performanceId))
- DatabaseConnector::insertTable(connection = conn,
- databaseSchema = resultSchema,
- tableName = paste0(tablePrefix,'calibration_summary'),
- data = value[,columnNames],
- dropTableIfExists = F, createTable = F, tempTable = F,
- bulkLoad = F, camelCaseToSnakeCase = T, progressBar = T,
- tempEmulationSchema = tempEmulationSchema)
+
+ # add
+ ParallelLogger::logInfo(paste0("Inserting calibrationSummary for performance ", performanceId))
+ DatabaseConnector::insertTable(
+ connection = conn,
+ databaseSchema = resultSchema,
+ tableName = paste0(tablePrefix, "calibration_summary"),
+ data = value[, columnNames],
+ dropTableIfExists = FALSE, createTable = FALSE, tempTable = FALSE,
+ bulkLoad = FALSE, camelCaseToSnakeCase = TRUE, progressBar = TRUE,
+ tempEmulationSchema = tempEmulationSchema
+ )
}
-
+
return(invisible(NULL))
}
addEvaluationStatistics <- function(conn, resultSchema, targetDialect,
- tablePrefix = '',
+ tablePrefix = "",
performanceId,
performanceEvaluation,
- overWriteIfExists = T,
- tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")){
-
-
+ overWriteIfExists = TRUE,
+ tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")) {
value <- data.frame(
evaluation = unlist(performanceEvaluation$evaluationStatistics$evaluation),
metric = unlist(performanceEvaluation$evaluationStatistics$metric),
value = as.numeric(unlist(performanceEvaluation$evaluationStatistics$value))
)
-
- if(is.null(value)){
+
+ if (is.null(value)) {
return(NULL)
}
-
+
# edit names
firstLower <- function(x) {
substr(x, 1, 1) <- tolower(substr(x, 1, 1))
return(x)
}
- colnames(value) <- sapply(colnames(value), firstLower )
+ colnames(value) <- sapply(colnames(value), firstLower)
value$performanceId <- performanceId
-
+
# get column names and check all present in object
- columnNames <- getColumnNames(conn = conn,
- resultSchema = resultSchema,
- targetDialect = targetDialect,
- tablePrefix = tablePrefix,
- tableName = 'evaluation_statistics',
- tempEmulationSchema = tempEmulationSchema)
- isValid <- sum(colnames(value)%in%columnNames) == length(columnNames)
-
- exists <- checkResultExists(conn = conn,
- resultSchema = resultSchema,
- targetDialect = targetDialect,
- tableName = paste0(tablePrefix,'evaluation_statistics'),
- resultIdName = 'performance_id',
- resultId = performanceId,
- tempEmulationSchema = tempEmulationSchema)
-
- if(isValid && (!exists || overWriteIfExists)){
-
+ columnNames <- getColumnNames(
+ conn = conn,
+ resultSchema = resultSchema,
+ targetDialect = targetDialect,
+ tablePrefix = tablePrefix,
+ tableName = "evaluation_statistics",
+ tempEmulationSchema = tempEmulationSchema
+ )
+ isValid <- sum(colnames(value) %in% columnNames) == length(columnNames)
+
+ exists <- checkResultExists(
+ conn = conn,
+ resultSchema = resultSchema,
+ targetDialect = targetDialect,
+ tableName = paste0(tablePrefix, "evaluation_statistics"),
+ resultIdName = "performance_id",
+ resultId = performanceId,
+ tempEmulationSchema = tempEmulationSchema
+ )
+
+ if (isValid && (!exists || overWriteIfExists)) {
# REMOVE existing result
- if(exists){
+ if (exists) {
sql <- "delete from @result_schema.@table_name where performance_id = @performance_id;"
- sql <- SqlRender::render(sql,
- result_schema = resultSchema,
- performance_id = performanceId,
- table_name = paste0(tablePrefix,'evaluation_statistics'))
- sql <- SqlRender::translate(sql,
- targetDialect = targetDialect,
- tempEmulationSchema = tempEmulationSchema)
+ sql <- SqlRender::render(sql,
+ result_schema = resultSchema,
+ performance_id = performanceId,
+ table_name = paste0(tablePrefix, "evaluation_statistics")
+ )
+ sql <- SqlRender::translate(sql,
+ targetDialect = targetDialect,
+ tempEmulationSchema = tempEmulationSchema
+ )
DatabaseConnector::executeSql(conn, sql)
}
-
- # add
- ParallelLogger::logInfo(paste0('Inserting evaluationSummary for performance ',performanceId))
- DatabaseConnector::insertTable(connection = conn,
- databaseSchema = resultSchema,
- tableName = paste0(tablePrefix,'evaluation_statistics'),
- data = value[,columnNames],
- dropTableIfExists = F, createTable = F, tempTable = F,
- bulkLoad = F, camelCaseToSnakeCase = T, progressBar = T,
- tempEmulationSchema = tempEmulationSchema)
+
+ # add
+ ParallelLogger::logInfo(paste0("Inserting evaluationSummary for performance ", performanceId))
+ DatabaseConnector::insertTable(
+ connection = conn,
+ databaseSchema = resultSchema,
+ tableName = paste0(tablePrefix, "evaluation_statistics"),
+ data = value[, columnNames],
+ dropTableIfExists = FALSE, createTable = FALSE, tempTable = FALSE,
+ bulkLoad = FALSE, camelCaseToSnakeCase = TRUE, progressBar = TRUE,
+ tempEmulationSchema = tempEmulationSchema
+ )
}
-
+
return(invisible(NULL))
}
-addDemographicSummary <- function(conn, resultSchema, targetDialect,
- tablePrefix = '',
+addDemographicSummary <- function(conn, resultSchema, targetDialect,
+ tablePrefix = "",
performanceId,
performanceEvaluation,
- overWriteIfExists = T,
- tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")){
-
-
+ overWriteIfExists = TRUE,
+ tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")) {
value <- performanceEvaluation$demographicSummary
- if(is.null(value)){
+ if (is.null(value)) {
return(NULL)
}
-
+
# edit names
firstLower <- function(x) {
substr(x, 1, 1) <- tolower(substr(x, 1, 1))
return(x)
}
- colnames(value) <- sapply(colnames(value), firstLower )
- #if(sum(colnames(value)=="p50PredictedProbability")>0){
- # colnames(value)[colnames(value)=="p50PredictedProbability"] <- 'medianPredictedProbability'
- #}
-
+ colnames(value) <- sapply(colnames(value), firstLower)
+
value$performanceId <- performanceId
-
+
# get column names and check all present in object
- columnNames <- getColumnNames(conn = conn,
- resultSchema = resultSchema,
- targetDialect = targetDialect,
- tablePrefix = tablePrefix,
- tableName = 'demographic_summary',
- tempEmulationSchema = tempEmulationSchema)
- isValid <- sum(colnames(value)%in%columnNames) == length(columnNames)
-
- exists <- checkResultExists(conn = conn,
- resultSchema = resultSchema,
- targetDialect = targetDialect,
- tableName = paste0(tablePrefix,'demographic_summary'),
- resultIdName = 'performance_id',
- resultId = performanceId,
- tempEmulationSchema = tempEmulationSchema)
-
- if(isValid && (!exists || overWriteIfExists)){
-
+ columnNames <- getColumnNames(
+ conn = conn,
+ resultSchema = resultSchema,
+ targetDialect = targetDialect,
+ tablePrefix = tablePrefix,
+ tableName = "demographic_summary",
+ tempEmulationSchema = tempEmulationSchema
+ )
+ isValid <- sum(colnames(value) %in% columnNames) == length(columnNames)
+
+ exists <- checkResultExists(
+ conn = conn,
+ resultSchema = resultSchema,
+ targetDialect = targetDialect,
+ tableName = paste0(tablePrefix, "demographic_summary"),
+ resultIdName = "performance_id",
+ resultId = performanceId,
+ tempEmulationSchema = tempEmulationSchema
+ )
+
+ if (isValid && (!exists || overWriteIfExists)) {
# REMOVE existing result
- if(exists){
+ if (exists) {
sql <- "delete from @result_schema.@table_name where performance_id = @performance_id;"
- sql <- SqlRender::render(sql,
- result_schema = resultSchema,
- performance_id = performanceId,
- table_name = paste0(tablePrefix,'demographic_summary'))
- sql <- SqlRender::translate(sql,
- targetDialect = targetDialect,
- tempEmulationSchema = tempEmulationSchema)
+ sql <- SqlRender::render(sql,
+ result_schema = resultSchema,
+ performance_id = performanceId,
+ table_name = paste0(tablePrefix, "demographic_summary")
+ )
+ sql <- SqlRender::translate(sql,
+ targetDialect = targetDialect,
+ tempEmulationSchema = tempEmulationSchema
+ )
DatabaseConnector::executeSql(conn, sql)
}
-
- # add
- ParallelLogger::logInfo(paste0('Inserting demographicSummary for performance ',performanceId))
- DatabaseConnector::insertTable(connection = conn,
- databaseSchema = resultSchema,
- tableName = paste0(tablePrefix,'demographic_summary'),
- data = value[,columnNames],
- dropTableIfExists = F, createTable = F, tempTable = F,
- bulkLoad = F, camelCaseToSnakeCase = T, progressBar = T,
- tempEmulationSchema = tempEmulationSchema)
+
+ # add
+ ParallelLogger::logInfo(paste0("Inserting demographicSummary for performance ", performanceId))
+ DatabaseConnector::insertTable(
+ connection = conn,
+ databaseSchema = resultSchema,
+ tableName = paste0(tablePrefix, "demographic_summary"),
+ data = value[, columnNames],
+ dropTableIfExists = FALSE, createTable = FALSE, tempTable = FALSE,
+ bulkLoad = FALSE, camelCaseToSnakeCase = TRUE, progressBar = TRUE,
+ tempEmulationSchema = tempEmulationSchema
+ )
}
-
+
return(invisible(NULL))
}
-addCovariateSummary <- function(conn, resultSchema, targetDialect,
- tablePrefix = '',
+addCovariateSummary <- function(conn, resultSchema, targetDialect,
+ tablePrefix = "",
performanceId,
covariateSummary,
- restrictToIncluded = T,
- overWriteIfExists = T,
- tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")){
-
-
+ restrictToIncluded = TRUE,
+ overWriteIfExists = TRUE,
+ tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")) {
value <- covariateSummary
- if(is.null(value)){
+ if (is.null(value)) {
return(NULL)
}
-
+
# edit names
firstLower <- function(x) {
substr(x, 1, 1) <- tolower(substr(x, 1, 1))
return(x)
}
- colnames(value) <- sapply(colnames(value), firstLower )
+ colnames(value) <- sapply(colnames(value), firstLower)
value$performanceId <- performanceId
# remove _ from names
- colnames(value) <- gsub('_','', colnames(value))
-
- if(restrictToIncluded){
- ParallelLogger::logInfo('Restricting to covariates included in model')
- value <- value[value$covariateValue!=0 & !is.na(value$covariateValue),]
+ colnames(value) <- gsub("_", "", colnames(value))
+
+ if (restrictToIncluded) {
+ ParallelLogger::logInfo("Restricting to covariates included in model")
+ value <- value[value$covariateValue != 0 & !is.na(value$covariateValue), ]
}
-
+
# get column names and check all present in object
- columnNames <- getColumnNames(conn = conn,
- resultSchema = resultSchema,
- targetDialect = targetDialect,
- tablePrefix = tablePrefix,
- tableName = 'covariate_summary',
- tempEmulationSchema = tempEmulationSchema)
- isValid <- sum(colnames(value)%in%columnNames) == length(columnNames)
-
- exists <- checkResultExists(conn = conn,
- resultSchema = resultSchema,
- targetDialect = targetDialect,
- tableName = paste0(tablePrefix,'covariate_summary'),
- resultIdName = 'performance_id',
- resultId = performanceId,
- tempEmulationSchema = tempEmulationSchema)
-
- if(isValid && (!exists || overWriteIfExists)){
-
+ columnNames <- getColumnNames(
+ conn = conn,
+ resultSchema = resultSchema,
+ targetDialect = targetDialect,
+ tablePrefix = tablePrefix,
+ tableName = "covariate_summary",
+ tempEmulationSchema = tempEmulationSchema
+ )
+ isValid <- sum(colnames(value) %in% columnNames) == length(columnNames)
+
+ exists <- checkResultExists(
+ conn = conn,
+ resultSchema = resultSchema,
+ targetDialect = targetDialect,
+ tableName = paste0(tablePrefix, "covariate_summary"),
+ resultIdName = "performance_id",
+ resultId = performanceId,
+ tempEmulationSchema = tempEmulationSchema
+ )
+
+ if (isValid && (!exists || overWriteIfExists)) {
# REMOVE existing result
- if(exists){
- ParallelLogger::logTrace('Removing existing covariateSummary')
+ if (exists) {
+ ParallelLogger::logTrace("Removing existing covariateSummary")
sql <- "delete from @result_schema.@table_name where performance_id = @performance_id;"
- sql <- SqlRender::render(sql,
- result_schema = resultSchema,
- performance_id = performanceId,
- table_name = paste0(tablePrefix,'covariate_summary'))
- sql <- SqlRender::translate(sql,
- targetDialect = targetDialect,
- tempEmulationSchema = tempEmulationSchema)
+ sql <- SqlRender::render(sql,
+ result_schema = resultSchema,
+ performance_id = performanceId,
+ table_name = paste0(tablePrefix, "covariate_summary")
+ )
+ sql <- SqlRender::translate(sql,
+ targetDialect = targetDialect,
+ tempEmulationSchema = tempEmulationSchema
+ )
DatabaseConnector::executeSql(conn, sql)
}
-
- # add
- ParallelLogger::logInfo(paste0('Inserting covariateSummary for result ', performanceId))
- DatabaseConnector::insertTable(connection = conn,
- databaseSchema = resultSchema,
- tableName = paste0(tablePrefix,'covariate_summary'),
- data = value[,columnNames],
- dropTableIfExists = F, createTable = F, tempTable = F,
- bulkLoad = F, camelCaseToSnakeCase = T, progressBar = T,
- tempEmulationSchema = tempEmulationSchema)
+
+ # add
+ ParallelLogger::logInfo(paste0("Inserting covariateSummary for result ", performanceId))
+ DatabaseConnector::insertTable(
+ connection = conn,
+ databaseSchema = resultSchema,
+ tableName = paste0(tablePrefix, "covariate_summary"),
+ data = value[, columnNames],
+ dropTableIfExists = FALSE, createTable = FALSE, tempTable = FALSE,
+ bulkLoad = FALSE, camelCaseToSnakeCase = TRUE, progressBar = TRUE,
+ tempEmulationSchema = tempEmulationSchema
+ )
}
-
+
return(invisible(NULL))
}
-#====================
+# ====================
# Helpers
-#====================
+# ====================
# gets the column names in camelCase of a table
-getColumnNames <- function(conn, resultSchema, targetDialect, tableName, tablePrefix = '',
- tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")
-){
+getColumnNames <- function(conn, resultSchema, targetDialect, tableName, tablePrefix = "",
+ tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")) {
sql <- "select top 1 * from @my_schema.@string_to_append@table;"
- sql <- SqlRender::render(sql,
- my_schema = resultSchema,
- table = tableName,
- string_to_append = tablePrefix)
- sql <- SqlRender::translate(sql, targetDialect = targetDialect,
- tempEmulationSchema = tempEmulationSchema)
- result <- DatabaseConnector::querySql(connection = conn, sql = sql, snakeCaseToCamelCase = T)
-
+ sql <- SqlRender::render(sql,
+ my_schema = resultSchema,
+ table = tableName,
+ string_to_append = tablePrefix
+ )
+ sql <- SqlRender::translate(sql,
+ targetDialect = targetDialect,
+ tempEmulationSchema = tempEmulationSchema
+ )
+ result <- DatabaseConnector::querySql(connection = conn, sql = sql, snakeCaseToCamelCase = TRUE)
+
return(colnames(result))
}
-# True/False check whether results exist in table
+# True/False check whether results exist in table
checkResultExists <- function(conn, resultSchema, targetDialect,
snakeCaseToCamelCase,
tableName,
- resultIdName = 'performance_id',
+ resultIdName = "performance_id",
resultId,
- tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")
-){
-
+ tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")) {
sql <- "select * from @my_schema.@table where @result_id_name = @result_id;"
- sql <- SqlRender::render(sql,
- my_schema = resultSchema,
- table = tableName,
- result_id_name = resultIdName,
- result_id = resultId)
- sql <- SqlRender::translate(sql, targetDialect = targetDialect,
- tempEmulationSchema = tempEmulationSchema)
- result <- DatabaseConnector::querySql(connection = conn, sql = sql, snakeCaseToCamelCase = T)
- return(nrow(result)>0)
-}
\ No newline at end of file
+ sql <- SqlRender::render(sql,
+ my_schema = resultSchema,
+ table = tableName,
+ result_id_name = resultIdName,
+ result_id = resultId
+ )
+ sql <- SqlRender::translate(sql,
+ targetDialect = targetDialect,
+ tempEmulationSchema = tempEmulationSchema
+ )
+ result <- DatabaseConnector::querySql(connection = conn, sql = sql, snakeCaseToCamelCase = TRUE)
+ return(nrow(result) > 0)
+}
+
diff --git a/README.md b/README.md
index 9587cfa6f..567ba37c8 100644
--- a/README.md
+++ b/README.md
@@ -18,11 +18,11 @@ Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. [Design and implementati
The figure below illustrates the prediction problem we address. Among a population at risk, we aim to predict which patients at a defined moment in time (t = 0) will experience some outcome during a time-at-risk. Prediction is done using only information about the patients in an observation window prior to that moment in time.
-![](vignettes/Figure1.webp)
+![](vignettes/images/Figure1.avif)
To define a prediction problem we have to define t=0 by a Target Cohort (T), the outcome we like to predict by an outcome cohort (O), and the time-at-risk (TAR). Furthermore, we have to make design choices for the model we like to develop, and determine the observational datasets to perform internal and external validation. This conceptual framework works for all type of prediction problems, for example those presented below (T=green, O=red).
-![](vignettes/problems.webp)
+![](vignettes/images/problems.avif)
Features
========
@@ -51,11 +51,11 @@ Screenshots
-
+
-
+
diff --git a/_pkgdown.yml b/_pkgdown.yml
index 5e7e42c5c..447991ad2 100644
--- a/_pkgdown.yml
+++ b/_pkgdown.yml
@@ -1,6 +1,8 @@
template:
+ bootstrap: 5
params:
bootswatch: cosmo
+ light-switch: true
development:
mode: auto
@@ -16,10 +18,8 @@ navbar:
left:
- home
- intro
- - videos
- reference
- articles
- - tutorial
- benchmarks
- predictors
- bestpractice
@@ -81,6 +81,7 @@ reference:
contents:
- createStudyPopulationSettings
- createDefaultSplitSetting
+ - createExistingSplitSettings
- createSampleSettings
- createFeatureEngineeringSettings
- createPreprocessSettings
@@ -114,7 +115,6 @@ reference:
- setAdaBoost
- setDecisionTree
- setGradientBoostingMachine
- - setKNN
- setLassoLogisticRegression
- setMLP
- setNaiveBayes
diff --git a/extras/PackageMaintenance.R b/extras/PackageMaintenance.R
index ba1ee177c..cfdd6442d 100644
--- a/extras/PackageMaintenance.R
+++ b/extras/PackageMaintenance.R
@@ -24,7 +24,6 @@ OhdsiRTools::fixHadesLogo()
OhdsiRTools::formatRFolder()
OhdsiRTools::checkUsagePackage("PatientLevelPrediction")
OhdsiRTools::updateCopyrightYearFolder()
-devtools::spell_check()
# Create manual and vignettes
unlink("extras/PatientLevelPrediction.pdf")
diff --git a/inst/doc/AddingCustomFeatureEngineering.pdf b/inst/doc/AddingCustomFeatureEngineering.pdf
deleted file mode 100644
index e4e8220ce..000000000
Binary files a/inst/doc/AddingCustomFeatureEngineering.pdf and /dev/null differ
diff --git a/inst/doc/AddingCustomModels.pdf b/inst/doc/AddingCustomModels.pdf
deleted file mode 100644
index f58d39dbb..000000000
Binary files a/inst/doc/AddingCustomModels.pdf and /dev/null differ
diff --git a/inst/doc/AddingCustomSamples.pdf b/inst/doc/AddingCustomSamples.pdf
deleted file mode 100644
index e85e0baaf..000000000
Binary files a/inst/doc/AddingCustomSamples.pdf and /dev/null differ
diff --git a/inst/doc/AddingCustomSplitting.pdf b/inst/doc/AddingCustomSplitting.pdf
deleted file mode 100644
index 6bc846f51..000000000
Binary files a/inst/doc/AddingCustomSplitting.pdf and /dev/null differ
diff --git a/inst/doc/BuildingMultiplePredictiveModels.pdf b/inst/doc/BuildingMultiplePredictiveModels.pdf
deleted file mode 100644
index 31c3cb98a..000000000
Binary files a/inst/doc/BuildingMultiplePredictiveModels.pdf and /dev/null differ
diff --git a/inst/doc/BuildingPredictiveModels.pdf b/inst/doc/BuildingPredictiveModels.pdf
deleted file mode 100644
index fd77a14bc..000000000
Binary files a/inst/doc/BuildingPredictiveModels.pdf and /dev/null differ
diff --git a/inst/doc/BuildingPredictiveModels.tex b/inst/doc/BuildingPredictiveModels.tex
deleted file mode 100644
index ae3ce9ba2..000000000
--- a/inst/doc/BuildingPredictiveModels.tex
+++ /dev/null
@@ -1,2473 +0,0 @@
-% Options for packages loaded elsewhere
-\PassOptionsToPackage{unicode}{hyperref}
-\PassOptionsToPackage{hyphens}{url}
-%
-\documentclass[
-]{article}
-\usepackage{amsmath,amssymb}
-\usepackage{iftex}
-\ifPDFTeX
- \usepackage[T1]{fontenc}
- \usepackage[utf8]{inputenc}
- \usepackage{textcomp} % provide euro and other symbols
-\else % if luatex or xetex
- \usepackage{unicode-math} % this also loads fontspec
- \defaultfontfeatures{Scale=MatchLowercase}
- \defaultfontfeatures[\rmfamily]{Ligatures=TeX,Scale=1}
-\fi
-\usepackage{lmodern}
-\ifPDFTeX\else
- % xetex/luatex font selection
-\fi
-% Use upquote if available, for straight quotes in verbatim environments
-\IfFileExists{upquote.sty}{\usepackage{upquote}}{}
-\IfFileExists{microtype.sty}{% use microtype if available
- \usepackage[]{microtype}
- \UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts
-}{}
-\makeatletter
-\@ifundefined{KOMAClassName}{% if non-KOMA class
- \IfFileExists{parskip.sty}{%
- \usepackage{parskip}
- }{% else
- \setlength{\parindent}{0pt}
- \setlength{\parskip}{6pt plus 2pt minus 1pt}}
-}{% if KOMA class
- \KOMAoptions{parskip=half}}
-\makeatother
-\usepackage{xcolor}
-\usepackage[margin=1in]{geometry}
-\usepackage{color}
-\usepackage{fancyvrb}
-\newcommand{\VerbBar}{|}
-\newcommand{\VERB}{\Verb[commandchars=\\\{\}]}
-\DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\}}
-% Add ',fontsize=\small' for more characters per line
-\usepackage{framed}
-\definecolor{shadecolor}{RGB}{248,248,248}
-\newenvironment{Shaded}{\begin{snugshade}}{\end{snugshade}}
-\newcommand{\AlertTok}[1]{\textcolor[rgb]{0.94,0.16,0.16}{#1}}
-\newcommand{\AnnotationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
-\newcommand{\AttributeTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{#1}}
-\newcommand{\BaseNTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}}
-\newcommand{\BuiltInTok}[1]{#1}
-\newcommand{\CharTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}}
-\newcommand{\CommentTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}}
-\newcommand{\CommentVarTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
-\newcommand{\ConstantTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{#1}}
-\newcommand{\ControlFlowTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}}
-\newcommand{\DataTypeTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{#1}}
-\newcommand{\DecValTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}}
-\newcommand{\DocumentationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
-\newcommand{\ErrorTok}[1]{\textcolor[rgb]{0.64,0.00,0.00}{\textbf{#1}}}
-\newcommand{\ExtensionTok}[1]{#1}
-\newcommand{\FloatTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}}
-\newcommand{\FunctionTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}}
-\newcommand{\ImportTok}[1]{#1}
-\newcommand{\InformationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
-\newcommand{\KeywordTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}}
-\newcommand{\NormalTok}[1]{#1}
-\newcommand{\OperatorTok}[1]{\textcolor[rgb]{0.81,0.36,0.00}{\textbf{#1}}}
-\newcommand{\OtherTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{#1}}
-\newcommand{\PreprocessorTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}}
-\newcommand{\RegionMarkerTok}[1]{#1}
-\newcommand{\SpecialCharTok}[1]{\textcolor[rgb]{0.81,0.36,0.00}{\textbf{#1}}}
-\newcommand{\SpecialStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}}
-\newcommand{\StringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}}
-\newcommand{\VariableTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
-\newcommand{\VerbatimStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}}
-\newcommand{\WarningTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
-\usepackage{longtable,booktabs,array}
-\usepackage{calc} % for calculating minipage widths
-% Correct order of tables after \paragraph or \subparagraph
-\usepackage{etoolbox}
-\makeatletter
-\patchcmd\longtable{\par}{\if@noskipsec\mbox{}\fi\par}{}{}
-\makeatother
-% Allow footnotes in longtable head/foot
-\IfFileExists{footnotehyper.sty}{\usepackage{footnotehyper}}{\usepackage{footnote}}
-\makesavenoteenv{longtable}
-\usepackage{graphicx}
-\makeatletter
-\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi}
-\def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi}
-\makeatother
-% Scale images if necessary, so that they will not overflow the page
-% margins by default, and it is still possible to overwrite the defaults
-% using explicit options in \includegraphics[width, height, ...]{}
-\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio}
-% Set default figure placement to htbp
-\makeatletter
-\def\fps@figure{htbp}
-\makeatother
-\setlength{\emergencystretch}{3em} % prevent overfull lines
-\providecommand{\tightlist}{%
- \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}
-\setcounter{secnumdepth}{5}
-\usepackage{fancyhdr}
-\pagestyle{fancy}
-\fancyhead{}
-\fancyhead[CO,CE]{Installation Guide}
-\fancyfoot[CO,CE]{PatientLevelPrediction Package Version 6.3.7.9999}
-\fancyfoot[LE,RO]{\thepage}
-\renewcommand{\headrulewidth}{0.4pt}
-\renewcommand{\footrulewidth}{0.4pt}
-\ifLuaTeX
- \usepackage{selnolig} % disable illegal ligatures
-\fi
-\IfFileExists{bookmark.sty}{\usepackage{bookmark}}{\usepackage{hyperref}}
-\IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available
-\urlstyle{same}
-\hypersetup{
- pdftitle={Building patient-level predictive models},
- pdfauthor={Jenna Reps, Martijn J. Schuemie, Patrick B. Ryan, Peter R. Rijnbeek},
- hidelinks,
- pdfcreator={LaTeX via pandoc}}
-
-\title{Building patient-level predictive models}
-\author{Jenna Reps, Martijn J. Schuemie, Patrick B. Ryan, Peter R.
-Rijnbeek}
-\date{2024-04-26}
-
-\begin{document}
-\maketitle
-
-{
-\setcounter{tocdepth}{3}
-\tableofcontents
-}
-\hypertarget{introduction}{%
-\section{Introduction}\label{introduction}}
-
-Observational healthcare data, such as administrative claims and
-electronic health records, are increasingly used for clinical
-characterization of disease progression, quality improvement, and
-population-level effect estimation for medical product safety
-surveillance and comparative effectiveness. Advances in machine learning
-for large dataset analysis have led to increased interest in applying
-patient-level prediction on this type of data. Patient-level prediction
-offers the potential for medical practice to move beyond average
-treatment effects and to consider personalized risks as part of clinical
-decision-making. However, many published efforts in
-patient-level-prediction do not follow the model development guidelines,
-fail to perform extensive external validation, or provide insufficient
-model details that limits the ability of independent researchers to
-reproduce the models and perform external validation. This makes it hard
-to fairly evaluate the predictive performance of the models and reduces
-the likelihood of the model being used appropriately in clinical
-practice. To improve standards, several papers have been written
-detailing guidelines for best practices in developing and reporting
-prediction models.
-
-The Transparent Reporting of a multivariable prediction model for
-\href{https://www.equator-network.org/reporting-guidelines/tripod-statement/}{\texttt{Individual\ Prognosis\ Or\ Diagnosis\ (TRIPOD)\ statement}}
-provides clear recommendations for reporting prediction model
-development and validation and addresses some of the concerns related to
-transparency. However, data structure heterogeneity and inconsistent
-terminologies still make collaboration and model sharing difficult as
-different researchers are often required to write new code to extract
-the data from their databases and may define variables differently.
-
-In our
-\href{https://academic.oup.com/jamia/article/25/8/969/4989437}{\texttt{paper}},
-we propose a standardised framework for patient-level prediction that
-utilizes the OMOP Common Data Model (CDM) and standardized vocabularies,
-and describe the open-source software that we developed implementing the
-framework's pipeline. The framework is the first to support existing
-best practice guidelines and will enable open dissemination of models
-that can be extensively validated across the network of OHDSI
-collaborators.
-
-Figure 1, illustrates the prediction problem we address. Among a
-population at risk, we aim to predict which patients at a defined moment
-in time (t = 0) will experience some outcome during a time-at-risk.
-Prediction is done using only information about the patients in an
-observation window prior to that moment in time.
-
-\begin{figure}
-\centering
-\includegraphics{Figure1.webp}
-\caption{The prediction problem}
-\end{figure}
-
-As shown in Figure 2, to define a prediction problem we have to define
-t=0 by a Target Cohort (T), the outcome we like to predict by an outcome
-cohort (O), and the time-at-risk (TAR). Furthermore, we have to make
-design choices for the model we like to develop, and determine the
-observational datasets to perform internal and external validation. This
-conceptual framework works for all type of prediction problems, for
-example those presented in Figure 3.
-
-\begin{figure}
-\centering
-\includegraphics{studydesign.webp}
-\caption{Design choices}
-\end{figure}
-
-\begin{figure}
-\centering
-\includegraphics{problems.webp}
-\caption{Examples of prediction problems}
-\end{figure}
-
-This vignette describes how you can use the
-\texttt{PatientLevelPrediction} package to build patient-level
-predictive models. The package enables data extraction, model building,
-and model evaluation using data from databases that are translated into
-the OMOP CDM. In this vignette we assume you have installed the package
-correctly using the
-\href{https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/InstallationGuide.pdf}{\texttt{InstallationGuide}}.
-
-\hypertarget{study-specification}{%
-\section{Study specification}\label{study-specification}}
-
-We have to clearly specify our study upfront to be able to implement it.
-This means we need to define the prediction problem we like to address,
-in which population we will build the model, which model we will build
-and how we will evaluate its performance. To guide you through this
-process we will use a ``Disease onset and progression'' prediction type
-as an example.
-
-\hypertarget{problem-definition-1-stroke-in-afibrilation-patients}{%
-\subsection{Problem definition 1: Stroke in afibrilation
-patients}\label{problem-definition-1-stroke-in-afibrilation-patients}}
-
-Atrial fibrillation is a disease characterized by an irregular heart
-rate that can cause poor blood flow. Patients with atrial fibrillation
-are at increased risk of ischemic stroke. Anticoagulation is a
-recommended prophylaxis treatment strategy for patients at high risk of
-stroke, though the underuse of anticoagulants and persistent severity of
-ischemic stroke represents a substantial unmet medical need. Various
-strategies have been developed to predict risk of ischemic stroke in
-patients with atrial fibrillation. CHADS2 (Gage JAMA 2001) was developed
-as a risk score based on history of congestive heart failure,
-hypertension, age\textgreater=75, diabetes and stroke. CHADS2 was
-initially derived using Medicare claims data, where it achieved good
-discrimination (AUC=0.82). However, subsequent external validation
-studies revealed the CHADS2 had substantially lower predictive accuracy
-(Keogh Thromb Haemost 2011). Subsequent stroke risk calculators have
-been developed and evaluated, including the extension of CHADS2Vasc. The
-management of atrial fibrillation has evolved substantially over the
-last decade, for various reasons that include the introduction of novel
-oral anticoagulants. With these innovations has come a renewed interest
-in greater precision medicine for stroke prevention.
-
-We will apply the PatientLevelPrediction package to observational
-healthcare data to address the following patient-level prediction
-question:
-
-Amongst patients who are newly diagnosed with Atrial Fibrillation, which
-patients will go on to have Ischemic Stroke within 1 year?
-
-We will define `patients who are newly diagnosed with Atrial
-Fibrillation' as the first condition record of cardiac arrhythmia, which
-is followed by another cardiac arrhythmia condition record, at least two
-drug records for a drug used to treat arrhythmias, or a procedure to
-treat arrhythmias. We will define `Ischemic stroke events' as ischemic
-stroke condition records during an inpatient or ER visit; successive
-records with \textgreater{} 180 day gap are considered independent
-episodes.
-
-\hypertarget{problem-definition-2-angioedema-in-ace-inhibitor-users}{%
-\subsection{Problem definition 2: Angioedema in ACE inhibitor
-users}\label{problem-definition-2-angioedema-in-ace-inhibitor-users}}
-
-Angiotensin converting enzyme inhibitors (ACE inhibitors) are
-medications used by patients with hypertension that widen the blood
-vessles and therefore increse the amount of blood pumped by the heart
-and decreases blood pressure. Ace inhibitors reduce a patients risk of
-cardiovasular disease but can lead to drug-induced angioedema.
-
-We will apply the PatientLevelPrediction package to observational
-healthcare data to address the following patient-level prediction
-question:
-
-Amongst patients who are newly dispensed an ACE inhibitor, which
-patients will go on to have angioedema within 1 year?
-
-We will define `patients who are newly dispensed an ACE inhibitor' as
-the first drug record of sny ACE inhibitor, {[}\ldots{]}which is
-followed by another cardiac arrhythmia condition record, at least two
-drug records for a drug used to treat arrhythmias, or a procedure to
-treat arrhythmias. We will define `angioedema' as an angioedema
-condition record.
-
-\hypertarget{study-population-definition}{%
-\subsection{Study population
-definition}\label{study-population-definition}}
-
-The final study population in which we will develop our model is often a
-subset of the Target population, because we will e.g.~apply criteria
-that are dependent on T and O or we want to do sensitivity analyses with
-subpopulations of T. For this we have to answer the following questions:
-
-\begin{itemize}
-\item
- \emph{What is the minimum amount of observation time we require before
- the start of the target cohort?} This choice could depend on the
- available patient time in your training data, but also on the time you
- expect to be available in the data sources you want to apply the model
- on in the future. The longer the minimum observation time, the more
- baseline history time is available for each person to use for feature
- extraction, but the fewer patients will qualify for analysis.
- Moreover, there could be clinical reasons to choose a short or longer
- lookback period. For our example, we will use a prior history as
- lookback period (washout period).
-\item
- \emph{Can patients enter the target cohort multiple times?} In the
- target cohort definition, a person may qualify for the cohort multiple
- times during different spans of time, for example if they had
- different episodes of a disease or separate periods of exposure to a
- medical product. The cohort definition does not necessarily apply a
- restriction to only let the patients enter once, but in the context of
- a particular patient-level prediction problem, a user may want to
- restrict the cohort to the first qualifying episode. In our example, a
- person could only enter the target cohort once since our criteria was
- based on first occurrence of atrial fibrillation.
-\item
- \emph{Do we allow persons to enter the cohort if they experienced the
- outcome before?} Do we allow persons to enter the target cohort if
- they experienced the outcome before qualifying for the target cohort?
- Depending on the particular patient-level prediction problem, there
- may be a desire to predict `incident' first occurrence of an outcome,
- in which case patients who have previously experienced the outcome are
- not `at-risk' for having a first occurrence and therefore should be
- excluded from the target cohort. In other circumstances, there may be
- a desire to predict `prevalent' episodes, whereby patients with prior
- outcomes can be included in the analysis and the prior outcome itself
- can be a predictor of future outcomes. For our prediction example, the
- answer to this question is `Yes, allow persons with prior outcomes'
- because we know from the CHADS2 score that prior strokes are very
- predictive of future strokes. If this answer would have been `No' we
- also have to decide how long we would look back for previous
- occurrences of the outcome.
-\item
- \emph{How do we define the period in which we will predict our outcome
- relative to the target cohort start?} We actually have to make two
- decisions to answer that question. First, does the time-at-risk window
- start at the date of the start of the target cohort or later?
- Arguments to make it start later could be that you want to avoid
- outcomes that were entered late in the record that actually occurred
- before the start of the target cohort or you want to leave a gap where
- interventions to prevent the outcome could theoretically be
- implemented. Second, you need to define the time-at-risk by setting
- the risk window end, as some specification of days offset relative to
- the target cohort start or end dates. For our problem we will predict
- in a `time-at-risk' window starting 1 day after the start of the
- target cohort up to 365 days later (to look for 1-year risk following
- atrial fibrillation diagnosis).
-\item
- \emph{Do we require a minimum amount of time-at-risk?} We have to
- decide if we want to include patients that did not experience the
- outcome but did leave the database earlier than the end of our
- time-at-risk period. These patients may experience the outcome when we
- do not observe them. For our prediction problem we decide to answer
- this question with `Yes, require a mimimum time-at-risk' for that
- reason. Furthermore, we have to decide if this constraint also applies
- to persons who experienced the outcome or we will include all persons
- with the outcome irrespective of their total time at risk. For
- example, if the outcome is death, then persons with the outcome are
- likely censored before the full time-at-risk period is complete.
-\end{itemize}
-
-\hypertarget{model-development-settings}{%
-\subsection{Model development
-settings}\label{model-development-settings}}
-
-To develop the model we have to decide which algorithm(s) we like to
-train. We see the selection of the best algorithm for a certain
-prediction problem as an empirical question, i.e.~you need to let the
-data speak for itself and try different approaches to find the best one.
-There is no algorithm that will work best for all problems (no free
-lunch). In our package we therefore aim to implement many algorithms.
-Furthermore, we made the system modular so you can add your own custom
-algorithms as described in more detail in the
-\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/AddingCustomModels.pdf}{\texttt{AddingCustomModels}}
-vignette.
-
-Our package currently contains the following algorithms to choose from:
-
-\begin{longtable}[]{@{}
- >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.1190}}
- >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.6071}}
- >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.2738}}@{}}
-\toprule\noalign{}
-\begin{minipage}[b]{\linewidth}\raggedright
-Algorihm
-\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
-Description
-\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
-Hyper-parameters
-\end{minipage} \\
-\midrule\noalign{}
-\endhead
-\bottomrule\noalign{}
-\endlastfoot
-Regularized Logistic Regression & Lasso logistic regression belongs to
-the family of generalized linear models, where a linear combination of
-the variables is learned and finally a logistic function maps the linear
-combination to a value between 0 and 1. The lasso regularization adds a
-cost based on model complexity to the objective function when training
-the model. This cost is the sum of the absolute values of the linear
-combination of the coefficients. The model automatically performs
-feature selection by minimizing this cost. We use the Cyclic coordinate
-descent for logistic, Poisson and survival analysis (Cyclops) package to
-perform large-scale regularized logistic regression:
-\url{https://github.com/OHDSI/Cyclops} & var (starting variance),
-seed \\
-Gradient boosting machines & Gradient boosting machines is a boosting
-ensemble technique and in our framework it combines multiple decision
-trees. Boosting works by iteratively adding decision trees but adds more
-weight to the data-points that are misclassified by prior decision trees
-in the cost function when training the next tree. We use Extreme
-Gradient Boosting, which is an efficient implementation of the gradient
-boosting framework implemented in the xgboost R package available from
-CRAN. & ntree (number of trees), max depth (max levels in tree), min
-rows (minimum data points in in node), learning rate, balance (balance
-class labels), seed \\
-Random forest & Random forest is a bagging ensemble technique that
-combines multiple decision trees. The idea behind bagging is to reduce
-the likelihood of overfitting, by using weak classifiers, but combining
-multiple diverse weak classifiers into a strong classifier. Random
-forest accomplishes this by training multiple decision trees but only
-using a subset of the variables in each tree and the subset of variables
-differ between trees. Our packages uses the sklearn learn implementation
-of Random Forest in python. & mtry (number of features in each
-tree),ntree (number of trees), maxDepth (max levels in tree), minRows
-(minimum data points in in node),balance (balance class labels), seed \\
-K-nearest neighbors & K-nearest neighbors (KNN) is an algorithm that
-uses some metric to find the K closest labelled data-points, given the
-specified metric, to a new unlabelled data-point. The prediction of the
-new data-points is then the most prevalent class of the K-nearest
-labelled data-points. There is a sharing limitation of KNN, as the model
-requires labelled data to perform the prediction on new data, and it is
-often not possible to share this data across data sites.We included the
-BigKnn classifier developed in OHDSI which is a large scale k-nearest
-neighbor classifier using the Lucene search engine:
-\url{https://github.com/OHDSI/BigKnn} & k (number of
-neighbours),weighted (weight by inverse frequency) \\
-Naive Bayes & The Naive Bayes algorithm applies the Bayes theorem with
-the `naive' assumption of conditional independence between every pair of
-features given the value of the class variable. Based on the likelihood
-the data belongs to a class and the prior distribution of the class, a
-posterior distribution is obtained. & none \\
-AdaBoost & AdaBoost is a boosting ensemble technique. Boosting works by
-iteratively adding classifiers but adds more weight to the data-points
-that are misclassified by prior classifiers in the cost function when
-training the next classifier. We use the sklearn `AdaboostClassifier'
-implementation in Python. & nEstimators (the maximum number of
-estimators at which boosting is terminated), learningRate (learning rate
-shrinks the contribution of each classifier by learning\_rate. There is
-a trade-off between learningRate and nEstimators) \\
-Decision Tree & A decision tree is a classifier that partitions the
-variable space using individual tests selected using a greedy approach.
-It aims to find partitions that have the highest information gain to
-separate the classes. The decision tree can easily overfit by enabling a
-large number of partitions (tree depth) and often needs some
-regularization (e.g., pruning or specifying hyper-parameters that limit
-the complexity of the model). We use the sklearn
-`DecisionTreeClassifier' implementation in Python. & maxDepth (the
-maximum depth of the tree), minSamplesSplit,minSamplesLeaf,
-minImpuritySplit (threshold for early stopping in tree growth. A node
-will split if its impurity is above the threshold, otherwise it is a
-leaf.), seed,classWeight (`Balance' or `None') \\
-Multilayer Perception & Neural networks contain multiple layers that
-weight their inputs using a non-linear function. The first layer is the
-input layer, the last layer is the output layer the between are the
-hidden layers. Neural networks are generally trained using feed forward
-back-propagation. This is when you go through the network with a
-data-point and calculate the error between the true label and predicted
-label, then go backwards through the network and update the linear
-function weights based on the error. This can also be performed as a
-batch, where multiple data-points are fee & size (the number of hidden
-nodes), alpha (the l2 regularisation), seed \\
-Deep Learning (now in seperate DeepPatientLevelPrediction R package) &
-Deep learning such as deep nets, convolutional neural networks or
-recurrent neural networks are similar to a neural network but have
-multiple hidden layers that aim to learn latent representations useful
-for prediction. In the seperate BuildingDeepLearningModels vignette we
-describe these models and hyper-parameters in more detail & see
-OHDSI/DeepPatientLevelPrediction \\
-\end{longtable}
-
-Furthermore, we have to decide on the \textbf{covariates} that we will
-use to train our model. This choice can be driven by domain knowledge of
-available computational resources. In our example, we like to add the
-Gender, Age, Conditions, Drugs Groups, and Visit Count. We also have to
-specify in which time windows we will look and we decide to look in year
-before and any time prior.
-
-Finally, we have to define how we will train and test our model on our
-data, i.e.~how we perform \textbf{internal validation}. For this we have
-to decide how we divide our dataset in a training and testing dataset
-and how we randomly assign patients to these two sets. Dependent on the
-size of the training set we can decide how much data we like to use for
-training, typically this is a 75\%, 25\% split. If you have very large
-datasets you can use more data for training. To randomly assign patients
-to the training and testing set, there are two commonly used approaches:
-
-\begin{enumerate}
-\def\labelenumi{\arabic{enumi}.}
-\tightlist
-\item
- split by person. In this case a random seed is used to assign the
- patient to either sets.
-\item
- split by time. In this case a time point is used to split the persons,
- e.g.~75\% of the data is before and 25\% is after this date. The
- advantage of this is that you take into consideration that the health
- care system has changed over time.
-\end{enumerate}
-
-We now completely defined our studies and implement them:
-
-\begin{itemize}
-\tightlist
-\item
- \protect\hyperlink{example1}{See example 1: Stroke in afibrilation
- patients}
-\item
- \protect\hyperlink{example2}{See example 2: Agioedema in ACE inhibitor
- new users}
-\end{itemize}
-
-\hypertarget{example1}{%
-\section{Example 1: Stroke in afibrilation patients}\label{example1}}
-
-\hypertarget{study-specification-1}{%
-\subsection{Study Specification}\label{study-specification-1}}
-
-For our first prediction model we decide to start with a Regularized
-Logistic Regression and will use the default parameters. We will do a
-75\%-25\% split by person.
-
-\begin{longtable}[]{@{}
- >{\raggedright\arraybackslash}p{(\columnwidth - 2\tabcolsep) * \real{0.2361}}
- >{\raggedright\arraybackslash}p{(\columnwidth - 2\tabcolsep) * \real{0.7639}}@{}}
-\toprule\noalign{}
-\begin{minipage}[b]{\linewidth}\raggedright
-Definition
-\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
-Value
-\end{minipage} \\
-\midrule\noalign{}
-\endhead
-\bottomrule\noalign{}
-\endlastfoot
-\textbf{Problem Definition} & \\
-Target Cohort (T) & `Patients who are newly diagnosed with Atrial
-Fibrillation' defined as the first condition record of cardiac
-arrhythmia, which is followed by another cardiac arrhythmia condition
-record, at least two drug records for a drug used to treat arrhythmias,
-or a procedure to treat arrhythmias. \\
-Outcome Cohort (O) & `Ischemic stroke events' defined as ischemic stroke
-condition records during an inpatient or ER visit; successive records
-with \textgreater{} 180 day gap are considered independent episodes. \\
-Time-at-risk (TAR) & 1 day till 365 days from cohort start \\
-& \\
-\textbf{Population Definition} & \\
-Washout Period & 1095 \\
-Enter the target cohort multiple times? & No \\
-Allow prior outcomes? & Yes \\
-Start of time-at-risk & 1 day \\
-End of time-at-risk & 365 days \\
-Require a minimum amount of time-at-risk? & Yes (364 days) \\
-& \\
-\textbf{Model Development} & \\
-Algorithm & Regularized Logistic Regression \\
-Hyper-parameters & variance = 0.01 (Default) \\
-Covariates & Gender, Age, Conditions (ever before, \textless365), Drugs
-Groups (ever before, \textless365), and Visit Count \\
-Data split & 75\% train, 25\% test. Randomly assigned by person \\
-\end{longtable}
-
-According to the best practices we need to make a protocol that
-completely specifies how we plan to execute our study. This protocol
-will be assessed by the governance boards of the participating data
-sources in your network study. For this a template could be used but we
-prefer to automate this process as much as possible by adding
-functionality to automatically generate study protocol from a study
-specification. We will discuss this in more detail later.
-
-\hypertarget{study-implementation}{%
-\subsection{Study implementation}\label{study-implementation}}
-
-Now we have completely design our study we have to implement the study.
-We have to generate the target and outcome cohorts and we need to
-develop the R code to run against our CDM that will execute the full
-study.
-
-\hypertarget{cohort-instantiation}{%
-\subsubsection{Cohort instantiation}\label{cohort-instantiation}}
-
-For our study we need to know when a person enters the target and
-outcome cohorts. This is stored in a table on the server that contains
-the cohort start date and cohort end date for all subjects for a
-specific cohort definition. This cohort table has a very simple
-structure as shown below:
-
-\begin{itemize}
-\tightlist
-\item
- \texttt{cohort\_definition\_id}, a unique identifier for
- distinguishing between different types of cohorts, e.g.~cohorts of
- interest and outcome cohorts.
-\item
- \texttt{subject\_id}, a unique identifier corresponding to the
- \texttt{person\_id} in the CDM.
-\item
- \texttt{cohort\_start\_date}, the date the subject enters the cohort.
-\item
- \texttt{cohort\_end\_date}, the date the subject leaves the cohort.
-\end{itemize}
-
-How do we fill this table according to our cohort definitions? There are
-two options for this:
-
-\begin{enumerate}
-\def\labelenumi{\arabic{enumi})}
-\item
- use the interactive cohort builder tool in
- \href{www.github.com/OHDSI/ATLAS}{ATLAS} which can be used to create
- cohorts based on inclusion criteria and will automatically populate
- this cohort table.
-\item
- write your own custom SQL statements to fill the cohort table.
-\end{enumerate}
-
-Both methods are described below for our example prediction problem.
-
-\hypertarget{atlas-cohort-builder}{%
-\subsubsection{ATLAS cohort builder}\label{atlas-cohort-builder}}
-
-\begin{figure}
-\centering
-\includegraphics{example1/ATLAS_T.webp}
-\caption{Target Cohort Atrial Fibrillation}
-\end{figure}
-
-ATLAS allows you to define cohorts interactively by specifying cohort
-entry and cohort exit criteria. Cohort entry criteria involve selecting
-one or more initial events, which determine the start date for cohort
-entry, and optionally specifying additional inclusion criteria which
-filter to the qualifying events. Cohort exit criteria are applied to
-each cohort entry record to determine the end date when the person's
-episode no longer qualifies for the cohort. For the outcome cohort the
-end date is less relevant. As an example, Figure 4 shows how we created
-the Atrial Fibrillation cohort and Figure 5 shows how we created the
-stroke cohort in ATLAS.
-
-\begin{figure}
-\centering
-\includegraphics{example1/ATLAS_O.webp}
-\caption{Outcome Cohort Stroke}
-\end{figure}
-
-The T and O cohorts can be found here:
-
-\begin{itemize}
-\tightlist
-\item
- Atrial Fibrillaton (T):
- \url{http://www.ohdsi.org/web/atlas/\#/cohortdefinition/1769447}
-\item
- Stroke (O) :
- \url{http://www.ohdsi.org/web/atlas/\#/cohortdefinition/1769448}
-\end{itemize}
-
-In depth explanation of cohort creation in ATLAS is out of scope of this
-vignette but can be found on the OHDSI wiki pages
-\href{http://www.ohdsi.org/web/wiki/doku.php?id=documentation:software:atlas}{(link)}.
-
-Note that when a cohort is created in ATLAS the cohortid is needed to
-extract the data in R. The cohortid can be found at the top of the ATLAS
-screen, e.g.~1769447 in Figure 4.
-
-\hypertarget{custom-cohorts}{%
-\subsubsection{Custom cohorts}\label{custom-cohorts}}
-
-It is also possible to create cohorts without the use of ATLAS. Using
-custom cohort code (SQL) you can make more advanced cohorts if needed.
-
-For our example study, we need to create at table to hold the cohort
-data and we need to create SQL code to instantiate this table for both
-the AF and Stroke cohorts. Therefore, we create a file called
-\emph{AfStrokeCohorts.sql} with the following contents:
-
-\begin{Shaded}
-\begin{Highlighting}[]
-\CommentTok{/***********************************}
-\CommentTok{File AfStrokeCohorts.sql }
-\CommentTok{***********************************/}
-\CommentTok{/*}
-\CommentTok{Create a table to store the persons in the T and C cohort}
-\CommentTok{*/}
-
-\ControlFlowTok{IF}\NormalTok{ OBJECT\_ID(}\StringTok{\textquotesingle{}@resultsDatabaseSchema.PLPAFibStrokeCohort\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}U\textquotesingle{}}\NormalTok{) }\KeywordTok{IS} \KeywordTok{NOT} \KeywordTok{NULL}
-\KeywordTok{DROP} \KeywordTok{TABLE}\NormalTok{ @resultsDatabaseSchema.PLPAFibStrokeCohort;}
-
-\KeywordTok{CREATE} \KeywordTok{TABLE}\NormalTok{ @resultsDatabaseSchema.PLPAFibStrokeCohort }
-\NormalTok{( }
-\NormalTok{cohort\_definition\_id }\DataTypeTok{INT}\NormalTok{, }
-\NormalTok{subject\_id BIGINT,}
-\NormalTok{cohort\_start\_date }\DataTypeTok{DATE}\NormalTok{, }
-\NormalTok{cohort\_end\_date }\DataTypeTok{DATE}
-\NormalTok{);}
-
-
-\CommentTok{/*}
-\CommentTok{T cohort: [PatientLevelPrediction vignette]: T : patients who are newly }
-\CommentTok{diagnosed with Atrial fibrillation}
-\CommentTok{{-} persons with a condition occurrence record of \textquotesingle{}Atrial fibrillation\textquotesingle{} or }
-\CommentTok{any descendants, indexed at the first diagnosis}
-\CommentTok{{-} who have \textgreater{}1095 days of prior observation before their first diagnosis}
-\CommentTok{{-} and have no warfarin exposure any time prior to first AFib diagnosis}
-\CommentTok{*/}
-\KeywordTok{INSERT} \KeywordTok{INTO}\NormalTok{ @resultsDatabaseSchema.AFibStrokeCohort (cohort\_definition\_id, }
-\NormalTok{subject\_id, }
-\NormalTok{cohort\_start\_date, }
-\NormalTok{cohort\_end\_date)}
-\KeywordTok{SELECT} \DecValTok{1} \KeywordTok{AS}\NormalTok{ cohort\_definition\_id,}
-\NormalTok{AFib.person\_id }\KeywordTok{AS}\NormalTok{ subject\_id,}
-\NormalTok{AFib.condition\_start\_date }\KeywordTok{AS}\NormalTok{ cohort\_start\_date,}
-\NormalTok{observation\_period.observation\_period\_end\_date }\KeywordTok{AS}\NormalTok{ cohort\_end\_date}
-\KeywordTok{FROM}
-\NormalTok{(}
- \KeywordTok{SELECT}\NormalTok{ person\_id, }\FunctionTok{min}\NormalTok{(condition\_start\_date) }\KeywordTok{as}\NormalTok{ condition\_start\_date}
- \KeywordTok{FROM}\NormalTok{ @cdmDatabaseSchema.condition\_occurrence}
- \KeywordTok{WHERE}\NormalTok{ condition\_concept\_id }\KeywordTok{IN}\NormalTok{ (}\KeywordTok{SELECT}\NormalTok{ descendant\_concept\_id }\KeywordTok{FROM}
-\NormalTok{ @cdmDatabaseSchema.concept\_ancestor }\KeywordTok{WHERE}\NormalTok{ ancestor\_concept\_id }\KeywordTok{IN}
-\NormalTok{ (}\DecValTok{313217} \CommentTok{/*atrial fibrillation*/}\NormalTok{))}
- \KeywordTok{GROUP} \KeywordTok{BY}\NormalTok{ person\_id}
-\NormalTok{) AFib}
- \KeywordTok{INNER} \KeywordTok{JOIN}\NormalTok{ @cdmDatabaseSchema.observation\_period}
- \KeywordTok{ON}\NormalTok{ AFib.person\_id }\OperatorTok{=}\NormalTok{ observation\_period.person\_id}
- \KeywordTok{AND}\NormalTok{ AFib.condition\_start\_date }\OperatorTok{\textgreater{}=}\NormalTok{ dateadd(dd,}\DecValTok{1095}\NormalTok{, }
-\NormalTok{ observation\_period.observation\_period\_start\_date)}
- \KeywordTok{AND}\NormalTok{ AFib.condition\_start\_date }\OperatorTok{\textless{}=}\NormalTok{ observation\_period.observation\_period\_end\_date}
- \KeywordTok{LEFT} \KeywordTok{JOIN}
-\NormalTok{ (}
- \KeywordTok{SELECT}\NormalTok{ person\_id, }\FunctionTok{min}\NormalTok{(drug\_exposure\_start\_date) }\KeywordTok{as}\NormalTok{ drug\_exposure\_start\_date}
- \KeywordTok{FROM}\NormalTok{ @cdmDatabaseSchema.drug\_exposure}
- \KeywordTok{WHERE}\NormalTok{ drug\_concept\_id }\KeywordTok{IN}\NormalTok{ (}\KeywordTok{SELECT}\NormalTok{ descendant\_concept\_id }\KeywordTok{FROM}
-\NormalTok{ @cdmDatabaseSchema.concept\_ancestor }\KeywordTok{WHERE}\NormalTok{ ancestor\_concept\_id }\KeywordTok{IN}
-\NormalTok{ (}\DecValTok{1310149} \CommentTok{/*warfarin*/}\NormalTok{))}
- \KeywordTok{GROUP} \KeywordTok{BY}\NormalTok{ person\_id}
-\NormalTok{ ) warfarin}
- \KeywordTok{ON}\NormalTok{ Afib.person\_id }\OperatorTok{=}\NormalTok{ warfarin.person\_id}
- \KeywordTok{AND}\NormalTok{ Afib.condition\_start\_date }\OperatorTok{\textgreater{}}\NormalTok{ warfarin.drug\_exposure\_start\_date}
- \KeywordTok{WHERE}\NormalTok{ warfarin.person\_id }\KeywordTok{IS} \KeywordTok{NULL}
-\NormalTok{ ;}
-
- \CommentTok{/*}
-\CommentTok{ C cohort: [PatientLevelPrediction vignette]: O: Ischemic stroke events}
-\CommentTok{ {-} inpatient visits that include a condition occurrence record for }
-\CommentTok{ \textquotesingle{}cerebral infarction\textquotesingle{} and descendants, \textquotesingle{}cerebral thrombosis\textquotesingle{}, }
-\CommentTok{ \textquotesingle{}cerebral embolism\textquotesingle{}, \textquotesingle{}cerebral artery occlusion\textquotesingle{} }
-\CommentTok{ */}
- \KeywordTok{INSERT} \KeywordTok{INTO}\NormalTok{ @resultsDatabaseSchema.AFibStrokeCohort (cohort\_definition\_id, }
-\NormalTok{ subject\_id, }
-\NormalTok{ cohort\_start\_date, }
-\NormalTok{ cohort\_end\_date)}
- \KeywordTok{SELECT} \DecValTok{2} \KeywordTok{AS}\NormalTok{ cohort\_definition\_id,}
-\NormalTok{ visit\_occurrence.person\_id }\KeywordTok{AS}\NormalTok{ subject\_id,}
-\NormalTok{ visit\_occurrence.visit\_start\_date }\KeywordTok{AS}\NormalTok{ cohort\_start\_date,}
-\NormalTok{ visit\_occurrence.visit\_end\_date }\KeywordTok{AS}\NormalTok{ cohort\_end\_date}
- \KeywordTok{FROM}
-\NormalTok{ (}
- \KeywordTok{SELECT}\NormalTok{ person\_id, condition\_start\_date}
- \KeywordTok{FROM}\NormalTok{ @cdmDatabaseSchema.condition\_occurrence}
- \KeywordTok{WHERE}\NormalTok{ condition\_concept\_id }\KeywordTok{IN}\NormalTok{ (}\KeywordTok{SELECT} \KeywordTok{DISTINCT}\NormalTok{ descendant\_concept\_id }\KeywordTok{FROM}
-\NormalTok{ @cdmDatabaseSchema.concept\_ancestor }\KeywordTok{WHERE}\NormalTok{ ancestor\_concept\_id }\KeywordTok{IN}
-\NormalTok{ (}\DecValTok{443454} \CommentTok{/*cerebral infarction*/}\NormalTok{) }\KeywordTok{OR}\NormalTok{ descendant\_concept\_id }\KeywordTok{IN}
-\NormalTok{ (}\DecValTok{441874} \CommentTok{/*cerebral thrombosis*/}\NormalTok{, }\DecValTok{375557} \CommentTok{/*cerebral embolism*/}\NormalTok{, }
- \DecValTok{372924} \CommentTok{/*cerebral artery occlusion*/}\NormalTok{))}
-\NormalTok{ ) stroke}
- \KeywordTok{INNER} \KeywordTok{JOIN}\NormalTok{ @cdmDatabaseSchema.visit\_occurrence}
- \KeywordTok{ON}\NormalTok{ stroke.person\_id }\OperatorTok{=}\NormalTok{ visit\_occurrence.person\_id}
- \KeywordTok{AND}\NormalTok{ stroke.condition\_start\_date }\OperatorTok{\textgreater{}=}\NormalTok{ visit\_occurrence.visit\_start\_date}
- \KeywordTok{AND}\NormalTok{ stroke.condition\_start\_date }\OperatorTok{\textless{}=}\NormalTok{ visit\_occurrence.visit\_end\_date}
- \KeywordTok{AND}\NormalTok{ visit\_occurrence.visit\_concept\_id }\KeywordTok{IN}\NormalTok{ (}\DecValTok{9201}\NormalTok{, }\DecValTok{262} \CommentTok{/*\textquotesingle{}Inpatient Visit\textquotesingle{} or }
-\CommentTok{ \textquotesingle{}Emergency Room and Inpatient Visit\textquotesingle{}*/}\NormalTok{)}
- \KeywordTok{GROUP} \KeywordTok{BY}\NormalTok{ visit\_occurrence.person\_id, visit\_occurrence.visit\_start\_date, }
-\NormalTok{ visit\_occurrence.visit\_end\_date}
-\NormalTok{ ;}
-
-\end{Highlighting}
-\end{Shaded}
-
-This is parameterized SQL which can be used by the
-\href{http://github.com/OHDSI/SqlRender}{\texttt{SqlRender}} package. We
-use parameterized SQL so we do not have to pre-specify the names of the
-CDM and result schemas. That way, if we want to run the SQL on a
-different schema, we only need to change the parameter values; we do not
-have to change the SQL code. By also making use of translation
-functionality in \texttt{SqlRender}, we can make sure the SQL code can
-be run in many different environments.
-
-To execute this sql against our CDM we first need to tell R how to
-connect to the server. \texttt{PatientLevelPrediction} uses the
-\href{http://github.com/ohdsi/DatabaseConnector}{\texttt{DatabaseConnector}}
-package, which provides a function called
-\texttt{createConnectionDetails}. Type \texttt{?createConnectionDetails}
-for the specific settings required for the various database management
-systems (DBMS). For example, one might connect to a PostgreSQL database
-using this code:
-
-\begin{Shaded}
-\begin{Highlighting}[]
-\NormalTok{ connectionDetails }\OtherTok{\textless{}{-}} \FunctionTok{createConnectionDetails}\NormalTok{(}\AttributeTok{dbms =} \StringTok{"postgresql"}\NormalTok{, }
- \AttributeTok{server =} \StringTok{"localhost/ohdsi"}\NormalTok{, }
- \AttributeTok{user =} \StringTok{"joe"}\NormalTok{, }
- \AttributeTok{password =} \StringTok{"supersecret"}\NormalTok{)}
-
-\NormalTok{ cdmDatabaseSchema }\OtherTok{\textless{}{-}} \StringTok{"my\_cdm\_data"}
-\NormalTok{ cohortsDatabaseSchema }\OtherTok{\textless{}{-}} \StringTok{"my\_results"}
-\NormalTok{ cdmVersion }\OtherTok{\textless{}{-}} \StringTok{"5"}
-\end{Highlighting}
-\end{Shaded}
-
-The last three lines define the \texttt{cdmDatabaseSchema} and
-\texttt{cohortsDatabaseSchema} variables, as well as the CDM version. We
-will use these later to tell R where the data in CDM format live, where
-we want to create the cohorts of interest, and what version CDM is used.
-Note that for Microsoft SQL Server, databaseschemas need to specify both
-the database and the schema, so for example
-\texttt{cdmDatabaseSchema\ \textless{}-\ "my\_cdm\_data.dbo"}.
-
-\begin{Shaded}
-\begin{Highlighting}[]
- \FunctionTok{library}\NormalTok{(SqlRender)}
-\NormalTok{ sql }\OtherTok{\textless{}{-}} \FunctionTok{readSql}\NormalTok{(}\StringTok{"AfStrokeCohorts.sql"}\NormalTok{)}
-\NormalTok{ sql }\OtherTok{\textless{}{-}} \FunctionTok{renderSql}\NormalTok{(sql,}
- \AttributeTok{cdmDatabaseSchema =}\NormalTok{ cdmDatabaseSchema,}
- \AttributeTok{cohortsDatabaseSchema =}\NormalTok{ cohortsDatabaseSchema,}
- \AttributeTok{post\_time =} \DecValTok{30}\NormalTok{,}
- \AttributeTok{pre\_time =} \DecValTok{365}\NormalTok{)}\SpecialCharTok{$}\NormalTok{sql}
-\NormalTok{ sql }\OtherTok{\textless{}{-}} \FunctionTok{translateSql}\NormalTok{(sql, }\AttributeTok{targetDialect =}\NormalTok{ connectionDetails}\SpecialCharTok{$}\NormalTok{dbms)}\SpecialCharTok{$}\NormalTok{sql}
-
-\NormalTok{ connection }\OtherTok{\textless{}{-}} \FunctionTok{connect}\NormalTok{(connectionDetails)}
- \FunctionTok{executeSql}\NormalTok{(connection, sql)}
-\end{Highlighting}
-\end{Shaded}
-
-In this code, we first read the SQL from the file into memory. In the
-next line, we replace four parameter names with the actual values. We
-then translate the SQL into the dialect appropriate for the DBMS we
-already specified in the \texttt{connectionDetails}. Next, we connect to
-the server, and submit the rendered and translated SQL.
-
-If all went well, we now have a table with the events of interest. We
-can see how many events per type:
-
-\begin{Shaded}
-\begin{Highlighting}[]
-\NormalTok{ sql }\OtherTok{\textless{}{-}} \FunctionTok{paste}\NormalTok{(}\StringTok{"SELECT cohort\_definition\_id, COUNT(*) AS count"}\NormalTok{,}
- \StringTok{"FROM @cohortsDatabaseSchema.AFibStrokeCohort"}\NormalTok{,}
- \StringTok{"GROUP BY cohort\_definition\_id"}\NormalTok{)}
-\NormalTok{ sql }\OtherTok{\textless{}{-}} \FunctionTok{renderSql}\NormalTok{(sql, }\AttributeTok{cohortsDatabaseSchema =}\NormalTok{ cohortsDatabaseSchema)}\SpecialCharTok{$}\NormalTok{sql}
-\NormalTok{ sql }\OtherTok{\textless{}{-}} \FunctionTok{translateSql}\NormalTok{(sql, }\AttributeTok{targetDialect =}\NormalTok{ connectionDetails}\SpecialCharTok{$}\NormalTok{dbms)}\SpecialCharTok{$}\NormalTok{sql}
-
- \FunctionTok{querySql}\NormalTok{(connection, sql)}
-\end{Highlighting}
-\end{Shaded}
-
-\begin{verbatim}
-## cohort_definition_id count
-## 1 1 527616
-## 2 2 221555
-\end{verbatim}
-
-\hypertarget{study-script-creation}{%
-\subsubsection{Study script creation}\label{study-script-creation}}
-
-In this section we assume that our cohorts have been created either by
-using ATLAS or a custom SQL script. We will first explain how to create
-an R script yourself that will execute our study as we have defined
-earlier.
-
-\hypertarget{data-extraction}{%
-\subsubsection{Data extraction}\label{data-extraction}}
-
-Now we can tell \texttt{PatientLevelPrediction} to extract all necessary
-data for our analysis. This is done using the
-\href{https://github.com/OHDSI/FeatureExtraction}{\texttt{FeatureExtractionPackage}}.
-In short the FeatureExtractionPackage allows you to specify which
-features (covariates) need to be extracted, e.g.~all conditions and drug
-exposures. It also supports the creation of custom covariates. For more
-detailed information on the FeatureExtraction package see its
-\href{https://github.com/OHDSI/FeatureExtraction}{vignettes}. For our
-example study we decided to use these settings:
-
-\begin{Shaded}
-\begin{Highlighting}[]
-\NormalTok{ covariateSettings }\OtherTok{\textless{}{-}} \FunctionTok{createCovariateSettings}\NormalTok{(}\AttributeTok{useDemographicsGender =} \ConstantTok{TRUE}\NormalTok{,}
- \AttributeTok{useDemographicsAge =} \ConstantTok{TRUE}\NormalTok{,}
- \AttributeTok{useConditionGroupEraLongTerm =} \ConstantTok{TRUE}\NormalTok{,}
- \AttributeTok{useConditionGroupEraAnyTimePrior =} \ConstantTok{TRUE}\NormalTok{,}
- \AttributeTok{useDrugGroupEraLongTerm =} \ConstantTok{TRUE}\NormalTok{,}
- \AttributeTok{useDrugGroupEraAnyTimePrior =} \ConstantTok{TRUE}\NormalTok{,}
- \AttributeTok{useVisitConceptCountLongTerm =} \ConstantTok{TRUE}\NormalTok{,}
- \AttributeTok{longTermStartDays =} \SpecialCharTok{{-}}\DecValTok{365}\NormalTok{,}
- \AttributeTok{endDays =} \SpecialCharTok{{-}}\DecValTok{1}\NormalTok{)}
-\end{Highlighting}
-\end{Shaded}
-
-The final step for extracting the data is to run the \texttt{getPlpData}
-function and input the connection details, the database schema where the
-cohorts are stored, the cohort definition ids for the cohort and
-outcome, and the washoutPeriod which is the minimum number of days prior
-to cohort index date that the person must have been observed to be
-included into the data, and finally input the previously constructed
-covariate settings.
-
-\begin{Shaded}
-\begin{Highlighting}[]
-\NormalTok{databaseDetails }\OtherTok{\textless{}{-}} \FunctionTok{createDatabaseDetails}\NormalTok{(}
- \AttributeTok{connectionDetails =}\NormalTok{ connectionDetails,}
- \AttributeTok{cdmDatabaseSchema =}\NormalTok{ cdmDatabaseSchema,}
- \AttributeTok{cdmDatabaseName =} \StringTok{\textquotesingle{}\textquotesingle{}}\NormalTok{,}
- \AttributeTok{cohortDatabaseSchema =}\NormalTok{ resultsDatabaseSchema,}
- \AttributeTok{cohortTable =} \StringTok{\textquotesingle{}AFibStrokeCohort\textquotesingle{}}\NormalTok{,}
- \AttributeTok{cohortId =} \DecValTok{1}\NormalTok{,}
- \AttributeTok{outcomeDatabaseSchema =}\NormalTok{ resultsDatabaseSchema,}
- \AttributeTok{outcomeTable =} \StringTok{\textquotesingle{}AFibStrokeCohort\textquotesingle{}}\NormalTok{,}
- \AttributeTok{outcomeIds =} \DecValTok{2}\NormalTok{,}
- \AttributeTok{cdmVersion =} \DecValTok{5}
-\NormalTok{ )}
-
-\CommentTok{\# here you can define whether you want to sample the target cohort and add any}
-\CommentTok{\# restrictions based on minimum prior observation, index date restrictions}
-\CommentTok{\# or restricting to first index date (if people can be in target cohort multiple times)}
-\NormalTok{restrictPlpDataSettings }\OtherTok{\textless{}{-}} \FunctionTok{createRestrictPlpDataSettings}\NormalTok{(}\AttributeTok{sampleSize =} \DecValTok{10000}\NormalTok{)}
-
-\NormalTok{ plpData }\OtherTok{\textless{}{-}} \FunctionTok{getPlpData}\NormalTok{(}
- \AttributeTok{databaseDetails =}\NormalTok{ databaseDetails, }
- \AttributeTok{covariateSettings =}\NormalTok{ covariateSettings,}
- \AttributeTok{restrictPlpDataSettings =}\NormalTok{ restrictPlpDataSettings}
-\NormalTok{ )}
-\end{Highlighting}
-\end{Shaded}
-
-Note that if the cohorts are created in ATLAS its corresponding cohort
-database schema needs to be selected. There are many additional
-parameters for the \texttt{createRestrictPlpDataSettings} function which
-are all documented in the \texttt{PatientLevelPrediction} manual. The
-resulting \texttt{plpData} object uses the package \texttt{Andromeda}
-(which uses \href{https://www.sqlite.org/index.html}{SQLite}) to store
-information in a way that ensures R does not run out of memory, even
-when the data are large.
-
-Creating the \texttt{plpData} object can take considerable computing
-time, and it is probably a good idea to save it for future sessions.
-Because \texttt{plpData} uses \texttt{Andromeda}, we cannot use R's
-regular save function. Instead, we'll have to use the
-\texttt{savePlpData()} function:
-
-\begin{Shaded}
-\begin{Highlighting}[]
- \FunctionTok{savePlpData}\NormalTok{(plpData, }\StringTok{"stroke\_in\_af\_data"}\NormalTok{)}
-\end{Highlighting}
-\end{Shaded}
-
-We can use the \texttt{loadPlpData()} function to load the data in a
-future session.
-
-\hypertarget{additional-inclusion-criteria}{%
-\subsubsection{Additional inclusion
-criteria}\label{additional-inclusion-criteria}}
-
-To completely define the prediction problem the final study population
-is obtained by applying additional constraints on the two earlier
-defined cohorts, e.g., a minumim time at risk can be enforced
-(\texttt{requireTimeAtRisk,\ minTimeAtRisk}) and we can specify if this
-also applies to patients with the outcome (\texttt{includeAllOutcomes}).
-Here we also specify the start and end of the risk window relative to
-target cohort start. For example, if we like the risk window to start 30
-days after the at-risk cohort start and end a year later we can set
-\texttt{riskWindowStart\ =\ 30} and \texttt{riskWindowEnd\ =\ 365}. In
-some cases the risk window needs to start at the cohort end date. This
-can be achieved by setting \texttt{addExposureToStart\ =\ TRUE} which
-adds the cohort (exposure) time to the start date.
-
-In Appendix 1, we demonstrate the effect of these settings on the subset
-of the persons in the target cohort that end up in the final study
-population.
-
-In the example below all the settings we defined for our study are
-imposed:
-
-\begin{Shaded}
-\begin{Highlighting}[]
-\NormalTok{ populationSettings }\OtherTok{\textless{}{-}} \FunctionTok{createStudyPopulationSettings}\NormalTok{(}
- \AttributeTok{washoutPeriod =} \DecValTok{1095}\NormalTok{,}
- \AttributeTok{firstExposureOnly =} \ConstantTok{FALSE}\NormalTok{,}
- \AttributeTok{removeSubjectsWithPriorOutcome =} \ConstantTok{FALSE}\NormalTok{,}
- \AttributeTok{priorOutcomeLookback =} \DecValTok{1}\NormalTok{,}
- \AttributeTok{riskWindowStart =} \DecValTok{1}\NormalTok{,}
- \AttributeTok{riskWindowEnd =} \DecValTok{365}\NormalTok{,}
- \AttributeTok{startAnchor =} \StringTok{\textquotesingle{}cohort start\textquotesingle{}}\NormalTok{,}
- \AttributeTok{endAnchor =} \StringTok{\textquotesingle{}cohort start\textquotesingle{}}\NormalTok{,}
- \AttributeTok{minTimeAtRisk =} \DecValTok{364}\NormalTok{,}
- \AttributeTok{requireTimeAtRisk =} \ConstantTok{TRUE}\NormalTok{,}
- \AttributeTok{includeAllOutcomes =} \ConstantTok{TRUE}
-\NormalTok{ )}
-\end{Highlighting}
-\end{Shaded}
-
-\hypertarget{spliting-the-data-into-trainingvalidationtesting-datasets}{%
-\subsubsection{Spliting the data into training/validation/testing
-datasets}\label{spliting-the-data-into-trainingvalidationtesting-datasets}}
-
-When developing a prediction model using supervised learning (when you
-have features paired with labels for a set of patients), the first step
-is to design the development/internal validation process. This requires
-specifying how to select the model hyper-parameters, how to learn the
-model parameters and how to fairly evaluate the model. In general, the
-validation set is used to pick hyper-parameters, the training set is
-used to learn the model parameters and the test set is used to perform
-fair internal validation. However, cross-validation can be implemented
-to pick the hyper-parameters on the training data (so a validation data
-set is not required). Cross validation can also be used to estimate
-internal validation (so a testing data set is not required).
-
-In small data the best approach for internal validation has been shown
-to be boostrapping. However, in big data (many patients and many
-features) bootstrapping is generally not feasible. In big data our
-research has shown that it is just important to have some form of fair
-evaluation (use a test set or cross validation). For full details see
-\href{add\%20link}{our BMJ open paper}.
-
-In the PatientLevelPrediction package, the splitSettings define how the
-plpData are partitioned into training/validation/testing data. Cross
-validation is always done, but using a test set is optional (when the
-data are small, it may be optimal to not use a test set). For the
-splitSettings we can use the type (stratified/time/subject) and
-testFraction parameters to split the data in a 75\%-25\% split and run
-the patient-level prediction pipeline:
-
-\begin{Shaded}
-\begin{Highlighting}[]
-\NormalTok{ splitSettings }\OtherTok{\textless{}{-}} \FunctionTok{createDefaultSplitSetting}\NormalTok{(}
- \AttributeTok{trainFraction =} \FloatTok{0.75}\NormalTok{,}
- \AttributeTok{testFraction =} \FloatTok{0.25}\NormalTok{,}
- \AttributeTok{type =} \StringTok{\textquotesingle{}stratified\textquotesingle{}}\NormalTok{,}
- \AttributeTok{nfold =} \DecValTok{2}\NormalTok{, }
- \AttributeTok{splitSeed =} \DecValTok{1234}
-\NormalTok{ )}
-\end{Highlighting}
-\end{Shaded}
-
-Note: it is possible to add a custom method to specify how the plpData
-are partitioned into training/validation/testing data, see
-\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/AddingCustomSplitting.pdf}{vignette
-for custom splitting}.
-
-\hypertarget{preprocessing-the-training-data}{%
-\subsubsection{Preprocessing the training
-data}\label{preprocessing-the-training-data}}
-
-There a numerous data processing settings that a user must specify when
-developing a prediction model. These are: * Whether to under-sample or
-over-sample the training data (this may be useful when there is class
-imballance (e.g., the outcome is very rare or very common)) * Whether to
-perform feature engineering or feature selection (e.g., create latent
-variables that are not observed in the data or reduce the dimensionality
-of the data) * Whether to remove redundant features and normalize the
-data (this is required for some models)
-
-The default sample settings does nothing, it simply returns the
-trainData as input, see below:
-
-\begin{Shaded}
-\begin{Highlighting}[]
-\NormalTok{ sampleSettings }\OtherTok{\textless{}{-}} \FunctionTok{createSampleSettings}\NormalTok{()}
-\end{Highlighting}
-\end{Shaded}
-
-However, the current package contains methods of under-sampling the
-non-outcome patients. To perform undersampling, the \texttt{type} input
-should be `underSample' and \texttt{numberOutcomestoNonOutcomes} must be
-specified (an integer specifying the number of non-outcomes per
-outcome). It is possible to add any custom function for over/under
-sampling, see
-\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/AddingCustomSamples.pdf}{vignette
-for custom sampling}.
-
-It is possible to specify a combination of feature engineering functions
-that take as input the trainData and output a new trainData with
-different features. The default feature engineering setting does
-nothing:
-
-\begin{Shaded}
-\begin{Highlighting}[]
-\NormalTok{ featureEngineeringSettings }\OtherTok{\textless{}{-}} \FunctionTok{createFeatureEngineeringSettings}\NormalTok{()}
-\end{Highlighting}
-\end{Shaded}
-
-However, it is possible to add custom feature engineering functions into
-the pipeline, see
-\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/AddingCustomFeatureEngineering.pdf}{vignette
-for custom feature engineering}.
-
-Finally, the preprocessing setting is required. For this setting the
-user can define \texttt{minFraction}, this removes any features that is
-observed in the training data for less than 0.01 fraction of the
-patients. So, if \texttt{minFraction\ =\ 0.01} then any feature that is
-seen in less than 1 percent of the target population is removed. The
-input \texttt{normalize} specifies whether the features are scaled
-between 0 and 1, this is required for certain models (e.g., LASSO
-logistic regression). The input \texttt{removeRedundancy} specifies
-whether features that are observed in all of the target population are
-removed.
-
-\begin{Shaded}
-\begin{Highlighting}[]
-\NormalTok{ preprocessSettingsSettings }\OtherTok{\textless{}{-}} \FunctionTok{createPreprocessSettings}\NormalTok{(}
- \AttributeTok{minFraction =} \FloatTok{0.01}\NormalTok{, }
- \AttributeTok{normalize =}\NormalTok{ T, }
- \AttributeTok{removeRedundancy =}\NormalTok{ T}
-\NormalTok{ )}
-\end{Highlighting}
-\end{Shaded}
-
-\hypertarget{model-development}{%
-\subsubsection{Model Development}\label{model-development}}
-
-In the set function of an algorithm the user can specify a list of
-eligible values for each hyper-parameter. All possible combinations of
-the hyper-parameters are included in a so-called grid search using
-cross-validation on the training set. If a user does not specify any
-value then the default value is used instead.
-
-For example, if we use the following settings for the
-gradientBoostingMachine: ntrees=c(100,200), maxDepth=4 the grid search
-will apply the gradient boosting machine algorithm with ntrees=100 and
-maxDepth=4 plus the default settings for other hyper-parameters and
-ntrees=200 and maxDepth=4 plus the default settings for other
-hyper-parameters. The hyper-parameters that lead to the
-bestcross-validation performance will then be chosen for the final
-model. For our problem we choose to build a logistic regression model
-with the default hyper-parameters
-
-\begin{Shaded}
-\begin{Highlighting}[]
-\NormalTok{lrModel }\OtherTok{\textless{}{-}} \FunctionTok{setLassoLogisticRegression}\NormalTok{()}
-\end{Highlighting}
-\end{Shaded}
-
-The \texttt{runPlP} function requires the \texttt{plpData}, the
-\texttt{outcomeId} specifying the outcome being predicted and the
-settings: \texttt{populationSettings}, \texttt{splitSettings},
-\texttt{sampleSettings}, \texttt{featureEngineeringSettings},
-\texttt{preprocessSettings} and \texttt{modelSettings} to train and
-evaluate the model.
-
-\begin{Shaded}
-\begin{Highlighting}[]
-\NormalTok{ lrResults }\OtherTok{\textless{}{-}} \FunctionTok{runPlp}\NormalTok{(}
- \AttributeTok{plpData =}\NormalTok{ plpData,}
- \AttributeTok{outcomeId =} \DecValTok{2}\NormalTok{, }
- \AttributeTok{analysisId =} \StringTok{\textquotesingle{}singleDemo\textquotesingle{}}\NormalTok{,}
- \AttributeTok{analysisName =} \StringTok{\textquotesingle{}Demonstration of runPlp for training single PLP models\textquotesingle{}}\NormalTok{,}
- \AttributeTok{populationSettings =}\NormalTok{ populationSettings, }
- \AttributeTok{splitSettings =}\NormalTok{ splitSettings,}
- \AttributeTok{sampleSettings =}\NormalTok{ sampleSettings, }
- \AttributeTok{featureEngineeringSettings =}\NormalTok{ featureEngineeringSettings, }
- \AttributeTok{preprocessSettings =}\NormalTok{ preprocessSettings,}
- \AttributeTok{modelSettings =}\NormalTok{ lrModel,}
- \AttributeTok{logSettings =} \FunctionTok{createLogSettings}\NormalTok{(), }
- \AttributeTok{executeSettings =} \FunctionTok{createExecuteSettings}\NormalTok{(}
- \AttributeTok{runSplitData =}\NormalTok{ T, }
- \AttributeTok{runSampleData =}\NormalTok{ T, }
- \AttributeTok{runfeatureEngineering =}\NormalTok{ T, }
- \AttributeTok{runPreprocessData =}\NormalTok{ T, }
- \AttributeTok{runModelDevelopment =}\NormalTok{ T, }
- \AttributeTok{runCovariateSummary =}\NormalTok{ T}
-\NormalTok{ ), }
- \AttributeTok{saveDirectory =} \FunctionTok{file.path}\NormalTok{(}\FunctionTok{getwd}\NormalTok{(), }\StringTok{\textquotesingle{}singlePlp\textquotesingle{}}\NormalTok{)}
-\NormalTok{ )}
-\end{Highlighting}
-\end{Shaded}
-
-Under the hood the package will now use the
-\href{www.github.com/OHDSI/Cyclops}{\texttt{Cyclops}} package to fit a
-large-scale regularized regression using 75\% of the data and will
-evaluate the model on the remaining 25\%. A results data structure is
-returned containing information about the model, its performance etc.
-
-You can save the model using:
-
-\begin{Shaded}
-\begin{Highlighting}[]
-\FunctionTok{savePlpModel}\NormalTok{(lrResults}\SpecialCharTok{$}\NormalTok{model, }\AttributeTok{dirPath =} \FunctionTok{file.path}\NormalTok{(}\FunctionTok{getwd}\NormalTok{(), }\StringTok{"model"}\NormalTok{))}
-\end{Highlighting}
-\end{Shaded}
-
-You can load the model using:
-
-\begin{Shaded}
-\begin{Highlighting}[]
-\NormalTok{ plpModel }\OtherTok{\textless{}{-}} \FunctionTok{loadPlpModel}\NormalTok{(}\FunctionTok{file.path}\NormalTok{(}\FunctionTok{getwd}\NormalTok{(),}\StringTok{\textquotesingle{}model\textquotesingle{}}\NormalTok{))}
-\end{Highlighting}
-\end{Shaded}
-
-You can also save the full results structure using:
-
-\begin{Shaded}
-\begin{Highlighting}[]
- \FunctionTok{savePlpResult}\NormalTok{(lrResults, }\AttributeTok{location =} \FunctionTok{file.path}\NormalTok{(}\FunctionTok{getwd}\NormalTok{(),}\StringTok{\textquotesingle{}lr\textquotesingle{}}\NormalTok{))}
-\end{Highlighting}
-\end{Shaded}
-
-To load the full results structure use:
-
-\begin{Shaded}
-\begin{Highlighting}[]
-\NormalTok{ lrResults }\OtherTok{\textless{}{-}} \FunctionTok{loadPlpResult}\NormalTok{(}\FunctionTok{file.path}\NormalTok{(}\FunctionTok{getwd}\NormalTok{(),}\StringTok{\textquotesingle{}lr\textquotesingle{}}\NormalTok{))}
-\end{Highlighting}
-\end{Shaded}
-
-\newpage
-
-\hypertarget{example2}{%
-\section{Example 2: Angioedema in ACE inhibitor users}\label{example2}}
-
-\hypertarget{study-specification-2}{%
-\subsection{Study Specification}\label{study-specification-2}}
-
-\begin{longtable}[]{@{}
- >{\raggedright\arraybackslash}p{(\columnwidth - 2\tabcolsep) * \real{0.3056}}
- >{\raggedright\arraybackslash}p{(\columnwidth - 2\tabcolsep) * \real{0.6944}}@{}}
-\toprule\noalign{}
-\begin{minipage}[b]{\linewidth}\raggedright
-Definition
-\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
-Value
-\end{minipage} \\
-\midrule\noalign{}
-\endhead
-\bottomrule\noalign{}
-\endlastfoot
-\textbf{Problem Definition} & \\
-Target Cohort (T) & `Patients who are newly dispensed an ACE inhibitor'
-defined as the first drug record of any ACE inhibitor \\
-Outcome Cohort (O) & `Angioedema' defined as an angioedema condition
-record during an inpatient or ER visit \\
-Time-at-risk (TAR) & 1 day till 365 days from cohort start \\
-& \\
-\textbf{Population Definition} & \\
-Washout Period & 365 \\
-Enter the target cohort multiple times? & No \\
-Allow prior outcomes? & No \\
-Start of time-at-risk & 1 day \\
-End of time-at-risk & 365 days \\
-Require a minimum amount of time-at-risk? & Yes (364 days) \\
-& \\
-\textbf{Model Development} & \\
-Algorithm & Gradient Boosting Machine \\
-Hyper-parameters & ntree:5000, max depth:4 or 7 or 10 and learning rate:
-0.001 or 0.01 or 0.1 or 0.9 \\
-Covariates & Gender, Age, Conditions (ever before, \textless365), Drugs
-Groups (ever before, \textless365), and Visit Count \\
-Data split & 75\% train, 25\% test. Randomly assigned by person \\
-\end{longtable}
-
-According to the best practices we need to make a protocol that
-completely specifies how we plan to execute our study. This protocol
-will be assessed by the governance boards of the participating data
-sources in your network study. For this a template could be used but we
-prefer to automate this process as much as possible by adding
-functionality to automatically generate study protocol from a study
-specification. We will discuss this in more detail later.
-
-\hypertarget{study-implementation-1}{%
-\subsection{Study implementation}\label{study-implementation-1}}
-
-Now we have completely design our study we have to implement the study.
-We have to generate the target and outcome cohorts and we need to
-develop the R code to run against our CDM that will execute the full
-study.
-
-\hypertarget{cohort-instantiation-1}{%
-\subsubsection{Cohort instantiation}\label{cohort-instantiation-1}}
-
-For our study we need to know when a person enters the target and
-outcome cohorts. This is stored in a table on the server that contains
-the cohort start date and cohort end date for all subjects for a
-specific cohort definition. This cohort table has a very simple
-structure as shown below:
-
-\begin{itemize}
-\tightlist
-\item
- \texttt{cohort\_definition\_id}, a unique identifier for
- distinguishing between different types of cohorts, e.g.~cohorts of
- interest and outcome cohorts.
-\item
- \texttt{subject\_id}, a unique identifier corresponding to the
- \texttt{person\_id} in the CDM.
-\item
- \texttt{cohort\_start\_date}, the date the subject enters the cohort.
-\item
- \texttt{cohort\_end\_date}, the date the subject leaves the cohort.
-\end{itemize}
-
-How do we fill this table according to our cohort definitions? There are
-two options for this:
-
-\begin{enumerate}
-\def\labelenumi{\arabic{enumi})}
-\item
- use the interactive cohort builder tool in
- \href{www.github.com/OHDSI/ATLAS}{ATLAS} which can be used to create
- cohorts based on inclusion criteria and will automatically populate
- this cohort table.
-\item
- write your own custom SQL statements to fill the cohort table.
-\end{enumerate}
-
-Both methods are described below for our example prediction problem.
-
-\hypertarget{atlas-cohort-builder-1}{%
-\subsubsection{ATLAS cohort builder}\label{atlas-cohort-builder-1}}
-
-\begin{figure}
-\centering
-\includegraphics{example2/aceinhibitors.webp}
-\caption{Target Cohort ACE inhibitors}
-\end{figure}
-
-ATLAS allows you to define cohorts interactively by specifying cohort
-entry and cohort exit criteria. Cohort entry criteria involve selecting
-one or more initial events, which determine the start date for cohort
-entry, and optionally specifying additional inclusion criteria which
-filter to the qualifying events. Cohort exit criteria are applied to
-each cohort entry record to determine the end date when the person's
-episode no longer qualifies for the cohort. For the outcome cohort the
-end date is less relevant. As an example, Figure 6 shows how we created
-the ACE inhibitors cohort and Figure 7 shows how we created the
-angioedema cohort in ATLAS.
-
-\begin{figure}
-\centering
-\includegraphics{example2/angioedema.webp}
-\caption{Outcome Cohort Angioedema}
-\end{figure}
-
-The T and O cohorts can be found here:
-
-\begin{itemize}
-\tightlist
-\item
- Ace inhibitors (T):
- \url{http://www.ohdsi.org/web/atlas/\#/cohortdefinition/1770617}
-\item
- Angioedema (O) :
- \url{http://www.ohdsi.org/web/atlas/\#/cohortdefinition/1770616}
-\end{itemize}
-
-In depth explanation of cohort creation in ATLAS is out of scope of this
-vignette but can be found on the OHDSI wiki pages
-\href{http://www.ohdsi.org/web/wiki/doku.php?id=documentation:software:atlas}{(link)}.
-
-Note that when a cohort is created in ATLAS the cohortid is needed to
-extract the data in R. The cohortid can be found at the top of the ATLAS
-screen, e.g.~1770617 in Figure 6.
-
-\hypertarget{custom-cohorts-1}{%
-\subsubsection{Custom cohorts}\label{custom-cohorts-1}}
-
-It is also possible to create cohorts without the use of ATLAS. Using
-custom cohort code (SQL) you can make more advanced cohorts if needed.
-
-For our example study, we need to create at table to hold the cohort
-data and we need to create SQL code to instantiate this table for both
-the AF and Stroke cohorts. Therefore, we create a file called
-\emph{AceAngioCohorts.sql} with the following contents:
-
-\begin{Shaded}
-\begin{Highlighting}[]
- \CommentTok{/***********************************}
-\CommentTok{ File AceAngioCohorts.sql }
-\CommentTok{ ***********************************/}
- \CommentTok{/*}
-\CommentTok{ Create a table to store the persons in the T and C cohort}
-\CommentTok{ */}
-
- \ControlFlowTok{IF}\NormalTok{ OBJECT\_ID(}\StringTok{\textquotesingle{}@resultsDatabaseSchema.PLPAceAngioCohort\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}U\textquotesingle{}}\NormalTok{) }\KeywordTok{IS} \KeywordTok{NOT} \KeywordTok{NULL}
- \KeywordTok{DROP} \KeywordTok{TABLE}\NormalTok{ @resultsDatabaseSchema.PLPAceAngioCohort;}
-
- \KeywordTok{CREATE} \KeywordTok{TABLE}\NormalTok{ @resultsDatabaseSchema.PLPAceAngioCohort }
-\NormalTok{ ( }
-\NormalTok{ cohort\_definition\_id }\DataTypeTok{INT}\NormalTok{, }
-\NormalTok{ subject\_id BIGINT,}
-\NormalTok{ cohort\_start\_date }\DataTypeTok{DATE}\NormalTok{, }
-\NormalTok{ cohort\_end\_date }\DataTypeTok{DATE}
-\NormalTok{ );}
-
-
- \CommentTok{/*}
-\CommentTok{ T cohort: [PatientLevelPrediction vignette]: T : patients who are newly }
-\CommentTok{ dispensed an ACE inhibitor}
-\CommentTok{ {-} persons with a drug exposure record of any \textquotesingle{}ACE inhibitor\textquotesingle{} or }
-\CommentTok{ any descendants, indexed at the first diagnosis}
-\CommentTok{ {-} who have \textgreater{}364 days of prior observation before their first dispensing}
-\CommentTok{ */}
- \KeywordTok{INSERT} \KeywordTok{INTO}\NormalTok{ @resultsDatabaseSchema.AceAngioCohort (cohort\_definition\_id, }
-\NormalTok{ subject\_id, }
-\NormalTok{ cohort\_start\_date, }
-\NormalTok{ cohort\_end\_date)}
- \KeywordTok{SELECT} \DecValTok{1} \KeywordTok{AS}\NormalTok{ cohort\_definition\_id,}
-\NormalTok{ Ace.person\_id }\KeywordTok{AS}\NormalTok{ subject\_id,}
-\NormalTok{ Ace.drug\_start\_date }\KeywordTok{AS}\NormalTok{ cohort\_start\_date,}
-\NormalTok{ observation\_period.observation\_period\_end\_date }\KeywordTok{AS}\NormalTok{ cohort\_end\_date}
- \KeywordTok{FROM}
-\NormalTok{ (}
- \KeywordTok{SELECT}\NormalTok{ person\_id, }\FunctionTok{min}\NormalTok{(drug\_exposure\_date) }\KeywordTok{as}\NormalTok{ drug\_start\_date}
- \KeywordTok{FROM}\NormalTok{ @cdmDatabaseSchema.drug\_exposure}
- \KeywordTok{WHERE}\NormalTok{ drug\_concept\_id }\KeywordTok{IN}\NormalTok{ (}\KeywordTok{SELECT}\NormalTok{ descendant\_concept\_id }\KeywordTok{FROM}
-\NormalTok{ @cdmDatabaseSchema.concept\_ancestor }\KeywordTok{WHERE}\NormalTok{ ancestor\_concept\_id }\KeywordTok{IN}
-\NormalTok{ (}\DecValTok{1342439}\NormalTok{,}\DecValTok{1334456}\NormalTok{, }\DecValTok{1331235}\NormalTok{, }\DecValTok{1373225}\NormalTok{, }\DecValTok{1310756}\NormalTok{, }\DecValTok{1308216}\NormalTok{, }\DecValTok{1363749}\NormalTok{, }\DecValTok{1341927}\NormalTok{, }\DecValTok{1340128}\NormalTok{, }\DecValTok{1335471} \CommentTok{/*ace inhibitors*/}\NormalTok{))}
- \KeywordTok{GROUP} \KeywordTok{BY}\NormalTok{ person\_id}
-\NormalTok{ ) Ace}
- \KeywordTok{INNER} \KeywordTok{JOIN}\NormalTok{ @cdmDatabaseSchema.observation\_period}
- \KeywordTok{ON}\NormalTok{ Ace.person\_id }\OperatorTok{=}\NormalTok{ observation\_period.person\_id}
- \KeywordTok{AND}\NormalTok{ Ace.drug\_start\_date }\OperatorTok{\textgreater{}=}\NormalTok{ dateadd(dd,}\DecValTok{364}\NormalTok{, }
-\NormalTok{ observation\_period.observation\_period\_start\_date)}
- \KeywordTok{AND}\NormalTok{ Ace.drug\_start\_date }\OperatorTok{\textless{}=}\NormalTok{ observation\_period.observation\_period\_end\_date}
-\NormalTok{ ;}
-
- \CommentTok{/*}
-\CommentTok{ C cohort: [PatientLevelPrediction vignette]: O: Angioedema}
-\CommentTok{ */}
- \KeywordTok{INSERT} \KeywordTok{INTO}\NormalTok{ @resultsDatabaseSchema.AceAngioCohort (cohort\_definition\_id, }
-\NormalTok{ subject\_id, }
-\NormalTok{ cohort\_start\_date, }
-\NormalTok{ cohort\_end\_date)}
- \KeywordTok{SELECT} \DecValTok{2} \KeywordTok{AS}\NormalTok{ cohort\_definition\_id,}
-\NormalTok{ angioedema.person\_id }\KeywordTok{AS}\NormalTok{ subject\_id,}
-\NormalTok{ angioedema.condition\_start\_date }\KeywordTok{AS}\NormalTok{ cohort\_start\_date,}
-\NormalTok{ angioedema.condition\_start\_date }\KeywordTok{AS}\NormalTok{ cohort\_end\_date}
- \KeywordTok{FROM}
-\NormalTok{ (}
- \KeywordTok{SELECT}\NormalTok{ person\_id, condition\_start\_date}
- \KeywordTok{FROM}\NormalTok{ @cdmDatabaseSchema.condition\_occurrence}
- \KeywordTok{WHERE}\NormalTok{ condition\_concept\_id }\KeywordTok{IN}\NormalTok{ (}\KeywordTok{SELECT} \KeywordTok{DISTINCT}\NormalTok{ descendant\_concept\_id }\KeywordTok{FROM}
-\NormalTok{ @cdmDatabaseSchema.concept\_ancestor }\KeywordTok{WHERE}\NormalTok{ ancestor\_concept\_id }\KeywordTok{IN}
-\NormalTok{ (}\DecValTok{432791} \CommentTok{/*angioedema*/}\NormalTok{) }\KeywordTok{OR}\NormalTok{ descendant\_concept\_id }\KeywordTok{IN}
-\NormalTok{ (}\DecValTok{432791} \CommentTok{/*angioedema*/}\NormalTok{)}
-\NormalTok{ ) angioedema}
-
-\NormalTok{ ;}
-
-\end{Highlighting}
-\end{Shaded}
-
-This is parameterized SQL which can be used by the
-\href{http://github.com/OHDSI/SqlRender}{\texttt{SqlRender}} package. We
-use parameterized SQL so we do not have to pre-specify the names of the
-CDM and result schemas. That way, if we want to run the SQL on a
-different schema, we only need to change the parameter values; we do not
-have to change the SQL code. By also making use of translation
-functionality in \texttt{SqlRender}, we can make sure the SQL code can
-be run in many different environments.
-
-To execute this sql against our CDM we first need to tell R how to
-connect to the server. \texttt{PatientLevelPrediction} uses the
-\href{http://github.com/ohdsi/DatabaseConnector}{\texttt{DatabaseConnector}}
-package, which provides a function called
-\texttt{createConnectionDetails}. Type \texttt{?createConnectionDetails}
-for the specific settings required for the various database management
-systems (DBMS). For example, one might connect to a PostgreSQL database
-using this code:
-
-\begin{Shaded}
-\begin{Highlighting}[]
-\NormalTok{ connectionDetails }\OtherTok{\textless{}{-}} \FunctionTok{createConnectionDetails}\NormalTok{(}\AttributeTok{dbms =} \StringTok{"postgresql"}\NormalTok{, }
- \AttributeTok{server =} \StringTok{"localhost/ohdsi"}\NormalTok{, }
- \AttributeTok{user =} \StringTok{"joe"}\NormalTok{, }
- \AttributeTok{password =} \StringTok{"supersecret"}\NormalTok{)}
-
-\NormalTok{ cdmDatabaseSchema }\OtherTok{\textless{}{-}} \StringTok{"my\_cdm\_data"}
-\NormalTok{ cohortsDatabaseSchema }\OtherTok{\textless{}{-}} \StringTok{"my\_results"}
-\NormalTok{ cdmVersion }\OtherTok{\textless{}{-}} \StringTok{"5"}
-\end{Highlighting}
-\end{Shaded}
-
-The last three lines define the \texttt{cdmDatabaseSchema} and
-\texttt{cohortsDatabaseSchema} variables, as well as the CDM version. We
-will use these later to tell R where the data in CDM format live, where
-we want to create the cohorts of interest, and what version CDM is used.
-Note that for Microsoft SQL Server, databaseschemas need to specify both
-the database and the schema, so for example
-\texttt{cdmDatabaseSchema\ \textless{}-\ "my\_cdm\_data.dbo"}.
-
-\begin{Shaded}
-\begin{Highlighting}[]
- \FunctionTok{library}\NormalTok{(SqlRender)}
-\NormalTok{ sql }\OtherTok{\textless{}{-}} \FunctionTok{readSql}\NormalTok{(}\StringTok{"AceAngioCohorts.sql"}\NormalTok{)}
-\NormalTok{ sql }\OtherTok{\textless{}{-}} \FunctionTok{render}\NormalTok{(sql,}
- \AttributeTok{cdmDatabaseSchema =}\NormalTok{ cdmDatabaseSchema,}
- \AttributeTok{cohortsDatabaseSchema =}\NormalTok{ cohortsDatabaseSchema)}
-\NormalTok{ sql }\OtherTok{\textless{}{-}} \FunctionTok{translate}\NormalTok{(sql, }\AttributeTok{targetDialect =}\NormalTok{ connectionDetails}\SpecialCharTok{$}\NormalTok{dbms)}
-
-\NormalTok{ connection }\OtherTok{\textless{}{-}} \FunctionTok{connect}\NormalTok{(connectionDetails)}
- \FunctionTok{executeSql}\NormalTok{(connection, sql)}
-\end{Highlighting}
-\end{Shaded}
-
-In this code, we first read the SQL from the file into memory. In the
-next line, we replace four parameter names with the actual values. We
-then translate the SQL into the dialect appropriate for the DBMS we
-already specified in the \texttt{connectionDetails}. Next, we connect to
-the server, and submit the rendered and translated SQL.
-
-If all went well, we now have a table with the events of interest. We
-can see how many events per type:
-
-\begin{Shaded}
-\begin{Highlighting}[]
-\NormalTok{ sql }\OtherTok{\textless{}{-}} \FunctionTok{paste}\NormalTok{(}\StringTok{"SELECT cohort\_definition\_id, COUNT(*) AS count"}\NormalTok{,}
- \StringTok{"FROM @cohortsDatabaseSchema.AceAngioCohort"}\NormalTok{,}
- \StringTok{"GROUP BY cohort\_definition\_id"}\NormalTok{)}
-\NormalTok{ sql }\OtherTok{\textless{}{-}} \FunctionTok{render}\NormalTok{(sql, }\AttributeTok{cohortsDatabaseSchema =}\NormalTok{ cohortsDatabaseSchema)}
-\NormalTok{ sql }\OtherTok{\textless{}{-}} \FunctionTok{translate}\NormalTok{(sql, }\AttributeTok{targetDialect =}\NormalTok{ connectionDetails}\SpecialCharTok{$}\NormalTok{dbms)}
-
- \FunctionTok{querySql}\NormalTok{(connection, sql)}
-\end{Highlighting}
-\end{Shaded}
-
-\begin{verbatim}
-## cohort_definition_id count
-## 1 1 0
-## 2 2 0
-\end{verbatim}
-
-\hypertarget{study-script-creation-1}{%
-\subsubsection{Study script creation}\label{study-script-creation-1}}
-
-In this section we assume that our cohorts have been created either by
-using ATLAS or a custom SQL script. We will first explain how to create
-an R script yourself that will execute our study as we have defined
-earlier.
-
-\hypertarget{data-extraction-1}{%
-\subsubsection{Data extraction}\label{data-extraction-1}}
-
-Now we can tell \texttt{PatientLevelPrediction} to extract all necessary
-data for our analysis. This is done using the
-\href{https://github.com/OHDSI/FeatureExtraction}{\texttt{FeatureExtractionPackage}}.
-In short the FeatureExtractionPackage allows you to specify which
-features (covariates) need to be extracted, e.g.~all conditions and drug
-exposures. It also supports the creation of custom covariates. For more
-detailed information on the FeatureExtraction package see its
-\href{https://github.com/OHDSI/FeatureExtraction}{vignettes}. For our
-example study we decided to use these settings:
-
-\begin{Shaded}
-\begin{Highlighting}[]
-\NormalTok{ covariateSettings }\OtherTok{\textless{}{-}} \FunctionTok{createCovariateSettings}\NormalTok{(}\AttributeTok{useDemographicsGender =} \ConstantTok{TRUE}\NormalTok{,}
- \AttributeTok{useDemographicsAge =} \ConstantTok{TRUE}\NormalTok{,}
- \AttributeTok{useConditionGroupEraLongTerm =} \ConstantTok{TRUE}\NormalTok{,}
- \AttributeTok{useConditionGroupEraAnyTimePrior =} \ConstantTok{TRUE}\NormalTok{,}
- \AttributeTok{useDrugGroupEraLongTerm =} \ConstantTok{TRUE}\NormalTok{,}
- \AttributeTok{useDrugGroupEraAnyTimePrior =} \ConstantTok{TRUE}\NormalTok{,}
- \AttributeTok{useVisitConceptCountLongTerm =} \ConstantTok{TRUE}\NormalTok{,}
- \AttributeTok{longTermStartDays =} \SpecialCharTok{{-}}\DecValTok{365}\NormalTok{,}
- \AttributeTok{endDays =} \SpecialCharTok{{-}}\DecValTok{1}\NormalTok{)}
-\end{Highlighting}
-\end{Shaded}
-
-The final step for extracting the data is to run the \texttt{getPlpData}
-function and input the connection details, the database schema where the
-cohorts are stored, the cohort definition ids for the cohort and
-outcome, and the washoutPeriod which is the minimum number of days prior
-to cohort index date that the person must have been observed to be
-included into the data, and finally input the previously constructed
-covariate settings.
-
-\begin{Shaded}
-\begin{Highlighting}[]
-\NormalTok{databaseDetails }\OtherTok{\textless{}{-}} \FunctionTok{createDatabaseDetails}\NormalTok{(}
- \AttributeTok{connectionDetails =}\NormalTok{ connectionDetails,}
- \AttributeTok{cdmDatabaseSchema =}\NormalTok{ cdmDatabaseSchema,}
- \AttributeTok{cohortDatabaseSchema =}\NormalTok{ resultsDatabaseSchema,}
- \AttributeTok{cohortTable =} \StringTok{\textquotesingle{}AceAngioCohort\textquotesingle{}}\NormalTok{,}
- \AttributeTok{cohortId =} \DecValTok{1}\NormalTok{,}
- \AttributeTok{outcomeDatabaseSchema =}\NormalTok{ resultsDatabaseSchema,}
- \AttributeTok{outcomeTable =} \StringTok{\textquotesingle{}AceAngioCohort\textquotesingle{}}\NormalTok{,}
- \AttributeTok{outcomeIds =} \DecValTok{2}
-\NormalTok{ )}
-
-\NormalTok{restrictPlpDataSettings }\OtherTok{\textless{}{-}} \FunctionTok{createRestrictPlpDataSettings}\NormalTok{(}
- \AttributeTok{sampleSize =} \DecValTok{10000}
-\NormalTok{ )}
-
-\NormalTok{plpData }\OtherTok{\textless{}{-}} \FunctionTok{getPlpData}\NormalTok{(}
- \AttributeTok{databaseDetails =}\NormalTok{ databaseDetails, }
- \AttributeTok{covariateSettings =}\NormalTok{ covariateSettings, }
- \AttributeTok{restrictPlpDataSettings =}\NormalTok{ restrictPlpDataSettings}
-\NormalTok{ )}
-\end{Highlighting}
-\end{Shaded}
-
-Note that if the cohorts are created in ATLAS its corresponding cohort
-database schema needs to be selected. There are many additional
-parameters for the \texttt{getPlpData} function which are all documented
-in the \texttt{PatientLevelPrediction} manual. The resulting
-\texttt{plpData} object uses the package \texttt{ff} to store
-information in a way that ensures R does not run out of memory, even
-when the data are large.
-
-Creating the \texttt{plpData} object can take considerable computing
-time, and it is probably a good idea to save it for future sessions.
-Because \texttt{plpData} uses \texttt{ff}, we cannot use R's regular
-save function. Instead, we'll have to use the \texttt{savePlpData()}
-function:
-
-\begin{Shaded}
-\begin{Highlighting}[]
- \FunctionTok{savePlpData}\NormalTok{(plpData, }\StringTok{"angio\_in\_ace\_data"}\NormalTok{)}
-\end{Highlighting}
-\end{Shaded}
-
-We can use the \texttt{loadPlpData()} function to load the data in a
-future session.
-
-\hypertarget{additional-inclusion-criteria-1}{%
-\subsubsection{Additional inclusion
-criteria}\label{additional-inclusion-criteria-1}}
-
-To completely define the prediction problem the final study population
-is obtained by applying additional constraints on the two earlier
-defined cohorts, e.g., a minumim time at risk can be enforced
-(\texttt{requireTimeAtRisk,\ minTimeAtRisk}) and we can specify if this
-also applies to patients with the outcome (\texttt{includeAllOutcomes}).
-Here we also specify the start and end of the risk window relative to
-target cohort start. For example, if we like the risk window to start 30
-days after the at-risk cohort start and end a year later we can set
-\texttt{riskWindowStart\ =\ 30} and \texttt{riskWindowEnd\ =\ 365}. In
-some cases the risk window needs to start at the cohort end date. This
-can be achieved by setting \texttt{addExposureToStart\ =\ TRUE} which
-adds the cohort (exposure) time to the start date.
-
-In Appendix 1, we demonstrate the effect of these settings on the subset
-of the persons in the target cohort that end up in the final study
-population.
-
-In the example below all the settings we defined for our study are
-imposed:
-
-\begin{Shaded}
-\begin{Highlighting}[]
-\NormalTok{ populationSettings }\OtherTok{\textless{}{-}} \FunctionTok{createStudyPopulationSettings}\NormalTok{(}
- \AttributeTok{washoutPeriod =} \DecValTok{364}\NormalTok{,}
- \AttributeTok{firstExposureOnly =} \ConstantTok{FALSE}\NormalTok{,}
- \AttributeTok{removeSubjectsWithPriorOutcome =} \ConstantTok{TRUE}\NormalTok{,}
- \AttributeTok{priorOutcomeLookback =} \DecValTok{9999}\NormalTok{,}
- \AttributeTok{riskWindowStart =} \DecValTok{1}\NormalTok{,}
- \AttributeTok{riskWindowEnd =} \DecValTok{365}\NormalTok{, }
- \AttributeTok{minTimeAtRisk =} \DecValTok{364}\NormalTok{,}
- \AttributeTok{startAnchor =} \StringTok{\textquotesingle{}cohort start\textquotesingle{}}\NormalTok{,}
- \AttributeTok{endAnchor =} \StringTok{\textquotesingle{}cohort start\textquotesingle{}}\NormalTok{,}
- \AttributeTok{requireTimeAtRisk =} \ConstantTok{TRUE}\NormalTok{,}
- \AttributeTok{includeAllOutcomes =} \ConstantTok{TRUE}
-\NormalTok{ )}
-\end{Highlighting}
-\end{Shaded}
-
-\hypertarget{spliting-the-data-into-trainingvalidationtesting-datasets-1}{%
-\subsubsection{Spliting the data into training/validation/testing
-datasets}\label{spliting-the-data-into-trainingvalidationtesting-datasets-1}}
-
-When developing a prediction model using supervised learning (when you
-have features paired with labels for a set of patients), the first step
-is to design the development/internal validation process. This requires
-specifying how to select the model hyper-parameters, how to learn the
-model parameters and how to fairly evaluate the model. In general, the
-validation set is used to pick hyper-parameters, the training set is
-used to learn the model parameters and the test set is used to perform
-fair internal validation. However, cross-validation can be implemented
-to pick the hyper-parameters on the training data (so a validation data
-set is not required). Cross validation can also be used to estimate
-internal validation (so a testing data set is not required).
-
-In small data the best approach for internal validation has been shown
-to be boostrapping. However, in big data (many patients and many
-features) bootstrapping is generally not feasible. In big data our
-research has shown that it is just important to have some form of fair
-evaluation (use a test set or cross validation). For full details see
-\href{add\%20link}{our BMJ open paper}.
-
-In the PatientLevelPrediction package, the splitSettings define how the
-plpData are partitioned into training/validation/testing data. Cross
-validation is always done, but using a test set is optional (when the
-data are small, it may be optimal to not use a test set). For the
-splitSettings we can use the type (stratified/time/subject) and
-testFraction parameters to split the data in a 75\%-25\% split and run
-the patient-level prediction pipeline:
-
-\begin{Shaded}
-\begin{Highlighting}[]
-\NormalTok{ splitSettings }\OtherTok{\textless{}{-}} \FunctionTok{createDefaultSplitSetting}\NormalTok{(}
- \AttributeTok{trainFraction =} \FloatTok{0.75}\NormalTok{,}
- \AttributeTok{testFraction =} \FloatTok{0.25}\NormalTok{,}
- \AttributeTok{type =} \StringTok{\textquotesingle{}stratified\textquotesingle{}}\NormalTok{,}
- \AttributeTok{nfold =} \DecValTok{2}\NormalTok{, }
- \AttributeTok{splitSeed =} \DecValTok{1234}
-\NormalTok{ )}
-\end{Highlighting}
-\end{Shaded}
-
-Note: it is possible to add a custom method to specify how the plpData
-are partitioned into training/validation/testing data, see
-\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/AddingCustomSplitting.pdf}{vignette
-for custom splitting}.
-
-\hypertarget{preprocessing-the-training-data-1}{%
-\subsubsection{Preprocessing the training
-data}\label{preprocessing-the-training-data-1}}
-
-There a numerous data processing settings that a user must specify when
-developing a prediction model. These are: * Whether to under-sample or
-over-sample the training data (this may be useful when there is class
-imballance (e.g., the outcome is very rare or very common)) * Whether to
-perform feature engineering or feature selection (e.g., create latent
-variables that are not observed in the data or reduce the dimensionality
-of the data) * Whether to remove redundant features and normalize the
-data (this is required for some models)
-
-The default sample settings does nothing, it simply returns the
-trainData as input, see below:
-
-\begin{Shaded}
-\begin{Highlighting}[]
-\NormalTok{ sampleSettings }\OtherTok{\textless{}{-}} \FunctionTok{createSampleSettings}\NormalTok{()}
-\end{Highlighting}
-\end{Shaded}
-
-However, the current package contains methods of under-sampling the
-non-outcome patients. To perform undersampling, the \texttt{type} input
-should be `underSample' and \texttt{numberOutcomestoNonOutcomes} must be
-specified (an integer specifying the number of non-outcomes per
-outcome). It is possible to add any custom function for over/under
-sampling, see
-\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/AddingCustomSamples.pdf}{vignette
-for custom sampling}.
-
-It is possible to specify a combination of feature engineering functions
-that take as input the trainData and output a new trainData with
-different features. The default feature engineering setting does
-nothing:
-
-\begin{Shaded}
-\begin{Highlighting}[]
-\NormalTok{ featureEngineeringSettings }\OtherTok{\textless{}{-}} \FunctionTok{createFeatureEngineeringSettings}\NormalTok{()}
-\end{Highlighting}
-\end{Shaded}
-
-However, it is possible to add custom feature engineering functions into
-the pipeline, see
-\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/AddingCustomfeatureEngineering.pdf}{vignette
-for custom feature engineering}.
-
-Finally, the preprocessing setting is required. For this setting the
-user can define \texttt{minFraction}, this removes any features that is
-observed in the training data for less than 0.01 fraction of the
-patients. So, if \texttt{minFraction\ =\ 0.01} then any feature that is
-seen in less than 1 percent of the target population is removed. The
-input \texttt{normalize} specifies whether the features are scaled
-between 0 and 1, this is required for certain models (e.g., LASSO
-logistic regression). The input \texttt{removeRedundancy} specifies
-whether features that are observed in all of the target population are
-removed.
-
-\begin{Shaded}
-\begin{Highlighting}[]
-\NormalTok{ preprocessSettingsSettings }\OtherTok{\textless{}{-}} \FunctionTok{createPreprocessSettings}\NormalTok{(}
- \AttributeTok{minFraction =} \FloatTok{0.01}\NormalTok{, }
- \AttributeTok{normalize =}\NormalTok{ T, }
- \AttributeTok{removeRedundancy =}\NormalTok{ T}
-\NormalTok{ )}
-\end{Highlighting}
-\end{Shaded}
-
-\hypertarget{model-development-1}{%
-\subsubsection{Model Development}\label{model-development-1}}
-
-In the set function of an algorithm the user can specify a list of
-eligible values for each hyper-parameter. All possible combinations of
-the hyper-parameters are included in a so-called grid search using
-cross-validation on the training set. If a user does not specify any
-value then the default value is used instead.
-
-For example, if we use the following settings for the
-gradientBoostingMachine: ntrees=c(100,200), maxDepth=4 the grid search
-will apply the gradient boosting machine algorithm with ntrees=100 and
-maxDepth=4 plus the default settings for other hyper-parameters and
-ntrees=200 and maxDepth=4 plus the default settings for other
-hyper-parameters. The hyper-parameters that lead to the
-bestcross-validation performance will then be chosen for the final
-model. For our problem we choose to build a logistic regression model
-with the default hyper-parameters
-
-\begin{Shaded}
-\begin{Highlighting}[]
-\NormalTok{ gbmModel }\OtherTok{\textless{}{-}} \FunctionTok{setGradientBoostingMachine}\NormalTok{(}
- \AttributeTok{ntrees =} \DecValTok{5000}\NormalTok{, }
- \AttributeTok{maxDepth =} \FunctionTok{c}\NormalTok{(}\DecValTok{4}\NormalTok{,}\DecValTok{7}\NormalTok{,}\DecValTok{10}\NormalTok{), }
- \AttributeTok{learnRate =} \FunctionTok{c}\NormalTok{(}\FloatTok{0.001}\NormalTok{,}\FloatTok{0.01}\NormalTok{,}\FloatTok{0.1}\NormalTok{,}\FloatTok{0.9}\NormalTok{)}
-\NormalTok{ )}
-\end{Highlighting}
-\end{Shaded}
-
-The \texttt{runPlP} function requires the \texttt{plpData}, the
-\texttt{outcomeId} specifying the outcome being predicted and the
-settings: \texttt{populationSettings}, \texttt{splitSettings},
-\texttt{sampleSettings}, \texttt{featureEngineeringSettings},
-\texttt{preprocessSettings} and \texttt{modelSettings} to train and
-evaluate the model.
-
-\begin{Shaded}
-\begin{Highlighting}[]
-\NormalTok{ gbmResults }\OtherTok{\textless{}{-}} \FunctionTok{runPlp}\NormalTok{(}
- \AttributeTok{plpData =}\NormalTok{ plpData,}
- \AttributeTok{outcomeId =} \DecValTok{2}\NormalTok{, }
- \AttributeTok{analysisId =} \StringTok{\textquotesingle{}singleDemo2\textquotesingle{}}\NormalTok{,}
- \AttributeTok{analysisName =} \StringTok{\textquotesingle{}Demonstration of runPlp for training single PLP models\textquotesingle{}}\NormalTok{,}
- \AttributeTok{populationSettings =}\NormalTok{ populationSettings, }
- \AttributeTok{splitSettings =}\NormalTok{ splitSettings,}
- \AttributeTok{sampleSettings =}\NormalTok{ sampleSettings, }
- \AttributeTok{featureEngineeringSettings =}\NormalTok{ featureEngineeringSettings, }
- \AttributeTok{preprocessSettings =}\NormalTok{ preprocessSettings,}
- \AttributeTok{modelSettings =}\NormalTok{ gbmModel,}
- \AttributeTok{logSettings =} \FunctionTok{createLogSettings}\NormalTok{(), }
- \AttributeTok{executeSettings =} \FunctionTok{createExecuteSettings}\NormalTok{(}
- \AttributeTok{runSplitData =}\NormalTok{ T, }
- \AttributeTok{runSampleData =}\NormalTok{ T, }
- \AttributeTok{runfeatureEngineering =}\NormalTok{ T, }
- \AttributeTok{runPreprocessData =}\NormalTok{ T, }
- \AttributeTok{runModelDevelopment =}\NormalTok{ T, }
- \AttributeTok{runCovariateSummary =}\NormalTok{ T}
-\NormalTok{ ), }
- \AttributeTok{saveDirectory =} \FunctionTok{file.path}\NormalTok{(}\FunctionTok{getwd}\NormalTok{(), }\StringTok{\textquotesingle{}singlePlpExample2\textquotesingle{}}\NormalTok{)}
-\NormalTok{ )}
-\end{Highlighting}
-\end{Shaded}
-
-Under the hood the package will now use the R xgboost package to fit a a
-gradient boosting machine model using 75\% of the data and will evaluate
-the model on the remaining 25\%. A results data structure is returned
-containing information about the model, its performance etc.
-
-You can save the model using:
-
-\begin{Shaded}
-\begin{Highlighting}[]
- \FunctionTok{savePlpModel}\NormalTok{(gbmResults}\SpecialCharTok{$}\NormalTok{model, }\AttributeTok{dirPath =} \FunctionTok{file.path}\NormalTok{(}\FunctionTok{getwd}\NormalTok{(), }\StringTok{"model"}\NormalTok{))}
-\end{Highlighting}
-\end{Shaded}
-
-You can load the model using:
-
-\begin{Shaded}
-\begin{Highlighting}[]
-\NormalTok{ plpModel }\OtherTok{\textless{}{-}} \FunctionTok{loadPlpModel}\NormalTok{(}\FunctionTok{file.path}\NormalTok{(}\FunctionTok{getwd}\NormalTok{(),}\StringTok{\textquotesingle{}model\textquotesingle{}}\NormalTok{))}
-\end{Highlighting}
-\end{Shaded}
-
-You can also save the full results structure using:
-
-\begin{Shaded}
-\begin{Highlighting}[]
- \FunctionTok{savePlpResult}\NormalTok{(gbmResults, }\AttributeTok{location =} \FunctionTok{file.path}\NormalTok{(}\FunctionTok{getwd}\NormalTok{(),}\StringTok{\textquotesingle{}gbm\textquotesingle{}}\NormalTok{))}
-\end{Highlighting}
-\end{Shaded}
-
-To load the full results structure use:
-
-\begin{Shaded}
-\begin{Highlighting}[]
-\NormalTok{ gbmResults }\OtherTok{\textless{}{-}} \FunctionTok{loadPlpResult}\NormalTok{(}\FunctionTok{file.path}\NormalTok{(}\FunctionTok{getwd}\NormalTok{(),}\StringTok{\textquotesingle{}gbm\textquotesingle{}}\NormalTok{))}
-\end{Highlighting}
-\end{Shaded}
-
-\newpage
-
-\hypertarget{study-package-creation}{%
-\section{Study package creation}\label{study-package-creation}}
-
-The script we created manually above can also be automatically created
-using a powerful feature in ATLAS. By creating a new prediction study
-(left menu) you can select the Target and Outcome as created in ATLAS,
-set all the study parameters, and then you can download a R package that
-you can use to execute your study. What is really powerful is that you
-can add multiple Ts, Os, covariate settings etc. The package will then
-run all the combinations of automatically as separate analyses. The
-screenshots below explain this process.
-
-\begin{enumerate}
-\def\labelenumi{\arabic{enumi})}
-\item
- Create a new prediction study and select your target and outcome
- cohorts.
-
- \includegraphics{atlasplp1.webp}
-\item
- Specify one or more analysis settings.
-
- \includegraphics{atlasplp2.web}
-
- \newpage
-\item
- Specify the trainings settigns
-
- \includegraphics{atlasplp3.webp}
-\item
- Specify the execution settings
-
- \includegraphics{atlasplp4.web}
-
- {]} \newpage
-\end{enumerate}
-
-ATLAS can build a R package for you that will execute the full study
-against you CDM. Below the steps are explained how to do this in ATLAS.
-
-\begin{enumerate}
-\def\labelenumi{\arabic{enumi})}
-\item
- Under utilities you can find download. Click on the button to review
- the full study specification
-
- \begin{figure}
- \centering
- \includegraphics{atlasdownload1.webp}
- \caption{R package download functionality in ATLAS}
- \end{figure}
-\item
- You now have to review that you indeed want to run all these analyses
- (cartesian product of all the settings for each T and O combination.
-
- \begin{figure}
- \centering
- \includegraphics{atlasdownload2.webp}
- \caption{R package download functionality in ATLAS}
- \end{figure}
-\item
- If you agree, you give the package a name, and download the package as
- a zipfile.
-\item
- By opening the R package in R studio and building the package you can
- run the study using the \texttt{execute} function. Theres is also an
- example CodeToRun.R script available in the extras folder of the
- package with extra instructions.
-\end{enumerate}
-
-\hypertarget{internal-validation}{%
-\section{Internal validation}\label{internal-validation}}
-
-Once we execute the study, the runPlp() function returns the trained
-model and the evaluation of the model on the train/test sets.
-
-You can interactively view the results by running:
-\texttt{viewPlp(runPlp=lrResults)}. This will generate a Shiny App in
-your browser in which you can view all performance measures created by
-the framework as shown in the figure below.
-
-\begin{figure}
-\centering
-\includegraphics{shinysummary.webp}
-\caption{Summary of all the performance measures of the analyses}
-\end{figure}
-
-Furthermore, many interactive plots are available in the Shiny App, for
-example the ROC curve in which you can move over the plot to see the
-threshold and the corresponding sensitivity and specificity values.
-
-\begin{figure}
-\centering
-\includegraphics{shinyroc.webp}
-\caption{Example of the interactive ROC curve}
-\end{figure}
-
-To generate and save all the evaluation plots to a folder run the
-following code:
-
-\begin{Shaded}
-\begin{Highlighting}[]
-\FunctionTok{plotPlp}\NormalTok{(lrResults, }\AttributeTok{dirPath=}\FunctionTok{getwd}\NormalTok{())}
-\end{Highlighting}
-\end{Shaded}
-
-The plots are described in more detail in the next sections.
-
-\newpage
-
-\hypertarget{discrimination}{%
-\subsection{Discrimination}\label{discrimination}}
-
-The Receiver Operating Characteristics (ROC) plot shows the sensitivity
-against 1-specificity on the test set. The plot illustrates how well the
-model is able to discriminate between the people with the outcome and
-those without. The dashed diagonal line is the performance of a model
-that randomly assigns predictions. The higher the area under the ROC
-plot the better the discrimination of the model. The plot is created by
-changing the probability threshold to assign the positive class.
-
-\begin{figure}
-\centering
-\includegraphics{sparseRoc.webp}
-\caption{Receiver Operating Characteristic Plot}
-\end{figure}
-
-\newpage
-
-\#\# Calibration
-
-The calibration plot shows how close the predicted risk is to the
-observed risk. The diagonal dashed line thus indicates a perfectly
-calibrated model. The ten (or fewer) dots represent the mean predicted
-values for each quantile plotted against the observed fraction of people
-in that quantile who had the outcome (observed fraction). The straight
-black line is the linear regression using these 10 plotted quantile mean
-predicted vs observed fraction points. The straight vertical lines
-represented the 95\% lower and upper confidence intervals of the slope
-of the fitted line.
-
-\begin{figure}
-\centering
-\includegraphics{sparseCalibration.webp}
-\caption{Calibration Plot}
-\end{figure}
-
-\newpage
-
-\hypertarget{smooth-calibration}{%
-\subsection{Smooth Calibration}\label{smooth-calibration}}
-
-Similar to the traditional calibration shown above the Smooth
-Calibration plot shows the relationship between predicted and observed
-risk. the major difference is that the smooth fit allows for a more fine
-grained examination of this. Whereas the traditional plot will be
-heavily influenced by the areas with the highest density of data the
-smooth plot will provide the same information for this region as well as
-a more accurate interpretation of areas with lower density. the plot
-also contains information on the distribution of the outcomes relative
-to predicted risk.
-
-However, the increased information gain comes at a computational cost.
-It is recommended to use the traditional plot for examination and then
-to produce the smooth plot for final versions. To create the smooth
-calibarion plot you have to run the follow command:
-
-\begin{Shaded}
-\begin{Highlighting}[]
-\FunctionTok{plotSmoothCalibration}\NormalTok{(lrResults)}
-\end{Highlighting}
-\end{Shaded}
-
-See the help function for more information, on how to set the smoothing
-method etc.
-
-The example below is from another study that better demonstrates the
-impact of using a smooth calibration plot. The default line fit would
-not highlight the miss-calibration at the lower predicted probability
-levels that well.
-
-\begin{figure}
-\centering
-\includegraphics{smoothCalibration.jpeg}
-\caption{Smooth Calibration plot}
-\end{figure}
-
-\newpage
-
-\#\# Preference distribution
-
-The preference distribution plots are the preference score distributions
-corresponding to i) people in the test set with the outcome (red) and
-ii) people in the test set without the outcome (blue).
-
-\begin{figure}
-\centering
-\includegraphics{preferencePDF.webp}
-\caption{Preference Plot}
-\end{figure}
-
-\newpage
-
-\#\# Predicted probability distribution
-
-The prediction distribution box plots are for the predicted risks of the
-people in the test set with the outcome (class 1: blue) and without the
-outcome (class 0: red).
-
-The box plots in the Figure show that the predicted probability of the
-outcome is indeed higher for those with the outcome but there is also
-overlap between the two distribution which lead to an imperfect
-discrimination.
-
-\begin{figure}
-\centering
-\includegraphics{predictionDistribution.wwebp}
-\caption{Prediction Distribution Box Plot}
-\end{figure}
-
-\newpage
-
-\#\# Test-Train similarity
-
-The test-train similarity is assessed by plotting the mean covariate
-values in the train set against those in the test set for people with
-and without the outcome.
-
-The results for our example of look very promising since the mean values
-of the covariates are on the diagonal.
-
-\begin{figure}
-\centering
-\includegraphics{generalizability.webp}
-\caption{Similarity plots of train and test set}
-\end{figure}
-
-\newpage
-
-\#\# Variable scatter plot
-
-The variable scatter plot shows the mean covariate value for the people
-with the outcome against the mean covariate value for the people without
-the outcome. The color of the dots corresponds to the inclusion (green)
-or exclusion in the model (blue), respectively. It is highly recommended
-to use the Shiny App since this allows you to hoover over a covariate to
-show more details (name, value etc).
-
-The plot shows that the mean of most of the covariates is higher for
-subjects with the outcome compared to those without.
-
-\begin{figure}
-\centering
-\includegraphics{variableScatterplot.webp}
-\caption{Variabel scatter Plot}
-\end{figure}
-
-\newpage
-
-\#\# Precision recall
-
-Precision (P) is defined as the number of true positives (Tp) over the
-number of true positives plus the number of false positives (Fp).
-
-\begin{Shaded}
-\begin{Highlighting}[]
-\NormalTok{P }\OtherTok{\textless{}{-}}\NormalTok{ Tp}\SpecialCharTok{/}\NormalTok{(Tp}\SpecialCharTok{+}\NormalTok{Fp)}
-\end{Highlighting}
-\end{Shaded}
-
-Recall (R) is defined as the number of true positives (Tp) over the
-number of true positives plus the number of false negatives (Fn).
-
-\begin{Shaded}
-\begin{Highlighting}[]
-\NormalTok{R }\OtherTok{\textless{}{-}}\NormalTok{ Tp}\SpecialCharTok{/}\NormalTok{(Tp }\SpecialCharTok{+}\NormalTok{ Fn)}
-\end{Highlighting}
-\end{Shaded}
-
-These quantities are also related to the (F1) score, which is defined as
-the harmonic mean of precision and recall.
-
-\begin{Shaded}
-\begin{Highlighting}[]
-\NormalTok{F1 }\OtherTok{\textless{}{-}} \DecValTok{2}\SpecialCharTok{*}\NormalTok{P}\SpecialCharTok{*}\NormalTok{R}\SpecialCharTok{/}\NormalTok{(P}\SpecialCharTok{+}\NormalTok{R)}
-\end{Highlighting}
-\end{Shaded}
-
-Note that the precision can either decrease or increase if the threshold
-is lowered. Lowering the threshold of a classifier may increase the
-denominator, by increasing the number of results returned. If the
-threshold was previously set too high, the new results may all be true
-positives, which will increase precision. If the previous threshold was
-about right or too low, further lowering the threshold will introduce
-false positives, decreasing precision.
-
-For Recall the denominator does not depend on the classifier threshold
-(Tp+Fn is a constant). This means that lowering the classifier threshold
-may increase recall, by increasing the number of true positive results.
-It is also possible that lowering the threshold may leave recall
-unchanged, while the precision fluctuates.
-
-\begin{figure}
-\centering
-\includegraphics{precisionRecall.webp}
-\caption{Precision Recall Plot}
-\end{figure}
-
-\newpage
-
-\#\# Demographic summary
-
-This plot shows for females and males the expected and observed risk in
-different age groups together with a confidence area.
-
-The results show that our model is well calibrated across gender and age
-groups.
-
-\begin{figure}
-\centering
-\includegraphics{demographicSummary.webp}
-\caption{Demographic Summary Plot}
-\end{figure}
-
-\newpage
-
-\# External validation
-
-We recommend to always perform external validation, i.e.~apply the final
-model on as much new datasets as feasible and evaluate its performance.
-
-\begin{Shaded}
-\begin{Highlighting}[]
-\CommentTok{\# load the trained model}
-\NormalTok{plpModel }\OtherTok{\textless{}{-}} \FunctionTok{loadPlpModel}\NormalTok{(}\FunctionTok{getwd}\NormalTok{(),}\StringTok{\textquotesingle{}model\textquotesingle{}}\NormalTok{)}
-
-\CommentTok{\# add details of new database}
-\NormalTok{validationDatabaseDetails }\OtherTok{\textless{}{-}} \FunctionTok{createDatabaseDetails}\NormalTok{()}
-
-\CommentTok{\# to externally validate the model and perform recalibration run:}
-\FunctionTok{externalValidateDbPlp}\NormalTok{(}
- \AttributeTok{plpModel =}\NormalTok{ plpModel,}
- \AttributeTok{validationDatabaseDetails =}\NormalTok{ validationDatabaseDetails,}
- \AttributeTok{validationRestrictPlpDataSettings =}\NormalTok{ plpModel}\SpecialCharTok{$}\NormalTok{settings}\SpecialCharTok{$}\NormalTok{plpDataSettings,}
- \AttributeTok{settings =} \FunctionTok{createValidationSettings}\NormalTok{(}
- \AttributeTok{recalibrate =} \StringTok{\textquotesingle{}weakRecalibration\textquotesingle{}}
-\NormalTok{ ),}
- \AttributeTok{outputFolder =} \FunctionTok{getwd}\NormalTok{()}
-\NormalTok{)}
-\end{Highlighting}
-\end{Shaded}
-
-This will extract the new plpData from the specified schemas and cohort
-tables. It will then apply the same population settings and the trained
-plp model. Finally, it will evaluate the performance and return the
-standard output as \texttt{validation\$performanceEvaluation} and it
-will also return the prediction on the population as
-\texttt{validation\$prediction}. They can be inserted into the shiny app
-for viewing the model and validation by running:
-\texttt{viewPlp(runPlp=plpResult,\ validatePlp=validation\ )}.
-
-\newpage
-
-\hypertarget{other-functionality}{%
-\section{Other functionality}\label{other-functionality}}
-
-The package has much more functionality than described in this vignette
-and contributions have been made my many persons in the OHDSI community.
-The table below provides an overview:
-
-\begin{longtable}[]{@{}
- >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.2361}}
- >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.5278}}
- >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.2361}}@{}}
-\toprule\noalign{}
-\begin{minipage}[b]{\linewidth}\raggedright
-Functionality
-\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
-Description
-\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
-Vignette
-\end{minipage} \\
-\midrule\noalign{}
-\endhead
-\bottomrule\noalign{}
-\endlastfoot
-Builing Multiple Models & This vignette describes how you can run
-multiple models automatically &
-\href{https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/BuildingMultiplePredictiveModels.pdf}{\texttt{Vignette}} \\
-Custom Models & This vignette describes how you can add your own custom
-algorithms in the framework &
-\href{https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomModels.pdf}{\texttt{Vignette}} \\
-Custom Splitting Functions & This vignette describes how you can add
-your own custom training/validation/testing splitting functions in the
-framework &
-\href{https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomSplitting.pdf}{\texttt{Vignette}} \\
-Custom Sampling Functions & This vignette describes how you can add your
-own custom sampling functions in the framework &
-\href{https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomSamples.pdf}{\texttt{Vignette}} \\
-Custom Feature Engineering/Selection & This vignette describes how you
-can add your own custom feature engineering and selection functions in
-the framework &
-\href{https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomFeatureEngineering.pdf}{\texttt{Vignette}} \\
-Ensemble models & This vignette describes how you can use the framework
-to build ensemble models, i.e combine multiple models in a super learner
-&
-\href{https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/BuildingEnsembleModels.pdf}{\texttt{Vignette}} \\
-Learning curves & Learning curves assess the effect of training set size
-on model performance by training a sequence of prediction models on
-successively larger subsets of the training set. A learning curve plot
-can also help in diagnosing a bias or variance problem as explained
-below. &
-\href{https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/GeneratingLearningCurves.pdf}{\texttt{Vignette}} \\
-\end{longtable}
-
-\hypertarget{demos}{%
-\section{Demos}\label{demos}}
-
-We have added several demos in the package that run on simulated data:
-
-\begin{Shaded}
-\begin{Highlighting}[]
-\CommentTok{\# Show all demos in our package: }
-\FunctionTok{demo}\NormalTok{(}\AttributeTok{package =} \StringTok{"PatientLevelPrediction"}\NormalTok{)}
-
-\CommentTok{\# For example, to run the SingleModelDemo that runs Lasso and shows you how to run the Shiny App use this call}
-\FunctionTok{demo}\NormalTok{(}\StringTok{"SingleModelDemo"}\NormalTok{, }\AttributeTok{package =} \StringTok{"PatientLevelPrediction"}\NormalTok{)}
-\end{Highlighting}
-\end{Shaded}
-
-\newpage
-
-\hypertarget{acknowledgments}{%
-\section{Acknowledgments}\label{acknowledgments}}
-
-Considerable work has been dedicated to provide the
-\texttt{PatientLevelPrediction} package.
-
-\begin{Shaded}
-\begin{Highlighting}[]
-\FunctionTok{citation}\NormalTok{(}\StringTok{"PatientLevelPrediction"}\NormalTok{)}
-\end{Highlighting}
-\end{Shaded}
-
-\begin{verbatim}
-## To cite PatientLevelPrediction in publications use:
-##
-## Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design and
-## implementation of a standardized framework to generate and evaluate patient-level
-## prediction models using observational healthcare data." _Journal of the American
-## Medical Informatics Association_, *25*(8), 969-975.
-## .
-##
-## A BibTeX entry for LaTeX users is
-##
-## @Article{,
-## author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek},
-## title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data},
-## journal = {Journal of the American Medical Informatics Association},
-## volume = {25},
-## number = {8},
-## pages = {969-975},
-## year = {2018},
-## url = {https://doi.org/10.1093/jamia/ocy032},
-## }
-\end{verbatim}
-
-Further, \texttt{PatientLevelPrediction} makes extensive use of the
-\texttt{Cyclops} package.
-
-\begin{Shaded}
-\begin{Highlighting}[]
-\FunctionTok{citation}\NormalTok{(}\StringTok{"Cyclops"}\NormalTok{)}
-\end{Highlighting}
-\end{Shaded}
-
-\begin{verbatim}
-## To cite Cyclops in publications use:
-##
-## Suchard MA, Simpson SE, Zorych I, Ryan P, Madigan D (2013). "Massive parallelization of
-## serial inference algorithms for complex generalized linear models." _ACM Transactions
-## on Modeling and Computer Simulation_, *23*, 10.
-## .
-##
-## A BibTeX entry for LaTeX users is
-##
-## @Article{,
-## author = {M. A. Suchard and S. E. Simpson and I. Zorych and P. Ryan and D. Madigan},
-## title = {Massive parallelization of serial inference algorithms for complex generalized linear models},
-## journal = {ACM Transactions on Modeling and Computer Simulation},
-## volume = {23},
-## pages = {10},
-## year = {2013},
-## url = {https://dl.acm.org/doi/10.1145/2414416.2414791},
-## }
-\end{verbatim}
-
-\textbf{Please reference this paper if you use the PLP Package in your
-work:}
-
-\href{http://dx.doi.org/10.1093/jamia/ocy032}{Reps JM, Schuemie MJ,
-Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a
-standardized framework to generate and evaluate patient-level prediction
-models using observational healthcare data. J Am Med Inform Assoc.
-2018;25(8):969-975.}
-
-This work is supported in part through the National Science Foundation
-grant IIS 1251151.
-
-\newpage
-
-\hypertarget{appendix-1-study-population-settings-details}{%
-\section*{Appendix 1: Study population settings
-details}\label{appendix-1-study-population-settings-details}}
-\addcontentsline{toc}{section}{Appendix 1: Study population settings
-details}
-
-In the figures below the effect is shown of the
-removeSubjectsWithPriorOutcome, requireTimAtRisk, and includeAllOutcomes
-booleans on the final study population. We start with a Target Cohort
-with firstExposureOnly = false and we require a washout period = 1095.
-We then subset the target cohort based on additional constraints. The
-final study population in the Venn diagrams below are colored green.
-
-\begin{enumerate}
-\def\labelenumi{\arabic{enumi})}
-\item
- Require minimum time-at-risk for all person in the target cohort
-
- \includegraphics{popdef1.webp}
-\item
- Require minumum time-at-risk for target cohort, except for persons
- with outcomes during time-at-risk.
-
- \includegraphics{popdef2.webp}
-\end{enumerate}
-
-\newpage
-3
-
-)
-
-Include all persons in the target cohort exclude persons with prior
-outcomes
-
-\includegraphics{popdef3.webp}
-
-\begin{enumerate}
-\def\labelenumi{\arabic{enumi})}
-\setcounter{enumi}{3}
-\item
- Require minimum time-at-risk for target cohort, except for persons
- with outcomes during time-at-risk, exclude persons with prior outcomes
-
- \includegraphics{popdef4.webp}
-\end{enumerate}
-
-\newpage
-5
-
-)
-
-Include all persons in target cohort exclude persons with prior outcomes
-
-\includegraphics{popdef5.webp}
-
-\begin{enumerate}
-\def\labelenumi{\arabic{enumi})}
-\setcounter{enumi}{5}
-\item
- Include all persons in target cohort
-
- \includegraphics{popdef6.webp}
-\end{enumerate}
-
-\end{document}
diff --git a/inst/doc/CreatingLearningCurves.pdf b/inst/doc/CreatingLearningCurves.pdf
deleted file mode 100644
index cfa6cf8ef..000000000
Binary files a/inst/doc/CreatingLearningCurves.pdf and /dev/null differ
diff --git a/inst/doc/CreatingNetworkstudies.pdf b/inst/doc/CreatingNetworkstudies.pdf
deleted file mode 100644
index 57a187a82..000000000
Binary files a/inst/doc/CreatingNetworkstudies.pdf and /dev/null differ
diff --git a/inst/doc/CreatingShinyApp.pdf b/inst/doc/CreatingShinyApp.pdf
deleted file mode 100644
index 6ad7fa645..000000000
Binary files a/inst/doc/CreatingShinyApp.pdf and /dev/null differ
diff --git a/inst/doc/InstallationGuide.pdf b/inst/doc/InstallationGuide.pdf
deleted file mode 100644
index 605054361..000000000
Binary files a/inst/doc/InstallationGuide.pdf and /dev/null differ
diff --git a/inst/sql/sql_server/CreateCohorts.sql b/inst/sql/sql_server/CreateCohorts.sql
index 20279339f..90a945ee8 100644
--- a/inst/sql/sql_server/CreateCohorts.sql
+++ b/inst/sql/sql_server/CreateCohorts.sql
@@ -1,5 +1,5 @@
/************************************************************************
-@file GetCohorts.sql
+@file CreateCohorts.sql
Copyright 2021 Observational Health Data Sciences and Informatics
This file is part of CohortMethod
Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/man/MapIds.Rd b/man/MapIds.Rd
index e10d16ba3..6c5b5f728 100644
--- a/man/MapIds.Rd
+++ b/man/MapIds.Rd
@@ -14,6 +14,6 @@ MapIds(covariateData, cohort = NULL, mapping = NULL)
\item{mapping}{A pre defined mapping to use}
}
\description{
-this functions takes covariate data and a cohort/population and remaps
+this functions takes covariate data and a cohort/population and remaps
the covariate and row ids, restricts to pop and saves/creates mapping
}
diff --git a/man/PatientLevelPrediction.Rd b/man/PatientLevelPrediction.Rd
index 4946949d9..4a632609e 100644
--- a/man/PatientLevelPrediction.Rd
+++ b/man/PatientLevelPrediction.Rd
@@ -29,5 +29,10 @@ Authors:
\item Peter Rijnbeek
}
+Other contributors:
+\itemize{
+ \item Observational Health Data Science and Informatics [copyright holder]
+}
+
}
\keyword{internal}
diff --git a/man/addDiagnosePlpToDatabase.Rd b/man/addDiagnosePlpToDatabase.Rd
index 483575654..e94a5188d 100644
--- a/man/addDiagnosePlpToDatabase.Rd
+++ b/man/addDiagnosePlpToDatabase.Rd
@@ -10,7 +10,7 @@ addDiagnosePlpToDatabase(
databaseSchemaSettings,
cohortDefinitions,
databaseList = NULL,
- overWriteIfExists = T
+ overWriteIfExists = TRUE
)
}
\arguments{
diff --git a/man/calibrationInLarge.Rd b/man/calibrationInLarge.Rd
new file mode 100644
index 000000000..28505d42d
--- /dev/null
+++ b/man/calibrationInLarge.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/EvaluationSummary.R
+\name{calibrationInLarge}
+\alias{calibrationInLarge}
+\title{Calculate the calibration in large}
+\usage{
+calibrationInLarge(prediction)
+}
+\arguments{
+\item{prediction}{A prediction dataframe}
+}
+\value{
+data.frame with meanPredictionRisk, observedRisk, and N
+}
+\description{
+Calculate the calibration in large
+}
+\keyword{internal}
diff --git a/man/checkSurvivalPackages.Rd b/man/checkSurvivalPackages.Rd
new file mode 100644
index 000000000..aa1a4ddba
--- /dev/null
+++ b/man/checkSurvivalPackages.Rd
@@ -0,0 +1,12 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/HelperFunctions.R
+\name{checkSurvivalPackages}
+\alias{checkSurvivalPackages}
+\title{Check if the required packages for survival analysis are installed}
+\usage{
+checkSurvivalPackages()
+}
+\description{
+Check if the required packages for survival analysis are installed
+}
+\keyword{internal}
diff --git a/man/covariateSummary.Rd b/man/covariateSummary.Rd
index f0fb6a27b..5bf3e30f9 100644
--- a/man/covariateSummary.Rd
+++ b/man/covariateSummary.Rd
@@ -14,7 +14,7 @@ covariateSummary(
)
}
\arguments{
-\item{covariateData}{The covariateData part of the plpData that is
+\item{covariateData}{The covariateData part of the plpData that is
extracted using \code{getPlpData}}
\item{cohort}{The patient cohort to calculate the summary}
@@ -23,7 +23,7 @@ extracted using \code{getPlpData}}
\item{strata}{A data.frame containing the columns rowId, strataName}
-\item{variableImportance}{A data.frame with the columns covariateId and
+\item{variableImportance}{A data.frame with the columns covariateId and
value (the variable importance value)}
\item{featureEngineering}{(currently not used )
diff --git a/man/createCohortCovariateSettings.Rd b/man/createCohortCovariateSettings.Rd
index 37ec35177..5458e8913 100644
--- a/man/createCohortCovariateSettings.Rd
+++ b/man/createCohortCovariateSettings.Rd
@@ -12,9 +12,9 @@ createCohortCovariateSettings(
cohortId,
startDay = -30,
endDay = 0,
- count = F,
- ageInteraction = F,
- lnAgeInteraction = F,
+ count = FALSE,
+ ageInteraction = FALSE,
+ lnAgeInteraction = FALSE,
analysisId = 456
)
}
@@ -23,10 +23,10 @@ createCohortCovariateSettings(
\item{settingId}{A unique id for the covariate time and}
-\item{cohortDatabaseSchema}{The schema of the database with the cohort. If
+\item{cohortDatabaseSchema}{The schema of the database with the cohort. If
nothing is specified then the cohortDatabaseSchema from databaseDetails at runtime is used.}
-\item{cohortTable}{the table name that contains the covariate cohort. If
+\item{cohortTable}{the table name that contains the covariate cohort. If
nothing is specified then the cohortTable from databaseDetails at runtime is used.}
\item{cohortId}{cohort id for the covariate cohort}
diff --git a/man/createDatabaseDetails.Rd b/man/createDatabaseDetails.Rd
index f848843cd..86c7fbc72 100644
--- a/man/createDatabaseDetails.Rd
+++ b/man/createDatabaseDetails.Rd
@@ -69,6 +69,6 @@ A list with the the database specific settings (this is used by the runMultipleP
Create a setting that holds the details about the cdmDatabase connection for data extraction
}
\details{
-This function simply stores the settings for communicating with the cdmDatabase when extracting
+This function simply stores the settings for communicating with the cdmDatabase when extracting
the target cohort and outcomes
}
diff --git a/man/createDefaultSplitSetting.Rd b/man/createDefaultSplitSetting.Rd
index b9020bd45..e63e74375 100644
--- a/man/createDefaultSplitSetting.Rd
+++ b/man/createDefaultSplitSetting.Rd
@@ -2,8 +2,9 @@
% Please edit documentation in R/DataSplitting.R
\name{createDefaultSplitSetting}
\alias{createDefaultSplitSetting}
-\title{Create the settings for defining how the plpData are split into test/validation/train sets using
-default splitting functions (either random stratified by outcome, time or subject splitting)}
+\title{Create the settings for defining how the plpData are split into
+test/validation/train sets using default splitting functions
+(either random stratified by outcome, time or subject splitting)}
\usage{
createDefaultSplitSetting(
testFraction = 0.25,
@@ -14,28 +15,39 @@ createDefaultSplitSetting(
)
}
\arguments{
-\item{testFraction}{(numeric) A real number between 0 and 1 indicating the test set fraction of the data}
+\item{testFraction}{(numeric) A real number between 0 and 1
+indicating the test set fraction of the data}
-\item{trainFraction}{(numeric) A real number between 0 and 1 indicating the train set fraction of the data.
-If not set train is equal to 1 - test}
+\item{trainFraction}{(numeric) A real number between 0 and 1 indicating the
+train set fraction of the data. If not set train is equal to 1 - test}
-\item{splitSeed}{(numeric) A seed to use when splitting the data for reproducibility (if not set a random number will be generated)}
+\item{splitSeed}{(numeric) A seed to use when splitting the data for
+reproducibility (if not set a random number will be generated)}
-\item{nfold}{(numeric) An integer > 1 specifying the number of folds used in cross validation}
+\item{nfold}{(numeric) An integer > 1 specifying the number of
+folds used in cross validation}
-\item{type}{(character) Choice of: \itemize{
-\item'stratified' Each data point is randomly assigned into the test or a train fold set but this is done stratified such that the outcome rate is consistent in each partition
-\item'time' Older data are assigned into the training set and newer data are assigned into the test set
-\item'subject' Data are partitioned by subject, if a subject is in the data more than once, all the data points for the subject are assigned either into the test data or into the train data (not both).
-}}
+\item{type}{(character) Choice of: \itemize{
+ \item'stratified' Each data point is
+randomly assigned into the test or a train fold set but this is done
+stratified such that the outcome rate is consistent in each partition
+ \item'time' Older data are assigned
+into the training set and newer data are assigned into the test set
+ \item'subject' Data are partitioned by
+subject, if a subject is in the data more than once, all the data points for
+the subject are assigned either into the test data or into the train data
+(not both).
+ }}
}
\value{
An object of class \code{splitSettings}
}
\description{
-Create the settings for defining how the plpData are split into test/validation/train sets using
-default splitting functions (either random stratified by outcome, time or subject splitting)
+Create the settings for defining how the plpData are split into
+test/validation/train sets using default splitting functions
+(either random stratified by outcome, time or subject splitting)
}
\details{
-Returns an object of class \code{splitSettings} that specifies the splitting function that will be called and the settings
+Returns an object of class \code{splitSettings} that specifies the
+splitting function that will be called and the settings
}
diff --git a/man/createDownloadTasks.Rd b/man/createDownloadTasks.Rd
new file mode 100644
index 000000000..194503eea
--- /dev/null
+++ b/man/createDownloadTasks.Rd
@@ -0,0 +1,26 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/ExternalValidatePlp.R
+\name{createDownloadTasks}
+\alias{createDownloadTasks}
+\title{createDownloadTasks
+create download tasks based on unique combinations of targetId and
+restrictPlpDataSettings. It adds all covariateSettings and outcomes that
+have that targetId and restrictPlpDataSettings. This is used to avoid
+downloading the same data multiple times.}
+\usage{
+createDownloadTasks(validationDesignList)
+}
+\arguments{
+\item{validationDesignList}{A list of validationDesign objects}
+}
+\value{
+A dataframe where each row is a downloadTask
+}
+\description{
+createDownloadTasks
+create download tasks based on unique combinations of targetId and
+restrictPlpDataSettings. It adds all covariateSettings and outcomes that
+have that targetId and restrictPlpDataSettings. This is used to avoid
+downloading the same data multiple times.
+}
+\keyword{internal}
diff --git a/man/createExecuteSettings.Rd b/man/createExecuteSettings.Rd
index 8d58eb259..7cb647de6 100644
--- a/man/createExecuteSettings.Rd
+++ b/man/createExecuteSettings.Rd
@@ -5,12 +5,12 @@
\title{Creates list of settings specifying what parts of runPlp to execute}
\usage{
createExecuteSettings(
- runSplitData = F,
- runSampleData = F,
- runfeatureEngineering = F,
- runPreprocessData = F,
- runModelDevelopment = F,
- runCovariateSummary = F
+ runSplitData = FALSE,
+ runSampleData = FALSE,
+ runfeatureEngineering = FALSE,
+ runPreprocessData = FALSE,
+ runModelDevelopment = FALSE,
+ runCovariateSummary = FALSE
)
}
\arguments{
diff --git a/man/createExistingSplitSettings.Rd b/man/createExistingSplitSettings.Rd
new file mode 100644
index 000000000..18ffda495
--- /dev/null
+++ b/man/createExistingSplitSettings.Rd
@@ -0,0 +1,23 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/DataSplitting.R
+\name{createExistingSplitSettings}
+\alias{createExistingSplitSettings}
+\title{Create the settings for defining how the plpData are split into
+test/validation/train sets using an existing split - good to use for
+reproducing results from a different run}
+\usage{
+createExistingSplitSettings(splitIds)
+}
+\arguments{
+\item{splitIds}{(data.frame) A data frame with rowId and index columns of
+type integer/numeric. Index is -1 for test set, positive integer for train
+set folds}
+}
+\value{
+An object of class \code{splitSettings}
+}
+\description{
+Create the settings for defining how the plpData are split into
+test/validation/train sets using an existing split - good to use for
+reproducing results from a different run
+}
diff --git a/man/createFeatureEngineeringMapColumnsSettings.Rd b/man/createFeatureEngineeringMapColumnsSettings.Rd
index dc592321f..e6494813d 100644
--- a/man/createFeatureEngineeringMapColumnsSettings.Rd
+++ b/man/createFeatureEngineeringMapColumnsSettings.Rd
@@ -2,24 +2,24 @@
% Please edit documentation in R/ExistingPython.R
\name{createFeatureEngineeringMapColumnsSettings}
\alias{createFeatureEngineeringMapColumnsSettings}
-\title{Create settings that enable you to convert from standard covariateIds to
+\title{Create settings that enable you to convert from standard covariateIds to
model covariate names - this is useful when implementing a model developed
outside of the OHDSI tools.}
\usage{
createFeatureEngineeringMapColumnsSettings(columnMap)
}
\arguments{
-\item{columnMap}{A data.frame containing the columns: covariateId the covariate ID from FeatureExtraction and
+\item{columnMap}{A data.frame containing the columns: covariateId the covariate ID from FeatureExtraction and
modelCovariateIdName which is the column name used when fitting the model.}
}
\value{
-An object of class featureEngineeringSettings that will convert column names
+An object of class \code{featureEngineeringSettings} that will convert column names
}
\description{
-Create settings that enable you to convert from standard covariateIds to
+Create settings that enable you to convert from standard covariateIds to
model covariate names - this is useful when implementing a model developed
outside of the OHDSI tools.
}
\details{
-This function create settings that let you rename the covariates in the plpData object
+This function creates settings that let you rename the covariates in the plpData object
}
diff --git a/man/createFeatureEngineeringSettings.Rd b/man/createFeatureEngineeringSettings.Rd
index 0b7a0a8d8..9772a9a15 100644
--- a/man/createFeatureEngineeringSettings.Rd
+++ b/man/createFeatureEngineeringSettings.Rd
@@ -8,7 +8,7 @@ createFeatureEngineeringSettings(type = "none")
}
\arguments{
\item{type}{(character) Choice of: \itemize{
-\item'none' No feature engineering - this is the default
+\item'none' No feature engineering - this is the default
}}
}
\value{
diff --git a/man/createGlmModel.Rd b/man/createGlmModel.Rd
index 6a6509fd7..1ba11ca09 100644
--- a/man/createGlmModel.Rd
+++ b/man/createGlmModel.Rd
@@ -9,7 +9,7 @@ createGlmModel(coefficients, intercept = 0, finalMapping = "logistic")
\arguments{
\item{coefficients}{A dataframe containing two columns, coefficients and
covariateId, both of type numeric. The covariateId column must contain
-valid covariateIds that match those used in the /code{FeatureExtraction}
+valid covariateIds that match those used in the \code{FeatureExtraction}
package.}
\item{intercept}{A numeric value representing the intercept of the model.}
diff --git a/man/createLearningCurve.Rd b/man/createLearningCurve.Rd
index e6c113a13..ad8604843 100644
--- a/man/createLearningCurve.Rd
+++ b/man/createLearningCurve.Rd
@@ -7,7 +7,7 @@
createLearningCurve(
plpData,
outcomeId,
- parallel = T,
+ parallel = TRUE,
cores = 4,
modelSettings,
saveDirectory = getwd(),
@@ -18,11 +18,11 @@ createLearningCurve(
trainEvents = NULL,
sampleSettings = createSampleSettings(),
featureEngineeringSettings = createFeatureEngineeringSettings(),
- preprocessSettings = createPreprocessSettings(minFraction = 0.001, normalize = T),
+ preprocessSettings = createPreprocessSettings(minFraction = 0.001, normalize = TRUE),
logSettings = createLogSettings(),
- executeSettings = createExecuteSettings(runSplitData = T, runSampleData = F,
- runfeatureEngineering = F, runPreprocessData = T, runModelDevelopment = T,
- runCovariateSummary = F)
+ executeSettings = createExecuteSettings(runSplitData = TRUE, runSampleData = FALSE,
+ runfeatureEngineering = FALSE, runPreprocessData = TRUE, runModelDevelopment = TRUE,
+ runCovariateSummary = FALSE)
)
}
\arguments{
@@ -50,7 +50,7 @@ data extracted from the CDM.}
\item{analysisId}{(integer) Identifier for the analysis. It is used to create, e.g., the result folder. Default is a timestamp.}
\item{populationSettings}{An object of type \code{populationSettings} created using \code{createStudyPopulationSettings} that
-specifies how the data class labels are defined and addition any exclusions to apply to the
+specifies how the data class labels are defined and addition any exclusions to apply to the
plpData cohort}
\item{splitSettings}{An object of type \code{splitSettings} that specifies how to split the data into train/validation/test.
@@ -73,11 +73,11 @@ The default is none.}
\item{featureEngineeringSettings}{An object of \code{featureEngineeringSettings} specifying any feature engineering to be learned (using the train data)}
-\item{preprocessSettings}{An object of \code{preprocessSettings}. This setting specifies the minimum fraction of
-target population who must have a covariate for it to be included in the model training
+\item{preprocessSettings}{An object of \code{preprocessSettings}. This setting specifies the minimum fraction of
+target population who must have a covariate for it to be included in the model training
and whether to normalise the covariates before training}
-\item{logSettings}{An object of \code{logSettings} created using \code{createLogSettings}
+\item{logSettings}{An object of \code{logSettings} created using \code{createLogSettings}
specifying how the logging is done}
\item{executeSettings}{An object of \code{executeSettings} specifying which parts of the analysis to run}
@@ -94,12 +94,14 @@ Creates a learning curve object, which can be plotted using the
\examples{
\dontrun{
# define model
-modelSettings = PatientLevelPrediction::setLassoLogisticRegression()
+modelSettings <- PatientLevelPrediction::setLassoLogisticRegression()
# create learning curve
-learningCurve <- PatientLevelPrediction::createLearningCurve(population,
- plpData,
- modelSettings)
+learningCurve <- PatientLevelPrediction::createLearningCurve(
+ population,
+ plpData,
+ modelSettings
+)
# plot learning curve
PatientLevelPrediction::plotLearningCurve(learningCurve)
}
diff --git a/man/createLogSettings.Rd b/man/createLogSettings.Rd
index 83476c541..b889f0e95 100644
--- a/man/createLogSettings.Rd
+++ b/man/createLogSettings.Rd
@@ -4,7 +4,11 @@
\alias{createLogSettings}
\title{Create the settings for logging the progression of the analysis}
\usage{
-createLogSettings(verbosity = "DEBUG", timeStamp = T, logName = "runPlp Log")
+createLogSettings(
+ verbosity = "DEBUG",
+ timeStamp = TRUE,
+ logName = "runPlp Log"
+)
}
\arguments{
\item{verbosity}{Sets the level of the verbosity. If the log level is at or higher in priority than the logger threshold, a message will print. The levels are:
diff --git a/man/createModelDesign.Rd b/man/createModelDesign.Rd
index 593a0d8c9..d51a1afcd 100644
--- a/man/createModelDesign.Rd
+++ b/man/createModelDesign.Rd
@@ -16,7 +16,7 @@ createModelDesign(
modelSettings = NULL,
splitSettings = createDefaultSplitSetting(type = "stratified", testFraction = 0.25,
trainFraction = 0.75, splitSeed = 123, nfold = 3),
- runCovariateSummary = T
+ runCovariateSummary = TRUE
)
}
\arguments{
diff --git a/man/createPlpResultTables.Rd b/man/createPlpResultTables.Rd
index 8bdd6fa6c..279215b34 100644
--- a/man/createPlpResultTables.Rd
+++ b/man/createPlpResultTables.Rd
@@ -8,8 +8,8 @@ createPlpResultTables(
connectionDetails,
targetDialect = "postgresql",
resultSchema,
- deleteTables = T,
- createTables = T,
+ deleteTables = TRUE,
+ createTables = TRUE,
tablePrefix = "",
tempEmulationSchema = getOption("sqlRenderTempEmulationSchema"),
testFile = NULL
diff --git a/man/createRestrictPlpDataSettings.Rd b/man/createRestrictPlpDataSettings.Rd
index 59aa85886..aebc419b9 100644
--- a/man/createRestrictPlpDataSettings.Rd
+++ b/man/createRestrictPlpDataSettings.Rd
@@ -7,7 +7,7 @@
createRestrictPlpDataSettings(
studyStartDate = "",
studyEndDate = "",
- firstExposureOnly = F,
+ firstExposureOnly = FALSE,
washoutPeriod = 0,
sampleSize = NULL
)
diff --git a/man/createSampleSettings.Rd b/man/createSampleSettings.Rd
index 9c8d0b918..754258768 100644
--- a/man/createSampleSettings.Rd
+++ b/man/createSampleSettings.Rd
@@ -2,7 +2,7 @@
% Please edit documentation in R/Sampling.R
\name{createSampleSettings}
\alias{createSampleSettings}
-\title{Create the settings for defining how the trainData from \code{splitData} are sampled using
+\title{Create the settings for defining how the trainData from \code{splitData} are sampled using
default sample functions.}
\usage{
createSampleSettings(
@@ -14,11 +14,11 @@ createSampleSettings(
\arguments{
\item{type}{(character) Choice of: \itemize{
\item 'none' No sampling is applied - this is the default
-\item 'underSample' Undersample the non-outcome class to make the data more ballanced
+\item 'underSample' Undersample the non-outcome class to make the data more balanced
\item 'overSample' Oversample the outcome class by adding in each outcome multiple times
}}
-\item{numberOutcomestoNonOutcomes}{(numeric) An numeric specifying the require number of non-outcomes per outcome}
+\item{numberOutcomestoNonOutcomes}{(numeric) A numeric specifying the required number of outcomes per non-outcomes}
\item{sampleSeed}{(numeric) A seed to use when splitting the data for reproducibility (if not set a random number will be generated)}
}
@@ -26,7 +26,7 @@ createSampleSettings(
An object of class \code{sampleSettings}
}
\description{
-Create the settings for defining how the trainData from \code{splitData} are sampled using
+Create the settings for defining how the trainData from \code{splitData} are sampled using
default sample functions.
}
\details{
diff --git a/man/createSciKitLearnModel.Rd b/man/createSciKitLearnModel.Rd
index 6317eeb4d..f608ca1af 100644
--- a/man/createSciKitLearnModel.Rd
+++ b/man/createSciKitLearnModel.Rd
@@ -2,7 +2,7 @@
% Please edit documentation in R/ExistingPython.R
\name{createSciKitLearnModel}
\alias{createSciKitLearnModel}
-\title{Plug an existing scikit learn python model developed outside OHDSI into the
+\title{Plug an existing scikit learn python model into the
PLP framework}
\usage{
createSciKitLearnModel(
@@ -11,21 +11,21 @@ createSciKitLearnModel(
c("pred_1", "pred_2")),
covariateSettings,
populationSettings,
- isPickle = T
+ isPickle = TRUE
)
}
\arguments{
\item{pythonModelLocation}{The location of the folder that contains the model as model.pkl}
\item{covariateMap}{A data.frame with the columns: columnId specifying the column order for the
-covariate, covariateId the covariate ID from FeatureExtraction and modelCovariateIdName which is the
-column name used when fitting the model. For example, if you had a column called 'age' in your model and this was the 3rd
+covariate, covariateId the covariate ID from FeatureExtraction and modelCovariateIdName which is the
+column name used when fitting the model. For example, if you had a column called 'age' in your model and this was the 3rd
column when fitting the model, then the values for columnId would be 3, covariateId would be 1002 (the covariateId for age in years) and
modelCovariateIdName would be 'age'.}
\item{covariateSettings}{The settings for the standardized covariates}
-\item{populationSettings}{The settings for the population, this includes the time-at-risk settings and
+\item{populationSettings}{The settings for the population, this includes the time-at-risk settings and
and inclusion criteria.}
\item{isPickle}{If the model is saved as a pickle set this to T if it is a json set this to F}
@@ -33,11 +33,11 @@ and inclusion criteria.}
\value{
An object of class plpModel, this is a list that contains: model (the location of the model.pkl),
preprocessing (settings for mapping the covariateIds to the model column mames), modelDesign (specification
-of the model design), trainDetails (information about the model fitting) and covariateImportance. You can use the output
+of the model design), trainDetails (information about the model fitting) and covariateImportance. You can use the output
as an input in PatientLevelPrediction::predictPlp to apply the model and calculate the risk for patients.
}
\description{
-Plug an existing scikit learn python model developed outside OHDSI into the
+Plug an existing scikit learn python model into the
PLP framework
}
\details{
diff --git a/man/createStudyPopulationSettings.Rd b/man/createStudyPopulationSettings.Rd
index d7dbcabde..019a37e5b 100644
--- a/man/createStudyPopulationSettings.Rd
+++ b/man/createStudyPopulationSettings.Rd
@@ -5,19 +5,19 @@
\title{create the study population settings}
\usage{
createStudyPopulationSettings(
- binary = T,
- includeAllOutcomes = T,
+ binary = TRUE,
+ includeAllOutcomes = TRUE,
firstExposureOnly = FALSE,
washoutPeriod = 0,
removeSubjectsWithPriorOutcome = TRUE,
priorOutcomeLookback = 99999,
- requireTimeAtRisk = T,
+ requireTimeAtRisk = TRUE,
minTimeAtRisk = 364,
riskWindowStart = 1,
startAnchor = "cohort start",
riskWindowEnd = 365,
endAnchor = "cohort start",
- restrictTarToCohortEnd = F
+ restrictTarToCohortEnd = FALSE
)
}
\arguments{
diff --git a/man/createValidationDesign.Rd b/man/createValidationDesign.Rd
index 79bbaeb78..1f78a8034 100644
--- a/man/createValidationDesign.Rd
+++ b/man/createValidationDesign.Rd
@@ -19,7 +19,7 @@ createValidationDesign(
\item{outcomeId}{The outcomeId of the outcome cohort to validate on}
-\item{populationSettings}{A list of population restriction settings created
+\item{populationSettings}{A list of population restriction settings created
by \code{createPopulationSettings}. Default is NULL and then this is taken
from the model}
@@ -34,7 +34,7 @@ this is taken from the model.}
\item{runCovariateSummary}{whether to run the covariate summary for the validation data}
}
\value{
-A validation design object of class \code{validationDesign}
+A validation design object of class \code{validationDesign} or a list of such objects
}
\description{
createValidationDesign - Define the validation design for external validation
diff --git a/man/createValidationSettings.Rd b/man/createValidationSettings.Rd
index 04b8decff..b2f722013 100644
--- a/man/createValidationSettings.Rd
+++ b/man/createValidationSettings.Rd
@@ -4,7 +4,7 @@
\alias{createValidationSettings}
\title{createValidationSettings define optional settings for performing external validation}
\usage{
-createValidationSettings(recalibrate = NULL, runCovariateSummary = T)
+createValidationSettings(recalibrate = NULL, runCovariateSummary = TRUE)
}
\arguments{
\item{recalibrate}{A vector of characters specifying the recalibration method to apply}
diff --git a/man/deDuplicateCovariateData.Rd b/man/deDuplicateCovariateData.Rd
new file mode 100644
index 000000000..e1f94213d
--- /dev/null
+++ b/man/deDuplicateCovariateData.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/ExternalValidatePlp.R
+\name{deDuplicateCovariateData}
+\alias{deDuplicateCovariateData}
+\title{deplucateCovariateData - Remove duplicate covariate data
+when downloading data with multiple different covariateSettings sometimes
+there will be duplicated analysisIds which need to be removed}
+\usage{
+deDuplicateCovariateData(covariateData)
+}
+\arguments{
+\item{covariateData}{The covariate data Andromeda object}
+}
+\value{
+The deduplicated covariate data
+}
+\description{
+deplucateCovariateData - Remove duplicate covariate data
+when downloading data with multiple different covariateSettings sometimes
+there will be duplicated analysisIds which need to be removed
+}
+\keyword{internal}
diff --git a/man/diagnoseMultiplePlp.Rd b/man/diagnoseMultiplePlp.Rd
index a8a1c6ff7..d46ae33ab 100644
--- a/man/diagnoseMultiplePlp.Rd
+++ b/man/diagnoseMultiplePlp.Rd
@@ -10,7 +10,7 @@ diagnoseMultiplePlp(
setLassoLogisticRegression()), createModelDesign(targetId = 1, outcomeId = 3,
modelSettings = setLassoLogisticRegression())),
cohortDefinitions = NULL,
- logSettings = createLogSettings(verbosity = "DEBUG", timeStamp = T, logName =
+ logSettings = createLogSettings(verbosity = "DEBUG", timeStamp = TRUE, logName =
"diagnosePlp Log"),
saveDirectory = getwd()
)
@@ -29,7 +29,7 @@ diagnoseMultiplePlp(
\value{
A data frame with the following columns: \tabular{ll}{ \verb{analysisId} \tab The unique identifier
for a set of analysis choices.\cr \verb{targetId} \tab The ID of the target cohort populations.\cr
-\verb{outcomeId} \tab The ID of the outcomeId.\cr \verb{dataLocation} \tab The location where the plpData was saved
+\verb{outcomeId} \tab The ID of the outcomeId.\cr \verb{dataLocation} \tab The location where the plpData was saved
\cr \verb{the settings ids} \tab The ids for all other settings used for model development.\cr }
}
\description{
diff --git a/man/diagnosePlp.Rd b/man/diagnosePlp.Rd
index e62e999ea..d6637cc4f 100644
--- a/man/diagnosePlp.Rd
+++ b/man/diagnosePlp.Rd
@@ -14,14 +14,14 @@ diagnosePlp(
saveDirectory = NULL,
featureEngineeringSettings = createFeatureEngineeringSettings(),
modelSettings = setLassoLogisticRegression(),
- logSettings = createLogSettings(verbosity = "DEBUG", timeStamp = T, logName =
+ logSettings = createLogSettings(verbosity = "DEBUG", timeStamp = TRUE, logName =
"diagnosePlp Log"),
preprocessSettings = createPreprocessSettings()
)
}
\arguments{
\item{plpData}{An object of type \code{plpData} - the patient level prediction
-data extracted from the CDM. Can also include an initial population as
+data extracted from the CDM. Can also include an initial population as
plpData$popualtion.}
\item{outcomeId}{(integer) The ID of the outcome.}
@@ -29,7 +29,7 @@ plpData$popualtion.}
\item{analysisId}{(integer) Identifier for the analysis. It is used to create, e.g., the result folder. Default is a timestamp.}
\item{populationSettings}{An object of type \code{populationSettings} created using \code{createStudyPopulationSettings} that
-specifies how the data class labels are defined and addition any exclusions to apply to the
+specifies how the data class labels are defined and addition any exclusions to apply to the
plpData cohort}
\item{splitSettings}{An object of type \code{splitSettings} that specifies how to split the data into train/validation/test.
@@ -43,21 +43,21 @@ The default is none.}
\item{featureEngineeringSettings}{An object of \code{featureEngineeringSettings} specifying any feature engineering to be learned (using the train data)}
\item{modelSettings}{An object of class \code{modelSettings} created using one of the function:
-\itemize{
-\item setLassoLogisticRegression() A lasso logistic regression model
-\item setGradientBoostingMachine() A gradient boosting machine
-\item setAdaBoost() An ada boost model
-\item setRandomForest() A random forest model
-\item setDecisionTree() A decision tree model
-\item setKNN() A KNN model
+ \itemize{
+ \item setLassoLogisticRegression() A lasso logistic regression model
+ \item setGradientBoostingMachine() A gradient boosting machine
+ \item setAdaBoost() An ada boost model
+ \item setRandomForest() A random forest model
+ \item setDecisionTree() A decision tree model
+ \item setKNN() A KNN model
-}}
+ }}
-\item{logSettings}{An object of \code{logSettings} created using \code{createLogSettings}
+\item{logSettings}{An object of \code{logSettings} created using \code{createLogSettings}
specifying how the logging is done}
-\item{preprocessSettings}{An object of \code{preprocessSettings}. This setting specifies the minimum fraction of
-target population who must have a covariate for it to be included in the model training
+\item{preprocessSettings}{An object of \code{preprocessSettings}. This setting specifies the minimum fraction of
+target population who must have a covariate for it to be included in the model training
and whether to normalise the covariates before training}
}
\value{
@@ -69,16 +69,16 @@ and training settings as well as various performance measures obtained by the mo
\item{characterization}{list for each O of Characterization of T, TnO, Tn~O}
}
\description{
-This function runs a set of prediction diagnoses to help pick a suitable T, O, TAR and determine
+This function runs a set of prediction diagnoses to help pick a suitable T, O, TAR and determine
whether the prediction problem is worth executing.
}
\details{
Users can define set of Ts, Os, databases and population settings. A list of data.frames containing details such as
-follow-up time distribution, time-to-event information, characteriszation details, time from last prior event,
+follow-up time distribution, time-to-event information, characteriszation details, time from last prior event,
observation time distribution.
}
\examples{
\dontrun{
-#******** EXAMPLE 1 *********
-}
+#******** EXAMPLE 1 *********
+}
}
diff --git a/man/evaluatePlp.Rd b/man/evaluatePlp.Rd
index 811a1678d..7413bb6a8 100644
--- a/man/evaluatePlp.Rd
+++ b/man/evaluatePlp.Rd
@@ -9,7 +9,7 @@ evaluatePlp(prediction, typeColumn = "evaluationType")
\arguments{
\item{prediction}{The patient level prediction model's prediction}
-\item{typeColumn}{The column name in the prediction object that is used to
+\item{typeColumn}{The column name in the prediction object that is used to
stratify the evaluation}
}
\value{
diff --git a/man/getCohortCovariateData.Rd b/man/getCohortCovariateData.Rd
index 5c0c515f3..3cb66cbed 100644
--- a/man/getCohortCovariateData.Rd
+++ b/man/getCohortCovariateData.Rd
@@ -6,6 +6,7 @@
\usage{
getCohortCovariateData(
connection,
+ tempEmulationSchema = NULL,
oracleTempSchema = NULL,
cdmDatabaseSchema,
cdmVersion = "5",
@@ -20,7 +21,9 @@ getCohortCovariateData(
\arguments{
\item{connection}{The database connection}
-\item{oracleTempSchema}{The temp schema if using oracle}
+\item{tempEmulationSchema}{The schema to use for temp tables}
+
+\item{oracleTempSchema}{DEPRECATED The temp schema if using oracle}
\item{cdmDatabaseSchema}{The schema of the OMOP CDM data}
diff --git a/man/getData.Rd b/man/getData.Rd
index 175d75831..075f73f3d 100644
--- a/man/getData.Rd
+++ b/man/getData.Rd
@@ -4,7 +4,7 @@
\alias{getData}
\title{getData - Get the plpData for the validation}
\usage{
-getData(design, database, outputFolder, allCovSettings)
+getData(design, database, outputFolder, downloadTasks)
}
\arguments{
\item{design}{The validationDesign object}
@@ -13,7 +13,8 @@ getData(design, database, outputFolder, allCovSettings)
\item{outputFolder}{The directory to save the validation results to}
-\item{allCovSettings}{A list of covariateSettings from the models}
+\item{downloadTasks}{A list of download tasks determined by unique
+combinations of targetId and restrictPlpDataSettings}
}
\value{
The plpData object
diff --git a/man/getPlpData.Rd b/man/getPlpData.Rd
index dc0faff7a..9cb50fb25 100644
--- a/man/getPlpData.Rd
+++ b/man/getPlpData.Rd
@@ -2,18 +2,19 @@
% Please edit documentation in R/ExtractData.R
\name{getPlpData}
\alias{getPlpData}
-\title{Get the patient level prediction data from the server}
+\title{Extract the patient level prediction data from the server}
\usage{
-getPlpData(databaseDetails, covariateSettings, restrictPlpDataSettings)
+getPlpData(databaseDetails, covariateSettings, restrictPlpDataSettings = NULL)
}
\arguments{
\item{databaseDetails}{The cdm database details created using \code{createDatabaseDetails()}}
-\item{covariateSettings}{An object of type \code{covariateSettings} as created using the
+\item{covariateSettings}{An object of type \code{covariateSettings} or a list of such objects as created using the
\code{createCovariateSettings} function in the
\code{FeatureExtraction} package.}
-\item{restrictPlpDataSettings}{Extra settings to apply to the target population while extracting data. Created using \code{createRestrictPlpDataSettings()}.}
+\item{restrictPlpDataSettings}{Extra settings to apply to the target population while extracting data.
+Created using \code{createRestrictPlpDataSettings()}. This is optional.}
}
\value{
Returns an object of type \code{plpData}, containing information on the cohorts, their
@@ -23,11 +24,11 @@ efficiency reasons. This object is a list with the following components: \descri
the outcome id. Outcomes are not yet filtered based on risk window, since this is done at
a later stage.} \item{cohorts}{A data frame listing the persons in each cohort, listing their
exposure status as well as the time to the end of the observation period and time to the end of the
-cohort (usually the end of the exposure era).} \item{covariates}{An ffdf object listing the
+cohort (usually the end of the exposure era).} \item{covariates}{An Andromeda object listing the
baseline covariates per person in the two cohorts. This is done using a sparse representation:
-covariates with a value of 0 are omitted to save space.} \item{covariateRef}{An ffdf object describing the covariates that have been extracted.}
-\item{metaData}{A list of objects with information on how the cohortMethodData object was
-constructed.} } The generic \code{()} and \code{summary()} functions have been implemented for this object.
+covariates with a value of 0 are omitted to save space. Usually has three columns,
+rowId, covariateId and covariateValue'.} \item{covariateRef}{An Andromeda object describing the covariates that have been extracted.}
+\item{AnalysisRef}{An Andromeda object with information about which analysisIds from 'FeatureExtraction' were used.} }
}
\description{
This function executes a large set of SQL statements against the database in OMOP CDM format to
@@ -37,7 +38,7 @@ extract the data needed to perform the analysis.
Based on the arguments, the at risk cohort data is retrieved, as well as outcomes
occurring in these subjects. The at risk cohort is identified through
user-defined cohorts in a cohort table either inside the CDM instance or in a separate schema.
-Similarly, outcomes are identified
+Similarly, outcomes are identified
through user-defined cohorts in a cohort table either inside the CDM instance or in a separate
schema. Covariates are automatically extracted from the appropriate tables within the CDM.
If you wish to exclude concepts from covariates you will need to
diff --git a/man/loadPlpAnalysesJson.Rd b/man/loadPlpAnalysesJson.Rd
index dcd52b0f8..1053e1b7e 100644
--- a/man/loadPlpAnalysesJson.Rd
+++ b/man/loadPlpAnalysesJson.Rd
@@ -13,12 +13,12 @@ loadPlpAnalysesJson(jsonFileLocation)
Load the multiple prediction json settings from a file
}
\details{
-This function interprets a json with the multiple prediction settings and creates a list
+This function interprets a json with the multiple prediction settings and creates a list
that can be combined with connection settings to run a multiple prediction study
}
\examples{
\dontrun{
-modelDesignList <- loadPlpAnalysesJson('location of json settings')$analysis
+modelDesignList <- loadPlpAnalysesJson("location of json settings")$analysis
}
}
diff --git a/man/outcomeSurvivalPlot.Rd b/man/outcomeSurvivalPlot.Rd
index 39100718d..ca2d09bed 100644
--- a/man/outcomeSurvivalPlot.Rd
+++ b/man/outcomeSurvivalPlot.Rd
@@ -7,12 +7,12 @@
outcomeSurvivalPlot(
plpData,
outcomeId,
- populationSettings = createStudyPopulationSettings(binary = T, includeAllOutcomes = T,
- firstExposureOnly = FALSE, washoutPeriod = 0, removeSubjectsWithPriorOutcome = TRUE,
- priorOutcomeLookback = 99999, requireTimeAtRisk = F, riskWindowStart = 1, startAnchor
- = "cohort start", riskWindowEnd = 3650, endAnchor = "cohort start"),
- riskTable = T,
- confInt = T,
+ populationSettings = createStudyPopulationSettings(binary = TRUE, includeAllOutcomes =
+ TRUE, firstExposureOnly = FALSE, washoutPeriod = 0, removeSubjectsWithPriorOutcome =
+ TRUE, priorOutcomeLookback = 99999, requireTimeAtRisk = FALSE, riskWindowStart = 1,
+ startAnchor = "cohort start", riskWindowEnd = 3650, endAnchor = "cohort start"),
+ riskTable = TRUE,
+ confInt = TRUE,
yLabel = "Fraction of those who are outcome free in target population"
)
}
diff --git a/man/plotLearningCurve.Rd b/man/plotLearningCurve.Rd
index 8056ca95b..de6969a2a 100644
--- a/man/plotLearningCurve.Rd
+++ b/man/plotLearningCurve.Rd
@@ -36,11 +36,11 @@ function.}
\item{plotSubtitle}{Subtitle of the learning curve plot.}
\item{fileName}{Filename of plot to be saved, for example \code{'plot.png'}.
-See the function \code{ggsave} in the ggplot2 package for supported file
+See the function \code{ggsave} in the ggplot2 package for supported file
formats.}
}
\value{
-A ggplot object. Use the \code{\link[ggplot2]{ggsave}} function to save to
+A ggplot object. Use the \code{\link[ggplot2]{ggsave}} function to save to
file in a different format.
}
\description{
@@ -50,9 +50,11 @@ from \code{createLearningCurve}.
\examples{
\dontrun{
# create learning curve object
-learningCurve <- createLearningCurve(population,
- plpData,
- modelSettings)
+learningCurve <- createLearningCurve(
+ population,
+ plpData,
+ modelSettings
+)
# plot the learning curve
plotLearningCurve(learningCurve)
}
diff --git a/man/plotPlp.Rd b/man/plotPlp.Rd
index b4cfa6e69..18033f88e 100644
--- a/man/plotPlp.Rd
+++ b/man/plotPlp.Rd
@@ -11,7 +11,7 @@ plotPlp(plpResult, saveLocation = NULL, typeColumn = "evaluation")
\item{saveLocation}{Name of the directory where the plots should be saved (NULL means no saving)}
-\item{typeColumn}{The name of the column specifying the evaluation type
+\item{typeColumn}{The name of the column specifying the evaluation type
(to stratify the plots)}
}
\value{
diff --git a/man/predictGlm.Rd b/man/predictGlm.Rd
index e7e09a57b..6f9e21cc5 100644
--- a/man/predictGlm.Rd
+++ b/man/predictGlm.Rd
@@ -14,7 +14,7 @@ prediction model}
data extracted from the CDM.}
\item{cohort}{The population dataframe created using
-/code{createStudyPopulation} who will have their risks predicted or a cohort
+\code{createStudyPopulation} who will have their risks predicted or a cohort
without the outcome known}
}
\value{
diff --git a/man/preprocessData.Rd b/man/preprocessData.Rd
index 2ace52208..4f9b18eb2 100644
--- a/man/preprocessData.Rd
+++ b/man/preprocessData.Rd
@@ -8,7 +8,7 @@ and remove rare or redundant features}
preprocessData(covariateData, preprocessSettings)
}
\arguments{
-\item{covariateData}{The covariate part of the training data created by \code{splitData} after being sampled and having
+\item{covariateData}{The covariate part of the training data created by \code{splitData} after being sampled and having
any required feature engineering}
\item{preprocessSettings}{The settings for the preprocessing created by \code{createPreprocessSettings}}
diff --git a/man/recalibratePlp.Rd b/man/recalibratePlp.Rd
index 148e3c3e6..3ede5aa45 100644
--- a/man/recalibratePlp.Rd
+++ b/man/recalibratePlp.Rd
@@ -21,12 +21,14 @@ recalibratePlp(
\item{method}{Method used to recalibrate ('recalibrationInTheLarge' or 'weakRecalibration' )}
}
\value{
-An object of class \code{runPlp} that is recalibrated on the new data
+A prediction dataframe with the recalibrated predictions added
}
\description{
-Train various models using a default parameter gird search or user specified parameters
+Recalibrating a model using the recalibrationInTheLarge or weakRecalibration methods
}
\details{
-The user can define the machine learning model to train (regularised logistic regression, random forest,
-gradient boosting machine, neural network and )
+'recalibrationInTheLarge' calculates a single correction factor for the
+average predicted risks to match the average observed risks.
+'weakRecalibration' fits a glm model to the logit of the predicted risks,
+also known as Platt scaling/logistic recalibration.
}
diff --git a/man/recalibratePlpRefit.Rd b/man/recalibratePlpRefit.Rd
index 41202c9b3..ec83f986f 100644
--- a/man/recalibratePlpRefit.Rd
+++ b/man/recalibratePlpRefit.Rd
@@ -4,7 +4,7 @@
\alias{recalibratePlpRefit}
\title{recalibratePlpRefit}
\usage{
-recalibratePlpRefit(plpModel, newPopulation, newData)
+recalibratePlpRefit(plpModel, newPopulation, newData, returnModel = FALSE)
}
\arguments{
\item{plpModel}{The trained plpModel (runPlp$model)}
@@ -13,14 +13,12 @@ recalibratePlpRefit(plpModel, newPopulation, newData)
\item{newData}{An object of type \code{plpData} - the patient level prediction
data extracted from the CDM.}
+
+\item{returnModel}{Logical: return the refitted model}
}
\value{
-An object of class \code{runPlp} that is recalibrated on the new data
+An prediction dataframe with the predictions of the recalibrated model added
}
\description{
-Train various models using a default parameter gird search or user specified parameters
-}
-\details{
-The user can define the machine learning model to train (regularised logistic regression, random forest,
-gradient boosting machine, neural network and )
+Recalibrating a model by refitting it
}
diff --git a/man/recalibrationInTheLarge.Rd b/man/recalibrationInTheLarge.Rd
new file mode 100644
index 000000000..4a678d2e1
--- /dev/null
+++ b/man/recalibrationInTheLarge.Rd
@@ -0,0 +1,21 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/Recalibration.R
+\name{recalibrationInTheLarge}
+\alias{recalibrationInTheLarge}
+\title{recalibrationInTheLarge}
+\usage{
+recalibrationInTheLarge(prediction, columnType = "evaluationType")
+}
+\arguments{
+\item{prediction}{A prediction dataframe}
+
+\item{columnType}{The column name where the strata types are specified}
+}
+\value{
+An prediction dataframe with the recalibrated predictions added
+}
+\description{
+Recalibrate a model using the recalibrationInTheLarge method which calculates a single correction factor
+for the average predicted risks to match the average observed risks
+}
+\keyword{internal}
diff --git a/man/runMultiplePlp.Rd b/man/runMultiplePlp.Rd
index 22192b5a8..b79819921 100644
--- a/man/runMultiplePlp.Rd
+++ b/man/runMultiplePlp.Rd
@@ -9,9 +9,9 @@ runMultiplePlp(
modelDesignList = list(createModelDesign(targetId = 1, outcomeId = 2, modelSettings =
setLassoLogisticRegression()), createModelDesign(targetId = 1, outcomeId = 3,
modelSettings = setLassoLogisticRegression())),
- onlyFetchData = F,
+ onlyFetchData = FALSE,
cohortDefinitions = NULL,
- logSettings = createLogSettings(verbosity = "DEBUG", timeStamp = T, logName =
+ logSettings = createLogSettings(verbosity = "DEBUG", timeStamp = TRUE, logName =
"runPlp Log"),
saveDirectory = getwd(),
sqliteLocation = file.path(saveDirectory, "sqlite")
@@ -35,7 +35,7 @@ runMultiplePlp(
\value{
A data frame with the following columns: \tabular{ll}{ \verb{analysisId} \tab The unique identifier
for a set of analysis choices.\cr \verb{targetId} \tab The ID of the target cohort populations.\cr
-\verb{outcomeId} \tab The ID of the outcomeId.\cr \verb{dataLocation} \tab The location where the plpData was saved
+\verb{outcomeId} \tab The ID of the outcomeId.\cr \verb{dataLocation} \tab The location where the plpData was saved
\cr \verb{the settings ids} \tab The ids for all other settings used for model development.\cr }
}
\description{
diff --git a/man/runPlp.Rd b/man/runPlp.Rd
index ad000f4b7..778a2ff28 100644
--- a/man/runPlp.Rd
+++ b/man/runPlp.Rd
@@ -14,9 +14,9 @@ runPlp(
trainFraction = 0.75, splitSeed = 123, nfold = 3),
sampleSettings = createSampleSettings(type = "none"),
featureEngineeringSettings = createFeatureEngineeringSettings(type = "none"),
- preprocessSettings = createPreprocessSettings(minFraction = 0.001, normalize = T),
+ preprocessSettings = createPreprocessSettings(minFraction = 0.001, normalize = TRUE),
modelSettings = setLassoLogisticRegression(),
- logSettings = createLogSettings(verbosity = "DEBUG", timeStamp = T, logName =
+ logSettings = createLogSettings(verbosity = "DEBUG", timeStamp = TRUE, logName =
"runPlp Log"),
executeSettings = createDefaultExecuteSettings(),
saveDirectory = getwd()
diff --git a/man/savePlpAnalysesJson.Rd b/man/savePlpAnalysesJson.Rd
index 742631e6f..0e5bbbfbb 100644
--- a/man/savePlpAnalysesJson.Rd
+++ b/man/savePlpAnalysesJson.Rd
@@ -28,11 +28,11 @@ This function creates a json file with the modelDesignList saved
\examples{
\dontrun{
savePlpAnalysesJson(
-modelDesignList = list(
-createModelDesign(targetId = 1, outcomeId = 2, modelSettings = setLassoLogisticRegression()),
-createModelDesign(targetId = 1, outcomeId = 3, modelSettings = setLassoLogisticRegression())
-),
-saveDirectory = 'C:/bestModels'
+ modelDesignList = list(
+ createModelDesign(targetId = 1, outcomeId = 2, modelSettings = setLassoLogisticRegression()),
+ createModelDesign(targetId = 1, outcomeId = 3, modelSettings = setLassoLogisticRegression())
+ ),
+ saveDirectory = "C:/bestModels"
)
}
diff --git a/man/savePlpData.Rd b/man/savePlpData.Rd
index 2562b0fdf..4f38dd50f 100644
--- a/man/savePlpData.Rd
+++ b/man/savePlpData.Rd
@@ -4,7 +4,7 @@
\alias{savePlpData}
\title{Save the cohort data to folder}
\usage{
-savePlpData(plpData, file, envir = NULL, overwrite = F)
+savePlpData(plpData, file, envir = NULL, overwrite = FALSE)
}
\arguments{
\item{plpData}{An object of type \code{plpData} as generated using
diff --git a/man/setAdaBoost.Rd b/man/setAdaBoost.Rd
index 971948d00..e4af2568d 100644
--- a/man/setAdaBoost.Rd
+++ b/man/setAdaBoost.Rd
@@ -7,7 +7,7 @@
setAdaBoost(
nEstimators = list(10, 50, 200),
learningRate = list(1, 0.5, 0.1),
- algorithm = list("SAMME.R"),
+ algorithm = list("SAMME"),
seed = sample(1e+06, 1)
)
}
@@ -17,7 +17,7 @@ setAdaBoost(
\item{learningRate}{(list) Weight applied to each classifier at each boosting iteration. A higher learning rate increases the contribution of each classifier. There is a trade-off between the learningRate and nEstimators parameters
There is a trade-off between learningRate and nEstimators.}
-\item{algorithm}{(list) If ‘SAMME.R’ then use the SAMME.R real boosting algorithm. base_estimator must support calculation of class probabilities. If ‘SAMME’ then use the SAMME discrete boosting algorithm. The SAMME.R algorithm typically converges faster than SAMME, achieving a lower test error with fewer boosting iterations.}
+\item{algorithm}{Only ‘SAMME’ can be provided. The 'algorithm' argument will be deprecated in scikit-learn 1.8.}
\item{seed}{A seed for the model}
}
@@ -26,8 +26,9 @@ Create setting for AdaBoost with python DecisionTreeClassifier base estimator
}
\examples{
\dontrun{
-model.adaBoost <- setAdaBoost(nEstimators = list(10,50,200), learningRate = list(1, 0.5, 0.1),
- algorithm = list('SAMME.R'), seed = sample(1000000,1)
- )
+model.adaBoost <- setAdaBoost(
+ nEstimators = list(10, 50, 200), learningRate = list(1, 0.5, 0.1),
+ algorithm = list("SAMME.R"), seed = sample(1000000, 1)
+)
}
}
diff --git a/man/setDecisionTree.Rd b/man/setDecisionTree.Rd
index d977d0ee9..1e6b994cb 100644
--- a/man/setDecisionTree.Rd
+++ b/man/setDecisionTree.Rd
@@ -2,7 +2,7 @@
% Please edit documentation in R/SklearnClassifierSettings.R
\name{setDecisionTree}
\alias{setDecisionTree}
-\title{Create setting for the scikit-learn 1.0.1 DecisionTree with python}
+\title{Create setting for the scikit-learn DecisionTree with python}
\usage{
setDecisionTree(
criterion = list("gini"),
@@ -42,10 +42,10 @@ setDecisionTree(
\item{seed}{The random state seed}
}
\description{
-Create setting for the scikit-learn 1.0.1 DecisionTree with python
+Create setting for the scikit-learn DecisionTree with python
}
\examples{
\dontrun{
-model.decisionTree <- setDecisionTree(maxDepth=10,minSamplesLeaf=10, seed=NULL )
+model.decisionTree <- setDecisionTree(maxDepth = 10, minSamplesLeaf = 10, seed = NULL)
}
}
diff --git a/man/setGradientBoostingMachine.Rd b/man/setGradientBoostingMachine.Rd
index a42e2b71c..560e5683f 100644
--- a/man/setGradientBoostingMachine.Rd
+++ b/man/setGradientBoostingMachine.Rd
@@ -42,7 +42,9 @@ setGradientBoostingMachine(
Create setting for gradient boosting machine model using gbm_xgboost implementation
}
\examples{
-model.gbm <- setGradientBoostingMachine(ntrees=c(10,100), nthread=20,
- maxDepth=c(4,6), learnRate=c(0.1,0.3))
+model.gbm <- setGradientBoostingMachine(
+ ntrees = c(10, 100), nthread = 20,
+ maxDepth = c(4, 6), learnRate = c(0.1, 0.3)
+)
}
diff --git a/man/setIterativeHardThresholding.Rd b/man/setIterativeHardThresholding.Rd
index 63d7fd257..8cac5a628 100644
--- a/man/setIterativeHardThresholding.Rd
+++ b/man/setIterativeHardThresholding.Rd
@@ -9,9 +9,9 @@ setIterativeHardThresholding(
penalty = "bic",
seed = sample(1e+05, 1),
exclude = c(),
- forceIntercept = F,
+ forceIntercept = FALSE,
fitBestSubset = FALSE,
- initialRidgeVariance = 10000,
+ initialRidgeVariance = 0.1,
tolerance = 1e-08,
maxIterations = 10000,
threshold = 1e-06,
diff --git a/man/setKNN.Rd b/man/setKNN.Rd
deleted file mode 100644
index fc4c25d90..000000000
--- a/man/setKNN.Rd
+++ /dev/null
@@ -1,23 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/KNN.R
-\name{setKNN}
-\alias{setKNN}
-\title{Create setting for knn model}
-\usage{
-setKNN(k = 1000, indexFolder = file.path(getwd(), "knn"), threads = 1)
-}
-\arguments{
-\item{k}{The number of neighbors to consider}
-
-\item{indexFolder}{The directory where the results and intermediate steps are output}
-
-\item{threads}{The number of threads to use when applying big knn}
-}
-\description{
-Create setting for knn model
-}
-\examples{
-\dontrun{
-model.knn <- setKNN(k=10000)
-}
-}
diff --git a/man/setLassoLogisticRegression.Rd b/man/setLassoLogisticRegression.Rd
index 533aa53b3..ecd38da3c 100644
--- a/man/setLassoLogisticRegression.Rd
+++ b/man/setLassoLogisticRegression.Rd
@@ -10,7 +10,7 @@ setLassoLogisticRegression(
includeCovariateIds = c(),
noShrinkage = c(0),
threads = -1,
- forceIntercept = F,
+ forceIntercept = FALSE,
upperLimit = 20,
lowerLimit = 0.01,
tolerance = 2e-06,
diff --git a/man/setLightGBM.Rd b/man/setLightGBM.Rd
index 6380df304..5dee039c3 100644
--- a/man/setLightGBM.Rd
+++ b/man/setLightGBM.Rd
@@ -49,8 +49,8 @@ Create setting for gradient boosting machine model using lightGBM (https://githu
}
\examples{
model.lightgbm <- setLightGBM(
- numLeaves = c(20, 31, 50), maxDepth = c(-1, 5, 10),
- minDataInLeaf = c(10, 20, 30), learningRate = c(0.05, 0.1, 0.3)
+ numLeaves = c(20, 31, 50), maxDepth = c(-1, 5, 10),
+ minDataInLeaf = c(10, 20, 30), learningRate = c(0.05, 0.1, 0.3)
)
}
diff --git a/man/setRandomForest.Rd b/man/setRandomForest.Rd
index 360b532ae..15ee62bba 100644
--- a/man/setRandomForest.Rd
+++ b/man/setRandomForest.Rd
@@ -65,7 +65,9 @@ Create setting for random forest model with python (very fast)
}
\examples{
\dontrun{
-model.rf <- setRandomForest(mtries=list('auto',5,20), ntrees=c(10,100),
- maxDepth=c(5,20))
+model.rf <- setRandomForest(
+ mtries = list("auto", 5, 20), ntrees = c(10, 100),
+ maxDepth = c(5, 20)
+)
}
}
diff --git a/man/setSVM.Rd b/man/setSVM.Rd
index 2def0720b..f2d273f2f 100644
--- a/man/setSVM.Rd
+++ b/man/setSVM.Rd
@@ -43,6 +43,6 @@ Create setting for the python sklearn SVM (SVC function)
}
\examples{
\dontrun{
-model.svm <- setSVM(kernel='rbf', seed = NULL)
+model.svm <- setSVM(kernel = "rbf", seed = NULL)
}
}
diff --git a/man/splitData.Rd b/man/splitData.Rd
index 4c03f61b8..a4368cf4e 100644
--- a/man/splitData.Rd
+++ b/man/splitData.Rd
@@ -2,7 +2,8 @@
% Please edit documentation in R/DataSplitting.R
\name{splitData}
\alias{splitData}
-\title{Split the plpData into test/train sets using a splitting settings of class \code{splitSettings}}
+\title{Split the plpData into test/train sets using a splitting settings of class
+\code{splitSettings}}
\usage{
splitData(
plpData = plpData,
@@ -11,29 +12,38 @@ splitData(
)
}
\arguments{
-\item{plpData}{An object of type \code{plpData} - the patient level prediction
-data extracted from the CDM.}
+\item{plpData}{An object of type \code{plpData} - the patient level
+prediction data extracted from the CDM.}
-\item{population}{The population created using \code{createStudyPopulation} that define who will be used to develop the model}
+\item{population}{The population created using \code{createStudyPopulation}
+that define who will be used to develop the model}
-\item{splitSettings}{An object of type \code{splitSettings} specifying the split - the default can be created using \code{createDefaultSplitSetting}}
+\item{splitSettings}{An object of type \code{splitSettings} specifying the
+split - the default can be created using \code{createDefaultSplitSetting}}
}
\value{
An object of class \code{splitSettings}
}
\description{
-Split the plpData into test/train sets using a splitting settings of class \code{splitSettings}
+Split the plpData into test/train sets using a splitting settings of class
+\code{splitSettings}
}
\details{
-Returns a list containing the training data (Train) and optionally the test data (Test). Train is an Andromeda object containing
-\itemize{\item covariates: a table (rowId, covariateId, covariateValue) containing the covariates for each data point in the train data
+Returns a list containing the training data (Train) and optionally the test
+data (Test). Train is an Andromeda object containing
+\itemize{\item covariates: a table (rowId, covariateId, covariateValue)
+containing the covariates for each data point in the train data
\item covariateRef: a table with the covariate information
- \item labels: a table (rowId, outcomeCount, ...) for each data point in the train data (outcomeCount is the class label)
- \item folds: a table (rowId, index) specifying which training fold each data point is in.
- }
+ \item labels: a table (rowId, outcomeCount, ...) for each data point
+in the train data (outcomeCount is the class label)
+ \item folds: a table (rowId, index) specifying which training
+fold each data point is in.
+ }
Test is an Andromeda object containing
-\itemize{\item covariates: a table (rowId, covariateId, covariateValue) containing the covariates for each data point in the test data
+\itemize{\item covariates: a table (rowId, covariateId, covariateValue)
+containing the covariates for each data point in the test data
\item covariateRef: a table with the covariate information
- \item labels: a table (rowId, outcomeCount, ...) for each data point in the test data (outcomeCount is the class label)
+ \item labels: a table (rowId, outcomeCount, ...) for each data
+point in the test data (outcomeCount is the class label)
}
}
diff --git a/man/toSparseM.Rd b/man/toSparseM.Rd
index 3e75eda7f..9e3cc4ae7 100644
--- a/man/toSparseM.Rd
+++ b/man/toSparseM.Rd
@@ -31,6 +31,6 @@ This function converts the covariate file from ffdf in COO format into a sparse
the package Matrix
}
\examples{
-#TODO
+# TODO
}
diff --git a/man/weakRecalibration.Rd b/man/weakRecalibration.Rd
new file mode 100644
index 000000000..24f1639cb
--- /dev/null
+++ b/man/weakRecalibration.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/Recalibration.R
+\name{weakRecalibration}
+\alias{weakRecalibration}
+\title{weakRecalibration}
+\usage{
+weakRecalibration(prediction, columnType = "evaluationType")
+}
+\arguments{
+\item{prediction}{A prediction dataframe}
+
+\item{columnType}{The column name where the strata types are specified}
+}
+\value{
+An prediction dataframe with the recalibrated predictions added
+}
+\description{
+Recalibrate a model using the weakRecalibration method which fits a glm model
+to the logit of the predicted risks.
+Alsi known as Platt scaling/logistic recalibration
+}
+\keyword{internal}
diff --git a/tests/testthat.R b/tests/testthat.R
index f18e635a2..de2dd523e 100644
--- a/tests/testthat.R
+++ b/tests/testthat.R
@@ -1,6 +1,4 @@
-#Sys.setenv("R_TESTS" = "")
-#options(fftempdir = file.path(getwd(),'fftemp'))
library(testthat)
library(PatientLevelPrediction)
test_check("PatientLevelPrediction")
-unlink('T:/Temp', recursive = T)
+unlink("T:/Temp", recursive = TRUE)
diff --git a/tests/testthat/helper-expectations.R b/tests/testthat/helper-expectations.R
index 25210fdf6..7dd3a6b9a 100644
--- a/tests/testthat/helper-expectations.R
+++ b/tests/testthat/helper-expectations.R
@@ -1,28 +1,28 @@
# common tests that can be grouped together, such as testing the output from fitplp
-
expect_correct_fitPlp <- function(plpModel, trainData) {
-
# predictions are same amount as labels
- multiplicativeFactor <- dplyr::n_distinct(plpModel$prediction %>% dplyr::pull(evaluationType))
- expect_equal(NROW(trainData$labels)*multiplicativeFactor, NROW(plpModel$prediction))
-
+ multiplicativeFactor <- dplyr::n_distinct(plpModel$prediction %>%
+ dplyr::pull(.data$evaluationType))
+ expect_equal(NROW(trainData$labels) * multiplicativeFactor, NROW(plpModel$prediction))
+
# predictions are all between 0 and 1
- expect_true(all((plpModel$prediction$value >= 0) &
- (plpModel$prediction$value <= 1)))
+ expect_true(all((plpModel$prediction$value >= 0) &
+ (plpModel$prediction$value <= 1)))
# model directory exists
expect_true(dir.exists(plpModel$model))
-
+
expect_equal(plpModel$modelDesign$outcomeId, outcomeId)
expect_equal(plpModel$modelDesign$targetId, 1)
-
+
# structure of plpModel is correct
- expect_equal(names(plpModel), c("model", "preprocessing", "prediction",
- "modelDesign", "trainDetails", "covariateImportance"))
+ expect_equal(names(plpModel), c(
+ "model", "preprocessing", "prediction",
+ "modelDesign", "trainDetails", "covariateImportance"
+ ))
}
expect_correct_predictions <- function(predictions, testData) {
-
# predictions are all between 0 and 1
expect_true(all((predictions$value >= 0) & (predictions$value <= 1)))
expect_equal(NROW(testData$labels), NROW(predictions))
diff --git a/tests/testthat/helper-functions.R b/tests/testthat/helper-functions.R
index 7170cf2aa..9a72b1b24 100644
--- a/tests/testthat/helper-functions.R
+++ b/tests/testthat/helper-functions.R
@@ -1,9 +1,9 @@
# helper functions for tests
-# copies trainData and makes sure andromeda object is copied correctly
+# copies trainData and makes sure andromeda object is copied correctly
copyTrainData <- function(trainData) {
newTrainData <- trainData
-
+
# force andromeda to copy
newTrainData$covariateData <- Andromeda::copyAndromeda(trainData$covariateData)
@@ -12,21 +12,29 @@ copyTrainData <- function(trainData) {
}
# create tiny dataset with subset of covariates based on lasso fit
-createTinyPlpData <- function(plpData, plpResult, n= 20) {
-
- covariates <- plpResult$model$covariateImportance %>%
- dplyr::slice_max(order_by = abs(.data$covariateValue),
- n = n, with_ties = F) %>%
+createTinyPlpData <- function(plpData, plpResult, n = 20) {
+ covariates <- plpResult$model$covariateImportance %>%
+ dplyr::slice_max(
+ order_by = abs(.data$covariateValue),
+ n = n, with_ties = FALSE
+ ) %>%
dplyr::pull(.data$covariateId)
tinyPlpData <- plpData
tinyPlpData$covariateData <- Andromeda::copyAndromeda(plpData$covariateData)
-
- tinyPlpData$covariateData$covariates <- plpData$covariateData$covariates %>%
- dplyr::filter(covariateId %in% covariates)
- tinyPlpData$covariateData$covariateRef <- plpData$covariateData$covariateRef %>%
- dplyr::filter(covariateId %in% covariates)
-
- attributes(tinyPlpData$covariateData)$metaData <- attributes(plpData$covariateData)$metaData
+
+ tinyPlpData$covariateData$covariates <- plpData$covariateData$covariates %>%
+ dplyr::filter(.data$covariateId %in% covariates)
+ tinyPlpData$covariateData$covariateRef <- plpData$covariateData$covariateRef %>%
+ dplyr::filter(.data$covariateId %in% covariates)
+
+ rowIds <- tinyPlpData$covariateData$covariates %>%
+ dplyr::pull(.data$rowId) %>%
+ unique()
+ tinyPlpData$cohorts <- plpData$cohorts %>%
+ dplyr::filter(.data$rowId %in% rowIds)
+
+ attributes(tinyPlpData$covariateData)$metaData <-
+ attributes(plpData$covariateData)$metaData
class(tinyPlpData$covariateData) <- class(plpData$covariateData)
attributes(tinyPlpData)$metaData <- attributes(plpData)$metaData
class(tinyPlpData) <- class(plpData)
@@ -42,22 +50,24 @@ createData <- function(observations, features, totalFeatures,
columnId <- sample(1:totalFeatures, observations * features, replace = TRUE)
})
covariateValue <- rep(1, observations * features)
- covariates <- data.frame(rowId = rowId, columnId = columnId, covariateValue = covariateValue)
+ covariates <- data.frame(rowId = rowId, columnId = columnId, covariateValue = covariateValue)
if (numCovs) {
numRow <- 1:observations
numCol <- rep(totalFeatures + 1, observations)
withr::with_seed(seed, {
numVal <- runif(observations)
})
- numCovariates <- data.frame(rowId = as.integer(numRow),
- columnId = as.integer(numCol),
- covariateValue = numVal)
+ numCovariates <- data.frame(
+ rowId = as.integer(numRow),
+ columnId = as.integer(numCol),
+ covariateValue = numVal
+ )
covariates <- rbind(covariates, numCovariates)
}
withr::with_seed(seed, {
- labels <- as.numeric(sample(0:1, observations, replace = TRUE, prob = c(1 - outcomeRate, outcomeRate)))
+ labels <- as.numeric(sample(0:1, observations, replace = TRUE, prob = c(1 - outcomeRate, outcomeRate)))
})
-
- data <- list(covariates = covariates, labels = labels)
+
+ data <- list(covariates = covariates, labels = labels)
return(data)
}
diff --git a/tests/testthat/helper-objects.R b/tests/testthat/helper-objects.R
index fccf9bf1d..d4084221b 100644
--- a/tests/testthat/helper-objects.R
+++ b/tests/testthat/helper-objects.R
@@ -1,6 +1,5 @@
-
# this files contains the objects used in the tests:
-if (Sys.getenv('GITHUB_ACTIONS') == 'true') {
+if (Sys.getenv("GITHUB_ACTIONS") == "true") {
# Download the PostreSQL driver ---------------------------
# If DATABASECONNECTOR_JAR_FOLDER exists, assume driver has been downloaded
jarFolder <- Sys.getenv("DATABASECONNECTOR_JAR_FOLDER", unset = "")
@@ -9,22 +8,28 @@ if (Sys.getenv('GITHUB_ACTIONS') == 'true') {
dir.create(tempJarFolder)
Sys.setenv("DATABASECONNECTOR_JAR_FOLDER" = tempJarFolder)
DatabaseConnector::downloadJdbcDrivers("postgresql")
-
- withr::defer({
- unlink(tempJarFolder, recursive = TRUE, force = TRUE)
- Sys.unsetenv("DATABASECONNECTOR_JAR_FOLDER")
- }, testthat::teardown_env())
+
+ withr::defer(
+ {
+ unlink(tempJarFolder, recursive = TRUE, force = TRUE)
+ Sys.unsetenv("DATABASECONNECTOR_JAR_FOLDER")
+ },
+ testthat::teardown_env()
+ )
}
-
+
# configure and activate python
- PatientLevelPrediction::configurePython(envname = 'r-reticulate', envtype = "conda")
- PatientLevelPrediction::setPythonEnvironment(envname = 'r-reticulate', envtype = "conda")
-
+ rlang::check_installed("reticulate")
+ PatientLevelPrediction::configurePython(envname = "r-reticulate", envtype = "conda")
+ PatientLevelPrediction::setPythonEnvironment(envname = "r-reticulate", envtype = "conda")
+
# if mac install nomkl -- trying to fix github actions
- if (ifelse(is.null(Sys.info()), F, Sys.info()['sysname'] == 'Darwin')){
- reticulate::conda_install(envname = 'r-reticulate', packages = c('nomkl'),
- forge = TRUE, pip = FALSE, pip_ignore_installed = TRUE,
- conda = "auto")
+ if (ifelse(is.null(Sys.info()), FALSE, Sys.info()["sysname"] == "Darwin")) {
+ reticulate::conda_install(
+ envname = "r-reticulate", packages = c("nomkl"),
+ forge = TRUE, pip = FALSE, pip_ignore_installed = TRUE,
+ conda = "auto"
+ )
}
}
@@ -32,28 +37,23 @@ if (Sys.getenv('GITHUB_ACTIONS') == 'true') {
saveLoc <- tempfile("saveLoc")
dir.create(saveLoc)
-
-#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-# simulated data Tests
-#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-data("plpDataSimulationProfile")
-
# PLPDATA
connectionDetails <- Eunomia::getEunomiaConnectionDetails()
Eunomia::createCohorts(connectionDetails)
outcomeId <- 3 # GIbleed
databaseDetails <- createDatabaseDetails(
- connectionDetails = connectionDetails,
- cdmDatabaseSchema = "main",
+ connectionDetails = connectionDetails,
+ cdmDatabaseSchema = "main",
cdmDatabaseName = "main",
- cohortDatabaseSchema = "main",
- cohortTable = "cohort",
- outcomeDatabaseSchema = "main",
- outcomeTable = "cohort",
- targetId = 1,
+ cohortDatabaseSchema = "main",
+ cohortTable = "cohort",
+ outcomeDatabaseSchema = "main",
+ outcomeTable = "cohort",
+ targetId = 1,
outcomeIds = outcomeId,
- cdmVersion = 5)
+ cdmVersion = 5
+)
covariateSettings <- FeatureExtraction::createCovariateSettings(
useDemographicsAge = TRUE,
@@ -61,9 +61,11 @@ covariateSettings <- FeatureExtraction::createCovariateSettings(
useConditionOccurrenceAnyTimePrior = TRUE
)
-plpData <- getPlpData(databaseDetails = databaseDetails,
- covariateSettings = covariateSettings,
- restrictPlpDataSettings = createRestrictPlpDataSettings())
+plpData <- getPlpData(
+ databaseDetails = databaseDetails,
+ covariateSettings = covariateSettings,
+ restrictPlpDataSettings = createRestrictPlpDataSettings()
+)
# POPULATION
populationSettings <- createStudyPopulationSettings(
@@ -71,50 +73,49 @@ populationSettings <- createStudyPopulationSettings(
washoutPeriod = 0,
removeSubjectsWithPriorOutcome = FALSE,
priorOutcomeLookback = 99999,
- requireTimeAtRisk = T,
+ requireTimeAtRisk = TRUE,
minTimeAtRisk = 10,
riskWindowStart = 0,
- startAnchor = 'cohort start',
+ startAnchor = "cohort start",
riskWindowEnd = 365,
- endAnchor = 'cohort start'
- )
-
+ endAnchor = "cohort start"
+)
# MODEL SETTINGS
lrSet <- setLassoLogisticRegression(seed = 42)
# RUNPLP - LASSO LR
plpResult <- runPlp(
- plpData = plpData,
- outcomeId = outcomeId,
- analysisId = 'Test',
- analysisName = 'Testing analysis',
- populationSettings = populationSettings,
+ plpData = plpData,
+ outcomeId = outcomeId,
+ analysisId = "Test",
+ analysisName = "Testing analysis",
+ populationSettings = populationSettings,
splitSettings = createDefaultSplitSetting(splitSeed = 12),
- preprocessSettings = createPreprocessSettings(),
- modelSettings = lrSet,
- logSettings = createLogSettings(verbosity = 'ERROR'),
- executeSettings = createDefaultExecuteSettings(),
+ preprocessSettings = createPreprocessSettings(),
+ modelSettings = lrSet,
+ logSettings = createLogSettings(verbosity = "ERROR"),
+ executeSettings = createDefaultExecuteSettings(),
saveDirectory = saveLoc
- )
+)
# now diagnose
diagnoseResult <- diagnosePlp(
plpData = plpData,
outcomeId = outcomeId,
- analysisId = 'Test',
+ analysisId = "Test",
populationSettings = populationSettings,
splitSettings = createDefaultSplitSetting(splitSeed = 12),
saveDirectory = saveLoc,
modelSettings = lrSet,
- logSettings = createLogSettings(
- verbosity = 'DEBUG',
- timeStamp = T,
- logName = 'diagnosePlp Log'
+ logSettings = createLogSettings(
+ verbosity = "DEBUG",
+ timeStamp = TRUE,
+ logName = "diagnosePlp Log"
),
- preprocessSettings = createPreprocessSettings(),
- sampleSettings = NULL,
+ preprocessSettings = createPreprocessSettings(),
+ sampleSettings = NULL,
featureEngineeringSettings = NULL
)
@@ -123,24 +124,28 @@ diagnoseResult <- diagnosePlp(
population <- createStudyPopulation(
plpData = plpData,
- outcomeId = outcomeId,
+ outcomeId = outcomeId,
populationSettings = populationSettings
- )
+)
-createTrainData <- function(plpData, population){
- data <- PatientLevelPrediction::splitData(plpData = plpData,
- population = population,
- splitSettings = PatientLevelPrediction::createDefaultSplitSetting(splitSeed = 12))
+createTrainData <- function(plpData, population) {
+ data <- PatientLevelPrediction::splitData(
+ plpData = plpData,
+ population = population,
+ splitSettings = PatientLevelPrediction::createDefaultSplitSetting(splitSeed = 12)
+ )
trainData <- data$Train
return(trainData)
}
trainData <- createTrainData(plpData, population)
-createTestData <- function(plpData, population){
- data <- PatientLevelPrediction::splitData(plpData = plpData,
- population = population,
- splitSettings = PatientLevelPrediction::createDefaultSplitSetting(splitSeed = 12))
+createTestData <- function(plpData, population) {
+ data <- PatientLevelPrediction::splitData(
+ plpData = plpData,
+ population = population,
+ splitSettings = PatientLevelPrediction::createDefaultSplitSetting(splitSeed = 12)
+ )
testData <- data$Test
return(testData)
}
@@ -148,27 +153,29 @@ createTestData <- function(plpData, population){
testData <- createTestData(plpData, population)
# reduced trainData to only use n most important features (as decided by LR)
-reduceTrainData <- function(trainData, n=20) {
- covariates <- plpResult$model$covariateImportance %>%
- dplyr::slice_max(order_by = abs(.data$covariateValue),n = n, with_ties = F) %>%
+reduceTrainData <- function(trainData, n = 20) {
+ covariates <- plpResult$model$covariateImportance %>%
+ dplyr::slice_max(order_by = abs(.data$covariateValue), n = n, with_ties = FALSE) %>%
dplyr::pull(.data$covariateId)
-
- reducedTrainData <- list(labels = trainData$labels,
- folds = trainData$folds,
- covariateData = Andromeda::andromeda(
- analysisRef = trainData$covariateData$analysisRef
- ))
-
-
- reducedTrainData$covariateData$covariates <- trainData$covariateData$covariates %>%
- dplyr::filter(covariateId %in% covariates)
- reducedTrainData$covariateData$covariateRef <- trainData$covariateData$covariateRef %>%
- dplyr::filter(covariateId %in% covariates)
-
+
+ reducedTrainData <- list(
+ labels = trainData$labels,
+ folds = trainData$folds,
+ covariateData = Andromeda::andromeda(
+ analysisRef = trainData$covariateData$analysisRef
+ )
+ )
+
+
+ reducedTrainData$covariateData$covariates <- trainData$covariateData$covariates %>%
+ dplyr::filter(.data$covariateId %in% covariates)
+ reducedTrainData$covariateData$covariateRef <- trainData$covariateData$covariateRef %>%
+ dplyr::filter(.data$covariateId %in% covariates)
+
attributes(reducedTrainData$covariateData)$metaData <- attributes(trainData$covariateData)$metaData
class(reducedTrainData$covariateData) <- class(trainData$covariateData)
attributes(reducedTrainData)$metaData <- attributes(trainData)$metaData
- return(reducedTrainData)
+ return(reducedTrainData)
}
tinyTrainData <- reduceTrainData(trainData)
@@ -176,16 +183,18 @@ tinyTrainData <- reduceTrainData(trainData)
tinyPlpData <- createTinyPlpData(plpData, plpResult)
nanoData <- createTinyPlpData(plpData, plpResult, n = 2)
-tinyResults <- runPlp(plpData = nanoData,
- populationSettings = populationSettings,
- outcomeId = outcomeId,
- analysisId = 'tinyFit',
- executeSettings = createExecuteSettings(
- runSplitData = T,
- runSampleData = F,
- runfeatureEngineering = F,
- runPreprocessData = T,
- runModelDevelopment = T,
- runCovariateSummary = F
- ),
- saveDirectory = file.path(saveLoc, 'tinyResults'))
+tinyResults <- runPlp(
+ plpData = nanoData,
+ populationSettings = populationSettings,
+ outcomeId = outcomeId,
+ analysisId = "tinyFit",
+ executeSettings = createExecuteSettings(
+ runSplitData = TRUE,
+ runSampleData = FALSE,
+ runfeatureEngineering = FALSE,
+ runPreprocessData = TRUE,
+ runModelDevelopment = TRUE,
+ runCovariateSummary = FALSE
+ ),
+ saveDirectory = file.path(saveLoc, "tinyResults")
+)
diff --git a/tests/testthat/test-KNN.R b/tests/testthat/test-KNN.R
deleted file mode 100644
index 001e20f3b..000000000
--- a/tests/testthat/test-KNN.R
+++ /dev/null
@@ -1,31 +0,0 @@
-
-
-test_that('KNN fit works', {
- skip_on_ci()
- modelSettings = setKNN(k = 2)
- nanoTrainData <- reduceTrainData(tinyTrainData, n = 2)
- subjectToKeep <- nanoTrainData$labels[sample.int(nrow(nanoTrainData$labels), 50),"rowId"]
- nanoTrainData$labels <- nanoTrainData$labels[nanoTrainData$labels$rowId %in% subjectToKeep,]
- nanoTrainData$folds <- nanoTrainData$folds[nanoTrainData$folds$rowId %in% subjectToKeep,]
- nanoTrainData$covariateData$covariates <- nanoTrainData$covariateData$covariates %>% dplyr::filter(.data$rowId %in% subjectToKeep)
- plpModel <- fitPlp(
- trainData = nanoTrainData,
- modelSettings = modelSettings,
- analysisId = 'KNN',
- analysisPath = tempdir()
- )
-
- expect_correct_fitPlp(plpModel, nanoTrainData)
-
-})
-
-
-test_that("KNN settings", {
- skip_on_ci()
-
-model_set <- setKNN(k=5)
-testthat::expect_is(model_set, "modelSettings")
-testthat::expect_length(model_set,2)
-testthat::expect_error(setKNN(k = 0))
-testthat::expect_error(setKNN(indexFolder = 2372))
-})
diff --git a/tests/testthat/test-LightGBM.R b/tests/testthat/test-LightGBM.R
index 35e742cfb..071d10be2 100644
--- a/tests/testthat/test-LightGBM.R
+++ b/tests/testthat/test-LightGBM.R
@@ -20,41 +20,42 @@ context("LightGBM")
test_that("LightGBM settings work", {
-
- seed <- sample(10000000,1)
- #=====================================
+ skip_if_not_installed("lightgbm")
+ skip_on_cran()
+ seed <- sample(10000000, 1)
+ # =====================================
# checking Light GBM
- #=====================================
+ # =====================================
lgbmSet <- setLightGBM(
- nthread = 5,
+ nthread = 5,
earlyStopRound = 25,
numIterations = 10,
numLeaves = c(31, 20),
- maxDepth = 5,
- minDataInLeaf = 10,
+ maxDepth = 5,
+ minDataInLeaf = 10,
learningRate = 0.1,
lambdaL1 = 0,
- lambdaL2 =0,
+ lambdaL2 = 0,
scalePosWeight = 1,
- isUnbalance = F,
+ isUnbalance = FALSE,
seed = seed
)
-
- expect_is(lgbmSet, 'modelSettings')
- expect_equal(lgbmSet$fitFunction, 'fitRclassifier')
- expect_is(lgbmSet$param, 'list')
-
- expect_equal(attr(lgbmSet$param, 'settings')$modelType, 'LightGBM')
- expect_equal(attr(lgbmSet$param, 'settings')$seed, seed)
- expect_equal(attr(lgbmSet$param, 'settings')$modelName, "LightGBM")
-
- expect_equal(attr(lgbmSet$param, 'settings')$threads, 5)
- expect_equal(attr(lgbmSet$param, 'settings')$varImpRFunction, 'varImpLightGBM')
- expect_equal(attr(lgbmSet$param, 'settings')$trainRFunction, 'fitLightGBM')
- expect_equal(attr(lgbmSet$param, 'settings')$predictRFunction, 'predictLightGBM')
-
- expect_equal(length(lgbmSet$param),2)
-
+
+ expect_is(lgbmSet, "modelSettings")
+ expect_equal(lgbmSet$fitFunction, "fitRclassifier")
+ expect_is(lgbmSet$param, "list")
+
+ expect_equal(attr(lgbmSet$param, "settings")$modelType, "LightGBM")
+ expect_equal(attr(lgbmSet$param, "settings")$seed, seed)
+ expect_equal(attr(lgbmSet$param, "settings")$modelName, "LightGBM")
+
+ expect_equal(attr(lgbmSet$param, "settings")$threads, 5)
+ expect_equal(attr(lgbmSet$param, "settings")$varImpRFunction, "varImpLightGBM")
+ expect_equal(attr(lgbmSet$param, "settings")$trainRFunction, "fitLightGBM")
+ expect_equal(attr(lgbmSet$param, "settings")$predictRFunction, "predictLightGBM")
+
+ expect_equal(length(lgbmSet$param), 2)
+
expect_equal(length(unique(unlist(lapply(lgbmSet$param, function(x) x$numIterations)))), 1)
expect_equal(length(unique(unlist(lapply(lgbmSet$param, function(x) x$numLeaves)))), 2)
expect_equal(length(unique(unlist(lapply(lgbmSet$param, function(x) x$earlyStopRound)))), 1)
@@ -65,57 +66,57 @@ test_that("LightGBM settings work", {
expect_equal(length(unique(unlist(lapply(lgbmSet$param, function(x) x$lambdaL2)))), 1)
expect_equal(length(unique(unlist(lapply(lgbmSet$param, function(x) x$scalePosWeight)))), 1)
expect_equal(length(unique(unlist(lapply(lgbmSet$param, function(x) x$isUnbalance)))), 1)
-
})
test_that("LightGBM settings expected errors", {
- #=====================================
+ skip_if_not_installed("lightgbm")
+ skip_on_cran()
+ # =====================================
# checking Gradient Boosting Machine
- #=====================================
-
+ # =====================================
+
testthat::expect_error(setLightGBM(numIterations = -1))
testthat::expect_error(setLightGBM(numLeaves = -1))
testthat::expect_error(setLightGBM(numLeaves = 10000000))
testthat::expect_error(setLightGBM(learningRate = -2))
- testthat::expect_error(setLightGBM(seed = 'F'))
+ testthat::expect_error(setLightGBM(seed = "F"))
testthat::expect_error(setLightGBM(lambdaL1 = -1))
testthat::expect_error(setLightGBM(lambdaL2 = -1))
testthat::expect_error(setLightGBM(scalePosWeight = -1))
testthat::expect_error(setLightGBM(isUnbalance = TRUE, scalePosWeight = 0.5))
-
})
test_that("LightGBM working checks", {
-
+ skip_if_not_installed("lightgbm")
+ skip_on_cran()
modelSettings <- setLightGBM(numIterations = 10, maxDepth = 3, learningRate = 0.1, numLeaves = 31, minDataInLeaf = 10, lambdaL1 = 0, lambdaL2 = 0)
-
+
fitModel <- fitPlp(
- trainData = trainData,
- modelSettings = modelSettings,
- analysisId = 'lgbmTest',
+ trainData = trainData,
+ modelSettings = modelSettings,
+ analysisId = "lgbmTest",
analysisPath = tempdir()
)
-
- expect_equal(nrow(fitModel$prediction), nrow(trainData$labels)*2)
- expect_equal(length(unique(fitModel$prediction$evaluationType)),2)
-
+
+ expect_equal(nrow(fitModel$prediction), nrow(trainData$labels) * 2)
+ expect_equal(length(unique(fitModel$prediction$evaluationType)), 2)
+
# check prediction between 0 and 1
expect_gte(min(fitModel$prediction$value), 0)
expect_lte(max(fitModel$prediction$value), 1)
-
+
expect_equal(class(fitModel$model), c("lgb.Booster", "R6"))
-
+
expect_lte(nrow(fitModel$covariateImportance), trainData$covariateData$covariateRef %>% dplyr::tally() %>% dplyr::pull())
-
+
expect_equal(fitModel$modelDesign$outcomeId, outcomeId)
expect_equal(fitModel$modelDesign$targetId, 1)
# TODO check other model design values?
-
+
# test that at least some features have importances that are not zero
- expect_equal(sum(abs(fitModel$covariateImportance$covariateValue))>0, TRUE)
-
+ expect_equal(sum(abs(fitModel$covariateImportance$covariateValue)) > 0, TRUE)
})
diff --git a/tests/testthat/test-PredictionDistribution.R b/tests/testthat/test-PredictionDistribution.R
index 0d2b88b05..5105a6e64 100644
--- a/tests/testthat/test-PredictionDistribution.R
+++ b/tests/testthat/test-PredictionDistribution.R
@@ -17,47 +17,43 @@ context("PredictionDistribution")
test_that("getPredictionDistribution binary type", {
Eprediction <- data.frame(
- value= runif(100),
- outcomeCount =round(runif(100)),
- evaluation = rep('Test',100)
- )
+ value = runif(100),
+ outcomeCount = round(runif(100)),
+ evaluation = rep("Test", 100)
+ )
predSum <- getPredictionDistribution(
prediction = Eprediction,
- predictionType = 'binary',
- typeColumn = 'evaluation'
- )
-
- expect_that(nrow(predSum ), equals(2))
- expect_that(ncol(predSum ), equals(12))
-
-
-
+ predictionType = "binary",
+ typeColumn = "evaluation"
+ )
+
+ expect_that(nrow(predSum), equals(2))
+ expect_that(ncol(predSum), equals(12))
+
+
+
predBinary <- getPredictionDistribution_binary(
- prediction = Eprediction,
- evaluation = rep('Test',100),
- evalColumn = 'evaluation'
- )
-
+ prediction = Eprediction,
+ evaluation = rep("Test", 100),
+ evalColumn = "evaluation"
+ )
+
expect_equal(predBinary, predSum)
-
})
test_that("getPredictionDistribution survival type", {
Eprediction <- data.frame(
- value= runif(100),
- outcomeCount =round(runif(100)),
- evaluation = rep('Test',100)
+ value = runif(100),
+ outcomeCount = round(runif(100)),
+ evaluation = rep("Test", 100)
)
-
- predSurvival <- getPredictionDistribution_survival(
- prediction = Eprediction,
- evaluation = rep('Test',100),
- evalColumn = 'evaluation'
+
+ predSurvival <- getPredictionDistribution_survival(
+ prediction = Eprediction,
+ evaluation = rep("Test", 100),
+ evalColumn = "evaluation"
)
-
- expect_true(is.null(predSurvival))
-
+ expect_true(is.null(predSurvival))
})
-
diff --git a/tests/testthat/test-ThresholdSummary.R b/tests/testthat/test-ThresholdSummary.R
index 9d8e5ba8d..70c42ae22 100644
--- a/tests/testthat/test-ThresholdSummary.R
+++ b/tests/testthat/test-ThresholdSummary.R
@@ -20,258 +20,258 @@ context("ThresholdSummary")
test_that("getThresholdSummary binary", {
Eprediction <- data.frame(
- value = runif(100),
+ value = runif(100),
outcomeCount = round(runif(100)),
- evaluation = rep('Test', 100)
- )
-
+ evaluation = rep("Test", 100)
+ )
+
thresSum <- getThresholdSummary(
prediction = Eprediction,
- predictionType = 'binary',
- typeColumn = 'evaluation'
+ predictionType = "binary",
+ typeColumn = "evaluation"
)
-
- expect_true('evaluation' %in% colnames(thresSum))
- expect_that(nrow(thresSum), equals(length(unique(Eprediction$value))))
+
+ expect_true("evaluation" %in% colnames(thresSum))
+ expect_that(nrow(thresSum), equals(length(unique(Eprediction$value))))
expect_that(ncol(thresSum), equals(24))
-
- expect_that(thresSum$truePositiveCount+thresSum$falseNegativeCount,
- equals(rep(sum(Eprediction$outcomeCount),length(thresSum$truePositiveCount))))
-
- expect_that(thresSum$truePositiveCount+thresSum$falsePositiveCount+
- thresSum$trueNegativeCount+thresSum$falseNegativeCount,
- equals(rep(nrow(Eprediction),length(thresSum$truePositiveCount))))
-
+
+ expect_that(
+ thresSum$truePositiveCount + thresSum$falseNegativeCount,
+ equals(rep(sum(Eprediction$outcomeCount), length(thresSum$truePositiveCount)))
+ )
+
+ expect_that(
+ thresSum$truePositiveCount + thresSum$falsePositiveCount +
+ thresSum$trueNegativeCount + thresSum$falseNegativeCount,
+ equals(rep(nrow(Eprediction), length(thresSum$truePositiveCount)))
+ )
+
# now do bianry directly
-
+
thresSumBin <- getThresholdSummary_binary(
- prediction = Eprediction,
- evalColumn= 'evaluation'
- )
-
+ prediction = Eprediction,
+ evalColumn = "evaluation"
+ )
+
expect_equal(thresSumBin, thresSum)
-
})
test_that("getThresholdSummary survival", {
Eprediction <- data.frame(
value = c(
- (100+sample(10,50, replace = T)),
-
- (105+sample(10,150,replace = T))
- ),
- outcomeCount = c(rep(1,50), rep(0,150)),
- evaluation = rep('Test', 200),
- survivalTime = 50 + sample(365*2,200, replace = T)
+ (100 + sample(10, 50, replace = TRUE)),
+ (105 + sample(10, 150, replace = TRUE))
+ ),
+ outcomeCount = c(rep(1, 50), rep(0, 150)),
+ evaluation = rep("Test", 200),
+ survivalTime = 50 + sample(365 * 2, 200, replace = TRUE)
)
-
+
thresSum <- getThresholdSummary_survival(
- prediction = Eprediction,
- evalColumn= 'evaluation',
+ prediction = Eprediction,
+ evalColumn = "evaluation",
timepoint = 365
)
-
- expect_true('evaluation' %in% colnames(thresSum))
- expect_true(nrow(thresSum)>0 )
+
+ expect_true("evaluation" %in% colnames(thresSum))
+ expect_true(nrow(thresSum) > 0)
expect_that(ncol(thresSum), equals(5))
-
})
-if(F){
-test_that("f1Score", {
- netben <- stdca(
- data,
- outcome,
- ttoutcome,
- timepoint,
- predictors,
- xstart = 0.01,
- xstop = 0.99,
- xby = 0.01,
- ymin = -0.05,
- probability = NULL,
- harm = NULL,
- graph = TRUE,
- intervention = FALSE,
- interventionper = 100,
- smooth = FALSE,
- loess.span = 0.1,
- cmprsk = FALSE
- )
-
-})
+if (FALSE) {
+ test_that("f1Score", {
+ netben <- stdca(
+ data,
+ outcome,
+ ttoutcome,
+ timepoint,
+ predictors,
+ xstart = 0.01,
+ xstop = 0.99,
+ xby = 0.01,
+ ymin = -0.05,
+ probability = NULL,
+ harm = NULL,
+ graph = TRUE,
+ intervention = FALSE,
+ interventionper = 100,
+ smooth = FALSE,
+ loess.span = 0.1,
+ cmprsk = FALSE
+ )
+ })
}
test_that("f1Score", {
- expect_that(f1Score(TP=0,TN=0,FN=0,FP=0), equals(NaN))
- expect_that(f1Score(TP=-1,TN=0,FN=0,FP=0), throws_error())
- expect_that(f1Score(TP=1,TN=-1,FN=0,FP=0), throws_error())
- expect_that(f1Score(TP=1,TN=3,FN=-1,FP=0), throws_error())
- expect_that(f1Score(TP=1,TN=1,FN=5,FP=-1), throws_error())
- expect_that(f1Score(TP=NULL,TN=0,FN=0,FP=0), throws_error())
- expect_that(f1Score(TP=1,TN=NULL,FN=0,FP=0), throws_error())
- expect_that(f1Score(TP=1,TN=3,FN=NULL,FP=0), throws_error())
- expect_that(f1Score(TP=1,TN=1,FN=5,FP=NULL), throws_error())
- expect_that(f1Score(TP=10,TN=3,FN=5,FP=5), equals(0.6666667,tolerance = 0.0001) )
+ expect_that(f1Score(TP = 0, TN = 0, FN = 0, FP = 0), equals(NaN))
+ expect_that(f1Score(TP = -1, TN = 0, FN = 0, FP = 0), throws_error())
+ expect_that(f1Score(TP = 1, TN = -1, FN = 0, FP = 0), throws_error())
+ expect_that(f1Score(TP = 1, TN = 3, FN = -1, FP = 0), throws_error())
+ expect_that(f1Score(TP = 1, TN = 1, FN = 5, FP = -1), throws_error())
+ expect_that(f1Score(TP = NULL, TN = 0, FN = 0, FP = 0), throws_error())
+ expect_that(f1Score(TP = 1, TN = NULL, FN = 0, FP = 0), throws_error())
+ expect_that(f1Score(TP = 1, TN = 3, FN = NULL, FP = 0), throws_error())
+ expect_that(f1Score(TP = 1, TN = 1, FN = 5, FP = NULL), throws_error())
+ expect_that(f1Score(TP = 10, TN = 3, FN = 5, FP = 5), equals(0.6666667, tolerance = 0.0001))
})
test_that("accuracy", {
- expect_that(accuracy(TP=0,TN=0,FN=0,FP=0), equals(NaN))
- expect_that(accuracy(TP=-1,TN=0,FN=0,FP=0), throws_error())
- expect_that(accuracy(TP=1,TN=-1,FN=0,FP=0), throws_error())
- expect_that(accuracy(TP=1,TN=3,FN=-1,FP=0), throws_error())
- expect_that(accuracy(TP=1,TN=1,FN=5,FP=-1), throws_error())
- expect_that(accuracy(TP=NULL,TN=0,FN=0,FP=0), throws_error())
- expect_that(accuracy(TP=1,TN=NULL,FN=0,FP=0), throws_error())
- expect_that(accuracy(TP=1,TN=3,FN=NULL,FP=0), throws_error())
- expect_that(accuracy(TP=1,TN=1,FN=5,FP=NULL), throws_error())
- expect_that(accuracy(TP=10,TN=3,FN=5,FP=5), equals(13/23, tolerance = 0.0001))
+ expect_that(accuracy(TP = 0, TN = 0, FN = 0, FP = 0), equals(NaN))
+ expect_that(accuracy(TP = -1, TN = 0, FN = 0, FP = 0), throws_error())
+ expect_that(accuracy(TP = 1, TN = -1, FN = 0, FP = 0), throws_error())
+ expect_that(accuracy(TP = 1, TN = 3, FN = -1, FP = 0), throws_error())
+ expect_that(accuracy(TP = 1, TN = 1, FN = 5, FP = -1), throws_error())
+ expect_that(accuracy(TP = NULL, TN = 0, FN = 0, FP = 0), throws_error())
+ expect_that(accuracy(TP = 1, TN = NULL, FN = 0, FP = 0), throws_error())
+ expect_that(accuracy(TP = 1, TN = 3, FN = NULL, FP = 0), throws_error())
+ expect_that(accuracy(TP = 1, TN = 1, FN = 5, FP = NULL), throws_error())
+ expect_that(accuracy(TP = 10, TN = 3, FN = 5, FP = 5), equals(13 / 23, tolerance = 0.0001))
})
test_that("sensitivity", {
- expect_that(sensitivity(TP=0,TN=0,FN=0,FP=0), equals(NaN))
- expect_that(sensitivity(TP=-1,TN=0,FN=0,FP=0), throws_error())
- expect_that(sensitivity(TP=1,TN=-1,FN=0,FP=0), throws_error())
- expect_that(sensitivity(TP=1,TN=3,FN=-1,FP=0), throws_error())
- expect_that(sensitivity(TP=1,TN=1,FN=5,FP=-1), throws_error())
- expect_that(sensitivity(TP=NULL,TN=0,FN=0,FP=0), throws_error())
- expect_that(sensitivity(TP=1,TN=NULL,FN=0,FP=0), throws_error())
- expect_that(sensitivity(TP=1,TN=3,FN=NULL,FP=0), throws_error())
- expect_that(sensitivity(TP=1,TN=1,FN=5,FP=NULL), throws_error())
- expect_that(sensitivity(TP=10,TN=3,FN=5,FP=5), equals(10/(10+5),tolerance = 0.0001))
+ expect_that(sensitivity(TP = 0, TN = 0, FN = 0, FP = 0), equals(NaN))
+ expect_that(sensitivity(TP = -1, TN = 0, FN = 0, FP = 0), throws_error())
+ expect_that(sensitivity(TP = 1, TN = -1, FN = 0, FP = 0), throws_error())
+ expect_that(sensitivity(TP = 1, TN = 3, FN = -1, FP = 0), throws_error())
+ expect_that(sensitivity(TP = 1, TN = 1, FN = 5, FP = -1), throws_error())
+ expect_that(sensitivity(TP = NULL, TN = 0, FN = 0, FP = 0), throws_error())
+ expect_that(sensitivity(TP = 1, TN = NULL, FN = 0, FP = 0), throws_error())
+ expect_that(sensitivity(TP = 1, TN = 3, FN = NULL, FP = 0), throws_error())
+ expect_that(sensitivity(TP = 1, TN = 1, FN = 5, FP = NULL), throws_error())
+ expect_that(sensitivity(TP = 10, TN = 3, FN = 5, FP = 5), equals(10 / (10 + 5), tolerance = 0.0001))
})
test_that("falseNegativeRate", {
- expect_that(falseNegativeRate(TP=0,TN=0,FN=0,FP=0), equals(NaN))
- expect_that(falseNegativeRate(TP=-1,TN=0,FN=0,FP=0), throws_error())
- expect_that(falseNegativeRate(TP=1,TN=-1,FN=0,FP=0), throws_error())
- expect_that(falseNegativeRate(TP=1,TN=3,FN=-1,FP=0), throws_error())
- expect_that(falseNegativeRate(TP=1,TN=1,FN=5,FP=-1), throws_error())
- expect_that(falseNegativeRate(TP=NULL,TN=0,FN=0,FP=0), throws_error())
- expect_that(falseNegativeRate(TP=1,TN=NULL,FN=0,FP=0), throws_error())
- expect_that(falseNegativeRate(TP=1,TN=3,FN=NULL,FP=0), throws_error())
- expect_that(falseNegativeRate(TP=1,TN=1,FN=5,FP=NULL), throws_error())
- expect_that(falseNegativeRate(TP=10,TN=3,FN=5,FP=5), equals(5/(10+5), tolerance = 0.0001))
+ expect_that(falseNegativeRate(TP = 0, TN = 0, FN = 0, FP = 0), equals(NaN))
+ expect_that(falseNegativeRate(TP = -1, TN = 0, FN = 0, FP = 0), throws_error())
+ expect_that(falseNegativeRate(TP = 1, TN = -1, FN = 0, FP = 0), throws_error())
+ expect_that(falseNegativeRate(TP = 1, TN = 3, FN = -1, FP = 0), throws_error())
+ expect_that(falseNegativeRate(TP = 1, TN = 1, FN = 5, FP = -1), throws_error())
+ expect_that(falseNegativeRate(TP = NULL, TN = 0, FN = 0, FP = 0), throws_error())
+ expect_that(falseNegativeRate(TP = 1, TN = NULL, FN = 0, FP = 0), throws_error())
+ expect_that(falseNegativeRate(TP = 1, TN = 3, FN = NULL, FP = 0), throws_error())
+ expect_that(falseNegativeRate(TP = 1, TN = 1, FN = 5, FP = NULL), throws_error())
+ expect_that(falseNegativeRate(TP = 10, TN = 3, FN = 5, FP = 5), equals(5 / (10 + 5), tolerance = 0.0001))
})
test_that("falsePositiveRate", {
- expect_that(falsePositiveRate(TP=0,TN=0,FN=0,FP=0), equals(NaN))
- expect_that(falsePositiveRate(TP=-1,TN=0,FN=0,FP=0), throws_error())
- expect_that(falsePositiveRate(TP=1,TN=-1,FN=0,FP=0), throws_error())
- expect_that(falsePositiveRate(TP=1,TN=3,FN=-1,FP=0), throws_error())
- expect_that(falsePositiveRate(TP=1,TN=1,FN=5,FP=-1), throws_error())
- expect_that(falsePositiveRate(TP=NULL,TN=0,FN=0,FP=0), throws_error())
- expect_that(falsePositiveRate(TP=1,TN=NULL,FN=0,FP=0), throws_error())
- expect_that(falsePositiveRate(TP=1,TN=3,FN=NULL,FP=0), throws_error())
- expect_that(falsePositiveRate(TP=1,TN=1,FN=5,FP=NULL), throws_error())
- expect_that(falsePositiveRate(TP=10,TN=3,FN=5,FP=5), equals(5/(5+3), tolerance = 0.0001))
+ expect_that(falsePositiveRate(TP = 0, TN = 0, FN = 0, FP = 0), equals(NaN))
+ expect_that(falsePositiveRate(TP = -1, TN = 0, FN = 0, FP = 0), throws_error())
+ expect_that(falsePositiveRate(TP = 1, TN = -1, FN = 0, FP = 0), throws_error())
+ expect_that(falsePositiveRate(TP = 1, TN = 3, FN = -1, FP = 0), throws_error())
+ expect_that(falsePositiveRate(TP = 1, TN = 1, FN = 5, FP = -1), throws_error())
+ expect_that(falsePositiveRate(TP = NULL, TN = 0, FN = 0, FP = 0), throws_error())
+ expect_that(falsePositiveRate(TP = 1, TN = NULL, FN = 0, FP = 0), throws_error())
+ expect_that(falsePositiveRate(TP = 1, TN = 3, FN = NULL, FP = 0), throws_error())
+ expect_that(falsePositiveRate(TP = 1, TN = 1, FN = 5, FP = NULL), throws_error())
+ expect_that(falsePositiveRate(TP = 10, TN = 3, FN = 5, FP = 5), equals(5 / (5 + 3), tolerance = 0.0001))
})
test_that("specificity", {
- expect_that(specificity(TP=0,TN=0,FN=0,FP=0), equals(NaN))
- expect_that(specificity(TP=-1,TN=0,FN=0,FP=0), throws_error())
- expect_that(specificity(TP=1,TN=-1,FN=0,FP=0), throws_error())
- expect_that(specificity(TP=1,TN=3,FN=-1,FP=0), throws_error())
- expect_that(specificity(TP=1,TN=1,FN=5,FP=-1), throws_error())
- expect_that(specificity(TP=NULL,TN=0,FN=0,FP=0), throws_error())
- expect_that(specificity(TP=1,TN=NULL,FN=0,FP=0), throws_error())
- expect_that(specificity(TP=1,TN=3,FN=NULL,FP=0), throws_error())
- expect_that(specificity(TP=1,TN=1,FN=5,FP=NULL), throws_error())
- expect_that(specificity(TP=10,TN=3,FN=5,FP=5), equals(3/(5+3), tolerance = 0.0001))
+ expect_that(specificity(TP = 0, TN = 0, FN = 0, FP = 0), equals(NaN))
+ expect_that(specificity(TP = -1, TN = 0, FN = 0, FP = 0), throws_error())
+ expect_that(specificity(TP = 1, TN = -1, FN = 0, FP = 0), throws_error())
+ expect_that(specificity(TP = 1, TN = 3, FN = -1, FP = 0), throws_error())
+ expect_that(specificity(TP = 1, TN = 1, FN = 5, FP = -1), throws_error())
+ expect_that(specificity(TP = NULL, TN = 0, FN = 0, FP = 0), throws_error())
+ expect_that(specificity(TP = 1, TN = NULL, FN = 0, FP = 0), throws_error())
+ expect_that(specificity(TP = 1, TN = 3, FN = NULL, FP = 0), throws_error())
+ expect_that(specificity(TP = 1, TN = 1, FN = 5, FP = NULL), throws_error())
+ expect_that(specificity(TP = 10, TN = 3, FN = 5, FP = 5), equals(3 / (5 + 3), tolerance = 0.0001))
})
test_that("positivePredictiveValue", {
- expect_that(positivePredictiveValue(TP=0,TN=0,FN=0,FP=0), equals(NaN))
- expect_that(positivePredictiveValue(TP=-1,TN=0,FN=0,FP=0), throws_error())
- expect_that(positivePredictiveValue(TP=1,TN=-1,FN=0,FP=0), throws_error())
- expect_that(positivePredictiveValue(TP=1,TN=3,FN=-1,FP=0), throws_error())
- expect_that(positivePredictiveValue(TP=1,TN=1,FN=5,FP=-1), throws_error())
- expect_that(positivePredictiveValue(TP=NULL,TN=0,FN=0,FP=0), throws_error())
- expect_that(positivePredictiveValue(TP=1,TN=NULL,FN=0,FP=0), throws_error())
- expect_that(positivePredictiveValue(TP=1,TN=3,FN=NULL,FP=0), throws_error())
- expect_that(positivePredictiveValue(TP=1,TN=1,FN=5,FP=NULL), throws_error())
- expect_that(positivePredictiveValue(TP=10,TN=3,FN=5,FP=5), equals(10/(10+5), tolerance = 0.0001))
+ expect_that(positivePredictiveValue(TP = 0, TN = 0, FN = 0, FP = 0), equals(NaN))
+ expect_that(positivePredictiveValue(TP = -1, TN = 0, FN = 0, FP = 0), throws_error())
+ expect_that(positivePredictiveValue(TP = 1, TN = -1, FN = 0, FP = 0), throws_error())
+ expect_that(positivePredictiveValue(TP = 1, TN = 3, FN = -1, FP = 0), throws_error())
+ expect_that(positivePredictiveValue(TP = 1, TN = 1, FN = 5, FP = -1), throws_error())
+ expect_that(positivePredictiveValue(TP = NULL, TN = 0, FN = 0, FP = 0), throws_error())
+ expect_that(positivePredictiveValue(TP = 1, TN = NULL, FN = 0, FP = 0), throws_error())
+ expect_that(positivePredictiveValue(TP = 1, TN = 3, FN = NULL, FP = 0), throws_error())
+ expect_that(positivePredictiveValue(TP = 1, TN = 1, FN = 5, FP = NULL), throws_error())
+ expect_that(positivePredictiveValue(TP = 10, TN = 3, FN = 5, FP = 5), equals(10 / (10 + 5), tolerance = 0.0001))
})
test_that("falseDiscoveryRate", {
- expect_that(falseDiscoveryRate(TP=0,TN=0,FN=0,FP=0), equals(NaN))
- expect_that(falseDiscoveryRate(TP=-1,TN=0,FN=0,FP=0), throws_error())
- expect_that(falseDiscoveryRate(TP=1,TN=-1,FN=0,FP=0), throws_error())
- expect_that(falseDiscoveryRate(TP=1,TN=3,FN=-1,FP=0), throws_error())
- expect_that(falseDiscoveryRate(TP=1,TN=1,FN=5,FP=-1), throws_error())
- expect_that(falseDiscoveryRate(TP=NULL,TN=0,FN=0,FP=0), throws_error())
- expect_that(falseDiscoveryRate(TP=1,TN=NULL,FN=0,FP=0), throws_error())
- expect_that(falseDiscoveryRate(TP=1,TN=3,FN=NULL,FP=0), throws_error())
- expect_that(falseDiscoveryRate(TP=1,TN=1,FN=5,FP=NULL), throws_error())
- expect_that(falseDiscoveryRate(TP=10,TN=3,FN=5,FP=5), equals(5/(10+5), tolerance = 0.0001))
+ expect_that(falseDiscoveryRate(TP = 0, TN = 0, FN = 0, FP = 0), equals(NaN))
+ expect_that(falseDiscoveryRate(TP = -1, TN = 0, FN = 0, FP = 0), throws_error())
+ expect_that(falseDiscoveryRate(TP = 1, TN = -1, FN = 0, FP = 0), throws_error())
+ expect_that(falseDiscoveryRate(TP = 1, TN = 3, FN = -1, FP = 0), throws_error())
+ expect_that(falseDiscoveryRate(TP = 1, TN = 1, FN = 5, FP = -1), throws_error())
+ expect_that(falseDiscoveryRate(TP = NULL, TN = 0, FN = 0, FP = 0), throws_error())
+ expect_that(falseDiscoveryRate(TP = 1, TN = NULL, FN = 0, FP = 0), throws_error())
+ expect_that(falseDiscoveryRate(TP = 1, TN = 3, FN = NULL, FP = 0), throws_error())
+ expect_that(falseDiscoveryRate(TP = 1, TN = 1, FN = 5, FP = NULL), throws_error())
+ expect_that(falseDiscoveryRate(TP = 10, TN = 3, FN = 5, FP = 5), equals(5 / (10 + 5), tolerance = 0.0001))
})
test_that("negativePredictiveValue", {
- expect_that(negativePredictiveValue(TP=0,TN=0,FN=0,FP=0), equals(NaN))
- expect_that(negativePredictiveValue(TP=-1,TN=0,FN=0,FP=0), throws_error())
- expect_that(negativePredictiveValue(TP=1,TN=-1,FN=0,FP=0), throws_error())
- expect_that(negativePredictiveValue(TP=1,TN=3,FN=-1,FP=0), throws_error())
- expect_that(negativePredictiveValue(TP=1,TN=1,FN=5,FP=-1), throws_error())
- expect_that(negativePredictiveValue(TP=NULL,TN=0,FN=0,FP=0), throws_error())
- expect_that(negativePredictiveValue(TP=1,TN=NULL,FN=0,FP=0), throws_error())
- expect_that(negativePredictiveValue(TP=1,TN=3,FN=NULL,FP=0), throws_error())
- expect_that(negativePredictiveValue(TP=1,TN=1,FN=5,FP=NULL), throws_error())
- expect_that(negativePredictiveValue(TP=10,TN=3,FN=5,FP=5), equals(3/(5+3), tolerance = 0.0001))
+ expect_that(negativePredictiveValue(TP = 0, TN = 0, FN = 0, FP = 0), equals(NaN))
+ expect_that(negativePredictiveValue(TP = -1, TN = 0, FN = 0, FP = 0), throws_error())
+ expect_that(negativePredictiveValue(TP = 1, TN = -1, FN = 0, FP = 0), throws_error())
+ expect_that(negativePredictiveValue(TP = 1, TN = 3, FN = -1, FP = 0), throws_error())
+ expect_that(negativePredictiveValue(TP = 1, TN = 1, FN = 5, FP = -1), throws_error())
+ expect_that(negativePredictiveValue(TP = NULL, TN = 0, FN = 0, FP = 0), throws_error())
+ expect_that(negativePredictiveValue(TP = 1, TN = NULL, FN = 0, FP = 0), throws_error())
+ expect_that(negativePredictiveValue(TP = 1, TN = 3, FN = NULL, FP = 0), throws_error())
+ expect_that(negativePredictiveValue(TP = 1, TN = 1, FN = 5, FP = NULL), throws_error())
+ expect_that(negativePredictiveValue(TP = 10, TN = 3, FN = 5, FP = 5), equals(3 / (5 + 3), tolerance = 0.0001))
})
test_that("falseOmissionRate", {
- expect_that(falseOmissionRate(TP=0,TN=0,FN=0,FP=0), equals(NaN))
- expect_that(falseOmissionRate(TP=-1,TN=0,FN=0,FP=0), throws_error())
- expect_that(falseOmissionRate(TP=1,TN=-1,FN=0,FP=0), throws_error())
- expect_that(falseOmissionRate(TP=1,TN=3,FN=-1,FP=0), throws_error())
- expect_that(falseOmissionRate(TP=1,TN=1,FN=5,FP=-1), throws_error())
- expect_that(falseOmissionRate(TP=NULL,TN=0,FN=0,FP=0), throws_error())
- expect_that(falseOmissionRate(TP=1,TN=NULL,FN=0,FP=0), throws_error())
- expect_that(falseOmissionRate(TP=1,TN=3,FN=NULL,FP=0), throws_error())
- expect_that(falseOmissionRate(TP=1,TN=1,FN=5,FP=NULL), throws_error())
- expect_that(falseOmissionRate(TP=10,TN=3,FN=5,FP=5), equals(5/(5+3), tolerance = 0.0001))
+ expect_that(falseOmissionRate(TP = 0, TN = 0, FN = 0, FP = 0), equals(NaN))
+ expect_that(falseOmissionRate(TP = -1, TN = 0, FN = 0, FP = 0), throws_error())
+ expect_that(falseOmissionRate(TP = 1, TN = -1, FN = 0, FP = 0), throws_error())
+ expect_that(falseOmissionRate(TP = 1, TN = 3, FN = -1, FP = 0), throws_error())
+ expect_that(falseOmissionRate(TP = 1, TN = 1, FN = 5, FP = -1), throws_error())
+ expect_that(falseOmissionRate(TP = NULL, TN = 0, FN = 0, FP = 0), throws_error())
+ expect_that(falseOmissionRate(TP = 1, TN = NULL, FN = 0, FP = 0), throws_error())
+ expect_that(falseOmissionRate(TP = 1, TN = 3, FN = NULL, FP = 0), throws_error())
+ expect_that(falseOmissionRate(TP = 1, TN = 1, FN = 5, FP = NULL), throws_error())
+ expect_that(falseOmissionRate(TP = 10, TN = 3, FN = 5, FP = 5), equals(5 / (5 + 3), tolerance = 0.0001))
})
test_that("negativeLikelihoodRatio", {
- expect_that(negativeLikelihoodRatio(TP=0,TN=0,FN=0,FP=0), equals(NaN))
- expect_that(negativeLikelihoodRatio(TP=-1,TN=0,FN=0,FP=0), throws_error())
- expect_that(negativeLikelihoodRatio(TP=1,TN=-1,FN=0,FP=0), throws_error())
- expect_that(negativeLikelihoodRatio(TP=1,TN=3,FN=-1,FP=0), throws_error())
- expect_that(negativeLikelihoodRatio(TP=1,TN=1,FN=5,FP=-1), throws_error())
- expect_that(negativeLikelihoodRatio(TP=NULL,TN=0,FN=0,FP=0), throws_error())
- expect_that(negativeLikelihoodRatio(TP=1,TN=NULL,FN=0,FP=0), throws_error())
- expect_that(negativeLikelihoodRatio(TP=1,TN=3,FN=NULL,FP=0), throws_error())
- expect_that(negativeLikelihoodRatio(TP=1,TN=1,FN=5,FP=NULL), throws_error())
- expect_that(negativeLikelihoodRatio(TP=10,TN=3,FN=5,FP=5), equals((5/(10+5))/(3/(5+3)), tolerance = 0.0001))
+ expect_that(negativeLikelihoodRatio(TP = 0, TN = 0, FN = 0, FP = 0), equals(NaN))
+ expect_that(negativeLikelihoodRatio(TP = -1, TN = 0, FN = 0, FP = 0), throws_error())
+ expect_that(negativeLikelihoodRatio(TP = 1, TN = -1, FN = 0, FP = 0), throws_error())
+ expect_that(negativeLikelihoodRatio(TP = 1, TN = 3, FN = -1, FP = 0), throws_error())
+ expect_that(negativeLikelihoodRatio(TP = 1, TN = 1, FN = 5, FP = -1), throws_error())
+ expect_that(negativeLikelihoodRatio(TP = NULL, TN = 0, FN = 0, FP = 0), throws_error())
+ expect_that(negativeLikelihoodRatio(TP = 1, TN = NULL, FN = 0, FP = 0), throws_error())
+ expect_that(negativeLikelihoodRatio(TP = 1, TN = 3, FN = NULL, FP = 0), throws_error())
+ expect_that(negativeLikelihoodRatio(TP = 1, TN = 1, FN = 5, FP = NULL), throws_error())
+ expect_that(negativeLikelihoodRatio(TP = 10, TN = 3, FN = 5, FP = 5), equals((5 / (10 + 5)) / (3 / (5 + 3)), tolerance = 0.0001))
})
test_that("positiveLikelihoodRatio", {
- expect_that(positiveLikelihoodRatio(TP=0,TN=0,FN=0,FP=0), equals(NaN))
- expect_that(positiveLikelihoodRatio(TP=-1,TN=0,FN=0,FP=0), throws_error())
- expect_that(positiveLikelihoodRatio(TP=1,TN=-1,FN=0,FP=0), throws_error())
- expect_that(positiveLikelihoodRatio(TP=1,TN=3,FN=-1,FP=0), throws_error())
- expect_that(positiveLikelihoodRatio(TP=1,TN=1,FN=5,FP=-1), throws_error())
- expect_that(positiveLikelihoodRatio(TP=NULL,TN=0,FN=0,FP=0), throws_error())
- expect_that(positiveLikelihoodRatio(TP=1,TN=NULL,FN=0,FP=0), throws_error())
- expect_that(positiveLikelihoodRatio(TP=1,TN=3,FN=NULL,FP=0), throws_error())
- expect_that(positiveLikelihoodRatio(TP=1,TN=1,FN=5,FP=NULL), throws_error())
- expect_that(positiveLikelihoodRatio(TP=10,TN=3,FN=5,FP=5), equals((10/(10+5))/(5/(5+3)), tolerance = 0.0001))
+ expect_that(positiveLikelihoodRatio(TP = 0, TN = 0, FN = 0, FP = 0), equals(NaN))
+ expect_that(positiveLikelihoodRatio(TP = -1, TN = 0, FN = 0, FP = 0), throws_error())
+ expect_that(positiveLikelihoodRatio(TP = 1, TN = -1, FN = 0, FP = 0), throws_error())
+ expect_that(positiveLikelihoodRatio(TP = 1, TN = 3, FN = -1, FP = 0), throws_error())
+ expect_that(positiveLikelihoodRatio(TP = 1, TN = 1, FN = 5, FP = -1), throws_error())
+ expect_that(positiveLikelihoodRatio(TP = NULL, TN = 0, FN = 0, FP = 0), throws_error())
+ expect_that(positiveLikelihoodRatio(TP = 1, TN = NULL, FN = 0, FP = 0), throws_error())
+ expect_that(positiveLikelihoodRatio(TP = 1, TN = 3, FN = NULL, FP = 0), throws_error())
+ expect_that(positiveLikelihoodRatio(TP = 1, TN = 1, FN = 5, FP = NULL), throws_error())
+ expect_that(positiveLikelihoodRatio(TP = 10, TN = 3, FN = 5, FP = 5), equals((10 / (10 + 5)) / (5 / (5 + 3)), tolerance = 0.0001))
})
test_that("diagnosticOddsRatio", {
- expect_that(diagnosticOddsRatio(TP=0,TN=0,FN=0,FP=0), equals(NaN))
- expect_that(diagnosticOddsRatio(TP=-1,TN=0,FN=0,FP=0), throws_error())
- expect_that(diagnosticOddsRatio(TP=1,TN=-1,FN=0,FP=0), throws_error())
- expect_that(diagnosticOddsRatio(TP=1,TN=3,FN=-1,FP=0), throws_error())
- expect_that(diagnosticOddsRatio(TP=1,TN=1,FN=5,FP=-1), throws_error())
- expect_that(diagnosticOddsRatio(TP=NULL,TN=0,FN=0,FP=0), throws_error())
- expect_that(diagnosticOddsRatio(TP=1,TN=NULL,FN=0,FP=0), throws_error())
- expect_that(diagnosticOddsRatio(TP=1,TN=3,FN=NULL,FP=0), throws_error())
- expect_that(diagnosticOddsRatio(TP=1,TN=1,FN=5,FP=NULL), throws_error())
- expect_that(diagnosticOddsRatio(TP=10,TN=3,FN=5,FP=5), equals(((10/(10+5))/(5/(5+3)))/((5/(10+5))/(3/(5+3))), tolerance = 0.0001))
+ expect_that(diagnosticOddsRatio(TP = 0, TN = 0, FN = 0, FP = 0), equals(NaN))
+ expect_that(diagnosticOddsRatio(TP = -1, TN = 0, FN = 0, FP = 0), throws_error())
+ expect_that(diagnosticOddsRatio(TP = 1, TN = -1, FN = 0, FP = 0), throws_error())
+ expect_that(diagnosticOddsRatio(TP = 1, TN = 3, FN = -1, FP = 0), throws_error())
+ expect_that(diagnosticOddsRatio(TP = 1, TN = 1, FN = 5, FP = -1), throws_error())
+ expect_that(diagnosticOddsRatio(TP = NULL, TN = 0, FN = 0, FP = 0), throws_error())
+ expect_that(diagnosticOddsRatio(TP = 1, TN = NULL, FN = 0, FP = 0), throws_error())
+ expect_that(diagnosticOddsRatio(TP = 1, TN = 3, FN = NULL, FP = 0), throws_error())
+ expect_that(diagnosticOddsRatio(TP = 1, TN = 1, FN = 5, FP = NULL), throws_error())
+ expect_that(diagnosticOddsRatio(TP = 10, TN = 3, FN = 5, FP = 5), equals(((10 / (10 + 5)) / (5 / (5 + 3))) / ((5 / (10 + 5)) / (3 / (5 + 3))), tolerance = 0.0001))
})
diff --git a/tests/testthat/test-UploadToDatabase.R b/tests/testthat/test-UploadToDatabase.R
index 908ab44d3..3f4f03990 100644
--- a/tests/testthat/test-UploadToDatabase.R
+++ b/tests/testthat/test-UploadToDatabase.R
@@ -1,5 +1,3 @@
-# Copyright 2021 Observational Health Data Sciences and Informatics
-#
# This file is part of PatientLevelPrediction
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -18,441 +16,436 @@ library("testthat")
context("UploadToDatabase")
-# only run this during CI
-if (Sys.getenv('CI') == 'true') {
-cdmDatabaseSchema <- Sys.getenv("CDM5_POSTGRESQL_CDM_SCHEMA")
-ohdsiDatabaseSchema <- Sys.getenv("CDM5_POSTGRESQL_OHDSI_SCHEMA")
-connectionRedshift <- DatabaseConnector::createConnectionDetails(
- dbms = "postgresql",
- user = Sys.getenv("CDM5_POSTGRESQL_USER"),
- password = URLdecode(Sys.getenv("CDM5_POSTGRESQL_PASSWORD")),
- server = Sys.getenv("CDM5_POSTGRESQL_SERVER"),
+# only run this during CI in main repo
+if (Sys.getenv("CI") == "true" &&
+ Sys.getenv("GITHUB_REPOSITORY") == "OHDSI/PatientLevelPrediction") {
+ cdmDatabaseSchema <- Sys.getenv("CDM5_POSTGRESQL_CDM_SCHEMA")
+ ohdsiDatabaseSchema <- Sys.getenv("CDM5_POSTGRESQL_OHDSI_SCHEMA")
+ cdmDatabaseSchema <- Sys.getenv("CDM5_POSTGRESQL_CDM_SCHEMA")
+ ohdsiDatabaseSchema <- Sys.getenv("CDM5_POSTGRESQL_OHDSI_SCHEMA")
+ connectionRedshift <- DatabaseConnector::createConnectionDetails(
+ dbms = "postgresql",
+ user = Sys.getenv("CDM5_POSTGRESQL_USER"),
+ password = URLdecode(Sys.getenv("CDM5_POSTGRESQL_PASSWORD")),
+ server = Sys.getenv("CDM5_POSTGRESQL_SERVER"),
)
-conn <- DatabaseConnector::connect(connectionRedshift)
-targetDialect <- 'postgresql'
-
-set.seed(NULL)
-randVar <- rawToChar(as.raw(sample(c(65:90,97:122), 5, replace=T)))
+ conn <- DatabaseConnector::connect(connectionRedshift)
+ targetDialect <- "postgresql"
-appendRandom <- function(x, rand = randVar){
- return(paste("plp", rand, x, sep=''))
-}
+ set.seed(NULL)
+ randVar <- rawToChar(as.raw(sample(c(65:90, 97:122), 5, replace = TRUE)))
+ appendRandom <- function(x, rand = randVar) {
+ return(paste("plp", rand, x, sep = ""))
+ }
}
test_that("test createDatabaseSchemaSettings works", {
- skip_if(Sys.getenv('CI') != 'true', 'not run locally')
+ skip_if(Sys.getenv("CI") != "true", "not run on CI")
+ skip_if(Sys.getenv("GITHUB_REPOSITORY") != "OHDSI/PatientLevelPrediction", "not run in fork")
databaseSchemaSettings <- createDatabaseSchemaSettings(
- resultSchema = ohdsiDatabaseSchema,
- tablePrefix = '',
+ resultSchema = ohdsiDatabaseSchema,
+ tablePrefix = "",
targetDialect = targetDialect
)
-
+
# check inputs as expected
testthat::expect_true(databaseSchemaSettings$resultSchema == ohdsiDatabaseSchema)
- testthat::expect_true(databaseSchemaSettings$tablePrefix == '')
+ testthat::expect_true(databaseSchemaSettings$tablePrefix == "")
testthat::expect_true(databaseSchemaSettings$targetDialect == targetDialect)
testthat::expect_true(databaseSchemaSettings$cohortDefinitionSchema == ohdsiDatabaseSchema)
testthat::expect_true(databaseSchemaSettings$databaseDefinitionSchema == ohdsiDatabaseSchema)
- testthat::expect_true(databaseSchemaSettings$tablePrefixCohortDefinitionTables == '')
- testthat::expect_true(databaseSchemaSettings$tablePrefixDatabaseDefinitionTables == '')
-
+ testthat::expect_true(databaseSchemaSettings$tablePrefixCohortDefinitionTables == "")
+ testthat::expect_true(databaseSchemaSettings$tablePrefixDatabaseDefinitionTables == "")
+
databaseSchemaSettings <- createDatabaseSchemaSettings(
- resultSchema = ohdsiDatabaseSchema,
- tablePrefix = '',
+ resultSchema = ohdsiDatabaseSchema,
+ tablePrefix = "",
targetDialect = targetDialect,
- cohortDefinitionSchema = 'test 123',
- tablePrefixCohortDefinitionTables = 'a',
- databaseDefinitionSchema = 'test234',
- tablePrefixDatabaseDefinitionTables = 'b'
+ cohortDefinitionSchema = "test 123",
+ tablePrefixCohortDefinitionTables = "a",
+ databaseDefinitionSchema = "test234",
+ tablePrefixDatabaseDefinitionTables = "b"
)
-
- testthat::expect_true(databaseSchemaSettings$cohortDefinitionSchema == 'test 123')
- testthat::expect_true(databaseSchemaSettings$databaseDefinitionSchema == 'test234')
- testthat::expect_true(databaseSchemaSettings$tablePrefixCohortDefinitionTables == 'A_')
- testthat::expect_true(databaseSchemaSettings$tablePrefixDatabaseDefinitionTables == 'B_')
-
-
- testthat::expect_true(class(databaseSchemaSettings) == 'plpDatabaseResultSchema')
-
-}
-)
+
+ testthat::expect_true(databaseSchemaSettings$cohortDefinitionSchema == "test 123")
+ testthat::expect_true(databaseSchemaSettings$databaseDefinitionSchema == "test234")
+ testthat::expect_true(databaseSchemaSettings$tablePrefixCohortDefinitionTables == "A_")
+ testthat::expect_true(databaseSchemaSettings$tablePrefixDatabaseDefinitionTables == "B_")
+
+
+ testthat::expect_true(class(databaseSchemaSettings) == "plpDatabaseResultSchema")
+})
test_that("test createDatabaseDetails works", {
-
databaseList <- createDatabaseList(
- cdmDatabaseSchemas = paste0('database', 1:5)
+ cdmDatabaseSchemas = paste0("database", 1:5)
)
-
- testthat::expect_true(length(databaseList) == length(paste0('database', 1:5)))
- testthat::expect_true(class(databaseList) == 'list')
+
+ testthat::expect_true(length(databaseList) == length(paste0("database", 1:5)))
+ testthat::expect_true(class(databaseList) == "list")
testthat::expect_true(!is.null(databaseList$database1$databaseDetails))
testthat::expect_true(!is.null(databaseList$database1$databaseMetaData))
-
+
testthat::expect_equal(
databaseList$database1$databaseDetails$databaseMetaDataId,
databaseList$database1$databaseMetaData$databaseId
)
-
-}
-)
+})
test_that("database creation", {
- skip_if(Sys.getenv('CI') != 'true', 'not run locally')
+ skip_if(Sys.getenv("CI") != "true", "not run on CI")
+ skip_if(Sys.getenv("GITHUB_REPOSITORY") != "OHDSI/PatientLevelPrediction", "not run in fork")
createPlpResultTables(
- connectionDetails = connectionRedshift,
- resultSchema = ohdsiDatabaseSchema,
+ connectionDetails = connectionRedshift,
+ resultSchema = ohdsiDatabaseSchema,
targetDialect = targetDialect,
- deleteTables = T,
- createTables = T,
- tablePrefix = appendRandom('test')
+ deleteTables = TRUE,
+ createTables = TRUE,
+ tablePrefix = appendRandom("test")
)
-
+
# check the results table is created
testthat::expect_true(DatabaseConnector::existsTable(
- connection = conn,
+ connection = conn,
databaseSchema = ohdsiDatabaseSchema,
- tableName = paste0(appendRandom('test'),'_PERFORMANCES')
+ tableName = paste0(appendRandom("test"), "_PERFORMANCES")
))
-
})
test_that("results uploaded to database", {
- skip_if(Sys.getenv('CI') != 'true', 'not run locally')
- resultsLoc <- file.path(saveLoc,'dbUp')
-
- plpResult$model$trainDetails$developmentDatabase <- 'test'
- savePlpResult(plpResult, file.path(resultsLoc, 'Analysis_1','plpResult'))
+ skip_if(Sys.getenv("CI") != "true", "not run on CI")
+ skip_if(Sys.getenv("GITHUB_REPOSITORY") != "OHDSI/PatientLevelPrediction", "not run in fork")
+ resultsLoc <- file.path(saveLoc, "dbUp")
+
+ plpResult$model$trainDetails$developmentDatabase <- "test"
+ savePlpResult(plpResult, file.path(resultsLoc, "Analysis_1", "plpResult"))
# save validation
- if(!dir.exists(file.path(resultsLoc,'Validation','test', 'Analysis_1'))){
- dir.create(file.path(resultsLoc,'Validation','test', 'Analysis_1'), recursive = T)
+ if (!dir.exists(file.path(resultsLoc, "Validation", "test", "Analysis_1"))) {
+ dir.create(file.path(resultsLoc, "Validation", "test", "Analysis_1"), recursive = TRUE)
}
plpResult$model$validationDetails <- list(
- targetId = 1,
+ targetId = 1,
outcomeId = outcomeId,
- developmentDatabase = 'test',
- validationDatabase = 'test',
- populationSettings = plpResult$model$modelDesign$populationSettings,
+ developmentDatabase = "test",
+ validationDatabase = "test",
+ populationSettings = plpResult$model$modelDesign$populationSettings,
restrictPlpDataSettings = plpResult$model$modelDesign$restrictPlpDataSettings
- )
- savePlpResult(plpResult, file.path(resultsLoc,'Validation','test', 'Analysis_1', 'validationResult'))
-
+ )
+ savePlpResult(plpResult, file.path(resultsLoc, "Validation", "test", "Analysis_1", "validationResult"))
+
# add results:
addMultipleRunPlpToDatabase(
- connectionDetails = connectionRedshift,
+ connectionDetails = connectionRedshift,
databaseSchemaSettings = createDatabaseSchemaSettings(
- resultSchema = ohdsiDatabaseSchema,
- tablePrefix = appendRandom('test'),
+ resultSchema = ohdsiDatabaseSchema,
+ tablePrefix = appendRandom("test"),
targetDialect = targetDialect
- ),
+ ),
cohortDefinitions = data.frame(
- cohortName = c('blank1','blank2','blank3'),
- cohortId = c(1,2,3),
- json = rep('bla',3)
- ),
+ cohortName = c("blank1", "blank2", "blank3"),
+ cohortId = c(1, 2, 3),
+ json = rep("bla", 3)
+ ),
databaseList = createDatabaseList(
- cdmDatabaseSchemas = c('test')
+ cdmDatabaseSchemas = c("test")
),
resultLocation = resultsLoc,
- modelSaveLocation = file.path(saveLoc,'modelLocation') # new
+ modelSaveLocation = file.path(saveLoc, "modelLocation") # new
)
-
+
# check the results table is populated
- sql <- 'select count(*) as N from @resultSchema.@appendperformances;'
- sql <- SqlRender::render(sql, resultSchema = ohdsiDatabaseSchema, append = appendRandom('test_'))
+ sql <- "select count(*) as N from @resultSchema.@appendperformances;"
+ sql <- SqlRender::render(sql, resultSchema = ohdsiDatabaseSchema, append = appendRandom("test_"))
res <- DatabaseConnector::querySql(conn, sql)
- testthat::expect_true(res$N[1]>0)
-
- # add test: check model location has result?
+ testthat::expect_true(res$N[1] > 0)
+ # add test: check model location has result?
})
test_that("database deletion", {
- skip_if(Sys.getenv('CI') != 'true', 'not run locally')
+ skip_if(Sys.getenv("CI") != "true", "not run on CI")
+ skip_if(Sys.getenv("GITHUB_REPOSITORY") != "OHDSI/PatientLevelPrediction", "not run in fork")
createPlpResultTables(
- connectionDetails = connectionRedshift,
- resultSchema = ohdsiDatabaseSchema,
+ connectionDetails = connectionRedshift,
+ resultSchema = ohdsiDatabaseSchema,
targetDialect = targetDialect,
- deleteTables = T,
- createTables = F,
- tablePrefix = appendRandom('test')
+ deleteTables = TRUE,
+ createTables = FALSE,
+ tablePrefix = appendRandom("test")
)
-
+
# check the results table is then deleted
testthat::expect_false(DatabaseConnector::existsTable(
- connection = conn,
+ connection = conn,
databaseSchema = ohdsiDatabaseSchema,
- tableName = paste0(appendRandom('test'),'_PERFORMANCES')
+ tableName = paste0(appendRandom("test"), "_PERFORMANCES")
))
-
})
# disconnect
-if (Sys.getenv('CI') == 'true') {
+if (Sys.getenv("CI") == "true" && Sys.getenv("GITHUB_REPOSITORY") == "OHDSI/PatientLevelPrediction") {
DatabaseConnector::disconnect(conn)
}
# code to test sqlite creation, result and diagnostic upload all in one
test_that("temporary sqlite with results works", {
-
- resultsLoc <- file.path(saveLoc,'sqliteTest')
-
- savePlpResult(plpResult, file.path(resultsLoc, 'Analysis_1','plpResult'))
+ skip_if_not_installed("ResultModelManager")
+ skip_on_cran()
+ resultsLoc <- file.path(saveLoc, "sqliteTest")
+
+ savePlpResult(plpResult, file.path(resultsLoc, "Analysis_1", "plpResult"))
# save diagnostic
- saveRDS(diagnoseResult, file.path(resultsLoc,'Analysis_1','diagnosePlp.rds'))
-
+ saveRDS(diagnoseResult, file.path(resultsLoc, "Analysis_1", "diagnosePlp.rds"))
+
sqliteLocation <- insertResultsToSqlite(
- resultLocation = resultsLoc,
+ resultLocation = resultsLoc,
cohortDefinitions = data.frame(
- cohortName = c('blank1','blank2','blank3'),
- cohortId = c(1,2,3),
- json = rep('bla',3)
+ cohortName = c("blank1", "blank2", "blank3"),
+ cohortId = c(1, 2, 3),
+ json = rep("bla", 3)
),
databaseList = createDatabaseList(
- cdmDatabaseSchemas = c('test')
+ cdmDatabaseSchemas = c("test")
),
- sqliteLocation = file.path(resultsLoc, 'sqlite')
+ sqliteLocation = file.path(resultsLoc, "sqlite")
)
-
+
# expect the database to exist
testthat::expect_true(file.exists(sqliteLocation))
-
- cdmDatabaseSchema <- 'main'
- ohdsiDatabaseSchema <- 'main'
+
+ cdmDatabaseSchema <- "main"
+ ohdsiDatabaseSchema <- "main"
connectionDetails <- DatabaseConnector::createConnectionDetails(
- dbms = 'sqlite',
+ dbms = "sqlite",
server = sqliteLocation
)
conn <- DatabaseConnector::connect(connectionDetails = connectionDetails)
- targetDialect <- 'sqlite'
-
+ targetDialect <- "sqlite"
+
# check the results table is populated
- sql <- 'select count(*) as N from main.performances;'
+ sql <- "select count(*) as N from main.performances;"
res <- DatabaseConnector::querySql(conn, sql)
- testthat::expect_true(res$N[1]>0)
-
+ testthat::expect_true(res$N[1] > 0)
+
# check the diagnostic table is populated
- sql <- 'select count(*) as N from main.diagnostics;'
+ sql <- "select count(*) as N from main.diagnostics;"
res <- DatabaseConnector::querySql(conn, sql)
- testthat::expect_true(res$N[1]>0)
-
+ testthat::expect_true(res$N[1] > 0)
+
# disconnect
DatabaseConnector::disconnect(conn)
-
})
# SQL lite test
test_that("temporary sqlite with results works", {
-
+ skip_if_not_installed("ResultModelManager")
+ skip_on_cran()
externalVal <- plpResult
- externalVal$model$model <- 'none'
+ externalVal$model$model <- "none"
externalVal$model$trainDetails <- NULL
externalVal$model$validationDetails <- list(
- targetId = 1,
+ targetId = 1,
outcomeId = 3,
- developmentDatabase = 'test',
- validationDatabase = 'test',
- populationSettings = plpResult$model$modelDesign$populationSettings,
+ developmentDatabase = "test",
+ validationDatabase = "test",
+ populationSettings = plpResult$model$modelDesign$populationSettings,
restrictPlpDataSettings = plpResult$model$modelDesign$restrictPlpDataSettings
)
-
-sqliteLocation <- insertRunPlpToSqlite(
- runPlp = plpResult,
- externalValidatePlp = NULL
+
+ sqliteLocation <- insertRunPlpToSqlite(
+ runPlp = plpResult,
+ externalValidatePlp = NULL
)
-# expect the database to exist
-testthat::expect_true(file.exists(sqliteLocation))
-
-cdmDatabaseSchema <- 'main'
-ohdsiDatabaseSchema <- 'main'
-connectionDetails <- DatabaseConnector::createConnectionDetails(
- dbms = 'sqlite',
- server = sqliteLocation
-)
-conn <- DatabaseConnector::connect(connectionDetails = connectionDetails)
-targetDialect <- 'sqlite'
-
-# check the results table is populated
-sql <- 'select count(*) as N from main.performances;'
-res <- DatabaseConnector::querySql(conn, sql)
-testthat::expect_true(res$N[1]>0)
-
-
-# check export to csv
-extractDatabaseToCsv(
- connectionDetails = connectionDetails,
- databaseSchemaSettings = createDatabaseSchemaSettings(resultSchema = 'main'),
- csvFolder = file.path(saveLoc, 'csvFolder')
-)
-
-testthat::expect_true(dir.exists(file.path(saveLoc, 'csvFolder')))
-testthat::expect_true(length(dir(file.path(saveLoc, 'csvFolder'))) > 0 )
-testthat::expect_true(dir.exists(file.path(saveLoc, 'csvFolder', 'models'))) # new
-testthat::expect_true(length(dir(file.path(saveLoc, 'csvFolder', 'models'))) > 0 ) # new
-# disconnect
-DatabaseConnector::disconnect(conn)
+ # expect the database to exist
+ testthat::expect_true(file.exists(sqliteLocation))
+ cdmDatabaseSchema <- "main"
+ ohdsiDatabaseSchema <- "main"
+ connectionDetails <- DatabaseConnector::createConnectionDetails(
+ dbms = "sqlite",
+ server = sqliteLocation
+ )
+ conn <- DatabaseConnector::connect(connectionDetails = connectionDetails)
+ targetDialect <- "sqlite"
+ # check the results table is populated
+ sql <- "select count(*) as N from main.performances;"
+ res <- DatabaseConnector::querySql(conn, sql)
+ testthat::expect_true(res$N[1] > 0)
+
+
+ # check export to csv
+ extractDatabaseToCsv(
+ connectionDetails = connectionDetails,
+ databaseSchemaSettings = createDatabaseSchemaSettings(resultSchema = "main"),
+ csvFolder = file.path(saveLoc, "csvFolder")
+ )
+
+ testthat::expect_true(dir.exists(file.path(saveLoc, "csvFolder")))
+ testthat::expect_true(length(dir(file.path(saveLoc, "csvFolder"))) > 0)
+ testthat::expect_true(dir.exists(file.path(saveLoc, "csvFolder", "models"))) # new
+ testthat::expect_true(length(dir(file.path(saveLoc, "csvFolder", "models"))) > 0) # new
+ # disconnect
+ DatabaseConnector::disconnect(conn)
})
# importFromCsv test here as can use previous csv saving
test_that("import from csv", {
-
+ # TODO remove dependancy on previous test
+ skip_if_not_installed("ResultModelManager")
+ skip_on_cran()
cohortDef <- extractCohortDefinitionsCSV(
- csvFolder = file.path(saveLoc, 'csvFolder')
+ csvFolder = file.path(saveLoc, "csvFolder")
)
- testthat::expect_true(inherits(cohortDef, 'data.frame'))
+ testthat::expect_true(inherits(cohortDef, "data.frame"))
testthat::expect_true(ncol(cohortDef) == 4)
-
+
databaseList <- extractDatabaseListCSV(
- csvFolder = file.path(saveLoc, 'csvFolder')
+ csvFolder = file.path(saveLoc, "csvFolder")
)
- testthat::expect_true(inherits(databaseList, 'list'))
+ testthat::expect_true(inherits(databaseList, "list"))
testthat::expect_true(!is.null(databaseList[[1]]$databaseDetails))
testthat::expect_true(!is.null(databaseList[[1]]$databaseMetaData))
-
+
# model designs work
modeldesignsRow <- data.frame(
- target_id = 1, outcome_id = 2, population_setting_id = 1,
- plp_data_setting_id = 1, model_setting_id = 1,
+ target_id = 1, outcome_id = 2, population_setting_id = 1,
+ plp_data_setting_id = 1, model_setting_id = 1,
covariate_setting_id = 1, sample_setting_id = 1,
- split_setting_id = 1, feature_engineering_setting_id =1 ,
+ split_setting_id = 1, feature_engineering_setting_id = 1,
tidy_covariates_setting_id = 1
- )
+ )
res <- getModelDesignSettingTable(modeldesignsRow)
# expect res to be a data.frame, check values?
- testthat::expect_true(inherits(res, 'data.frame'))
-
+ testthat::expect_true(inherits(res, "data.frame"))
+
modelDesign <- getModelDesignCsv(
- modelDesignSettingTable = res,
- csvFolder = file.path(saveLoc, 'csvFolder')
- )
- testthat::expect_true(inherits(modelDesign, 'modelDesign'))
-
+ modelDesignSettingTable = res,
+ csvFolder = file.path(saveLoc, "csvFolder")
+ )
+ testthat::expect_true(inherits(modelDesign, "modelDesign"))
+
# performance works
res <- getPerformanceEvaluationCsv(
- performanceId = 1,
- csvFolder = file.path(saveLoc, 'csvFolder')
+ performanceId = 1,
+ csvFolder = file.path(saveLoc, "csvFolder")
)
- testthat::expect_true(inherits(res, 'list'))
+ testthat::expect_true(inherits(res, "list"))
testthat::expect_true(
- sum(names(res) %in%
- c('evaluationStatistics', 'thresholdSummary',
- 'calibrationSummary', 'demographicSummary',
- 'predictionDistribution'
- )
- ) == 5
+ sum(names(res) %in%
+ c(
+ "evaluationStatistics", "thresholdSummary",
+ "calibrationSummary", "demographicSummary",
+ "predictionDistribution"
+ )) == 5
)
-
-
- # test object extracts
+
+
+ # test object extracts
obj <- extractObjectFromCsv(
- performanceId = 1,
- csvFolder = file.path(saveLoc, 'csvFolder')
+ performanceId = 1,
+ csvFolder = file.path(saveLoc, "csvFolder")
)
- testthat::expect_true(inherits(obj, 'externalValidatePlp') | inherits(obj, 'runPlp'))
-
+ testthat::expect_true(inherits(obj, "externalValidatePlp") | inherits(obj, "runPlp"))
+
# test diagnostic extracted
diag <- extractDiagnosticFromCsv(
- diagnosticId = 1,
- csvFolder = file.path(saveLoc, 'csvFolder')
+ diagnosticId = 1,
+ csvFolder = file.path(saveLoc, "csvFolder")
)
- testthat::expect_true(inherits(diag, 'diagnosePlp') | is.null(diag))
-
-
-
+ testthat::expect_true(inherits(diag, "diagnosePlp") | is.null(diag))
+
+
+
# Testing everything together
- csvServerLoc <- file.path(tempdir(), 'newCsvDatabase')
- if(!dir.exists(file.path(tempdir(), 'newCsvDatabase'))){
- dir.create(file.path(tempdir(), 'newCsvDatabase'), recursive = T)
+ csvServerLoc <- file.path(tempdir(), "newCsvDatabase")
+ if (!dir.exists(file.path(tempdir(), "newCsvDatabase"))) {
+ dir.create(file.path(tempdir(), "newCsvDatabase"), recursive = TRUE)
}
newResultConnDetails <- DatabaseConnector::createConnectionDetails(
- dbms = 'sqlite',
- server = file.path(csvServerLoc,'newCsv.sqlite')
- )
+ dbms = "sqlite",
+ server = file.path(csvServerLoc, "newCsv.sqlite")
+ )
newResultConn <- DatabaseConnector::connect(newResultConnDetails)
- csvDatabaseSchemaSettings <- PatientLevelPrediction::createDatabaseSchemaSettings(
- resultSchema = 'main',
- tablePrefix = '',
- targetDialect = 'sqlite',
+ csvDatabaseSchemaSettings <- PatientLevelPrediction::createDatabaseSchemaSettings(
+ resultSchema = "main",
+ tablePrefix = "",
+ targetDialect = "sqlite",
tempEmulationSchema = NULL
- )
-
+ )
+
# create empty tables to insert csv into
PatientLevelPrediction::createPlpResultTables(
- connectionDetails = newResultConnDetails,
- targetDialect = 'sqlite',
- resultSchema = 'main',
- createTables = T,
- deleteTables = T,
- tablePrefix = '',
+ connectionDetails = newResultConnDetails,
+ targetDialect = "sqlite",
+ resultSchema = "main",
+ createTables = TRUE,
+ deleteTables = TRUE,
+ tablePrefix = "",
tempEmulationSchema = NULL
- )
-
+ )
+
res <- insertCsvToDatabase(
- csvFolder = file.path(saveLoc, 'csvFolder'),
+ csvFolder = file.path(saveLoc, "csvFolder"),
connectionDetails = newResultConnDetails,
databaseSchemaSettings = csvDatabaseSchemaSettings,
- modelSaveLocation = file.path(csvServerLoc,'models'),
- csvTableAppend = ''
+ modelSaveLocation = file.path(csvServerLoc, "models"),
+ csvTableAppend = ""
)
testthat::expect_true(res)
-
+
# check some of the tables
-
-
})
# new - check null model just reports message
test_that("message if model is null", {
-
- model2 <- list(noModel = T)
- attr(model2, "predictionFunction") <- 'noModel'
- attr(model2, "saveType") <- 'RtoJson'
- class(model2) <- 'plpModel'
-
+ skip_if_not_installed("ResultModelManager")
+ skip_on_cran()
+ model2 <- list(noModel = TRUE)
+ attr(model2, "predictionFunction") <- "noModel"
+ attr(model2, "saveType") <- "RtoJson"
+ class(model2) <- "plpModel"
+
plpResult2 <- plpResult
plpResult2$model <- model2
-
- savePlpResult(plpResult2, file.path(tempdir(), 'null_model', 'Analysis_1', 'plpResult'))
-
- nullModelServerLoc <- file.path(tempdir(), 'nullModelDatabase')
- if(!dir.exists(file.path(tempdir(), 'nullModelDatabase'))){
- dir.create(file.path(tempdir(), 'nullModelDatabase'), recursive = T)
+
+ savePlpResult(plpResult2, file.path(tempdir(), "null_model", "Analysis_1", "plpResult"))
+
+ nullModelServerLoc <- file.path(tempdir(), "nullModelDatabase")
+ if (!dir.exists(file.path(tempdir(), "nullModelDatabase"))) {
+ dir.create(file.path(tempdir(), "nullModelDatabase"), recursive = TRUE)
}
nullModelResultConnDetails <- DatabaseConnector::createConnectionDetails(
- dbms = 'sqlite',
- server = file.path(nullModelServerLoc,'sqlite.sqlite')
+ dbms = "sqlite",
+ server = file.path(nullModelServerLoc, "sqlite.sqlite")
)
- nullModelDatabaseSchemaSettings <- createDatabaseSchemaSettings(
- resultSchema = 'main',
- tablePrefix = '',
- targetDialect = 'sqlite',
+ nullModelDatabaseSchema <- createDatabaseSchemaSettings(
+ resultSchema = "main",
+ tablePrefix = "",
+ targetDialect = "sqlite",
tempEmulationSchema = NULL
)
-
+
createPlpResultTables(
connectionDetails = nullModelResultConnDetails,
- targetDialect = 'sqlite',
- resultSchema = 'main',
- deleteTables = T,
- createTables = T,
- tablePrefix = ''
+ targetDialect = "sqlite",
+ resultSchema = "main",
+ deleteTables = TRUE,
+ createTables = TRUE,
+ tablePrefix = ""
)
testthat::expect_message(
addMultipleRunPlpToDatabase(
- connectionDetails = nullModelResultConnDetails,
- databaseSchemaSettings = nullModelDatabaseSchemaSettings,
- resultLocation = file.path(tempdir(), 'null_model'),
- modelSaveLocation = file.path(tempdir(), 'null_model', 'models')
+ connectionDetails = nullModelResultConnDetails,
+ databaseSchemaSettings = nullModelDatabaseSchema,
+ resultLocation = file.path(tempdir(), "null_model"),
+ modelSaveLocation = file.path(tempdir(), "null_model", "models")
)
)
-
})
-
-
diff --git a/tests/testthat/test-andromedahelperfunctions.R b/tests/testthat/test-andromedahelperfunctions.R
index d63bb397d..f48997fc8 100644
--- a/tests/testthat/test-andromedahelperfunctions.R
+++ b/tests/testthat/test-andromedahelperfunctions.R
@@ -17,18 +17,16 @@
library("testthat")
context("AndromedaHelperFunctions")
-# add limitCovariatesToPopulation(covariateData, rowIds) test
+# add limitCovariatesToPopulation(covariateData, rowIds) test
-# batcheRestrict test
+# batcheRestrict test
test_that("batchRestrict", {
-
- metaData <- attr(plpData$covariateData, 'metaData')
+ metaData <- attr(plpData$covariateData, "metaData")
covariateData <- PatientLevelPrediction:::batchRestrict(plpData$covariateData, population, sizeN = 1000000)
- testthat::expect_is(covariateData, 'CovariateData')
-
- expect_equal(names(metaData), names(attr(covariateData, 'metaData')))
-
+ testthat::expect_is(covariateData, "CovariateData")
+
+ expect_equal(names(metaData), names(attr(covariateData, "metaData")))
})
diff --git a/tests/testthat/test-covariateExtras.R b/tests/testthat/test-covariateExtras.R
index 17c9e04ec..d3c30a067 100644
--- a/tests/testthat/test-covariateExtras.R
+++ b/tests/testthat/test-covariateExtras.R
@@ -19,105 +19,98 @@ library("testthat")
context("CovariateExtras")
test_that("settings creation", {
-
-covSet <- createCohortCovariateSettings(
- cohortName = 'covariateName',
- settingId = 4,
- cohortDatabaseSchema = 'main',
- cohortTable = 'cohort',
- cohortId = 2,
- startDay = -350,
- endDay = -2,
- count = F,
- ageInteraction = F,
- lnAgeInteraction = F,
- analysisId = 456
-)
-
-expect_equal(class(covSet), "covariateSettings")
+ covSet <- createCohortCovariateSettings(
+ cohortName = "covariateName",
+ settingId = 4,
+ cohortDatabaseSchema = "main",
+ cohortTable = "cohort",
+ cohortId = 2,
+ startDay = -350,
+ endDay = -2,
+ count = FALSE,
+ ageInteraction = FALSE,
+ lnAgeInteraction = FALSE,
+ analysisId = 456
+ )
-expect_equal(attr(covSet, 'fun'), "PatientLevelPrediction::getCohortCovariateData")
+ expect_equal(class(covSet), "covariateSettings")
+ expect_equal(attr(covSet, "fun"), "PatientLevelPrediction::getCohortCovariateData")
})
test_that("settings creation errors", {
-
-expect_error(createCohortCovariateSettings(
- cohortName = 'covariateName',
- settingId = -1,
- cohortDatabaseSchema = 'test_cdm',
- cohortTable = 'table',
- cohortId = 2,
- startDay = -350,
- endDay = -2,
- count = F,
- ageInteraction = F,
- lnAgeInteraction = F,
- analysisId = 456
-))
-
-expect_error(createCohortCovariateSettings(
- cohortName = 'covariateName',
- settingId = 101,
- cohortDatabaseSchema = 'test_cdm',
- cohortTable = 'table',
- cohortId = 2,
- startDay = -350,
- endDay = -2,
- count = F,
- ageInteraction = F,
- lnAgeInteraction = F,
- analysisId = 456
-))
+ expect_error(createCohortCovariateSettings(
+ cohortName = "covariateName",
+ settingId = -1,
+ cohortDatabaseSchema = "test_cdm",
+ cohortTable = "table",
+ cohortId = 2,
+ startDay = -350,
+ endDay = -2,
+ count = FALSE,
+ ageInteraction = FALSE,
+ lnAgeInteraction = FALSE,
+ analysisId = 456
+ ))
-expect_error(createCohortCovariateSettings(
- cohortName = 'covariateName',
- settingId = '101',
- cohortDatabaseSchema = 'test_cdm',
- cohortTable = 'table',
- cohortId = 2,
- startDay = -350,
- endDay = -2,
- count = F,
- ageInteraction = F,
- lnAgeInteraction = F,
- analysisId = 456
-))
+ expect_error(createCohortCovariateSettings(
+ cohortName = "covariateName",
+ settingId = 101,
+ cohortDatabaseSchema = "test_cdm",
+ cohortTable = "table",
+ cohortId = 2,
+ startDay = -350,
+ endDay = -2,
+ count = FALSE,
+ ageInteraction = FALSE,
+ lnAgeInteraction = FALSE,
+ analysisId = 456
+ ))
+ expect_error(createCohortCovariateSettings(
+ cohortName = "covariateName",
+ settingId = "101",
+ cohortDatabaseSchema = "test_cdm",
+ cohortTable = "table",
+ cohortId = 2,
+ startDay = -350,
+ endDay = -2,
+ count = FALSE,
+ ageInteraction = FALSE,
+ lnAgeInteraction = FALSE,
+ analysisId = 456
+ ))
})
test_that("extraction works", {
-
covSet <- createCohortCovariateSettings(
- cohortName = 'covariateName',
+ cohortName = "covariateName",
settingId = 4,
- cohortDatabaseSchema = 'main',
- cohortTable = 'cohort',
+ cohortDatabaseSchema = "main",
+ cohortTable = "cohort",
cohortId = 2,
- startDay = -350,
- endDay = -2,
- count = F,
- ageInteraction = F,
- lnAgeInteraction = F,
+ startDay = -350,
+ endDay = -2,
+ count = FALSE,
+ ageInteraction = FALSE,
+ lnAgeInteraction = FALSE,
analysisId = 456
)
-
-covs <- FeatureExtraction::getDbCovariateData(
- connectionDetails = connectionDetails,
- cdmDatabaseSchema = "main",
- cdmVersion = 5,
- cohortTable = "cohort",
- cohortDatabaseSchema = "main",
- cohortTableIsTemp = F,
- cohortIds = c(1),
- rowIdField = 'rowId',
- covariateSettings = covSet,
- aggregated = F
- )
-expect_equal(1, covs$covariateRef %>% dplyr::tally() %>% dplyr::pull())
-expect_equal(as.double(covs$covariateRef %>% dplyr::select("covariateId") %>% dplyr::collect()), covSet$covariateId)
-expect_true(covs$covariates %>% dplyr::tally() %>% dplyr::pull() > 0)
+ covs <- FeatureExtraction::getDbCovariateData(
+ connectionDetails = connectionDetails,
+ cdmDatabaseSchema = "main",
+ cdmVersion = 5,
+ cohortTable = "cohort",
+ cohortDatabaseSchema = "main",
+ cohortTableIsTemp = FALSE,
+ cohortIds = c(1),
+ rowIdField = "rowId",
+ covariateSettings = covSet,
+ aggregated = FALSE
+ )
+ expect_equal(1, covs$covariateRef %>% dplyr::tally() %>% dplyr::pull())
+ expect_equal(as.double(covs$covariateRef %>% dplyr::select("covariateId") %>% dplyr::collect()), covSet$covariateId)
+ expect_true(covs$covariates %>% dplyr::tally() %>% dplyr::pull() > 0)
})
-
diff --git a/tests/testthat/test-covariateSummary.R b/tests/testthat/test-covariateSummary.R
index ce84af5d7..0e6598826 100644
--- a/tests/testthat/test-covariateSummary.R
+++ b/tests/testthat/test-covariateSummary.R
@@ -18,73 +18,69 @@ context("covariateSummary")
testCovariateData <- Andromeda::andromeda()
testCovariateData$covariates <- data.frame(
- rowId = c(1,2,5,7,10),
+ rowId = c(1, 2, 5, 7, 10),
covariateId = rep(1001, 5),
- covariateValue = rep(1,5)
+ covariateValue = rep(1, 5)
)
testCovariateData$covariateRef <- data.frame(
covariateId = 1001,
- covariateName = 'Made up covariate'
+ covariateName = "Made up covariate"
)
labels <- data.frame(
- rowId = 1:20,
- outcomeCount = c(rep(1,5), rep(0,15))
+ rowId = 1:20,
+ outcomeCount = c(rep(1, 5), rep(0, 15))
)
test_that("covariateSummary works with no strata or labels", {
-
-res <- covariateSummary(
- covariateData = testCovariateData,
- cohort = data.frame(rowId = 1:10),
- labels = NULL,
- strata = NULL,
- variableImportance = NULL,
- featureEngineering = NULL
-)
-
-expect_equal(res$CovariateCount, 5)
-expect_equal(res$CovariateMean, 0.5)
-expect_equal(res$CovariateStDev, 0.5)
+ res <- covariateSummary(
+ covariateData = testCovariateData,
+ cohort = data.frame(rowId = 1:10),
+ labels = NULL,
+ strata = NULL,
+ variableImportance = NULL,
+ featureEngineering = NULL
+ )
+ expect_equal(res$CovariateCount, 5)
+ expect_equal(res$CovariateMean, 0.5)
+ expect_equal(res$CovariateStDev, 0.5)
-res <- covariateSummary(
- covariateData = testCovariateData,
- cohort = data.frame(rowId = 1:20),
- labels = NULL,
- strata = NULL,
- variableImportance = NULL,
- featureEngineering = NULL
-)
-expect_equal(res$CovariateCount, 5)
-expect_equal(res$CovariateMean, 0.25)
-expect_equal(res$CovariateStDev, sqrt((5*(0.25-1)^2+15*(0.25-0)^2)/20))
+ res <- covariateSummary(
+ covariateData = testCovariateData,
+ cohort = data.frame(rowId = 1:20),
+ labels = NULL,
+ strata = NULL,
+ variableImportance = NULL,
+ featureEngineering = NULL
+ )
+ expect_equal(res$CovariateCount, 5)
+ expect_equal(res$CovariateMean, 0.25)
+ expect_equal(res$CovariateStDev, sqrt((5 * (0.25 - 1)^2 + 15 * (0.25 - 0)^2) / 20))
})
test_that("covariateSummary works with labels", {
-
res <- covariateSummary(
covariateData = testCovariateData,
cohort = data.frame(rowId = 1:20),
- labels = labels,
+ labels = labels,
strata = NULL,
variableImportance = NULL,
featureEngineering = NULL
)
-
+
expect_equal(res$WithNoOutcome_CovariateCount, 2)
expect_equal(res$WithOutcome_CovariateCount, 3)
-
- expect_equal(res$WithNoOutcome_CovariateMean, 2/15)
- expect_equal(res$WithOutcome_CovariateMean, 3/5)
-
- expect_equal(res$WithNoOutcome_CovariateStDev, sqrt((2*(2/15-1)^2+13*(2/15)^2)/15))
- expect_equal(res$WithOutcome_CovariateStDev, sqrt((3*(3/5-1)^2+2*(3/5)^2)/5))
-
- expect_equal(abs(res$StandardizedMeanDiff),abs((2/15-3/5)/sqrt(((2*(2/15-1)^2+13*(2/15)^2)/15+(3*(3/5-1)^2+2*(3/5)^2)/5)/2)))
-
+
+ expect_equal(res$WithNoOutcome_CovariateMean, 2 / 15)
+ expect_equal(res$WithOutcome_CovariateMean, 3 / 5)
+
+ expect_equal(res$WithNoOutcome_CovariateStDev, sqrt((2 * (2 / 15 - 1)^2 + 13 * (2 / 15)^2) / 15))
+ expect_equal(res$WithOutcome_CovariateStDev, sqrt((3 * (3 / 5 - 1)^2 + 2 * (3 / 5)^2) / 5))
+
+ expect_equal(abs(res$StandardizedMeanDiff), abs((2 / 15 - 3 / 5) / sqrt(((2 * (2 / 15 - 1)^2 + 13 * (2 / 15)^2) / 15 + (3 * (3 / 5 - 1)^2 + 2 * (3 / 5)^2) / 5) / 2)))
})
diff --git a/tests/testthat/test-cyclopsModels.R b/tests/testthat/test-cyclopsModels.R
index 0bbf6d779..e02df86a6 100644
--- a/tests/testthat/test-cyclopsModels.R
+++ b/tests/testthat/test-cyclopsModels.R
@@ -19,154 +19,148 @@ library("testthat")
context("CyclopsModels")
-#================ SETTING TESTINGS
+# ================ SETTING TESTINGS
test_that("set LR inputs", {
-
-#=====================================
-# checking Logistic Regression
-#=====================================
+ # =====================================
+ # checking Logistic Regression
+ # =====================================
-model_set <- setLassoLogisticRegression()
-testthat::expect_that(model_set, testthat::is_a("modelSettings"))
-
-expect_equal(model_set$fitFunction, 'fitCyclopsModel')
-expect_is(model_set$param, 'list')
-
-expect_equal(model_set$param$priorParams$priorType, "laplace")
+ model_set <- setLassoLogisticRegression()
+ testthat::expect_that(model_set, testthat::is_a("modelSettings"))
-expect_equal(attr(model_set$param, 'settings')$modelType, 'logistic')
-expect_equal(attr(model_set$param, 'settings')$priorfunction, 'Cyclops::createPrior')
-expect_equal(attr(model_set$param, 'settings')$addIntercept, T)
-expect_equal(attr(model_set$param, 'settings')$useControl, T)
-expect_equal(attr(model_set$param, 'settings')$name, "Lasso Logistic Regression")
-expect_equal(attr(model_set$param, 'settings')$cvRepetitions, 1)
+ expect_equal(model_set$fitFunction, "fitCyclopsModel")
+ expect_is(model_set$param, "list")
+ expect_equal(model_set$param$priorParams$priorType, "laplace")
-variance <- runif(1)
-model_set <- setLassoLogisticRegression(variance = variance)
-expect_equal(model_set$param$priorParams$variance, variance)
+ expect_equal(attr(model_set$param, "settings")$modelType, "logistic")
+ expect_equal(attr(model_set$param, "settings")$priorfunction, "Cyclops::createPrior")
+ expect_equal(attr(model_set$param, "settings")$addIntercept, TRUE)
+ expect_equal(attr(model_set$param, "settings")$useControl, TRUE)
+ expect_equal(attr(model_set$param, "settings")$name, "Lasso Logistic Regression")
+ expect_equal(attr(model_set$param, "settings")$cvRepetitions, 1)
-seed <- sample(10,1)
-model_set <- setLassoLogisticRegression(seed = seed)
-expect_equal(attr(model_set$param, 'settings')$seed, seed)
-model_set <- setLassoLogisticRegression(includeCovariateIds = c(1,2))
-expect_equal(model_set$param$includeCovariateIds, c(1,2))
+ variance <- runif(1)
+ model_set <- setLassoLogisticRegression(variance = variance)
+ expect_equal(model_set$param$priorParams$variance, variance)
-model_set <- setLassoLogisticRegression(noShrinkage = c(1,3))
-expect_equal(model_set$param$priorParams$exclude, c(1,3))
+ seed <- sample(10, 1)
+ model_set <- setLassoLogisticRegression(seed = seed)
+ expect_equal(attr(model_set$param, "settings")$seed, seed)
-threads <- sample(10,1)
-model_set <- setLassoLogisticRegression(threads = threads)
-expect_equal(attr(model_set$param, 'settings')$threads, threads)
+ model_set <- setLassoLogisticRegression(includeCovariateIds = c(1, 2))
+ expect_equal(model_set$param$includeCovariateIds, c(1, 2))
-model_set <- setLassoLogisticRegression(forceIntercept = T)
-expect_equal(model_set$param$priorParams$forceIntercept, T)
+ model_set <- setLassoLogisticRegression(noShrinkage = c(1, 3))
+ expect_equal(model_set$param$priorParams$exclude, c(1, 3))
-model_set <- setLassoLogisticRegression(upperLimit = 1)
-expect_equal(model_set$param$upperLimit, 1)
+ threads <- sample(10, 1)
+ model_set <- setLassoLogisticRegression(threads = threads)
+ expect_equal(attr(model_set$param, "settings")$threads, threads)
-model_set <- setLassoLogisticRegression(lowerLimit = 1)
-expect_equal(model_set$param$lowerLimit, 1)
+ model_set <- setLassoLogisticRegression(forceIntercept = TRUE)
+ expect_equal(model_set$param$priorParams$forceIntercept, TRUE)
-tolerance <- runif(1)
-model_set <- setLassoLogisticRegression(tolerance = tolerance)
-expect_equal(attr(model_set$param, 'settings')$tolerance, tolerance)
+ model_set <- setLassoLogisticRegression(upperLimit = 1)
+ expect_equal(model_set$param$upperLimit, 1)
-maxIterations <- sample(100,1)
-model_set <- setLassoLogisticRegression(maxIterations = maxIterations)
-expect_equal(attr(model_set$param, 'settings')$maxIterations, maxIterations)
+ model_set <- setLassoLogisticRegression(lowerLimit = 1)
+ expect_equal(model_set$param$lowerLimit, 1)
+ tolerance <- runif(1)
+ model_set <- setLassoLogisticRegression(tolerance = tolerance)
+ expect_equal(attr(model_set$param, "settings")$tolerance, tolerance)
+ maxIterations <- sample(100, 1)
+ model_set <- setLassoLogisticRegression(maxIterations = maxIterations)
+ expect_equal(attr(model_set$param, "settings")$maxIterations, maxIterations)
})
test_that("set LR incorrect inputs", {
-
expect_error(setLassoLogisticRegression(variance = -0.01))
- expect_error(setLassoLogisticRegression(variance = 'variance'))
- expect_error(setLassoLogisticRegression(seed = 'seed'))
- expect_error(setLassoLogisticRegression(threads = 'threads'))
-
- expect_error(setLassoLogisticRegression(lowerLimit = 'lowerLimit'))
- expect_error(setLassoLogisticRegression(upperLimit = 'upperLimit'))
- expect_error(setLassoLogisticRegression(lowerLimit=3, upperLimit = 1))
-
+ expect_error(setLassoLogisticRegression(variance = "variance"))
+ expect_error(setLassoLogisticRegression(seed = "seed"))
+ expect_error(setLassoLogisticRegression(threads = "threads"))
+
+ expect_error(setLassoLogisticRegression(lowerLimit = "lowerLimit"))
+ expect_error(setLassoLogisticRegression(upperLimit = "upperLimit"))
+ expect_error(setLassoLogisticRegression(lowerLimit = 3, upperLimit = 1))
})
test_that("set cox regression inputs", {
-
- #=====================================
- # checking Cox Regression
- #=====================================
-
- model_set <- setCoxModel()
- testthat::expect_that(model_set, testthat::is_a("modelSettings"))
-
- expect_equal(model_set$fitFunction, 'fitCyclopsModel')
- expect_is(model_set$param, 'list')
-
- expect_equal(model_set$param$priorParams$priorType, "laplace")
-
- expect_equal(attr(model_set$param, 'settings')$modelType, 'cox')
- expect_equal(attr(model_set$param, 'settings')$priorfunction, 'Cyclops::createPrior')
- expect_equal(attr(model_set$param, 'settings')$addIntercept, F)
- expect_equal(attr(model_set$param, 'settings')$useControl, T)
- expect_equal(attr(model_set$param, 'settings')$name, "LASSO Cox Regression")
- expect_equal(attr(model_set$param, 'settings')$cvRepetitions, 1)
-
+ skip_if_not_installed("polspline")
+ skip_on_cran()
+ # =====================================
+ # checking Cox Regression
+ # =====================================
+
+ modelSet <- setCoxModel()
+ testthat::expect_that(modelSet, testthat::is_a("modelSettings"))
+
+ expect_equal(modelSet$fitFunction, "fitCyclopsModel")
+ expect_is(modelSet$param, "list")
+
+ expect_equal(modelSet$param$priorParams$priorType, "laplace")
+
+ expect_equal(attr(modelSet$param, "settings")$modelType, "cox")
+ expect_equal(attr(modelSet$param, "settings")$priorfunction, "Cyclops::createPrior")
+ expect_equal(attr(modelSet$param, "settings")$addIntercept, FALSE)
+ expect_equal(attr(modelSet$param, "settings")$useControl, TRUE)
+ expect_equal(attr(modelSet$param, "settings")$name, "LASSO Cox Regression")
+ expect_equal(attr(modelSet$param, "settings")$cvRepetitions, 1)
+
variance <- runif(1)
- model_set <- setCoxModel(variance = variance)
- expect_equal(model_set$param$priorParams$variance, variance)
-
- seed <- sample(10,1)
- model_set <- setCoxModel(seed = seed)
- expect_equal(attr(model_set$param, 'settings')$seed, seed)
-
- model_set <- setCoxModel(includeCovariateIds = c(1,2))
- expect_equal(model_set$param$includeCovariateIds, c(1,2))
-
- model_set <- setCoxModel(upperLimit = 1)
- expect_equal(model_set$param$upperLimit, 1)
-
- model_set <- setCoxModel(lowerLimit = 1)
- expect_equal(model_set$param$lowerLimit, 1)
-
- model_set <- setCoxModel(noShrinkage = c(1,3))
- expect_equal(model_set$param$priorParams$exclude, c(1,3))
-
- threads <- sample(10,1)
- model_set <- setCoxModel(threads = threads)
- expect_equal(attr(model_set$param, 'settings')$threads, threads)
-
+ modelSet <- setCoxModel(variance = variance)
+ expect_equal(modelSet$param$priorParams$variance, variance)
+
+ seed <- sample(10, 1)
+ modelSet <- setCoxModel(seed = seed)
+ expect_equal(attr(modelSet$param, "settings")$seed, seed)
+
+ modelSet <- setCoxModel(includeCovariateIds = c(1, 2))
+ expect_equal(modelSet$param$includeCovariateIds, c(1, 2))
+
+ modelSet <- setCoxModel(upperLimit = 1)
+ expect_equal(modelSet$param$upperLimit, 1)
+
+ modelSet <- setCoxModel(lowerLimit = 1)
+ expect_equal(modelSet$param$lowerLimit, 1)
+
+ modelSet <- setCoxModel(noShrinkage = c(1, 3))
+ expect_equal(modelSet$param$priorParams$exclude, c(1, 3))
+
+ threads <- sample(10, 1)
+ modelSet <- setCoxModel(threads = threads)
+ expect_equal(attr(modelSet$param, "settings")$threads, threads)
+
tolerance <- runif(1)
- model_set <- setCoxModel(tolerance = tolerance)
- expect_equal(attr(model_set$param, 'settings')$tolerance, tolerance)
-
- maxIterations <- sample(100,1)
- model_set <- setCoxModel(maxIterations = maxIterations)
- expect_equal(attr(model_set$param, 'settings')$maxIterations, maxIterations)
-
-
+ modelSet <- setCoxModel(tolerance = tolerance)
+ expect_equal(attr(modelSet$param, "settings")$tolerance, tolerance)
+
+ maxIterations <- sample(100, 1)
+ modelSet <- setCoxModel(maxIterations = maxIterations)
+ expect_equal(attr(modelSet$param, "settings")$maxIterations, maxIterations)
})
test_that("set cox regression incorrect inputs", {
-
+ skip_if_not_installed("polspline")
+ skip_on_cran()
expect_error(setCoxModel(variance = -0.01))
- expect_error(setCoxModel(variance = 'variance'))
- expect_error(setCoxModel(seed = 'seed'))
- expect_error(setCoxModel(threads = 'threads'))
-
- expect_error(setCoxModel(lowerLimit = 'lowerLimit'))
- expect_error(setCoxModel(upperLimit = 'upperLimit'))
- expect_error(setCoxModel(lowerLimit=3, upperLimit = 1))
-
+ expect_error(setCoxModel(variance = "variance"))
+ expect_error(setCoxModel(seed = "seed"))
+ expect_error(setCoxModel(threads = "threads"))
+
+ expect_error(setCoxModel(lowerLimit = "lowerLimit"))
+ expect_error(setCoxModel(upperLimit = "upperLimit"))
+ expect_error(setCoxModel(lowerLimit = 3, upperLimit = 1))
})
@@ -174,96 +168,96 @@ test_that("set cox regression incorrect inputs", {
test_that("set IHT inputs", {
-
- #=====================================
+ skip_if_not_installed("IterativeHardThresholding")
+ skip_on_cran()
+ # =====================================
# checking IHT
- #=====================================
- model_set <- setIterativeHardThresholding()
- testthat::expect_that(model_set, testthat::is_a("modelSettings"))
-
- expect_equal(model_set$fitFunction, 'fitCyclopsModel')
- expect_is(model_set$param, 'list')
-
- expect_equal(attr(model_set$param, 'settings')$modelType, 'logistic')
- expect_equal(attr(model_set$param, 'settings')$priorfunction,'IterativeHardThresholding::createIhtPrior')
- expect_equal(attr(model_set$param, 'settings')$addIntercept, F)
- expect_equal(attr(model_set$param, 'settings')$useControl, F)
- expect_equal(attr(model_set$param, 'settings')$name, "Iterative Hard Thresholding")
- expect_equal(attr(model_set$param, 'settings')$crossValidationInPrior, F)
-
- k <- sample(100,1)
- model_set <- setIterativeHardThresholding(K = k)
- expect_equal(model_set$param$priorParams$K, k)
-
-
- penalty <- sample(c('bic', 'aic'),1)
- model_set <- setIterativeHardThresholding(penalty = penalty)
- expect_equal(model_set$param$priorParams$penalty, penalty)
-
- model_set <- setIterativeHardThresholding(exclude = c(1,2))
- expect_equal(model_set$param$priorParams$exclude, c(1,2))
-
- model_set <- setIterativeHardThresholding(forceIntercept = T)
- expect_equal(model_set$param$priorParams$forceIntercept, T)
-
- model_set <- setIterativeHardThresholding(fitBestSubset = T)
- expect_equal(model_set$param$priorParams$fitBestSubset, T)
-
+ # =====================================
+ modelSet <- setIterativeHardThresholding()
+ testthat::expect_that(modelSet, testthat::is_a("modelSettings"))
+
+ expect_equal(modelSet$fitFunction, "fitCyclopsModel")
+ expect_is(modelSet$param, "list")
+
+ expect_equal(attr(modelSet$param, "settings")$modelType, "logistic")
+ expect_equal(attr(modelSet$param, "settings")$priorfunction, "IterativeHardThresholding::createIhtPrior")
+ expect_equal(attr(modelSet$param, "settings")$addIntercept, FALSE)
+ expect_equal(attr(modelSet$param, "settings")$useControl, FALSE)
+ expect_equal(attr(modelSet$param, "settings")$name, "Iterative Hard Thresholding")
+ expect_equal(attr(modelSet$param, "settings")$crossValidationInPrior, FALSE)
+
+ k <- sample(100, 1)
+ modelSet <- setIterativeHardThresholding(K = k)
+ expect_equal(modelSet$param$priorParams$K, k)
+
+
+ penalty <- sample(c("bic", "aic"), 1)
+ modelSet <- setIterativeHardThresholding(penalty = penalty)
+ expect_equal(modelSet$param$priorParams$penalty, penalty)
+
+ modelSet <- setIterativeHardThresholding(exclude = c(1, 2))
+ expect_equal(modelSet$param$priorParams$exclude, c(1, 2))
+
+ modelSet <- setIterativeHardThresholding(forceIntercept = TRUE)
+ expect_equal(modelSet$param$priorParams$forceIntercept, TRUE)
+
+ modelSet <- setIterativeHardThresholding(fitBestSubset = TRUE)
+ expect_equal(modelSet$param$priorParams$fitBestSubset, TRUE)
+
# add other parameter checks
## initialRidgeVariance
## tolerance
## maxIterations
## threshold
- ## delta
-
- seed <- sample(10,1)
- model_set <- setIterativeHardThresholding(seed = seed)
- expect_equal(attr(model_set$param, 'settings')$seed, seed)
-
+ ## delta
+
+ seed <- sample(10, 1)
+ modelSet <- setIterativeHardThresholding(seed = seed)
+ expect_equal(attr(modelSet$param, "settings")$seed, seed)
})
test_that("test IHT incorrect inputs", {
-
-testthat::expect_error(setIterativeHardThresholding(K = 0))
-testthat::expect_error(setIterativeHardThresholding(penalty = 'L1'))
-testthat::expect_error(setIterativeHardThresholding(fitBestSubset = "true"))
-testthat::expect_error(setIterativeHardThresholding(seed = 'F'))
-
+ skip_if_not_installed("IterativeHardThresholding")
+ skip_on_cran()
+ testthat::expect_error(setIterativeHardThresholding(K = 0))
+ testthat::expect_error(setIterativeHardThresholding(penalty = "L1"))
+ testthat::expect_error(setIterativeHardThresholding(fitBestSubset = "true"))
+ testthat::expect_error(setIterativeHardThresholding(seed = "F"))
})
-#================ FUNCTION TESTING
+# ================ FUNCTION TESTING
test_that("test logistic regression runs", {
-
-modelSettings <- setLassoLogisticRegression()
-
-fitModel <- fitPlp(
- trainData = trainData,
- modelSettings = modelSettings,
- search = "grid",
- analysisId = 'lrTest',
- analysisPath = tempdir()
+ modelSettings <- setLassoLogisticRegression()
+
+ fitModel <- fitPlp(
+ trainData = trainData,
+ modelSettings = modelSettings,
+ search = "grid",
+ analysisId = "lrTest",
+ analysisPath = tempdir()
)
-expect_equal(length(unique(fitModel$prediction$evaluationType)),2)
-expect_equal(nrow(fitModel$prediction), nrow(trainData$labels)*2)
-expect_true(length(fitModel$model$coefficients) < trainData$covariateData$covariateRef %>% dplyr::tally() %>% dplyr::pull()+1)
+ expect_equal(length(unique(fitModel$prediction$evaluationType)), 2)
+ expect_equal(nrow(fitModel$prediction), nrow(trainData$labels) * 2)
+ expect_true(length(fitModel$model$coefficients) < trainData$covariateData$covariateRef %>%
+ dplyr::tally() %>%
+ dplyr::pull() + 1)
-expect_true(!is.null(fitModel$trainDetails$trainingTime))
-expect_equal(fitModel$trainDetails$trainingDate,Sys.Date())
+ expect_true(!is.null(fitModel$trainDetails$trainingTime))
+ expect_equal(fitModel$trainDetails$trainingDate, Sys.Date())
-expect_equal(
- nrow(fitModel$covariateImportance),
- trainData$covariateData$covariateRef %>% dplyr::tally() %>% dplyr::pull()
-)
-
-expect_true('covariateValue' %in% colnames(fitModel$covariateImportance))
+ expect_equal(
+ nrow(fitModel$covariateImportance),
+ trainData$covariateData$covariateRef %>% dplyr::tally() %>% dplyr::pull()
+ )
-expect_equal(fitModel$modelDesign$outcomeId, attr(trainData, 'metaData')$outcomeId)
-expect_equal(fitModel$modelDesign$targetId, attr(trainData, 'metaData')$targetId)
+ expect_true("covariateValue" %in% colnames(fitModel$covariateImportance))
+ expect_equal(fitModel$modelDesign$outcomeId, attr(trainData, "metaData")$outcomeId)
+ expect_equal(fitModel$modelDesign$targetId, attr(trainData, "metaData")$targetId)
})
diff --git a/tests/testthat/test-dataSplitting.R b/tests/testthat/test-dataSplitting.R
index b8ce628bb..78a78f3b2 100644
--- a/tests/testthat/test-dataSplitting.R
+++ b/tests/testthat/test-dataSplitting.R
@@ -2,13 +2,13 @@
# Copyright 2021 Observational Health Data Sciences and Informatics
#
# This file is part of PatientLevelPrediction
-#
+#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
-#
+#
# http://www.apache.org/licenses/LICENSE-2.0
-#
+#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -21,30 +21,29 @@ context("Data splitting")
# make sure pop is all plpData people
populationT <- plpData$cohorts
-populationT$outcomeCount <- sample(c(0,1), nrow(populationT), replace = T)
+populationT$outcomeCount <- sample(c(0, 1), nrow(populationT), replace = TRUE)
attr(populationT, "metaData")$outcomeId <- outcomeId
-attr(populationT, "metaData")$populationSettings <- list(madeup = T)
-attr(populationT, "metaData")$restrictPlpDataSettings <- list(madeup = T)
-attr(populationT, "metaData")$attrition <- c(1,2,3)
+attr(populationT, "metaData")$populationSettings <- list(madeup = TRUE)
+attr(populationT, "metaData")$restrictPlpDataSettings <- list(madeup = TRUE)
+attr(populationT, "metaData")$attrition <- c(1, 2, 3)
# check correct inputs
-testFraction1 <- sample(9,1)/10
-trainFraction1 <- 1-testFraction1
-splitSeed1 <- sample(100000,1)
-nfold1 <- 1+sample(10,1)
-type1 <- sample(c('stratified', 'time', 'subject'), 1)
+testFraction1 <- sample(9, 1) / 10
+trainFraction1 <- 1 - testFraction1
+splitSeed1 <- sample(100000, 1)
+nfold1 <- 1 + sample(10, 1)
+type1 <- sample(c("stratified", "time", "subject"), 1)
defaultSetting <- function(
- testFraction = testFraction1,
- trainFraction = trainFraction1,
- splitSeed = splitSeed1,
- nfold = nfold1,
- type = type1
-){
+ testFraction = testFraction1,
+ trainFraction = trainFraction1,
+ splitSeed = splitSeed1,
+ nfold = nfold1,
+ type = type1) {
result <- createDefaultSplitSetting(
- testFraction = testFraction,
- trainFraction = trainFraction,
- splitSeed = splitSeed,
+ testFraction = testFraction,
+ trainFraction = trainFraction,
+ splitSeed = splitSeed,
nfold = nfold,
type = type
)
@@ -53,369 +52,418 @@ defaultSetting <- function(
test_that("createDefaultSplitSetting", {
-
splitSettings <- defaultSetting()
-
- expect_is(splitSettings, 'splitSettings')
-
- expectFun <- 'randomSplitter'
- if(type1 == 'time'){
- expectFun <- 'timeSplitter'
+
+ expect_is(splitSettings, "splitSettings")
+
+ expectFun <- "randomSplitter"
+ if (type1 == "time") {
+ expectFun <- "timeSplitter"
}
- if(type1 == 'subject'){
- expectFun <- 'subjectSplitter'
+ if (type1 == "subject") {
+ expectFun <- "subjectSplitter"
}
-
+
expect_equal(attr(splitSettings, "fun"), expectFun)
-
+
expect_equal(splitSettings$test, testFraction1)
expect_equal(splitSettings$train, trainFraction1)
expect_equal(splitSettings$seed, splitSeed1)
expect_equal(splitSettings$nfold, nfold1)
-
- #check input errors for testFraction
+
+ # check input errors for testFraction
expect_error(
- defaultSetting(testFraction = 'character')
+ defaultSetting(testFraction = "character")
)
-
+
expect_error(
defaultSetting(testFraction = -0.1)
)
-
+
expect_error(
defaultSetting(testFraction = 1.001)
)
-
+
# check input error for trainFraction
expect_error(
- defaultSetting(trainFraction = 'trainFraction')
+ defaultSetting(trainFraction = "trainFraction")
)
-
+
expect_error(
defaultSetting(trainFraction = 1.2)
)
-
+
expect_error(
defaultSetting(trainFraction = -0.2)
)
-
- #check error for splitSeed
-
+
+ # check error for splitSeed
+
expect_error(
- defaultSetting(splitSeed = NULL)
+ defaultSetting(splitSeed = NULL)
)
-
+
expect_error(
- defaultSetting(splitSeed = 'NULL')
+ defaultSetting(splitSeed = "NULL")
)
-
+
# check error for nfold
expect_error(
defaultSetting(nfold = NULL)
)
expect_error(
- defaultSetting(nfold = 'NULL')
+ defaultSetting(nfold = "NULL")
)
-
+
# incorrect type
expect_error(
- defaultSetting(type = 'madeup')
+ defaultSetting(type = "madeup")
)
expect_error(
- defaultSetting(type = NULL)
+ defaultSetting(type = NULL)
)
expect_error(
- defaultSetting(type = 1)
+ defaultSetting(type = 1)
)
-
})
test_that("Main split function: splitData", {
-
# check default settings with test/train
splitSettings <- defaultSetting()
-
+
splitData <- splitData(
plpData = plpData,
population = populationT,
splitSettings = splitSettings
)
-
+
# check class
- expect_is(splitData, 'splitData')
-
+ expect_is(splitData, "splitData")
+
# should have test/train
- expect_equal(names(splitData), c('Train', 'Test'))
-
+ expect_equal(names(splitData), c("Train", "Test"))
+
# train and test are CovariateData
- expect_is(splitData$Train$covariateData, 'CovariateData')
- expect_is(splitData$Test$covariateData, 'CovariateData')
-
+ expect_is(splitData$Train$covariateData, "CovariateData")
+ expect_is(splitData$Test$covariateData, "CovariateData")
+
# Train has labels/folds/covariateData
- expect_equal(names(splitData$Train), c('labels', 'folds', 'covariateData'))
-
+ expect_equal(names(splitData$Train), c("labels", "folds", "covariateData"))
+
# Test has labels/covariateData
- expect_equal(names(splitData$Test), c('labels', 'covariateData'))
-
+ expect_equal(names(splitData$Test), c("labels", "covariateData"))
+
# check attributes for Train
expect_equal(attr(splitData$Train, "metaData")$outcomeId, attr(populationT, "metaData")$outcomeId)
expect_equal(attr(splitData$Train, "metaData")$targetId, plpData$metaData$databaseDetails$targetId)
expect_equal(
- attr(splitData$Train, "metaData")$cdmDatabaseSchema,
+ attr(splitData$Train, "metaData")$cdmDatabaseSchema,
plpData$metaData$databaseDetails$cdmDatabaseSchema
- )
-
- expect_is(attr(splitData$Train, "metaData")$restrictPlpDataSettings, 'list')
+ )
+
+ expect_is(attr(splitData$Train, "metaData")$restrictPlpDataSettings, "list")
expect_equal(
- attr(splitData$Train, "metaData")$covariateSettings,
+ attr(splitData$Train, "metaData")$covariateSettings,
plpData$metaData$covariateSettings
)
expect_equal(
- attr(splitData$Train, "metaData")$populationSettings,
+ attr(splitData$Train, "metaData")$populationSettings,
attr(populationT, "metaData")$populationSettings
)
expect_equal(
- attr(splitData$Train, "metaData")$attrition,
+ attr(splitData$Train, "metaData")$attrition,
attr(populationT, "metaData")$attrition
)
-
+
expect_equal(
- attr(splitData$Train, "metaData")$splitSettings,
+ attr(splitData$Train, "metaData")$splitSettings,
splitSettings
)
-
+
# train+test should be full data as train+test = 1
expect_equal(
- nrow(splitData$Train$labels) + nrow(splitData$Test$labels),
+ nrow(splitData$Train$labels) + nrow(splitData$Test$labels),
nrow(populationT)
)
expect_equal(
- splitData$Train$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull() +
- splitData$Test$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull(),
+ splitData$Train$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull() +
+ splitData$Test$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull(),
plpData$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull()
)
-
+
# make sure nfolds is correct
expect_equal(
min(splitData$Train$folds$index),
1
)
-
+
expect_equal(
max(splitData$Train$folds$index),
splitSettings$nfold
)
-
-
+
+
# check when test is 0
splitSettings <- defaultSetting(
- testFraction = 0,
+ testFraction = 0,
trainFraction = 1
- )
-
+ )
+
splitData <- splitData(
plpData = plpData,
population = populationT,
splitSettings = splitSettings
)
-
+
# should just have train
- expect_equal(names(splitData), c('Train'))
-
+ expect_equal(names(splitData), c("Train"))
+
# train labels should be the same size at the population
expect_equal(
- nrow(splitData$Train$labels),
+ nrow(splitData$Train$labels),
nrow(populationT)
)
expect_equal(
- splitData$Train$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull(),
+ splitData$Train$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull(),
plpData$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull()
)
-
})
test_that("dataSummary works", {
-
splitSettings <- defaultSetting(
- testFraction = 0,
+ testFraction = 0,
trainFraction = 1
)
-
+
splitData <- splitData(
plpData = plpData,
population = populationT,
splitSettings = splitSettings
)
-summaryPrint <- dataSummary(splitData)
-expect_equal(summaryPrint, TRUE)
-
+ summaryPrint <- dataSummary(splitData)
+ expect_equal(summaryPrint, TRUE)
})
test_that("Data stratified splitting", {
-
splitSettings <- defaultSetting(
- test=0.3,
- nfold=3
- )
+ test = 0.3,
+ nfold = 3
+ )
# error due to insufficient outcomes
- DSpopulation1 <- data.frame(rowId=1:20, outcomeCount=c(1,1,1,1,rep(0,16)))
- expect_error(randomSplitter(population = DSpopulation1, splitSettings = splitSettings))
-
- DSpopulation2 <- data.frame(rowId=1:200, outcomeCount=c(rep(1,42),rep(0,158)))
+ dsPopulation1 <- data.frame(rowId = 1:20, outcomeCount = c(1, 1, 1, 1, rep(0, 16)))
+ expect_error(randomSplitter(population = dsPopulation1, splitSettings = splitSettings))
+
+ dsPopulation2 <- data.frame(rowId = 1:200, outcomeCount = c(rep(1, 42), rep(0, 158)))
splitSettings <- defaultSetting(
train = 0.7,
- test = 0.3,
+ test = 0.3,
nfold = 4
)
# fold creation check 1 (fixed)
- test <- randomSplitter(population = DSpopulation2, splitSettings = splitSettings)
- test <- merge(DSpopulation2, test)
+ test <- randomSplitter(population = dsPopulation2, splitSettings = splitSettings)
+ test <- merge(dsPopulation2, test)
test <- table(test$outcomeCount, test$index)
- test.returned <- paste(test, collapse='-')
- test.expected <- paste(matrix(c(47,28,28,28,27,12,8,8,7,7), ncol=5, byrow=T),collapse='-')
- expect_identical(test.returned, test.expected)
-
+ testReturned <- paste(test, collapse = "-")
+ testExpected <- paste(matrix(c(47, 28, 28, 28, 27, 12, 8, 8, 7, 7), ncol = 5, byrow = TRUE), collapse = "-")
+ expect_identical(testReturned, testExpected)
+
# fold creation check 2 (sum)
size <- 500
- DSpopulation3 <- data.frame(rowId=1:size, outcomeCount=c(rep(1,floor(size/3)),rep(0,size-floor(size/3))))
+ dsPopulation3 <- data.frame(rowId = 1:size, outcomeCount = c(rep(1, floor(size / 3)), rep(0, size - floor(size / 3))))
splitSettings <- defaultSetting(
train = 0.8,
- test = 0.2,
+ test = 0.2,
nfold = 4
)
- test <- randomSplitter(population = DSpopulation3, splitSettings = splitSettings)
- test <- merge(DSpopulation3, test)
+ test <- randomSplitter(population = dsPopulation3, splitSettings = splitSettings)
+ test <- merge(dsPopulation3, test)
test <- table(test$outcomeCount, test$index)
expect_that(sum(test), equals(size))
-
+
# test the training fraction parameter for learning curves
- size = 500
- DSpopulation4 <- data.frame(rowId=1:size,
- outcomeCount=c(rep(1,floor(size/3)),
- rep(0,size-floor(size/3))))
+ size <- 500
+ dsPopulation4 <- data.frame(
+ rowId = 1:size,
+ outcomeCount = c(
+ rep(1, floor(size / 3)),
+ rep(0, size - floor(size / 3))
+ )
+ )
splitSettings <- defaultSetting(
train = 0.4,
- test = 0.2,
+ test = 0.2,
nfold = 4
)
- test <- randomSplitter(population = DSpopulation4, splitSettings = splitSettings)
+ test <- randomSplitter(population = dsPopulation4, splitSettings = splitSettings)
- tolerance = 5
- excludedPatients = 200
+ tolerance <- 5
+ excludedPatients <- 200
# test, if the number of patients in each fold are roughly the same
expect_equal(length(test$index[test$index == 1]),
- length(test$index[test$index == 3]),
- tolerance = tolerance)
+ length(test$index[test$index == 3]),
+ tolerance = tolerance
+ )
expect_equal(length(test$index[test$index == 2]),
- length(test$index[test$index == 4]),
- tolerance = tolerance)
+ length(test$index[test$index == 4]),
+ tolerance = tolerance
+ )
expect_equal(length(test$index[test$index == 1]),
- length(test$index[test$index == 4]),
- tolerance = tolerance)
+ length(test$index[test$index == 4]),
+ tolerance = tolerance
+ )
# test, if patients were excluded according to the training fraction
- expect_equal(length(test$index[test$index == 0]),
- excludedPatients)
+ expect_equal(
+ length(test$index[test$index == 0]),
+ excludedPatients
+ )
})
test_that("Data splitting by time", {
-
# fold creation check (sum)
size <- 500
set.seed(1)
- DSpopulation2 <- data.frame(
- rowId=1:size,
- outcomeCount=sample(0:1,size,replace=TRUE),
+ dsPopulation2 <- data.frame(
+ rowId = 1:size,
+ outcomeCount = sample(0:1, size, replace = TRUE),
cohortStartDate = as.Date("2010-01-01") + c(1:size)
- )
+ )
splitSettings <- defaultSetting(
train = 0.8,
- test = 0.2,
+ test = 0.2,
nfold = 4
)
-
- test <- timeSplitter(population = DSpopulation2, splitSettings = splitSettings)
- test <- merge(DSpopulation2, test)
+
+ test <- timeSplitter(population = dsPopulation2, splitSettings = splitSettings)
+ test <- merge(dsPopulation2, test)
test <- table(test$outcomeCount, test$index)
expect_that(sum(test), equals(size))
-
+
# test the training fraction parameter for learning curves
size <- 500
set.seed(1)
- DSpopulation3 <- data.frame(rowId=1:size,
- outcomeCount=sample(0:1,size,replace=TRUE),
- cohortStartDate = as.Date("2010-01-01") + c(1:size))
+ dsPopulation3 <- data.frame(
+ rowId = 1:size,
+ outcomeCount = sample(0:1, size, replace = TRUE),
+ cohortStartDate = as.Date("2010-01-01") + c(1:size)
+ )
splitSettings <- defaultSetting(
train = 0.4,
- test = 0.2,
+ test = 0.2,
nfold = 4
)
- test <- timeSplitter(population = DSpopulation3, splitSettings = splitSettings)
-
- tolerance = 5
- excludedPatients = 196
+ test <- timeSplitter(population = dsPopulation3, splitSettings = splitSettings)
+
+ tolerance <- 5
+ excludedPatients <- 196
# test, if the number of patients in each fold are roughly the same
expect_equal(length(test$index[test$index == 1]),
- length(test$index[test$index == 3]),
- tolerance = tolerance)
+ length(test$index[test$index == 3]),
+ tolerance = tolerance
+ )
expect_equal(length(test$index[test$index == 2]),
- length(test$index[test$index == 4]),
- tolerance = tolerance)
+ length(test$index[test$index == 4]),
+ tolerance = tolerance
+ )
expect_equal(length(test$index[test$index == 1]),
- length(test$index[test$index == 4]),
- tolerance = tolerance)
+ length(test$index[test$index == 4]),
+ tolerance = tolerance
+ )
# test, if patients were excluded according to the training fraction
- expect_equal(length(test$index[test$index == 0]),
- excludedPatients)
-
+ expect_equal(
+ length(test$index[test$index == 0]),
+ excludedPatients
+ )
})
test_that("Data splitting by subject", {
-# error message checks
- DSpopulation1 <- data.frame(rowId=1:20, subjectId = 1:20, outcomeCount=c(1,1,1,1,rep(0,16)))
+ # error message checks
+ dsPopulation1 <- data.frame(rowId = 1:20, subjectId = 1:20, outcomeCount = c(1, 1, 1, 1, rep(0, 16)))
splitSettings <- defaultSetting(
train = 0.7,
- test = 0.3,
+ test = 0.3,
nfold = 3
)
- expect_error(subjectSplitter(population = DSpopulation1, splitSettings = splitSettings ))
+ expect_error(subjectSplitter(population = dsPopulation1, splitSettings = splitSettings))
- DSpopulation2 <- data.frame(rowId=1:200,subjectId = 1:200, outcomeCount=c(rep(1,42),rep(0,158)))
+ dsPopulation2 <- data.frame(rowId = 1:200, subjectId = 1:200, outcomeCount = c(rep(1, 42), rep(0, 158)))
splitSettings <- defaultSetting(
train = 0.8,
- test = 0.2,
+ test = 0.2,
nfold = 4
)
- test <- subjectSplitter(population = DSpopulation2, splitSettings = splitSettings)
- test <- merge(DSpopulation2, test)
+ test <- subjectSplitter(population = dsPopulation2, splitSettings = splitSettings)
+ test <- merge(dsPopulation2, test)
test <- table(test$outcomeCount, test$index)
- test.returned <- paste(test, collapse='-')
- test.expected <- paste(matrix(c(32,32,32,31,31,8,9,9,8,8), ncol=5, byrow=T),collapse='-')
- expect_identical(test.returned, test.expected)
+ testReturned <- paste(test, collapse = "-")
+ testExpected <- paste(matrix(c(32, 32, 32, 31, 31, 8, 9, 9, 8, 8), ncol = 5, byrow = TRUE), collapse = "-")
+ expect_identical(testReturned, testExpected)
-# test that people are not in multiple folds
- DSpopulation3 <- data.frame(rowId=1:200,subjectId = rep(1:50,4), outcomeCount=c(rep(1,42),rep(0,158)))
+ # test that people are not in multiple folds
+ dsPopulation3 <- data.frame(rowId = 1:200, subjectId = rep(1:50, 4), outcomeCount = c(rep(1, 42), rep(0, 158)))
splitSettings <- defaultSetting(
train = 0.75,
- test = 0.25,
+ test = 0.25,
nfold = 3
)
- test <- subjectSplitter(population = DSpopulation3, splitSettings = splitSettings )
- test <- merge(DSpopulation3, test)
+ test <- subjectSplitter(population = dsPopulation3, splitSettings = splitSettings)
+ test <- merge(dsPopulation3, test)
+
+ expect_equal(unique(table(test$subjectId[test$index == -1])), 4)
+ expect_equal(unique(table(test$subjectId[test$index == 2])), 4)
+ expect_equal(unique(table(test$subjectId[test$index == 3])), 4)
+ expect_equal(unique(table(test$subjectId[test$index == 1])), 4)
- expect_equal(unique(table(test$subjectId[test$index==-1])), 4)
- expect_equal(unique(table(test$subjectId[test$index==2])), 4)
- expect_equal(unique(table(test$subjectId[test$index==3])), 4)
- expect_equal(unique(table(test$subjectId[test$index==1])), 4)
-
-# test that no subject is not assigned a fold
- expect_equal(sum(test$index==0), 0)
+ # test that no subject is not assigned a fold
+ expect_equal(sum(test$index == 0), 0)
+})
+
+test_that("Existing data splitter works", {
+ # split by age
+ age <- population$ageYear
+ # create empty index same lengths as age
+ index <- rep(0, length(age))
+ index[age > 43] <- -1 # test set
+ index[age <= 35] <- 1 # train fold 1
+ index[age > 35 & age <= 43] <- 2 # train fold 2
+ splitIds <- data.frame(rowId = population$rowId, index = index)
+ splitSettings <- createExistingSplitSettings(splitIds)
+ ageSplit <- splitData(
+ plpData = plpData,
+ population = population,
+ splitSettings = splitSettings
+ )
+
+ # test only old people in test
+ expect_equal(
+ length(ageSplit$Test$labels$rowId),
+ sum(age > 43)
+ )
+ # only young people in train
+ expect_equal(
+ length(ageSplit$Train$labels$rowId),
+ sum(age <= 43)
+ )
+ # no overlap
+ expect_equal(
+ length(intersect(ageSplit$Test$labels$rowId, ageSplit$Train$labels$rowId)),
+ 0
+ )
+})
+test_that("Outcome options works", {
+ testPop <- data.frame(outcomeCount = sample(c(0, 1), 9, replace = TRUE))
+ # regular plp outcome check
+ expect_error(checkOutcomes(testPop, train = 0.75, nfold = 3))
+ withr::with_options(list("plp.outcomes" = 100), {
+ testPop <- data.frame(outcomeCount = sample(c(0, 1), 90, replace = TRUE))
+ expect_error(checkOutcomes(testPop, train = 0.75, nfold = 3))
+ })
})
diff --git a/tests/testthat/test-demographicSummary.R b/tests/testthat/test-demographicSummary.R
index 90a3312a9..909648126 100644
--- a/tests/testthat/test-demographicSummary.R
+++ b/tests/testthat/test-demographicSummary.R
@@ -19,59 +19,56 @@ context("DemographicSummary binary")
test_that("getDemographicSummary", {
prediction <- data.frame(
- rowId = 1:100,
- ageYear = sample(100, 100, replace = T),
- gender = sample(c(8507,'female'), 100, replace = T),
- value= runif(100),
+ rowId = 1:100,
+ ageYear = sample(100, 100, replace = TRUE),
+ gender = sample(c(8507, "female"), 100, replace = TRUE),
+ value = runif(100),
outcomeCount = round(runif(100)),
- evaluation = rep('Test', 100)
- )
+ evaluation = rep("Test", 100)
+ )
demoSum <- getDemographicSummary(
prediction = prediction,
- predictionType = 'binary',
- typeColumn = 'evaluation'
- )
-
+ predictionType = "binary",
+ typeColumn = "evaluation"
+ )
+
expect_that(ncol(demoSum), equals(12))
- expect_true('evaluation' %in% colnames(demoSum))
-
+ expect_true("evaluation" %in% colnames(demoSum))
+
# check correct gender length
expect_equal(length(unique(prediction$gender)), length(unique(demoSum$genGroup)))
-
-
+
+
demoSumBin <- getDemographicSummary_binary(
prediction = prediction,
- evalColumn = 'evaluation'
+ evalColumn = "evaluation"
)
- expect_equal(demoSum,demoSumBin)
-
+ expect_equal(demoSum, demoSumBin)
})
test_that("getDemographicSummary", {
prediction <- data.frame(
- rowId = 1:100,
- ageYear = sample(100, 100, replace = T),
- gender = sample(c(8507,'female'), 100, replace = T),
- value= runif(100),
+ rowId = 1:100,
+ ageYear = sample(100, 100, replace = TRUE),
+ gender = sample(c(8507, "female"), 100, replace = TRUE),
+ value = runif(100),
outcomeCount = round(runif(100)),
- evaluation = rep('Test', 100),
- survivalTime = 50 + sample(730, 100, replace = T)
+ evaluation = rep("Test", 100),
+ survivalTime = 50 + sample(730, 100, replace = TRUE)
)
-
+
demoSumSurv <- getDemographicSummary_survival(
prediction = prediction,
- evalColumn = 'evaluation',
+ evalColumn = "evaluation",
timepoint = 365
)
-
- expect_is(demoSumSurv, 'data.frame')
+
+ expect_is(demoSumSurv, "data.frame")
expect_that(ncol(demoSumSurv), equals(8))
- expect_true('evaluation' %in% colnames(demoSumSurv))
-
+ expect_true("evaluation" %in% colnames(demoSumSurv))
+
# check correct gender length
expect_equal(length(unique(prediction$gender)), length(unique(demoSumSurv$genGroup)))
-
-
})
diff --git a/tests/testthat/test-diagnostic.R b/tests/testthat/test-diagnostic.R
index 2c261dd16..6b7d31b6b 100644
--- a/tests/testthat/test-diagnostic.R
+++ b/tests/testthat/test-diagnostic.R
@@ -18,106 +18,106 @@ context("Diagnostic")
test_that("getMaxEndDaysFromCovariates works", {
-
covariateSettings <- FeatureExtraction::createCovariateSettings(
- useDemographicsGender = T,
+ useDemographicsGender = TRUE,
endDays = -1
)
testthat::expect_equal(getMaxEndDaysFromCovariates(covariateSettings), -1)
-
+
covariateSettings <- list(
FeatureExtraction::createCovariateSettings(
- useDemographicsGender = T,
- endDays = -1
- ),
- FeatureExtraction::createCovariateSettings(
- useDemographicsGender = T,
- endDays = 2
- )
+ useDemographicsGender = TRUE,
+ endDays = -1
+ ),
+ FeatureExtraction::createCovariateSettings(
+ useDemographicsGender = TRUE,
+ endDays = 2
+ )
)
testthat::expect_equal(getMaxEndDaysFromCovariates(covariateSettings), 2)
-
+
covariateSettings <- list(
FeatureExtraction::createCovariateSettings(
- useDemographicsGender = T,
+ useDemographicsGender = TRUE,
endDays = -1,
),
PatientLevelPrediction::createCohortCovariateSettings(
- endDay = 5,
- settingId = 1,
- cohortName = 'test',
- cohortId = 1,
- analysisId = 111,
- cohortDatabaseSchema = '', cohortTable = '')
+ endDay = 5,
+ settingId = 1,
+ cohortName = "test",
+ cohortId = 1,
+ analysisId = 111,
+ cohortDatabaseSchema = "", cohortTable = ""
+ )
)
testthat::expect_equal(getMaxEndDaysFromCovariates(covariateSettings), 5)
-
+
# if no covariate setting has endDays return 0
testthat::expect_equal(
- getMaxEndDaysFromCovariates(list(empty = list(gfg=2), empty2 = list(ff=1))),
+ getMaxEndDaysFromCovariates(list(empty = list(gfg = 2), empty2 = list(ff = 1))),
0
- )
-
+ )
})
test_that("test diagnosePlp works", {
test <- diagnosePlp(
plpData = tinyPlpData,
outcomeId = outcomeId,
- analysisId = 'diagnoseTest',
+ analysisId = "diagnoseTest",
populationSettings = createStudyPopulationSettings(
- riskWindowStart = 1,
- startAnchor = 'cohort start',
+ riskWindowStart = 1,
+ startAnchor = "cohort start",
riskWindowEnd = 365,
- endAnchor = 'cohort start'
+ endAnchor = "cohort start"
),
splitSettings = createDefaultSplitSetting(),
sampleSettings = createSampleSettings(), # default none
- saveDirectory = file.path(saveLoc, 'diagnostics'),
+ saveDirectory = file.path(saveLoc, "diagnostics"),
featureEngineeringSettings = createFeatureEngineeringSettings(), # default none
modelSettings = setLassoLogisticRegression(), # default to logistic regression
preprocessSettings = createPreprocessSettings()
)
- #check results are a list
- testthat::expect_is(test, 'diagnosePlp')
-
+ # check results are a list
+ testthat::expect_is(test, "diagnosePlp")
+
# check list names
testthat::expect_equal(
- sum( names(test) %in%
- c('summary','participants','predictors',
- 'outcomes', 'designs', 'modelDesign',
- 'databaseSchema')
- )
- , 7)
-
+ sum(names(test) %in%
+ c(
+ "summary", "participants", "predictors",
+ "outcomes", "designs", "modelDesign",
+ "databaseSchema"
+ )),
+ 7
+ )
+
# check the results are saved into the databaseName directory
- testthat::expect_equal(T, dir.exists(file.path(saveLoc, 'diagnostics')))
- testthat::expect_equal(T, file.exists(file.path(saveLoc, 'diagnostics','diagnoseTest','diagnosePlp.rds')))
-
- testthat::expect_is(test$summary, 'data.frame')
- testthat::expect_is(test$participants, 'data.frame')
- testthat::expect_is(test$predictors, 'data.frame') # rename this outcome survival?
- testthat::expect_is(test$outcomes, 'data.frame')
- testthat::expect_is(test$databaseSchema, 'character')
-
+ testthat::expect_equal(TRUE, dir.exists(file.path(saveLoc, "diagnostics")))
+ testthat::expect_equal(TRUE, file.exists(file.path(saveLoc, "diagnostics", "diagnoseTest", "diagnosePlp.rds")))
+
+ testthat::expect_is(test$summary, "data.frame")
+ testthat::expect_is(test$participants, "data.frame")
+ testthat::expect_is(test$predictors, "data.frame") # rename this outcome survival?
+ testthat::expect_is(test$outcomes, "data.frame")
+ testthat::expect_is(test$databaseSchema, "character")
+
testthat::expect_true(!is.null(test$modelDesign$targetId))
testthat::expect_true(!is.null(test$modelDesign$outcomeId))
testthat::expect_true(!is.null(test$modelDesign$restrictPlpDataSettings))
testthat::expect_true(!is.null(test$modelDesign$covariateSettings))
testthat::expect_true(!is.null(test$modelDesign$populationSettings))
-
-
})
test_that("test diagnoseMultiplePlp works", {
-
analysis1 <- createModelDesign(
targetId = 1,
outcomeId = outcomeId,
- restrictPlpDataSettings = createRestrictPlpDataSettings(firstExposureOnly = F,
- washoutPeriod = 0,
- sampleSize = 100),
+ restrictPlpDataSettings = createRestrictPlpDataSettings(
+ firstExposureOnly = FALSE,
+ washoutPeriod = 0,
+ sampleSize = 100
+ ),
populationSettings = createStudyPopulationSettings(),
covariateSettings = covariateSettings,
featureEngineeringSettings = NULL,
@@ -126,13 +126,15 @@ test_that("test diagnoseMultiplePlp works", {
preprocessSettings = createPreprocessSettings(),
modelSettings = setLassoLogisticRegression(seed = 12)
)
-
+
analysis2 <- createModelDesign(
targetId = 1,
outcomeId = outcomeId,
- restrictPlpDataSettings = createRestrictPlpDataSettings(firstExposureOnly = F,
- washoutPeriod = 0,
- sampleSize = 100),
+ restrictPlpDataSettings = createRestrictPlpDataSettings(
+ firstExposureOnly = FALSE,
+ washoutPeriod = 0,
+ sampleSize = 100
+ ),
populationSettings = createStudyPopulationSettings(washoutPeriod = 400),
covariateSettings = covariateSettings,
featureEngineeringSettings = NULL,
@@ -141,26 +143,25 @@ test_that("test diagnoseMultiplePlp works", {
preprocessSettings = createPreprocessSettings(),
modelSettings = setLassoLogisticRegression(seed = 12)
)
-
+
diagnoseMultiplePlp(
databaseDetails = databaseDetails,
modelDesignList = list(
analysis1,
analysis2
- ),
+ ),
cohortDefinitions = data.frame(
cohortId = c(1, outcomeId),
- cohortName = c('target', 'outcome')
- ),
- saveDirectory = file.path(saveLoc, 'diagnosticsMultiple')
+ cohortName = c("target", "outcome")
+ ),
+ saveDirectory = file.path(saveLoc, "diagnosticsMultiple")
)
-
+
# file.path(saveDirectory,'settings.csv') exits
- testthat::expect_true(file.exists(file.path(saveLoc, 'diagnosticsMultiple', 'settings.csv')))
-
+ testthat::expect_true(file.exists(file.path(saveLoc, "diagnosticsMultiple", "settings.csv")))
+
# file.path(saveDirectory, settings$analysisId, 'diagnosePlp.rds') exists
- testthat::expect_true(length(dir(file.path(saveLoc, 'diagnosticsMultiple'), pattern = 'Analysis_')) == 2)
-
- testthat::expect_true(file.exists(file.path(saveLoc, 'diagnosticsMultiple', 'Analysis_1', 'diagnosePlp.rds')))
-
+ testthat::expect_true(length(dir(file.path(saveLoc, "diagnosticsMultiple"), pattern = "Analysis_")) == 2)
+
+ testthat::expect_true(file.exists(file.path(saveLoc, "diagnosticsMultiple", "Analysis_1", "diagnosePlp.rds")))
})
diff --git a/tests/testthat/test-evaluation.R b/tests/testthat/test-evaluation.R
index 6669aa488..2def26bfb 100644
--- a/tests/testthat/test-evaluation.R
+++ b/tests/testthat/test-evaluation.R
@@ -15,13 +15,6 @@
# limitations under the License.
context("Evaluation")
-library("testthat")
-library("pROC")
-library("AUC")
-library("scoring")
-library("Metrics")
-library("PRROC")
-
test_that("evaluatePlp", {
eval <- evaluatePlp(
diff --git a/tests/testthat/test-extractData.R b/tests/testthat/test-extractData.R
index 0e15eb37a..77826dc7c 100644
--- a/tests/testthat/test-extractData.R
+++ b/tests/testthat/test-extractData.R
@@ -19,24 +19,24 @@ context("extractPlp")
test_that("summary.plpData", {
attr(plpData$outcomes, "metaData")$outcomeIds <- c(outcomeId)
sum <- summary.plpData(plpData)
- testthat::expect_equal(class(sum),'summary.plpData')
+ testthat::expect_equal(class(sum), "summary.plpData")
})
test_that("getPlpData errors", {
testthat::expect_error(
getPlpData(
databaseDetails = list(targetId = NULL)
- )
+ )
)
testthat::expect_error(
getPlpData(
- databaseDetails = list(targetId = c(1,2))
- )
+ databaseDetails = list(targetId = c(1, 2))
+ )
)
testthat::expect_error(
getPlpData(
databaseDetails = list(targetId = 1, outcomeIds = NULL)
- )
+ )
)
})
@@ -51,11 +51,40 @@ test_that("getCovariateData", {
test_that("createDatabaseDetails with NULL cdmDatabaseId errors", {
testthat::expect_error(createDatabaseDetails(
- connectionDetails = list(),
- cdmDatabaseSchema = 'main',
- cdmDatabaseId = NULL,
- targetId = 1,
+ connectionDetails = list(),
+ cdmDatabaseSchema = "main",
+ cdmDatabaseId = NULL,
+ targetId = 1,
outcomeIds = outcomeId
))
})
+test_that("getPlpData checks covariateSettings object", {
+ testthat::expect_error(getPlpData(
+ databaseDetails = list(targetId = 1, outcomeIds = outcomeId),
+ covariateSettings = list()
+ ))
+
+ settings1 <-
+ FeatureExtraction::createCovariateSettings(useDemographicsGender = TRUE)
+ settings2 <-
+ FeatureExtraction::createCovariateSettings(useDemographicsAge = TRUE)
+ plpData <- getPlpData(
+ databaseDetails = databaseDetails,
+ covariateSettings = list(settings1, settings2)
+ )
+ expect_equal(
+ plpData$covariateData$covariateRef %>%
+ dplyr::pull(.data$analysisId) %>%
+ length(),
+ 3
+ )
+
+ settings3 <- list(covariateId = 3)
+ class(settings3) <- "NotCovariateSettings"
+
+ expect_error(getPlpData(
+ databaseDetails = databaseDetails,
+ covariateSettings = list(settings1, settings3)
+ ))
+})
diff --git a/tests/testthat/test-featureEngineering.R b/tests/testthat/test-featureEngineering.R
index 8ae88f5ce..c283b2af1 100644
--- a/tests/testthat/test-featureEngineering.R
+++ b/tests/testthat/test-featureEngineering.R
@@ -18,275 +18,283 @@ library("testthat")
context("FeatureEngineering")
-testFEFun <- function(type = 'none'){
-
+testFEFun <- function(type = "none") {
result <- createFeatureEngineeringSettings(type = type)
-
+
return(result)
}
-
test_that("createFeatureEngineeringSettings correct class", {
-
featureEngineeringSettings <- testFEFun()
-
- expect_is(featureEngineeringSettings, 'featureEngineeringSettings')
-
- checkFun <- 'sameData' # this is the only option at the moment, edit this when more are added
+
+ expect_is(featureEngineeringSettings, "featureEngineeringSettings")
+
+ checkFun <- "sameData"
expect_equal(attr(featureEngineeringSettings, "fun"), checkFun)
-
})
-
-testUniFun <- function(k = 100){
-
+testUniFun <- function(k = 100) {
result <- createUnivariateFeatureSelection(k = k)
-
return(result)
}
-
-
test_that("createUnivariateFeatureSelection correct class", {
- k <- sample(1000,1)
+ skip_if_not_installed("reticulate")
+ skip_on_cran()
+ k <- sample(1000, 1)
featureEngineeringSettings <- testUniFun(k = k)
-
- expect_is(featureEngineeringSettings, 'featureEngineeringSettings')
+
+ expect_is(featureEngineeringSettings, "featureEngineeringSettings")
expect_equal(featureEngineeringSettings$k, k)
- expect_equal(attr(featureEngineeringSettings, "fun"), 'univariateFeatureSelection')
-
- expect_error(testUniFun(k = 'ffdff'))
+ expect_equal(attr(featureEngineeringSettings, "fun"), "univariateFeatureSelection")
+
+ expect_error(testUniFun(k = "ffdff"))
expect_error(testUniFun(k = NULL))
expect_error(testUniFun(k = -1))
})
test_that("univariateFeatureSelection", {
-
- k <- 20+sample(10,1)
+ skip_if_not_installed("reticulate")
+ skip_on_cran()
+ k <- 20 + sample(10, 1)
featureEngineeringSettings <- testUniFun(k = k)
newTrainData <- copyTrainData(trainData)
-
- trainDataCovariateSize <- newTrainData$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull()
-
+
+ trainDataCovariateSize <- newTrainData$covariateData$covariates %>%
+ dplyr::tally() %>%
+ dplyr::pull()
+
reducedTrainData <- univariateFeatureSelection(
- trainData = newTrainData,
+ trainData = newTrainData,
featureEngineeringSettings = featureEngineeringSettings,
covariateIdsInclude = NULL
- )
-
- newDataCovariateSize <- reducedTrainData$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull()
+ )
+
+ newDataCovariateSize <- reducedTrainData$covariateData$covariates %>%
+ dplyr::tally() %>%
+ dplyr::pull()
expect_true(newDataCovariateSize <= trainDataCovariateSize)
-
+
# expect k many covariates left
- expect_equal(k,reducedTrainData$covariateData$covariateRef %>% dplyr::tally() %>% dplyr::pull())
-
+ expect_equal(k, reducedTrainData$covariateData$covariateRef %>% dplyr::tally() %>% dplyr::pull())
})
test_that("createRandomForestFeatureSelection correct class", {
- ntreesTest <- sample(1000,1)
- maxDepthTest <- sample(20,1)
+ skip_if_not_installed("reticulate")
+ skip_on_cran()
+ ntreesTest <- sample(1000, 1)
+ maxDepthTest <- sample(20, 1)
featureEngineeringSettings <- createRandomForestFeatureSelection(
- ntrees = ntreesTest,
+ ntrees = ntreesTest,
maxDepth = maxDepthTest
- )
-
- expect_is(featureEngineeringSettings, 'featureEngineeringSettings')
+ )
+
+ expect_is(featureEngineeringSettings, "featureEngineeringSettings")
expect_equal(featureEngineeringSettings$ntrees, ntreesTest)
expect_equal(featureEngineeringSettings$max_depth, maxDepthTest)
- expect_equal(attr(featureEngineeringSettings, "fun"), 'randomForestFeatureSelection')
-
+ expect_equal(attr(featureEngineeringSettings, "fun"), "randomForestFeatureSelection")
+
# error due to params
expect_error(
createRandomForestFeatureSelection(
- ntrees = -1,
+ ntrees = -1,
maxDepth = maxDepthTest
)
)
-
+
expect_error(
createRandomForestFeatureSelection(
- ntrees = 'dfdfd',
+ ntrees = "dfdfd",
maxDepth = maxDepthTest
)
)
-
+
expect_error(
createRandomForestFeatureSelection(
- ntrees = 50,
- maxDepth = 'maxDepthTest'
+ ntrees = 50,
+ maxDepth = "maxDepthTest"
)
)
-
+
expect_error(
createRandomForestFeatureSelection(
- ntrees = 50,
+ ntrees = 50,
maxDepth = -1
)
)
-
})
test_that("randomForestFeatureSelection", {
-
- ntreesTest <- sample(1000,1)
- maxDepthTest <- sample(20,1)
+ skip_if_not_installed("reticulate")
+ skip_on_cran()
+ ntreesTest <- sample(1000, 1)
+ maxDepthTest <- sample(20, 1)
featureEngineeringSettings <- createRandomForestFeatureSelection(
- ntrees = ntreesTest,
+ ntrees = ntreesTest,
maxDepth = maxDepthTest
)
-
+
newTrainData <- copyTrainData(trainData)
- trainDataCovariateSize <- newTrainData$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull()
-
+ trainDataCovariateSize <- newTrainData$covariateData$covariates %>%
+ dplyr::tally() %>%
+ dplyr::pull()
+
reducedTrainData <- randomForestFeatureSelection(
- trainData = newTrainData,
+ trainData = newTrainData,
featureEngineeringSettings = featureEngineeringSettings,
covariateIdsInclude = NULL
)
-
- newDataCovariateSize <- reducedTrainData$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull()
- expect_true(newDataCovariateSize < trainDataCovariateSize)
+ newDataCovariateSize <- reducedTrainData$covariateData$covariates %>%
+ dplyr::tally() %>%
+ dplyr::pull()
+ expect_true(newDataCovariateSize < trainDataCovariateSize)
})
test_that("featureSelection is applied on test_data", {
+ skip_if_not_installed("reticulate")
+ skip_on_cran()
k <- 20
featureEngineeringSettings <- testUniFun(k = k)
newTrainData <- copyTrainData(trainData)
newTrainData <- univariateFeatureSelection(
- trainData = newTrainData,
+ trainData = newTrainData,
featureEngineeringSettings = featureEngineeringSettings,
covariateIdsInclude = NULL
)
-
+
modelSettings <- setLassoLogisticRegression()
-
+
# added try catch due to model sometimes not fitting
plpModel <- tryCatch(
- {fitPlp(newTrainData, modelSettings, analysisId='FE')},
- error = function(e){return(NULL)}
+ {
+ fitPlp(newTrainData, modelSettings, analysisId = "FE")
+ },
+ error = function(e) {
+ return(NULL)
+ }
)
-
- if(!is.null(plpModel)){ # if the model fit then check this
+
+ if (!is.null(plpModel)) { # if the model fit then check this
prediction <- predictPlp(plpModel, testData, population)
- expect_true(attr(prediction, 'metaData')$featureEngineering)
+ expect_true(attr(prediction, "metaData")$featureEngineering)
}
})
test_that("createSplineSettings correct class", {
-
featureEngineeringSettings <- createSplineSettings(
- continousCovariateId = 12,
+ continousCovariateId = 12,
knots = 4
- )
-
- expect_is(featureEngineeringSettings, 'featureEngineeringSettings')
+ )
+
+ expect_is(featureEngineeringSettings, "featureEngineeringSettings")
expect_equal(featureEngineeringSettings$knots, 4)
expect_equal(featureEngineeringSettings$continousCovariateId, 12)
- expect_equal(attr(featureEngineeringSettings, "fun"), 'splineCovariates')
-
- expect_error(createSplineSettings(knots = 'ffdff'))
+ expect_equal(attr(featureEngineeringSettings, "fun"), "splineCovariates")
+
+ expect_error(createSplineSettings(knots = "ffdff"))
expect_error(createSplineSettings(knots = NULL))
})
test_that("createSplineSettings correct class", {
-
knots <- 4
featureEngineeringSettings <- createSplineSettings(
- continousCovariateId = 12101,
+ continousCovariateId = 12101,
knots = knots
)
-
+ data(plpDataSimulationProfile)
trainData <- simulatePlpData(plpDataSimulationProfile, n = 200)
-
+
N <- 50
trainData$covariateData$covariates <- data.frame(
rowId = sample(trainData$cohorts$rowId, N),
covariateId = rep(12101, N),
- covariateValue = sample(10, N, replace = T)
+ covariateValue = sample(10, N, replace = TRUE)
)
-
+
trainData$covariateData$analysisRef <- data.frame(
analysisId = 101,
- analysisName = 'cond',
- domainId = 'madeup',
+ analysisName = "cond",
+ domainId = "madeup",
startDay = 0,
endDay = 0,
- isBinary = 'N',
- missingMeansZero = 'N'
+ isBinary = "N",
+ missingMeansZero = "N"
)
-
+
trainData$covariateData$covariateRef <- data.frame(
covariateId = 12101,
- covariateName = 'test',
+ covariateName = "test",
analysisId = 101,
conceptId = 1
)
-
-newData <- splineCovariates(
- trainData = trainData,
+
+ newData <- splineCovariates(
+ trainData = trainData,
featureEngineeringSettings = featureEngineeringSettings
-)
+ )
-testthat::expect_true(1 < nrow(as.data.frame(newData$covariateData$analysisRef)))
-testthat::expect_true((knots+1) == nrow(as.data.frame(newData$covariateData$covariateRef)))
-testthat::expect_true((knots+1) == length(table(as.data.frame(newData$covariateData$covariates)$covariateId)))
-
+ testthat::expect_true(1 < nrow(as.data.frame(newData$covariateData$analysisRef)))
+ testthat::expect_true((knots + 1) == nrow(as.data.frame(newData$covariateData$covariateRef)))
+ testthat::expect_true((knots + 1) == length(table(as.data.frame(newData$covariateData$covariates)$covariateId)))
})
test_that("createStratifiedImputationSettings correct class", {
-
+ ageSplits <- c(33, 38, 42)
featureEngineeringSettings <- createStratifiedImputationSettings(
- covariateId = 12101,
- ageSplits = c(20,50,70)
- )
-
- trainData <- simulatePlpData(plpDataSimulationProfile, n = 200)
-
- N <- 50
- trainData$covariateData$covariates <- data.frame(
- rowId = sample(trainData$cohorts$rowId, N),
- covariateId = rep(12101, N),
- covariateValue = sample(10, N, replace = T)
+ covariateId = 12101,
+ ageSplits = ageSplits
)
-
- trainData$covariateData$analysisRef <- data.frame(
+
+ numSubjects <- nanoData$covariateData$covariates %>%
+ dplyr::pull(.data$rowId) %>%
+ dplyr::n_distinct()
+ Andromeda::appendToTable(nanoData$covariateData$covariates, data.frame(
+ rowId = sample(nanoData$cohorts$rowId, floor(numSubjects / 2)),
+ covariateId = rep(12101, floor(numSubjects / 2)),
+ covariateValue = sample(10, floor(numSubjects / 2), replace = TRUE)
+ ))
+
+ Andromeda::appendToTable(nanoData$covariateData$analysisRef, data.frame(
analysisId = 101,
- analysisName = 'cond',
- domainId = 'madeup',
+ analysisName = "cond",
+ domainId = "madeup",
startDay = 0,
endDay = 0,
- isBinary = 'N',
- missingMeansZero = 'N'
- )
-
- trainData$covariateData$covariateRef <- data.frame(
+ isBinary = "N",
+ missingMeansZero = "N"
+ ))
+
+ Andromeda::appendToTable(nanoData$covariateData$covariateRef, data.frame(
covariateId = 12101,
- covariateName = 'test',
+ covariateName = "test",
analysisId = 101,
conceptId = 1
- )
-
+ ))
+
stratifiedMeans <- calculateStratifiedMeans(
- trainData = trainData,
+ trainData = nanoData,
featureEngineeringSettings = featureEngineeringSettings
)
-
- testthat::expect_true(nrow(stratifiedMeans) == 8)
-
-imputedData <- imputeMissingMeans(
- trainData = trainData,
+
+ testthat::expect_true(nrow(stratifiedMeans) == 8)
+
+ imputedData <- imputeMissingMeans(
+ trainData = nanoData,
covariateId = 12101,
- ageSplits = c(20,50,70),
+ ageSplits = ageSplits,
stratifiedMeans = stratifiedMeans
-)
-
-testthat::expect_true(
- nrow(as.data.frame(imputedData$covariateData$covariates)) == 200
-)
+ )
-})
\ No newline at end of file
+ testthat::expect_equal(
+ imputedData$covariateData$covariates %>%
+ dplyr::filter(.data$covariateId == 12101) %>%
+ dplyr::pull(.data$rowId) %>%
+ dplyr::n_distinct(),
+ numSubjects
+ )
+})
diff --git a/tests/testthat/test-featureImportance.R b/tests/testthat/test-featureImportance.R
index d0140adda..64e78fdd1 100644
--- a/tests/testthat/test-featureImportance.R
+++ b/tests/testthat/test-featureImportance.R
@@ -20,39 +20,37 @@ context("FeatureImportance")
test_that("pfi feature importance returns data.frame", {
-
# limit to a sample of 2 covariates for faster test
- covariates <- plpResult$model$covariateImportance %>%
- dplyr::filter("covariateValue" != 0) %>%
- dplyr::select("covariateId") %>%
+ covariates <- plpResult$model$covariateImportance %>%
+ dplyr::filter("covariateValue" != 0) %>%
+ dplyr::select("covariateId") %>%
dplyr::arrange(desc("covariateValue")) %>%
dplyr::pull()
-
+
# if the model had non-zero covariates
- if(length(covariates) > 0){
+ if (length(covariates) > 0) {
if (length(covariates) > 2) {
covariates <- covariates[1:2]
}
- pfiTest <- pfi(plpResult, population, plpData, repeats = 1,
- covariates = covariates, cores = 1, log = NULL,
- logthreshold = "INFO")
-
- expect_equal(class(pfiTest), 'data.frame')
+ pfiTest <- pfi(plpResult, population, plpData,
+ repeats = 1,
+ covariates = covariates, cores = 1, log = NULL,
+ logthreshold = "INFO"
+ )
+
+ expect_equal(class(pfiTest), "data.frame")
expect_equal(sum(names(pfiTest) %in% c("covariateId", "pfi")), 2)
expect_true(all(!is.nan(pfiTest$pfi)))
-
}
-
})
-test_that('pfi feature importance works with logger or without covariates', {
+test_that("pfi feature importance works with logger or without covariates", {
+ pfiTest <- pfi(tinyResults, population, nanoData,
+ cores = 1,
+ covariates = NULL, log = file.path(tempdir(), "pfiLog")
+ )
- pfiTest <- pfi(tinyResults, population, nanoData, cores = 1,
- covariates = NULL, log = file.path(tempdir(), 'pfiLog'))
-
- expect_equal(class(pfiTest), 'data.frame')
+ expect_equal(class(pfiTest), "data.frame")
expect_equal(sum(names(pfiTest) %in% c("covariateId", "pfi")), 2)
expect_true(all(!is.nan(pfiTest$pfi)))
-
})
-
diff --git a/tests/testthat/test-fitting.R b/tests/testthat/test-fitting.R
index ad22e92bf..4c5b866dc 100644
--- a/tests/testthat/test-fitting.R
+++ b/tests/testthat/test-fitting.R
@@ -21,54 +21,49 @@ context("Fitting")
modelSettings <- setLassoLogisticRegression()
test_that("fitPlp", {
-
-plpModel <- fitPlp(
- trainData = trainData,
- modelSettings = modelSettings,
- search = "grid",
- analysisId = 'fitting',
- analysisPath = tempdir()
+ plpModel <- fitPlp(
+ trainData = trainData,
+ modelSettings = modelSettings,
+ search = "grid",
+ analysisId = "fitting",
+ analysisPath = tempdir()
)
-expect_is(plpModel, 'plpModel')
-
+ expect_is(plpModel, "plpModel")
})
test_that("fitPlp input errors", {
-
expect_error(
fitPlp(
- trainData = trainData,
+ trainData = trainData,
modelSettings = modelSettings,
analysisPath = tempDir()
)
)
-
+
expect_error(
fitPlp(
- trainData = list(covariateData = NULL),
+ trainData = list(covariateData = NULL),
modelSettings = modelSettings,
- analysisId = 'fitting',
+ analysisId = "fitting",
analysisPath = tempDir()
)
)
-
+
expect_error(
fitPlp(
- trainData = trainData,
+ trainData = trainData,
modelSettings = NULL,
- analysisId = 'fitting',
+ analysisId = "fitting",
analysisPath = tempDir()
)
)
-
+
expect_error(
fitPlp(
- trainData = trainData,
+ trainData = trainData,
modelSettings = modelSettings,
- analysisId = 'fitting'
+ analysisId = "fitting"
)
)
-
})
-
diff --git a/tests/testthat/test-formatting.R b/tests/testthat/test-formatting.R
index 0e3ac0121..d8d80fd89 100644
--- a/tests/testthat/test-formatting.R
+++ b/tests/testthat/test-formatting.R
@@ -17,165 +17,174 @@
library("testthat")
context("Formatting")
-createCovariateData <- function(){
+createCovariateData <- function() {
# testing manually constructed data...
- covs <- data.frame(rowId=c(1,1,1,2,4,4,4,6,6), # 3 and 5 have nothing
- covariateId=c(123,2002,10,123,2002,3,4,9,8),
- covariateValue=rep(1,9))
- covref <- data.frame(covariateId=c(123,2002,3,4,5,6,7,8,9,10),
- covariateName=1:10,
- analysisId=rep(1,10),
- conceptId=1:10)
-
+ covs <- data.frame(
+ rowId = c(1, 1, 1, 2, 4, 4, 4, 6, 6), # 3 and 5 have nothing
+ covariateId = c(123, 2002, 10, 123, 2002, 3, 4, 9, 8),
+ covariateValue = rep(1, 9)
+ )
+ covref <- data.frame(
+ covariateId = c(123, 2002, 3, 4, 5, 6, 7, 8, 9, 10),
+ covariateName = 1:10,
+ analysisId = rep(1, 10),
+ conceptId = 1:10
+ )
+
covariateData <- Andromeda::andromeda()
covariateData$covariates <- covs
covariateData$covariateRef <- covref
-
+
return(covariateData)
}
test_that("MapIds with no cohort", {
-
covariateDataTest <- createCovariateData()
-
+
mappings <- MapIds(
covariateData = covariateDataTest,
cohort = NULL,
mapping = NULL
)
- #rowMap, covariates, mapping, covariateRef (no cohort as NULL)
- expect_true('rowMap' %in% names(mappings))
- expect_true('covariates' %in% names(mappings))
- expect_true('mapping' %in% names(mappings))
- expect_true('covariateRef' %in% names(mappings))
- expect_false('cohort' %in% names(mappings))
-
+ # rowMap, covariates, mapping, covariateRef (no cohort as NULL)
+ expect_true("rowMap" %in% names(mappings))
+ expect_true("covariates" %in% names(mappings))
+ expect_true("mapping" %in% names(mappings))
+ expect_true("covariateRef" %in% names(mappings))
+ expect_false("cohort" %in% names(mappings))
+
# 4 rowIds in the data
expect_equal(mappings$rowMap %>% dplyr::tally() %>% dplyr::pull(), 4)
-
+
# some covariates not in data 5,6,7 so should be removed from covRef
expect_equal(mappings$covariateRef %>% dplyr::tally() %>% dplyr::pull(), 7)
-
- correctCov <- mappings$covariateRef %>% dplyr::select("covariateId") %>% dplyr::pull() %in% c(123,2002,10,3,4,9,8)
+
+ correctCov <- mappings$covariateRef %>%
+ dplyr::select("covariateId") %>%
+ dplyr::pull() %in% c(123, 2002, 10, 3, 4, 9, 8)
expect_equal(sum(correctCov), length(correctCov))
-
})
test_that("MapIds with a cohort", {
-
covariateDataTest <- createCovariateData()
-
- cohort <- data.frame(rowId=c(2,6), outcomeCount = c(1,0))
-
+
+ cohort <- data.frame(rowId = c(2, 6), outcomeCount = c(1, 0))
+
mappings <- MapIds(
covariateData = covariateDataTest,
cohort = cohort,
mapping = NULL
)
-
- #rowMap, covariates, mapping, covariateRef (no cohort as NULL)
- expect_true('rowMap' %in% names(mappings))
- expect_true('covariates' %in% names(mappings))
- expect_true('mapping' %in% names(mappings))
- expect_true('covariateRef' %in% names(mappings))
- expect_true('cohort' %in% names(mappings))
-
+
+ # rowMap, covariates, mapping, covariateRef (no cohort as NULL)
+ expect_true("rowMap" %in% names(mappings))
+ expect_true("covariates" %in% names(mappings))
+ expect_true("mapping" %in% names(mappings))
+ expect_true("covariateRef" %in% names(mappings))
+ expect_true("cohort" %in% names(mappings))
+
# 4 rowIds in the data
expect_equal(mappings$rowMap %>% dplyr::tally() %>% dplyr::pull(), 2)
-
+
# no covariates should be lost
expect_equal(mappings$covariates %>% dplyr::tally() %>% dplyr::pull(), 3)
-
+
# some covariates not in data 5,6,7 so should be removed from covRef
expect_equal(mappings$covariateRef %>% dplyr::tally() %>% dplyr::pull(), 3)
-
- correctCov <- mappings$covariateRef %>% dplyr::select("covariateId") %>% dplyr::pull() %in% c(123,9,8)
+
+ correctCov <- mappings$covariateRef %>%
+ dplyr::select("covariateId") %>%
+ dplyr::pull() %in% c(123, 9, 8)
expect_equal(sum(correctCov), length(correctCov))
-
})
# switch of all messages
test_that("toSparseM", {
-
- cohorts <- data.frame(rowId=1:6,
- subjectId=1:6,
- targetId=rep(1,6),
- cohortStartDate= rep('2007-12-28 00:00:00.0',6),
- daysFromObsStart= c(500,50,500,500,500,500),
- daysToCohortEnd= rep(200,6),
- daysToObsEnd=rep(200,6),
- ageYear = rep(25, 6),
- gender = rep(8507, 6))
-
- attr(cohorts, "metaData") <- list(attrition=data.frame(outcomeId=2,description='test',
- targetCount=6,uniquePeople=6,
- outcomes=2))
-
- outcomes <- data.frame(rowId=c(1,2),
- outcomeId=rep(outcomeId,2),
- daysToEvent=c(150,40))
-
- FplpData <- list(cohorts=cohorts,
- outcomes=outcomes,
- covariateData = createCovariateData())
- class(FplpData) <- 'plpData'
+ cohorts <- data.frame(
+ rowId = 1:6,
+ subjectId = 1:6,
+ targetId = rep(1, 6),
+ cohortStartDate = rep("2007-12-28 00:00:00.0", 6),
+ daysFromObsStart = c(500, 50, 500, 500, 500, 500),
+ daysToCohortEnd = rep(200, 6),
+ daysToObsEnd = rep(200, 6),
+ ageYear = rep(25, 6),
+ gender = rep(8507, 6)
+ )
+
+ attr(cohorts, "metaData") <- list(attrition = data.frame(
+ outcomeId = 2, description = "test",
+ targetCount = 6, uniquePeople = 6,
+ outcomes = 2
+ ))
+
+ outcomes <- data.frame(
+ rowId = c(1, 2),
+ outcomeId = rep(outcomeId, 2),
+ daysToEvent = c(150, 40)
+ )
+
+ FplpData <- list(
+ cohorts = cohorts,
+ outcomes = outcomes,
+ covariateData = createCovariateData()
+ )
+ class(FplpData) <- "plpData"
Fpopulation <- data.frame(
- rowId = c(1,3:6),
- outcomeCount = c(1,0,0,0,0)
- )
-
- # test gbm coo to sparse matrix
- sparseMat.test <- toSparseM(FplpData,Fpopulation, map=NULL)
- matrix.real <- matrix(rep(0, 5*7), ncol=7)
- x <- c(1,1,1,3,3,3,5,5)
- y <- c(1,2,3,2,4,5,6,7)
- for(a in 1:8) matrix.real[x[a],y[a]] <- 1
+ rowId = c(1, 3:6),
+ outcomeCount = c(1, 0, 0, 0, 0)
+ )
+
+ # test gbm coo to sparse matrix
+ sparseMat.test <- toSparseM(FplpData, Fpopulation, map = NULL)
+ matrix.real <- matrix(rep(0, 5 * 7), ncol = 7)
+ x <- c(1, 1, 1, 3, 3, 3, 5, 5)
+ y <- c(5, 6, 7, 1, 2, 7, 3, 4)
+ for (a in 1:8) matrix.real[x[a], y[a]] <- 1
expect_that(as.matrix(sparseMat.test$dataMatrix), is_equivalent_to(matrix.real))
-
-
+
+
# check map works by permuting it and checking the result
sparseMat.test$covariateMap <- sparseMat.test$covariateMap %>% dplyr::arrange(.data$covariateId)
sparseMat.test$covariateMap$columnId <- 1:7
- withMapTest <- toSparseM(FplpData,Fpopulation, map=sparseMat.test$covariateMap)
-
-
- matrix.real <- matrix(rep(0, 5*7), ncol=7)
- x <- c(1,1,1,3,3,3,5,5)
- y <- c(6,7,5,7,1,2,3,4)
- for(a in 1:8) matrix.real[x[a],y[a]] <- 1
+ withMapTest <- toSparseM(FplpData, Fpopulation, map = sparseMat.test$covariateMap)
+
+
+ matrix.real <- matrix(rep(0, 5 * 7), ncol = 7)
+ x <- c(1, 1, 1, 3, 3, 3, 5, 5)
+ y <- c(6, 7, 5, 7, 1, 2, 3, 4)
+ for (a in 1:8) matrix.real[x[a], y[a]] <- 1
expect_that(as.matrix(withMapTest$dataMatrix), is_equivalent_to(matrix.real))
-
-
- #==================================
+
+
+ # ==================================
# check sizes using simulated data
- #==================================
-# objects from helper-object.R
- test <- toSparseM(plpData,population, map=NULL)
+ # ==================================
+ # objects from helper-object.R
+ test <- toSparseM(plpData, population, map = NULL)
compTest <- as.matrix(test$dataMatrix)
testthat::expect_equal(test$labels %>% dplyr::tally() %>% dplyr::pull(), length(population$rowId))
testthat::expect_equal(nrow(compTest), length(population$rowId))
- testthat::expect_true(ncol(compTest) <= plpData$covariateData$covariateRef %>% dplyr::tally() %>% dplyr::pull())
+ testthat::expect_true(ncol(compTest) <= plpData$covariateData$covariateRef %>%
+ dplyr::tally() %>%
+ dplyr::pull())
testthat::expect_equal(ncol(compTest), test$covariateRef %>% dplyr::tally() %>% dplyr::pull())
testthat::expect_equal(ncol(compTest), test$covariateMap %>% dplyr::tally() %>% dplyr::pull())
-
-
})
test_that("checkRam", {
-
ramCheck <- checkRam(createCovariateData())
expect_true(ramCheck)
-
})
formattingCovs <- Andromeda::andromeda()
formattingCovs$covariates <- data.frame(
rowId = c(
- rep(1, 5),
- rep(2, 3),
+ rep(1, 5),
+ rep(2, 3),
rep(4, 2),
rep(50, 6)
),
@@ -183,20 +192,20 @@ formattingCovs$covariates <- data.frame(
c(1001, 1213104, 1233105, 1, 99),
c(1001, 2, 99),
c(1001, 4),
- c(1,99, 98, 2, 4,3)
+ c(1, 99, 98, 2, 4, 3)
),
covariateValue = rep(1, 16)
)
formattingCovs$covariateRef <- data.frame(
- covariateId = c(1001, 1213104, 25,26, 1233105, 1, 99, 2,4,98,3),
- covariateName = c(paste0('Covariate_', 1:11))
- )
+ covariateId = c(1001, 1213104, 25, 26, 1233105, 1, 99, 2, 4, 98, 3),
+ covariateName = c(paste0("Covariate_", 1:11))
+)
formattingcohort <- data.frame(
- rowId = c(1:4,50),
- outcomeCount = c(1,1,0,0,0)
- )
+ rowId = c(1:4, 50),
+ outcomeCount = c(1, 1, 0, 0, 0)
+)
formattingData <- list(
labels = formattingcohort,
@@ -204,18 +213,16 @@ formattingData <- list(
)
test_that("testCorrectLables", {
-
data <- toSparseM(plpData = formattingData)
-
+
expect_equal(
data.frame(
outcomeCount = data$labels$outcomeCount,
rowId = data$labels$rowId
- ),
+ ),
data.frame(
- outcomeCount = c(1,1,0,0,0),
+ outcomeCount = c(1, 1, 0, 0, 0),
rowId = 1:5
- )
)
-
+ )
})
diff --git a/tests/testthat/test-getCalibration.R b/tests/testthat/test-getCalibration.R
index 522fb6b98..ad09912f1 100644
--- a/tests/testthat/test-getCalibration.R
+++ b/tests/testthat/test-getCalibration.R
@@ -18,34 +18,33 @@ context("Calibration")
test_that("getCalibration binary", {
Eprediction <- data.frame(
- rowId = 1:100,
- evaluation = rep('Test',100),
- value = runif(100),
+ rowId = 1:100,
+ evaluation = rep("Test", 100),
+ value = runif(100),
outcomeCount = round(runif(100))
- )
- attr(Eprediction, "metaData")$predictionType <- "binary"
+ )
+ attr(Eprediction, "metaData")$predictionType <- "binary"
calib <- getCalibrationSummary(
prediction = Eprediction,
- predictionType = 'binary',
- typeColumn = 'evaluation',
+ predictionType = "binary",
+ typeColumn = "evaluation",
numberOfStrata = 100,
truncateFraction = 0.05
)
-
- expect_that(nrow(calib ), equals(100))
- expect_that(ncol(calib ), equals(12))
- expect_true('evaluation' %in% colnames(calib))
-
-
+
+ expect_that(nrow(calib), equals(100))
+ expect_that(ncol(calib), equals(12))
+ expect_true("evaluation" %in% colnames(calib))
+
+
calibBinary <- getCalibrationSummary_binary(
prediction = Eprediction,
- evalColumn = 'evaluation',
+ evalColumn = "evaluation",
numberOfStrata = 100,
truncateFraction = 0.05
- )
-
+ )
+
expect_equal(calib, calibBinary)
-
})
@@ -53,23 +52,22 @@ test_that("getCalibration binary", {
test_that("getCalibration survival", {
Eprediction <- data.frame(
- rowId = 1:100,
- evaluation = rep('Test',100),
- value = runif(100),
- survivalTime = 50+sample(2*365,100),
+ rowId = 1:100,
+ evaluation = rep("Test", 100),
+ value = runif(100),
+ survivalTime = 50 + sample(2 * 365, 100),
outcomeCount = round(runif(100))
)
-
+
calib <- getCalibrationSummary_survival(
prediction = Eprediction,
- evalColumn = 'evaluation',
+ evalColumn = "evaluation",
numberOfStrata = 50,
truncateFraction = 0.05,
timepoint = 365
)
-
- expect_true('evaluation' %in% colnames(calib))
- expect_that(nrow(calib ), equals(50))
- expect_that(ncol(calib ), equals(7))
-
+
+ expect_true("evaluation" %in% colnames(calib))
+ expect_that(nrow(calib), equals(50))
+ expect_that(ncol(calib), equals(7))
})
diff --git a/tests/testthat/test-helperfunctions.R b/tests/testthat/test-helperfunctions.R
index 526d6466b..49825d566 100644
--- a/tests/testthat/test-helperfunctions.R
+++ b/tests/testthat/test-helperfunctions.R
@@ -17,15 +17,6 @@
library("testthat")
context("HelperFunctions")
-test_that("is_installed", {
- testthat::expect_equal(PatientLevelPrediction:::is_installed('FeatureExtraction'), T)
- testthat::expect_equal(PatientLevelPrediction:::is_installed('MadeUp4u834t3f'), F)
-})
-
-test_that("ensure_installed", {
- testthat::expect_equal(PatientLevelPrediction:::ensure_installed('FeatureExtraction'), NULL)
-})
-
# how to test checkPlpInstallation?
@@ -33,8 +24,8 @@ test_that("createTempModelLoc", {
testthat::expect_equal(class(PatientLevelPrediction:::createTempModelLoc()), "character")
})
-list1 <- list(a=1:2, b=5:6)
-list2 <- list(c=1:5)
+list1 <- list(a = 1:2, b = 5:6)
+list2 <- list(c = 1:5)
test_that("listAppend", {
testthat::expect_equal(length(listAppend(list1, list2)), 3)
})
@@ -42,16 +33,18 @@ test_that("listAppend", {
# how to test configurePython?
test_that("setPythonEnvironment", {
- testthat::expect_error(setPythonEnvironment(envname='madeup34343'))
- testthat::expect_equal(class(setPythonEnvironment(envname='madeup34343', envtype = 'conda')), "character")
+ skip_if_not_installed("reticulate")
+ skip_on_cran()
+ testthat::expect_error(setPythonEnvironment(envname = "madeup34343"))
+ testthat::expect_equal(class(setPythonEnvironment(envname = "madeup34343", envtype = "conda")), "character")
})
test_that("Borrowed cut2", {
- x <- c(1, rep(2, 2), rep(4, 4), rep(5, 5), rep(6, 6))
- groups <- PatientLevelPrediction:::cut2(x, g = 3)
- expect_true(
- all(levels(groups) == c("[1,5)", "5", "6"))
- )
+ x <- c(1, rep(2, 2), rep(4, 4), rep(5, 5), rep(6, 6))
+ groups <- PatientLevelPrediction:::cut2(x, g = 3)
+ expect_true(
+ all(levels(groups) == c("[1,5)", "5", "6"))
+ )
})
# getOs test?
diff --git a/tests/testthat/test-learningCurves.R b/tests/testthat/test-learningCurves.R
index 15da60bcf..dfc8141e0 100644
--- a/tests/testthat/test-learningCurves.R
+++ b/tests/testthat/test-learningCurves.R
@@ -16,67 +16,72 @@
context("LearningCurves")
-# learningCurve
+# learningCurve
learningCurve <- PatientLevelPrediction::createLearningCurve(
plpData = plpData,
- outcomeId = outcomeId, parallel = F, cores = -1,
+ outcomeId = outcomeId, parallel = FALSE, cores = -1,
modelSettings = setLassoLogisticRegression(),
- saveDirectory = file.path(saveLoc, 'lcc'),
- splitSettings = createDefaultSplitSetting(testFraction = 0.2, nfold=2),
- trainFractions = c(0.6,0.7),
+ saveDirectory = file.path(saveLoc, "lcc"),
+ splitSettings = createDefaultSplitSetting(testFraction = 0.2, nfold = 2),
+ trainFractions = c(0.6, 0.7),
trainEvents = NULL,
preprocessSettings = createPreprocessSettings(
minFraction = 0.001,
- normalize = T
+ normalize = TRUE
)
)
test_that("learningCurve output correct", {
-
-
testthat::expect_true(is.data.frame(learningCurve))
- testthat::expect_equal(sum(colnames(learningCurve)%in%c(
+ testthat::expect_equal(sum(colnames(learningCurve) %in% c(
"trainFraction",
"Train_AUROC",
"nPredictors",
"Train_populationSize",
- "Train_outcomeCount") ),5)
-
- testthat::expect_equal(learningCurve$trainFraction, c(0.6,0.7)*100)
-
+ "Train_outcomeCount"
+ )), 5)
+
+ testthat::expect_equal(learningCurve$trainFraction, c(0.6, 0.7) * 100)
})
test_that("plotLearningCurve", {
-
- test <- plotLearningCurve(learningCurve = learningCurve,
- metric = 'AUROC')
-
+ skip_if_not_installed("ggplot2")
+ skip_on_cran()
+ test <- plotLearningCurve(
+ learningCurve = learningCurve,
+ metric = "AUROC"
+ )
+
# test the plot works
- testthat::expect_s3_class(test, 'ggplot')
-
- test <- plotLearningCurve(learningCurve = learningCurve,
- metric = "AUPRC")
- testthat::expect_s3_class(test, 'ggplot')
-
- test <- plotLearningCurve(learningCurve = learningCurve,
- metric = "sBrier")
- testthat::expect_s3_class(test, 'ggplot')
-
+ testthat::expect_s3_class(test, "ggplot")
+
+ test <- plotLearningCurve(
+ learningCurve = learningCurve,
+ metric = "AUPRC"
+ )
+ testthat::expect_s3_class(test, "ggplot")
+
+ test <- plotLearningCurve(
+ learningCurve = learningCurve,
+ metric = "sBrier"
+ )
+ testthat::expect_s3_class(test, "ggplot")
})
test_that("getTrainFractions works", {
-
learningCurve <- PatientLevelPrediction::createLearningCurve(
plpData = tinyPlpData,
- outcomeId = outcomeId, parallel = F, cores = -1,
+ outcomeId = outcomeId, parallel = FALSE, cores = -1,
modelSettings = setLassoLogisticRegression(seed = 42),
- saveDirectory = file.path(saveLoc, 'lcc'),
- splitSettings = createDefaultSplitSetting(testFraction = 0.33, nfold = 2,
- splitSeed = 42),
- trainEvents = c(150,200),
+ saveDirectory = file.path(saveLoc, "lcc"),
+ splitSettings = createDefaultSplitSetting(
+ testFraction = 0.33, nfold = 2,
+ splitSeed = 42
+ ),
+ trainEvents = c(150, 200),
preprocessSettings = createPreprocessSettings(
minFraction = 0.001,
- normalize = T
+ normalize = TRUE
)
)
testthat::expect_true(is.data.frame(learningCurve))
@@ -85,7 +90,6 @@ test_that("getTrainFractions works", {
"Train_AUROC",
"nPredictors",
"Train_populationSize",
- "Train_outcomeCount") ),5)
-
+ "Train_outcomeCount"
+ )), 5)
})
-
diff --git a/tests/testthat/test-multiplePlp.R b/tests/testthat/test-multiplePlp.R
index 14cf651dc..54513fc03 100644
--- a/tests/testthat/test-multiplePlp.R
+++ b/tests/testthat/test-multiplePlp.R
@@ -20,7 +20,8 @@ context("MultiplePlp")
analysis1 <- createModelDesign(
targetId = 1,
outcomeId = outcomeId,
- restrictPlpDataSettings = createRestrictPlpDataSettings(firstExposureOnly = F, washoutPeriod = 0),
+ restrictPlpDataSettings =
+ createRestrictPlpDataSettings(firstExposureOnly = FALSE, washoutPeriod = 0),
populationSettings = createStudyPopulationSettings(),
covariateSettings = covariateSettings,
featureEngineeringSettings = NULL,
@@ -31,67 +32,64 @@ analysis1 <- createModelDesign(
)
test_that("createModelDesign - test working", {
-
expect_equal(analysis1$targetId, 1)
expect_equal(analysis1$outcomeId, outcomeId)
- expect_equal(analysis1$restrictPlpDataSettings, createRestrictPlpDataSettings(firstExposureOnly = F, washoutPeriod = 0))
+ expect_equal(analysis1$restrictPlpDataSettings,
+ createRestrictPlpDataSettings(firstExposureOnly = FALSE, washoutPeriod = 0))
expect_equal(analysis1$covariateSettings, covariateSettings)
- expect_equal(analysis1$featureEngineeringSettings, list(createFeatureEngineeringSettings(type= "none")))
- expect_equal(analysis1$sampleSettings, list(createSampleSettings(type = 'none')))
+ expect_equal(analysis1$featureEngineeringSettings, list(createFeatureEngineeringSettings(type = "none")))
+ expect_equal(analysis1$sampleSettings, list(createSampleSettings(type = "none")))
expect_equal(analysis1$preprocessSettings, createPreprocessSettings())
expect_equal(analysis1$splitSettings, createDefaultSplitSetting(splitSeed = 1))
expect_equal(analysis1$modelSettings, setLassoLogisticRegression(seed = 12))
expect_equal(
- analysis1$executeSettings,
+ analysis1$executeSettings,
createExecuteSettings(
- runSplitData = T,
- runSampleData = F,
- runfeatureEngineering = F,
- runPreprocessData = T,
- runModelDevelopment = T,
- runCovariateSummary = T
+ runSplitData = TRUE,
+ runSampleData = FALSE,
+ runfeatureEngineering = FALSE,
+ runPreprocessData = TRUE,
+ runModelDevelopment = TRUE,
+ runCovariateSummary = TRUE
)
)
-
})
test_that("saving analyses settings", {
-
fileLocation <- savePlpAnalysesJson(
modelDesignList = list(analysis1),
- saveDirectory = file.path(saveLoc, 'settings')
+ saveDirectory = file.path(saveLoc, "settings")
)
-
+
expect_true(file.exists(fileLocation))
-
-}
-)
+})
test_that("loading analyses settings", {
-
- analysisSetting <- loadPlpAnalysesJson(file.path(saveLoc, 'settings',"predictionAnalysisList.json"))
-
+ analysisSetting <- loadPlpAnalysesJson(file.path(saveLoc, "settings", "predictionAnalysisList.json"))
+
expect_equal(analysis1$targetId, analysisSetting$analyses[[1]]$targetId)
expect_equal(analysis1$outcomeId, analysisSetting$analyses[[1]]$outcomeId)
expect_equal(analysis1$restrictPlpDataSettings, analysisSetting$analyses[[1]]$restrictPlpDataSettings)
- expect_equal(attr(analysis1$covariateSettings, 'fun'), attr(analysisSetting$analyses[[1]]$covariateSettings,'fun') )
+ expect_equal(attr(analysis1$covariateSettings, "fun"), attr(analysisSetting$analyses[[1]]$covariateSettings, "fun"))
expect_equal(analysis1$populationSettings, analysisSetting$analyses[[1]]$populationSettings)
expect_equal(analysis1$sampleSettings, analysisSetting$analyses[[1]]$sampleSettings)
- expect_equal(attr(analysis1$featureEngineeringSettings,'class'), attr(analysisSetting$analyses[[1]]$featureEngineeringSettings,'class'))
- expect_equal(attr(analysis1$featureEngineeringSettings,'fun'), attr(analysisSetting$analyses[[1]]$featureEngineeringSettings,'fun'))
+ expect_equal(attr(analysis1$featureEngineeringSettings, "class"), attr(analysisSetting$analyses[[1]]$featureEngineeringSettings, "class"))
+ expect_equal(attr(analysis1$featureEngineeringSettings, "fun"), attr(analysisSetting$analyses[[1]]$featureEngineeringSettings, "fun"))
expect_equal(analysis1$preprocessSettings, analysisSetting$analyses[[1]]$preprocessSettings)
expect_equal(analysis1$modelSettings, analysisSetting$analyses[[1]]$modelSettings)
expect_equal(analysis1$splitSettings, analysisSetting$analyses[[1]]$splitSettings)
expect_equal(analysis1$executeSettings, analysisSetting$analyses[[1]]$executeSettings)
-}
-)
+})
test_that("test run multiple", {
-
+ skip_if_not_installed("ResultModelManager")
+ skip_on_cran()
+
analysis3 <- createModelDesign(
targetId = 1,
outcomeId = outcomeId,
- restrictPlpDataSettings = createRestrictPlpDataSettings(firstExposureOnly = F, washoutPeriod = 0),
+ restrictPlpDataSettings =
+ createRestrictPlpDataSettings(firstExposureOnly = FALSE, washoutPeriod = 0),
populationSettings = createStudyPopulationSettings(),
covariateSettings = covariateSettings,
featureEngineeringSettings = createFeatureEngineeringSettings(),
@@ -99,55 +97,54 @@ test_that("test run multiple", {
preprocessSettings = createPreprocessSettings(),
modelSettings = setLassoLogisticRegression(seed = 12),
splitSettings = createDefaultSplitSetting(
- type = "stratified",
+ type = "stratified",
testFraction = 0.25,
- trainFraction = 0.75,
- splitSeed = 123,
+ trainFraction = 0.75,
+ splitSeed = 123,
nfold = 3
),
runCovariateSummary = FALSE
)
-
+
runMultiplePlp(
databaseDetails = databaseDetails,
modelDesignList = list(
# add this twice to make sure no issue with overlapping ids?
analysis3
),
- onlyFetchData = F,
+ onlyFetchData = FALSE,
logSettings = createLogSettings(
- verbosity = "DEBUG",
- timeStamp = T,
+ verbosity = "DEBUG",
+ timeStamp = TRUE,
logName = "runPlp Log"
),
- saveDirectory = file.path(saveLoc, 'multiple')
+ saveDirectory = file.path(saveLoc, "multiple")
)
-
- expect_true(file.exists(file.path(saveLoc, 'multiple', 'settings.csv')))
- expect_true(dir.exists(file.path(saveLoc, 'multiple', 'Analysis_1')))
- expect_true(file.exists(file.path(saveLoc, 'multiple', 'Analysis_1','plpResult', 'runPlp.rds')))
-
+
+ expect_true(file.exists(file.path(saveLoc, "multiple", "settings.csv")))
+ expect_true(dir.exists(file.path(saveLoc, "multiple", "Analysis_1")))
+ expect_true(file.exists(file.path(saveLoc, "multiple", "Analysis_1", "plpResult", "runPlp.rds")))
})
test_that("validateMultiplePlp errors", {
-
+ skip_if_not_installed("ResultModelManager")
+ skip_on_cran()
PatientLevelPrediction::validateMultiplePlp(
- analysesLocation = file.path(saveLoc,'multiple'),
- validationDatabaseDetails = databaseDetails,
- validationRestrictPlpDataSettings = createRestrictPlpDataSettings(),
+ analysesLocation = file.path(saveLoc, "multiple"),
+ validationDatabaseDetails = databaseDetails,
+ validationRestrictPlpDataSettings = createRestrictPlpDataSettings(),
recalibrate = NULL
- )
-
-expect_true(dir.exists(file.path(saveLoc, 'multiple', 'Validation', 'main')))
-expect_true(dir.exists(file.path(saveLoc, 'multiple', 'Validation', 'main', 'Analysis_1', 'validationResult')))
-expect_true(file.exists(file.path(saveLoc, 'multiple', 'Validation', 'main', 'Analysis_1', 'validationResult', 'runPlp.rds')))
+ )
+
+ expect_true(dir.exists(file.path(saveLoc, "multiple", "Validation", "main")))
+ expect_true(dir.exists(file.path(saveLoc, "multiple", "Validation", "main", "Analysis_1", "validationResult")))
+ expect_true(file.exists(file.path(saveLoc, "multiple", "Validation", "main", "Analysis_1", "validationResult", "runPlp.rds")))
# no results error
expect_error(evaluateMultiplePlp(
- analysesLocation = file.path(saveLoc,'madeup123') ,
- validationDatabaseDetails = databaseDetails,
- validationRestrictPlpDataSettings = createRestrictPlpDataSettings(),
+ analysesLocation = file.path(saveLoc, "madeup123"),
+ validationDatabaseDetails = databaseDetails,
+ validationRestrictPlpDataSettings = createRestrictPlpDataSettings(),
recalibrate = NULL
- ))
+ ))
})
-
diff --git a/tests/testthat/test-paramchecks.R b/tests/testthat/test-paramchecks.R
index 1a7967fdb..eae484f25 100644
--- a/tests/testthat/test-paramchecks.R
+++ b/tests/testthat/test-paramchecks.R
@@ -21,84 +21,66 @@ context("ParamChecks")
test_that("checkBoolean", {
-
testthat::expect_error(checkBoolean(1))
- testthat::expect_error(checkBoolean('tertet'))
-
- testthat::expect_equal(checkBoolean(T), T)
- testthat::expect_equal(checkBoolean(F), T)
-
-
+ testthat::expect_error(checkBoolean("tertet"))
+
+ testthat::expect_equal(checkBoolean(TRUE), TRUE)
+ testthat::expect_equal(checkBoolean(FALSE), TRUE)
})
test_that("checkHigherEqual", {
-
- testthat::expect_error(checkHigherEqual(1,2))
- testthat::expect_error(checkHigherEqual('tertet',1))
-
- testthat::expect_equal(checkHigherEqual(1,0), T)
- testthat::expect_equal(checkHigherEqual(0,0), T)
-
-
+ testthat::expect_error(checkHigherEqual(1, 2))
+ testthat::expect_error(checkHigherEqual("tertet", 1))
+
+ testthat::expect_equal(checkHigherEqual(1, 0), TRUE)
+ testthat::expect_equal(checkHigherEqual(0, 0), TRUE)
})
test_that("checkLowerEqual", {
-
- testthat::expect_error(checkLowerEqual(2,1))
- testthat::expect_error(checkLowerEqual('tertet',1))
-
- testthat::expect_equal(checkLowerEqual(0,1), T)
- testthat::expect_equal(checkLowerEqual(0,0), T)
+ testthat::expect_error(checkLowerEqual(2, 1))
+ testthat::expect_error(checkLowerEqual("tertet", 1))
+ testthat::expect_equal(checkLowerEqual(0, 1), TRUE)
+ testthat::expect_equal(checkLowerEqual(0, 0), TRUE)
})
test_that("checkHigher", {
-
- testthat::expect_error(checkHigher(1,2))
- testthat::expect_error(checkHigher(0,0))
- testthat::expect_error(checkHigher('tertet',1))
-
- testthat::expect_equal(checkHigher(1,0), T)
-
+ testthat::expect_error(checkHigher(1, 2))
+ testthat::expect_error(checkHigher(0, 0))
+ testthat::expect_error(checkHigher("tertet", 1))
+
+ testthat::expect_equal(checkHigher(1, 0), TRUE)
})
test_that("checkLower", {
-
- testthat::expect_error(checkLower(2,1))
- testthat::expect_error(checkLower(0,0))
- testthat::expect_error(checkLower('tertet',1))
-
- testthat::expect_equal(checkLower(0,1), T)
-
+ testthat::expect_error(checkLower(2, 1))
+ testthat::expect_error(checkLower(0, 0))
+ testthat::expect_error(checkLower("tertet", 1))
+
+ testthat::expect_equal(checkLower(0, 1), TRUE)
})
test_that("checkNotNull", {
-
testthat::expect_error(checkNotNull(NULL))
-
- testthat::expect_equal(checkNotNull(0), T)
- testthat::expect_equal(checkNotNull(T), T)
- testthat::expect_equal(checkNotNull('yeryey'), T)
-
+
+ testthat::expect_equal(checkNotNull(0), TRUE)
+ testthat::expect_equal(checkNotNull(TRUE), TRUE)
+ testthat::expect_equal(checkNotNull("yeryey"), TRUE)
})
test_that("checkIsClass", {
-
- testthat::expect_error(checkIsClass('dsdsds', 'double'))
-
- testthat::expect_equal(checkIsClass('dsdsds', "character"), T)
+ testthat::expect_error(checkIsClass("dsdsds", "double"))
+ testthat::expect_equal(checkIsClass("dsdsds", "character"), TRUE)
})
test_that("checkInStringVector", {
-
- testthat::expect_error(checkInStringVector('dsdsds', c('dsds','double')))
-
- testthat::expect_equal(checkInStringVector('dsdsds', c('dsdsds','double')), T)
-
+ testthat::expect_error(checkInStringVector("dsdsds", c("dsds", "double")))
+
+ testthat::expect_equal(checkInStringVector("dsdsds", c("dsdsds", "double")), TRUE)
})
diff --git a/tests/testthat/test-plotting.R b/tests/testthat/test-plotting.R
index a250376e4..5275b8590 100644
--- a/tests/testthat/test-plotting.R
+++ b/tests/testthat/test-plotting.R
@@ -20,6 +20,8 @@ context("Plotting")
# TODO: add input checks and test these...
test_that("plots", {
+ skip_if_not_installed("ggplot2")
+ skip_on_cran()
# test all the outputs are ggplots
test <- plotSparseRoc(plpResult, typeColumn = "evaluation")
testthat::expect_s3_class(test, "arrangelist")
@@ -56,6 +58,8 @@ test_that("plots", {
test_that("outcomeSurvivalPlot", {
+ skip_if_not_installed("survminer")
+ skip_on_cran()
# test the plot works
test <- outcomeSurvivalPlot(plpData = plpData, outcomeId = outcomeId)
testthat::expect_s3_class(test, "ggsurvplot")
@@ -67,6 +71,8 @@ test_that("outcomeSurvivalPlot", {
test_that("plotPlp", {
+ skip_if_not_installed(c("ggplot2", "gridExtra"))
+ skip_on_cran()
# test the plot works
test <- plotPlp(
plpResult = plpResult,
@@ -81,6 +87,8 @@ test_that("plotPlp", {
})
test_that("plotSmoothCalibration", {
+ skip_if_not_installed("ggplot2")
+ skip_on_cran()
# test the plot works
test <- plotSmoothCalibration(
plpResult = plpResult,
@@ -109,8 +117,12 @@ test_that("plotSmoothCalibration", {
saveLocation = NULL
)
testthat::expect_s3_class(test2$test$smoothPlot, c("gg", "ggplot"))
- plpResult$prediction <- pred
+})
+test_that("Smooth calibration plot works with rcs", {
+ skip_if_not_installed("ggplot2")
+ skip_if_not_installed("mgcv")
+ skip_on_cran()
test3 <- plotSmoothCalibration(plpResult,
smooth = "rcs",
span = 1,
@@ -151,11 +163,15 @@ test_that("getNetBenefit handles invalid evalType gracefully", {
})
test_that("plotNetBenefit returns a grob object", {
+ skip_if_not_installed("ggplot2")
+ skip_on_cran()
plot <- plotNetBenefit(plpResult, evalType = "Test")
expect_true(inherits(plot, "arrangelist"))
})
test_that("plotNetBenefit saves plot when saveLocation is specified", {
+ skip_if_not_installed("ggplot2")
+ skip_on_cran()
tempDir <- tempfile()
plotNetBenefit(plpResult, saveLocation = tempDir, fileName = "netBenefit.png", evalType = "Test")
expect_true(file.exists(file.path(tempDir, "netBenefit.png")))
@@ -164,14 +180,18 @@ test_that("plotNetBenefit saves plot when saveLocation is specified", {
})
test_that("plotNetBenefit handles NULL evalType", {
+ skip_if_not_installed("ggplot2")
+ skip_on_cran()
plot <- plotNetBenefit(plpResult, evalType = NULL)
expect_true(inherits(plot, "arrangelist"))
})
test_that("plotNetBenefit creates correct number of plots when evalType is NULL", {
+ skip_if_not_installed("ggplot2")
+ skip_on_cran()
plot <- plotNetBenefit(plpResult, evalType = NULL)
# Since evalType is NULL, it should plot for all unique evaluation types
evalTypes <- unique(plpResult$performanceEvaluation$thresholdSummary$evaluation)
- expect_equal(length(plot[[1]]$grobs) - 1 , length(evalTypes)) # -1 for text grob
+ expect_equal(length(plot[[1]]$grobs) - 1, length(evalTypes)) # -1 for text grob
})
diff --git a/tests/testthat/test-population.R b/tests/testthat/test-population.R
index 6add83fee..cc65857fd 100644
--- a/tests/testthat/test-population.R
+++ b/tests/testthat/test-population.R
@@ -18,19 +18,18 @@ library("testthat")
context("Population")
defaultSettings <- function(
- binary = TRUE,
- includeAllOutcomes = F,
- firstExposureOnly = FALSE,
- washoutPeriod = 0,
- removeSubjectsWithPriorOutcome = FALSE,
- priorOutcomeLookback = 99999,
- requireTimeAtRisk = FALSE,
- minTimeAtRisk=0,
- riskWindowStart = 0,
- startAnchor = 'cohort start',
- riskWindowEnd = 365,
- endAnchor = 'cohort start'
-){
+ binary = TRUE,
+ includeAllOutcomes = FALSE,
+ firstExposureOnly = FALSE,
+ washoutPeriod = 0,
+ removeSubjectsWithPriorOutcome = FALSE,
+ priorOutcomeLookback = 99999,
+ requireTimeAtRisk = FALSE,
+ minTimeAtRisk = 0,
+ riskWindowStart = 0,
+ startAnchor = "cohort start",
+ riskWindowEnd = 365,
+ endAnchor = "cohort start") {
result <- createStudyPopulationSettings(
binary = binary,
includeAllOutcomes = includeAllOutcomes,
@@ -45,462 +44,436 @@ defaultSettings <- function(
riskWindowEnd = riskWindowEnd,
endAnchor = endAnchor
)
-
+
return(result)
}
test_that("createStudyPopulation correct class", {
-
populationSettings <- createStudyPopulationSettings()
expect_is(populationSettings, "populationSettings")
-
})
test_that("createStudyPopulation binary", {
-
populationSettings <- createStudyPopulationSettings(
binary = TRUE,
)
expect_true(populationSettings$binary)
-
+
populationSettings <- createStudyPopulationSettings(
binary = FALSE,
)
expect_false(populationSettings$binary)
-
})
test_that("createStudyPopulation includeAllOutcomes", {
-
expect_error(
createStudyPopulationSettings(includeAllOutcomes = NULL)
)
-
+
expect_error(
createStudyPopulationSettings(includeAllOutcomes = 12)
)
-
+
populationSettings <- createStudyPopulationSettings(
includeAllOutcomes = TRUE,
)
expect_true(populationSettings$includeAllOutcomes)
-
+
populationSettings <- createStudyPopulationSettings(
includeAllOutcomes = FALSE,
)
expect_false(populationSettings$includeAllOutcomes)
-
})
test_that("createStudyPopulation firstExposureOnly", {
-
expect_error(
createStudyPopulationSettings(firstExposureOnly = NULL)
)
-
+
expect_error(
createStudyPopulationSettings(firstExposureOnly = 12)
)
-
+
populationSettings <- createStudyPopulationSettings(
firstExposureOnly = TRUE,
)
expect_true(populationSettings$firstExposureOnly)
-
+
populationSettings <- createStudyPopulationSettings(
firstExposureOnly = FALSE,
)
expect_false(populationSettings$firstExposureOnly)
-
})
test_that("createStudyPopulation washoutPeriod", {
-
expect_error(
createStudyPopulationSettings(washoutPeriod = NULL)
)
-
+
expect_error(
createStudyPopulationSettings(washoutPeriod = -1)
)
-
- rand <- sample(1000,1)
+
+ rand <- sample(1000, 1)
populationSettings <- createStudyPopulationSettings(
washoutPeriod = rand,
)
expect_equal(populationSettings$washoutPeriod, rand)
-
})
test_that("createStudyPopulation removeSubjectsWithPriorOutcome", {
-
expect_error(
createStudyPopulationSettings(removeSubjectsWithPriorOutcome = NULL)
)
-
+
expect_error(
createStudyPopulationSettings(removeSubjectsWithPriorOutcome = 12)
)
-
+
populationSettings <- createStudyPopulationSettings(
removeSubjectsWithPriorOutcome = TRUE,
)
expect_true(populationSettings$removeSubjectsWithPriorOutcome)
-
+
populationSettings <- createStudyPopulationSettings(
removeSubjectsWithPriorOutcome = FALSE,
)
expect_false(populationSettings$removeSubjectsWithPriorOutcome)
-
})
test_that("createStudyPopulation priorOutcomeLookback", {
-
expect_error(
createStudyPopulationSettings(priorOutcomeLookback = NULL)
)
-
+
expect_error(
createStudyPopulationSettings(priorOutcomeLookback = -1)
)
-
- rand <- sample(1000,1)
+
+ rand <- sample(1000, 1)
populationSettings <- createStudyPopulationSettings(
priorOutcomeLookback = rand,
)
expect_equal(populationSettings$priorOutcomeLookback, rand)
-
})
test_that("createStudyPopulation requireTimeAtRisk", {
-
expect_error(
createStudyPopulationSettings(requireTimeAtRisk = NULL)
)
-
+
expect_error(
createStudyPopulationSettings(requireTimeAtRisk = 12)
)
-
+
populationSettings <- createStudyPopulationSettings(
requireTimeAtRisk = TRUE,
)
expect_true(populationSettings$requireTimeAtRisk)
-
+
populationSettings <- createStudyPopulationSettings(
requireTimeAtRisk = FALSE,
)
expect_false(populationSettings$requireTimeAtRisk)
-
})
test_that("createStudyPopulation minTimeAtRisk", {
-
expect_error(
createStudyPopulationSettings(minTimeAtRisk = NULL)
)
-
+
expect_error(
createStudyPopulationSettings(minTimeAtRisk = -1)
)
-
- rand <- 365+sample(365,1)
+
+ rand <- 365 + sample(365, 1)
populationSettings <- createStudyPopulationSettings(
minTimeAtRisk = rand,
riskWindowEnd = 1000
)
expect_equal(populationSettings$minTimeAtRisk, rand)
-
})
test_that("createStudyPopulation riskWindowStart", {
-
expect_error(
createStudyPopulationSettings(riskWindowStart = NULL)
)
-
- rand <- sample(1000,1)
+
+ rand <- sample(1000, 1)
populationSettings <- createStudyPopulationSettings(
riskWindowStart = rand,
riskWindowEnd = rand + 365
)
expect_equal(populationSettings$riskWindowStart, rand)
-
})
test_that("createStudyPopulation riskWindowEnd", {
-
expect_error(
createStudyPopulationSettings(riskWindowEnd = NULL)
)
-
-
- rand <- sample(1000,1)
+
+
+ rand <- sample(1000, 1)
populationSettings <- createStudyPopulationSettings(
- riskWindowEnd = rand,
+ riskWindowEnd = rand,
minTimeAtRisk = 1
)
expect_equal(populationSettings$riskWindowEnd, rand)
-
})
test_that("createStudyPopulation startAnchor", {
-
expect_error(
createStudyPopulationSettings(startAnchor = NULL)
)
-
+
expect_error(
- createStudyPopulationSettings(startAnchor = 'blabla')
+ createStudyPopulationSettings(startAnchor = "blabla")
)
-
- rand <- c('cohort end', 'cohort start')[sample(2,1)]
+
+ rand <- c("cohort end", "cohort start")[sample(2, 1)]
populationSettings <- createStudyPopulationSettings(
- startAnchor = rand ,
+ startAnchor = rand,
)
expect_equal(populationSettings$startAnchor, rand)
-
})
test_that("createStudyPopulation endAnchor", {
-
expect_error(
createStudyPopulationSettings(endAnchor = NULL)
)
-
+
expect_error(
- createStudyPopulationSettings(endAnchor = 'blabla')
+ createStudyPopulationSettings(endAnchor = "blabla")
)
-
- rand <- c('cohort end', 'cohort start')[sample(2,1)]
+
+ rand <- c("cohort end", "cohort start")[sample(2, 1)]
populationSettings <- createStudyPopulationSettings(
- endAnchor = rand ,
+ endAnchor = rand,
)
expect_equal(populationSettings$endAnchor, rand)
-
})
-# Test unit for the creation of the study population. The firstExposureOnly,
+# Test unit for the creation of the study population. The firstExposureOnly,
# washout, requireTimeAtRisk are checked. Additionally, error messages are checked.
test_that("population creation parameters", {
-
studyPopulation <- createStudyPopulation(
plpData = plpData,
outcomeId = outcomeId,
populationSettings = defaultSettings()
)
-
- #plpData = plpData
+
expect_is(studyPopulation, "data.frame")
-
- nrOutcomes1 <- sum(studyPopulation$outcomeCount)
- expect_gt(nrOutcomes1,0)
-
- #firstExposureOnly test (should have no effect on simulated data)
+
+ nrOutcomes1 <- sum(studyPopulation$outcomeCount)
+ expect_gt(nrOutcomes1, 0)
+
+ # firstExposureOnly test (should have no effect on simulated data)
studyPopulation <- createStudyPopulation(
plpData = plpData,
outcomeId = outcomeId,
- populationSettings = defaultSettings(firstExposureOnly = T)
- )
-
+ populationSettings = defaultSettings(firstExposureOnly = TRUE)
+ )
+
nrOutcomes2 <- sum(studyPopulation$outcomeCount)
- expect_gt(nrOutcomes2,0)
- expect_equal(nrOutcomes1,nrOutcomes2)
-
- #requireTimeAtRisk
+ expect_gt(nrOutcomes2, 0)
+ expect_equal(nrOutcomes1, nrOutcomes2)
+
+ # requireTimeAtRisk
studyPopulation <- createStudyPopulation(
plpData = plpData,
- outcomeId = outcomeId,
- populationSettings = defaultSettings(requireTimeAtRisk = T)
- )
-
+ outcomeId = outcomeId,
+ populationSettings = defaultSettings(requireTimeAtRisk = TRUE)
+ )
+
nrOutcomes3 <- sum(studyPopulation$outcomeCount)
- expect_gt(nrOutcomes3,0)
- expect_true(nrOutcomes3 <= nrOutcomes1)
-
+ expect_gt(nrOutcomes3, 0)
+ expect_true(nrOutcomes3 <= nrOutcomes1)
+
# expect warning due to no people left with
# minTimeAtRisk of 999999
expect_warning(
createStudyPopulation(
plpData = plpData,
- outcomeId = outcomeId,
- populationSettings = defaultSettings(requireTimeAtRisk = T, minTimeAtRisk = 99999)
+ outcomeId = outcomeId,
+ populationSettings = defaultSettings(requireTimeAtRisk = TRUE, minTimeAtRisk = 99999)
)
)
-
- #washoutPeriod = 365,
+
studyPopulation <- createStudyPopulation(
plpData = plpData,
- outcomeId = outcomeId,
+ outcomeId = outcomeId,
populationSettings = defaultSettings(washoutPeriod = 365)
)
nrOutcomes4 <- sum(studyPopulation$outcomeCount)
- expect_gt(nrOutcomes4,0)
- expect_true(nrOutcomes4 <= nrOutcomes1)
-
- #washoutPeriod <0 error
+ expect_gt(nrOutcomes4, 0)
+ expect_true(nrOutcomes4 <= nrOutcomes1)
+
+ # washoutPeriod <0 error
expect_error(
createStudyPopulation(
plpData = plpData,
- outcomeId = outcomeId,
+ outcomeId = outcomeId,
populationSettings = defaultSettings(washoutPeriod = -1)
)
)
-
- #priorOutcomeLookback < 0 error
+
+ # priorOutcomeLookback < 0 error
expect_error(
createStudyPopulation(
plpData = plpData,
- outcomeId = outcomeId,
+ outcomeId = outcomeId,
populationSettings = defaultSettings(
- priorOutcomeLookback = -1,
- removeSubjectsWithPriorOutcome = T
- )
+ priorOutcomeLookback = -1,
+ removeSubjectsWithPriorOutcome = TRUE
+ )
)
)
-
- #minTimeAtRisk <0
+
expect_error(
createStudyPopulation(
plpData = plpData,
- outcomeId = outcomeId,
+ outcomeId = outcomeId,
populationSettings = defaultSettings(
- minTimeAtRisk = -1,
- requireTimeAtRisk = T
+ minTimeAtRisk = -1,
+ requireTimeAtRisk = TRUE
)
)
)
-
+
# Incorrect startAnchor
expect_error(
createStudyPopulation(
plpData = plpData,
- outcomeId = outcomeId,
+ outcomeId = outcomeId,
populationSettings = defaultSettings(
- startAnchor = 'cohort stard'
+ startAnchor = "cohort stard"
)
)
-
)
-
+
# Incorrect endAnchor
expect_error(
createStudyPopulation(
plpData = plpData,
- outcomeId = outcomeId,
+ outcomeId = outcomeId,
populationSettings = defaultSettings(
- endAnchor = 'cohort ent'
+ endAnchor = "cohort ent"
)
)
)
-
-
+
+
# check outcomes that only have partial timeatrisk are included:
-
- outcomes <- data.frame(rowId= c(1,1,1,4,5),
- outcomeId=c(1,1,1,1,2),
- outcomeCount=rep(1,5),
- daysToEvent=c(-30,30,180,60,4)
- )
- cohorts <- data.frame(rowId=1:20,
- subjectId=1:20,
- targetId=rep(2,20),
- time=rep(365,20),
- ageYear = rep(18,20),
- gender = rep(8507,20),
- cohortStartDate=rep('2012-04-12',20),
- daysFromObsStart=rep(740,20),
- daysToCohortEnd=rep(1,20),
- daysToObsEnd=c(40, rep(900,19))
+
+ outcomes <- data.frame(
+ rowId = c(1, 1, 1, 4, 5),
+ outcomeId = c(1, 1, 1, 1, 2),
+ outcomeCount = rep(1, 5),
+ daysToEvent = c(-30, 30, 180, 60, 4)
+ )
+ cohorts <- data.frame(
+ rowId = 1:20,
+ subjectId = 1:20,
+ targetId = rep(2, 20),
+ time = rep(365, 20),
+ ageYear = rep(18, 20),
+ gender = rep(8507, 20),
+ cohortStartDate = rep("2012-04-12", 20),
+ daysFromObsStart = rep(740, 20),
+ daysToCohortEnd = rep(1, 20),
+ daysToObsEnd = c(40, rep(900, 19))
)
PplpData <- plpData
PplpData$outcomes <- outcomes
PplpData$cohorts <- cohorts
-
- attr(PplpData$cohorts, "metaData") <- list(attrition=data.frame(outcomeId=1,description='test',
- targetCount=20,uniquePeople=20,
- outcomes=3))
-
+
+ attr(PplpData$cohorts, "metaData") <- list(attrition = data.frame(
+ outcomeId = 1, description = "test",
+ targetCount = 20, uniquePeople = 20,
+ outcomes = 3
+ ))
+
Ppop <- createStudyPopulation(
plpData = PplpData,
- outcomeId = 1,
+ outcomeId = 1,
populationSettings = defaultSettings(
- includeAllOutcomes = T,
- requireTimeAtRisk = T,
+ includeAllOutcomes = TRUE,
+ requireTimeAtRisk = TRUE,
minTimeAtRisk = 365
- )
)
-
+ )
+
# person 1 and 4 should be retruned
- expect_equal(Ppop$rowId[Ppop$outcomeCount>0], c(1,4))
-
+ expect_equal(Ppop$rowId[Ppop$outcomeCount > 0], c(1, 4))
+
Ppop2 <- createStudyPopulation(
plpData = PplpData,
outcomeId = 1,
populationSettings = defaultSettings(
- includeAllOutcomes = T,
- removeSubjectsWithPriorOutcome = T,
+ includeAllOutcomes = TRUE,
+ removeSubjectsWithPriorOutcome = TRUE,
priorOutcomeLookback = 99999,
- requireTimeAtRisk = T,
+ requireTimeAtRisk = TRUE,
minTimeAtRisk = 365
)
)
-
+
# person 4 only as person 1 has it before
- expect_equal(Ppop2$rowId[Ppop2$outcomeCount>0], c(4))
-
- Ppop3 <- createStudyPopulation(plpData = PplpData,
+ expect_equal(Ppop2$rowId[Ppop2$outcomeCount > 0], c(4))
+
+ Ppop3 <- createStudyPopulation(
+ plpData = PplpData,
outcomeId = 1,
populationSettings = defaultSettings(
- includeAllOutcomes = F,
+ includeAllOutcomes = FALSE,
firstExposureOnly = FALSE,
- removeSubjectsWithPriorOutcome = F,
+ removeSubjectsWithPriorOutcome = FALSE,
priorOutcomeLookback = 99999,
- requireTimeAtRisk = T,
+ requireTimeAtRisk = TRUE,
minTimeAtRisk = 365
)
)
-
+
# 4 only should be retruned
- expect_equal(Ppop3$rowId[Ppop3$outcomeCount>0], c(4))
-
+ expect_equal(Ppop3$rowId[Ppop3$outcomeCount > 0], c(4))
+
# creates min warning due to no data...
Ppop5 <- createStudyPopulation(
plpData = PplpData,
outcomeId = 1,
populationSettings = defaultSettings(
- binary = T,
- includeAllOutcomes = F,
+ binary = TRUE,
+ includeAllOutcomes = FALSE,
firstExposureOnly = FALSE,
washoutPeriod = 0,
- removeSubjectsWithPriorOutcome = F,
+ removeSubjectsWithPriorOutcome = FALSE,
priorOutcomeLookback = 99999,
- requireTimeAtRisk = T,
- minTimeAtRisk=303,
+ requireTimeAtRisk = TRUE,
+ minTimeAtRisk = 303,
riskWindowStart = 62,
- startAnchor = 'cohort start',
+ startAnchor = "cohort start",
riskWindowEnd = 365,
- endAnchor = 'cohort start')
+ endAnchor = "cohort start"
+ )
)
-
+
# should have no outcomes
expect_equal(is.null(Ppop5), TRUE)
-
})
testthat::test_that("Providing an existing population and skipping population creation works", {
popSize <- 400
- newPopulation <- population[sample.int(nrow.default(population), popSize), ]
-
+ newPopulation <- population[sample.int(nrow(population), popSize), ]
+
tinyPlpData$population <- newPopulation
-
+
plpResults <- runPlp(
plpData = tinyPlpData,
outcomeId = 2,
@@ -515,11 +488,12 @@ testthat::test_that("Providing an existing population and skipping population cr
runModelDevelopment = TRUE
)
)
-
+
trainPredictions <- plpResults$prediction %>%
- dplyr::filter(.data$evaluationType == "Train") %>% nrow.default()
+ dplyr::filter(.data$evaluationType == "Train") %>%
+ nrow()
testPredictions <- plpResults$prediction %>%
- dplyr::filter(.data$evaluationType == "Test") %>% nrow.default()
+ dplyr::filter(.data$evaluationType == "Test") %>%
+ nrow()
expect_equal(popSize, trainPredictions + testPredictions)
})
-
diff --git a/tests/testthat/test-prediction.R b/tests/testthat/test-prediction.R
index 85aecf052..3cf8246e9 100644
--- a/tests/testthat/test-prediction.R
+++ b/tests/testthat/test-prediction.R
@@ -20,87 +20,94 @@ context("Prediction")
test_that("prediction inputs", {
- #=====================================
+ # =====================================
# check prediction
- #=====================================
- testthat::expect_error(predictPlp(model=NULL, population=population,
- plpData=plpData))
- testthat::expect_error(predictPlp(model=list(), population=NULL,
- plpData=plpData))
- testthat::expect_error(predictPlp(model=list(), population=population,
- plpData=NULL))
-
-
- })
+ # =====================================
+ testthat::expect_error(predictPlp(
+ model = NULL, population = population,
+ plpData = plpData
+ ))
+ testthat::expect_error(predictPlp(
+ model = list(), population = NULL,
+ plpData = plpData
+ ))
+ testthat::expect_error(predictPlp(
+ model = list(), population = population,
+ plpData = NULL
+ ))
+})
test_that("prediction works", {
- #=====================================
+ # =====================================
# check prediction
- #=====================================
- pred <- predictPlp(plpModel=plpResult$model,
- population=population,
- plpData=plpData
+ # =====================================
+ pred <- predictPlp(
+ plpModel = plpResult$model,
+ population = population,
+ plpData = plpData
+ )
+ pred <- pred[order(pred$rowId), ]
+ plpResult$prediction <- plpResult$prediction[order(plpResult$prediction$rowId), ]
+ testthat::expect_equal(
+ nrow(pred),
+ nrow(population)
)
- pred <- pred[order(pred$rowId),]
- plpResult$prediction <- plpResult$prediction[order(plpResult$prediction$rowId),]
- testthat::expect_equal(nrow(pred),
- nrow(population))
-
- rowId <- plpResult$prediction$rowId[plpResult$prediction$evaluationType == 'Test'][1]
-
- testthat::expect_equal(pred$value[pred$rowId == rowId],
- plpResult$prediction$value[
- plpResult$prediction$evaluationType == 'Test' &
- plpResult$prediction$rowId == rowId
- ]
- )
-
+
+ rowId <- plpResult$prediction$rowId[plpResult$prediction$evaluationType == "Test"][1]
+
+ testthat::expect_equal(
+ pred$value[pred$rowId == rowId],
+ plpResult$prediction$value[
+ plpResult$prediction$evaluationType == "Test" &
+ plpResult$prediction$rowId == rowId
+ ]
+ )
+
# check metaData
- expect_equal(length(names(attr(pred, "metaData"))), 6) # 8 if survivial
-
+ expect_equal(length(names(attr(pred, "metaData"))), 6) # 8 if survivial
+
# add single person pred and compare with manual cal
-
+
# add prediction of other models
-
})
# predict.*
test_that("applyTidyCovariateData", {
-
- covariateIds <- plpData$covariateData$covariateRef %>% dplyr::select("covariateId") %>% dplyr::pull()
+ covariateIds <- plpData$covariateData$covariateRef %>%
+ dplyr::select("covariateId") %>%
+ dplyr::pull()
remove <- sample(covariateIds, 10)
- deletedRedundantCovariateIds = remove[1:5]
- deletedInfrequentCovariateIds = remove[6:10]
-
- prepocessSettings = list(
+ deletedRedundantCovariateIds <- remove[1:5]
+ deletedInfrequentCovariateIds <- remove[6:10]
+
+ prepocessSettings <- list(
normFactors = data.frame(
covariateId = covariateIds,
- maxValue = rep(0.1,length(covariateIds))
- ),
+ maxValue = rep(0.1, length(covariateIds))
+ ),
deletedRedundantCovariateIds = deletedRedundantCovariateIds,
deletedInfrequentCovariateIds = deletedInfrequentCovariateIds
)
-
- # get covariateSize before
- covariateCount <- plpData$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull()
-
+
+ # get covariateSize before
+ covariateCount <- plpData$covariateData$covariates %>%
+ dplyr::tally() %>%
+ dplyr::pull()
+
newCovariateData <- applyTidyCovariateData(
covariateData = plpData$covariateData,
preprocessSettings = prepocessSettings
)
-
+
# some covariates removed
expect_true(newCovariateData$covariates %>% dplyr::tally() %>% dplyr::pull() < covariateCount)
-
- newCovs <- newCovariateData$covariateRef %>% dplyr::select("covariateId") %>% dplyr::pull()
-
- expect_equal(sum(covariateIds[!covariateIds %in% newCovs] %in% remove),10)
-
-})
-
-
+ newCovs <- newCovariateData$covariateRef %>%
+ dplyr::select("covariateId") %>%
+ dplyr::pull()
+ expect_equal(sum(covariateIds[!covariateIds %in% newCovs] %in% remove), 10)
+})
diff --git a/tests/testthat/test-preprocessingData.R b/tests/testthat/test-preprocessingData.R
index 9b3755b72..d0e823eba 100644
--- a/tests/testthat/test-preprocessingData.R
+++ b/tests/testthat/test-preprocessingData.R
@@ -19,10 +19,9 @@ context("preprocessingData")
createDefaultSettings <- function(
- minCovariateFraction = 0.001,
- normalizeData = TRUE,
- removeRedundancy = TRUE
-){
+ minCovariateFraction = 0.001,
+ normalizeData = TRUE,
+ removeRedundancy = TRUE) {
result <- createPreprocessSettings(
minFraction = minCovariateFraction,
normalize = normalizeData,
@@ -32,28 +31,28 @@ createDefaultSettings <- function(
}
test_that("createPreprocessSettings", {
-
- normalizeDataTest <- sample(c(T,F),1)
- removeRedundancyTest <- sample(c(T,F),1)
- minCovariateFractionTest <- 1/(1000+sample(1000,1))
-
+ normalizeDataTest <- sample(c(TRUE, FALSE), 1)
+ removeRedundancyTest <- sample(c(TRUE, FALSE), 1)
+ minCovariateFractionTest <- 1 / (1000 + sample(1000, 1))
+
settings <- createDefaultSettings(
minCovariateFraction = minCovariateFractionTest,
- normalizeData = normalizeDataTest,
- removeRedundancy = removeRedundancyTest)
-
+ normalizeData = normalizeDataTest,
+ removeRedundancy = removeRedundancyTest
+ )
+
expect_is(settings, "preprocessSettings")
expect_equal(settings$minFraction, minCovariateFractionTest)
expect_equal(settings$normalize, normalizeDataTest)
expect_equal(settings$removeRedundancy, removeRedundancyTest)
-
+
expect_error(createDefaultSettings(minCovariateFraction = -1))
- expect_error(createDefaultSettings(minCovariateFraction = 'dfdfdf'))
-
- expect_error(createDefaultSettings(removeRedundancy = 'dfdfdf'))
+ expect_error(createDefaultSettings(minCovariateFraction = "dfdfdf"))
+
+ expect_error(createDefaultSettings(removeRedundancy = "dfdfdf"))
expect_error(createDefaultSettings(removeRedundancy = NULL))
-
- expect_error(createDefaultSettings(normalizeData = 'dfdfdf'))
+
+ expect_error(createDefaultSettings(normalizeData = "dfdfdf"))
expect_error(createDefaultSettings(normalizeData = NULL))
})
@@ -61,53 +60,62 @@ test_that("createPreprocessSettings", {
attr(trainData$covariateData, "metaData")$preprocessSettings <- NULL # removing for test
metaData <- attr(trainData$covariateData, "metaData")
metaLength <- length(metaData)
- covSize <- trainData$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull()
- oldFeatureCount <- trainData$covariateData$covariateRef %>% dplyr::tally() %>% dplyr::pull()
-
+ covSize <- trainData$covariateData$covariates %>%
+ dplyr::tally() %>%
+ dplyr::pull()
+ oldFeatureCount <- trainData$covariateData$covariateRef %>%
+ dplyr::tally() %>%
+ dplyr::pull()
+
preprocessSettings <- createDefaultSettings(
minCovariateFraction = 0.01,
- normalizeData = F,
- removeRedundancy = F
+ normalizeData = FALSE,
+ removeRedundancy = FALSE
)
newData <- preprocessData(trainData$covariateData, preprocessSettings)
-
- expect_is(newData, 'CovariateData')
+
+ expect_is(newData, "CovariateData")
expect_true(newData$covariates %>% dplyr::tally() %>% dplyr::pull() < covSize)
-
+
# metaData should have tidyCovariateDataSettings + preprocessSettings (so 2 bigger)
- expect_equal(length(attr(newData, "metaData")), metaLength+2)
-
- expect_true(length(attr(newData, "metaData")$tidyCovariateDataSettings$deletedInfrequentCovariateIds)>=0)
+ expect_equal(length(attr(newData, "metaData")), metaLength + 2)
+
+ expect_true(length(attr(newData, "metaData")$tidyCovariateDataSettings$deletedInfrequentCovariateIds) >= 0)
expect_equal(attr(newData, "metaData")$tidyCovariateDataSettings$deletedRedundantCovariateIds, NULL)
expect_equal(attr(newData, "metaData")$tidyCovariateDataSettings$normFactors, NULL)
-
- newFeatureCount <- newData$covariateRef %>% dplyr::tally() %>% dplyr::pull() + length(attr(newData, "metaData")$tidyCovariateDataSettings$deletedInfrequentCovariateIds)
-
+
+ newFeatureCount <- newData$covariateRef %>%
+ dplyr::tally() %>%
+ dplyr::pull() + length(attr(newData, "metaData")$tidyCovariateDataSettings$deletedInfrequentCovariateIds)
+
expect_equal(newFeatureCount, oldFeatureCount)
-
- oldFeatureCount <- trainData$covariateData$covariateRef %>% dplyr::tally() %>% dplyr::pull()
-
+
+ oldFeatureCount <- trainData$covariateData$covariateRef %>%
+ dplyr::tally() %>%
+ dplyr::pull()
+
metaData <- attr(trainData$covariateData, "metaData")
preprocessSettings <- createDefaultSettings(
minCovariateFraction = 0,
- normalizeData = T,
- removeRedundancy = T
+ normalizeData = TRUE,
+ removeRedundancy = TRUE
)
newData <- preprocessData(trainData$covariateData, preprocessSettings)
- expect_true(length(attr(newData, "metaData")$tidyCovariateDataSettings$deletedInfrequentCovariateIds)==0)
- expect_true(length(attr(newData, "metaData")$tidyCovariateDataSettings$deletedRedundantCovariateIds)>=0)
- expect_true(length(attr(newData, "metaData")$tidyCovariateDataSettings$normFactors)>=0)
-
- newFeatureCount <- newData$covariateRef %>% dplyr::tally() %>% dplyr::pull() +
- length(attr(newData, "metaData")$tidyCovariateDataSettings$deletedRedundantCovariateIds)
-
+ expect_true(length(attr(newData, "metaData")$tidyCovariateDataSettings$deletedInfrequentCovariateIds) == 0)
+ expect_true(length(attr(newData, "metaData")$tidyCovariateDataSettings$deletedRedundantCovariateIds) >= 0)
+ expect_true(length(attr(newData, "metaData")$tidyCovariateDataSettings$normFactors) >= 0)
+
+ newFeatureCount <- newData$covariateRef %>%
+ dplyr::tally() %>%
+ dplyr::pull() +
+ length(attr(newData, "metaData")$tidyCovariateDataSettings$deletedRedundantCovariateIds)
+
expect_equal(newFeatureCount, oldFeatureCount) # sometimes differ?
-
+
# check settings are saved
expect_equal(attr(newData, "metaData")$preprocessSettings, preprocessSettings)
-
})
-test_that('Did tidy on test', {
- expect_true(attr(plpResult$prediction, 'metaData')$tidyCovariates)
+test_that("Did tidy on test", {
+ expect_true(attr(plpResult$prediction, "metaData")$tidyCovariates)
})
diff --git a/tests/testthat/test-rclassifier.R b/tests/testthat/test-rclassifier.R
index 72f6d024d..c75b6a56e 100644
--- a/tests/testthat/test-rclassifier.R
+++ b/tests/testthat/test-rclassifier.R
@@ -20,38 +20,39 @@ context("RClassifier")
test_that("GBM settings work", {
-
- seed <- sample(10000000,1)
- #=====================================
+ skip_if_not_installed("xgboost")
+ skip_on_cran()
+ seed <- sample(10000000, 1)
+ # =====================================
# checking Gradient Boosting Machine
- #=====================================
+ # =====================================
gbmSet <- setGradientBoostingMachine(
- ntrees = c(2, 10),
- nthread = 5,
+ ntrees = c(2, 10),
+ nthread = 5,
earlyStopRound = 25,
- maxDepth = 4,
- minChildWeight = 1,
+ maxDepth = 4,
+ minChildWeight = 1,
learnRate = 0.1,
alpha = 0,
- lambda =1,
+ lambda = 1,
seed = seed
- )
-
- expect_is(gbmSet, 'modelSettings')
- expect_equal(gbmSet$fitFunction, 'fitRclassifier')
- expect_is(gbmSet$param, 'list')
-
- expect_equal(attr(gbmSet$param, 'settings')$modelType, 'Xgboost')
- expect_equal(attr(gbmSet$param, 'settings')$seed, seed)
- expect_equal(attr(gbmSet$param, 'settings')$modelName, "Gradient Boosting Machine")
-
- expect_equal(attr(gbmSet$param, 'settings')$threads, 5)
- expect_equal(attr(gbmSet$param, 'settings')$varImpRFunction, 'varImpXgboost')
- expect_equal(attr(gbmSet$param, 'settings')$trainRFunction, 'fitXgboost')
- expect_equal(attr(gbmSet$param, 'settings')$predictRFunction, 'predictXgboost')
-
- expect_equal(length(gbmSet$param),2)
-
+ )
+
+ expect_is(gbmSet, "modelSettings")
+ expect_equal(gbmSet$fitFunction, "fitRclassifier")
+ expect_is(gbmSet$param, "list")
+
+ expect_equal(attr(gbmSet$param, "settings")$modelType, "Xgboost")
+ expect_equal(attr(gbmSet$param, "settings")$seed, seed)
+ expect_equal(attr(gbmSet$param, "settings")$modelName, "Gradient Boosting Machine")
+
+ expect_equal(attr(gbmSet$param, "settings")$threads, 5)
+ expect_equal(attr(gbmSet$param, "settings")$varImpRFunction, "varImpXgboost")
+ expect_equal(attr(gbmSet$param, "settings")$trainRFunction, "fitXgboost")
+ expect_equal(attr(gbmSet$param, "settings")$predictRFunction, "predictXgboost")
+
+ expect_equal(length(gbmSet$param), 2)
+
expect_equal(length(unique(unlist(lapply(gbmSet$param, function(x) x$ntrees)))), 2)
expect_equal(length(unique(unlist(lapply(gbmSet$param, function(x) x$earlyStopRound)))), 1)
expect_equal(length(unique(unlist(lapply(gbmSet$param, function(x) x$maxDepth)))), 1)
@@ -59,66 +60,65 @@ test_that("GBM settings work", {
expect_equal(length(unique(unlist(lapply(gbmSet$param, function(x) x$learnRate)))), 1)
expect_equal(length(unique(unlist(lapply(gbmSet$param, function(x) x$lambda)))), 1)
expect_equal(length(unique(unlist(lapply(gbmSet$param, function(x) x$alpha)))), 1)
-
-
-
})
test_that("GBM settings expected errors", {
-#=====================================
-# checking Gradient Boosting Machine
-#=====================================
-
-testthat::expect_error(setGradientBoostingMachine(ntrees = -1))
-testthat::expect_error(setGradientBoostingMachine(minChildWeight = -1))
-testthat::expect_error(setGradientBoostingMachine(maxDepth = 0))
-testthat::expect_error(setGradientBoostingMachine(learnRate = -2))
-testthat::expect_error(setGradientBoostingMachine(seed = 'F'))
-testthat::expect_error(setGradientBoostingMachine(lambda = -1))
-testthat::expect_error(setGradientBoostingMachine(alpha = -1))
-testthat::expect_error(setGradientBoostingMachine(scalePosWeight = -1))
-
-
-
+ skip_if_not_installed("xgboost")
+ skip_on_cran()
+ # =====================================
+ # checking Gradient Boosting Machine
+ # =====================================
+
+ testthat::expect_error(setGradientBoostingMachine(ntrees = -1))
+ testthat::expect_error(setGradientBoostingMachine(minChildWeight = -1))
+ testthat::expect_error(setGradientBoostingMachine(maxDepth = 0))
+ testthat::expect_error(setGradientBoostingMachine(learnRate = -2))
+ testthat::expect_error(setGradientBoostingMachine(seed = "F"))
+ testthat::expect_error(setGradientBoostingMachine(lambda = -1))
+ testthat::expect_error(setGradientBoostingMachine(alpha = -1))
+ testthat::expect_error(setGradientBoostingMachine(scalePosWeight = -1))
})
test_that("GBM working checks", {
-
- modelSettings <- setGradientBoostingMachine(ntrees = 10, maxDepth = 3, learnRate = 0.1)
-
+ skip_if_not_installed("xgboost")
+ skip_on_cran()
+ modelSettings <- setGradientBoostingMachine(ntrees = 10,
+ maxDepth = 3, learnRate = 0.1)
+
fitModel <- fitPlp(
- trainData = trainData,
- modelSettings = modelSettings,
- analysisId = 'gbmTest',
+ trainData = trainData,
+ modelSettings = modelSettings,
+ analysisId = "gbmTest",
analysisPath = tempdir()
)
-
- expect_equal(nrow(fitModel$prediction), nrow(trainData$labels)*2)
- expect_equal(length(unique(fitModel$prediction$evaluationType)),2)
-
+
+ expect_equal(nrow(fitModel$prediction), nrow(trainData$labels) * 2)
+ expect_equal(length(unique(fitModel$prediction$evaluationType)), 2)
+
# check prediction between 0 and 1
expect_gte(min(fitModel$prediction$value), 0)
expect_lte(max(fitModel$prediction$value), 1)
-
- expect_equal(class(fitModel$model),"xgb.Booster")
-
+
+ expect_equal(class(fitModel$model), "xgb.Booster")
+
expect_lte(nrow(fitModel$covariateImportance), trainData$covariateData$covariateRef %>% dplyr::tally() %>% dplyr::pull())
-
+
expect_equal(fitModel$modelDesign$outcomeId, outcomeId)
expect_equal(fitModel$modelDesign$targetId, 1)
# TODO check other model design values?
-
+
# test that at least some features have importances that are not zero
- expect_equal(sum(abs(fitModel$covariateImportance$covariateValue))>0, TRUE)
-
+ expect_equal(sum(abs(fitModel$covariateImportance$covariateValue)) > 0, TRUE)
})
test_that("GBM without outcomes in early stopping set errors", {
+ skip_if_not_installed("xgboost")
+ skip_on_cran()
hyperParameters <- list(
ntrees = 10,
earlyStopRound = 2,
@@ -131,22 +131,26 @@ test_that("GBM without outcomes in early stopping set errors", {
)
observations <- 100
features <- 10
- data <- createData(observations = observations, features = features,
- totalFeatures = 10,
- numCovs = FALSE, outcomeRate = 0.05)
+ data <- createData(
+ observations = observations, features = features,
+ totalFeatures = 10,
+ numCovs = FALSE, outcomeRate = 0.05
+ )
dataMatrix <- Matrix::sparseMatrix(
i = data$covariates %>% dplyr::pull("rowId"),
j = data$covariates %>% dplyr::pull("columnId"),
x = data$covariates %>% dplyr::pull("covariateValue"),
- dims = c(observations,features)
+ dims = c(observations, features)
)
labels <- data.frame(outcomeCount = data$labels)
settings <- list(seed = 42, threads = 2)
- expect_error(fitXgboost(dataMatrix = dataMatrix,
- labels = labels,
- hyperParameters = hyperParameters,
- settings = settings),
- regexp = "* or turn off early stopping")
-
+ expect_error(
+ fitXgboost(
+ dataMatrix = dataMatrix,
+ labels = labels,
+ hyperParameters = hyperParameters,
+ settings = settings
+ ),
+ regexp = "* or turn off early stopping"
+ )
})
-
diff --git a/tests/testthat/test-recalibration.R b/tests/testthat/test-recalibration.R
index 33e717197..712e9b5ca 100644
--- a/tests/testthat/test-recalibration.R
+++ b/tests/testthat/test-recalibration.R
@@ -21,16 +21,16 @@ context("Recalibration")
prediction <- data.frame(
rowId = 1:100,
- value = c(runif(20)/30,runif(80)/300),
- outcomeCount = c(runif(20)>0.5, runif(80)>0.9)*1,
- gender = sample(c(8507, 1111), 100, replace = T),
- ageYear = sample(1:100,100, replace = T ),
- survivalTime = rep(365,100),
- evaluationType = rep('Test', 100)
- )
+ value = c(runif(20) / 30, runif(80) / 300),
+ outcomeCount = c(runif(20) > 0.5, runif(80) > 0.9) * 1,
+ gender = sample(c(8507, 1111), 100, replace = TRUE),
+ ageYear = sample(1:100, 100, replace = TRUE),
+ survivalTime = rep(365, 100),
+ evaluationType = rep("Test", 100)
+)
metaData <- list(
- modelType = "binary",
+ modelType = "binary",
targetId = 1,
outcomeId = outcomeId,
timepoint = 365
@@ -39,75 +39,80 @@ metaData <- list(
attr(prediction, "metaData") <- metaData
test_that("recalibrationInTheLarge", {
-
- test <- recalibratePlp(prediction, analysisId = 'Analysis_1',
- method = 'recalibrationInTheLarge')
-
- testthat::expect_true(sum(test$evaluationType == 'recalibrationInTheLarge') == 100)
-
+ test <- recalibratePlp(prediction,
+ analysisId = "Analysis_1",
+ method = "recalibrationInTheLarge"
+ )
+
+ testthat::expect_true(sum(test$evaluationType == "recalibrationInTheLarge") == 100)
})
-#'weakRecalibration'
+#' weakRecalibration'
test_that("weakRecalibration", {
+ test <- recalibratePlp(prediction,
+ analysisId = "Analysis_1",
+ method = "weakRecalibration"
+ )
-test <- recalibratePlp(prediction, analysisId = 'Analysis_1',
- method = 'weakRecalibration')
-
-testthat::expect_true(sum(test$evaluationType == 'weakRecalibration') == 100)
-
+ testthat::expect_true(sum(test$evaluationType == "weakRecalibration") == 100)
})
-
-
test_that("recalibratePlpRefit", {
-
- newPop <- plpResult$prediction %>% dplyr::select(-"value") %>% dplyr::filter(.data$evaluationType %in% c('Test','Train'))
- attr(newPop, 'metaData') <- list(
- targetId = 1,
+ newPop <- plpResult$prediction %>%
+ dplyr::select(-"value") %>%
+ dplyr::filter(.data$evaluationType %in% c("Test", "Train"))
+ attr(newPop, "metaData") <- list(
+ targetId = 1,
outcomeId = outcomeId,
restrictPlpDataSettings = PatientLevelPrediction::createRestrictPlpDataSettings(),
populationSettings = PatientLevelPrediction::createStudyPopulationSettings()
)
-
+
testRecal <- recalibratePlpRefit(
- plpModel = plpResult$model,
- newPopulation = newPop,
+ plpModel = plpResult$model,
+ newPopulation = newPop,
newData = plpData
)
-
- if(!is.null(testRecal)){
+
+ if (!is.null(testRecal)) {
testthat::expect_true(
- sum(testRecal$evaluationType == 'recalibrationRefit')>0
+ sum(testRecal$evaluationType == "recalibrationRefit") > 0
)
-
- testthat::expect_s3_class(testRecal, 'data.frame')
+ testthat::expect_s3_class(testRecal, "data.frame")
}
-
+
+ testRecalWithModel <- recalibratePlpRefit(
+ plpModel = plpResult$model,
+ newPopulation = newPop,
+ newData = plpData,
+ returnModel = TRUE
+ )
+ expect_type(testRecalWithModel, "list")
+ expect_s3_class(testRecalWithModel$model, "plpModel")
+ expect_s3_class(testRecalWithModel$prediction, "data.frame")
+
# add more test...
})
test_that("survival", {
-# survival
-metaData <- list(
- modelType = "survival",
- targetId = 1,
- outcomeId = outcomeId,
- timepoint = 365
-)
+ # survival
+ metaData <- list(
+ modelType = "survival",
+ targetId = 1,
+ outcomeId = outcomeId,
+ timepoint = 365
+ )
-attr(prediction, "metaData") <- metaData
+ attr(prediction, "metaData") <- metaData
- test <- recalibratePlp(prediction, analysisId = 'Analysis_1',
- method = 'weakRecalibration')
+ test <- recalibratePlp(prediction,
+ analysisId = "Analysis_1",
+ method = "weakRecalibration"
+ )
- testthat::expect_true(sum(test$evaluationType == 'weakRecalibration') == 100)
-
+ testthat::expect_true(sum(test$evaluationType == "weakRecalibration") == 100)
})
-
-
-
-
diff --git a/tests/testthat/test-runPlpHelpers.R b/tests/testthat/test-runPlpHelpers.R
index 4d361410d..d4d3f74cd 100644
--- a/tests/testthat/test-runPlpHelpers.R
+++ b/tests/testthat/test-runPlpHelpers.R
@@ -18,78 +18,75 @@ library("testthat")
context("runPlpHelpers")
test_that("check printHeader runs", {
-
header <- printHeader(
- plpData = plpData,
- targetId = 1,
- outcomeId = outcomeId,
- analysisId = 123,
- analysisName = 'test',
- ExecutionDateTime = Sys.time()
- )
+ plpData = plpData,
+ targetId = 1,
+ outcomeId = outcomeId,
+ analysisId = 123,
+ analysisName = "test",
+ executionDateTime = Sys.time()
+ )
expect_is(header, "logical")
-
})
test_that("checkInputs", {
-
check <- checkInputs(
list(
- plpData = plpData,
- outcomeId = outcomeId,
+ plpData = plpData,
+ outcomeId = outcomeId,
populationSettings = populationSettings
)
)
-
+
expect_is(check, "logical")
expect_equal(check, TRUE)
-
+
# error as NULL plpData
expect_error(
checkInputs(
list(
- plpData = NULL,
- outcomeId = outcomeId,
+ plpData = NULL,
+ outcomeId = outcomeId,
populationSettings = populationSettings
)
)
)
-
+
# error as incorrect outcomeId
expect_error(
checkInputs(
list(
- plpData = plpData,
- outcomeId = 'test',
+ plpData = plpData,
+ outcomeId = "test",
populationSettings = populationSettings
)
)
)
-
+
# error as incorrect populationSettings
expect_error(
checkInputs(
list(
- plpData = plpData,
- outcomeId = outcomeId,
- populationSettings = 'populationSettings'
+ plpData = plpData,
+ outcomeId = outcomeId,
+ populationSettings = "populationSettings"
)
)
)
-
})
test_that("createExecuteSettings", {
-
- getTF <- function(){return(sample(c(T,F), 1))}
+ getTF <- function() {
+ return(sample(c(TRUE, FALSE), 1))
+ }
runSplitData <- getTF()
runSampleData <- getTF()
runfeatureEngineering <- getTF()
runPreprocessData <- getTF()
runModelDevelopment <- getTF()
runCovariateSummary <- getTF()
-
+
executeSettings <- createExecuteSettings(
runSplitData = runSplitData,
runSampleData = runSampleData,
@@ -98,15 +95,15 @@ test_that("createExecuteSettings", {
runModelDevelopment = runModelDevelopment,
runCovariateSummary = runCovariateSummary
)
-
+
expect_is(executeSettings, "executeSettings")
expect_equal(executeSettings$runSplitData, runSplitData)
- expect_equal(executeSettings$runSampleData , runSampleData)
+ expect_equal(executeSettings$runSampleData, runSampleData)
expect_equal(executeSettings$runfeatureEngineering, runfeatureEngineering)
expect_equal(executeSettings$runPreprocessData, runPreprocessData)
expect_equal(executeSettings$runModelDevelopment, runModelDevelopment)
expect_equal(executeSettings$runCovariateSummary, runCovariateSummary)
-
+
expect_error(
executeSettings <- createExecuteSettings(
runSplitData = 12,
@@ -117,7 +114,7 @@ test_that("createExecuteSettings", {
runCovariateSummary = runCovariateSummary
)
)
-
+
expect_error(
executeSettings <- createExecuteSettings(
runSplitData = runSplitData,
@@ -128,7 +125,7 @@ test_that("createExecuteSettings", {
runCovariateSummary = runCovariateSummary
)
)
-
+
expect_error(
executeSettings <- createExecuteSettings(
runSplitData = runSplitData,
@@ -139,7 +136,7 @@ test_that("createExecuteSettings", {
runCovariateSummary = runCovariateSummary
)
)
-
+
expect_error(
executeSettings <- createExecuteSettings(
runSplitData = runSplitData,
@@ -150,7 +147,7 @@ test_that("createExecuteSettings", {
runCovariateSummary = runCovariateSummary
)
)
-
+
expect_error(
executeSettings <- createExecuteSettings(
runSplitData = runSplitData,
@@ -161,7 +158,7 @@ test_that("createExecuteSettings", {
runCovariateSummary = runCovariateSummary
)
)
-
+
expect_error(
executeSettings <- createExecuteSettings(
runSplitData = runSplitData,
@@ -172,21 +169,17 @@ test_that("createExecuteSettings", {
runCovariateSummary = 12
)
)
-
})
test_that("createDefaultExecuteSettings", {
-
executeSettings <- createDefaultExecuteSettings()
expect_is(executeSettings, "executeSettings")
- expect_equal(executeSettings$runSplitData, T)
- expect_equal(executeSettings$runSampleData , F)
- expect_equal(executeSettings$runfeatureEngineering, F)
- expect_equal(executeSettings$runPreprocessData, T)
- expect_equal(executeSettings$runModelDevelopment, T)
- expect_equal(executeSettings$runCovariateSummary, T)
-
-
+ expect_equal(executeSettings$runSplitData, TRUE)
+ expect_equal(executeSettings$runSampleData, FALSE)
+ expect_equal(executeSettings$runfeatureEngineering, FALSE)
+ expect_equal(executeSettings$runPreprocessData, TRUE)
+ expect_equal(executeSettings$runModelDevelopment, TRUE)
+ expect_equal(executeSettings$runCovariateSummary, TRUE)
})
diff --git a/tests/testthat/test-sampling.R b/tests/testthat/test-sampling.R
index 967dd6dc6..eed4aa036 100644
--- a/tests/testthat/test-sampling.R
+++ b/tests/testthat/test-sampling.R
@@ -17,184 +17,171 @@
library("testthat")
context("Sampling")
-testType <- sample(c('none', 'underSample', 'overSample'), 1)
+testType <- sample(c("none", "underSample", "overSample"), 1)
testNumberOutcomestoNonOutcomes <- 2
-testSampleSeed <- sample(10000,1)
+testSampleSeed <- sample(10000, 1)
sampleSettingFunc <- function(
- type = testType,
- numberOutcomestoNonOutcomes = testNumberOutcomestoNonOutcomes,
- sampleSeed = testSampleSeed
-){
-
+ type = testType,
+ numberOutcomestoNonOutcomes = testNumberOutcomestoNonOutcomes,
+ sampleSeed = testSampleSeed) {
result <- createSampleSettings(
- type = type ,
+ type = type,
numberOutcomestoNonOutcomes = numberOutcomestoNonOutcomes,
sampleSeed = sampleSeed
)
-
+
return(result)
-
}
-
+
test_that("createSampleSettings works", {
-
sampleSettings <- sampleSettingFunc()
expect_is(sampleSettings, "sampleSettings")
-
- sampleFun <- 'sameData'
- if(testType == 'underSample'){
- sampleFun <- 'underSampleData'
+
+ sampleFun <- "sameData"
+ if (testType == "underSample") {
+ sampleFun <- "underSampleData"
}
- if(testType == 'overSample'){
- sampleFun <- 'overSampleData'
+ if (testType == "overSample") {
+ sampleFun <- "overSampleData"
}
-
+
expect_equal(
- attr(sampleSettings, "fun") ,
+ attr(sampleSettings, "fun"),
sampleFun
)
-
+
expect_equal(
- sampleSettings$numberOutcomestoNonOutcomes,
+ sampleSettings$numberOutcomestoNonOutcomes,
testNumberOutcomestoNonOutcomes
- )
-
- # the seed is ignored if sameData
- if(testType == 'none'){
+ )
+
+ # the seed is ignored if sameData
+ if (testType == "none") {
testSampleSeed <- 1
}
expect_equal(
- sampleSettings$sampleSeed,
- testSampleSeed
+ sampleSettings$sampleSeed,
+ testSampleSeed
)
-
})
test_that("createSampleSettings expected errors", {
-
expect_error(
- sampleSettingFunc(numberOutcomestoNonOutcomes = 'fsfd')
+ sampleSettingFunc(numberOutcomestoNonOutcomes = "fsfd")
)
expect_error(
sampleSettingFunc(numberOutcomestoNonOutcomes = -1)
)
-
+
expect_error(
- sampleSettingFunc(sampleSeed = 'fsfd')
+ sampleSettingFunc(sampleSeed = "fsfd")
)
-
+
expect_error(
- sampleSettingFunc(type = 'fsfd')
+ sampleSettingFunc(type = "fsfd")
)
expect_error(
- sampleSettingFunc(type = NULL)
+ sampleSettingFunc(type = NULL)
)
-
})
test_that("sampleData outputs are correct", {
-
newTrainData <- trainData
attr(newTrainData, "metaData")$sampleSettings <- NULL # remove for test
-
- sampleSettings <- sampleSettingFunc(type = 'none')
-
+
+ sampleSettings <- sampleSettingFunc(type = "none")
+
sampleData <- sampleData(newTrainData, sampleSettings)
-
+
# make sure metaData captures
expect_equal(
length(attr(sampleData, "metaData")),
- length(attr(newTrainData, "metaData"))+1
+ length(attr(newTrainData, "metaData")) + 1
)
-
+
expect_equal(
attr(sampleData, "metaData")$sampleSettings[[1]],
sampleSettings
)
-
+
# check the data is the same:
expect_equal(
- nrow(sampleData$labels),
+ nrow(sampleData$labels),
nrow(newTrainData$labels)
)
-
+
expect_equal(
- nrow(sampleData$folds),
+ nrow(sampleData$folds),
nrow(newTrainData$folds)
)
-
+
expect_equal(
- sampleData$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull(),
- newTrainData$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull()
+ sampleData$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull(),
+ newTrainData$covariateData$covariates %>% dplyr::tally() %>% dplyr::pull()
)
-
-
})
# specific functions for sampling
-
+
test_that("underSampleData works", {
-
newTrainData <- trainData
-
+
sampleSettings <- list(
sampleSeed = 1,
numberOutcomestoNonOutcomes = 1
- )
-
+ )
+
underSampleData <- underSampleData(trainData, sampleSettings)
-
- expect_true(inherits(underSampleData, 'plpData')) # add test based on github issue
-
+
+ expect_true(inherits(underSampleData, "plpData")) # add test based on github issue
+
# the sampled data should be smaller...
expect_true(nrow(underSampleData$labels) <= nrow(newTrainData$labels))
-
+
expect_true(nrow(underSampleData$folds) <= nrow(newTrainData$folds))
-
+
expect_true(
- underSampleData$covariateData$covariates %>%
- dplyr::tally() %>%
- dplyr::pull() <= newTrainData$covariateData$covariates %>%
- dplyr::tally() %>% dplyr::pull()
+ underSampleData$covariateData$covariates %>%
+ dplyr::tally() %>%
+ dplyr::pull() <= newTrainData$covariateData$covariates %>%
+ dplyr::tally() %>%
+ dplyr::pull()
)
-
+
# perhaps add manual data test
-
-
})
test_that("overSampleData works", {
-
newTrainData <- trainData
-
+
sampleSettings <- list(
sampleSeed = 1,
numberOutcomestoNonOutcomes = 0.5
)
-
+
overSampleData <- overSampleData(newTrainData, sampleSettings)
-
- expect_true(inherits(overSampleData, 'plpData')) # add test based on github issue
-
+
+ expect_true(inherits(overSampleData, "plpData")) # add test based on github issue
+
# the sampled data should be smaller...
expect_true(nrow(overSampleData$labels) >= nrow(newTrainData$labels))
-
+
expect_true(nrow(overSampleData$folds) >= nrow(newTrainData$folds))
-
+
expect_true(
- overSampleData$covariateData$covariates %>% dplyr::tally() %>%
- dplyr::pull() >= newTrainData$covariateData$covariates %>%
- dplyr::tally() %>% dplyr::pull()
+ overSampleData$covariateData$covariates %>%
+ dplyr::tally() %>%
+ dplyr::pull() >= newTrainData$covariateData$covariates %>%
+ dplyr::tally() %>%
+ dplyr::pull()
)
-
+
# perhaps add manual data test
-
-
})
diff --git a/tests/testthat/test-saveloadplp.R b/tests/testthat/test-saveloadplp.R
index ed71009ce..a0d144d6b 100644
--- a/tests/testthat/test-saveloadplp.R
+++ b/tests/testthat/test-saveloadplp.R
@@ -20,8 +20,8 @@ saveLoc <- tempdir()
test_that("savePlpDataError", {
expect_error(savePlpData())
- expect_error(savePlpData(plpData=1))
- expect_error(savePlpData(plpData=1, file='testing'))
+ expect_error(savePlpData(plpData = 1))
+ expect_error(savePlpData(plpData = 1, file = "testing"))
})
@@ -30,24 +30,30 @@ oldOutcomes <- plpData$outcomes
oldCovariates <- as.data.frame(plpData$covariateData$covariates)
oldCovariateRef <- as.data.frame(plpData$covariateData$covariateRef)
test_that("savePlpData", {
- savePlpData(plpData = plpData,
- file = file.path(saveLoc,"saveDataTest"), overwrite = T)
- testExist <- dir.exists(file.path(saveLoc,"saveDataTest"))
- expect_equal(testExist, T)
+ savePlpData(
+ plpData = plpData,
+ file = file.path(saveLoc, "saveDataTest"), overwrite = TRUE
+ )
+ testExist <- dir.exists(file.path(saveLoc, "saveDataTest"))
+ expect_equal(testExist, TRUE)
})
test_that("loadPlpDataError", {
- expect_error(loadPlpData(file='madeup/dffdf/testing'))
+ expect_error(loadPlpData(file = "madeup/dffdf/testing"))
})
test_that("loadPlpData", {
- loadedData <- loadPlpData(file = file.path(saveLoc,"saveDataTest"))
+ loadedData <- loadPlpData(file = file.path(saveLoc, "saveDataTest"))
expect_identical(loadedData$cohorts, oldCohorts)
expect_identical(loadedData$outcomes, oldOutcomes)
- expect_equal(as.data.frame(loadedData$covariateData$covariates),
- oldCovariates)
- expect_equal(as.data.frame(loadedData$covariateData$covariateRef),
- oldCovariateRef)
+ expect_equal(
+ as.data.frame(loadedData$covariateData$covariates),
+ oldCovariates
+ )
+ expect_equal(
+ as.data.frame(loadedData$covariateData$covariateRef),
+ oldCovariateRef
+ )
})
# add tests using simualted data...
@@ -65,126 +71,127 @@ test_that("print.summary.plpData", {
test_that("savePlpModelError", {
- expect_error(savePlpModel(dirPath=NULL))
- expect_error(savePlpModel(plpModel=NULL))
- expect_error(savePlpModel(plpModel=NULL,dirPath=NULL))
+ expect_error(savePlpModel(dirPath = NULL))
+ expect_error(savePlpModel(plpModel = NULL))
+ expect_error(savePlpModel(plpModel = NULL, dirPath = NULL))
})
test_that("loadPlpModelError", {
- expect_error(loadPlpModel(dirPath=NULL))
- expect_error(loadPlpModel(dirPath='madeup.txt'))
+ expect_error(loadPlpModel(dirPath = NULL))
+ expect_error(loadPlpModel(dirPath = "madeup.txt"))
})
# make an sklearn model to test file transport
-dir.create(file.path(saveLoc, 'testMoveStart'))
-write.csv(data.frame(a=1,b=2), file = file.path(saveLoc, 'testMoveStart', 'file.csv'), row.names = F)
+dir.create(file.path(saveLoc, "testMoveStart"))
+write.csv(data.frame(a = 1, b = 2), file = file.path(saveLoc, "testMoveStart", "file.csv"), row.names = FALSE)
plpModelTemp <- plpResult$model
-plpModelTemp$model = file.path(saveLoc, 'testMoveStart')
-attr(plpModelTemp, 'saveType') <- 'file'
+plpModelTemp$model <- file.path(saveLoc, "testMoveStart")
+attr(plpModelTemp, "saveType") <- "file"
test_that("move model files when saveType is file", {
-
- savePlpModel(plpModel = plpModelTemp, dirPath = file.path(saveLoc, 'testMoveEnd'))
-
- expect_equal(dir(file.path(saveLoc, 'testMoveStart')), dir(file.path(saveLoc, 'testMoveEnd', 'model')))
+ savePlpModel(plpModel = plpModelTemp, dirPath = file.path(saveLoc, "testMoveEnd"))
+
+ expect_equal(dir(file.path(saveLoc, "testMoveStart")), dir(file.path(saveLoc, "testMoveEnd", "model")))
})
test_that("savePrediction", {
- predLoc <- savePrediction(prediction = data.frame(rowId=1:10, value=1:10),
- dirPath = saveLoc, fileName = "pred.json" )
- expect_equal(file.exists(predLoc), T)
-
+ predLoc <- savePrediction(
+ prediction = data.frame(rowId = 1:10, value = 1:10),
+ dirPath = saveLoc, fileName = "pred.json"
+ )
+ expect_equal(file.exists(predLoc), TRUE)
})
test_that("loadPrediction", {
- pred <- loadPrediction(file.path(saveLoc,"pred.json"))
- expect_identical(data.frame(rowId=1:10, value=1:10), pred)
+ pred <- loadPrediction(file.path(saveLoc, "pred.json"))
+ expect_identical(data.frame(rowId = 1:10, value = 1:10), pred)
})
test_that("savePlpResultError", {
- expect_error(savePlpResult(dirPath=NULL))
- expect_error(savePlpResult(result=NULL))
+ expect_error(savePlpResult(dirPath = NULL))
+ expect_error(savePlpResult(result = NULL))
})
test_that("savePlpResult", {
emptyModel <- list()
- attr(emptyModel, 'predictionFunction') <- 'none'
- attr(emptyModel, 'saveType') <- 'RtoJson'
+ attr(emptyModel, "predictionFunction") <- "none"
+ attr(emptyModel, "saveType") <- "RtoJson"
class(emptyModel) <- "plpModel"
emptyResult <- list(
model = emptyModel,
- prediction = data.frame(rowId=1:5, value = 1:5),
+ prediction = data.frame(rowId = 1:5, value = 1:5),
performanceEvaluation = data.frame(),
covariateSummary = NULL,
executionSettings = NULL
- )
- class(emptyResult) <- 'runPlp'
-
- savePlpResult(result = emptyResult, dirPath = file.path(saveLoc,"plpResultTest"))
- expect_equal(dir.exists(file.path(saveLoc,"plpResultTest")), T)
- expect_equal(dir(file.path(saveLoc,"plpResultTest")), c("model","runPlp.rds"))
-
+ )
+ class(emptyResult) <- "runPlp"
+
+ savePlpResult(result = emptyResult, dirPath = file.path(saveLoc, "plpResultTest"))
+ expect_equal(dir.exists(file.path(saveLoc, "plpResultTest")), TRUE)
+ expect_equal(dir(file.path(saveLoc, "plpResultTest")), c("model", "runPlp.rds"))
})
test_that("loadPlpResultError", {
- expect_error(loadPlpResult(dirPath=NULL))
- expect_error(loadPlpResult(dirPath = 'madeup/dfdfd/j'))
- write.csv(c(1), file.path(saveLoc,"file2.csv"))
- expect_error(loadPlpResult(dirPath = file.path(saveLoc,"file2.csv")))
+ expect_error(loadPlpResult(dirPath = NULL))
+ expect_error(loadPlpResult(dirPath = "madeup/dfdfd/j"))
+ write.csv(c(1), file.path(saveLoc, "file2.csv"))
+ expect_error(loadPlpResult(dirPath = file.path(saveLoc, "file2.csv")))
})
test_that("loadPlpResult", {
emptyModel <- list()
- attr(emptyModel, 'predictionFunction') <- 'none'
- attr(emptyModel, 'saveType') <- 'RtoJson'
+ attr(emptyModel, "predictionFunction") <- "none"
+ attr(emptyModel, "saveType") <- "RtoJson"
class(emptyModel) <- "plpModel"
emptyResult <- list(
model = emptyModel,
- prediction = data.frame(rowId=1:5, value = 1:5),
+ prediction = data.frame(rowId = 1:5, value = 1:5),
performanceEvaluation = data.frame(),
covariateSummary = NULL,
executionSettings = NULL
)
- class(emptyResult) <- 'runPlp'
-
- plpResultLoaded <- loadPlpResult(file.path(saveLoc,"plpResultTest"))
+ class(emptyResult) <- "runPlp"
+
+ plpResultLoaded <- loadPlpResult(file.path(saveLoc, "plpResultTest"))
expect_identical(plpResultLoaded$covariateSummary, emptyResult$covariateSummary)
expect_identical(plpResultLoaded$executionSummary, emptyResult$executionSummary)
expect_identical(plpResultLoaded$performanceEvaluation, emptyResult$performanceEvaluation)
expect_identical(plpResultLoaded$prediction, emptyResult$prediction)
-
})
test_that("savePlpShareable works", {
-
- #check it works
- savePlpShareable(plpResult, file.path(saveLoc,"plpFriendly"), minCellCount = 0)
- shareableLoad <- loadPlpShareable(file.path(saveLoc,"plpFriendly"))
-
+ # check it works
+ savePlpShareable(plpResult, file.path(saveLoc, "plpFriendly"), minCellCount = 0)
+ shareableLoad <- loadPlpShareable(file.path(saveLoc, "plpFriendly"))
+
# check covariateSummary
testthat::expect_equal(nrow(shareableLoad$covariateSummary), nrow(plpResult$covariateSummary))
-
+
# check performanceEvaluation
- expect_equal(dim(shareableLoad$performanceEvaluation$evaluationStatistics),
- dim(plpResult$performanceEvaluation$evaluationStatistics)
- )
- expect_equal(dim(shareableLoad$performanceEvaluation$thresholdSummary),
- dim(plpResult$performanceEvaluation$thresholdSummary)
+ expect_equal(
+ dim(shareableLoad$performanceEvaluation$evaluationStatistics),
+ dim(plpResult$performanceEvaluation$evaluationStatistics)
)
- expect_equal(dim(shareableLoad$performanceEvaluation$demographicSummary),
- dim(plpResult$performanceEvaluation$demographicSummary)
+ expect_equal(
+ dim(shareableLoad$performanceEvaluation$thresholdSummary),
+ dim(plpResult$performanceEvaluation$thresholdSummary)
)
- expect_equal(dim(shareableLoad$performanceEvaluation$calibrationSummary),
- dim(plpResult$performanceEvaluation$calibrationSummary)
+ expect_equal(
+ dim(shareableLoad$performanceEvaluation$demographicSummary),
+ dim(plpResult$performanceEvaluation$demographicSummary)
)
- expect_equal(dim(shareableLoad$performanceEvaluation$predictionDistribution),
- dim(plpResult$performanceEvaluation$predictionDistribution)
+ expect_equal(
+ dim(shareableLoad$performanceEvaluation$calibrationSummary),
+ dim(plpResult$performanceEvaluation$calibrationSummary)
+ )
+ expect_equal(
+ dim(shareableLoad$performanceEvaluation$predictionDistribution),
+ dim(plpResult$performanceEvaluation$predictionDistribution)
)
-
})
# Note: saving from database to csv is in the database upload test file
@@ -192,36 +199,35 @@ test_that("savePlpShareable works", {
test_that("applyMinCellCount works", {
-
result <- data.frame(
performance_id = 1:2,
covariate_id = 1:2,
- covariate_name = paste0('name', 1:2),
+ covariate_name = paste0("name", 1:2),
concept_id = 1:2,
covariate_value = runif(2),
- covariate_count = c(100,50),
+ covariate_count = c(100, 50),
covariate_mean = runif(2),
covariate_st_dev = runif(2),
- with_no_outcome_covariate_count = c(10,5),
+ with_no_outcome_covariate_count = c(10, 5),
with_no_outcome_covariate_mean = runif(2),
with_no_outcome_covariate_st_dev = runif(2),
- with_outcome_covariate_count = c(90,45),
+ with_outcome_covariate_count = c(90, 45),
with_outcome_covariate_mean = runif(2),
with_outcome_covariate_st_dev = runif(2),
standardized_mean_diff = runif(2)
)
-
+
minCellResult <- applyMinCellCount(
tableName = "covariate_summary",
sensitiveColumns = getPlpSensitiveColumns(),
result = result,
minCellCount = 5
)
- # check nothing removed
- testthat::expect_equal(2,sum(minCellResult$covariate_count != -1))
- testthat::expect_equal(2,sum(minCellResult$with_no_outcome_covariate_count != -1))
- testthat::expect_equal(2,sum(minCellResult$with_outcome_covariate_count != -1))
-
+ # check nothing removed
+ testthat::expect_equal(2, sum(minCellResult$covariate_count != -1))
+ testthat::expect_equal(2, sum(minCellResult$with_no_outcome_covariate_count != -1))
+ testthat::expect_equal(2, sum(minCellResult$with_outcome_covariate_count != -1))
+
# now check values are removed
minCellResult <- applyMinCellCount(
tableName = "covariate_summary",
@@ -229,10 +235,8 @@ test_that("applyMinCellCount works", {
result = result,
minCellCount = 10
)
- testthat::expect_equal(0,sum(minCellResult$covariate_count == -1))
- testthat::expect_equal(minCellResult$with_no_outcome_covariate_count[2],-1)
- testthat::expect_equal(1,sum(minCellResult$with_no_outcome_covariate_count == -1))
- testthat::expect_equal(1,sum(minCellResult$with_outcome_covariate_count == -1))
-
-
+ testthat::expect_equal(0, sum(minCellResult$covariate_count == -1))
+ testthat::expect_equal(minCellResult$with_no_outcome_covariate_count[2], -1)
+ testthat::expect_equal(1, sum(minCellResult$with_no_outcome_covariate_count == -1))
+ testthat::expect_equal(1, sum(minCellResult$with_outcome_covariate_count == -1))
})
diff --git a/tests/testthat/test-sklearnClassifier.R b/tests/testthat/test-sklearnClassifier.R
index c87a1937b..7b616493d 100644
--- a/tests/testthat/test-sklearnClassifier.R
+++ b/tests/testthat/test-sklearnClassifier.R
@@ -1,134 +1,138 @@
+test_that("DecisionTree settings work checks", {
+ skip_if_not_installed("reticulate")
+ skip_on_cran()
+ dtset <- setDecisionTree(
+ criterion = list("gini"),
+ splitter = list("best"),
+ maxDepth = list(4, 10, NULL),
+ minSamplesSplit = list(2, 10),
+ minSamplesLeaf = list(10, 50),
+ minWeightFractionLeaf = list(0),
+ maxFeatures = list(100, "sqrt", NULL),
+ maxLeafNodes = list(NULL),
+ minImpurityDecrease = list(10^-7),
+ classWeight = list(NULL),
+ seed = sample(1000000, 1)
+ )
+ expect_equal(dtset$fitFunction, "fitSklearn")
-test_that("DecisionTree settings work checks", {
-
-dtset <- setDecisionTree(
- criterion = list('gini'),
- splitter = list('best'),
- maxDepth = list(4, 10, NULL),
- minSamplesSplit = list(2, 10),
- minSamplesLeaf = list(10, 50),
- minWeightFractionLeaf = list(0),
- maxFeatures = list(100,'sqrt', NULL),
- maxLeafNodes = list(NULL),
- minImpurityDecrease = list(10^-7),
- classWeight = list(NULL),
- seed = sample(1000000,1)
-)
-
-expect_equal(dtset$fitFunction, "fitSklearn")
-
-expect_equal(length(dtset$param), 3*2*2*3*1)
-
-expect_equal(unique(unlist(lapply(dtset$param, function(x) x[[1]]))), 'gini')
-expect_equal(unique(unlist(lapply(dtset$param, function(x) x[[2]]))), 'best')
-expect_equal(length(unique(lapply(dtset$param, function(x) x[[3]]))), 3)
-
-expect_false(attr(dtset$param, 'settings')$requiresDenseMatrix)
-expect_equal(attr(dtset$param, 'settings')$name, 'Decision Tree')
-expect_equal(attr(dtset$param, 'settings')$pythonModule, 'sklearn.tree')
-expect_equal(attr(dtset$param, 'settings')$pythonClass, "DecisionTreeClassifier")
+ expect_equal(length(dtset$param), 3 * 2 * 2 * 3 * 1)
+ expect_equal(unique(unlist(lapply(dtset$param, function(x) x[[1]]))), "gini")
+ expect_equal(unique(unlist(lapply(dtset$param, function(x) x[[2]]))), "best")
+ expect_equal(length(unique(lapply(dtset$param, function(x) x[[3]]))), 3)
+ expect_false(attr(dtset$param, "settings")$requiresDenseMatrix)
+ expect_equal(attr(dtset$param, "settings")$name, "Decision Tree")
+ expect_equal(attr(dtset$param, "settings")$pythonModule, "sklearn.tree")
+ expect_equal(attr(dtset$param, "settings")$pythonClass, "DecisionTreeClassifier")
})
test_that("DecisionTree errors as expected", {
-
- expect_error(setDecisionTree(criterion = list('madeup')))
-
- expect_error(setDecisionTree(maxDepth = list(-1)))
- expect_error(setDecisionTree(minSamplesSplit = list(-1)))
- expect_error(setDecisionTree(minSamplesLeaf = list(-1)))
-
+ skip_if_not_installed("reticulate")
+ skip_on_cran()
+ expect_error(setDecisionTree(criterion = list("madeup")))
+
+ expect_error(setDecisionTree(maxDepth = list(-1)))
+ expect_error(setDecisionTree(minSamplesSplit = list(-1)))
+ expect_error(setDecisionTree(minSamplesLeaf = list(-1)))
})
test_that("check fit of DecisionTree", {
-
+ skip_if_not_installed("reticulate")
+ skip_on_cran()
modelSettings <- setDecisionTree(
- criterion = list('gini'),
- splitter = list('best'),
+ criterion = list("gini"),
+ splitter = list("best"),
maxDepth = list(as.integer(4)),
minSamplesSplit = list(2),
minSamplesLeaf = list(10),
minWeightFractionLeaf = list(0),
- maxFeatures = list('sqrt'),
+ maxFeatures = list("sqrt"),
maxLeafNodes = list(NULL),
minImpurityDecrease = list(10^-7),
classWeight = list(NULL),
- seed = sample(1000000,1)
+ seed = sample(1000000, 1)
)
-
+
plpModel <- fitPlp(
- trainData = tinyTrainData,
+ trainData = tinyTrainData,
modelSettings = modelSettings,
- analysisId = 'DecisionTree',
+ analysisId = "DecisionTree",
analysisPath = tempdir()
- )
-
+ )
+
expect_correct_fitPlp(plpModel, trainData)
# add check for other model design settings
-
})
-test_that('fitSklearn errors with wrong covariateData', {
-
+test_that("fitSklearn errors with wrong covariateData", {
+ skip_if_not_installed("reticulate")
+ skip_on_cran()
newTrainData <- copyTrainData(trainData)
- class(newTrainData$covariateData) <- 'notCovariateData'
+ class(newTrainData$covariateData) <- "notCovariateData"
modelSettings <- setAdaBoost()
analysisId <- 42
-
+
expect_error(fitSklearn(newTrainData,
- modelSettings,
- search='grid',
- analysisId))
+ modelSettings,
+ search = "grid",
+ analysisId
+ ))
})
-test_that('AdaBoost fit works', {
-
- modelSettings <- setAdaBoost(nEstimators = list(10),
- learningRate = list(0.1),
- )
-
+test_that("AdaBoost fit works", {
+ skip_if_not_installed("reticulate")
+ skip_on_cran()
+ modelSettings <- setAdaBoost(
+ nEstimators = list(10),
+ learningRate = list(0.1),
+ )
+
plpModel <- fitPlp(
- trainData = tinyTrainData,
+ trainData = tinyTrainData,
modelSettings = modelSettings,
- analysisId = 'Adaboost',
+ analysisId = "Adaboost",
analysisPath = tempdir()
)
-
+
expect_correct_fitPlp(plpModel, trainData)
- expect_equal(dir(plpModel$model),"model.json")
-
+ expect_equal(dir(plpModel$model), "model.json")
})
-test_that('RandomForest fit works', {
-
- modelSettings <- setRandomForest(ntrees=list(10),
- maxDepth=list(4),
- minSamplesSplit = list(2),
- minSamplesLeaf = list(10),
- mtries = list("sqrt"),
- maxSamples = list(0.9),
- classWeight = list(NULL))
-
+test_that("RandomForest fit works", {
+ skip_if_not_installed("reticulate")
+ skip_on_cran()
+ modelSettings <- setRandomForest(
+ ntrees = list(10),
+ maxDepth = list(4),
+ minSamplesSplit = list(2),
+ minSamplesLeaf = list(10),
+ mtries = list("sqrt"),
+ maxSamples = list(0.9),
+ classWeight = list(NULL)
+ )
+
plpModel <- fitPlp(
- trainData = tinyTrainData,
+ trainData = tinyTrainData,
modelSettings = modelSettings,
- analysisId = 'RandomForest',
+ analysisId = "RandomForest",
analysisPath = tempdir()
)
-
+
expect_correct_fitPlp(plpModel, trainData)
- expect_equal(dir(plpModel$model),"model.json")
-
+ expect_equal(dir(plpModel$model), "model.json")
})
-test_that('MLP fit works', {
+test_that("MLP fit works", {
+ skip_if_not_installed("reticulate")
+ skip_on_cran()
modelSettings <- setMLP(
hiddenLayerSizes = list(c(20)),
alpha = list(1e-6),
@@ -137,69 +141,76 @@ test_that('MLP fit works', {
learningRateInit = list(0.01),
tol = list(1e-2) # reduce tol so I don't get convergence warnings
)
-
+
plpModel <- fitPlp(
- trainData = tinyTrainData,
+ trainData = tinyTrainData,
modelSettings = modelSettings,
- analysisId = 'MLP',
+ analysisId = "MLP",
analysisPath = tempdir()
)
-
+
expect_correct_fitPlp(plpModel, trainData)
- expect_equal(dir(plpModel$model),"model.json")
-
+ expect_equal(dir(plpModel$model), "model.json")
})
-test_that('Naive bayes fit works', {
+test_that("Naive bayes fit works", {
+ skip_if_not_installed("reticulate")
+ skip_on_cran()
modelSettings <- setNaiveBayes()
-
+
plpModel <- fitPlp(
- trainData = tinyTrainData,
+ trainData = tinyTrainData,
modelSettings = modelSettings,
- analysisId = 'Naive bayes',
+ analysisId = "Naive bayes",
analysisPath = tempdir()
)
-
- expect_correct_fitPlp(plpModel, trainData)
- expect_equal(dir(plpModel$model),"model.json")
-
+
+ expect_correct_fitPlp(plpModel, trainData)
+ expect_equal(dir(plpModel$model), "model.json")
})
-test_that('Support vector machine fit works', {
- modelSettings <- setSVM(C = list(1),
- degree = list(1),
- gamma = list('scale'),
- classWeight = list(NULL))
-
+test_that("Support vector machine fit works", {
+ skip_if_not_installed("reticulate")
+ skip_on_cran()
+ modelSettings <- setSVM(
+ C = list(1),
+ degree = list(1),
+ gamma = list("scale"),
+ classWeight = list(NULL)
+ )
+
plpModel <- fitPlp(
- trainData = tinyTrainData,
+ trainData = tinyTrainData,
modelSettings = modelSettings,
- analysisId = 'SVM',
+ analysisId = "SVM",
analysisPath = tempdir()
)
-
- expect_correct_fitPlp(plpModel, trainData)
- expect_equal(dir(plpModel$model),"model.json")
-
+
+ expect_correct_fitPlp(plpModel, trainData)
+ expect_equal(dir(plpModel$model), "model.json")
})
-test_that('Sklearn predict works', {
-
- modelSettings <- setAdaBoost(nEstimators = list(10),
- learningRate = list(0.1),
+test_that("Sklearn predict works", {
+ skip_if_not_installed("reticulate")
+ skip_on_cran()
+ modelSettings <- setAdaBoost(
+ nEstimators = list(10),
+ learningRate = list(0.1),
)
-
+
plpModel <- fitPlp(
- trainData = tinyTrainData,
+ trainData = tinyTrainData,
modelSettings = modelSettings,
- analysisId = 'Adaboost',
+ analysisId = "Adaboost",
analysisPath = tempdir()
)
-
- predictions <- predictPythonSklearn(plpModel,
- testData,
- population)
+
+ predictions <- predictPythonSklearn(
+ plpModel,
+ testData,
+ population
+ )
expect_correct_predictions(predictions, testData)
})
diff --git a/tests/testthat/test-sklearnClassifierHelpers.R b/tests/testthat/test-sklearnClassifierHelpers.R
index 7467fb739..0b2e698d6 100644
--- a/tests/testthat/test-sklearnClassifierHelpers.R
+++ b/tests/testthat/test-sklearnClassifierHelpers.R
@@ -1,11 +1,9 @@
test_that("listCartesian works", {
-
- allList <- list(a=list(1,2), b=list(NULL, 'auto'), c=list(-1))
-
+ allList <- list(a = list(1, 2), b = list(NULL, "auto"), c = list(-1))
+
paramLists <- listCartesian(allList)
-
- expect_equal(length(paramLists), 2*2*1)
- expect_equal(names(paramLists[[1]]), c('a', 'b', 'c'))
+
+ expect_equal(length(paramLists), 2 * 2 * 1)
+ expect_equal(names(paramLists[[1]]), c("a", "b", "c"))
expect_equal(length(paramLists[[1]]), 3)
-
})
diff --git a/tests/testthat/test-sklearnClassifierSettings.R b/tests/testthat/test-sklearnClassifierSettings.R
index 29c3f7f3d..c22b65403 100644
--- a/tests/testthat/test-sklearnClassifierSettings.R
+++ b/tests/testthat/test-sklearnClassifierSettings.R
@@ -1,57 +1,58 @@
test_that("setAdaBoost settings work checks", {
-
+ skip_if_not_installed("reticulate")
+ skip_on_cran()
adset <- setAdaBoost(
- nEstimators = list(10,50, 200),
- learningRate = list(1, 0.5, 0.1),
- algorithm = list('SAMME.R'),
- seed = sample(1000000,1)
+ nEstimators = list(10, 50, 200),
+ learningRate = list(1, 0.5, 0.1),
+ algorithm = list("SAMME"),
+ seed = sample(1000000, 1)
)
-
+
expect_equal(adset$fitFunction, "fitSklearn")
-
- expect_equal(length(adset$param), 3*3*1)
-
- expect_equal(unique(unlist(lapply(adset$param, function(x) x[[1]]))), c(10,50, 200))
+
+ expect_equal(length(adset$param), 3 * 3 * 1)
+
+ expect_equal(unique(unlist(lapply(adset$param, function(x) x[[1]]))), c(10, 50, 200))
expect_equal(unique(unlist(lapply(adset$param, function(x) x[[2]]))), c(1, 0.5, 0.1))
- expect_equal(unique(lapply(adset$param, function(x) x[[3]])), list('SAMME.R'))
-
- expect_false(attr(adset$param, 'settings')$requiresDenseMatrix)
- expect_equal(attr(adset$param, 'settings')$name, 'AdaBoost')
- expect_equal(attr(adset$param, 'settings')$pythonModule, 'sklearn.ensemble')
- expect_equal(attr(adset$param, 'settings')$pythonClass, "AdaBoostClassifier")
-
-
+ expect_equal(unique(lapply(adset$param, function(x) x[[3]])), list("SAMME"))
+
+ expect_false(attr(adset$param, "settings")$requiresDenseMatrix)
+ expect_equal(attr(adset$param, "settings")$name, "AdaBoost")
+ expect_equal(attr(adset$param, "settings")$pythonModule, "sklearn.ensemble")
+ expect_equal(attr(adset$param, "settings")$pythonClass, "AdaBoostClassifier")
+
+
inputs <- AdaBoostClassifierInputs(list, adset$param[[1]])
expect_equal(
- names(inputs),
- c("n_estimators","learning_rate","algorithm","random_state" )
- )
-
+ names(inputs),
+ c("n_estimators", "learning_rate", "algorithm", "random_state")
+ )
})
test_that("setAdaBoost errors as expected", {
-
+ skip_if_not_installed("reticulate")
+ skip_on_cran()
expect_error(setAdaBoost(nEstimators = list(-1)))
- expect_error(setAdaBoost(learningRate = list(-1)))
- expect_error(setAdaBoost(algorithm = list(-1)))
- expect_error(setAdaBoost(seed = list('seed')))
-
+ expect_error(setAdaBoost(learningRate = list(-1)))
+ expect_error(setAdaBoost(algorithm = list(-1)))
+ expect_error(setAdaBoost(seed = list("seed")))
})
test_that("setMLP settings work checks", {
-
+ skip_if_not_installed("reticulate")
+ skip_on_cran()
mlpset <- setMLP(
- hiddenLayerSizes = list(c(100), c(20,4)), #must be integers
- activation = list('relu'),
- solver = list('adam'),
- alpha = list(0.3,0.01,0.0001,0.000001),
- batchSize = list('auto'),
- learningRate = list('constant'),
+ hiddenLayerSizes = list(c(100), c(20, 4)), # must be integers
+ activation = list("relu"),
+ solver = list("adam"),
+ alpha = list(0.3, 0.01, 0.0001, 0.000001),
+ batchSize = list("auto"),
+ learningRate = list("constant"),
learningRateInit = list(0.001),
powerT = list(0.5),
- maxIter = list(200, 100),
+ maxIter = list(200, 100),
shuffle = list(TRUE),
tol = list(0.0001),
warmStart = list(TRUE),
@@ -59,69 +60,71 @@ test_that("setMLP settings work checks", {
nesterovsMomentum = list(TRUE),
earlyStopping = list(FALSE),
validationFraction = list(0.1),
- beta1 = list(0.9),
- beta2 = list(0.999),
- epsilon = list(1,0.1,0.00000001),
+ beta1 = list(0.9),
+ beta2 = list(0.999),
+ epsilon = list(1, 0.1, 0.00000001),
nIterNoChange = list(10),
- seed = sample(100000,1)
+ seed = sample(100000, 1)
)
-
+
expect_equal(mlpset$fitFunction, "fitSklearn")
-
- expect_equal(length(mlpset$param), 2*4*2*3)
-
- expect_equal(unique(lapply(mlpset$param, function(x) x[[1]])), list(c(100), c(20,4)))
- expect_equal(unique(unlist(lapply(mlpset$param, function(x) x[[2]]))), 'relu')
- expect_equal(unique(unlist(lapply(mlpset$param, function(x) x[[4]]))), c(0.3,0.01,0.0001,0.000001))
+
+ expect_equal(length(mlpset$param), 2 * 4 * 2 * 3)
+
+ expect_equal(unique(lapply(mlpset$param, function(x) x[[1]])), list(c(100), c(20, 4)))
+ expect_equal(unique(unlist(lapply(mlpset$param, function(x) x[[2]]))), "relu")
+ expect_equal(unique(unlist(lapply(mlpset$param, function(x) x[[4]]))), c(0.3, 0.01, 0.0001, 0.000001))
expect_equal(unique(lapply(mlpset$param, function(x) x[[9]])), list(200, 100))
-
- expect_false(attr(mlpset$param, 'settings')$requiresDenseMatrix)
- expect_equal(attr(mlpset$param, 'settings')$name, 'Neural Network')
- expect_equal(attr(mlpset$param, 'settings')$pythonModule, 'sklearn.neural_network')
- expect_equal(attr(mlpset$param, 'settings')$pythonClass, "MLPClassifier")
-
+
+ expect_false(attr(mlpset$param, "settings")$requiresDenseMatrix)
+ expect_equal(attr(mlpset$param, "settings")$name, "Neural Network")
+ expect_equal(attr(mlpset$param, "settings")$pythonModule, "sklearn.neural_network")
+ expect_equal(attr(mlpset$param, "settings")$pythonClass, "MLPClassifier")
+
inputs <- MLPClassifierInputs(list, mlpset$param[[1]])
expect_equal(
- names(inputs),
- c("hidden_layer_sizes", "activation", "solver", "alpha", "batch_size",
+ names(inputs),
+ c(
+ "hidden_layer_sizes", "activation", "solver", "alpha", "batch_size",
"learning_rate", "learning_rate_init", "power_t", "max_iter", "shuffle",
"random_state", "tol", "verbose", "warm_start", "momentum", "nesterovs_momentum",
"early_stopping", "validation_fraction", "beta_1", "beta_2", "epsilon",
- "n_iter_no_change" )
+ "n_iter_no_change"
+ )
)
})
test_that("setNaiveBayes settings work checks", {
-
- nbset <- setNaiveBayes(
- )
-
+ skip_if_not_installed("reticulate")
+ skip_on_cran()
+ nbset <- setNaiveBayes()
+
expect_equal(nbset$fitFunction, "fitSklearn")
-
+
expect_equal(length(nbset$param), 1)
-
- expect_true(attr(nbset$param, 'settings')$requiresDenseMatrix)
- expect_equal(attr(nbset$param, 'settings')$name, 'Naive Bayes')
- expect_equal(attr(nbset$param, 'settings')$pythonModule, 'sklearn.naive_bayes')
- expect_equal(attr(nbset$param, 'settings')$pythonClass, "GaussianNB")
-
+
+ expect_true(attr(nbset$param, "settings")$requiresDenseMatrix)
+ expect_equal(attr(nbset$param, "settings")$name, "Naive Bayes")
+ expect_equal(attr(nbset$param, "settings")$pythonModule, "sklearn.naive_bayes")
+ expect_equal(attr(nbset$param, "settings")$pythonClass, "GaussianNB")
+
inputs <- GaussianNBInputs(list, nbset$param[[1]])
- expect_equal(names(inputs),NULL)
-
+ expect_equal(names(inputs), NULL)
})
test_that("setRandomForest settings work checks", {
-
+ skip_if_not_installed("reticulate")
+ skip_on_cran()
rfset <- setRandomForest(
- ntrees = list(100,500),
- criterion = list('gini'),
- maxDepth = list(4,10,17),
- minSamplesSplit = list(2,5),
- minSamplesLeaf = list(1,10),
+ ntrees = list(100, 500),
+ criterion = list("gini"),
+ maxDepth = list(4, 10, 17),
+ minSamplesSplit = list(2, 5),
+ minSamplesLeaf = list(1, 10),
minWeightFractionLeaf = list(0),
- mtries = list('sqrt', 'log2'),
+ mtries = list("sqrt", "log2"),
maxLeafNodes = list(NULL),
minImpurityDecrease = list(0),
bootstrap = list(TRUE),
@@ -129,65 +132,70 @@ test_that("setRandomForest settings work checks", {
oobScore = list(FALSE),
nJobs = list(NULL),
classWeight = list(NULL),
- seed = sample(100000,1)
+ seed = sample(100000, 1)
)
-
+
expect_equal(rfset$fitFunction, "fitSklearn")
-
- expect_equal(length(rfset$param), 2*3*2*2*2*2*1)
-
- expect_equal(unique(lapply(rfset$param, function(x) x[[1]])), list(100,500))
- expect_equal(unique(unlist(lapply(rfset$param, function(x) x[[3]]))), c(4,10,17))
-
- expect_false(attr(rfset$param, 'settings')$requiresDenseMatrix)
- expect_equal(attr(rfset$param, 'settings')$name, 'Random forest')
- expect_equal(attr(rfset$param, 'settings')$pythonModule, 'sklearn.ensemble')
- expect_equal(attr(rfset$param, 'settings')$pythonClass, "RandomForestClassifier")
-
- inputs <- RandomForestClassifierInputs(list, rfset$param[[1]])
+
+ expect_equal(length(rfset$param), 2 * 3 * 2 * 2 * 2 * 2 * 1)
+
+ expect_equal(unique(lapply(rfset$param, function(x) x[[1]])), list(100, 500))
+ expect_equal(unique(unlist(lapply(rfset$param, function(x) x[[3]]))), c(4, 10, 17))
+
+ expect_false(attr(rfset$param, "settings")$requiresDenseMatrix)
+ expect_equal(attr(rfset$param, "settings")$name, "Random forest")
+ expect_equal(attr(rfset$param, "settings")$pythonModule, "sklearn.ensemble")
+ expect_equal(attr(rfset$param, "settings")$pythonClass, "RandomForestClassifier")
+
+ inputs <- RandomForestClassifierInputs(list, rfset$param[[1]])
expect_equal(
- names(inputs),
- c("n_estimators", "criterion", "max_depth", "min_samples_split", "min_samples_leaf",
+ names(inputs),
+ c(
+ "n_estimators", "criterion", "max_depth", "min_samples_split", "min_samples_leaf",
"min_weight_fraction_leaf", "max_features", "max_leaf_nodes", "min_impurity_decrease",
"bootstrap", "max_samples", "oob_score", "n_jobs", "random_state", "verbose",
- "warm_start","class_weight")
+ "warm_start", "class_weight"
+ )
)
})
test_that("setSVM settings work checks", {
-
- svmset <- setSVM (
- C = list(1,0.9,2,0.1),
- kernel = list('rbf'),
- degree = list(1,3,5),
- gamma = list('scale', 1e-04, 3e-05, 0.001, 0.01, 0.25),
+ skip_if_not_installed("reticulate")
+ skip_on_cran()
+ svmset <- setSVM(
+ C = list(1, 0.9, 2, 0.1),
+ kernel = list("rbf"),
+ degree = list(1, 3, 5),
+ gamma = list("scale", 1e-04, 3e-05, 0.001, 0.01, 0.25),
coef0 = list(0.0),
- shrinking = list(TRUE),
+ shrinking = list(TRUE),
tol = list(0.001),
- classWeight = list(NULL),
- cacheSize = 500,
- seed = sample(100000,1)
+ classWeight = list(NULL),
+ cacheSize = 500,
+ seed = sample(100000, 1)
)
-
+
expect_equal(svmset$fitFunction, "fitSklearn")
-
- expect_equal(length(svmset$param), 4*3*6*1)
-
- expect_equal(unique(lapply(svmset$param, function(x) x[[4]])), list('scale', 1e-04, 3e-05, 0.001, 0.01, 0.25))
- expect_equal(unique(unlist(lapply(svmset$param, function(x) x[[1]]))), c(1,0.9,2,0.1))
-
- expect_false(attr(svmset$param, 'settings')$requiresDenseMatrix)
- expect_equal(attr(svmset$param, 'settings')$name, 'Support Vector Machine')
- expect_equal(attr(svmset$param, 'settings')$pythonModule, 'sklearn.svm')
- expect_equal(attr(svmset$param, 'settings')$pythonClass, "SVC")
-
- inputs <- SVCInputs(list, svmset$param[[1]])
+
+ expect_equal(length(svmset$param), 4 * 3 * 6 * 1)
+
+ expect_equal(unique(lapply(svmset$param, function(x) x[[4]])), list("scale", 1e-04, 3e-05, 0.001, 0.01, 0.25))
+ expect_equal(unique(unlist(lapply(svmset$param, function(x) x[[1]]))), c(1, 0.9, 2, 0.1))
+
+ expect_false(attr(svmset$param, "settings")$requiresDenseMatrix)
+ expect_equal(attr(svmset$param, "settings")$name, "Support Vector Machine")
+ expect_equal(attr(svmset$param, "settings")$pythonModule, "sklearn.svm")
+ expect_equal(attr(svmset$param, "settings")$pythonClass, "SVC")
+
+ inputs <- SVCInputs(list, svmset$param[[1]])
expect_equal(
- names(inputs),
- c("C", "kernel", "degree", "gamma", "coef0",
- "shrinking", "probability", "tol", "cache_size",
+ names(inputs),
+ c(
+ "C", "kernel", "degree", "gamma", "coef0",
+ "shrinking", "probability", "tol", "cache_size",
"class_weight", "verbose", "max_iter", "decision_function_shape",
- "break_ties", "random_state")
+ "break_ties", "random_state"
+ )
)
})
diff --git a/tests/testthat/test-sklearnJson.R b/tests/testthat/test-sklearnJson.R
index 6d36b5f07..d8e85a2ba 100644
--- a/tests/testthat/test-sklearnJson.R
+++ b/tests/testthat/test-sklearnJson.R
@@ -1,131 +1,145 @@
+if (rlang::is_installed("reticulate")) {
+ sklearn <- reticulate::import("sklearn", convert = FALSE)
+ np <- reticulate::import("numpy", convert = FALSE)
+ data <- sklearn$datasets$make_classification(
+ n_samples = 500L, n_features = 3L,
+ n_classes = 2L, n_informative = 3L,
+ n_redundant = 0L, random_state = 0L,
+ shuffle = FALSE
+ )
-sklearn <- reticulate::import('sklearn', convert=FALSE)
-np <- reticulate::import('numpy', convert=FALSE)
-
-data <- sklearn$datasets$make_classification(n_samples=500L, n_features=3L,
- n_classes=2L, n_informative=3L,
- n_redundant=0L, random_state=0L,
- shuffle=FALSE)
+ xUnseen <- sklearn$datasets$make_classification(
+ n_samples = 100L, n_features = 3L,
+ n_classes = 2L, n_informative = 3L,
+ n_redundant = 0L, random_state = 42L,
+ shuffle = FALSE
+ )[[0]]
+ X <- data[[0]]
+ y <- data[[1]]
+}
+test_that("Decision tree to json is correct", {
+ skip_if_not_installed("reticulate")
+ skip_on_cran()
+ classifier <- sklearn$tree$DecisionTreeClassifier(max_depth = 3L)
-X_unseen <- sklearn$datasets$make_classification(n_samples=100L, n_features=3L,
- n_classes=2L, n_informative=3L,
- n_redundant=0L, random_state=42L,
- shuffle=FALSE)[[0]]
-X <- data[[0]]
-y <- data[[1]]
+ model <- classifier$fit(X, y)
+ predictions <- reticulate::py_to_r(model$predict_proba(xUnseen))
+ path <- file.path(tempdir(), "model.json")
-test_that("Decision tree to json is correct", {
- classifier <- sklearn$tree$DecisionTreeClassifier(max_depth=3L)
-
- model <- classifier$fit(X,y)
- predictions <- reticulate::py_to_r(model$predict_proba(X_unseen))
- path <- file.path(tempdir(),"model.json")
-
sklearnToJson(model, path)
-
+
loadedModel <- sklearnFromJson(path)
-
- loadedPredictions <- reticulate::py_to_r(loadedModel$predict_proba(X_unseen))
-
+
+ loadedPredictions <- reticulate::py_to_r(loadedModel$predict_proba(xUnseen))
+
expect_true(all.equal(predictions, loadedPredictions))
})
test_that("Random forest to json is correct", {
- classifier <- sklearn$ensemble$RandomForestClassifier(n_estimators=10L)
-
- model <- classifier$fit(X,y)
- predictions <- reticulate::py_to_r(model$predict_proba(X_unseen))
- path <- file.path(tempdir(),"model.json")
-
+ skip_if_not_installed("reticulate")
+ skip_on_cran()
+ classifier <- sklearn$ensemble$RandomForestClassifier(n_estimators = 10L)
+
+ model <- classifier$fit(X, y)
+ predictions <- reticulate::py_to_r(model$predict_proba(xUnseen))
+ path <- file.path(tempdir(), "model.json")
+
sklearnToJson(model, path)
-
+
loadedModel <- sklearnFromJson(path)
-
- loadedPredictions <- reticulate::py_to_r(loadedModel$predict_proba(X_unseen))
-
+
+ loadedPredictions <- reticulate::py_to_r(loadedModel$predict_proba(xUnseen))
+
expect_true(all.equal(predictions, loadedPredictions))
})
test_that("Adaboost to json is correct", {
- classifier <- sklearn$ensemble$AdaBoostClassifier(n_estimators=10L)
-
- model <- classifier$fit(X,y)
- predictions <- reticulate::py_to_r(model$predict_proba(X_unseen))
+ skip_if_not_installed("reticulate")
+ skip_on_cran()
+ classifier <- sklearn$ensemble$AdaBoostClassifier(n_estimators = 10L)
+
+ model <- classifier$fit(X, y)
+ predictions <- reticulate::py_to_r(model$predict_proba(xUnseen))
path <- file.path(tempdir(), "model.json")
-
+
sklearnToJson(model, path)
-
+
loadedModel <- sklearnFromJson(path)
-
- loadedPredictions <- reticulate::py_to_r(loadedModel$predict_proba(X_unseen))
-
+
+ loadedPredictions <- reticulate::py_to_r(loadedModel$predict_proba(xUnseen))
+
expect_true(all.equal(predictions, loadedPredictions))
})
test_that("Naive Bayes to json is correct", {
+ skip_if_not_installed("reticulate")
+ skip_on_cran()
classifier <- sklearn$naive_bayes$GaussianNB()
-
- model <- classifier$fit(X,y)
- predictions <- reticulate::py_to_r(model$predict_proba(X_unseen))
+
+ model <- classifier$fit(X, y)
+ predictions <- reticulate::py_to_r(model$predict_proba(xUnseen))
path <- file.path(tempdir(), "model.json")
-
+
sklearnToJson(model, path)
-
+
loadedModel <- sklearnFromJson(path)
-
- loadedPredictions <- reticulate::py_to_r(loadedModel$predict_proba(X_unseen))
-
+
+ loadedPredictions <- reticulate::py_to_r(loadedModel$predict_proba(xUnseen))
+
expect_true(all.equal(predictions, loadedPredictions))
})
test_that("MLP to json is correct", {
+ skip_if_not_installed("reticulate")
+ skip_on_cran()
# lower tolerance to not get convergence warning
- classifier <- sklearn$neural_network$MLPClassifier(tol=1e-2)
-
- model <- classifier$fit(X,y)
- predictions <- reticulate::py_to_r(model$predict_proba(X_unseen))
+ classifier <- sklearn$neural_network$MLPClassifier(tol = 1e-2)
+
+ model <- classifier$fit(X, y)
+ predictions <- reticulate::py_to_r(model$predict_proba(xUnseen))
path <- file.path(tempdir(), "model.json")
-
+
sklearnToJson(model, path)
-
+
loadedModel <- sklearnFromJson(path)
-
- loadedPredictions <- reticulate::py_to_r(loadedModel$predict_proba(X_unseen))
-
+
+ loadedPredictions <- reticulate::py_to_r(loadedModel$predict_proba(xUnseen))
+
expect_true(all.equal(predictions, loadedPredictions))
})
test_that("SVM to json is correct", {
- classifier <- sklearn$svm$SVC(probability=TRUE)
-
- # create sparse data because then some of the internal fields in the
+ skip_if_not_installed("reticulate")
+ skip_on_cran()
+ classifier <- sklearn$svm$SVC(probability = TRUE)
+
+ # create sparse data because then some of the internal fields in the
# SVM will be sparse
- feature_hasher <- sklearn$feature_extraction$FeatureHasher(n_features=3L)
- random <- reticulate::import("random", convert=FALSE)
+ featureHasher <- sklearn$feature_extraction$FeatureHasher(n_features = 3L)
+ random <- reticulate::import("random", convert = FALSE)
features <- list()
- y_sparse <- np$empty(100L)
+ ySparse <- np$empty(100L)
for (i in 1:100) {
- row <- reticulate::dict(a=random$randint(0,2),
- b=random$randint(3,5),
- c=random$randint(6,8))
+ row <- reticulate::dict(
+ a = random$randint(0, 2),
+ b = random$randint(3, 5),
+ c = random$randint(6, 8)
+ )
features <- c(features, row)
- reticulate::py_set_item(y_sparse, i - 1L, random$randint(0, 2))
- }
- X_sparse <- feature_hasher$transform(features)
-
- model <- classifier$fit(X_sparse,y_sparse)
- predictions <- reticulate::py_to_r(model$predict_proba(X_unseen))
+ reticulate::py_set_item(ySparse, i - 1L, random$randint(0, 2))
+ }
+ xSparse <- featureHasher$transform(features)
+
+ model <- classifier$fit(xSparse, ySparse)
+ predictions <- reticulate::py_to_r(model$predict_proba(xUnseen))
path <- file.path(tempdir(), "model.json")
-
+
sklearnToJson(model, path)
-
- loadedModel <- sklearnFromJson(path)
-
- loadedPredictions <- reticulate::py_to_r(loadedModel$predict_proba(X_unseen))
-
- expect_true(all.equal(predictions, loadedPredictions))
-})
+ loadedModel <- sklearnFromJson(path)
+ loadedPredictions <- reticulate::py_to_r(loadedModel$predict_proba(xUnseen))
+ expect_true(all.equal(predictions, loadedPredictions))
+})
diff --git a/tests/testthat/test-validation.R b/tests/testthat/test-validation.R
index 1fb3ec2e2..a66818b35 100644
--- a/tests/testthat/test-validation.R
+++ b/tests/testthat/test-validation.R
@@ -17,19 +17,18 @@
context("Validation")
# Test unit for the creation of the study externalValidatePlp
-modelVal <- loadPlpModel(file.path(saveLoc, 'Test', 'plpResult', 'model'))
-validationDatabaseDetailsVal <- databaseDetails # from run multiple tests
+modelVal <- loadPlpModel(file.path(saveLoc, "Test", "plpResult", "model"))
+validationDatabaseDetailsVal <- databaseDetails # from run multiple tests
validationRestrictPlpDataSettingsVal <- createRestrictPlpDataSettings(washoutPeriod = 0, sampleSize = NULL)
-recalSet <- createValidationSettings(recalibrate = 'weakRecalibration')
-saveLocation <- file.path(saveLoc, 'extern')
+recalSet <- createValidationSettings(recalibrate = "weakRecalibration")
+saveLocation <- file.path(saveLoc, "extern")
setEV <- function(
- model = modelVal,
- validationDatabaseDetails = validationDatabaseDetailsVal,
- validationRestrictPlpDataSettings = validationRestrictPlpDataSettingsVal,
- settings = recalSet,
- outputFolder = saveLocation
-){
+ model = modelVal,
+ validationDatabaseDetails = validationDatabaseDetailsVal,
+ validationRestrictPlpDataSettings = validationRestrictPlpDataSettingsVal,
+ settings = recalSet,
+ outputFolder = saveLocation) {
result <- externalValidateDbPlp(
plpModel = model,
validationDatabaseDetails = validationDatabaseDetails,
@@ -37,100 +36,109 @@ setEV <- function(
settings = settings,
outputFolder = outputFolder
)
-
+
return(result)
}
test_that("incorrect input externalValidateDbPlp checks work", {
-
# fails when plpResult is NULL
- expect_error(externalValidateDbPlp(setEV(model=NULL)))
+ expect_error(externalValidateDbPlp(setEV(model = NULL)))
# fails when plpResult is not class 'plpResult'
- expect_error(externalValidateDbPlp(setEV(model=list())))
-
-
+ expect_error(externalValidateDbPlp(setEV(model = list())))
+
+
expect_error(externalValidateDbPlp(
setEV(validationDatabaseDetails = NULL)
))
-
+
expect_error(externalValidateDbPlp(
setEV(validationRestrictPlpDataSettings = NULL)
))
-
+
expect_error(externalValidateDbPlp(
setEV(outputFolder = NULL)
))
-
-
})
test_that("external validate", {
-
exVal <- setEV()
- testthat::expect_equal(class(exVal[[1]]), 'externalValidatePlp')
-
+ testthat::expect_equal(class(exVal[[1]]), "externalValidatePlp")
})
test_that("fromDesignOrModel helper works", {
-
- settingName <- "restrictPlpDataSettings"
- validationDesign <- list(targetId = 1,
- outcomeId = 2,
- restrictPlpDataSettings = list(a = 1, b = 2)
- )
- modelDesigns <- list(
- list(targetId = 1,
- outcomeId = 2,
- restrictPlpDataSettings = list(a = 3, b = 4)
- ),
- list(targetId = 1,
- outcomeId = 2,
- restrictPlpDataSettings = list(a = 3, b = 4)
- )
- )
- output <- fromDesignOrModel(validationDesign, modelDesigns, settingName)
-
- expect_equal(output[[settingName]], list(a = 1, b = 2))
- validationDesign[[settingName]] <- NULL
- output <- fromDesignOrModel(validationDesign, modelDesigns, settingName)
- expect_equal(output[[settingName]], list(a = 3, b = 4))
+ settingName <- "restrictPlpDataSettings"
+ validationDesign <- list(
+ targetId = 1,
+ outcomeId = 2,
+ restrictPlpDataSettings = list(a = 1, b = 2)
+ )
+ modelDesigns <- list(
+ list(
+ targetId = 1,
+ outcomeId = 2,
+ restrictPlpDataSettings = list(a = 3, b = 4)
+ ),
+ list(
+ targetId = 1,
+ outcomeId = 2,
+ restrictPlpDataSettings = list(a = 3, b = 4)
+ )
+ )
+ output <- fromDesignOrModel(validationDesign, modelDesigns, settingName)
+ expect_equal(output[[settingName]], list(a = 1, b = 2))
+ validationDesign[[settingName]] <- NULL
+ output <- fromDesignOrModel(validationDesign, modelDesigns, settingName)
+ expect_equal(output[[settingName]], list(a = 3, b = 4))
})
test_that("createValidationDesign errors", {
-
- expect_error(createValidationDesign(targetId = NULL, outcomeId = 2,
- plpModelList = list()))
- expect_error(createValidationDesign(targetId = 1, outcomeId = NULL,
- plpModelList = list()))
- expect_error(createValidationDesign(targetId = "a", outcomeId = 2,
- plpModelList = list()))
- expect_error(createValidationDesign(targetId = 1, outcomeId = "a",
- plpModelList = list()))
- expect_error(createValidationDesign(targetId = 1, outcomeId = 2,
- plpModelList = list(),
- populationSettings = list()))
- expect_error(createValidationDesign(targetId = 1, outcomeId = 2,
- plpModelList = list(),
- recalibrate = 1))
- expect_error(createValidationDesign(targetId = 1, outcomeId = 2,
- plpModelList = list(),
- runCovariateSummary = 1))
+ expect_error(createValidationDesign(
+ targetId = NULL, outcomeId = 2,
+ plpModelList = list()
+ ))
+ expect_error(createValidationDesign(
+ targetId = 1, outcomeId = NULL,
+ plpModelList = list()
+ ))
+ expect_error(createValidationDesign(
+ targetId = "a", outcomeId = 2,
+ plpModelList = list()
+ ))
+ expect_error(createValidationDesign(
+ targetId = 1, outcomeId = "a",
+ plpModelList = list()
+ ))
+ expect_error(createValidationDesign(
+ targetId = 1, outcomeId = 2,
+ plpModelList = list(),
+ populationSettings = list()
+ ))
+ expect_error(createValidationDesign(
+ targetId = 1, outcomeId = 2,
+ plpModelList = list(),
+ recalibrate = 1
+ ))
+ expect_error(createValidationDesign(
+ targetId = 1, outcomeId = 2,
+ plpModelList = list(),
+ runCovariateSummary = 1
+ ))
})
test_that("createValidationDesign works with minimal required arguments", {
targetId <- 1
outcomeId <- 2
plpModelList <- list()
-
+
design <- createValidationDesign(
- targetId = targetId,
- outcomeId = outcomeId,
+ targetId = targetId,
+ outcomeId = outcomeId,
plpModelList = plpModelList
- )
+ )
expect_s3_class(design, "validationDesign")
expect_equal(design$targetId, targetId)
expect_equal(design$outcomeId, outcomeId)
@@ -141,20 +149,20 @@ test_that("single createValidationDesign works with all arguments", {
targetId <- 1
outcomeId <- 2
plpModelList <- list("model1", "model2")
- populationSettings <- createStudyPopulationSettings()
- restrictPlpDataSettings <- createRestrictPlpDataSettings() # Replace with actual restrictPlpDataSettings object
- recalibrate <- c("recalibrationInTheLarge")
+ populationSettings <- createStudyPopulationSettings()
+ restrictPlpDataSettings <- createRestrictPlpDataSettings() # Replace with actual restrictPlpDataSettings object
+ recalibrate <- c("recalibrationInTheLarge")
runCovariateSummary <- FALSE
-
+
design <- createValidationDesign(
- targetId = targetId,
- outcomeId = outcomeId,
- plpModelList = plpModelList,
- populationSettings = populationSettings,
- restrictPlpDataSettings = restrictPlpDataSettings,
- recalibrate = recalibrate,
+ targetId = targetId,
+ outcomeId = outcomeId,
+ plpModelList = plpModelList,
+ populationSettings = populationSettings,
+ restrictPlpDataSettings = restrictPlpDataSettings,
+ recalibrate = recalibrate,
runCovariateSummary = runCovariateSummary
- )
+ )
expect_s3_class(design, "validationDesign")
expect_equal(design$targetId, targetId)
expect_equal(design$outcomeId, outcomeId)
@@ -170,13 +178,13 @@ test_that("createValidationDesigns correctly handles multiple restrictSettings",
outcomeId <- 2
plpModelList <- list()
restrictPlpDataSettings <- list(createRestrictPlpDataSettings(), createRestrictPlpDataSettings())
-
+
design <- createValidationDesign(
- targetId = targetId,
- outcomeId =outcomeId,
- plpModelList = plpModelList,
+ targetId = targetId,
+ outcomeId = outcomeId,
+ plpModelList = plpModelList,
restrictPlpDataSettings = restrictPlpDataSettings
- )
+ )
expect_s3_class(design[[1]], "validationDesign")
expect_equal(design[[1]]$targetId, targetId)
expect_equal(design[[1]]$outcomeId, outcomeId)
@@ -184,4 +192,167 @@ test_that("createValidationDesigns correctly handles multiple restrictSettings",
expect_equal(design[[1]]$restrictPlpDataSettings, restrictPlpDataSettings[[1]])
expect_equal(design[[2]]$restrictPlpDataSettings, restrictPlpDataSettings[[2]])
expect_equal(length(design), length(restrictPlpDataSettings))
-})
\ No newline at end of file
+})
+
+test_that("createValidationSettings errors with <10 outcomes", {
+ tinyRestrictPlpDataSettings <- createRestrictPlpDataSettings(
+ sampleSize = 30,
+ )
+
+ validationDesign <- createValidationDesign(
+ targetId = 1,
+ outcomeId = 3,
+ plpModelList = list(modelVal),
+ restrictPlpDataSettings = tinyRestrictPlpDataSettings
+ )
+
+ expect_output(
+ validateExternal(
+ validationDesignList = validationDesign,
+ databaseDetails = databaseDetails,
+ logSettings = createLogSettings(),
+ outputFolder = saveLocation
+ ),
+ "skipping validation for design and database"
+ )
+})
+
+test_that("createDownloadTasks handles single design correctly", {
+ design <- createValidationDesign(
+ targetId = 1,
+ outcomeId = 2,
+ plpModelList = list(modelVal),
+ restrictPlpDataSettings = createRestrictPlpDataSettings()
+ )
+ result <- createDownloadTasks(list(design))
+ expect_s3_class(result, "data.frame")
+ expect_equal(nrow(result), 1)
+ expect_equal(ncol(result), 4)
+})
+
+test_that("createDownloadTasks handles multiple designs correctly", {
+ design1 <- createValidationDesign(
+ targetId = 1,
+ outcomeId = 2,
+ plpModelList = list(modelVal),
+ restrictPlpDataSettings = createRestrictPlpDataSettings()
+ )
+ design2 <- createValidationDesign(
+ targetId = 3,
+ outcomeId = 4,
+ plpModelList = list(modelVal),
+ restrictPlpDataSettings = createRestrictPlpDataSettings()
+ )
+ result <- createDownloadTasks(list(design1, design2))
+ expect_s3_class(result, "data.frame")
+ expect_equal(nrow(result), 2)
+ expect_equal(ncol(result), 4)
+})
+
+test_that("createDownloadTasks handles duplicated designs correctly", {
+ design <- createValidationDesign(
+ targetId = 1,
+ outcomeId = 2,
+ plpModelList = list(modelVal),
+ restrictPlpDataSettings = createRestrictPlpDataSettings()
+ )
+ result <- createDownloadTasks(list(design, design))
+ expect_s3_class(result, "data.frame")
+ expect_equal(nrow(result), 1)
+
+ design2 <- createValidationDesign(
+ targetId = 3,
+ outcomeId = 4,
+ plpModelList = list(modelVal),
+ restrictPlpDataSettings = createRestrictPlpDataSettings()
+ )
+
+ results <- createDownloadTasks(list(design, design2, design))
+ expect_s3_class(results, "data.frame")
+ expect_equal(nrow(results), 2)
+})
+
+test_that("createDownloadTasks with different restrictSettings", {
+ design <- createValidationDesign(
+ targetId = 1,
+ outcomeId = 2,
+ plpModelList = list(modelVal),
+ restrictPlpDataSettings = createRestrictPlpDataSettings()
+ )
+ design2 <- createValidationDesign(
+ targetId = 3,
+ outcomeId = 4,
+ plpModelList = list(modelVal),
+ restrictPlpDataSettings = createRestrictPlpDataSettings()
+ )
+ design3 <- createValidationDesign(
+ targetId = 1,
+ outcomeId = 2,
+ plpModelList = list(modelVal),
+ restrictPlpDataSettings = createRestrictPlpDataSettings(sampleSize = 100)
+ )
+
+ result <- createDownloadTasks(list(design, design2, design3))
+
+ expect_s3_class(result, "data.frame")
+ expect_equal(nrow(result), 3)
+})
+
+test_that("createDownloadTasks works with multiple outcomeIds", {
+ design1 <- createValidationDesign(
+ targetId = 1,
+ outcomeId = 2,
+ plpModelList = list(modelVal),
+ restrictPlpDataSettings = createRestrictPlpDataSettings()
+ )
+ design2 <- createValidationDesign(
+ targetId = 1,
+ outcomeId = 3,
+ plpModelList = list(modelVal),
+ restrictPlpDataSettings = createRestrictPlpDataSettings()
+ )
+ result <- createDownloadTasks(list(design1, design2))
+ expect_s3_class(result, "data.frame")
+ expect_equal(nrow(result), 1)
+ expect_equal(length(result[1, ]$outcomeIds[[1]]), 2)
+
+ design3 <- createValidationDesign(
+ targetId = 1,
+ outcomeId = 3,
+ plpModelList = list(modelVal),
+ restrictPlpDataSettings = createRestrictPlpDataSettings(sampleSize = 100)
+ )
+ result <- createDownloadTasks(list(design1, design2, design3))
+ expect_equal(nrow(result), 2)
+})
+
+test_that("createDownloadTasks with multiple covSettings", {
+ modelVal2 <- modelVal
+ design1 <- createValidationDesign(
+ targetId = 1,
+ outcomeId = 2,
+ plpModelList = list(modelVal),
+ restrictPlpDataSettings = createRestrictPlpDataSettings()
+ )
+ modelVal2$modelDesign$covariateSettings <-
+ FeatureExtraction::createCovariateSettings(useChads2 = TRUE)
+ design2 <- createValidationDesign(
+ targetId = 1,
+ outcomeId = 2,
+ plpModelList = list(modelVal2),
+ restrictPlpDataSettings = createRestrictPlpDataSettings()
+ )
+ result <- createDownloadTasks(list(design1, design2))
+ expect_equal(nrow(result), 1)
+ expect_equal(length(result[1, ]$covariateSettings[[1]]), 2)
+})
+
+test_that("createDownloadTasks when restrictSettings come from models", {
+ design1 <- createValidationDesign(
+ targetId = 1,
+ outcomeId = 2,
+ plpModelList = list(modelVal)
+ )
+ result <- createDownloadTasks(list(design1))
+ expect_s3_class(result[1, ]$restrictPlpDataSettings[[1]], "restrictPlpDataSettings")
+})
diff --git a/vignettes/AddingCustomFeatureEngineering.Rmd b/vignettes/AddingCustomFeatureEngineering.Rmd
index 0ec7c921f..c0d11df14 100644
--- a/vignettes/AddingCustomFeatureEngineering.Rmd
+++ b/vignettes/AddingCustomFeatureEngineering.Rmd
@@ -2,30 +2,13 @@
title: "Adding Custom Feature Engineering Functions"
author: "Jenna Reps, Egill Fridgeirsson"
date: "`r Sys.Date()`"
-header-includes:
- - \usepackage{fancyhdr}
- - \pagestyle{fancy}
- - \fancyhead{}
- - \fancyhead[CO,CE]{Custom Patient-Level Prediction Algorithms}
- - \fancyfoot[CO,CE]{PatientLevelPrediction Package Version `r utils::packageVersion("PatientLevelPrediction")`}
- - \fancyfoot[LE,RO]{\thepage}
- - \renewcommand{\headrulewidth}{0.4pt}
- - \renewcommand{\footrulewidth}{0.4pt}
-output:
- pdf_document:
- number_sections: yes
- toc: yes
- html_document:
- number_sections: yes
- toc: yes
+output: rmarkdown::html_vignette
+vignette: >
+ %\VignetteIndexEntry{Adding Custom Feature Engineering Functions}
+ %\VignetteEngine{knitr::rmarkdown}
+ %\VignetteEncoding{UTF-8}
---
-```{=html}
-
-```
```{r, echo = FALSE, message = FALSE, warning = FALSE}
library(PatientLevelPrediction)
```
diff --git a/vignettes/AddingCustomModels.Rmd b/vignettes/AddingCustomModels.Rmd
index c7d187625..dc52dc15d 100644
--- a/vignettes/AddingCustomModels.Rmd
+++ b/vignettes/AddingCustomModels.Rmd
@@ -2,28 +2,14 @@
title: "Adding Custom Patient-Level Prediction Algorithms"
author: "Jenna Reps, Martijn J. Schuemie, Patrick B. Ryan, Peter R. Rijnbeek"
date: "`r Sys.Date()`"
-header-includes:
- - \usepackage{fancyhdr}
- - \pagestyle{fancy}
- - \fancyhead{}
- - \fancyhead[CO,CE]{Custom Patient-Level Prediction Algorithms}
- - \fancyfoot[CO,CE]{PatientLevelPrediction Package Version `r utils::packageVersion("PatientLevelPrediction")`}
- - \fancyfoot[LE,RO]{\thepage}
- - \renewcommand{\headrulewidth}{0.4pt}
- - \renewcommand{\footrulewidth}{0.4pt}
-output:
- pdf_document:
- number_sections: yes
- toc: yes
- html_document:
- number_sections: yes
- toc: yes
+output: rmarkdown::html_vignette
---
```{=html}
```
```{r, echo = FALSE, message = FALSE, warning = FALSE}
diff --git a/vignettes/AddingCustomSamples.Rmd b/vignettes/AddingCustomSamples.Rmd
index 93241c085..fe6c1c3a2 100644
--- a/vignettes/AddingCustomSamples.Rmd
+++ b/vignettes/AddingCustomSamples.Rmd
@@ -2,31 +2,14 @@
title: "Adding Custom Sampling Functions"
author: "Jenna Reps"
date: "`r Sys.Date()`"
-header-includes:
- - \usepackage{fancyhdr}
- - \pagestyle{fancy}
- - \fancyhead{}
- - \fancyhead[CO,CE]{Custom Patient-Level Prediction Algorithms}
- - \fancyfoot[LE,RO]{\thepage}
- - \renewcommand{\headrulewidth}{0.4pt}
- - \renewcommand{\footrulewidth}{0.4pt}
- - \fancyfoot[CO,CE]{PatientLevelPrediction Package Version `r utils::packageVersion("PatientLevelPrediction")`}
-output:
- pdf_document:
- number_sections: yes
- toc: yes
- html_document:
- number_sections: yes
- toc: yes
-editor_options:
- markdown:
- wrap: 72
+output: rmarkdown::html_vignette
---
```{=html}
```
```{r, echo = FALSE, message = FALSE, warning = FALSE}
diff --git a/vignettes/AddingCustomSplitting.Rmd b/vignettes/AddingCustomSplitting.Rmd
index d0e318d1c..fa687d2da 100644
--- a/vignettes/AddingCustomSplitting.Rmd
+++ b/vignettes/AddingCustomSplitting.Rmd
@@ -1,29 +1,15 @@
---
-title: "Adding Custom Data Splitting Functions"
+title: "Adding Custom Data Splitting"
author: "Jenna Reps"
date: "`r Sys.Date()`"
-header-includes:
- - \usepackage{fancyhdr}
- - \pagestyle{fancy}
- - \fancyhead{}
- - \fancyhead[CO,CE]{Custom Patient-Level Prediction Algorithms}
- - \fancyfoot[CO,CE]{PatientLevelPrediction Package Version `r utils::packageVersion("PatientLevelPrediction")`}
- - \fancyfoot[LE,RO]{\thepage}
- - \renewcommand{\headrulewidth}{0.4pt}
- - \renewcommand{\footrulewidth}{0.4pt}
-output:
- pdf_document:
- number_sections: yes
- toc: yes
- html_document:
- number_sections: yes
- toc: yes
+output: rmarkdown::html_vignette
---
```{=html}
```
```{r, echo = FALSE, message = FALSE, warning = FALSE}
diff --git a/vignettes/BenchmarkTasks.Rmd b/vignettes/BenchmarkTasks.Rmd
index d046b5890..725b2a62a 100644
--- a/vignettes/BenchmarkTasks.Rmd
+++ b/vignettes/BenchmarkTasks.Rmd
@@ -2,32 +2,14 @@
title: "Benchmark Tasks"
author: "Jenna Reps, Ross Williams, Peter R. Rijnbeek"
date: '`r Sys.Date()`'
-header-includes:
- - \usepackage{fancyhdr}
- - \pagestyle{fancy}
- - \fancyhead{}
- - \fancyhead[CO,CE]{Installation Guide}
- - \fancyfoot[CO,CE]{PatientLevelPrediction Package Version `r utils::packageVersion("PatientLevelPrediction")`}
- - \fancyfoot[LE,RO]{\thepage}
- - \renewcommand{\headrulewidth}{0.4pt}
- - \renewcommand{\footrulewidth}{0.4pt}
-output:
- pdf_document:
- includes:
- in_header: preamble.tex
- number_sections: yes
- toc: yes
- word_document:
- toc: yes
- html_document:
- number_sections: yes
- toc: yes
+output: rmarkdown::html_vignette
---
```{=html}
```
## Benchmark Tasks For Large-Scale Empirical Analyses
diff --git a/vignettes/BestPractices.Rmd b/vignettes/BestPractices.Rmd
index 1fba21806..bcb2c152b 100644
--- a/vignettes/BestPractices.Rmd
+++ b/vignettes/BestPractices.Rmd
@@ -2,32 +2,14 @@
title: "Best Practice Research"
author: "Jenna Reps, Peter R. Rijnbeek"
date: '`r Sys.Date()`'
-header-includes:
- - \usepackage{fancyhdr}
- - \pagestyle{fancy}
- - \fancyhead{}
- - \fancyhead[CO,CE]{Installation Guide}
- - \fancyfoot[CO,CE]{PatientLevelPrediction Package Version `r utils::packageVersion("PatientLevelPrediction")`}
- - \fancyfoot[LE,RO]{\thepage}
- - \renewcommand{\headrulewidth}{0.4pt}
- - \renewcommand{\footrulewidth}{0.4pt}
-output:
- pdf_document:
- includes:
- in_header: preamble.tex
- number_sections: yes
- toc: yes
- word_document:
- toc: yes
- html_document:
- number_sections: yes
- toc: yes
+output: rmarkdown::html_vignette
---
```{=html}
```
## Best practice publications using the OHDSI PatientLevelPrediction framework
diff --git a/vignettes/BuildingMultiplePredictiveModels.Rmd b/vignettes/BuildingMultiplePredictiveModels.Rmd
index 93206f3c8..604fa428b 100644
--- a/vignettes/BuildingMultiplePredictiveModels.Rmd
+++ b/vignettes/BuildingMultiplePredictiveModels.Rmd
@@ -2,32 +2,14 @@
title: "Automatically Build Multiple Patient-Level Predictive Models"
author: "Jenna Reps, Martijn J. Schuemie, Patrick B. Ryan, Peter R. Rijnbeek"
date: '`r Sys.Date()`'
-header-includes:
- - \usepackage{fancyhdr}
- - \pagestyle{fancy}
- - \fancyhead{}
- - \fancyhead[CO,CE]{Automatically Build Multiple Patient-Level Predictive Models}
- - \fancyfoot[CO,CE]{PatientLevelPrediction Package Version `r utils::packageVersion("PatientLevelPrediction")`}
- - \fancyfoot[LE,RO]{\thepage}
- - \renewcommand{\headrulewidth}{0.4pt}
- - \renewcommand{\footrulewidth}{0.4pt}
-output:
- pdf_document:
- includes:
- in_header: preamble.tex
- number_sections: yes
- toc: yes
- word_document:
- toc: yes
- html_document:
- number_sections: yes
- toc: yes
+output: rmarkdown::html_vignette
---
```{=html}
```
# Introduction
diff --git a/vignettes/BuildingPredictiveModels.Rmd b/vignettes/BuildingPredictiveModels.Rmd
index 9e5099cc2..ea293d39a 100644
--- a/vignettes/BuildingPredictiveModels.Rmd
+++ b/vignettes/BuildingPredictiveModels.Rmd
@@ -2,32 +2,18 @@
title: "Building patient-level predictive models"
author: "Jenna Reps, Martijn J. Schuemie, Patrick B. Ryan, Peter R. Rijnbeek"
date: '`r Sys.Date()`'
-header-includes:
- - \usepackage{fancyhdr}
- - \pagestyle{fancy}
- - \fancyhead{}
- - \fancyhead[CO,CE]{Installation Guide}
- - \fancyfoot[CO,CE]{PatientLevelPrediction Package Version `r utils::packageVersion("PatientLevelPrediction")`}
- - \fancyfoot[LE,RO]{\thepage}
- - \renewcommand{\headrulewidth}{0.4pt}
- - \renewcommand{\footrulewidth}{0.4pt}
-output:
- pdf_document:
- includes:
- in_header: preamble.tex
- number_sections: yes
- toc: yes
- word_document:
- toc: yes
- html_document:
- number_sections: yes
- toc: yes
+output: rmarkdown::html_vignette
+vignette: >
+ %\VignetteEngine{knitr::rmarkdown}
+ %\VignetteIndexEntry{Building patient-level predictive models}
+ %\VignetteEncoding{UTF-8}
---
-
+```{css, echo=FALSE}
+img {
+ max-width: 100%;
+}
+```
```{r echo=FALSE,message=FALSE,warning=FALSE,eval=TRUE}
library(PatientLevelPrediction)
@@ -53,13 +39,13 @@ In our [`paper`](https://academic.oup.com/jamia/article/25/8/969/4989437), we pr
Figure 1, illustrates the prediction problem we address. Among a population at risk, we aim to predict which patients at a defined moment in time (t = 0) will experience some outcome during a time-at-risk. Prediction is done using only information about the patients in an observation window prior to that moment in time.
-![The prediction problem](Figure1.webp)
+![The prediction problem](./images/Figure1.avif)
As shown in Figure 2, to define a prediction problem we have to define t=0 by a Target Cohort (T), the outcome we like to predict by an outcome cohort (O), and the time-at-risk (TAR). Furthermore, we have to make design choices for the model we like to develop, and determine the observational datasets to perform internal and external validation. This conceptual framework works for all type of prediction problems, for example those presented in Figure 3.
-![Design choices](studydesign.webp)
+![Design choices](./images/studydesign.avif)
-![Examples of prediction problems](problems.webp)
+![Examples of prediction problems](./images/problems.avif)
This vignette describes how you can use the `PatientLevelPrediction` package to build patient-level predictive models. The package enables data extraction, model building, and model evaluation using data from databases that are translated into the OMOP CDM. In this vignette we assume you have installed the package correctly using the `vignette('InstallationGuide')`.
@@ -114,7 +100,6 @@ tabl <- "
| Regularized Logistic Regression | Lasso logistic regression belongs to the family of generalized linear models, where a linear combination of the variables is learned and finally a logistic function maps the linear combination to a value between 0 and 1. The lasso regularization adds a cost based on model complexity to the objective function when training the model. This cost is the sum of the absolute values of the linear combination of the coefficients. The model automatically performs feature selection by minimizing this cost. We use the Cyclic coordinate descent for logistic, Poisson and survival analysis (Cyclops) package to perform large-scale regularized logistic regression: https://github.com/OHDSI/Cyclops | var (starting variance), seed |
| Gradient boosting machines | Gradient boosting machines is a boosting ensemble technique and in our framework it combines multiple decision trees. Boosting works by iteratively adding decision trees but adds more weight to the data-points that are misclassified by prior decision trees in the cost function when training the next tree. We use Extreme Gradient Boosting, which is an efficient implementation of the gradient boosting framework implemented in the xgboost R package available from CRAN. | ntree (number of trees), max depth (max levels in tree), min rows (minimum data points in in node), learning rate, balance (balance class labels), seed |
| Random forest | Random forest is a bagging ensemble technique that combines multiple decision trees. The idea behind bagging is to reduce the likelihood of overfitting, by using weak classifiers, but combining multiple diverse weak classifiers into a strong classifier. Random forest accomplishes this by training multiple decision trees but only using a subset of the variables in each tree and the subset of variables differ between trees. Our packages uses the sklearn learn implementation of Random Forest in python. | mtry (number of features in each tree),ntree (number of trees), maxDepth (max levels in tree), minRows (minimum data points in in node),balance (balance class labels), seed |
-| K-nearest neighbors | K-nearest neighbors (KNN) is an algorithm that uses some metric to find the K closest labelled data-points, given the specified metric, to a new unlabelled data-point. The prediction of the new data-points is then the most prevalent class of the K-nearest labelled data-points. There is a sharing limitation of KNN, as the model requires labelled data to perform the prediction on new data, and it is often not possible to share this data across data sites.We included the BigKnn classifier developed in OHDSI which is a large scale k-nearest neighbor classifier using the Lucene search engine: https://github.com/OHDSI/BigKnn | k (number of neighbours),weighted (weight by inverse frequency) |
| Naive Bayes | The Naive Bayes algorithm applies the Bayes theorem with the 'naive' assumption of conditional independence between every pair of features given the value of the class variable. Based on the likelihood the data belongs to a class and the prior distribution of the class, a posterior distribution is obtained. | none |
| AdaBoost | AdaBoost is a boosting ensemble technique. Boosting works by iteratively adding classifiers but adds more weight to the data-points that are misclassified by prior classifiers in the cost function when training the next classifier. We use the sklearn 'AdaboostClassifier' implementation in Python. | nEstimators (the maximum number of estimators at which boosting is terminated), learningRate (learning rate shrinks the contribution of each classifier by learning_rate. There is a trade-off between learningRate and nEstimators) |
| Decision Tree | A decision tree is a classifier that partitions the variable space using individual tests selected using a greedy approach. It aims to find partitions that have the highest information gain to separate the classes. The decision tree can easily overfit by enabling a large number of partitions (tree depth) and often needs some regularization (e.g., pruning or specifying hyper-parameters that limit the complexity of the model). We use the sklearn 'DecisionTreeClassifier' implementation in Python. | maxDepth (the maximum depth of the tree), minSamplesSplit,minSamplesLeaf, minImpuritySplit (threshold for early stopping in tree growth. A node will split if its impurity is above the threshold, otherwise it is a leaf.), seed,classWeight ('Balance' or 'None') |
@@ -188,11 +173,11 @@ Both methods are described below for our example prediction problem.
### ATLAS cohort builder
-![Figure 4: Target Cohort Atrial Fibrillation](example1/ATLAS_T.webp)
+![Figure 4: Target Cohort Atrial Fibrillation](example1/ATLAS_T.avif)
ATLAS allows you to define cohorts interactively by specifying cohort entry and cohort exit criteria. Cohort entry criteria involve selecting one or more initial events, which determine the start date for cohort entry, and optionally specifying additional inclusion criteria which filter to the qualifying events. Cohort exit criteria are applied to each cohort entry record to determine the end date when the person's episode no longer qualifies for the cohort. For the outcome cohort the end date is less relevant. As an example, Figure 4 shows how we created the Atrial Fibrillation cohort and Figure 5 shows how we created the stroke cohort in ATLAS.
-![Figure 5: Outcome Cohort Stroke](example1/ATLAS_O.webp)
+![Figure 5: Outcome Cohort Stroke](example1/ATLAS_O.avif)
The T and O cohorts can be found here:
@@ -611,11 +596,11 @@ Both methods are described below for our example prediction problem.
### ATLAS cohort builder
-![Target Cohort ACE inhibitors](example2/aceinhibitors.webp)
+![Target Cohort ACE inhibitors](example2/aceinhibitors.avif)
ATLAS allows you to define cohorts interactively by specifying cohort entry and cohort exit criteria. Cohort entry criteria involve selecting one or more initial events, which determine the start date for cohort entry, and optionally specifying additional inclusion criteria which filter to the qualifying events. Cohort exit criteria are applied to each cohort entry record to determine the end date when the person's episode no longer qualifies for the cohort. For the outcome cohort the end date is less relevant. As an example, Figure 6 shows how we created the ACE inhibitors cohort and Figure 7 shows how we created the angioedema cohort in ATLAS.
-![Outcome Cohort Angioedema](example2/angioedema.webp)
+![Outcome Cohort Angioedema](example2/angioedema.avif)
The T and O cohorts can be found here:
@@ -845,7 +830,7 @@ populationSettings <- createStudyPopulationSettings(
When developing a prediction model using supervised learning (when you have features paired with labels for a set of patients), the first step is to design the development/internal validation process. This requires specifying how to select the model hyper-parameters, how to learn the model parameters and how to fairly evaluate the model. In general, the validation set is used to pick hyper-parameters, the training set is used to learn the model parameters and the test set is used to perform fair internal validation. However, cross-validation can be implemented to pick the hyper-parameters on the training data (so a validation data set is not required). Cross validation can also be used to estimate internal validation (so a testing data set is not required).
-In small data the best approach for internal validation has been shown to be boostrapping. However, in big data (many patients and many features) bootstrapping is generally not feasible. In big data our research has shown that it is just important to have some form of fair evaluation (use a test set or cross validation). For full details see [our BMJ open paper](add%20link).
+In small data the best approach for internal validation has been shown to be bootstrapping. However, in big data (many patients and many features) bootstrapping is generally not feasible. In big data our research has shown that it is just important to have some form of fair evaluation (use a test set or cross validation). For full details see [our BMJ open paper](https://bmjopen.bmj.com/content/11/12/e050146.long).
In the PatientLevelPrediction package, the splitSettings define how the plpData are partitioned into training/validation/testing data. Cross validation is always done, but using a test set is optional (when the data are small, it may be optimal to not use a test set). For the splitSettings we can use the type (stratified/time/subject) and testFraction parameters to split the data in a 75%-25% split and run the patient-level prediction pipeline:
@@ -968,7 +953,7 @@ The script we created manually above can also be automatically created using a p
Create a new prediction study and select your target and outcome cohorts.
-
![](atlasplp1.webp)
+
![](images/atlasplp1.avif)
@@ -976,7 +961,7 @@ The script we created manually above can also be automatically created using a p
Specify one or more analysis settings.
-
![](atlasplp2.web)
+
![](images/atlasplp2.avif)
@@ -986,7 +971,7 @@ The script we created manually above can also be automatically created using a p
Specify the trainings settigns
-
![](atlasplp3.webp)
+
![](images/atlasplp3.avif)
@@ -994,7 +979,7 @@ The script we created manually above can also be automatically created using a p
Specify the execution settings
-
![](atlasplp4.web)
+
![](images/atlasplp4.avif)
@@ -1008,7 +993,7 @@ ATLAS can build a R package for you that will execute the full study against you
- ![R package download functionality in ATLAS](atlasdownload1.webp)
+ ![R package download functionality in ATLAS](images/atlasdownload1.avif)
@@ -1020,7 +1005,7 @@ ATLAS can build a R package for you that will execute the full study against you
- ![R package download functionality in ATLAS](atlasdownload2.webp)
+ ![R package download functionality in ATLAS](images/atlasdownload2.avif)
@@ -1038,7 +1023,7 @@ You can interactively view the results by running: `viewPlp(runPlp=lrResults)`.
-![Summary of all the performance measures of the analyses](shinysummary.webp)
+![Summary of all the performance measures of the analyses](images/shinysummary.avif)
@@ -1046,7 +1031,7 @@ Furthermore, many interactive plots are available in the Shiny App, for example
-![Example of the interactive ROC curve](shinyroc.webp)
+![Example of the interactive ROC curve](images/shinyroc.avif)
@@ -1066,7 +1051,7 @@ The Receiver Operating Characteristics (ROC) plot shows the sensitivity against
@@ -1290,7 +1275,7 @@ In the figures below the effect is shown of the removeSubjectsWithPriorOutcome,
Require minimum time-at-risk for all person in the target cohort
-
![](popdef1.webp)
+
![](images/popdef1.avif)
@@ -1298,7 +1283,7 @@ In the figures below the effect is shown of the removeSubjectsWithPriorOutcome,
Require minumum time-at-risk for target cohort, except for persons with outcomes during time-at-risk.
-
![](popdef2.webp)
+
![](images/popdef2.avif)
@@ -1311,7 +1296,7 @@ In the figures below the effect is shown of the removeSubjectsWithPriorOutcome,
Include all persons in the target cohort exclude persons with prior outcomes
-
![](popdef3.webp)
+
![](images/popdef3.avif)
@@ -1319,7 +1304,7 @@ In the figures below the effect is shown of the removeSubjectsWithPriorOutcome,
Require minimum time-at-risk for target cohort, except for persons with outcomes during time-at-risk, exclude persons with prior outcomes
-
![](popdef4.webp)
+
![](images/popdef4.avif)
@@ -1332,7 +1317,7 @@ In the figures below the effect is shown of the removeSubjectsWithPriorOutcome,
Include all persons in target cohort exclude persons with prior outcomes
-
![](popdef5.webp)
+
![](images/popdef5.avif)
@@ -1340,6 +1325,6 @@ In the figures below the effect is shown of the removeSubjectsWithPriorOutcome,
Include all persons in target cohort
-
![](popdef6.webp)
+
![](images/popdef6.avif)
diff --git a/vignettes/ClinicalModels.Rmd b/vignettes/ClinicalModels.Rmd
index 3b6a5e5ae..65e747dd3 100644
--- a/vignettes/ClinicalModels.Rmd
+++ b/vignettes/ClinicalModels.Rmd
@@ -2,35 +2,13 @@
title: "Clinical Models"
author: "Jenna Reps, Peter R. Rijnbeek"
date: '`r Sys.Date()`'
-header-includes:
- - \usepackage{fancyhdr}
- - \pagestyle{fancy}
- - \fancyhead{}
- - \fancyhead[CO,CE]{Installation Guide}
- - \fancyfoot[CO,CE]{PatientLevelPrediction Package Version `r utils::packageVersion("PatientLevelPrediction")`}
- - \fancyfoot[LE,RO]{\thepage}
- - \renewcommand{\headrulewidth}{0.4pt}
- - \renewcommand{\footrulewidth}{0.4pt}
-output:
- pdf_document:
- includes:
- in_header: preamble.tex
- number_sections: yes
- toc: yes
- word_document:
- toc: yes
- html_document:
- number_sections: yes
- toc: yes
+output: rmarkdown::html_vignette
+vignette: >
+ %\VignetteEngine{knitr::rmarkdown}
+ %\VignetteIndexEntry{Clinical Models}
+ %\VignetteEncoding{UTF-8}
---
-```{=html}
-
-```
-
## Clinical models developed using the OHDSI PatientLevelPrediction framework
| Title | Link |
@@ -43,4 +21,4 @@ output:
| Seek COVER: using a disease proxy to rapidly develop and validate a personalized risk calculator for COVID-19 outcomes in an international network | [BMC Medical Research Methodology](https://link.springer.com/article/10.1186/s12874-022-01505-z) |
| 90-Day all-cause mortality can be predicted following a total knee replacement: an international, network study to develop and validate a prediction model | [Knee Surgery, Sports Traumatology, Arthroscopy](https://link.springer.com/article/10.1007/s00167-021-06799-y) |
| Machine learning and real-world data to predict lung cancer risk in routine care | [Cancer Epidemiology, Biomarkers & Prevention](https://aacrjournals.org/cebp/article-abstract/32/3/337/718495) |
-| Development and validation of a patient-level model to predict dementia across a network of observational databases | [BMC medicine](https://link.springer.com/article/10.1186/s12916-024-03530-9) |
\ No newline at end of file
+| Development and validation of a patient-level model to predict dementia across a network of observational databases | [BMC medicine](https://link.springer.com/article/10.1186/s12916-024-03530-9) |
diff --git a/vignettes/ConstrainedPredictors.Rmd b/vignettes/ConstrainedPredictors.Rmd
index 420f218ac..7bd1e93fb 100644
--- a/vignettes/ConstrainedPredictors.Rmd
+++ b/vignettes/ConstrainedPredictors.Rmd
@@ -1,35 +1,14 @@
---
-title: "Constrained predictors"
+title: "Constrained Predictors"
author: "Jenna Reps"
date: '`r Sys.Date()`'
-header-includes:
- - \usepackage{fancyhdr}
- - \pagestyle{fancy}
- - \fancyhead{}
- - \fancyhead[CO,CE]{Installation Guide}
- - \fancyfoot[CO,CE]{PatientLevelPrediction Package Version `r utils::packageVersion("PatientLevelPrediction")`}
- - \fancyfoot[LE,RO]{\thepage}
- - \renewcommand{\headrulewidth}{0.4pt}
- - \renewcommand{\footrulewidth}{0.4pt}
-output:
- pdf_document:
- includes:
- in_header: preamble.tex
- number_sections: yes
- toc: yes
- word_document:
- toc: yes
- html_document:
- number_sections: yes
- toc: yes
+output: rmarkdown::html_vignette
+vignette: >
+ %\VignetteIndexEntry{Constrained Predictors}
+ %\VignetteEncoding{UTF-8}
+ %\VignetteEngine{knitr::rmarkdown}
---
-```{=html}
-
-```
## Constrained Predictors
### How to use the PhenotypeLibrary R package
@@ -122,4 +101,3 @@ phenotypeDefinitions <- PhenotypeLibrary::getPlCohortDefinitionSet(1152:1215)
| Aspirin | Vascular | 1158 |
| Deep vein thrombosis (DVT) | Vascular | 1152 |
| Edema | Vascular | 1196 |
-| Inpatient visit | NA | NA |
diff --git a/vignettes/CreatingLearningCurves.Rmd b/vignettes/CreatingLearningCurves.Rmd
index 6430f7c5b..36ffc81b7 100644
--- a/vignettes/CreatingLearningCurves.Rmd
+++ b/vignettes/CreatingLearningCurves.Rmd
@@ -2,36 +2,14 @@
title: "Creating Learning Curves"
author: "Luis H. John, Jenna M. Reps, Peter R. Rijnbeek"
date: '`r Sys.Date()`'
-header-includes:
- - \usepackage{fancyhdr}
- - \pagestyle{fancy}
- - \fancyhead{}
- - \fancyhead[CO,CE]{Generating Learning Curves}
- - \fancyfoot[LE,RO]{\thepage}
- - \renewcommand{\headrulewidth}{0.4pt}
- - \renewcommand{\footrulewidth}{0.4pt}
- - \fancyfoot[CO,CE]{PatientLevelPrediction Package Version `r utils::packageVersion("PatientLevelPrediction")`}
-output:
- pdf_document:
- includes:
- in_header: preamble.tex
- number_sections: yes
- toc: yes
- html_document:
- number_sections: yes
- toc: yes
- html_notebook:
- toc: yes
- word_document:
- toc: yes
+output:
+ rmarkdown::html_vignette
+vignette: >
+ %\VignetteIndexEntry{Creating Learning Curves}
+ %\VignetteEngine{knitr::rmarkdown}
+ %\VignetteEncoding{UTF-8}
---
-```{=html}
-
-```
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
@@ -54,17 +32,17 @@ Prediction models will show overly-optimistic performance when predicting on the
Learning curves assess the effect of training set size on model performance by training a sequence of prediction models on successively larger subsets of the training set. A learning curve plot can also help in diagnosing a bias or variance problem as explained below.
-![Learning curve example.](learningCurve.png)
+![Learning curve example.](images/learningCurve.avif)
Figure 1, shows an example of learning curve plot in which the vertical axis represents the model performance and the horizontal axis the training set size. If training set size is small, the performance on the training set is high, because a model can often be fitted well to a limited number of training examples. At the same time, the performance on the testing set will be poor, because the model trained on such a limited number of training examples will not generalize well to unseen data in the testing set. As the training set size increases, the performance of the model on the training set will decrease. It becomes more difficult for the model to find a good fit through all the training examples. Also, the model will be trained on a more representative portion of training examples, making it generalize better to unseen data. This can be observed by the increasin testing set performance.
The learning curve can help us in diagnosing bias and variance problems with our classifier which will provide guidance on how to further improve our model. We can observe high variance (overfitting) in a prediction model if it performs well on the training set, but poorly on the testing set (Figure 2). Adding additional data is a common approach to counteract high variance. From the learning curve it becomes apparent, that adding additional data may improve performance on the testing set a little further, as the learning curve has not yet plateaued and, thus, the model is not saturated yet. Therefore, adding more data will decrease the gap between training set and testing set, which is the main indicator for a high variance problem.
-![Prediction model suffering from high variance.](learningCurveVariance.png)
+![Prediction model suffering from high variance.](images/learningCurveVariance.avif)
Furthermore, we can observe high bias (underfitting) if a prediction model performs poorly on the training set as well as on the testing set (Figure 3). The learning curves of training set and testing set have flattened on a low performance with only a small gap in between them. Adding additional data will in this case have little to no impact on the model performance. Choosing another prediction algorithm that can find more complex (for example non-linear) relationships in the data may be an alternative approach to consider in this high bias situation.
-![Prediction model suffering from high bias.](learningCurveBias.png)
+![Prediction model suffering from high bias.](images/learningCurveBias.avif)
# Creating the learning curve
@@ -114,9 +92,6 @@ splitSettings <- createDefaultSplitSetting(
)
trainFractions <- seq(0.1, 0.8, 0.1) # Create eight training set fractions
-
-# alternatively use a sequence of training events by uncommenting the line below.
-# trainEvents <- seq(100, 5000, 100)
```
Create the learning curve object.
@@ -161,7 +136,7 @@ plotLearningCurve(
)
```
-![Learning curve plot.](learningCurvePlot.png)
+![Learning curve plot.](images/learningCurvePlot.avif)
# Parallel processing
diff --git a/vignettes/CreatingLearningCurves.nb.html b/vignettes/CreatingLearningCurves.nb.html
index 94e6c1c44..2e3751cb6 100644
--- a/vignettes/CreatingLearningCurves.nb.html
+++ b/vignettes/CreatingLearningCurves.nb.html
@@ -11,18 +11,28 @@
-
+
Creating Learning Curves
-
+
-
+
-
+
+
diff --git a/vignettes/CreatingNetworkStudies.Rmd b/vignettes/CreatingNetworkStudies.Rmd
index 88f91a779..8728252a1 100644
--- a/vignettes/CreatingNetworkStudies.Rmd
+++ b/vignettes/CreatingNetworkStudies.Rmd
@@ -1,26 +1,14 @@
---
-title: "Making patient-level predictive network study packages"
+title: "Making patient-level predictive network study packages"
author: "Jenna Reps, Martijn J. Schuemie, Patrick B. Ryan, Peter R. Rijnbeek"
date: '`r Sys.Date()`'
-output:
- pdf_document:
- includes:
- in_header: preamble.tex
- number_sections: yes
- toc: yes
- word_document:
- toc: yes
- html_document:
- number_sections: yes
- toc: yes
+output: rmarkdown::html_vignette
+vignettes: >
+ %\VignetteEngine{knitr::rmarkdown}
+ %\VignetteIndexEntry{Making patient-level predictive network study packages}
+ %\VignetteEncoding{UTF-8}
---
-```{=html}
-
-```
```{r echo=FALSE,message=FALSE,warning=FALSE,eval=TRUE}
library(PatientLevelPrediction)
vignetteDataFolder <- "s:/temp/plpVignette"
diff --git a/vignettes/Diagram.png b/vignettes/Diagram.png
deleted file mode 100644
index 5c8f0c148..000000000
Binary files a/vignettes/Diagram.png and /dev/null differ
diff --git a/vignettes/Figure1.webp b/vignettes/Figure1.webp
deleted file mode 100644
index 42ad71d7f..000000000
Binary files a/vignettes/Figure1.webp and /dev/null differ
diff --git a/vignettes/GISExample.Rmd b/vignettes/GISExample.Rmd
new file mode 100644
index 000000000..448006690
--- /dev/null
+++ b/vignettes/GISExample.Rmd
@@ -0,0 +1,270 @@
+---
+title: "Integration of GIS Data Into OHDSI Model Building"
+author: "Jared Houghtaling"
+date: "`r Sys.Date()`"
+output: rmarkdown::html_vignette
+vignette: >
+ %\VignetteIndexEntry{Integration of GIS Data Into OHDSI Model Building}
+ %\VignetteEngine{knitr::rmarkdown}
+ %\VignetteEncoding{UTF-8}
+---
+
+# Integration of GIS Data into OHDSI Model Building
+
+## Motivation
+
+Although the proposed GIS extension tables have been approved by the broader OHDSI community, they are not yet integrated natively into the OHDSI tooling. These tables and the data they capture, however, can still be referenced in standard model building and analytics workflows. The purpose of this analytics demonstration is to show how data in the EXPOSURE_OCCURRENCE table can be utilized in the training and evaluation of an OMOP-specific patient-level-prediction (PLP) model. The analytic process we executed and describe below relies on a GIS-version of the Tufts Synthetic Dataset that has both EHR and geospatial data integrated; see the description of that dataset for more details about contents or access. Much of the work is based on the detailed vignette that describes custom feature engineering in PLP
+`vignette('AddingCustomFeatureEngineering')`
+
+## Step-by-Step Process
+
+### Step 1: Create Target & Outcome Cohorts
+
+We defined our `target` cohort within the sampling procedures when creating a subset of the Tufts Synthetic Data, and we described this process at length elsewhere. For the purposes of using the PLP package, we formalized this group of individuals in the cohort table using a simple cohort definition including all individuals with "any visit", and include that atlas-compatible json definition here for reference.
+
+We also created our `outcome` cohort - in this case, those patients with COPD or a conceptual descendant thereof - in Atlas and have shared the json definition (we also included an equivalent SQL script).
+
+### Step 2: Create Generic PLP Lasso Logistic Regression Model in R
+
+It is possible to create an R package that serves as a basis for a PLP model using Atlas, but given Atlas does not yet support the GIS extension, we have created this demo model directly using the PLP package.
+
+After configuring the environment (very important!) appropriately, we imported necessary packages and defined the relevant parameters about the data source:
+
+```{r, echo=TRUE, message=FALSE, warning=FALSE, eval=FALSE}
+library(PatientLevelPrediction)
+library(dplyr)
+outputFolder <- "/ohdsi-gis/copdResultsPM25_NEW"
+saveDirectory <- outputFolder
+ExecutionDateTime <- Sys.time()
+logSettings = createLogSettings(verbosity = "DEBUG", timeStamp = T, logName =
+ "runPlp Log")
+analysisName = 'Generic PLP'
+
+# Details for connecting to the server:
+connectionDetails <- DatabaseConnector::createConnectionDetails(
+ dbms = 'spark',
+ server = '/default',
+ connectionString = ''
+ )
+# Add the database containing the OMOP CDM data
+cdmDatabaseSchema <- 'gis_syn_dataset_5_4'
+# Add a sharebale name for the database containing the OMOP CDM data
+cdmDatabaseName <- 'TSD-GIS'
+# Add a database with read/write access as this is where the cohorts will be generated
+cohortDatabaseSchema <- 'gis_syn_dataset_5_4'
+tempEmulationSchema <- NULL
+# table name where the cohorts will be generated
+cohortTable <- 'cohort'
+```
+
+After defining parameters, we created a database settings object and launched the `runMultiplePLP` function to create a base `plpData` object:
+
+```{r, echo=TRUE, message=FALSE, warning=FALSE, eval=FALSE}
+databaseDetails <- PatientLevelPrediction::createDatabaseDetails(
+ connectionDetails = connectionDetails,
+ cdmDatabaseSchema = cdmDatabaseSchema,
+ cdmDatabaseName = cdmDatabaseName,
+ tempEmulationSchema = tempEmulationSchema,
+ cohortDatabaseSchema = cohortDatabaseSchema,
+ cohortTable = cohortTable,
+ outcomeDatabaseSchema = cohortDatabaseSchema,
+ outcomeTable = cohortTable,
+ cdmVersion = 5
+)
+
+
+# Run very simple LR model against two cohorts created in Atlas. Use model
+# as basis for augmented model with pollutants below
+runMultiplePlp(
+ databaseDetails = databaseDetails,
+ modelDesignList = list(createModelDesign(targetId = 9, outcomeId = 8, modelSettings =
+ setLassoLogisticRegression())),
+ onlyFetchData = F,
+ cohortDefinitions = NULL,
+ logSettings = createLogSettings(verbosity = "DEBUG", timeStamp = T, logName =
+ "runPlp Log"),
+ saveDirectory = outputFolder,
+ sqliteLocation = file.path(saveDirectory, "sqlite")
+ )
+```
+
+### Step 3: Split plpData object to train/test, augment labels with EXPOSURE_OCCURRENCE values
+
+The labels sub-object within `plpData` contains per-individual data elements like gender and age; we added an additional data element to this object derived from the `EXPOSURE_OCCURRENCE` table:
+
+```{r, echo=TRUE, message=FALSE, warning=FALSE, eval=FALSE}
+cohortDefinitions <- NULL
+modelDesign <- createModelDesign(targetId = 9, outcomeId = 8, modelSettings = setLassoLogisticRegression())
+populationSettings <- modelDesign$populationSettings
+splitSettings <- modelDesign$splitSettings
+
+plpData <- loadPlpData("/ohdsi-gis/copdResultsPM25_B/targetId_9_L1")
+
+mySplit <- splitData (plpData = plpData,
+ population = createStudyPopulation(plpData, 8, populationSettings),
+ splitSettings = splitSettings)
+
+
+labelTrain <- mySplit$Train$labels
+conn <- DatabaseConnector::connect(connectionDetails)
+pollutants <- DatabaseConnector::querySql(conn, "SELECT person_id as subjectID, CAST(MEAN(value_as_number) AS DOUBLE) AS pmValue FROM gis_syn_dataset_5_4.exposure_occurrence WHERE value_as_number IS NOT NULL GROUP BY person_id;")
+labelTrainPol <- merge(x=labelTrain, y=pollutants, by.x = "subjectId", by.y = "SUBJECTID")
+
+mySplit$Train$labels <- labelTrainPol
+
+labelTest <- mySplit$Test$labels
+labelTestPol <- merge(x=labelTest, y=pollutants, by.x = "subjectId", by.y = "SUBJECTID")
+
+mySplit$Test$labels <- labelTestPol
+
+trainData <- mySplit$Train
+
+testData <- mySplit$Test
+```
+
+### Step 4: Reference augmented label objects in custom feature engineering function
+
+We would like to convert our per-patient labels into the `covariateData` structure referenced by the PLP workflow. To do this, we were able to follow the feature engineering vignette and create two functions, `createPollutants` and `implementPollutants`:
+
+```{r, echo=TRUE, message=FALSE, warning=FALSE, eval=FALSE}
+createPollutants <- function(
+ method = 'QNCV'
+ ){
+
+ # create list of inputs to implement function
+ featureEngineeringSettings <- list(
+ method = method
+ )
+
+ # specify the function that will implement the sampling
+ attr(featureEngineeringSettings, "fun") <- "implementPollutants"
+
+ # make sure the object returned is of class "sampleSettings"
+ class(featureEngineeringSettings) <- "featureEngineeringSettings"
+ return(featureEngineeringSettings)
+
+}
+
+
+implementPollutants <- function(trainData, featureEngineeringSettings, model=NULL) {
+ if (is.null(model)) {
+ method <- featureEngineeringSettings$method
+ gisData <- trainData$labels
+ y <- gisData$outcomeCount
+ X <- gisData$PMVALUE
+ model <- mgcv::gam(
+ y ~ s(X, bs='cr', k=5, m=2)
+ )
+ newData <- data.frame(
+ rowId = gisData$rowId,
+ covariateId = 2052499839,
+ covariateValue = model$fitted.values
+ )
+ }
+ else {
+ gisData <- trainData$labels
+ X <- gisData$PMVALUE
+ y <- gisData$outcomeCount
+ newData <- data.frame(y=y, X=X)
+ yHat <- predict(model, newData)
+ newData <- data.frame(
+ rowId = gisData$rowId,
+ covariateId = 2052499839,
+ covariateValue = yHat
+ )
+ }
+ # update covRef
+ Andromeda::appendToTable(trainData$covariateData$covariateRef,
+ data.frame(covariateId=2052499839,
+ covariateName='Average PM2.5 Concentrations',
+ analysisId=1,
+ conceptId=2052499839))
+
+ # update covariates
+ Andromeda::appendToTable(trainData$covariateData$covariates, newData)
+
+ featureEngineering <- list(
+ funct = 'implementPollutants',
+ settings = list(
+ featureEngineeringSettings = featureEngineeringSettings,
+ model = model
+ )
+ )
+
+ attr(trainData$covariateData, 'metaData')$featureEngineering = listAppend(
+ attr(trainData$covariateData, 'metaData')$featureEngineering,
+ featureEngineering
+ )
+
+ trainData$model <- model
+
+ return(trainData)
+}
+```
+
+We can then execute these functions to create training and test data objects that contain our extended covariates:
+
+```{r, echo=TRUE, message=FALSE, warning=FALSE, eval=FALSE}
+featureEngineeringSettingsPol <- createPollutants('QNCV')
+trainDataPol <- implementPollutants(trainData, featureEngineeringSettings)
+testDataPol <- implementPollutants(testData, featureEngineeringSettings, trainDataPol$model)
+```
+
+Note that if we plot the output model of the `GAM` fitting in the `implementPollutants` function, we end up with a plot that aligns well with our underlying relationship between Odds Ratio and PM2.5 concentration that we used to distribute our synthetic data by location:
+
+![Covariate Fit]()
+
+![OR Estimation]()
+
+### Step 5: Apply new train and test datasets to `runPlp` and evaluate output
+
+```{r, echo=TRUE, message=FALSE, warning=FALSE, eval=FALSE}
+analysisId <- '1'
+analysisPath = file.path(saveDirectory, analysisId)
+
+settings <- list(
+ trainData = trainDataPol,
+ modelSettings = setLassoLogisticRegression(),
+ analysisId = analysisId,
+ analysisPath = analysisPath
+)
+
+ParallelLogger::logInfo(sprintf('Training %s model',settings$modelSettings$name))
+model <- tryCatch(
+ {
+ do.call(fitPlp, settings)
+ },
+ error = function(e) { ParallelLogger::logError(e); return(NULL)}
+)
+
+
+prediction <- model$prediction
+# remove prediction from model
+model$prediction <- NULL
+
+#apply to test data if exists:
+if('Test' %in% names(data)){
+predictionTest <- tryCatch(
+ {
+ predictPlp(
+ plpModel = model,
+ plpData = testDataPol,
+ population = testDataPol$labels
+ )
+ },
+ error = function(e) { ParallelLogger::logError(e); return(NULL)}
+)
+
+predictionTest$evaluationType <- 'Test'
+
+if(!is.null(predictionTest)){
+ prediction <- rbind(predictionTest, prediction[, colnames(prediction)!='index'])
+}
+
+
+}
+
+```
+
+
+
diff --git a/vignettes/InstallationGuide.Rmd b/vignettes/InstallationGuide.Rmd
index 7b3af179d..9c8b8c447 100644
--- a/vignettes/InstallationGuide.Rmd
+++ b/vignettes/InstallationGuide.Rmd
@@ -2,34 +2,13 @@
title: "Patient-Level Prediction Installation Guide"
author: "Jenna Reps, Peter R. Rijnbeek, Egill Fridgeirsson"
date: '`r Sys.Date()`'
-header-includes:
- - \usepackage{fancyhdr}
- - \pagestyle{fancy}
- - \fancyhead{}
- - \fancyhead[CO,CE]{Installation Guide}
- - \fancyfoot[CO,CE]{PatientLevelPrediction Package Version `r utils::packageVersion("PatientLevelPrediction")`}
- - \fancyfoot[LE,RO]{\thepage}
- - \renewcommand{\headrulewidth}{0.4pt}
- - \renewcommand{\footrulewidth}{0.4pt}
-output:
- pdf_document:
- includes:
- in_header: preamble.tex
- number_sections: yes
- toc: yes
- word_document:
- toc: yes
- html_document:
- number_sections: yes
- toc: yes
+output: rmarkdown::html_vignette
+vignette: >
+ %\VignetteEngine{knitr::rmarkdown}
+ %\VignetteIndexEntry{Patient-Level Prediction Installation Guide}
+ %\VignetteEncoding{UTF-8}
---
-```{=html}
-
-```
# Introduction
This vignette describes how you need to install the Observational Health Data Science and Informatics (OHDSI) [`PatientLevelPrediction`](http://github.com/OHDSI/PatientLevelPrediction) package under Windows, Mac, and Linux.
diff --git a/vignettes/Videos.Rmd b/vignettes/Videos.Rmd
index 5f278c92b..a39547958 100644
--- a/vignettes/Videos.Rmd
+++ b/vignettes/Videos.Rmd
@@ -2,34 +2,13 @@
title: "Demo Videos"
author: "Jenna Reps, Peter R. Rijnbeek"
date: '`r Sys.Date()`'
-header-includes:
- - \usepackage{fancyhdr}
- - \pagestyle{fancy}
- - \fancyhead{}
- - \fancyhead[CO,CE]{Installation Guide}
- - \fancyfoot[CO,CE]{PatientLevelPrediction Package Version `r utils::packageVersion("PatientLevelPrediction")`}
- - \fancyfoot[LE,RO]{\thepage}
- - \renewcommand{\headrulewidth}{0.4pt}
- - \renewcommand{\footrulewidth}{0.4pt}
-output:
- pdf_document:
- includes:
- in_header: preamble.tex
- number_sections: yes
- toc: yes
- word_document:
- toc: yes
- html_document:
- number_sections: yes
- toc: yes
+output: rmarkdown::html_vignette
+vignette: >
+ %\VignetteEngine{knitr::rmarkdown}
+ %\VignetteIndexEntry{Demo Videos}
+ %\VignetteEncoding{UTF-8}
---
-```{=html}
-
-```
## What is a cohort table?
| Click To Launch | Description of Demo |
diff --git a/vignettes/atlasdownload1.webp b/vignettes/atlasdownload1.webp
deleted file mode 100644
index 6cac340ed..000000000
Binary files a/vignettes/atlasdownload1.webp and /dev/null differ
diff --git a/vignettes/atlasdownload2.webp b/vignettes/atlasdownload2.webp
deleted file mode 100644
index 452c5ca21..000000000
Binary files a/vignettes/atlasdownload2.webp and /dev/null differ
diff --git a/vignettes/atlasplp1.webp b/vignettes/atlasplp1.webp
deleted file mode 100644
index 71a3c1ce9..000000000
Binary files a/vignettes/atlasplp1.webp and /dev/null differ
diff --git a/vignettes/atlasplp2.webp b/vignettes/atlasplp2.webp
deleted file mode 100644
index 668202de4..000000000
Binary files a/vignettes/atlasplp2.webp and /dev/null differ
diff --git a/vignettes/atlasplp3.webp b/vignettes/atlasplp3.webp
deleted file mode 100644
index 523d0143c..000000000
Binary files a/vignettes/atlasplp3.webp and /dev/null differ
diff --git a/vignettes/atlasplp4.webp b/vignettes/atlasplp4.webp
deleted file mode 100644
index d90a43ea2..000000000
Binary files a/vignettes/atlasplp4.webp and /dev/null differ
diff --git a/vignettes/demographicSummary.webp b/vignettes/demographicSummary.webp
deleted file mode 100644
index 7d0437deb..000000000
Binary files a/vignettes/demographicSummary.webp and /dev/null differ
diff --git a/vignettes/ensemble.png b/vignettes/ensemble.png
deleted file mode 100644
index 6e2173a48..000000000
Binary files a/vignettes/ensemble.png and /dev/null differ
diff --git a/vignettes/example1/ATLAS_O.avif b/vignettes/example1/ATLAS_O.avif
new file mode 100644
index 000000000..9b773bdb9
Binary files /dev/null and b/vignettes/example1/ATLAS_O.avif differ
diff --git a/vignettes/example1/ATLAS_T.avif b/vignettes/example1/ATLAS_T.avif
new file mode 100644
index 000000000..20bdde0fc
Binary files /dev/null and b/vignettes/example1/ATLAS_T.avif differ
diff --git a/vignettes/example2/aceinhibitors.avif b/vignettes/example2/aceinhibitors.avif
new file mode 100644
index 000000000..c76837cb4
Binary files /dev/null and b/vignettes/example2/aceinhibitors.avif differ
diff --git a/vignettes/example2/angioedema.avif b/vignettes/example2/angioedema.avif
new file mode 100644
index 000000000..40d42b8ae
Binary files /dev/null and b/vignettes/example2/angioedema.avif differ
diff --git a/vignettes/f1Measure.png b/vignettes/f1Measure.png
deleted file mode 100644
index 437b6893f..000000000
Binary files a/vignettes/f1Measure.png and /dev/null differ
diff --git a/vignettes/f1score.png b/vignettes/f1score.png
deleted file mode 100644
index 5113b25c5..000000000
Binary files a/vignettes/f1score.png and /dev/null differ
diff --git a/vignettes/generalizability.webp b/vignettes/generalizability.webp
deleted file mode 100644
index ba6d14de4..000000000
Binary files a/vignettes/generalizability.webp and /dev/null differ
diff --git a/vignettes/images/Figure1.avif b/vignettes/images/Figure1.avif
new file mode 100644
index 000000000..f4f2c0d61
Binary files /dev/null and b/vignettes/images/Figure1.avif differ
diff --git a/vignettes/images/atlasdownload1.avif b/vignettes/images/atlasdownload1.avif
new file mode 100644
index 000000000..9f2ed9d8d
Binary files /dev/null and b/vignettes/images/atlasdownload1.avif differ
diff --git a/vignettes/images/atlasdownload2.avif b/vignettes/images/atlasdownload2.avif
new file mode 100644
index 000000000..bc9906b78
Binary files /dev/null and b/vignettes/images/atlasdownload2.avif differ
diff --git a/vignettes/images/atlasplp1.avif b/vignettes/images/atlasplp1.avif
new file mode 100644
index 000000000..873e06411
Binary files /dev/null and b/vignettes/images/atlasplp1.avif differ
diff --git a/vignettes/images/atlasplp2.avif b/vignettes/images/atlasplp2.avif
new file mode 100644
index 000000000..95704629c
Binary files /dev/null and b/vignettes/images/atlasplp2.avif differ
diff --git a/vignettes/images/atlasplp3.avif b/vignettes/images/atlasplp3.avif
new file mode 100644
index 000000000..2a2eeab1f
Binary files /dev/null and b/vignettes/images/atlasplp3.avif differ
diff --git a/vignettes/images/atlasplp4.avif b/vignettes/images/atlasplp4.avif
new file mode 100644
index 000000000..e613e5072
Binary files /dev/null and b/vignettes/images/atlasplp4.avif differ
diff --git a/vignettes/images/demographicSummary.avif b/vignettes/images/demographicSummary.avif
new file mode 100644
index 000000000..63a77156f
Binary files /dev/null and b/vignettes/images/demographicSummary.avif differ
diff --git a/vignettes/images/generalizability.avif b/vignettes/images/generalizability.avif
new file mode 100644
index 000000000..c8e5bfbf2
Binary files /dev/null and b/vignettes/images/generalizability.avif differ
diff --git a/vignettes/images/learningCurve.avif b/vignettes/images/learningCurve.avif
new file mode 100644
index 000000000..be40aa00f
Binary files /dev/null and b/vignettes/images/learningCurve.avif differ
diff --git a/vignettes/images/learningCurveBias.avif b/vignettes/images/learningCurveBias.avif
new file mode 100644
index 000000000..948431b28
Binary files /dev/null and b/vignettes/images/learningCurveBias.avif differ
diff --git a/vignettes/images/learningCurvePlot.avif b/vignettes/images/learningCurvePlot.avif
new file mode 100644
index 000000000..3f8ddb9f5
Binary files /dev/null and b/vignettes/images/learningCurvePlot.avif differ
diff --git a/vignettes/images/learningCurveVariance.avif b/vignettes/images/learningCurveVariance.avif
new file mode 100644
index 000000000..00e48fdd3
Binary files /dev/null and b/vignettes/images/learningCurveVariance.avif differ
diff --git a/vignettes/images/popdef1.avif b/vignettes/images/popdef1.avif
new file mode 100644
index 000000000..7584bc72e
Binary files /dev/null and b/vignettes/images/popdef1.avif differ
diff --git a/vignettes/images/popdef2.avif b/vignettes/images/popdef2.avif
new file mode 100644
index 000000000..be4403626
Binary files /dev/null and b/vignettes/images/popdef2.avif differ
diff --git a/vignettes/images/popdef3.avif b/vignettes/images/popdef3.avif
new file mode 100644
index 000000000..985babd29
Binary files /dev/null and b/vignettes/images/popdef3.avif differ
diff --git a/vignettes/images/popdef4.avif b/vignettes/images/popdef4.avif
new file mode 100644
index 000000000..62a590262
Binary files /dev/null and b/vignettes/images/popdef4.avif differ
diff --git a/vignettes/images/popdef5.avif b/vignettes/images/popdef5.avif
new file mode 100644
index 000000000..a13bcb8ac
Binary files /dev/null and b/vignettes/images/popdef5.avif differ
diff --git a/vignettes/images/popdef6.avif b/vignettes/images/popdef6.avif
new file mode 100644
index 000000000..91495baa7
Binary files /dev/null and b/vignettes/images/popdef6.avif differ
diff --git a/vignettes/images/precisionRecall.avif b/vignettes/images/precisionRecall.avif
new file mode 100644
index 000000000..62472ee9c
Binary files /dev/null and b/vignettes/images/precisionRecall.avif differ
diff --git a/vignettes/images/predictionDistribution.avif b/vignettes/images/predictionDistribution.avif
new file mode 100644
index 000000000..f4403112c
Binary files /dev/null and b/vignettes/images/predictionDistribution.avif differ
diff --git a/vignettes/images/preferencePDF.avif b/vignettes/images/preferencePDF.avif
new file mode 100644
index 000000000..107871a9a
Binary files /dev/null and b/vignettes/images/preferencePDF.avif differ
diff --git a/vignettes/images/problems.avif b/vignettes/images/problems.avif
new file mode 100644
index 000000000..e0a2c8500
Binary files /dev/null and b/vignettes/images/problems.avif differ
diff --git a/vignettes/images/shinyroc.avif b/vignettes/images/shinyroc.avif
new file mode 100644
index 000000000..1be2230dd
Binary files /dev/null and b/vignettes/images/shinyroc.avif differ
diff --git a/vignettes/images/shinysummary.avif b/vignettes/images/shinysummary.avif
new file mode 100644
index 000000000..905aabce0
Binary files /dev/null and b/vignettes/images/shinysummary.avif differ
diff --git a/vignettes/images/smoothCalibration.avif b/vignettes/images/smoothCalibration.avif
new file mode 100644
index 000000000..854cc2485
Binary files /dev/null and b/vignettes/images/smoothCalibration.avif differ
diff --git a/vignettes/images/sparseCalibration.avif b/vignettes/images/sparseCalibration.avif
new file mode 100644
index 000000000..b8f4a5221
Binary files /dev/null and b/vignettes/images/sparseCalibration.avif differ
diff --git a/vignettes/images/sparseRoc.avif b/vignettes/images/sparseRoc.avif
new file mode 100644
index 000000000..bb5d14ec3
Binary files /dev/null and b/vignettes/images/sparseRoc.avif differ
diff --git a/vignettes/images/studydesign.avif b/vignettes/images/studydesign.avif
new file mode 100644
index 000000000..e9c028fbc
Binary files /dev/null and b/vignettes/images/studydesign.avif differ
diff --git a/vignettes/images/variableScatterplot.avif b/vignettes/images/variableScatterplot.avif
new file mode 100644
index 000000000..7a498180a
Binary files /dev/null and b/vignettes/images/variableScatterplot.avif differ
diff --git a/vignettes/learningCurve.png b/vignettes/learningCurve.png
deleted file mode 100644
index 19cd06691..000000000
Binary files a/vignettes/learningCurve.png and /dev/null differ
diff --git a/vignettes/learningCurveBias.png b/vignettes/learningCurveBias.png
deleted file mode 100644
index 3bd9f580a..000000000
Binary files a/vignettes/learningCurveBias.png and /dev/null differ
diff --git a/vignettes/learningCurvePlot.png b/vignettes/learningCurvePlot.png
deleted file mode 100644
index a5e1f9e96..000000000
Binary files a/vignettes/learningCurvePlot.png and /dev/null differ
diff --git a/vignettes/learningCurveVariance.png b/vignettes/learningCurveVariance.png
deleted file mode 100644
index 3212e6106..000000000
Binary files a/vignettes/learningCurveVariance.png and /dev/null differ
diff --git a/vignettes/plpDiagram.png b/vignettes/plpDiagram.png
deleted file mode 100644
index 301f4bcb4..000000000
Binary files a/vignettes/plpDiagram.png and /dev/null differ
diff --git a/vignettes/popdef1.webp b/vignettes/popdef1.webp
deleted file mode 100644
index 83ef7afd6..000000000
Binary files a/vignettes/popdef1.webp and /dev/null differ
diff --git a/vignettes/popdef2.webp b/vignettes/popdef2.webp
deleted file mode 100644
index 31887dd1b..000000000
Binary files a/vignettes/popdef2.webp and /dev/null differ
diff --git a/vignettes/popdef3.webp b/vignettes/popdef3.webp
deleted file mode 100644
index 8b409ed49..000000000
Binary files a/vignettes/popdef3.webp and /dev/null differ
diff --git a/vignettes/popdef4.webp b/vignettes/popdef4.webp
deleted file mode 100644
index 2709497e7..000000000
Binary files a/vignettes/popdef4.webp and /dev/null differ
diff --git a/vignettes/popdef5.webp b/vignettes/popdef5.webp
deleted file mode 100644
index 748b8901b..000000000
Binary files a/vignettes/popdef5.webp and /dev/null differ
diff --git a/vignettes/popdef6.webp b/vignettes/popdef6.webp
deleted file mode 100644
index 583dc9fba..000000000
Binary files a/vignettes/popdef6.webp and /dev/null differ
diff --git a/vignettes/preamble.tex b/vignettes/preamble.tex
deleted file mode 100644
index 204026792..000000000
--- a/vignettes/preamble.tex
+++ /dev/null
@@ -1,8 +0,0 @@
-\usepackage{float}
-\let\origfigure\figure
-\let\endorigfigure\endfigure
-\renewenvironment{figure}[1][2] {
- \expandafter\origfigure\expandafter[H]
-} {
- \endorigfigure
-}
\ No newline at end of file
diff --git a/vignettes/precisionRecall.webp b/vignettes/precisionRecall.webp
deleted file mode 100644
index af6b0cfe5..000000000
Binary files a/vignettes/precisionRecall.webp and /dev/null differ
diff --git a/vignettes/predictedPDF.png b/vignettes/predictedPDF.png
deleted file mode 100644
index fbf4e85b5..000000000
Binary files a/vignettes/predictedPDF.png and /dev/null differ
diff --git a/vignettes/predictionDistribution.webp b/vignettes/predictionDistribution.webp
deleted file mode 100644
index c7756d788..000000000
Binary files a/vignettes/predictionDistribution.webp and /dev/null differ
diff --git a/vignettes/preferencePDF.webp b/vignettes/preferencePDF.webp
deleted file mode 100644
index 189a356be..000000000
Binary files a/vignettes/preferencePDF.webp and /dev/null differ
diff --git a/vignettes/problems.webp b/vignettes/problems.webp
deleted file mode 100644
index 5c1c27bb4..000000000
Binary files a/vignettes/problems.webp and /dev/null differ
diff --git a/vignettes/shinyroc.webp b/vignettes/shinyroc.webp
deleted file mode 100644
index a11724623..000000000
Binary files a/vignettes/shinyroc.webp and /dev/null differ
diff --git a/vignettes/shinysummary.webp b/vignettes/shinysummary.webp
deleted file mode 100644
index 0d256ade1..000000000
Binary files a/vignettes/shinysummary.webp and /dev/null differ
diff --git a/vignettes/smoothCalibration.jpeg b/vignettes/smoothCalibration.jpeg
deleted file mode 100644
index 72c3cdb7a..000000000
Binary files a/vignettes/smoothCalibration.jpeg and /dev/null differ
diff --git a/vignettes/sparseCalibration.webp b/vignettes/sparseCalibration.webp
deleted file mode 100644
index 043019e5b..000000000
Binary files a/vignettes/sparseCalibration.webp and /dev/null differ
diff --git a/vignettes/sparseROC.webp b/vignettes/sparseROC.webp
deleted file mode 100644
index 2ea3ea56f..000000000
Binary files a/vignettes/sparseROC.webp and /dev/null differ
diff --git a/vignettes/studydesign.webp b/vignettes/studydesign.webp
deleted file mode 100644
index 28717c7d2..000000000
Binary files a/vignettes/studydesign.webp and /dev/null differ
diff --git a/vignettes/variableScatterplot.webp b/vignettes/variableScatterplot.webp
deleted file mode 100644
index de6f8999d..000000000
Binary files a/vignettes/variableScatterplot.webp and /dev/null differ