diff --git a/NAMESPACE b/NAMESPACE index de50241d..86c5b495 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -23,7 +23,6 @@ export(createDefaultExecuteSettings) export(createDefaultSplitSetting) export(createExecuteSettings) export(createExistingSplitSettings) -export(createFeatureEngineeringMapColumnsSettings) export(createFeatureEngineeringSettings) export(createGlmModel) export(createLearningCurve) @@ -34,7 +33,7 @@ export(createPreprocessSettings) export(createRandomForestFeatureSelection) export(createRestrictPlpDataSettings) export(createSampleSettings) -export(createSciKitLearnModel) +export(createSklearnModel) export(createSplineSettings) export(createStratifiedImputationSettings) export(createStudyPopulation) diff --git a/R/ExistingPython.R b/R/ExistingPython.R deleted file mode 100644 index f78dc279..00000000 --- a/R/ExistingPython.R +++ /dev/null @@ -1,200 +0,0 @@ -# @file ExistingPython.R -# -# Copyright 2025 Observational Health Data Sciences and Informatics -# -# This file is part of PatientLevelPrediction -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -#' Plug an existing scikit learn python model into the -#' PLP framework -#' -#' @details -#' This function lets users add an existing scikit learn that is saved as model.pkl -#' into PLP format. covariateMap is a mapping between standard covariateIds and the model column names -#' and order are required in addition to pythonModelLocation, the location of the model that must be saved -#' as `model.pkl`. The user also needs to specify the covariate settings and population settings as these -#' are used to determine the standard PLP model design. -#' -#' @param modelLocation The location of the folder that contains the model as model.pkl -#' @param covariateMap A data.frame with the columns: columnId specifying the column order for the -#' covariate, covariateId the covariate ID from FeatureExtraction and modelCovariateIdName which is the -#' column name used when fitting the model. For example, if you had a column called 'age' in your model and this was the 3rd -#' column when fitting the model, then the values for columnId would be 3, covariateId would be 1002 (the covariateId for age in years) and -#' modelCovariateIdName would be 'age'. -#' @param covariateSettings The settings for the standardized covariates -#' @param populationSettings The settings for the population, this includes the time-at-risk settings and -#' and inclusion criteria. -#' @param isPickle If the model is saved as a pickle set this to T if it is a json set this to F -#' -#' @return -#' An object of class plpModel, this is a list that contains: model (the location of the model.pkl), -#' preprocessing (settings for mapping the covariateIds to the model column mames), modelDesign (specification -#' of the model design), trainDetails (information about the model fitting) and covariateImportance. You can use the output -#' as an input in PatientLevelPrediction::predictPlp to apply the model and calculate the risk for patients. -#' -#' @export -createSciKitLearnModel <- function( - modelLocation = "/model", # model needs to be saved here as "model.pkl" - covariateMap = data.frame( - columnId = 1:2, - covariateId = c(1, 2), - modelCovariateIdName = c("pred_1", "pred_2") - ), - covariateSettings, # specify the covariates - populationSettings, # specify time at risk used to develop model - isPickle = TRUE) { - checkSklearn() - checkFileExists(modelLocation) - checkIsClass(covariateMap, "data.frame") - checkIsClass(covariateSettings, "covariateSettings") - checkIsClass(populationSettings, "populationSettings") - checkBoolean(isPickle) - checkDataframe(covariateMap, c("columnId", "covariateId", "modelCovariateIdName"), - columnTypes = list(c("numeric", "integer"), "numeric", "character") - ) - existingModel <- list(model = "existingPython") - class(existingModel) <- "modelSettings" - - plpModel <- list( - # use plpModel$preprocessing$featureEngineering to rename columns - # set plpModel$preprocessing$tidyCovariates to NULL - preprocessing = list( - featureEngineering = list( - funct = "mapColumns", - settings = list( - featureEngineeringSettings = createFeatureEngineeringMapColumnsSettings( - columnMap = covariateMap - ) - ) - ), - tidyCovariates = NULL, - requireDenseMatrix = FALSE - ), - covariateImportance = data.frame( - columnId = covariateMap$columnId, - covariateId = covariateMap$modelCovariateIdName, - included = TRUE - ), - modelDesign = PatientLevelPrediction::createModelDesign( - targetId = 1, - outcomeId = 2, - restrictPlpDataSettings = PatientLevelPrediction::createRestrictPlpDataSettings(), - covariateSettings = covariateSettings, - populationSettings = populationSettings, - sampleSettings = PatientLevelPrediction::createSampleSettings(), - featureEngineeringSettings = createFeatureEngineeringMapColumnsSettings( - columnMap = covariateMap - ), - preprocessSettings = PatientLevelPrediction::createPreprocessSettings( - minFraction = 0, - normalize = FALSE, - removeRedundancy = FALSE - ), - modelSettings = existingModel, - splitSettings = PatientLevelPrediction::createDefaultSplitSetting() - ), - model = modelLocation, - trainDetails = list( - analysisId = "exisitingPython", - developmentDatabase = "nonOMOP", - developmentDatabaseId = "nonOMOP", - trainingTime = -1, - modelName = "existing" - ) - ) - - attr(plpModel, "modelType") <- "binary" - attr(plpModel, "saveType") <- "file" - attr(plpModel, "predictionFunction") <- "predictPythonSklearn" - attr(plpModel, "saveToJson") <- !isPickle - class(plpModel) <- "plpModel" - return(plpModel) -} - - -#' Create settings that enable you to convert from standard covariateIds to -#' model covariate names - this is useful when implementing a model developed -#' outside of the OHDSI tools. -#' -#' @details -#' This function creates settings that let you rename the covariates in the plpData object -#' -#' @param columnMap A data.frame containing the columns: covariateId the covariate ID from FeatureExtraction and -#' modelCovariateIdName which is the column name used when fitting the model. -#' -#' @return -#' An object of class \code{featureEngineeringSettings} that will convert column names -#' -#' @export -createFeatureEngineeringMapColumnsSettings <- function( - columnMap) { - featureEngineeringSettings <- list( - columnMap = columnMap - ) - - attr(featureEngineeringSettings, "fun") <- "mapColumns" - class(featureEngineeringSettings) <- "featureEngineeringSettings" - return(featureEngineeringSettings) -} - -mapColumns <- function( - trainData, - featureEngineeringSettings) { - ParallelLogger::logInfo("Changing column names") - - # map the columns - swap the covariateId with the modelCovariateIdName - trainData$covariateData$columnMap <- featureEngineeringSettings$columnMap %>% - dplyr::select("covariateId", "modelCovariateIdName") - trainData$covariateData$covariates <- trainData$covariateData$covariates %>% - dplyr::rename(newId = "rowId") %>% # duckdb issue with implicit rowid - dplyr::collapse() %>% - dplyr::inner_join( - trainData$covariateData$columnMap, - by = "covariateId" - ) %>% - dplyr::select("newId", "modelCovariateIdName", "covariateValue") %>% - dplyr::rename( - rowId = "newId", # duckdb issue with implicit rowid - covariateId = "modelCovariateIdName" - ) - - trainData$covariateData$covariateRef <- dplyr::inner_join( - trainData$covariateData$covariateRef, - trainData$covariateData$columnMap, - by = "covariateId" - ) %>% - dplyr::select(-"covariateId") %>% - dplyr::rename( - covariateId = "modelCovariateIdName" - ) - - # remove the columnMap - trainData$covariateData$columnMap <- NULL - - # add attribute for FE - featureEngineering <- list( - funct = "mapColumns", - settings = list( - featureEngineeringSettings = featureEngineeringSettings - ) - ) - - attr(trainData, "metaData")$featureEngineering <- listAppend( - attr(trainData, "metaData")$featureEngineering, - featureEngineering - ) - - return(trainData) -} diff --git a/R/ExistingSklearn.R b/R/ExistingSklearn.R new file mode 100644 index 00000000..5e02b778 --- /dev/null +++ b/R/ExistingSklearn.R @@ -0,0 +1,119 @@ +# @file ExistingSklearn.R +# +# Copyright 2025 Observational Health Data Sciences and Informatics +# +# This file is part of PatientLevelPrediction +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +#' Plug an existing scikit learn python model into the +#' PLP framework +#' +#' @details +#' This function lets users add an existing scikit learn model that is saved as +#' model.pkl into PLP format. covariateMap is a mapping between standard +#' covariateIds and the model columns. The user also needs to specify the +#' covariate settings and population settings as these are used to determine +#' the standard PLP model design. +#' +#' @param modelLocation The location of the folder that contains the model as +#' model.pkl +#' @param covariateMap A data.frame with the columns: columnId and covariateId. +#' `covariateId` from FeatureExtraction is the standard OHDSI covariateId. +#' `columnId` is the column location the model expects that covariate to be in. +#' For example, if you had a column called 'age' in your model and this was the +#' 3rd column when fitting the model, then the values for columnId would be 3, +#' covariateId would be 1002 (the covariateId for age in years) and +#' @param covariateSettings The settings for the standardized covariates +#' @param populationSettings The settings for the population, this includes the +#' time-at-risk settings and inclusion criteria. +#' @param isPickle If the model should be saved as a pickle set this to TRUE if +#' it should be saved as json set this to FALSE. +#' +#' @return +#' An object of class plpModel, this is a list that contains: +#' model (the location of the model.pkl), +#' preprocessing (settings for mapping the covariateIds to the model +#' column mames), +#' modelDesign (specification of the model design), +#' trainDetails (information about the model fitting) and +#' covariateImportance. +#' +#' You can use the output as an input in PatientLevelPrediction::predictPlp to +#' apply the model and calculate the risk for patients. +#' +#' @export +createSklearnModel <- function( + modelLocation = "/model", # model needs to be saved here as "model.pkl" + covariateMap = data.frame( + columnId = 1:2, + covariateId = c(1, 2), + ), + covariateSettings, # specify the covariates + populationSettings, # specify time at risk used to develop model + isPickle = TRUE) { + checkSklearn() + checkFileExists(modelLocation) + checkIsClass(covariateMap, "data.frame") + checkIsClass(covariateSettings, "covariateSettings") + checkIsClass(populationSettings, "populationSettings") + checkBoolean(isPickle) + checkDataframe(covariateMap, c("columnId", "covariateId"), + columnTypes = list(c("numeric", "integer"), "numeric") + ) + existingModel <- list(model = "existingSklearn") + class(existingModel) <- "modelSettings" + + plpModel <- list( + preprocessing = list( + tidyCovariates = NULL, + requireDenseMatrix = FALSE + ), + covariateImportance = data.frame( + columnId = covariateMap$columnId, + covariateId = covariateMap$covariateId, + included = TRUE + ), + modelDesign = PatientLevelPrediction::createModelDesign( + targetId = 1, + outcomeId = 2, + restrictPlpDataSettings = PatientLevelPrediction::createRestrictPlpDataSettings(), + covariateSettings = covariateSettings, + populationSettings = populationSettings, + sampleSettings = PatientLevelPrediction::createSampleSettings(), + preprocessSettings = PatientLevelPrediction::createPreprocessSettings( + minFraction = 0, + normalize = FALSE, + removeRedundancy = FALSE + ), + modelSettings = existingModel, + splitSettings = PatientLevelPrediction::createDefaultSplitSetting() + ), + model = modelLocation, + trainDetails = list( + analysisId = "existingSklearn", + developmentDatabase = "unknown", + developmentDatabaseId = "unknown", + trainingTime = -1, + modelName = "existingSklearn" + ) + ) + + attr(plpModel, "modelType") <- "binary" + attr(plpModel, "saveType") <- "file" + attr(plpModel, "predictionFunction") <- "predictPythonSklearn" + attr(plpModel, "saveToJson") <- !isPickle + class(plpModel) <- "plpModel" + return(plpModel) +} diff --git a/man/createFeatureEngineeringMapColumnsSettings.Rd b/man/createFeatureEngineeringMapColumnsSettings.Rd deleted file mode 100644 index e6494813..00000000 --- a/man/createFeatureEngineeringMapColumnsSettings.Rd +++ /dev/null @@ -1,25 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ExistingPython.R -\name{createFeatureEngineeringMapColumnsSettings} -\alias{createFeatureEngineeringMapColumnsSettings} -\title{Create settings that enable you to convert from standard covariateIds to -model covariate names - this is useful when implementing a model developed -outside of the OHDSI tools.} -\usage{ -createFeatureEngineeringMapColumnsSettings(columnMap) -} -\arguments{ -\item{columnMap}{A data.frame containing the columns: covariateId the covariate ID from FeatureExtraction and -modelCovariateIdName which is the column name used when fitting the model.} -} -\value{ -An object of class \code{featureEngineeringSettings} that will convert column names -} -\description{ -Create settings that enable you to convert from standard covariateIds to -model covariate names - this is useful when implementing a model developed -outside of the OHDSI tools. -} -\details{ -This function creates settings that let you rename the covariates in the plpData object -} diff --git a/man/createSciKitLearnModel.Rd b/man/createSciKitLearnModel.Rd deleted file mode 100644 index 92819d19..00000000 --- a/man/createSciKitLearnModel.Rd +++ /dev/null @@ -1,49 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ExistingPython.R -\name{createSciKitLearnModel} -\alias{createSciKitLearnModel} -\title{Plug an existing scikit learn python model into the -PLP framework} -\usage{ -createSciKitLearnModel( - modelLocation = "/model", - covariateMap = data.frame(columnId = 1:2, covariateId = c(1, 2), modelCovariateIdName = - c("pred_1", "pred_2")), - covariateSettings, - populationSettings, - isPickle = TRUE -) -} -\arguments{ -\item{modelLocation}{The location of the folder that contains the model as model.pkl} - -\item{covariateMap}{A data.frame with the columns: columnId specifying the column order for the -covariate, covariateId the covariate ID from FeatureExtraction and modelCovariateIdName which is the -column name used when fitting the model. For example, if you had a column called 'age' in your model and this was the 3rd -column when fitting the model, then the values for columnId would be 3, covariateId would be 1002 (the covariateId for age in years) and -modelCovariateIdName would be 'age'.} - -\item{covariateSettings}{The settings for the standardized covariates} - -\item{populationSettings}{The settings for the population, this includes the time-at-risk settings and -and inclusion criteria.} - -\item{isPickle}{If the model is saved as a pickle set this to T if it is a json set this to F} -} -\value{ -An object of class plpModel, this is a list that contains: model (the location of the model.pkl), -preprocessing (settings for mapping the covariateIds to the model column mames), modelDesign (specification -of the model design), trainDetails (information about the model fitting) and covariateImportance. You can use the output -as an input in PatientLevelPrediction::predictPlp to apply the model and calculate the risk for patients. -} -\description{ -Plug an existing scikit learn python model into the -PLP framework -} -\details{ -This function lets users add an existing scikit learn that is saved as model.pkl -into PLP format. covariateMap is a mapping between standard covariateIds and the model column names -and order are required in addition to pythonModelLocation, the location of the model that must be saved -as `model.pkl`. The user also needs to specify the covariate settings and population settings as these -are used to determine the standard PLP model design. -} diff --git a/man/createSklearnModel.Rd b/man/createSklearnModel.Rd new file mode 100644 index 00000000..f66e8cca --- /dev/null +++ b/man/createSklearnModel.Rd @@ -0,0 +1,57 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ExistingSklearn.R +\name{createSklearnModel} +\alias{createSklearnModel} +\title{Plug an existing scikit learn python model into the +PLP framework} +\usage{ +createSklearnModel( + modelLocation = "/model", + covariateMap = data.frame(columnId = 1:2, covariateId = c(1, 2), ), + covariateSettings, + populationSettings, + isPickle = TRUE +) +} +\arguments{ +\item{modelLocation}{The location of the folder that contains the model as +model.pkl} + +\item{covariateMap}{A data.frame with the columns: columnId and covariateId. +`covariateId` from FeatureExtraction is the standard OHDSI covariateId. +`columnId` is the column location the model expects that covariate to be in. +For example, if you had a column called 'age' in your model and this was the +3rd column when fitting the model, then the values for columnId would be 3, +covariateId would be 1002 (the covariateId for age in years) and} + +\item{covariateSettings}{The settings for the standardized covariates} + +\item{populationSettings}{The settings for the population, this includes the +time-at-risk settings and inclusion criteria.} + +\item{isPickle}{If the model should be saved as a pickle set this to TRUE if +it should be saved as json set this to FALSE.} +} +\value{ +An object of class plpModel, this is a list that contains: + model (the location of the model.pkl), + preprocessing (settings for mapping the covariateIds to the model + column mames), + modelDesign (specification of the model design), + trainDetails (information about the model fitting) and + covariateImportance. + +You can use the output as an input in PatientLevelPrediction::predictPlp to +apply the model and calculate the risk for patients. +} +\description{ +Plug an existing scikit learn python model into the +PLP framework +} +\details{ +This function lets users add an existing scikit learn model that is saved as +model.pkl into PLP format. covariateMap is a mapping between standard +covariateIds and the model columns. The user also needs to specify the +covariate settings and population settings as these are used to determine +the standard PLP model design. +} diff --git a/tests/testthat/test-existingModel.R b/tests/testthat/test-existingModel.R index 11a22f5d..eb59ddb1 100644 --- a/tests/testthat/test-existingModel.R +++ b/tests/testthat/test-existingModel.R @@ -1,47 +1,46 @@ test_that("Create existing sklearn works", { - expect_error(createSciKitLearnModel("existing")) + expect_error(createSklearnModel("existing")) # create a file model.pkl for testing file.create("model.pkl") covariateSettings <- FeatureExtraction::createCovariateSettings(useDemographicsAge = TRUE) populationSettings <- createStudyPopulationSettings() # dataframe wrong type - expect_error(createSciKitLearnModel( + expect_error(createSklearnModel( modelLocation = "model.pkl", covariateMap = list( columnId = "columnId", - modelCovariateIdName = "modelCovariateIdName" + covariateId = c(1) ), covariateSettings = covariateSettings, populationSettings = populationSettings )) # dataframe wrong column names - expect_error(createSciKitLearnModel( + expect_error(createSklearnModel( modelLocation = "model.pkl", covariateMap = data.frame( - columnId = "columnId", - modelCovariateIdName = "modelCovariateIdName" + columnId = c(1), + notCovariateId = c(1002), ), covariateSettings = covariateSettings, populationSettings = populationSettings )) # dataframe wrong column types - expect_error(createSciKitLearnModel( + expect_error(createSklearnModel( modelLocation = "model.pkl", covariateMap = data.frame( columnId = 1, - modelCovariateIdName = 2 + covariateId = "2" ), covariateSettings = covariateSettings, populationSettings = populationSettings )) - model <- createSciKitLearnModel( + model <- createSklearnModel( modelLocation = "model.pkl", covariateMap = data.frame( columnId = c(1, 2), - covariateId = c(1002, 1003), - modelCovariateIdName = c("feature1", "feature2") + covariateId = c(1002, 1003) ), covariateSettings = covariateSettings, populationSettings = populationSettings @@ -51,9 +50,6 @@ test_that("Create existing sklearn works", { expect_equal(attr(model, "predictionFunction"), "predictPythonSklearn") expect_equal(attr(model, "saveToJson"), FALSE) expect_equal(class(model), "plpModel") - expect_equal(model$preprocessing$featureEngineering$funct, "mapColumns") - expect_equal(model$preprocessing$featureEngineering$settings$featureEngineeringSettings$columnMap$columnId, c(1, 2)) - expect_equal(model$preprocessing$featureEngineering$settings$featureEngineeringSettings$columnMap$modelCovariateIdName, c("feature1", "feature2")) unlink("model.pkl") }) @@ -87,11 +83,10 @@ test_that("existing sklearn model works", { joblib <- reticulate::import("joblib") joblib$dump(model, file.path(plpModel$model, "model.pkl")) - # extract covariatMap from plpModel + # extract covariateMap from plpModel covariateMap <- plpModel$covariateImportance %>% dplyr::select(columnId, covariateId) - covariateMap$modelCovariateIdName <- as.character(covariateMap$covariateId) - existingModel <- createSciKitLearnModel( + existingModel <- createSklearnModel( modelLocation = file.path(plpModel$model), covariateMap = covariateMap, covariateSettings = plpModel$modelDesign$covariateSettings,