Skip to content

Commit

Permalink
adding code for existing scikit learn model
Browse files Browse the repository at this point in the history
- code that lets users plug in scikit learn models into the PLP framework
  • Loading branch information
jreps committed Nov 11, 2024
1 parent 253d1b7 commit 18d5782
Show file tree
Hide file tree
Showing 4 changed files with 271 additions and 0 deletions.
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ export(createDatabaseSchemaSettings)
export(createDefaultExecuteSettings)
export(createDefaultSplitSetting)
export(createExecuteSettings)
export(createFeatureEngineeringMapColumnsSettings)
export(createFeatureEngineeringSettings)
export(createGlmModel)
export(createLearningCurve)
Expand All @@ -32,6 +33,7 @@ export(createPreprocessSettings)
export(createRandomForestFeatureSelection)
export(createRestrictPlpDataSettings)
export(createSampleSettings)
export(createSciKitLearnModel)
export(createSplineSettings)
export(createStratifiedImputationSettings)
export(createStudyPopulation)
Expand Down
195 changes: 195 additions & 0 deletions R/ExistingPython.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
# @file ExistingPython.R
#
# Copyright 2024 Observational Health Data Sciences and Informatics
#
# This file is part of PatientLevelPrediction
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


#' Plug an existing scikit learn python model developed outside OHDSI into the
#' PLP framework
#'
#' @details
#' This function lets users add an existing scikit learn that is saved as model.pkl
#' into PLP format. covariateMap is a mapping between standard covariateIds and the model column names
#' and order are required in addition to pythonModelLocation, the location of the model that must be saved
#' as model.pkl . The user also needs to specify the covariate settings and population settings as these
#' are used to determine the standard PLP model design.
#'
#' @param pythonModelLocation The location of the folder that contains the model as model.pkl
#' @param covariateMap A data.frame with the columns: columnId specifying the column order for the
#' covariate, covariateId the covariate ID from FeatureExtraction and modelCovariateIdName which is the
#' column name used when fitting the model. For example, if you had a column called 'age' in your model and this was the 3rd
#' column when fitting the model, then the values for columnId would be 3, covariateId would be 1002 (the covariateId for age in years) and
#' modelCovariateIdName would be 'age'.
#' @param covariateSettings The settings for the standardized covariates
#' @param populationSettings The settings for the population, this includes the time-at-risk settings and
#' and inclusion criteria.
#' @param isPickle If the model is saved as a pickle set this to T if it is a json set this to F
#'
#' @return
#' An object of class plpModel, this is a list that contains: model (the location of the model.pkl),
#' preprocessing (settings for mapping the covariateIds to the model column mames), modelDesign (specification
#' of the model design), trainDetails (information about the model fitting) and covariateImportance. You can use the output
#' as an input in PatientLevelPrediction::predictPlp to apply the model and calculate the risk for patients.
#'
#' @export
createSciKitLearnModel <- function(
pythonModelLocation = '/model', # model needs to be saved here as "model.pkl"
covariateMap = data.frame(
columnId = 1:2,
covariateId = c(1,2),
modelCovariateIdName = c('pred_1', 'pred_2')
),
covariateSettings, # specify the covariates
populationSettings, # specify time at risk used to develop model
isPickle = T
){

existingModel <- list(model = 'existingPython')
class(existingModel) <- 'modelSettings'

plpModel <- list(
# use plpModel$preprocessing$featureEngineering to rename columns
# set plpModel$preprocessing$tidyCovariates to NULL
preprocessing = list(
featureEngineering = list(
funct = 'mapColumns',
settings = list(
featureEngineeringSettings = createFeatureEngineeringMapColumnsSettings(
columnMap = covariateMap
)
)
),
tidyCovariates = NULL,
requireDenseMatrix = F
),
covariateImportance = data.frame(
columnId = covariateMap$columnId,
covariateId = covariateMap$modelCovariateIdName,
included = T
),
modelDesign = PatientLevelPrediction::createModelDesign(
targetId = 1,
outcomeId = 2,
restrictPlpDataSettings = PatientLevelPrediction::createRestrictPlpDataSettings(),
covariateSettings = covariateSettings,
populationSettings = populationSettings,
sampleSettings = PatientLevelPrediction::createSampleSettings(),
featureEngineeringSettings = createFeatureEngineeringMapColumnsSettings(
columnMap = covariateMap
),
preprocessSettings = PatientLevelPrediction::createPreprocessSettings(
minFraction = 0,
normalize = F,
removeRedundancy = F
),
modelSettings = existingModel,
splitSettings = PatientLevelPrediction::createDefaultSplitSetting()
),
model = pythonModelLocation,
trainDetails = list(
analysisId = 'exisitingPython',
developmentDatabase = 'nonOMOP',
developmentDatabaseId = 'nonOMOP',
trainingTime = -1,
modelName = 'existing'
)
)

attr(plpModel, "modelType") <- "binary"
attr(plpModel, "saveType") <- "file"
attr(plpModel, 'predictionFunction') <- "predictPythonSklearn"
attr(plpModel, 'saveToJson') <- !isPickle
class(plpModel) <- "plpModel"
return(plpModel)

}


#' Create settings that enable you to convert from standard covariateIds to
#' model covariate names - this is useful when implementing a model developed
#' outside of the OHDSI tools.
#'
#' @details
#' This function create settings that let you rename the covariates in the plpData object
#'
#' @param columnMap A data.frame containing the columns: covariateId the covariate ID from FeatureExtraction and
#' modelCovariateIdName which is the column name used when fitting the model.
#'
#' @return
#' An object of class featureEngineeringSettings that will convert column names
#'
#' @export
createFeatureEngineeringMapColumnsSettings <- function(
columnMap
){

featureEngineeringSettings <- list(
columnMap = columnMap
)

attr(featureEngineeringSettings, "fun") <- "mapColumns"
class(featureEngineeringSettings) <- "featureEngineeringSettings"
return(featureEngineeringSettings)

}

mapColumns <- function(
trainData,
featureEngineeringSettings
){
ParallelLogger::logInfo('Changing column names')

# map the columns - swap the covariateId with the modelCovariateIdName
trainData$covariateData$columnMap <- featureEngineeringSettings$columnMap %>%
dplyr::select("covariateId","modelCovariateIdName")
trainData$covariateData$covariates <- dplyr::inner_join(
trainData$covariateData$covariates,
trainData$covariateData$columnMap,
by = "covariateId"
) %>%
dplyr::select(-"covariateId") %>%
dplyr::rename(
covariateId = "modelCovariateIdName"
)

trainData$covariateData$covariateRef <- dplyr::inner_join(
trainData$covariateData$covariateRef,
trainData$covariateData$columnMap,
by = "covariateId"
) %>%
dplyr::select(-"covariateId") %>%
dplyr::rename(
covariateId = "modelCovariateIdName"
)

# remove the columnMap
trainData$covariateData$columnMap <- NULL

# add attribute for FE
featureEngineering <- list(
funct = 'mapColumns',
settings = list(
featureEngineeringSettings = featureEngineeringSettings
)
)

attr(trainData, 'metaData')$featureEngineering = listAppend(
attr(trainData, 'metaData')$featureEngineering,
featureEngineering
)

return(trainData)
}
25 changes: 25 additions & 0 deletions man/createFeatureEngineeringMapColumnsSettings.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

49 changes: 49 additions & 0 deletions man/createSciKitLearnModel.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 18d5782

Please sign in to comment.