diff --git a/.gitignore b/.gitignore index 8245e9a78..a079224c7 100644 --- a/.gitignore +++ b/.gitignore @@ -25,5 +25,8 @@ standalone/build/* /inst/shiny/DiagnosticsExplorer/rsconnect/* /doc/ /Meta/ +/extras/ +/results/ +/.vscode/ .project .cproject diff --git a/DESCRIPTION b/DESCRIPTION index 1065bba92..a56b87c28 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,6 @@ Package: PatientLevelPrediction Type: Package -Title: Developing patient level prediction using data in the OMOP Common Data - Model +Title: Developing Patient Level Prediction Models Using the Observational Medical Outcomes Partnership Common Data Model Version: 6.3.9.9999 Date: 2024-10-10 Authors@R: c( @@ -10,12 +9,12 @@ Authors@R: c( person("Martijn", "Schuemie", role = c("aut")), person("Marc", "Suchard", role = c("aut")), person("Patrick", "Ryan", role = c("aut")), - person("Peter", "Rijnbeek", role = c("aut")) - ) -Description: A user friendly way to create patient level prediction models using the OMOP common data model. Given a + person("Peter", "Rijnbeek", role = c("aut")), + person("Observational Health Data Science and Informatics", role = c("cph"))) +Description: A user friendly way to create patient level prediction models using the OMOP (ObservationalMedical Outcomes Partnership) common data model. Given a cohort of interest and an outcome of interest, the package can use data in the OMOP Common Data Model to build a large set of features. These features can then - be assessed to fit a predictive model using a number of machine learning algorithms. + be used to fit a predictive model with a number of machine learning algorithms. Several performance measures are implemented for model evaluation. License: Apache License 2.0 URL: https://ohdsi.github.io/PatientLevelPrediction, https://github.com/OHDSI/PatientLevelPrediction diff --git a/README.md b/README.md index 9587cfa6f..567ba37c8 100644 --- a/README.md +++ b/README.md @@ -18,11 +18,11 @@ Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. [Design and implementati The figure below illustrates the prediction problem we address. Among a population at risk, we aim to predict which patients at a defined moment in time (t = 0) will experience some outcome during a time-at-risk. Prediction is done using only information about the patients in an observation window prior to that moment in time. -![](vignettes/Figure1.webp) +![](vignettes/images/Figure1.avif) To define a prediction problem we have to define t=0 by a Target Cohort (T), the outcome we like to predict by an outcome cohort (O), and the time-at-risk (TAR). Furthermore, we have to make design choices for the model we like to develop, and determine the observational datasets to perform internal and external validation. This conceptual framework works for all type of prediction problems, for example those presented below (T=green, O=red). -![](vignettes/problems.webp) +![](vignettes/images/problems.avif) Features ======== @@ -51,11 +51,11 @@ Screenshots -Calibration plot +Calibration plot -ROC plot +ROC plot diff --git a/_pkgdown.yml b/_pkgdown.yml index ade5b412b..447991ad2 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -1,6 +1,8 @@ template: + bootstrap: 5 params: bootswatch: cosmo + light-switch: true development: mode: auto @@ -16,10 +18,8 @@ navbar: left: - home - intro - - videos - reference - articles - - tutorial - benchmarks - predictors - bestpractice diff --git a/inst/doc/AddingCustomFeatureEngineering.pdf b/inst/doc/AddingCustomFeatureEngineering.pdf deleted file mode 100644 index e4e8220ce..000000000 Binary files a/inst/doc/AddingCustomFeatureEngineering.pdf and /dev/null differ diff --git a/inst/doc/AddingCustomModels.pdf b/inst/doc/AddingCustomModels.pdf deleted file mode 100644 index f58d39dbb..000000000 Binary files a/inst/doc/AddingCustomModels.pdf and /dev/null differ diff --git a/inst/doc/AddingCustomSamples.pdf b/inst/doc/AddingCustomSamples.pdf deleted file mode 100644 index e85e0baaf..000000000 Binary files a/inst/doc/AddingCustomSamples.pdf and /dev/null differ diff --git a/inst/doc/AddingCustomSplitting.pdf b/inst/doc/AddingCustomSplitting.pdf deleted file mode 100644 index 6bc846f51..000000000 Binary files a/inst/doc/AddingCustomSplitting.pdf and /dev/null differ diff --git a/inst/doc/BuildingMultiplePredictiveModels.pdf b/inst/doc/BuildingMultiplePredictiveModels.pdf deleted file mode 100644 index 31c3cb98a..000000000 Binary files a/inst/doc/BuildingMultiplePredictiveModels.pdf and /dev/null differ diff --git a/inst/doc/BuildingPredictiveModels.pdf b/inst/doc/BuildingPredictiveModels.pdf deleted file mode 100644 index fd77a14bc..000000000 Binary files a/inst/doc/BuildingPredictiveModels.pdf and /dev/null differ diff --git a/inst/doc/BuildingPredictiveModels.tex b/inst/doc/BuildingPredictiveModels.tex deleted file mode 100644 index ae3ce9ba2..000000000 --- a/inst/doc/BuildingPredictiveModels.tex +++ /dev/null @@ -1,2473 +0,0 @@ -% Options for packages loaded elsewhere -\PassOptionsToPackage{unicode}{hyperref} -\PassOptionsToPackage{hyphens}{url} -% -\documentclass[ -]{article} -\usepackage{amsmath,amssymb} -\usepackage{iftex} -\ifPDFTeX - \usepackage[T1]{fontenc} - \usepackage[utf8]{inputenc} - \usepackage{textcomp} % provide euro and other symbols -\else % if luatex or xetex - \usepackage{unicode-math} % this also loads fontspec - \defaultfontfeatures{Scale=MatchLowercase} - \defaultfontfeatures[\rmfamily]{Ligatures=TeX,Scale=1} -\fi -\usepackage{lmodern} -\ifPDFTeX\else - % xetex/luatex font selection -\fi -% Use upquote if available, for straight quotes in verbatim environments -\IfFileExists{upquote.sty}{\usepackage{upquote}}{} -\IfFileExists{microtype.sty}{% use microtype if available - \usepackage[]{microtype} - \UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts -}{} -\makeatletter -\@ifundefined{KOMAClassName}{% if non-KOMA class - \IfFileExists{parskip.sty}{% - \usepackage{parskip} - }{% else - \setlength{\parindent}{0pt} - \setlength{\parskip}{6pt plus 2pt minus 1pt}} -}{% if KOMA class - \KOMAoptions{parskip=half}} -\makeatother -\usepackage{xcolor} -\usepackage[margin=1in]{geometry} -\usepackage{color} -\usepackage{fancyvrb} -\newcommand{\VerbBar}{|} -\newcommand{\VERB}{\Verb[commandchars=\\\{\}]} -\DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\}} -% Add ',fontsize=\small' for more characters per line -\usepackage{framed} -\definecolor{shadecolor}{RGB}{248,248,248} -\newenvironment{Shaded}{\begin{snugshade}}{\end{snugshade}} -\newcommand{\AlertTok}[1]{\textcolor[rgb]{0.94,0.16,0.16}{#1}} -\newcommand{\AnnotationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\AttributeTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{#1}} -\newcommand{\BaseNTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\BuiltInTok}[1]{#1} -\newcommand{\CharTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\CommentTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}} -\newcommand{\CommentVarTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\ConstantTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{#1}} -\newcommand{\ControlFlowTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}} -\newcommand{\DataTypeTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{#1}} -\newcommand{\DecValTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\DocumentationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\ErrorTok}[1]{\textcolor[rgb]{0.64,0.00,0.00}{\textbf{#1}}} -\newcommand{\ExtensionTok}[1]{#1} -\newcommand{\FloatTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}} -\newcommand{\FunctionTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}} -\newcommand{\ImportTok}[1]{#1} -\newcommand{\InformationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\newcommand{\KeywordTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}} -\newcommand{\NormalTok}[1]{#1} -\newcommand{\OperatorTok}[1]{\textcolor[rgb]{0.81,0.36,0.00}{\textbf{#1}}} -\newcommand{\OtherTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{#1}} -\newcommand{\PreprocessorTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}} -\newcommand{\RegionMarkerTok}[1]{#1} -\newcommand{\SpecialCharTok}[1]{\textcolor[rgb]{0.81,0.36,0.00}{\textbf{#1}}} -\newcommand{\SpecialStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\StringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\VariableTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}} -\newcommand{\VerbatimStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}} -\newcommand{\WarningTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}} -\usepackage{longtable,booktabs,array} -\usepackage{calc} % for calculating minipage widths -% Correct order of tables after \paragraph or \subparagraph -\usepackage{etoolbox} -\makeatletter -\patchcmd\longtable{\par}{\if@noskipsec\mbox{}\fi\par}{}{} -\makeatother -% Allow footnotes in longtable head/foot -\IfFileExists{footnotehyper.sty}{\usepackage{footnotehyper}}{\usepackage{footnote}} -\makesavenoteenv{longtable} -\usepackage{graphicx} -\makeatletter -\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi} -\def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi} -\makeatother -% Scale images if necessary, so that they will not overflow the page -% margins by default, and it is still possible to overwrite the defaults -% using explicit options in \includegraphics[width, height, ...]{} -\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio} -% Set default figure placement to htbp -\makeatletter -\def\fps@figure{htbp} -\makeatother -\setlength{\emergencystretch}{3em} % prevent overfull lines -\providecommand{\tightlist}{% - \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}} -\setcounter{secnumdepth}{5} -\usepackage{fancyhdr} -\pagestyle{fancy} -\fancyhead{} -\fancyhead[CO,CE]{Installation Guide} -\fancyfoot[CO,CE]{PatientLevelPrediction Package Version 6.3.7.9999} -\fancyfoot[LE,RO]{\thepage} -\renewcommand{\headrulewidth}{0.4pt} -\renewcommand{\footrulewidth}{0.4pt} -\ifLuaTeX - \usepackage{selnolig} % disable illegal ligatures -\fi -\IfFileExists{bookmark.sty}{\usepackage{bookmark}}{\usepackage{hyperref}} -\IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available -\urlstyle{same} -\hypersetup{ - pdftitle={Building patient-level predictive models}, - pdfauthor={Jenna Reps, Martijn J. Schuemie, Patrick B. Ryan, Peter R. Rijnbeek}, - hidelinks, - pdfcreator={LaTeX via pandoc}} - -\title{Building patient-level predictive models} -\author{Jenna Reps, Martijn J. Schuemie, Patrick B. Ryan, Peter R. -Rijnbeek} -\date{2024-04-26} - -\begin{document} -\maketitle - -{ -\setcounter{tocdepth}{3} -\tableofcontents -} -\hypertarget{introduction}{% -\section{Introduction}\label{introduction}} - -Observational healthcare data, such as administrative claims and -electronic health records, are increasingly used for clinical -characterization of disease progression, quality improvement, and -population-level effect estimation for medical product safety -surveillance and comparative effectiveness. Advances in machine learning -for large dataset analysis have led to increased interest in applying -patient-level prediction on this type of data. Patient-level prediction -offers the potential for medical practice to move beyond average -treatment effects and to consider personalized risks as part of clinical -decision-making. However, many published efforts in -patient-level-prediction do not follow the model development guidelines, -fail to perform extensive external validation, or provide insufficient -model details that limits the ability of independent researchers to -reproduce the models and perform external validation. This makes it hard -to fairly evaluate the predictive performance of the models and reduces -the likelihood of the model being used appropriately in clinical -practice. To improve standards, several papers have been written -detailing guidelines for best practices in developing and reporting -prediction models. - -The Transparent Reporting of a multivariable prediction model for -\href{https://www.equator-network.org/reporting-guidelines/tripod-statement/}{\texttt{Individual\ Prognosis\ Or\ Diagnosis\ (TRIPOD)\ statement}} -provides clear recommendations for reporting prediction model -development and validation and addresses some of the concerns related to -transparency. However, data structure heterogeneity and inconsistent -terminologies still make collaboration and model sharing difficult as -different researchers are often required to write new code to extract -the data from their databases and may define variables differently. - -In our -\href{https://academic.oup.com/jamia/article/25/8/969/4989437}{\texttt{paper}}, -we propose a standardised framework for patient-level prediction that -utilizes the OMOP Common Data Model (CDM) and standardized vocabularies, -and describe the open-source software that we developed implementing the -framework's pipeline. The framework is the first to support existing -best practice guidelines and will enable open dissemination of models -that can be extensively validated across the network of OHDSI -collaborators. - -Figure 1, illustrates the prediction problem we address. Among a -population at risk, we aim to predict which patients at a defined moment -in time (t = 0) will experience some outcome during a time-at-risk. -Prediction is done using only information about the patients in an -observation window prior to that moment in time. - -\begin{figure} -\centering -\includegraphics{Figure1.webp} -\caption{The prediction problem} -\end{figure} - -As shown in Figure 2, to define a prediction problem we have to define -t=0 by a Target Cohort (T), the outcome we like to predict by an outcome -cohort (O), and the time-at-risk (TAR). Furthermore, we have to make -design choices for the model we like to develop, and determine the -observational datasets to perform internal and external validation. This -conceptual framework works for all type of prediction problems, for -example those presented in Figure 3. - -\begin{figure} -\centering -\includegraphics{studydesign.webp} -\caption{Design choices} -\end{figure} - -\begin{figure} -\centering -\includegraphics{problems.webp} -\caption{Examples of prediction problems} -\end{figure} - -This vignette describes how you can use the -\texttt{PatientLevelPrediction} package to build patient-level -predictive models. The package enables data extraction, model building, -and model evaluation using data from databases that are translated into -the OMOP CDM. In this vignette we assume you have installed the package -correctly using the -\href{https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/InstallationGuide.pdf}{\texttt{InstallationGuide}}. - -\hypertarget{study-specification}{% -\section{Study specification}\label{study-specification}} - -We have to clearly specify our study upfront to be able to implement it. -This means we need to define the prediction problem we like to address, -in which population we will build the model, which model we will build -and how we will evaluate its performance. To guide you through this -process we will use a ``Disease onset and progression'' prediction type -as an example. - -\hypertarget{problem-definition-1-stroke-in-afibrilation-patients}{% -\subsection{Problem definition 1: Stroke in afibrilation -patients}\label{problem-definition-1-stroke-in-afibrilation-patients}} - -Atrial fibrillation is a disease characterized by an irregular heart -rate that can cause poor blood flow. Patients with atrial fibrillation -are at increased risk of ischemic stroke. Anticoagulation is a -recommended prophylaxis treatment strategy for patients at high risk of -stroke, though the underuse of anticoagulants and persistent severity of -ischemic stroke represents a substantial unmet medical need. Various -strategies have been developed to predict risk of ischemic stroke in -patients with atrial fibrillation. CHADS2 (Gage JAMA 2001) was developed -as a risk score based on history of congestive heart failure, -hypertension, age\textgreater=75, diabetes and stroke. CHADS2 was -initially derived using Medicare claims data, where it achieved good -discrimination (AUC=0.82). However, subsequent external validation -studies revealed the CHADS2 had substantially lower predictive accuracy -(Keogh Thromb Haemost 2011). Subsequent stroke risk calculators have -been developed and evaluated, including the extension of CHADS2Vasc. The -management of atrial fibrillation has evolved substantially over the -last decade, for various reasons that include the introduction of novel -oral anticoagulants. With these innovations has come a renewed interest -in greater precision medicine for stroke prevention. - -We will apply the PatientLevelPrediction package to observational -healthcare data to address the following patient-level prediction -question: - -Amongst patients who are newly diagnosed with Atrial Fibrillation, which -patients will go on to have Ischemic Stroke within 1 year? - -We will define `patients who are newly diagnosed with Atrial -Fibrillation' as the first condition record of cardiac arrhythmia, which -is followed by another cardiac arrhythmia condition record, at least two -drug records for a drug used to treat arrhythmias, or a procedure to -treat arrhythmias. We will define `Ischemic stroke events' as ischemic -stroke condition records during an inpatient or ER visit; successive -records with \textgreater{} 180 day gap are considered independent -episodes. - -\hypertarget{problem-definition-2-angioedema-in-ace-inhibitor-users}{% -\subsection{Problem definition 2: Angioedema in ACE inhibitor -users}\label{problem-definition-2-angioedema-in-ace-inhibitor-users}} - -Angiotensin converting enzyme inhibitors (ACE inhibitors) are -medications used by patients with hypertension that widen the blood -vessles and therefore increse the amount of blood pumped by the heart -and decreases blood pressure. Ace inhibitors reduce a patients risk of -cardiovasular disease but can lead to drug-induced angioedema. - -We will apply the PatientLevelPrediction package to observational -healthcare data to address the following patient-level prediction -question: - -Amongst patients who are newly dispensed an ACE inhibitor, which -patients will go on to have angioedema within 1 year? - -We will define `patients who are newly dispensed an ACE inhibitor' as -the first drug record of sny ACE inhibitor, {[}\ldots{]}which is -followed by another cardiac arrhythmia condition record, at least two -drug records for a drug used to treat arrhythmias, or a procedure to -treat arrhythmias. We will define `angioedema' as an angioedema -condition record. - -\hypertarget{study-population-definition}{% -\subsection{Study population -definition}\label{study-population-definition}} - -The final study population in which we will develop our model is often a -subset of the Target population, because we will e.g.~apply criteria -that are dependent on T and O or we want to do sensitivity analyses with -subpopulations of T. For this we have to answer the following questions: - -\begin{itemize} -\item - \emph{What is the minimum amount of observation time we require before - the start of the target cohort?} This choice could depend on the - available patient time in your training data, but also on the time you - expect to be available in the data sources you want to apply the model - on in the future. The longer the minimum observation time, the more - baseline history time is available for each person to use for feature - extraction, but the fewer patients will qualify for analysis. - Moreover, there could be clinical reasons to choose a short or longer - lookback period. For our example, we will use a prior history as - lookback period (washout period). -\item - \emph{Can patients enter the target cohort multiple times?} In the - target cohort definition, a person may qualify for the cohort multiple - times during different spans of time, for example if they had - different episodes of a disease or separate periods of exposure to a - medical product. The cohort definition does not necessarily apply a - restriction to only let the patients enter once, but in the context of - a particular patient-level prediction problem, a user may want to - restrict the cohort to the first qualifying episode. In our example, a - person could only enter the target cohort once since our criteria was - based on first occurrence of atrial fibrillation. -\item - \emph{Do we allow persons to enter the cohort if they experienced the - outcome before?} Do we allow persons to enter the target cohort if - they experienced the outcome before qualifying for the target cohort? - Depending on the particular patient-level prediction problem, there - may be a desire to predict `incident' first occurrence of an outcome, - in which case patients who have previously experienced the outcome are - not `at-risk' for having a first occurrence and therefore should be - excluded from the target cohort. In other circumstances, there may be - a desire to predict `prevalent' episodes, whereby patients with prior - outcomes can be included in the analysis and the prior outcome itself - can be a predictor of future outcomes. For our prediction example, the - answer to this question is `Yes, allow persons with prior outcomes' - because we know from the CHADS2 score that prior strokes are very - predictive of future strokes. If this answer would have been `No' we - also have to decide how long we would look back for previous - occurrences of the outcome. -\item - \emph{How do we define the period in which we will predict our outcome - relative to the target cohort start?} We actually have to make two - decisions to answer that question. First, does the time-at-risk window - start at the date of the start of the target cohort or later? - Arguments to make it start later could be that you want to avoid - outcomes that were entered late in the record that actually occurred - before the start of the target cohort or you want to leave a gap where - interventions to prevent the outcome could theoretically be - implemented. Second, you need to define the time-at-risk by setting - the risk window end, as some specification of days offset relative to - the target cohort start or end dates. For our problem we will predict - in a `time-at-risk' window starting 1 day after the start of the - target cohort up to 365 days later (to look for 1-year risk following - atrial fibrillation diagnosis). -\item - \emph{Do we require a minimum amount of time-at-risk?} We have to - decide if we want to include patients that did not experience the - outcome but did leave the database earlier than the end of our - time-at-risk period. These patients may experience the outcome when we - do not observe them. For our prediction problem we decide to answer - this question with `Yes, require a mimimum time-at-risk' for that - reason. Furthermore, we have to decide if this constraint also applies - to persons who experienced the outcome or we will include all persons - with the outcome irrespective of their total time at risk. For - example, if the outcome is death, then persons with the outcome are - likely censored before the full time-at-risk period is complete. -\end{itemize} - -\hypertarget{model-development-settings}{% -\subsection{Model development -settings}\label{model-development-settings}} - -To develop the model we have to decide which algorithm(s) we like to -train. We see the selection of the best algorithm for a certain -prediction problem as an empirical question, i.e.~you need to let the -data speak for itself and try different approaches to find the best one. -There is no algorithm that will work best for all problems (no free -lunch). In our package we therefore aim to implement many algorithms. -Furthermore, we made the system modular so you can add your own custom -algorithms as described in more detail in the -\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/AddingCustomModels.pdf}{\texttt{AddingCustomModels}} -vignette. - -Our package currently contains the following algorithms to choose from: - -\begin{longtable}[]{@{} - >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.1190}} - >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.6071}} - >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.2738}}@{}} -\toprule\noalign{} -\begin{minipage}[b]{\linewidth}\raggedright -Algorihm -\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright -Description -\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright -Hyper-parameters -\end{minipage} \\ -\midrule\noalign{} -\endhead -\bottomrule\noalign{} -\endlastfoot -Regularized Logistic Regression & Lasso logistic regression belongs to -the family of generalized linear models, where a linear combination of -the variables is learned and finally a logistic function maps the linear -combination to a value between 0 and 1. The lasso regularization adds a -cost based on model complexity to the objective function when training -the model. This cost is the sum of the absolute values of the linear -combination of the coefficients. The model automatically performs -feature selection by minimizing this cost. We use the Cyclic coordinate -descent for logistic, Poisson and survival analysis (Cyclops) package to -perform large-scale regularized logistic regression: -\url{https://github.com/OHDSI/Cyclops} & var (starting variance), -seed \\ -Gradient boosting machines & Gradient boosting machines is a boosting -ensemble technique and in our framework it combines multiple decision -trees. Boosting works by iteratively adding decision trees but adds more -weight to the data-points that are misclassified by prior decision trees -in the cost function when training the next tree. We use Extreme -Gradient Boosting, which is an efficient implementation of the gradient -boosting framework implemented in the xgboost R package available from -CRAN. & ntree (number of trees), max depth (max levels in tree), min -rows (minimum data points in in node), learning rate, balance (balance -class labels), seed \\ -Random forest & Random forest is a bagging ensemble technique that -combines multiple decision trees. The idea behind bagging is to reduce -the likelihood of overfitting, by using weak classifiers, but combining -multiple diverse weak classifiers into a strong classifier. Random -forest accomplishes this by training multiple decision trees but only -using a subset of the variables in each tree and the subset of variables -differ between trees. Our packages uses the sklearn learn implementation -of Random Forest in python. & mtry (number of features in each -tree),ntree (number of trees), maxDepth (max levels in tree), minRows -(minimum data points in in node),balance (balance class labels), seed \\ -K-nearest neighbors & K-nearest neighbors (KNN) is an algorithm that -uses some metric to find the K closest labelled data-points, given the -specified metric, to a new unlabelled data-point. The prediction of the -new data-points is then the most prevalent class of the K-nearest -labelled data-points. There is a sharing limitation of KNN, as the model -requires labelled data to perform the prediction on new data, and it is -often not possible to share this data across data sites.We included the -BigKnn classifier developed in OHDSI which is a large scale k-nearest -neighbor classifier using the Lucene search engine: -\url{https://github.com/OHDSI/BigKnn} & k (number of -neighbours),weighted (weight by inverse frequency) \\ -Naive Bayes & The Naive Bayes algorithm applies the Bayes theorem with -the `naive' assumption of conditional independence between every pair of -features given the value of the class variable. Based on the likelihood -the data belongs to a class and the prior distribution of the class, a -posterior distribution is obtained. & none \\ -AdaBoost & AdaBoost is a boosting ensemble technique. Boosting works by -iteratively adding classifiers but adds more weight to the data-points -that are misclassified by prior classifiers in the cost function when -training the next classifier. We use the sklearn `AdaboostClassifier' -implementation in Python. & nEstimators (the maximum number of -estimators at which boosting is terminated), learningRate (learning rate -shrinks the contribution of each classifier by learning\_rate. There is -a trade-off between learningRate and nEstimators) \\ -Decision Tree & A decision tree is a classifier that partitions the -variable space using individual tests selected using a greedy approach. -It aims to find partitions that have the highest information gain to -separate the classes. The decision tree can easily overfit by enabling a -large number of partitions (tree depth) and often needs some -regularization (e.g., pruning or specifying hyper-parameters that limit -the complexity of the model). We use the sklearn -`DecisionTreeClassifier' implementation in Python. & maxDepth (the -maximum depth of the tree), minSamplesSplit,minSamplesLeaf, -minImpuritySplit (threshold for early stopping in tree growth. A node -will split if its impurity is above the threshold, otherwise it is a -leaf.), seed,classWeight (`Balance' or `None') \\ -Multilayer Perception & Neural networks contain multiple layers that -weight their inputs using a non-linear function. The first layer is the -input layer, the last layer is the output layer the between are the -hidden layers. Neural networks are generally trained using feed forward -back-propagation. This is when you go through the network with a -data-point and calculate the error between the true label and predicted -label, then go backwards through the network and update the linear -function weights based on the error. This can also be performed as a -batch, where multiple data-points are fee & size (the number of hidden -nodes), alpha (the l2 regularisation), seed \\ -Deep Learning (now in seperate DeepPatientLevelPrediction R package) & -Deep learning such as deep nets, convolutional neural networks or -recurrent neural networks are similar to a neural network but have -multiple hidden layers that aim to learn latent representations useful -for prediction. In the seperate BuildingDeepLearningModels vignette we -describe these models and hyper-parameters in more detail & see -OHDSI/DeepPatientLevelPrediction \\ -\end{longtable} - -Furthermore, we have to decide on the \textbf{covariates} that we will -use to train our model. This choice can be driven by domain knowledge of -available computational resources. In our example, we like to add the -Gender, Age, Conditions, Drugs Groups, and Visit Count. We also have to -specify in which time windows we will look and we decide to look in year -before and any time prior. - -Finally, we have to define how we will train and test our model on our -data, i.e.~how we perform \textbf{internal validation}. For this we have -to decide how we divide our dataset in a training and testing dataset -and how we randomly assign patients to these two sets. Dependent on the -size of the training set we can decide how much data we like to use for -training, typically this is a 75\%, 25\% split. If you have very large -datasets you can use more data for training. To randomly assign patients -to the training and testing set, there are two commonly used approaches: - -\begin{enumerate} -\def\labelenumi{\arabic{enumi}.} -\tightlist -\item - split by person. In this case a random seed is used to assign the - patient to either sets. -\item - split by time. In this case a time point is used to split the persons, - e.g.~75\% of the data is before and 25\% is after this date. The - advantage of this is that you take into consideration that the health - care system has changed over time. -\end{enumerate} - -We now completely defined our studies and implement them: - -\begin{itemize} -\tightlist -\item - \protect\hyperlink{example1}{See example 1: Stroke in afibrilation - patients} -\item - \protect\hyperlink{example2}{See example 2: Agioedema in ACE inhibitor - new users} -\end{itemize} - -\hypertarget{example1}{% -\section{Example 1: Stroke in afibrilation patients}\label{example1}} - -\hypertarget{study-specification-1}{% -\subsection{Study Specification}\label{study-specification-1}} - -For our first prediction model we decide to start with a Regularized -Logistic Regression and will use the default parameters. We will do a -75\%-25\% split by person. - -\begin{longtable}[]{@{} - >{\raggedright\arraybackslash}p{(\columnwidth - 2\tabcolsep) * \real{0.2361}} - >{\raggedright\arraybackslash}p{(\columnwidth - 2\tabcolsep) * \real{0.7639}}@{}} -\toprule\noalign{} -\begin{minipage}[b]{\linewidth}\raggedright -Definition -\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright -Value -\end{minipage} \\ -\midrule\noalign{} -\endhead -\bottomrule\noalign{} -\endlastfoot -\textbf{Problem Definition} & \\ -Target Cohort (T) & `Patients who are newly diagnosed with Atrial -Fibrillation' defined as the first condition record of cardiac -arrhythmia, which is followed by another cardiac arrhythmia condition -record, at least two drug records for a drug used to treat arrhythmias, -or a procedure to treat arrhythmias. \\ -Outcome Cohort (O) & `Ischemic stroke events' defined as ischemic stroke -condition records during an inpatient or ER visit; successive records -with \textgreater{} 180 day gap are considered independent episodes. \\ -Time-at-risk (TAR) & 1 day till 365 days from cohort start \\ -& \\ -\textbf{Population Definition} & \\ -Washout Period & 1095 \\ -Enter the target cohort multiple times? & No \\ -Allow prior outcomes? & Yes \\ -Start of time-at-risk & 1 day \\ -End of time-at-risk & 365 days \\ -Require a minimum amount of time-at-risk? & Yes (364 days) \\ -& \\ -\textbf{Model Development} & \\ -Algorithm & Regularized Logistic Regression \\ -Hyper-parameters & variance = 0.01 (Default) \\ -Covariates & Gender, Age, Conditions (ever before, \textless365), Drugs -Groups (ever before, \textless365), and Visit Count \\ -Data split & 75\% train, 25\% test. Randomly assigned by person \\ -\end{longtable} - -According to the best practices we need to make a protocol that -completely specifies how we plan to execute our study. This protocol -will be assessed by the governance boards of the participating data -sources in your network study. For this a template could be used but we -prefer to automate this process as much as possible by adding -functionality to automatically generate study protocol from a study -specification. We will discuss this in more detail later. - -\hypertarget{study-implementation}{% -\subsection{Study implementation}\label{study-implementation}} - -Now we have completely design our study we have to implement the study. -We have to generate the target and outcome cohorts and we need to -develop the R code to run against our CDM that will execute the full -study. - -\hypertarget{cohort-instantiation}{% -\subsubsection{Cohort instantiation}\label{cohort-instantiation}} - -For our study we need to know when a person enters the target and -outcome cohorts. This is stored in a table on the server that contains -the cohort start date and cohort end date for all subjects for a -specific cohort definition. This cohort table has a very simple -structure as shown below: - -\begin{itemize} -\tightlist -\item - \texttt{cohort\_definition\_id}, a unique identifier for - distinguishing between different types of cohorts, e.g.~cohorts of - interest and outcome cohorts. -\item - \texttt{subject\_id}, a unique identifier corresponding to the - \texttt{person\_id} in the CDM. -\item - \texttt{cohort\_start\_date}, the date the subject enters the cohort. -\item - \texttt{cohort\_end\_date}, the date the subject leaves the cohort. -\end{itemize} - -How do we fill this table according to our cohort definitions? There are -two options for this: - -\begin{enumerate} -\def\labelenumi{\arabic{enumi})} -\item - use the interactive cohort builder tool in - \href{www.github.com/OHDSI/ATLAS}{ATLAS} which can be used to create - cohorts based on inclusion criteria and will automatically populate - this cohort table. -\item - write your own custom SQL statements to fill the cohort table. -\end{enumerate} - -Both methods are described below for our example prediction problem. - -\hypertarget{atlas-cohort-builder}{% -\subsubsection{ATLAS cohort builder}\label{atlas-cohort-builder}} - -\begin{figure} -\centering -\includegraphics{example1/ATLAS_T.webp} -\caption{Target Cohort Atrial Fibrillation} -\end{figure} - -ATLAS allows you to define cohorts interactively by specifying cohort -entry and cohort exit criteria. Cohort entry criteria involve selecting -one or more initial events, which determine the start date for cohort -entry, and optionally specifying additional inclusion criteria which -filter to the qualifying events. Cohort exit criteria are applied to -each cohort entry record to determine the end date when the person's -episode no longer qualifies for the cohort. For the outcome cohort the -end date is less relevant. As an example, Figure 4 shows how we created -the Atrial Fibrillation cohort and Figure 5 shows how we created the -stroke cohort in ATLAS. - -\begin{figure} -\centering -\includegraphics{example1/ATLAS_O.webp} -\caption{Outcome Cohort Stroke} -\end{figure} - -The T and O cohorts can be found here: - -\begin{itemize} -\tightlist -\item - Atrial Fibrillaton (T): - \url{http://www.ohdsi.org/web/atlas/\#/cohortdefinition/1769447} -\item - Stroke (O) : - \url{http://www.ohdsi.org/web/atlas/\#/cohortdefinition/1769448} -\end{itemize} - -In depth explanation of cohort creation in ATLAS is out of scope of this -vignette but can be found on the OHDSI wiki pages -\href{http://www.ohdsi.org/web/wiki/doku.php?id=documentation:software:atlas}{(link)}. - -Note that when a cohort is created in ATLAS the cohortid is needed to -extract the data in R. The cohortid can be found at the top of the ATLAS -screen, e.g.~1769447 in Figure 4. - -\hypertarget{custom-cohorts}{% -\subsubsection{Custom cohorts}\label{custom-cohorts}} - -It is also possible to create cohorts without the use of ATLAS. Using -custom cohort code (SQL) you can make more advanced cohorts if needed. - -For our example study, we need to create at table to hold the cohort -data and we need to create SQL code to instantiate this table for both -the AF and Stroke cohorts. Therefore, we create a file called -\emph{AfStrokeCohorts.sql} with the following contents: - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{/***********************************} -\CommentTok{File AfStrokeCohorts.sql } -\CommentTok{***********************************/} -\CommentTok{/*} -\CommentTok{Create a table to store the persons in the T and C cohort} -\CommentTok{*/} - -\ControlFlowTok{IF}\NormalTok{ OBJECT\_ID(}\StringTok{\textquotesingle{}@resultsDatabaseSchema.PLPAFibStrokeCohort\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}U\textquotesingle{}}\NormalTok{) }\KeywordTok{IS} \KeywordTok{NOT} \KeywordTok{NULL} -\KeywordTok{DROP} \KeywordTok{TABLE}\NormalTok{ @resultsDatabaseSchema.PLPAFibStrokeCohort;} - -\KeywordTok{CREATE} \KeywordTok{TABLE}\NormalTok{ @resultsDatabaseSchema.PLPAFibStrokeCohort } -\NormalTok{( } -\NormalTok{cohort\_definition\_id }\DataTypeTok{INT}\NormalTok{, } -\NormalTok{subject\_id BIGINT,} -\NormalTok{cohort\_start\_date }\DataTypeTok{DATE}\NormalTok{, } -\NormalTok{cohort\_end\_date }\DataTypeTok{DATE} -\NormalTok{);} - - -\CommentTok{/*} -\CommentTok{T cohort: [PatientLevelPrediction vignette]: T : patients who are newly } -\CommentTok{diagnosed with Atrial fibrillation} -\CommentTok{{-} persons with a condition occurrence record of \textquotesingle{}Atrial fibrillation\textquotesingle{} or } -\CommentTok{any descendants, indexed at the first diagnosis} -\CommentTok{{-} who have \textgreater{}1095 days of prior observation before their first diagnosis} -\CommentTok{{-} and have no warfarin exposure any time prior to first AFib diagnosis} -\CommentTok{*/} -\KeywordTok{INSERT} \KeywordTok{INTO}\NormalTok{ @resultsDatabaseSchema.AFibStrokeCohort (cohort\_definition\_id, } -\NormalTok{subject\_id, } -\NormalTok{cohort\_start\_date, } -\NormalTok{cohort\_end\_date)} -\KeywordTok{SELECT} \DecValTok{1} \KeywordTok{AS}\NormalTok{ cohort\_definition\_id,} -\NormalTok{AFib.person\_id }\KeywordTok{AS}\NormalTok{ subject\_id,} -\NormalTok{AFib.condition\_start\_date }\KeywordTok{AS}\NormalTok{ cohort\_start\_date,} -\NormalTok{observation\_period.observation\_period\_end\_date }\KeywordTok{AS}\NormalTok{ cohort\_end\_date} -\KeywordTok{FROM} -\NormalTok{(} - \KeywordTok{SELECT}\NormalTok{ person\_id, }\FunctionTok{min}\NormalTok{(condition\_start\_date) }\KeywordTok{as}\NormalTok{ condition\_start\_date} - \KeywordTok{FROM}\NormalTok{ @cdmDatabaseSchema.condition\_occurrence} - \KeywordTok{WHERE}\NormalTok{ condition\_concept\_id }\KeywordTok{IN}\NormalTok{ (}\KeywordTok{SELECT}\NormalTok{ descendant\_concept\_id }\KeywordTok{FROM} -\NormalTok{ @cdmDatabaseSchema.concept\_ancestor }\KeywordTok{WHERE}\NormalTok{ ancestor\_concept\_id }\KeywordTok{IN} -\NormalTok{ (}\DecValTok{313217} \CommentTok{/*atrial fibrillation*/}\NormalTok{))} - \KeywordTok{GROUP} \KeywordTok{BY}\NormalTok{ person\_id} -\NormalTok{) AFib} - \KeywordTok{INNER} \KeywordTok{JOIN}\NormalTok{ @cdmDatabaseSchema.observation\_period} - \KeywordTok{ON}\NormalTok{ AFib.person\_id }\OperatorTok{=}\NormalTok{ observation\_period.person\_id} - \KeywordTok{AND}\NormalTok{ AFib.condition\_start\_date }\OperatorTok{\textgreater{}=}\NormalTok{ dateadd(dd,}\DecValTok{1095}\NormalTok{, } -\NormalTok{ observation\_period.observation\_period\_start\_date)} - \KeywordTok{AND}\NormalTok{ AFib.condition\_start\_date }\OperatorTok{\textless{}=}\NormalTok{ observation\_period.observation\_period\_end\_date} - \KeywordTok{LEFT} \KeywordTok{JOIN} -\NormalTok{ (} - \KeywordTok{SELECT}\NormalTok{ person\_id, }\FunctionTok{min}\NormalTok{(drug\_exposure\_start\_date) }\KeywordTok{as}\NormalTok{ drug\_exposure\_start\_date} - \KeywordTok{FROM}\NormalTok{ @cdmDatabaseSchema.drug\_exposure} - \KeywordTok{WHERE}\NormalTok{ drug\_concept\_id }\KeywordTok{IN}\NormalTok{ (}\KeywordTok{SELECT}\NormalTok{ descendant\_concept\_id }\KeywordTok{FROM} -\NormalTok{ @cdmDatabaseSchema.concept\_ancestor }\KeywordTok{WHERE}\NormalTok{ ancestor\_concept\_id }\KeywordTok{IN} -\NormalTok{ (}\DecValTok{1310149} \CommentTok{/*warfarin*/}\NormalTok{))} - \KeywordTok{GROUP} \KeywordTok{BY}\NormalTok{ person\_id} -\NormalTok{ ) warfarin} - \KeywordTok{ON}\NormalTok{ Afib.person\_id }\OperatorTok{=}\NormalTok{ warfarin.person\_id} - \KeywordTok{AND}\NormalTok{ Afib.condition\_start\_date }\OperatorTok{\textgreater{}}\NormalTok{ warfarin.drug\_exposure\_start\_date} - \KeywordTok{WHERE}\NormalTok{ warfarin.person\_id }\KeywordTok{IS} \KeywordTok{NULL} -\NormalTok{ ;} - - \CommentTok{/*} -\CommentTok{ C cohort: [PatientLevelPrediction vignette]: O: Ischemic stroke events} -\CommentTok{ {-} inpatient visits that include a condition occurrence record for } -\CommentTok{ \textquotesingle{}cerebral infarction\textquotesingle{} and descendants, \textquotesingle{}cerebral thrombosis\textquotesingle{}, } -\CommentTok{ \textquotesingle{}cerebral embolism\textquotesingle{}, \textquotesingle{}cerebral artery occlusion\textquotesingle{} } -\CommentTok{ */} - \KeywordTok{INSERT} \KeywordTok{INTO}\NormalTok{ @resultsDatabaseSchema.AFibStrokeCohort (cohort\_definition\_id, } -\NormalTok{ subject\_id, } -\NormalTok{ cohort\_start\_date, } -\NormalTok{ cohort\_end\_date)} - \KeywordTok{SELECT} \DecValTok{2} \KeywordTok{AS}\NormalTok{ cohort\_definition\_id,} -\NormalTok{ visit\_occurrence.person\_id }\KeywordTok{AS}\NormalTok{ subject\_id,} -\NormalTok{ visit\_occurrence.visit\_start\_date }\KeywordTok{AS}\NormalTok{ cohort\_start\_date,} -\NormalTok{ visit\_occurrence.visit\_end\_date }\KeywordTok{AS}\NormalTok{ cohort\_end\_date} - \KeywordTok{FROM} -\NormalTok{ (} - \KeywordTok{SELECT}\NormalTok{ person\_id, condition\_start\_date} - \KeywordTok{FROM}\NormalTok{ @cdmDatabaseSchema.condition\_occurrence} - \KeywordTok{WHERE}\NormalTok{ condition\_concept\_id }\KeywordTok{IN}\NormalTok{ (}\KeywordTok{SELECT} \KeywordTok{DISTINCT}\NormalTok{ descendant\_concept\_id }\KeywordTok{FROM} -\NormalTok{ @cdmDatabaseSchema.concept\_ancestor }\KeywordTok{WHERE}\NormalTok{ ancestor\_concept\_id }\KeywordTok{IN} -\NormalTok{ (}\DecValTok{443454} \CommentTok{/*cerebral infarction*/}\NormalTok{) }\KeywordTok{OR}\NormalTok{ descendant\_concept\_id }\KeywordTok{IN} -\NormalTok{ (}\DecValTok{441874} \CommentTok{/*cerebral thrombosis*/}\NormalTok{, }\DecValTok{375557} \CommentTok{/*cerebral embolism*/}\NormalTok{, } - \DecValTok{372924} \CommentTok{/*cerebral artery occlusion*/}\NormalTok{))} -\NormalTok{ ) stroke} - \KeywordTok{INNER} \KeywordTok{JOIN}\NormalTok{ @cdmDatabaseSchema.visit\_occurrence} - \KeywordTok{ON}\NormalTok{ stroke.person\_id }\OperatorTok{=}\NormalTok{ visit\_occurrence.person\_id} - \KeywordTok{AND}\NormalTok{ stroke.condition\_start\_date }\OperatorTok{\textgreater{}=}\NormalTok{ visit\_occurrence.visit\_start\_date} - \KeywordTok{AND}\NormalTok{ stroke.condition\_start\_date }\OperatorTok{\textless{}=}\NormalTok{ visit\_occurrence.visit\_end\_date} - \KeywordTok{AND}\NormalTok{ visit\_occurrence.visit\_concept\_id }\KeywordTok{IN}\NormalTok{ (}\DecValTok{9201}\NormalTok{, }\DecValTok{262} \CommentTok{/*\textquotesingle{}Inpatient Visit\textquotesingle{} or } -\CommentTok{ \textquotesingle{}Emergency Room and Inpatient Visit\textquotesingle{}*/}\NormalTok{)} - \KeywordTok{GROUP} \KeywordTok{BY}\NormalTok{ visit\_occurrence.person\_id, visit\_occurrence.visit\_start\_date, } -\NormalTok{ visit\_occurrence.visit\_end\_date} -\NormalTok{ ;} - -\end{Highlighting} -\end{Shaded} - -This is parameterized SQL which can be used by the -\href{http://github.com/OHDSI/SqlRender}{\texttt{SqlRender}} package. We -use parameterized SQL so we do not have to pre-specify the names of the -CDM and result schemas. That way, if we want to run the SQL on a -different schema, we only need to change the parameter values; we do not -have to change the SQL code. By also making use of translation -functionality in \texttt{SqlRender}, we can make sure the SQL code can -be run in many different environments. - -To execute this sql against our CDM we first need to tell R how to -connect to the server. \texttt{PatientLevelPrediction} uses the -\href{http://github.com/ohdsi/DatabaseConnector}{\texttt{DatabaseConnector}} -package, which provides a function called -\texttt{createConnectionDetails}. Type \texttt{?createConnectionDetails} -for the specific settings required for the various database management -systems (DBMS). For example, one might connect to a PostgreSQL database -using this code: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{ connectionDetails }\OtherTok{\textless{}{-}} \FunctionTok{createConnectionDetails}\NormalTok{(}\AttributeTok{dbms =} \StringTok{"postgresql"}\NormalTok{, } - \AttributeTok{server =} \StringTok{"localhost/ohdsi"}\NormalTok{, } - \AttributeTok{user =} \StringTok{"joe"}\NormalTok{, } - \AttributeTok{password =} \StringTok{"supersecret"}\NormalTok{)} - -\NormalTok{ cdmDatabaseSchema }\OtherTok{\textless{}{-}} \StringTok{"my\_cdm\_data"} -\NormalTok{ cohortsDatabaseSchema }\OtherTok{\textless{}{-}} \StringTok{"my\_results"} -\NormalTok{ cdmVersion }\OtherTok{\textless{}{-}} \StringTok{"5"} -\end{Highlighting} -\end{Shaded} - -The last three lines define the \texttt{cdmDatabaseSchema} and -\texttt{cohortsDatabaseSchema} variables, as well as the CDM version. We -will use these later to tell R where the data in CDM format live, where -we want to create the cohorts of interest, and what version CDM is used. -Note that for Microsoft SQL Server, databaseschemas need to specify both -the database and the schema, so for example -\texttt{cdmDatabaseSchema\ \textless{}-\ "my\_cdm\_data.dbo"}. - -\begin{Shaded} -\begin{Highlighting}[] - \FunctionTok{library}\NormalTok{(SqlRender)} -\NormalTok{ sql }\OtherTok{\textless{}{-}} \FunctionTok{readSql}\NormalTok{(}\StringTok{"AfStrokeCohorts.sql"}\NormalTok{)} -\NormalTok{ sql }\OtherTok{\textless{}{-}} \FunctionTok{renderSql}\NormalTok{(sql,} - \AttributeTok{cdmDatabaseSchema =}\NormalTok{ cdmDatabaseSchema,} - \AttributeTok{cohortsDatabaseSchema =}\NormalTok{ cohortsDatabaseSchema,} - \AttributeTok{post\_time =} \DecValTok{30}\NormalTok{,} - \AttributeTok{pre\_time =} \DecValTok{365}\NormalTok{)}\SpecialCharTok{$}\NormalTok{sql} -\NormalTok{ sql }\OtherTok{\textless{}{-}} \FunctionTok{translateSql}\NormalTok{(sql, }\AttributeTok{targetDialect =}\NormalTok{ connectionDetails}\SpecialCharTok{$}\NormalTok{dbms)}\SpecialCharTok{$}\NormalTok{sql} - -\NormalTok{ connection }\OtherTok{\textless{}{-}} \FunctionTok{connect}\NormalTok{(connectionDetails)} - \FunctionTok{executeSql}\NormalTok{(connection, sql)} -\end{Highlighting} -\end{Shaded} - -In this code, we first read the SQL from the file into memory. In the -next line, we replace four parameter names with the actual values. We -then translate the SQL into the dialect appropriate for the DBMS we -already specified in the \texttt{connectionDetails}. Next, we connect to -the server, and submit the rendered and translated SQL. - -If all went well, we now have a table with the events of interest. We -can see how many events per type: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{ sql }\OtherTok{\textless{}{-}} \FunctionTok{paste}\NormalTok{(}\StringTok{"SELECT cohort\_definition\_id, COUNT(*) AS count"}\NormalTok{,} - \StringTok{"FROM @cohortsDatabaseSchema.AFibStrokeCohort"}\NormalTok{,} - \StringTok{"GROUP BY cohort\_definition\_id"}\NormalTok{)} -\NormalTok{ sql }\OtherTok{\textless{}{-}} \FunctionTok{renderSql}\NormalTok{(sql, }\AttributeTok{cohortsDatabaseSchema =}\NormalTok{ cohortsDatabaseSchema)}\SpecialCharTok{$}\NormalTok{sql} -\NormalTok{ sql }\OtherTok{\textless{}{-}} \FunctionTok{translateSql}\NormalTok{(sql, }\AttributeTok{targetDialect =}\NormalTok{ connectionDetails}\SpecialCharTok{$}\NormalTok{dbms)}\SpecialCharTok{$}\NormalTok{sql} - - \FunctionTok{querySql}\NormalTok{(connection, sql)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -## cohort_definition_id count -## 1 1 527616 -## 2 2 221555 -\end{verbatim} - -\hypertarget{study-script-creation}{% -\subsubsection{Study script creation}\label{study-script-creation}} - -In this section we assume that our cohorts have been created either by -using ATLAS or a custom SQL script. We will first explain how to create -an R script yourself that will execute our study as we have defined -earlier. - -\hypertarget{data-extraction}{% -\subsubsection{Data extraction}\label{data-extraction}} - -Now we can tell \texttt{PatientLevelPrediction} to extract all necessary -data for our analysis. This is done using the -\href{https://github.com/OHDSI/FeatureExtraction}{\texttt{FeatureExtractionPackage}}. -In short the FeatureExtractionPackage allows you to specify which -features (covariates) need to be extracted, e.g.~all conditions and drug -exposures. It also supports the creation of custom covariates. For more -detailed information on the FeatureExtraction package see its -\href{https://github.com/OHDSI/FeatureExtraction}{vignettes}. For our -example study we decided to use these settings: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{ covariateSettings }\OtherTok{\textless{}{-}} \FunctionTok{createCovariateSettings}\NormalTok{(}\AttributeTok{useDemographicsGender =} \ConstantTok{TRUE}\NormalTok{,} - \AttributeTok{useDemographicsAge =} \ConstantTok{TRUE}\NormalTok{,} - \AttributeTok{useConditionGroupEraLongTerm =} \ConstantTok{TRUE}\NormalTok{,} - \AttributeTok{useConditionGroupEraAnyTimePrior =} \ConstantTok{TRUE}\NormalTok{,} - \AttributeTok{useDrugGroupEraLongTerm =} \ConstantTok{TRUE}\NormalTok{,} - \AttributeTok{useDrugGroupEraAnyTimePrior =} \ConstantTok{TRUE}\NormalTok{,} - \AttributeTok{useVisitConceptCountLongTerm =} \ConstantTok{TRUE}\NormalTok{,} - \AttributeTok{longTermStartDays =} \SpecialCharTok{{-}}\DecValTok{365}\NormalTok{,} - \AttributeTok{endDays =} \SpecialCharTok{{-}}\DecValTok{1}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -The final step for extracting the data is to run the \texttt{getPlpData} -function and input the connection details, the database schema where the -cohorts are stored, the cohort definition ids for the cohort and -outcome, and the washoutPeriod which is the minimum number of days prior -to cohort index date that the person must have been observed to be -included into the data, and finally input the previously constructed -covariate settings. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{databaseDetails }\OtherTok{\textless{}{-}} \FunctionTok{createDatabaseDetails}\NormalTok{(} - \AttributeTok{connectionDetails =}\NormalTok{ connectionDetails,} - \AttributeTok{cdmDatabaseSchema =}\NormalTok{ cdmDatabaseSchema,} - \AttributeTok{cdmDatabaseName =} \StringTok{\textquotesingle{}\textquotesingle{}}\NormalTok{,} - \AttributeTok{cohortDatabaseSchema =}\NormalTok{ resultsDatabaseSchema,} - \AttributeTok{cohortTable =} \StringTok{\textquotesingle{}AFibStrokeCohort\textquotesingle{}}\NormalTok{,} - \AttributeTok{cohortId =} \DecValTok{1}\NormalTok{,} - \AttributeTok{outcomeDatabaseSchema =}\NormalTok{ resultsDatabaseSchema,} - \AttributeTok{outcomeTable =} \StringTok{\textquotesingle{}AFibStrokeCohort\textquotesingle{}}\NormalTok{,} - \AttributeTok{outcomeIds =} \DecValTok{2}\NormalTok{,} - \AttributeTok{cdmVersion =} \DecValTok{5} -\NormalTok{ )} - -\CommentTok{\# here you can define whether you want to sample the target cohort and add any} -\CommentTok{\# restrictions based on minimum prior observation, index date restrictions} -\CommentTok{\# or restricting to first index date (if people can be in target cohort multiple times)} -\NormalTok{restrictPlpDataSettings }\OtherTok{\textless{}{-}} \FunctionTok{createRestrictPlpDataSettings}\NormalTok{(}\AttributeTok{sampleSize =} \DecValTok{10000}\NormalTok{)} - -\NormalTok{ plpData }\OtherTok{\textless{}{-}} \FunctionTok{getPlpData}\NormalTok{(} - \AttributeTok{databaseDetails =}\NormalTok{ databaseDetails, } - \AttributeTok{covariateSettings =}\NormalTok{ covariateSettings,} - \AttributeTok{restrictPlpDataSettings =}\NormalTok{ restrictPlpDataSettings} -\NormalTok{ )} -\end{Highlighting} -\end{Shaded} - -Note that if the cohorts are created in ATLAS its corresponding cohort -database schema needs to be selected. There are many additional -parameters for the \texttt{createRestrictPlpDataSettings} function which -are all documented in the \texttt{PatientLevelPrediction} manual. The -resulting \texttt{plpData} object uses the package \texttt{Andromeda} -(which uses \href{https://www.sqlite.org/index.html}{SQLite}) to store -information in a way that ensures R does not run out of memory, even -when the data are large. - -Creating the \texttt{plpData} object can take considerable computing -time, and it is probably a good idea to save it for future sessions. -Because \texttt{plpData} uses \texttt{Andromeda}, we cannot use R's -regular save function. Instead, we'll have to use the -\texttt{savePlpData()} function: - -\begin{Shaded} -\begin{Highlighting}[] - \FunctionTok{savePlpData}\NormalTok{(plpData, }\StringTok{"stroke\_in\_af\_data"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -We can use the \texttt{loadPlpData()} function to load the data in a -future session. - -\hypertarget{additional-inclusion-criteria}{% -\subsubsection{Additional inclusion -criteria}\label{additional-inclusion-criteria}} - -To completely define the prediction problem the final study population -is obtained by applying additional constraints on the two earlier -defined cohorts, e.g., a minumim time at risk can be enforced -(\texttt{requireTimeAtRisk,\ minTimeAtRisk}) and we can specify if this -also applies to patients with the outcome (\texttt{includeAllOutcomes}). -Here we also specify the start and end of the risk window relative to -target cohort start. For example, if we like the risk window to start 30 -days after the at-risk cohort start and end a year later we can set -\texttt{riskWindowStart\ =\ 30} and \texttt{riskWindowEnd\ =\ 365}. In -some cases the risk window needs to start at the cohort end date. This -can be achieved by setting \texttt{addExposureToStart\ =\ TRUE} which -adds the cohort (exposure) time to the start date. - -In Appendix 1, we demonstrate the effect of these settings on the subset -of the persons in the target cohort that end up in the final study -population. - -In the example below all the settings we defined for our study are -imposed: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{ populationSettings }\OtherTok{\textless{}{-}} \FunctionTok{createStudyPopulationSettings}\NormalTok{(} - \AttributeTok{washoutPeriod =} \DecValTok{1095}\NormalTok{,} - \AttributeTok{firstExposureOnly =} \ConstantTok{FALSE}\NormalTok{,} - \AttributeTok{removeSubjectsWithPriorOutcome =} \ConstantTok{FALSE}\NormalTok{,} - \AttributeTok{priorOutcomeLookback =} \DecValTok{1}\NormalTok{,} - \AttributeTok{riskWindowStart =} \DecValTok{1}\NormalTok{,} - \AttributeTok{riskWindowEnd =} \DecValTok{365}\NormalTok{,} - \AttributeTok{startAnchor =} \StringTok{\textquotesingle{}cohort start\textquotesingle{}}\NormalTok{,} - \AttributeTok{endAnchor =} \StringTok{\textquotesingle{}cohort start\textquotesingle{}}\NormalTok{,} - \AttributeTok{minTimeAtRisk =} \DecValTok{364}\NormalTok{,} - \AttributeTok{requireTimeAtRisk =} \ConstantTok{TRUE}\NormalTok{,} - \AttributeTok{includeAllOutcomes =} \ConstantTok{TRUE} -\NormalTok{ )} -\end{Highlighting} -\end{Shaded} - -\hypertarget{spliting-the-data-into-trainingvalidationtesting-datasets}{% -\subsubsection{Spliting the data into training/validation/testing -datasets}\label{spliting-the-data-into-trainingvalidationtesting-datasets}} - -When developing a prediction model using supervised learning (when you -have features paired with labels for a set of patients), the first step -is to design the development/internal validation process. This requires -specifying how to select the model hyper-parameters, how to learn the -model parameters and how to fairly evaluate the model. In general, the -validation set is used to pick hyper-parameters, the training set is -used to learn the model parameters and the test set is used to perform -fair internal validation. However, cross-validation can be implemented -to pick the hyper-parameters on the training data (so a validation data -set is not required). Cross validation can also be used to estimate -internal validation (so a testing data set is not required). - -In small data the best approach for internal validation has been shown -to be boostrapping. However, in big data (many patients and many -features) bootstrapping is generally not feasible. In big data our -research has shown that it is just important to have some form of fair -evaluation (use a test set or cross validation). For full details see -\href{add\%20link}{our BMJ open paper}. - -In the PatientLevelPrediction package, the splitSettings define how the -plpData are partitioned into training/validation/testing data. Cross -validation is always done, but using a test set is optional (when the -data are small, it may be optimal to not use a test set). For the -splitSettings we can use the type (stratified/time/subject) and -testFraction parameters to split the data in a 75\%-25\% split and run -the patient-level prediction pipeline: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{ splitSettings }\OtherTok{\textless{}{-}} \FunctionTok{createDefaultSplitSetting}\NormalTok{(} - \AttributeTok{trainFraction =} \FloatTok{0.75}\NormalTok{,} - \AttributeTok{testFraction =} \FloatTok{0.25}\NormalTok{,} - \AttributeTok{type =} \StringTok{\textquotesingle{}stratified\textquotesingle{}}\NormalTok{,} - \AttributeTok{nfold =} \DecValTok{2}\NormalTok{, } - \AttributeTok{splitSeed =} \DecValTok{1234} -\NormalTok{ )} -\end{Highlighting} -\end{Shaded} - -Note: it is possible to add a custom method to specify how the plpData -are partitioned into training/validation/testing data, see -\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/AddingCustomSplitting.pdf}{vignette -for custom splitting}. - -\hypertarget{preprocessing-the-training-data}{% -\subsubsection{Preprocessing the training -data}\label{preprocessing-the-training-data}} - -There a numerous data processing settings that a user must specify when -developing a prediction model. These are: * Whether to under-sample or -over-sample the training data (this may be useful when there is class -imballance (e.g., the outcome is very rare or very common)) * Whether to -perform feature engineering or feature selection (e.g., create latent -variables that are not observed in the data or reduce the dimensionality -of the data) * Whether to remove redundant features and normalize the -data (this is required for some models) - -The default sample settings does nothing, it simply returns the -trainData as input, see below: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{ sampleSettings }\OtherTok{\textless{}{-}} \FunctionTok{createSampleSettings}\NormalTok{()} -\end{Highlighting} -\end{Shaded} - -However, the current package contains methods of under-sampling the -non-outcome patients. To perform undersampling, the \texttt{type} input -should be `underSample' and \texttt{numberOutcomestoNonOutcomes} must be -specified (an integer specifying the number of non-outcomes per -outcome). It is possible to add any custom function for over/under -sampling, see -\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/AddingCustomSamples.pdf}{vignette -for custom sampling}. - -It is possible to specify a combination of feature engineering functions -that take as input the trainData and output a new trainData with -different features. The default feature engineering setting does -nothing: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{ featureEngineeringSettings }\OtherTok{\textless{}{-}} \FunctionTok{createFeatureEngineeringSettings}\NormalTok{()} -\end{Highlighting} -\end{Shaded} - -However, it is possible to add custom feature engineering functions into -the pipeline, see -\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/AddingCustomFeatureEngineering.pdf}{vignette -for custom feature engineering}. - -Finally, the preprocessing setting is required. For this setting the -user can define \texttt{minFraction}, this removes any features that is -observed in the training data for less than 0.01 fraction of the -patients. So, if \texttt{minFraction\ =\ 0.01} then any feature that is -seen in less than 1 percent of the target population is removed. The -input \texttt{normalize} specifies whether the features are scaled -between 0 and 1, this is required for certain models (e.g., LASSO -logistic regression). The input \texttt{removeRedundancy} specifies -whether features that are observed in all of the target population are -removed. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{ preprocessSettingsSettings }\OtherTok{\textless{}{-}} \FunctionTok{createPreprocessSettings}\NormalTok{(} - \AttributeTok{minFraction =} \FloatTok{0.01}\NormalTok{, } - \AttributeTok{normalize =}\NormalTok{ T, } - \AttributeTok{removeRedundancy =}\NormalTok{ T} -\NormalTok{ )} -\end{Highlighting} -\end{Shaded} - -\hypertarget{model-development}{% -\subsubsection{Model Development}\label{model-development}} - -In the set function of an algorithm the user can specify a list of -eligible values for each hyper-parameter. All possible combinations of -the hyper-parameters are included in a so-called grid search using -cross-validation on the training set. If a user does not specify any -value then the default value is used instead. - -For example, if we use the following settings for the -gradientBoostingMachine: ntrees=c(100,200), maxDepth=4 the grid search -will apply the gradient boosting machine algorithm with ntrees=100 and -maxDepth=4 plus the default settings for other hyper-parameters and -ntrees=200 and maxDepth=4 plus the default settings for other -hyper-parameters. The hyper-parameters that lead to the -bestcross-validation performance will then be chosen for the final -model. For our problem we choose to build a logistic regression model -with the default hyper-parameters - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{lrModel }\OtherTok{\textless{}{-}} \FunctionTok{setLassoLogisticRegression}\NormalTok{()} -\end{Highlighting} -\end{Shaded} - -The \texttt{runPlP} function requires the \texttt{plpData}, the -\texttt{outcomeId} specifying the outcome being predicted and the -settings: \texttt{populationSettings}, \texttt{splitSettings}, -\texttt{sampleSettings}, \texttt{featureEngineeringSettings}, -\texttt{preprocessSettings} and \texttt{modelSettings} to train and -evaluate the model. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{ lrResults }\OtherTok{\textless{}{-}} \FunctionTok{runPlp}\NormalTok{(} - \AttributeTok{plpData =}\NormalTok{ plpData,} - \AttributeTok{outcomeId =} \DecValTok{2}\NormalTok{, } - \AttributeTok{analysisId =} \StringTok{\textquotesingle{}singleDemo\textquotesingle{}}\NormalTok{,} - \AttributeTok{analysisName =} \StringTok{\textquotesingle{}Demonstration of runPlp for training single PLP models\textquotesingle{}}\NormalTok{,} - \AttributeTok{populationSettings =}\NormalTok{ populationSettings, } - \AttributeTok{splitSettings =}\NormalTok{ splitSettings,} - \AttributeTok{sampleSettings =}\NormalTok{ sampleSettings, } - \AttributeTok{featureEngineeringSettings =}\NormalTok{ featureEngineeringSettings, } - \AttributeTok{preprocessSettings =}\NormalTok{ preprocessSettings,} - \AttributeTok{modelSettings =}\NormalTok{ lrModel,} - \AttributeTok{logSettings =} \FunctionTok{createLogSettings}\NormalTok{(), } - \AttributeTok{executeSettings =} \FunctionTok{createExecuteSettings}\NormalTok{(} - \AttributeTok{runSplitData =}\NormalTok{ T, } - \AttributeTok{runSampleData =}\NormalTok{ T, } - \AttributeTok{runfeatureEngineering =}\NormalTok{ T, } - \AttributeTok{runPreprocessData =}\NormalTok{ T, } - \AttributeTok{runModelDevelopment =}\NormalTok{ T, } - \AttributeTok{runCovariateSummary =}\NormalTok{ T} -\NormalTok{ ), } - \AttributeTok{saveDirectory =} \FunctionTok{file.path}\NormalTok{(}\FunctionTok{getwd}\NormalTok{(), }\StringTok{\textquotesingle{}singlePlp\textquotesingle{}}\NormalTok{)} -\NormalTok{ )} -\end{Highlighting} -\end{Shaded} - -Under the hood the package will now use the -\href{www.github.com/OHDSI/Cyclops}{\texttt{Cyclops}} package to fit a -large-scale regularized regression using 75\% of the data and will -evaluate the model on the remaining 25\%. A results data structure is -returned containing information about the model, its performance etc. - -You can save the model using: - -\begin{Shaded} -\begin{Highlighting}[] -\FunctionTok{savePlpModel}\NormalTok{(lrResults}\SpecialCharTok{$}\NormalTok{model, }\AttributeTok{dirPath =} \FunctionTok{file.path}\NormalTok{(}\FunctionTok{getwd}\NormalTok{(), }\StringTok{"model"}\NormalTok{))} -\end{Highlighting} -\end{Shaded} - -You can load the model using: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{ plpModel }\OtherTok{\textless{}{-}} \FunctionTok{loadPlpModel}\NormalTok{(}\FunctionTok{file.path}\NormalTok{(}\FunctionTok{getwd}\NormalTok{(),}\StringTok{\textquotesingle{}model\textquotesingle{}}\NormalTok{))} -\end{Highlighting} -\end{Shaded} - -You can also save the full results structure using: - -\begin{Shaded} -\begin{Highlighting}[] - \FunctionTok{savePlpResult}\NormalTok{(lrResults, }\AttributeTok{location =} \FunctionTok{file.path}\NormalTok{(}\FunctionTok{getwd}\NormalTok{(),}\StringTok{\textquotesingle{}lr\textquotesingle{}}\NormalTok{))} -\end{Highlighting} -\end{Shaded} - -To load the full results structure use: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{ lrResults }\OtherTok{\textless{}{-}} \FunctionTok{loadPlpResult}\NormalTok{(}\FunctionTok{file.path}\NormalTok{(}\FunctionTok{getwd}\NormalTok{(),}\StringTok{\textquotesingle{}lr\textquotesingle{}}\NormalTok{))} -\end{Highlighting} -\end{Shaded} - -\newpage - -\hypertarget{example2}{% -\section{Example 2: Angioedema in ACE inhibitor users}\label{example2}} - -\hypertarget{study-specification-2}{% -\subsection{Study Specification}\label{study-specification-2}} - -\begin{longtable}[]{@{} - >{\raggedright\arraybackslash}p{(\columnwidth - 2\tabcolsep) * \real{0.3056}} - >{\raggedright\arraybackslash}p{(\columnwidth - 2\tabcolsep) * \real{0.6944}}@{}} -\toprule\noalign{} -\begin{minipage}[b]{\linewidth}\raggedright -Definition -\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright -Value -\end{minipage} \\ -\midrule\noalign{} -\endhead -\bottomrule\noalign{} -\endlastfoot -\textbf{Problem Definition} & \\ -Target Cohort (T) & `Patients who are newly dispensed an ACE inhibitor' -defined as the first drug record of any ACE inhibitor \\ -Outcome Cohort (O) & `Angioedema' defined as an angioedema condition -record during an inpatient or ER visit \\ -Time-at-risk (TAR) & 1 day till 365 days from cohort start \\ -& \\ -\textbf{Population Definition} & \\ -Washout Period & 365 \\ -Enter the target cohort multiple times? & No \\ -Allow prior outcomes? & No \\ -Start of time-at-risk & 1 day \\ -End of time-at-risk & 365 days \\ -Require a minimum amount of time-at-risk? & Yes (364 days) \\ -& \\ -\textbf{Model Development} & \\ -Algorithm & Gradient Boosting Machine \\ -Hyper-parameters & ntree:5000, max depth:4 or 7 or 10 and learning rate: -0.001 or 0.01 or 0.1 or 0.9 \\ -Covariates & Gender, Age, Conditions (ever before, \textless365), Drugs -Groups (ever before, \textless365), and Visit Count \\ -Data split & 75\% train, 25\% test. Randomly assigned by person \\ -\end{longtable} - -According to the best practices we need to make a protocol that -completely specifies how we plan to execute our study. This protocol -will be assessed by the governance boards of the participating data -sources in your network study. For this a template could be used but we -prefer to automate this process as much as possible by adding -functionality to automatically generate study protocol from a study -specification. We will discuss this in more detail later. - -\hypertarget{study-implementation-1}{% -\subsection{Study implementation}\label{study-implementation-1}} - -Now we have completely design our study we have to implement the study. -We have to generate the target and outcome cohorts and we need to -develop the R code to run against our CDM that will execute the full -study. - -\hypertarget{cohort-instantiation-1}{% -\subsubsection{Cohort instantiation}\label{cohort-instantiation-1}} - -For our study we need to know when a person enters the target and -outcome cohorts. This is stored in a table on the server that contains -the cohort start date and cohort end date for all subjects for a -specific cohort definition. This cohort table has a very simple -structure as shown below: - -\begin{itemize} -\tightlist -\item - \texttt{cohort\_definition\_id}, a unique identifier for - distinguishing between different types of cohorts, e.g.~cohorts of - interest and outcome cohorts. -\item - \texttt{subject\_id}, a unique identifier corresponding to the - \texttt{person\_id} in the CDM. -\item - \texttt{cohort\_start\_date}, the date the subject enters the cohort. -\item - \texttt{cohort\_end\_date}, the date the subject leaves the cohort. -\end{itemize} - -How do we fill this table according to our cohort definitions? There are -two options for this: - -\begin{enumerate} -\def\labelenumi{\arabic{enumi})} -\item - use the interactive cohort builder tool in - \href{www.github.com/OHDSI/ATLAS}{ATLAS} which can be used to create - cohorts based on inclusion criteria and will automatically populate - this cohort table. -\item - write your own custom SQL statements to fill the cohort table. -\end{enumerate} - -Both methods are described below for our example prediction problem. - -\hypertarget{atlas-cohort-builder-1}{% -\subsubsection{ATLAS cohort builder}\label{atlas-cohort-builder-1}} - -\begin{figure} -\centering -\includegraphics{example2/aceinhibitors.webp} -\caption{Target Cohort ACE inhibitors} -\end{figure} - -ATLAS allows you to define cohorts interactively by specifying cohort -entry and cohort exit criteria. Cohort entry criteria involve selecting -one or more initial events, which determine the start date for cohort -entry, and optionally specifying additional inclusion criteria which -filter to the qualifying events. Cohort exit criteria are applied to -each cohort entry record to determine the end date when the person's -episode no longer qualifies for the cohort. For the outcome cohort the -end date is less relevant. As an example, Figure 6 shows how we created -the ACE inhibitors cohort and Figure 7 shows how we created the -angioedema cohort in ATLAS. - -\begin{figure} -\centering -\includegraphics{example2/angioedema.webp} -\caption{Outcome Cohort Angioedema} -\end{figure} - -The T and O cohorts can be found here: - -\begin{itemize} -\tightlist -\item - Ace inhibitors (T): - \url{http://www.ohdsi.org/web/atlas/\#/cohortdefinition/1770617} -\item - Angioedema (O) : - \url{http://www.ohdsi.org/web/atlas/\#/cohortdefinition/1770616} -\end{itemize} - -In depth explanation of cohort creation in ATLAS is out of scope of this -vignette but can be found on the OHDSI wiki pages -\href{http://www.ohdsi.org/web/wiki/doku.php?id=documentation:software:atlas}{(link)}. - -Note that when a cohort is created in ATLAS the cohortid is needed to -extract the data in R. The cohortid can be found at the top of the ATLAS -screen, e.g.~1770617 in Figure 6. - -\hypertarget{custom-cohorts-1}{% -\subsubsection{Custom cohorts}\label{custom-cohorts-1}} - -It is also possible to create cohorts without the use of ATLAS. Using -custom cohort code (SQL) you can make more advanced cohorts if needed. - -For our example study, we need to create at table to hold the cohort -data and we need to create SQL code to instantiate this table for both -the AF and Stroke cohorts. Therefore, we create a file called -\emph{AceAngioCohorts.sql} with the following contents: - -\begin{Shaded} -\begin{Highlighting}[] - \CommentTok{/***********************************} -\CommentTok{ File AceAngioCohorts.sql } -\CommentTok{ ***********************************/} - \CommentTok{/*} -\CommentTok{ Create a table to store the persons in the T and C cohort} -\CommentTok{ */} - - \ControlFlowTok{IF}\NormalTok{ OBJECT\_ID(}\StringTok{\textquotesingle{}@resultsDatabaseSchema.PLPAceAngioCohort\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}U\textquotesingle{}}\NormalTok{) }\KeywordTok{IS} \KeywordTok{NOT} \KeywordTok{NULL} - \KeywordTok{DROP} \KeywordTok{TABLE}\NormalTok{ @resultsDatabaseSchema.PLPAceAngioCohort;} - - \KeywordTok{CREATE} \KeywordTok{TABLE}\NormalTok{ @resultsDatabaseSchema.PLPAceAngioCohort } -\NormalTok{ ( } -\NormalTok{ cohort\_definition\_id }\DataTypeTok{INT}\NormalTok{, } -\NormalTok{ subject\_id BIGINT,} -\NormalTok{ cohort\_start\_date }\DataTypeTok{DATE}\NormalTok{, } -\NormalTok{ cohort\_end\_date }\DataTypeTok{DATE} -\NormalTok{ );} - - - \CommentTok{/*} -\CommentTok{ T cohort: [PatientLevelPrediction vignette]: T : patients who are newly } -\CommentTok{ dispensed an ACE inhibitor} -\CommentTok{ {-} persons with a drug exposure record of any \textquotesingle{}ACE inhibitor\textquotesingle{} or } -\CommentTok{ any descendants, indexed at the first diagnosis} -\CommentTok{ {-} who have \textgreater{}364 days of prior observation before their first dispensing} -\CommentTok{ */} - \KeywordTok{INSERT} \KeywordTok{INTO}\NormalTok{ @resultsDatabaseSchema.AceAngioCohort (cohort\_definition\_id, } -\NormalTok{ subject\_id, } -\NormalTok{ cohort\_start\_date, } -\NormalTok{ cohort\_end\_date)} - \KeywordTok{SELECT} \DecValTok{1} \KeywordTok{AS}\NormalTok{ cohort\_definition\_id,} -\NormalTok{ Ace.person\_id }\KeywordTok{AS}\NormalTok{ subject\_id,} -\NormalTok{ Ace.drug\_start\_date }\KeywordTok{AS}\NormalTok{ cohort\_start\_date,} -\NormalTok{ observation\_period.observation\_period\_end\_date }\KeywordTok{AS}\NormalTok{ cohort\_end\_date} - \KeywordTok{FROM} -\NormalTok{ (} - \KeywordTok{SELECT}\NormalTok{ person\_id, }\FunctionTok{min}\NormalTok{(drug\_exposure\_date) }\KeywordTok{as}\NormalTok{ drug\_start\_date} - \KeywordTok{FROM}\NormalTok{ @cdmDatabaseSchema.drug\_exposure} - \KeywordTok{WHERE}\NormalTok{ drug\_concept\_id }\KeywordTok{IN}\NormalTok{ (}\KeywordTok{SELECT}\NormalTok{ descendant\_concept\_id }\KeywordTok{FROM} -\NormalTok{ @cdmDatabaseSchema.concept\_ancestor }\KeywordTok{WHERE}\NormalTok{ ancestor\_concept\_id }\KeywordTok{IN} -\NormalTok{ (}\DecValTok{1342439}\NormalTok{,}\DecValTok{1334456}\NormalTok{, }\DecValTok{1331235}\NormalTok{, }\DecValTok{1373225}\NormalTok{, }\DecValTok{1310756}\NormalTok{, }\DecValTok{1308216}\NormalTok{, }\DecValTok{1363749}\NormalTok{, }\DecValTok{1341927}\NormalTok{, }\DecValTok{1340128}\NormalTok{, }\DecValTok{1335471} \CommentTok{/*ace inhibitors*/}\NormalTok{))} - \KeywordTok{GROUP} \KeywordTok{BY}\NormalTok{ person\_id} -\NormalTok{ ) Ace} - \KeywordTok{INNER} \KeywordTok{JOIN}\NormalTok{ @cdmDatabaseSchema.observation\_period} - \KeywordTok{ON}\NormalTok{ Ace.person\_id }\OperatorTok{=}\NormalTok{ observation\_period.person\_id} - \KeywordTok{AND}\NormalTok{ Ace.drug\_start\_date }\OperatorTok{\textgreater{}=}\NormalTok{ dateadd(dd,}\DecValTok{364}\NormalTok{, } -\NormalTok{ observation\_period.observation\_period\_start\_date)} - \KeywordTok{AND}\NormalTok{ Ace.drug\_start\_date }\OperatorTok{\textless{}=}\NormalTok{ observation\_period.observation\_period\_end\_date} -\NormalTok{ ;} - - \CommentTok{/*} -\CommentTok{ C cohort: [PatientLevelPrediction vignette]: O: Angioedema} -\CommentTok{ */} - \KeywordTok{INSERT} \KeywordTok{INTO}\NormalTok{ @resultsDatabaseSchema.AceAngioCohort (cohort\_definition\_id, } -\NormalTok{ subject\_id, } -\NormalTok{ cohort\_start\_date, } -\NormalTok{ cohort\_end\_date)} - \KeywordTok{SELECT} \DecValTok{2} \KeywordTok{AS}\NormalTok{ cohort\_definition\_id,} -\NormalTok{ angioedema.person\_id }\KeywordTok{AS}\NormalTok{ subject\_id,} -\NormalTok{ angioedema.condition\_start\_date }\KeywordTok{AS}\NormalTok{ cohort\_start\_date,} -\NormalTok{ angioedema.condition\_start\_date }\KeywordTok{AS}\NormalTok{ cohort\_end\_date} - \KeywordTok{FROM} -\NormalTok{ (} - \KeywordTok{SELECT}\NormalTok{ person\_id, condition\_start\_date} - \KeywordTok{FROM}\NormalTok{ @cdmDatabaseSchema.condition\_occurrence} - \KeywordTok{WHERE}\NormalTok{ condition\_concept\_id }\KeywordTok{IN}\NormalTok{ (}\KeywordTok{SELECT} \KeywordTok{DISTINCT}\NormalTok{ descendant\_concept\_id }\KeywordTok{FROM} -\NormalTok{ @cdmDatabaseSchema.concept\_ancestor }\KeywordTok{WHERE}\NormalTok{ ancestor\_concept\_id }\KeywordTok{IN} -\NormalTok{ (}\DecValTok{432791} \CommentTok{/*angioedema*/}\NormalTok{) }\KeywordTok{OR}\NormalTok{ descendant\_concept\_id }\KeywordTok{IN} -\NormalTok{ (}\DecValTok{432791} \CommentTok{/*angioedema*/}\NormalTok{)} -\NormalTok{ ) angioedema} - -\NormalTok{ ;} - -\end{Highlighting} -\end{Shaded} - -This is parameterized SQL which can be used by the -\href{http://github.com/OHDSI/SqlRender}{\texttt{SqlRender}} package. We -use parameterized SQL so we do not have to pre-specify the names of the -CDM and result schemas. That way, if we want to run the SQL on a -different schema, we only need to change the parameter values; we do not -have to change the SQL code. By also making use of translation -functionality in \texttt{SqlRender}, we can make sure the SQL code can -be run in many different environments. - -To execute this sql against our CDM we first need to tell R how to -connect to the server. \texttt{PatientLevelPrediction} uses the -\href{http://github.com/ohdsi/DatabaseConnector}{\texttt{DatabaseConnector}} -package, which provides a function called -\texttt{createConnectionDetails}. Type \texttt{?createConnectionDetails} -for the specific settings required for the various database management -systems (DBMS). For example, one might connect to a PostgreSQL database -using this code: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{ connectionDetails }\OtherTok{\textless{}{-}} \FunctionTok{createConnectionDetails}\NormalTok{(}\AttributeTok{dbms =} \StringTok{"postgresql"}\NormalTok{, } - \AttributeTok{server =} \StringTok{"localhost/ohdsi"}\NormalTok{, } - \AttributeTok{user =} \StringTok{"joe"}\NormalTok{, } - \AttributeTok{password =} \StringTok{"supersecret"}\NormalTok{)} - -\NormalTok{ cdmDatabaseSchema }\OtherTok{\textless{}{-}} \StringTok{"my\_cdm\_data"} -\NormalTok{ cohortsDatabaseSchema }\OtherTok{\textless{}{-}} \StringTok{"my\_results"} -\NormalTok{ cdmVersion }\OtherTok{\textless{}{-}} \StringTok{"5"} -\end{Highlighting} -\end{Shaded} - -The last three lines define the \texttt{cdmDatabaseSchema} and -\texttt{cohortsDatabaseSchema} variables, as well as the CDM version. We -will use these later to tell R where the data in CDM format live, where -we want to create the cohorts of interest, and what version CDM is used. -Note that for Microsoft SQL Server, databaseschemas need to specify both -the database and the schema, so for example -\texttt{cdmDatabaseSchema\ \textless{}-\ "my\_cdm\_data.dbo"}. - -\begin{Shaded} -\begin{Highlighting}[] - \FunctionTok{library}\NormalTok{(SqlRender)} -\NormalTok{ sql }\OtherTok{\textless{}{-}} \FunctionTok{readSql}\NormalTok{(}\StringTok{"AceAngioCohorts.sql"}\NormalTok{)} -\NormalTok{ sql }\OtherTok{\textless{}{-}} \FunctionTok{render}\NormalTok{(sql,} - \AttributeTok{cdmDatabaseSchema =}\NormalTok{ cdmDatabaseSchema,} - \AttributeTok{cohortsDatabaseSchema =}\NormalTok{ cohortsDatabaseSchema)} -\NormalTok{ sql }\OtherTok{\textless{}{-}} \FunctionTok{translate}\NormalTok{(sql, }\AttributeTok{targetDialect =}\NormalTok{ connectionDetails}\SpecialCharTok{$}\NormalTok{dbms)} - -\NormalTok{ connection }\OtherTok{\textless{}{-}} \FunctionTok{connect}\NormalTok{(connectionDetails)} - \FunctionTok{executeSql}\NormalTok{(connection, sql)} -\end{Highlighting} -\end{Shaded} - -In this code, we first read the SQL from the file into memory. In the -next line, we replace four parameter names with the actual values. We -then translate the SQL into the dialect appropriate for the DBMS we -already specified in the \texttt{connectionDetails}. Next, we connect to -the server, and submit the rendered and translated SQL. - -If all went well, we now have a table with the events of interest. We -can see how many events per type: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{ sql }\OtherTok{\textless{}{-}} \FunctionTok{paste}\NormalTok{(}\StringTok{"SELECT cohort\_definition\_id, COUNT(*) AS count"}\NormalTok{,} - \StringTok{"FROM @cohortsDatabaseSchema.AceAngioCohort"}\NormalTok{,} - \StringTok{"GROUP BY cohort\_definition\_id"}\NormalTok{)} -\NormalTok{ sql }\OtherTok{\textless{}{-}} \FunctionTok{render}\NormalTok{(sql, }\AttributeTok{cohortsDatabaseSchema =}\NormalTok{ cohortsDatabaseSchema)} -\NormalTok{ sql }\OtherTok{\textless{}{-}} \FunctionTok{translate}\NormalTok{(sql, }\AttributeTok{targetDialect =}\NormalTok{ connectionDetails}\SpecialCharTok{$}\NormalTok{dbms)} - - \FunctionTok{querySql}\NormalTok{(connection, sql)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -## cohort_definition_id count -## 1 1 0 -## 2 2 0 -\end{verbatim} - -\hypertarget{study-script-creation-1}{% -\subsubsection{Study script creation}\label{study-script-creation-1}} - -In this section we assume that our cohorts have been created either by -using ATLAS or a custom SQL script. We will first explain how to create -an R script yourself that will execute our study as we have defined -earlier. - -\hypertarget{data-extraction-1}{% -\subsubsection{Data extraction}\label{data-extraction-1}} - -Now we can tell \texttt{PatientLevelPrediction} to extract all necessary -data for our analysis. This is done using the -\href{https://github.com/OHDSI/FeatureExtraction}{\texttt{FeatureExtractionPackage}}. -In short the FeatureExtractionPackage allows you to specify which -features (covariates) need to be extracted, e.g.~all conditions and drug -exposures. It also supports the creation of custom covariates. For more -detailed information on the FeatureExtraction package see its -\href{https://github.com/OHDSI/FeatureExtraction}{vignettes}. For our -example study we decided to use these settings: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{ covariateSettings }\OtherTok{\textless{}{-}} \FunctionTok{createCovariateSettings}\NormalTok{(}\AttributeTok{useDemographicsGender =} \ConstantTok{TRUE}\NormalTok{,} - \AttributeTok{useDemographicsAge =} \ConstantTok{TRUE}\NormalTok{,} - \AttributeTok{useConditionGroupEraLongTerm =} \ConstantTok{TRUE}\NormalTok{,} - \AttributeTok{useConditionGroupEraAnyTimePrior =} \ConstantTok{TRUE}\NormalTok{,} - \AttributeTok{useDrugGroupEraLongTerm =} \ConstantTok{TRUE}\NormalTok{,} - \AttributeTok{useDrugGroupEraAnyTimePrior =} \ConstantTok{TRUE}\NormalTok{,} - \AttributeTok{useVisitConceptCountLongTerm =} \ConstantTok{TRUE}\NormalTok{,} - \AttributeTok{longTermStartDays =} \SpecialCharTok{{-}}\DecValTok{365}\NormalTok{,} - \AttributeTok{endDays =} \SpecialCharTok{{-}}\DecValTok{1}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -The final step for extracting the data is to run the \texttt{getPlpData} -function and input the connection details, the database schema where the -cohorts are stored, the cohort definition ids for the cohort and -outcome, and the washoutPeriod which is the minimum number of days prior -to cohort index date that the person must have been observed to be -included into the data, and finally input the previously constructed -covariate settings. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{databaseDetails }\OtherTok{\textless{}{-}} \FunctionTok{createDatabaseDetails}\NormalTok{(} - \AttributeTok{connectionDetails =}\NormalTok{ connectionDetails,} - \AttributeTok{cdmDatabaseSchema =}\NormalTok{ cdmDatabaseSchema,} - \AttributeTok{cohortDatabaseSchema =}\NormalTok{ resultsDatabaseSchema,} - \AttributeTok{cohortTable =} \StringTok{\textquotesingle{}AceAngioCohort\textquotesingle{}}\NormalTok{,} - \AttributeTok{cohortId =} \DecValTok{1}\NormalTok{,} - \AttributeTok{outcomeDatabaseSchema =}\NormalTok{ resultsDatabaseSchema,} - \AttributeTok{outcomeTable =} \StringTok{\textquotesingle{}AceAngioCohort\textquotesingle{}}\NormalTok{,} - \AttributeTok{outcomeIds =} \DecValTok{2} -\NormalTok{ )} - -\NormalTok{restrictPlpDataSettings }\OtherTok{\textless{}{-}} \FunctionTok{createRestrictPlpDataSettings}\NormalTok{(} - \AttributeTok{sampleSize =} \DecValTok{10000} -\NormalTok{ )} - -\NormalTok{plpData }\OtherTok{\textless{}{-}} \FunctionTok{getPlpData}\NormalTok{(} - \AttributeTok{databaseDetails =}\NormalTok{ databaseDetails, } - \AttributeTok{covariateSettings =}\NormalTok{ covariateSettings, } - \AttributeTok{restrictPlpDataSettings =}\NormalTok{ restrictPlpDataSettings} -\NormalTok{ )} -\end{Highlighting} -\end{Shaded} - -Note that if the cohorts are created in ATLAS its corresponding cohort -database schema needs to be selected. There are many additional -parameters for the \texttt{getPlpData} function which are all documented -in the \texttt{PatientLevelPrediction} manual. The resulting -\texttt{plpData} object uses the package \texttt{ff} to store -information in a way that ensures R does not run out of memory, even -when the data are large. - -Creating the \texttt{plpData} object can take considerable computing -time, and it is probably a good idea to save it for future sessions. -Because \texttt{plpData} uses \texttt{ff}, we cannot use R's regular -save function. Instead, we'll have to use the \texttt{savePlpData()} -function: - -\begin{Shaded} -\begin{Highlighting}[] - \FunctionTok{savePlpData}\NormalTok{(plpData, }\StringTok{"angio\_in\_ace\_data"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -We can use the \texttt{loadPlpData()} function to load the data in a -future session. - -\hypertarget{additional-inclusion-criteria-1}{% -\subsubsection{Additional inclusion -criteria}\label{additional-inclusion-criteria-1}} - -To completely define the prediction problem the final study population -is obtained by applying additional constraints on the two earlier -defined cohorts, e.g., a minumim time at risk can be enforced -(\texttt{requireTimeAtRisk,\ minTimeAtRisk}) and we can specify if this -also applies to patients with the outcome (\texttt{includeAllOutcomes}). -Here we also specify the start and end of the risk window relative to -target cohort start. For example, if we like the risk window to start 30 -days after the at-risk cohort start and end a year later we can set -\texttt{riskWindowStart\ =\ 30} and \texttt{riskWindowEnd\ =\ 365}. In -some cases the risk window needs to start at the cohort end date. This -can be achieved by setting \texttt{addExposureToStart\ =\ TRUE} which -adds the cohort (exposure) time to the start date. - -In Appendix 1, we demonstrate the effect of these settings on the subset -of the persons in the target cohort that end up in the final study -population. - -In the example below all the settings we defined for our study are -imposed: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{ populationSettings }\OtherTok{\textless{}{-}} \FunctionTok{createStudyPopulationSettings}\NormalTok{(} - \AttributeTok{washoutPeriod =} \DecValTok{364}\NormalTok{,} - \AttributeTok{firstExposureOnly =} \ConstantTok{FALSE}\NormalTok{,} - \AttributeTok{removeSubjectsWithPriorOutcome =} \ConstantTok{TRUE}\NormalTok{,} - \AttributeTok{priorOutcomeLookback =} \DecValTok{9999}\NormalTok{,} - \AttributeTok{riskWindowStart =} \DecValTok{1}\NormalTok{,} - \AttributeTok{riskWindowEnd =} \DecValTok{365}\NormalTok{, } - \AttributeTok{minTimeAtRisk =} \DecValTok{364}\NormalTok{,} - \AttributeTok{startAnchor =} \StringTok{\textquotesingle{}cohort start\textquotesingle{}}\NormalTok{,} - \AttributeTok{endAnchor =} \StringTok{\textquotesingle{}cohort start\textquotesingle{}}\NormalTok{,} - \AttributeTok{requireTimeAtRisk =} \ConstantTok{TRUE}\NormalTok{,} - \AttributeTok{includeAllOutcomes =} \ConstantTok{TRUE} -\NormalTok{ )} -\end{Highlighting} -\end{Shaded} - -\hypertarget{spliting-the-data-into-trainingvalidationtesting-datasets-1}{% -\subsubsection{Spliting the data into training/validation/testing -datasets}\label{spliting-the-data-into-trainingvalidationtesting-datasets-1}} - -When developing a prediction model using supervised learning (when you -have features paired with labels for a set of patients), the first step -is to design the development/internal validation process. This requires -specifying how to select the model hyper-parameters, how to learn the -model parameters and how to fairly evaluate the model. In general, the -validation set is used to pick hyper-parameters, the training set is -used to learn the model parameters and the test set is used to perform -fair internal validation. However, cross-validation can be implemented -to pick the hyper-parameters on the training data (so a validation data -set is not required). Cross validation can also be used to estimate -internal validation (so a testing data set is not required). - -In small data the best approach for internal validation has been shown -to be boostrapping. However, in big data (many patients and many -features) bootstrapping is generally not feasible. In big data our -research has shown that it is just important to have some form of fair -evaluation (use a test set or cross validation). For full details see -\href{add\%20link}{our BMJ open paper}. - -In the PatientLevelPrediction package, the splitSettings define how the -plpData are partitioned into training/validation/testing data. Cross -validation is always done, but using a test set is optional (when the -data are small, it may be optimal to not use a test set). For the -splitSettings we can use the type (stratified/time/subject) and -testFraction parameters to split the data in a 75\%-25\% split and run -the patient-level prediction pipeline: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{ splitSettings }\OtherTok{\textless{}{-}} \FunctionTok{createDefaultSplitSetting}\NormalTok{(} - \AttributeTok{trainFraction =} \FloatTok{0.75}\NormalTok{,} - \AttributeTok{testFraction =} \FloatTok{0.25}\NormalTok{,} - \AttributeTok{type =} \StringTok{\textquotesingle{}stratified\textquotesingle{}}\NormalTok{,} - \AttributeTok{nfold =} \DecValTok{2}\NormalTok{, } - \AttributeTok{splitSeed =} \DecValTok{1234} -\NormalTok{ )} -\end{Highlighting} -\end{Shaded} - -Note: it is possible to add a custom method to specify how the plpData -are partitioned into training/validation/testing data, see -\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/AddingCustomSplitting.pdf}{vignette -for custom splitting}. - -\hypertarget{preprocessing-the-training-data-1}{% -\subsubsection{Preprocessing the training -data}\label{preprocessing-the-training-data-1}} - -There a numerous data processing settings that a user must specify when -developing a prediction model. These are: * Whether to under-sample or -over-sample the training data (this may be useful when there is class -imballance (e.g., the outcome is very rare or very common)) * Whether to -perform feature engineering or feature selection (e.g., create latent -variables that are not observed in the data or reduce the dimensionality -of the data) * Whether to remove redundant features and normalize the -data (this is required for some models) - -The default sample settings does nothing, it simply returns the -trainData as input, see below: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{ sampleSettings }\OtherTok{\textless{}{-}} \FunctionTok{createSampleSettings}\NormalTok{()} -\end{Highlighting} -\end{Shaded} - -However, the current package contains methods of under-sampling the -non-outcome patients. To perform undersampling, the \texttt{type} input -should be `underSample' and \texttt{numberOutcomestoNonOutcomes} must be -specified (an integer specifying the number of non-outcomes per -outcome). It is possible to add any custom function for over/under -sampling, see -\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/AddingCustomSamples.pdf}{vignette -for custom sampling}. - -It is possible to specify a combination of feature engineering functions -that take as input the trainData and output a new trainData with -different features. The default feature engineering setting does -nothing: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{ featureEngineeringSettings }\OtherTok{\textless{}{-}} \FunctionTok{createFeatureEngineeringSettings}\NormalTok{()} -\end{Highlighting} -\end{Shaded} - -However, it is possible to add custom feature engineering functions into -the pipeline, see -\href{https://github.com/OHDSI/PatientLevelPrediction/blob/master/inst/doc/AddingCustomfeatureEngineering.pdf}{vignette -for custom feature engineering}. - -Finally, the preprocessing setting is required. For this setting the -user can define \texttt{minFraction}, this removes any features that is -observed in the training data for less than 0.01 fraction of the -patients. So, if \texttt{minFraction\ =\ 0.01} then any feature that is -seen in less than 1 percent of the target population is removed. The -input \texttt{normalize} specifies whether the features are scaled -between 0 and 1, this is required for certain models (e.g., LASSO -logistic regression). The input \texttt{removeRedundancy} specifies -whether features that are observed in all of the target population are -removed. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{ preprocessSettingsSettings }\OtherTok{\textless{}{-}} \FunctionTok{createPreprocessSettings}\NormalTok{(} - \AttributeTok{minFraction =} \FloatTok{0.01}\NormalTok{, } - \AttributeTok{normalize =}\NormalTok{ T, } - \AttributeTok{removeRedundancy =}\NormalTok{ T} -\NormalTok{ )} -\end{Highlighting} -\end{Shaded} - -\hypertarget{model-development-1}{% -\subsubsection{Model Development}\label{model-development-1}} - -In the set function of an algorithm the user can specify a list of -eligible values for each hyper-parameter. All possible combinations of -the hyper-parameters are included in a so-called grid search using -cross-validation on the training set. If a user does not specify any -value then the default value is used instead. - -For example, if we use the following settings for the -gradientBoostingMachine: ntrees=c(100,200), maxDepth=4 the grid search -will apply the gradient boosting machine algorithm with ntrees=100 and -maxDepth=4 plus the default settings for other hyper-parameters and -ntrees=200 and maxDepth=4 plus the default settings for other -hyper-parameters. The hyper-parameters that lead to the -bestcross-validation performance will then be chosen for the final -model. For our problem we choose to build a logistic regression model -with the default hyper-parameters - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{ gbmModel }\OtherTok{\textless{}{-}} \FunctionTok{setGradientBoostingMachine}\NormalTok{(} - \AttributeTok{ntrees =} \DecValTok{5000}\NormalTok{, } - \AttributeTok{maxDepth =} \FunctionTok{c}\NormalTok{(}\DecValTok{4}\NormalTok{,}\DecValTok{7}\NormalTok{,}\DecValTok{10}\NormalTok{), } - \AttributeTok{learnRate =} \FunctionTok{c}\NormalTok{(}\FloatTok{0.001}\NormalTok{,}\FloatTok{0.01}\NormalTok{,}\FloatTok{0.1}\NormalTok{,}\FloatTok{0.9}\NormalTok{)} -\NormalTok{ )} -\end{Highlighting} -\end{Shaded} - -The \texttt{runPlP} function requires the \texttt{plpData}, the -\texttt{outcomeId} specifying the outcome being predicted and the -settings: \texttt{populationSettings}, \texttt{splitSettings}, -\texttt{sampleSettings}, \texttt{featureEngineeringSettings}, -\texttt{preprocessSettings} and \texttt{modelSettings} to train and -evaluate the model. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{ gbmResults }\OtherTok{\textless{}{-}} \FunctionTok{runPlp}\NormalTok{(} - \AttributeTok{plpData =}\NormalTok{ plpData,} - \AttributeTok{outcomeId =} \DecValTok{2}\NormalTok{, } - \AttributeTok{analysisId =} \StringTok{\textquotesingle{}singleDemo2\textquotesingle{}}\NormalTok{,} - \AttributeTok{analysisName =} \StringTok{\textquotesingle{}Demonstration of runPlp for training single PLP models\textquotesingle{}}\NormalTok{,} - \AttributeTok{populationSettings =}\NormalTok{ populationSettings, } - \AttributeTok{splitSettings =}\NormalTok{ splitSettings,} - \AttributeTok{sampleSettings =}\NormalTok{ sampleSettings, } - \AttributeTok{featureEngineeringSettings =}\NormalTok{ featureEngineeringSettings, } - \AttributeTok{preprocessSettings =}\NormalTok{ preprocessSettings,} - \AttributeTok{modelSettings =}\NormalTok{ gbmModel,} - \AttributeTok{logSettings =} \FunctionTok{createLogSettings}\NormalTok{(), } - \AttributeTok{executeSettings =} \FunctionTok{createExecuteSettings}\NormalTok{(} - \AttributeTok{runSplitData =}\NormalTok{ T, } - \AttributeTok{runSampleData =}\NormalTok{ T, } - \AttributeTok{runfeatureEngineering =}\NormalTok{ T, } - \AttributeTok{runPreprocessData =}\NormalTok{ T, } - \AttributeTok{runModelDevelopment =}\NormalTok{ T, } - \AttributeTok{runCovariateSummary =}\NormalTok{ T} -\NormalTok{ ), } - \AttributeTok{saveDirectory =} \FunctionTok{file.path}\NormalTok{(}\FunctionTok{getwd}\NormalTok{(), }\StringTok{\textquotesingle{}singlePlpExample2\textquotesingle{}}\NormalTok{)} -\NormalTok{ )} -\end{Highlighting} -\end{Shaded} - -Under the hood the package will now use the R xgboost package to fit a a -gradient boosting machine model using 75\% of the data and will evaluate -the model on the remaining 25\%. A results data structure is returned -containing information about the model, its performance etc. - -You can save the model using: - -\begin{Shaded} -\begin{Highlighting}[] - \FunctionTok{savePlpModel}\NormalTok{(gbmResults}\SpecialCharTok{$}\NormalTok{model, }\AttributeTok{dirPath =} \FunctionTok{file.path}\NormalTok{(}\FunctionTok{getwd}\NormalTok{(), }\StringTok{"model"}\NormalTok{))} -\end{Highlighting} -\end{Shaded} - -You can load the model using: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{ plpModel }\OtherTok{\textless{}{-}} \FunctionTok{loadPlpModel}\NormalTok{(}\FunctionTok{file.path}\NormalTok{(}\FunctionTok{getwd}\NormalTok{(),}\StringTok{\textquotesingle{}model\textquotesingle{}}\NormalTok{))} -\end{Highlighting} -\end{Shaded} - -You can also save the full results structure using: - -\begin{Shaded} -\begin{Highlighting}[] - \FunctionTok{savePlpResult}\NormalTok{(gbmResults, }\AttributeTok{location =} \FunctionTok{file.path}\NormalTok{(}\FunctionTok{getwd}\NormalTok{(),}\StringTok{\textquotesingle{}gbm\textquotesingle{}}\NormalTok{))} -\end{Highlighting} -\end{Shaded} - -To load the full results structure use: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{ gbmResults }\OtherTok{\textless{}{-}} \FunctionTok{loadPlpResult}\NormalTok{(}\FunctionTok{file.path}\NormalTok{(}\FunctionTok{getwd}\NormalTok{(),}\StringTok{\textquotesingle{}gbm\textquotesingle{}}\NormalTok{))} -\end{Highlighting} -\end{Shaded} - -\newpage - -\hypertarget{study-package-creation}{% -\section{Study package creation}\label{study-package-creation}} - -The script we created manually above can also be automatically created -using a powerful feature in ATLAS. By creating a new prediction study -(left menu) you can select the Target and Outcome as created in ATLAS, -set all the study parameters, and then you can download a R package that -you can use to execute your study. What is really powerful is that you -can add multiple Ts, Os, covariate settings etc. The package will then -run all the combinations of automatically as separate analyses. The -screenshots below explain this process. - -\begin{enumerate} -\def\labelenumi{\arabic{enumi})} -\item - Create a new prediction study and select your target and outcome - cohorts. - - \includegraphics{atlasplp1.webp} -\item - Specify one or more analysis settings. - - \includegraphics{atlasplp2.web} - - \newpage -\item - Specify the trainings settigns - - \includegraphics{atlasplp3.webp} -\item - Specify the execution settings - - \includegraphics{atlasplp4.web} - - {]} \newpage -\end{enumerate} - -ATLAS can build a R package for you that will execute the full study -against you CDM. Below the steps are explained how to do this in ATLAS. - -\begin{enumerate} -\def\labelenumi{\arabic{enumi})} -\item - Under utilities you can find download. Click on the button to review - the full study specification - - \begin{figure} - \centering - \includegraphics{atlasdownload1.webp} - \caption{R package download functionality in ATLAS} - \end{figure} -\item - You now have to review that you indeed want to run all these analyses - (cartesian product of all the settings for each T and O combination. - - \begin{figure} - \centering - \includegraphics{atlasdownload2.webp} - \caption{R package download functionality in ATLAS} - \end{figure} -\item - If you agree, you give the package a name, and download the package as - a zipfile. -\item - By opening the R package in R studio and building the package you can - run the study using the \texttt{execute} function. Theres is also an - example CodeToRun.R script available in the extras folder of the - package with extra instructions. -\end{enumerate} - -\hypertarget{internal-validation}{% -\section{Internal validation}\label{internal-validation}} - -Once we execute the study, the runPlp() function returns the trained -model and the evaluation of the model on the train/test sets. - -You can interactively view the results by running: -\texttt{viewPlp(runPlp=lrResults)}. This will generate a Shiny App in -your browser in which you can view all performance measures created by -the framework as shown in the figure below. - -\begin{figure} -\centering -\includegraphics{shinysummary.webp} -\caption{Summary of all the performance measures of the analyses} -\end{figure} - -Furthermore, many interactive plots are available in the Shiny App, for -example the ROC curve in which you can move over the plot to see the -threshold and the corresponding sensitivity and specificity values. - -\begin{figure} -\centering -\includegraphics{shinyroc.webp} -\caption{Example of the interactive ROC curve} -\end{figure} - -To generate and save all the evaluation plots to a folder run the -following code: - -\begin{Shaded} -\begin{Highlighting}[] -\FunctionTok{plotPlp}\NormalTok{(lrResults, }\AttributeTok{dirPath=}\FunctionTok{getwd}\NormalTok{())} -\end{Highlighting} -\end{Shaded} - -The plots are described in more detail in the next sections. - -\newpage - -\hypertarget{discrimination}{% -\subsection{Discrimination}\label{discrimination}} - -The Receiver Operating Characteristics (ROC) plot shows the sensitivity -against 1-specificity on the test set. The plot illustrates how well the -model is able to discriminate between the people with the outcome and -those without. The dashed diagonal line is the performance of a model -that randomly assigns predictions. The higher the area under the ROC -plot the better the discrimination of the model. The plot is created by -changing the probability threshold to assign the positive class. - -\begin{figure} -\centering -\includegraphics{sparseRoc.webp} -\caption{Receiver Operating Characteristic Plot} -\end{figure} - -\newpage - -\#\# Calibration - -The calibration plot shows how close the predicted risk is to the -observed risk. The diagonal dashed line thus indicates a perfectly -calibrated model. The ten (or fewer) dots represent the mean predicted -values for each quantile plotted against the observed fraction of people -in that quantile who had the outcome (observed fraction). The straight -black line is the linear regression using these 10 plotted quantile mean -predicted vs observed fraction points. The straight vertical lines -represented the 95\% lower and upper confidence intervals of the slope -of the fitted line. - -\begin{figure} -\centering -\includegraphics{sparseCalibration.webp} -\caption{Calibration Plot} -\end{figure} - -\newpage - -\hypertarget{smooth-calibration}{% -\subsection{Smooth Calibration}\label{smooth-calibration}} - -Similar to the traditional calibration shown above the Smooth -Calibration plot shows the relationship between predicted and observed -risk. the major difference is that the smooth fit allows for a more fine -grained examination of this. Whereas the traditional plot will be -heavily influenced by the areas with the highest density of data the -smooth plot will provide the same information for this region as well as -a more accurate interpretation of areas with lower density. the plot -also contains information on the distribution of the outcomes relative -to predicted risk. - -However, the increased information gain comes at a computational cost. -It is recommended to use the traditional plot for examination and then -to produce the smooth plot for final versions. To create the smooth -calibarion plot you have to run the follow command: - -\begin{Shaded} -\begin{Highlighting}[] -\FunctionTok{plotSmoothCalibration}\NormalTok{(lrResults)} -\end{Highlighting} -\end{Shaded} - -See the help function for more information, on how to set the smoothing -method etc. - -The example below is from another study that better demonstrates the -impact of using a smooth calibration plot. The default line fit would -not highlight the miss-calibration at the lower predicted probability -levels that well. - -\begin{figure} -\centering -\includegraphics{smoothCalibration.jpeg} -\caption{Smooth Calibration plot} -\end{figure} - -\newpage - -\#\# Preference distribution - -The preference distribution plots are the preference score distributions -corresponding to i) people in the test set with the outcome (red) and -ii) people in the test set without the outcome (blue). - -\begin{figure} -\centering -\includegraphics{preferencePDF.webp} -\caption{Preference Plot} -\end{figure} - -\newpage - -\#\# Predicted probability distribution - -The prediction distribution box plots are for the predicted risks of the -people in the test set with the outcome (class 1: blue) and without the -outcome (class 0: red). - -The box plots in the Figure show that the predicted probability of the -outcome is indeed higher for those with the outcome but there is also -overlap between the two distribution which lead to an imperfect -discrimination. - -\begin{figure} -\centering -\includegraphics{predictionDistribution.wwebp} -\caption{Prediction Distribution Box Plot} -\end{figure} - -\newpage - -\#\# Test-Train similarity - -The test-train similarity is assessed by plotting the mean covariate -values in the train set against those in the test set for people with -and without the outcome. - -The results for our example of look very promising since the mean values -of the covariates are on the diagonal. - -\begin{figure} -\centering -\includegraphics{generalizability.webp} -\caption{Similarity plots of train and test set} -\end{figure} - -\newpage - -\#\# Variable scatter plot - -The variable scatter plot shows the mean covariate value for the people -with the outcome against the mean covariate value for the people without -the outcome. The color of the dots corresponds to the inclusion (green) -or exclusion in the model (blue), respectively. It is highly recommended -to use the Shiny App since this allows you to hoover over a covariate to -show more details (name, value etc). - -The plot shows that the mean of most of the covariates is higher for -subjects with the outcome compared to those without. - -\begin{figure} -\centering -\includegraphics{variableScatterplot.webp} -\caption{Variabel scatter Plot} -\end{figure} - -\newpage - -\#\# Precision recall - -Precision (P) is defined as the number of true positives (Tp) over the -number of true positives plus the number of false positives (Fp). - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{P }\OtherTok{\textless{}{-}}\NormalTok{ Tp}\SpecialCharTok{/}\NormalTok{(Tp}\SpecialCharTok{+}\NormalTok{Fp)} -\end{Highlighting} -\end{Shaded} - -Recall (R) is defined as the number of true positives (Tp) over the -number of true positives plus the number of false negatives (Fn). - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{R }\OtherTok{\textless{}{-}}\NormalTok{ Tp}\SpecialCharTok{/}\NormalTok{(Tp }\SpecialCharTok{+}\NormalTok{ Fn)} -\end{Highlighting} -\end{Shaded} - -These quantities are also related to the (F1) score, which is defined as -the harmonic mean of precision and recall. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{F1 }\OtherTok{\textless{}{-}} \DecValTok{2}\SpecialCharTok{*}\NormalTok{P}\SpecialCharTok{*}\NormalTok{R}\SpecialCharTok{/}\NormalTok{(P}\SpecialCharTok{+}\NormalTok{R)} -\end{Highlighting} -\end{Shaded} - -Note that the precision can either decrease or increase if the threshold -is lowered. Lowering the threshold of a classifier may increase the -denominator, by increasing the number of results returned. If the -threshold was previously set too high, the new results may all be true -positives, which will increase precision. If the previous threshold was -about right or too low, further lowering the threshold will introduce -false positives, decreasing precision. - -For Recall the denominator does not depend on the classifier threshold -(Tp+Fn is a constant). This means that lowering the classifier threshold -may increase recall, by increasing the number of true positive results. -It is also possible that lowering the threshold may leave recall -unchanged, while the precision fluctuates. - -\begin{figure} -\centering -\includegraphics{precisionRecall.webp} -\caption{Precision Recall Plot} -\end{figure} - -\newpage - -\#\# Demographic summary - -This plot shows for females and males the expected and observed risk in -different age groups together with a confidence area. - -The results show that our model is well calibrated across gender and age -groups. - -\begin{figure} -\centering -\includegraphics{demographicSummary.webp} -\caption{Demographic Summary Plot} -\end{figure} - -\newpage - -\# External validation - -We recommend to always perform external validation, i.e.~apply the final -model on as much new datasets as feasible and evaluate its performance. - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{\# load the trained model} -\NormalTok{plpModel }\OtherTok{\textless{}{-}} \FunctionTok{loadPlpModel}\NormalTok{(}\FunctionTok{getwd}\NormalTok{(),}\StringTok{\textquotesingle{}model\textquotesingle{}}\NormalTok{)} - -\CommentTok{\# add details of new database} -\NormalTok{validationDatabaseDetails }\OtherTok{\textless{}{-}} \FunctionTok{createDatabaseDetails}\NormalTok{()} - -\CommentTok{\# to externally validate the model and perform recalibration run:} -\FunctionTok{externalValidateDbPlp}\NormalTok{(} - \AttributeTok{plpModel =}\NormalTok{ plpModel,} - \AttributeTok{validationDatabaseDetails =}\NormalTok{ validationDatabaseDetails,} - \AttributeTok{validationRestrictPlpDataSettings =}\NormalTok{ plpModel}\SpecialCharTok{$}\NormalTok{settings}\SpecialCharTok{$}\NormalTok{plpDataSettings,} - \AttributeTok{settings =} \FunctionTok{createValidationSettings}\NormalTok{(} - \AttributeTok{recalibrate =} \StringTok{\textquotesingle{}weakRecalibration\textquotesingle{}} -\NormalTok{ ),} - \AttributeTok{outputFolder =} \FunctionTok{getwd}\NormalTok{()} -\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -This will extract the new plpData from the specified schemas and cohort -tables. It will then apply the same population settings and the trained -plp model. Finally, it will evaluate the performance and return the -standard output as \texttt{validation\$performanceEvaluation} and it -will also return the prediction on the population as -\texttt{validation\$prediction}. They can be inserted into the shiny app -for viewing the model and validation by running: -\texttt{viewPlp(runPlp=plpResult,\ validatePlp=validation\ )}. - -\newpage - -\hypertarget{other-functionality}{% -\section{Other functionality}\label{other-functionality}} - -The package has much more functionality than described in this vignette -and contributions have been made my many persons in the OHDSI community. -The table below provides an overview: - -\begin{longtable}[]{@{} - >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.2361}} - >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.5278}} - >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.2361}}@{}} -\toprule\noalign{} -\begin{minipage}[b]{\linewidth}\raggedright -Functionality -\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright -Description -\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright -Vignette -\end{minipage} \\ -\midrule\noalign{} -\endhead -\bottomrule\noalign{} -\endlastfoot -Builing Multiple Models & This vignette describes how you can run -multiple models automatically & -\href{https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/BuildingMultiplePredictiveModels.pdf}{\texttt{Vignette}} \\ -Custom Models & This vignette describes how you can add your own custom -algorithms in the framework & -\href{https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomModels.pdf}{\texttt{Vignette}} \\ -Custom Splitting Functions & This vignette describes how you can add -your own custom training/validation/testing splitting functions in the -framework & -\href{https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomSplitting.pdf}{\texttt{Vignette}} \\ -Custom Sampling Functions & This vignette describes how you can add your -own custom sampling functions in the framework & -\href{https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomSamples.pdf}{\texttt{Vignette}} \\ -Custom Feature Engineering/Selection & This vignette describes how you -can add your own custom feature engineering and selection functions in -the framework & -\href{https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/AddingCustomFeatureEngineering.pdf}{\texttt{Vignette}} \\ -Ensemble models & This vignette describes how you can use the framework -to build ensemble models, i.e combine multiple models in a super learner -& -\href{https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/BuildingEnsembleModels.pdf}{\texttt{Vignette}} \\ -Learning curves & Learning curves assess the effect of training set size -on model performance by training a sequence of prediction models on -successively larger subsets of the training set. A learning curve plot -can also help in diagnosing a bias or variance problem as explained -below. & -\href{https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/GeneratingLearningCurves.pdf}{\texttt{Vignette}} \\ -\end{longtable} - -\hypertarget{demos}{% -\section{Demos}\label{demos}} - -We have added several demos in the package that run on simulated data: - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{\# Show all demos in our package: } -\FunctionTok{demo}\NormalTok{(}\AttributeTok{package =} \StringTok{"PatientLevelPrediction"}\NormalTok{)} - -\CommentTok{\# For example, to run the SingleModelDemo that runs Lasso and shows you how to run the Shiny App use this call} -\FunctionTok{demo}\NormalTok{(}\StringTok{"SingleModelDemo"}\NormalTok{, }\AttributeTok{package =} \StringTok{"PatientLevelPrediction"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\newpage - -\hypertarget{acknowledgments}{% -\section{Acknowledgments}\label{acknowledgments}} - -Considerable work has been dedicated to provide the -\texttt{PatientLevelPrediction} package. - -\begin{Shaded} -\begin{Highlighting}[] -\FunctionTok{citation}\NormalTok{(}\StringTok{"PatientLevelPrediction"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -## To cite PatientLevelPrediction in publications use: -## -## Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek P (2018). "Design and -## implementation of a standardized framework to generate and evaluate patient-level -## prediction models using observational healthcare data." _Journal of the American -## Medical Informatics Association_, *25*(8), 969-975. -## . -## -## A BibTeX entry for LaTeX users is -## -## @Article{, -## author = {J. M. Reps and M. J. Schuemie and M. A. Suchard and P. B. Ryan and P. Rijnbeek}, -## title = {Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data}, -## journal = {Journal of the American Medical Informatics Association}, -## volume = {25}, -## number = {8}, -## pages = {969-975}, -## year = {2018}, -## url = {https://doi.org/10.1093/jamia/ocy032}, -## } -\end{verbatim} - -Further, \texttt{PatientLevelPrediction} makes extensive use of the -\texttt{Cyclops} package. - -\begin{Shaded} -\begin{Highlighting}[] -\FunctionTok{citation}\NormalTok{(}\StringTok{"Cyclops"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -## To cite Cyclops in publications use: -## -## Suchard MA, Simpson SE, Zorych I, Ryan P, Madigan D (2013). "Massive parallelization of -## serial inference algorithms for complex generalized linear models." _ACM Transactions -## on Modeling and Computer Simulation_, *23*, 10. -## . -## -## A BibTeX entry for LaTeX users is -## -## @Article{, -## author = {M. A. Suchard and S. E. Simpson and I. Zorych and P. Ryan and D. Madigan}, -## title = {Massive parallelization of serial inference algorithms for complex generalized linear models}, -## journal = {ACM Transactions on Modeling and Computer Simulation}, -## volume = {23}, -## pages = {10}, -## year = {2013}, -## url = {https://dl.acm.org/doi/10.1145/2414416.2414791}, -## } -\end{verbatim} - -\textbf{Please reference this paper if you use the PLP Package in your -work:} - -\href{http://dx.doi.org/10.1093/jamia/ocy032}{Reps JM, Schuemie MJ, -Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a -standardized framework to generate and evaluate patient-level prediction -models using observational healthcare data. J Am Med Inform Assoc. -2018;25(8):969-975.} - -This work is supported in part through the National Science Foundation -grant IIS 1251151. - -\newpage - -\hypertarget{appendix-1-study-population-settings-details}{% -\section*{Appendix 1: Study population settings -details}\label{appendix-1-study-population-settings-details}} -\addcontentsline{toc}{section}{Appendix 1: Study population settings -details} - -In the figures below the effect is shown of the -removeSubjectsWithPriorOutcome, requireTimAtRisk, and includeAllOutcomes -booleans on the final study population. We start with a Target Cohort -with firstExposureOnly = false and we require a washout period = 1095. -We then subset the target cohort based on additional constraints. The -final study population in the Venn diagrams below are colored green. - -\begin{enumerate} -\def\labelenumi{\arabic{enumi})} -\item - Require minimum time-at-risk for all person in the target cohort - - \includegraphics{popdef1.webp} -\item - Require minumum time-at-risk for target cohort, except for persons - with outcomes during time-at-risk. - - \includegraphics{popdef2.webp} -\end{enumerate} - -\newpage -3 - -) - -Include all persons in the target cohort exclude persons with prior -outcomes - -\includegraphics{popdef3.webp} - -\begin{enumerate} -\def\labelenumi{\arabic{enumi})} -\setcounter{enumi}{3} -\item - Require minimum time-at-risk for target cohort, except for persons - with outcomes during time-at-risk, exclude persons with prior outcomes - - \includegraphics{popdef4.webp} -\end{enumerate} - -\newpage -5 - -) - -Include all persons in target cohort exclude persons with prior outcomes - -\includegraphics{popdef5.webp} - -\begin{enumerate} -\def\labelenumi{\arabic{enumi})} -\setcounter{enumi}{5} -\item - Include all persons in target cohort - - \includegraphics{popdef6.webp} -\end{enumerate} - -\end{document} diff --git a/inst/doc/CreatingLearningCurves.pdf b/inst/doc/CreatingLearningCurves.pdf deleted file mode 100644 index cfa6cf8ef..000000000 Binary files a/inst/doc/CreatingLearningCurves.pdf and /dev/null differ diff --git a/inst/doc/CreatingNetworkstudies.pdf b/inst/doc/CreatingNetworkstudies.pdf deleted file mode 100644 index 57a187a82..000000000 Binary files a/inst/doc/CreatingNetworkstudies.pdf and /dev/null differ diff --git a/inst/doc/CreatingShinyApp.pdf b/inst/doc/CreatingShinyApp.pdf deleted file mode 100644 index 6ad7fa645..000000000 Binary files a/inst/doc/CreatingShinyApp.pdf and /dev/null differ diff --git a/inst/doc/InstallationGuide.pdf b/inst/doc/InstallationGuide.pdf deleted file mode 100644 index 605054361..000000000 Binary files a/inst/doc/InstallationGuide.pdf and /dev/null differ diff --git a/man/PatientLevelPrediction.Rd b/man/PatientLevelPrediction.Rd index 4946949d9..4a632609e 100644 --- a/man/PatientLevelPrediction.Rd +++ b/man/PatientLevelPrediction.Rd @@ -29,5 +29,10 @@ Authors: \item Peter Rijnbeek } +Other contributors: +\itemize{ + \item Observational Health Data Science and Informatics [copyright holder] +} + } \keyword{internal} diff --git a/vignettes/AddingCustomFeatureEngineering.Rmd b/vignettes/AddingCustomFeatureEngineering.Rmd index dc7ce30f4..c0d11df14 100644 --- a/vignettes/AddingCustomFeatureEngineering.Rmd +++ b/vignettes/AddingCustomFeatureEngineering.Rmd @@ -2,31 +2,13 @@ title: "Adding Custom Feature Engineering Functions" author: "Jenna Reps, Egill Fridgeirsson" date: "`r Sys.Date()`" -header-includes: - - \usepackage{fancyhdr} - - \pagestyle{fancy} - - \fancyhead{} - - \fancyhead[C]{Custom FeatureEngineering} - - \fancyfoot[C]{PatientLevelPrediction Package Version `r utils::packageVersion("PatientLevelPrediction")`} - - \fancyfoot[C]{\thepage} - - \renewcommand{\headrulewidth}{0.4pt} - - \renewcommand{\footrulewidth}{0.4pt} -output: - pdf_document: - number_sections: yes - toc: yes - html_document: - number_sections: yes - toc: yes +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Adding Custom Feature Engineering Functions} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} --- -```{=html} - -``` - ```{r, echo = FALSE, message = FALSE, warning = FALSE} library(PatientLevelPrediction) ``` diff --git a/vignettes/AddingCustomModels.Rmd b/vignettes/AddingCustomModels.Rmd index a2d3066e0..dc52dc15d 100644 --- a/vignettes/AddingCustomModels.Rmd +++ b/vignettes/AddingCustomModels.Rmd @@ -2,28 +2,14 @@ title: "Adding Custom Patient-Level Prediction Algorithms" author: "Jenna Reps, Martijn J. Schuemie, Patrick B. Ryan, Peter R. Rijnbeek" date: "`r Sys.Date()`" -header-includes: - - \usepackage{fancyhdr} - - \pagestyle{fancy} - - \fancyhead{} - - \fancyhead[C]{Custom Patient-Level Prediction Algorithms} - - \fancyfoot[C]{PatientLevelPrediction Package Version `r utils::packageVersion("PatientLevelPrediction")`} - - \fancyfoot[C]{\thepage} - - \renewcommand{\headrulewidth}{0.4pt} - - \renewcommand{\footrulewidth}{0.4pt} -output: - pdf_document: - number_sections: yes - toc: yes - html_document: - number_sections: yes - toc: yes +output: rmarkdown::html_vignette --- ```{=html} ``` ```{r, echo = FALSE, message = FALSE, warning = FALSE} diff --git a/vignettes/AddingCustomSamples.Rmd b/vignettes/AddingCustomSamples.Rmd index b3ff4450d..fe6c1c3a2 100644 --- a/vignettes/AddingCustomSamples.Rmd +++ b/vignettes/AddingCustomSamples.Rmd @@ -2,28 +2,14 @@ title: "Adding Custom Sampling Functions" author: "Jenna Reps" date: "`r Sys.Date()`" -header-includes: - - \usepackage{fancyhdr} - - \pagestyle{fancy} - - \fancyhead{} - - \fancyhead[C]{Custom Sampling} - - \fancyfoot[C]{\thepage} - - \renewcommand{\headrulewidth}{0.4pt} - - \renewcommand{\footrulewidth}{0.4pt} - - \fancyfoot[C]{PatientLevelPrediction Package Version `r utils::packageVersion("PatientLevelPrediction")`} -output: - pdf_document: - number_sections: yes - toc: yes - html_document: - number_sections: yes - toc: yes +output: rmarkdown::html_vignette --- ```{=html} ``` ```{r, echo = FALSE, message = FALSE, warning = FALSE} diff --git a/vignettes/AddingCustomSplitting.Rmd b/vignettes/AddingCustomSplitting.Rmd index b9c3c6c99..fa687d2da 100644 --- a/vignettes/AddingCustomSplitting.Rmd +++ b/vignettes/AddingCustomSplitting.Rmd @@ -1,29 +1,15 @@ --- -title: "Adding Custom Data Splitting Functions" +title: "Adding Custom Data Splitting" author: "Jenna Reps" date: "`r Sys.Date()`" -header-includes: - - \usepackage{fancyhdr} - - \pagestyle{fancy} - - \fancyhead{} - - \fancyhead[C]{Custom Splitting} - - \fancyfoot[C]{PatientLevelPrediction Package Version `r utils::packageVersion("PatientLevelPrediction")`} - - \fancyfoot[C]{\thepage} - - \renewcommand{\headrulewidth}{0.4pt} - - \renewcommand{\footrulewidth}{0.4pt} -output: - pdf_document: - number_sections: yes - toc: yes - html_document: - number_sections: yes - toc: yes +output: rmarkdown::html_vignette --- ```{=html} ``` ```{r, echo = FALSE, message = FALSE, warning = FALSE} diff --git a/vignettes/BenchmarkTasks.Rmd b/vignettes/BenchmarkTasks.Rmd index 3c5333833..725b2a62a 100644 --- a/vignettes/BenchmarkTasks.Rmd +++ b/vignettes/BenchmarkTasks.Rmd @@ -2,28 +2,14 @@ title: "Benchmark Tasks" author: "Jenna Reps, Ross Williams, Peter R. Rijnbeek" date: '`r Sys.Date()`' -header-includes: - - \usepackage{fancyhdr} - - \pagestyle{fancy} - - \fancyhead{} - - \fancyhead[C]{Benchmark Tasks} - - \fancyfoot[C]{PatientLevelPrediction Package Version `r utils::packageVersion("PatientLevelPrediction")`} - - \fancyfoot[C]{\thepage} - - \renewcommand{\headrulewidth}{0.4pt} - - \renewcommand{\footrulewidth}{0.4pt} -output: - pdf_document: - number_sections: yes - toc: yes - html_document: - number_sections: yes - toc: yes +output: rmarkdown::html_vignette --- ```{=html} ``` ## Benchmark Tasks For Large-Scale Empirical Analyses diff --git a/vignettes/BestPractices.Rmd b/vignettes/BestPractices.Rmd index 1094c0a4a..bcb2c152b 100644 --- a/vignettes/BestPractices.Rmd +++ b/vignettes/BestPractices.Rmd @@ -2,30 +2,14 @@ title: "Best Practice Research" author: "Jenna Reps, Peter R. Rijnbeek" date: '`r Sys.Date()`' -header-includes: - - \usepackage{fancyhdr} - - \pagestyle{fancy} - - \fancyhead{} - - \fancyhead[C]{Best Practices} - - \fancyfoot[C]{PatientLevelPrediction Package Version `r utils::packageVersion("PatientLevelPrediction")`} - - \fancyfoot[C]{\thepage} - - \renewcommand{\headrulewidth}{0.4pt} - - \renewcommand{\footrulewidth}{0.4pt} -output: - pdf_document: - includes: - in_header: preamble.tex - number_sections: yes - toc: yes - html_document: - number_sections: yes - toc: yes +output: rmarkdown::html_vignette --- ```{=html} ``` ## Best practice publications using the OHDSI PatientLevelPrediction framework diff --git a/vignettes/BuildingMultiplePredictiveModels.Rmd b/vignettes/BuildingMultiplePredictiveModels.Rmd index f3dd638b6..604fa428b 100644 --- a/vignettes/BuildingMultiplePredictiveModels.Rmd +++ b/vignettes/BuildingMultiplePredictiveModels.Rmd @@ -2,30 +2,14 @@ title: "Automatically Build Multiple Patient-Level Predictive Models" author: "Jenna Reps, Martijn J. Schuemie, Patrick B. Ryan, Peter R. Rijnbeek" date: '`r Sys.Date()`' -header-includes: - - \usepackage{fancyhdr} - - \pagestyle{fancy} - - \fancyhead{} - - \fancyhead[C]{Automatically Build Multiple Patient-Level Predictive Models} - - \fancyfoot[C]{PatientLevelPrediction Package Version `r utils::packageVersion("PatientLevelPrediction")`} - - \fancyfoot[C]{\thepage} - - \renewcommand{\headrulewidth}{0.4pt} - - \renewcommand{\footrulewidth}{0.4pt} -output: - pdf_document: - includes: - in_header: preamble.tex - number_sections: yes - toc: yes - html_document: - number_sections: yes - toc: yes +output: rmarkdown::html_vignette --- ```{=html} ``` # Introduction diff --git a/vignettes/BuildingPredictiveModels.Rmd b/vignettes/BuildingPredictiveModels.Rmd index 8c4c9c484..ea293d39a 100644 --- a/vignettes/BuildingPredictiveModels.Rmd +++ b/vignettes/BuildingPredictiveModels.Rmd @@ -2,25 +2,18 @@ title: "Building patient-level predictive models" author: "Jenna Reps, Martijn J. Schuemie, Patrick B. Ryan, Peter R. Rijnbeek" date: '`r Sys.Date()`' -header-includes: - - \usepackage{fancyhdr} - - \pagestyle{fancy} - - \fancyhead{} - - \fancyhead[C]{Building Predictive Models} - - \fancyfoot[C]{PatientLevelPrediction Package Version `r utils::packageVersion("PatientLevelPrediction")`} - - \fancyfoot[C]{\thepage} - - \renewcommand{\headrulewidth}{0.4pt} - - \renewcommand{\footrulewidth}{0.4pt} -output: - html_document: - number_sections: yes - toc: yes +output: rmarkdown::html_vignette +vignette: > + %\VignetteEngine{knitr::rmarkdown} + %\VignetteIndexEntry{Building patient-level predictive models} + %\VignetteEncoding{UTF-8} --- - +```{css, echo=FALSE} +img { + max-width: 100%; +} +``` ```{r echo=FALSE,message=FALSE,warning=FALSE,eval=TRUE} library(PatientLevelPrediction) @@ -46,13 +39,13 @@ In our [`paper`](https://academic.oup.com/jamia/article/25/8/969/4989437), we pr Figure 1, illustrates the prediction problem we address. Among a population at risk, we aim to predict which patients at a defined moment in time (t = 0) will experience some outcome during a time-at-risk. Prediction is done using only information about the patients in an observation window prior to that moment in time. -![The prediction problem](Figure1.webp) +![The prediction problem](./images/Figure1.avif) As shown in Figure 2, to define a prediction problem we have to define t=0 by a Target Cohort (T), the outcome we like to predict by an outcome cohort (O), and the time-at-risk (TAR). Furthermore, we have to make design choices for the model we like to develop, and determine the observational datasets to perform internal and external validation. This conceptual framework works for all type of prediction problems, for example those presented in Figure 3. -![Design choices](studydesign.webp) +![Design choices](./images/studydesign.avif) -![Examples of prediction problems](problems.webp) +![Examples of prediction problems](./images/problems.avif) This vignette describes how you can use the `PatientLevelPrediction` package to build patient-level predictive models. The package enables data extraction, model building, and model evaluation using data from databases that are translated into the OMOP CDM. In this vignette we assume you have installed the package correctly using the `vignette('InstallationGuide')`. @@ -180,11 +173,11 @@ Both methods are described below for our example prediction problem. ### ATLAS cohort builder -![Figure 4: Target Cohort Atrial Fibrillation](example1/ATLAS_T.webp) +![Figure 4: Target Cohort Atrial Fibrillation](example1/ATLAS_T.avif) ATLAS allows you to define cohorts interactively by specifying cohort entry and cohort exit criteria. Cohort entry criteria involve selecting one or more initial events, which determine the start date for cohort entry, and optionally specifying additional inclusion criteria which filter to the qualifying events. Cohort exit criteria are applied to each cohort entry record to determine the end date when the person's episode no longer qualifies for the cohort. For the outcome cohort the end date is less relevant. As an example, Figure 4 shows how we created the Atrial Fibrillation cohort and Figure 5 shows how we created the stroke cohort in ATLAS. -![Figure 5: Outcome Cohort Stroke](example1/ATLAS_O.webp) +![Figure 5: Outcome Cohort Stroke](example1/ATLAS_O.avif) The T and O cohorts can be found here: @@ -603,11 +596,11 @@ Both methods are described below for our example prediction problem. ### ATLAS cohort builder -![Target Cohort ACE inhibitors](example2/aceinhibitors.webp) +![Target Cohort ACE inhibitors](example2/aceinhibitors.avif) ATLAS allows you to define cohorts interactively by specifying cohort entry and cohort exit criteria. Cohort entry criteria involve selecting one or more initial events, which determine the start date for cohort entry, and optionally specifying additional inclusion criteria which filter to the qualifying events. Cohort exit criteria are applied to each cohort entry record to determine the end date when the person's episode no longer qualifies for the cohort. For the outcome cohort the end date is less relevant. As an example, Figure 6 shows how we created the ACE inhibitors cohort and Figure 7 shows how we created the angioedema cohort in ATLAS. -![Outcome Cohort Angioedema](example2/angioedema.webp) +![Outcome Cohort Angioedema](example2/angioedema.avif) The T and O cohorts can be found here: @@ -837,7 +830,7 @@ populationSettings <- createStudyPopulationSettings( When developing a prediction model using supervised learning (when you have features paired with labels for a set of patients), the first step is to design the development/internal validation process. This requires specifying how to select the model hyper-parameters, how to learn the model parameters and how to fairly evaluate the model. In general, the validation set is used to pick hyper-parameters, the training set is used to learn the model parameters and the test set is used to perform fair internal validation. However, cross-validation can be implemented to pick the hyper-parameters on the training data (so a validation data set is not required). Cross validation can also be used to estimate internal validation (so a testing data set is not required). -In small data the best approach for internal validation has been shown to be boostrapping. However, in big data (many patients and many features) bootstrapping is generally not feasible. In big data our research has shown that it is just important to have some form of fair evaluation (use a test set or cross validation). For full details see [our BMJ open paper](add%20link). +In small data the best approach for internal validation has been shown to be bootstrapping. However, in big data (many patients and many features) bootstrapping is generally not feasible. In big data our research has shown that it is just important to have some form of fair evaluation (use a test set or cross validation). For full details see [our BMJ open paper](https://bmjopen.bmj.com/content/11/12/e050146.long). In the PatientLevelPrediction package, the splitSettings define how the plpData are partitioned into training/validation/testing data. Cross validation is always done, but using a test set is optional (when the data are small, it may be optimal to not use a test set). For the splitSettings we can use the type (stratified/time/subject) and testFraction parameters to split the data in a 75%-25% split and run the patient-level prediction pipeline: @@ -960,7 +953,7 @@ The script we created manually above can also be automatically created using a p
Create a new prediction study and select your target and outcome cohorts.
-
![](atlasplp1.webp)
+
![](images/atlasplp1.avif)
@@ -968,7 +961,7 @@ The script we created manually above can also be automatically created using a p
Specify one or more analysis settings.
-
![](atlasplp2.web)
+
![](images/atlasplp2.avif)
@@ -978,7 +971,7 @@ The script we created manually above can also be automatically created using a p
Specify the trainings settigns
-
![](atlasplp3.webp)
+
![](images/atlasplp3.avif)
@@ -986,7 +979,7 @@ The script we created manually above can also be automatically created using a p
Specify the execution settings
-
![](atlasplp4.web)
+
![](images/atlasplp4.avif)
@@ -1000,7 +993,7 @@ ATLAS can build a R package for you that will execute the full study against you
- ![R package download functionality in ATLAS](atlasdownload1.webp) + ![R package download functionality in ATLAS](images/atlasdownload1.avif)
@@ -1012,7 +1005,7 @@ ATLAS can build a R package for you that will execute the full study against you
- ![R package download functionality in ATLAS](atlasdownload2.webp) + ![R package download functionality in ATLAS](images/atlasdownload2.avif)
@@ -1030,7 +1023,7 @@ You can interactively view the results by running: `viewPlp(runPlp=lrResults)`.
-![Summary of all the performance measures of the analyses](shinysummary.webp) +![Summary of all the performance measures of the analyses](images/shinysummary.avif)
@@ -1038,7 +1031,7 @@ Furthermore, many interactive plots are available in the Shiny App, for example
-![Example of the interactive ROC curve](shinyroc.webp) +![Example of the interactive ROC curve](images/shinyroc.avif)
@@ -1058,7 +1051,7 @@ The Receiver Operating Characteristics (ROC) plot shows the sensitivity against
-![Receiver Operating Characteristic Plot](sparseRoc.webp) +![Receiver Operating Characteristic Plot](images/sparseRoc.avif)
@@ -1070,7 +1063,7 @@ The calibration plot shows how close the predicted risk is to the observed risk.
-![Calibration Plot](sparseCalibration.webp) +![Calibration Plot](images/sparseCalibration.avif)
@@ -1092,7 +1085,7 @@ The example below is from another study that better demonstrates the impact of u
-![Smooth Calibration plot](smoothCalibration.jpeg) +![Smooth Calibration plot](images/smoothCalibration.avif)
@@ -1104,7 +1097,7 @@ The preference distribution plots are the preference score distributions corresp
-![Preference Plot](preferencePDF.webp) +![Preference Plot](images/preferencePDF.avif)
@@ -1118,7 +1111,7 @@ The box plots in the Figure show that the predicted probability of the outcome i
-![Prediction Distribution Box Plot](predictionDistribution.wwebp) +![Prediction Distribution Box Plot](images/predictionDistribution.avif)
@@ -1132,7 +1125,7 @@ The results for our example of look very promising since the mean values of the
-![Similarity plots of train and test set](generalizability.webp) +![Similarity plots of train and test set](images/generalizability.avif)
@@ -1146,7 +1139,7 @@ The plot shows that the mean of most of the covariates is higher for subjects wi
-![Variabel scatter Plot](variableScatterplot.webp) +![Variabel scatter Plot](images/variableScatterplot.avif)
@@ -1178,7 +1171,7 @@ For Recall the denominator does not depend on the classifier threshold (Tp+Fn is
-![Precision Recall Plot](precisionRecall.webp) +![Precision Recall Plot](images/precisionRecall.avif)
@@ -1192,7 +1185,7 @@ The results show that our model is well calibrated across gender and age groups.
-![Demographic Summary Plot](demographicSummary.webp) +![Demographic Summary Plot](images/demographicSummary.avif)
@@ -1282,7 +1275,7 @@ In the figures below the effect is shown of the removeSubjectsWithPriorOutcome,
Require minimum time-at-risk for all person in the target cohort
-
![](popdef1.webp)
+
![](images/popdef1.avif)
@@ -1290,7 +1283,7 @@ In the figures below the effect is shown of the removeSubjectsWithPriorOutcome,
Require minumum time-at-risk for target cohort, except for persons with outcomes during time-at-risk.
-
![](popdef2.webp)
+
![](images/popdef2.avif)
@@ -1303,7 +1296,7 @@ In the figures below the effect is shown of the removeSubjectsWithPriorOutcome,
Include all persons in the target cohort exclude persons with prior outcomes
-
![](popdef3.webp)
+
![](images/popdef3.avif)
@@ -1311,7 +1304,7 @@ In the figures below the effect is shown of the removeSubjectsWithPriorOutcome,
Require minimum time-at-risk for target cohort, except for persons with outcomes during time-at-risk, exclude persons with prior outcomes
-
![](popdef4.webp)
+
![](images/popdef4.avif)
@@ -1324,7 +1317,7 @@ In the figures below the effect is shown of the removeSubjectsWithPriorOutcome,
Include all persons in target cohort exclude persons with prior outcomes
-
![](popdef5.webp)
+
![](images/popdef5.avif)
@@ -1332,6 +1325,6 @@ In the figures below the effect is shown of the removeSubjectsWithPriorOutcome,
Include all persons in target cohort
-
![](popdef6.webp)
+
![](images/popdef6.avif)
diff --git a/vignettes/ClinicalModels.Rmd b/vignettes/ClinicalModels.Rmd index e06959431..65e747dd3 100644 --- a/vignettes/ClinicalModels.Rmd +++ b/vignettes/ClinicalModels.Rmd @@ -2,33 +2,13 @@ title: "Clinical Models" author: "Jenna Reps, Peter R. Rijnbeek" date: '`r Sys.Date()`' -header-includes: - - \usepackage{fancyhdr} - - \pagestyle{fancy} - - \fancyhead{} - - \fancyhead[C]{Clinical Models} - - \fancyfoot[C]{PatientLevelPrediction Package Version `r utils::packageVersion("PatientLevelPrediction")`} - - \fancyfoot[C]{\thepage} - - \renewcommand{\headrulewidth}{0.4pt} - - \renewcommand{\footrulewidth}{0.4pt} -output: - pdf_document: - includes: - in_header: preamble.tex - number_sections: yes - toc: yes - html_document: - number_sections: yes - toc: yes +output: rmarkdown::html_vignette +vignette: > + %\VignetteEngine{knitr::rmarkdown} + %\VignetteIndexEntry{Clinical Models} + %\VignetteEncoding{UTF-8} --- -```{=html} - -``` - ## Clinical models developed using the OHDSI PatientLevelPrediction framework | Title | Link | diff --git a/vignettes/ConstrainedPredictors.Rmd b/vignettes/ConstrainedPredictors.Rmd index 96052b864..7bd1e93fb 100644 --- a/vignettes/ConstrainedPredictors.Rmd +++ b/vignettes/ConstrainedPredictors.Rmd @@ -1,33 +1,14 @@ --- -title: "Constrained predictors" +title: "Constrained Predictors" author: "Jenna Reps" date: '`r Sys.Date()`' -header-includes: - - \usepackage{fancyhdr} - - \pagestyle{fancy} - - \fancyhead{} - - \fancyhead[C]{Constrained Predictors} - - \fancyfoot[C]{PatientLevelPrediction Package Version `r utils::packageVersion("PatientLevelPrediction")`} - - \fancyfoot[C]{\thepage} - - \renewcommand{\headrulewidth}{0.4pt} - - \renewcommand{\footrulewidth}{0.4pt} -output: - pdf_document: - includes: - in_header: preamble.tex - number_sections: yes - toc: yes - html_document: - number_sections: yes - toc: yes +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Constrained Predictors} + %\VignetteEncoding{UTF-8} + %\VignetteEngine{knitr::rmarkdown} --- -```{=html} - -``` ## Constrained Predictors ### How to use the PhenotypeLibrary R package @@ -120,4 +101,3 @@ phenotypeDefinitions <- PhenotypeLibrary::getPlCohortDefinitionSet(1152:1215) | Aspirin | Vascular | 1158 | | Deep vein thrombosis (DVT) | Vascular | 1152 | | Edema | Vascular | 1196 | -| Inpatient visit | NA | NA | diff --git a/vignettes/CreatingLearningCurves.Rmd b/vignettes/CreatingLearningCurves.Rmd index 4fb76b7b0..36ffc81b7 100644 --- a/vignettes/CreatingLearningCurves.Rmd +++ b/vignettes/CreatingLearningCurves.Rmd @@ -2,34 +2,14 @@ title: "Creating Learning Curves" author: "Luis H. John, Jenna M. Reps, Peter R. Rijnbeek" date: '`r Sys.Date()`' -header-includes: - - \usepackage{fancyhdr} - - \pagestyle{fancy} - - \fancyhead{} - - \fancyhead[C]{Generating Learning Curves} - - \fancyfoot[C]{\thepage} - - \renewcommand{\headrulewidth}{0.4pt} - - \renewcommand{\footrulewidth}{0.4pt} - - \fancyfoot[C]{PatientLevelPrediction Package Version `r utils::packageVersion("PatientLevelPrediction")`} -output: - pdf_document: - includes: - in_header: preamble.tex - number_sections: yes - toc: yes - html_document: - number_sections: yes - toc: yes - html_notebook: - toc: yes +output: + rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Creating Learning Curves} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} --- -```{=html} - -``` ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` @@ -52,17 +32,17 @@ Prediction models will show overly-optimistic performance when predicting on the Learning curves assess the effect of training set size on model performance by training a sequence of prediction models on successively larger subsets of the training set. A learning curve plot can also help in diagnosing a bias or variance problem as explained below. -![Learning curve example.](learningCurve.png) +![Learning curve example.](images/learningCurve.avif) Figure 1, shows an example of learning curve plot in which the vertical axis represents the model performance and the horizontal axis the training set size. If training set size is small, the performance on the training set is high, because a model can often be fitted well to a limited number of training examples. At the same time, the performance on the testing set will be poor, because the model trained on such a limited number of training examples will not generalize well to unseen data in the testing set. As the training set size increases, the performance of the model on the training set will decrease. It becomes more difficult for the model to find a good fit through all the training examples. Also, the model will be trained on a more representative portion of training examples, making it generalize better to unseen data. This can be observed by the increasin testing set performance. The learning curve can help us in diagnosing bias and variance problems with our classifier which will provide guidance on how to further improve our model. We can observe high variance (overfitting) in a prediction model if it performs well on the training set, but poorly on the testing set (Figure 2). Adding additional data is a common approach to counteract high variance. From the learning curve it becomes apparent, that adding additional data may improve performance on the testing set a little further, as the learning curve has not yet plateaued and, thus, the model is not saturated yet. Therefore, adding more data will decrease the gap between training set and testing set, which is the main indicator for a high variance problem. -![Prediction model suffering from high variance.](learningCurveVariance.png) +![Prediction model suffering from high variance.](images/learningCurveVariance.avif) Furthermore, we can observe high bias (underfitting) if a prediction model performs poorly on the training set as well as on the testing set (Figure 3). The learning curves of training set and testing set have flattened on a low performance with only a small gap in between them. Adding additional data will in this case have little to no impact on the model performance. Choosing another prediction algorithm that can find more complex (for example non-linear) relationships in the data may be an alternative approach to consider in this high bias situation. -![Prediction model suffering from high bias.](learningCurveBias.png) +![Prediction model suffering from high bias.](images/learningCurveBias.avif) # Creating the learning curve @@ -112,9 +92,6 @@ splitSettings <- createDefaultSplitSetting( ) trainFractions <- seq(0.1, 0.8, 0.1) # Create eight training set fractions - -# alternatively use a sequence of training events by uncommenting the line below. -# trainEvents <- seq(100, 5000, 100) ``` Create the learning curve object. @@ -159,7 +136,7 @@ plotLearningCurve( ) ``` -![Learning curve plot.](learningCurvePlot.png) +![Learning curve plot.](images/learningCurvePlot.avif) # Parallel processing diff --git a/vignettes/CreatingNetworkStudies.Rmd b/vignettes/CreatingNetworkStudies.Rmd index 21cd35968..8728252a1 100644 --- a/vignettes/CreatingNetworkStudies.Rmd +++ b/vignettes/CreatingNetworkStudies.Rmd @@ -1,33 +1,14 @@ --- -title: "Making patient-level predictive network study packages" +title: "Making patient-level predictive network study packages" author: "Jenna Reps, Martijn J. Schuemie, Patrick B. Ryan, Peter R. Rijnbeek" date: '`r Sys.Date()`' -header-includes: - - \usepackage{fancyhdr} - - \pagestyle{fancy} - - \fancyhead{} - - \fancyhead[C]{Network Studies} - - \fancyfoot[C]{PatientLevelPrediction Package Version `r utils::packageVersion("PatientLevelPrediction")`} - - \fancyfoot[C]{\thepage} - - \renewcommand{\headrulewidth}{0.4pt} - - \renewcommand{\footrulewidth}{0.4pt} -output: - pdf_document: - includes: - in_header: preamble.tex - number_sections: yes - toc: yes - html_document: - number_sections: yes - toc: yes +output: rmarkdown::html_vignette +vignettes: > + %\VignetteEngine{knitr::rmarkdown} + %\VignetteIndexEntry{Making patient-level predictive network study packages} + %\VignetteEncoding{UTF-8} --- -```{=html} - -``` ```{r echo=FALSE,message=FALSE,warning=FALSE,eval=TRUE} library(PatientLevelPrediction) vignetteDataFolder <- "s:/temp/plpVignette" diff --git a/vignettes/Diagram.png b/vignettes/Diagram.png deleted file mode 100644 index 5c8f0c148..000000000 Binary files a/vignettes/Diagram.png and /dev/null differ diff --git a/vignettes/Figure1.webp b/vignettes/Figure1.webp deleted file mode 100644 index 42ad71d7f..000000000 Binary files a/vignettes/Figure1.webp and /dev/null differ diff --git a/vignettes/GISExample.Rmd b/vignettes/GISExample.Rmd new file mode 100644 index 000000000..448006690 --- /dev/null +++ b/vignettes/GISExample.Rmd @@ -0,0 +1,270 @@ +--- +title: "Integration of GIS Data Into OHDSI Model Building" +author: "Jared Houghtaling" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Integration of GIS Data Into OHDSI Model Building} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +# Integration of GIS Data into OHDSI Model Building + +## Motivation + +Although the proposed GIS extension tables have been approved by the broader OHDSI community, they are not yet integrated natively into the OHDSI tooling. These tables and the data they capture, however, can still be referenced in standard model building and analytics workflows. The purpose of this analytics demonstration is to show how data in the EXPOSURE_OCCURRENCE table can be utilized in the training and evaluation of an OMOP-specific patient-level-prediction (PLP) model. The analytic process we executed and describe below relies on a GIS-version of the Tufts Synthetic Dataset that has both EHR and geospatial data integrated; see the description of that dataset for more details about contents or access. Much of the work is based on the detailed vignette that describes custom feature engineering in PLP +`vignette('AddingCustomFeatureEngineering')` + +## Step-by-Step Process + +### Step 1: Create Target & Outcome Cohorts + +We defined our `target` cohort within the sampling procedures when creating a subset of the Tufts Synthetic Data, and we described this process at length elsewhere. For the purposes of using the PLP package, we formalized this group of individuals in the cohort table using a simple cohort definition including all individuals with "any visit", and include that atlas-compatible json definition here for reference. + +We also created our `outcome` cohort - in this case, those patients with COPD or a conceptual descendant thereof - in Atlas and have shared the json definition (we also included an equivalent SQL script). + +### Step 2: Create Generic PLP Lasso Logistic Regression Model in R + +It is possible to create an R package that serves as a basis for a PLP model using Atlas, but given Atlas does not yet support the GIS extension, we have created this demo model directly using the PLP package. + +After configuring the environment (very important!) appropriately, we imported necessary packages and defined the relevant parameters about the data source: + +```{r, echo=TRUE, message=FALSE, warning=FALSE, eval=FALSE} +library(PatientLevelPrediction) +library(dplyr) +outputFolder <- "/ohdsi-gis/copdResultsPM25_NEW" +saveDirectory <- outputFolder +ExecutionDateTime <- Sys.time() +logSettings = createLogSettings(verbosity = "DEBUG", timeStamp = T, logName = + "runPlp Log") +analysisName = 'Generic PLP' + +# Details for connecting to the server: +connectionDetails <- DatabaseConnector::createConnectionDetails( + dbms = 'spark', + server = '/default', + connectionString = '' + ) +# Add the database containing the OMOP CDM data +cdmDatabaseSchema <- 'gis_syn_dataset_5_4' +# Add a sharebale name for the database containing the OMOP CDM data +cdmDatabaseName <- 'TSD-GIS' +# Add a database with read/write access as this is where the cohorts will be generated +cohortDatabaseSchema <- 'gis_syn_dataset_5_4' +tempEmulationSchema <- NULL +# table name where the cohorts will be generated +cohortTable <- 'cohort' +``` + +After defining parameters, we created a database settings object and launched the `runMultiplePLP` function to create a base `plpData` object: + +```{r, echo=TRUE, message=FALSE, warning=FALSE, eval=FALSE} +databaseDetails <- PatientLevelPrediction::createDatabaseDetails( + connectionDetails = connectionDetails, + cdmDatabaseSchema = cdmDatabaseSchema, + cdmDatabaseName = cdmDatabaseName, + tempEmulationSchema = tempEmulationSchema, + cohortDatabaseSchema = cohortDatabaseSchema, + cohortTable = cohortTable, + outcomeDatabaseSchema = cohortDatabaseSchema, + outcomeTable = cohortTable, + cdmVersion = 5 +) + + +# Run very simple LR model against two cohorts created in Atlas. Use model +# as basis for augmented model with pollutants below +runMultiplePlp( + databaseDetails = databaseDetails, + modelDesignList = list(createModelDesign(targetId = 9, outcomeId = 8, modelSettings = + setLassoLogisticRegression())), + onlyFetchData = F, + cohortDefinitions = NULL, + logSettings = createLogSettings(verbosity = "DEBUG", timeStamp = T, logName = + "runPlp Log"), + saveDirectory = outputFolder, + sqliteLocation = file.path(saveDirectory, "sqlite") + ) +``` + +### Step 3: Split plpData object to train/test, augment labels with EXPOSURE_OCCURRENCE values + +The labels sub-object within `plpData` contains per-individual data elements like gender and age; we added an additional data element to this object derived from the `EXPOSURE_OCCURRENCE` table: + +```{r, echo=TRUE, message=FALSE, warning=FALSE, eval=FALSE} +cohortDefinitions <- NULL +modelDesign <- createModelDesign(targetId = 9, outcomeId = 8, modelSettings = setLassoLogisticRegression()) +populationSettings <- modelDesign$populationSettings +splitSettings <- modelDesign$splitSettings + +plpData <- loadPlpData("/ohdsi-gis/copdResultsPM25_B/targetId_9_L1") + +mySplit <- splitData (plpData = plpData, + population = createStudyPopulation(plpData, 8, populationSettings), + splitSettings = splitSettings) + + +labelTrain <- mySplit$Train$labels +conn <- DatabaseConnector::connect(connectionDetails) +pollutants <- DatabaseConnector::querySql(conn, "SELECT person_id as subjectID, CAST(MEAN(value_as_number) AS DOUBLE) AS pmValue FROM gis_syn_dataset_5_4.exposure_occurrence WHERE value_as_number IS NOT NULL GROUP BY person_id;") +labelTrainPol <- merge(x=labelTrain, y=pollutants, by.x = "subjectId", by.y = "SUBJECTID") + +mySplit$Train$labels <- labelTrainPol + +labelTest <- mySplit$Test$labels +labelTestPol <- merge(x=labelTest, y=pollutants, by.x = "subjectId", by.y = "SUBJECTID") + +mySplit$Test$labels <- labelTestPol + +trainData <- mySplit$Train + +testData <- mySplit$Test +``` + +### Step 4: Reference augmented label objects in custom feature engineering function + +We would like to convert our per-patient labels into the `covariateData` structure referenced by the PLP workflow. To do this, we were able to follow the feature engineering vignette and create two functions, `createPollutants` and `implementPollutants`: + +```{r, echo=TRUE, message=FALSE, warning=FALSE, eval=FALSE} +createPollutants <- function( + method = 'QNCV' + ){ + + # create list of inputs to implement function + featureEngineeringSettings <- list( + method = method + ) + + # specify the function that will implement the sampling + attr(featureEngineeringSettings, "fun") <- "implementPollutants" + + # make sure the object returned is of class "sampleSettings" + class(featureEngineeringSettings) <- "featureEngineeringSettings" + return(featureEngineeringSettings) + +} + + +implementPollutants <- function(trainData, featureEngineeringSettings, model=NULL) { + if (is.null(model)) { + method <- featureEngineeringSettings$method + gisData <- trainData$labels + y <- gisData$outcomeCount + X <- gisData$PMVALUE + model <- mgcv::gam( + y ~ s(X, bs='cr', k=5, m=2) + ) + newData <- data.frame( + rowId = gisData$rowId, + covariateId = 2052499839, + covariateValue = model$fitted.values + ) + } + else { + gisData <- trainData$labels + X <- gisData$PMVALUE + y <- gisData$outcomeCount + newData <- data.frame(y=y, X=X) + yHat <- predict(model, newData) + newData <- data.frame( + rowId = gisData$rowId, + covariateId = 2052499839, + covariateValue = yHat + ) + } + # update covRef + Andromeda::appendToTable(trainData$covariateData$covariateRef, + data.frame(covariateId=2052499839, + covariateName='Average PM2.5 Concentrations', + analysisId=1, + conceptId=2052499839)) + + # update covariates + Andromeda::appendToTable(trainData$covariateData$covariates, newData) + + featureEngineering <- list( + funct = 'implementPollutants', + settings = list( + featureEngineeringSettings = featureEngineeringSettings, + model = model + ) + ) + + attr(trainData$covariateData, 'metaData')$featureEngineering = listAppend( + attr(trainData$covariateData, 'metaData')$featureEngineering, + featureEngineering + ) + + trainData$model <- model + + return(trainData) +} +``` + +We can then execute these functions to create training and test data objects that contain our extended covariates: + +```{r, echo=TRUE, message=FALSE, warning=FALSE, eval=FALSE} +featureEngineeringSettingsPol <- createPollutants('QNCV') +trainDataPol <- implementPollutants(trainData, featureEngineeringSettings) +testDataPol <- implementPollutants(testData, featureEngineeringSettings, trainDataPol$model) +``` + +Note that if we plot the output model of the `GAM` fitting in the `implementPollutants` function, we end up with a plot that aligns well with our underlying relationship between Odds Ratio and PM2.5 concentration that we used to distribute our synthetic data by location: + +![Covariate Fit]() + +![OR Estimation]() + +### Step 5: Apply new train and test datasets to `runPlp` and evaluate output + +```{r, echo=TRUE, message=FALSE, warning=FALSE, eval=FALSE} +analysisId <- '1' +analysisPath = file.path(saveDirectory, analysisId) + +settings <- list( + trainData = trainDataPol, + modelSettings = setLassoLogisticRegression(), + analysisId = analysisId, + analysisPath = analysisPath +) + +ParallelLogger::logInfo(sprintf('Training %s model',settings$modelSettings$name)) +model <- tryCatch( + { + do.call(fitPlp, settings) + }, + error = function(e) { ParallelLogger::logError(e); return(NULL)} +) + + +prediction <- model$prediction +# remove prediction from model +model$prediction <- NULL + +#apply to test data if exists: +if('Test' %in% names(data)){ +predictionTest <- tryCatch( + { + predictPlp( + plpModel = model, + plpData = testDataPol, + population = testDataPol$labels + ) + }, + error = function(e) { ParallelLogger::logError(e); return(NULL)} +) + +predictionTest$evaluationType <- 'Test' + +if(!is.null(predictionTest)){ + prediction <- rbind(predictionTest, prediction[, colnames(prediction)!='index']) +} + + +} + +``` + + + diff --git a/vignettes/InstallationGuide.Rmd b/vignettes/InstallationGuide.Rmd index 0e8ba0c13..9c8b8c447 100644 --- a/vignettes/InstallationGuide.Rmd +++ b/vignettes/InstallationGuide.Rmd @@ -2,32 +2,13 @@ title: "Patient-Level Prediction Installation Guide" author: "Jenna Reps, Peter R. Rijnbeek, Egill Fridgeirsson" date: '`r Sys.Date()`' -header-includes: - - \usepackage{fancyhdr} - - \pagestyle{fancy} - - \fancyhead{} - - \fancyhead[C]{Installation Guide} - - \fancyfoot[C]{PatientLevelPrediction Package Version `r utils::packageVersion("PatientLevelPrediction")`} - - \fancyfoot[C]{\thepage} - - \renewcommand{\headrulewidth}{0.4pt} - - \renewcommand{\footrulewidth}{0.4pt} -output: - pdf_document: - includes: - in_header: preamble.tex - number_sections: yes - toc: yes - html_document: - number_sections: yes - toc: yes +output: rmarkdown::html_vignette +vignette: > + %\VignetteEngine{knitr::rmarkdown} + %\VignetteIndexEntry{Patient-Level Prediction Installation Guide} + %\VignetteEncoding{UTF-8} --- -```{=html} - -``` # Introduction This vignette describes how you need to install the Observational Health Data Science and Informatics (OHDSI) [`PatientLevelPrediction`](http://github.com/OHDSI/PatientLevelPrediction) package under Windows, Mac, and Linux. diff --git a/vignettes/Videos.Rmd b/vignettes/Videos.Rmd index d4711a4e0..a39547958 100644 --- a/vignettes/Videos.Rmd +++ b/vignettes/Videos.Rmd @@ -2,32 +2,13 @@ title: "Demo Videos" author: "Jenna Reps, Peter R. Rijnbeek" date: '`r Sys.Date()`' -header-includes: - - \usepackage{fancyhdr} - - \pagestyle{fancy} - - \fancyhead{} - - \fancyhead[C]{Videos} - - \fancyfoot[C]{PatientLevelPrediction Package Version `r utils::packageVersion("PatientLevelPrediction")`} - - \fancyfoot[C]{\thepage} - - \renewcommand{\headrulewidth}{0.4pt} - - \renewcommand{\footrulewidth}{0.4pt} -output: - pdf_document: - includes: - in_header: preamble.tex - number_sections: yes - toc: yes - html_document: - number_sections: yes - toc: yes +output: rmarkdown::html_vignette +vignette: > + %\VignetteEngine{knitr::rmarkdown} + %\VignetteIndexEntry{Demo Videos} + %\VignetteEncoding{UTF-8} --- -```{=html} - -``` ## What is a cohort table? | Click To Launch | Description of Demo | diff --git a/vignettes/atlasdownload1.webp b/vignettes/atlasdownload1.webp deleted file mode 100644 index 6cac340ed..000000000 Binary files a/vignettes/atlasdownload1.webp and /dev/null differ diff --git a/vignettes/atlasdownload2.webp b/vignettes/atlasdownload2.webp deleted file mode 100644 index 452c5ca21..000000000 Binary files a/vignettes/atlasdownload2.webp and /dev/null differ diff --git a/vignettes/atlasplp1.webp b/vignettes/atlasplp1.webp deleted file mode 100644 index 71a3c1ce9..000000000 Binary files a/vignettes/atlasplp1.webp and /dev/null differ diff --git a/vignettes/atlasplp2.webp b/vignettes/atlasplp2.webp deleted file mode 100644 index 668202de4..000000000 Binary files a/vignettes/atlasplp2.webp and /dev/null differ diff --git a/vignettes/atlasplp3.webp b/vignettes/atlasplp3.webp deleted file mode 100644 index 523d0143c..000000000 Binary files a/vignettes/atlasplp3.webp and /dev/null differ diff --git a/vignettes/atlasplp4.webp b/vignettes/atlasplp4.webp deleted file mode 100644 index d90a43ea2..000000000 Binary files a/vignettes/atlasplp4.webp and /dev/null differ diff --git a/vignettes/demographicSummary.webp b/vignettes/demographicSummary.webp deleted file mode 100644 index 7d0437deb..000000000 Binary files a/vignettes/demographicSummary.webp and /dev/null differ diff --git a/vignettes/ensemble.png b/vignettes/ensemble.png deleted file mode 100644 index 6e2173a48..000000000 Binary files a/vignettes/ensemble.png and /dev/null differ diff --git a/vignettes/example1/ATLAS_O.avif b/vignettes/example1/ATLAS_O.avif new file mode 100644 index 000000000..9b773bdb9 Binary files /dev/null and b/vignettes/example1/ATLAS_O.avif differ diff --git a/vignettes/example1/ATLAS_T.avif b/vignettes/example1/ATLAS_T.avif new file mode 100644 index 000000000..20bdde0fc Binary files /dev/null and b/vignettes/example1/ATLAS_T.avif differ diff --git a/vignettes/example2/aceinhibitors.avif b/vignettes/example2/aceinhibitors.avif new file mode 100644 index 000000000..c76837cb4 Binary files /dev/null and b/vignettes/example2/aceinhibitors.avif differ diff --git a/vignettes/example2/angioedema.avif b/vignettes/example2/angioedema.avif new file mode 100644 index 000000000..40d42b8ae Binary files /dev/null and b/vignettes/example2/angioedema.avif differ diff --git a/vignettes/f1Measure.png b/vignettes/f1Measure.png deleted file mode 100644 index 437b6893f..000000000 Binary files a/vignettes/f1Measure.png and /dev/null differ diff --git a/vignettes/f1score.png b/vignettes/f1score.png deleted file mode 100644 index 5113b25c5..000000000 Binary files a/vignettes/f1score.png and /dev/null differ diff --git a/vignettes/generalizability.webp b/vignettes/generalizability.webp deleted file mode 100644 index ba6d14de4..000000000 Binary files a/vignettes/generalizability.webp and /dev/null differ diff --git a/vignettes/images/Figure1.avif b/vignettes/images/Figure1.avif new file mode 100644 index 000000000..f4f2c0d61 Binary files /dev/null and b/vignettes/images/Figure1.avif differ diff --git a/vignettes/images/atlasdownload1.avif b/vignettes/images/atlasdownload1.avif new file mode 100644 index 000000000..9f2ed9d8d Binary files /dev/null and b/vignettes/images/atlasdownload1.avif differ diff --git a/vignettes/images/atlasdownload2.avif b/vignettes/images/atlasdownload2.avif new file mode 100644 index 000000000..bc9906b78 Binary files /dev/null and b/vignettes/images/atlasdownload2.avif differ diff --git a/vignettes/images/atlasplp1.avif b/vignettes/images/atlasplp1.avif new file mode 100644 index 000000000..873e06411 Binary files /dev/null and b/vignettes/images/atlasplp1.avif differ diff --git a/vignettes/images/atlasplp2.avif b/vignettes/images/atlasplp2.avif new file mode 100644 index 000000000..95704629c Binary files /dev/null and b/vignettes/images/atlasplp2.avif differ diff --git a/vignettes/images/atlasplp3.avif b/vignettes/images/atlasplp3.avif new file mode 100644 index 000000000..2a2eeab1f Binary files /dev/null and b/vignettes/images/atlasplp3.avif differ diff --git a/vignettes/images/atlasplp4.avif b/vignettes/images/atlasplp4.avif new file mode 100644 index 000000000..e613e5072 Binary files /dev/null and b/vignettes/images/atlasplp4.avif differ diff --git a/vignettes/images/demographicSummary.avif b/vignettes/images/demographicSummary.avif new file mode 100644 index 000000000..63a77156f Binary files /dev/null and b/vignettes/images/demographicSummary.avif differ diff --git a/vignettes/images/generalizability.avif b/vignettes/images/generalizability.avif new file mode 100644 index 000000000..c8e5bfbf2 Binary files /dev/null and b/vignettes/images/generalizability.avif differ diff --git a/vignettes/images/learningCurve.avif b/vignettes/images/learningCurve.avif new file mode 100644 index 000000000..be40aa00f Binary files /dev/null and b/vignettes/images/learningCurve.avif differ diff --git a/vignettes/images/learningCurveBias.avif b/vignettes/images/learningCurveBias.avif new file mode 100644 index 000000000..948431b28 Binary files /dev/null and b/vignettes/images/learningCurveBias.avif differ diff --git a/vignettes/images/learningCurvePlot.avif b/vignettes/images/learningCurvePlot.avif new file mode 100644 index 000000000..3f8ddb9f5 Binary files /dev/null and b/vignettes/images/learningCurvePlot.avif differ diff --git a/vignettes/images/learningCurveVariance.avif b/vignettes/images/learningCurveVariance.avif new file mode 100644 index 000000000..00e48fdd3 Binary files /dev/null and b/vignettes/images/learningCurveVariance.avif differ diff --git a/vignettes/images/popdef1.avif b/vignettes/images/popdef1.avif new file mode 100644 index 000000000..7584bc72e Binary files /dev/null and b/vignettes/images/popdef1.avif differ diff --git a/vignettes/images/popdef2.avif b/vignettes/images/popdef2.avif new file mode 100644 index 000000000..be4403626 Binary files /dev/null and b/vignettes/images/popdef2.avif differ diff --git a/vignettes/images/popdef3.avif b/vignettes/images/popdef3.avif new file mode 100644 index 000000000..985babd29 Binary files /dev/null and b/vignettes/images/popdef3.avif differ diff --git a/vignettes/images/popdef4.avif b/vignettes/images/popdef4.avif new file mode 100644 index 000000000..62a590262 Binary files /dev/null and b/vignettes/images/popdef4.avif differ diff --git a/vignettes/images/popdef5.avif b/vignettes/images/popdef5.avif new file mode 100644 index 000000000..a13bcb8ac Binary files /dev/null and b/vignettes/images/popdef5.avif differ diff --git a/vignettes/images/popdef6.avif b/vignettes/images/popdef6.avif new file mode 100644 index 000000000..91495baa7 Binary files /dev/null and b/vignettes/images/popdef6.avif differ diff --git a/vignettes/images/precisionRecall.avif b/vignettes/images/precisionRecall.avif new file mode 100644 index 000000000..62472ee9c Binary files /dev/null and b/vignettes/images/precisionRecall.avif differ diff --git a/vignettes/images/predictionDistribution.avif b/vignettes/images/predictionDistribution.avif new file mode 100644 index 000000000..f4403112c Binary files /dev/null and b/vignettes/images/predictionDistribution.avif differ diff --git a/vignettes/images/preferencePDF.avif b/vignettes/images/preferencePDF.avif new file mode 100644 index 000000000..107871a9a Binary files /dev/null and b/vignettes/images/preferencePDF.avif differ diff --git a/vignettes/images/problems.avif b/vignettes/images/problems.avif new file mode 100644 index 000000000..e0a2c8500 Binary files /dev/null and b/vignettes/images/problems.avif differ diff --git a/vignettes/images/shinyroc.avif b/vignettes/images/shinyroc.avif new file mode 100644 index 000000000..1be2230dd Binary files /dev/null and b/vignettes/images/shinyroc.avif differ diff --git a/vignettes/images/shinysummary.avif b/vignettes/images/shinysummary.avif new file mode 100644 index 000000000..905aabce0 Binary files /dev/null and b/vignettes/images/shinysummary.avif differ diff --git a/vignettes/images/smoothCalibration.avif b/vignettes/images/smoothCalibration.avif new file mode 100644 index 000000000..854cc2485 Binary files /dev/null and b/vignettes/images/smoothCalibration.avif differ diff --git a/vignettes/images/sparseCalibration.avif b/vignettes/images/sparseCalibration.avif new file mode 100644 index 000000000..b8f4a5221 Binary files /dev/null and b/vignettes/images/sparseCalibration.avif differ diff --git a/vignettes/images/sparseRoc.avif b/vignettes/images/sparseRoc.avif new file mode 100644 index 000000000..bb5d14ec3 Binary files /dev/null and b/vignettes/images/sparseRoc.avif differ diff --git a/vignettes/images/studydesign.avif b/vignettes/images/studydesign.avif new file mode 100644 index 000000000..e9c028fbc Binary files /dev/null and b/vignettes/images/studydesign.avif differ diff --git a/vignettes/images/variableScatterplot.avif b/vignettes/images/variableScatterplot.avif new file mode 100644 index 000000000..7a498180a Binary files /dev/null and b/vignettes/images/variableScatterplot.avif differ diff --git a/vignettes/learningCurve.png b/vignettes/learningCurve.png deleted file mode 100644 index 19cd06691..000000000 Binary files a/vignettes/learningCurve.png and /dev/null differ diff --git a/vignettes/learningCurveBias.png b/vignettes/learningCurveBias.png deleted file mode 100644 index 3bd9f580a..000000000 Binary files a/vignettes/learningCurveBias.png and /dev/null differ diff --git a/vignettes/learningCurvePlot.png b/vignettes/learningCurvePlot.png deleted file mode 100644 index a5e1f9e96..000000000 Binary files a/vignettes/learningCurvePlot.png and /dev/null differ diff --git a/vignettes/learningCurveVariance.png b/vignettes/learningCurveVariance.png deleted file mode 100644 index 3212e6106..000000000 Binary files a/vignettes/learningCurveVariance.png and /dev/null differ diff --git a/vignettes/plpDiagram.png b/vignettes/plpDiagram.png deleted file mode 100644 index 301f4bcb4..000000000 Binary files a/vignettes/plpDiagram.png and /dev/null differ diff --git a/vignettes/popdef1.webp b/vignettes/popdef1.webp deleted file mode 100644 index 83ef7afd6..000000000 Binary files a/vignettes/popdef1.webp and /dev/null differ diff --git a/vignettes/popdef2.webp b/vignettes/popdef2.webp deleted file mode 100644 index 31887dd1b..000000000 Binary files a/vignettes/popdef2.webp and /dev/null differ diff --git a/vignettes/popdef3.webp b/vignettes/popdef3.webp deleted file mode 100644 index 8b409ed49..000000000 Binary files a/vignettes/popdef3.webp and /dev/null differ diff --git a/vignettes/popdef4.webp b/vignettes/popdef4.webp deleted file mode 100644 index 2709497e7..000000000 Binary files a/vignettes/popdef4.webp and /dev/null differ diff --git a/vignettes/popdef5.webp b/vignettes/popdef5.webp deleted file mode 100644 index 748b8901b..000000000 Binary files a/vignettes/popdef5.webp and /dev/null differ diff --git a/vignettes/popdef6.webp b/vignettes/popdef6.webp deleted file mode 100644 index 583dc9fba..000000000 Binary files a/vignettes/popdef6.webp and /dev/null differ diff --git a/vignettes/preamble.tex b/vignettes/preamble.tex deleted file mode 100644 index 204026792..000000000 --- a/vignettes/preamble.tex +++ /dev/null @@ -1,8 +0,0 @@ -\usepackage{float} -\let\origfigure\figure -\let\endorigfigure\endfigure -\renewenvironment{figure}[1][2] { - \expandafter\origfigure\expandafter[H] -} { - \endorigfigure -} \ No newline at end of file diff --git a/vignettes/precisionRecall.webp b/vignettes/precisionRecall.webp deleted file mode 100644 index af6b0cfe5..000000000 Binary files a/vignettes/precisionRecall.webp and /dev/null differ diff --git a/vignettes/predictedPDF.png b/vignettes/predictedPDF.png deleted file mode 100644 index fbf4e85b5..000000000 Binary files a/vignettes/predictedPDF.png and /dev/null differ diff --git a/vignettes/predictionDistribution.webp b/vignettes/predictionDistribution.webp deleted file mode 100644 index c7756d788..000000000 Binary files a/vignettes/predictionDistribution.webp and /dev/null differ diff --git a/vignettes/preferencePDF.webp b/vignettes/preferencePDF.webp deleted file mode 100644 index 189a356be..000000000 Binary files a/vignettes/preferencePDF.webp and /dev/null differ diff --git a/vignettes/problems.webp b/vignettes/problems.webp deleted file mode 100644 index 5c1c27bb4..000000000 Binary files a/vignettes/problems.webp and /dev/null differ diff --git a/vignettes/shinyroc.webp b/vignettes/shinyroc.webp deleted file mode 100644 index a11724623..000000000 Binary files a/vignettes/shinyroc.webp and /dev/null differ diff --git a/vignettes/shinysummary.webp b/vignettes/shinysummary.webp deleted file mode 100644 index 0d256ade1..000000000 Binary files a/vignettes/shinysummary.webp and /dev/null differ diff --git a/vignettes/smoothCalibration.jpeg b/vignettes/smoothCalibration.jpeg deleted file mode 100644 index 72c3cdb7a..000000000 Binary files a/vignettes/smoothCalibration.jpeg and /dev/null differ diff --git a/vignettes/sparseCalibration.webp b/vignettes/sparseCalibration.webp deleted file mode 100644 index 043019e5b..000000000 Binary files a/vignettes/sparseCalibration.webp and /dev/null differ diff --git a/vignettes/sparseROC.webp b/vignettes/sparseROC.webp deleted file mode 100644 index 2ea3ea56f..000000000 Binary files a/vignettes/sparseROC.webp and /dev/null differ diff --git a/vignettes/studydesign.webp b/vignettes/studydesign.webp deleted file mode 100644 index 28717c7d2..000000000 Binary files a/vignettes/studydesign.webp and /dev/null differ diff --git a/vignettes/variableScatterplot.webp b/vignettes/variableScatterplot.webp deleted file mode 100644 index de6f8999d..000000000 Binary files a/vignettes/variableScatterplot.webp and /dev/null differ