diff --git a/.github/ISSUE_TEMPLATE/cran-release-checklist.md b/.github/ISSUE_TEMPLATE/cran-release-checklist.md index ea9c7954..98e422b6 100644 --- a/.github/ISSUE_TEMPLATE/cran-release-checklist.md +++ b/.github/ISSUE_TEMPLATE/cran-release-checklist.md @@ -13,13 +13,15 @@ assignees: '' - [ ] update NEWS.md - [ ] code check: run `devtools::check(args=c("--no-examples", "--no-tests"), vignettes=FALSE)`, alternatively `rcmdcheck::rcmdcheck(args=c("--no-examples", "--no-tests"))` - [ ] test check: run `devtools::test()`: set `options("testthat.progress.max_fails"=Inf)` and `options("Ncpus"=10)` -- [ ] create archives for testing backward compatibility against future versions: run tests/old_experiments/archiveExperiments.R +- [ ] create archives for testing backward compatibility against future versions: run `tests/old_experiments/archiveExperiments.R` - [ ] test backward compatibility against archived experiments: manually run `tests/testthat/test-update_object.R` - [ ] pre-compile vignettes: run `vignettes/compile.R` - [ ] code and vignette check: `rcmdcheck::rcmdcheck(args=c("--no-examples", "--as-cran"))` - [ ] check reverse dependencies - [ ] create pull request - [ ] check continuous integration tests +- [ ] Build source tarball using `devtools::build()`. +- [ ] Run `devtools::check_built(path=pkg, env_vars = list("_R_CHECK_DEPENDS_ONLY_" = TRUE))`, with pkg the path to the tarball. - [ ] update cran_comments.md - [ ] merge dev branch into master @@ -28,14 +30,14 @@ assignees: '' - [ ] Run a post-merge code check: `rcmdcheck::rcmdcheck(args=c("--no-examples", "--as-cran"))` - [ ] Run post-merge test suite, if required. - [ ] Pre-compile vignettes, if required. + - [ ] Rebuild source tarball using `devtools::build()`, if required. **CRAN** -- [ ] Build source tarball using `devtools::build()`. - [ ] Check CRAN-policies on https://cran.r-project.org/web/packages/policies.html - [ ] Upload source tarball to https://cran.r-project.org/submit.html - [ ] Check CRAN checks **Github release** - [ ] prepare a GithHub release - - [ ] create source tarball using devtools::build and add as binary. - [ ] copy news from NEWS.md + - [ ] add source tarball as data diff --git a/.github/workflows/auto-test-no-suggests-pull.yml b/.github/workflows/auto-test-no-suggests-pull.yml index 541fb759..ac4d705a 100644 --- a/.github/workflows/auto-test-no-suggests-pull.yml +++ b/.github/workflows/auto-test-no-suggests-pull.yml @@ -20,7 +20,7 @@ jobs: fail-fast: false matrix: r-version: ["release"] - os: [ ubuntu-latest ] + os: [ windows-latest ] steps: - uses: actions/checkout@v4 @@ -38,8 +38,19 @@ jobs: any::testthat needs: check + - name: Identify and remove all non-critical packages + run: | + installed_packages <- data.table::data.table(utils::installed.packages()) + required_packages <- c("familiar", "rcmdcheck", "testthat") + required_packages <- union(required_packages, unlist(tools::package_dependencies(required_packages, recursive = TRUE), use.names=FALSE)) + required_packages <- union(required_packages, installed_packages[Priority == "base"]$Package) + non_critical_packages <- setdiff(installed_packages$Package, required_packages) + for (package in non_critical_packages) {utils::remove.packages(installed_packages[Package == package]$Package, installed_packages[Package == package]$LibPath)} + shell: Rscript {0} + - uses: r-lib/actions/check-r-package@v2 with: upload-snapshots: true - args: 'c("--no-manual", "--as-cran")' + args: 'c("--no-manual", "--as-cran", "--no-vignettes", "--no-build-vignettes")' + build_args: 'c("--no-build-vignettes", "--no-manual")' error-on: '"error"' diff --git a/.github/workflows/auto-test-package_pull.yml b/.github/workflows/auto-test-package_pull.yml index 791fe9fa..28f085f4 100644 --- a/.github/workflows/auto-test-package_pull.yml +++ b/.github/workflows/auto-test-package_pull.yml @@ -38,5 +38,6 @@ jobs: - uses: r-lib/actions/check-r-package@v2 with: upload-snapshots: true - args: 'c("--no-manual", "--as-cran")' + args: 'c("--no-manual", "--as-cran", "--no-vignettes", "--no-build-vignettes")' + build_args: 'c("--no-build-vignettes", "--no-manual")' error-on: '"error"' diff --git a/.gitignore b/.gitignore index 27ee1a0d..98fbb26c 100644 --- a/.gitignore +++ b/.gitignore @@ -5,10 +5,7 @@ doc Meta tests/testthat/Rplots.pdf -inst/doc /doc/ /Meta/ -/man/ -*.Rd tests/old_experiments/_experiment Rplots.pdf diff --git a/NEWS.md b/NEWS.md index 37a6d50d..369e87a2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,6 +4,8 @@ - Adapted tests to work when packages are missing. +- Fixed an issue that prevented hyperparameter optimisation of `xgboost` models for survival tasks. + # Version 1.4.7 (Uncertain Unicorn) ## Bug fixes diff --git a/R/LearnerS4XGBoost.R b/R/LearnerS4XGBoost.R index 804f6eb3..137f34dd 100644 --- a/R/LearnerS4XGBoost.R +++ b/R/LearnerS4XGBoost.R @@ -365,9 +365,7 @@ setMethod("get_prediction_type", signature(object="familiarXGBoost"), # The prediction type is a bit more complicated for xgboost methods. if(type == "default"){ - if(as.character(object@hyperparameters$learn_objective %in% c("cox"))){ - return("hazard_ratio") - } + return("hazard_ratio") } else if(type == "survival_probability"){ return("survival_probability") diff --git a/R/TestDataCreators.R b/R/TestDataCreators.R index d5a35583..7f75b4d0 100644 --- a/R/TestDataCreators.R +++ b/R/TestDataCreators.R @@ -9,7 +9,7 @@ test_data_package_installed <- function(outcome_type) { "count" = "MASS" ) - if (!is_package_installed(data_packages[[outcome_type]])) run_test <- FALSE + if (!rlang::is_installed(data_packages[[outcome_type]])) run_test <- FALSE if (!run_test) { rlang::inform( diff --git a/man/aggregate_vimp_table-methods.Rd b/man/aggregate_vimp_table-methods.Rd new file mode 100644 index 00000000..e77ae5d8 --- /dev/null +++ b/man/aggregate_vimp_table-methods.Rd @@ -0,0 +1,43 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/VimpTable.R +\name{aggregate_vimp_table} +\alias{aggregate_vimp_table} +\alias{aggregate_vimp_table,list-method} +\alias{aggregate_vimp_table,character-method} +\alias{aggregate_vimp_table,vimpTable-method} +\alias{aggregate_vimp_table,NULL-method} +\alias{aggregate_vimp_table,experimentData-method} +\title{Aggregate variable importance from multiple variable importance +objects.} +\usage{ +aggregate_vimp_table(x, aggregation_method, rank_threshold = NULL, ...) + +\S4method{aggregate_vimp_table}{list}(x, aggregation_method, rank_threshold = NULL, ...) + +\S4method{aggregate_vimp_table}{character}(x, aggregation_method, rank_threshold = NULL, ...) + +\S4method{aggregate_vimp_table}{vimpTable}(x, aggregation_method, rank_threshold = NULL, ...) + +\S4method{aggregate_vimp_table}{NULL}(x, aggregation_method, rank_threshold = NULL, ...) + +\S4method{aggregate_vimp_table}{experimentData}(x, aggregation_method, rank_threshold = NULL, ...) +} +\arguments{ +\item{x}{Variable importance (\code{vimpTable}) object, a list thereof, or one or +more paths to these objects.} + +\item{aggregation_method}{Method used to aggregate variable importance. The +available methods are described in the \emph{feature selection methods} vignette.} + +\item{rank_threshold}{Rank threshold used within several aggregation methods. +See the \emph{feature selection methods} vignette for more details.} + +\item{...}{unused parameters.} +} +\value{ +A \code{vimpTable} object with aggregated variable importance data. +} +\description{ +This methods aggregates variable importance from one or more +\code{vimpTable} objects. +} diff --git a/man/as_data_object-methods.Rd b/man/as_data_object-methods.Rd new file mode 100644 index 00000000..984cd977 --- /dev/null +++ b/man/as_data_object-methods.Rd @@ -0,0 +1,217 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/DataObject.R +\name{as_data_object} +\alias{as_data_object} +\alias{as_data_object,dataObject-method} +\alias{as_data_object,data.table-method} +\alias{as_data_object,ANY-method} +\title{Creates a valid data object from input data.} +\usage{ +as_data_object(data, ...) + +\S4method{as_data_object}{dataObject}(data, object = NULL, ...) + +\S4method{as_data_object}{data.table}( + data, + object = NULL, + sample_id_column = waiver(), + batch_id_column = waiver(), + series_id_column = waiver(), + development_batch_id = waiver(), + validation_batch_id = waiver(), + outcome_name = waiver(), + outcome_column = waiver(), + outcome_type = waiver(), + event_indicator = waiver(), + censoring_indicator = waiver(), + competing_risk_indicator = waiver(), + class_levels = waiver(), + exclude_features = waiver(), + include_features = waiver(), + reference_method = waiver(), + check_stringency = "strict", + ... +) + +\S4method{as_data_object}{ANY}( + data, + object = NULL, + sample_id_column = waiver(), + batch_id_column = waiver(), + series_id_column = waiver(), + ... +) +} +\arguments{ +\item{data}{A \code{data.frame} or \code{data.table}, a path to such tables on a local +or network drive, or a path to tabular data that may be converted to these +formats.} + +\item{...}{Unused arguments.} + +\item{object}{A \code{familiarEnsemble} or \code{familiarModel} object that is used to +check consistency of these objects.} + +\item{sample_id_column}{(\strong{recommended}) Name of the column containing +sample or subject identifiers. See \code{batch_id_column} above for more +details. + +If unset, every row will be identified as a single sample.} + +\item{batch_id_column}{(\strong{recommended}) Name of the column containing batch +or cohort identifiers. This parameter is required if more than one dataset +is provided, or if external validation is performed. + +In familiar any row of data is organised by four identifiers: +\itemize{ +\item The batch identifier \code{batch_id_column}: This denotes the group to which a +set of samples belongs, e.g. patients from a single study, samples measured +in a batch, etc. The batch identifier is used for batch normalisation, as +well as selection of development and validation datasets. +\item The sample identifier \code{sample_id_column}: This denotes the sample level, +e.g. data from a single individual. Subsets of data, e.g. bootstraps or +cross-validation folds, are created at this level. +\item The series identifier \code{series_id_column}: Indicates measurements on a +single sample that may not share the same outcome value, e.g. a time +series, or the number of cells in a view. +\item The repetition identifier: Indicates repeated measurements in a single +series where any feature values may differ, but the outcome does not. +Repetition identifiers are always implicitly set when multiple entries for +the same series of the same sample in the same batch that share the same +outcome are encountered. +}} + +\item{series_id_column}{(\strong{optional}) Name of the column containing series +identifiers, which distinguish between measurements that are part of a +series for a single sample. See \code{batch_id_column} above for more details. + +If unset, rows which share the same batch and sample identifiers but have a +different outcome are assigned unique series identifiers.} + +\item{development_batch_id}{(\emph{optional}) One or more batch or cohort +identifiers to constitute data sets for development. Defaults to all, or +all minus the identifiers in \code{validation_batch_id} for external validation. +Required if external validation is performed and \code{validation_batch_id} is +not provided.} + +\item{validation_batch_id}{(\emph{optional}) One or more batch or cohort +identifiers to constitute data sets for external validation. Defaults to +all data sets except those in \code{development_batch_id} for external +validation, or none if not. Required if \code{development_batch_id} is not +provided.} + +\item{outcome_name}{(\emph{optional}) Name of the modelled outcome. This name will +be used in figures created by \code{familiar}. + +If not set, the column name in \code{outcome_column} will be used for +\code{binomial}, \code{multinomial}, \code{count} and \code{continuous} outcomes. For other +outcomes (\code{survival} and \code{competing_risk}) no default is used.} + +\item{outcome_column}{(\strong{recommended}) Name of the column containing the +outcome of interest. May be identified from a formula, if a formula is +provided as an argument. Otherwise an error is raised. Note that \code{survival} +and \code{competing_risk} outcome type outcomes require two columns that +indicate the time-to-event or the time of last follow-up and the event +status.} + +\item{outcome_type}{(\strong{recommended}) Type of outcome found in the outcome +column. The outcome type determines many aspects of the overall process, +e.g. the available feature selection methods and learners, but also the +type of assessments that can be conducted to evaluate the resulting models. +Implemented outcome types are: +\itemize{ +\item \code{binomial}: categorical outcome with 2 levels. +\item \code{multinomial}: categorical outcome with 2 or more levels. +\item \code{count}: Poisson-distributed numeric outcomes. +\item \code{continuous}: general continuous numeric outcomes. +\item \code{survival}: survival outcome for time-to-event data. +} + +If not provided, the algorithm will attempt to obtain outcome_type from +contents of the outcome column. This may lead to unexpected results, and we +therefore advise to provide this information manually. + +Note that \code{competing_risk} survival analysis are not fully supported, and +is currently not a valid choice for \code{outcome_type}.} + +\item{event_indicator}{(\strong{recommended}) Indicator for events in \code{survival} +and \code{competing_risk} analyses. \code{familiar} will automatically recognise \code{1}, +\code{true}, \code{t}, \code{y} and \code{yes} as event indicators, including different +capitalisations. If this parameter is set, it replaces the default values.} + +\item{censoring_indicator}{(\strong{recommended}) Indicator for right-censoring in +\code{survival} and \code{competing_risk} analyses. \code{familiar} will automatically +recognise \code{0}, \code{false}, \code{f}, \code{n}, \code{no} as censoring indicators, including +different capitalisations. If this parameter is set, it replaces the +default values.} + +\item{competing_risk_indicator}{(\strong{recommended}) Indicator for competing +risks in \code{competing_risk} analyses. There are no default values, and if +unset, all values other than those specified by the \code{event_indicator} and +\code{censoring_indicator} parameters are considered to indicate competing +risks.} + +\item{class_levels}{(\emph{optional}) Class levels for \code{binomial} or \code{multinomial} +outcomes. This argument can be used to specify the ordering of levels for +categorical outcomes. These class levels must exactly match the levels +present in the outcome column.} + +\item{exclude_features}{(\emph{optional}) Feature columns that will be removed +from the data set. Cannot overlap with features in \code{signature}, +\code{novelty_features} or \code{include_features}.} + +\item{include_features}{(\emph{optional}) Feature columns that are specifically +included in the data set. By default all features are included. Cannot +overlap with \code{exclude_features}, but may overlap \code{signature}. Features in +\code{signature} and \code{novelty_features} are always included. If both +\code{exclude_features} and \code{include_features} are provided, \code{include_features} +takes precedence, provided that there is no overlap between the two.} + +\item{reference_method}{(\emph{optional}) Method used to set reference levels for +categorical features. There are several options: +\itemize{ +\item \code{auto} (default): Categorical features that are not explicitly set by the +user, i.e. columns containing boolean values or characters, use the most +frequent level as reference. Categorical features that are explicitly set, +i.e. as factors, are used as is. +\item \code{always}: Both automatically detected and user-specified categorical +features have the reference level set to the most frequent level. Ordinal +features are not altered, but are used as is. +\item \code{never}: User-specified categorical features are used as is. +Automatically detected categorical features are simply sorted, and the +first level is then used as the reference level. This was the behaviour +prior to familiar version 1.3.0. +}} + +\item{check_stringency}{Specifies stringency of various checks. This is mostly: +\itemize{ +\item \code{strict}: default value used for \code{summon_familiar}. Thoroughly checks +input data. Used internally for checking development data. +\item \code{external_warn}: value used for \code{extract_data} and related methods. Less +stringent checks, but will warn for possible issues. Used internally for +checking data for evaluation and explanation. +\item \code{external}: value used for external methods such as \code{predict}. Less +stringent checks, particularly for identifier and outcome columns, which may +be completely absent. Used internally for \code{predict}. +}} +} +\value{ +A \code{dataObject} object. +} +\description{ +Creates \code{dataObject} a object from input data. Input data can be +a \code{data.frame} or \code{data.table}, a path to such tables on a local or network +drive, or a path to tabular data that may be converted to these formats. + +In addition, a \code{familiarEnsemble} or \code{familiarModel} object can be passed +along to check whether the data are formatted correctly, e.g. by checking +the levels of categorical features, whether all expected columns are +present, etc. +} +\details{ +You can specify settings for your data manually, e.g. the column for +sample identifiers (\code{sample_id_column}). This prevents you from having to +change the column name externally. In the case you provide a \code{familiarModel} +or \code{familiarEnsemble} for the \code{object} argument, any parameters you provide +take precedence over parameters specified by the object. +} diff --git a/man/as_familiar_collection-methods.Rd b/man/as_familiar_collection-methods.Rd new file mode 100644 index 00000000..e3744918 --- /dev/null +++ b/man/as_familiar_collection-methods.Rd @@ -0,0 +1,375 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarObjectConversion.R +\name{as_familiar_collection} +\alias{as_familiar_collection} +\alias{as_familiar_collection,familiarCollection-method} +\alias{as_familiar_collection,familiarData-method} +\alias{as_familiar_collection,familiarEnsemble-method} +\alias{as_familiar_collection,familiarModel-method} +\alias{as_familiar_collection,list-method} +\alias{as_familiar_collection,character-method} +\alias{as_familiar_collection,ANY-method} +\title{Conversion to familiarCollection object.} +\usage{ +as_familiar_collection( + object, + familiar_data_names = NULL, + collection_name = NULL, + ... +) + +\S4method{as_familiar_collection}{familiarCollection}( + object, + familiar_data_names = NULL, + collection_name = NULL, + ... +) + +\S4method{as_familiar_collection}{familiarData}( + object, + familiar_data_names = NULL, + collection_name = NULL, + ... +) + +\S4method{as_familiar_collection}{familiarEnsemble}( + object, + familiar_data_names = NULL, + collection_name = NULL, + ... +) + +\S4method{as_familiar_collection}{familiarModel}( + object, + familiar_data_names = NULL, + collection_name = NULL, + ... +) + +\S4method{as_familiar_collection}{list}( + object, + familiar_data_names = NULL, + collection_name = NULL, + ... +) + +\S4method{as_familiar_collection}{character}( + object, + familiar_data_names = NULL, + collection_name = NULL, + ... +) + +\S4method{as_familiar_collection}{ANY}( + object, + familiar_data_names = NULL, + collection_name = NULL, + ... +) +} +\arguments{ +\item{object}{\code{familiarCollection} object, or one or more \code{familiarData} +objects, that will be internally converted to a \code{familiarCollection} object. +It is also possible to provide a \code{familiarEnsemble} or one or more +\code{familiarModel} objects together with the data from which data is computed +prior to export. Paths to such files can also be provided.} + +\item{familiar_data_names}{Names of the dataset(s). Only used if the \code{object} parameter +is one or more \code{familiarData} objects.} + +\item{collection_name}{Name of the collection.} + +\item{...}{ + Arguments passed on to \code{\link[=extract_data]{extract_data}} + \describe{ + \item{\code{data}}{A \code{dataObject} object, \code{data.table} or \code{data.frame} that +constitutes the data that are assessed.} + \item{\code{is_pre_processed}}{Flag that indicates whether the data was already +pre-processed externally, e.g. normalised and clustered. Only used if the +\code{data} argument is a \code{data.table} or \code{data.frame}.} + \item{\code{cl}}{Cluster created using the \code{parallel} package. This cluster is then +used to speed up computation through parallellisation.} + \item{\code{time_max}}{Time point which is used as the benchmark for e.g. cumulative +risks generated by random forest, or the cut-off value for Uno's concordance +index. If not provided explicitly, this parameter is read from settings used +at creation of the underlying \code{familiarModel} objects. Only used for +\code{survival} outcomes.} + \item{\code{evaluation_times}}{One or more time points that are used for in analysis of +survival problems when data has to be assessed at a set time, e.g. +calibration. If not provided explicitly, this parameter is read from +settings used at creation of the underlying \code{familiarModel} objects. Only +used for \code{survival} outcomes.} + \item{\code{aggregation_method}}{Method for aggregating variable importances for the +purpose of evaluation. Variable importances are determined during feature +selection steps and after training the model. Both types are evaluated, but +feature selection variable importance is only evaluated at run-time. + +See the documentation for the \code{vimp_aggregation_method} argument in +\code{summon_familiar} for information concerning the different available +methods. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{rank_threshold}}{The threshold used to define the subset of highly +important features during evaluation. + +See the documentation for the \code{vimp_aggregation_rank_threshold} argument in +\code{summon_familiar} for more information. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{ensemble_method}}{Method for ensembling predictions from models for the +same sample. Available methods are: +\itemize{ +\item \code{median} (default): Use the median of the predicted values as the ensemble +value for a sample. +\item \code{mean}: Use the mean of the predicted values as the ensemble value for a +sample. +}} + \item{\code{metric}}{One or more metrics for assessing model performance. See the +vignette on performance metrics for the available metrics. If not provided +explicitly, this parameter is read from settings used at creation of the +underlying \code{familiarModel} objects.} + \item{\code{feature_cluster_method}}{The method used to perform clustering. These are +the same methods as for the \code{cluster_method} configuration parameter: +\code{none}, \code{hclust}, \code{agnes}, \code{diana} and \code{pam}. + +\code{none} cannot be used when extracting data regarding mutual correlation or +feature expressions. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{feature_linkage_method}}{The method used for agglomerative clustering in +\code{hclust} and \code{agnes}. These are the same methods as for the +\code{cluster_linkage_method} configuration parameter: \code{average}, \code{single}, +\code{complete}, \code{weighted}, and \code{ward}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{feature_cluster_cut_method}}{The method used to divide features into +separate clusters. The available methods are the same as for the +\code{cluster_cut_method} configuration parameter: \code{silhouette}, \code{fixed_cut} and +\code{dynamic_cut}. + +\code{silhouette} is available for all cluster methods, but \code{fixed_cut} only +applies to methods that create hierarchical trees (\code{hclust}, \code{agnes} and +\code{diana}). \code{dynamic_cut} requires the \code{dynamicTreeCut} package and can only +be used with \code{agnes} and \code{hclust}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{feature_similarity_threshold}}{The threshold level for pair-wise +similarity that is required to form feature clusters with the \code{fixed_cut} +method. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{feature_similarity_metric}}{Metric to determine pairwise similarity +between features. Similarity is computed in the same manner as for +clustering, and \code{feature_similarity_metric} therefore has the same options +as \code{cluster_similarity_metric}: \code{mcfadden_r2}, \code{cox_snell_r2}, +\code{nagelkerke_r2}, \code{spearman}, \code{kendall} and \code{pearson}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{sample_cluster_method}}{The method used to perform clustering based on +distance between samples. These are the same methods as for the +\code{cluster_method} configuration parameter: \code{hclust}, \code{agnes}, \code{diana} and +\code{pam}. + +\code{none} cannot be used when extracting data for feature expressions. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{sample_linkage_method}}{The method used for agglomerative clustering in +\code{hclust} and \code{agnes}. These are the same methods as for the +\code{cluster_linkage_method} configuration parameter: \code{average}, \code{single}, +\code{complete}, \code{weighted}, and \code{ward}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{sample_similarity_metric}}{Metric to determine pairwise similarity +between samples. Similarity is computed in the same manner as for +clustering, but \code{sample_similarity_metric} has different options that are +better suited to computing distance between samples instead of between +features: \code{gower}, \code{euclidean}. + +The underlying feature data is scaled to the \eqn{[0, 1]} range (for +numerical features) using the feature values across the samples. The +normalisation parameters required can optionally be computed from feature +data with the outer 5\% (on both sides) of feature values trimmed or +winsorised. To do so append \verb{_trim} (trimming) or \verb{_winsor} (winsorising) to +the metric name. This reduces the effect of outliers somewhat. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{icc_type}}{String indicating the type of intraclass correlation +coefficient (\code{1}, \code{2} or \code{3}) that should be used to compute robustness for +features in repeated measurements during the evaluation of univariate +importance. These types correspond to the types in Shrout and Fleiss (1979). +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{verbose}}{Flag to indicate whether feedback should be provided on the +computation and extraction of various data elements.} + \item{\code{message_indent}}{Number of indentation steps for messages shown during +computation and extraction of various data elements.} + \item{\code{data_element}}{String indicating which data elements are to be extracted. +Default is \code{all}, but specific elements can be specified to speed up +computations if not all elements are to be computed. This is an internal +parameter that is set by, e.g. the \code{export_model_vimp} method.} + \item{\code{sample_limit}}{(\emph{optional}) Set the upper limit of the number of samples +that are used during evaluation steps. Cannot be less than 20. + +This setting can be specified per data element by providing a parameter +value in a named list with data elements, e.g. +\code{list("sample_similarity"=100, "permutation_vimp"=1000)}. + +This parameter can be set for the following data elements: +\code{sample_similarity} and \code{ice_data}.} + \item{\code{detail_level}}{(\emph{optional}) Sets the level at which results are computed +and aggregated. +\itemize{ +\item \code{ensemble}: Results are computed at the ensemble level, i.e. over all +models in the ensemble. This means that, for example, bias-corrected +estimates of model performance are assessed by creating (at least) 20 +bootstraps and computing the model performance of the ensemble model for +each bootstrap. +\item \code{hybrid} (default): Results are computed at the level of models in an +ensemble. This means that, for example, bias-corrected estimates of model +performance are directly computed using the models in the ensemble. If there +are at least 20 trained models in the ensemble, performance is computed for +each model, in contrast to \code{ensemble} where performance is computed for the +ensemble of models. If there are less than 20 trained models in the +ensemble, bootstraps are created so that at least 20 point estimates can be +made. +\item \code{model}: Results are computed at the model level. This means that, for +example, bias-corrected estimates of model performance are assessed by +creating (at least) 20 bootstraps and computing the performance of the model +for each bootstrap. +} + +Note that each level of detail has a different interpretation for bootstrap +confidence intervals. For \code{ensemble} and \code{model} these are the confidence +intervals for the ensemble and an individual model, respectively. That is, +the confidence interval describes the range where an estimate produced by a +respective ensemble or model trained on a repeat of the experiment may be +found with the probability of the confidence level. For \code{hybrid}, it +represents the range where any single model trained on a repeat of the +experiment may be found with the probability of the confidence level. By +definition, confidence intervals obtained using \code{hybrid} are at least as +wide as those for \code{ensemble}. \code{hybrid} offers the correct interpretation if +the goal of the analysis is to assess the result of a single, unspecified, +model. + +\code{hybrid} is generally computationally less expensive then \code{ensemble}, which +in turn is somewhat less expensive than \code{model}. + +A non-default \code{detail_level} parameter can be specified for separate +evaluation steps by providing a parameter value in a named list with data +elements, e.g. \code{list("auc_data"="ensemble", "model_performance"="hybrid")}. +This parameter can be set for the following data elements: \code{auc_data}, +\code{decision_curve_analyis}, \code{model_performance}, \code{permutation_vimp}, +\code{ice_data}, \code{prediction_data} and \code{confusion_matrix}.} + \item{\code{estimation_type}}{(\emph{optional}) Sets the type of estimation that should be +possible. This has the following options: +\itemize{ +\item \code{point}: Point estimates. +\item \code{bias_correction} or \code{bc}: Bias-corrected estimates. A bias-corrected +estimate is computed from (at least) 20 point estimates, and \code{familiar} may +bootstrap the data to create them. +\item \code{bootstrap_confidence_interval} or \code{bci} (default): Bias-corrected +estimates with bootstrap confidence intervals (Efron and Hastie, 2016). The +number of point estimates required depends on the \code{confidence_level} +parameter, and \code{familiar} may bootstrap the data to create them. +} + +As with \code{detail_level}, a non-default \code{estimation_type} parameter can be +specified for separate evaluation steps by providing a parameter value in a +named list with data elements, e.g. \code{list("auc_data"="bci", "model_performance"="point")}. This parameter can be set for the following +data elements: \code{auc_data}, \code{decision_curve_analyis}, \code{model_performance}, +\code{permutation_vimp}, \code{ice_data}, and \code{prediction_data}.} + \item{\code{aggregate_results}}{(\emph{optional}) Flag that signifies whether results +should be aggregated during evaluation. If \code{estimation_type} is +\code{bias_correction} or \code{bc}, aggregation leads to a single bias-corrected +estimate. If \code{estimation_type} is \code{bootstrap_confidence_interval} or \code{bci}, +aggregation leads to a single bias-corrected estimate with lower and upper +boundaries of the confidence interval. This has no effect if +\code{estimation_type} is \code{point}. + +The default value is equal to \code{TRUE} except when assessing metrics to assess +model performance, as the default violin plot requires underlying data. + +As with \code{detail_level} and \code{estimation_type}, a non-default +\code{aggregate_results} parameter can be specified for separate evaluation steps +by providing a parameter value in a named list with data elements, e.g. +\code{list("auc_data"=TRUE, , "model_performance"=FALSE)}. This parameter exists +for the same elements as \code{estimation_type}.} + \item{\code{confidence_level}}{(\emph{optional}) Numeric value for the level at which +confidence intervals are determined. In the case bootstraps are used to +determine the confidence intervals bootstrap estimation, \code{familiar} uses the +rule of thumb \eqn{n = 20 / ci.level} to determine the number of required +bootstraps. + +The default value is \code{0.95}.} + \item{\code{bootstrap_ci_method}}{(\emph{optional}) Method used to determine bootstrap +confidence intervals (Efron and Hastie, 2016). The following methods are +implemented: +\itemize{ +\item \code{percentile} (default): Confidence intervals obtained using the percentile +method. +\item \code{bc}: Bias-corrected confidence intervals. +} + +Note that the standard method is not implemented because this method is +often not suitable due to non-normal distributions. The bias-corrected and +accelerated (BCa) method is not implemented yet.} + \item{\code{stratification_method}}{(\emph{optional}) Method for determining the +stratification threshold for creating survival groups. The actual, +model-dependent, threshold value is obtained from the development data, and +can afterwards be used to perform stratification on validation data. + +The following stratification methods are available: +\itemize{ +\item \code{median} (default): The median predicted value in the development cohort +is used to stratify the samples into two risk groups. For predicted outcome +values that build a continuous spectrum, the two risk groups in the +development cohort will be roughly equal in size. +\item \code{mean}: The mean predicted value in the development cohort is used to +stratify the samples into two risk groups. +\item \code{mean_trim}: As \code{mean}, but based on the set of predicted values +where the 5\% lowest and 5\% highest values are discarded. This reduces the +effect of outliers. +\item \code{mean_winsor}: As \code{mean}, but based on the set of predicted values where +the 5\% lowest and 5\% highest values are winsorised. This reduces the effect +of outliers. +\item \code{fixed}: Samples are stratified based on the sample quantiles of the +predicted values. These quantiles are defined using the +\code{stratification_threshold} parameter. +\item \code{optimised}: Use maximally selected rank statistics to determine the +optimal threshold (Lausen and Schumacher, 1992; Hothorn et al., 2003) to +stratify samples into two optimally separated risk groups. +} + +One or more stratification methods can be selected simultaneously. + +This parameter is only relevant for \code{survival} outcomes.} + \item{\code{dynamic_model_loading}}{(\emph{optional}) Enables dynamic loading of models +during the evaluation process, if \code{TRUE}. Defaults to \code{FALSE}. Dynamic +loading of models may reduce the overall memory footprint, at the cost of +increased disk or network IO. Models can only be dynamically loaded if they +are found at an accessible disk or network location. Setting this parameter +to \code{TRUE} may help if parallel processing causes out-of-memory issues during +evaluation.} + }} +} +\value{ +A \code{familiarCollection} object. +} +\description{ +Creates a \code{familiarCollection} objects from \code{familiarData}, +\code{familiarEnsemble} or \code{familiarModel} objects. +} +\details{ +A \code{data} argument is expected if the \code{object} argument is a +\code{familiarEnsemble} object or one or more \code{familiarModel} objects. +} diff --git a/man/as_familiar_data-methods.Rd b/man/as_familiar_data-methods.Rd new file mode 100644 index 00000000..38cb2281 --- /dev/null +++ b/man/as_familiar_data-methods.Rd @@ -0,0 +1,328 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarObjectConversion.R +\name{as_familiar_data} +\alias{as_familiar_data} +\alias{as_familiar_data,familiarData-method} +\alias{as_familiar_data,familiarEnsemble-method} +\alias{as_familiar_data,familiarModel-method} +\alias{as_familiar_data,list-method} +\alias{as_familiar_data,character-method} +\alias{as_familiar_data,ANY-method} +\title{Conversion to familiarData object.} +\usage{ +as_familiar_data(object, ...) + +\S4method{as_familiar_data}{familiarData}(object, ...) + +\S4method{as_familiar_data}{familiarEnsemble}(object, name = NULL, ...) + +\S4method{as_familiar_data}{familiarModel}(object, ...) + +\S4method{as_familiar_data}{list}(object, ...) + +\S4method{as_familiar_data}{character}(object, ...) + +\S4method{as_familiar_data}{ANY}(object, ...) +} +\arguments{ +\item{object}{A \code{familiarData} object, or a \code{familiarEnsemble} or +\code{familiarModel} objects that will be internally converted to a +\code{familiarData} object. Paths to such objects can also be provided.} + +\item{...}{ + Arguments passed on to \code{\link[=extract_data]{extract_data}} + \describe{ + \item{\code{data}}{A \code{dataObject} object, \code{data.table} or \code{data.frame} that +constitutes the data that are assessed.} + \item{\code{is_pre_processed}}{Flag that indicates whether the data was already +pre-processed externally, e.g. normalised and clustered. Only used if the +\code{data} argument is a \code{data.table} or \code{data.frame}.} + \item{\code{cl}}{Cluster created using the \code{parallel} package. This cluster is then +used to speed up computation through parallellisation.} + \item{\code{time_max}}{Time point which is used as the benchmark for e.g. cumulative +risks generated by random forest, or the cut-off value for Uno's concordance +index. If not provided explicitly, this parameter is read from settings used +at creation of the underlying \code{familiarModel} objects. Only used for +\code{survival} outcomes.} + \item{\code{evaluation_times}}{One or more time points that are used for in analysis of +survival problems when data has to be assessed at a set time, e.g. +calibration. If not provided explicitly, this parameter is read from +settings used at creation of the underlying \code{familiarModel} objects. Only +used for \code{survival} outcomes.} + \item{\code{aggregation_method}}{Method for aggregating variable importances for the +purpose of evaluation. Variable importances are determined during feature +selection steps and after training the model. Both types are evaluated, but +feature selection variable importance is only evaluated at run-time. + +See the documentation for the \code{vimp_aggregation_method} argument in +\code{summon_familiar} for information concerning the different available +methods. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{rank_threshold}}{The threshold used to define the subset of highly +important features during evaluation. + +See the documentation for the \code{vimp_aggregation_rank_threshold} argument in +\code{summon_familiar} for more information. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{ensemble_method}}{Method for ensembling predictions from models for the +same sample. Available methods are: +\itemize{ +\item \code{median} (default): Use the median of the predicted values as the ensemble +value for a sample. +\item \code{mean}: Use the mean of the predicted values as the ensemble value for a +sample. +}} + \item{\code{metric}}{One or more metrics for assessing model performance. See the +vignette on performance metrics for the available metrics. If not provided +explicitly, this parameter is read from settings used at creation of the +underlying \code{familiarModel} objects.} + \item{\code{feature_cluster_method}}{The method used to perform clustering. These are +the same methods as for the \code{cluster_method} configuration parameter: +\code{none}, \code{hclust}, \code{agnes}, \code{diana} and \code{pam}. + +\code{none} cannot be used when extracting data regarding mutual correlation or +feature expressions. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{feature_linkage_method}}{The method used for agglomerative clustering in +\code{hclust} and \code{agnes}. These are the same methods as for the +\code{cluster_linkage_method} configuration parameter: \code{average}, \code{single}, +\code{complete}, \code{weighted}, and \code{ward}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{feature_cluster_cut_method}}{The method used to divide features into +separate clusters. The available methods are the same as for the +\code{cluster_cut_method} configuration parameter: \code{silhouette}, \code{fixed_cut} and +\code{dynamic_cut}. + +\code{silhouette} is available for all cluster methods, but \code{fixed_cut} only +applies to methods that create hierarchical trees (\code{hclust}, \code{agnes} and +\code{diana}). \code{dynamic_cut} requires the \code{dynamicTreeCut} package and can only +be used with \code{agnes} and \code{hclust}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{feature_similarity_threshold}}{The threshold level for pair-wise +similarity that is required to form feature clusters with the \code{fixed_cut} +method. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{feature_similarity_metric}}{Metric to determine pairwise similarity +between features. Similarity is computed in the same manner as for +clustering, and \code{feature_similarity_metric} therefore has the same options +as \code{cluster_similarity_metric}: \code{mcfadden_r2}, \code{cox_snell_r2}, +\code{nagelkerke_r2}, \code{spearman}, \code{kendall} and \code{pearson}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{sample_cluster_method}}{The method used to perform clustering based on +distance between samples. These are the same methods as for the +\code{cluster_method} configuration parameter: \code{hclust}, \code{agnes}, \code{diana} and +\code{pam}. + +\code{none} cannot be used when extracting data for feature expressions. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{sample_linkage_method}}{The method used for agglomerative clustering in +\code{hclust} and \code{agnes}. These are the same methods as for the +\code{cluster_linkage_method} configuration parameter: \code{average}, \code{single}, +\code{complete}, \code{weighted}, and \code{ward}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{sample_similarity_metric}}{Metric to determine pairwise similarity +between samples. Similarity is computed in the same manner as for +clustering, but \code{sample_similarity_metric} has different options that are +better suited to computing distance between samples instead of between +features: \code{gower}, \code{euclidean}. + +The underlying feature data is scaled to the \eqn{[0, 1]} range (for +numerical features) using the feature values across the samples. The +normalisation parameters required can optionally be computed from feature +data with the outer 5\% (on both sides) of feature values trimmed or +winsorised. To do so append \verb{_trim} (trimming) or \verb{_winsor} (winsorising) to +the metric name. This reduces the effect of outliers somewhat. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{icc_type}}{String indicating the type of intraclass correlation +coefficient (\code{1}, \code{2} or \code{3}) that should be used to compute robustness for +features in repeated measurements during the evaluation of univariate +importance. These types correspond to the types in Shrout and Fleiss (1979). +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{verbose}}{Flag to indicate whether feedback should be provided on the +computation and extraction of various data elements.} + \item{\code{message_indent}}{Number of indentation steps for messages shown during +computation and extraction of various data elements.} + \item{\code{data_element}}{String indicating which data elements are to be extracted. +Default is \code{all}, but specific elements can be specified to speed up +computations if not all elements are to be computed. This is an internal +parameter that is set by, e.g. the \code{export_model_vimp} method.} + \item{\code{sample_limit}}{(\emph{optional}) Set the upper limit of the number of samples +that are used during evaluation steps. Cannot be less than 20. + +This setting can be specified per data element by providing a parameter +value in a named list with data elements, e.g. +\code{list("sample_similarity"=100, "permutation_vimp"=1000)}. + +This parameter can be set for the following data elements: +\code{sample_similarity} and \code{ice_data}.} + \item{\code{detail_level}}{(\emph{optional}) Sets the level at which results are computed +and aggregated. +\itemize{ +\item \code{ensemble}: Results are computed at the ensemble level, i.e. over all +models in the ensemble. This means that, for example, bias-corrected +estimates of model performance are assessed by creating (at least) 20 +bootstraps and computing the model performance of the ensemble model for +each bootstrap. +\item \code{hybrid} (default): Results are computed at the level of models in an +ensemble. This means that, for example, bias-corrected estimates of model +performance are directly computed using the models in the ensemble. If there +are at least 20 trained models in the ensemble, performance is computed for +each model, in contrast to \code{ensemble} where performance is computed for the +ensemble of models. If there are less than 20 trained models in the +ensemble, bootstraps are created so that at least 20 point estimates can be +made. +\item \code{model}: Results are computed at the model level. This means that, for +example, bias-corrected estimates of model performance are assessed by +creating (at least) 20 bootstraps and computing the performance of the model +for each bootstrap. +} + +Note that each level of detail has a different interpretation for bootstrap +confidence intervals. For \code{ensemble} and \code{model} these are the confidence +intervals for the ensemble and an individual model, respectively. That is, +the confidence interval describes the range where an estimate produced by a +respective ensemble or model trained on a repeat of the experiment may be +found with the probability of the confidence level. For \code{hybrid}, it +represents the range where any single model trained on a repeat of the +experiment may be found with the probability of the confidence level. By +definition, confidence intervals obtained using \code{hybrid} are at least as +wide as those for \code{ensemble}. \code{hybrid} offers the correct interpretation if +the goal of the analysis is to assess the result of a single, unspecified, +model. + +\code{hybrid} is generally computationally less expensive then \code{ensemble}, which +in turn is somewhat less expensive than \code{model}. + +A non-default \code{detail_level} parameter can be specified for separate +evaluation steps by providing a parameter value in a named list with data +elements, e.g. \code{list("auc_data"="ensemble", "model_performance"="hybrid")}. +This parameter can be set for the following data elements: \code{auc_data}, +\code{decision_curve_analyis}, \code{model_performance}, \code{permutation_vimp}, +\code{ice_data}, \code{prediction_data} and \code{confusion_matrix}.} + \item{\code{estimation_type}}{(\emph{optional}) Sets the type of estimation that should be +possible. This has the following options: +\itemize{ +\item \code{point}: Point estimates. +\item \code{bias_correction} or \code{bc}: Bias-corrected estimates. A bias-corrected +estimate is computed from (at least) 20 point estimates, and \code{familiar} may +bootstrap the data to create them. +\item \code{bootstrap_confidence_interval} or \code{bci} (default): Bias-corrected +estimates with bootstrap confidence intervals (Efron and Hastie, 2016). The +number of point estimates required depends on the \code{confidence_level} +parameter, and \code{familiar} may bootstrap the data to create them. +} + +As with \code{detail_level}, a non-default \code{estimation_type} parameter can be +specified for separate evaluation steps by providing a parameter value in a +named list with data elements, e.g. \code{list("auc_data"="bci", "model_performance"="point")}. This parameter can be set for the following +data elements: \code{auc_data}, \code{decision_curve_analyis}, \code{model_performance}, +\code{permutation_vimp}, \code{ice_data}, and \code{prediction_data}.} + \item{\code{aggregate_results}}{(\emph{optional}) Flag that signifies whether results +should be aggregated during evaluation. If \code{estimation_type} is +\code{bias_correction} or \code{bc}, aggregation leads to a single bias-corrected +estimate. If \code{estimation_type} is \code{bootstrap_confidence_interval} or \code{bci}, +aggregation leads to a single bias-corrected estimate with lower and upper +boundaries of the confidence interval. This has no effect if +\code{estimation_type} is \code{point}. + +The default value is equal to \code{TRUE} except when assessing metrics to assess +model performance, as the default violin plot requires underlying data. + +As with \code{detail_level} and \code{estimation_type}, a non-default +\code{aggregate_results} parameter can be specified for separate evaluation steps +by providing a parameter value in a named list with data elements, e.g. +\code{list("auc_data"=TRUE, , "model_performance"=FALSE)}. This parameter exists +for the same elements as \code{estimation_type}.} + \item{\code{confidence_level}}{(\emph{optional}) Numeric value for the level at which +confidence intervals are determined. In the case bootstraps are used to +determine the confidence intervals bootstrap estimation, \code{familiar} uses the +rule of thumb \eqn{n = 20 / ci.level} to determine the number of required +bootstraps. + +The default value is \code{0.95}.} + \item{\code{bootstrap_ci_method}}{(\emph{optional}) Method used to determine bootstrap +confidence intervals (Efron and Hastie, 2016). The following methods are +implemented: +\itemize{ +\item \code{percentile} (default): Confidence intervals obtained using the percentile +method. +\item \code{bc}: Bias-corrected confidence intervals. +} + +Note that the standard method is not implemented because this method is +often not suitable due to non-normal distributions. The bias-corrected and +accelerated (BCa) method is not implemented yet.} + \item{\code{stratification_method}}{(\emph{optional}) Method for determining the +stratification threshold for creating survival groups. The actual, +model-dependent, threshold value is obtained from the development data, and +can afterwards be used to perform stratification on validation data. + +The following stratification methods are available: +\itemize{ +\item \code{median} (default): The median predicted value in the development cohort +is used to stratify the samples into two risk groups. For predicted outcome +values that build a continuous spectrum, the two risk groups in the +development cohort will be roughly equal in size. +\item \code{mean}: The mean predicted value in the development cohort is used to +stratify the samples into two risk groups. +\item \code{mean_trim}: As \code{mean}, but based on the set of predicted values +where the 5\% lowest and 5\% highest values are discarded. This reduces the +effect of outliers. +\item \code{mean_winsor}: As \code{mean}, but based on the set of predicted values where +the 5\% lowest and 5\% highest values are winsorised. This reduces the effect +of outliers. +\item \code{fixed}: Samples are stratified based on the sample quantiles of the +predicted values. These quantiles are defined using the +\code{stratification_threshold} parameter. +\item \code{optimised}: Use maximally selected rank statistics to determine the +optimal threshold (Lausen and Schumacher, 1992; Hothorn et al., 2003) to +stratify samples into two optimally separated risk groups. +} + +One or more stratification methods can be selected simultaneously. + +This parameter is only relevant for \code{survival} outcomes.} + \item{\code{dynamic_model_loading}}{(\emph{optional}) Enables dynamic loading of models +during the evaluation process, if \code{TRUE}. Defaults to \code{FALSE}. Dynamic +loading of models may reduce the overall memory footprint, at the cost of +increased disk or network IO. Models can only be dynamically loaded if they +are found at an accessible disk or network location. Setting this parameter +to \code{TRUE} may help if parallel processing causes out-of-memory issues during +evaluation.} + }} + +\item{name}{Name of the \code{familiarData} object. If not set, a name is +automatically generated.} +} +\value{ +A \code{familiarData} object. +} +\description{ +Creates \code{familiarData} a object from \code{familiarEnsemble} or +\code{familiarModel} objects. +} +\details{ +The \code{data} argument is required if \code{familiarEnsemble} or +\code{familiarModel} objects are provided. +} diff --git a/man/as_familiar_ensemble-methods.Rd b/man/as_familiar_ensemble-methods.Rd new file mode 100644 index 00000000..f536ce89 --- /dev/null +++ b/man/as_familiar_ensemble-methods.Rd @@ -0,0 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarObjectConversion.R +\name{as_familiar_ensemble} +\alias{as_familiar_ensemble} +\alias{as_familiar_ensemble,familiarEnsemble-method} +\alias{as_familiar_ensemble,familiarModel-method} +\alias{as_familiar_ensemble,list-method} +\alias{as_familiar_ensemble,character-method} +\alias{as_familiar_ensemble,ANY-method} +\title{Conversion to familiarEnsemble object.} +\usage{ +as_familiar_ensemble(object, ...) + +\S4method{as_familiar_ensemble}{familiarEnsemble}(object, ...) + +\S4method{as_familiar_ensemble}{familiarModel}(object, ...) + +\S4method{as_familiar_ensemble}{list}(object, ...) + +\S4method{as_familiar_ensemble}{character}(object, ...) + +\S4method{as_familiar_ensemble}{ANY}(object, ...) +} +\arguments{ +\item{object}{A \code{familiarEnsemble} object, or one or more +\code{familiarModel} objects that will be internally converted to a +\code{familiarEnsemble} object. Paths to such objects can also be provided.} + +\item{...}{Unused arguments.} +} +\value{ +A \code{familiarEnsemble} object. +} +\description{ +Creates \code{familiarEnsemble} a object from \code{familiarModel} objects. +} diff --git a/man/coef-methods.Rd b/man/coef-methods.Rd new file mode 100644 index 00000000..7e3e9aba --- /dev/null +++ b/man/coef-methods.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarModel.R +\name{coef} +\alias{coef} +\alias{coef,familiarModel-method} +\title{Extract model coefficients} +\usage{ +coef(object, ...) + +\S4method{coef}{familiarModel}(object, ...) +} +\arguments{ +\item{object}{a familiarModel object} + +\item{...}{additional arguments passed to \code{coef} methods for the underlying +model, when available.} +} +\value{ +Coefficients extracted from the model in the familiarModel object, if +any. +} +\description{ +Extract model coefficients +} +\details{ +This method extends the \code{coef} S3 method. For some models \code{coef} +requires information that is trimmed from the model. In this case a copy of +the model coefficient is stored with the model, and returned. +} diff --git a/man/create_randomised_groups.Rd b/man/create_randomised_groups.Rd new file mode 100644 index 00000000..94515239 --- /dev/null +++ b/man/create_randomised_groups.Rd @@ -0,0 +1,43 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RandomGrouping.R +\name{create_randomised_groups} +\alias{create_randomised_groups} +\title{Create randomised groups +Creates randomised groups, e.g. for tests that depend on splitting (continuous) data into groups, such as the Hosmer-Lemeshow test} +\usage{ +create_randomised_groups( + x, + y = NULL, + sample_identifiers, + n_max_groups = NULL, + n_min_groups = NULL, + n_min_y_in_group = NULL, + n_groups_init = 30, + fast_mode = TRUE +) +} +\arguments{ +\item{x}{Vector with data used for sorting. Groups are formed based on adjacent values.} + +\item{y}{Vector with markers, e.g. the events. Should be 0 or 1 (for an event).} + +\item{sample_identifiers}{data.table with sample_identifiers. If provide, a list of grouped sample_identifiers will be returned, and integers otherwise.} + +\item{n_max_groups}{Maximum number of groups that need to be formed.} + +\item{n_min_groups}{Minimum number of groups that need to be formed.} + +\item{n_min_y_in_group}{Minimum number of y=1 in each group for a valid group.} + +\item{n_groups_init}{Number of initial groups (default: 30)} + +\item{fast_mode}{Enables fast randomised grouping mode (default: TRUE)} +} +\value{ +List of group sample ids or indices. +} +\description{ +The default fast mode is based on random sampling, whereas the slow mode is based on probabilistic joining of adjacent groups. As +the name suggests, fast mode operates considerably more efficient. +} +\keyword{internal} diff --git a/man/dataObject-class.Rd b/man/dataObject-class.Rd new file mode 100644 index 00000000..fd5c715c --- /dev/null +++ b/man/dataObject-class.Rd @@ -0,0 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarS4Classes.R +\docType{class} +\name{dataObject-class} +\alias{dataObject-class} +\title{Data object} +\description{ +The dataObject class is used to resolve the issue of keeping track of +pre-processing status and data loading inside complex workflows, e.g. nested +predict functions inside a calibration function. +} +\section{Slots}{ + +\describe{ +\item{\code{data}}{NULL or data table containing the data. This is the data which +will be read and used.} + +\item{\code{preprocessing_level}}{character indicating the level of pre-processing +already conducted.} + +\item{\code{outcome_type}}{character, determines the outcome type.} + +\item{\code{data_column_info}}{Object containing column information.} + +\item{\code{delay_loading}}{logical. Allows delayed loading data, which enables data +parsing downstream without additional workflow complexity or memory +utilisation.} + +\item{\code{perturb_level}}{numeric. This is the perturbation level for data which +has not been loaded. Used for data retrieval by interacting with the run +table of the accompanying model.} + +\item{\code{load_validation}}{logical. This determines which internal data set will +be loaded. If TRUE, the validation data will be loaded, whereas FALSE loads +the development data.} + +\item{\code{aggregate_on_load}}{logical. Determines whether data is aggregated after +loading.} + +\item{\code{sample_set_on_load}}{NULL or vector of sample identifiers to be loaded.} +}} + diff --git a/man/dot-check_class_level_plausibility.Rd b/man/dot-check_class_level_plausibility.Rd new file mode 100644 index 00000000..51ac1d40 --- /dev/null +++ b/man/dot-check_class_level_plausibility.Rd @@ -0,0 +1,66 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/DataParameterChecks.R +\name{.check_class_level_plausibility} +\alias{.check_class_level_plausibility} +\title{Internal function to test plausibility of provided class levels} +\usage{ +.check_class_level_plausibility( + data, + outcome_type, + outcome_column, + class_levels, + check_stringency = "strict" +) +} +\arguments{ +\item{data}{Data set as loaded using the \code{.load_data} function.} + +\item{outcome_type}{(\strong{recommended}) Type of outcome found in the outcome +column. The outcome type determines many aspects of the overall process, +e.g. the available feature selection methods and learners, but also the +type of assessments that can be conducted to evaluate the resulting models. +Implemented outcome types are: +\itemize{ +\item \code{binomial}: categorical outcome with 2 levels. +\item \code{multinomial}: categorical outcome with 2 or more levels. +\item \code{count}: Poisson-distributed numeric outcomes. +\item \code{continuous}: general continuous numeric outcomes. +\item \code{survival}: survival outcome for time-to-event data. +} + +If not provided, the algorithm will attempt to obtain outcome_type from +contents of the outcome column. This may lead to unexpected results, and we +therefore advise to provide this information manually. + +Note that \code{competing_risk} survival analysis are not fully supported, and +is currently not a valid choice for \code{outcome_type}.} + +\item{outcome_column}{(\strong{recommended}) Name of the column containing the +outcome of interest. May be identified from a formula, if a formula is +provided as an argument. Otherwise an error is raised. Note that \code{survival} +and \code{competing_risk} outcome type outcomes require two columns that +indicate the time-to-event or the time of last follow-up and the event +status.} + +\item{class_levels}{(\emph{optional}) Class levels for \code{binomial} or \code{multinomial} +outcomes. This argument can be used to specify the ordering of levels for +categorical outcomes. These class levels must exactly match the levels +present in the outcome column.} + +\item{check_stringency}{Specifies stringency of various checks. This is mostly: +\itemize{ +\item \code{strict}: default value used for \code{summon_familiar}. Thoroughly checks +input data. Used internally for checking development data. +\item \code{external_warn}: value used for \code{extract_data} and related methods. Less +stringent checks, but will warn for possible issues. Used internally for +checking data for evaluation and explanation. +\item \code{external}: value used for external methods such as \code{predict}. Less +stringent checks, particularly for identifier and outcome columns, which may +be completely absent. Used internally for \code{predict}. +}} +} +\description{ +This function checks whether categorical levels are present in the data that +are not found in the user-provided class levels. +} +\keyword{internal} diff --git a/man/dot-check_feature_availability.Rd b/man/dot-check_feature_availability.Rd new file mode 100644 index 00000000..f5c52c5e --- /dev/null +++ b/man/dot-check_feature_availability.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/DataParameterChecks.R +\name{.check_feature_availability} +\alias{.check_feature_availability} +\title{Internal function to check whether feature columns are found in the data} +\usage{ +.check_feature_availability(data, feature) +} +\arguments{ +\item{data}{Data set as loaded using the \code{.load_data} function.} + +\item{feature}{Character string(s) indicating one or more features.} +} +\description{ +This function checks whether feature columns can be found in the data set. +It will raise an error if any feature columns are missing from the data set. +} +\keyword{internal} diff --git a/man/dot-check_input_identifier_column.Rd b/man/dot-check_input_identifier_column.Rd new file mode 100644 index 00000000..8a5c685d --- /dev/null +++ b/man/dot-check_input_identifier_column.Rd @@ -0,0 +1,65 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/DataParameterChecks.R +\name{.check_input_identifier_column} +\alias{.check_input_identifier_column} +\title{Internal function for checking consistency of the identifier columns} +\usage{ +.check_input_identifier_column( + id_column, + data, + signature = NULL, + exclude_features = NULL, + include_features = NULL, + other_id_column = NULL, + outcome_column = NULL, + col_type, + check_stringency = "strict" +) +} +\arguments{ +\item{id_column}{Character string indicating the currently inspected +identifier column.} + +\item{data}{Data set as loaded using the \code{.load_data} function.} + +\item{signature}{(\emph{optional}) One or more names of feature columns that are +considered part of a specific signature. Features specified here will +always be used for modelling. Ranking from feature selection has no effect +for these features.} + +\item{exclude_features}{(\emph{optional}) Feature columns that will be removed +from the data set. Cannot overlap with features in \code{signature}, +\code{novelty_features} or \code{include_features}.} + +\item{include_features}{(\emph{optional}) Feature columns that are specifically +included in the data set. By default all features are included. Cannot +overlap with \code{exclude_features}, but may overlap \code{signature}. Features in +\code{signature} and \code{novelty_features} are always included. If both +\code{exclude_features} and \code{include_features} are provided, \code{include_features} +takes precedence, provided that there is no overlap between the two.} + +\item{other_id_column}{Character string indicating another identifier column.} + +\item{outcome_column}{Character string indicating the outcome column(s).} + +\item{col_type}{Character string indicating the type of column, i.e. \code{sample} +or \code{batch}.} + +\item{check_stringency}{Specifies stringency of various checks. This is mostly: +\itemize{ +\item \code{strict}: default value used for \code{summon_familiar}. Thoroughly checks +input data. Used internally for checking development data. +\item \code{external_warn}: value used for \code{extract_data} and related methods. Less +stringent checks, but will warn for possible issues. Used internally for +checking data for evaluation and explanation. +\item \code{external}: value used for external methods such as \code{predict}. Less +stringent checks, particularly for identifier and outcome columns, which may +be completely absent. Used internally for \code{predict}. +}} +} +\description{ +This function checks whether an identifier column is consistent, i.e. appears +it exists, there is only one, and there is no overlap with any user-provided +feature columns, identifiers, or +} +\keyword{internal} diff --git a/man/dot-check_outcome_type_plausibility.Rd b/man/dot-check_outcome_type_plausibility.Rd new file mode 100644 index 00000000..1d62a9d9 --- /dev/null +++ b/man/dot-check_outcome_type_plausibility.Rd @@ -0,0 +1,48 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/DataParameterChecks.R +\name{.check_outcome_type_plausibility} +\alias{.check_outcome_type_plausibility} +\title{Internal function for checking if the outcome type fits well to the data} +\usage{ +.check_outcome_type_plausibility( + data, + outcome_type, + outcome_column, + censoring_indicator, + event_indicator, + competing_risk_indicator, + check_stringency = "strict" +) +} +\arguments{ +\item{data}{Data set as loaded using the \code{.load_data} function.} + +\item{outcome_type}{Character string indicating the type of outcome being +assessed.} + +\item{outcome_column}{Name of the outcome column in the data set.} + +\item{censoring_indicator}{Name of censoring indicator.} + +\item{event_indicator}{Name of event indicator.} + +\item{competing_risk_indicator}{Name of competing risk indicator.} + +\item{check_stringency}{Specifies stringency of various checks. This is mostly: +\itemize{ +\item \code{strict}: default value used for \code{summon_familiar}. Thoroughly checks +input data. Used internally for checking development data. +\item \code{external_warn}: value used for \code{extract_data} and related methods. Less +stringent checks, but will warn for possible issues. Used internally for +checking data for evaluation and explanation. +\item \code{external}: value used for external methods such as \code{predict}. Less +stringent checks, particularly for identifier and outcome columns, which may +be completely absent. Used internally for \code{predict}. +}} +} +\description{ +This function may help identify if the outcome type is plausible +given the outcome data. In practice it also tests whether the outcome column +is actually correct given the outcome type. +} +\keyword{internal} diff --git a/man/dot-check_survival_time_plausibility.Rd b/man/dot-check_survival_time_plausibility.Rd new file mode 100644 index 00000000..7127f058 --- /dev/null +++ b/man/dot-check_survival_time_plausibility.Rd @@ -0,0 +1,49 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/DataParameterChecks.R +\name{.check_survival_time_plausibility} +\alias{.check_survival_time_plausibility} +\title{Internal function to test plausibility of provided survival times.} +\usage{ +.check_survival_time_plausibility( + data, + outcome_type, + outcome_column, + check_stringency = "strict" +) +} +\arguments{ +\item{data}{Data set as loaded using the \code{.load_data} function.} + +\item{outcome_type}{(\strong{recommended}) Type of outcome found in the outcome +column. The outcome type determines many aspects of the overall process, +e.g. the available feature selection methods and learners, but also the +type of assessments that can be conducted to evaluate the resulting models. +Implemented outcome types are: +\itemize{ +\item \code{binomial}: categorical outcome with 2 levels. +\item \code{multinomial}: categorical outcome with 2 or more levels. +\item \code{count}: Poisson-distributed numeric outcomes. +\item \code{continuous}: general continuous numeric outcomes. +\item \code{survival}: survival outcome for time-to-event data. +} + +If not provided, the algorithm will attempt to obtain outcome_type from +contents of the outcome column. This may lead to unexpected results, and we +therefore advise to provide this information manually. + +Note that \code{competing_risk} survival analysis are not fully supported, and +is currently not a valid choice for \code{outcome_type}.} + +\item{outcome_column}{(\strong{recommended}) Name of the column containing the +outcome of interest. May be identified from a formula, if a formula is +provided as an argument. Otherwise an error is raised. Note that \code{survival} +and \code{competing_risk} outcome type outcomes require two columns that +indicate the time-to-event or the time of last follow-up and the event +status.} +} +\description{ +This function checks whether non-positive outcome time is present in the +data. This may produce unexpected results for some packages. For example, +glmnet will not train if an instance has a survival time of 0 or lower. +} +\keyword{internal} diff --git a/man/dot-finish_data_preparation.Rd b/man/dot-finish_data_preparation.Rd new file mode 100644 index 00000000..8b44a538 --- /dev/null +++ b/man/dot-finish_data_preparation.Rd @@ -0,0 +1,161 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ParseData.R +\name{.finish_data_preparation} +\alias{.finish_data_preparation} +\title{Internal function for finalising generic data processing} +\usage{ +.finish_data_preparation( + data, + sample_id_column, + batch_id_column, + series_id_column, + outcome_column, + outcome_type, + include_features, + class_levels, + censoring_indicator, + event_indicator, + competing_risk_indicator, + check_stringency = "strict", + reference_method = "auto" +) +} +\arguments{ +\item{data}{data.table with feature data} + +\item{sample_id_column}{(\strong{recommended}) Name of the column containing +sample or subject identifiers. See \code{batch_id_column} above for more +details. + +If unset, every row will be identified as a single sample.} + +\item{batch_id_column}{(\strong{recommended}) Name of the column containing batch +or cohort identifiers. This parameter is required if more than one dataset +is provided, or if external validation is performed. + +In familiar any row of data is organised by four identifiers: +\itemize{ +\item The batch identifier \code{batch_id_column}: This denotes the group to which a +set of samples belongs, e.g. patients from a single study, samples measured +in a batch, etc. The batch identifier is used for batch normalisation, as +well as selection of development and validation datasets. +\item The sample identifier \code{sample_id_column}: This denotes the sample level, +e.g. data from a single individual. Subsets of data, e.g. bootstraps or +cross-validation folds, are created at this level. +\item The series identifier \code{series_id_column}: Indicates measurements on a +single sample that may not share the same outcome value, e.g. a time +series, or the number of cells in a view. +\item The repetition identifier: Indicates repeated measurements in a single +series where any feature values may differ, but the outcome does not. +Repetition identifiers are always implicitly set when multiple entries for +the same series of the same sample in the same batch that share the same +outcome are encountered. +}} + +\item{series_id_column}{(\strong{optional}) Name of the column containing series +identifiers, which distinguish between measurements that are part of a +series for a single sample. See \code{batch_id_column} above for more details. + +If unset, rows which share the same batch and sample identifiers but have a +different outcome are assigned unique series identifiers.} + +\item{outcome_column}{(\strong{recommended}) Name of the column containing the +outcome of interest. May be identified from a formula, if a formula is +provided as an argument. Otherwise an error is raised. Note that \code{survival} +and \code{competing_risk} outcome type outcomes require two columns that +indicate the time-to-event or the time of last follow-up and the event +status.} + +\item{outcome_type}{(\strong{recommended}) Type of outcome found in the outcome +column. The outcome type determines many aspects of the overall process, +e.g. the available feature selection methods and learners, but also the +type of assessments that can be conducted to evaluate the resulting models. +Implemented outcome types are: +\itemize{ +\item \code{binomial}: categorical outcome with 2 levels. +\item \code{multinomial}: categorical outcome with 2 or more levels. +\item \code{count}: Poisson-distributed numeric outcomes. +\item \code{continuous}: general continuous numeric outcomes. +\item \code{survival}: survival outcome for time-to-event data. +} + +If not provided, the algorithm will attempt to obtain outcome_type from +contents of the outcome column. This may lead to unexpected results, and we +therefore advise to provide this information manually. + +Note that \code{competing_risk} survival analysis are not fully supported, and +is currently not a valid choice for \code{outcome_type}.} + +\item{include_features}{(\emph{optional}) Feature columns that are specifically +included in the data set. By default all features are included. Cannot +overlap with \code{exclude_features}, but may overlap \code{signature}. Features in +\code{signature} and \code{novelty_features} are always included. If both +\code{exclude_features} and \code{include_features} are provided, \code{include_features} +takes precedence, provided that there is no overlap between the two.} + +\item{class_levels}{(\emph{optional}) Class levels for \code{binomial} or \code{multinomial} +outcomes. This argument can be used to specify the ordering of levels for +categorical outcomes. These class levels must exactly match the levels +present in the outcome column.} + +\item{censoring_indicator}{(\strong{recommended}) Indicator for right-censoring in +\code{survival} and \code{competing_risk} analyses. \code{familiar} will automatically +recognise \code{0}, \code{false}, \code{f}, \code{n}, \code{no} as censoring indicators, including +different capitalisations. If this parameter is set, it replaces the +default values.} + +\item{event_indicator}{(\strong{recommended}) Indicator for events in \code{survival} +and \code{competing_risk} analyses. \code{familiar} will automatically recognise \code{1}, +\code{true}, \code{t}, \code{y} and \code{yes} as event indicators, including different +capitalisations. If this parameter is set, it replaces the default values.} + +\item{competing_risk_indicator}{(\strong{recommended}) Indicator for competing +risks in \code{competing_risk} analyses. There are no default values, and if +unset, all values other than those specified by the \code{event_indicator} and +\code{censoring_indicator} parameters are considered to indicate competing +risks.} + +\item{check_stringency}{Specifies stringency of various checks. This is mostly: +\itemize{ +\item \code{strict}: default value used for \code{summon_familiar}. Thoroughly checks +input data. Used internally for checking development data. +\item \code{external_warn}: value used for \code{extract_data} and related methods. Less +stringent checks, but will warn for possible issues. Used internally for +checking data for evaluation and explanation. +\item \code{external}: value used for external methods such as \code{predict}. Less +stringent checks, particularly for identifier and outcome columns, which may +be completely absent. Used internally for \code{predict}. +}} + +\item{reference_method}{(\emph{optional}) Method used to set reference levels for +categorical features. There are several options: +\itemize{ +\item \code{auto} (default): Categorical features that are not explicitly set by the +user, i.e. columns containing boolean values or characters, use the most +frequent level as reference. Categorical features that are explicitly set, +i.e. as factors, are used as is. +\item \code{always}: Both automatically detected and user-specified categorical +features have the reference level set to the most frequent level. Ordinal +features are not altered, but are used as is. +\item \code{never}: User-specified categorical features are used as is. +Automatically detected categorical features are simply sorted, and the +first level is then used as the reference level. This was the behaviour +prior to familiar version 1.3.0. +}} +} +\value{ +data.table with expected column names. +} +\description{ +Internal function for finalising generic data processing +} +\details{ +This function is used to update data.table provided by loading the +data. When part of the main familiar workflow, this function is used after +.parse_initial_settings --> .load_data --> .update_initial_settings. + +When used to parse external data (e.g. in conjunction with familiarModel) +it follows after .load_data. Hence the function contains several checks +which are otherwise part of .update_initial_settings. +} +\keyword{internal} diff --git a/man/dot-get_default_sign_size.Rd b/man/dot-get_default_sign_size.Rd new file mode 100644 index 00000000..7fd02770 --- /dev/null +++ b/man/dot-get_default_sign_size.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/LearnerMain.R +\name{.get_default_sign_size} +\alias{.get_default_sign_size} +\title{Internal function for obtaining a default signature size parameter} +\usage{ +.get_default_sign_size(data_obj, restrict_samples = FALSE) +} +\arguments{ +\item{data_obj}{dataObject class object which contains the data on which the +preset parameters are determined.} + +\item{restrict_samples}{Logical indicating whether the signature size should +be limited by the number of samples in addition to the number of available +features. This may help convergence of OLS-based methods.} +} +\value{ +List containing the preset values for the signature size parameter. +} +\description{ +Internal function for obtaining a default signature size parameter +} +\keyword{internal} diff --git a/man/dot-get_iteration_data.Rd b/man/dot-get_iteration_data.Rd new file mode 100644 index 00000000..5fd03f50 --- /dev/null +++ b/man/dot-get_iteration_data.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Iterations.R +\name{.get_iteration_data} +\alias{.get_iteration_data} +\title{Internal function for creating or retrieving iteration data} +\usage{ +.get_iteration_data( + file_paths, + data, + experiment_setup, + settings, + message_indent = 0L, + verbose = TRUE +) +} +\arguments{ +\item{file_paths}{Set of paths to relevant files and directories.} + +\item{data}{Data set as loaded using the \code{.load_data} function.} + +\item{experiment_setup}{data.table with subsampler information at different +levels of the experimental design.} + +\item{settings}{List of parameter settings. Some of these parameters are +relevant to creating iterations.} + +\item{message_indent}{Indenting of messages.} + +\item{verbose}{Sets verbosity.} +} +\value{ +A list with the following elements: +\itemize{ +\item \code{iter_list}: A list containing iteration data at the different levels of +the experiment. +\item \code{project_id}: The unique project identifier. +\item \code{experiment_setup}: data.table with subsampler information at different +levels of the experimental design. +} +} +\description{ +Internal function for creating or retrieving iteration data +} +\keyword{internal} diff --git a/man/dot-impute_outcome_type.Rd b/man/dot-impute_outcome_type.Rd new file mode 100644 index 00000000..ea7d3948 --- /dev/null +++ b/man/dot-impute_outcome_type.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/DataParameterChecks.R +\name{.impute_outcome_type} +\alias{.impute_outcome_type} +\title{Internal imputation function for the outcome type.} +\usage{ +.impute_outcome_type( + data, + outcome_column, + class_levels, + censoring_indicator, + event_indicator, + competing_risk_indicator +) +} +\arguments{ +\item{data}{Data set as loaded using the \code{.load_data} function.} + +\item{outcome_column}{Name of the outcome column in the data set.} + +\item{class_levels}{User-provided class levels for the outcome.} + +\item{censoring_indicator}{Name of censoring indicator.} + +\item{event_indicator}{Name of event indicator.} + +\item{competing_risk_indicator}{Name of competing risk indicator.} +} +\value{ +The imputed outcome type. +} +\description{ +This function allows for imputation of the most plausible outcome type. +This imputation is only done for trivial cases, where there is little doubt. +As a consequence \code{count} and \code{continuous} outcome types are never imputed. +} +\note{ +It is highly recommended that the user provides the outcome type. +} +\keyword{internal} diff --git a/man/dot-load_iterations.Rd b/man/dot-load_iterations.Rd new file mode 100644 index 00000000..0ac1daef --- /dev/null +++ b/man/dot-load_iterations.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Iterations.R +\name{.load_iterations} +\alias{.load_iterations} +\title{Internal function for loading iteration data from the file system} +\usage{ +.load_iterations(file_dir, iteration_file = NULL) +} +\arguments{ +\item{file_dir}{Path to directory where iteration files are stored.} +} +\value{ +List containing: +\itemize{ +\item \code{iteration_file_exists}: An indicator whether an iteration file was found. +\item \code{iteration_list}: The list of iterations (if available). +\item \code{project_id}: The unique project identifier (if available). +} +} +\description{ +Loads iterations generated by \code{.create_iterations} that were created in a +previous session. If these are not available, this is indicated by setting a +return flag. +} +\keyword{internal} diff --git a/man/dot-parse_categorical_features.Rd b/man/dot-parse_categorical_features.Rd new file mode 100644 index 00000000..08fd0a93 --- /dev/null +++ b/man/dot-parse_categorical_features.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ParseData.R +\name{.parse_categorical_features} +\alias{.parse_categorical_features} +\title{Internal function for setting categorical features} +\usage{ +.parse_categorical_features(data, outcome_type, reference_method = "auto") +} +\arguments{ +\item{data}{data.table with feature data} + +\item{outcome_type}{character, indicating the type of outcome} + +\item{reference_method}{character, indicating the type of method used to set +the reference level.} +} +\value{ +data.table with several features converted to factor. +} +\description{ +Internal function for setting categorical features +} +\details{ +This function parses columns containing feature data to factors if +the data contained therein have logical (TRUE, FALSE), character, or factor +classes. Unless passed as feature names with \code{reference}, numerical data, +including integers, are not converted to factors. +} +\keyword{internal} diff --git a/man/dot-parse_evaluation_settings.Rd b/man/dot-parse_evaluation_settings.Rd new file mode 100644 index 00000000..728a0f6f --- /dev/null +++ b/man/dot-parse_evaluation_settings.Rd @@ -0,0 +1,488 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ParseSettings.R +\name{.parse_evaluation_settings} +\alias{.parse_evaluation_settings} +\title{Internal function for parsing settings related to model evaluation} +\usage{ +.parse_evaluation_settings( + config = NULL, + data, + parallel, + outcome_type, + hpo_metric, + development_batch_id, + vimp_aggregation_method, + vimp_aggregation_rank_threshold, + prep_cluster_method, + prep_cluster_linkage_method, + prep_cluster_cut_method, + prep_cluster_similarity_threshold, + prep_cluster_similarity_metric, + evaluate_top_level_only = waiver(), + skip_evaluation_elements = waiver(), + ensemble_method = waiver(), + evaluation_metric = waiver(), + sample_limit = waiver(), + detail_level = waiver(), + estimation_type = waiver(), + aggregate_results = waiver(), + confidence_level = waiver(), + bootstrap_ci_method = waiver(), + feature_cluster_method = waiver(), + feature_cluster_cut_method = waiver(), + feature_linkage_method = waiver(), + feature_similarity_metric = waiver(), + feature_similarity_threshold = waiver(), + sample_cluster_method = waiver(), + sample_linkage_method = waiver(), + sample_similarity_metric = waiver(), + eval_aggregation_method = waiver(), + eval_aggregation_rank_threshold = waiver(), + eval_icc_type = waiver(), + stratification_method = waiver(), + stratification_threshold = waiver(), + time_max = waiver(), + evaluation_times = waiver(), + dynamic_model_loading = waiver(), + parallel_evaluation = waiver(), + ... +) +} +\arguments{ +\item{config}{A list of settings, e.g. from an xml file.} + +\item{data}{Data set as loaded using the \code{.load_data} function.} + +\item{parallel}{Logical value that whether familiar uses parallelisation. If +\code{FALSE} it will override \code{parallel_evaluation}.} + +\item{outcome_type}{Type of outcome found in the data set.} + +\item{hpo_metric}{Metric defined for hyperparameter optimisation.} + +\item{development_batch_id}{Identifiers of batches used for model development. +These identifiers are used to determine the cohorts used to determine a +setting for \code{time_max}, if the \code{outcome_type} is \code{survival}, and both +\code{time_max} and \code{evaluation_times} are not provided.} + +\item{vimp_aggregation_method}{Method for variable importance aggregation that +was used for feature selection.} + +\item{vimp_aggregation_rank_threshold}{Rank threshold for variable importance +aggregation used during feature selection.} + +\item{prep_cluster_method}{Cluster method used during pre-processing.} + +\item{prep_cluster_linkage_method}{Cluster linkage method used during +pre-processing.} + +\item{prep_cluster_cut_method}{Cluster cut method used during pre-processing.} + +\item{prep_cluster_similarity_threshold}{Cluster similarity threshold used +during pre-processing.} + +\item{prep_cluster_similarity_metric}{Cluster similarity metric used during +pre-processing.} + +\item{evaluate_top_level_only}{(\emph{optional}) Flag that signals that only +evaluation at the most global experiment level is required. Consider a +cross-validation experiment with additional external validation. The global +experiment level consists of data that are used for development, internal +validation and external validation. The next lower experiment level are the +individual cross-validation iterations. + +When the flag is \code{true}, evaluations take place on the global level only, +and no results are generated for the next lower experiment levels. In our +example, this means that results from individual cross-validation iterations +are not computed and shown. When the flag is \code{false}, results are computed +from both the global layer and the next lower level. + +Setting the flag to \code{true} saves computation time.} + +\item{skip_evaluation_elements}{(\emph{optional}) Specifies which evaluation steps, +if any, should be skipped as part of the evaluation process. Defaults to +\code{none}, which means that all relevant evaluation steps are performed. It can +have one or more of the following values: +\itemize{ +\item \code{none}, \code{false}: no steps are skipped. +\item \code{all}, \code{true}: all steps are skipped. +\item \code{auc_data}: data for assessing and plotting the area under the receiver +operating characteristic curve are not computed. +\item \code{calibration_data}: data for assessing and plotting model calibration are +not computed. +\item \code{calibration_info}: data required to assess calibration, such as baseline +survival curves, are not collected. These data will still be present in the +models. +\item \code{confusion_matrix}: data for assessing and plotting a confusion matrix are +not collected. +\item \code{decision_curve_analyis}: data for performing a decision curve analysis +are not computed. +\item \code{feature_expressions}: data for assessing and plotting sample clustering +are not computed. +\item \code{feature_similarity}: data for assessing and plotting feature clusters are +not computed. +\item \code{fs_vimp}: data for assessing and plotting feature selection-based +variable importance are not collected. +\item \code{hyperparameters}: data for assessing model hyperparameters are not +collected. These data will still be present in the models. +\item \code{ice_data}: data for individual conditional expectation and partial +dependence plots are not created. +\item \code{model_performance}: data for assessing and visualising model performance +are not created. +\item \code{model_vimp}: data for assessing and plotting model-based variable +importance are not collected. +\item \code{permutation_vimp}: data for assessing and plotting model-agnostic +permutation variable importance are not computed. +\item \code{prediction_data}: predictions for each sample are not made and exported. +\item \code{risk_stratification_data}: data for assessing and plotting Kaplan-Meier +survival curves are not collected. +\item \code{risk_stratification_info}: data for assessing stratification into risk +groups are not computed. +\item \code{univariate_analysis}: data for assessing and plotting univariate feature +importance are not computed. +}} + +\item{ensemble_method}{(\emph{optional}) Method for ensembling predictions from +models for the same sample. Available methods are: +\itemize{ +\item \code{median} (default): Use the median of the predicted values as the ensemble +value for a sample. +\item \code{mean}: Use the mean of the predicted values as the ensemble value for a +sample. +} + +This parameter is only used if \code{detail_level} is \code{ensemble}.} + +\item{evaluation_metric}{(\emph{optional}) One or more metrics for assessing model +performance. See the vignette on performance metrics for the available +metrics. + +Confidence intervals (or rather credibility intervals) are computed for each +metric during evaluation. This is done using bootstraps, the number of which +depends on the value of \code{confidence_level} (Davison and Hinkley, 1997). + +If unset, the metric in the \code{optimisation_metric} variable is used.} + +\item{sample_limit}{(\emph{optional}) Set the upper limit of the number of samples +that are used during evaluation steps. Cannot be less than 20. + +This setting can be specified per data element by providing a parameter +value in a named list with data elements, e.g. +\code{list("sample_similarity"=100, "permutation_vimp"=1000)}. + +This parameter can be set for the following data elements: +\code{sample_similarity} and \code{ice_data}.} + +\item{detail_level}{(\emph{optional}) Sets the level at which results are computed +and aggregated. +\itemize{ +\item \code{ensemble}: Results are computed at the ensemble level, i.e. over all +models in the ensemble. This means that, for example, bias-corrected +estimates of model performance are assessed by creating (at least) 20 +bootstraps and computing the model performance of the ensemble model for +each bootstrap. +\item \code{hybrid} (default): Results are computed at the level of models in an +ensemble. This means that, for example, bias-corrected estimates of model +performance are directly computed using the models in the ensemble. If there +are at least 20 trained models in the ensemble, performance is computed for +each model, in contrast to \code{ensemble} where performance is computed for the +ensemble of models. If there are less than 20 trained models in the +ensemble, bootstraps are created so that at least 20 point estimates can be +made. +\item \code{model}: Results are computed at the model level. This means that, for +example, bias-corrected estimates of model performance are assessed by +creating (at least) 20 bootstraps and computing the performance of the model +for each bootstrap. +} + +Note that each level of detail has a different interpretation for bootstrap +confidence intervals. For \code{ensemble} and \code{model} these are the confidence +intervals for the ensemble and an individual model, respectively. That is, +the confidence interval describes the range where an estimate produced by a +respective ensemble or model trained on a repeat of the experiment may be +found with the probability of the confidence level. For \code{hybrid}, it +represents the range where any single model trained on a repeat of the +experiment may be found with the probability of the confidence level. By +definition, confidence intervals obtained using \code{hybrid} are at least as +wide as those for \code{ensemble}. \code{hybrid} offers the correct interpretation if +the goal of the analysis is to assess the result of a single, unspecified, +model. + +\code{hybrid} is generally computationally less expensive then \code{ensemble}, which +in turn is somewhat less expensive than \code{model}. + +A non-default \code{detail_level} parameter can be specified for separate +evaluation steps by providing a parameter value in a named list with data +elements, e.g. \code{list("auc_data"="ensemble", "model_performance"="hybrid")}. +This parameter can be set for the following data elements: \code{auc_data}, +\code{decision_curve_analyis}, \code{model_performance}, \code{permutation_vimp}, +\code{ice_data}, \code{prediction_data} and \code{confusion_matrix}.} + +\item{estimation_type}{(\emph{optional}) Sets the type of estimation that should be +possible. This has the following options: +\itemize{ +\item \code{point}: Point estimates. +\item \code{bias_correction} or \code{bc}: Bias-corrected estimates. A bias-corrected +estimate is computed from (at least) 20 point estimates, and \code{familiar} may +bootstrap the data to create them. +\item \code{bootstrap_confidence_interval} or \code{bci} (default): Bias-corrected +estimates with bootstrap confidence intervals (Efron and Hastie, 2016). The +number of point estimates required depends on the \code{confidence_level} +parameter, and \code{familiar} may bootstrap the data to create them. +} + +As with \code{detail_level}, a non-default \code{estimation_type} parameter can be +specified for separate evaluation steps by providing a parameter value in a +named list with data elements, e.g. \code{list("auc_data"="bci", "model_performance"="point")}. This parameter can be set for the following +data elements: \code{auc_data}, \code{decision_curve_analyis}, \code{model_performance}, +\code{permutation_vimp}, \code{ice_data}, and \code{prediction_data}.} + +\item{aggregate_results}{(\emph{optional}) Flag that signifies whether results +should be aggregated during evaluation. If \code{estimation_type} is +\code{bias_correction} or \code{bc}, aggregation leads to a single bias-corrected +estimate. If \code{estimation_type} is \code{bootstrap_confidence_interval} or \code{bci}, +aggregation leads to a single bias-corrected estimate with lower and upper +boundaries of the confidence interval. This has no effect if +\code{estimation_type} is \code{point}. + +The default value is equal to \code{TRUE} except when assessing metrics to assess +model performance, as the default violin plot requires underlying data. + +As with \code{detail_level} and \code{estimation_type}, a non-default +\code{aggregate_results} parameter can be specified for separate evaluation steps +by providing a parameter value in a named list with data elements, e.g. +\code{list("auc_data"=TRUE, , "model_performance"=FALSE)}. This parameter exists +for the same elements as \code{estimation_type}.} + +\item{confidence_level}{(\emph{optional}) Numeric value for the level at which +confidence intervals are determined. In the case bootstraps are used to +determine the confidence intervals bootstrap estimation, \code{familiar} uses the +rule of thumb \eqn{n = 20 / ci.level} to determine the number of required +bootstraps. + +The default value is \code{0.95}.} + +\item{bootstrap_ci_method}{(\emph{optional}) Method used to determine bootstrap +confidence intervals (Efron and Hastie, 2016). The following methods are +implemented: +\itemize{ +\item \code{percentile} (default): Confidence intervals obtained using the percentile +method. +\item \code{bc}: Bias-corrected confidence intervals. +} + +Note that the standard method is not implemented because this method is +often not suitable due to non-normal distributions. The bias-corrected and +accelerated (BCa) method is not implemented yet.} + +\item{feature_cluster_method}{(\emph{optional}) Method used to perform clustering +of features. The same methods as for the \code{cluster_method} configuration +parameter are available: \code{none}, \code{hclust}, \code{agnes}, \code{diana} and \code{pam}. + +The value for the \code{cluster_method} configuration parameter is used by +default. When generating clusters for the purpose of determining mutual +correlation and ordering feature expressions, \code{none} is ignored and \code{hclust} +is used instead.} + +\item{feature_cluster_cut_method}{(\emph{optional}) Method used to divide features +into separate clusters. The available methods are the same as for the +\code{cluster_cut_method} configuration parameter: \code{silhouette}, \code{fixed_cut} and +\code{dynamic_cut}. + +\code{silhouette} is available for all cluster methods, but \code{fixed_cut} only +applies to methods that create hierarchical trees (\code{hclust}, \code{agnes} and +\code{diana}). \code{dynamic_cut} requires the \code{dynamicTreeCut} package and can only +be used with \code{agnes} and \code{hclust}. + +The value for the \code{cluster_cut_method} configuration parameter is used by +default.} + +\item{feature_linkage_method}{(\emph{optional}) Method used for agglomerative +clustering with \code{hclust} and \code{agnes}. Linkage determines how features are +sequentially combined into clusters based on distance. The methods are +shared with the \code{cluster_linkage_method} configuration parameter: \code{average}, +\code{single}, \code{complete}, \code{weighted}, and \code{ward}. + +The value for the \code{cluster_linkage_method} configuration parameters is used +by default.} + +\item{feature_similarity_metric}{(\emph{optional}) Metric to determine pairwise +similarity between features. Similarity is computed in the same manner as +for clustering, and \code{feature_similarity_metric} therefore has the same +options as \code{cluster_similarity_metric}: \code{mcfadden_r2}, \code{cox_snell_r2}, +\code{nagelkerke_r2}, \code{mutual_information}, \code{spearman}, \code{kendall} and \code{pearson}. + +The value used for the \code{cluster_similarity_metric} configuration parameter +is used by default.} + +\item{feature_similarity_threshold}{(\emph{optional}) The threshold level for +pair-wise similarity that is required to form feature clusters with the +\code{fixed_cut} method. This threshold functions in the same manner as the one +defined using the \code{cluster_similarity_threshold} parameter. + +By default, the value for the \code{cluster_similarity_threshold} configuration +parameter is used. + +Unlike for \code{cluster_similarity_threshold}, more than one value can be +supplied here.} + +\item{sample_cluster_method}{(\emph{optional}) The method used to perform +clustering based on distance between samples. These are the same methods as +for the \code{cluster_method} configuration parameter: \code{hclust}, \code{agnes}, \code{diana} +and \code{pam}. + +The value for the \code{cluster_method} configuration parameter is used by +default. When generating clusters for the purpose of ordering samples in +feature expressions, \code{none} is ignored and \code{hclust} is used instead.} + +\item{sample_linkage_method}{(\emph{optional}) The method used for agglomerative +clustering in \code{hclust} and \code{agnes}. These are the same methods as for the +\code{cluster_linkage_method} configuration parameter: \code{average}, \code{single}, +\code{complete}, \code{weighted}, and \code{ward}. + +The value for the \code{cluster_linkage_method} configuration parameters is used +by default.} + +\item{sample_similarity_metric}{(\emph{optional}) Metric to determine pairwise +similarity between samples. Similarity is computed in the same manner as for +clustering, but \code{sample_similarity_metric} has different options that are +better suited to computing distance between samples instead of between +features. The following metrics are available. +\itemize{ +\item \code{gower} (default): compute Gower's distance between samples. By default, +Gower's distance is computed based on winsorised data to reduce the effect +of outliers (see below). +\item \code{euclidean}: compute the Euclidean distance between samples. +} + +The underlying feature data for numerical features is scaled to the +\eqn{[0,1]} range using the feature values across the samples. The +normalisation parameters required can optionally be computed from feature +data with the outer 5\% (on both sides) of feature values trimmed or +winsorised. To do so append \verb{_trim} (trimming) or \verb{_winsor} (winsorising) to +the metric name. This reduces the effect of outliers somewhat. + +Regardless of metric, all categorical features are handled as for the +Gower's distance: distance is 0 if the values in a pair of samples match, +and 1 if they do not.} + +\item{eval_aggregation_method}{(\emph{optional}) Method for aggregating variable +importances for the purpose of evaluation. Variable importances are +determined during feature selection steps and after training the model. Both +types are evaluated, but feature selection variable importance is only +evaluated at run-time. + +See the documentation for the \code{vimp_aggregation_method} argument for +information concerning the different methods available.} + +\item{eval_aggregation_rank_threshold}{(\emph{optional}) The threshold used to +define the subset of highly important features during evaluation. + +See the documentation for the \code{vimp_aggregation_rank_threshold} argument for +more information.} + +\item{eval_icc_type}{(\emph{optional}) String indicating the type of intraclass +correlation coefficient (\code{1}, \code{2} or \code{3}) that should be used to compute +robustness for features in repeated measurements during the evaluation of +univariate importance. These types correspond to the types in Shrout and +Fleiss (1979). The default value is \code{1}.} + +\item{stratification_method}{(\emph{optional}) Method for determining the +stratification threshold for creating survival groups. The actual, +model-dependent, threshold value is obtained from the development data, and +can afterwards be used to perform stratification on validation data. + +The following stratification methods are available: +\itemize{ +\item \code{median} (default): The median predicted value in the development cohort +is used to stratify the samples into two risk groups. For predicted outcome +values that build a continuous spectrum, the two risk groups in the +development cohort will be roughly equal in size. +\item \code{mean}: The mean predicted value in the development cohort is used to +stratify the samples into two risk groups. +\item \code{mean_trim}: As \code{mean}, but based on the set of predicted values +where the 5\% lowest and 5\% highest values are discarded. This reduces the +effect of outliers. +\item \code{mean_winsor}: As \code{mean}, but based on the set of predicted values where +the 5\% lowest and 5\% highest values are winsorised. This reduces the effect +of outliers. +\item \code{fixed}: Samples are stratified based on the sample quantiles of the +predicted values. These quantiles are defined using the +\code{stratification_threshold} parameter. +\item \code{optimised}: Use maximally selected rank statistics to determine the +optimal threshold (Lausen and Schumacher, 1992; Hothorn et al., 2003) to +stratify samples into two optimally separated risk groups. +} + +One or more stratification methods can be selected simultaneously. + +This parameter is only relevant for \code{survival} outcomes.} + +\item{stratification_threshold}{(\emph{optional}) Numeric value(s) signifying the +sample quantiles for stratification using the \code{fixed} method. The number of +risk groups will be the number of values +1. + +The default value is \code{c(1/3, 2/3)}, which will yield two thresholds that +divide samples into three equally sized groups. If \code{fixed} is not among the +selected stratification methods, this parameter is ignored. + +This parameter is only relevant for \code{survival} outcomes.} + +\item{time_max}{(\emph{optional}) Time point which is used as the benchmark for +e.g. cumulative risks generated by random forest, or the cutoff for Uno's +concordance index. + +If \code{time_max} is not provided, but \code{evaluation_times} is, the largest value +of \code{evaluation_times} is used. If both are not provided, \code{time_max} is set +to the 98th percentile of the distribution of survival times for samples +with an event in the development data set. + +This parameter is only relevant for \code{survival} outcomes.} + +\item{evaluation_times}{(\emph{optional}) One or more time points that are used for +assessing calibration in survival problems. This is done as expected and +observed survival probabilities depend on time. + +If unset, \code{evaluation_times} will be equal to \code{time_max}. + +This parameter is only relevant for \code{survival} outcomes.} + +\item{dynamic_model_loading}{(\emph{optional}) Enables dynamic loading of models +during the evaluation process, if \code{TRUE}. Defaults to \code{FALSE}. Dynamic +loading of models may reduce the overall memory footprint, at the cost of +increased disk or network IO. Models can only be dynamically loaded if they +are found at an accessible disk or network location. Setting this parameter +to \code{TRUE} may help if parallel processing causes out-of-memory issues during +evaluation.} + +\item{parallel_evaluation}{(\emph{optional}) Enable parallel processing for +hyperparameter optimisation. Defaults to \code{TRUE}. When set to \code{FALSE}, this +will disable the use of parallel processing while performing optimisation, +regardless of the settings of the \code{parallel} parameter. The parameter +moreover specifies whether parallelisation takes place within the evaluation +process steps (\code{inner}, default), or in an outer loop ( \code{outer}) over +learners, data subsamples, etc. + +\code{parallel_evaluation} is ignored if \code{parallel=FALSE}.} + +\item{...}{Unused arguments.} +} +\value{ +List of parameters related to model evaluation. +} +\description{ +Internal function for parsing settings related to model evaluation +} +\references{ +\enumerate{ +\item Davison, A. C. & Hinkley, D. V. Bootstrap methods and their +application. (Cambridge University Press, 1997). +\item Efron, B. & Hastie, T. Computer Age Statistical Inference. (Cambridge +University Press, 2016). +\item Lausen, B. & Schumacher, M. Maximally Selected Rank Statistics. +Biometrics 48, 73 (1992). +\item Hothorn, T. & Lausen, B. On the exact distribution of maximally selected +rank statistics. Comput. Stat. Data Anal. 43, 121–137 (2003). +} +} +\keyword{internal} diff --git a/man/dot-parse_experiment_settings.Rd b/man/dot-parse_experiment_settings.Rd new file mode 100644 index 00000000..c2a30dee --- /dev/null +++ b/man/dot-parse_experiment_settings.Rd @@ -0,0 +1,253 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ParseSettings.R +\name{.parse_experiment_settings} +\alias{.parse_experiment_settings} +\title{Internal function for parsing settings related to the computational setup} +\usage{ +.parse_experiment_settings( + config = NULL, + batch_id_column = waiver(), + sample_id_column = waiver(), + series_id_column = waiver(), + development_batch_id = waiver(), + validation_batch_id = waiver(), + outcome_name = waiver(), + outcome_column = waiver(), + outcome_type = waiver(), + event_indicator = waiver(), + censoring_indicator = waiver(), + competing_risk_indicator = waiver(), + class_levels = waiver(), + signature = waiver(), + novelty_features = waiver(), + exclude_features = waiver(), + include_features = waiver(), + reference_method = waiver(), + experimental_design = waiver(), + imbalance_correction_method = waiver(), + imbalance_n_partitions = waiver(), + ... +) +} +\arguments{ +\item{config}{A list of settings, e.g. from an xml file.} + +\item{batch_id_column}{(\strong{recommended}) Name of the column containing batch +or cohort identifiers. This parameter is required if more than one dataset +is provided, or if external validation is performed. + +In familiar any row of data is organised by four identifiers: +\itemize{ +\item The batch identifier \code{batch_id_column}: This denotes the group to which a +set of samples belongs, e.g. patients from a single study, samples measured +in a batch, etc. The batch identifier is used for batch normalisation, as +well as selection of development and validation datasets. +\item The sample identifier \code{sample_id_column}: This denotes the sample level, +e.g. data from a single individual. Subsets of data, e.g. bootstraps or +cross-validation folds, are created at this level. +\item The series identifier \code{series_id_column}: Indicates measurements on a +single sample that may not share the same outcome value, e.g. a time +series, or the number of cells in a view. +\item The repetition identifier: Indicates repeated measurements in a single +series where any feature values may differ, but the outcome does not. +Repetition identifiers are always implicitly set when multiple entries for +the same series of the same sample in the same batch that share the same +outcome are encountered. +}} + +\item{sample_id_column}{(\strong{recommended}) Name of the column containing +sample or subject identifiers. See \code{batch_id_column} above for more +details. + +If unset, every row will be identified as a single sample.} + +\item{series_id_column}{(\strong{optional}) Name of the column containing series +identifiers, which distinguish between measurements that are part of a +series for a single sample. See \code{batch_id_column} above for more details. + +If unset, rows which share the same batch and sample identifiers but have a +different outcome are assigned unique series identifiers.} + +\item{development_batch_id}{(\emph{optional}) One or more batch or cohort +identifiers to constitute data sets for development. Defaults to all, or +all minus the identifiers in \code{validation_batch_id} for external validation. +Required if external validation is performed and \code{validation_batch_id} is +not provided.} + +\item{validation_batch_id}{(\emph{optional}) One or more batch or cohort +identifiers to constitute data sets for external validation. Defaults to +all data sets except those in \code{development_batch_id} for external +validation, or none if not. Required if \code{development_batch_id} is not +provided.} + +\item{outcome_name}{(\emph{optional}) Name of the modelled outcome. This name will +be used in figures created by \code{familiar}. + +If not set, the column name in \code{outcome_column} will be used for +\code{binomial}, \code{multinomial}, \code{count} and \code{continuous} outcomes. For other +outcomes (\code{survival} and \code{competing_risk}) no default is used.} + +\item{outcome_column}{(\strong{recommended}) Name of the column containing the +outcome of interest. May be identified from a formula, if a formula is +provided as an argument. Otherwise an error is raised. Note that \code{survival} +and \code{competing_risk} outcome type outcomes require two columns that +indicate the time-to-event or the time of last follow-up and the event +status.} + +\item{outcome_type}{(\strong{recommended}) Type of outcome found in the outcome +column. The outcome type determines many aspects of the overall process, +e.g. the available feature selection methods and learners, but also the +type of assessments that can be conducted to evaluate the resulting models. +Implemented outcome types are: +\itemize{ +\item \code{binomial}: categorical outcome with 2 levels. +\item \code{multinomial}: categorical outcome with 2 or more levels. +\item \code{count}: Poisson-distributed numeric outcomes. +\item \code{continuous}: general continuous numeric outcomes. +\item \code{survival}: survival outcome for time-to-event data. +} + +If not provided, the algorithm will attempt to obtain outcome_type from +contents of the outcome column. This may lead to unexpected results, and we +therefore advise to provide this information manually. + +Note that \code{competing_risk} survival analysis are not fully supported, and +is currently not a valid choice for \code{outcome_type}.} + +\item{event_indicator}{(\strong{recommended}) Indicator for events in \code{survival} +and \code{competing_risk} analyses. \code{familiar} will automatically recognise \code{1}, +\code{true}, \code{t}, \code{y} and \code{yes} as event indicators, including different +capitalisations. If this parameter is set, it replaces the default values.} + +\item{censoring_indicator}{(\strong{recommended}) Indicator for right-censoring in +\code{survival} and \code{competing_risk} analyses. \code{familiar} will automatically +recognise \code{0}, \code{false}, \code{f}, \code{n}, \code{no} as censoring indicators, including +different capitalisations. If this parameter is set, it replaces the +default values.} + +\item{competing_risk_indicator}{(\strong{recommended}) Indicator for competing +risks in \code{competing_risk} analyses. There are no default values, and if +unset, all values other than those specified by the \code{event_indicator} and +\code{censoring_indicator} parameters are considered to indicate competing +risks.} + +\item{class_levels}{(\emph{optional}) Class levels for \code{binomial} or \code{multinomial} +outcomes. This argument can be used to specify the ordering of levels for +categorical outcomes. These class levels must exactly match the levels +present in the outcome column.} + +\item{signature}{(\emph{optional}) One or more names of feature columns that are +considered part of a specific signature. Features specified here will +always be used for modelling. Ranking from feature selection has no effect +for these features.} + +\item{novelty_features}{(\emph{optional}) One or more names of feature columns +that should be included for the purpose of novelty detection.} + +\item{exclude_features}{(\emph{optional}) Feature columns that will be removed +from the data set. Cannot overlap with features in \code{signature}, +\code{novelty_features} or \code{include_features}.} + +\item{include_features}{(\emph{optional}) Feature columns that are specifically +included in the data set. By default all features are included. Cannot +overlap with \code{exclude_features}, but may overlap \code{signature}. Features in +\code{signature} and \code{novelty_features} are always included. If both +\code{exclude_features} and \code{include_features} are provided, \code{include_features} +takes precedence, provided that there is no overlap between the two.} + +\item{reference_method}{(\emph{optional}) Method used to set reference levels for +categorical features. There are several options: +\itemize{ +\item \code{auto} (default): Categorical features that are not explicitly set by the +user, i.e. columns containing boolean values or characters, use the most +frequent level as reference. Categorical features that are explicitly set, +i.e. as factors, are used as is. +\item \code{always}: Both automatically detected and user-specified categorical +features have the reference level set to the most frequent level. Ordinal +features are not altered, but are used as is. +\item \code{never}: User-specified categorical features are used as is. +Automatically detected categorical features are simply sorted, and the +first level is then used as the reference level. This was the behaviour +prior to familiar version 1.3.0. +}} + +\item{experimental_design}{(\strong{required}) Defines what the experiment looks +like, e.g. \code{cv(bt(fs,20)+mb,3,2)+ev} for 2 times repeated 3-fold +cross-validation with nested feature selection on 20 bootstraps and +model-building, and external validation. The basic workflow components are: +\itemize{ +\item \code{fs}: (required) feature selection step. +\item \code{mb}: (required) model building step. +\item \code{ev}: (optional) external validation. Note that internal validation due +to subsampling will always be conducted if the subsampling methods create +any validation data sets. +} + +The different components are linked using \code{+}. + +Different subsampling methods can be used in conjunction with the basic +workflow components: +\itemize{ +\item \code{bs(x,n)}: (stratified) .632 bootstrap, with \code{n} the number of +bootstraps. In contrast to \code{bt}, feature pre-processing parameters and +hyperparameter optimisation are conducted on individual bootstraps. +\item \code{bt(x,n)}: (stratified) .632 bootstrap, with \code{n} the number of +bootstraps. Unlike \code{bs} and other subsampling methods, no separate +pre-processing parameters or optimised hyperparameters will be determined +for each bootstrap. +\item \code{cv(x,n,p)}: (stratified) \code{n}-fold cross-validation, repeated \code{p} times. +Pre-processing parameters are determined for each iteration. +\item \code{lv(x)}: leave-one-out-cross-validation. Pre-processing parameters are +determined for each iteration. +\item \code{ip(x)}: imbalance partitioning for addressing class imbalances on the +data set. Pre-processing parameters are determined for each partition. The +number of partitions generated depends on the imbalance correction method +(see the \code{imbalance_correction_method} parameter). Imbalance partitioning +does not generate validation sets. +} + +As shown in the example above, sampling algorithms can be nested. + +The simplest valid experimental design is \code{fs+mb}, which corresponds to a +TRIPOD type 1a analysis. Type 1b analyses are only possible using +bootstraps, e.g. \code{bt(fs+mb,100)}. Type 2a analyses can be conducted using +cross-validation, e.g. \code{cv(bt(fs,100)+mb,10,1)}. Depending on the origin of +the external validation data, designs such as \code{fs+mb+ev} or +\code{cv(bt(fs,100)+mb,10,1)+ev} constitute type 2b or type 3 analyses. Type 4 +analyses can be done by obtaining one or more \code{familiarModel} objects from +others and applying them to your own data set. + +Alternatively, the \code{experimental_design} parameter may be used to provide a +path to a file containing iterations, which is named \verb{####_iterations.RDS} +by convention. This path can be relative to the directory of the current +experiment (\code{experiment_dir}), or an absolute path. The absolute path may +thus also point to a file from a different experiment.} + +\item{imbalance_correction_method}{(\emph{optional}) Type of method used to +address class imbalances. Available options are: +\itemize{ +\item \code{full_undersampling} (default): All data will be used in an ensemble +fashion. The full minority class will appear in each partition, but +majority classes are undersampled until all data have been used. +\item \code{random_undersampling}: Randomly undersamples majority classes. This is +useful in cases where full undersampling would lead to the formation of +many models due major overrepresentation of the largest class. +} + +This parameter is only used in combination with imbalance partitioning in +the experimental design, and \code{ip} should therefore appear in the string +that defines the design.} + +\item{imbalance_n_partitions}{(\emph{optional}) Number of times random +undersampling should be repeated. 10 undersampled subsets with balanced +classes are formed by default.} + +\item{...}{Unused arguments.} +} +\value{ +List of parameters related to data parsing and the experiment. +} +\description{ +Internal function for parsing settings related to the computational setup +} +\keyword{internal} diff --git a/man/dot-parse_feature_selection_settings.Rd b/man/dot-parse_feature_selection_settings.Rd new file mode 100644 index 00000000..8c1c09fb --- /dev/null +++ b/man/dot-parse_feature_selection_settings.Rd @@ -0,0 +1,120 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ParseSettings.R +\name{.parse_feature_selection_settings} +\alias{.parse_feature_selection_settings} +\title{Internal function for parsing settings related to feature selection} +\usage{ +.parse_feature_selection_settings( + config = NULL, + data, + parallel, + outcome_type, + fs_method = waiver(), + fs_method_parameter = waiver(), + vimp_aggregation_method = waiver(), + vimp_aggregation_rank_threshold = waiver(), + parallel_feature_selection = waiver(), + ... +) +} +\arguments{ +\item{config}{A list of settings, e.g. from an xml file.} + +\item{data}{Data set as loaded using the \code{.load_data} function.} + +\item{parallel}{Logical value that whether familiar uses parallelisation. If +\code{FALSE} it will override \code{parallel_feature_selection}.} + +\item{outcome_type}{Type of outcome found in the data set.} + +\item{fs_method}{(\strong{required}) Feature selection method to be used for +determining variable importance. \code{familiar} implements various feature +selection methods. Please refer to the vignette on feature selection +methods for more details. + +More than one feature selection method can be chosen. The experiment will +then repeated for each feature selection method. + +Feature selection methods determines the ranking of features. Actual +selection of features is done by optimising the signature size model +hyperparameter during the hyperparameter optimisation step.} + +\item{fs_method_parameter}{(\emph{optional}) List of lists containing parameters +for feature selection methods. Each sublist should have the name of the +feature selection method it corresponds to. + +Most feature selection methods do not have parameters that can be set. +Please refer to the vignette on feature selection methods for more details. +Note that if the feature selection method is based on a learner (e.g. lasso +regression), hyperparameter optimisation may be performed prior to +assessing variable importance.} + +\item{vimp_aggregation_method}{(\emph{optional}) The method used to aggregate +variable importances over different data subsets, e.g. bootstraps. The +following methods can be selected: +\itemize{ +\item \code{none}: Don't aggregate ranks, but rather aggregate the variable +importance scores themselves. +\item \code{mean}: Use the mean rank of a feature over the subsets to +determine the aggregated feature rank. +\item \code{median}: Use the median rank of a feature over the subsets to determine +the aggregated feature rank. +\item \code{best}: Use the best rank the feature obtained in any subset to determine +the aggregated feature rank. +\item \code{worst}: Use the worst rank the feature obtained in any subset to +determine the aggregated feature rank. +\item \code{stability}: Use the frequency of the feature being in the subset of +highly ranked features as measure for the aggregated feature rank +(Meinshausen and Buehlmann, 2010). +\item \code{exponential}: Use a rank-weighted frequence of occurrence in the subset +of highly ranked features as measure for the aggregated feature rank (Haury +et al., 2011). +\item \code{borda} (default): Use the borda count as measure for the aggregated +feature rank (Wald et al., 2012). +\item \code{enhanced_borda}: Use an occurrence frequency-weighted borda count as +measure for the aggregated feature rank (Wald et al., 2012). +\item \code{truncated_borda}: Use borda count computed only on features within the +subset of highly ranked features. +\item \code{enhanced_truncated_borda}: Apply both the enhanced borda method and the +truncated borda method and use the resulting borda count as the aggregated +feature rank. +} + +The \emph{feature selection methods} vignette provides additional information.} + +\item{vimp_aggregation_rank_threshold}{(\emph{optional}) The threshold used to +define the subset of highly important features. If not set, this threshold +is determined by maximising the variance in the occurrence value over all +features over the subset size. + +This parameter is only relevant for \code{stability}, \code{exponential}, +\code{enhanced_borda}, \code{truncated_borda} and \code{enhanced_truncated_borda} methods.} + +\item{parallel_feature_selection}{(\emph{optional}) Enable parallel processing for +the feature selection workflow. Defaults to \code{TRUE}. When set to \code{FALSE}, +this will disable the use of parallel processing while performing feature +selection, regardless of the settings of the \code{parallel} parameter. +\code{parallel_feature_selection} is ignored if \code{parallel=FALSE}.} + +\item{...}{Unused arguments.} +} +\value{ +List of parameters related to feature selection. +} +\description{ +Internal function for parsing settings related to feature selection +} +\references{ +\enumerate{ +\item Wald, R., Khoshgoftaar, T. M., Dittman, D., Awada, W. & +Napolitano, A. An extensive comparison of feature ranking aggregation +techniques in bioinformatics. in 2012 IEEE 13th International Conference on +Information Reuse Integration (IRI) 377–384 (2012). +\item Meinshausen, N. & Buehlmann, P. Stability selection. J. R. Stat. Soc. +Series B Stat. Methodol. 72, 417–473 (2010). +\item Haury, A.-C., Gestraud, P. & Vert, J.-P. The influence of feature +selection methods on accuracy, stability and interpretability of molecular +signatures. PLoS One 6, e28210 (2011). +} +} +\keyword{internal} diff --git a/man/dot-parse_file_paths.Rd b/man/dot-parse_file_paths.Rd new file mode 100644 index 00000000..582e8cf2 --- /dev/null +++ b/man/dot-parse_file_paths.Rd @@ -0,0 +1,72 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ParseSettings.R +\name{.parse_file_paths} +\alias{.parse_file_paths} +\title{Internal function for parsing file paths} +\usage{ +.parse_file_paths( + config = NULL, + project_dir = waiver(), + experiment_dir = waiver(), + data_file = waiver(), + verbose = TRUE, + ... +) +} +\arguments{ +\item{config}{A list of settings, e.g. from an xml file.} + +\item{project_dir}{(\emph{optional}) Path to the project directory. \code{familiar} +checks if the directory indicated by \code{experiment_dir} and data files in +\code{data_file} are relative to the \code{project_dir}.} + +\item{experiment_dir}{(\strong{recommended}) Path to the directory where all +intermediate and final results produced by \code{familiar} are written to. + +The \code{experiment_dir} can be a path relative to \code{project_dir} or an absolute +path. + +In case no project directory is provided and the experiment directory is +not on an absolute path, a directory will be created in the temporary R +directory indicated by \code{tempdir()}. This directory is deleted after closing +the R session or once data analysis has finished. All information will be +lost afterwards. Hence, it is recommended to provide either +\code{experiment_dir} as an absolute path, or provide both \code{project_dir} and +\code{experiment_dir}.} + +\item{data_file}{(\emph{optional}) Path to files containing data that should be +analysed. The paths can be relative to \code{project_dir} or absolute paths. An +error will be raised if the file cannot be found. + +The following types of data are supported. +\itemize{ +\item \code{csv} files containing column headers on the first row, and samples per +row. \code{csv} files are read using \code{data.table::fread}. +\item \code{rds} files that contain a \code{data.table} or \code{data.frame} object. \code{rds} +files are imported using \code{base::readRDS}. +\item \code{RData} files that contain a single \code{data.table} or \code{data.frame} object. +\code{RData} files are imported using \code{base::load}. +} + +All data are expected in wide format, with sample information organised +row-wise. + +More than one data file can be provided. \code{familiar} will try to combine +data files based on column names and identifier columns. + +Alternatively, data can be provided using the \code{data} argument. These data +are expected to be \code{data.frame} or \code{data.table} objects or paths to data +files. The latter are handled in the same way as file paths provided to +\code{data_file}.} + +\item{verbose}{Sets verbosity.} + +\item{...}{Unused arguments} +} +\value{ +List of paths to important directories and files. +} +\description{ +Internal function for parsing file paths +} +\keyword{internal} diff --git a/man/dot-parse_general_settings.Rd b/man/dot-parse_general_settings.Rd new file mode 100644 index 00000000..a3a44a6f --- /dev/null +++ b/man/dot-parse_general_settings.Rd @@ -0,0 +1,1188 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ParseSettings.R +\name{.parse_general_settings} +\alias{.parse_general_settings} +\title{Internal function for parsing settings that configure various aspects of the +worklow} +\usage{ +.parse_general_settings(settings, config = NULL, data, ...) +} +\arguments{ +\item{settings}{List of settings that was previously generated by +\code{.parse_initial_settings}.} + +\item{config}{A list of settings, e.g. from an xml file.} + +\item{data}{Data set as loaded using the \code{.load_data} function.} + +\item{...}{ + Arguments passed on to \code{\link[=.parse_setup_settings]{.parse_setup_settings}}, \code{\link[=.parse_preprocessing_settings]{.parse_preprocessing_settings}}, \code{\link[=.parse_feature_selection_settings]{.parse_feature_selection_settings}}, \code{\link[=.parse_model_development_settings]{.parse_model_development_settings}}, \code{\link[=.parse_hyperparameter_optimisation_settings]{.parse_hyperparameter_optimisation_settings}}, \code{\link[=.parse_evaluation_settings]{.parse_evaluation_settings}} + \describe{ + \item{\code{parallel}}{(\emph{optional}) Enable parallel processing. Defaults to \code{TRUE}. +When set to \code{FALSE}, this disables all parallel processing, regardless of +specific parameters such as \code{parallel_preprocessing}. However, when +\code{parallel} is \code{TRUE}, parallel processing of different parts of the +workflow can be disabled by setting respective flags to \code{FALSE}.} + \item{\code{parallel_nr_cores}}{(\emph{optional}) Number of cores available for +parallelisation. Defaults to 2. This setting does nothing if +parallelisation is disabled.} + \item{\code{restart_cluster}}{(\emph{optional}) Restart nodes used for parallel computing +to free up memory prior to starting a parallel process. Note that it does +take time to set up the clusters. Therefore setting this argument to \code{TRUE} +may impact processing speed. This argument is ignored if \code{parallel} is +\code{FALSE} or the cluster was initialised outside of familiar. Default is +\code{FALSE}, which causes the clusters to be initialised only once.} + \item{\code{cluster_type}}{(\emph{optional}) Selection of the cluster type for parallel +processing. Available types are the ones supported by the parallel package +that is part of the base R distribution: \code{psock} (default), \code{fork}, \code{mpi}, +\code{nws}, \code{sock}. In addition, \code{none} is available, which also disables +parallel processing.} + \item{\code{backend_type}}{(\emph{optional}) Selection of the backend for distributing +copies of the data. This backend ensures that only a single master copy is +kept in memory. This limits memory usage during parallel processing. + +Several backend options are available, notably \code{socket_server}, and \code{none} +(default). \code{socket_server} is based on the callr package and R sockets, +comes with \code{familiar} and is available for any OS. \code{none} uses the package +environment of familiar to store data, and is available for any OS. +However, \code{none} requires copying of data to any parallel process, and has a +larger memory footprint.} + \item{\code{server_port}}{(\emph{optional}) Integer indicating the port on which the +socket server or RServe process should communicate. Defaults to port 6311. +Note that ports 0 to 1024 and 49152 to 65535 cannot be used.} + \item{\code{feature_max_fraction_missing}}{(\emph{optional}) Numeric value between \code{0.0} +and \code{0.95} that determines the meximum fraction of missing values that +still allows a feature to be included in the data set. All features with a +missing value fraction over this threshold are not processed further. The +default value is \code{0.30}.} + \item{\code{sample_max_fraction_missing}}{(\emph{optional}) Numeric value between \code{0.0} +and \code{0.95} that determines the maximum fraction of missing values that +still allows a sample to be included in the data set. All samples with a +missing value fraction over this threshold are excluded and not processed +further. The default value is \code{0.30}.} + \item{\code{filter_method}}{(\emph{optional}) One or methods used to reduce +dimensionality of the data set by removing irrelevant or poorly +reproducible features. + +Several method are available: +\itemize{ +\item \code{none} (default): None of the features will be filtered. +\item \code{low_variance}: Features with a variance below the +\code{low_var_minimum_variance_threshold} are filtered. This can be useful to +filter, for example, genes that are not differentially expressed. +\item \code{univariate_test}: Features undergo a univariate regression using an +outcome-appropriate regression model. The p-value of the model coefficient +is collected. Features with coefficient p or q-value above the +\code{univariate_test_threshold} are subsequently filtered. +\item \code{robustness}: Features that are not sufficiently robust according to the +intraclass correlation coefficient are filtered. Use of this method +requires that repeated measurements are present in the data set, i.e. there +should be entries for which the sample and cohort identifiers are the same. +} + +More than one method can be used simultaneously. Features with singular +values are always filtered, as these do not contain information.} + \item{\code{univariate_test_threshold}}{(\emph{optional}) Numeric value between \code{1.0} and +\code{0.0} that determines which features are irrelevant and will be filtered by +the \code{univariate_test}. The p or q-values are compared to this threshold. +All features with values above the threshold are filtered. The default +value is \code{0.20}.} + \item{\code{univariate_test_threshold_metric}}{(\emph{optional}) Metric used with the to +compare the \code{univariate_test_threshold} against. The following metrics can +be chosen: +\itemize{ +\item \code{p_value} (default): The unadjusted p-value of each feature is used for +to filter features. +\item \code{q_value}: The q-value (Story, 2002), is used to filter features. Some +data sets may have insufficient samples to compute the q-value. The +\code{qvalue} package must be installed from Bioconductor to use this method. +}} + \item{\code{univariate_test_max_feature_set_size}}{(\emph{optional}) Maximum size of the +feature set after the univariate test. P or q values of features are +compared against the threshold, but if the resulting data set would be +larger than this setting, only the most relevant features up to the desired +feature set size are selected. + +The default value is \code{NULL}, which causes features to be filtered based on +their relevance only.} + \item{\code{low_var_minimum_variance_threshold}}{(required, if used) Numeric value +that determines which features will be filtered by the \code{low_variance} +method. The variance of each feature is computed and compared to the +threshold. If it is below the threshold, the feature is removed. + +This parameter has no default value and should be set if \code{low_variance} is +used.} + \item{\code{low_var_max_feature_set_size}}{(\emph{optional}) Maximum size of the feature +set after filtering features with a low variance. All features are first +compared against \code{low_var_minimum_variance_threshold}. If the resulting +feature set would be larger than specified, only the most strongly varying +features will be selected, up to the desired size of the feature set. + +The default value is \code{NULL}, which causes features to be filtered based on +their variance only.} + \item{\code{robustness_icc_type}}{(\emph{optional}) String indicating the type of +intraclass correlation coefficient (\code{1}, \code{2} or \code{3}) that should be used to +compute robustness for features in repeated measurements. These types +correspond to the types in Shrout and Fleiss (1979). The default value is +\code{1}.} + \item{\code{robustness_threshold_metric}}{(\emph{optional}) String indicating which +specific intraclass correlation coefficient (ICC) metric should be used to +filter features. This should be one of: +\itemize{ +\item \code{icc}: The estimated ICC value itself. +\item \code{icc_low} (default): The estimated lower limit of the 95\% confidence +interval of the ICC, as suggested by Koo and Li (2016). +\item \code{icc_panel}: The estimated ICC value over the panel average, i.e. the ICC +that would be obtained if all repeated measurements were averaged. +\item \code{icc_panel_low}: The estimated lower limit of the 95\% confidence interval +of the panel ICC. +}} + \item{\code{robustness_threshold_value}}{(\emph{optional}) The intraclass correlation +coefficient value that is as threshold. The default value is \code{0.70}.} + \item{\code{transformation_method}}{(\emph{optional}) The transformation method used to +change the distribution of the data to be more normal-like. The following +methods are available: +\itemize{ +\item \code{none}: This disables transformation of features. +\item \code{yeo_johnson} (default): Transformation using the Yeo-Johnson +transformation (Yeo and Johnson, 2000). The algorithm tests various lambda +values and selects the lambda that maximises the log-likelihood. +\item \code{yeo_johnson_trim}: As \code{yeo_johnson}, but based on the set of feature +values where the 5\% lowest and 5\% highest values are discarded. This +reduces the effect of outliers. +\item \code{yeo_johnson_winsor}: As \code{yeo_johnson}, but based on the set of feature +values where the 5\% lowest and 5\% highest values are winsorised. This +reduces the effect of outliers. +\item \code{yeo_johnson_robust}: A robust version of \code{yeo_johnson} after Raymaekers +and Rousseeuw (2021). This method is less sensitive to outliers. +\item \code{box_cox}: Transformation using the Box-Cox transformation (Box and Cox, +1964). Unlike the Yeo-Johnson transformation, the Box-Cox transformation +requires that all data are positive. Features that contain zero or negative +values cannot be transformed using this transformation. The algorithm tests +various lambda values and selects the lambda that maximises the +log-likelihood. +\item \code{box_cox_trim}: As \code{box_cox}, but based on the set of feature values +where the 5\% lowest and 5\% highest values are discarded. This reduces the +effect of outliers. +\item \code{box_cox_winsor}: As \code{box_cox}, but based on the set of feature values +where the 5\% lowest and 5\% highest values are winsorised. This reduces the +effect of outliers. +\item \code{box_cox_robust}: A robust verson of \code{box_cox} after Raymaekers and +Rousseew (2021). This method is less sensitive to outliers. +} + +Only features that contain numerical data are transformed. Transformation +parameters obtained in development data are stored within \code{featureInfo} +objects for later use with validation data sets.} + \item{\code{normalisation_method}}{(\emph{optional}) The normalisation method used to +improve the comparability between numerical features that may have very +different scales. The following normalisation methods can be chosen: +\itemize{ +\item \code{none}: This disables feature normalisation. +\item \code{standardisation}: Features are normalised by subtraction of their mean +values and division by their standard deviations. This causes every feature +to be have a center value of 0.0 and standard deviation of 1.0. +\item \code{standardisation_trim}: As \code{standardisation}, but based on the set of +feature values where the 5\% lowest and 5\% highest values are discarded. +This reduces the effect of outliers. +\item \code{standardisation_winsor}: As \code{standardisation}, but based on the set of +feature values where the 5\% lowest and 5\% highest values are winsorised. +This reduces the effect of outliers. +\item \code{standardisation_robust} (default): A robust version of \code{standardisation} +that relies on computing Huber's M-estimators for location and scale. +\item \code{normalisation}: Features are normalised by subtraction of their minimum +values and division by their ranges. This maps all feature values to a +\eqn{[0, 1]} interval. +\item \code{normalisation_trim}: As \code{normalisation}, but based on the set of feature +values where the 5\% lowest and 5\% highest values are discarded. This +reduces the effect of outliers. +\item \code{normalisation_winsor}: As \code{normalisation}, but based on the set of +feature values where the 5\% lowest and 5\% highest values are winsorised. +This reduces the effect of outliers. +\item \code{quantile}: Features are normalised by subtraction of their median values +and division by their interquartile range. +\item \code{mean_centering}: Features are centered by substracting the mean, but do +not undergo rescaling. +} + +Only features that contain numerical data are normalised. Normalisation +parameters obtained in development data are stored within \code{featureInfo} +objects for later use with validation data sets.} + \item{\code{batch_normalisation_method}}{(\emph{optional}) The method used for batch +normalisation. Available methods are: +\itemize{ +\item \code{none} (default): This disables batch normalisation of features. +\item \code{standardisation}: Features within each batch are normalised by +subtraction of the mean value and division by the standard deviation in +each batch. +\item \code{standardisation_trim}: As \code{standardisation}, but based on the set of +feature values where the 5\% lowest and 5\% highest values are discarded. +This reduces the effect of outliers. +\item \code{standardisation_winsor}: As \code{standardisation}, but based on the set of +feature values where the 5\% lowest and 5\% highest values are winsorised. +This reduces the effect of outliers. +\item \code{standardisation_robust}: A robust version of \code{standardisation} that +relies on computing Huber's M-estimators for location and scale within each +batch. +\item \code{normalisation}: Features within each batch are normalised by subtraction +of their minimum values and division by their range in each batch. This +maps all feature values in each batch to a \eqn{[0, 1]} interval. +\item \code{normalisation_trim}: As \code{normalisation}, but based on the set of feature +values where the 5\% lowest and 5\% highest values are discarded. This +reduces the effect of outliers. +\item \code{normalisation_winsor}: As \code{normalisation}, but based on the set of +feature values where the 5\% lowest and 5\% highest values are winsorised. +This reduces the effect of outliers. +\item \code{quantile}: Features in each batch are normalised by subtraction of the +median value and division by the interquartile range of each batch. +\item \code{mean_centering}: Features in each batch are centered on 0.0 by +substracting the mean value in each batch, but are not rescaled. +\item \code{combat_parametric}: Batch adjustments using parametric empirical Bayes +(Johnson et al, 2007). \code{combat_p} leads to the same method. +\item \code{combat_non_parametric}: Batch adjustments using non-parametric empirical +Bayes (Johnson et al, 2007). \code{combat_np} and \code{combat} lead to the same +method. Note that we reduced complexity from O(\eqn{n^2}) to O(\eqn{n}) by +only computing batch adjustment parameters for each feature on a subset of +50 randomly selected features, instead of all features. +} + +Only features that contain numerical data are normalised using batch +normalisation. Batch normalisation parameters obtained in development data +are stored within \code{featureInfo} objects for later use with validation data +sets, in case the validation data is from the same batch. + +If validation data contains data from unknown batches, normalisation +parameters are separately determined for these batches. + +Note that for both empirical Bayes methods, the batch effect is assumed to +produce results across the features. This is often true for things such as +gene expressions, but the assumption may not hold generally. + +When performing batch normalisation, it is moreover important to check that +differences between batches or cohorts are not related to the studied +endpoint.} + \item{\code{imputation_method}}{(\emph{optional}) Method used for imputing missing +feature values. Two methods are implemented: +\itemize{ +\item \code{simple}: Simple replacement of a missing value by the median value (for +numeric features) or the modal value (for categorical features). +\item \code{lasso}: Imputation of missing value by lasso regression (using \code{glmnet}) +based on information contained in other features. +} + +\code{simple} imputation precedes \code{lasso} imputation to ensure that any missing +values in predictors required for \code{lasso} regression are resolved. The +\code{lasso} estimate is then used to replace the missing value. + +The default value depends on the number of features in the dataset. If the +number is lower than 100, \code{lasso} is used by default, and \code{simple} +otherwise. + +Only single imputation is performed. Imputation models and parameters are +stored within \code{featureInfo} objects for later use with validation data +sets.} + \item{\code{cluster_method}}{(\emph{optional}) Clustering is performed to identify and +replace redundant features, for example those that are highly correlated. +Such features do not carry much additional information and may be removed +or replaced instead (Park et al., 2007; Tolosi and Lengauer, 2011). + +The cluster method determines the algorithm used to form the clusters. The +following cluster methods are implemented: +\itemize{ +\item \code{none}: No clustering is performed. +\item \code{hclust} (default): Hierarchical agglomerative clustering. If the +\code{fastcluster} package is installed, \code{fastcluster::hclust} is used (Muellner +2013), otherwise \code{stats::hclust} is used. +\item \code{agnes}: Hierarchical clustering using agglomerative nesting (Kaufman and +Rousseeuw, 1990). This algorithm is similar to \code{hclust}, but uses the +\code{cluster::agnes} implementation. +\item \code{diana}: Divisive analysis hierarchical clustering. This method uses +divisive instead of agglomerative clustering (Kaufman and Rousseeuw, 1990). +\code{cluster::diana} is used. +\item \code{pam}: Partioning around medioids. This partitions the data into $k$ +clusters around medioids (Kaufman and Rousseeuw, 1990). $k$ is selected +using the \code{silhouette} metric. \code{pam} is implemented using the +\code{cluster::pam} function. +} + +Clusters and cluster information is stored within \code{featureInfo} objects for +later use with validation data sets. This enables reproduction of the same +clusters as formed in the development data set.} + \item{\code{cluster_linkage_method}}{(\emph{optional}) Linkage method used for +agglomerative clustering in \code{hclust} and \code{agnes}. The following linkage +methods can be used: +\itemize{ +\item \code{average} (default): Average linkage. +\item \code{single}: Single linkage. +\item \code{complete}: Complete linkage. +\item \code{weighted}: Weighted linkage, also known as McQuitty linkage. +\item \code{ward}: Linkage using Ward's minimum variance method. +} + +\code{diana} and \code{pam} do not require a linkage method.} + \item{\code{cluster_cut_method}}{(\emph{optional}) The method used to define the actual +clusters. The following methods can be used: +\itemize{ +\item \code{silhouette}: Clusters are formed based on the silhouette score +(Rousseeuw, 1987). The average silhouette score is computed from 2 to +\eqn{n} clusters, with \eqn{n} the number of features. Clusters are only +formed if the average silhouette exceeds 0.50, which indicates reasonable +evidence for structure. This procedure may be slow if the number of +features is large (>100s). +\item \code{fixed_cut}: Clusters are formed by cutting the hierarchical tree at the +point indicated by the \code{cluster_similarity_threshold}, e.g. where features +in a cluster have an average Spearman correlation of 0.90. \code{fixed_cut} is +only available for \code{agnes}, \code{diana} and \code{hclust}. +\item \code{dynamic_cut}: Dynamic cluster formation using the cutting algorithm in +the \code{dynamicTreeCut} package. This package should be installed to select +this option. \code{dynamic_cut} can only be used with \code{agnes} and \code{hclust}. +} + +The default options are \code{silhouette} for partioning around medioids (\code{pam}) +and \code{fixed_cut} otherwise.} + \item{\code{cluster_similarity_metric}}{(\emph{optional}) Clusters are formed based on +feature similarity. All features are compared in a pair-wise fashion to +compute similarity, for example correlation. The resulting similarity grid +is converted into a distance matrix that is subsequently used for +clustering. The following metrics are supported to compute pairwise +similarities: +\itemize{ +\item \code{mutual_information} (default): normalised mutual information. +\item \code{mcfadden_r2}: McFadden's pseudo R-squared (McFadden, 1974). +\item \code{cox_snell_r2}: Cox and Snell's pseudo R-squared (Cox and Snell, 1989). +\item \code{nagelkerke_r2}: Nagelkerke's pseudo R-squared (Nagelkerke, 1991). +\item \code{spearman}: Spearman's rank order correlation. +\item \code{kendall}: Kendall rank correlation. +\item \code{pearson}: Pearson product-moment correlation. +} + +The pseudo R-squared metrics can be used to assess similarity between mixed +pairs of numeric and categorical features, as these are based on the +log-likelihood of regression models. In \code{familiar}, the more informative +feature is used as the predictor and the other feature as the reponse +variable. In numeric-categorical pairs, the numeric feature is considered +to be more informative and is thus used as the predictor. In +categorical-categorical pairs, the feature with most levels is used as the +predictor. + +In case any of the classical correlation coefficients (\code{pearson}, +\code{spearman} and \code{kendall}) are used with (mixed) categorical features, the +categorical features are one-hot encoded and the mean correlation over all +resulting pairs is used as similarity.} + \item{\code{cluster_similarity_threshold}}{(\emph{optional}) The threshold level for +pair-wise similarity that is required to form clusters using \code{fixed_cut}. +This should be a numerical value between 0.0 and 1.0. Note however, that a +reasonable threshold value depends strongly on the similarity metric. The +following are the default values used: +\itemize{ +\item \code{mcfadden_r2} and \code{mutual_information}: \code{0.30} +\item \code{cox_snell_r2} and \code{nagelkerke_r2}: \code{0.75} +\item \code{spearman}, \code{kendall} and \code{pearson}: \code{0.90} +} + +Alternatively, if the \verb{fixed cut} method is not used, this value determines +whether any clustering should be performed, because the data may not +contain highly similar features. The default values in this situation are: +\itemize{ +\item \code{mcfadden_r2} and \code{mutual_information}: \code{0.25} +\item \code{cox_snell_r2} and \code{nagelkerke_r2}: \code{0.40} +\item \code{spearman}, \code{kendall} and \code{pearson}: \code{0.70} +} + +The threshold value is converted to a distance (1-similarity) prior to +cutting hierarchical trees.} + \item{\code{cluster_representation_method}}{(\emph{optional}) Method used to determine +how the information of co-clustered features is summarised and used to +represent the cluster. The following methods can be selected: +\itemize{ +\item \code{best_predictor} (default): The feature with the highest importance +according to univariate regression with the outcome is used to represent +the cluster. +\item \code{medioid}: The feature closest to the cluster center, i.e. the feature +that is most similar to the remaining features in the cluster, is used to +represent the feature. +\item \code{mean}: A meta-feature is generated by averaging the feature values for +all features in a cluster. This method aligns all features so that all +features will be positively correlated prior to averaging. Should a cluster +contain one or more categorical features, the \code{medioid} method will be used +instead, as averaging is not possible. Note that if this method is chosen, +the \code{normalisation_method} parameter should be one of \code{standardisation}, +\code{standardisation_trim}, \code{standardisation_winsor} or \code{quantile}.` +} + +If the \code{pam} cluster method is selected, only the \code{medioid} method can be +used. In that case 1 medioid is used by default.} + \item{\code{parallel_preprocessing}}{(\emph{optional}) Enable parallel processing for the +preprocessing workflow. Defaults to \code{TRUE}. When set to \code{FALSE}, this will +disable the use of parallel processing while preprocessing, regardless of +the settings of the \code{parallel} parameter. \code{parallel_preprocessing} is +ignored if \code{parallel=FALSE}.} + \item{\code{fs_method}}{(\strong{required}) Feature selection method to be used for +determining variable importance. \code{familiar} implements various feature +selection methods. Please refer to the vignette on feature selection +methods for more details. + +More than one feature selection method can be chosen. The experiment will +then repeated for each feature selection method. + +Feature selection methods determines the ranking of features. Actual +selection of features is done by optimising the signature size model +hyperparameter during the hyperparameter optimisation step.} + \item{\code{fs_method_parameter}}{(\emph{optional}) List of lists containing parameters +for feature selection methods. Each sublist should have the name of the +feature selection method it corresponds to. + +Most feature selection methods do not have parameters that can be set. +Please refer to the vignette on feature selection methods for more details. +Note that if the feature selection method is based on a learner (e.g. lasso +regression), hyperparameter optimisation may be performed prior to +assessing variable importance.} + \item{\code{vimp_aggregation_method}}{(\emph{optional}) The method used to aggregate +variable importances over different data subsets, e.g. bootstraps. The +following methods can be selected: +\itemize{ +\item \code{none}: Don't aggregate ranks, but rather aggregate the variable +importance scores themselves. +\item \code{mean}: Use the mean rank of a feature over the subsets to +determine the aggregated feature rank. +\item \code{median}: Use the median rank of a feature over the subsets to determine +the aggregated feature rank. +\item \code{best}: Use the best rank the feature obtained in any subset to determine +the aggregated feature rank. +\item \code{worst}: Use the worst rank the feature obtained in any subset to +determine the aggregated feature rank. +\item \code{stability}: Use the frequency of the feature being in the subset of +highly ranked features as measure for the aggregated feature rank +(Meinshausen and Buehlmann, 2010). +\item \code{exponential}: Use a rank-weighted frequence of occurrence in the subset +of highly ranked features as measure for the aggregated feature rank (Haury +et al., 2011). +\item \code{borda} (default): Use the borda count as measure for the aggregated +feature rank (Wald et al., 2012). +\item \code{enhanced_borda}: Use an occurrence frequency-weighted borda count as +measure for the aggregated feature rank (Wald et al., 2012). +\item \code{truncated_borda}: Use borda count computed only on features within the +subset of highly ranked features. +\item \code{enhanced_truncated_borda}: Apply both the enhanced borda method and the +truncated borda method and use the resulting borda count as the aggregated +feature rank. +} + +The \emph{feature selection methods} vignette provides additional information.} + \item{\code{vimp_aggregation_rank_threshold}}{(\emph{optional}) The threshold used to +define the subset of highly important features. If not set, this threshold +is determined by maximising the variance in the occurrence value over all +features over the subset size. + +This parameter is only relevant for \code{stability}, \code{exponential}, +\code{enhanced_borda}, \code{truncated_borda} and \code{enhanced_truncated_borda} methods.} + \item{\code{parallel_feature_selection}}{(\emph{optional}) Enable parallel processing for +the feature selection workflow. Defaults to \code{TRUE}. When set to \code{FALSE}, +this will disable the use of parallel processing while performing feature +selection, regardless of the settings of the \code{parallel} parameter. +\code{parallel_feature_selection} is ignored if \code{parallel=FALSE}.} + \item{\code{learner}}{(\strong{required}) One or more algorithms used for model +development. A sizeable number learners is supported in \code{familiar}. Please +see the vignette on learners for more information concerning the available +learners.} + \item{\code{hyperparameter}}{(\emph{optional}) List of lists containing hyperparameters +for learners. Each sublist should have the name of the learner method it +corresponds to, with list elements being named after the intended +hyperparameter, e.g. \code{"glm_logistic"=list("sign_size"=3)} + +All learners have hyperparameters. Please refer to the vignette on learners +for more details. If no parameters are provided, sequential model-based +optimisation is used to determine optimal hyperparameters. + +Hyperparameters provided by the user are never optimised. However, if more +than one value is provided for a single hyperparameter, optimisation will +be conducted using these values.} + \item{\code{novelty_detector}}{(\emph{optional}) Specify the algorithm used for training +a novelty detector. This detector can be used to identify +out-of-distribution data prospectively.} + \item{\code{detector_parameters}}{(\emph{optional}) List lists containing hyperparameters +for novelty detectors. Currently not used.} + \item{\code{parallel_model_development}}{(\emph{optional}) Enable parallel processing for +the model development workflow. Defaults to \code{TRUE}. When set to \code{FALSE}, +this will disable the use of parallel processing while developing models, +regardless of the settings of the \code{parallel} parameter. +\code{parallel_model_development} is ignored if \code{parallel=FALSE}.} + \item{\code{optimisation_bootstraps}}{(\emph{optional}) Number of bootstraps that should +be generated from the development data set. During the optimisation +procedure one or more of these bootstraps (indicated by +\code{smbo_step_bootstraps}) are used for model development using different +combinations of hyperparameters. The effect of the hyperparameters is then +assessed by comparing in-bag and out-of-bag model performance. + +The default number of bootstraps is \code{50}. Hyperparameter optimisation may +finish before exhausting the set of bootstraps.} + \item{\code{optimisation_determine_vimp}}{(\emph{optional}) Logical value that indicates +whether variable importance is determined separately for each of the +bootstraps created during the optimisation process (\code{TRUE}) or the +applicable results from the feature selection step are used (\code{FALSE}). + +Determining variable importance increases the initial computational +overhead. However, it prevents positive biases for the out-of-bag data due +to overlap of these data with the development data set used for the feature +selection step. In this case, any hyperparameters of the variable +importance method are not determined separately for each bootstrap, but +those obtained during the feature selection step are used instead. In case +multiple of such hyperparameter sets could be applicable, the set that will +be used is randomly selected for each bootstrap. + +This parameter only affects hyperparameter optimisation of learners. The +default is \code{TRUE}.} + \item{\code{smbo_random_initialisation}}{(\emph{optional}) String indicating the +initialisation method for the hyperparameter space. Can be one of +\code{fixed_subsample} (default), \code{fixed}, or \code{random}. \code{fixed} and +\code{fixed_subsample} first create hyperparameter sets from a range of default +values set by familiar. \code{fixed_subsample} then randomly draws up to +\code{smbo_n_random_sets} from the grid. \code{random} does not rely upon a fixed +grid, and randomly draws up to \code{smbo_n_random_sets} hyperparameter sets +from the hyperparameter space.} + \item{\code{smbo_n_random_sets}}{(\emph{optional}) Number of random or subsampled +hyperparameters drawn during the initialisation process. Default: \code{100}. +Cannot be smaller than \code{10}. The parameter is not used when +\code{smbo_random_initialisation} is \code{fixed}, as the entire pre-defined grid +will be explored.} + \item{\code{max_smbo_iterations}}{(\emph{optional}) Maximum number of intensify +iterations of the SMBO algorithm. During an intensify iteration a run-off +occurs between the current \emph{best} hyperparameter combination and either 10 +challenger combination with the highest expected improvement or a set of 20 +random combinations. + +Run-off with random combinations is used to force exploration of the +hyperparameter space, and is performed every second intensify iteration, or +if there is no expected improvement for any challenger combination. + +If a combination of hyperparameters leads to better performance on the same +data than the incumbent \emph{best} set of hyperparameters, it replaces the +incumbent set at the end of the intensify iteration. + +The default number of intensify iteration is \code{20}. Iterations may be +stopped early if the incumbent set of hyperparameters remains the same for +\code{smbo_stop_convergent_iterations} iterations, or performance improvement is +minimal. This behaviour is suppressed during the first 4 iterations to +enable the algorithm to explore the hyperparameter space.} + \item{\code{smbo_stop_convergent_iterations}}{(\emph{optional}) The number of subsequent +convergent SMBO iterations required to stop hyperparameter optimisation +early. An iteration is convergent if the \emph{best} parameter set has not +changed or the optimisation score over the 4 most recent iterations has not +changed beyond the tolerance level in \code{smbo_stop_tolerance}. + +The default value is \code{3}.} + \item{\code{smbo_stop_tolerance}}{(\emph{optional}) Tolerance for early stopping due to +convergent optimisation score. + +The default value depends on the square root of the number of samples (at +the series level), and is \code{0.01} for 100 samples. This value is computed as +\code{0.1 * 1 / sqrt(n_samples)}. The upper limit is \code{0.0001} for 1M or more +samples.} + \item{\code{smbo_time_limit}}{(\emph{optional}) Time limit (in minutes) for the +optimisation process. Optimisation is stopped after this limit is exceeded. +Time taken to determine variable importance for the optimisation process +(see the \code{optimisation_determine_vimp} parameter) does not count. + +The default is \code{NULL}, indicating that there is no time limit for the +optimisation process. The time limit cannot be less than 1 minute.} + \item{\code{smbo_initial_bootstraps}}{(\emph{optional}) The number of bootstraps taken +from the set of \code{optimisation_bootstraps} as the bootstraps assessed +initially. + +The default value is \code{1}. The value cannot be larger than +\code{optimisation_bootstraps}.} + \item{\code{smbo_step_bootstraps}}{(\emph{optional}) The number of bootstraps taken from +the set of \code{optimisation_bootstraps} bootstraps as the bootstraps assessed +during the steps of each intensify iteration. + +The default value is \code{3}. The value cannot be larger than +\code{optimisation_bootstraps}.} + \item{\code{smbo_intensify_steps}}{(\emph{optional}) The number of steps in each SMBO +intensify iteration. Each step a new set of \code{smbo_step_bootstraps} +bootstraps is drawn and used in the run-off between the incumbent \emph{best} +hyperparameter combination and its challengers. + +The default value is \code{5}. Higher numbers allow for a more detailed +comparison, but this comes with added computational cost.} + \item{\code{optimisation_metric}}{(\emph{optional}) One or more metrics used to compute +performance scores. See the vignette on performance metrics for the +available metrics. + +If unset, the following metrics are used by default: +\itemize{ +\item \code{auc_roc}: For \code{binomial} and \code{multinomial} models. +\item \code{mse}: Mean squared error for \code{continuous} models. +\item \code{msle}: Mean squared logarithmic error for \code{count} models. +\item \code{concordance_index}: For \code{survival} models. +} + +Multiple optimisation metrics can be specified. Actual metric values are +converted to an objective value by comparison with a baseline metric value +that derives from a trivial model, i.e. majority class for binomial and +multinomial outcomes, the median outcome for count and continuous outcomes +and a fixed risk or time for survival outcomes.} + \item{\code{optimisation_function}}{(\emph{optional}) Type of optimisation function used +to quantify the performance of a hyperparameter set. Model performance is +assessed using the metric(s) specified by \code{optimisation_metric} on the +in-bag (IB) and out-of-bag (OOB) samples of a bootstrap. These values are +converted to objective scores with a standardised interval of \eqn{[-1.0, + 1.0]}. Each pair of objective is subsequently used to compute an +optimisation score. The optimisation score across different bootstraps is +than aggregated to a summary score. This summary score is used to rank +hyperparameter sets, and select the optimal set. + +The combination of optimisation score and summary score is determined by +the optimisation function indicated by this parameter: +\itemize{ +\item \code{validation} or \code{max_validation} (default): seeks to maximise OOB score. +\item \code{balanced}: seeks to balance IB and OOB score. +\item \code{stronger_balance}: similar to \code{balanced}, but with stronger penalty for +differences between IB and OOB scores. +\item \code{validation_minus_sd}: seeks to optimise the average OOB score minus its +standard deviation. +\item \code{validation_25th_percentile}: seeks to optimise the 25th percentile of +OOB scores, and is conceptually similar to \code{validation_minus_sd}. +\item \code{model_estimate}: seeks to maximise the OOB score estimate predicted by +the hyperparameter learner (not available for random search). +\item \code{model_estimate_minus_sd}: seeks to maximise the OOB score estimate minus +its estimated standard deviation, as predicted by the hyperparameter +learner (not available for random search). +\item \code{model_balanced_estimate}: seeks to maximise the estimate of the balanced +IB and OOB score. This is similar to the \code{balanced} score, and in fact uses +a hyperparameter learner to predict said score (not available for random +search). +\item \code{model_balanced_estimate_minus_sd}: seeks to maximise the estimate of the +balanced IB and OOB score, minus its estimated standard deviation. This is +similar to the \code{balanced} score, but takes into account its estimated +spread. +} + +Additional detail are provided in the \emph{Learning algorithms and +hyperparameter optimisation} vignette.} + \item{\code{hyperparameter_learner}}{(\emph{optional}) Any point in the hyperparameter +space has a single, scalar, optimisation score value that is \emph{a priori} +unknown. During the optimisation process, the algorithm samples from the +hyperparameter space by selecting hyperparameter sets and computing the +optimisation score value for one or more bootstraps. For each +hyperparameter set the resulting values are distributed around the actual +value. The learner indicated by \code{hyperparameter_learner} is then used to +infer optimisation score estimates for unsampled parts of the +hyperparameter space. + +The following models are available: +\itemize{ +\item \code{bayesian_additive_regression_trees} or \code{bart}: Uses Bayesian Additive +Regression Trees (Sparapani et al., 2021) for inference. Unlike standard +random forests, BART allows for estimating posterior distributions directly +and can extrapolate. +\item \code{gaussian_process} (default): Creates a localised approximate Gaussian +process for inference (Gramacy, 2016). This allows for better scaling than +deterministic Gaussian Processes. +\item \code{random_forest}: Creates a random forest for inference. Originally +suggested by Hutter et al. (2011). A weakness of random forests is their +lack of extrapolation beyond observed values, which limits their usefulness +in exploiting promising areas of hyperparameter space. +\item \code{random} or \code{random_search}: Forgoes the use of models to steer +optimisation. Instead, a random search is performed. +}} + \item{\code{acquisition_function}}{(\emph{optional}) The acquisition function influences +how new hyperparameter sets are selected. The algorithm uses the model +learned by the learner indicated by \code{hyperparameter_learner} to search the +hyperparameter space for hyperparameter sets that are either likely better +than the best known set (\emph{exploitation}) or where there is considerable +uncertainty (\emph{exploration}). The acquisition function quantifies this +(Shahriari et al., 2016). + +The following acquisition functions are available, and are described in +more detail in the \emph{learner algorithms} vignette: +\itemize{ +\item \code{improvement_probability}: The probability of improvement quantifies the +probability that the expected optimisation score for a set is better than +the best observed optimisation score +\item \code{improvement_empirical_probability}: Similar to +\code{improvement_probability}, but based directly on optimisation scores +predicted by the individual decision trees. +\item \code{expected_improvement} (default): Computes expected improvement. +\item \code{upper_confidence_bound}: This acquisition function is based on the upper +confidence bound of the distribution (Srinivas et al., 2012). +\item \code{bayes_upper_confidence_bound}: This acquisition function is based on the +upper confidence bound of the distribution (Kaufmann et al., 2012). +}} + \item{\code{exploration_method}}{(\emph{optional}) Method used to steer exploration in +post-initialisation intensive searching steps. As stated earlier, each SMBO +iteration step compares suggested alternative parameter sets with an +incumbent \strong{best} set in a series of steps. The exploration method +controls how the set of alternative parameter sets is pruned after each +step in an iteration. Can be one of the following: +\itemize{ +\item \code{single_shot} (default): The set of alternative parameter sets is not +pruned, and each intensification iteration contains only a single +intensification step that only uses a single bootstrap. This is the fastest +exploration method, but only superficially tests each parameter set. +\item \code{successive_halving}: The set of alternative parameter sets is +pruned by removing the worst performing half of the sets after each step +(Jamieson and Talwalkar, 2016). +\item \code{stochastic_reject}: The set of alternative parameter sets is pruned by +comparing the performance of each parameter set with that of the incumbent +\strong{best} parameter set using a paired Wilcoxon test based on shared +bootstraps. Parameter sets that perform significantly worse, at an alpha +level indicated by \code{smbo_stochastic_reject_p_value}, are pruned. +\item \code{none}: The set of alternative parameter sets is not pruned. +}} + \item{\code{smbo_stochastic_reject_p_value}}{(\emph{optional}) The p-value threshold used +for the \code{stochastic_reject} exploration method. + +The default value is \code{0.05}.} + \item{\code{parallel_hyperparameter_optimisation}}{(\emph{optional}) Enable parallel +processing for hyperparameter optimisation. Defaults to \code{TRUE}. When set to +\code{FALSE}, this will disable the use of parallel processing while performing +optimisation, regardless of the settings of the \code{parallel} parameter. The +parameter moreover specifies whether parallelisation takes place within the +optimisation algorithm (\code{inner}, default), or in an outer loop ( \code{outer}) +over learners, data subsamples, etc. + +\code{parallel_hyperparameter_optimisation} is ignored if \code{parallel=FALSE}.} + \item{\code{evaluate_top_level_only}}{(\emph{optional}) Flag that signals that only +evaluation at the most global experiment level is required. Consider a +cross-validation experiment with additional external validation. The global +experiment level consists of data that are used for development, internal +validation and external validation. The next lower experiment level are the +individual cross-validation iterations. + +When the flag is \code{true}, evaluations take place on the global level only, +and no results are generated for the next lower experiment levels. In our +example, this means that results from individual cross-validation iterations +are not computed and shown. When the flag is \code{false}, results are computed +from both the global layer and the next lower level. + +Setting the flag to \code{true} saves computation time.} + \item{\code{skip_evaluation_elements}}{(\emph{optional}) Specifies which evaluation steps, +if any, should be skipped as part of the evaluation process. Defaults to +\code{none}, which means that all relevant evaluation steps are performed. It can +have one or more of the following values: +\itemize{ +\item \code{none}, \code{false}: no steps are skipped. +\item \code{all}, \code{true}: all steps are skipped. +\item \code{auc_data}: data for assessing and plotting the area under the receiver +operating characteristic curve are not computed. +\item \code{calibration_data}: data for assessing and plotting model calibration are +not computed. +\item \code{calibration_info}: data required to assess calibration, such as baseline +survival curves, are not collected. These data will still be present in the +models. +\item \code{confusion_matrix}: data for assessing and plotting a confusion matrix are +not collected. +\item \code{decision_curve_analyis}: data for performing a decision curve analysis +are not computed. +\item \code{feature_expressions}: data for assessing and plotting sample clustering +are not computed. +\item \code{feature_similarity}: data for assessing and plotting feature clusters are +not computed. +\item \code{fs_vimp}: data for assessing and plotting feature selection-based +variable importance are not collected. +\item \code{hyperparameters}: data for assessing model hyperparameters are not +collected. These data will still be present in the models. +\item \code{ice_data}: data for individual conditional expectation and partial +dependence plots are not created. +\item \code{model_performance}: data for assessing and visualising model performance +are not created. +\item \code{model_vimp}: data for assessing and plotting model-based variable +importance are not collected. +\item \code{permutation_vimp}: data for assessing and plotting model-agnostic +permutation variable importance are not computed. +\item \code{prediction_data}: predictions for each sample are not made and exported. +\item \code{risk_stratification_data}: data for assessing and plotting Kaplan-Meier +survival curves are not collected. +\item \code{risk_stratification_info}: data for assessing stratification into risk +groups are not computed. +\item \code{univariate_analysis}: data for assessing and plotting univariate feature +importance are not computed. +}} + \item{\code{ensemble_method}}{(\emph{optional}) Method for ensembling predictions from +models for the same sample. Available methods are: +\itemize{ +\item \code{median} (default): Use the median of the predicted values as the ensemble +value for a sample. +\item \code{mean}: Use the mean of the predicted values as the ensemble value for a +sample. +} + +This parameter is only used if \code{detail_level} is \code{ensemble}.} + \item{\code{evaluation_metric}}{(\emph{optional}) One or more metrics for assessing model +performance. See the vignette on performance metrics for the available +metrics. + +Confidence intervals (or rather credibility intervals) are computed for each +metric during evaluation. This is done using bootstraps, the number of which +depends on the value of \code{confidence_level} (Davison and Hinkley, 1997). + +If unset, the metric in the \code{optimisation_metric} variable is used.} + \item{\code{sample_limit}}{(\emph{optional}) Set the upper limit of the number of samples +that are used during evaluation steps. Cannot be less than 20. + +This setting can be specified per data element by providing a parameter +value in a named list with data elements, e.g. +\code{list("sample_similarity"=100, "permutation_vimp"=1000)}. + +This parameter can be set for the following data elements: +\code{sample_similarity} and \code{ice_data}.} + \item{\code{detail_level}}{(\emph{optional}) Sets the level at which results are computed +and aggregated. +\itemize{ +\item \code{ensemble}: Results are computed at the ensemble level, i.e. over all +models in the ensemble. This means that, for example, bias-corrected +estimates of model performance are assessed by creating (at least) 20 +bootstraps and computing the model performance of the ensemble model for +each bootstrap. +\item \code{hybrid} (default): Results are computed at the level of models in an +ensemble. This means that, for example, bias-corrected estimates of model +performance are directly computed using the models in the ensemble. If there +are at least 20 trained models in the ensemble, performance is computed for +each model, in contrast to \code{ensemble} where performance is computed for the +ensemble of models. If there are less than 20 trained models in the +ensemble, bootstraps are created so that at least 20 point estimates can be +made. +\item \code{model}: Results are computed at the model level. This means that, for +example, bias-corrected estimates of model performance are assessed by +creating (at least) 20 bootstraps and computing the performance of the model +for each bootstrap. +} + +Note that each level of detail has a different interpretation for bootstrap +confidence intervals. For \code{ensemble} and \code{model} these are the confidence +intervals for the ensemble and an individual model, respectively. That is, +the confidence interval describes the range where an estimate produced by a +respective ensemble or model trained on a repeat of the experiment may be +found with the probability of the confidence level. For \code{hybrid}, it +represents the range where any single model trained on a repeat of the +experiment may be found with the probability of the confidence level. By +definition, confidence intervals obtained using \code{hybrid} are at least as +wide as those for \code{ensemble}. \code{hybrid} offers the correct interpretation if +the goal of the analysis is to assess the result of a single, unspecified, +model. + +\code{hybrid} is generally computationally less expensive then \code{ensemble}, which +in turn is somewhat less expensive than \code{model}. + +A non-default \code{detail_level} parameter can be specified for separate +evaluation steps by providing a parameter value in a named list with data +elements, e.g. \code{list("auc_data"="ensemble", "model_performance"="hybrid")}. +This parameter can be set for the following data elements: \code{auc_data}, +\code{decision_curve_analyis}, \code{model_performance}, \code{permutation_vimp}, +\code{ice_data}, \code{prediction_data} and \code{confusion_matrix}.} + \item{\code{estimation_type}}{(\emph{optional}) Sets the type of estimation that should be +possible. This has the following options: +\itemize{ +\item \code{point}: Point estimates. +\item \code{bias_correction} or \code{bc}: Bias-corrected estimates. A bias-corrected +estimate is computed from (at least) 20 point estimates, and \code{familiar} may +bootstrap the data to create them. +\item \code{bootstrap_confidence_interval} or \code{bci} (default): Bias-corrected +estimates with bootstrap confidence intervals (Efron and Hastie, 2016). The +number of point estimates required depends on the \code{confidence_level} +parameter, and \code{familiar} may bootstrap the data to create them. +} + +As with \code{detail_level}, a non-default \code{estimation_type} parameter can be +specified for separate evaluation steps by providing a parameter value in a +named list with data elements, e.g. \code{list("auc_data"="bci", "model_performance"="point")}. This parameter can be set for the following +data elements: \code{auc_data}, \code{decision_curve_analyis}, \code{model_performance}, +\code{permutation_vimp}, \code{ice_data}, and \code{prediction_data}.} + \item{\code{aggregate_results}}{(\emph{optional}) Flag that signifies whether results +should be aggregated during evaluation. If \code{estimation_type} is +\code{bias_correction} or \code{bc}, aggregation leads to a single bias-corrected +estimate. If \code{estimation_type} is \code{bootstrap_confidence_interval} or \code{bci}, +aggregation leads to a single bias-corrected estimate with lower and upper +boundaries of the confidence interval. This has no effect if +\code{estimation_type} is \code{point}. + +The default value is equal to \code{TRUE} except when assessing metrics to assess +model performance, as the default violin plot requires underlying data. + +As with \code{detail_level} and \code{estimation_type}, a non-default +\code{aggregate_results} parameter can be specified for separate evaluation steps +by providing a parameter value in a named list with data elements, e.g. +\code{list("auc_data"=TRUE, , "model_performance"=FALSE)}. This parameter exists +for the same elements as \code{estimation_type}.} + \item{\code{confidence_level}}{(\emph{optional}) Numeric value for the level at which +confidence intervals are determined. In the case bootstraps are used to +determine the confidence intervals bootstrap estimation, \code{familiar} uses the +rule of thumb \eqn{n = 20 / ci.level} to determine the number of required +bootstraps. + +The default value is \code{0.95}.} + \item{\code{bootstrap_ci_method}}{(\emph{optional}) Method used to determine bootstrap +confidence intervals (Efron and Hastie, 2016). The following methods are +implemented: +\itemize{ +\item \code{percentile} (default): Confidence intervals obtained using the percentile +method. +\item \code{bc}: Bias-corrected confidence intervals. +} + +Note that the standard method is not implemented because this method is +often not suitable due to non-normal distributions. The bias-corrected and +accelerated (BCa) method is not implemented yet.} + \item{\code{feature_cluster_method}}{(\emph{optional}) Method used to perform clustering +of features. The same methods as for the \code{cluster_method} configuration +parameter are available: \code{none}, \code{hclust}, \code{agnes}, \code{diana} and \code{pam}. + +The value for the \code{cluster_method} configuration parameter is used by +default. When generating clusters for the purpose of determining mutual +correlation and ordering feature expressions, \code{none} is ignored and \code{hclust} +is used instead.} + \item{\code{feature_linkage_method}}{(\emph{optional}) Method used for agglomerative +clustering with \code{hclust} and \code{agnes}. Linkage determines how features are +sequentially combined into clusters based on distance. The methods are +shared with the \code{cluster_linkage_method} configuration parameter: \code{average}, +\code{single}, \code{complete}, \code{weighted}, and \code{ward}. + +The value for the \code{cluster_linkage_method} configuration parameters is used +by default.} + \item{\code{feature_cluster_cut_method}}{(\emph{optional}) Method used to divide features +into separate clusters. The available methods are the same as for the +\code{cluster_cut_method} configuration parameter: \code{silhouette}, \code{fixed_cut} and +\code{dynamic_cut}. + +\code{silhouette} is available for all cluster methods, but \code{fixed_cut} only +applies to methods that create hierarchical trees (\code{hclust}, \code{agnes} and +\code{diana}). \code{dynamic_cut} requires the \code{dynamicTreeCut} package and can only +be used with \code{agnes} and \code{hclust}. + +The value for the \code{cluster_cut_method} configuration parameter is used by +default.} + \item{\code{feature_similarity_metric}}{(\emph{optional}) Metric to determine pairwise +similarity between features. Similarity is computed in the same manner as +for clustering, and \code{feature_similarity_metric} therefore has the same +options as \code{cluster_similarity_metric}: \code{mcfadden_r2}, \code{cox_snell_r2}, +\code{nagelkerke_r2}, \code{mutual_information}, \code{spearman}, \code{kendall} and \code{pearson}. + +The value used for the \code{cluster_similarity_metric} configuration parameter +is used by default.} + \item{\code{feature_similarity_threshold}}{(\emph{optional}) The threshold level for +pair-wise similarity that is required to form feature clusters with the +\code{fixed_cut} method. This threshold functions in the same manner as the one +defined using the \code{cluster_similarity_threshold} parameter. + +By default, the value for the \code{cluster_similarity_threshold} configuration +parameter is used. + +Unlike for \code{cluster_similarity_threshold}, more than one value can be +supplied here.} + \item{\code{sample_cluster_method}}{(\emph{optional}) The method used to perform +clustering based on distance between samples. These are the same methods as +for the \code{cluster_method} configuration parameter: \code{hclust}, \code{agnes}, \code{diana} +and \code{pam}. + +The value for the \code{cluster_method} configuration parameter is used by +default. When generating clusters for the purpose of ordering samples in +feature expressions, \code{none} is ignored and \code{hclust} is used instead.} + \item{\code{sample_linkage_method}}{(\emph{optional}) The method used for agglomerative +clustering in \code{hclust} and \code{agnes}. These are the same methods as for the +\code{cluster_linkage_method} configuration parameter: \code{average}, \code{single}, +\code{complete}, \code{weighted}, and \code{ward}. + +The value for the \code{cluster_linkage_method} configuration parameters is used +by default.} + \item{\code{sample_similarity_metric}}{(\emph{optional}) Metric to determine pairwise +similarity between samples. Similarity is computed in the same manner as for +clustering, but \code{sample_similarity_metric} has different options that are +better suited to computing distance between samples instead of between +features. The following metrics are available. +\itemize{ +\item \code{gower} (default): compute Gower's distance between samples. By default, +Gower's distance is computed based on winsorised data to reduce the effect +of outliers (see below). +\item \code{euclidean}: compute the Euclidean distance between samples. +} + +The underlying feature data for numerical features is scaled to the +\eqn{[0,1]} range using the feature values across the samples. The +normalisation parameters required can optionally be computed from feature +data with the outer 5\% (on both sides) of feature values trimmed or +winsorised. To do so append \verb{_trim} (trimming) or \verb{_winsor} (winsorising) to +the metric name. This reduces the effect of outliers somewhat. + +Regardless of metric, all categorical features are handled as for the +Gower's distance: distance is 0 if the values in a pair of samples match, +and 1 if they do not.} + \item{\code{eval_aggregation_method}}{(\emph{optional}) Method for aggregating variable +importances for the purpose of evaluation. Variable importances are +determined during feature selection steps and after training the model. Both +types are evaluated, but feature selection variable importance is only +evaluated at run-time. + +See the documentation for the \code{vimp_aggregation_method} argument for +information concerning the different methods available.} + \item{\code{eval_aggregation_rank_threshold}}{(\emph{optional}) The threshold used to +define the subset of highly important features during evaluation. + +See the documentation for the \code{vimp_aggregation_rank_threshold} argument for +more information.} + \item{\code{eval_icc_type}}{(\emph{optional}) String indicating the type of intraclass +correlation coefficient (\code{1}, \code{2} or \code{3}) that should be used to compute +robustness for features in repeated measurements during the evaluation of +univariate importance. These types correspond to the types in Shrout and +Fleiss (1979). The default value is \code{1}.} + \item{\code{stratification_method}}{(\emph{optional}) Method for determining the +stratification threshold for creating survival groups. The actual, +model-dependent, threshold value is obtained from the development data, and +can afterwards be used to perform stratification on validation data. + +The following stratification methods are available: +\itemize{ +\item \code{median} (default): The median predicted value in the development cohort +is used to stratify the samples into two risk groups. For predicted outcome +values that build a continuous spectrum, the two risk groups in the +development cohort will be roughly equal in size. +\item \code{mean}: The mean predicted value in the development cohort is used to +stratify the samples into two risk groups. +\item \code{mean_trim}: As \code{mean}, but based on the set of predicted values +where the 5\% lowest and 5\% highest values are discarded. This reduces the +effect of outliers. +\item \code{mean_winsor}: As \code{mean}, but based on the set of predicted values where +the 5\% lowest and 5\% highest values are winsorised. This reduces the effect +of outliers. +\item \code{fixed}: Samples are stratified based on the sample quantiles of the +predicted values. These quantiles are defined using the +\code{stratification_threshold} parameter. +\item \code{optimised}: Use maximally selected rank statistics to determine the +optimal threshold (Lausen and Schumacher, 1992; Hothorn et al., 2003) to +stratify samples into two optimally separated risk groups. +} + +One or more stratification methods can be selected simultaneously. + +This parameter is only relevant for \code{survival} outcomes.} + \item{\code{stratification_threshold}}{(\emph{optional}) Numeric value(s) signifying the +sample quantiles for stratification using the \code{fixed} method. The number of +risk groups will be the number of values +1. + +The default value is \code{c(1/3, 2/3)}, which will yield two thresholds that +divide samples into three equally sized groups. If \code{fixed} is not among the +selected stratification methods, this parameter is ignored. + +This parameter is only relevant for \code{survival} outcomes.} + \item{\code{time_max}}{(\emph{optional}) Time point which is used as the benchmark for +e.g. cumulative risks generated by random forest, or the cutoff for Uno's +concordance index. + +If \code{time_max} is not provided, but \code{evaluation_times} is, the largest value +of \code{evaluation_times} is used. If both are not provided, \code{time_max} is set +to the 98th percentile of the distribution of survival times for samples +with an event in the development data set. + +This parameter is only relevant for \code{survival} outcomes.} + \item{\code{evaluation_times}}{(\emph{optional}) One or more time points that are used for +assessing calibration in survival problems. This is done as expected and +observed survival probabilities depend on time. + +If unset, \code{evaluation_times} will be equal to \code{time_max}. + +This parameter is only relevant for \code{survival} outcomes.} + \item{\code{dynamic_model_loading}}{(\emph{optional}) Enables dynamic loading of models +during the evaluation process, if \code{TRUE}. Defaults to \code{FALSE}. Dynamic +loading of models may reduce the overall memory footprint, at the cost of +increased disk or network IO. Models can only be dynamically loaded if they +are found at an accessible disk or network location. Setting this parameter +to \code{TRUE} may help if parallel processing causes out-of-memory issues during +evaluation.} + \item{\code{parallel_evaluation}}{(\emph{optional}) Enable parallel processing for +hyperparameter optimisation. Defaults to \code{TRUE}. When set to \code{FALSE}, this +will disable the use of parallel processing while performing optimisation, +regardless of the settings of the \code{parallel} parameter. The parameter +moreover specifies whether parallelisation takes place within the evaluation +process steps (\code{inner}, default), or in an outer loop ( \code{outer}) over +learners, data subsamples, etc. + +\code{parallel_evaluation} is ignored if \code{parallel=FALSE}.} + }} +} +\value{ +A list of settings to be used within the workflow +} +\description{ +Internal function for parsing settings that configure various aspects of the +worklow +} +\references{ +\enumerate{ +\item Storey, J. D. A direct approach to false discovery rates. J. +R. Stat. Soc. Series B Stat. Methodol. 64, 479–498 (2002). +\item Shrout, P. E. & Fleiss, J. L. Intraclass correlations: uses in assessing +rater reliability. Psychol. Bull. 86, 420–428 (1979). +\item Koo, T. K. & Li, M. Y. A guideline of selecting and reporting intraclass +correlation coefficients for reliability research. J. Chiropr. Med. 15, +155–163 (2016). +\item Yeo, I. & Johnson, R. A. A new family of power transformations to +improve normality or symmetry. Biometrika 87, 954–959 (2000). +\item Box, G. E. P. & Cox, D. R. An analysis of transformations. J. R. Stat. +Soc. Series B Stat. Methodol. 26, 211–252 (1964). +\item Raymaekers, J., Rousseeuw, P. J. Transforming variables to central +normality. Mach Learn. (2021). +\item Park, M. Y., Hastie, T. & Tibshirani, R. Averaged gene expressions for +regression. Biostatistics 8, 212–227 (2007). +\item Tolosi, L. & Lengauer, T. Classification with correlated features: +unreliability of feature ranking and solutions. Bioinformatics 27, +1986–1994 (2011). +\item Johnson, W. E., Li, C. & Rabinovic, A. Adjusting batch effects in +microarray expression data using empirical Bayes methods. Biostatistics 8, +118–127 (2007) +\item Kaufman, L. & Rousseeuw, P. J. Finding groups in data: an introduction +to cluster analysis. (John Wiley & Sons, 2009). +\item Muellner, D. fastcluster: fast hierarchical, agglomerative clustering +routines for R and Python. J. Stat. Softw. 53, 1–18 (2013). +\item Rousseeuw, P. J. Silhouettes: A graphical aid to the interpretation and +validation of cluster analysis. J. Comput. Appl. Math. 20, 53–65 (1987). +\item Langfelder, P., Zhang, B. & Horvath, S. Defining clusters from a +hierarchical cluster tree: the Dynamic Tree Cut package for R. +Bioinformatics 24, 719–720 (2008). +\item McFadden, D. Conditional logit analysis of qualitative choice behavior. +in Frontiers in Econometrics (ed. Zarembka, P.) 105–142 (Academic Press, +1974). +\item Cox, D. R. & Snell, E. J. Analysis of binary data. (Chapman and Hall, +1989). +\item Nagelkerke, N. J. D. A note on a general definition of the coefficient +of determination. Biometrika 78, 691–692 (1991). +\item Meinshausen, N. & Buehlmann, P. Stability selection. J. R. Stat. Soc. +Series B Stat. Methodol. 72, 417–473 (2010). +\item Haury, A.-C., Gestraud, P. & Vert, J.-P. The influence of feature +selection methods on accuracy, stability and interpretability of molecular +signatures. PLoS One 6, e28210 (2011). +\item Wald, R., Khoshgoftaar, T. M., Dittman, D., Awada, W. & Napolitano,A. An +extensive comparison of feature ranking aggregation techniques in +bioinformatics. in 2012 IEEE 13th International Conference on Information +Reuse Integration (IRI) 377–384 (2012). +\item Hutter, F., Hoos, H. H. & Leyton-Brown, K. Sequential model-based +optimization for general algorithm configuration. in Learning and +Intelligent Optimization (ed. Coello, C. A. C.) 6683, 507–523 (Springer +Berlin Heidelberg, 2011). +\item Shahriari, B., Swersky, K., Wang, Z., Adams, R. P. & de Freitas, N. +Taking the Human Out of the Loop: A Review of Bayesian Optimization. Proc. +IEEE 104, 148–175 (2016) +\item Srinivas, N., Krause, A., Kakade, S. M. & Seeger, M. W. +Information-Theoretic Regret Bounds for Gaussian Process Optimization in +the Bandit Setting. IEEE Trans. Inf. Theory 58, 3250–3265 (2012) +\item Kaufmann, E., Cappé, O. & Garivier, A. On Bayesian upper confidence +bounds for bandit problems. in Artificial intelligence and statistics +592–600 (2012). +\item Jamieson, K. & Talwalkar, A. Non-stochastic Best Arm Identification and +Hyperparameter Optimization. in Proceedings of the 19th International +Conference on Artificial Intelligence and Statistics (eds. Gretton, A. & +Robert, C. C.) vol. 51 240–248 (PMLR, 2016). +\item Gramacy, R. B. laGP: Large-Scale Spatial Modeling via Local Approximate +Gaussian Processes in R. Journal of Statistical Software 72, 1–46 (2016) +\item Sparapani, R., Spanbauer, C. & McCulloch, R. Nonparametric Machine +Learning and Efficient Computation with Bayesian Additive Regression Trees: +The BART R Package. Journal of Statistical Software 97, 1–66 (2021) +\item Davison, A. C. & Hinkley, D. V. Bootstrap methods and their application. +(Cambridge University Press, 1997). +\item Efron, B. & Hastie, T. Computer Age Statistical Inference. (Cambridge +University Press, 2016). +\item Lausen, B. & Schumacher, M. Maximally Selected Rank Statistics. +Biometrics 48, 73 (1992). +\item Hothorn, T. & Lausen, B. On the exact distribution of maximally selected +rank statistics. Comput. Stat. Data Anal. 43, 121–137 (2003). +} +} +\keyword{internal} diff --git a/man/dot-parse_hyperparameter_optimisation_settings.Rd b/man/dot-parse_hyperparameter_optimisation_settings.Rd new file mode 100644 index 00000000..680288fb --- /dev/null +++ b/man/dot-parse_hyperparameter_optimisation_settings.Rd @@ -0,0 +1,329 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ParseSettings.R +\name{.parse_hyperparameter_optimisation_settings} +\alias{.parse_hyperparameter_optimisation_settings} +\title{Internal function for parsing settings related to model hyperparameter +optimisation} +\usage{ +.parse_hyperparameter_optimisation_settings( + config = NULL, + parallel, + outcome_type, + optimisation_bootstraps = waiver(), + optimisation_determine_vimp = waiver(), + smbo_random_initialisation = waiver(), + smbo_n_random_sets = waiver(), + max_smbo_iterations = waiver(), + smbo_stop_convergent_iterations = waiver(), + smbo_stop_tolerance = waiver(), + smbo_time_limit = waiver(), + smbo_initial_bootstraps = waiver(), + smbo_step_bootstraps = waiver(), + smbo_intensify_steps = waiver(), + smbo_stochastic_reject_p_value = waiver(), + optimisation_function = waiver(), + optimisation_metric = waiver(), + acquisition_function = waiver(), + exploration_method = waiver(), + hyperparameter_learner = waiver(), + parallel_hyperparameter_optimisation = waiver(), + ... +) +} +\arguments{ +\item{config}{A list of settings, e.g. from an xml file.} + +\item{parallel}{Logical value that whether familiar uses parallelisation. If +\code{FALSE} it will override \code{parallel_hyperparameter_optimisation}.} + +\item{outcome_type}{Type of outcome found in the data set.} + +\item{optimisation_bootstraps}{(\emph{optional}) Number of bootstraps that should +be generated from the development data set. During the optimisation +procedure one or more of these bootstraps (indicated by +\code{smbo_step_bootstraps}) are used for model development using different +combinations of hyperparameters. The effect of the hyperparameters is then +assessed by comparing in-bag and out-of-bag model performance. + +The default number of bootstraps is \code{50}. Hyperparameter optimisation may +finish before exhausting the set of bootstraps.} + +\item{optimisation_determine_vimp}{(\emph{optional}) Logical value that indicates +whether variable importance is determined separately for each of the +bootstraps created during the optimisation process (\code{TRUE}) or the +applicable results from the feature selection step are used (\code{FALSE}). + +Determining variable importance increases the initial computational +overhead. However, it prevents positive biases for the out-of-bag data due +to overlap of these data with the development data set used for the feature +selection step. In this case, any hyperparameters of the variable +importance method are not determined separately for each bootstrap, but +those obtained during the feature selection step are used instead. In case +multiple of such hyperparameter sets could be applicable, the set that will +be used is randomly selected for each bootstrap. + +This parameter only affects hyperparameter optimisation of learners. The +default is \code{TRUE}.} + +\item{smbo_random_initialisation}{(\emph{optional}) String indicating the +initialisation method for the hyperparameter space. Can be one of +\code{fixed_subsample} (default), \code{fixed}, or \code{random}. \code{fixed} and +\code{fixed_subsample} first create hyperparameter sets from a range of default +values set by familiar. \code{fixed_subsample} then randomly draws up to +\code{smbo_n_random_sets} from the grid. \code{random} does not rely upon a fixed +grid, and randomly draws up to \code{smbo_n_random_sets} hyperparameter sets +from the hyperparameter space.} + +\item{smbo_n_random_sets}{(\emph{optional}) Number of random or subsampled +hyperparameters drawn during the initialisation process. Default: \code{100}. +Cannot be smaller than \code{10}. The parameter is not used when +\code{smbo_random_initialisation} is \code{fixed}, as the entire pre-defined grid +will be explored.} + +\item{max_smbo_iterations}{(\emph{optional}) Maximum number of intensify +iterations of the SMBO algorithm. During an intensify iteration a run-off +occurs between the current \emph{best} hyperparameter combination and either 10 +challenger combination with the highest expected improvement or a set of 20 +random combinations. + +Run-off with random combinations is used to force exploration of the +hyperparameter space, and is performed every second intensify iteration, or +if there is no expected improvement for any challenger combination. + +If a combination of hyperparameters leads to better performance on the same +data than the incumbent \emph{best} set of hyperparameters, it replaces the +incumbent set at the end of the intensify iteration. + +The default number of intensify iteration is \code{20}. Iterations may be +stopped early if the incumbent set of hyperparameters remains the same for +\code{smbo_stop_convergent_iterations} iterations, or performance improvement is +minimal. This behaviour is suppressed during the first 4 iterations to +enable the algorithm to explore the hyperparameter space.} + +\item{smbo_stop_convergent_iterations}{(\emph{optional}) The number of subsequent +convergent SMBO iterations required to stop hyperparameter optimisation +early. An iteration is convergent if the \emph{best} parameter set has not +changed or the optimisation score over the 4 most recent iterations has not +changed beyond the tolerance level in \code{smbo_stop_tolerance}. + +The default value is \code{3}.} + +\item{smbo_stop_tolerance}{(\emph{optional}) Tolerance for early stopping due to +convergent optimisation score. + +The default value depends on the square root of the number of samples (at +the series level), and is \code{0.01} for 100 samples. This value is computed as +\code{0.1 * 1 / sqrt(n_samples)}. The upper limit is \code{0.0001} for 1M or more +samples.} + +\item{smbo_time_limit}{(\emph{optional}) Time limit (in minutes) for the +optimisation process. Optimisation is stopped after this limit is exceeded. +Time taken to determine variable importance for the optimisation process +(see the \code{optimisation_determine_vimp} parameter) does not count. + +The default is \code{NULL}, indicating that there is no time limit for the +optimisation process. The time limit cannot be less than 1 minute.} + +\item{smbo_initial_bootstraps}{(\emph{optional}) The number of bootstraps taken +from the set of \code{optimisation_bootstraps} as the bootstraps assessed +initially. + +The default value is \code{1}. The value cannot be larger than +\code{optimisation_bootstraps}.} + +\item{smbo_step_bootstraps}{(\emph{optional}) The number of bootstraps taken from +the set of \code{optimisation_bootstraps} bootstraps as the bootstraps assessed +during the steps of each intensify iteration. + +The default value is \code{3}. The value cannot be larger than +\code{optimisation_bootstraps}.} + +\item{smbo_intensify_steps}{(\emph{optional}) The number of steps in each SMBO +intensify iteration. Each step a new set of \code{smbo_step_bootstraps} +bootstraps is drawn and used in the run-off between the incumbent \emph{best} +hyperparameter combination and its challengers. + +The default value is \code{5}. Higher numbers allow for a more detailed +comparison, but this comes with added computational cost.} + +\item{smbo_stochastic_reject_p_value}{(\emph{optional}) The p-value threshold used +for the \code{stochastic_reject} exploration method. + +The default value is \code{0.05}.} + +\item{optimisation_function}{(\emph{optional}) Type of optimisation function used +to quantify the performance of a hyperparameter set. Model performance is +assessed using the metric(s) specified by \code{optimisation_metric} on the +in-bag (IB) and out-of-bag (OOB) samples of a bootstrap. These values are +converted to objective scores with a standardised interval of \eqn{[-1.0, + 1.0]}. Each pair of objective is subsequently used to compute an +optimisation score. The optimisation score across different bootstraps is +than aggregated to a summary score. This summary score is used to rank +hyperparameter sets, and select the optimal set. + +The combination of optimisation score and summary score is determined by +the optimisation function indicated by this parameter: +\itemize{ +\item \code{validation} or \code{max_validation} (default): seeks to maximise OOB score. +\item \code{balanced}: seeks to balance IB and OOB score. +\item \code{stronger_balance}: similar to \code{balanced}, but with stronger penalty for +differences between IB and OOB scores. +\item \code{validation_minus_sd}: seeks to optimise the average OOB score minus its +standard deviation. +\item \code{validation_25th_percentile}: seeks to optimise the 25th percentile of +OOB scores, and is conceptually similar to \code{validation_minus_sd}. +\item \code{model_estimate}: seeks to maximise the OOB score estimate predicted by +the hyperparameter learner (not available for random search). +\item \code{model_estimate_minus_sd}: seeks to maximise the OOB score estimate minus +its estimated standard deviation, as predicted by the hyperparameter +learner (not available for random search). +\item \code{model_balanced_estimate}: seeks to maximise the estimate of the balanced +IB and OOB score. This is similar to the \code{balanced} score, and in fact uses +a hyperparameter learner to predict said score (not available for random +search). +\item \code{model_balanced_estimate_minus_sd}: seeks to maximise the estimate of the +balanced IB and OOB score, minus its estimated standard deviation. This is +similar to the \code{balanced} score, but takes into account its estimated +spread. +} + +Additional detail are provided in the \emph{Learning algorithms and +hyperparameter optimisation} vignette.} + +\item{optimisation_metric}{(\emph{optional}) One or more metrics used to compute +performance scores. See the vignette on performance metrics for the +available metrics. + +If unset, the following metrics are used by default: +\itemize{ +\item \code{auc_roc}: For \code{binomial} and \code{multinomial} models. +\item \code{mse}: Mean squared error for \code{continuous} models. +\item \code{msle}: Mean squared logarithmic error for \code{count} models. +\item \code{concordance_index}: For \code{survival} models. +} + +Multiple optimisation metrics can be specified. Actual metric values are +converted to an objective value by comparison with a baseline metric value +that derives from a trivial model, i.e. majority class for binomial and +multinomial outcomes, the median outcome for count and continuous outcomes +and a fixed risk or time for survival outcomes.} + +\item{acquisition_function}{(\emph{optional}) The acquisition function influences +how new hyperparameter sets are selected. The algorithm uses the model +learned by the learner indicated by \code{hyperparameter_learner} to search the +hyperparameter space for hyperparameter sets that are either likely better +than the best known set (\emph{exploitation}) or where there is considerable +uncertainty (\emph{exploration}). The acquisition function quantifies this +(Shahriari et al., 2016). + +The following acquisition functions are available, and are described in +more detail in the \emph{learner algorithms} vignette: +\itemize{ +\item \code{improvement_probability}: The probability of improvement quantifies the +probability that the expected optimisation score for a set is better than +the best observed optimisation score +\item \code{improvement_empirical_probability}: Similar to +\code{improvement_probability}, but based directly on optimisation scores +predicted by the individual decision trees. +\item \code{expected_improvement} (default): Computes expected improvement. +\item \code{upper_confidence_bound}: This acquisition function is based on the upper +confidence bound of the distribution (Srinivas et al., 2012). +\item \code{bayes_upper_confidence_bound}: This acquisition function is based on the +upper confidence bound of the distribution (Kaufmann et al., 2012). +}} + +\item{exploration_method}{(\emph{optional}) Method used to steer exploration in +post-initialisation intensive searching steps. As stated earlier, each SMBO +iteration step compares suggested alternative parameter sets with an +incumbent \strong{best} set in a series of steps. The exploration method +controls how the set of alternative parameter sets is pruned after each +step in an iteration. Can be one of the following: +\itemize{ +\item \code{single_shot} (default): The set of alternative parameter sets is not +pruned, and each intensification iteration contains only a single +intensification step that only uses a single bootstrap. This is the fastest +exploration method, but only superficially tests each parameter set. +\item \code{successive_halving}: The set of alternative parameter sets is +pruned by removing the worst performing half of the sets after each step +(Jamieson and Talwalkar, 2016). +\item \code{stochastic_reject}: The set of alternative parameter sets is pruned by +comparing the performance of each parameter set with that of the incumbent +\strong{best} parameter set using a paired Wilcoxon test based on shared +bootstraps. Parameter sets that perform significantly worse, at an alpha +level indicated by \code{smbo_stochastic_reject_p_value}, are pruned. +\item \code{none}: The set of alternative parameter sets is not pruned. +}} + +\item{hyperparameter_learner}{(\emph{optional}) Any point in the hyperparameter +space has a single, scalar, optimisation score value that is \emph{a priori} +unknown. During the optimisation process, the algorithm samples from the +hyperparameter space by selecting hyperparameter sets and computing the +optimisation score value for one or more bootstraps. For each +hyperparameter set the resulting values are distributed around the actual +value. The learner indicated by \code{hyperparameter_learner} is then used to +infer optimisation score estimates for unsampled parts of the +hyperparameter space. + +The following models are available: +\itemize{ +\item \code{bayesian_additive_regression_trees} or \code{bart}: Uses Bayesian Additive +Regression Trees (Sparapani et al., 2021) for inference. Unlike standard +random forests, BART allows for estimating posterior distributions directly +and can extrapolate. +\item \code{gaussian_process} (default): Creates a localised approximate Gaussian +process for inference (Gramacy, 2016). This allows for better scaling than +deterministic Gaussian Processes. +\item \code{random_forest}: Creates a random forest for inference. Originally +suggested by Hutter et al. (2011). A weakness of random forests is their +lack of extrapolation beyond observed values, which limits their usefulness +in exploiting promising areas of hyperparameter space. +\item \code{random} or \code{random_search}: Forgoes the use of models to steer +optimisation. Instead, a random search is performed. +}} + +\item{parallel_hyperparameter_optimisation}{(\emph{optional}) Enable parallel +processing for hyperparameter optimisation. Defaults to \code{TRUE}. When set to +\code{FALSE}, this will disable the use of parallel processing while performing +optimisation, regardless of the settings of the \code{parallel} parameter. The +parameter moreover specifies whether parallelisation takes place within the +optimisation algorithm (\code{inner}, default), or in an outer loop ( \code{outer}) +over learners, data subsamples, etc. + +\code{parallel_hyperparameter_optimisation} is ignored if \code{parallel=FALSE}.} + +\item{...}{Unused arguments.} +} +\value{ +List of parameters related to model hyperparameter optimisation. +} +\description{ +Internal function for parsing settings related to model hyperparameter +optimisation +} +\references{ +\enumerate{ +\item Hutter, F., Hoos, H. H. & Leyton-Brown, K. Sequential +model-based optimization for general algorithm configuration. in Learning +and Intelligent Optimization (ed. Coello, C. A. C.) 6683, 507–523 (Springer +Berlin Heidelberg, 2011). +\item Shahriari, B., Swersky, K., Wang, Z., Adams, R. P. & de Freitas, N. +Taking the Human Out of the Loop: A Review of Bayesian Optimization. Proc. +IEEE 104, 148–175 (2016) +\item Srinivas, N., Krause, A., Kakade, S. M. & Seeger, M. W. +Information-Theoretic Regret Bounds for Gaussian Process Optimization in +the Bandit Setting. IEEE Trans. Inf. Theory 58, 3250–3265 (2012) +\item Kaufmann, E., Cappé, O. & Garivier, A. On Bayesian upper confidence +bounds for bandit problems. in Artificial intelligence and statistics +592–600 (2012). +\item Jamieson, K. & Talwalkar, A. Non-stochastic Best Arm Identification and +Hyperparameter Optimization. in Proceedings of the 19th International +Conference on Artificial Intelligence and Statistics (eds. Gretton, A. & +Robert, C. C.) vol. 51 240–248 (PMLR, 2016). +\item Gramacy, R. B. laGP: Large-Scale Spatial Modeling via Local Approximate +Gaussian Processes in R. Journal of Statistical Software 72, 1–46 (2016) +\item Sparapani, R., Spanbauer, C. & McCulloch, R. Nonparametric Machine +Learning and Efficient Computation with Bayesian Additive Regression Trees: +The BART R Package. Journal of Statistical Software 97, 1–66 (2021) +} +} +\keyword{internal} diff --git a/man/dot-parse_initial_settings.Rd b/man/dot-parse_initial_settings.Rd new file mode 100644 index 00000000..c93db933 --- /dev/null +++ b/man/dot-parse_initial_settings.Rd @@ -0,0 +1,226 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ParseSettings.R +\name{.parse_initial_settings} +\alias{.parse_initial_settings} +\title{Internal function for parsing settings required to parse the input data +and define the experiment} +\usage{ +.parse_initial_settings(config = NULL, ...) +} +\arguments{ +\item{config}{A list of settings, e.g. from an xml file.} + +\item{...}{ + Arguments passed on to \code{\link[=.parse_experiment_settings]{.parse_experiment_settings}} + \describe{ + \item{\code{batch_id_column}}{(\strong{recommended}) Name of the column containing batch +or cohort identifiers. This parameter is required if more than one dataset +is provided, or if external validation is performed. + +In familiar any row of data is organised by four identifiers: +\itemize{ +\item The batch identifier \code{batch_id_column}: This denotes the group to which a +set of samples belongs, e.g. patients from a single study, samples measured +in a batch, etc. The batch identifier is used for batch normalisation, as +well as selection of development and validation datasets. +\item The sample identifier \code{sample_id_column}: This denotes the sample level, +e.g. data from a single individual. Subsets of data, e.g. bootstraps or +cross-validation folds, are created at this level. +\item The series identifier \code{series_id_column}: Indicates measurements on a +single sample that may not share the same outcome value, e.g. a time +series, or the number of cells in a view. +\item The repetition identifier: Indicates repeated measurements in a single +series where any feature values may differ, but the outcome does not. +Repetition identifiers are always implicitly set when multiple entries for +the same series of the same sample in the same batch that share the same +outcome are encountered. +}} + \item{\code{sample_id_column}}{(\strong{recommended}) Name of the column containing +sample or subject identifiers. See \code{batch_id_column} above for more +details. + +If unset, every row will be identified as a single sample.} + \item{\code{series_id_column}}{(\strong{optional}) Name of the column containing series +identifiers, which distinguish between measurements that are part of a +series for a single sample. See \code{batch_id_column} above for more details. + +If unset, rows which share the same batch and sample identifiers but have a +different outcome are assigned unique series identifiers.} + \item{\code{development_batch_id}}{(\emph{optional}) One or more batch or cohort +identifiers to constitute data sets for development. Defaults to all, or +all minus the identifiers in \code{validation_batch_id} for external validation. +Required if external validation is performed and \code{validation_batch_id} is +not provided.} + \item{\code{validation_batch_id}}{(\emph{optional}) One or more batch or cohort +identifiers to constitute data sets for external validation. Defaults to +all data sets except those in \code{development_batch_id} for external +validation, or none if not. Required if \code{development_batch_id} is not +provided.} + \item{\code{outcome_name}}{(\emph{optional}) Name of the modelled outcome. This name will +be used in figures created by \code{familiar}. + +If not set, the column name in \code{outcome_column} will be used for +\code{binomial}, \code{multinomial}, \code{count} and \code{continuous} outcomes. For other +outcomes (\code{survival} and \code{competing_risk}) no default is used.} + \item{\code{outcome_column}}{(\strong{recommended}) Name of the column containing the +outcome of interest. May be identified from a formula, if a formula is +provided as an argument. Otherwise an error is raised. Note that \code{survival} +and \code{competing_risk} outcome type outcomes require two columns that +indicate the time-to-event or the time of last follow-up and the event +status.} + \item{\code{outcome_type}}{(\strong{recommended}) Type of outcome found in the outcome +column. The outcome type determines many aspects of the overall process, +e.g. the available feature selection methods and learners, but also the +type of assessments that can be conducted to evaluate the resulting models. +Implemented outcome types are: +\itemize{ +\item \code{binomial}: categorical outcome with 2 levels. +\item \code{multinomial}: categorical outcome with 2 or more levels. +\item \code{count}: Poisson-distributed numeric outcomes. +\item \code{continuous}: general continuous numeric outcomes. +\item \code{survival}: survival outcome for time-to-event data. +} + +If not provided, the algorithm will attempt to obtain outcome_type from +contents of the outcome column. This may lead to unexpected results, and we +therefore advise to provide this information manually. + +Note that \code{competing_risk} survival analysis are not fully supported, and +is currently not a valid choice for \code{outcome_type}.} + \item{\code{class_levels}}{(\emph{optional}) Class levels for \code{binomial} or \code{multinomial} +outcomes. This argument can be used to specify the ordering of levels for +categorical outcomes. These class levels must exactly match the levels +present in the outcome column.} + \item{\code{event_indicator}}{(\strong{recommended}) Indicator for events in \code{survival} +and \code{competing_risk} analyses. \code{familiar} will automatically recognise \code{1}, +\code{true}, \code{t}, \code{y} and \code{yes} as event indicators, including different +capitalisations. If this parameter is set, it replaces the default values.} + \item{\code{censoring_indicator}}{(\strong{recommended}) Indicator for right-censoring in +\code{survival} and \code{competing_risk} analyses. \code{familiar} will automatically +recognise \code{0}, \code{false}, \code{f}, \code{n}, \code{no} as censoring indicators, including +different capitalisations. If this parameter is set, it replaces the +default values.} + \item{\code{competing_risk_indicator}}{(\strong{recommended}) Indicator for competing +risks in \code{competing_risk} analyses. There are no default values, and if +unset, all values other than those specified by the \code{event_indicator} and +\code{censoring_indicator} parameters are considered to indicate competing +risks.} + \item{\code{signature}}{(\emph{optional}) One or more names of feature columns that are +considered part of a specific signature. Features specified here will +always be used for modelling. Ranking from feature selection has no effect +for these features.} + \item{\code{novelty_features}}{(\emph{optional}) One or more names of feature columns +that should be included for the purpose of novelty detection.} + \item{\code{exclude_features}}{(\emph{optional}) Feature columns that will be removed +from the data set. Cannot overlap with features in \code{signature}, +\code{novelty_features} or \code{include_features}.} + \item{\code{include_features}}{(\emph{optional}) Feature columns that are specifically +included in the data set. By default all features are included. Cannot +overlap with \code{exclude_features}, but may overlap \code{signature}. Features in +\code{signature} and \code{novelty_features} are always included. If both +\code{exclude_features} and \code{include_features} are provided, \code{include_features} +takes precedence, provided that there is no overlap between the two.} + \item{\code{reference_method}}{(\emph{optional}) Method used to set reference levels for +categorical features. There are several options: +\itemize{ +\item \code{auto} (default): Categorical features that are not explicitly set by the +user, i.e. columns containing boolean values or characters, use the most +frequent level as reference. Categorical features that are explicitly set, +i.e. as factors, are used as is. +\item \code{always}: Both automatically detected and user-specified categorical +features have the reference level set to the most frequent level. Ordinal +features are not altered, but are used as is. +\item \code{never}: User-specified categorical features are used as is. +Automatically detected categorical features are simply sorted, and the +first level is then used as the reference level. This was the behaviour +prior to familiar version 1.3.0. +}} + \item{\code{experimental_design}}{(\strong{required}) Defines what the experiment looks +like, e.g. \code{cv(bt(fs,20)+mb,3,2)+ev} for 2 times repeated 3-fold +cross-validation with nested feature selection on 20 bootstraps and +model-building, and external validation. The basic workflow components are: +\itemize{ +\item \code{fs}: (required) feature selection step. +\item \code{mb}: (required) model building step. +\item \code{ev}: (optional) external validation. Note that internal validation due +to subsampling will always be conducted if the subsampling methods create +any validation data sets. +} + +The different components are linked using \code{+}. + +Different subsampling methods can be used in conjunction with the basic +workflow components: +\itemize{ +\item \code{bs(x,n)}: (stratified) .632 bootstrap, with \code{n} the number of +bootstraps. In contrast to \code{bt}, feature pre-processing parameters and +hyperparameter optimisation are conducted on individual bootstraps. +\item \code{bt(x,n)}: (stratified) .632 bootstrap, with \code{n} the number of +bootstraps. Unlike \code{bs} and other subsampling methods, no separate +pre-processing parameters or optimised hyperparameters will be determined +for each bootstrap. +\item \code{cv(x,n,p)}: (stratified) \code{n}-fold cross-validation, repeated \code{p} times. +Pre-processing parameters are determined for each iteration. +\item \code{lv(x)}: leave-one-out-cross-validation. Pre-processing parameters are +determined for each iteration. +\item \code{ip(x)}: imbalance partitioning for addressing class imbalances on the +data set. Pre-processing parameters are determined for each partition. The +number of partitions generated depends on the imbalance correction method +(see the \code{imbalance_correction_method} parameter). Imbalance partitioning +does not generate validation sets. +} + +As shown in the example above, sampling algorithms can be nested. + +The simplest valid experimental design is \code{fs+mb}, which corresponds to a +TRIPOD type 1a analysis. Type 1b analyses are only possible using +bootstraps, e.g. \code{bt(fs+mb,100)}. Type 2a analyses can be conducted using +cross-validation, e.g. \code{cv(bt(fs,100)+mb,10,1)}. Depending on the origin of +the external validation data, designs such as \code{fs+mb+ev} or +\code{cv(bt(fs,100)+mb,10,1)+ev} constitute type 2b or type 3 analyses. Type 4 +analyses can be done by obtaining one or more \code{familiarModel} objects from +others and applying them to your own data set. + +Alternatively, the \code{experimental_design} parameter may be used to provide a +path to a file containing iterations, which is named \verb{####_iterations.RDS} +by convention. This path can be relative to the directory of the current +experiment (\code{experiment_dir}), or an absolute path. The absolute path may +thus also point to a file from a different experiment.} + \item{\code{imbalance_correction_method}}{(\emph{optional}) Type of method used to +address class imbalances. Available options are: +\itemize{ +\item \code{full_undersampling} (default): All data will be used in an ensemble +fashion. The full minority class will appear in each partition, but +majority classes are undersampled until all data have been used. +\item \code{random_undersampling}: Randomly undersamples majority classes. This is +useful in cases where full undersampling would lead to the formation of +many models due major overrepresentation of the largest class. +} + +This parameter is only used in combination with imbalance partitioning in +the experimental design, and \code{ip} should therefore appear in the string +that defines the design.} + \item{\code{imbalance_n_partitions}}{(\emph{optional}) Number of times random +undersampling should be repeated. 10 undersampled subsets with balanced +classes are formed by default.} + }} +} +\value{ +A list of settings to be used for configuring the experiments. +} +\description{ +This function parses settings required to parse the data set, e.g. determine +which columns are identfier columns, what column contains outcome data, which +type of outcome is it? +} +\details{ +Three variants of parameters exist: +\itemize{ +\item required: this parameter is required and must be set by the user. +\item recommended: not setting this parameter might cause an error to be thrown, +dependent on other input. +\item optional: these parameters have default values that may be altered if +required. +} +} +\keyword{internal} diff --git a/man/dot-parse_model_development_settings.Rd b/man/dot-parse_model_development_settings.Rd new file mode 100644 index 00000000..3ea0abf9 --- /dev/null +++ b/man/dot-parse_model_development_settings.Rd @@ -0,0 +1,69 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ParseSettings.R +\name{.parse_model_development_settings} +\alias{.parse_model_development_settings} +\title{Internal function for parsing settings related to model development} +\usage{ +.parse_model_development_settings( + config = NULL, + data, + parallel, + outcome_type, + learner = waiver(), + hyperparameter = waiver(), + novelty_detector = waiver(), + detector_parameters = waiver(), + parallel_model_development = waiver(), + ... +) +} +\arguments{ +\item{config}{A list of settings, e.g. from an xml file.} + +\item{data}{Data set as loaded using the \code{.load_data} function.} + +\item{parallel}{Logical value that whether familiar uses parallelisation. If +\code{FALSE} it will override \code{parallel_model_development}.} + +\item{outcome_type}{Type of outcome found in the data set.} + +\item{learner}{(\strong{required}) One or more algorithms used for model +development. A sizeable number learners is supported in \code{familiar}. Please +see the vignette on learners for more information concerning the available +learners.} + +\item{hyperparameter}{(\emph{optional}) List of lists containing hyperparameters +for learners. Each sublist should have the name of the learner method it +corresponds to, with list elements being named after the intended +hyperparameter, e.g. \code{"glm_logistic"=list("sign_size"=3)} + +All learners have hyperparameters. Please refer to the vignette on learners +for more details. If no parameters are provided, sequential model-based +optimisation is used to determine optimal hyperparameters. + +Hyperparameters provided by the user are never optimised. However, if more +than one value is provided for a single hyperparameter, optimisation will +be conducted using these values.} + +\item{novelty_detector}{(\emph{optional}) Specify the algorithm used for training +a novelty detector. This detector can be used to identify +out-of-distribution data prospectively.} + +\item{detector_parameters}{(\emph{optional}) List lists containing hyperparameters +for novelty detectors. Currently not used.} + +\item{parallel_model_development}{(\emph{optional}) Enable parallel processing for +the model development workflow. Defaults to \code{TRUE}. When set to \code{FALSE}, +this will disable the use of parallel processing while developing models, +regardless of the settings of the \code{parallel} parameter. +\code{parallel_model_development} is ignored if \code{parallel=FALSE}.} + +\item{...}{Unused arguments.} +} +\value{ +List of parameters related to model development. +} +\description{ +Internal function for parsing settings related to model development +} +\keyword{internal} diff --git a/man/dot-parse_preprocessing_settings.Rd b/man/dot-parse_preprocessing_settings.Rd new file mode 100644 index 00000000..b1d96818 --- /dev/null +++ b/man/dot-parse_preprocessing_settings.Rd @@ -0,0 +1,485 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ParseSettings.R +\name{.parse_preprocessing_settings} +\alias{.parse_preprocessing_settings} +\title{Internal function for parsing settings related to preprocessing} +\usage{ +.parse_preprocessing_settings( + config = NULL, + data, + parallel, + outcome_type, + feature_max_fraction_missing = waiver(), + sample_max_fraction_missing = waiver(), + filter_method = waiver(), + univariate_test_threshold = waiver(), + univariate_test_threshold_metric = waiver(), + univariate_test_max_feature_set_size = waiver(), + low_var_minimum_variance_threshold = waiver(), + low_var_max_feature_set_size = waiver(), + robustness_icc_type = waiver(), + robustness_threshold_metric = waiver(), + robustness_threshold_value = waiver(), + transformation_method = waiver(), + normalisation_method = waiver(), + batch_normalisation_method = waiver(), + imputation_method = waiver(), + cluster_method = waiver(), + cluster_linkage_method = waiver(), + cluster_cut_method = waiver(), + cluster_similarity_metric = waiver(), + cluster_similarity_threshold = waiver(), + cluster_representation_method = waiver(), + parallel_preprocessing = waiver(), + ... +) +} +\arguments{ +\item{config}{A list of settings, e.g. from an xml file.} + +\item{data}{Data set as loaded using the \code{.load_data} function.} + +\item{parallel}{Logical value that whether familiar uses parallelisation. If +\code{FALSE} it will override \code{parallel_preprocessing}.} + +\item{outcome_type}{Type of outcome found in the data set.} + +\item{feature_max_fraction_missing}{(\emph{optional}) Numeric value between \code{0.0} +and \code{0.95} that determines the meximum fraction of missing values that +still allows a feature to be included in the data set. All features with a +missing value fraction over this threshold are not processed further. The +default value is \code{0.30}.} + +\item{sample_max_fraction_missing}{(\emph{optional}) Numeric value between \code{0.0} +and \code{0.95} that determines the maximum fraction of missing values that +still allows a sample to be included in the data set. All samples with a +missing value fraction over this threshold are excluded and not processed +further. The default value is \code{0.30}.} + +\item{filter_method}{(\emph{optional}) One or methods used to reduce +dimensionality of the data set by removing irrelevant or poorly +reproducible features. + +Several method are available: +\itemize{ +\item \code{none} (default): None of the features will be filtered. +\item \code{low_variance}: Features with a variance below the +\code{low_var_minimum_variance_threshold} are filtered. This can be useful to +filter, for example, genes that are not differentially expressed. +\item \code{univariate_test}: Features undergo a univariate regression using an +outcome-appropriate regression model. The p-value of the model coefficient +is collected. Features with coefficient p or q-value above the +\code{univariate_test_threshold} are subsequently filtered. +\item \code{robustness}: Features that are not sufficiently robust according to the +intraclass correlation coefficient are filtered. Use of this method +requires that repeated measurements are present in the data set, i.e. there +should be entries for which the sample and cohort identifiers are the same. +} + +More than one method can be used simultaneously. Features with singular +values are always filtered, as these do not contain information.} + +\item{univariate_test_threshold}{(\emph{optional}) Numeric value between \code{1.0} and +\code{0.0} that determines which features are irrelevant and will be filtered by +the \code{univariate_test}. The p or q-values are compared to this threshold. +All features with values above the threshold are filtered. The default +value is \code{0.20}.} + +\item{univariate_test_threshold_metric}{(\emph{optional}) Metric used with the to +compare the \code{univariate_test_threshold} against. The following metrics can +be chosen: +\itemize{ +\item \code{p_value} (default): The unadjusted p-value of each feature is used for +to filter features. +\item \code{q_value}: The q-value (Story, 2002), is used to filter features. Some +data sets may have insufficient samples to compute the q-value. The +\code{qvalue} package must be installed from Bioconductor to use this method. +}} + +\item{univariate_test_max_feature_set_size}{(\emph{optional}) Maximum size of the +feature set after the univariate test. P or q values of features are +compared against the threshold, but if the resulting data set would be +larger than this setting, only the most relevant features up to the desired +feature set size are selected. + +The default value is \code{NULL}, which causes features to be filtered based on +their relevance only.} + +\item{low_var_minimum_variance_threshold}{(required, if used) Numeric value +that determines which features will be filtered by the \code{low_variance} +method. The variance of each feature is computed and compared to the +threshold. If it is below the threshold, the feature is removed. + +This parameter has no default value and should be set if \code{low_variance} is +used.} + +\item{low_var_max_feature_set_size}{(\emph{optional}) Maximum size of the feature +set after filtering features with a low variance. All features are first +compared against \code{low_var_minimum_variance_threshold}. If the resulting +feature set would be larger than specified, only the most strongly varying +features will be selected, up to the desired size of the feature set. + +The default value is \code{NULL}, which causes features to be filtered based on +their variance only.} + +\item{robustness_icc_type}{(\emph{optional}) String indicating the type of +intraclass correlation coefficient (\code{1}, \code{2} or \code{3}) that should be used to +compute robustness for features in repeated measurements. These types +correspond to the types in Shrout and Fleiss (1979). The default value is +\code{1}.} + +\item{robustness_threshold_metric}{(\emph{optional}) String indicating which +specific intraclass correlation coefficient (ICC) metric should be used to +filter features. This should be one of: +\itemize{ +\item \code{icc}: The estimated ICC value itself. +\item \code{icc_low} (default): The estimated lower limit of the 95\% confidence +interval of the ICC, as suggested by Koo and Li (2016). +\item \code{icc_panel}: The estimated ICC value over the panel average, i.e. the ICC +that would be obtained if all repeated measurements were averaged. +\item \code{icc_panel_low}: The estimated lower limit of the 95\% confidence interval +of the panel ICC. +}} + +\item{robustness_threshold_value}{(\emph{optional}) The intraclass correlation +coefficient value that is as threshold. The default value is \code{0.70}.} + +\item{transformation_method}{(\emph{optional}) The transformation method used to +change the distribution of the data to be more normal-like. The following +methods are available: +\itemize{ +\item \code{none}: This disables transformation of features. +\item \code{yeo_johnson} (default): Transformation using the Yeo-Johnson +transformation (Yeo and Johnson, 2000). The algorithm tests various lambda +values and selects the lambda that maximises the log-likelihood. +\item \code{yeo_johnson_trim}: As \code{yeo_johnson}, but based on the set of feature +values where the 5\% lowest and 5\% highest values are discarded. This +reduces the effect of outliers. +\item \code{yeo_johnson_winsor}: As \code{yeo_johnson}, but based on the set of feature +values where the 5\% lowest and 5\% highest values are winsorised. This +reduces the effect of outliers. +\item \code{yeo_johnson_robust}: A robust version of \code{yeo_johnson} after Raymaekers +and Rousseeuw (2021). This method is less sensitive to outliers. +\item \code{box_cox}: Transformation using the Box-Cox transformation (Box and Cox, +1964). Unlike the Yeo-Johnson transformation, the Box-Cox transformation +requires that all data are positive. Features that contain zero or negative +values cannot be transformed using this transformation. The algorithm tests +various lambda values and selects the lambda that maximises the +log-likelihood. +\item \code{box_cox_trim}: As \code{box_cox}, but based on the set of feature values +where the 5\% lowest and 5\% highest values are discarded. This reduces the +effect of outliers. +\item \code{box_cox_winsor}: As \code{box_cox}, but based on the set of feature values +where the 5\% lowest and 5\% highest values are winsorised. This reduces the +effect of outliers. +\item \code{box_cox_robust}: A robust verson of \code{box_cox} after Raymaekers and +Rousseew (2021). This method is less sensitive to outliers. +} + +Only features that contain numerical data are transformed. Transformation +parameters obtained in development data are stored within \code{featureInfo} +objects for later use with validation data sets.} + +\item{normalisation_method}{(\emph{optional}) The normalisation method used to +improve the comparability between numerical features that may have very +different scales. The following normalisation methods can be chosen: +\itemize{ +\item \code{none}: This disables feature normalisation. +\item \code{standardisation}: Features are normalised by subtraction of their mean +values and division by their standard deviations. This causes every feature +to be have a center value of 0.0 and standard deviation of 1.0. +\item \code{standardisation_trim}: As \code{standardisation}, but based on the set of +feature values where the 5\% lowest and 5\% highest values are discarded. +This reduces the effect of outliers. +\item \code{standardisation_winsor}: As \code{standardisation}, but based on the set of +feature values where the 5\% lowest and 5\% highest values are winsorised. +This reduces the effect of outliers. +\item \code{standardisation_robust} (default): A robust version of \code{standardisation} +that relies on computing Huber's M-estimators for location and scale. +\item \code{normalisation}: Features are normalised by subtraction of their minimum +values and division by their ranges. This maps all feature values to a +\eqn{[0, 1]} interval. +\item \code{normalisation_trim}: As \code{normalisation}, but based on the set of feature +values where the 5\% lowest and 5\% highest values are discarded. This +reduces the effect of outliers. +\item \code{normalisation_winsor}: As \code{normalisation}, but based on the set of +feature values where the 5\% lowest and 5\% highest values are winsorised. +This reduces the effect of outliers. +\item \code{quantile}: Features are normalised by subtraction of their median values +and division by their interquartile range. +\item \code{mean_centering}: Features are centered by substracting the mean, but do +not undergo rescaling. +} + +Only features that contain numerical data are normalised. Normalisation +parameters obtained in development data are stored within \code{featureInfo} +objects for later use with validation data sets.} + +\item{batch_normalisation_method}{(\emph{optional}) The method used for batch +normalisation. Available methods are: +\itemize{ +\item \code{none} (default): This disables batch normalisation of features. +\item \code{standardisation}: Features within each batch are normalised by +subtraction of the mean value and division by the standard deviation in +each batch. +\item \code{standardisation_trim}: As \code{standardisation}, but based on the set of +feature values where the 5\% lowest and 5\% highest values are discarded. +This reduces the effect of outliers. +\item \code{standardisation_winsor}: As \code{standardisation}, but based on the set of +feature values where the 5\% lowest and 5\% highest values are winsorised. +This reduces the effect of outliers. +\item \code{standardisation_robust}: A robust version of \code{standardisation} that +relies on computing Huber's M-estimators for location and scale within each +batch. +\item \code{normalisation}: Features within each batch are normalised by subtraction +of their minimum values and division by their range in each batch. This +maps all feature values in each batch to a \eqn{[0, 1]} interval. +\item \code{normalisation_trim}: As \code{normalisation}, but based on the set of feature +values where the 5\% lowest and 5\% highest values are discarded. This +reduces the effect of outliers. +\item \code{normalisation_winsor}: As \code{normalisation}, but based on the set of +feature values where the 5\% lowest and 5\% highest values are winsorised. +This reduces the effect of outliers. +\item \code{quantile}: Features in each batch are normalised by subtraction of the +median value and division by the interquartile range of each batch. +\item \code{mean_centering}: Features in each batch are centered on 0.0 by +substracting the mean value in each batch, but are not rescaled. +\item \code{combat_parametric}: Batch adjustments using parametric empirical Bayes +(Johnson et al, 2007). \code{combat_p} leads to the same method. +\item \code{combat_non_parametric}: Batch adjustments using non-parametric empirical +Bayes (Johnson et al, 2007). \code{combat_np} and \code{combat} lead to the same +method. Note that we reduced complexity from O(\eqn{n^2}) to O(\eqn{n}) by +only computing batch adjustment parameters for each feature on a subset of +50 randomly selected features, instead of all features. +} + +Only features that contain numerical data are normalised using batch +normalisation. Batch normalisation parameters obtained in development data +are stored within \code{featureInfo} objects for later use with validation data +sets, in case the validation data is from the same batch. + +If validation data contains data from unknown batches, normalisation +parameters are separately determined for these batches. + +Note that for both empirical Bayes methods, the batch effect is assumed to +produce results across the features. This is often true for things such as +gene expressions, but the assumption may not hold generally. + +When performing batch normalisation, it is moreover important to check that +differences between batches or cohorts are not related to the studied +endpoint.} + +\item{imputation_method}{(\emph{optional}) Method used for imputing missing +feature values. Two methods are implemented: +\itemize{ +\item \code{simple}: Simple replacement of a missing value by the median value (for +numeric features) or the modal value (for categorical features). +\item \code{lasso}: Imputation of missing value by lasso regression (using \code{glmnet}) +based on information contained in other features. +} + +\code{simple} imputation precedes \code{lasso} imputation to ensure that any missing +values in predictors required for \code{lasso} regression are resolved. The +\code{lasso} estimate is then used to replace the missing value. + +The default value depends on the number of features in the dataset. If the +number is lower than 100, \code{lasso} is used by default, and \code{simple} +otherwise. + +Only single imputation is performed. Imputation models and parameters are +stored within \code{featureInfo} objects for later use with validation data +sets.} + +\item{cluster_method}{(\emph{optional}) Clustering is performed to identify and +replace redundant features, for example those that are highly correlated. +Such features do not carry much additional information and may be removed +or replaced instead (Park et al., 2007; Tolosi and Lengauer, 2011). + +The cluster method determines the algorithm used to form the clusters. The +following cluster methods are implemented: +\itemize{ +\item \code{none}: No clustering is performed. +\item \code{hclust} (default): Hierarchical agglomerative clustering. If the +\code{fastcluster} package is installed, \code{fastcluster::hclust} is used (Muellner +2013), otherwise \code{stats::hclust} is used. +\item \code{agnes}: Hierarchical clustering using agglomerative nesting (Kaufman and +Rousseeuw, 1990). This algorithm is similar to \code{hclust}, but uses the +\code{cluster::agnes} implementation. +\item \code{diana}: Divisive analysis hierarchical clustering. This method uses +divisive instead of agglomerative clustering (Kaufman and Rousseeuw, 1990). +\code{cluster::diana} is used. +\item \code{pam}: Partioning around medioids. This partitions the data into $k$ +clusters around medioids (Kaufman and Rousseeuw, 1990). $k$ is selected +using the \code{silhouette} metric. \code{pam} is implemented using the +\code{cluster::pam} function. +} + +Clusters and cluster information is stored within \code{featureInfo} objects for +later use with validation data sets. This enables reproduction of the same +clusters as formed in the development data set.} + +\item{cluster_linkage_method}{(\emph{optional}) Linkage method used for +agglomerative clustering in \code{hclust} and \code{agnes}. The following linkage +methods can be used: +\itemize{ +\item \code{average} (default): Average linkage. +\item \code{single}: Single linkage. +\item \code{complete}: Complete linkage. +\item \code{weighted}: Weighted linkage, also known as McQuitty linkage. +\item \code{ward}: Linkage using Ward's minimum variance method. +} + +\code{diana} and \code{pam} do not require a linkage method.} + +\item{cluster_cut_method}{(\emph{optional}) The method used to define the actual +clusters. The following methods can be used: +\itemize{ +\item \code{silhouette}: Clusters are formed based on the silhouette score +(Rousseeuw, 1987). The average silhouette score is computed from 2 to +\eqn{n} clusters, with \eqn{n} the number of features. Clusters are only +formed if the average silhouette exceeds 0.50, which indicates reasonable +evidence for structure. This procedure may be slow if the number of +features is large (>100s). +\item \code{fixed_cut}: Clusters are formed by cutting the hierarchical tree at the +point indicated by the \code{cluster_similarity_threshold}, e.g. where features +in a cluster have an average Spearman correlation of 0.90. \code{fixed_cut} is +only available for \code{agnes}, \code{diana} and \code{hclust}. +\item \code{dynamic_cut}: Dynamic cluster formation using the cutting algorithm in +the \code{dynamicTreeCut} package. This package should be installed to select +this option. \code{dynamic_cut} can only be used with \code{agnes} and \code{hclust}. +} + +The default options are \code{silhouette} for partioning around medioids (\code{pam}) +and \code{fixed_cut} otherwise.} + +\item{cluster_similarity_metric}{(\emph{optional}) Clusters are formed based on +feature similarity. All features are compared in a pair-wise fashion to +compute similarity, for example correlation. The resulting similarity grid +is converted into a distance matrix that is subsequently used for +clustering. The following metrics are supported to compute pairwise +similarities: +\itemize{ +\item \code{mutual_information} (default): normalised mutual information. +\item \code{mcfadden_r2}: McFadden's pseudo R-squared (McFadden, 1974). +\item \code{cox_snell_r2}: Cox and Snell's pseudo R-squared (Cox and Snell, 1989). +\item \code{nagelkerke_r2}: Nagelkerke's pseudo R-squared (Nagelkerke, 1991). +\item \code{spearman}: Spearman's rank order correlation. +\item \code{kendall}: Kendall rank correlation. +\item \code{pearson}: Pearson product-moment correlation. +} + +The pseudo R-squared metrics can be used to assess similarity between mixed +pairs of numeric and categorical features, as these are based on the +log-likelihood of regression models. In \code{familiar}, the more informative +feature is used as the predictor and the other feature as the reponse +variable. In numeric-categorical pairs, the numeric feature is considered +to be more informative and is thus used as the predictor. In +categorical-categorical pairs, the feature with most levels is used as the +predictor. + +In case any of the classical correlation coefficients (\code{pearson}, +\code{spearman} and \code{kendall}) are used with (mixed) categorical features, the +categorical features are one-hot encoded and the mean correlation over all +resulting pairs is used as similarity.} + +\item{cluster_similarity_threshold}{(\emph{optional}) The threshold level for +pair-wise similarity that is required to form clusters using \code{fixed_cut}. +This should be a numerical value between 0.0 and 1.0. Note however, that a +reasonable threshold value depends strongly on the similarity metric. The +following are the default values used: +\itemize{ +\item \code{mcfadden_r2} and \code{mutual_information}: \code{0.30} +\item \code{cox_snell_r2} and \code{nagelkerke_r2}: \code{0.75} +\item \code{spearman}, \code{kendall} and \code{pearson}: \code{0.90} +} + +Alternatively, if the \verb{fixed cut} method is not used, this value determines +whether any clustering should be performed, because the data may not +contain highly similar features. The default values in this situation are: +\itemize{ +\item \code{mcfadden_r2} and \code{mutual_information}: \code{0.25} +\item \code{cox_snell_r2} and \code{nagelkerke_r2}: \code{0.40} +\item \code{spearman}, \code{kendall} and \code{pearson}: \code{0.70} +} + +The threshold value is converted to a distance (1-similarity) prior to +cutting hierarchical trees.} + +\item{cluster_representation_method}{(\emph{optional}) Method used to determine +how the information of co-clustered features is summarised and used to +represent the cluster. The following methods can be selected: +\itemize{ +\item \code{best_predictor} (default): The feature with the highest importance +according to univariate regression with the outcome is used to represent +the cluster. +\item \code{medioid}: The feature closest to the cluster center, i.e. the feature +that is most similar to the remaining features in the cluster, is used to +represent the feature. +\item \code{mean}: A meta-feature is generated by averaging the feature values for +all features in a cluster. This method aligns all features so that all +features will be positively correlated prior to averaging. Should a cluster +contain one or more categorical features, the \code{medioid} method will be used +instead, as averaging is not possible. Note that if this method is chosen, +the \code{normalisation_method} parameter should be one of \code{standardisation}, +\code{standardisation_trim}, \code{standardisation_winsor} or \code{quantile}.` +} + +If the \code{pam} cluster method is selected, only the \code{medioid} method can be +used. In that case 1 medioid is used by default.} + +\item{parallel_preprocessing}{(\emph{optional}) Enable parallel processing for the +preprocessing workflow. Defaults to \code{TRUE}. When set to \code{FALSE}, this will +disable the use of parallel processing while preprocessing, regardless of +the settings of the \code{parallel} parameter. \code{parallel_preprocessing} is +ignored if \code{parallel=FALSE}.} + +\item{...}{Unused arguments.} +} +\value{ +List of parameters related to preprocessing. +} +\description{ +Internal function for parsing settings related to preprocessing +} +\references{ +\enumerate{ +\item Storey, J. D. A direct approach to false discovery rates. J. +R. Stat. Soc. Series B Stat. Methodol. 64, 479–498 (2002). +\item Shrout, P. E. & Fleiss, J. L. Intraclass correlations: uses in assessing +rater reliability. Psychol. Bull. 86, 420–428 (1979). +\item Koo, T. K. & Li, M. Y. A guideline of selecting and reporting intraclass +correlation coefficients for reliability research. J. Chiropr. Med. 15, +155–163 (2016). +\item Yeo, I. & Johnson, R. A. A new family of power transformations to +improve normality or symmetry. Biometrika 87, 954–959 (2000). +\item Box, G. E. P. & Cox, D. R. An analysis of transformations. J. R. Stat. +Soc. Series B Stat. Methodol. 26, 211–252 (1964). +\item Raymaekers, J., Rousseeuw, P. J. Transforming variables to central +normality. Mach Learn. (2021). +\item Park, M. Y., Hastie, T. & Tibshirani, R. Averaged gene expressions for +regression. Biostatistics 8, 212–227 (2007). +\item Tolosi, L. & Lengauer, T. Classification with correlated features: +unreliability of feature ranking and solutions. Bioinformatics 27, +1986–1994 (2011). +\item Johnson, W. E., Li, C. & Rabinovic, A. Adjusting batch effects in +microarray expression data using empirical Bayes methods. Biostatistics 8, +118–127 (2007) +\item Kaufman, L. & Rousseeuw, P. J. Finding groups in data: an introduction +to cluster analysis. (John Wiley & Sons, 2009). +\item Muellner, D. fastcluster: fast hierarchical, agglomerative clustering +routines for R and Python. J. Stat. Softw. 53, 1–18 (2013). +\item Rousseeuw, P. J. Silhouettes: A graphical aid to the interpretation and +validation of cluster analysis. J. Comput. Appl. Math. 20, 53–65 (1987). +\item Langfelder, P., Zhang, B. & Horvath, S. Defining clusters from a +hierarchical cluster tree: the Dynamic Tree Cut package for R. +Bioinformatics 24, 719–720 (2008). +\item McFadden, D. Conditional logit analysis of qualitative choice behavior. +in Frontiers in Econometrics (ed. Zarembka, P.) 105–142 (Academic Press, +1974). +\item Cox, D. R. & Snell, E. J. Analysis of binary data. (Chapman and Hall, +1989). +\item Nagelkerke, N. J. D. A note on a general definition of the coefficient +of determination. Biometrika 78, 691–692 (1991). +} +} +\keyword{internal} diff --git a/man/dot-parse_setup_settings.Rd b/man/dot-parse_setup_settings.Rd new file mode 100644 index 00000000..b9ca748e --- /dev/null +++ b/man/dot-parse_setup_settings.Rd @@ -0,0 +1,67 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ParseSettings.R +\name{.parse_setup_settings} +\alias{.parse_setup_settings} +\title{Internal function for parsing settings related to the computational setup} +\usage{ +.parse_setup_settings( + config = NULL, + parallel = waiver(), + parallel_nr_cores = waiver(), + restart_cluster = waiver(), + cluster_type = waiver(), + backend_type = waiver(), + server_port = waiver(), + ... +) +} +\arguments{ +\item{config}{A list of settings, e.g. from an xml file.} + +\item{parallel}{(\emph{optional}) Enable parallel processing. Defaults to \code{TRUE}. +When set to \code{FALSE}, this disables all parallel processing, regardless of +specific parameters such as \code{parallel_preprocessing}. However, when +\code{parallel} is \code{TRUE}, parallel processing of different parts of the +workflow can be disabled by setting respective flags to \code{FALSE}.} + +\item{parallel_nr_cores}{(\emph{optional}) Number of cores available for +parallelisation. Defaults to 2. This setting does nothing if +parallelisation is disabled.} + +\item{restart_cluster}{(\emph{optional}) Restart nodes used for parallel computing +to free up memory prior to starting a parallel process. Note that it does +take time to set up the clusters. Therefore setting this argument to \code{TRUE} +may impact processing speed. This argument is ignored if \code{parallel} is +\code{FALSE} or the cluster was initialised outside of familiar. Default is +\code{FALSE}, which causes the clusters to be initialised only once.} + +\item{cluster_type}{(\emph{optional}) Selection of the cluster type for parallel +processing. Available types are the ones supported by the parallel package +that is part of the base R distribution: \code{psock} (default), \code{fork}, \code{mpi}, +\code{nws}, \code{sock}. In addition, \code{none} is available, which also disables +parallel processing.} + +\item{backend_type}{(\emph{optional}) Selection of the backend for distributing +copies of the data. This backend ensures that only a single master copy is +kept in memory. This limits memory usage during parallel processing. + +Several backend options are available, notably \code{socket_server}, and \code{none} +(default). \code{socket_server} is based on the callr package and R sockets, +comes with \code{familiar} and is available for any OS. \code{none} uses the package +environment of familiar to store data, and is available for any OS. +However, \code{none} requires copying of data to any parallel process, and has a +larger memory footprint.} + +\item{server_port}{(\emph{optional}) Integer indicating the port on which the +socket server or RServe process should communicate. Defaults to port 6311. +Note that ports 0 to 1024 and 49152 to 65535 cannot be used.} + +\item{...}{Unused arguments.} +} +\value{ +List of parameters related to the computational setup. +} +\description{ +Internal function for parsing settings related to the computational setup +} +\keyword{internal} diff --git a/man/dot-plot_permutation_variable_importance.Rd b/man/dot-plot_permutation_variable_importance.Rd new file mode 100644 index 00000000..ad8d469c --- /dev/null +++ b/man/dot-plot_permutation_variable_importance.Rd @@ -0,0 +1,83 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/PlotPermutationVariableImportance.R +\name{.plot_permutation_variable_importance} +\alias{.plot_permutation_variable_importance} +\title{Internal plotting function for permutation variable importance plots} +\usage{ +.plot_permutation_variable_importance( + x, + color_by, + facet_by, + facet_wrap_cols, + ggtheme, + discrete_palette, + x_label, + y_label, + legend_label, + plot_title, + plot_sub_title, + caption, + conf_int_style, + conf_int_alpha, + x_range, + x_breaks +) +} +\arguments{ +\item{color_by}{(\emph{optional}) Variables used to determine fill colour of plot +objects. The variables cannot overlap with those provided to the \code{split_by} +argument, but may overlap with other arguments. See details for available +variables.} + +\item{facet_by}{(\emph{optional}) Variables used to determine how and if facets of +each figure appear. In case the \code{facet_wrap_cols} argument is \code{NULL}, the +first variable is used to define columns, and the remaing variables are +used to define rows of facets. The variables cannot overlap with those +provided to the \code{split_by} argument, but may overlap with other arguments. +See details for available variables.} + +\item{facet_wrap_cols}{(\emph{optional}) Number of columns to generate when facet +wrapping. If NULL, a facet grid is produced instead.} + +\item{ggtheme}{(\emph{optional}) \code{ggplot} theme to use for plotting.} + +\item{discrete_palette}{(\emph{optional}) Palette used to fill the bars in case a +non-singular variable was provided to the \code{color_by} argument.} + +\item{x_label}{(\emph{optional}) Label to provide to the x-axis. If NULL, no label +is shown.} + +\item{y_label}{(\emph{optional}) Label to provide to the y-axis. If NULL, no label +is shown.} + +\item{legend_label}{(\emph{optional}) Label to provide to the legend. If NULL, the +legend will not have a name.} + +\item{plot_title}{(\emph{optional}) Label to provide as figure title. If NULL, no +title is shown.} + +\item{plot_sub_title}{(\emph{optional}) Label to provide as figure subtitle. If +NULL, no subtitle is shown.} + +\item{caption}{(\emph{optional}) Label to provide as figure caption. If NULL, no +caption is shown.} + +\item{conf_int_style}{(\emph{optional}) Confidence interval style. See details for +allowed styles.} + +\item{conf_int_alpha}{(\emph{optional}) Alpha value to determine transparency of +confidence intervals or, alternatively, other plot elements with which the +confidence interval overlaps. Only values between 0.0 (fully transparent) +and 1.0 (fully opaque) are allowed.} + +\item{x_range}{(\emph{optional}) Value range for the x-axis.} + +\item{x_breaks}{(\emph{optional}) Break points on the x-axis of the plot.} +} +\value{ +ggplot plot object. +} +\description{ +Internal plotting function for permutation variable importance plots +} +\keyword{internal} diff --git a/man/dot-plot_univariate_importance.Rd b/man/dot-plot_univariate_importance.Rd new file mode 100644 index 00000000..2d892b76 --- /dev/null +++ b/man/dot-plot_univariate_importance.Rd @@ -0,0 +1,92 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/PlotUnivariateImportance.R +\name{.plot_univariate_importance} +\alias{.plot_univariate_importance} +\title{Internal plotting function for univariate plots} +\usage{ +.plot_univariate_importance( + x, + color_by, + facet_by, + facet_wrap_cols, + ggtheme, + show_cluster, + discrete_palette, + gradient_palette, + x_label, + y_label, + legend_label, + plot_title, + plot_sub_title, + caption, + x_range, + x_breaks, + significance_level_shown +) +} +\arguments{ +\item{color_by}{(\emph{optional}) Variables used to determine fill colour of plot +objects. The variables cannot overlap with those provided to the \code{split_by} +argument, but may overlap with other arguments. See details for available +variables.} + +\item{facet_by}{(\emph{optional}) Variables used to determine how and if facets of +each figure appear. In case the \code{facet_wrap_cols} argument is \code{NULL}, the +first variable is used to define columns, and the remaing variables are +used to define rows of facets. The variables cannot overlap with those +provided to the \code{split_by} argument, but may overlap with other arguments. +See details for available variables.} + +\item{facet_wrap_cols}{(\emph{optional}) Number of columns to generate when facet +wrapping. If NULL, a facet grid is produced instead.} + +\item{ggtheme}{(\emph{optional}) \code{ggplot} theme to use for plotting.} + +\item{show_cluster}{(\emph{optional}) Show which features were clustered together.} + +\item{discrete_palette}{(\emph{optional}) Palette used to fill the bars in case a +non-singular variable was provided to the \code{color_by} argument.} + +\item{gradient_palette}{(\emph{optional}) Palette to use for filling the bars in +case the \code{color_by} argument is not set. The bars are then coloured +according to their importance. By default, no gradient is used, and the bars +are not filled according to importance. Use \code{NULL} to fill the bars using +the default palette in \code{familiar}.} + +\item{x_label}{(\emph{optional}) Label to provide to the x-axis. If NULL, no label +is shown.} + +\item{y_label}{(\emph{optional}) Label to provide to the y-axis. If NULL, no label +is shown.} + +\item{legend_label}{(\emph{optional}) Label to provide to the legend. If NULL, the +legend will not have a name.} + +\item{plot_title}{(\emph{optional}) Label to provide as figure title. If NULL, no +title is shown.} + +\item{plot_sub_title}{(\emph{optional}) Label to provide as figure subtitle. If +NULL, no subtitle is shown.} + +\item{caption}{(\emph{optional}) Label to provide as figure caption. If NULL, no +caption is shown.} + +\item{x_range}{(\emph{optional}) Value range for the x-axis.} + +\item{x_breaks}{(\emph{optional}) Break points on the x-axis of the plot.} + +\item{significance_level_shown}{Position(s) to draw vertical lines indicating +a significance level, e.g. 0.05. Can be NULL to not draw anything.} +} +\value{ +ggplot plot object. +} +\description{ +Internal plotting function for univariate plots +} +\seealso{ +\itemize{ +\item \code{\link{plot_univariate_importance}} for the user interface. +} +} +\keyword{internal} diff --git a/man/dot-prepare_familiar_data_sets.Rd b/man/dot-prepare_familiar_data_sets.Rd new file mode 100644 index 00000000..8b64b572 --- /dev/null +++ b/man/dot-prepare_familiar_data_sets.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Evaluation.R +\name{.prepare_familiar_data_sets} +\alias{.prepare_familiar_data_sets} +\title{Prepare familiarData objects for evaluation at runtime.} +\usage{ +.prepare_familiar_data_sets( + cl = NULL, + only_pooling = FALSE, + message_indent = 0L, + verbose = FALSE +) +} +\arguments{ +\item{cl}{Cluster for parallel processing.} + +\item{only_pooling}{Flag that, if set, forces evaluation of only the +top-level data, and not e.g. ensembles.} + +\item{message_indent}{indent that messages should have.} + +\item{verbose}{Sets verbosity} +} +\value{ +A data.table with created links to created data objects. +} +\description{ +Information concerning models, features and the experiment is +processed and stored in familiarData objects. Information can be extracted +from these objects as csv files, or by plotting, or multiple objects can be +combined into familiarCollection objects, which allows aggregated exports. +} +\details{ +This function generates the names of familiarData object files, and +their corresponding generating ensemble, which allows the familiarData +objects to be created. +} +\keyword{internal} diff --git a/man/dot-update_experimental_design_settings.Rd b/man/dot-update_experimental_design_settings.Rd new file mode 100644 index 00000000..41a1c21c --- /dev/null +++ b/man/dot-update_experimental_design_settings.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/DataParameterChecks.R +\name{.update_experimental_design_settings} +\alias{.update_experimental_design_settings} +\title{Internal function to check batch assignment to development and validation} +\usage{ +.update_experimental_design_settings(section_table, data, settings) +} +\arguments{ +\item{section_table}{data.table generated by the \code{extract_experimental_setup} +function. Contains information regarding the experiment.} + +\item{data}{Data set as loaded using the \code{.load_data} function.} + +\item{settings}{List of parameter settings for data set parsing and setting +up the experiment.} +} +\value{ +A verified and updated list of parameter settings. +} +\description{ +This function checks which batches in the data set are assigned to model +development and external validation. Several errors may be raised if there +are inconsistencies such as an overlapping assignment, name mismatches etc. +} +\keyword{internal} diff --git a/man/dot-update_initial_settings.Rd b/man/dot-update_initial_settings.Rd new file mode 100644 index 00000000..fa22e3a4 --- /dev/null +++ b/man/dot-update_initial_settings.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/DataParameterChecks.R +\name{.update_initial_settings} +\alias{.update_initial_settings} +\title{Internal check and update of settings related to data set parsing} +\usage{ +.update_initial_settings( + formula = NULL, + data, + settings, + check_stringency = "strict" +) +} +\arguments{ +\item{formula}{User-provided formula, may be absent (\code{NULL}).} + +\item{data}{Data set as loaded using the \code{.load_data} function.} + +\item{settings}{List of parameter settings for data set parsing.} + +\item{check_stringency}{Specifies stringency of various checks. This is mostly: +\itemize{ +\item \code{strict}: default value used for \code{summon_familiar}. Thoroughly checks +input data. Used internally for checking development data. +\item \code{external_warn}: value used for \code{extract_data} and related methods. Less +stringent checks, but will warn for possible issues. Used internally for +checking data for evaluation and explanation. +\item \code{external}: value used for external methods such as \code{predict}. Less +stringent checks, particularly for identifier and outcome columns, which may +be completely absent. Used internally for \code{predict}. +}} +} +\value{ +A verified and updated list of parameter settings. +} +\description{ +This function updates and checks parameters related to data set parsing based +on the available data set. +} +\keyword{internal} diff --git a/man/encapsulate_path.Rd b/man/encapsulate_path.Rd new file mode 100644 index 00000000..f718fad9 --- /dev/null +++ b/man/encapsulate_path.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Utilities.R +\name{encapsulate_path} +\alias{encapsulate_path} +\title{Encapsulate path} +\usage{ +encapsulate_path(path) +} +\value{ +encapsulated_path object +} +\description{ +This function is used to encapsulate paths to allow for behaviour switches. +One use is for example when plotting. The plot_all method will encapsulate a +path so that plots may be saved to a directory structure. Other plot methods, +e.g. plot_model_performance do not encapsulate a path, and if the user calls +these functions directly, the plot may be written to the provided path +instead of a directory structure. +} +\keyword{internal} diff --git a/man/experimentData-class.Rd b/man/experimentData-class.Rd new file mode 100644 index 00000000..91f25f6c --- /dev/null +++ b/man/experimentData-class.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarS4Classes.R +\docType{class} +\name{experimentData-class} +\alias{experimentData-class} +\title{Experiment data} +\description{ +An experimentData object contains information concerning the experiment. +These objects can be used to instantiate multiple experiments using the same +iterations, feature information and variable importance. +} +\details{ +experimentData objects are primarily used to improve +reproducibility, since these allow for training models on a shared +foundation. +} +\section{Slots}{ + +\describe{ +\item{\code{experiment_setup}}{Contains regarding the experimental setup that is used +to generate the iteration list.} + +\item{\code{iteration_list}}{List of iteration data that determines which instances +are assigned to training, validation and test sets.} + +\item{\code{feature_info}}{Feature information objects. Only available if the +experimentData object was generated using the \code{precompute_feature_info} or +\code{precompute_vimp} functions.} + +\item{\code{vimp_table_list}}{List of variable importance table objects. Only +available if the experimentData object was created using the +\code{precompute_vimp} function.} + +\item{\code{project_id}}{Identifier of the project that generated the experimentData +object.} + +\item{\code{familiar_version}}{Version of the familiar package used to create this +experimentData.} +}} + +\seealso{ +\code{\link{precompute_data_assignment}} +\code{\link{precompute_feature_info}}, \code{\link{precompute_vimp}} +} diff --git a/man/export_all-methods.Rd b/man/export_all-methods.Rd new file mode 100644 index 00000000..fdf1cda2 --- /dev/null +++ b/man/export_all-methods.Rd @@ -0,0 +1,313 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarCollectionExport.R +\name{export_all} +\alias{export_all} +\alias{export_all,familiarCollection-method} +\alias{export_all,ANY-method} +\title{Extract and export all data.} +\usage{ +export_all(object, dir_path = NULL, aggregate_results = waiver(), ...) + +\S4method{export_all}{familiarCollection}(object, dir_path = NULL, aggregate_results = waiver(), ...) + +\S4method{export_all}{ANY}(object, dir_path = NULL, aggregate_results = waiver(), ...) +} +\arguments{ +\item{object}{A \code{familiarCollection} object, or other other objects from which +a \code{familiarCollection} can be extracted. See details for more information.} + +\item{dir_path}{Path to folder where extracted data should be saved. \code{NULL} +will allow export as a structured list of data.tables.} + +\item{aggregate_results}{Flag that signifies whether results should be +aggregated for export.} + +\item{...}{ + Arguments passed on to \code{\link[=extract_data]{extract_data}}, \code{\link[=as_familiar_collection]{as_familiar_collection}} + \describe{ + \item{\code{data}}{A \code{dataObject} object, \code{data.table} or \code{data.frame} that +constitutes the data that are assessed.} + \item{\code{is_pre_processed}}{Flag that indicates whether the data was already +pre-processed externally, e.g. normalised and clustered. Only used if the +\code{data} argument is a \code{data.table} or \code{data.frame}.} + \item{\code{cl}}{Cluster created using the \code{parallel} package. This cluster is then +used to speed up computation through parallellisation.} + \item{\code{time_max}}{Time point which is used as the benchmark for e.g. cumulative +risks generated by random forest, or the cut-off value for Uno's concordance +index. If not provided explicitly, this parameter is read from settings used +at creation of the underlying \code{familiarModel} objects. Only used for +\code{survival} outcomes.} + \item{\code{evaluation_times}}{One or more time points that are used for in analysis of +survival problems when data has to be assessed at a set time, e.g. +calibration. If not provided explicitly, this parameter is read from +settings used at creation of the underlying \code{familiarModel} objects. Only +used for \code{survival} outcomes.} + \item{\code{aggregation_method}}{Method for aggregating variable importances for the +purpose of evaluation. Variable importances are determined during feature +selection steps and after training the model. Both types are evaluated, but +feature selection variable importance is only evaluated at run-time. + +See the documentation for the \code{vimp_aggregation_method} argument in +\code{summon_familiar} for information concerning the different available +methods. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{rank_threshold}}{The threshold used to define the subset of highly +important features during evaluation. + +See the documentation for the \code{vimp_aggregation_rank_threshold} argument in +\code{summon_familiar} for more information. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{ensemble_method}}{Method for ensembling predictions from models for the +same sample. Available methods are: +\itemize{ +\item \code{median} (default): Use the median of the predicted values as the ensemble +value for a sample. +\item \code{mean}: Use the mean of the predicted values as the ensemble value for a +sample. +}} + \item{\code{metric}}{One or more metrics for assessing model performance. See the +vignette on performance metrics for the available metrics. If not provided +explicitly, this parameter is read from settings used at creation of the +underlying \code{familiarModel} objects.} + \item{\code{feature_cluster_method}}{The method used to perform clustering. These are +the same methods as for the \code{cluster_method} configuration parameter: +\code{none}, \code{hclust}, \code{agnes}, \code{diana} and \code{pam}. + +\code{none} cannot be used when extracting data regarding mutual correlation or +feature expressions. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{feature_linkage_method}}{The method used for agglomerative clustering in +\code{hclust} and \code{agnes}. These are the same methods as for the +\code{cluster_linkage_method} configuration parameter: \code{average}, \code{single}, +\code{complete}, \code{weighted}, and \code{ward}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{feature_cluster_cut_method}}{The method used to divide features into +separate clusters. The available methods are the same as for the +\code{cluster_cut_method} configuration parameter: \code{silhouette}, \code{fixed_cut} and +\code{dynamic_cut}. + +\code{silhouette} is available for all cluster methods, but \code{fixed_cut} only +applies to methods that create hierarchical trees (\code{hclust}, \code{agnes} and +\code{diana}). \code{dynamic_cut} requires the \code{dynamicTreeCut} package and can only +be used with \code{agnes} and \code{hclust}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{feature_similarity_threshold}}{The threshold level for pair-wise +similarity that is required to form feature clusters with the \code{fixed_cut} +method. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{feature_similarity_metric}}{Metric to determine pairwise similarity +between features. Similarity is computed in the same manner as for +clustering, and \code{feature_similarity_metric} therefore has the same options +as \code{cluster_similarity_metric}: \code{mcfadden_r2}, \code{cox_snell_r2}, +\code{nagelkerke_r2}, \code{spearman}, \code{kendall} and \code{pearson}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{sample_cluster_method}}{The method used to perform clustering based on +distance between samples. These are the same methods as for the +\code{cluster_method} configuration parameter: \code{hclust}, \code{agnes}, \code{diana} and +\code{pam}. + +\code{none} cannot be used when extracting data for feature expressions. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{sample_linkage_method}}{The method used for agglomerative clustering in +\code{hclust} and \code{agnes}. These are the same methods as for the +\code{cluster_linkage_method} configuration parameter: \code{average}, \code{single}, +\code{complete}, \code{weighted}, and \code{ward}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{sample_similarity_metric}}{Metric to determine pairwise similarity +between samples. Similarity is computed in the same manner as for +clustering, but \code{sample_similarity_metric} has different options that are +better suited to computing distance between samples instead of between +features: \code{gower}, \code{euclidean}. + +The underlying feature data is scaled to the \eqn{[0, 1]} range (for +numerical features) using the feature values across the samples. The +normalisation parameters required can optionally be computed from feature +data with the outer 5\% (on both sides) of feature values trimmed or +winsorised. To do so append \verb{_trim} (trimming) or \verb{_winsor} (winsorising) to +the metric name. This reduces the effect of outliers somewhat. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{icc_type}}{String indicating the type of intraclass correlation +coefficient (\code{1}, \code{2} or \code{3}) that should be used to compute robustness for +features in repeated measurements during the evaluation of univariate +importance. These types correspond to the types in Shrout and Fleiss (1979). +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{verbose}}{Flag to indicate whether feedback should be provided on the +computation and extraction of various data elements.} + \item{\code{message_indent}}{Number of indentation steps for messages shown during +computation and extraction of various data elements.} + \item{\code{data_element}}{String indicating which data elements are to be extracted. +Default is \code{all}, but specific elements can be specified to speed up +computations if not all elements are to be computed. This is an internal +parameter that is set by, e.g. the \code{export_model_vimp} method.} + \item{\code{sample_limit}}{(\emph{optional}) Set the upper limit of the number of samples +that are used during evaluation steps. Cannot be less than 20. + +This setting can be specified per data element by providing a parameter +value in a named list with data elements, e.g. +\code{list("sample_similarity"=100, "permutation_vimp"=1000)}. + +This parameter can be set for the following data elements: +\code{sample_similarity} and \code{ice_data}.} + \item{\code{detail_level}}{(\emph{optional}) Sets the level at which results are computed +and aggregated. +\itemize{ +\item \code{ensemble}: Results are computed at the ensemble level, i.e. over all +models in the ensemble. This means that, for example, bias-corrected +estimates of model performance are assessed by creating (at least) 20 +bootstraps and computing the model performance of the ensemble model for +each bootstrap. +\item \code{hybrid} (default): Results are computed at the level of models in an +ensemble. This means that, for example, bias-corrected estimates of model +performance are directly computed using the models in the ensemble. If there +are at least 20 trained models in the ensemble, performance is computed for +each model, in contrast to \code{ensemble} where performance is computed for the +ensemble of models. If there are less than 20 trained models in the +ensemble, bootstraps are created so that at least 20 point estimates can be +made. +\item \code{model}: Results are computed at the model level. This means that, for +example, bias-corrected estimates of model performance are assessed by +creating (at least) 20 bootstraps and computing the performance of the model +for each bootstrap. +} + +Note that each level of detail has a different interpretation for bootstrap +confidence intervals. For \code{ensemble} and \code{model} these are the confidence +intervals for the ensemble and an individual model, respectively. That is, +the confidence interval describes the range where an estimate produced by a +respective ensemble or model trained on a repeat of the experiment may be +found with the probability of the confidence level. For \code{hybrid}, it +represents the range where any single model trained on a repeat of the +experiment may be found with the probability of the confidence level. By +definition, confidence intervals obtained using \code{hybrid} are at least as +wide as those for \code{ensemble}. \code{hybrid} offers the correct interpretation if +the goal of the analysis is to assess the result of a single, unspecified, +model. + +\code{hybrid} is generally computationally less expensive then \code{ensemble}, which +in turn is somewhat less expensive than \code{model}. + +A non-default \code{detail_level} parameter can be specified for separate +evaluation steps by providing a parameter value in a named list with data +elements, e.g. \code{list("auc_data"="ensemble", "model_performance"="hybrid")}. +This parameter can be set for the following data elements: \code{auc_data}, +\code{decision_curve_analyis}, \code{model_performance}, \code{permutation_vimp}, +\code{ice_data}, \code{prediction_data} and \code{confusion_matrix}.} + \item{\code{estimation_type}}{(\emph{optional}) Sets the type of estimation that should be +possible. This has the following options: +\itemize{ +\item \code{point}: Point estimates. +\item \code{bias_correction} or \code{bc}: Bias-corrected estimates. A bias-corrected +estimate is computed from (at least) 20 point estimates, and \code{familiar} may +bootstrap the data to create them. +\item \code{bootstrap_confidence_interval} or \code{bci} (default): Bias-corrected +estimates with bootstrap confidence intervals (Efron and Hastie, 2016). The +number of point estimates required depends on the \code{confidence_level} +parameter, and \code{familiar} may bootstrap the data to create them. +} + +As with \code{detail_level}, a non-default \code{estimation_type} parameter can be +specified for separate evaluation steps by providing a parameter value in a +named list with data elements, e.g. \code{list("auc_data"="bci", "model_performance"="point")}. This parameter can be set for the following +data elements: \code{auc_data}, \code{decision_curve_analyis}, \code{model_performance}, +\code{permutation_vimp}, \code{ice_data}, and \code{prediction_data}.} + \item{\code{confidence_level}}{(\emph{optional}) Numeric value for the level at which +confidence intervals are determined. In the case bootstraps are used to +determine the confidence intervals bootstrap estimation, \code{familiar} uses the +rule of thumb \eqn{n = 20 / ci.level} to determine the number of required +bootstraps. + +The default value is \code{0.95}.} + \item{\code{bootstrap_ci_method}}{(\emph{optional}) Method used to determine bootstrap +confidence intervals (Efron and Hastie, 2016). The following methods are +implemented: +\itemize{ +\item \code{percentile} (default): Confidence intervals obtained using the percentile +method. +\item \code{bc}: Bias-corrected confidence intervals. +} + +Note that the standard method is not implemented because this method is +often not suitable due to non-normal distributions. The bias-corrected and +accelerated (BCa) method is not implemented yet.} + \item{\code{stratification_method}}{(\emph{optional}) Method for determining the +stratification threshold for creating survival groups. The actual, +model-dependent, threshold value is obtained from the development data, and +can afterwards be used to perform stratification on validation data. + +The following stratification methods are available: +\itemize{ +\item \code{median} (default): The median predicted value in the development cohort +is used to stratify the samples into two risk groups. For predicted outcome +values that build a continuous spectrum, the two risk groups in the +development cohort will be roughly equal in size. +\item \code{mean}: The mean predicted value in the development cohort is used to +stratify the samples into two risk groups. +\item \code{mean_trim}: As \code{mean}, but based on the set of predicted values +where the 5\% lowest and 5\% highest values are discarded. This reduces the +effect of outliers. +\item \code{mean_winsor}: As \code{mean}, but based on the set of predicted values where +the 5\% lowest and 5\% highest values are winsorised. This reduces the effect +of outliers. +\item \code{fixed}: Samples are stratified based on the sample quantiles of the +predicted values. These quantiles are defined using the +\code{stratification_threshold} parameter. +\item \code{optimised}: Use maximally selected rank statistics to determine the +optimal threshold (Lausen and Schumacher, 1992; Hothorn et al., 2003) to +stratify samples into two optimally separated risk groups. +} + +One or more stratification methods can be selected simultaneously. + +This parameter is only relevant for \code{survival} outcomes.} + \item{\code{dynamic_model_loading}}{(\emph{optional}) Enables dynamic loading of models +during the evaluation process, if \code{TRUE}. Defaults to \code{FALSE}. Dynamic +loading of models may reduce the overall memory footprint, at the cost of +increased disk or network IO. Models can only be dynamically loaded if they +are found at an accessible disk or network location. Setting this parameter +to \code{TRUE} may help if parallel processing causes out-of-memory issues during +evaluation.} + \item{\code{familiar_data_names}}{Names of the dataset(s). Only used if the \code{object} parameter +is one or more \code{familiarData} objects.} + \item{\code{collection_name}}{Name of the collection.} + }} +} +\value{ +A list of data.tables (if \code{dir_path} is not provided), or nothing, as +all data is exported to \code{csv} files. +} +\description{ +Extract and export all data from a familiarCollection. +} +\details{ +Data, such as model performance and calibration information, is +usually collected from a \code{familiarCollection} object. However, you can also +provide one or more \code{familiarData} objects, that will be internally +converted to a \code{familiarCollection} object. It is also possible to provide a +\code{familiarEnsemble} or one or more \code{familiarModel} objects together with the +data from which data is computed prior to export. Paths to the previous +files can also be provided. + +All parameters aside from \code{object} and \code{dir_path} are only used if \code{object} +is not a \code{familiarCollection} object, or a path to one. +} diff --git a/man/export_auc_data-methods.Rd b/man/export_auc_data-methods.Rd new file mode 100644 index 00000000..b5ee5b43 --- /dev/null +++ b/man/export_auc_data-methods.Rd @@ -0,0 +1,176 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarDataComputationAUCCurves.R +\name{export_auc_data} +\alias{export_auc_data} +\alias{export_auc_data,familiarCollection-method} +\alias{export_auc_data,ANY-method} +\title{Extract and export ROC and Precision-Recall curves.} +\usage{ +export_auc_data( + object, + dir_path = NULL, + aggregate_results = TRUE, + export_collection = FALSE, + ... +) + +\S4method{export_auc_data}{familiarCollection}( + object, + dir_path = NULL, + aggregate_results = TRUE, + export_collection = FALSE, + ... +) + +\S4method{export_auc_data}{ANY}( + object, + dir_path = NULL, + aggregate_results = TRUE, + export_collection = FALSE, + ... +) +} +\arguments{ +\item{object}{A \code{familiarCollection} object, or other other objects from which +a \code{familiarCollection} can be extracted. See details for more information.} + +\item{dir_path}{Path to folder where extracted data should be saved. \code{NULL} +will allow export as a structured list of data.tables.} + +\item{aggregate_results}{Flag that signifies whether results should be +aggregated for export.} + +\item{export_collection}{(\emph{optional}) Exports the collection if TRUE.} + +\item{...}{ + Arguments passed on to \code{\link[=extract_auc_data]{extract_auc_data}}, \code{\link[=as_familiar_collection]{as_familiar_collection}} + \describe{ + \item{\code{data}}{A \code{dataObject} object, \code{data.table} or \code{data.frame} that +constitutes the data that are assessed.} + \item{\code{is_pre_processed}}{Flag that indicates whether the data was already +pre-processed externally, e.g. normalised and clustered. Only used if the +\code{data} argument is a \code{data.table} or \code{data.frame}.} + \item{\code{cl}}{Cluster created using the \code{parallel} package. This cluster is then +used to speed up computation through parallellisation.} + \item{\code{ensemble_method}}{Method for ensembling predictions from models for the +same sample. Available methods are: +\itemize{ +\item \code{median} (default): Use the median of the predicted values as the ensemble +value for a sample. +\item \code{mean}: Use the mean of the predicted values as the ensemble value for a +sample. +}} + \item{\code{verbose}}{Flag to indicate whether feedback should be provided on the +computation and extraction of various data elements.} + \item{\code{message_indent}}{Number of indentation steps for messages shown during +computation and extraction of various data elements.} + \item{\code{detail_level}}{(\emph{optional}) Sets the level at which results are computed +and aggregated. +\itemize{ +\item \code{ensemble}: Results are computed at the ensemble level, i.e. over all +models in the ensemble. This means that, for example, bias-corrected +estimates of model performance are assessed by creating (at least) 20 +bootstraps and computing the model performance of the ensemble model for +each bootstrap. +\item \code{hybrid} (default): Results are computed at the level of models in an +ensemble. This means that, for example, bias-corrected estimates of model +performance are directly computed using the models in the ensemble. If there +are at least 20 trained models in the ensemble, performance is computed for +each model, in contrast to \code{ensemble} where performance is computed for the +ensemble of models. If there are less than 20 trained models in the +ensemble, bootstraps are created so that at least 20 point estimates can be +made. +\item \code{model}: Results are computed at the model level. This means that, for +example, bias-corrected estimates of model performance are assessed by +creating (at least) 20 bootstraps and computing the performance of the model +for each bootstrap. +} + +Note that each level of detail has a different interpretation for bootstrap +confidence intervals. For \code{ensemble} and \code{model} these are the confidence +intervals for the ensemble and an individual model, respectively. That is, +the confidence interval describes the range where an estimate produced by a +respective ensemble or model trained on a repeat of the experiment may be +found with the probability of the confidence level. For \code{hybrid}, it +represents the range where any single model trained on a repeat of the +experiment may be found with the probability of the confidence level. By +definition, confidence intervals obtained using \code{hybrid} are at least as +wide as those for \code{ensemble}. \code{hybrid} offers the correct interpretation if +the goal of the analysis is to assess the result of a single, unspecified, +model. + +\code{hybrid} is generally computationally less expensive then \code{ensemble}, which +in turn is somewhat less expensive than \code{model}. + +A non-default \code{detail_level} parameter can be specified for separate +evaluation steps by providing a parameter value in a named list with data +elements, e.g. \code{list("auc_data"="ensemble", "model_performance"="hybrid")}. +This parameter can be set for the following data elements: \code{auc_data}, +\code{decision_curve_analyis}, \code{model_performance}, \code{permutation_vimp}, +\code{ice_data}, \code{prediction_data} and \code{confusion_matrix}.} + \item{\code{estimation_type}}{(\emph{optional}) Sets the type of estimation that should be +possible. This has the following options: +\itemize{ +\item \code{point}: Point estimates. +\item \code{bias_correction} or \code{bc}: Bias-corrected estimates. A bias-corrected +estimate is computed from (at least) 20 point estimates, and \code{familiar} may +bootstrap the data to create them. +\item \code{bootstrap_confidence_interval} or \code{bci} (default): Bias-corrected +estimates with bootstrap confidence intervals (Efron and Hastie, 2016). The +number of point estimates required depends on the \code{confidence_level} +parameter, and \code{familiar} may bootstrap the data to create them. +} + +As with \code{detail_level}, a non-default \code{estimation_type} parameter can be +specified for separate evaluation steps by providing a parameter value in a +named list with data elements, e.g. \code{list("auc_data"="bci", "model_performance"="point")}. This parameter can be set for the following +data elements: \code{auc_data}, \code{decision_curve_analyis}, \code{model_performance}, +\code{permutation_vimp}, \code{ice_data}, and \code{prediction_data}.} + \item{\code{confidence_level}}{(\emph{optional}) Numeric value for the level at which +confidence intervals are determined. In the case bootstraps are used to +determine the confidence intervals bootstrap estimation, \code{familiar} uses the +rule of thumb \eqn{n = 20 / ci.level} to determine the number of required +bootstraps. + +The default value is \code{0.95}.} + \item{\code{bootstrap_ci_method}}{(\emph{optional}) Method used to determine bootstrap +confidence intervals (Efron and Hastie, 2016). The following methods are +implemented: +\itemize{ +\item \code{percentile} (default): Confidence intervals obtained using the percentile +method. +\item \code{bc}: Bias-corrected confidence intervals. +} + +Note that the standard method is not implemented because this method is +often not suitable due to non-normal distributions. The bias-corrected and +accelerated (BCa) method is not implemented yet.} + \item{\code{familiar_data_names}}{Names of the dataset(s). Only used if the \code{object} parameter +is one or more \code{familiarData} objects.} + \item{\code{collection_name}}{Name of the collection.} + }} +} +\value{ +A list of data.tables (if \code{dir_path} is not provided), or nothing, as +all data is exported to \code{csv} files. +} +\description{ +Extract and export ROC and Precision-Recall curves for models in +a familiarCollection. +} +\details{ +Data is usually collected from a \code{familiarCollection} object. +However, you can also provide one or more \code{familiarData} objects, that will +be internally converted to a \code{familiarCollection} object. It is also +possible to provide a \code{familiarEnsemble} or one or more \code{familiarModel} +objects together with the data from which data is computed prior to export. +Paths to the previous files can also be provided. + +All parameters aside from \code{object} and \code{dir_path} are only used if \code{object} +is not a \code{familiarCollection} object, or a path to one. + +ROC curve data are exported for individual and ensemble models. For ensemble +models, a credibility interval for the ROC curve is determined using +bootstrapping for each metric. In case of multinomial outcomes, ROC-curves +are computed for each class, using a one-against-all approach. +} diff --git a/man/export_calibration_data-methods.Rd b/man/export_calibration_data-methods.Rd new file mode 100644 index 00000000..0670cd08 --- /dev/null +++ b/man/export_calibration_data-methods.Rd @@ -0,0 +1,183 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarDataComputationCalibrationData.R +\name{export_calibration_data} +\alias{export_calibration_data} +\alias{export_calibration_data,familiarCollection-method} +\alias{export_calibration_data,ANY-method} +\title{Extract and export calibration and goodness-of-fit tests.} +\usage{ +export_calibration_data( + object, + dir_path = NULL, + aggregate_results = TRUE, + export_collection = FALSE, + ... +) + +\S4method{export_calibration_data}{familiarCollection}( + object, + dir_path = NULL, + aggregate_results = TRUE, + export_collection = FALSE, + ... +) + +\S4method{export_calibration_data}{ANY}( + object, + dir_path = NULL, + aggregate_results = TRUE, + export_collection = FALSE, + ... +) +} +\arguments{ +\item{object}{A \code{familiarCollection} object, or other other objects from which +a \code{familiarCollection} can be extracted. See details for more information.} + +\item{dir_path}{Path to folder where extracted data should be saved. \code{NULL} +will allow export as a structured list of data.tables.} + +\item{aggregate_results}{Flag that signifies whether results should be +aggregated for export.} + +\item{export_collection}{(\emph{optional}) Exports the collection if TRUE.} + +\item{...}{ + Arguments passed on to \code{\link[=extract_calibration_data]{extract_calibration_data}}, \code{\link[=as_familiar_collection]{as_familiar_collection}} + \describe{ + \item{\code{data}}{A \code{dataObject} object, \code{data.table} or \code{data.frame} that +constitutes the data that are assessed.} + \item{\code{is_pre_processed}}{Flag that indicates whether the data was already +pre-processed externally, e.g. normalised and clustered. Only used if the +\code{data} argument is a \code{data.table} or \code{data.frame}.} + \item{\code{cl}}{Cluster created using the \code{parallel} package. This cluster is then +used to speed up computation through parallellisation.} + \item{\code{evaluation_times}}{One or more time points that are used for in analysis of +survival problems when data has to be assessed at a set time, e.g. +calibration. If not provided explicitly, this parameter is read from +settings used at creation of the underlying \code{familiarModel} objects. Only +used for \code{survival} outcomes.} + \item{\code{ensemble_method}}{Method for ensembling predictions from models for the +same sample. Available methods are: +\itemize{ +\item \code{median} (default): Use the median of the predicted values as the ensemble +value for a sample. +\item \code{mean}: Use the mean of the predicted values as the ensemble value for a +sample. +}} + \item{\code{verbose}}{Flag to indicate whether feedback should be provided on the +computation and extraction of various data elements.} + \item{\code{message_indent}}{Number of indentation steps for messages shown during +computation and extraction of various data elements.} + \item{\code{detail_level}}{(\emph{optional}) Sets the level at which results are computed +and aggregated. +\itemize{ +\item \code{ensemble}: Results are computed at the ensemble level, i.e. over all +models in the ensemble. This means that, for example, bias-corrected +estimates of model performance are assessed by creating (at least) 20 +bootstraps and computing the model performance of the ensemble model for +each bootstrap. +\item \code{hybrid} (default): Results are computed at the level of models in an +ensemble. This means that, for example, bias-corrected estimates of model +performance are directly computed using the models in the ensemble. If there +are at least 20 trained models in the ensemble, performance is computed for +each model, in contrast to \code{ensemble} where performance is computed for the +ensemble of models. If there are less than 20 trained models in the +ensemble, bootstraps are created so that at least 20 point estimates can be +made. +\item \code{model}: Results are computed at the model level. This means that, for +example, bias-corrected estimates of model performance are assessed by +creating (at least) 20 bootstraps and computing the performance of the model +for each bootstrap. +} + +Note that each level of detail has a different interpretation for bootstrap +confidence intervals. For \code{ensemble} and \code{model} these are the confidence +intervals for the ensemble and an individual model, respectively. That is, +the confidence interval describes the range where an estimate produced by a +respective ensemble or model trained on a repeat of the experiment may be +found with the probability of the confidence level. For \code{hybrid}, it +represents the range where any single model trained on a repeat of the +experiment may be found with the probability of the confidence level. By +definition, confidence intervals obtained using \code{hybrid} are at least as +wide as those for \code{ensemble}. \code{hybrid} offers the correct interpretation if +the goal of the analysis is to assess the result of a single, unspecified, +model. + +\code{hybrid} is generally computationally less expensive then \code{ensemble}, which +in turn is somewhat less expensive than \code{model}. + +A non-default \code{detail_level} parameter can be specified for separate +evaluation steps by providing a parameter value in a named list with data +elements, e.g. \code{list("auc_data"="ensemble", "model_performance"="hybrid")}. +This parameter can be set for the following data elements: \code{auc_data}, +\code{decision_curve_analyis}, \code{model_performance}, \code{permutation_vimp}, +\code{ice_data}, \code{prediction_data} and \code{confusion_matrix}.} + \item{\code{estimation_type}}{(\emph{optional}) Sets the type of estimation that should be +possible. This has the following options: +\itemize{ +\item \code{point}: Point estimates. +\item \code{bias_correction} or \code{bc}: Bias-corrected estimates. A bias-corrected +estimate is computed from (at least) 20 point estimates, and \code{familiar} may +bootstrap the data to create them. +\item \code{bootstrap_confidence_interval} or \code{bci} (default): Bias-corrected +estimates with bootstrap confidence intervals (Efron and Hastie, 2016). The +number of point estimates required depends on the \code{confidence_level} +parameter, and \code{familiar} may bootstrap the data to create them. +} + +As with \code{detail_level}, a non-default \code{estimation_type} parameter can be +specified for separate evaluation steps by providing a parameter value in a +named list with data elements, e.g. \code{list("auc_data"="bci", "model_performance"="point")}. This parameter can be set for the following +data elements: \code{auc_data}, \code{decision_curve_analyis}, \code{model_performance}, +\code{permutation_vimp}, \code{ice_data}, and \code{prediction_data}.} + \item{\code{confidence_level}}{(\emph{optional}) Numeric value for the level at which +confidence intervals are determined. In the case bootstraps are used to +determine the confidence intervals bootstrap estimation, \code{familiar} uses the +rule of thumb \eqn{n = 20 / ci.level} to determine the number of required +bootstraps. + +The default value is \code{0.95}.} + \item{\code{bootstrap_ci_method}}{(\emph{optional}) Method used to determine bootstrap +confidence intervals (Efron and Hastie, 2016). The following methods are +implemented: +\itemize{ +\item \code{percentile} (default): Confidence intervals obtained using the percentile +method. +\item \code{bc}: Bias-corrected confidence intervals. +} + +Note that the standard method is not implemented because this method is +often not suitable due to non-normal distributions. The bias-corrected and +accelerated (BCa) method is not implemented yet.} + \item{\code{familiar_data_names}}{Names of the dataset(s). Only used if the \code{object} parameter +is one or more \code{familiarData} objects.} + \item{\code{collection_name}}{Name of the collection.} + }} +} +\value{ +A list of data.tables (if \code{dir_path} is not provided), or nothing, as +all data is exported to \code{csv} files. +} +\description{ +Extract and export calibration and goodness-of-fit tests for data +in a familiarCollection. +} +\details{ +Data is usually collected from a \code{familiarCollection} object. +However, you can also provide one or more \code{familiarData} objects, that will +be internally converted to a \code{familiarCollection} object. It is also +possible to provide a \code{familiarEnsemble} or one or more \code{familiarModel} +objects together with the data from which data is computed prior to export. +Paths to the previous files can also be provided. + +All parameters aside from \code{object} and \code{dir_path} are only used if \code{object} +is not a \code{familiarCollection} object, or a path to one. + +Calibration tests are performed based on expected (predicted) and observed +outcomes. For all outcomes, calibration-at-the-large and calibration slopes +are determined. Furthermore, for all but survival outcomes, a repeated, +randomised grouping Hosmer-Lemeshow test is performed. For survival +outcomes, the Nam-D'Agostino and Greenwood-Nam-D'Agostino tests are +performed. +} diff --git a/man/export_calibration_info-methods.Rd b/man/export_calibration_info-methods.Rd new file mode 100644 index 00000000..bf61e533 --- /dev/null +++ b/man/export_calibration_info-methods.Rd @@ -0,0 +1,75 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarDataComputationCalibrationInfo.R +\name{export_calibration_info} +\alias{export_calibration_info} +\alias{export_calibration_info,familiarCollection-method} +\alias{export_calibration_info,ANY-method} +\title{Extract and export calibration information.} +\usage{ +export_calibration_info( + object, + dir_path = NULL, + aggregate_results = TRUE, + export_collection = FALSE, + ... +) + +\S4method{export_calibration_info}{familiarCollection}( + object, + dir_path = NULL, + aggregate_results = TRUE, + export_collection = FALSE, + ... +) + +\S4method{export_calibration_info}{ANY}( + object, + dir_path = NULL, + aggregate_results = TRUE, + export_collection = FALSE, + ... +) +} +\arguments{ +\item{object}{A \code{familiarCollection} object, or other other objects from which +a \code{familiarCollection} can be extracted. See details for more information.} + +\item{dir_path}{Path to folder where extracted data should be saved. \code{NULL} +will allow export as a structured list of data.tables.} + +\item{aggregate_results}{Flag that signifies whether results should be +aggregated for export.} + +\item{export_collection}{(\emph{optional}) Exports the collection if TRUE.} + +\item{...}{ + Arguments passed on to \code{\link[=as_familiar_collection]{as_familiar_collection}} + \describe{ + \item{\code{familiar_data_names}}{Names of the dataset(s). Only used if the \code{object} parameter +is one or more \code{familiarData} objects.} + \item{\code{collection_name}}{Name of the collection.} + }} +} +\value{ +A data.table (if \code{dir_path} is not provided), or nothing, as all data +is exported to \code{csv} files. +} +\description{ +Extract and export calibration information (e.g. baseline +survival) for data in a familiarCollection. +} +\details{ +Data is usually collected from a \code{familiarCollection} object. +However, you can also provide one or more \code{familiarData} objects, that will +be internally converted to a \code{familiarCollection} object. It is also +possible to provide a \code{familiarEnsemble} or one or more \code{familiarModel} +objects together with the data from which data is computed prior to export. +Paths to the previous files can also be provided. + +All parameters aside from \code{object} and \code{dir_path} are only used if \code{object} +is not a \code{familiarCollection} object, or a path to one. + +Currently only baseline survival is exported as supporting calibration +information. See \code{export_calibration_data} for export of direct assessment +of calibration, including calibration and goodness-of-fit tests. +} diff --git a/man/export_confusion_matrix_data-methods.Rd b/man/export_confusion_matrix_data-methods.Rd new file mode 100644 index 00000000..3d99a50c --- /dev/null +++ b/man/export_confusion_matrix_data-methods.Rd @@ -0,0 +1,130 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarDataComputationConfusionMatrix.R +\name{export_confusion_matrix_data} +\alias{export_confusion_matrix_data} +\alias{export_confusion_matrix_data,familiarCollection-method} +\alias{export_confusion_matrix_data,ANY-method} +\title{Extract and export confusion matrices.} +\usage{ +export_confusion_matrix_data( + object, + dir_path = NULL, + export_collection = FALSE, + ... +) + +\S4method{export_confusion_matrix_data}{familiarCollection}( + object, + dir_path = NULL, + export_collection = FALSE, + ... +) + +\S4method{export_confusion_matrix_data}{ANY}( + object, + dir_path = NULL, + export_collection = FALSE, + ... +) +} +\arguments{ +\item{object}{A \code{familiarCollection} object, or other other objects from which +a \code{familiarCollection} can be extracted. See details for more information.} + +\item{dir_path}{Path to folder where extracted data should be saved. \code{NULL} +will allow export as a structured list of data.tables.} + +\item{export_collection}{(\emph{optional}) Exports the collection if TRUE.} + +\item{...}{ + Arguments passed on to \code{\link[=extract_confusion_matrix]{extract_confusion_matrix}}, \code{\link[=as_familiar_collection]{as_familiar_collection}} + \describe{ + \item{\code{data}}{A \code{dataObject} object, \code{data.table} or \code{data.frame} that +constitutes the data that are assessed.} + \item{\code{is_pre_processed}}{Flag that indicates whether the data was already +pre-processed externally, e.g. normalised and clustered. Only used if the +\code{data} argument is a \code{data.table} or \code{data.frame}.} + \item{\code{cl}}{Cluster created using the \code{parallel} package. This cluster is then +used to speed up computation through parallellisation.} + \item{\code{ensemble_method}}{Method for ensembling predictions from models for the +same sample. Available methods are: +\itemize{ +\item \code{median} (default): Use the median of the predicted values as the ensemble +value for a sample. +\item \code{mean}: Use the mean of the predicted values as the ensemble value for a +sample. +}} + \item{\code{verbose}}{Flag to indicate whether feedback should be provided on the +computation and extraction of various data elements.} + \item{\code{message_indent}}{Number of indentation steps for messages shown during +computation and extraction of various data elements.} + \item{\code{detail_level}}{(\emph{optional}) Sets the level at which results are computed +and aggregated. +\itemize{ +\item \code{ensemble}: Results are computed at the ensemble level, i.e. over all +models in the ensemble. This means that, for example, bias-corrected +estimates of model performance are assessed by creating (at least) 20 +bootstraps and computing the model performance of the ensemble model for +each bootstrap. +\item \code{hybrid} (default): Results are computed at the level of models in an +ensemble. This means that, for example, bias-corrected estimates of model +performance are directly computed using the models in the ensemble. If there +are at least 20 trained models in the ensemble, performance is computed for +each model, in contrast to \code{ensemble} where performance is computed for the +ensemble of models. If there are less than 20 trained models in the +ensemble, bootstraps are created so that at least 20 point estimates can be +made. +\item \code{model}: Results are computed at the model level. This means that, for +example, bias-corrected estimates of model performance are assessed by +creating (at least) 20 bootstraps and computing the performance of the model +for each bootstrap. +} + +Note that each level of detail has a different interpretation for bootstrap +confidence intervals. For \code{ensemble} and \code{model} these are the confidence +intervals for the ensemble and an individual model, respectively. That is, +the confidence interval describes the range where an estimate produced by a +respective ensemble or model trained on a repeat of the experiment may be +found with the probability of the confidence level. For \code{hybrid}, it +represents the range where any single model trained on a repeat of the +experiment may be found with the probability of the confidence level. By +definition, confidence intervals obtained using \code{hybrid} are at least as +wide as those for \code{ensemble}. \code{hybrid} offers the correct interpretation if +the goal of the analysis is to assess the result of a single, unspecified, +model. + +\code{hybrid} is generally computationally less expensive then \code{ensemble}, which +in turn is somewhat less expensive than \code{model}. + +A non-default \code{detail_level} parameter can be specified for separate +evaluation steps by providing a parameter value in a named list with data +elements, e.g. \code{list("auc_data"="ensemble", "model_performance"="hybrid")}. +This parameter can be set for the following data elements: \code{auc_data}, +\code{decision_curve_analyis}, \code{model_performance}, \code{permutation_vimp}, +\code{ice_data}, \code{prediction_data} and \code{confusion_matrix}.} + \item{\code{familiar_data_names}}{Names of the dataset(s). Only used if the \code{object} parameter +is one or more \code{familiarData} objects.} + \item{\code{collection_name}}{Name of the collection.} + }} +} +\value{ +A list of data.tables (if \code{dir_path} is not provided), or nothing, as +all data is exported to \code{csv} files. +} +\description{ +Extract and export confusion matrics for models in a +familiarCollection. +} +\details{ +Data is usually collected from a \code{familiarCollection} object. +However, you can also provide one or more \code{familiarData} objects, that will +be internally converted to a \code{familiarCollection} object. It is also +possible to provide a \code{familiarEnsemble} or one or more \code{familiarModel} +objects together with the data from which data is computed prior to export. +Paths to the previous files can also be provided. + +All parameters aside from \code{object} and \code{dir_path} are only used if \code{object} +is not a \code{familiarCollection} object, or a path to one. + +Confusion matrices are exported for individual and ensemble models. +} diff --git a/man/export_decision_curve_analysis_data-methods.Rd b/man/export_decision_curve_analysis_data-methods.Rd new file mode 100644 index 00000000..073521ae --- /dev/null +++ b/man/export_decision_curve_analysis_data-methods.Rd @@ -0,0 +1,69 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarDataComputationDecisionCurveAnalysis.R +\name{export_decision_curve_analysis_data} +\alias{export_decision_curve_analysis_data} +\alias{export_decision_curve_analysis_data,familiarCollection-method} +\alias{export_decision_curve_analysis_data,ANY-method} +\title{Extract and export decision curve analysis data.} +\usage{ +export_decision_curve_analysis_data( + object, + dir_path = NULL, + aggregate_results = TRUE, + ... +) + +\S4method{export_decision_curve_analysis_data}{familiarCollection}( + object, + dir_path = NULL, + aggregate_results = TRUE, + ... +) + +\S4method{export_decision_curve_analysis_data}{ANY}( + object, + dir_path = NULL, + aggregate_results = TRUE, + ... +) +} +\arguments{ +\item{object}{A \code{familiarCollection} object, or other other objects from which +a \code{familiarCollection} can be extracted. See details for more information.} + +\item{dir_path}{Path to folder where extracted data should be saved. \code{NULL} +will allow export as a structured list of data.tables.} + +\item{aggregate_results}{Flag that signifies whether results should be +aggregated for export.} + +\item{...}{ + Arguments passed on to \code{\link[=as_familiar_collection]{as_familiar_collection}} + \describe{ + \item{\code{familiar_data_names}}{Names of the dataset(s). Only used if the \code{object} parameter +is one or more \code{familiarData} objects.} + \item{\code{collection_name}}{Name of the collection.} + }} +} +\value{ +A list of data.table (if \code{dir_path} is not provided), or nothing, as +all data is exported to \code{csv} files. +} +\description{ +Extract and export decision curve analysis data in a +familiarCollection. +} +\details{ +Data is usually collected from a \code{familiarCollection} object. +However, you can also provide one or more \code{familiarData} objects, that will +be internally converted to a \code{familiarCollection} object. It is also +possible to provide a \code{familiarEnsemble} or one or more \code{familiarModel} +objects together with the data from which data is computed prior to export. +Paths to the previous files can also be provided. + +All parameters aside from \code{object} and \code{dir_path} are only used if \code{object} +is not a \code{familiarCollection} object, or a path to one. + +Decision curve analysis data is computed for categorical outcomes, i.e. +binomial and multinomial, as well as survival outcomes. +} diff --git a/man/export_feature_expressions-methods.Rd b/man/export_feature_expressions-methods.Rd new file mode 100644 index 00000000..be54ce98 --- /dev/null +++ b/man/export_feature_expressions-methods.Rd @@ -0,0 +1,146 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarDataComputationFeatureExpression.R +\name{export_feature_expressions} +\alias{export_feature_expressions} +\alias{export_feature_expressions,familiarCollection-method} +\alias{export_feature_expressions,ANY-method} +\title{Extract and export feature expressions.} +\usage{ +export_feature_expressions( + object, + dir_path = NULL, + evaluation_time = waiver(), + export_collection = FALSE, + ... +) + +\S4method{export_feature_expressions}{familiarCollection}( + object, + dir_path = NULL, + evaluation_time = waiver(), + export_collection = FALSE, + ... +) + +\S4method{export_feature_expressions}{ANY}( + object, + dir_path = NULL, + evaluation_time = waiver(), + export_collection = FALSE, + ... +) +} +\arguments{ +\item{object}{A \code{familiarCollection} object, or other other objects from which +a \code{familiarCollection} can be extracted. See details for more information.} + +\item{dir_path}{Path to folder where extracted data should be saved. \code{NULL} +will allow export as a structured list of data.tables.} + +\item{evaluation_time}{One or more time points that are used to create the +outcome columns in expression plots. If not provided explicitly, this +parameter is read from settings used at creation of the underlying +\code{familiarData} objects. Only used for \code{survival} outcomes.} + +\item{export_collection}{(\emph{optional}) Exports the collection if TRUE.} + +\item{...}{ + Arguments passed on to \code{\link[=extract_feature_expression]{extract_feature_expression}}, \code{\link[=as_familiar_collection]{as_familiar_collection}} + \describe{ + \item{\code{feature_similarity}}{Table containing pairwise distance between +sample. This is used to determine cluster information, and indicate which +samples are similar. The table is created by the +\code{extract_sample_similarity} method.} + \item{\code{data}}{A \code{dataObject} object, \code{data.table} or \code{data.frame} that +constitutes the data that are assessed.} + \item{\code{evaluation_times}}{One or more time points that are used for in analysis of +survival problems when data has to be assessed at a set time, e.g. +calibration. If not provided explicitly, this parameter is read from +settings used at creation of the underlying \code{familiarModel} objects. Only +used for \code{survival} outcomes.} + \item{\code{feature_cluster_method}}{The method used to perform clustering. These are +the same methods as for the \code{cluster_method} configuration parameter: +\code{none}, \code{hclust}, \code{agnes}, \code{diana} and \code{pam}. + +\code{none} cannot be used when extracting data regarding mutual correlation or +feature expressions. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{feature_linkage_method}}{The method used for agglomerative clustering in +\code{hclust} and \code{agnes}. These are the same methods as for the +\code{cluster_linkage_method} configuration parameter: \code{average}, \code{single}, +\code{complete}, \code{weighted}, and \code{ward}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{feature_similarity_metric}}{Metric to determine pairwise similarity +between features. Similarity is computed in the same manner as for +clustering, and \code{feature_similarity_metric} therefore has the same options +as \code{cluster_similarity_metric}: \code{mcfadden_r2}, \code{cox_snell_r2}, +\code{nagelkerke_r2}, \code{spearman}, \code{kendall} and \code{pearson}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{sample_cluster_method}}{The method used to perform clustering based on +distance between samples. These are the same methods as for the +\code{cluster_method} configuration parameter: \code{hclust}, \code{agnes}, \code{diana} and +\code{pam}. + +\code{none} cannot be used when extracting data for feature expressions. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{sample_linkage_method}}{The method used for agglomerative clustering in +\code{hclust} and \code{agnes}. These are the same methods as for the +\code{cluster_linkage_method} configuration parameter: \code{average}, \code{single}, +\code{complete}, \code{weighted}, and \code{ward}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{sample_similarity_metric}}{Metric to determine pairwise similarity +between samples. Similarity is computed in the same manner as for +clustering, but \code{sample_similarity_metric} has different options that are +better suited to computing distance between samples instead of between +features: \code{gower}, \code{euclidean}. + +The underlying feature data is scaled to the \eqn{[0, 1]} range (for +numerical features) using the feature values across the samples. The +normalisation parameters required can optionally be computed from feature +data with the outer 5\% (on both sides) of feature values trimmed or +winsorised. To do so append \verb{_trim} (trimming) or \verb{_winsor} (winsorising) to +the metric name. This reduces the effect of outliers somewhat. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{verbose}}{Flag to indicate whether feedback should be provided on the +computation and extraction of various data elements.} + \item{\code{message_indent}}{Number of indentation steps for messages shown during +computation and extraction of various data elements.} + \item{\code{familiar_data_names}}{Names of the dataset(s). Only used if the \code{object} parameter +is one or more \code{familiarData} objects.} + \item{\code{collection_name}}{Name of the collection.} + }} +} +\value{ +A data.table (if \code{dir_path} is not provided), or nothing, as all data +is exported to \code{csv} files. +} +\description{ +Extract and export feature expressions for the features in a +familiarCollection. +} +\details{ +Data is usually collected from a \code{familiarCollection} object. +However, you can also provide one or more \code{familiarData} objects, that will +be internally converted to a \code{familiarCollection} object. It is also +possible to provide a \code{familiarEnsemble} or one or more \code{familiarModel} +objects together with the data from which data is computed prior to export. +Paths to the previous files can also be provided. + +All parameters aside from \code{object} and \code{dir_path} are only used if \code{object} +is not a \code{familiarCollection} object, or a path to one. + +Feature expressions are computed by standardising each feature, i.e. sample +mean is 0 and standard deviation is 1. +} diff --git a/man/export_feature_similarity-methods.Rd b/man/export_feature_similarity-methods.Rd new file mode 100644 index 00000000..74a2086b --- /dev/null +++ b/man/export_feature_similarity-methods.Rd @@ -0,0 +1,137 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarDataComputationFeatureSimilarity.R +\name{export_feature_similarity} +\alias{export_feature_similarity} +\alias{export_feature_similarity,familiarCollection-method} +\alias{export_feature_similarity,ANY-method} +\title{Extract and export mutual correlation between features.} +\usage{ +export_feature_similarity( + object, + dir_path = NULL, + aggregate_results = TRUE, + feature_cluster_method = waiver(), + feature_linkage_method = waiver(), + feature_cluster_cut_method = waiver(), + feature_similarity_threshold = waiver(), + export_dendrogram = FALSE, + export_ordered_data = FALSE, + export_clustering = FALSE, + export_collection = FALSE, + ... +) + +\S4method{export_feature_similarity}{familiarCollection}( + object, + dir_path = NULL, + aggregate_results = TRUE, + feature_cluster_method = waiver(), + feature_linkage_method = waiver(), + feature_cluster_cut_method = waiver(), + feature_similarity_threshold = waiver(), + export_dendrogram = FALSE, + export_ordered_data = FALSE, + export_clustering = FALSE, + export_collection = FALSE, + ... +) + +\S4method{export_feature_similarity}{ANY}( + object, + dir_path = NULL, + aggregate_results = TRUE, + feature_cluster_method = waiver(), + feature_linkage_method = waiver(), + feature_cluster_cut_method = waiver(), + feature_similarity_threshold = waiver(), + export_dendrogram = FALSE, + export_ordered_data = FALSE, + export_clustering = FALSE, + export_collection = FALSE, + ... +) +} +\arguments{ +\item{object}{A \code{familiarCollection} object, or other other objects from which +a \code{familiarCollection} can be extracted. See details for more information.} + +\item{dir_path}{Path to folder where extracted data should be saved. \code{NULL} +will allow export as a structured list of data.tables.} + +\item{aggregate_results}{Flag that signifies whether results should be +aggregated for export.} + +\item{feature_cluster_method}{The method used to perform clustering. These are +the same methods as for the \code{cluster_method} configuration parameter: +\code{none}, \code{hclust}, \code{agnes}, \code{diana} and \code{pam}. + +\code{none} cannot be used when extracting data regarding mutual correlation or +feature expressions. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{feature_linkage_method}{The method used for agglomerative clustering in +\code{hclust} and \code{agnes}. These are the same methods as for the +\code{cluster_linkage_method} configuration parameter: \code{average}, \code{single}, +\code{complete}, \code{weighted}, and \code{ward}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{feature_cluster_cut_method}{The method used to divide features into +separate clusters. The available methods are the same as for the +\code{cluster_cut_method} configuration parameter: \code{silhouette}, \code{fixed_cut} and +\code{dynamic_cut}. + +\code{silhouette} is available for all cluster methods, but \code{fixed_cut} only +applies to methods that create hierarchical trees (\code{hclust}, \code{agnes} and +\code{diana}). \code{dynamic_cut} requires the \code{dynamicTreeCut} package and can only +be used with \code{agnes} and \code{hclust}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{feature_similarity_threshold}{The threshold level for pair-wise +similarity that is required to form feature clusters with the \code{fixed_cut} +method. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{export_dendrogram}{Add dendrogram in the data element objects.} + +\item{export_ordered_data}{Add feature label ordering to data in the data +element objects.} + +\item{export_clustering}{Add clustering information to data.} + +\item{export_collection}{(\emph{optional}) Exports the collection if TRUE.} + +\item{...}{ + Arguments passed on to \code{\link[=as_familiar_collection]{as_familiar_collection}} + \describe{ + \item{\code{familiar_data_names}}{Names of the dataset(s). Only used if the \code{object} parameter +is one or more \code{familiarData} objects.} + \item{\code{collection_name}}{Name of the collection.} + }} +} +\value{ +A list containing a data.table (if \code{dir_path} is not provided), or +nothing, as all data is exported to \code{csv} files. +} +\description{ +Extract and export mutual correlation between features in a +familiarCollection. +} +\details{ +Data is usually collected from a \code{familiarCollection} object. +However, you can also provide one or more \code{familiarData} objects, that will +be internally converted to a \code{familiarCollection} object. It is also +possible to provide a \code{familiarEnsemble} or one or more \code{familiarModel} +objects together with the data from which data is computed prior to export. +Paths to the previous files can also be provided. + +All parameters aside from \code{object} and \code{dir_path} are only used if \code{object} +is not a \code{familiarCollection} object, or a path to one. +} diff --git a/man/export_fs_vimp-methods.Rd b/man/export_fs_vimp-methods.Rd new file mode 100644 index 00000000..03e43e90 --- /dev/null +++ b/man/export_fs_vimp-methods.Rd @@ -0,0 +1,128 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarDataComputationVimp.R +\name{export_fs_vimp} +\alias{export_fs_vimp} +\alias{export_fs_vimp,familiarCollection-method} +\alias{export_fs_vimp,ANY-method} +\title{Extract and export feature selection variable importance.} +\usage{ +export_fs_vimp( + object, + dir_path = NULL, + aggregate_results = TRUE, + aggregation_method = waiver(), + rank_threshold = waiver(), + export_collection = FALSE, + ... +) + +\S4method{export_fs_vimp}{familiarCollection}( + object, + dir_path = NULL, + aggregate_results = TRUE, + aggregation_method = waiver(), + rank_threshold = waiver(), + export_collection = FALSE, + ... +) + +\S4method{export_fs_vimp}{ANY}( + object, + dir_path = NULL, + aggregate_results = TRUE, + aggregation_method = waiver(), + rank_threshold = waiver(), + export_collection = FALSE, + ... +) +} +\arguments{ +\item{object}{A \code{familiarCollection} object, or other other objects from which +a \code{familiarCollection} can be extracted. See details for more information.} + +\item{dir_path}{Path to folder where extracted data should be saved. \code{NULL} +will allow export as a structured list of data.tables.} + +\item{aggregate_results}{Flag that signifies whether results should be +aggregated for export.} + +\item{aggregation_method}{(\emph{optional}) The method used to aggregate variable +importances over different data subsets, e.g. bootstraps. The following +methods can be selected: +\itemize{ +\item \code{mean} (default): Use the mean rank of a feature over the subsets to +determine the aggregated feature rank. +\item \code{median}: Use the median rank of a feature over the subsets to determine +the aggregated feature rank. +\item \code{best}: Use the best rank the feature obtained in any subset to determine +the aggregated feature rank. +\item \code{worst}: Use the worst rank the feature obtained in any subset to +determine the aggregated feature rank. +\item \code{stability}: Use the frequency of the feature being in the subset of +highly ranked features as measure for the aggregated feature rank +(Meinshausen and Buehlmann, 2010). +\item \code{exponential}: Use a rank-weighted frequence of occurrence in the subset +of highly ranked features as measure for the aggregated feature rank (Haury +et al., 2011). +\item \code{borda}: Use the borda count as measure for the aggregated feature rank +(Wald et al., 2012). +\item \code{enhanced_borda}: Use an occurrence frequency-weighted borda count as +measure for the aggregated feature rank (Wald et al., 2012). +\item \code{truncated_borda}: Use borda count computed only on features within the +subset of highly ranked features. +\item \code{enhanced_truncated_borda}: Apply both the enhanced borda method and the +truncated borda method and use the resulting borda count as the aggregated +feature rank. +}} + +\item{rank_threshold}{(\emph{optional}) The threshold used to define the subset of +highly important features. If not set, this threshold is determined by +maximising the variance in the occurrence value over all features over the +subset size. + +This parameter is only relevant for \code{stability}, \code{exponential}, +\code{enhanced_borda}, \code{truncated_borda} and \code{enhanced_truncated_borda} methods.} + +\item{export_collection}{(\emph{optional}) Exports the collection if TRUE.} + +\item{...}{ + Arguments passed on to \code{\link[=as_familiar_collection]{as_familiar_collection}} + \describe{ + \item{\code{familiar_data_names}}{Names of the dataset(s). Only used if the \code{object} parameter +is one or more \code{familiarData} objects.} + \item{\code{collection_name}}{Name of the collection.} + }} +} +\value{ +A data.table (if \code{dir_path} is not provided), or nothing, as all data +is exported to \code{csv} files. +} +\description{ +Extract and export feature selection variable importance from a +familiarCollection. +} +\details{ +Data, such as model performance and calibration information, is +usually collected from a \code{familiarCollection} object. However, you can also +provide one or more \code{familiarData} objects, that will be internally +converted to a \code{familiarCollection} object. Paths to the previous files can +also be provided. + +Unlike other export function, export using \code{familiarEnsemble} or +\code{familiarModel} objects is not possible. This is because feature selection +variable importance is not stored within \code{familiarModel} objects. + +All parameters aside from \code{object} and \code{dir_path} are only used if \code{object} +is not a \code{familiarCollection} object, or a path to one. + +Variable importance is based on the ranking produced by feature selection +routines. In case feature selection was performed repeatedly, e.g. using +bootstraps, feature ranks are first aggregated using the method defined by +the \code{aggregation_method}, some of which require a \code{rank_threshold} to +indicate a subset of most important features. + +Information concerning highly similar features that form clusters is +provided as well. This information is based on consensus clustering of the +features. This clustering information is also used during aggregation to +ensure that co-clustered features are only taken into account once. +} diff --git a/man/export_hyperparameters-methods.Rd b/man/export_hyperparameters-methods.Rd new file mode 100644 index 00000000..0393c7ce --- /dev/null +++ b/man/export_hyperparameters-methods.Rd @@ -0,0 +1,77 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarDataComputationHyperparameters.R +\name{export_hyperparameters} +\alias{export_hyperparameters} +\alias{export_hyperparameters,familiarCollection-method} +\alias{export_hyperparameters,ANY-method} +\title{Extract and export model hyperparameters.} +\usage{ +export_hyperparameters( + object, + dir_path = NULL, + aggregate_results = TRUE, + export_collection = FALSE, + ... +) + +\S4method{export_hyperparameters}{familiarCollection}( + object, + dir_path = NULL, + aggregate_results = TRUE, + export_collection = FALSE, + ... +) + +\S4method{export_hyperparameters}{ANY}( + object, + dir_path = NULL, + aggregate_results = TRUE, + export_collection = FALSE, + ... +) +} +\arguments{ +\item{object}{A \code{familiarCollection} object, or other other objects from which +a \code{familiarCollection} can be extracted. See details for more information.} + +\item{dir_path}{Path to folder where extracted data should be saved. \code{NULL} +will allow export as a structured list of data.tables.} + +\item{aggregate_results}{Flag that signifies whether results should be +aggregated for export.} + +\item{export_collection}{(\emph{optional}) Exports the collection if TRUE.} + +\item{...}{ + Arguments passed on to \code{\link[=as_familiar_collection]{as_familiar_collection}} + \describe{ + \item{\code{familiar_data_names}}{Names of the dataset(s). Only used if the \code{object} parameter +is one or more \code{familiarData} objects.} + \item{\code{collection_name}}{Name of the collection.} + }} +} +\value{ +A data.table (if \code{dir_path} is not provided), or nothing, as all data +is exported to \code{csv} files. In case of the latter, hyperparameters are +summarised. +} +\description{ +Extract and export model hyperparameters from models in a +familiarCollection. +} +\details{ +Data, such as model performance and calibration information, is +usually collected from a \code{familiarCollection} object. However, you can also +provide one or more \code{familiarData} objects, that will be internally +converted to a \code{familiarCollection} object. It is also possible to provide a +\code{familiarEnsemble} or one or more \code{familiarModel} objects together with the +data from which data is computed prior to export. Paths to the previous +files can also be provided. + +All parameters aside from \code{object} and \code{dir_path} are only used if \code{object} +is not a \code{familiarCollection} object, or a path to one. + +Many model hyperparameters are optimised using sequential model-based +optimisation. The extracted hyperparameters are those that were selected to +construct the underlying models (\code{familiarModel} objects). +} diff --git a/man/export_ice_data-methods.Rd b/man/export_ice_data-methods.Rd new file mode 100644 index 00000000..8d5a1829 --- /dev/null +++ b/man/export_ice_data-methods.Rd @@ -0,0 +1,196 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarDataComputationICE.R +\name{export_ice_data} +\alias{export_ice_data} +\alias{export_ice_data,familiarCollection-method} +\alias{export_ice_data,ANY-method} +\title{Extract and export individual conditional expectation data.} +\usage{ +export_ice_data( + object, + dir_path = NULL, + aggregate_results = TRUE, + export_collection = FALSE, + ... +) + +\S4method{export_ice_data}{familiarCollection}( + object, + dir_path = NULL, + aggregate_results = TRUE, + export_collection = FALSE, + ... +) + +\S4method{export_ice_data}{ANY}( + object, + dir_path = NULL, + aggregate_results = TRUE, + export_collection = FALSE, + ... +) +} +\arguments{ +\item{object}{A \code{familiarCollection} object, or other other objects from which +a \code{familiarCollection} can be extracted. See details for more information.} + +\item{dir_path}{Path to folder where extracted data should be saved. \code{NULL} +will allow export as a structured list of data.tables.} + +\item{aggregate_results}{Flag that signifies whether results should be +aggregated for export.} + +\item{export_collection}{(\emph{optional}) Exports the collection if TRUE.} + +\item{...}{ + Arguments passed on to \code{\link[=extract_ice]{extract_ice}}, \code{\link[=as_familiar_collection]{as_familiar_collection}} + \describe{ + \item{\code{features}}{Names of the feature or features (2) assessed simultaneously. +By default \code{NULL}, which means that all features are assessed one-by-one.} + \item{\code{feature_x_range}}{When one or two features are defined using \code{features}, +\code{feature_x_range} can be used to set the range of values for the first +feature. For numeric features, a vector of two values is assumed to indicate +a range from which \code{n_sample_points} are uniformly sampled. A vector of more +than two values is interpreted as is, i.e. these represent the values to be +sampled. For categorical features, values should represent a (sub)set of +available levels.} + \item{\code{feature_y_range}}{As \code{feature_x_range}, but for the second feature in +case two features are defined.} + \item{\code{n_sample_points}}{Number of points used to sample continuous features.} + \item{\code{data}}{A \code{dataObject} object, \code{data.table} or \code{data.frame} that +constitutes the data that are assessed.} + \item{\code{is_pre_processed}}{Flag that indicates whether the data was already +pre-processed externally, e.g. normalised and clustered. Only used if the +\code{data} argument is a \code{data.table} or \code{data.frame}.} + \item{\code{cl}}{Cluster created using the \code{parallel} package. This cluster is then +used to speed up computation through parallellisation.} + \item{\code{evaluation_times}}{One or more time points that are used for in analysis of +survival problems when data has to be assessed at a set time, e.g. +calibration. If not provided explicitly, this parameter is read from +settings used at creation of the underlying \code{familiarModel} objects. Only +used for \code{survival} outcomes.} + \item{\code{ensemble_method}}{Method for ensembling predictions from models for the +same sample. Available methods are: +\itemize{ +\item \code{median} (default): Use the median of the predicted values as the ensemble +value for a sample. +\item \code{mean}: Use the mean of the predicted values as the ensemble value for a +sample. +}} + \item{\code{verbose}}{Flag to indicate whether feedback should be provided on the +computation and extraction of various data elements.} + \item{\code{message_indent}}{Number of indentation steps for messages shown during +computation and extraction of various data elements.} + \item{\code{sample_limit}}{(\emph{optional}) Set the upper limit of the number of samples +that are used during evaluation steps. Cannot be less than 20. + +This setting can be specified per data element by providing a parameter +value in a named list with data elements, e.g. +\code{list("sample_similarity"=100, "permutation_vimp"=1000)}. + +This parameter can be set for the following data elements: +\code{sample_similarity} and \code{ice_data}.} + \item{\code{detail_level}}{(\emph{optional}) Sets the level at which results are computed +and aggregated. +\itemize{ +\item \code{ensemble}: Results are computed at the ensemble level, i.e. over all +models in the ensemble. This means that, for example, bias-corrected +estimates of model performance are assessed by creating (at least) 20 +bootstraps and computing the model performance of the ensemble model for +each bootstrap. +\item \code{hybrid} (default): Results are computed at the level of models in an +ensemble. This means that, for example, bias-corrected estimates of model +performance are directly computed using the models in the ensemble. If there +are at least 20 trained models in the ensemble, performance is computed for +each model, in contrast to \code{ensemble} where performance is computed for the +ensemble of models. If there are less than 20 trained models in the +ensemble, bootstraps are created so that at least 20 point estimates can be +made. +\item \code{model}: Results are computed at the model level. This means that, for +example, bias-corrected estimates of model performance are assessed by +creating (at least) 20 bootstraps and computing the performance of the model +for each bootstrap. +} + +Note that each level of detail has a different interpretation for bootstrap +confidence intervals. For \code{ensemble} and \code{model} these are the confidence +intervals for the ensemble and an individual model, respectively. That is, +the confidence interval describes the range where an estimate produced by a +respective ensemble or model trained on a repeat of the experiment may be +found with the probability of the confidence level. For \code{hybrid}, it +represents the range where any single model trained on a repeat of the +experiment may be found with the probability of the confidence level. By +definition, confidence intervals obtained using \code{hybrid} are at least as +wide as those for \code{ensemble}. \code{hybrid} offers the correct interpretation if +the goal of the analysis is to assess the result of a single, unspecified, +model. + +\code{hybrid} is generally computationally less expensive then \code{ensemble}, which +in turn is somewhat less expensive than \code{model}. + +A non-default \code{detail_level} parameter can be specified for separate +evaluation steps by providing a parameter value in a named list with data +elements, e.g. \code{list("auc_data"="ensemble", "model_performance"="hybrid")}. +This parameter can be set for the following data elements: \code{auc_data}, +\code{decision_curve_analyis}, \code{model_performance}, \code{permutation_vimp}, +\code{ice_data}, \code{prediction_data} and \code{confusion_matrix}.} + \item{\code{estimation_type}}{(\emph{optional}) Sets the type of estimation that should be +possible. This has the following options: +\itemize{ +\item \code{point}: Point estimates. +\item \code{bias_correction} or \code{bc}: Bias-corrected estimates. A bias-corrected +estimate is computed from (at least) 20 point estimates, and \code{familiar} may +bootstrap the data to create them. +\item \code{bootstrap_confidence_interval} or \code{bci} (default): Bias-corrected +estimates with bootstrap confidence intervals (Efron and Hastie, 2016). The +number of point estimates required depends on the \code{confidence_level} +parameter, and \code{familiar} may bootstrap the data to create them. +} + +As with \code{detail_level}, a non-default \code{estimation_type} parameter can be +specified for separate evaluation steps by providing a parameter value in a +named list with data elements, e.g. \code{list("auc_data"="bci", "model_performance"="point")}. This parameter can be set for the following +data elements: \code{auc_data}, \code{decision_curve_analyis}, \code{model_performance}, +\code{permutation_vimp}, \code{ice_data}, and \code{prediction_data}.} + \item{\code{confidence_level}}{(\emph{optional}) Numeric value for the level at which +confidence intervals are determined. In the case bootstraps are used to +determine the confidence intervals bootstrap estimation, \code{familiar} uses the +rule of thumb \eqn{n = 20 / ci.level} to determine the number of required +bootstraps. + +The default value is \code{0.95}.} + \item{\code{bootstrap_ci_method}}{(\emph{optional}) Method used to determine bootstrap +confidence intervals (Efron and Hastie, 2016). The following methods are +implemented: +\itemize{ +\item \code{percentile} (default): Confidence intervals obtained using the percentile +method. +\item \code{bc}: Bias-corrected confidence intervals. +} + +Note that the standard method is not implemented because this method is +often not suitable due to non-normal distributions. The bias-corrected and +accelerated (BCa) method is not implemented yet.} + \item{\code{familiar_data_names}}{Names of the dataset(s). Only used if the \code{object} parameter +is one or more \code{familiarData} objects.} + \item{\code{collection_name}}{Name of the collection.} + }} +} +\value{ +A list of data.tables (if \code{dir_path} is not provided), or nothing, as +all data is exported to \code{csv} files. +} +\description{ +Extract and export individual conditional expectation data. +} +\details{ +Data is usually collected from a \code{familiarCollection} object. +However, you can also provide one or more \code{familiarData} objects, that will +be internally converted to a \code{familiarCollection} object. It is also +possible to provide a \code{familiarEnsemble} or one or more \code{familiarModel} +objects together with the data from which data is computed prior to export. +Paths to the previous files can also be provided. + +All parameters aside from \code{object} and \code{dir_path} are only used if \code{object} +is not a \code{familiarCollection} object, or a path to one. +} diff --git a/man/export_model_performance-methods.Rd b/man/export_model_performance-methods.Rd new file mode 100644 index 00000000..74b9ca6e --- /dev/null +++ b/man/export_model_performance-methods.Rd @@ -0,0 +1,184 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarDataComputationModelPerformance.R +\name{export_model_performance} +\alias{export_model_performance} +\alias{export_model_performance,familiarCollection-method} +\alias{export_model_performance,ANY-method} +\title{Extract and export metrics for model performance.} +\usage{ +export_model_performance( + object, + dir_path = NULL, + aggregate_results = FALSE, + export_collection = FALSE, + ... +) + +\S4method{export_model_performance}{familiarCollection}( + object, + dir_path = NULL, + aggregate_results = FALSE, + export_collection = FALSE, + ... +) + +\S4method{export_model_performance}{ANY}( + object, + dir_path = NULL, + aggregate_results = FALSE, + export_collection = FALSE, + ... +) +} +\arguments{ +\item{object}{A \code{familiarCollection} object, or other other objects from which +a \code{familiarCollection} can be extracted. See details for more information.} + +\item{dir_path}{Path to folder where extracted data should be saved. \code{NULL} +will allow export as a structured list of data.tables.} + +\item{aggregate_results}{Flag that signifies whether results should be +aggregated for export.} + +\item{export_collection}{(\emph{optional}) Exports the collection if TRUE.} + +\item{...}{ + Arguments passed on to \code{\link[=extract_performance]{extract_performance}}, \code{\link[=as_familiar_collection]{as_familiar_collection}} + \describe{ + \item{\code{data}}{A \code{dataObject} object, \code{data.table} or \code{data.frame} that +constitutes the data that are assessed.} + \item{\code{is_pre_processed}}{Flag that indicates whether the data was already +pre-processed externally, e.g. normalised and clustered. Only used if the +\code{data} argument is a \code{data.table} or \code{data.frame}.} + \item{\code{cl}}{Cluster created using the \code{parallel} package. This cluster is then +used to speed up computation through parallellisation.} + \item{\code{evaluation_times}}{One or more time points that are used for in analysis of +survival problems when data has to be assessed at a set time, e.g. +calibration. If not provided explicitly, this parameter is read from +settings used at creation of the underlying \code{familiarModel} objects. Only +used for \code{survival} outcomes.} + \item{\code{ensemble_method}}{Method for ensembling predictions from models for the +same sample. Available methods are: +\itemize{ +\item \code{median} (default): Use the median of the predicted values as the ensemble +value for a sample. +\item \code{mean}: Use the mean of the predicted values as the ensemble value for a +sample. +}} + \item{\code{metric}}{One or more metrics for assessing model performance. See the +vignette on performance metrics for the available metrics. If not provided +explicitly, this parameter is read from settings used at creation of the +underlying \code{familiarModel} objects.} + \item{\code{verbose}}{Flag to indicate whether feedback should be provided on the +computation and extraction of various data elements.} + \item{\code{message_indent}}{Number of indentation steps for messages shown during +computation and extraction of various data elements.} + \item{\code{detail_level}}{(\emph{optional}) Sets the level at which results are computed +and aggregated. +\itemize{ +\item \code{ensemble}: Results are computed at the ensemble level, i.e. over all +models in the ensemble. This means that, for example, bias-corrected +estimates of model performance are assessed by creating (at least) 20 +bootstraps and computing the model performance of the ensemble model for +each bootstrap. +\item \code{hybrid} (default): Results are computed at the level of models in an +ensemble. This means that, for example, bias-corrected estimates of model +performance are directly computed using the models in the ensemble. If there +are at least 20 trained models in the ensemble, performance is computed for +each model, in contrast to \code{ensemble} where performance is computed for the +ensemble of models. If there are less than 20 trained models in the +ensemble, bootstraps are created so that at least 20 point estimates can be +made. +\item \code{model}: Results are computed at the model level. This means that, for +example, bias-corrected estimates of model performance are assessed by +creating (at least) 20 bootstraps and computing the performance of the model +for each bootstrap. +} + +Note that each level of detail has a different interpretation for bootstrap +confidence intervals. For \code{ensemble} and \code{model} these are the confidence +intervals for the ensemble and an individual model, respectively. That is, +the confidence interval describes the range where an estimate produced by a +respective ensemble or model trained on a repeat of the experiment may be +found with the probability of the confidence level. For \code{hybrid}, it +represents the range where any single model trained on a repeat of the +experiment may be found with the probability of the confidence level. By +definition, confidence intervals obtained using \code{hybrid} are at least as +wide as those for \code{ensemble}. \code{hybrid} offers the correct interpretation if +the goal of the analysis is to assess the result of a single, unspecified, +model. + +\code{hybrid} is generally computationally less expensive then \code{ensemble}, which +in turn is somewhat less expensive than \code{model}. + +A non-default \code{detail_level} parameter can be specified for separate +evaluation steps by providing a parameter value in a named list with data +elements, e.g. \code{list("auc_data"="ensemble", "model_performance"="hybrid")}. +This parameter can be set for the following data elements: \code{auc_data}, +\code{decision_curve_analyis}, \code{model_performance}, \code{permutation_vimp}, +\code{ice_data}, \code{prediction_data} and \code{confusion_matrix}.} + \item{\code{estimation_type}}{(\emph{optional}) Sets the type of estimation that should be +possible. This has the following options: +\itemize{ +\item \code{point}: Point estimates. +\item \code{bias_correction} or \code{bc}: Bias-corrected estimates. A bias-corrected +estimate is computed from (at least) 20 point estimates, and \code{familiar} may +bootstrap the data to create them. +\item \code{bootstrap_confidence_interval} or \code{bci} (default): Bias-corrected +estimates with bootstrap confidence intervals (Efron and Hastie, 2016). The +number of point estimates required depends on the \code{confidence_level} +parameter, and \code{familiar} may bootstrap the data to create them. +} + +As with \code{detail_level}, a non-default \code{estimation_type} parameter can be +specified for separate evaluation steps by providing a parameter value in a +named list with data elements, e.g. \code{list("auc_data"="bci", "model_performance"="point")}. This parameter can be set for the following +data elements: \code{auc_data}, \code{decision_curve_analyis}, \code{model_performance}, +\code{permutation_vimp}, \code{ice_data}, and \code{prediction_data}.} + \item{\code{confidence_level}}{(\emph{optional}) Numeric value for the level at which +confidence intervals are determined. In the case bootstraps are used to +determine the confidence intervals bootstrap estimation, \code{familiar} uses the +rule of thumb \eqn{n = 20 / ci.level} to determine the number of required +bootstraps. + +The default value is \code{0.95}.} + \item{\code{bootstrap_ci_method}}{(\emph{optional}) Method used to determine bootstrap +confidence intervals (Efron and Hastie, 2016). The following methods are +implemented: +\itemize{ +\item \code{percentile} (default): Confidence intervals obtained using the percentile +method. +\item \code{bc}: Bias-corrected confidence intervals. +} + +Note that the standard method is not implemented because this method is +often not suitable due to non-normal distributions. The bias-corrected and +accelerated (BCa) method is not implemented yet.} + \item{\code{familiar_data_names}}{Names of the dataset(s). Only used if the \code{object} parameter +is one or more \code{familiarData} objects.} + \item{\code{collection_name}}{Name of the collection.} + }} +} +\value{ +A list of data.tables (if \code{dir_path} is not provided), or nothing, as +all data is exported to \code{csv} files. +} +\description{ +Extract and export metrics for model performance of models in a +familiarCollection. +} +\details{ +Data is usually collected from a \code{familiarCollection} object. +However, you can also provide one or more \code{familiarData} objects, that will +be internally converted to a \code{familiarCollection} object. It is also +possible to provide a \code{familiarEnsemble} or one or more \code{familiarModel} +objects together with the data from which data is computed prior to export. +Paths to the previous files can also be provided. + +All parameters aside from \code{object} and \code{dir_path} are only used if \code{object} +is not a \code{familiarCollection} object, or a path to one. + +Performance of individual and ensemble models is exported. For ensemble +models, a credibility interval is determined using bootstrapping for each +metric. +} diff --git a/man/export_model_vimp-methods.Rd b/man/export_model_vimp-methods.Rd new file mode 100644 index 00000000..9b8c5879 --- /dev/null +++ b/man/export_model_vimp-methods.Rd @@ -0,0 +1,130 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarDataComputationVimp.R +\name{export_model_vimp} +\alias{export_model_vimp} +\alias{export_model_vimp,familiarCollection-method} +\alias{export_model_vimp,ANY-method} +\title{Extract and export model-based variable importance.} +\usage{ +export_model_vimp( + object, + dir_path = NULL, + aggregate_results = TRUE, + aggregation_method = waiver(), + rank_threshold = waiver(), + export_collection = FALSE, + ... +) + +\S4method{export_model_vimp}{familiarCollection}( + object, + dir_path = NULL, + aggregate_results = TRUE, + aggregation_method = waiver(), + rank_threshold = waiver(), + export_collection = FALSE, + ... +) + +\S4method{export_model_vimp}{ANY}( + object, + dir_path = NULL, + aggregate_results = TRUE, + aggregation_method = waiver(), + rank_threshold = waiver(), + export_collection = FALSE, + ... +) +} +\arguments{ +\item{object}{A \code{familiarCollection} object, or other other objects from which +a \code{familiarCollection} can be extracted. See details for more information.} + +\item{dir_path}{Path to folder where extracted data should be saved. \code{NULL} +will allow export as a structured list of data.tables.} + +\item{aggregate_results}{Flag that signifies whether results should be +aggregated for export.} + +\item{aggregation_method}{(\emph{optional}) The method used to aggregate variable +importances over different data subsets, e.g. bootstraps. The following +methods can be selected: +\itemize{ +\item \code{mean} (default): Use the mean rank of a feature over the subsets to +determine the aggregated feature rank. +\item \code{median}: Use the median rank of a feature over the subsets to determine +the aggregated feature rank. +\item \code{best}: Use the best rank the feature obtained in any subset to determine +the aggregated feature rank. +\item \code{worst}: Use the worst rank the feature obtained in any subset to +determine the aggregated feature rank. +\item \code{stability}: Use the frequency of the feature being in the subset of +highly ranked features as measure for the aggregated feature rank +(Meinshausen and Buehlmann, 2010). +\item \code{exponential}: Use a rank-weighted frequence of occurrence in the subset +of highly ranked features as measure for the aggregated feature rank (Haury +et al., 2011). +\item \code{borda}: Use the borda count as measure for the aggregated feature rank +(Wald et al., 2012). +\item \code{enhanced_borda}: Use an occurrence frequency-weighted borda count as +measure for the aggregated feature rank (Wald et al., 2012). +\item \code{truncated_borda}: Use borda count computed only on features within the +subset of highly ranked features. +\item \code{enhanced_truncated_borda}: Apply both the enhanced borda method and the +truncated borda method and use the resulting borda count as the aggregated +feature rank. +}} + +\item{rank_threshold}{(\emph{optional}) The threshold used to define the subset of +highly important features. If not set, this threshold is determined by +maximising the variance in the occurrence value over all features over the +subset size. + +This parameter is only relevant for \code{stability}, \code{exponential}, +\code{enhanced_borda}, \code{truncated_borda} and \code{enhanced_truncated_borda} methods.} + +\item{export_collection}{(\emph{optional}) Exports the collection if TRUE.} + +\item{...}{ + Arguments passed on to \code{\link[=as_familiar_collection]{as_familiar_collection}} + \describe{ + \item{\code{familiar_data_names}}{Names of the dataset(s). Only used if the \code{object} parameter +is one or more \code{familiarData} objects.} + \item{\code{collection_name}}{Name of the collection.} + }} +} +\value{ +A data.table (if \code{dir_path} is not provided), or nothing, as all data +is exported to \code{csv} files. +} +\description{ +Extract and export model-based variable importance from a +familiarCollection. +} +\details{ +Data, such as model performance and calibration information, is +usually collected from a \code{familiarCollection} object. However, you can also +provide one or more \code{familiarData} objects, that will be internally +converted to a \code{familiarCollection} object. It is also possible to provide a +\code{familiarEnsemble} or one or more \code{familiarModel} objects together with the +data from which data is computed prior to export. Paths to the previous +files can also be provided. + +All parameters aside from \code{object} and \code{dir_path} are only used if \code{object} +is not a \code{familiarCollection} object, or a path to one. + +Variable importance is based on the ranking produced by model-specific +variable importance routines, e.g. permutation for random forests. If such a +routine is absent, variable importance is based on the feature selection +method that led to the features included in the model. In case multiple +models (\code{familiarModel} objects) are combined, feature ranks are first +aggregated using the method defined by the \code{aggregation_method}, some of +which require a \code{rank_threshold} to indicate a subset of most important +features. + +Information concerning highly similar features that form clusters is +provided as well. This information is based on consensus clustering of the +features that were used in the signatures of the underlying models. This +clustering information is also used during aggregation to ensure that +co-clustered features are only taken into account once. +} diff --git a/man/export_partial_dependence_data-methods.Rd b/man/export_partial_dependence_data-methods.Rd new file mode 100644 index 00000000..042019a4 --- /dev/null +++ b/man/export_partial_dependence_data-methods.Rd @@ -0,0 +1,196 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarDataComputationICE.R +\name{export_partial_dependence_data} +\alias{export_partial_dependence_data} +\alias{export_partial_dependence_data,familiarCollection-method} +\alias{export_partial_dependence_data,ANY-method} +\title{Extract and export partial dependence data.} +\usage{ +export_partial_dependence_data( + object, + dir_path = NULL, + aggregate_results = TRUE, + export_collection = FALSE, + ... +) + +\S4method{export_partial_dependence_data}{familiarCollection}( + object, + dir_path = NULL, + aggregate_results = TRUE, + export_collection = FALSE, + ... +) + +\S4method{export_partial_dependence_data}{ANY}( + object, + dir_path = NULL, + aggregate_results = TRUE, + export_collection = FALSE, + ... +) +} +\arguments{ +\item{object}{A \code{familiarCollection} object, or other other objects from which +a \code{familiarCollection} can be extracted. See details for more information.} + +\item{dir_path}{Path to folder where extracted data should be saved. \code{NULL} +will allow export as a structured list of data.tables.} + +\item{aggregate_results}{Flag that signifies whether results should be +aggregated for export.} + +\item{export_collection}{(\emph{optional}) Exports the collection if TRUE.} + +\item{...}{ + Arguments passed on to \code{\link[=extract_ice]{extract_ice}}, \code{\link[=as_familiar_collection]{as_familiar_collection}} + \describe{ + \item{\code{features}}{Names of the feature or features (2) assessed simultaneously. +By default \code{NULL}, which means that all features are assessed one-by-one.} + \item{\code{feature_x_range}}{When one or two features are defined using \code{features}, +\code{feature_x_range} can be used to set the range of values for the first +feature. For numeric features, a vector of two values is assumed to indicate +a range from which \code{n_sample_points} are uniformly sampled. A vector of more +than two values is interpreted as is, i.e. these represent the values to be +sampled. For categorical features, values should represent a (sub)set of +available levels.} + \item{\code{feature_y_range}}{As \code{feature_x_range}, but for the second feature in +case two features are defined.} + \item{\code{n_sample_points}}{Number of points used to sample continuous features.} + \item{\code{data}}{A \code{dataObject} object, \code{data.table} or \code{data.frame} that +constitutes the data that are assessed.} + \item{\code{is_pre_processed}}{Flag that indicates whether the data was already +pre-processed externally, e.g. normalised and clustered. Only used if the +\code{data} argument is a \code{data.table} or \code{data.frame}.} + \item{\code{cl}}{Cluster created using the \code{parallel} package. This cluster is then +used to speed up computation through parallellisation.} + \item{\code{evaluation_times}}{One or more time points that are used for in analysis of +survival problems when data has to be assessed at a set time, e.g. +calibration. If not provided explicitly, this parameter is read from +settings used at creation of the underlying \code{familiarModel} objects. Only +used for \code{survival} outcomes.} + \item{\code{ensemble_method}}{Method for ensembling predictions from models for the +same sample. Available methods are: +\itemize{ +\item \code{median} (default): Use the median of the predicted values as the ensemble +value for a sample. +\item \code{mean}: Use the mean of the predicted values as the ensemble value for a +sample. +}} + \item{\code{verbose}}{Flag to indicate whether feedback should be provided on the +computation and extraction of various data elements.} + \item{\code{message_indent}}{Number of indentation steps for messages shown during +computation and extraction of various data elements.} + \item{\code{sample_limit}}{(\emph{optional}) Set the upper limit of the number of samples +that are used during evaluation steps. Cannot be less than 20. + +This setting can be specified per data element by providing a parameter +value in a named list with data elements, e.g. +\code{list("sample_similarity"=100, "permutation_vimp"=1000)}. + +This parameter can be set for the following data elements: +\code{sample_similarity} and \code{ice_data}.} + \item{\code{detail_level}}{(\emph{optional}) Sets the level at which results are computed +and aggregated. +\itemize{ +\item \code{ensemble}: Results are computed at the ensemble level, i.e. over all +models in the ensemble. This means that, for example, bias-corrected +estimates of model performance are assessed by creating (at least) 20 +bootstraps and computing the model performance of the ensemble model for +each bootstrap. +\item \code{hybrid} (default): Results are computed at the level of models in an +ensemble. This means that, for example, bias-corrected estimates of model +performance are directly computed using the models in the ensemble. If there +are at least 20 trained models in the ensemble, performance is computed for +each model, in contrast to \code{ensemble} where performance is computed for the +ensemble of models. If there are less than 20 trained models in the +ensemble, bootstraps are created so that at least 20 point estimates can be +made. +\item \code{model}: Results are computed at the model level. This means that, for +example, bias-corrected estimates of model performance are assessed by +creating (at least) 20 bootstraps and computing the performance of the model +for each bootstrap. +} + +Note that each level of detail has a different interpretation for bootstrap +confidence intervals. For \code{ensemble} and \code{model} these are the confidence +intervals for the ensemble and an individual model, respectively. That is, +the confidence interval describes the range where an estimate produced by a +respective ensemble or model trained on a repeat of the experiment may be +found with the probability of the confidence level. For \code{hybrid}, it +represents the range where any single model trained on a repeat of the +experiment may be found with the probability of the confidence level. By +definition, confidence intervals obtained using \code{hybrid} are at least as +wide as those for \code{ensemble}. \code{hybrid} offers the correct interpretation if +the goal of the analysis is to assess the result of a single, unspecified, +model. + +\code{hybrid} is generally computationally less expensive then \code{ensemble}, which +in turn is somewhat less expensive than \code{model}. + +A non-default \code{detail_level} parameter can be specified for separate +evaluation steps by providing a parameter value in a named list with data +elements, e.g. \code{list("auc_data"="ensemble", "model_performance"="hybrid")}. +This parameter can be set for the following data elements: \code{auc_data}, +\code{decision_curve_analyis}, \code{model_performance}, \code{permutation_vimp}, +\code{ice_data}, \code{prediction_data} and \code{confusion_matrix}.} + \item{\code{estimation_type}}{(\emph{optional}) Sets the type of estimation that should be +possible. This has the following options: +\itemize{ +\item \code{point}: Point estimates. +\item \code{bias_correction} or \code{bc}: Bias-corrected estimates. A bias-corrected +estimate is computed from (at least) 20 point estimates, and \code{familiar} may +bootstrap the data to create them. +\item \code{bootstrap_confidence_interval} or \code{bci} (default): Bias-corrected +estimates with bootstrap confidence intervals (Efron and Hastie, 2016). The +number of point estimates required depends on the \code{confidence_level} +parameter, and \code{familiar} may bootstrap the data to create them. +} + +As with \code{detail_level}, a non-default \code{estimation_type} parameter can be +specified for separate evaluation steps by providing a parameter value in a +named list with data elements, e.g. \code{list("auc_data"="bci", "model_performance"="point")}. This parameter can be set for the following +data elements: \code{auc_data}, \code{decision_curve_analyis}, \code{model_performance}, +\code{permutation_vimp}, \code{ice_data}, and \code{prediction_data}.} + \item{\code{confidence_level}}{(\emph{optional}) Numeric value for the level at which +confidence intervals are determined. In the case bootstraps are used to +determine the confidence intervals bootstrap estimation, \code{familiar} uses the +rule of thumb \eqn{n = 20 / ci.level} to determine the number of required +bootstraps. + +The default value is \code{0.95}.} + \item{\code{bootstrap_ci_method}}{(\emph{optional}) Method used to determine bootstrap +confidence intervals (Efron and Hastie, 2016). The following methods are +implemented: +\itemize{ +\item \code{percentile} (default): Confidence intervals obtained using the percentile +method. +\item \code{bc}: Bias-corrected confidence intervals. +} + +Note that the standard method is not implemented because this method is +often not suitable due to non-normal distributions. The bias-corrected and +accelerated (BCa) method is not implemented yet.} + \item{\code{familiar_data_names}}{Names of the dataset(s). Only used if the \code{object} parameter +is one or more \code{familiarData} objects.} + \item{\code{collection_name}}{Name of the collection.} + }} +} +\value{ +A list of data.tables (if \code{dir_path} is not provided), or nothing, as +all data is exported to \code{csv} files. +} +\description{ +Extract and export partial dependence data. +} +\details{ +Data is usually collected from a \code{familiarCollection} object. +However, you can also provide one or more \code{familiarData} objects, that will +be internally converted to a \code{familiarCollection} object. It is also +possible to provide a \code{familiarEnsemble} or one or more \code{familiarModel} +objects together with the data from which data is computed prior to export. +Paths to the previous files can also be provided. + +All parameters aside from \code{object} and \code{dir_path} are only used if \code{object} +is not a \code{familiarCollection} object, or a path to one. +} diff --git a/man/export_permutation_vimp-methods.Rd b/man/export_permutation_vimp-methods.Rd new file mode 100644 index 00000000..4c3e1ae2 --- /dev/null +++ b/man/export_permutation_vimp-methods.Rd @@ -0,0 +1,244 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarDataComputationPermutationVimp.R +\name{export_permutation_vimp} +\alias{export_permutation_vimp} +\alias{export_permutation_vimp,familiarCollection-method} +\alias{export_permutation_vimp,ANY-method} +\title{Extract and export permutation variable importance.} +\usage{ +export_permutation_vimp( + object, + dir_path = NULL, + aggregate_results = TRUE, + export_collection = FALSE, + ... +) + +\S4method{export_permutation_vimp}{familiarCollection}( + object, + dir_path = NULL, + aggregate_results = TRUE, + export_collection = FALSE, + ... +) + +\S4method{export_permutation_vimp}{ANY}( + object, + dir_path = NULL, + aggregate_results = TRUE, + export_collection = FALSE, + ... +) +} +\arguments{ +\item{object}{A \code{familiarCollection} object, or other other objects from which +a \code{familiarCollection} can be extracted. See details for more information.} + +\item{dir_path}{Path to folder where extracted data should be saved. \code{NULL} +will allow export as a structured list of data.tables.} + +\item{aggregate_results}{Flag that signifies whether results should be +aggregated for export.} + +\item{export_collection}{(\emph{optional}) Exports the collection if TRUE.} + +\item{...}{ + Arguments passed on to \code{\link[=extract_permutation_vimp]{extract_permutation_vimp}}, \code{\link[=as_familiar_collection]{as_familiar_collection}} + \describe{ + \item{\code{data}}{A \code{dataObject} object, \code{data.table} or \code{data.frame} that +constitutes the data that are assessed.} + \item{\code{is_pre_processed}}{Flag that indicates whether the data was already +pre-processed externally, e.g. normalised and clustered. Only used if the +\code{data} argument is a \code{data.table} or \code{data.frame}.} + \item{\code{cl}}{Cluster created using the \code{parallel} package. This cluster is then +used to speed up computation through parallellisation.} + \item{\code{evaluation_times}}{One or more time points that are used for in analysis of +survival problems when data has to be assessed at a set time, e.g. +calibration. If not provided explicitly, this parameter is read from +settings used at creation of the underlying \code{familiarModel} objects. Only +used for \code{survival} outcomes.} + \item{\code{ensemble_method}}{Method for ensembling predictions from models for the +same sample. Available methods are: +\itemize{ +\item \code{median} (default): Use the median of the predicted values as the ensemble +value for a sample. +\item \code{mean}: Use the mean of the predicted values as the ensemble value for a +sample. +}} + \item{\code{metric}}{One or more metrics for assessing model performance. See the +vignette on performance metrics for the available metrics. If not provided +explicitly, this parameter is read from settings used at creation of the +underlying \code{familiarModel} objects.} + \item{\code{feature_cluster_method}}{The method used to perform clustering. These are +the same methods as for the \code{cluster_method} configuration parameter: +\code{none}, \code{hclust}, \code{agnes}, \code{diana} and \code{pam}. + +\code{none} cannot be used when extracting data regarding mutual correlation or +feature expressions. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{feature_linkage_method}}{The method used for agglomerative clustering in +\code{hclust} and \code{agnes}. These are the same methods as for the +\code{cluster_linkage_method} configuration parameter: \code{average}, \code{single}, +\code{complete}, \code{weighted}, and \code{ward}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{feature_cluster_cut_method}}{The method used to divide features into +separate clusters. The available methods are the same as for the +\code{cluster_cut_method} configuration parameter: \code{silhouette}, \code{fixed_cut} and +\code{dynamic_cut}. + +\code{silhouette} is available for all cluster methods, but \code{fixed_cut} only +applies to methods that create hierarchical trees (\code{hclust}, \code{agnes} and +\code{diana}). \code{dynamic_cut} requires the \code{dynamicTreeCut} package and can only +be used with \code{agnes} and \code{hclust}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{feature_similarity_threshold}}{The threshold level for pair-wise +similarity that is required to form feature clusters with the \code{fixed_cut} +method. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{feature_similarity_metric}}{Metric to determine pairwise similarity +between features. Similarity is computed in the same manner as for +clustering, and \code{feature_similarity_metric} therefore has the same options +as \code{cluster_similarity_metric}: \code{mcfadden_r2}, \code{cox_snell_r2}, +\code{nagelkerke_r2}, \code{spearman}, \code{kendall} and \code{pearson}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{verbose}}{Flag to indicate whether feedback should be provided on the +computation and extraction of various data elements.} + \item{\code{message_indent}}{Number of indentation steps for messages shown during +computation and extraction of various data elements.} + \item{\code{detail_level}}{(\emph{optional}) Sets the level at which results are computed +and aggregated. +\itemize{ +\item \code{ensemble}: Results are computed at the ensemble level, i.e. over all +models in the ensemble. This means that, for example, bias-corrected +estimates of model performance are assessed by creating (at least) 20 +bootstraps and computing the model performance of the ensemble model for +each bootstrap. +\item \code{hybrid} (default): Results are computed at the level of models in an +ensemble. This means that, for example, bias-corrected estimates of model +performance are directly computed using the models in the ensemble. If there +are at least 20 trained models in the ensemble, performance is computed for +each model, in contrast to \code{ensemble} where performance is computed for the +ensemble of models. If there are less than 20 trained models in the +ensemble, bootstraps are created so that at least 20 point estimates can be +made. +\item \code{model}: Results are computed at the model level. This means that, for +example, bias-corrected estimates of model performance are assessed by +creating (at least) 20 bootstraps and computing the performance of the model +for each bootstrap. +} + +Note that each level of detail has a different interpretation for bootstrap +confidence intervals. For \code{ensemble} and \code{model} these are the confidence +intervals for the ensemble and an individual model, respectively. That is, +the confidence interval describes the range where an estimate produced by a +respective ensemble or model trained on a repeat of the experiment may be +found with the probability of the confidence level. For \code{hybrid}, it +represents the range where any single model trained on a repeat of the +experiment may be found with the probability of the confidence level. By +definition, confidence intervals obtained using \code{hybrid} are at least as +wide as those for \code{ensemble}. \code{hybrid} offers the correct interpretation if +the goal of the analysis is to assess the result of a single, unspecified, +model. + +\code{hybrid} is generally computationally less expensive then \code{ensemble}, which +in turn is somewhat less expensive than \code{model}. + +A non-default \code{detail_level} parameter can be specified for separate +evaluation steps by providing a parameter value in a named list with data +elements, e.g. \code{list("auc_data"="ensemble", "model_performance"="hybrid")}. +This parameter can be set for the following data elements: \code{auc_data}, +\code{decision_curve_analyis}, \code{model_performance}, \code{permutation_vimp}, +\code{ice_data}, \code{prediction_data} and \code{confusion_matrix}.} + \item{\code{estimation_type}}{(\emph{optional}) Sets the type of estimation that should be +possible. This has the following options: +\itemize{ +\item \code{point}: Point estimates. +\item \code{bias_correction} or \code{bc}: Bias-corrected estimates. A bias-corrected +estimate is computed from (at least) 20 point estimates, and \code{familiar} may +bootstrap the data to create them. +\item \code{bootstrap_confidence_interval} or \code{bci} (default): Bias-corrected +estimates with bootstrap confidence intervals (Efron and Hastie, 2016). The +number of point estimates required depends on the \code{confidence_level} +parameter, and \code{familiar} may bootstrap the data to create them. +} + +As with \code{detail_level}, a non-default \code{estimation_type} parameter can be +specified for separate evaluation steps by providing a parameter value in a +named list with data elements, e.g. \code{list("auc_data"="bci", "model_performance"="point")}. This parameter can be set for the following +data elements: \code{auc_data}, \code{decision_curve_analyis}, \code{model_performance}, +\code{permutation_vimp}, \code{ice_data}, and \code{prediction_data}.} + \item{\code{confidence_level}}{(\emph{optional}) Numeric value for the level at which +confidence intervals are determined. In the case bootstraps are used to +determine the confidence intervals bootstrap estimation, \code{familiar} uses the +rule of thumb \eqn{n = 20 / ci.level} to determine the number of required +bootstraps. + +The default value is \code{0.95}.} + \item{\code{bootstrap_ci_method}}{(\emph{optional}) Method used to determine bootstrap +confidence intervals (Efron and Hastie, 2016). The following methods are +implemented: +\itemize{ +\item \code{percentile} (default): Confidence intervals obtained using the percentile +method. +\item \code{bc}: Bias-corrected confidence intervals. +} + +Note that the standard method is not implemented because this method is +often not suitable due to non-normal distributions. The bias-corrected and +accelerated (BCa) method is not implemented yet.} + \item{\code{familiar_data_names}}{Names of the dataset(s). Only used if the \code{object} parameter +is one or more \code{familiarData} objects.} + \item{\code{collection_name}}{Name of the collection.} + }} +} +\value{ +A data.table (if \code{dir_path} is not provided), or nothing, as all data +is exported to \code{csv} files. +} +\description{ +Extract and export model-based variable importance from a +familiarCollection. +} +\details{ +Data, such as permutation variable importance and calibration +information, is usually collected from a \code{familiarCollection} object. +However, you can also provide one or more \code{familiarData} objects, that will +be internally converted to a \code{familiarCollection} object. It is also +possible to provide a \code{familiarEnsemble} or one or more \code{familiarModel} +objects together with the data from which data is computed prior to export. +Paths to the previously mentioned files can also be provided. + +All parameters aside from \code{object} and \code{dir_path} are only used if \code{object} +is not a \code{familiarCollection} object, or a path to one. + +Permutation Variable importance assesses the improvement in model +performance due to a feature. For this purpose, the performance of the model +is measured as normal, and is measured again with a dataset where the values +of the feature in question have been randomly permuted. The difference +between both performance measurements is the permutation variable +importance. + +In familiar, this basic concept is extended in several ways: +\itemize{ +\item Point estimates of variable importance are based on multiple (21) random +permutations. The difference between model performance on the normal dataset +and the median performance measurement of the randomly permuted datasets is +used as permutation variable importance. +\item Confidence intervals for the ensemble model are determined using bootstrap +methods. +\item Permutation variable importance is assessed for any metric specified using +the \code{metric} argument. +\item Permutation variable importance can take into account similarity between +features and permute similar features simultaneously. +} +} diff --git a/man/export_prediction_data-methods.Rd b/man/export_prediction_data-methods.Rd new file mode 100644 index 00000000..0fcfe71f --- /dev/null +++ b/man/export_prediction_data-methods.Rd @@ -0,0 +1,162 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarDataComputationPredictionData.R +\name{export_prediction_data} +\alias{export_prediction_data} +\alias{export_prediction_data,familiarCollection-method} +\alias{export_prediction_data,ANY-method} +\title{Extract and export predicted values.} +\usage{ +export_prediction_data(object, dir_path = NULL, export_collection = FALSE, ...) + +\S4method{export_prediction_data}{familiarCollection}(object, dir_path = NULL, export_collection = FALSE, ...) + +\S4method{export_prediction_data}{ANY}(object, dir_path = NULL, export_collection = FALSE, ...) +} +\arguments{ +\item{object}{A \code{familiarCollection} object, or other other objects from which +a \code{familiarCollection} can be extracted. See details for more information.} + +\item{dir_path}{Path to folder where extracted data should be saved. \code{NULL} +will allow export as a structured list of data.tables.} + +\item{export_collection}{(\emph{optional}) Exports the collection if TRUE.} + +\item{...}{ + Arguments passed on to \code{\link[=extract_predictions]{extract_predictions}}, \code{\link[=as_familiar_collection]{as_familiar_collection}} + \describe{ + \item{\code{data}}{A \code{dataObject} object, \code{data.table} or \code{data.frame} that +constitutes the data that are assessed.} + \item{\code{is_pre_processed}}{Flag that indicates whether the data was already +pre-processed externally, e.g. normalised and clustered. Only used if the +\code{data} argument is a \code{data.table} or \code{data.frame}.} + \item{\code{cl}}{Cluster created using the \code{parallel} package. This cluster is then +used to speed up computation through parallellisation.} + \item{\code{evaluation_times}}{One or more time points that are used for in analysis of +survival problems when data has to be assessed at a set time, e.g. +calibration. If not provided explicitly, this parameter is read from +settings used at creation of the underlying \code{familiarModel} objects. Only +used for \code{survival} outcomes.} + \item{\code{ensemble_method}}{Method for ensembling predictions from models for the +same sample. Available methods are: +\itemize{ +\item \code{median} (default): Use the median of the predicted values as the ensemble +value for a sample. +\item \code{mean}: Use the mean of the predicted values as the ensemble value for a +sample. +}} + \item{\code{verbose}}{Flag to indicate whether feedback should be provided on the +computation and extraction of various data elements.} + \item{\code{message_indent}}{Number of indentation steps for messages shown during +computation and extraction of various data elements.} + \item{\code{detail_level}}{(\emph{optional}) Sets the level at which results are computed +and aggregated. +\itemize{ +\item \code{ensemble}: Results are computed at the ensemble level, i.e. over all +models in the ensemble. This means that, for example, bias-corrected +estimates of model performance are assessed by creating (at least) 20 +bootstraps and computing the model performance of the ensemble model for +each bootstrap. +\item \code{hybrid} (default): Results are computed at the level of models in an +ensemble. This means that, for example, bias-corrected estimates of model +performance are directly computed using the models in the ensemble. If there +are at least 20 trained models in the ensemble, performance is computed for +each model, in contrast to \code{ensemble} where performance is computed for the +ensemble of models. If there are less than 20 trained models in the +ensemble, bootstraps are created so that at least 20 point estimates can be +made. +\item \code{model}: Results are computed at the model level. This means that, for +example, bias-corrected estimates of model performance are assessed by +creating (at least) 20 bootstraps and computing the performance of the model +for each bootstrap. +} + +Note that each level of detail has a different interpretation for bootstrap +confidence intervals. For \code{ensemble} and \code{model} these are the confidence +intervals for the ensemble and an individual model, respectively. That is, +the confidence interval describes the range where an estimate produced by a +respective ensemble or model trained on a repeat of the experiment may be +found with the probability of the confidence level. For \code{hybrid}, it +represents the range where any single model trained on a repeat of the +experiment may be found with the probability of the confidence level. By +definition, confidence intervals obtained using \code{hybrid} are at least as +wide as those for \code{ensemble}. \code{hybrid} offers the correct interpretation if +the goal of the analysis is to assess the result of a single, unspecified, +model. + +\code{hybrid} is generally computationally less expensive then \code{ensemble}, which +in turn is somewhat less expensive than \code{model}. + +A non-default \code{detail_level} parameter can be specified for separate +evaluation steps by providing a parameter value in a named list with data +elements, e.g. \code{list("auc_data"="ensemble", "model_performance"="hybrid")}. +This parameter can be set for the following data elements: \code{auc_data}, +\code{decision_curve_analyis}, \code{model_performance}, \code{permutation_vimp}, +\code{ice_data}, \code{prediction_data} and \code{confusion_matrix}.} + \item{\code{estimation_type}}{(\emph{optional}) Sets the type of estimation that should be +possible. This has the following options: +\itemize{ +\item \code{point}: Point estimates. +\item \code{bias_correction} or \code{bc}: Bias-corrected estimates. A bias-corrected +estimate is computed from (at least) 20 point estimates, and \code{familiar} may +bootstrap the data to create them. +\item \code{bootstrap_confidence_interval} or \code{bci} (default): Bias-corrected +estimates with bootstrap confidence intervals (Efron and Hastie, 2016). The +number of point estimates required depends on the \code{confidence_level} +parameter, and \code{familiar} may bootstrap the data to create them. +} + +As with \code{detail_level}, a non-default \code{estimation_type} parameter can be +specified for separate evaluation steps by providing a parameter value in a +named list with data elements, e.g. \code{list("auc_data"="bci", "model_performance"="point")}. This parameter can be set for the following +data elements: \code{auc_data}, \code{decision_curve_analyis}, \code{model_performance}, +\code{permutation_vimp}, \code{ice_data}, and \code{prediction_data}.} + \item{\code{aggregate_results}}{(\emph{optional}) Flag that signifies whether results +should be aggregated during evaluation. If \code{estimation_type} is +\code{bias_correction} or \code{bc}, aggregation leads to a single bias-corrected +estimate. If \code{estimation_type} is \code{bootstrap_confidence_interval} or \code{bci}, +aggregation leads to a single bias-corrected estimate with lower and upper +boundaries of the confidence interval. This has no effect if +\code{estimation_type} is \code{point}. + +The default value is equal to \code{TRUE} except when assessing metrics to assess +model performance, as the default violin plot requires underlying data. + +As with \code{detail_level} and \code{estimation_type}, a non-default +\code{aggregate_results} parameter can be specified for separate evaluation steps +by providing a parameter value in a named list with data elements, e.g. +\code{list("auc_data"=TRUE, , "model_performance"=FALSE)}. This parameter exists +for the same elements as \code{estimation_type}.} + \item{\code{confidence_level}}{(\emph{optional}) Numeric value for the level at which +confidence intervals are determined. In the case bootstraps are used to +determine the confidence intervals bootstrap estimation, \code{familiar} uses the +rule of thumb \eqn{n = 20 / ci.level} to determine the number of required +bootstraps. + +The default value is \code{0.95}.} + \item{\code{familiar_data_names}}{Names of the dataset(s). Only used if the \code{object} parameter +is one or more \code{familiarData} objects.} + \item{\code{collection_name}}{Name of the collection.} + }} +} +\value{ +A list of data.tables (if \code{dir_path} is not provided), or nothing, as +all data is exported to \code{csv} files. +} +\description{ +Extract and export the values predicted by single and ensemble +models in a familiarCollection. +} +\details{ +Data, such as model performance and calibration information, is +usually collected from a \code{familiarCollection} object. However, you can also +provide one or more \code{familiarData} objects, that will be internally +converted to a \code{familiarCollection} object. It is also possible to provide a +\code{familiarEnsemble} or one or more \code{familiarModel} objects together with the +data from which data is computed prior to export. Paths to the previous +files can also be provided. + +All parameters aside from \code{object} and \code{dir_path} are only used if \code{object} +is not a \code{familiarCollection} object, or a path to one. + +Both single and ensemble predictions are exported. +} diff --git a/man/export_risk_stratification_data-methods.Rd b/man/export_risk_stratification_data-methods.Rd new file mode 100644 index 00000000..042f3b76 --- /dev/null +++ b/man/export_risk_stratification_data-methods.Rd @@ -0,0 +1,157 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarDataComputationRiskStratificationData.R +\name{export_risk_stratification_data} +\alias{export_risk_stratification_data} +\alias{export_risk_stratification_data,familiarCollection-method} +\alias{export_risk_stratification_data,ANY-method} +\title{Extract and export sample risk group stratification and associated +tests.} +\usage{ +export_risk_stratification_data( + object, + dir_path = NULL, + export_strata = TRUE, + time_range = NULL, + export_collection = FALSE, + ... +) + +\S4method{export_risk_stratification_data}{familiarCollection}( + object, + dir_path = NULL, + export_strata = TRUE, + time_range = NULL, + export_collection = FALSE, + ... +) + +\S4method{export_risk_stratification_data}{ANY}( + object, + dir_path = NULL, + export_strata = TRUE, + time_range = NULL, + export_collection = FALSE, + ... +) +} +\arguments{ +\item{object}{A \code{familiarCollection} object, or other other objects from which +a \code{familiarCollection} can be extracted. See details for more information.} + +\item{dir_path}{Path to folder where extracted data should be saved. \code{NULL} +will allow export as a structured list of data.tables.} + +\item{export_strata}{Flag that determines whether the raw data or strata are +exported.} + +\item{time_range}{Time range for which strata should be created. If \code{NULL}, +the full time range is used.} + +\item{export_collection}{(\emph{optional}) Exports the collection if TRUE.} + +\item{...}{ + Arguments passed on to \code{\link[=extract_risk_stratification_data]{extract_risk_stratification_data}}, \code{\link[=as_familiar_collection]{as_familiar_collection}} + \describe{ + \item{\code{data}}{A \code{dataObject} object, \code{data.table} or \code{data.frame} that +constitutes the data that are assessed.} + \item{\code{is_pre_processed}}{Flag that indicates whether the data was already +pre-processed externally, e.g. normalised and clustered. Only used if the +\code{data} argument is a \code{data.table} or \code{data.frame}.} + \item{\code{cl}}{Cluster created using the \code{parallel} package. This cluster is then +used to speed up computation through parallellisation.} + \item{\code{ensemble_method}}{Method for ensembling predictions from models for the +same sample. Available methods are: +\itemize{ +\item \code{median} (default): Use the median of the predicted values as the ensemble +value for a sample. +\item \code{mean}: Use the mean of the predicted values as the ensemble value for a +sample. +}} + \item{\code{verbose}}{Flag to indicate whether feedback should be provided on the +computation and extraction of various data elements.} + \item{\code{message_indent}}{Number of indentation steps for messages shown during +computation and extraction of various data elements.} + \item{\code{detail_level}}{(\emph{optional}) Sets the level at which results are computed +and aggregated. +\itemize{ +\item \code{ensemble}: Results are computed at the ensemble level, i.e. over all +models in the ensemble. This means that, for example, bias-corrected +estimates of model performance are assessed by creating (at least) 20 +bootstraps and computing the model performance of the ensemble model for +each bootstrap. +\item \code{hybrid} (default): Results are computed at the level of models in an +ensemble. This means that, for example, bias-corrected estimates of model +performance are directly computed using the models in the ensemble. If there +are at least 20 trained models in the ensemble, performance is computed for +each model, in contrast to \code{ensemble} where performance is computed for the +ensemble of models. If there are less than 20 trained models in the +ensemble, bootstraps are created so that at least 20 point estimates can be +made. +\item \code{model}: Results are computed at the model level. This means that, for +example, bias-corrected estimates of model performance are assessed by +creating (at least) 20 bootstraps and computing the performance of the model +for each bootstrap. +} + +Note that each level of detail has a different interpretation for bootstrap +confidence intervals. For \code{ensemble} and \code{model} these are the confidence +intervals for the ensemble and an individual model, respectively. That is, +the confidence interval describes the range where an estimate produced by a +respective ensemble or model trained on a repeat of the experiment may be +found with the probability of the confidence level. For \code{hybrid}, it +represents the range where any single model trained on a repeat of the +experiment may be found with the probability of the confidence level. By +definition, confidence intervals obtained using \code{hybrid} are at least as +wide as those for \code{ensemble}. \code{hybrid} offers the correct interpretation if +the goal of the analysis is to assess the result of a single, unspecified, +model. + +\code{hybrid} is generally computationally less expensive then \code{ensemble}, which +in turn is somewhat less expensive than \code{model}. + +A non-default \code{detail_level} parameter can be specified for separate +evaluation steps by providing a parameter value in a named list with data +elements, e.g. \code{list("auc_data"="ensemble", "model_performance"="hybrid")}. +This parameter can be set for the following data elements: \code{auc_data}, +\code{decision_curve_analyis}, \code{model_performance}, \code{permutation_vimp}, +\code{ice_data}, \code{prediction_data} and \code{confusion_matrix}.} + \item{\code{confidence_level}}{(\emph{optional}) Numeric value for the level at which +confidence intervals are determined. In the case bootstraps are used to +determine the confidence intervals bootstrap estimation, \code{familiar} uses the +rule of thumb \eqn{n = 20 / ci.level} to determine the number of required +bootstraps. + +The default value is \code{0.95}.} + \item{\code{familiar_data_names}}{Names of the dataset(s). Only used if the \code{object} parameter +is one or more \code{familiarData} objects.} + \item{\code{collection_name}}{Name of the collection.} + }} +} +\value{ +A list of data.tables (if \code{dir_path} is not provided), or nothing, as +all data is exported to \code{csv} files. +} +\description{ +Extract and export sample risk group stratification and +associated tests for data in a familiarCollection. +} +\details{ +Data is usually collected from a \code{familiarCollection} object. +However, you can also provide one or more \code{familiarData} objects, that will +be internally converted to a \code{familiarCollection} object. It is also +possible to provide a \code{familiarEnsemble} or one or more \code{familiarModel} +objects together with the data from which data is computed prior to export. +Paths to the previous files can also be provided. + +All parameters aside from \code{object} and \code{dir_path} are only used if \code{object} +is not a \code{familiarCollection} object, or a path to one. + +Three tables are exported in a list: +\itemize{ +\item \code{data}: Contains the assigned risk group for a given sample, along with +its reported survival time and censoring status. +\item \code{hr_ratio}: Contains the hazard ratio between different risk groups. +\item \code{logrank}: Contains the results from the logrank test between different +risk groups. +} +} diff --git a/man/export_risk_stratification_info-methods.Rd b/man/export_risk_stratification_info-methods.Rd new file mode 100644 index 00000000..9689dff7 --- /dev/null +++ b/man/export_risk_stratification_info-methods.Rd @@ -0,0 +1,94 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarDataComputationRiskStratificationInfo.R +\name{export_risk_stratification_info} +\alias{export_risk_stratification_info} +\alias{export_risk_stratification_info,familiarCollection-method} +\alias{export_risk_stratification_info,ANY-method} +\title{Extract and export cut-off values for risk group stratification.} +\usage{ +export_risk_stratification_info( + object, + dir_path = NULL, + aggregate_results = TRUE, + export_collection = FALSE, + ... +) + +\S4method{export_risk_stratification_info}{familiarCollection}( + object, + dir_path = NULL, + aggregate_results = TRUE, + export_collection = FALSE, + ... +) + +\S4method{export_risk_stratification_info}{ANY}( + object, + dir_path = NULL, + aggregate_results = TRUE, + export_collection = FALSE, + ... +) +} +\arguments{ +\item{object}{A \code{familiarCollection} object, or other other objects from which +a \code{familiarCollection} can be extracted. See details for more information.} + +\item{dir_path}{Path to folder where extracted data should be saved. \code{NULL} +will allow export as a structured list of data.tables.} + +\item{aggregate_results}{Flag that signifies whether results should be +aggregated for export.} + +\item{export_collection}{(\emph{optional}) Exports the collection if TRUE.} + +\item{...}{ + Arguments passed on to \code{\link[=as_familiar_collection]{as_familiar_collection}} + \describe{ + \item{\code{familiar_data_names}}{Names of the dataset(s). Only used if the \code{object} parameter +is one or more \code{familiarData} objects.} + \item{\code{collection_name}}{Name of the collection.} + }} +} +\value{ +A data.table (if \code{dir_path} is not provided), or nothing, as all data +is exported to \code{csv} files. +} +\description{ +Extract and export cut-off values for risk group stratification +by models in a familiarCollection. +} +\details{ +Data is usually collected from a \code{familiarCollection} object. +However, you can also provide one or more \code{familiarData} objects, that will +be internally converted to a \code{familiarCollection} object. It is also +possible to provide a \code{familiarEnsemble} or one or more \code{familiarModel} +objects together with the data from which data is computed prior to export. +Paths to the previous files can also be provided. + +All parameters aside from \code{object} and \code{dir_path} are only used if \code{object} +is not a \code{familiarCollection} object, or a path to one. + +Stratification cut-off values are determined when creating a model, using +one of several methods set by the \code{stratification_method} parameter. These +values are then used to stratify samples in any new dataset. The available +methods are: +\itemize{ +\item \code{median} (default): The median predicted value in the development cohort +is used to stratify the samples into two risk groups. +\item \code{fixed}: Samples are stratified based on the sample quantiles of the +predicted values. These quantiles are defined using the +\code{stratification_threshold} parameter. +\item \code{optimised}: Use maximally selected rank statistics to determine the +optimal threshold (Lausen and Schumacher, 1992; Hothorn et al., 2003) to +stratify samples into two optimally separated risk groups. +} +} +\references{ +\enumerate{ +\item Lausen, B. & Schumacher, M. Maximally Selected Rank Statistics. +Biometrics 48, 73 (1992). +\item Hothorn, T. & Lausen, B. On the exact distribution of maximally selected +rank statistics. Comput. Stat. Data Anal. 43, 121–137 (2003). +} +} diff --git a/man/export_sample_similarity-methods.Rd b/man/export_sample_similarity-methods.Rd new file mode 100644 index 00000000..387c7409 --- /dev/null +++ b/man/export_sample_similarity-methods.Rd @@ -0,0 +1,113 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarDataComputationSampleSimilarity.R +\name{export_sample_similarity} +\alias{export_sample_similarity} +\alias{export_sample_similarity,familiarCollection-method} +\alias{export_sample_similarity,ANY-method} +\title{Extract and export mutual correlation between features.} +\usage{ +export_sample_similarity( + object, + dir_path = NULL, + aggregate_results = TRUE, + sample_limit = waiver(), + sample_cluster_method = waiver(), + sample_linkage_method = waiver(), + export_dendrogram = FALSE, + export_collection = FALSE, + ... +) + +\S4method{export_sample_similarity}{familiarCollection}( + object, + dir_path = NULL, + aggregate_results = TRUE, + sample_limit = waiver(), + sample_cluster_method = waiver(), + sample_linkage_method = waiver(), + export_dendrogram = FALSE, + export_collection = FALSE, + ... +) + +\S4method{export_sample_similarity}{ANY}( + object, + dir_path = NULL, + aggregate_results = TRUE, + sample_limit = waiver(), + sample_cluster_method = waiver(), + sample_linkage_method = waiver(), + export_dendrogram = FALSE, + export_collection = FALSE, + ... +) +} +\arguments{ +\item{object}{A \code{familiarCollection} object, or other other objects from which +a \code{familiarCollection} can be extracted. See details for more information.} + +\item{dir_path}{Path to folder where extracted data should be saved. \code{NULL} +will allow export as a structured list of data.tables.} + +\item{aggregate_results}{Flag that signifies whether results should be +aggregated for export.} + +\item{sample_limit}{(\emph{optional}) Set the upper limit of the number of samples +that are used during evaluation steps. Cannot be less than 20. + +This setting can be specified per data element by providing a parameter +value in a named list with data elements, e.g. +\code{list("sample_similarity"=100, "permutation_vimp"=1000)}. + +This parameter can be set for the following data elements: +\code{sample_similarity} and \code{ice_data}.} + +\item{sample_cluster_method}{The method used to perform clustering based on +distance between samples. These are the same methods as for the +\code{cluster_method} configuration parameter: \code{hclust}, \code{agnes}, \code{diana} and +\code{pam}. + +\code{none} cannot be used when extracting data for feature expressions. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{sample_linkage_method}{The method used for agglomerative clustering in +\code{hclust} and \code{agnes}. These are the same methods as for the +\code{cluster_linkage_method} configuration parameter: \code{average}, \code{single}, +\code{complete}, \code{weighted}, and \code{ward}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{export_dendrogram}{Add dendrogram in the data element objects.} + +\item{export_collection}{(\emph{optional}) Exports the collection if TRUE.} + +\item{...}{ + Arguments passed on to \code{\link[=as_familiar_collection]{as_familiar_collection}} + \describe{ + \item{\code{familiar_data_names}}{Names of the dataset(s). Only used if the \code{object} parameter +is one or more \code{familiarData} objects.} + \item{\code{collection_name}}{Name of the collection.} + }} +} +\value{ +A list containing a data.table (if \code{dir_path} is not provided), or +nothing, as all data is exported to \code{csv} files. +} +\description{ +Extract and export mutual correlation between features in a +familiarCollection. +} +\details{ +Data is usually collected from a \code{familiarCollection} object. +However, you can also provide one or more \code{familiarData} objects, that will +be internally converted to a \code{familiarCollection} object. It is also +possible to provide a \code{familiarEnsemble} or one or more \code{familiarModel} +objects together with the data from which data is computed prior to export. +Paths to the previous files can also be provided. + +All parameters aside from \code{object} and \code{dir_path} are only used if \code{object} +is not a \code{familiarCollection} object, or a path to one. +} diff --git a/man/export_univariate_analysis_data-methods.Rd b/man/export_univariate_analysis_data-methods.Rd new file mode 100644 index 00000000..23a34a6a --- /dev/null +++ b/man/export_univariate_analysis_data-methods.Rd @@ -0,0 +1,133 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarDataComputationUnivariateAnalysis.R +\name{export_univariate_analysis_data} +\alias{export_univariate_analysis_data} +\alias{export_univariate_analysis_data,familiarCollection-method} +\alias{export_univariate_analysis_data,ANY-method} +\title{Extract and export univariate analysis data of features.} +\usage{ +export_univariate_analysis_data( + object, + dir_path = NULL, + p_adjustment_method = waiver(), + export_collection = FALSE, + ... +) + +\S4method{export_univariate_analysis_data}{familiarCollection}( + object, + dir_path = NULL, + p_adjustment_method = waiver(), + export_collection = FALSE, + ... +) + +\S4method{export_univariate_analysis_data}{ANY}( + object, + dir_path = NULL, + p_adjustment_method = waiver(), + export_collection = FALSE, + ... +) +} +\arguments{ +\item{object}{A \code{familiarCollection} object, or other other objects from which +a \code{familiarCollection} can be extracted. See details for more information.} + +\item{dir_path}{Path to folder where extracted data should be saved. \code{NULL} +will allow export as a structured list of data.tables.} + +\item{p_adjustment_method}{(\emph{optional}) Indicates type of p-value that is +shown. One of \code{holm}, \code{hochberg}, \code{hommel}, \code{bonferroni}, \code{BH}, \code{BY}, \code{fdr}, +\code{none}, \code{p_value} or \code{q_value} for adjusted p-values, uncorrected p-values +and q-values. q-values may not be available.} + +\item{export_collection}{(\emph{optional}) Exports the collection if TRUE.} + +\item{...}{ + Arguments passed on to \code{\link[=extract_univariate_analysis]{extract_univariate_analysis}}, \code{\link[=as_familiar_collection]{as_familiar_collection}} + \describe{ + \item{\code{data}}{A \code{dataObject} object, \code{data.table} or \code{data.frame} that +constitutes the data that are assessed.} + \item{\code{cl}}{Cluster created using the \code{parallel} package. This cluster is then +used to speed up computation through parallellisation.} + \item{\code{feature_cluster_method}}{The method used to perform clustering. These are +the same methods as for the \code{cluster_method} configuration parameter: +\code{none}, \code{hclust}, \code{agnes}, \code{diana} and \code{pam}. + +\code{none} cannot be used when extracting data regarding mutual correlation or +feature expressions. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{feature_linkage_method}}{The method used for agglomerative clustering in +\code{hclust} and \code{agnes}. These are the same methods as for the +\code{cluster_linkage_method} configuration parameter: \code{average}, \code{single}, +\code{complete}, \code{weighted}, and \code{ward}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{feature_cluster_cut_method}}{The method used to divide features into +separate clusters. The available methods are the same as for the +\code{cluster_cut_method} configuration parameter: \code{silhouette}, \code{fixed_cut} and +\code{dynamic_cut}. + +\code{silhouette} is available for all cluster methods, but \code{fixed_cut} only +applies to methods that create hierarchical trees (\code{hclust}, \code{agnes} and +\code{diana}). \code{dynamic_cut} requires the \code{dynamicTreeCut} package and can only +be used with \code{agnes} and \code{hclust}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{feature_similarity_threshold}}{The threshold level for pair-wise +similarity that is required to form feature clusters with the \code{fixed_cut} +method. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{feature_similarity_metric}}{Metric to determine pairwise similarity +between features. Similarity is computed in the same manner as for +clustering, and \code{feature_similarity_metric} therefore has the same options +as \code{cluster_similarity_metric}: \code{mcfadden_r2}, \code{cox_snell_r2}, +\code{nagelkerke_r2}, \code{spearman}, \code{kendall} and \code{pearson}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{icc_type}}{String indicating the type of intraclass correlation +coefficient (\code{1}, \code{2} or \code{3}) that should be used to compute robustness for +features in repeated measurements during the evaluation of univariate +importance. These types correspond to the types in Shrout and Fleiss (1979). +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{verbose}}{Flag to indicate whether feedback should be provided on the +computation and extraction of various data elements.} + \item{\code{message_indent}}{Number of indentation steps for messages shown during +computation and extraction of various data elements.} + \item{\code{familiar_data_names}}{Names of the dataset(s). Only used if the \code{object} parameter +is one or more \code{familiarData} objects.} + \item{\code{collection_name}}{Name of the collection.} + }} +} +\value{ +A data.table (if \code{dir_path} is not provided), or nothing, as +all data is exported to \code{csv} files. +} +\description{ +Extract and export univariate analysis data of features for data +in a familiarCollection. +} +\details{ +Data is usually collected from a \code{familiarCollection} object. +However, you can also provide one or more \code{familiarData} objects, that will +be internally converted to a \code{familiarCollection} object. It is also +possible to provide a \code{familiarEnsemble} or one or more \code{familiarModel} +objects together with the data from which data is computed prior to export. +Paths to the previous files can also be provided. + +All parameters aside from \code{object} and \code{dir_path} are only used if \code{object} +is not a \code{familiarCollection} object, or a path to one. + +Univariate analysis includes the computation of p and q-values, as well as +robustness (in case of repeated measurements). p-values are derived from +Wald's test. +} diff --git a/man/extract_auc_data.Rd b/man/extract_auc_data.Rd new file mode 100644 index 00000000..ce404ed2 --- /dev/null +++ b/man/extract_auc_data.Rd @@ -0,0 +1,172 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarDataComputationAUCCurves.R +\name{extract_auc_data} +\alias{extract_auc_data} +\title{Internal function to extract area under the ROC curve information.} +\usage{ +extract_auc_data( + object, + data, + cl = NULL, + ensemble_method = waiver(), + detail_level = waiver(), + estimation_type = waiver(), + aggregate_results = waiver(), + confidence_level = waiver(), + bootstrap_ci_method = waiver(), + is_pre_processed = FALSE, + message_indent = 0L, + verbose = FALSE, + ... +) +} +\arguments{ +\item{object}{A \code{familiarEnsemble} object, which is an ensemble of one or more +\code{familiarModel} objects.} + +\item{data}{A \code{dataObject} object, \code{data.table} or \code{data.frame} that +constitutes the data that are assessed.} + +\item{cl}{Cluster created using the \code{parallel} package. This cluster is then +used to speed up computation through parallellisation.} + +\item{ensemble_method}{Method for ensembling predictions from models for the +same sample. Available methods are: +\itemize{ +\item \code{median} (default): Use the median of the predicted values as the ensemble +value for a sample. +\item \code{mean}: Use the mean of the predicted values as the ensemble value for a +sample. +}} + +\item{detail_level}{(\emph{optional}) Sets the level at which results are computed +and aggregated. +\itemize{ +\item \code{ensemble}: Results are computed at the ensemble level, i.e. over all +models in the ensemble. This means that, for example, bias-corrected +estimates of model performance are assessed by creating (at least) 20 +bootstraps and computing the model performance of the ensemble model for +each bootstrap. +\item \code{hybrid} (default): Results are computed at the level of models in an +ensemble. This means that, for example, bias-corrected estimates of model +performance are directly computed using the models in the ensemble. If there +are at least 20 trained models in the ensemble, performance is computed for +each model, in contrast to \code{ensemble} where performance is computed for the +ensemble of models. If there are less than 20 trained models in the +ensemble, bootstraps are created so that at least 20 point estimates can be +made. +\item \code{model}: Results are computed at the model level. This means that, for +example, bias-corrected estimates of model performance are assessed by +creating (at least) 20 bootstraps and computing the performance of the model +for each bootstrap. +} + +Note that each level of detail has a different interpretation for bootstrap +confidence intervals. For \code{ensemble} and \code{model} these are the confidence +intervals for the ensemble and an individual model, respectively. That is, +the confidence interval describes the range where an estimate produced by a +respective ensemble or model trained on a repeat of the experiment may be +found with the probability of the confidence level. For \code{hybrid}, it +represents the range where any single model trained on a repeat of the +experiment may be found with the probability of the confidence level. By +definition, confidence intervals obtained using \code{hybrid} are at least as +wide as those for \code{ensemble}. \code{hybrid} offers the correct interpretation if +the goal of the analysis is to assess the result of a single, unspecified, +model. + +\code{hybrid} is generally computationally less expensive then \code{ensemble}, which +in turn is somewhat less expensive than \code{model}. + +A non-default \code{detail_level} parameter can be specified for separate +evaluation steps by providing a parameter value in a named list with data +elements, e.g. \code{list("auc_data"="ensemble", "model_performance"="hybrid")}. +This parameter can be set for the following data elements: \code{auc_data}, +\code{decision_curve_analyis}, \code{model_performance}, \code{permutation_vimp}, +\code{ice_data}, \code{prediction_data} and \code{confusion_matrix}.} + +\item{estimation_type}{(\emph{optional}) Sets the type of estimation that should be +possible. This has the following options: +\itemize{ +\item \code{point}: Point estimates. +\item \code{bias_correction} or \code{bc}: Bias-corrected estimates. A bias-corrected +estimate is computed from (at least) 20 point estimates, and \code{familiar} may +bootstrap the data to create them. +\item \code{bootstrap_confidence_interval} or \code{bci} (default): Bias-corrected +estimates with bootstrap confidence intervals (Efron and Hastie, 2016). The +number of point estimates required depends on the \code{confidence_level} +parameter, and \code{familiar} may bootstrap the data to create them. +} + +As with \code{detail_level}, a non-default \code{estimation_type} parameter can be +specified for separate evaluation steps by providing a parameter value in a +named list with data elements, e.g. \code{list("auc_data"="bci", "model_performance"="point")}. This parameter can be set for the following +data elements: \code{auc_data}, \code{decision_curve_analyis}, \code{model_performance}, +\code{permutation_vimp}, \code{ice_data}, and \code{prediction_data}.} + +\item{aggregate_results}{(\emph{optional}) Flag that signifies whether results +should be aggregated during evaluation. If \code{estimation_type} is +\code{bias_correction} or \code{bc}, aggregation leads to a single bias-corrected +estimate. If \code{estimation_type} is \code{bootstrap_confidence_interval} or \code{bci}, +aggregation leads to a single bias-corrected estimate with lower and upper +boundaries of the confidence interval. This has no effect if +\code{estimation_type} is \code{point}. + +The default value is equal to \code{TRUE} except when assessing metrics to assess +model performance, as the default violin plot requires underlying data. + +As with \code{detail_level} and \code{estimation_type}, a non-default +\code{aggregate_results} parameter can be specified for separate evaluation steps +by providing a parameter value in a named list with data elements, e.g. +\code{list("auc_data"=TRUE, , "model_performance"=FALSE)}. This parameter exists +for the same elements as \code{estimation_type}.} + +\item{confidence_level}{(\emph{optional}) Numeric value for the level at which +confidence intervals are determined. In the case bootstraps are used to +determine the confidence intervals bootstrap estimation, \code{familiar} uses the +rule of thumb \eqn{n = 20 / ci.level} to determine the number of required +bootstraps. + +The default value is \code{0.95}.} + +\item{bootstrap_ci_method}{(\emph{optional}) Method used to determine bootstrap +confidence intervals (Efron and Hastie, 2016). The following methods are +implemented: +\itemize{ +\item \code{percentile} (default): Confidence intervals obtained using the percentile +method. +\item \code{bc}: Bias-corrected confidence intervals. +} + +Note that the standard method is not implemented because this method is +often not suitable due to non-normal distributions. The bias-corrected and +accelerated (BCa) method is not implemented yet.} + +\item{is_pre_processed}{Flag that indicates whether the data was already +pre-processed externally, e.g. normalised and clustered. Only used if the +\code{data} argument is a \code{data.table} or \code{data.frame}.} + +\item{message_indent}{Number of indentation steps for messages shown during +computation and extraction of various data elements.} + +\item{verbose}{Flag to indicate whether feedback should be provided on the +computation and extraction of various data elements.} + +\item{...}{Unused arguments.} +} +\value{ +A list with data.tables for single and ensemble model ROC curve data. +} +\description{ +Computes the ROC curve from a \code{familiarEnsemble}. +' +} +\details{ +This function also computes credibility intervals for the ROC curve +for the ensemble model, at the level of \code{confidence_level}. In the case of +multinomial outcomes, an AUC curve is computed per class in a +one-against-all fashion. + +To allow plotting of multiple AUC curves in the same plot and the use of +ensemble models, the AUC curve is evaluated at 0.01 (1-specificity) intervals. +} +\keyword{internal} diff --git a/man/extract_calibration_data.Rd b/man/extract_calibration_data.Rd new file mode 100644 index 00000000..c37d1c26 --- /dev/null +++ b/man/extract_calibration_data.Rd @@ -0,0 +1,176 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarDataComputationCalibrationData.R +\name{extract_calibration_data} +\alias{extract_calibration_data} +\title{Internal function to extract calibration data.} +\usage{ +extract_calibration_data( + object, + data, + cl = NULL, + ensemble_method = waiver(), + evaluation_times = waiver(), + detail_level = waiver(), + estimation_type = waiver(), + aggregate_results = waiver(), + confidence_level = waiver(), + bootstrap_ci_method = waiver(), + is_pre_processed = FALSE, + message_indent = 0L, + verbose = FALSE, + ... +) +} +\arguments{ +\item{object}{A \code{familiarEnsemble} object, which is an ensemble of one or more +\code{familiarModel} objects.} + +\item{data}{A \code{dataObject} object, \code{data.table} or \code{data.frame} that +constitutes the data that are assessed.} + +\item{cl}{Cluster created using the \code{parallel} package. This cluster is then +used to speed up computation through parallellisation.} + +\item{ensemble_method}{Method for ensembling predictions from models for the +same sample. Available methods are: +\itemize{ +\item \code{median} (default): Use the median of the predicted values as the ensemble +value for a sample. +\item \code{mean}: Use the mean of the predicted values as the ensemble value for a +sample. +}} + +\item{evaluation_times}{One or more time points that are used for in analysis of +survival problems when data has to be assessed at a set time, e.g. +calibration. If not provided explicitly, this parameter is read from +settings used at creation of the underlying \code{familiarModel} objects. Only +used for \code{survival} outcomes.} + +\item{detail_level}{(\emph{optional}) Sets the level at which results are computed +and aggregated. +\itemize{ +\item \code{ensemble}: Results are computed at the ensemble level, i.e. over all +models in the ensemble. This means that, for example, bias-corrected +estimates of model performance are assessed by creating (at least) 20 +bootstraps and computing the model performance of the ensemble model for +each bootstrap. +\item \code{hybrid} (default): Results are computed at the level of models in an +ensemble. This means that, for example, bias-corrected estimates of model +performance are directly computed using the models in the ensemble. If there +are at least 20 trained models in the ensemble, performance is computed for +each model, in contrast to \code{ensemble} where performance is computed for the +ensemble of models. If there are less than 20 trained models in the +ensemble, bootstraps are created so that at least 20 point estimates can be +made. +\item \code{model}: Results are computed at the model level. This means that, for +example, bias-corrected estimates of model performance are assessed by +creating (at least) 20 bootstraps and computing the performance of the model +for each bootstrap. +} + +Note that each level of detail has a different interpretation for bootstrap +confidence intervals. For \code{ensemble} and \code{model} these are the confidence +intervals for the ensemble and an individual model, respectively. That is, +the confidence interval describes the range where an estimate produced by a +respective ensemble or model trained on a repeat of the experiment may be +found with the probability of the confidence level. For \code{hybrid}, it +represents the range where any single model trained on a repeat of the +experiment may be found with the probability of the confidence level. By +definition, confidence intervals obtained using \code{hybrid} are at least as +wide as those for \code{ensemble}. \code{hybrid} offers the correct interpretation if +the goal of the analysis is to assess the result of a single, unspecified, +model. + +\code{hybrid} is generally computationally less expensive then \code{ensemble}, which +in turn is somewhat less expensive than \code{model}. + +A non-default \code{detail_level} parameter can be specified for separate +evaluation steps by providing a parameter value in a named list with data +elements, e.g. \code{list("auc_data"="ensemble", "model_performance"="hybrid")}. +This parameter can be set for the following data elements: \code{auc_data}, +\code{decision_curve_analyis}, \code{model_performance}, \code{permutation_vimp}, +\code{ice_data}, \code{prediction_data} and \code{confusion_matrix}.} + +\item{estimation_type}{(\emph{optional}) Sets the type of estimation that should be +possible. This has the following options: +\itemize{ +\item \code{point}: Point estimates. +\item \code{bias_correction} or \code{bc}: Bias-corrected estimates. A bias-corrected +estimate is computed from (at least) 20 point estimates, and \code{familiar} may +bootstrap the data to create them. +\item \code{bootstrap_confidence_interval} or \code{bci} (default): Bias-corrected +estimates with bootstrap confidence intervals (Efron and Hastie, 2016). The +number of point estimates required depends on the \code{confidence_level} +parameter, and \code{familiar} may bootstrap the data to create them. +} + +As with \code{detail_level}, a non-default \code{estimation_type} parameter can be +specified for separate evaluation steps by providing a parameter value in a +named list with data elements, e.g. \code{list("auc_data"="bci", "model_performance"="point")}. This parameter can be set for the following +data elements: \code{auc_data}, \code{decision_curve_analyis}, \code{model_performance}, +\code{permutation_vimp}, \code{ice_data}, and \code{prediction_data}.} + +\item{aggregate_results}{(\emph{optional}) Flag that signifies whether results +should be aggregated during evaluation. If \code{estimation_type} is +\code{bias_correction} or \code{bc}, aggregation leads to a single bias-corrected +estimate. If \code{estimation_type} is \code{bootstrap_confidence_interval} or \code{bci}, +aggregation leads to a single bias-corrected estimate with lower and upper +boundaries of the confidence interval. This has no effect if +\code{estimation_type} is \code{point}. + +The default value is equal to \code{TRUE} except when assessing metrics to assess +model performance, as the default violin plot requires underlying data. + +As with \code{detail_level} and \code{estimation_type}, a non-default +\code{aggregate_results} parameter can be specified for separate evaluation steps +by providing a parameter value in a named list with data elements, e.g. +\code{list("auc_data"=TRUE, , "model_performance"=FALSE)}. This parameter exists +for the same elements as \code{estimation_type}.} + +\item{confidence_level}{(\emph{optional}) Numeric value for the level at which +confidence intervals are determined. In the case bootstraps are used to +determine the confidence intervals bootstrap estimation, \code{familiar} uses the +rule of thumb \eqn{n = 20 / ci.level} to determine the number of required +bootstraps. + +The default value is \code{0.95}.} + +\item{bootstrap_ci_method}{(\emph{optional}) Method used to determine bootstrap +confidence intervals (Efron and Hastie, 2016). The following methods are +implemented: +\itemize{ +\item \code{percentile} (default): Confidence intervals obtained using the percentile +method. +\item \code{bc}: Bias-corrected confidence intervals. +} + +Note that the standard method is not implemented because this method is +often not suitable due to non-normal distributions. The bias-corrected and +accelerated (BCa) method is not implemented yet.} + +\item{is_pre_processed}{Flag that indicates whether the data was already +pre-processed externally, e.g. normalised and clustered. Only used if the +\code{data} argument is a \code{data.table} or \code{data.frame}.} + +\item{message_indent}{Number of indentation steps for messages shown during +computation and extraction of various data elements.} + +\item{verbose}{Flag to indicate whether feedback should be provided on the +computation and extraction of various data elements.} + +\item{...}{Unused arguments.} +} +\value{ +A list with data.tables containing calibration test information for +the ensemble model. +} +\description{ +Computes calibration data from a \code{familiarEnsemble} object. +Calibration tests are performed based on expected (predicted) and observed +outcomes. For all outcomes, calibration-at-the-large and calibration slopes +are determined. Furthermore, for all but survival outcomes, a repeated, +randomised grouping Hosmer-Lemeshow test is performed. For survival +outcomes, the Nam-D'Agostino and Greenwood-Nam-D'Agostino tests are +performed. +} +\keyword{internal} diff --git a/man/extract_calibration_info.Rd b/man/extract_calibration_info.Rd new file mode 100644 index 00000000..82d7ec66 --- /dev/null +++ b/man/extract_calibration_info.Rd @@ -0,0 +1,78 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarDataComputationCalibrationInfo.R +\name{extract_calibration_info} +\alias{extract_calibration_info} +\title{Internal function to extract calibration info from data.} +\usage{ +extract_calibration_info( + object, + detail_level = waiver(), + message_indent = 0L, + verbose = FALSE, + ... +) +} +\arguments{ +\item{object}{A \code{familiarEnsemble} object, which is an ensemble of one or more +\code{familiarModel} objects.} + +\item{detail_level}{(\emph{optional}) Sets the level at which results are computed +and aggregated. +\itemize{ +\item \code{ensemble}: Results are computed at the ensemble level, i.e. over all +models in the ensemble. This means that, for example, bias-corrected +estimates of model performance are assessed by creating (at least) 20 +bootstraps and computing the model performance of the ensemble model for +each bootstrap. +\item \code{hybrid} (default): Results are computed at the level of models in an +ensemble. This means that, for example, bias-corrected estimates of model +performance are directly computed using the models in the ensemble. If there +are at least 20 trained models in the ensemble, performance is computed for +each model, in contrast to \code{ensemble} where performance is computed for the +ensemble of models. If there are less than 20 trained models in the +ensemble, bootstraps are created so that at least 20 point estimates can be +made. +\item \code{model}: Results are computed at the model level. This means that, for +example, bias-corrected estimates of model performance are assessed by +creating (at least) 20 bootstraps and computing the performance of the model +for each bootstrap. +} + +Note that each level of detail has a different interpretation for bootstrap +confidence intervals. For \code{ensemble} and \code{model} these are the confidence +intervals for the ensemble and an individual model, respectively. That is, +the confidence interval describes the range where an estimate produced by a +respective ensemble or model trained on a repeat of the experiment may be +found with the probability of the confidence level. For \code{hybrid}, it +represents the range where any single model trained on a repeat of the +experiment may be found with the probability of the confidence level. By +definition, confidence intervals obtained using \code{hybrid} are at least as +wide as those for \code{ensemble}. \code{hybrid} offers the correct interpretation if +the goal of the analysis is to assess the result of a single, unspecified, +model. + +\code{hybrid} is generally computationally less expensive then \code{ensemble}, which +in turn is somewhat less expensive than \code{model}. + +A non-default \code{detail_level} parameter can be specified for separate +evaluation steps by providing a parameter value in a named list with data +elements, e.g. \code{list("auc_data"="ensemble", "model_performance"="hybrid")}. +This parameter can be set for the following data elements: \code{auc_data}, +\code{decision_curve_analyis}, \code{model_performance}, \code{permutation_vimp}, +\code{ice_data}, \code{prediction_data} and \code{confusion_matrix}.} + +\item{message_indent}{Number of indentation steps for messages shown during +computation and extraction of various data elements.} + +\item{verbose}{Flag to indicate whether feedback should be provided on the +computation and extraction of various data elements.} + +\item{...}{Unused arguments.} +} +\value{ +A list of familiarDataElements with hyperparameters. +} +\description{ +Collects . +} +\keyword{internal} diff --git a/man/extract_confusion_matrix.Rd b/man/extract_confusion_matrix.Rd new file mode 100644 index 00000000..d0e6b42c --- /dev/null +++ b/man/extract_confusion_matrix.Rd @@ -0,0 +1,103 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarDataComputationConfusionMatrix.R +\name{extract_confusion_matrix} +\alias{extract_confusion_matrix} +\title{Internal function to extract the confusion matrix.} +\usage{ +extract_confusion_matrix( + object, + data, + cl = NULL, + ensemble_method = waiver(), + detail_level = waiver(), + is_pre_processed = FALSE, + message_indent = 0L, + verbose = FALSE, + ... +) +} +\arguments{ +\item{object}{A \code{familiarEnsemble} object, which is an ensemble of one or more +\code{familiarModel} objects.} + +\item{data}{A \code{dataObject} object, \code{data.table} or \code{data.frame} that +constitutes the data that are assessed.} + +\item{cl}{Cluster created using the \code{parallel} package. This cluster is then +used to speed up computation through parallellisation.} + +\item{ensemble_method}{Method for ensembling predictions from models for the +same sample. Available methods are: +\itemize{ +\item \code{median} (default): Use the median of the predicted values as the ensemble +value for a sample. +\item \code{mean}: Use the mean of the predicted values as the ensemble value for a +sample. +}} + +\item{detail_level}{(\emph{optional}) Sets the level at which results are computed +and aggregated. +\itemize{ +\item \code{ensemble}: Results are computed at the ensemble level, i.e. over all +models in the ensemble. This means that, for example, bias-corrected +estimates of model performance are assessed by creating (at least) 20 +bootstraps and computing the model performance of the ensemble model for +each bootstrap. +\item \code{hybrid} (default): Results are computed at the level of models in an +ensemble. This means that, for example, bias-corrected estimates of model +performance are directly computed using the models in the ensemble. If there +are at least 20 trained models in the ensemble, performance is computed for +each model, in contrast to \code{ensemble} where performance is computed for the +ensemble of models. If there are less than 20 trained models in the +ensemble, bootstraps are created so that at least 20 point estimates can be +made. +\item \code{model}: Results are computed at the model level. This means that, for +example, bias-corrected estimates of model performance are assessed by +creating (at least) 20 bootstraps and computing the performance of the model +for each bootstrap. +} + +Note that each level of detail has a different interpretation for bootstrap +confidence intervals. For \code{ensemble} and \code{model} these are the confidence +intervals for the ensemble and an individual model, respectively. That is, +the confidence interval describes the range where an estimate produced by a +respective ensemble or model trained on a repeat of the experiment may be +found with the probability of the confidence level. For \code{hybrid}, it +represents the range where any single model trained on a repeat of the +experiment may be found with the probability of the confidence level. By +definition, confidence intervals obtained using \code{hybrid} are at least as +wide as those for \code{ensemble}. \code{hybrid} offers the correct interpretation if +the goal of the analysis is to assess the result of a single, unspecified, +model. + +\code{hybrid} is generally computationally less expensive then \code{ensemble}, which +in turn is somewhat less expensive than \code{model}. + +A non-default \code{detail_level} parameter can be specified for separate +evaluation steps by providing a parameter value in a named list with data +elements, e.g. \code{list("auc_data"="ensemble", "model_performance"="hybrid")}. +This parameter can be set for the following data elements: \code{auc_data}, +\code{decision_curve_analyis}, \code{model_performance}, \code{permutation_vimp}, +\code{ice_data}, \code{prediction_data} and \code{confusion_matrix}.} + +\item{is_pre_processed}{Flag that indicates whether the data was already +pre-processed externally, e.g. normalised and clustered. Only used if the +\code{data} argument is a \code{data.table} or \code{data.frame}.} + +\item{message_indent}{Number of indentation steps for messages shown during +computation and extraction of various data elements.} + +\item{verbose}{Flag to indicate whether feedback should be provided on the +computation and extraction of various data elements.} + +\item{...}{Unused arguments.} +} +\value{ +A data.table containing predicted and observed outcome data together +with a co-occurence count. +} +\description{ +Computes and extracts the confusion matrix for predicted and +observed categorical outcomes used in a \code{familiarEnsemble} object. +} +\keyword{internal} diff --git a/man/extract_data.Rd b/man/extract_data.Rd new file mode 100644 index 00000000..f6638fba --- /dev/null +++ b/man/extract_data.Rd @@ -0,0 +1,368 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarDataComputation.R +\name{extract_data} +\alias{extract_data} +\title{Internal function to create a familiarData object.} +\usage{ +extract_data( + object, + data, + data_element = waiver(), + is_pre_processed = FALSE, + cl = NULL, + time_max = waiver(), + aggregation_method = waiver(), + rank_threshold = waiver(), + ensemble_method = waiver(), + stratification_method = waiver(), + evaluation_times = waiver(), + metric = waiver(), + feature_cluster_method = waiver(), + feature_cluster_cut_method = waiver(), + feature_linkage_method = waiver(), + feature_similarity_metric = waiver(), + feature_similarity_threshold = waiver(), + sample_cluster_method = waiver(), + sample_linkage_method = waiver(), + sample_similarity_metric = waiver(), + sample_limit = waiver(), + detail_level = waiver(), + estimation_type = waiver(), + aggregate_results = waiver(), + confidence_level = waiver(), + bootstrap_ci_method = waiver(), + icc_type = waiver(), + dynamic_model_loading = FALSE, + message_indent = 0L, + verbose = FALSE, + ... +) +} +\arguments{ +\item{object}{A \code{familiarEnsemble} object, which is an ensemble of one or more +\code{familiarModel} objects.} + +\item{data}{A \code{dataObject} object, \code{data.table} or \code{data.frame} that +constitutes the data that are assessed.} + +\item{data_element}{String indicating which data elements are to be extracted. +Default is \code{all}, but specific elements can be specified to speed up +computations if not all elements are to be computed. This is an internal +parameter that is set by, e.g. the \code{export_model_vimp} method.} + +\item{is_pre_processed}{Flag that indicates whether the data was already +pre-processed externally, e.g. normalised and clustered. Only used if the +\code{data} argument is a \code{data.table} or \code{data.frame}.} + +\item{cl}{Cluster created using the \code{parallel} package. This cluster is then +used to speed up computation through parallellisation.} + +\item{time_max}{Time point which is used as the benchmark for e.g. cumulative +risks generated by random forest, or the cut-off value for Uno's concordance +index. If not provided explicitly, this parameter is read from settings used +at creation of the underlying \code{familiarModel} objects. Only used for +\code{survival} outcomes.} + +\item{aggregation_method}{Method for aggregating variable importances for the +purpose of evaluation. Variable importances are determined during feature +selection steps and after training the model. Both types are evaluated, but +feature selection variable importance is only evaluated at run-time. + +See the documentation for the \code{vimp_aggregation_method} argument in +\code{summon_familiar} for information concerning the different available +methods. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{rank_threshold}{The threshold used to define the subset of highly +important features during evaluation. + +See the documentation for the \code{vimp_aggregation_rank_threshold} argument in +\code{summon_familiar} for more information. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{ensemble_method}{Method for ensembling predictions from models for the +same sample. Available methods are: +\itemize{ +\item \code{median} (default): Use the median of the predicted values as the ensemble +value for a sample. +\item \code{mean}: Use the mean of the predicted values as the ensemble value for a +sample. +}} + +\item{stratification_method}{(\emph{optional}) Method for determining the +stratification threshold for creating survival groups. The actual, +model-dependent, threshold value is obtained from the development data, and +can afterwards be used to perform stratification on validation data. + +The following stratification methods are available: +\itemize{ +\item \code{median} (default): The median predicted value in the development cohort +is used to stratify the samples into two risk groups. For predicted outcome +values that build a continuous spectrum, the two risk groups in the +development cohort will be roughly equal in size. +\item \code{mean}: The mean predicted value in the development cohort is used to +stratify the samples into two risk groups. +\item \code{mean_trim}: As \code{mean}, but based on the set of predicted values +where the 5\% lowest and 5\% highest values are discarded. This reduces the +effect of outliers. +\item \code{mean_winsor}: As \code{mean}, but based on the set of predicted values where +the 5\% lowest and 5\% highest values are winsorised. This reduces the effect +of outliers. +\item \code{fixed}: Samples are stratified based on the sample quantiles of the +predicted values. These quantiles are defined using the +\code{stratification_threshold} parameter. +\item \code{optimised}: Use maximally selected rank statistics to determine the +optimal threshold (Lausen and Schumacher, 1992; Hothorn et al., 2003) to +stratify samples into two optimally separated risk groups. +} + +One or more stratification methods can be selected simultaneously. + +This parameter is only relevant for \code{survival} outcomes.} + +\item{evaluation_times}{One or more time points that are used for in analysis of +survival problems when data has to be assessed at a set time, e.g. +calibration. If not provided explicitly, this parameter is read from +settings used at creation of the underlying \code{familiarModel} objects. Only +used for \code{survival} outcomes.} + +\item{metric}{One or more metrics for assessing model performance. See the +vignette on performance metrics for the available metrics. If not provided +explicitly, this parameter is read from settings used at creation of the +underlying \code{familiarModel} objects.} + +\item{feature_cluster_method}{The method used to perform clustering. These are +the same methods as for the \code{cluster_method} configuration parameter: +\code{none}, \code{hclust}, \code{agnes}, \code{diana} and \code{pam}. + +\code{none} cannot be used when extracting data regarding mutual correlation or +feature expressions. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{feature_cluster_cut_method}{The method used to divide features into +separate clusters. The available methods are the same as for the +\code{cluster_cut_method} configuration parameter: \code{silhouette}, \code{fixed_cut} and +\code{dynamic_cut}. + +\code{silhouette} is available for all cluster methods, but \code{fixed_cut} only +applies to methods that create hierarchical trees (\code{hclust}, \code{agnes} and +\code{diana}). \code{dynamic_cut} requires the \code{dynamicTreeCut} package and can only +be used with \code{agnes} and \code{hclust}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{feature_linkage_method}{The method used for agglomerative clustering in +\code{hclust} and \code{agnes}. These are the same methods as for the +\code{cluster_linkage_method} configuration parameter: \code{average}, \code{single}, +\code{complete}, \code{weighted}, and \code{ward}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{feature_similarity_metric}{Metric to determine pairwise similarity +between features. Similarity is computed in the same manner as for +clustering, and \code{feature_similarity_metric} therefore has the same options +as \code{cluster_similarity_metric}: \code{mcfadden_r2}, \code{cox_snell_r2}, +\code{nagelkerke_r2}, \code{spearman}, \code{kendall} and \code{pearson}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{feature_similarity_threshold}{The threshold level for pair-wise +similarity that is required to form feature clusters with the \code{fixed_cut} +method. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{sample_cluster_method}{The method used to perform clustering based on +distance between samples. These are the same methods as for the +\code{cluster_method} configuration parameter: \code{hclust}, \code{agnes}, \code{diana} and +\code{pam}. + +\code{none} cannot be used when extracting data for feature expressions. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{sample_linkage_method}{The method used for agglomerative clustering in +\code{hclust} and \code{agnes}. These are the same methods as for the +\code{cluster_linkage_method} configuration parameter: \code{average}, \code{single}, +\code{complete}, \code{weighted}, and \code{ward}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{sample_similarity_metric}{Metric to determine pairwise similarity +between samples. Similarity is computed in the same manner as for +clustering, but \code{sample_similarity_metric} has different options that are +better suited to computing distance between samples instead of between +features: \code{gower}, \code{euclidean}. + +The underlying feature data is scaled to the \eqn{[0, 1]} range (for +numerical features) using the feature values across the samples. The +normalisation parameters required can optionally be computed from feature +data with the outer 5\% (on both sides) of feature values trimmed or +winsorised. To do so append \verb{_trim} (trimming) or \verb{_winsor} (winsorising) to +the metric name. This reduces the effect of outliers somewhat. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{sample_limit}{(\emph{optional}) Set the upper limit of the number of samples +that are used during evaluation steps. Cannot be less than 20. + +This setting can be specified per data element by providing a parameter +value in a named list with data elements, e.g. +\code{list("sample_similarity"=100, "permutation_vimp"=1000)}. + +This parameter can be set for the following data elements: +\code{sample_similarity} and \code{ice_data}.} + +\item{detail_level}{(\emph{optional}) Sets the level at which results are computed +and aggregated. +\itemize{ +\item \code{ensemble}: Results are computed at the ensemble level, i.e. over all +models in the ensemble. This means that, for example, bias-corrected +estimates of model performance are assessed by creating (at least) 20 +bootstraps and computing the model performance of the ensemble model for +each bootstrap. +\item \code{hybrid} (default): Results are computed at the level of models in an +ensemble. This means that, for example, bias-corrected estimates of model +performance are directly computed using the models in the ensemble. If there +are at least 20 trained models in the ensemble, performance is computed for +each model, in contrast to \code{ensemble} where performance is computed for the +ensemble of models. If there are less than 20 trained models in the +ensemble, bootstraps are created so that at least 20 point estimates can be +made. +\item \code{model}: Results are computed at the model level. This means that, for +example, bias-corrected estimates of model performance are assessed by +creating (at least) 20 bootstraps and computing the performance of the model +for each bootstrap. +} + +Note that each level of detail has a different interpretation for bootstrap +confidence intervals. For \code{ensemble} and \code{model} these are the confidence +intervals for the ensemble and an individual model, respectively. That is, +the confidence interval describes the range where an estimate produced by a +respective ensemble or model trained on a repeat of the experiment may be +found with the probability of the confidence level. For \code{hybrid}, it +represents the range where any single model trained on a repeat of the +experiment may be found with the probability of the confidence level. By +definition, confidence intervals obtained using \code{hybrid} are at least as +wide as those for \code{ensemble}. \code{hybrid} offers the correct interpretation if +the goal of the analysis is to assess the result of a single, unspecified, +model. + +\code{hybrid} is generally computationally less expensive then \code{ensemble}, which +in turn is somewhat less expensive than \code{model}. + +A non-default \code{detail_level} parameter can be specified for separate +evaluation steps by providing a parameter value in a named list with data +elements, e.g. \code{list("auc_data"="ensemble", "model_performance"="hybrid")}. +This parameter can be set for the following data elements: \code{auc_data}, +\code{decision_curve_analyis}, \code{model_performance}, \code{permutation_vimp}, +\code{ice_data}, \code{prediction_data} and \code{confusion_matrix}.} + +\item{estimation_type}{(\emph{optional}) Sets the type of estimation that should be +possible. This has the following options: +\itemize{ +\item \code{point}: Point estimates. +\item \code{bias_correction} or \code{bc}: Bias-corrected estimates. A bias-corrected +estimate is computed from (at least) 20 point estimates, and \code{familiar} may +bootstrap the data to create them. +\item \code{bootstrap_confidence_interval} or \code{bci} (default): Bias-corrected +estimates with bootstrap confidence intervals (Efron and Hastie, 2016). The +number of point estimates required depends on the \code{confidence_level} +parameter, and \code{familiar} may bootstrap the data to create them. +} + +As with \code{detail_level}, a non-default \code{estimation_type} parameter can be +specified for separate evaluation steps by providing a parameter value in a +named list with data elements, e.g. \code{list("auc_data"="bci", "model_performance"="point")}. This parameter can be set for the following +data elements: \code{auc_data}, \code{decision_curve_analyis}, \code{model_performance}, +\code{permutation_vimp}, \code{ice_data}, and \code{prediction_data}.} + +\item{aggregate_results}{(\emph{optional}) Flag that signifies whether results +should be aggregated during evaluation. If \code{estimation_type} is +\code{bias_correction} or \code{bc}, aggregation leads to a single bias-corrected +estimate. If \code{estimation_type} is \code{bootstrap_confidence_interval} or \code{bci}, +aggregation leads to a single bias-corrected estimate with lower and upper +boundaries of the confidence interval. This has no effect if +\code{estimation_type} is \code{point}. + +The default value is equal to \code{TRUE} except when assessing metrics to assess +model performance, as the default violin plot requires underlying data. + +As with \code{detail_level} and \code{estimation_type}, a non-default +\code{aggregate_results} parameter can be specified for separate evaluation steps +by providing a parameter value in a named list with data elements, e.g. +\code{list("auc_data"=TRUE, , "model_performance"=FALSE)}. This parameter exists +for the same elements as \code{estimation_type}.} + +\item{confidence_level}{(\emph{optional}) Numeric value for the level at which +confidence intervals are determined. In the case bootstraps are used to +determine the confidence intervals bootstrap estimation, \code{familiar} uses the +rule of thumb \eqn{n = 20 / ci.level} to determine the number of required +bootstraps. + +The default value is \code{0.95}.} + +\item{bootstrap_ci_method}{(\emph{optional}) Method used to determine bootstrap +confidence intervals (Efron and Hastie, 2016). The following methods are +implemented: +\itemize{ +\item \code{percentile} (default): Confidence intervals obtained using the percentile +method. +\item \code{bc}: Bias-corrected confidence intervals. +} + +Note that the standard method is not implemented because this method is +often not suitable due to non-normal distributions. The bias-corrected and +accelerated (BCa) method is not implemented yet.} + +\item{icc_type}{String indicating the type of intraclass correlation +coefficient (\code{1}, \code{2} or \code{3}) that should be used to compute robustness for +features in repeated measurements during the evaluation of univariate +importance. These types correspond to the types in Shrout and Fleiss (1979). +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{dynamic_model_loading}{(\emph{optional}) Enables dynamic loading of models +during the evaluation process, if \code{TRUE}. Defaults to \code{FALSE}. Dynamic +loading of models may reduce the overall memory footprint, at the cost of +increased disk or network IO. Models can only be dynamically loaded if they +are found at an accessible disk or network location. Setting this parameter +to \code{TRUE} may help if parallel processing causes out-of-memory issues during +evaluation.} + +\item{message_indent}{Number of indentation steps for messages shown during +computation and extraction of various data elements.} + +\item{verbose}{Flag to indicate whether feedback should be provided on the +computation and extraction of various data elements.} + +\item{...}{Unused arguments.} +} +\value{ +A \code{familiarData} object. +} +\description{ +Compute various data related to model performance and calibration +from the provided dataset and \code{familiarEnsemble} object and store it as a +\code{familiarData} object. +} +\references{ +\enumerate{ +\item Shrout, P. E. & Fleiss, J. L. Intraclass correlations: uses in +assessing rater reliability. Psychol. Bull. 86, 420–428 (1979). +} +} +\keyword{internal} diff --git a/man/extract_decision_curve_data.Rd b/man/extract_decision_curve_data.Rd new file mode 100644 index 00000000..22b09837 --- /dev/null +++ b/man/extract_decision_curve_data.Rd @@ -0,0 +1,176 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarDataComputationDecisionCurveAnalysis.R +\name{extract_decision_curve_data} +\alias{extract_decision_curve_data} +\title{Internal function to extract decision curve analysis data.} +\usage{ +extract_decision_curve_data( + object, + data, + cl = NULL, + ensemble_method = waiver(), + evaluation_times = waiver(), + detail_level = waiver(), + estimation_type = waiver(), + aggregate_results = waiver(), + confidence_level = waiver(), + bootstrap_ci_method = waiver(), + is_pre_processed = FALSE, + message_indent = 0L, + verbose = FALSE, + ... +) +} +\arguments{ +\item{object}{A \code{familiarEnsemble} object, which is an ensemble of one or more +\code{familiarModel} objects.} + +\item{data}{A \code{dataObject} object, \code{data.table} or \code{data.frame} that +constitutes the data that are assessed.} + +\item{cl}{Cluster created using the \code{parallel} package. This cluster is then +used to speed up computation through parallellisation.} + +\item{ensemble_method}{Method for ensembling predictions from models for the +same sample. Available methods are: +\itemize{ +\item \code{median} (default): Use the median of the predicted values as the ensemble +value for a sample. +\item \code{mean}: Use the mean of the predicted values as the ensemble value for a +sample. +}} + +\item{evaluation_times}{One or more time points that are used for in analysis of +survival problems when data has to be assessed at a set time, e.g. +calibration. If not provided explicitly, this parameter is read from +settings used at creation of the underlying \code{familiarModel} objects. Only +used for \code{survival} outcomes.} + +\item{detail_level}{(\emph{optional}) Sets the level at which results are computed +and aggregated. +\itemize{ +\item \code{ensemble}: Results are computed at the ensemble level, i.e. over all +models in the ensemble. This means that, for example, bias-corrected +estimates of model performance are assessed by creating (at least) 20 +bootstraps and computing the model performance of the ensemble model for +each bootstrap. +\item \code{hybrid} (default): Results are computed at the level of models in an +ensemble. This means that, for example, bias-corrected estimates of model +performance are directly computed using the models in the ensemble. If there +are at least 20 trained models in the ensemble, performance is computed for +each model, in contrast to \code{ensemble} where performance is computed for the +ensemble of models. If there are less than 20 trained models in the +ensemble, bootstraps are created so that at least 20 point estimates can be +made. +\item \code{model}: Results are computed at the model level. This means that, for +example, bias-corrected estimates of model performance are assessed by +creating (at least) 20 bootstraps and computing the performance of the model +for each bootstrap. +} + +Note that each level of detail has a different interpretation for bootstrap +confidence intervals. For \code{ensemble} and \code{model} these are the confidence +intervals for the ensemble and an individual model, respectively. That is, +the confidence interval describes the range where an estimate produced by a +respective ensemble or model trained on a repeat of the experiment may be +found with the probability of the confidence level. For \code{hybrid}, it +represents the range where any single model trained on a repeat of the +experiment may be found with the probability of the confidence level. By +definition, confidence intervals obtained using \code{hybrid} are at least as +wide as those for \code{ensemble}. \code{hybrid} offers the correct interpretation if +the goal of the analysis is to assess the result of a single, unspecified, +model. + +\code{hybrid} is generally computationally less expensive then \code{ensemble}, which +in turn is somewhat less expensive than \code{model}. + +A non-default \code{detail_level} parameter can be specified for separate +evaluation steps by providing a parameter value in a named list with data +elements, e.g. \code{list("auc_data"="ensemble", "model_performance"="hybrid")}. +This parameter can be set for the following data elements: \code{auc_data}, +\code{decision_curve_analyis}, \code{model_performance}, \code{permutation_vimp}, +\code{ice_data}, \code{prediction_data} and \code{confusion_matrix}.} + +\item{estimation_type}{(\emph{optional}) Sets the type of estimation that should be +possible. This has the following options: +\itemize{ +\item \code{point}: Point estimates. +\item \code{bias_correction} or \code{bc}: Bias-corrected estimates. A bias-corrected +estimate is computed from (at least) 20 point estimates, and \code{familiar} may +bootstrap the data to create them. +\item \code{bootstrap_confidence_interval} or \code{bci} (default): Bias-corrected +estimates with bootstrap confidence intervals (Efron and Hastie, 2016). The +number of point estimates required depends on the \code{confidence_level} +parameter, and \code{familiar} may bootstrap the data to create them. +} + +As with \code{detail_level}, a non-default \code{estimation_type} parameter can be +specified for separate evaluation steps by providing a parameter value in a +named list with data elements, e.g. \code{list("auc_data"="bci", "model_performance"="point")}. This parameter can be set for the following +data elements: \code{auc_data}, \code{decision_curve_analyis}, \code{model_performance}, +\code{permutation_vimp}, \code{ice_data}, and \code{prediction_data}.} + +\item{aggregate_results}{(\emph{optional}) Flag that signifies whether results +should be aggregated during evaluation. If \code{estimation_type} is +\code{bias_correction} or \code{bc}, aggregation leads to a single bias-corrected +estimate. If \code{estimation_type} is \code{bootstrap_confidence_interval} or \code{bci}, +aggregation leads to a single bias-corrected estimate with lower and upper +boundaries of the confidence interval. This has no effect if +\code{estimation_type} is \code{point}. + +The default value is equal to \code{TRUE} except when assessing metrics to assess +model performance, as the default violin plot requires underlying data. + +As with \code{detail_level} and \code{estimation_type}, a non-default +\code{aggregate_results} parameter can be specified for separate evaluation steps +by providing a parameter value in a named list with data elements, e.g. +\code{list("auc_data"=TRUE, , "model_performance"=FALSE)}. This parameter exists +for the same elements as \code{estimation_type}.} + +\item{confidence_level}{(\emph{optional}) Numeric value for the level at which +confidence intervals are determined. In the case bootstraps are used to +determine the confidence intervals bootstrap estimation, \code{familiar} uses the +rule of thumb \eqn{n = 20 / ci.level} to determine the number of required +bootstraps. + +The default value is \code{0.95}.} + +\item{bootstrap_ci_method}{(\emph{optional}) Method used to determine bootstrap +confidence intervals (Efron and Hastie, 2016). The following methods are +implemented: +\itemize{ +\item \code{percentile} (default): Confidence intervals obtained using the percentile +method. +\item \code{bc}: Bias-corrected confidence intervals. +} + +Note that the standard method is not implemented because this method is +often not suitable due to non-normal distributions. The bias-corrected and +accelerated (BCa) method is not implemented yet.} + +\item{is_pre_processed}{Flag that indicates whether the data was already +pre-processed externally, e.g. normalised and clustered. Only used if the +\code{data} argument is a \code{data.table} or \code{data.frame}.} + +\item{message_indent}{Number of indentation steps for messages shown during +computation and extraction of various data elements.} + +\item{verbose}{Flag to indicate whether feedback should be provided on the +computation and extraction of various data elements.} + +\item{...}{Unused arguments.} +} +\value{ +A list with data.tables containing calibration test information for +the ensemble model. +} +\description{ +Computes decision curve analysis data from a \code{familiarEnsemble} object. +Calibration tests are performed based on expected (predicted) and observed +outcomes. For all outcomes, calibration-at-the-large and calibration slopes +are determined. Furthermore, for all but survival outcomes, a repeated, +randomised grouping Hosmer-Lemeshow test is performed. For survival +outcomes, the Nam-D'Agostino and Greenwood-Nam-D'Agostino tests are +performed. +} +\keyword{internal} diff --git a/man/extract_dispatcher-familiarEnsemble-familiarDataElement-method.Rd b/man/extract_dispatcher-familiarEnsemble-familiarDataElement-method.Rd new file mode 100644 index 00000000..492d6012 --- /dev/null +++ b/man/extract_dispatcher-familiarEnsemble-familiarDataElement-method.Rd @@ -0,0 +1,64 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarDataElement.R +\name{extract_dispatcher,familiarEnsemble,familiarDataElement-method} +\alias{extract_dispatcher,familiarEnsemble,familiarDataElement-method} +\title{Internal function to dispatch extraction functions.} +\usage{ +\S4method{extract_dispatcher}{familiarEnsemble,familiarDataElement}( + cl = NULL, + FUN, + object, + proto_data_element, + aggregate_results, + has_internal_bootstrap, + ..., + message_indent = 0L, + verbose = TRUE +) +} +\arguments{ +\item{cl}{Cluster created using the \code{parallel} package. This cluster is then +used to speed up computation through parallellisation.} + +\item{FUN}{Extraction function or method to which data and parameters are +dispatched.} + +\item{object}{A \code{familiarEnsemble} object.} + +\item{proto_data_element}{A \code{familiarDataElement} object, or an object that +inherits from it.} + +\item{aggregate_results}{A logical flag indicating whether results should be +aggregated.} + +\item{has_internal_bootstrap}{A logical flag that indicates whether \code{FUN} has +internal bootstrapping capabilities.} + +\item{...}{Unused arguments.} + +\item{message_indent}{Number of indentation steps for messages shown during +computation and extraction of various data elements.} + +\item{verbose}{Flag to indicate whether feedback should be provided on the +computation and extraction of various data elements.} +} +\value{ +A list of \code{familiarDataElement} objects. +} +\description{ +This function provides a unified access point to extraction +functions. Some of these functions require bootstrapping and result +aggregation, which are handled here. +} +\details{ +This function first determines how many data points need to be +evaluated to complete the desired estimation, i.e. 1 for point estimates, 20 +for bias-corrected estimates, and 20 / (1 - confidence level) for bootstrap +confidence intervals. + +Subsequently, we determine the number of models. This number is used to set +external or internal clusters, the number of bootstraps, and to evaluate +whether the estimation can be done in case \code{FUN} does not support +bootstrapping. +} +\keyword{internal} diff --git a/man/extract_experimental_setup.Rd b/man/extract_experimental_setup.Rd new file mode 100644 index 00000000..6e21bdf9 --- /dev/null +++ b/man/extract_experimental_setup.Rd @@ -0,0 +1,81 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ExperimentSetup.R +\name{extract_experimental_setup} +\alias{extract_experimental_setup} +\title{Parse experimental design} +\usage{ +extract_experimental_setup( + experimental_design, + file_dir, + message_indent = 0L, + verbose = TRUE +) +} +\arguments{ +\item{experimental_design}{(\strong{required}) Defines what the experiment looks +like, e.g. \code{cv(bt(fs,20)+mb,3,2)+ev} for 2 times repeated 3-fold +cross-validation with nested feature selection on 20 bootstraps and +model-building, and external validation. The basic workflow components are: +\itemize{ +\item \code{fs}: (required) feature selection step. +\item \code{mb}: (required) model building step. +\item \code{ev}: (optional) external validation. Note that internal validation due +to subsampling will always be conducted if the subsampling methods create +any validation data sets. +} + +The different components are linked using \code{+}. + +Different subsampling methods can be used in conjunction with the basic +workflow components: +\itemize{ +\item \code{bs(x,n)}: (stratified) .632 bootstrap, with \code{n} the number of +bootstraps. In contrast to \code{bt}, feature pre-processing parameters and +hyperparameter optimisation are conducted on individual bootstraps. +\item \code{bt(x,n)}: (stratified) .632 bootstrap, with \code{n} the number of +bootstraps. Unlike \code{bs} and other subsampling methods, no separate +pre-processing parameters or optimised hyperparameters will be determined +for each bootstrap. +\item \code{cv(x,n,p)}: (stratified) \code{n}-fold cross-validation, repeated \code{p} times. +Pre-processing parameters are determined for each iteration. +\item \code{lv(x)}: leave-one-out-cross-validation. Pre-processing parameters are +determined for each iteration. +\item \code{ip(x)}: imbalance partitioning for addressing class imbalances on the +data set. Pre-processing parameters are determined for each partition. The +number of partitions generated depends on the imbalance correction method +(see the \code{imbalance_correction_method} parameter). Imbalance partitioning +does not generate validation sets. +} + +As shown in the example above, sampling algorithms can be nested. + +The simplest valid experimental design is \code{fs+mb}, which corresponds to a +TRIPOD type 1a analysis. Type 1b analyses are only possible using +bootstraps, e.g. \code{bt(fs+mb,100)}. Type 2a analyses can be conducted using +cross-validation, e.g. \code{cv(bt(fs,100)+mb,10,1)}. Depending on the origin of +the external validation data, designs such as \code{fs+mb+ev} or +\code{cv(bt(fs,100)+mb,10,1)+ev} constitute type 2b or type 3 analyses. Type 4 +analyses can be done by obtaining one or more \code{familiarModel} objects from +others and applying them to your own data set. + +Alternatively, the \code{experimental_design} parameter may be used to provide a +path to a file containing iterations, which is named \verb{####_iterations.RDS} +by convention. This path can be relative to the directory of the current +experiment (\code{experiment_dir}), or an absolute path. The absolute path may +thus also point to a file from a different experiment.} + +\item{message_indent}{Spacing inserted before messages.} + +\item{verbose}{Sets verbosity.} +} +\value{ +data.table with subsampler information at different levels of the +experimental design. +} +\description{ +Parse experimental design +} +\details{ +This function converts the experimental_design string +} +\keyword{internal} diff --git a/man/extract_feature_expression.Rd b/man/extract_feature_expression.Rd new file mode 100644 index 00000000..6f3bfa1f --- /dev/null +++ b/man/extract_feature_expression.Rd @@ -0,0 +1,118 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarDataComputationFeatureExpression.R +\name{extract_feature_expression} +\alias{extract_feature_expression} +\title{Internal function to extract feature expressions.} +\usage{ +extract_feature_expression( + object, + data, + feature_similarity, + sample_similarity, + feature_cluster_method = waiver(), + feature_linkage_method = waiver(), + feature_similarity_metric = waiver(), + sample_cluster_method = waiver(), + sample_linkage_method = waiver(), + sample_similarity_metric = waiver(), + evaluation_times = waiver(), + message_indent = 0L, + verbose = FALSE, + ... +) +} +\arguments{ +\item{object}{A \code{familiarEnsemble} object, which is an ensemble of one or more +\code{familiarModel} objects.} + +\item{data}{A \code{dataObject} object, \code{data.table} or \code{data.frame} that +constitutes the data that are assessed.} + +\item{feature_similarity}{Table containing pairwise distance between +sample. This is used to determine cluster information, and indicate which +samples are similar. The table is created by the +\code{extract_sample_similarity} method.} + +\item{feature_cluster_method}{The method used to perform clustering. These are +the same methods as for the \code{cluster_method} configuration parameter: +\code{none}, \code{hclust}, \code{agnes}, \code{diana} and \code{pam}. + +\code{none} cannot be used when extracting data regarding mutual correlation or +feature expressions. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{feature_linkage_method}{The method used for agglomerative clustering in +\code{hclust} and \code{agnes}. These are the same methods as for the +\code{cluster_linkage_method} configuration parameter: \code{average}, \code{single}, +\code{complete}, \code{weighted}, and \code{ward}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{feature_similarity_metric}{Metric to determine pairwise similarity +between features. Similarity is computed in the same manner as for +clustering, and \code{feature_similarity_metric} therefore has the same options +as \code{cluster_similarity_metric}: \code{mcfadden_r2}, \code{cox_snell_r2}, +\code{nagelkerke_r2}, \code{spearman}, \code{kendall} and \code{pearson}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{sample_cluster_method}{The method used to perform clustering based on +distance between samples. These are the same methods as for the +\code{cluster_method} configuration parameter: \code{hclust}, \code{agnes}, \code{diana} and +\code{pam}. + +\code{none} cannot be used when extracting data for feature expressions. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{sample_linkage_method}{The method used for agglomerative clustering in +\code{hclust} and \code{agnes}. These are the same methods as for the +\code{cluster_linkage_method} configuration parameter: \code{average}, \code{single}, +\code{complete}, \code{weighted}, and \code{ward}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{sample_similarity_metric}{Metric to determine pairwise similarity +between samples. Similarity is computed in the same manner as for +clustering, but \code{sample_similarity_metric} has different options that are +better suited to computing distance between samples instead of between +features: \code{gower}, \code{euclidean}. + +The underlying feature data is scaled to the \eqn{[0, 1]} range (for +numerical features) using the feature values across the samples. The +normalisation parameters required can optionally be computed from feature +data with the outer 5\% (on both sides) of feature values trimmed or +winsorised. To do so append \verb{_trim} (trimming) or \verb{_winsor} (winsorising) to +the metric name. This reduces the effect of outliers somewhat. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{evaluation_times}{One or more time points that are used for in analysis of +survival problems when data has to be assessed at a set time, e.g. +calibration. If not provided explicitly, this parameter is read from +settings used at creation of the underlying \code{familiarModel} objects. Only +used for \code{survival} outcomes.} + +\item{message_indent}{Number of indentation steps for messages shown during +computation and extraction of various data elements.} + +\item{verbose}{Flag to indicate whether feedback should be provided on the +computation and extraction of various data elements.} + +\item{...}{Unused arguments.} +} +\value{ +A list with a data.table containing feature expressions. +} +\description{ +Computes and extracts feature expressions for features +used in a \code{familiarEnsemble} object. +} +\keyword{internal} diff --git a/man/extract_feature_similarity.Rd b/man/extract_feature_similarity.Rd new file mode 100644 index 00000000..20d5d268 --- /dev/null +++ b/man/extract_feature_similarity.Rd @@ -0,0 +1,163 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarDataComputationFeatureSimilarity.R +\name{extract_feature_similarity} +\alias{extract_feature_similarity} +\title{Internal function to extract the feature distance table.} +\usage{ +extract_feature_similarity( + object, + data, + cl = NULL, + estimation_type = waiver(), + aggregate_results = waiver(), + confidence_level = waiver(), + bootstrap_ci_method = waiver(), + is_pre_processed = FALSE, + feature_cluster_method = waiver(), + feature_linkage_method = waiver(), + feature_cluster_cut_method = waiver(), + feature_similarity_threshold = waiver(), + feature_similarity_metric = waiver(), + verbose = FALSE, + message_indent = 0L, + ... +) +} +\arguments{ +\item{object}{A \code{familiarEnsemble} object, which is an ensemble of one or more +\code{familiarModel} objects.} + +\item{data}{A \code{dataObject} object, \code{data.table} or \code{data.frame} that +constitutes the data that are assessed.} + +\item{cl}{Cluster created using the \code{parallel} package. This cluster is then +used to speed up computation through parallellisation.} + +\item{estimation_type}{(\emph{optional}) Sets the type of estimation that should be +possible. This has the following options: +\itemize{ +\item \code{point}: Point estimates. +\item \code{bias_correction} or \code{bc}: Bias-corrected estimates. A bias-corrected +estimate is computed from (at least) 20 point estimates, and \code{familiar} may +bootstrap the data to create them. +\item \code{bootstrap_confidence_interval} or \code{bci} (default): Bias-corrected +estimates with bootstrap confidence intervals (Efron and Hastie, 2016). The +number of point estimates required depends on the \code{confidence_level} +parameter, and \code{familiar} may bootstrap the data to create them. +} + +As with \code{detail_level}, a non-default \code{estimation_type} parameter can be +specified for separate evaluation steps by providing a parameter value in a +named list with data elements, e.g. \code{list("auc_data"="bci", "model_performance"="point")}. This parameter can be set for the following +data elements: \code{auc_data}, \code{decision_curve_analyis}, \code{model_performance}, +\code{permutation_vimp}, \code{ice_data}, and \code{prediction_data}.} + +\item{aggregate_results}{(\emph{optional}) Flag that signifies whether results +should be aggregated during evaluation. If \code{estimation_type} is +\code{bias_correction} or \code{bc}, aggregation leads to a single bias-corrected +estimate. If \code{estimation_type} is \code{bootstrap_confidence_interval} or \code{bci}, +aggregation leads to a single bias-corrected estimate with lower and upper +boundaries of the confidence interval. This has no effect if +\code{estimation_type} is \code{point}. + +The default value is equal to \code{TRUE} except when assessing metrics to assess +model performance, as the default violin plot requires underlying data. + +As with \code{detail_level} and \code{estimation_type}, a non-default +\code{aggregate_results} parameter can be specified for separate evaluation steps +by providing a parameter value in a named list with data elements, e.g. +\code{list("auc_data"=TRUE, , "model_performance"=FALSE)}. This parameter exists +for the same elements as \code{estimation_type}.} + +\item{confidence_level}{(\emph{optional}) Numeric value for the level at which +confidence intervals are determined. In the case bootstraps are used to +determine the confidence intervals bootstrap estimation, \code{familiar} uses the +rule of thumb \eqn{n = 20 / ci.level} to determine the number of required +bootstraps. + +The default value is \code{0.95}.} + +\item{bootstrap_ci_method}{(\emph{optional}) Method used to determine bootstrap +confidence intervals (Efron and Hastie, 2016). The following methods are +implemented: +\itemize{ +\item \code{percentile} (default): Confidence intervals obtained using the percentile +method. +\item \code{bc}: Bias-corrected confidence intervals. +} + +Note that the standard method is not implemented because this method is +often not suitable due to non-normal distributions. The bias-corrected and +accelerated (BCa) method is not implemented yet.} + +\item{is_pre_processed}{Flag that indicates whether the data was already +pre-processed externally, e.g. normalised and clustered. Only used if the +\code{data} argument is a \code{data.table} or \code{data.frame}.} + +\item{feature_cluster_method}{The method used to perform clustering. These are +the same methods as for the \code{cluster_method} configuration parameter: +\code{none}, \code{hclust}, \code{agnes}, \code{diana} and \code{pam}. + +\code{none} cannot be used when extracting data regarding mutual correlation or +feature expressions. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{feature_linkage_method}{The method used for agglomerative clustering in +\code{hclust} and \code{agnes}. These are the same methods as for the +\code{cluster_linkage_method} configuration parameter: \code{average}, \code{single}, +\code{complete}, \code{weighted}, and \code{ward}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{feature_cluster_cut_method}{The method used to divide features into +separate clusters. The available methods are the same as for the +\code{cluster_cut_method} configuration parameter: \code{silhouette}, \code{fixed_cut} and +\code{dynamic_cut}. + +\code{silhouette} is available for all cluster methods, but \code{fixed_cut} only +applies to methods that create hierarchical trees (\code{hclust}, \code{agnes} and +\code{diana}). \code{dynamic_cut} requires the \code{dynamicTreeCut} package and can only +be used with \code{agnes} and \code{hclust}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{feature_similarity_threshold}{The threshold level for pair-wise +similarity that is required to form feature clusters with the \code{fixed_cut} +method. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{feature_similarity_metric}{Metric to determine pairwise similarity +between features. Similarity is computed in the same manner as for +clustering, and \code{feature_similarity_metric} therefore has the same options +as \code{cluster_similarity_metric}: \code{mcfadden_r2}, \code{cox_snell_r2}, +\code{nagelkerke_r2}, \code{spearman}, \code{kendall} and \code{pearson}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{verbose}{Flag to indicate whether feedback should be provided on the +computation and extraction of various data elements.} + +\item{message_indent}{Number of indentation steps for messages shown during +computation and extraction of various data elements.} + +\item{...}{Unused arguments.} +} +\value{ +A data.table containing pairwise distance between features. This data +is only the upper triangular of the complete matrix (i.e. the sparse +unitriangular representation). Diagonals will always be 0.0 and the lower +triangular is mirrored. +} +\description{ +Computes and extracts the feature distance table for features +used in a \code{familiarEnsemble} object. This table can be used to cluster +features, and is exported directly by \code{export_feature_similarity}. +} +\keyword{internal} diff --git a/man/extract_fs_vimp.Rd b/man/extract_fs_vimp.Rd new file mode 100644 index 00000000..6a508079 --- /dev/null +++ b/man/extract_fs_vimp.Rd @@ -0,0 +1,56 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarDataComputationVimp.R +\name{extract_fs_vimp} +\alias{extract_fs_vimp} +\title{Internal function to extract feature selection variable importance.} +\usage{ +extract_fs_vimp( + object, + aggregation_method = waiver(), + rank_threshold = waiver(), + message_indent = 0L, + verbose = FALSE, + ... +) +} +\arguments{ +\item{object}{A \code{familiarEnsemble} object, which is an ensemble of one or more +\code{familiarModel} objects.} + +\item{aggregation_method}{Method for aggregating variable importances for the +purpose of evaluation. Variable importances are determined during feature +selection steps and after training the model. Both types are evaluated, but +feature selection variable importance is only evaluated at run-time. + +See the documentation for the \code{vimp_aggregation_method} argument in +\code{summon_familiar} for information concerning the different available +methods. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{rank_threshold}{The threshold used to define the subset of highly +important features during evaluation. + +See the documentation for the \code{vimp_aggregation_rank_threshold} argument in +\code{summon_familiar} for more information. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{message_indent}{Number of indentation steps for messages shown during +computation and extraction of various data elements.} + +\item{verbose}{Flag to indicate whether feedback should be provided on the +computation and extraction of various data elements.} + +\item{...}{Unused arguments.} +} +\value{ +A list containing feature selection variable importance information. +} +\description{ +Aggregate variable importance obtained during feature selection. +This information can only be obtained as part of the main \code{summon_familiar} process. +} +\keyword{internal} diff --git a/man/extract_hyperparameters.Rd b/man/extract_hyperparameters.Rd new file mode 100644 index 00000000..a7a13fc7 --- /dev/null +++ b/man/extract_hyperparameters.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarDataComputationHyperparameters.R +\name{extract_hyperparameters} +\alias{extract_hyperparameters} +\title{Internal function to extract hyperparameters from models.} +\usage{ +extract_hyperparameters(object, message_indent = 0L, verbose = FALSE, ...) +} +\arguments{ +\item{object}{A \code{familiarEnsemble} object, which is an ensemble of one or more +\code{familiarModel} objects.} + +\item{message_indent}{Number of indentation steps for messages shown during +computation and extraction of various data elements.} + +\item{verbose}{Flag to indicate whether feedback should be provided on the +computation and extraction of various data elements.} + +\item{...}{Unused arguments.} +} +\value{ +A list of familiarDataElements with hyperparameters. +} +\description{ +Collects hyperparameters from models in a \code{familiarEnsemble}. +} +\keyword{internal} diff --git a/man/extract_ice.Rd b/man/extract_ice.Rd new file mode 100644 index 00000000..14411f52 --- /dev/null +++ b/man/extract_ice.Rd @@ -0,0 +1,203 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarDataComputationICE.R +\name{extract_ice} +\alias{extract_ice} +\title{Internal function to extract data for individual conditional +expectation plots.} +\usage{ +extract_ice( + object, + data, + cl = NULL, + features = NULL, + feature_x_range = NULL, + feature_y_range = NULL, + n_sample_points = 50L, + ensemble_method = waiver(), + evaluation_times = waiver(), + sample_limit = waiver(), + detail_level = waiver(), + estimation_type = waiver(), + aggregate_results = waiver(), + confidence_level = waiver(), + bootstrap_ci_method = waiver(), + is_pre_processed = FALSE, + message_indent = 0L, + verbose = FALSE, + ... +) +} +\arguments{ +\item{object}{A \code{familiarEnsemble} object, which is an ensemble of one or more +\code{familiarModel} objects.} + +\item{data}{A \code{dataObject} object, \code{data.table} or \code{data.frame} that +constitutes the data that are assessed.} + +\item{cl}{Cluster created using the \code{parallel} package. This cluster is then +used to speed up computation through parallellisation.} + +\item{features}{Names of the feature or features (2) assessed simultaneously. +By default \code{NULL}, which means that all features are assessed one-by-one.} + +\item{feature_x_range}{When one or two features are defined using \code{features}, +\code{feature_x_range} can be used to set the range of values for the first +feature. For numeric features, a vector of two values is assumed to indicate +a range from which \code{n_sample_points} are uniformly sampled. A vector of more +than two values is interpreted as is, i.e. these represent the values to be +sampled. For categorical features, values should represent a (sub)set of +available levels.} + +\item{feature_y_range}{As \code{feature_x_range}, but for the second feature in +case two features are defined.} + +\item{n_sample_points}{Number of points used to sample continuous features.} + +\item{ensemble_method}{Method for ensembling predictions from models for the +same sample. Available methods are: +\itemize{ +\item \code{median} (default): Use the median of the predicted values as the ensemble +value for a sample. +\item \code{mean}: Use the mean of the predicted values as the ensemble value for a +sample. +}} + +\item{evaluation_times}{One or more time points that are used for in analysis of +survival problems when data has to be assessed at a set time, e.g. +calibration. If not provided explicitly, this parameter is read from +settings used at creation of the underlying \code{familiarModel} objects. Only +used for \code{survival} outcomes.} + +\item{sample_limit}{(\emph{optional}) Set the upper limit of the number of samples +that are used during evaluation steps. Cannot be less than 20. + +This setting can be specified per data element by providing a parameter +value in a named list with data elements, e.g. +\code{list("sample_similarity"=100, "permutation_vimp"=1000)}. + +This parameter can be set for the following data elements: +\code{sample_similarity} and \code{ice_data}.} + +\item{detail_level}{(\emph{optional}) Sets the level at which results are computed +and aggregated. +\itemize{ +\item \code{ensemble}: Results are computed at the ensemble level, i.e. over all +models in the ensemble. This means that, for example, bias-corrected +estimates of model performance are assessed by creating (at least) 20 +bootstraps and computing the model performance of the ensemble model for +each bootstrap. +\item \code{hybrid} (default): Results are computed at the level of models in an +ensemble. This means that, for example, bias-corrected estimates of model +performance are directly computed using the models in the ensemble. If there +are at least 20 trained models in the ensemble, performance is computed for +each model, in contrast to \code{ensemble} where performance is computed for the +ensemble of models. If there are less than 20 trained models in the +ensemble, bootstraps are created so that at least 20 point estimates can be +made. +\item \code{model}: Results are computed at the model level. This means that, for +example, bias-corrected estimates of model performance are assessed by +creating (at least) 20 bootstraps and computing the performance of the model +for each bootstrap. +} + +Note that each level of detail has a different interpretation for bootstrap +confidence intervals. For \code{ensemble} and \code{model} these are the confidence +intervals for the ensemble and an individual model, respectively. That is, +the confidence interval describes the range where an estimate produced by a +respective ensemble or model trained on a repeat of the experiment may be +found with the probability of the confidence level. For \code{hybrid}, it +represents the range where any single model trained on a repeat of the +experiment may be found with the probability of the confidence level. By +definition, confidence intervals obtained using \code{hybrid} are at least as +wide as those for \code{ensemble}. \code{hybrid} offers the correct interpretation if +the goal of the analysis is to assess the result of a single, unspecified, +model. + +\code{hybrid} is generally computationally less expensive then \code{ensemble}, which +in turn is somewhat less expensive than \code{model}. + +A non-default \code{detail_level} parameter can be specified for separate +evaluation steps by providing a parameter value in a named list with data +elements, e.g. \code{list("auc_data"="ensemble", "model_performance"="hybrid")}. +This parameter can be set for the following data elements: \code{auc_data}, +\code{decision_curve_analyis}, \code{model_performance}, \code{permutation_vimp}, +\code{ice_data}, \code{prediction_data} and \code{confusion_matrix}.} + +\item{estimation_type}{(\emph{optional}) Sets the type of estimation that should be +possible. This has the following options: +\itemize{ +\item \code{point}: Point estimates. +\item \code{bias_correction} or \code{bc}: Bias-corrected estimates. A bias-corrected +estimate is computed from (at least) 20 point estimates, and \code{familiar} may +bootstrap the data to create them. +\item \code{bootstrap_confidence_interval} or \code{bci} (default): Bias-corrected +estimates with bootstrap confidence intervals (Efron and Hastie, 2016). The +number of point estimates required depends on the \code{confidence_level} +parameter, and \code{familiar} may bootstrap the data to create them. +} + +As with \code{detail_level}, a non-default \code{estimation_type} parameter can be +specified for separate evaluation steps by providing a parameter value in a +named list with data elements, e.g. \code{list("auc_data"="bci", "model_performance"="point")}. This parameter can be set for the following +data elements: \code{auc_data}, \code{decision_curve_analyis}, \code{model_performance}, +\code{permutation_vimp}, \code{ice_data}, and \code{prediction_data}.} + +\item{aggregate_results}{(\emph{optional}) Flag that signifies whether results +should be aggregated during evaluation. If \code{estimation_type} is +\code{bias_correction} or \code{bc}, aggregation leads to a single bias-corrected +estimate. If \code{estimation_type} is \code{bootstrap_confidence_interval} or \code{bci}, +aggregation leads to a single bias-corrected estimate with lower and upper +boundaries of the confidence interval. This has no effect if +\code{estimation_type} is \code{point}. + +The default value is equal to \code{TRUE} except when assessing metrics to assess +model performance, as the default violin plot requires underlying data. + +As with \code{detail_level} and \code{estimation_type}, a non-default +\code{aggregate_results} parameter can be specified for separate evaluation steps +by providing a parameter value in a named list with data elements, e.g. +\code{list("auc_data"=TRUE, , "model_performance"=FALSE)}. This parameter exists +for the same elements as \code{estimation_type}.} + +\item{confidence_level}{(\emph{optional}) Numeric value for the level at which +confidence intervals are determined. In the case bootstraps are used to +determine the confidence intervals bootstrap estimation, \code{familiar} uses the +rule of thumb \eqn{n = 20 / ci.level} to determine the number of required +bootstraps. + +The default value is \code{0.95}.} + +\item{bootstrap_ci_method}{(\emph{optional}) Method used to determine bootstrap +confidence intervals (Efron and Hastie, 2016). The following methods are +implemented: +\itemize{ +\item \code{percentile} (default): Confidence intervals obtained using the percentile +method. +\item \code{bc}: Bias-corrected confidence intervals. +} + +Note that the standard method is not implemented because this method is +often not suitable due to non-normal distributions. The bias-corrected and +accelerated (BCa) method is not implemented yet.} + +\item{is_pre_processed}{Flag that indicates whether the data was already +pre-processed externally, e.g. normalised and clustered. Only used if the +\code{data} argument is a \code{data.table} or \code{data.frame}.} + +\item{message_indent}{Number of indentation steps for messages shown during +computation and extraction of various data elements.} + +\item{verbose}{Flag to indicate whether feedback should be provided on the +computation and extraction of various data elements.} + +\item{...}{Unused arguments.} +} +\value{ +A data.table containing predicted and observed outcome data together +with a co-occurence count. +} +\description{ +Computes data for individual conditional expectation plots and +partial dependence plots for the model(s) in a \code{familiarEnsemble} object. +} +\keyword{internal} diff --git a/man/extract_model_vimp.Rd b/man/extract_model_vimp.Rd new file mode 100644 index 00000000..49cd5f8e --- /dev/null +++ b/man/extract_model_vimp.Rd @@ -0,0 +1,60 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarDataComputationVimp.R +\name{extract_model_vimp} +\alias{extract_model_vimp} +\title{Internal function to extract variable importance from models.} +\usage{ +extract_model_vimp( + object, + data, + aggregation_method = waiver(), + rank_threshold = waiver(), + message_indent = 0L, + verbose = FALSE, + ... +) +} +\arguments{ +\item{object}{A \code{familiarEnsemble} object, which is an ensemble of one or more +\code{familiarModel} objects.} + +\item{data}{A \code{dataObject} object, \code{data.table} or \code{data.frame} that +constitutes the data that are assessed.} + +\item{aggregation_method}{Method for aggregating variable importances for the +purpose of evaluation. Variable importances are determined during feature +selection steps and after training the model. Both types are evaluated, but +feature selection variable importance is only evaluated at run-time. + +See the documentation for the \code{vimp_aggregation_method} argument in +\code{summon_familiar} for information concerning the different available +methods. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{rank_threshold}{The threshold used to define the subset of highly +important features during evaluation. + +See the documentation for the \code{vimp_aggregation_rank_threshold} argument in +\code{summon_familiar} for more information. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{message_indent}{Number of indentation steps for messages shown during +computation and extraction of various data elements.} + +\item{verbose}{Flag to indicate whether feedback should be provided on the +computation and extraction of various data elements.} + +\item{...}{Unused arguments.} +} +\value{ +A list containing variable importance information. +} +\description{ +Aggregate variable importance from models in a +\code{familiarEnsemble}. +} +\keyword{internal} diff --git a/man/extract_performance.Rd b/man/extract_performance.Rd new file mode 100644 index 00000000..20bc819b --- /dev/null +++ b/man/extract_performance.Rd @@ -0,0 +1,182 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarDataComputationModelPerformance.R +\name{extract_performance} +\alias{extract_performance} +\title{Internal function to extract performance metrics.} +\usage{ +extract_performance( + object, + data, + cl = NULL, + metric = waiver(), + ensemble_method = waiver(), + evaluation_times = waiver(), + detail_level = waiver(), + estimation_type = waiver(), + aggregate_results = waiver(), + confidence_level = waiver(), + bootstrap_ci_method = waiver(), + is_pre_processed = FALSE, + message_indent = 0L, + verbose = FALSE, + ... +) +} +\arguments{ +\item{object}{A \code{familiarEnsemble} object, which is an ensemble of one or more +\code{familiarModel} objects.} + +\item{data}{A \code{dataObject} object, \code{data.table} or \code{data.frame} that +constitutes the data that are assessed.} + +\item{cl}{Cluster created using the \code{parallel} package. This cluster is then +used to speed up computation through parallellisation.} + +\item{metric}{One or more metrics for assessing model performance. See the +vignette on performance metrics for the available metrics. If not provided +explicitly, this parameter is read from settings used at creation of the +underlying \code{familiarModel} objects.} + +\item{ensemble_method}{Method for ensembling predictions from models for the +same sample. Available methods are: +\itemize{ +\item \code{median} (default): Use the median of the predicted values as the ensemble +value for a sample. +\item \code{mean}: Use the mean of the predicted values as the ensemble value for a +sample. +}} + +\item{evaluation_times}{One or more time points that are used for in analysis of +survival problems when data has to be assessed at a set time, e.g. +calibration. If not provided explicitly, this parameter is read from +settings used at creation of the underlying \code{familiarModel} objects. Only +used for \code{survival} outcomes.} + +\item{detail_level}{(\emph{optional}) Sets the level at which results are computed +and aggregated. +\itemize{ +\item \code{ensemble}: Results are computed at the ensemble level, i.e. over all +models in the ensemble. This means that, for example, bias-corrected +estimates of model performance are assessed by creating (at least) 20 +bootstraps and computing the model performance of the ensemble model for +each bootstrap. +\item \code{hybrid} (default): Results are computed at the level of models in an +ensemble. This means that, for example, bias-corrected estimates of model +performance are directly computed using the models in the ensemble. If there +are at least 20 trained models in the ensemble, performance is computed for +each model, in contrast to \code{ensemble} where performance is computed for the +ensemble of models. If there are less than 20 trained models in the +ensemble, bootstraps are created so that at least 20 point estimates can be +made. +\item \code{model}: Results are computed at the model level. This means that, for +example, bias-corrected estimates of model performance are assessed by +creating (at least) 20 bootstraps and computing the performance of the model +for each bootstrap. +} + +Note that each level of detail has a different interpretation for bootstrap +confidence intervals. For \code{ensemble} and \code{model} these are the confidence +intervals for the ensemble and an individual model, respectively. That is, +the confidence interval describes the range where an estimate produced by a +respective ensemble or model trained on a repeat of the experiment may be +found with the probability of the confidence level. For \code{hybrid}, it +represents the range where any single model trained on a repeat of the +experiment may be found with the probability of the confidence level. By +definition, confidence intervals obtained using \code{hybrid} are at least as +wide as those for \code{ensemble}. \code{hybrid} offers the correct interpretation if +the goal of the analysis is to assess the result of a single, unspecified, +model. + +\code{hybrid} is generally computationally less expensive then \code{ensemble}, which +in turn is somewhat less expensive than \code{model}. + +A non-default \code{detail_level} parameter can be specified for separate +evaluation steps by providing a parameter value in a named list with data +elements, e.g. \code{list("auc_data"="ensemble", "model_performance"="hybrid")}. +This parameter can be set for the following data elements: \code{auc_data}, +\code{decision_curve_analyis}, \code{model_performance}, \code{permutation_vimp}, +\code{ice_data}, \code{prediction_data} and \code{confusion_matrix}.} + +\item{estimation_type}{(\emph{optional}) Sets the type of estimation that should be +possible. This has the following options: +\itemize{ +\item \code{point}: Point estimates. +\item \code{bias_correction} or \code{bc}: Bias-corrected estimates. A bias-corrected +estimate is computed from (at least) 20 point estimates, and \code{familiar} may +bootstrap the data to create them. +\item \code{bootstrap_confidence_interval} or \code{bci} (default): Bias-corrected +estimates with bootstrap confidence intervals (Efron and Hastie, 2016). The +number of point estimates required depends on the \code{confidence_level} +parameter, and \code{familiar} may bootstrap the data to create them. +} + +As with \code{detail_level}, a non-default \code{estimation_type} parameter can be +specified for separate evaluation steps by providing a parameter value in a +named list with data elements, e.g. \code{list("auc_data"="bci", "model_performance"="point")}. This parameter can be set for the following +data elements: \code{auc_data}, \code{decision_curve_analyis}, \code{model_performance}, +\code{permutation_vimp}, \code{ice_data}, and \code{prediction_data}.} + +\item{aggregate_results}{(\emph{optional}) Flag that signifies whether results +should be aggregated during evaluation. If \code{estimation_type} is +\code{bias_correction} or \code{bc}, aggregation leads to a single bias-corrected +estimate. If \code{estimation_type} is \code{bootstrap_confidence_interval} or \code{bci}, +aggregation leads to a single bias-corrected estimate with lower and upper +boundaries of the confidence interval. This has no effect if +\code{estimation_type} is \code{point}. + +The default value is equal to \code{TRUE} except when assessing metrics to assess +model performance, as the default violin plot requires underlying data. + +As with \code{detail_level} and \code{estimation_type}, a non-default +\code{aggregate_results} parameter can be specified for separate evaluation steps +by providing a parameter value in a named list with data elements, e.g. +\code{list("auc_data"=TRUE, , "model_performance"=FALSE)}. This parameter exists +for the same elements as \code{estimation_type}.} + +\item{confidence_level}{(\emph{optional}) Numeric value for the level at which +confidence intervals are determined. In the case bootstraps are used to +determine the confidence intervals bootstrap estimation, \code{familiar} uses the +rule of thumb \eqn{n = 20 / ci.level} to determine the number of required +bootstraps. + +The default value is \code{0.95}.} + +\item{bootstrap_ci_method}{(\emph{optional}) Method used to determine bootstrap +confidence intervals (Efron and Hastie, 2016). The following methods are +implemented: +\itemize{ +\item \code{percentile} (default): Confidence intervals obtained using the percentile +method. +\item \code{bc}: Bias-corrected confidence intervals. +} + +Note that the standard method is not implemented because this method is +often not suitable due to non-normal distributions. The bias-corrected and +accelerated (BCa) method is not implemented yet.} + +\item{is_pre_processed}{Flag that indicates whether the data was already +pre-processed externally, e.g. normalised and clustered. Only used if the +\code{data} argument is a \code{data.table} or \code{data.frame}.} + +\item{message_indent}{Number of indentation steps for messages shown during +computation and extraction of various data elements.} + +\item{verbose}{Flag to indicate whether feedback should be provided on the +computation and extraction of various data elements.} + +\item{...}{Unused arguments.} +} +\value{ +A list with data.tables for single and ensemble model assessments. +} +\description{ +Computes and collects discriminative performance metrics from a +\code{familiarEnsemble}. +} +\details{ +This method computes credibility intervals for the ensemble model, at +the level of \code{confidence_level}. This is a general method. Metrics with +known, theoretically derived confidence intervals, nevertheless have a +credibility interval computed. +} +\keyword{internal} diff --git a/man/extract_permutation_vimp.Rd b/man/extract_permutation_vimp.Rd new file mode 100644 index 00000000..807e373f --- /dev/null +++ b/man/extract_permutation_vimp.Rd @@ -0,0 +1,233 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarDataComputationPermutationVimp.R +\name{extract_permutation_vimp} +\alias{extract_permutation_vimp} +\title{Internal function to extract permutation variable importance.} +\usage{ +extract_permutation_vimp( + object, + data, + cl = NULL, + ensemble_method = waiver(), + feature_similarity, + feature_cluster_method = waiver(), + feature_linkage_method = waiver(), + feature_cluster_cut_method = waiver(), + feature_similarity_metric = waiver(), + feature_similarity_threshold = waiver(), + metric = waiver(), + evaluation_times = waiver(), + detail_level = waiver(), + estimation_type = waiver(), + aggregate_results = waiver(), + confidence_level = waiver(), + bootstrap_ci_method = waiver(), + is_pre_processed = FALSE, + message_indent = 0L, + verbose = FALSE, + ... +) +} +\arguments{ +\item{object}{A \code{familiarEnsemble} object, which is an ensemble of one or more +\code{familiarModel} objects.} + +\item{data}{A \code{dataObject} object, \code{data.table} or \code{data.frame} that +constitutes the data that are assessed.} + +\item{cl}{Cluster created using the \code{parallel} package. This cluster is then +used to speed up computation through parallellisation.} + +\item{ensemble_method}{Method for ensembling predictions from models for the +same sample. Available methods are: +\itemize{ +\item \code{median} (default): Use the median of the predicted values as the ensemble +value for a sample. +\item \code{mean}: Use the mean of the predicted values as the ensemble value for a +sample. +}} + +\item{feature_cluster_method}{The method used to perform clustering. These are +the same methods as for the \code{cluster_method} configuration parameter: +\code{none}, \code{hclust}, \code{agnes}, \code{diana} and \code{pam}. + +\code{none} cannot be used when extracting data regarding mutual correlation or +feature expressions. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{feature_linkage_method}{The method used for agglomerative clustering in +\code{hclust} and \code{agnes}. These are the same methods as for the +\code{cluster_linkage_method} configuration parameter: \code{average}, \code{single}, +\code{complete}, \code{weighted}, and \code{ward}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{feature_cluster_cut_method}{The method used to divide features into +separate clusters. The available methods are the same as for the +\code{cluster_cut_method} configuration parameter: \code{silhouette}, \code{fixed_cut} and +\code{dynamic_cut}. + +\code{silhouette} is available for all cluster methods, but \code{fixed_cut} only +applies to methods that create hierarchical trees (\code{hclust}, \code{agnes} and +\code{diana}). \code{dynamic_cut} requires the \code{dynamicTreeCut} package and can only +be used with \code{agnes} and \code{hclust}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{feature_similarity_metric}{Metric to determine pairwise similarity +between features. Similarity is computed in the same manner as for +clustering, and \code{feature_similarity_metric} therefore has the same options +as \code{cluster_similarity_metric}: \code{mcfadden_r2}, \code{cox_snell_r2}, +\code{nagelkerke_r2}, \code{spearman}, \code{kendall} and \code{pearson}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{feature_similarity_threshold}{The threshold level for pair-wise +similarity that is required to form feature clusters with the \code{fixed_cut} +method. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{metric}{One or more metrics for assessing model performance. See the +vignette on performance metrics for the available metrics. If not provided +explicitly, this parameter is read from settings used at creation of the +underlying \code{familiarModel} objects.} + +\item{evaluation_times}{One or more time points that are used for in analysis of +survival problems when data has to be assessed at a set time, e.g. +calibration. If not provided explicitly, this parameter is read from +settings used at creation of the underlying \code{familiarModel} objects. Only +used for \code{survival} outcomes.} + +\item{detail_level}{(\emph{optional}) Sets the level at which results are computed +and aggregated. +\itemize{ +\item \code{ensemble}: Results are computed at the ensemble level, i.e. over all +models in the ensemble. This means that, for example, bias-corrected +estimates of model performance are assessed by creating (at least) 20 +bootstraps and computing the model performance of the ensemble model for +each bootstrap. +\item \code{hybrid} (default): Results are computed at the level of models in an +ensemble. This means that, for example, bias-corrected estimates of model +performance are directly computed using the models in the ensemble. If there +are at least 20 trained models in the ensemble, performance is computed for +each model, in contrast to \code{ensemble} where performance is computed for the +ensemble of models. If there are less than 20 trained models in the +ensemble, bootstraps are created so that at least 20 point estimates can be +made. +\item \code{model}: Results are computed at the model level. This means that, for +example, bias-corrected estimates of model performance are assessed by +creating (at least) 20 bootstraps and computing the performance of the model +for each bootstrap. +} + +Note that each level of detail has a different interpretation for bootstrap +confidence intervals. For \code{ensemble} and \code{model} these are the confidence +intervals for the ensemble and an individual model, respectively. That is, +the confidence interval describes the range where an estimate produced by a +respective ensemble or model trained on a repeat of the experiment may be +found with the probability of the confidence level. For \code{hybrid}, it +represents the range where any single model trained on a repeat of the +experiment may be found with the probability of the confidence level. By +definition, confidence intervals obtained using \code{hybrid} are at least as +wide as those for \code{ensemble}. \code{hybrid} offers the correct interpretation if +the goal of the analysis is to assess the result of a single, unspecified, +model. + +\code{hybrid} is generally computationally less expensive then \code{ensemble}, which +in turn is somewhat less expensive than \code{model}. + +A non-default \code{detail_level} parameter can be specified for separate +evaluation steps by providing a parameter value in a named list with data +elements, e.g. \code{list("auc_data"="ensemble", "model_performance"="hybrid")}. +This parameter can be set for the following data elements: \code{auc_data}, +\code{decision_curve_analyis}, \code{model_performance}, \code{permutation_vimp}, +\code{ice_data}, \code{prediction_data} and \code{confusion_matrix}.} + +\item{estimation_type}{(\emph{optional}) Sets the type of estimation that should be +possible. This has the following options: +\itemize{ +\item \code{point}: Point estimates. +\item \code{bias_correction} or \code{bc}: Bias-corrected estimates. A bias-corrected +estimate is computed from (at least) 20 point estimates, and \code{familiar} may +bootstrap the data to create them. +\item \code{bootstrap_confidence_interval} or \code{bci} (default): Bias-corrected +estimates with bootstrap confidence intervals (Efron and Hastie, 2016). The +number of point estimates required depends on the \code{confidence_level} +parameter, and \code{familiar} may bootstrap the data to create them. +} + +As with \code{detail_level}, a non-default \code{estimation_type} parameter can be +specified for separate evaluation steps by providing a parameter value in a +named list with data elements, e.g. \code{list("auc_data"="bci", "model_performance"="point")}. This parameter can be set for the following +data elements: \code{auc_data}, \code{decision_curve_analyis}, \code{model_performance}, +\code{permutation_vimp}, \code{ice_data}, and \code{prediction_data}.} + +\item{aggregate_results}{(\emph{optional}) Flag that signifies whether results +should be aggregated during evaluation. If \code{estimation_type} is +\code{bias_correction} or \code{bc}, aggregation leads to a single bias-corrected +estimate. If \code{estimation_type} is \code{bootstrap_confidence_interval} or \code{bci}, +aggregation leads to a single bias-corrected estimate with lower and upper +boundaries of the confidence interval. This has no effect if +\code{estimation_type} is \code{point}. + +The default value is equal to \code{TRUE} except when assessing metrics to assess +model performance, as the default violin plot requires underlying data. + +As with \code{detail_level} and \code{estimation_type}, a non-default +\code{aggregate_results} parameter can be specified for separate evaluation steps +by providing a parameter value in a named list with data elements, e.g. +\code{list("auc_data"=TRUE, , "model_performance"=FALSE)}. This parameter exists +for the same elements as \code{estimation_type}.} + +\item{confidence_level}{(\emph{optional}) Numeric value for the level at which +confidence intervals are determined. In the case bootstraps are used to +determine the confidence intervals bootstrap estimation, \code{familiar} uses the +rule of thumb \eqn{n = 20 / ci.level} to determine the number of required +bootstraps. + +The default value is \code{0.95}.} + +\item{bootstrap_ci_method}{(\emph{optional}) Method used to determine bootstrap +confidence intervals (Efron and Hastie, 2016). The following methods are +implemented: +\itemize{ +\item \code{percentile} (default): Confidence intervals obtained using the percentile +method. +\item \code{bc}: Bias-corrected confidence intervals. +} + +Note that the standard method is not implemented because this method is +often not suitable due to non-normal distributions. The bias-corrected and +accelerated (BCa) method is not implemented yet.} + +\item{is_pre_processed}{Flag that indicates whether the data was already +pre-processed externally, e.g. normalised and clustered. Only used if the +\code{data} argument is a \code{data.table} or \code{data.frame}.} + +\item{message_indent}{Number of indentation steps for messages shown during +computation and extraction of various data elements.} + +\item{verbose}{Flag to indicate whether feedback should be provided on the +computation and extraction of various data elements.} + +\item{...}{Unused arguments.} +} +\value{ +A list with data.tables for single and ensemble model assessments. +} +\description{ +Computes and collects permutation variable importance from a +\code{familiarEnsemble}. +} +\details{ +This function also computes credibility intervals for the ensemble +model, at the level of \code{confidence_level}. +} +\keyword{internal} diff --git a/man/extract_predictions.Rd b/man/extract_predictions.Rd new file mode 100644 index 00000000..a8372c0d --- /dev/null +++ b/man/extract_predictions.Rd @@ -0,0 +1,155 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarDataComputationPredictionData.R +\name{extract_predictions} +\alias{extract_predictions} +\title{Internal function to extract predicted values from models.} +\usage{ +extract_predictions( + object, + data, + cl = NULL, + is_pre_processed = FALSE, + ensemble_method = waiver(), + evaluation_times = waiver(), + detail_level = waiver(), + estimation_type = waiver(), + aggregate_results = waiver(), + confidence_level = waiver(), + message_indent = 0L, + verbose = FALSE, + ... +) +} +\arguments{ +\item{object}{A \code{familiarEnsemble} object, which is an ensemble of one or more +\code{familiarModel} objects.} + +\item{data}{A \code{dataObject} object, \code{data.table} or \code{data.frame} that +constitutes the data that are assessed.} + +\item{cl}{Cluster created using the \code{parallel} package. This cluster is then +used to speed up computation through parallellisation.} + +\item{is_pre_processed}{Flag that indicates whether the data was already +pre-processed externally, e.g. normalised and clustered. Only used if the +\code{data} argument is a \code{data.table} or \code{data.frame}.} + +\item{ensemble_method}{Method for ensembling predictions from models for the +same sample. Available methods are: +\itemize{ +\item \code{median} (default): Use the median of the predicted values as the ensemble +value for a sample. +\item \code{mean}: Use the mean of the predicted values as the ensemble value for a +sample. +}} + +\item{evaluation_times}{One or more time points that are used for in analysis of +survival problems when data has to be assessed at a set time, e.g. +calibration. If not provided explicitly, this parameter is read from +settings used at creation of the underlying \code{familiarModel} objects. Only +used for \code{survival} outcomes.} + +\item{detail_level}{(\emph{optional}) Sets the level at which results are computed +and aggregated. +\itemize{ +\item \code{ensemble}: Results are computed at the ensemble level, i.e. over all +models in the ensemble. This means that, for example, bias-corrected +estimates of model performance are assessed by creating (at least) 20 +bootstraps and computing the model performance of the ensemble model for +each bootstrap. +\item \code{hybrid} (default): Results are computed at the level of models in an +ensemble. This means that, for example, bias-corrected estimates of model +performance are directly computed using the models in the ensemble. If there +are at least 20 trained models in the ensemble, performance is computed for +each model, in contrast to \code{ensemble} where performance is computed for the +ensemble of models. If there are less than 20 trained models in the +ensemble, bootstraps are created so that at least 20 point estimates can be +made. +\item \code{model}: Results are computed at the model level. This means that, for +example, bias-corrected estimates of model performance are assessed by +creating (at least) 20 bootstraps and computing the performance of the model +for each bootstrap. +} + +Note that each level of detail has a different interpretation for bootstrap +confidence intervals. For \code{ensemble} and \code{model} these are the confidence +intervals for the ensemble and an individual model, respectively. That is, +the confidence interval describes the range where an estimate produced by a +respective ensemble or model trained on a repeat of the experiment may be +found with the probability of the confidence level. For \code{hybrid}, it +represents the range where any single model trained on a repeat of the +experiment may be found with the probability of the confidence level. By +definition, confidence intervals obtained using \code{hybrid} are at least as +wide as those for \code{ensemble}. \code{hybrid} offers the correct interpretation if +the goal of the analysis is to assess the result of a single, unspecified, +model. + +\code{hybrid} is generally computationally less expensive then \code{ensemble}, which +in turn is somewhat less expensive than \code{model}. + +A non-default \code{detail_level} parameter can be specified for separate +evaluation steps by providing a parameter value in a named list with data +elements, e.g. \code{list("auc_data"="ensemble", "model_performance"="hybrid")}. +This parameter can be set for the following data elements: \code{auc_data}, +\code{decision_curve_analyis}, \code{model_performance}, \code{permutation_vimp}, +\code{ice_data}, \code{prediction_data} and \code{confusion_matrix}.} + +\item{estimation_type}{(\emph{optional}) Sets the type of estimation that should be +possible. This has the following options: +\itemize{ +\item \code{point}: Point estimates. +\item \code{bias_correction} or \code{bc}: Bias-corrected estimates. A bias-corrected +estimate is computed from (at least) 20 point estimates, and \code{familiar} may +bootstrap the data to create them. +\item \code{bootstrap_confidence_interval} or \code{bci} (default): Bias-corrected +estimates with bootstrap confidence intervals (Efron and Hastie, 2016). The +number of point estimates required depends on the \code{confidence_level} +parameter, and \code{familiar} may bootstrap the data to create them. +} + +As with \code{detail_level}, a non-default \code{estimation_type} parameter can be +specified for separate evaluation steps by providing a parameter value in a +named list with data elements, e.g. \code{list("auc_data"="bci", "model_performance"="point")}. This parameter can be set for the following +data elements: \code{auc_data}, \code{decision_curve_analyis}, \code{model_performance}, +\code{permutation_vimp}, \code{ice_data}, and \code{prediction_data}.} + +\item{aggregate_results}{(\emph{optional}) Flag that signifies whether results +should be aggregated during evaluation. If \code{estimation_type} is +\code{bias_correction} or \code{bc}, aggregation leads to a single bias-corrected +estimate. If \code{estimation_type} is \code{bootstrap_confidence_interval} or \code{bci}, +aggregation leads to a single bias-corrected estimate with lower and upper +boundaries of the confidence interval. This has no effect if +\code{estimation_type} is \code{point}. + +The default value is equal to \code{TRUE} except when assessing metrics to assess +model performance, as the default violin plot requires underlying data. + +As with \code{detail_level} and \code{estimation_type}, a non-default +\code{aggregate_results} parameter can be specified for separate evaluation steps +by providing a parameter value in a named list with data elements, e.g. +\code{list("auc_data"=TRUE, , "model_performance"=FALSE)}. This parameter exists +for the same elements as \code{estimation_type}.} + +\item{confidence_level}{(\emph{optional}) Numeric value for the level at which +confidence intervals are determined. In the case bootstraps are used to +determine the confidence intervals bootstrap estimation, \code{familiar} uses the +rule of thumb \eqn{n = 20 / ci.level} to determine the number of required +bootstraps. + +The default value is \code{0.95}.} + +\item{message_indent}{Number of indentation steps for messages shown during +computation and extraction of various data elements.} + +\item{verbose}{Flag to indicate whether feedback should be provided on the +computation and extraction of various data elements.} + +\item{...}{Unused arguments.} +} +\value{ +A list with single-model and ensemble predictions. +} +\description{ +Collects predicted values from models in a \code{familiarEnsemble}. +} +\keyword{internal} diff --git a/man/extract_risk_stratification_data.Rd b/man/extract_risk_stratification_data.Rd new file mode 100644 index 00000000..898b637b --- /dev/null +++ b/man/extract_risk_stratification_data.Rd @@ -0,0 +1,114 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarDataComputationRiskStratificationData.R +\name{extract_risk_stratification_data} +\alias{extract_risk_stratification_data} +\title{Internal function to extract stratification data.} +\usage{ +extract_risk_stratification_data( + object, + data, + cl = NULL, + is_pre_processed = FALSE, + ensemble_method = waiver(), + detail_level = waiver(), + confidence_level = waiver(), + message_indent = 0L, + verbose = FALSE, + ... +) +} +\arguments{ +\item{object}{A \code{familiarEnsemble} object, which is an ensemble of one or more +\code{familiarModel} objects.} + +\item{data}{A \code{dataObject} object, \code{data.table} or \code{data.frame} that +constitutes the data that are assessed.} + +\item{cl}{Cluster created using the \code{parallel} package. This cluster is then +used to speed up computation through parallellisation.} + +\item{is_pre_processed}{Flag that indicates whether the data was already +pre-processed externally, e.g. normalised and clustered. Only used if the +\code{data} argument is a \code{data.table} or \code{data.frame}.} + +\item{ensemble_method}{Method for ensembling predictions from models for the +same sample. Available methods are: +\itemize{ +\item \code{median} (default): Use the median of the predicted values as the ensemble +value for a sample. +\item \code{mean}: Use the mean of the predicted values as the ensemble value for a +sample. +}} + +\item{detail_level}{(\emph{optional}) Sets the level at which results are computed +and aggregated. +\itemize{ +\item \code{ensemble}: Results are computed at the ensemble level, i.e. over all +models in the ensemble. This means that, for example, bias-corrected +estimates of model performance are assessed by creating (at least) 20 +bootstraps and computing the model performance of the ensemble model for +each bootstrap. +\item \code{hybrid} (default): Results are computed at the level of models in an +ensemble. This means that, for example, bias-corrected estimates of model +performance are directly computed using the models in the ensemble. If there +are at least 20 trained models in the ensemble, performance is computed for +each model, in contrast to \code{ensemble} where performance is computed for the +ensemble of models. If there are less than 20 trained models in the +ensemble, bootstraps are created so that at least 20 point estimates can be +made. +\item \code{model}: Results are computed at the model level. This means that, for +example, bias-corrected estimates of model performance are assessed by +creating (at least) 20 bootstraps and computing the performance of the model +for each bootstrap. +} + +Note that each level of detail has a different interpretation for bootstrap +confidence intervals. For \code{ensemble} and \code{model} these are the confidence +intervals for the ensemble and an individual model, respectively. That is, +the confidence interval describes the range where an estimate produced by a +respective ensemble or model trained on a repeat of the experiment may be +found with the probability of the confidence level. For \code{hybrid}, it +represents the range where any single model trained on a repeat of the +experiment may be found with the probability of the confidence level. By +definition, confidence intervals obtained using \code{hybrid} are at least as +wide as those for \code{ensemble}. \code{hybrid} offers the correct interpretation if +the goal of the analysis is to assess the result of a single, unspecified, +model. + +\code{hybrid} is generally computationally less expensive then \code{ensemble}, which +in turn is somewhat less expensive than \code{model}. + +A non-default \code{detail_level} parameter can be specified for separate +evaluation steps by providing a parameter value in a named list with data +elements, e.g. \code{list("auc_data"="ensemble", "model_performance"="hybrid")}. +This parameter can be set for the following data elements: \code{auc_data}, +\code{decision_curve_analyis}, \code{model_performance}, \code{permutation_vimp}, +\code{ice_data}, \code{prediction_data} and \code{confusion_matrix}.} + +\item{confidence_level}{(\emph{optional}) Numeric value for the level at which +confidence intervals are determined. In the case bootstraps are used to +determine the confidence intervals bootstrap estimation, \code{familiar} uses the +rule of thumb \eqn{n = 20 / ci.level} to determine the number of required +bootstraps. + +The default value is \code{0.95}.} + +\item{message_indent}{Number of indentation steps for messages shown during +computation and extraction of various data elements.} + +\item{verbose}{Flag to indicate whether feedback should be provided on the +computation and extraction of various data elements.} + +\item{...}{Unused arguments.} +} +\value{ +A list with data.tables containing information concerning risk group +stratification. +} +\description{ +Computes and extracts stratification data from a +\code{familiarEnsemble} object. This includes the data required to draw +Kaplan-Meier plots, as well as logrank and hazard-ratio tests between the +respective risk groups. +} +\keyword{internal} diff --git a/man/extract_risk_stratification_info.Rd b/man/extract_risk_stratification_info.Rd new file mode 100644 index 00000000..8b15056a --- /dev/null +++ b/man/extract_risk_stratification_info.Rd @@ -0,0 +1,78 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarDataComputationRiskStratificationInfo.R +\name{extract_risk_stratification_info} +\alias{extract_risk_stratification_info} +\title{Internal function to extract risk stratification info from data.} +\usage{ +extract_risk_stratification_info( + object, + detail_level = waiver(), + message_indent = 0L, + verbose = FALSE, + ... +) +} +\arguments{ +\item{object}{A \code{familiarEnsemble} object, which is an ensemble of one or more +\code{familiarModel} objects.} + +\item{detail_level}{(\emph{optional}) Sets the level at which results are computed +and aggregated. +\itemize{ +\item \code{ensemble}: Results are computed at the ensemble level, i.e. over all +models in the ensemble. This means that, for example, bias-corrected +estimates of model performance are assessed by creating (at least) 20 +bootstraps and computing the model performance of the ensemble model for +each bootstrap. +\item \code{hybrid} (default): Results are computed at the level of models in an +ensemble. This means that, for example, bias-corrected estimates of model +performance are directly computed using the models in the ensemble. If there +are at least 20 trained models in the ensemble, performance is computed for +each model, in contrast to \code{ensemble} where performance is computed for the +ensemble of models. If there are less than 20 trained models in the +ensemble, bootstraps are created so that at least 20 point estimates can be +made. +\item \code{model}: Results are computed at the model level. This means that, for +example, bias-corrected estimates of model performance are assessed by +creating (at least) 20 bootstraps and computing the performance of the model +for each bootstrap. +} + +Note that each level of detail has a different interpretation for bootstrap +confidence intervals. For \code{ensemble} and \code{model} these are the confidence +intervals for the ensemble and an individual model, respectively. That is, +the confidence interval describes the range where an estimate produced by a +respective ensemble or model trained on a repeat of the experiment may be +found with the probability of the confidence level. For \code{hybrid}, it +represents the range where any single model trained on a repeat of the +experiment may be found with the probability of the confidence level. By +definition, confidence intervals obtained using \code{hybrid} are at least as +wide as those for \code{ensemble}. \code{hybrid} offers the correct interpretation if +the goal of the analysis is to assess the result of a single, unspecified, +model. + +\code{hybrid} is generally computationally less expensive then \code{ensemble}, which +in turn is somewhat less expensive than \code{model}. + +A non-default \code{detail_level} parameter can be specified for separate +evaluation steps by providing a parameter value in a named list with data +elements, e.g. \code{list("auc_data"="ensemble", "model_performance"="hybrid")}. +This parameter can be set for the following data elements: \code{auc_data}, +\code{decision_curve_analyis}, \code{model_performance}, \code{permutation_vimp}, +\code{ice_data}, \code{prediction_data} and \code{confusion_matrix}.} + +\item{message_indent}{Number of indentation steps for messages shown during +computation and extraction of various data elements.} + +\item{verbose}{Flag to indicate whether feedback should be provided on the +computation and extraction of various data elements.} + +\item{...}{Unused arguments.} +} +\value{ +A list of familiarDataElements with risk stratification information. +} +\description{ +Collects risk stratification information. +} +\keyword{internal} diff --git a/man/extract_sample_similarity.Rd b/man/extract_sample_similarity.Rd new file mode 100644 index 00000000..80bfdcac --- /dev/null +++ b/man/extract_sample_similarity.Rd @@ -0,0 +1,98 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarDataComputationSampleSimilarity.R +\name{extract_sample_similarity} +\alias{extract_sample_similarity} +\title{Internal function to extract the sample distance table.} +\usage{ +extract_sample_similarity( + object, + data, + cl = NULL, + is_pre_processed = FALSE, + sample_limit = waiver(), + sample_cluster_method = waiver(), + sample_linkage_method = waiver(), + sample_similarity_metric = waiver(), + verbose = FALSE, + message_indent = 0L, + ... +) +} +\arguments{ +\item{object}{A \code{familiarEnsemble} object, which is an ensemble of one or more +\code{familiarModel} objects.} + +\item{data}{A \code{dataObject} object, \code{data.table} or \code{data.frame} that +constitutes the data that are assessed.} + +\item{cl}{Cluster created using the \code{parallel} package. This cluster is then +used to speed up computation through parallellisation.} + +\item{is_pre_processed}{Flag that indicates whether the data was already +pre-processed externally, e.g. normalised and clustered. Only used if the +\code{data} argument is a \code{data.table} or \code{data.frame}.} + +\item{sample_limit}{(\emph{optional}) Set the upper limit of the number of samples +that are used during evaluation steps. Cannot be less than 20. + +This setting can be specified per data element by providing a parameter +value in a named list with data elements, e.g. +\code{list("sample_similarity"=100, "permutation_vimp"=1000)}. + +This parameter can be set for the following data elements: +\code{sample_similarity} and \code{ice_data}.} + +\item{sample_cluster_method}{The method used to perform clustering based on +distance between samples. These are the same methods as for the +\code{cluster_method} configuration parameter: \code{hclust}, \code{agnes}, \code{diana} and +\code{pam}. + +\code{none} cannot be used when extracting data for feature expressions. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{sample_linkage_method}{The method used for agglomerative clustering in +\code{hclust} and \code{agnes}. These are the same methods as for the +\code{cluster_linkage_method} configuration parameter: \code{average}, \code{single}, +\code{complete}, \code{weighted}, and \code{ward}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{sample_similarity_metric}{Metric to determine pairwise similarity +between samples. Similarity is computed in the same manner as for +clustering, but \code{sample_similarity_metric} has different options that are +better suited to computing distance between samples instead of between +features: \code{gower}, \code{euclidean}. + +The underlying feature data is scaled to the \eqn{[0, 1]} range (for +numerical features) using the feature values across the samples. The +normalisation parameters required can optionally be computed from feature +data with the outer 5\% (on both sides) of feature values trimmed or +winsorised. To do so append \verb{_trim} (trimming) or \verb{_winsor} (winsorising) to +the metric name. This reduces the effect of outliers somewhat. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{verbose}{Flag to indicate whether feedback should be provided on the +computation and extraction of various data elements.} + +\item{message_indent}{Number of indentation steps for messages shown during +computation and extraction of various data elements.} + +\item{...}{Unused arguments.} +} +\value{ +A data.table containing pairwise distance between samples. This data +is only the upper triangular of the complete matrix (i.e. the sparse +unitriangular representation). Diagonals will always be 0.0 and the lower +triangular is mirrored. +} +\description{ +Computes and extracts the sample distance table for samples +analysed using a \code{familiarEnsemble} object to form a \code{familiarData} object. This table can be used to cluster +samples, and is exported directly by \code{extract_feature_expression}. +} +\keyword{internal} diff --git a/man/extract_univariate_analysis.Rd b/man/extract_univariate_analysis.Rd new file mode 100644 index 00000000..c4ca6070 --- /dev/null +++ b/man/extract_univariate_analysis.Rd @@ -0,0 +1,104 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarDataComputationUnivariateAnalysis.R +\name{extract_univariate_analysis} +\alias{extract_univariate_analysis} +\title{Internal function to extract data from a univariate analysis.} +\usage{ +extract_univariate_analysis( + object, + data, + cl = NULL, + icc_type = waiver(), + feature_similarity = NULL, + feature_cluster_method = waiver(), + feature_cluster_cut_method = waiver(), + feature_linkage_method = waiver(), + feature_similarity_threshold = waiver(), + feature_similarity_metric = waiver(), + message_indent = 0L, + verbose = FALSE, + ... +) +} +\arguments{ +\item{object}{A \code{familiarEnsemble} object, which is an ensemble of one or more +\code{familiarModel} objects.} + +\item{data}{A \code{dataObject} object, \code{data.table} or \code{data.frame} that +constitutes the data that are assessed.} + +\item{cl}{Cluster created using the \code{parallel} package. This cluster is then +used to speed up computation through parallellisation.} + +\item{icc_type}{String indicating the type of intraclass correlation +coefficient (\code{1}, \code{2} or \code{3}) that should be used to compute robustness for +features in repeated measurements during the evaluation of univariate +importance. These types correspond to the types in Shrout and Fleiss (1979). +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{feature_cluster_method}{The method used to perform clustering. These are +the same methods as for the \code{cluster_method} configuration parameter: +\code{none}, \code{hclust}, \code{agnes}, \code{diana} and \code{pam}. + +\code{none} cannot be used when extracting data regarding mutual correlation or +feature expressions. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{feature_cluster_cut_method}{The method used to divide features into +separate clusters. The available methods are the same as for the +\code{cluster_cut_method} configuration parameter: \code{silhouette}, \code{fixed_cut} and +\code{dynamic_cut}. + +\code{silhouette} is available for all cluster methods, but \code{fixed_cut} only +applies to methods that create hierarchical trees (\code{hclust}, \code{agnes} and +\code{diana}). \code{dynamic_cut} requires the \code{dynamicTreeCut} package and can only +be used with \code{agnes} and \code{hclust}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{feature_linkage_method}{The method used for agglomerative clustering in +\code{hclust} and \code{agnes}. These are the same methods as for the +\code{cluster_linkage_method} configuration parameter: \code{average}, \code{single}, +\code{complete}, \code{weighted}, and \code{ward}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{feature_similarity_threshold}{The threshold level for pair-wise +similarity that is required to form feature clusters with the \code{fixed_cut} +method. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{feature_similarity_metric}{Metric to determine pairwise similarity +between features. Similarity is computed in the same manner as for +clustering, and \code{feature_similarity_metric} therefore has the same options +as \code{cluster_similarity_metric}: \code{mcfadden_r2}, \code{cox_snell_r2}, +\code{nagelkerke_r2}, \code{spearman}, \code{kendall} and \code{pearson}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{message_indent}{Number of indentation steps for messages shown during +computation and extraction of various data elements.} + +\item{verbose}{Flag to indicate whether feedback should be provided on the +computation and extraction of various data elements.} + +\item{...}{Unused arguments.} +} +\value{ +A list with a data.table containing information concerning the +univariate analysis of important features. +} +\description{ +Computes and extracts univariate analysis for the features used +in a \code{familiarEnsemble} object. This assessment includes the computation of +p and q-values, as well as robustness (in case of repeated measurements). +} +\keyword{internal} diff --git a/man/familiar.Rd b/man/familiar.Rd new file mode 100644 index 00000000..739fcdba --- /dev/null +++ b/man/familiar.Rd @@ -0,0 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Familiar.R +\docType{package} +\name{familiar} +\alias{familiar-package} +\alias{familiar} +\title{familiar: Fully Automated Machine Learning with Interpretable Analysis of Results} +\description{ +End-to-end, automated machine learning package for creating +trustworthy and interpretable machine learning models. Familiar supports +modelling of regression, categorical and time-to-event (survival) outcomes. +Models created using familiar are self-containing, and their use does not +require additional information such as baseline survival, feature +clustering, or feature transformation and normalisation parameters. In +addition, an novelty or out-of-distribution detector is trained +simultaneously and contained with every model. Model performance, +calibration, risk group stratification, (permutation) variable importance, +individual conditional expectation, partial dependence, and more, are +assessed automatically as part of the evaluation process and exported in +tabular format and plotted, and may also be computed manually using export +and plot functions. Where possible, metrics and values obtained during the +evaluation process come with confidence intervals. +} +\seealso{ +Useful links: +\itemize{ + \item \url{https://github.com/alexzwanenburg/familiar} + \item Report bugs at \url{https://github.com/alexzwanenburg/familiar/issues} +} + +} +\author{ +\strong{Maintainer}: Alex Zwanenburg \email{alexander.zwanenburg@nct-dresden.de} (\href{https://orcid.org/0000-0002-0342-9545}{ORCID}) + +Authors: +\itemize{ + \item Steffen Löck +} + +Other contributors: +\itemize{ + \item Stefan Leger [contributor] + \item Iram Shahzadi [contributor] + \item Asier Rabasco Meneghetti [contributor] + \item Sebastian Starke [contributor] + \item Technische Universität Dresden [copyright holder] + \item German Cancer Research Center (DKFZ) [copyright holder] +} + +} diff --git a/man/familiarCollection-class.Rd b/man/familiarCollection-class.Rd new file mode 100644 index 00000000..e18184e8 --- /dev/null +++ b/man/familiarCollection-class.Rd @@ -0,0 +1,116 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarS4Classes.R +\docType{class} +\name{familiarCollection-class} +\alias{familiarCollection-class} +\title{Collection of familiar data.} +\description{ +A familiarCollection object aggregates data from one or more familiarData +objects. +} +\section{Slots}{ + +\describe{ +\item{\code{name}}{Name of the collection.} + +\item{\code{data_sets}}{Name of the individual underlying datasets.} + +\item{\code{outcome_type}}{Outcome type for which the collection was created.} + +\item{\code{outcome_info}}{Outcome information object, which contains information +concerning the outcome, such as class levels.} + +\item{\code{fs_vimp}}{Variable importance data collected by feature selection +methods.} + +\item{\code{model_vimp}}{Variable importance data collected from model-specific +algorithms implemented by models created by familiar.} + +\item{\code{permutation_vimp}}{Data collected for permutation variable importance.} + +\item{\code{hyperparameters}}{Hyperparameters collected from created models.} + +\item{\code{hyperparameter_data}}{Additional data concerning hyperparameters. This is +currently not used yet.} + +\item{\code{required_features}}{The set of features required for complete +reproduction, i.e. with imputation.} + +\item{\code{model_features}}{The set of features that are required for using the +model, but without imputation.} + +\item{\code{learner}}{Learning algorithm(s) used for data in the collection.} + +\item{\code{fs_method}}{Feature selection method(s) used for data in the collection.} + +\item{\code{prediction_data}}{Model predictions for the data in the collection.} + +\item{\code{confusion_matrix}}{Confusion matrix information for the data in the +collection.} + +\item{\code{decision_curve_data}}{Decision curve analysis data for the data in the +collection.} + +\item{\code{calibration_info}}{Calibration information, e.g. baseline survival in the +development cohort.} + +\item{\code{calibration_data}}{Model calibration data collected from data in the +collection.} + +\item{\code{model_performance}}{Collection of model performance data for data in the +collection.} + +\item{\code{km_info}}{Information concerning risk-stratification cut-off values for +data in the collection.} + +\item{\code{km_data}}{Kaplan-Meier survival data for data in the collection.} + +\item{\code{auc_data}}{AUC-ROC and AUC-PR data for data in the collection.} + +\item{\code{ice_data}}{Individual conditional expectation data for data in the +collection. Partial dependence data are computed on the fly from these +data.} + +\item{\code{univariate_analysis}}{Univariate analysis results of data in the +collection.} + +\item{\code{feature_expressions}}{Feature expression values for data in the +collection.} + +\item{\code{feature_similarity}}{Feature similarity information for data in the +collection.} + +\item{\code{sample_similarity}}{Sample similarity information for data in the +collection.} + +\item{\code{data_set_labels}}{Labels for the different datasets in the collection. +See \code{get_data_set_names} and \code{set_data_set_names}.} + +\item{\code{learner_labels}}{Labels for the different learning algorithms used to +create the collection. See \code{get_learner_names} and \code{set_learner_names}.} + +\item{\code{fs_method_labels}}{Labels for the different feature selection methods +used to create the collection. See \code{get_fs_method_names} and +\code{set_fs_method_names}.} + +\item{\code{feature_labels}}{Labels for the features in this collection. See +\code{get_feature_names} and \code{set_feature_names}.} + +\item{\code{km_group_labels}}{Labels for the risk strata in this collection. See +\code{get_risk_group_names} and \code{set_risk_group_names}.} + +\item{\code{class_labels}}{Labels of the response variable. See \code{get_class_names} and +\code{set_class_names}.} + +\item{\code{project_id}}{Identifier of the project that generated this collection.} + +\item{\code{familiar_version}}{Version of the familiar package. + +familiarCollection objects collect data from one or more familiarData +objects. This objects are important, as all plotting and export functions use +it. The fact that one can supply familiarModel, familiarEnsemble and +familiarData objects as arguments for these methods, is because familiar +internally converts these into familiarCollection objects prior to executing +the method.} +}} + diff --git a/man/familiarData-class.Rd b/man/familiarData-class.Rd new file mode 100644 index 00000000..be376c6e --- /dev/null +++ b/man/familiarData-class.Rd @@ -0,0 +1,105 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarS4Classes.R +\docType{class} +\name{familiarData-class} +\alias{familiarData-class} +\title{Dataset obtained after evaluating models on a dataset.} +\description{ +A familiarData object is created by evaluating familiarEnsemble or +familiarModel objects on a dataset. Multiple familiarData objects are +aggregated in a familiarCollection object. +} +\section{Slots}{ + +\describe{ +\item{\code{name}}{Name of the dataset, e.g. training or internal validation.} + +\item{\code{outcome_type}}{Outcome type of the data used to create the object.} + +\item{\code{outcome_info}}{Outcome information object, which contains additional +information concerning the outcome, such as class levels.} + +\item{\code{fs_vimp}}{Variable importance data collected from feature selection +methods.} + +\item{\code{model_vimp}}{Variable importance data collected from model-specific +algorithms implemented by models created by familiar.} + +\item{\code{permutation_vimp}}{Data collected for permutation variable importance.} + +\item{\code{hyperparameters}}{Hyperparameters collected from created models.} + +\item{\code{hyperparameter_data}}{Additional data concerning hyperparameters. This is +currently not used yet.} + +\item{\code{required_features}}{The set of features required for complete +reproduction, i.e. with imputation.} + +\item{\code{model_features}}{The set of features that are required for using the +model or ensemble of models, but without imputation.} + +\item{\code{learner}}{Learning algorithm used to create the model or ensemble of +models.} + +\item{\code{fs_method}}{Feature selection method used to determine variable +importance for the model or ensemble of models.} + +\item{\code{pooling_table}}{Run table for the data underlying the familiarData +object. Used internally.} + +\item{\code{prediction_data}}{Model predictions for a model or ensemble of models for +the underlying dataset.} + +\item{\code{confusion_matrix}}{Confusion matrix for a model or ensemble of models, +based on the underlying dataset.} + +\item{\code{decision_curve_data}}{Decision curve analysis data for a model or +ensemble of models, based on the underlying dataset.} + +\item{\code{calibration_info}}{Calibration information, e.g. baseline survival in the +development cohort.} + +\item{\code{calibration_data}}{Calibration data for a model or ensemble of models, +based on the underlying dataset.} + +\item{\code{model_performance}}{Model performance data for a model or ensemble of +models, based on the underlying dataset.} + +\item{\code{km_info}}{Information concerning risk-stratification cut-off values..} + +\item{\code{km_data}}{Kaplan-Meier survival data for a model or ensemble of models, +based on the underlying dataset.} + +\item{\code{auc_data}}{AUC-ROC and AUC-PR data for a model or ensemble of models, +based on the underlying dataset.} + +\item{\code{ice_data}}{Individual conditional expectation data for features included +in a model or ensemble of models, based on the underlying dataset. Partial +dependence data are computed on the fly from these data.} + +\item{\code{univariate_analysis}}{Univariate analysis of the underlying dataset.} + +\item{\code{feature_expressions}}{Feature expression values of the underlying +dataset.} + +\item{\code{feature_similarity}}{Feature similarity information of the underlying +dataset.} + +\item{\code{sample_similarity}}{Sample similarity information of the underlying +dataset.} + +\item{\code{is_validation}}{Signifies whether the underlying data forms a validation +dataset. Used internally.} + +\item{\code{generating_ensemble}}{Name of the ensemble that was used to generate the +familiarData object.} + +\item{\code{project_id}}{Identifier of the project that generated the familiarData +object.} + +\item{\code{familiar_version}}{Version of the familiar package. + +familiarData objects contain information obtained by evaluating a single +model or single ensemble of models on a dataset.} +}} + diff --git a/man/familiarDataElement-class.Rd b/man/familiarDataElement-class.Rd new file mode 100644 index 00000000..d8332af4 --- /dev/null +++ b/man/familiarDataElement-class.Rd @@ -0,0 +1,107 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarS4Classes.R +\docType{class} +\name{familiarDataElement-class} +\alias{familiarDataElement-class} +\title{Data container for evaluation data.} +\description{ +Most attributes of the familiarData object are objects of the +familiarDataElement class. This (super-)class is used to allow for +standardised aggregation and processing of evaluation data. +} +\section{Slots}{ + +\describe{ +\item{\code{data}}{Evaluation data, typically a data.table or list.} + +\item{\code{identifiers}}{Identifiers of the data, e.g. the generating model name, +learner, etc.} + +\item{\code{detail_level}}{Sets the level at which results are computed and +aggregated. +\itemize{ +\item \code{ensemble}: Results are computed at the ensemble level, i.e. over all +models in the ensemble. This means that, for example, bias-corrected +estimates of model performance are assessed by creating (at least) 20 +bootstraps and computing the model performance of the ensemble model for +each bootstrap. +\item \code{hybrid} (default): Results are computed at the level of models in an +ensemble. This means that, for example, bias-corrected estimates of model +performance are directly computed using the models in the ensemble. If there +are at least 20 trained models in the ensemble, performance is computed for +each model, in contrast to \code{ensemble} where performance is computed for the +ensemble of models. If there are less than 20 trained models in the +ensemble, bootstraps are created so that at least 20 point estimates can be +made. +\item \code{model}: Results are computed at the model level. This means that, for +example, bias-corrected estimates of model performance are assessed by +creating (at least) 20 bootstraps and computing the performance of the model +for each bootstrap. +} + +Note that each level of detail has a different interpretation for bootstrap +confidence intervals. For \code{ensemble} and \code{model} these are the confidence +intervals for the ensemble and an individual model, respectively. That is, +the confidence interval describes the range where an estimate produced by a +respective ensemble or model trained on a repeat of the experiment may be +found with the probability of the confidence level. For \code{hybrid}, it +represents the range where any single model trained on a repeat of the +experiment may be found with the probability of the confidence level. By +definition, confidence intervals obtained using \code{hybrid} are at least as +wide as those for \code{ensemble}. \code{hybrid} offers the correct interpretation if +the goal of the analysis is to assess the result of a single, unspecified, +model. + +Some child classes do not use this parameter.} + +\item{\code{estimation_type}}{Sets the type of estimation that should be possible. +This has the following options: +\itemize{ +\item \code{point}: Point estimates. +\item \code{bias_correction} or \code{bc}: Bias-corrected estimates. A bias-corrected +estimate is computed from (at least) 20 point estimates, and \code{familiar} may +bootstrap the data to create them. +\item \code{bootstrap_confidence_interval} or \code{bci} (default): Bias-corrected +estimates with bootstrap confidence intervals (Efron and Hastie, 2016). The +number of point estimates required depends on the \code{confidence_level} +parameter, and \code{familiar} may bootstrap the data to create them. +} + +Some child classes do not use this parameter.} + +\item{\code{confidence_level}}{(\emph{optional}) Numeric value for the level at which +confidence intervals are determined. In the case bootstraps are used to +determine the confidence intervals bootstrap estimation, \code{familiar} uses the +rule of thumb \eqn{n = 20 / ci.level} to determine the number of required +bootstraps.} + +\item{\code{bootstrap_ci_method}}{Method used to determine bootstrap confidence +intervals (Efron and Hastie, 2016). The following methods are implemented: +\itemize{ +\item \code{percentile} (default): Confidence intervals obtained using the percentile +method. +\item \code{bc}: Bias-corrected confidence intervals. +} + +Note that the standard method is not implemented because this method is +often not suitable due to non-normal distributions. The bias-corrected and +accelerated (BCa) method is not implemented yet.} + +\item{\code{value_column}}{Identifies column(s) in the \code{data} attribute presenting +values.} + +\item{\code{grouping_column}}{Identifies column(s) in the \code{data} attribute presenting +identifier columns for grouping during aggregation. Familiar will +automatically assign items from the \code{identifiers} attribute to the data and +this attribute when combining multiple familiarDataElements of the same +(child) class.} + +\item{\code{is_aggregated}}{Defines whether the object was aggregated.} +}} + +\references{ +\enumerate{ +\item Efron, B. & Hastie, T. Computer Age Statistical Inference. +(Cambridge University Press, 2016). +} +} diff --git a/man/familiarEnsemble-class.Rd b/man/familiarEnsemble-class.Rd new file mode 100644 index 00000000..5a414530 --- /dev/null +++ b/man/familiarEnsemble-class.Rd @@ -0,0 +1,65 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarS4Classes.R +\docType{class} +\name{familiarEnsemble-class} +\alias{familiarEnsemble-class} +\title{Ensemble of familiar models.} +\description{ +A familiarEnsemble object contains one or more familiarModel objects. +} +\section{Slots}{ + +\describe{ +\item{\code{name}}{Name of the familiarEnsemble object.} + +\item{\code{model_list}}{List of attached familiarModel objects, or paths to these +objects. Familiar attaches familiarModel objects when required.} + +\item{\code{outcome_type}}{Outcome type of the data used to create the object.} + +\item{\code{outcome_info}}{Outcome information object, which contains additional +information concerning the outcome, such as class levels.} + +\item{\code{data_column_info}}{Data information object containing information +regarding identifier column names and outcome column names.} + +\item{\code{learner}}{Learning algorithm used to create the models in the ensemble.} + +\item{\code{fs_method}}{Feature selection method used to determine variable +importance for the models in the ensemble.} + +\item{\code{feature_info}}{List of objects containing feature information, e.g., +name, class levels, transformation, normalisation and clustering +parameters.} + +\item{\code{required_features}}{The set of features required for complete +reproduction, i.e. with imputation.} + +\item{\code{model_features}}{The combined set of features that is used to train the +models in the ensemble,} + +\item{\code{novelty_features}}{The combined set of features that is used to train +all novelty detectors in the ensemble.} + +\item{\code{run_table}}{Run table for the data used to train the ensemble. Used +internally.} + +\item{\code{calibration_info}}{Calibration information, e.g. baseline survival in the +development cohort.} + +\item{\code{model_dir_path}}{Path to folder containing the familiarModel objects. Can +be updated using the \code{update_model_dir_path} method.} + +\item{\code{auto_detach}}{Flag used to determine whether models should be detached +from the model after use, or not. Used internally.} + +\item{\code{settings}}{A copy of the evaluation configuration parameters used at +model creation. These are used as default parameters when evaluating the +ensemble to create a familiarData object.} + +\item{\code{project_id}}{Identifier of the project that generated the +underlying familiarModel object(s).} + +\item{\code{familiar_version}}{Version of the familiar package.} +}} + diff --git a/man/familiarHyperparameterLearner-class.Rd b/man/familiarHyperparameterLearner-class.Rd new file mode 100644 index 00000000..ba8e8ed6 --- /dev/null +++ b/man/familiarHyperparameterLearner-class.Rd @@ -0,0 +1,53 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarS4Classes.R +\docType{class} +\name{familiarHyperparameterLearner-class} +\alias{familiarHyperparameterLearner-class} +\title{Hyperparameter learner.} +\description{ +A familiarHyperparameterLearner object is a self-contained model that can be +applied to predict optimisation scores for a set of hyperparameters. +} +\details{ +Hyperparameter learners are used to infer the optimisation score for +sets of hyperparameters. These are then used to either infer utility using +acquisition functions or to generate summary scores to identify the optimal +model. +} +\section{Slots}{ + +\describe{ +\item{\code{name}}{Name of the familiarHyperparameterLearner object.} + +\item{\code{learner}}{Algorithm used to create the hyperparameter learner.} + +\item{\code{target_learner}}{Algorithm for which the hyperparameters are being +learned.} + +\item{\code{target_outcome_type}}{Outcome type of the learner for which +hyperparameters are being modeled. Used to determine the target +hyperparameters.} + +\item{\code{optimisation_metric}}{One or metrics used to generate the optimisation +score.} + +\item{\code{optimisation_function}}{Function used to generate the optimisation score.} + +\item{\code{model}}{The actual model trained using the specific algorithm, e.g. a +isolation forest from the \code{isotree} package.} + +\item{\code{target_hyperparameters}}{The names of the hyperparameters that are used +to train the hyperparameter learner.} + +\item{\code{project_id}}{Identifier of the project that generated the +familiarHyperparameterLearner object.} + +\item{\code{familiar_version}}{Version of the familiar package.} + +\item{\code{package}}{Name of package(s) required to executed the hyperparameter +learner itself, e.g. \code{laGP}.} + +\item{\code{package_version}}{Version of the packages mentioned in the \code{package} +attribute.} +}} + diff --git a/man/familiarMetric-class.Rd b/man/familiarMetric-class.Rd new file mode 100644 index 00000000..349df7dc --- /dev/null +++ b/man/familiarMetric-class.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarS4Classes.R +\docType{class} +\name{familiarMetric-class} +\alias{familiarMetric-class} +\title{Model performance metric.} +\description{ +Superclass for model performance objects. +} +\section{Slots}{ + +\describe{ +\item{\code{metric}}{Performance metric.} + +\item{\code{outcome_type}}{Type of outcome being predicted.} + +\item{\code{name}}{Name of the performance metric.} + +\item{\code{value_range}}{Range of the performance metric. Can be half-open.} + +\item{\code{baseline_value}}{Value of the metric for trivial models, e.g. models that +always predict the median value, the majority class, or the mean hazard, +etc.} + +\item{\code{higher_better}}{States whether higher metric values correspond to better +predictive model performance (e.g. accuracy) or not (e.g. root mean squared +error).} +}} + diff --git a/man/familiarModel-class.Rd b/man/familiarModel-class.Rd new file mode 100644 index 00000000..cea8aad9 --- /dev/null +++ b/man/familiarModel-class.Rd @@ -0,0 +1,87 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarS4Classes.R +\docType{class} +\name{familiarModel-class} +\alias{familiarModel-class} +\title{Familiar model.} +\description{ +A familiarModel object is a self-contained model that can be applied to +generate predictions for a dataset. familiarModel objects form the parent +class of learner-specific child classes. +} +\section{Slots}{ + +\describe{ +\item{\code{name}}{Name of the familiarModel object.} + +\item{\code{model}}{The actual model trained using a specific algorithm, e.g. a +random forest from the \code{ranger} package, or a LASSO model from \code{glmnet}.} + +\item{\code{outcome_type}}{Outcome type of the data used to create the object.} + +\item{\code{outcome_info}}{Outcome information object, which contains additional +information concerning the outcome, such as class levels.} + +\item{\code{feature_info}}{List of objects containing feature information, e.g., +name, class levels, transformation, normalisation and clustering +parameters.} + +\item{\code{data_column_info}}{Data information object containing information +regarding identifier column names and outcome column names.} + +\item{\code{hyperparameters}}{Set of hyperparameters used to train the model.} + +\item{\code{hyperparameter_data}}{Information generated during hyperparameter +optimisation.} + +\item{\code{calibration_model}}{One or more models used to recalibrate the model +output. Currently only used by some models.} + +\item{\code{novelty_detector}}{A familiarNoveltyDetector object that can be used to +detect out-of-distribution samples.} + +\item{\code{learner}}{Learning algorithm used to create the model.} + +\item{\code{fs_method}}{Feature selection method used to determine variable +importance for the model.} + +\item{\code{required_features}}{The set of features required for complete +reproduction, i.e. with imputation.} + +\item{\code{model_features}}{The set of features that is used to train the model,} + +\item{\code{novelty_features}}{The set of features that is used to train all novelty +detectors in the ensemble.} + +\item{\code{calibration_info}}{Calibration information, e.g. baseline survival in the +development cohort.} + +\item{\code{km_info}}{Data concerning stratification into risk groups.} + +\item{\code{run_table}}{Run table for the data used to train the model. Used +internally.} + +\item{\code{settings}}{A copy of the evaluation configuration parameters used at +model creation. These are used as default parameters when evaluating the +model (technically, familiarEnsemble) to create a familiarData object.} + +\item{\code{is_trimmed}}{Flag that indicates whether the model, stored in the \code{model} +slot, has been trimmed.} + +\item{\code{trimmed_function}}{List of functions whose output has been captured prior +to trimming the model.} + +\item{\code{messages}}{List of warning and error messages generated during training.} + +\item{\code{project_id}}{Identifier of the project that generated the familiarModel +object.} + +\item{\code{familiar_version}}{Version of the familiar package.} + +\item{\code{package}}{Name of package(s) required to executed the model itself, e.g. +\code{ranger} or \code{glmnet}.} + +\item{\code{package_version}}{Version of the packages mentioned in the \code{package} +attribute.} +}} + diff --git a/man/familiarNoveltyDetector-class.Rd b/man/familiarNoveltyDetector-class.Rd new file mode 100644 index 00000000..2fb59637 --- /dev/null +++ b/man/familiarNoveltyDetector-class.Rd @@ -0,0 +1,62 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarS4Classes.R +\docType{class} +\name{familiarNoveltyDetector-class} +\alias{familiarNoveltyDetector-class} +\title{Novelty detector.} +\description{ +A familiarNoveltyDetector object is a self-contained model that can be +applied to generate out-of-distribution predictions for instances in a +dataset. +} +\section{Slots}{ + +\describe{ +\item{\code{name}}{Name of the familiarNoveltyDetector object.} + +\item{\code{learner}}{Learning algorithm used to create the novelty detector.} + +\item{\code{model}}{The actual novelty detector trained using a specific algorithm, +e.g. a isolation forest from the \code{isotree} package.} + +\item{\code{feature_info}}{List of objects containing feature information, e.g., +name, class levels, transformation, normalisation and clustering +parameters.} + +\item{\code{data_column_info}}{Data information object containing information +regarding identifier column names.} + +\item{\code{conversion_parameters}}{Parameters used to convert raw output to +statistical probability of being out-of-distribution. Currently unused.} + +\item{\code{hyperparameters}}{Set of hyperparameters used to train the detector.} + +\item{\code{required_features}}{The set of features required for complete +reproduction, i.e. with imputation.} + +\item{\code{model_features}}{The set of features that is used to train the detector.} + +\item{\code{run_table}}{Run table for the data used to train the detector. Used +internally.} + +\item{\code{is_trimmed}}{Flag that indicates whether the detector, stored in the +\code{model} slot, has been trimmed.} + +\item{\code{trimmed_function}}{List of functions whose output has been captured prior +to trimming the model.} + +\item{\code{project_id}}{Identifier of the project that generated the +familiarNoveltyDetector object.} + +\item{\code{familiar_version}}{Version of the familiar package.} + +\item{\code{package}}{Name of package(s) required to executed the detector itself, +e.g. \code{isotree}.} + +\item{\code{package_version}}{Version of the packages mentioned in the \code{package} +attribute. + +Note that these objects do not contain any data concerning outcome, as this +not relevant for (prospective) out-of-distribution detection.} +}} + diff --git a/man/familiarVimpMethod-class.Rd b/man/familiarVimpMethod-class.Rd new file mode 100644 index 00000000..a7495472 --- /dev/null +++ b/man/familiarVimpMethod-class.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarS4Classes.R +\docType{class} +\name{familiarVimpMethod-class} +\alias{familiarVimpMethod-class} +\title{Variable importance method object.} +\description{ +The familiarVimpMethod class is the parent class for all variable importance +methods in familiar. +} +\section{Slots}{ + +\describe{ +\item{\code{outcome_type}}{Outcome type of the data to be evaluated using the object.} + +\item{\code{hyperparameters}}{Set of hyperparameters for the variable importance +method.} + +\item{\code{vimp_method}}{The character string indicating the variable importance +method.} + +\item{\code{multivariate}}{Flags whether the variable importance method is +multivariate vs. univariate.} + +\item{\code{outcome_info}}{Outcome information object, which contains additional +information concerning the outcome, such as class levels.} + +\item{\code{feature_info}}{List of objects containing feature information, e.g., +name, class levels, transformation, normalisation and clustering +parameters.} + +\item{\code{required_features}}{The set of features to be assessed by the variable +importance method.} + +\item{\code{package}}{Name of the package(s) required to execute the variable +importance method.} + +\item{\code{run_table}}{Run table for the data to be assessed by the variable +importance method. Used internally.} + +\item{\code{project_id}}{Identifier of the project that generated the +familiarVimpMethod object.} +}} + diff --git a/man/featureInfo-class.Rd b/man/featureInfo-class.Rd new file mode 100644 index 00000000..f559008a --- /dev/null +++ b/man/featureInfo-class.Rd @@ -0,0 +1,89 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarS4Classes.R +\docType{class} +\name{featureInfo-class} +\alias{featureInfo-class} +\title{Feature information object.} +\description{ +A featureInfo object contains information for a single feature. This +information is used to check data prospectively for consistency and for data +preparation. These objects are, for instance, attached to a familiarModel +object so that data can be pre-processed in the same way as the development +data. +} +\section{Slots}{ + +\describe{ +\item{\code{name}}{Name of the feature, which by default is the column name of the +feature.} + +\item{\code{set_descriptor}}{Character string describing the set to which the feature +belongs. Currently not used.} + +\item{\code{feature_type}}{Describes the feature type, i.e. \code{factor} or \code{numeric}.} + +\item{\code{levels}}{The class levels of categorical features. This is used to check +prospective datasets.} + +\item{\code{ordered}}{Specifies whether the} + +\item{\code{distribution}}{Five-number summary (numeric) or class frequency +(categorical).} + +\item{\code{data_id}}{Internal identifier for the dataset used to derive the feature +information.} + +\item{\code{run_id}}{Internal identifier for the specific subset of the dataset used +to derive the feature information.} + +\item{\code{in_signature}}{Specifies whether the feature is included in the model +signature.} + +\item{\code{in_novelty}}{Specifies whether the feature is included in the novelty +detector.} + +\item{\code{removed}}{Specifies whether the feature was removed during +pre-processing.} + +\item{\code{removed_unknown_type}}{Specifies whether the feature was removed during +pre-processing because the type was neither factor nor numeric..} + +\item{\code{removed_missing_values}}{Specifies whether the feature was removed during +pre-processing because it contained too many missing values.} + +\item{\code{removed_no_variance}}{Specifies whether the feature was removed during +pre-processing because it did not contain more than 1 unique value.} + +\item{\code{removed_low_variance}}{Specifies whether the feature was removed during +pre-processing because the variance was too low. Requires applying +\code{low_variance} as a \code{filter_method}.} + +\item{\code{removed_low_robustness}}{Specifies whether the feature was removed during +pre-processing because it lacks robustness. Requires applying +\code{robustness} as a \code{filter_method}, as well as repeated measurement.} + +\item{\code{removed_low_importance}}{Specifies whether the feature was removed during +pre-processing because it lacks relevance. Requires applying +\code{univariate_test} as a \code{filter_method}.} + +\item{\code{fraction_missing}}{Specifies the fraction of missing values.} + +\item{\code{robustness}}{Specifies robustness of the feature, if measured.} + +\item{\code{univariate_importance}}{Specifies the univariate p-value of the feature, if measured.} + +\item{\code{transformation_parameters}}{Details parameters for power transformation of numeric features.} + +\item{\code{normalisation_parameters}}{Details parameters for (global) normalisation of numeric features.} + +\item{\code{batch_normalisation_parameters}}{Details parameters for batch normalisation of numeric features.} + +\item{\code{imputation_parameters}}{Details parameters or models for imputation of missing values.} + +\item{\code{cluster_parameters}}{Details parameters for forming clusters with other features.} + +\item{\code{required_features}}{Details features required for clustering or imputation.} + +\item{\code{familiar_version}}{Version of the familiar package.} +}} + diff --git a/man/featureInfoParameters-class.Rd b/man/featureInfoParameters-class.Rd new file mode 100644 index 00000000..7d9522b2 --- /dev/null +++ b/man/featureInfoParameters-class.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarS4Classes.R +\docType{class} +\name{featureInfoParameters-class} +\alias{featureInfoParameters-class} +\title{Feature information parameters object.} +\description{ +A featureInfo object contains information for a single feature. Some +information, for example concerning clustering and transformation contains +various parameters that allow for applying the data transformation correctly. +These are stored in featureInfoParameters objects. +} +\details{ +featureInfoParameters is normally a parent class for specific +classes, such as featureInfoParametersTransformation. +} +\section{Slots}{ + +\describe{ +\item{\code{name}}{Name of the feature, which by default is the column name of the +feature. Typically used to correctly assign the data.} + +\item{\code{complete}}{Flags whether the parameters have been completely set.} + +\item{\code{familiar_version}}{Version of the familiar package.} +}} + diff --git a/man/get_class_names-familiarCollection-method.Rd b/man/get_class_names-familiarCollection-method.Rd new file mode 100644 index 00000000..f7352159 --- /dev/null +++ b/man/get_class_names-familiarCollection-method.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarCollection.R +\name{get_class_names,familiarCollection-method} +\alias{get_class_names,familiarCollection-method} +\alias{get_class_names} +\title{Get outcome class labels} +\usage{ +\S4method{get_class_names}{familiarCollection}(x) +} +\arguments{ +\item{x}{A familiarCollection object.} +} +\value{ +An ordered array of class labels. +} +\description{ +Outcome classes in familiarCollection objects can have custom names for export and plotting. This function retrieves the currently assigned names. +} +\details{ +Labels convert internal class names to the requested label at export or when plotting. Labels can be changed using the \code{set_class_names} method. +} +\seealso{ +\itemize{ +\item \linkS4class{familiarCollection} for information concerning the familiarCollection class. +\item \code{\link{set_class_names}} for updating the name and ordering of classes. +} +} diff --git a/man/get_data_set_names-familiarCollection-method.Rd b/man/get_data_set_names-familiarCollection-method.Rd new file mode 100644 index 00000000..d1be26b4 --- /dev/null +++ b/man/get_data_set_names-familiarCollection-method.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarCollection.R +\name{get_data_set_names,familiarCollection-method} +\alias{get_data_set_names,familiarCollection-method} +\alias{get_data_set_names} +\title{Get current name of datasets} +\usage{ +\S4method{get_data_set_names}{familiarCollection}(x) +} +\arguments{ +\item{x}{A familiarCollection object.} +} +\value{ +An ordered array of dataset name labels. +} +\description{ +Datasets in familiarCollection objects can have custom names for export and plotting. This function retrieves the currently assigned names. +} +\details{ +Labels convert internal naming of data sets to the requested label at export or when plotting. Labels can be changed using the \code{set_data_set_names} method. +} +\seealso{ +\itemize{ +\item \linkS4class{familiarCollection} for information concerning the familiarCollection class. +\item \code{\link{set_data_set_names}} for updating the name of datasets and their ordering. +} +} diff --git a/man/get_feature_names-familiarCollection-method.Rd b/man/get_feature_names-familiarCollection-method.Rd new file mode 100644 index 00000000..f935fc12 --- /dev/null +++ b/man/get_feature_names-familiarCollection-method.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarCollection.R +\name{get_feature_names,familiarCollection-method} +\alias{get_feature_names,familiarCollection-method} +\alias{get_feature_names} +\title{Get current feature labels} +\usage{ +\S4method{get_feature_names}{familiarCollection}(x) +} +\arguments{ +\item{x}{A familiarCollection object.} +} +\value{ +An ordered array of feature labels. +} +\description{ +Features in familiarCollection objects can have custom names for export and plotting. This function retrieves the currently assigned names. +} +\details{ +Labels convert internal naming of features to the requested label at export or when plotting. Labels can be changed using the \code{set_feature_names} method. +} +\seealso{ +\itemize{ +\item \linkS4class{familiarCollection} for information concerning the familiarCollection class. +\item \code{\link{set_feature_names}} for updating the name and ordering of features. +} +} diff --git a/man/get_fs_method_names-familiarCollection-method.Rd b/man/get_fs_method_names-familiarCollection-method.Rd new file mode 100644 index 00000000..82e783dc --- /dev/null +++ b/man/get_fs_method_names-familiarCollection-method.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarCollection.R +\name{get_fs_method_names,familiarCollection-method} +\alias{get_fs_method_names,familiarCollection-method} +\alias{get_fs_method_names} +\title{Get current feature selection method name labels} +\usage{ +\S4method{get_fs_method_names}{familiarCollection}(x) +} +\arguments{ +\item{x}{A familiarCollection object.} +} +\value{ +An ordered array of feature selection method name labels. +} +\description{ +Feature selection methods in familiarCollection objects can have custom names for export and plotting. This function retrieves the currently assigned names. +} +\details{ +Labels convert internal naming of feature selection methods to the requested label at export or when plotting. Labels can be changed using the \code{set_fs_method_names} method. +} +\seealso{ +\itemize{ +\item \linkS4class{familiarCollection} for information concerning the familiarCollection class. +\item \code{\link{set_fs_method_names}} for updating the name of feature selection methods and their ordering. +} +} diff --git a/man/get_learner_names-familiarCollection-method.Rd b/man/get_learner_names-familiarCollection-method.Rd new file mode 100644 index 00000000..9c906d2f --- /dev/null +++ b/man/get_learner_names-familiarCollection-method.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarCollection.R +\name{get_learner_names,familiarCollection-method} +\alias{get_learner_names,familiarCollection-method} +\alias{get_learner_names} +\title{Get current learner name labels} +\usage{ +\S4method{get_learner_names}{familiarCollection}(x) +} +\arguments{ +\item{x}{A familiarCollection object.} +} +\value{ +An ordered array of learner name labels. +} +\description{ +Learners in familiarCollection objects can have custom names for export and plotting. This function retrieves the currently assigned names. +} +\details{ +Labels convert internal naming of learners to the requested label at export or when plotting. Labels can be changed using the \code{set_learner_names} method. +} +\seealso{ +\itemize{ +\item \linkS4class{familiarCollection} for information concerning the familiarCollection class. +\item \code{\link{set_learner_names}} for updating the name of learners and their ordering. +} +} diff --git a/man/get_risk_group_names-familiarCollection-method.Rd b/man/get_risk_group_names-familiarCollection-method.Rd new file mode 100644 index 00000000..7349e0e0 --- /dev/null +++ b/man/get_risk_group_names-familiarCollection-method.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarCollection.R +\name{get_risk_group_names,familiarCollection-method} +\alias{get_risk_group_names,familiarCollection-method} +\alias{get_risk_group_names} +\title{Get current risk group labels} +\usage{ +\S4method{get_risk_group_names}{familiarCollection}(x) +} +\arguments{ +\item{x}{A familiarCollection object.} +} +\value{ +An ordered array of risk group labels. +} +\description{ +Risk groups in familiarCollection objects can have custom names for export and plotting. This function retrieves the currently assigned names. +} +\details{ +Labels convert internal naming of risk groups to the requested label at export or when plotting. Labels can be changed using the \code{set_risk_group_names} method. +} +\seealso{ +\itemize{ +\item \linkS4class{familiarCollection} for information concerning the familiarCollection class. +\item \code{\link{set_risk_group_names}} for updating the name and ordering of risk groups. +} +} diff --git a/man/get_vimp_table-methods.Rd b/man/get_vimp_table-methods.Rd new file mode 100644 index 00000000..2e4ede4c --- /dev/null +++ b/man/get_vimp_table-methods.Rd @@ -0,0 +1,71 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/VimpTable.R +\name{get_vimp_table} +\alias{get_vimp_table} +\alias{get_vimp_table,list-method} +\alias{get_vimp_table,character-method} +\alias{get_vimp_table,vimpTable-method} +\alias{get_vimp_table,NULL-method} +\alias{get_vimp_table,experimentData-method} +\alias{get_vimp_table,familiarModel-method} +\title{Extract variable importance table.} +\usage{ +get_vimp_table(x, state = "ranked", ...) + +\S4method{get_vimp_table}{list}(x, state = "ranked", ...) + +\S4method{get_vimp_table}{character}(x, state = "ranked", ...) + +\S4method{get_vimp_table}{vimpTable}(x, state = "ranked", ...) + +\S4method{get_vimp_table}{NULL}(x, state = "ranked", ...) + +\S4method{get_vimp_table}{experimentData}(x, state = "ranked", ...) + +\S4method{get_vimp_table}{familiarModel}(x, state = "ranked", data = NULL, as_object = FALSE, ...) +} +\arguments{ +\item{x}{Variable importance (\code{vimpTable}) object, a list thereof, or one or +more paths to these objects. This method extracts the variable importance +table from such objects.} + +\item{state}{State of the returned variable importance table. This affects +what contents are shown, and in which format. The variable importance table +can be returned with the following states: +\itemize{ +\item \code{initial}: initial state, directly after the variable importance table is +filled. The returned variable importance table shows the raw, un-processed +data. +\item \code{decoded}: depending on the variable importance method, the initial +variable importance table may contain the scores of individual contrasts for +categorical variables. When decoded, scores from all contrasts are +aggregated to a single score for each feature. +\item \code{declustered}: variable importance is determined from fully processed +features, which includes clustering. This means that a single feature in the +variable importance table may represent multiple original features. When a +variable importance table has been declustered, all clusters have been +turned into their constituent features. +\item \code{ranked} (default): The scores have been used to create ranks, with lower +ranks indicating better features. +} + +Internally, the variable importance table will go through each state, i.e. +an variable importance table in the initial state will be decoded, +declustered and then ranked prior to returning the variable importance +table.} + +\item{...}{Unused arguments.} + +\item{data}{Internally used argument for use with \code{familiarModel} objects.} + +\item{as_object}{Internally used argument for use with \code{familiarModel} +objects.} +} +\value{ +A \code{data.table} with variable importance scores and, with +\code{state="ranked"}, the respective ranks. +} +\description{ +This method retrieves and parses variable importance tables from +their respective \code{vimpTable} objects. +} diff --git a/man/get_xml_config.Rd b/man/get_xml_config.Rd new file mode 100644 index 00000000..9e12de2d --- /dev/null +++ b/man/get_xml_config.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Familiar.R +\name{get_xml_config} +\alias{get_xml_config} +\title{Create an empty xml configuration file} +\usage{ +get_xml_config(dir_path) +} +\arguments{ +\item{dir_path}{Path to the directory where the configuration file should be +created. The directory should exist, and no file named \code{config.xml} should +be present.} +} +\value{ +Nothing. A file named \code{config.xml} is created in the directory +indicated by \code{dir_path}. +} +\description{ +This function creates an empty configuration xml file in the directory +specified by \code{dir_path}. This provides an alternative to the use of input +arguments for familiar. +} +\examples{ +\dontrun{ +# Creates a config.xml file in the working directory +get_xml_config(dir_path=getwd()) +} +} +\keyword{IO} diff --git a/man/is.encapsulated_path.Rd b/man/is.encapsulated_path.Rd new file mode 100644 index 00000000..3b42333c --- /dev/null +++ b/man/is.encapsulated_path.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Utilities.R +\name{is.encapsulated_path} +\alias{is.encapsulated_path} +\title{Internal test for encapsulated_path} +\usage{ +is.encapsulated_path(x) +} +\arguments{ +\item{x}{Object to be tested.} +} +\value{ +\code{TRUE} for objects that are \code{encapsulated_path}, \code{FALSE} otherwise. +} +\description{ +This function tests if the object is an \code{encapsulated_path} object. +} +\keyword{internal} diff --git a/man/is.waive.Rd b/man/is.waive.Rd new file mode 100644 index 00000000..cb316376 --- /dev/null +++ b/man/is.waive.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Utilities.R +\name{is.waive} +\alias{is.waive} +\title{Internal test to see if an object is a waiver} +\usage{ +is.waive(x) +} +\arguments{ +\item{x}{Object to be tested.} +} +\value{ +\code{TRUE} for objects that are waivers, \code{FALSE} otherwise. +} +\description{ +This function tests if the object was created by the \code{waiver} function. This +function is functionally identical to \code{ggplot2:::is.waive()}. +} +\keyword{internal} diff --git a/man/outcomeInfo-class.Rd b/man/outcomeInfo-class.Rd new file mode 100644 index 00000000..15941980 --- /dev/null +++ b/man/outcomeInfo-class.Rd @@ -0,0 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarS4Classes.R +\docType{class} +\name{outcomeInfo-class} +\alias{outcomeInfo-class} +\title{Outcome information object.} +\description{ +An outcome information object stores data concerning an outcome. This is used +to prospectively check data. +} +\section{Slots}{ + +\describe{ +\item{\code{name}}{Name of the outcome, inherited from the original column name by +default.} + +\item{\code{outcome_type}}{Type of outcome.} + +\item{\code{outcome_column}}{Name of the outcome column in data.} + +\item{\code{levels}}{Specifies class levels of categorical outcomes.} + +\item{\code{ordered}}{Specifies whether categorical outcomes are ordered.} + +\item{\code{reference}}{Class level used as reference.} + +\item{\code{time}}{Maximum time, as set by the \code{time_max} configuration parameter.} + +\item{\code{censored}}{Censoring indicators for survival outcomes.} + +\item{\code{event}}{Event indicators for survival outcomes.} + +\item{\code{competing_risk}}{Indicators for competing risks in survival outcomes.} + +\item{\code{distribution}}{Five-number summary (numeric outcomes), class frequency +(categorical outcomes), or survival distributions.} + +\item{\code{data_id}}{Internal identifier for the dataset used to derive the outcome +information.} + +\item{\code{run_id}}{Internal identifier for the specific subset of the dataset used +to derive the outcome information.} + +\item{\code{transformation_parameters}}{Parameters used for transforming a numeric +outcomes. Currently unused.} + +\item{\code{normalisation_parameters}}{Parameters used for normalising numeric +outcomes. Currently unused.} +}} + diff --git a/man/plot_auc_precision_recall_curve-methods.Rd b/man/plot_auc_precision_recall_curve-methods.Rd new file mode 100644 index 00000000..25b40314 --- /dev/null +++ b/man/plot_auc_precision_recall_curve-methods.Rd @@ -0,0 +1,247 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/PlotAUCcurves.R +\name{plot_auc_precision_recall_curve} +\alias{plot_auc_precision_recall_curve} +\alias{plot_auc_precision_recall_curve,ANY-method} +\alias{plot_auc_precision_recall_curve,familiarCollection-method} +\title{Plot the precision-recall curve.} +\usage{ +plot_auc_precision_recall_curve( + object, + draw = FALSE, + dir_path = NULL, + split_by = NULL, + color_by = NULL, + facet_by = NULL, + facet_wrap_cols = NULL, + ggtheme = NULL, + discrete_palette = NULL, + x_label = waiver(), + y_label = waiver(), + legend_label = waiver(), + plot_title = waiver(), + plot_sub_title = waiver(), + caption = NULL, + x_n_breaks = 5, + x_breaks = NULL, + y_n_breaks = 5, + y_breaks = NULL, + conf_int_style = c("ribbon", "step", "none"), + conf_int_alpha = 0.4, + width = waiver(), + height = waiver(), + units = waiver(), + export_collection = FALSE, + ... +) + +\S4method{plot_auc_precision_recall_curve}{ANY}( + object, + draw = FALSE, + dir_path = NULL, + split_by = NULL, + color_by = NULL, + facet_by = NULL, + facet_wrap_cols = NULL, + ggtheme = NULL, + discrete_palette = NULL, + x_label = waiver(), + y_label = waiver(), + legend_label = waiver(), + plot_title = waiver(), + plot_sub_title = waiver(), + caption = NULL, + x_n_breaks = 5, + x_breaks = NULL, + y_n_breaks = 5, + y_breaks = NULL, + conf_int_style = c("ribbon", "step", "none"), + conf_int_alpha = 0.4, + width = waiver(), + height = waiver(), + units = waiver(), + export_collection = FALSE, + ... +) + +\S4method{plot_auc_precision_recall_curve}{familiarCollection}( + object, + draw = FALSE, + dir_path = NULL, + split_by = NULL, + color_by = NULL, + facet_by = NULL, + facet_wrap_cols = NULL, + ggtheme = NULL, + discrete_palette = NULL, + x_label = waiver(), + y_label = waiver(), + legend_label = waiver(), + plot_title = waiver(), + plot_sub_title = waiver(), + caption = NULL, + x_n_breaks = 5, + x_breaks = NULL, + y_n_breaks = 5, + y_breaks = NULL, + conf_int_style = c("ribbon", "step", "none"), + conf_int_alpha = 0.4, + width = waiver(), + height = waiver(), + units = waiver(), + export_collection = FALSE, + ... +) +} +\arguments{ +\item{object}{\code{familiarCollection} object, or one or more \code{familiarData} +objects, that will be internally converted to a \code{familiarCollection} object. +It is also possible to provide a \code{familiarEnsemble} or one or more +\code{familiarModel} objects together with the data from which data is computed +prior to export. Paths to such files can also be provided.} + +\item{draw}{(\emph{optional}) Draws the plot if TRUE.} + +\item{dir_path}{(\emph{optional}) Path to the directory where the plots of receiver +operating characteristic curves are saved to. Output is saved in the +\code{performance} subdirectory. If \code{NULL} no figures are saved, but are returned +instead.} + +\item{split_by}{(\emph{optional}) Splitting variables. This refers to column names +on which datasets are split. A separate figure is created for each split. +See details for available variables.} + +\item{color_by}{(\emph{optional}) Variables used to determine fill colour of plot +objects. The variables cannot overlap with those provided to the \code{split_by} +argument, but may overlap with other arguments. See details for available +variables.} + +\item{facet_by}{(\emph{optional}) Variables used to determine how and if facets of +each figure appear. In case the \code{facet_wrap_cols} argument is \code{NULL}, the +first variable is used to define columns, and the remaing variables are +used to define rows of facets. The variables cannot overlap with those +provided to the \code{split_by} argument, but may overlap with other arguments. +See details for available variables.} + +\item{facet_wrap_cols}{(\emph{optional}) Number of columns to generate when facet +wrapping. If NULL, a facet grid is produced instead.} + +\item{ggtheme}{(\emph{optional}) \code{ggplot} theme to use for plotting.} + +\item{discrete_palette}{(\emph{optional}) Palette to use to color the different +plot elements in case a value was provided to the \code{color_by} argument.} + +\item{x_label}{(\emph{optional}) Label to provide to the x-axis. If NULL, no label +is shown.} + +\item{y_label}{(\emph{optional}) Label to provide to the y-axis. If NULL, no label +is shown.} + +\item{legend_label}{(\emph{optional}) Label to provide to the legend. If NULL, the +legend will not have a name.} + +\item{plot_title}{(\emph{optional}) Label to provide as figure title. If NULL, no +title is shown.} + +\item{plot_sub_title}{(\emph{optional}) Label to provide as figure subtitle. If +NULL, no subtitle is shown.} + +\item{caption}{(\emph{optional}) Label to provide as figure caption. If NULL, no +caption is shown.} + +\item{x_n_breaks}{(\emph{optional}) Number of breaks to show on the x-axis of the +plot. \code{x_n_breaks} is used to determine the \code{x_breaks} argument in case it +is unset.} + +\item{x_breaks}{(\emph{optional}) Break points on the x-axis of the plot.} + +\item{y_n_breaks}{(\emph{optional}) Number of breaks to show on the y-axis of the +plot. \code{y_n_breaks} is used to determine the \code{y_breaks} argument in case it +is unset.} + +\item{y_breaks}{(\emph{optional}) Break points on the y-axis of the plot.} + +\item{conf_int_style}{(\emph{optional}) Confidence interval style. See details for +allowed styles.} + +\item{conf_int_alpha}{(\emph{optional}) Alpha value to determine transparency of +confidence intervals or, alternatively, other plot elements with which the +confidence interval overlaps. Only values between 0.0 (fully transparent) +and 1.0 (fully opaque) are allowed.} + +\item{width}{(\emph{optional}) Width of the plot. A default value is derived from +the number of facets.} + +\item{height}{(\emph{optional}) Height of the plot. A default value is derived from +the number of features and the number of facets.} + +\item{units}{(\emph{optional}) Plot size unit. Either \code{cm} (default), \code{mm} or \verb{in}.} + +\item{export_collection}{(\emph{optional}) Exports the collection if TRUE.} + +\item{...}{ + Arguments passed on to \code{\link[=as_familiar_collection]{as_familiar_collection}}, \code{\link[ggplot2:ggsave]{ggplot2::ggsave}} + \describe{ + \item{\code{familiar_data_names}}{Names of the dataset(s). Only used if the \code{object} parameter +is one or more \code{familiarData} objects.} + \item{\code{collection_name}}{Name of the collection.} + \item{\code{filename}}{File name to create on disk.} + \item{\code{plot}}{Plot to save, defaults to last plot displayed.} + \item{\code{device}}{Device to use. Can either be a device function +(e.g. \link{png}), or one of "eps", "ps", "tex" (pictex), +"pdf", "jpeg", "tiff", "png", "bmp", "svg" or "wmf" (windows only). If +\code{NULL} (default), the device is guessed based on the \code{filename} extension.} + \item{\code{path}}{Path of the directory to save plot to: \code{path} and \code{filename} +are combined to create the fully qualified file name. Defaults to the +working directory.} + \item{\code{scale}}{Multiplicative scaling factor.} + \item{\code{dpi}}{Plot resolution. Also accepts a string input: "retina" (320), +"print" (300), or "screen" (72). Applies only to raster output types.} + \item{\code{limitsize}}{When \code{TRUE} (the default), \code{ggsave()} will not +save images larger than 50x50 inches, to prevent the common error of +specifying dimensions in pixels.} + \item{\code{bg}}{Background colour. If \code{NULL}, uses the \code{plot.background} fill value +from the plot theme.} + \item{\code{create.dir}}{Whether to create new directories if a non-existing +directory is specified in the \code{filename} or \code{path} (\code{TRUE}) or return an +error (\code{FALSE}, default). If \code{FALSE} and run in an interactive session, +a prompt will appear asking to create a new directory when necessary.} + }} +} +\value{ +\code{NULL} or list of plot objects, if \code{dir_path} is \code{NULL}. +} +\description{ +This method creates precision-recall curves based on data in a +familiarCollection object. +} +\details{ +This function generates area under the precision-recall curve plots. + +Available splitting variables are: \code{fs_method}, \code{learner}, \code{data_set} and +\code{positive_class}. By default, the data is split by \code{fs_method} and \code{learner}, +with faceting by \code{data_set} and colouring by \code{positive_class}. + +Available palettes for \code{discrete_palette} are those listed by +\code{grDevices::palette.pals()} (requires R >= 4.0.0), \code{grDevices::hcl.pals()} +(requires R >= 3.6.0) and \code{rainbow}, \code{heat.colors}, \code{terrain.colors}, +\code{topo.colors} and \code{cm.colors}, which correspond to the palettes of the same +name in \code{grDevices}. If not specified, a default palette based on palettes +in Tableau are used. You may also specify your own palette by using colour +names listed by \code{grDevices::colors()} or through hexadecimal RGB strings. + +Bootstrap confidence intervals of the ROC curve (if present) can be shown +using various styles set by \code{conf_int_style}: +\itemize{ +\item \code{ribbon} (default): confidence intervals are shown as a ribbon with an +opacity of \code{conf_int_alpha} around the point estimate of the ROC curve. +\item \code{step} (default): confidence intervals are shown as a step function around +the point estimate of the ROC curve. +\item \code{none}: confidence intervals are not shown. The point estimate of the ROC +curve is shown as usual. +} + +Labelling methods such as \code{set_fs_method_names} or \code{set_data_set_names} can +be applied to the \code{familiarCollection} object to update labels, and order +the output in the figure. +} diff --git a/man/plot_auc_roc_curve-methods.Rd b/man/plot_auc_roc_curve-methods.Rd new file mode 100644 index 00000000..f9e1539e --- /dev/null +++ b/man/plot_auc_roc_curve-methods.Rd @@ -0,0 +1,363 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/PlotAUCcurves.R +\name{plot_auc_roc_curve} +\alias{plot_auc_roc_curve} +\alias{plot_auc_roc_curve,ANY-method} +\alias{plot_auc_roc_curve,familiarCollection-method} +\title{Plot the receiver operating characteristic curve.} +\usage{ +plot_auc_roc_curve( + object, + draw = FALSE, + dir_path = NULL, + split_by = NULL, + color_by = NULL, + facet_by = NULL, + facet_wrap_cols = NULL, + ggtheme = NULL, + discrete_palette = NULL, + x_label = waiver(), + y_label = waiver(), + legend_label = waiver(), + plot_title = waiver(), + plot_sub_title = waiver(), + caption = NULL, + x_n_breaks = 5, + x_breaks = NULL, + y_n_breaks = 5, + y_breaks = NULL, + conf_int_style = c("ribbon", "step", "none"), + conf_int_alpha = 0.4, + width = waiver(), + height = waiver(), + units = waiver(), + export_collection = FALSE, + ... +) + +\S4method{plot_auc_roc_curve}{ANY}( + object, + draw = FALSE, + dir_path = NULL, + split_by = NULL, + color_by = NULL, + facet_by = NULL, + facet_wrap_cols = NULL, + ggtheme = NULL, + discrete_palette = NULL, + x_label = waiver(), + y_label = waiver(), + legend_label = waiver(), + plot_title = waiver(), + plot_sub_title = waiver(), + caption = NULL, + x_n_breaks = 5, + x_breaks = NULL, + y_n_breaks = 5, + y_breaks = NULL, + conf_int_style = c("ribbon", "step", "none"), + conf_int_alpha = 0.4, + width = waiver(), + height = waiver(), + units = waiver(), + export_collection = FALSE, + ... +) + +\S4method{plot_auc_roc_curve}{familiarCollection}( + object, + draw = FALSE, + dir_path = NULL, + split_by = NULL, + color_by = NULL, + facet_by = NULL, + facet_wrap_cols = NULL, + ggtheme = NULL, + discrete_palette = NULL, + x_label = waiver(), + y_label = waiver(), + legend_label = waiver(), + plot_title = waiver(), + plot_sub_title = waiver(), + caption = NULL, + x_n_breaks = 5, + x_breaks = NULL, + y_n_breaks = 5, + y_breaks = NULL, + conf_int_style = c("ribbon", "step", "none"), + conf_int_alpha = 0.4, + width = waiver(), + height = waiver(), + units = waiver(), + export_collection = FALSE, + ... +) +} +\arguments{ +\item{object}{\code{familiarCollection} object, or one or more \code{familiarData} +objects, that will be internally converted to a \code{familiarCollection} object. +It is also possible to provide a \code{familiarEnsemble} or one or more +\code{familiarModel} objects together with the data from which data is computed +prior to export. Paths to such files can also be provided.} + +\item{draw}{(\emph{optional}) Draws the plot if TRUE.} + +\item{dir_path}{(\emph{optional}) Path to the directory where the plots of receiver +operating characteristic curves are saved to. Output is saved in the +\code{performance} subdirectory. If \code{NULL} no figures are saved, but are returned +instead.} + +\item{split_by}{(\emph{optional}) Splitting variables. This refers to column names +on which datasets are split. A separate figure is created for each split. +See details for available variables.} + +\item{color_by}{(\emph{optional}) Variables used to determine fill colour of plot +objects. The variables cannot overlap with those provided to the \code{split_by} +argument, but may overlap with other arguments. See details for available +variables.} + +\item{facet_by}{(\emph{optional}) Variables used to determine how and if facets of +each figure appear. In case the \code{facet_wrap_cols} argument is \code{NULL}, the +first variable is used to define columns, and the remaing variables are +used to define rows of facets. The variables cannot overlap with those +provided to the \code{split_by} argument, but may overlap with other arguments. +See details for available variables.} + +\item{facet_wrap_cols}{(\emph{optional}) Number of columns to generate when facet +wrapping. If NULL, a facet grid is produced instead.} + +\item{ggtheme}{(\emph{optional}) \code{ggplot} theme to use for plotting.} + +\item{discrete_palette}{(\emph{optional}) Palette to use to color the different +plot elements in case a value was provided to the \code{color_by} argument.} + +\item{x_label}{(\emph{optional}) Label to provide to the x-axis. If NULL, no label +is shown.} + +\item{y_label}{(\emph{optional}) Label to provide to the y-axis. If NULL, no label +is shown.} + +\item{legend_label}{(\emph{optional}) Label to provide to the legend. If NULL, the +legend will not have a name.} + +\item{plot_title}{(\emph{optional}) Label to provide as figure title. If NULL, no +title is shown.} + +\item{plot_sub_title}{(\emph{optional}) Label to provide as figure subtitle. If +NULL, no subtitle is shown.} + +\item{caption}{(\emph{optional}) Label to provide as figure caption. If NULL, no +caption is shown.} + +\item{x_n_breaks}{(\emph{optional}) Number of breaks to show on the x-axis of the +plot. \code{x_n_breaks} is used to determine the \code{x_breaks} argument in case it +is unset.} + +\item{x_breaks}{(\emph{optional}) Break points on the x-axis of the plot.} + +\item{y_n_breaks}{(\emph{optional}) Number of breaks to show on the y-axis of the +plot. \code{y_n_breaks} is used to determine the \code{y_breaks} argument in case it +is unset.} + +\item{y_breaks}{(\emph{optional}) Break points on the y-axis of the plot.} + +\item{conf_int_style}{(\emph{optional}) Confidence interval style. See details for +allowed styles.} + +\item{conf_int_alpha}{(\emph{optional}) Alpha value to determine transparency of +confidence intervals or, alternatively, other plot elements with which the +confidence interval overlaps. Only values between 0.0 (fully transparent) +and 1.0 (fully opaque) are allowed.} + +\item{width}{(\emph{optional}) Width of the plot. A default value is derived from +the number of facets.} + +\item{height}{(\emph{optional}) Height of the plot. A default value is derived from +the number of features and the number of facets.} + +\item{units}{(\emph{optional}) Plot size unit. Either \code{cm} (default), \code{mm} or \verb{in}.} + +\item{export_collection}{(\emph{optional}) Exports the collection if TRUE.} + +\item{...}{ + Arguments passed on to \code{\link[=as_familiar_collection]{as_familiar_collection}}, \code{\link[ggplot2:ggsave]{ggplot2::ggsave}}, \code{\link[=extract_auc_data]{extract_auc_data}} + \describe{ + \item{\code{familiar_data_names}}{Names of the dataset(s). Only used if the \code{object} parameter +is one or more \code{familiarData} objects.} + \item{\code{collection_name}}{Name of the collection.} + \item{\code{filename}}{File name to create on disk.} + \item{\code{plot}}{Plot to save, defaults to last plot displayed.} + \item{\code{device}}{Device to use. Can either be a device function +(e.g. \link{png}), or one of "eps", "ps", "tex" (pictex), +"pdf", "jpeg", "tiff", "png", "bmp", "svg" or "wmf" (windows only). If +\code{NULL} (default), the device is guessed based on the \code{filename} extension.} + \item{\code{path}}{Path of the directory to save plot to: \code{path} and \code{filename} +are combined to create the fully qualified file name. Defaults to the +working directory.} + \item{\code{scale}}{Multiplicative scaling factor.} + \item{\code{dpi}}{Plot resolution. Also accepts a string input: "retina" (320), +"print" (300), or "screen" (72). Applies only to raster output types.} + \item{\code{limitsize}}{When \code{TRUE} (the default), \code{ggsave()} will not +save images larger than 50x50 inches, to prevent the common error of +specifying dimensions in pixels.} + \item{\code{bg}}{Background colour. If \code{NULL}, uses the \code{plot.background} fill value +from the plot theme.} + \item{\code{create.dir}}{Whether to create new directories if a non-existing +directory is specified in the \code{filename} or \code{path} (\code{TRUE}) or return an +error (\code{FALSE}, default). If \code{FALSE} and run in an interactive session, +a prompt will appear asking to create a new directory when necessary.} + \item{\code{data}}{A \code{dataObject} object, \code{data.table} or \code{data.frame} that +constitutes the data that are assessed.} + \item{\code{is_pre_processed}}{Flag that indicates whether the data was already +pre-processed externally, e.g. normalised and clustered. Only used if the +\code{data} argument is a \code{data.table} or \code{data.frame}.} + \item{\code{cl}}{Cluster created using the \code{parallel} package. This cluster is then +used to speed up computation through parallellisation.} + \item{\code{ensemble_method}}{Method for ensembling predictions from models for the +same sample. Available methods are: +\itemize{ +\item \code{median} (default): Use the median of the predicted values as the ensemble +value for a sample. +\item \code{mean}: Use the mean of the predicted values as the ensemble value for a +sample. +}} + \item{\code{verbose}}{Flag to indicate whether feedback should be provided on the +computation and extraction of various data elements.} + \item{\code{message_indent}}{Number of indentation steps for messages shown during +computation and extraction of various data elements.} + \item{\code{detail_level}}{(\emph{optional}) Sets the level at which results are computed +and aggregated. +\itemize{ +\item \code{ensemble}: Results are computed at the ensemble level, i.e. over all +models in the ensemble. This means that, for example, bias-corrected +estimates of model performance are assessed by creating (at least) 20 +bootstraps and computing the model performance of the ensemble model for +each bootstrap. +\item \code{hybrid} (default): Results are computed at the level of models in an +ensemble. This means that, for example, bias-corrected estimates of model +performance are directly computed using the models in the ensemble. If there +are at least 20 trained models in the ensemble, performance is computed for +each model, in contrast to \code{ensemble} where performance is computed for the +ensemble of models. If there are less than 20 trained models in the +ensemble, bootstraps are created so that at least 20 point estimates can be +made. +\item \code{model}: Results are computed at the model level. This means that, for +example, bias-corrected estimates of model performance are assessed by +creating (at least) 20 bootstraps and computing the performance of the model +for each bootstrap. +} + +Note that each level of detail has a different interpretation for bootstrap +confidence intervals. For \code{ensemble} and \code{model} these are the confidence +intervals for the ensemble and an individual model, respectively. That is, +the confidence interval describes the range where an estimate produced by a +respective ensemble or model trained on a repeat of the experiment may be +found with the probability of the confidence level. For \code{hybrid}, it +represents the range where any single model trained on a repeat of the +experiment may be found with the probability of the confidence level. By +definition, confidence intervals obtained using \code{hybrid} are at least as +wide as those for \code{ensemble}. \code{hybrid} offers the correct interpretation if +the goal of the analysis is to assess the result of a single, unspecified, +model. + +\code{hybrid} is generally computationally less expensive then \code{ensemble}, which +in turn is somewhat less expensive than \code{model}. + +A non-default \code{detail_level} parameter can be specified for separate +evaluation steps by providing a parameter value in a named list with data +elements, e.g. \code{list("auc_data"="ensemble", "model_performance"="hybrid")}. +This parameter can be set for the following data elements: \code{auc_data}, +\code{decision_curve_analyis}, \code{model_performance}, \code{permutation_vimp}, +\code{ice_data}, \code{prediction_data} and \code{confusion_matrix}.} + \item{\code{estimation_type}}{(\emph{optional}) Sets the type of estimation that should be +possible. This has the following options: +\itemize{ +\item \code{point}: Point estimates. +\item \code{bias_correction} or \code{bc}: Bias-corrected estimates. A bias-corrected +estimate is computed from (at least) 20 point estimates, and \code{familiar} may +bootstrap the data to create them. +\item \code{bootstrap_confidence_interval} or \code{bci} (default): Bias-corrected +estimates with bootstrap confidence intervals (Efron and Hastie, 2016). The +number of point estimates required depends on the \code{confidence_level} +parameter, and \code{familiar} may bootstrap the data to create them. +} + +As with \code{detail_level}, a non-default \code{estimation_type} parameter can be +specified for separate evaluation steps by providing a parameter value in a +named list with data elements, e.g. \code{list("auc_data"="bci", "model_performance"="point")}. This parameter can be set for the following +data elements: \code{auc_data}, \code{decision_curve_analyis}, \code{model_performance}, +\code{permutation_vimp}, \code{ice_data}, and \code{prediction_data}.} + \item{\code{aggregate_results}}{(\emph{optional}) Flag that signifies whether results +should be aggregated during evaluation. If \code{estimation_type} is +\code{bias_correction} or \code{bc}, aggregation leads to a single bias-corrected +estimate. If \code{estimation_type} is \code{bootstrap_confidence_interval} or \code{bci}, +aggregation leads to a single bias-corrected estimate with lower and upper +boundaries of the confidence interval. This has no effect if +\code{estimation_type} is \code{point}. + +The default value is equal to \code{TRUE} except when assessing metrics to assess +model performance, as the default violin plot requires underlying data. + +As with \code{detail_level} and \code{estimation_type}, a non-default +\code{aggregate_results} parameter can be specified for separate evaluation steps +by providing a parameter value in a named list with data elements, e.g. +\code{list("auc_data"=TRUE, , "model_performance"=FALSE)}. This parameter exists +for the same elements as \code{estimation_type}.} + \item{\code{confidence_level}}{(\emph{optional}) Numeric value for the level at which +confidence intervals are determined. In the case bootstraps are used to +determine the confidence intervals bootstrap estimation, \code{familiar} uses the +rule of thumb \eqn{n = 20 / ci.level} to determine the number of required +bootstraps. + +The default value is \code{0.95}.} + \item{\code{bootstrap_ci_method}}{(\emph{optional}) Method used to determine bootstrap +confidence intervals (Efron and Hastie, 2016). The following methods are +implemented: +\itemize{ +\item \code{percentile} (default): Confidence intervals obtained using the percentile +method. +\item \code{bc}: Bias-corrected confidence intervals. +} + +Note that the standard method is not implemented because this method is +often not suitable due to non-normal distributions. The bias-corrected and +accelerated (BCa) method is not implemented yet.} + }} +} +\value{ +\code{NULL} or list of plot objects, if \code{dir_path} is \code{NULL}. +} +\description{ +This method creates receiver operating characteristic curves +based on data in a familiarCollection object. +} +\details{ +This function generates area under the ROC curve plots. + +Available splitting variables are: \code{fs_method}, \code{learner}, \code{data_set} and +\code{positive_class}. By default, the data is split by \code{fs_method} and \code{learner}, +with faceting by \code{data_set} and colouring by \code{positive_class}. + +Available palettes for \code{discrete_palette} are those listed by +\code{grDevices::palette.pals()} (requires R >= 4.0.0), \code{grDevices::hcl.pals()} +(requires R >= 3.6.0) and \code{rainbow}, \code{heat.colors}, \code{terrain.colors}, +\code{topo.colors} and \code{cm.colors}, which correspond to the palettes of the same +name in \code{grDevices}. If not specified, a default palette based on palettes +in Tableau are used. You may also specify your own palette by using colour +names listed by \code{grDevices::colors()} or through hexadecimal RGB strings. + +Bootstrap confidence intervals of the ROC curve (if present) can be shown +using various styles set by \code{conf_int_style}: +\itemize{ +\item \code{ribbon} (default): confidence intervals are shown as a ribbon with an +opacity of \code{conf_int_alpha} around the point estimate of the ROC curve. +\item \code{step} (default): confidence intervals are shown as a step function around +the point estimate of the ROC curve. +\item \code{none}: confidence intervals are not shown. The point estimate of the ROC +curve is shown as usual. +} + +Labelling methods such as \code{set_fs_method_names} or \code{set_data_set_names} can +be applied to the \code{familiarCollection} object to update labels, and order +the output in the figure. +} diff --git a/man/plot_calibration_data-methods.Rd b/man/plot_calibration_data-methods.Rd new file mode 100644 index 00000000..5aef6c28 --- /dev/null +++ b/man/plot_calibration_data-methods.Rd @@ -0,0 +1,471 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/PlotCalibration.R +\name{plot_calibration_data} +\alias{plot_calibration_data} +\alias{plot_calibration_data,ANY-method} +\alias{plot_calibration_data,familiarCollection-method} +\title{Plot calibration figures.} +\usage{ +plot_calibration_data( + object, + draw = FALSE, + dir_path = NULL, + split_by = NULL, + color_by = NULL, + facet_by = NULL, + facet_wrap_cols = NULL, + ggtheme = NULL, + discrete_palette = NULL, + x_label = waiver(), + x_label_shared = "column", + y_label = waiver(), + y_label_shared = "row", + legend_label = waiver(), + plot_title = waiver(), + plot_sub_title = waiver(), + caption = NULL, + x_range = NULL, + x_n_breaks = 5, + x_breaks = NULL, + y_range = NULL, + y_n_breaks = 5, + y_breaks = NULL, + conf_int_style = c("ribbon", "step", "none"), + conf_int_alpha = 0.4, + show_density = TRUE, + show_calibration_fit = TRUE, + show_goodness_of_fit = TRUE, + density_plot_height = grid::unit(1, "cm"), + width = waiver(), + height = waiver(), + units = waiver(), + export_collection = FALSE, + ... +) + +\S4method{plot_calibration_data}{ANY}( + object, + draw = FALSE, + dir_path = NULL, + split_by = NULL, + color_by = NULL, + facet_by = NULL, + facet_wrap_cols = NULL, + ggtheme = NULL, + discrete_palette = NULL, + x_label = waiver(), + x_label_shared = "column", + y_label = waiver(), + y_label_shared = "row", + legend_label = waiver(), + plot_title = waiver(), + plot_sub_title = waiver(), + caption = NULL, + x_range = NULL, + x_n_breaks = 5, + x_breaks = NULL, + y_range = NULL, + y_n_breaks = 5, + y_breaks = NULL, + conf_int_style = c("ribbon", "step", "none"), + conf_int_alpha = 0.4, + show_density = TRUE, + show_calibration_fit = TRUE, + show_goodness_of_fit = TRUE, + density_plot_height = grid::unit(1, "cm"), + width = waiver(), + height = waiver(), + units = waiver(), + export_collection = FALSE, + ... +) + +\S4method{plot_calibration_data}{familiarCollection}( + object, + draw = FALSE, + dir_path = NULL, + split_by = NULL, + color_by = NULL, + facet_by = NULL, + facet_wrap_cols = NULL, + ggtheme = NULL, + discrete_palette = NULL, + x_label = waiver(), + x_label_shared = "column", + y_label = waiver(), + y_label_shared = "row", + legend_label = waiver(), + plot_title = waiver(), + plot_sub_title = waiver(), + caption = NULL, + x_range = NULL, + x_n_breaks = 5, + x_breaks = NULL, + y_range = NULL, + y_n_breaks = 5, + y_breaks = NULL, + conf_int_style = c("ribbon", "step", "none"), + conf_int_alpha = 0.4, + show_density = TRUE, + show_calibration_fit = TRUE, + show_goodness_of_fit = TRUE, + density_plot_height = grid::unit(1, "cm"), + width = waiver(), + height = waiver(), + units = waiver(), + export_collection = FALSE, + ... +) +} +\arguments{ +\item{object}{\code{familiarCollection} object, or one or more \code{familiarData} +objects, that will be internally converted to a \code{familiarCollection} object. +It is also possible to provide a \code{familiarEnsemble} or one or more +\code{familiarModel} objects together with the data from which data is computed +prior to export. Paths to such files can also be provided.} + +\item{draw}{(\emph{optional}) Draws the plot if TRUE.} + +\item{dir_path}{(\emph{optional}) Path to the directory where created calibration +plots are saved to. Output is saved in the \code{calibration} subdirectory. If +\code{NULL} no figures are saved, but are returned instead.} + +\item{split_by}{(\emph{optional}) Splitting variables. This refers to column names +on which datasets are split. A separate figure is created for each split. +See details for available variables.} + +\item{color_by}{(\emph{optional}) Variables used to determine fill colour of plot +objects. The variables cannot overlap with those provided to the \code{split_by} +argument, but may overlap with other arguments. See details for available +variables.} + +\item{facet_by}{(\emph{optional}) Variables used to determine how and if facets of +each figure appear. In case the \code{facet_wrap_cols} argument is \code{NULL}, the +first variable is used to define columns, and the remaing variables are +used to define rows of facets. The variables cannot overlap with those +provided to the \code{split_by} argument, but may overlap with other arguments. +See details for available variables.} + +\item{facet_wrap_cols}{(\emph{optional}) Number of columns to generate when facet +wrapping. If NULL, a facet grid is produced instead.} + +\item{ggtheme}{(\emph{optional}) \code{ggplot} theme to use for plotting.} + +\item{discrete_palette}{(\emph{optional}) Palette to use to color the different +data points and fit lines in case a non-singular variable was provided to +the \code{color_by} argument.} + +\item{x_label}{(\emph{optional}) Label to provide to the x-axis. If NULL, no label +is shown.} + +\item{x_label_shared}{(\emph{optional}) Sharing of x-axis labels between facets. +One of three values: +\itemize{ +\item \code{overall}: A single label is placed at the bottom of the figure. Tick +text (but not the ticks themselves) is removed for all but the bottom facet +plot(s). +\item \code{column}: A label is placed at the bottom of each column. Tick text (but +not the ticks themselves) is removed for all but the bottom facet plot(s). +\item \code{individual}: A label is placed below each facet plot. Tick text is kept. +}} + +\item{y_label}{(\emph{optional}) Label to provide to the y-axis. If NULL, no label +is shown.} + +\item{y_label_shared}{(\emph{optional}) Sharing of y-axis labels between facets. +One of three values: +\itemize{ +\item \code{overall}: A single label is placed to the left of the figure. Tick text +(but not the ticks themselves) is removed for all but the left-most facet +plot(s). +\item \code{row}: A label is placed to the left of each row. Tick text (but not the +ticks themselves) is removed for all but the left-most facet plot(s). +\item \code{individual}: A label is placed below each facet plot. Tick text is kept. +}} + +\item{legend_label}{(\emph{optional}) Label to provide to the legend. If NULL, the +legend will not have a name.} + +\item{plot_title}{(\emph{optional}) Label to provide as figure title. If NULL, no +title is shown.} + +\item{plot_sub_title}{(\emph{optional}) Label to provide as figure subtitle. If +NULL, no subtitle is shown.} + +\item{caption}{(\emph{optional}) Label to provide as figure caption. If NULL, no +caption is shown.} + +\item{x_range}{(\emph{optional}) Value range for the x-axis.} + +\item{x_n_breaks}{(\emph{optional}) Number of breaks to show on the x-axis of the +plot. \code{x_n_breaks} is used to determine the \code{x_breaks} argument in case it +is unset.} + +\item{x_breaks}{(\emph{optional}) Break points on the x-axis of the plot.} + +\item{y_range}{(\emph{optional}) Value range for the y-axis.} + +\item{y_n_breaks}{(\emph{optional}) Number of breaks to show on the y-axis of the +plot. \code{y_n_breaks} is used to determine the \code{y_breaks} argument in case it +is unset.} + +\item{y_breaks}{(\emph{optional}) Break points on the y-axis of the plot.} + +\item{conf_int_style}{(\emph{optional}) Confidence interval style. See details for +allowed styles.} + +\item{conf_int_alpha}{(\emph{optional}) Alpha value to determine transparency of +confidence intervals or, alternatively, other plot elements with which the +confidence interval overlaps. Only values between 0.0 (fully transparent) +and 1.0 (fully opaque) are allowed.} + +\item{show_density}{(\emph{optional}) Show point density in top margin of the +figure. If \code{color_by} is set, this information will not be shown.} + +\item{show_calibration_fit}{(\emph{optional}) Specifies whether the calibration in +the large and calibration slope are annotated in the plot. If \code{color_by} is +set, this information will not be shown.} + +\item{show_goodness_of_fit}{(\emph{optional}) Specifies whether a the results of +goodness of fit tests are annotated in the plot. If \code{color_by} is set, this +information will not be shown.} + +\item{density_plot_height}{(\emph{optional}) Height of the density plot. The height +is 1.5 cm by default. Height is expected to be grid unit (see \code{grid::unit}), +which also allows for specifying relative heights. Will be ignored if +\code{show_density} is \code{FALSE}.} + +\item{width}{(\emph{optional}) Width of the plot. A default value is derived from +the number of facets.} + +\item{height}{(\emph{optional}) Height of the plot. A default value is derived from +the number of features and the number of facets.} + +\item{units}{(\emph{optional}) Plot size unit. Either \code{cm} (default), \code{mm} or \verb{in}.} + +\item{export_collection}{(\emph{optional}) Exports the collection if TRUE.} + +\item{...}{ + Arguments passed on to \code{\link[=as_familiar_collection]{as_familiar_collection}}, \code{\link[ggplot2:ggsave]{ggplot2::ggsave}}, \code{\link[=extract_calibration_data]{extract_calibration_data}} + \describe{ + \item{\code{familiar_data_names}}{Names of the dataset(s). Only used if the \code{object} parameter +is one or more \code{familiarData} objects.} + \item{\code{collection_name}}{Name of the collection.} + \item{\code{filename}}{File name to create on disk.} + \item{\code{plot}}{Plot to save, defaults to last plot displayed.} + \item{\code{device}}{Device to use. Can either be a device function +(e.g. \link{png}), or one of "eps", "ps", "tex" (pictex), +"pdf", "jpeg", "tiff", "png", "bmp", "svg" or "wmf" (windows only). If +\code{NULL} (default), the device is guessed based on the \code{filename} extension.} + \item{\code{path}}{Path of the directory to save plot to: \code{path} and \code{filename} +are combined to create the fully qualified file name. Defaults to the +working directory.} + \item{\code{scale}}{Multiplicative scaling factor.} + \item{\code{dpi}}{Plot resolution. Also accepts a string input: "retina" (320), +"print" (300), or "screen" (72). Applies only to raster output types.} + \item{\code{limitsize}}{When \code{TRUE} (the default), \code{ggsave()} will not +save images larger than 50x50 inches, to prevent the common error of +specifying dimensions in pixels.} + \item{\code{bg}}{Background colour. If \code{NULL}, uses the \code{plot.background} fill value +from the plot theme.} + \item{\code{create.dir}}{Whether to create new directories if a non-existing +directory is specified in the \code{filename} or \code{path} (\code{TRUE}) or return an +error (\code{FALSE}, default). If \code{FALSE} and run in an interactive session, +a prompt will appear asking to create a new directory when necessary.} + \item{\code{data}}{A \code{dataObject} object, \code{data.table} or \code{data.frame} that +constitutes the data that are assessed.} + \item{\code{is_pre_processed}}{Flag that indicates whether the data was already +pre-processed externally, e.g. normalised and clustered. Only used if the +\code{data} argument is a \code{data.table} or \code{data.frame}.} + \item{\code{cl}}{Cluster created using the \code{parallel} package. This cluster is then +used to speed up computation through parallellisation.} + \item{\code{evaluation_times}}{One or more time points that are used for in analysis of +survival problems when data has to be assessed at a set time, e.g. +calibration. If not provided explicitly, this parameter is read from +settings used at creation of the underlying \code{familiarModel} objects. Only +used for \code{survival} outcomes.} + \item{\code{ensemble_method}}{Method for ensembling predictions from models for the +same sample. Available methods are: +\itemize{ +\item \code{median} (default): Use the median of the predicted values as the ensemble +value for a sample. +\item \code{mean}: Use the mean of the predicted values as the ensemble value for a +sample. +}} + \item{\code{verbose}}{Flag to indicate whether feedback should be provided on the +computation and extraction of various data elements.} + \item{\code{message_indent}}{Number of indentation steps for messages shown during +computation and extraction of various data elements.} + \item{\code{detail_level}}{(\emph{optional}) Sets the level at which results are computed +and aggregated. +\itemize{ +\item \code{ensemble}: Results are computed at the ensemble level, i.e. over all +models in the ensemble. This means that, for example, bias-corrected +estimates of model performance are assessed by creating (at least) 20 +bootstraps and computing the model performance of the ensemble model for +each bootstrap. +\item \code{hybrid} (default): Results are computed at the level of models in an +ensemble. This means that, for example, bias-corrected estimates of model +performance are directly computed using the models in the ensemble. If there +are at least 20 trained models in the ensemble, performance is computed for +each model, in contrast to \code{ensemble} where performance is computed for the +ensemble of models. If there are less than 20 trained models in the +ensemble, bootstraps are created so that at least 20 point estimates can be +made. +\item \code{model}: Results are computed at the model level. This means that, for +example, bias-corrected estimates of model performance are assessed by +creating (at least) 20 bootstraps and computing the performance of the model +for each bootstrap. +} + +Note that each level of detail has a different interpretation for bootstrap +confidence intervals. For \code{ensemble} and \code{model} these are the confidence +intervals for the ensemble and an individual model, respectively. That is, +the confidence interval describes the range where an estimate produced by a +respective ensemble or model trained on a repeat of the experiment may be +found with the probability of the confidence level. For \code{hybrid}, it +represents the range where any single model trained on a repeat of the +experiment may be found with the probability of the confidence level. By +definition, confidence intervals obtained using \code{hybrid} are at least as +wide as those for \code{ensemble}. \code{hybrid} offers the correct interpretation if +the goal of the analysis is to assess the result of a single, unspecified, +model. + +\code{hybrid} is generally computationally less expensive then \code{ensemble}, which +in turn is somewhat less expensive than \code{model}. + +A non-default \code{detail_level} parameter can be specified for separate +evaluation steps by providing a parameter value in a named list with data +elements, e.g. \code{list("auc_data"="ensemble", "model_performance"="hybrid")}. +This parameter can be set for the following data elements: \code{auc_data}, +\code{decision_curve_analyis}, \code{model_performance}, \code{permutation_vimp}, +\code{ice_data}, \code{prediction_data} and \code{confusion_matrix}.} + \item{\code{estimation_type}}{(\emph{optional}) Sets the type of estimation that should be +possible. This has the following options: +\itemize{ +\item \code{point}: Point estimates. +\item \code{bias_correction} or \code{bc}: Bias-corrected estimates. A bias-corrected +estimate is computed from (at least) 20 point estimates, and \code{familiar} may +bootstrap the data to create them. +\item \code{bootstrap_confidence_interval} or \code{bci} (default): Bias-corrected +estimates with bootstrap confidence intervals (Efron and Hastie, 2016). The +number of point estimates required depends on the \code{confidence_level} +parameter, and \code{familiar} may bootstrap the data to create them. +} + +As with \code{detail_level}, a non-default \code{estimation_type} parameter can be +specified for separate evaluation steps by providing a parameter value in a +named list with data elements, e.g. \code{list("auc_data"="bci", "model_performance"="point")}. This parameter can be set for the following +data elements: \code{auc_data}, \code{decision_curve_analyis}, \code{model_performance}, +\code{permutation_vimp}, \code{ice_data}, and \code{prediction_data}.} + \item{\code{aggregate_results}}{(\emph{optional}) Flag that signifies whether results +should be aggregated during evaluation. If \code{estimation_type} is +\code{bias_correction} or \code{bc}, aggregation leads to a single bias-corrected +estimate. If \code{estimation_type} is \code{bootstrap_confidence_interval} or \code{bci}, +aggregation leads to a single bias-corrected estimate with lower and upper +boundaries of the confidence interval. This has no effect if +\code{estimation_type} is \code{point}. + +The default value is equal to \code{TRUE} except when assessing metrics to assess +model performance, as the default violin plot requires underlying data. + +As with \code{detail_level} and \code{estimation_type}, a non-default +\code{aggregate_results} parameter can be specified for separate evaluation steps +by providing a parameter value in a named list with data elements, e.g. +\code{list("auc_data"=TRUE, , "model_performance"=FALSE)}. This parameter exists +for the same elements as \code{estimation_type}.} + \item{\code{confidence_level}}{(\emph{optional}) Numeric value for the level at which +confidence intervals are determined. In the case bootstraps are used to +determine the confidence intervals bootstrap estimation, \code{familiar} uses the +rule of thumb \eqn{n = 20 / ci.level} to determine the number of required +bootstraps. + +The default value is \code{0.95}.} + \item{\code{bootstrap_ci_method}}{(\emph{optional}) Method used to determine bootstrap +confidence intervals (Efron and Hastie, 2016). The following methods are +implemented: +\itemize{ +\item \code{percentile} (default): Confidence intervals obtained using the percentile +method. +\item \code{bc}: Bias-corrected confidence intervals. +} + +Note that the standard method is not implemented because this method is +often not suitable due to non-normal distributions. The bias-corrected and +accelerated (BCa) method is not implemented yet.} + }} +} +\value{ +\code{NULL} or list of plot objects, if \code{dir_path} is \code{NULL}. +} +\description{ +This method creates calibration plots from calibration data +stored in a familiarCollection object. For this figures, the expected +(predicted) values are plotted against the observed values. A +well-calibrated model should be close to the identity line. +} +\details{ +This function generates a calibration plot for each model in each +dataset. Any data used for calibration (e.g. baseline survival) is obtained +during model creation. + +Available splitting variables are: \code{fs_method}, \code{learner}, \code{data_set} and +\code{evaluation_time} (survival analysis only) and \code{positive_class} (multinomial +endpoints only). By default, separate figures are created for each +combination of \code{fs_method} and \code{learner}, with facetting by \code{data_set}. + +Calibration in survival analysis is performed at set time points so that +survival probabilities can be computed from the model, and compared with +observed survival probabilities. This is done differently depending on the +underlying model. For Cox partial hazards regression models, the base +survival (of the development samples) are used, whereas accelerated failure +time models (e.g. Weibull) and survival random forests can be used to +directly predict survival probabilities at a given time point. For survival +analysis, \code{evaluation_time} is an additional facet variable (by default). + +Calibration for multinomial endpoints is performed in a one-against-all +manner. This yields calibration information for each individual class of the +endpoint. For such endpoints, \code{positive_class} is an additional facet variable +(by default). + +Calibration plots have a density plot in the margin, which shows the density +of the plotted points, ordered by the expected probability or value. For +binomial and multinomial outcomes, the density for positive and negative +classes are shown separately. Note that this information is only provided in +when \code{color_by} is not used as a splitting variable (i.e. one calibration +plot per facet). + +Calibration plots are annotated with the intercept and the slope of a linear +model fitted to the sample points. A well-calibrated model has an intercept +close to 0.0 and a slope of 1.0. Intercept and slope are shown with their +respective 95\% confidence intervals. In addition, goodness-of-fit tests may +be shown. For most endpoints these are based on the Hosmer-Lemeshow (HL) +test, but for survival endpoints both the Nam-D'Agostino (ND) and the +Greenwood-Nam-D'Agostino (GND) tests are shown. Note that this information +is only annotated when \code{color_by} is not used as a splitting variable (i.e. +one calibration plot per facet). + +Available palettes for \code{discrete_palette} are those listed by +\code{grDevices::palette.pals()} (requires R >= 4.0.0), \code{grDevices::hcl.pals()} +(requires R >= 3.6.0) and \code{rainbow}, \code{heat.colors}, \code{terrain.colors}, +\code{topo.colors} and \code{cm.colors}, which correspond to the palettes of the same +name in \code{grDevices}. If not specified, a default palette based on palettes +in Tableau are used. You may also specify your own palette by using colour +names listed by \code{grDevices::colors()} or through hexadecimal RGB strings. + +Labeling methods such as \code{set_risk_group_names} or \code{set_data_set_names} can +be applied to the \code{familiarCollection} object to update labels, and order +the output in the figure. +} +\references{ +\enumerate{ +\item Hosmer, D. W., Hosmer, T., Le Cessie, S. & Lemeshow, S. A +comparison of goodness-of-fit tests for the logistic regression model. Stat. +Med. 16, 965–980 (1997). +\item D’Agostino, R. B. & Nam, B.-H. Evaluation of the Performance of Survival +Analysis Models: Discrimination and Calibration Measures. in Handbook of +Statistics vol. 23 1–25 (Elsevier, 2003). +\item Demler, O. V., Paynter, N. P. & Cook, N. R. Tests of calibration and +goodness-of-fit in the survival setting. Stat. Med. 34, 1659–1680 (2015). +} +} diff --git a/man/plot_confusion_matrix-methods.Rd b/man/plot_confusion_matrix-methods.Rd new file mode 100644 index 00000000..8e9d329e --- /dev/null +++ b/man/plot_confusion_matrix-methods.Rd @@ -0,0 +1,282 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/PlotConfusionMatrix.R +\name{plot_confusion_matrix} +\alias{plot_confusion_matrix} +\alias{plot_confusion_matrix,ANY-method} +\alias{plot_confusion_matrix,familiarCollection-method} +\title{Plot confusion matrix.} +\usage{ +plot_confusion_matrix( + object, + draw = FALSE, + dir_path = NULL, + split_by = NULL, + facet_by = NULL, + facet_wrap_cols = NULL, + ggtheme = NULL, + discrete_palette = NULL, + x_label = waiver(), + y_label = waiver(), + legend_label = waiver(), + plot_title = waiver(), + plot_sub_title = waiver(), + caption = NULL, + rotate_x_tick_labels = waiver(), + show_alpha = TRUE, + width = waiver(), + height = waiver(), + units = waiver(), + export_collection = FALSE, + ... +) + +\S4method{plot_confusion_matrix}{ANY}( + object, + draw = FALSE, + dir_path = NULL, + split_by = NULL, + facet_by = NULL, + facet_wrap_cols = NULL, + ggtheme = NULL, + discrete_palette = NULL, + x_label = waiver(), + y_label = waiver(), + legend_label = waiver(), + plot_title = waiver(), + plot_sub_title = waiver(), + caption = NULL, + rotate_x_tick_labels = waiver(), + show_alpha = TRUE, + width = waiver(), + height = waiver(), + units = waiver(), + export_collection = FALSE, + ... +) + +\S4method{plot_confusion_matrix}{familiarCollection}( + object, + draw = FALSE, + dir_path = NULL, + split_by = NULL, + facet_by = NULL, + facet_wrap_cols = NULL, + ggtheme = NULL, + discrete_palette = NULL, + x_label = waiver(), + y_label = waiver(), + legend_label = waiver(), + plot_title = waiver(), + plot_sub_title = waiver(), + caption = NULL, + rotate_x_tick_labels = waiver(), + show_alpha = TRUE, + width = waiver(), + height = waiver(), + units = waiver(), + export_collection = FALSE, + ... +) +} +\arguments{ +\item{object}{\code{familiarCollection} object, or one or more \code{familiarData} +objects, that will be internally converted to a \code{familiarCollection} object. +It is also possible to provide a \code{familiarEnsemble} or one or more +\code{familiarModel} objects together with the data from which data is computed +prior to export. Paths to such files can also be provided.} + +\item{draw}{(\emph{optional}) Draws the plot if TRUE.} + +\item{dir_path}{(\emph{optional}) Path to the directory where created confusion +matrixes are saved to. Output is saved in the \code{performance} subdirectory. If +\code{NULL} no figures are saved, but are returned instead.} + +\item{split_by}{(\emph{optional}) Splitting variables. This refers to column names +on which datasets are split. A separate figure is created for each split. +See details for available variables.} + +\item{facet_by}{(\emph{optional}) Variables used to determine how and if facets of +each figure appear. In case the \code{facet_wrap_cols} argument is \code{NULL}, the +first variable is used to define columns, and the remaing variables are +used to define rows of facets. The variables cannot overlap with those +provided to the \code{split_by} argument, but may overlap with other arguments. +See details for available variables.} + +\item{facet_wrap_cols}{(\emph{optional}) Number of columns to generate when facet +wrapping. If NULL, a facet grid is produced instead.} + +\item{ggtheme}{(\emph{optional}) \code{ggplot} theme to use for plotting.} + +\item{discrete_palette}{(\emph{optional}) Palette used to colour the confusion +matrix. The colour depends on whether each cell of the confusion matrix is +on the diagonal (observed outcome matched expected outcome) or not.} + +\item{x_label}{(\emph{optional}) Label to provide to the x-axis. If NULL, no label +is shown.} + +\item{y_label}{(\emph{optional}) Label to provide to the y-axis. If NULL, no label +is shown.} + +\item{legend_label}{(\emph{optional}) Label to provide to the legend. If NULL, the +legend will not have a name.} + +\item{plot_title}{(\emph{optional}) Label to provide as figure title. If NULL, no +title is shown.} + +\item{plot_sub_title}{(\emph{optional}) Label to provide as figure subtitle. If +NULL, no subtitle is shown.} + +\item{caption}{(\emph{optional}) Label to provide as figure caption. If NULL, no +caption is shown.} + +\item{rotate_x_tick_labels}{(\emph{optional}) Rotate tick labels on the x-axis by +90 degrees. Defaults to \code{TRUE}. Rotation of x-axis tick labels may also be +controlled through the \code{ggtheme}. In this case, \code{FALSE} should be provided +explicitly.} + +\item{show_alpha}{(\emph{optional}) Interpreting confusion matrices is made easier +by setting the opacity of the cells. \code{show_alpha} takes the following +values: +\itemize{ +\item \code{none}: Cell opacity is not altered. Diagonal and off-diagonal cells are +completely opaque and transparent, respectively. Same as \code{show_alpha=FALSE}. +\item \code{by_class}: Cell opacity is normalised by the number of instances for each +observed outcome class in each confusion matrix. +\item \code{by_matrix} (default): Cell opacity is normalised by the number of +instances in the largest observed outcome class in each confusion matrix. +Same as \code{show_alpha=TRUE} +\item \code{by_figure}: Cell opacity is normalised by the number of instances in the +largest observed outcome class across confusion matrices in different +facets. +\item \code{by_all}: Cell opacity is normalised by the number of instances in the +largest observed outcome class across all confusion matrices. +}} + +\item{width}{(\emph{optional}) Width of the plot. A default value is derived from +the number of facets.} + +\item{height}{(\emph{optional}) Height of the plot. A default value is derived from +the number of features and the number of facets.} + +\item{units}{(\emph{optional}) Plot size unit. Either \code{cm} (default), \code{mm} or \verb{in}.} + +\item{export_collection}{(\emph{optional}) Exports the collection if TRUE.} + +\item{...}{ + Arguments passed on to \code{\link[=as_familiar_collection]{as_familiar_collection}}, \code{\link[ggplot2:ggsave]{ggplot2::ggsave}}, \code{\link[=extract_confusion_matrix]{extract_confusion_matrix}} + \describe{ + \item{\code{familiar_data_names}}{Names of the dataset(s). Only used if the \code{object} parameter +is one or more \code{familiarData} objects.} + \item{\code{collection_name}}{Name of the collection.} + \item{\code{filename}}{File name to create on disk.} + \item{\code{plot}}{Plot to save, defaults to last plot displayed.} + \item{\code{device}}{Device to use. Can either be a device function +(e.g. \link{png}), or one of "eps", "ps", "tex" (pictex), +"pdf", "jpeg", "tiff", "png", "bmp", "svg" or "wmf" (windows only). If +\code{NULL} (default), the device is guessed based on the \code{filename} extension.} + \item{\code{path}}{Path of the directory to save plot to: \code{path} and \code{filename} +are combined to create the fully qualified file name. Defaults to the +working directory.} + \item{\code{scale}}{Multiplicative scaling factor.} + \item{\code{dpi}}{Plot resolution. Also accepts a string input: "retina" (320), +"print" (300), or "screen" (72). Applies only to raster output types.} + \item{\code{limitsize}}{When \code{TRUE} (the default), \code{ggsave()} will not +save images larger than 50x50 inches, to prevent the common error of +specifying dimensions in pixels.} + \item{\code{bg}}{Background colour. If \code{NULL}, uses the \code{plot.background} fill value +from the plot theme.} + \item{\code{create.dir}}{Whether to create new directories if a non-existing +directory is specified in the \code{filename} or \code{path} (\code{TRUE}) or return an +error (\code{FALSE}, default). If \code{FALSE} and run in an interactive session, +a prompt will appear asking to create a new directory when necessary.} + \item{\code{data}}{A \code{dataObject} object, \code{data.table} or \code{data.frame} that +constitutes the data that are assessed.} + \item{\code{is_pre_processed}}{Flag that indicates whether the data was already +pre-processed externally, e.g. normalised and clustered. Only used if the +\code{data} argument is a \code{data.table} or \code{data.frame}.} + \item{\code{cl}}{Cluster created using the \code{parallel} package. This cluster is then +used to speed up computation through parallellisation.} + \item{\code{ensemble_method}}{Method for ensembling predictions from models for the +same sample. Available methods are: +\itemize{ +\item \code{median} (default): Use the median of the predicted values as the ensemble +value for a sample. +\item \code{mean}: Use the mean of the predicted values as the ensemble value for a +sample. +}} + \item{\code{verbose}}{Flag to indicate whether feedback should be provided on the +computation and extraction of various data elements.} + \item{\code{message_indent}}{Number of indentation steps for messages shown during +computation and extraction of various data elements.} + \item{\code{detail_level}}{(\emph{optional}) Sets the level at which results are computed +and aggregated. +\itemize{ +\item \code{ensemble}: Results are computed at the ensemble level, i.e. over all +models in the ensemble. This means that, for example, bias-corrected +estimates of model performance are assessed by creating (at least) 20 +bootstraps and computing the model performance of the ensemble model for +each bootstrap. +\item \code{hybrid} (default): Results are computed at the level of models in an +ensemble. This means that, for example, bias-corrected estimates of model +performance are directly computed using the models in the ensemble. If there +are at least 20 trained models in the ensemble, performance is computed for +each model, in contrast to \code{ensemble} where performance is computed for the +ensemble of models. If there are less than 20 trained models in the +ensemble, bootstraps are created so that at least 20 point estimates can be +made. +\item \code{model}: Results are computed at the model level. This means that, for +example, bias-corrected estimates of model performance are assessed by +creating (at least) 20 bootstraps and computing the performance of the model +for each bootstrap. +} + +Note that each level of detail has a different interpretation for bootstrap +confidence intervals. For \code{ensemble} and \code{model} these are the confidence +intervals for the ensemble and an individual model, respectively. That is, +the confidence interval describes the range where an estimate produced by a +respective ensemble or model trained on a repeat of the experiment may be +found with the probability of the confidence level. For \code{hybrid}, it +represents the range where any single model trained on a repeat of the +experiment may be found with the probability of the confidence level. By +definition, confidence intervals obtained using \code{hybrid} are at least as +wide as those for \code{ensemble}. \code{hybrid} offers the correct interpretation if +the goal of the analysis is to assess the result of a single, unspecified, +model. + +\code{hybrid} is generally computationally less expensive then \code{ensemble}, which +in turn is somewhat less expensive than \code{model}. + +A non-default \code{detail_level} parameter can be specified for separate +evaluation steps by providing a parameter value in a named list with data +elements, e.g. \code{list("auc_data"="ensemble", "model_performance"="hybrid")}. +This parameter can be set for the following data elements: \code{auc_data}, +\code{decision_curve_analyis}, \code{model_performance}, \code{permutation_vimp}, +\code{ice_data}, \code{prediction_data} and \code{confusion_matrix}.} + }} +} +\value{ +\code{NULL} or list of plot objects, if \code{dir_path} is \code{NULL}. +} +\description{ +This method creates confusion matrices based on data in a +familiarCollection object. +} +\details{ +This function generates area under the ROC curve plots. + +Available splitting variables are: \code{fs_method}, \code{learner} and \code{data_set}. By +default, the data is split by \code{fs_method} and \code{learner}, with facetting by +\code{data_set}. + +Available palettes for \code{discrete_palette} are those listed by +\code{grDevices::palette.pals()} (requires R >= 4.0.0), \code{grDevices::hcl.pals()} +(requires R >= 3.6.0) and \code{rainbow}, \code{heat.colors}, \code{terrain.colors}, +\code{topo.colors} and \code{cm.colors}, which correspond to the palettes of the same +name in \code{grDevices}. If not specified, a default palette based on palettes +in Tableau are used. You may also specify your own palette by using colour +names listed by \code{grDevices::colors()} or through hexadecimal RGB strings. + +Labeling methods such as \code{set_fs_method_names} or \code{set_data_set_names} can +be applied to the \code{familiarCollection} object to update labels, and order +the output in the figure. +} diff --git a/man/plot_decision_curve-methods.Rd b/man/plot_decision_curve-methods.Rd new file mode 100644 index 00000000..5c753b2b --- /dev/null +++ b/man/plot_decision_curve-methods.Rd @@ -0,0 +1,393 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/PlotDecisionCurves.R +\name{plot_decision_curve} +\alias{plot_decision_curve} +\alias{plot_decision_curve,ANY-method} +\alias{plot_decision_curve,familiarCollection-method} +\title{Plot decision curves.} +\usage{ +plot_decision_curve( + object, + draw = FALSE, + dir_path = NULL, + split_by = NULL, + color_by = NULL, + facet_by = NULL, + facet_wrap_cols = NULL, + ggtheme = NULL, + discrete_palette = NULL, + x_label = waiver(), + y_label = waiver(), + legend_label = waiver(), + plot_title = waiver(), + plot_sub_title = waiver(), + caption = NULL, + x_range = NULL, + x_n_breaks = 5, + x_breaks = NULL, + y_range = NULL, + y_n_breaks = 5, + y_breaks = NULL, + conf_int_style = c("ribbon", "step", "none"), + conf_int_alpha = 0.4, + width = waiver(), + height = waiver(), + units = waiver(), + export_collection = FALSE, + ... +) + +\S4method{plot_decision_curve}{ANY}( + object, + draw = FALSE, + dir_path = NULL, + split_by = NULL, + color_by = NULL, + facet_by = NULL, + facet_wrap_cols = NULL, + ggtheme = NULL, + discrete_palette = NULL, + x_label = waiver(), + y_label = waiver(), + legend_label = waiver(), + plot_title = waiver(), + plot_sub_title = waiver(), + caption = NULL, + x_range = NULL, + x_n_breaks = 5, + x_breaks = NULL, + y_range = NULL, + y_n_breaks = 5, + y_breaks = NULL, + conf_int_style = c("ribbon", "step", "none"), + conf_int_alpha = 0.4, + width = waiver(), + height = waiver(), + units = waiver(), + export_collection = FALSE, + ... +) + +\S4method{plot_decision_curve}{familiarCollection}( + object, + draw = FALSE, + dir_path = NULL, + split_by = NULL, + color_by = NULL, + facet_by = NULL, + facet_wrap_cols = NULL, + ggtheme = NULL, + discrete_palette = NULL, + x_label = waiver(), + y_label = waiver(), + legend_label = waiver(), + plot_title = waiver(), + plot_sub_title = waiver(), + caption = NULL, + x_range = NULL, + x_n_breaks = 5, + x_breaks = NULL, + y_range = NULL, + y_n_breaks = 5, + y_breaks = NULL, + conf_int_style = c("ribbon", "step", "none"), + conf_int_alpha = 0.4, + width = waiver(), + height = waiver(), + units = waiver(), + export_collection = FALSE, + ... +) +} +\arguments{ +\item{object}{\code{familiarCollection} object, or one or more \code{familiarData} +objects, that will be internally converted to a \code{familiarCollection} object. +It is also possible to provide a \code{familiarEnsemble} or one or more +\code{familiarModel} objects together with the data from which data is computed +prior to export. Paths to such files can also be provided.} + +\item{draw}{(\emph{optional}) Draws the plot if TRUE.} + +\item{dir_path}{(\emph{optional}) Path to the directory where created decision +curve plots are saved to. Output is saved in the \code{decision_curve_analysis} +subdirectory. If \code{NULL}, figures are written to the folder, but are returned +instead.} + +\item{split_by}{(\emph{optional}) Splitting variables. This refers to column names +on which datasets are split. A separate figure is created for each split. +See details for available variables.} + +\item{color_by}{(\emph{optional}) Variables used to determine fill colour of plot +objects. The variables cannot overlap with those provided to the \code{split_by} +argument, but may overlap with other arguments. See details for available +variables.} + +\item{facet_by}{(\emph{optional}) Variables used to determine how and if facets of +each figure appear. In case the \code{facet_wrap_cols} argument is \code{NULL}, the +first variable is used to define columns, and the remaing variables are +used to define rows of facets. The variables cannot overlap with those +provided to the \code{split_by} argument, but may overlap with other arguments. +See details for available variables.} + +\item{facet_wrap_cols}{(\emph{optional}) Number of columns to generate when facet +wrapping. If NULL, a facet grid is produced instead.} + +\item{ggtheme}{(\emph{optional}) \code{ggplot} theme to use for plotting.} + +\item{discrete_palette}{(\emph{optional}) Palette to use to color the different +plot elements in case a value was provided to the \code{color_by} argument.} + +\item{x_label}{(\emph{optional}) Label to provide to the x-axis. If NULL, no label +is shown.} + +\item{y_label}{(\emph{optional}) Label to provide to the y-axis. If NULL, no label +is shown.} + +\item{legend_label}{(\emph{optional}) Label to provide to the legend. If NULL, the +legend will not have a name.} + +\item{plot_title}{(\emph{optional}) Label to provide as figure title. If NULL, no +title is shown.} + +\item{plot_sub_title}{(\emph{optional}) Label to provide as figure subtitle. If +NULL, no subtitle is shown.} + +\item{caption}{(\emph{optional}) Label to provide as figure caption. If NULL, no +caption is shown.} + +\item{x_range}{(\emph{optional}) Value range for the x-axis.} + +\item{x_n_breaks}{(\emph{optional}) Number of breaks to show on the x-axis of the +plot. \code{x_n_breaks} is used to determine the \code{x_breaks} argument in case it +is unset.} + +\item{x_breaks}{(\emph{optional}) Break points on the x-axis of the plot.} + +\item{y_range}{(\emph{optional}) Value range for the y-axis.} + +\item{y_n_breaks}{(\emph{optional}) Number of breaks to show on the y-axis of the +plot. \code{y_n_breaks} is used to determine the \code{y_breaks} argument in case it +is unset.} + +\item{y_breaks}{(\emph{optional}) Break points on the y-axis of the plot.} + +\item{conf_int_style}{(\emph{optional}) Confidence interval style. See details for +allowed styles.} + +\item{conf_int_alpha}{(\emph{optional}) Alpha value to determine transparency of +confidence intervals or, alternatively, other plot elements with which the +confidence interval overlaps. Only values between 0.0 (fully transparent) +and 1.0 (fully opaque) are allowed.} + +\item{width}{(\emph{optional}) Width of the plot. A default value is derived from +the number of facets.} + +\item{height}{(\emph{optional}) Height of the plot. A default value is derived from +the number of features and the number of facets.} + +\item{units}{(\emph{optional}) Plot size unit. Either \code{cm} (default), \code{mm} or \verb{in}.} + +\item{export_collection}{(\emph{optional}) Exports the collection if TRUE.} + +\item{...}{ + Arguments passed on to \code{\link[=as_familiar_collection]{as_familiar_collection}}, \code{\link[ggplot2:ggsave]{ggplot2::ggsave}}, \code{\link[=extract_decision_curve_data]{extract_decision_curve_data}} + \describe{ + \item{\code{familiar_data_names}}{Names of the dataset(s). Only used if the \code{object} parameter +is one or more \code{familiarData} objects.} + \item{\code{collection_name}}{Name of the collection.} + \item{\code{filename}}{File name to create on disk.} + \item{\code{plot}}{Plot to save, defaults to last plot displayed.} + \item{\code{device}}{Device to use. Can either be a device function +(e.g. \link{png}), or one of "eps", "ps", "tex" (pictex), +"pdf", "jpeg", "tiff", "png", "bmp", "svg" or "wmf" (windows only). If +\code{NULL} (default), the device is guessed based on the \code{filename} extension.} + \item{\code{path}}{Path of the directory to save plot to: \code{path} and \code{filename} +are combined to create the fully qualified file name. Defaults to the +working directory.} + \item{\code{scale}}{Multiplicative scaling factor.} + \item{\code{dpi}}{Plot resolution. Also accepts a string input: "retina" (320), +"print" (300), or "screen" (72). Applies only to raster output types.} + \item{\code{limitsize}}{When \code{TRUE} (the default), \code{ggsave()} will not +save images larger than 50x50 inches, to prevent the common error of +specifying dimensions in pixels.} + \item{\code{bg}}{Background colour. If \code{NULL}, uses the \code{plot.background} fill value +from the plot theme.} + \item{\code{create.dir}}{Whether to create new directories if a non-existing +directory is specified in the \code{filename} or \code{path} (\code{TRUE}) or return an +error (\code{FALSE}, default). If \code{FALSE} and run in an interactive session, +a prompt will appear asking to create a new directory when necessary.} + \item{\code{data}}{A \code{dataObject} object, \code{data.table} or \code{data.frame} that +constitutes the data that are assessed.} + \item{\code{is_pre_processed}}{Flag that indicates whether the data was already +pre-processed externally, e.g. normalised and clustered. Only used if the +\code{data} argument is a \code{data.table} or \code{data.frame}.} + \item{\code{cl}}{Cluster created using the \code{parallel} package. This cluster is then +used to speed up computation through parallellisation.} + \item{\code{evaluation_times}}{One or more time points that are used for in analysis of +survival problems when data has to be assessed at a set time, e.g. +calibration. If not provided explicitly, this parameter is read from +settings used at creation of the underlying \code{familiarModel} objects. Only +used for \code{survival} outcomes.} + \item{\code{ensemble_method}}{Method for ensembling predictions from models for the +same sample. Available methods are: +\itemize{ +\item \code{median} (default): Use the median of the predicted values as the ensemble +value for a sample. +\item \code{mean}: Use the mean of the predicted values as the ensemble value for a +sample. +}} + \item{\code{verbose}}{Flag to indicate whether feedback should be provided on the +computation and extraction of various data elements.} + \item{\code{message_indent}}{Number of indentation steps for messages shown during +computation and extraction of various data elements.} + \item{\code{detail_level}}{(\emph{optional}) Sets the level at which results are computed +and aggregated. +\itemize{ +\item \code{ensemble}: Results are computed at the ensemble level, i.e. over all +models in the ensemble. This means that, for example, bias-corrected +estimates of model performance are assessed by creating (at least) 20 +bootstraps and computing the model performance of the ensemble model for +each bootstrap. +\item \code{hybrid} (default): Results are computed at the level of models in an +ensemble. This means that, for example, bias-corrected estimates of model +performance are directly computed using the models in the ensemble. If there +are at least 20 trained models in the ensemble, performance is computed for +each model, in contrast to \code{ensemble} where performance is computed for the +ensemble of models. If there are less than 20 trained models in the +ensemble, bootstraps are created so that at least 20 point estimates can be +made. +\item \code{model}: Results are computed at the model level. This means that, for +example, bias-corrected estimates of model performance are assessed by +creating (at least) 20 bootstraps and computing the performance of the model +for each bootstrap. +} + +Note that each level of detail has a different interpretation for bootstrap +confidence intervals. For \code{ensemble} and \code{model} these are the confidence +intervals for the ensemble and an individual model, respectively. That is, +the confidence interval describes the range where an estimate produced by a +respective ensemble or model trained on a repeat of the experiment may be +found with the probability of the confidence level. For \code{hybrid}, it +represents the range where any single model trained on a repeat of the +experiment may be found with the probability of the confidence level. By +definition, confidence intervals obtained using \code{hybrid} are at least as +wide as those for \code{ensemble}. \code{hybrid} offers the correct interpretation if +the goal of the analysis is to assess the result of a single, unspecified, +model. + +\code{hybrid} is generally computationally less expensive then \code{ensemble}, which +in turn is somewhat less expensive than \code{model}. + +A non-default \code{detail_level} parameter can be specified for separate +evaluation steps by providing a parameter value in a named list with data +elements, e.g. \code{list("auc_data"="ensemble", "model_performance"="hybrid")}. +This parameter can be set for the following data elements: \code{auc_data}, +\code{decision_curve_analyis}, \code{model_performance}, \code{permutation_vimp}, +\code{ice_data}, \code{prediction_data} and \code{confusion_matrix}.} + \item{\code{estimation_type}}{(\emph{optional}) Sets the type of estimation that should be +possible. This has the following options: +\itemize{ +\item \code{point}: Point estimates. +\item \code{bias_correction} or \code{bc}: Bias-corrected estimates. A bias-corrected +estimate is computed from (at least) 20 point estimates, and \code{familiar} may +bootstrap the data to create them. +\item \code{bootstrap_confidence_interval} or \code{bci} (default): Bias-corrected +estimates with bootstrap confidence intervals (Efron and Hastie, 2016). The +number of point estimates required depends on the \code{confidence_level} +parameter, and \code{familiar} may bootstrap the data to create them. +} + +As with \code{detail_level}, a non-default \code{estimation_type} parameter can be +specified for separate evaluation steps by providing a parameter value in a +named list with data elements, e.g. \code{list("auc_data"="bci", "model_performance"="point")}. This parameter can be set for the following +data elements: \code{auc_data}, \code{decision_curve_analyis}, \code{model_performance}, +\code{permutation_vimp}, \code{ice_data}, and \code{prediction_data}.} + \item{\code{aggregate_results}}{(\emph{optional}) Flag that signifies whether results +should be aggregated during evaluation. If \code{estimation_type} is +\code{bias_correction} or \code{bc}, aggregation leads to a single bias-corrected +estimate. If \code{estimation_type} is \code{bootstrap_confidence_interval} or \code{bci}, +aggregation leads to a single bias-corrected estimate with lower and upper +boundaries of the confidence interval. This has no effect if +\code{estimation_type} is \code{point}. + +The default value is equal to \code{TRUE} except when assessing metrics to assess +model performance, as the default violin plot requires underlying data. + +As with \code{detail_level} and \code{estimation_type}, a non-default +\code{aggregate_results} parameter can be specified for separate evaluation steps +by providing a parameter value in a named list with data elements, e.g. +\code{list("auc_data"=TRUE, , "model_performance"=FALSE)}. This parameter exists +for the same elements as \code{estimation_type}.} + \item{\code{confidence_level}}{(\emph{optional}) Numeric value for the level at which +confidence intervals are determined. In the case bootstraps are used to +determine the confidence intervals bootstrap estimation, \code{familiar} uses the +rule of thumb \eqn{n = 20 / ci.level} to determine the number of required +bootstraps. + +The default value is \code{0.95}.} + \item{\code{bootstrap_ci_method}}{(\emph{optional}) Method used to determine bootstrap +confidence intervals (Efron and Hastie, 2016). The following methods are +implemented: +\itemize{ +\item \code{percentile} (default): Confidence intervals obtained using the percentile +method. +\item \code{bc}: Bias-corrected confidence intervals. +} + +Note that the standard method is not implemented because this method is +often not suitable due to non-normal distributions. The bias-corrected and +accelerated (BCa) method is not implemented yet.} + }} +} +\value{ +\code{NULL} or list of plot objects, if \code{dir_path} is \code{NULL}. +} +\description{ +This method creates decision curves based on data in a +familiarCollection object. +} +\details{ +This function generates plots for decision curves. + +Available splitting variables are: \code{fs_method}, \code{learner}, \code{data_set} and +\code{positive_class} (categorical outcomes) or \code{evaluation_time} (survival outcomes). +By default, the data is split by \code{fs_method} and \code{learner}, with faceting by +\code{data_set} and colouring by \code{positive_class} or \code{evaluation_time}. + +Available palettes for \code{discrete_palette} are those listed by +\code{grDevices::palette.pals()} (requires R >= 4.0.0), \code{grDevices::hcl.pals()} +(requires R >= 3.6.0) and \code{rainbow}, \code{heat.colors}, \code{terrain.colors}, +\code{topo.colors} and \code{cm.colors}, which correspond to the palettes of the same +name in \code{grDevices}. If not specified, a default palette based on palettes +in Tableau are used. You may also specify your own palette by using colour +names listed by \code{grDevices::colors()} or through hexadecimal RGB strings. + +Bootstrap confidence intervals of the decision curve (if present) can be +shown using various styles set by \code{conf_int_style}: +\itemize{ +\item \code{ribbon} (default): confidence intervals are shown as a ribbon with an +opacity of \code{conf_int_alpha} around the point estimate of the decision curve. +\item \code{step} (default): confidence intervals are shown as a step function around +the point estimate of the decision curve. +\item \code{none}: confidence intervals are not shown. The point estimate of the +decision curve is shown as usual. +} + +Labelling methods such as \code{set_fs_method_names} or \code{set_data_set_names} can +be applied to the \code{familiarCollection} object to update labels, and order +the output in the figure. +} +\references{ +\enumerate{ +\item Vickers, A. J. & Elkin, E. B. Decision curve analysis: a novel +method for evaluating prediction models. Med. Decis. Making 26, 565–574 +(2006). +\item Vickers, A. J., Cronin, A. M., Elkin, E. B. & Gonen, M. Extensions to +decision curve analysis, a novel method for evaluating diagnostic tests, +prediction models and molecular markers. BMC Med. Inform. Decis. Mak. 8, 53 +(2008). +\item Vickers, A. J., van Calster, B. & Steyerberg, E. W. A simple, +step-by-step guide to interpreting decision curve analysis. Diagn Progn Res +3, 18 (2019). +} +} diff --git a/man/plot_feature_similarity-methods.Rd b/man/plot_feature_similarity-methods.Rd new file mode 100644 index 00000000..327f9d0d --- /dev/null +++ b/man/plot_feature_similarity-methods.Rd @@ -0,0 +1,396 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/PlotFeatureSimilarity.R +\name{plot_feature_similarity} +\alias{plot_feature_similarity} +\alias{plot_feature_similarity,ANY-method} +\alias{plot_feature_similarity,familiarCollection-method} +\title{Plot heatmaps for pairwise similarity between features.} +\usage{ +plot_feature_similarity( + object, + feature_cluster_method = waiver(), + feature_linkage_method = waiver(), + feature_cluster_cut_method = waiver(), + feature_similarity_threshold = waiver(), + draw = FALSE, + dir_path = NULL, + split_by = NULL, + facet_by = NULL, + facet_wrap_cols = NULL, + ggtheme = NULL, + gradient_palette = NULL, + gradient_palette_range = NULL, + x_label = waiver(), + x_label_shared = "column", + y_label = waiver(), + y_label_shared = "row", + legend_label = waiver(), + plot_title = waiver(), + plot_sub_title = waiver(), + caption = NULL, + y_range = NULL, + y_n_breaks = 3, + y_breaks = NULL, + rotate_x_tick_labels = waiver(), + show_dendrogram = c("top", "right"), + dendrogram_height = grid::unit(1.5, "cm"), + width = waiver(), + height = waiver(), + units = waiver(), + export_collection = FALSE, + ... +) + +\S4method{plot_feature_similarity}{ANY}( + object, + feature_cluster_method = waiver(), + feature_linkage_method = waiver(), + feature_cluster_cut_method = waiver(), + feature_similarity_threshold = waiver(), + draw = FALSE, + dir_path = NULL, + split_by = NULL, + facet_by = NULL, + facet_wrap_cols = NULL, + ggtheme = NULL, + gradient_palette = NULL, + gradient_palette_range = NULL, + x_label = waiver(), + x_label_shared = "column", + y_label = waiver(), + y_label_shared = "row", + legend_label = waiver(), + plot_title = waiver(), + plot_sub_title = waiver(), + caption = NULL, + y_range = NULL, + y_n_breaks = 3, + y_breaks = NULL, + rotate_x_tick_labels = waiver(), + show_dendrogram = c("top", "right"), + dendrogram_height = grid::unit(1.5, "cm"), + width = waiver(), + height = waiver(), + units = waiver(), + export_collection = FALSE, + ... +) + +\S4method{plot_feature_similarity}{familiarCollection}( + object, + feature_cluster_method = waiver(), + feature_linkage_method = waiver(), + feature_cluster_cut_method = waiver(), + feature_similarity_threshold = waiver(), + draw = FALSE, + dir_path = NULL, + split_by = NULL, + facet_by = NULL, + facet_wrap_cols = NULL, + ggtheme = NULL, + gradient_palette = NULL, + gradient_palette_range = NULL, + x_label = waiver(), + x_label_shared = "column", + y_label = waiver(), + y_label_shared = "row", + legend_label = waiver(), + plot_title = waiver(), + plot_sub_title = waiver(), + caption = NULL, + y_range = NULL, + y_n_breaks = 3, + y_breaks = NULL, + rotate_x_tick_labels = waiver(), + show_dendrogram = c("top", "right"), + dendrogram_height = grid::unit(1.5, "cm"), + width = waiver(), + height = waiver(), + units = waiver(), + export_collection = FALSE, + ... +) +} +\arguments{ +\item{object}{A \code{familiarCollection} object, or other other objects from which +a \code{familiarCollection} can be extracted. See details for more information.} + +\item{feature_cluster_method}{The method used to perform clustering. These are +the same methods as for the \code{cluster_method} configuration parameter: +\code{none}, \code{hclust}, \code{agnes}, \code{diana} and \code{pam}. + +\code{none} cannot be used when extracting data regarding mutual correlation or +feature expressions. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{feature_linkage_method}{The method used for agglomerative clustering in +\code{hclust} and \code{agnes}. These are the same methods as for the +\code{cluster_linkage_method} configuration parameter: \code{average}, \code{single}, +\code{complete}, \code{weighted}, and \code{ward}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{feature_cluster_cut_method}{The method used to divide features into +separate clusters. The available methods are the same as for the +\code{cluster_cut_method} configuration parameter: \code{silhouette}, \code{fixed_cut} and +\code{dynamic_cut}. + +\code{silhouette} is available for all cluster methods, but \code{fixed_cut} only +applies to methods that create hierarchical trees (\code{hclust}, \code{agnes} and +\code{diana}). \code{dynamic_cut} requires the \code{dynamicTreeCut} package and can only +be used with \code{agnes} and \code{hclust}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{feature_similarity_threshold}{The threshold level for pair-wise +similarity that is required to form feature clusters with the \code{fixed_cut} +method. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{draw}{(\emph{optional}) Draws the plot if TRUE.} + +\item{dir_path}{(\emph{optional}) Path to the directory where created performance +plots are saved to. Output is saved in the \code{feature_similarity} +subdirectory. If \code{NULL} no figures are saved, but are returned instead.} + +\item{split_by}{(\emph{optional}) Splitting variables. This refers to column names +on which datasets are split. A separate figure is created for each split. +See details for available variables.} + +\item{facet_by}{(\emph{optional}) Variables used to determine how and if facets of +each figure appear. In case the \code{facet_wrap_cols} argument is \code{NULL}, the +first variable is used to define columns, and the remaing variables are +used to define rows of facets. The variables cannot overlap with those +provided to the \code{split_by} argument, but may overlap with other arguments. +See details for available variables.} + +\item{facet_wrap_cols}{(\emph{optional}) Number of columns to generate when facet +wrapping. If NULL, a facet grid is produced instead.} + +\item{ggtheme}{(\emph{optional}) \code{ggplot} theme to use for plotting.} + +\item{gradient_palette}{(\emph{optional}) Sequential or divergent palette used to +colour the similarity or distance between features in a heatmap.} + +\item{gradient_palette_range}{(\emph{optional}) Numerical range used to span the +gradient. This should be a range of two values, e.g. \code{c(0, 1)}. Lower or +upper boundary can be unset by using \code{NA}. If not set, the full +metric-specific range is used.} + +\item{x_label}{(\emph{optional}) Label to provide to the x-axis. If NULL, no label +is shown.} + +\item{x_label_shared}{(\emph{optional}) Sharing of x-axis labels between facets. +One of three values: +\itemize{ +\item \code{overall}: A single label is placed at the bottom of the figure. Tick +text (but not the ticks themselves) is removed for all but the bottom facet +plot(s). +\item \code{column}: A label is placed at the bottom of each column. Tick text (but +not the ticks themselves) is removed for all but the bottom facet plot(s). +\item \code{individual}: A label is placed below each facet plot. Tick text is kept. +}} + +\item{y_label}{(\emph{optional}) Label to provide to the y-axis. If NULL, no label +is shown.} + +\item{y_label_shared}{(\emph{optional}) Sharing of y-axis labels between facets. +One of three values: +\itemize{ +\item \code{overall}: A single label is placed to the left of the figure. Tick text +(but not the ticks themselves) is removed for all but the left-most facet +plot(s). +\item \code{row}: A label is placed to the left of each row. Tick text (but not the +ticks themselves) is removed for all but the left-most facet plot(s). +\item \code{individual}: A label is placed below each facet plot. Tick text is kept. +}} + +\item{legend_label}{(\emph{optional}) Label to provide to the legend. If NULL, the +legend will not have a name.} + +\item{plot_title}{(\emph{optional}) Label to provide as figure title. If NULL, no +title is shown.} + +\item{plot_sub_title}{(\emph{optional}) Label to provide as figure subtitle. If +NULL, no subtitle is shown.} + +\item{caption}{(\emph{optional}) Label to provide as figure caption. If NULL, no +caption is shown.} + +\item{y_range}{(\emph{optional}) Value range for the y-axis.} + +\item{y_n_breaks}{(\emph{optional}) Number of breaks to show on the y-axis of the +plot. \code{y_n_breaks} is used to determine the \code{y_breaks} argument in case it +is unset.} + +\item{y_breaks}{(\emph{optional}) Break points on the y-axis of the plot.} + +\item{rotate_x_tick_labels}{(\emph{optional}) Rotate tick labels on the x-axis by +90 degrees. Defaults to \code{TRUE}. Rotation of x-axis tick labels may also be +controlled through the \code{ggtheme}. In this case, \code{FALSE} should be provided +explicitly.} + +\item{show_dendrogram}{(\emph{optional}) Show dendrogram around the main panel. Can +be \code{TRUE}, \code{FALSE}, \code{NULL}, or a position, i.e. \code{top}, \code{bottom}, \code{left} and +\code{right}. Up to two positions may be provided, but only as long as the +dendrograms are not on opposite sides of the heatmap: \code{top} and \code{bottom}, +and \code{left} and \code{right} cannot be used together. + +A dendrogram can only be drawn from cluster methods that produce +dendrograms, such as \code{hclust}. A dendrogram can for example not be +constructed using the partitioning around medioids method (\code{pam}). + +By default, a dendrogram is drawn to the top and right of the panel.} + +\item{dendrogram_height}{(\emph{optional}) Height of the dendrogram. The height is +1.5 cm by default. Height is expected to be grid unit (see \code{grid::unit}), +which also allows for specifying relative heights.} + +\item{width}{(\emph{optional}) Width of the plot. A default value is derived from +the number of facets.} + +\item{height}{(\emph{optional}) Height of the plot. A default value is derived from +the number of features and the number of facets.} + +\item{units}{(\emph{optional}) Plot size unit. Either \code{cm} (default), \code{mm} or \verb{in}.} + +\item{export_collection}{(\emph{optional}) Exports the collection if TRUE.} + +\item{...}{ + Arguments passed on to \code{\link[=as_familiar_collection]{as_familiar_collection}}, \code{\link[ggplot2:ggsave]{ggplot2::ggsave}}, \code{\link[=extract_feature_similarity]{extract_feature_similarity}} + \describe{ + \item{\code{familiar_data_names}}{Names of the dataset(s). Only used if the \code{object} parameter +is one or more \code{familiarData} objects.} + \item{\code{collection_name}}{Name of the collection.} + \item{\code{filename}}{File name to create on disk.} + \item{\code{plot}}{Plot to save, defaults to last plot displayed.} + \item{\code{device}}{Device to use. Can either be a device function +(e.g. \link{png}), or one of "eps", "ps", "tex" (pictex), +"pdf", "jpeg", "tiff", "png", "bmp", "svg" or "wmf" (windows only). If +\code{NULL} (default), the device is guessed based on the \code{filename} extension.} + \item{\code{path}}{Path of the directory to save plot to: \code{path} and \code{filename} +are combined to create the fully qualified file name. Defaults to the +working directory.} + \item{\code{scale}}{Multiplicative scaling factor.} + \item{\code{dpi}}{Plot resolution. Also accepts a string input: "retina" (320), +"print" (300), or "screen" (72). Applies only to raster output types.} + \item{\code{limitsize}}{When \code{TRUE} (the default), \code{ggsave()} will not +save images larger than 50x50 inches, to prevent the common error of +specifying dimensions in pixels.} + \item{\code{bg}}{Background colour. If \code{NULL}, uses the \code{plot.background} fill value +from the plot theme.} + \item{\code{create.dir}}{Whether to create new directories if a non-existing +directory is specified in the \code{filename} or \code{path} (\code{TRUE}) or return an +error (\code{FALSE}, default). If \code{FALSE} and run in an interactive session, +a prompt will appear asking to create a new directory when necessary.} + \item{\code{data}}{A \code{dataObject} object, \code{data.table} or \code{data.frame} that +constitutes the data that are assessed.} + \item{\code{is_pre_processed}}{Flag that indicates whether the data was already +pre-processed externally, e.g. normalised and clustered. Only used if the +\code{data} argument is a \code{data.table} or \code{data.frame}.} + \item{\code{cl}}{Cluster created using the \code{parallel} package. This cluster is then +used to speed up computation through parallellisation.} + \item{\code{feature_similarity_metric}}{Metric to determine pairwise similarity +between features. Similarity is computed in the same manner as for +clustering, and \code{feature_similarity_metric} therefore has the same options +as \code{cluster_similarity_metric}: \code{mcfadden_r2}, \code{cox_snell_r2}, +\code{nagelkerke_r2}, \code{spearman}, \code{kendall} and \code{pearson}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{verbose}}{Flag to indicate whether feedback should be provided on the +computation and extraction of various data elements.} + \item{\code{message_indent}}{Number of indentation steps for messages shown during +computation and extraction of various data elements.} + \item{\code{estimation_type}}{(\emph{optional}) Sets the type of estimation that should be +possible. This has the following options: +\itemize{ +\item \code{point}: Point estimates. +\item \code{bias_correction} or \code{bc}: Bias-corrected estimates. A bias-corrected +estimate is computed from (at least) 20 point estimates, and \code{familiar} may +bootstrap the data to create them. +\item \code{bootstrap_confidence_interval} or \code{bci} (default): Bias-corrected +estimates with bootstrap confidence intervals (Efron and Hastie, 2016). The +number of point estimates required depends on the \code{confidence_level} +parameter, and \code{familiar} may bootstrap the data to create them. +} + +As with \code{detail_level}, a non-default \code{estimation_type} parameter can be +specified for separate evaluation steps by providing a parameter value in a +named list with data elements, e.g. \code{list("auc_data"="bci", "model_performance"="point")}. This parameter can be set for the following +data elements: \code{auc_data}, \code{decision_curve_analyis}, \code{model_performance}, +\code{permutation_vimp}, \code{ice_data}, and \code{prediction_data}.} + \item{\code{aggregate_results}}{(\emph{optional}) Flag that signifies whether results +should be aggregated during evaluation. If \code{estimation_type} is +\code{bias_correction} or \code{bc}, aggregation leads to a single bias-corrected +estimate. If \code{estimation_type} is \code{bootstrap_confidence_interval} or \code{bci}, +aggregation leads to a single bias-corrected estimate with lower and upper +boundaries of the confidence interval. This has no effect if +\code{estimation_type} is \code{point}. + +The default value is equal to \code{TRUE} except when assessing metrics to assess +model performance, as the default violin plot requires underlying data. + +As with \code{detail_level} and \code{estimation_type}, a non-default +\code{aggregate_results} parameter can be specified for separate evaluation steps +by providing a parameter value in a named list with data elements, e.g. +\code{list("auc_data"=TRUE, , "model_performance"=FALSE)}. This parameter exists +for the same elements as \code{estimation_type}.} + \item{\code{confidence_level}}{(\emph{optional}) Numeric value for the level at which +confidence intervals are determined. In the case bootstraps are used to +determine the confidence intervals bootstrap estimation, \code{familiar} uses the +rule of thumb \eqn{n = 20 / ci.level} to determine the number of required +bootstraps. + +The default value is \code{0.95}.} + \item{\code{bootstrap_ci_method}}{(\emph{optional}) Method used to determine bootstrap +confidence intervals (Efron and Hastie, 2016). The following methods are +implemented: +\itemize{ +\item \code{percentile} (default): Confidence intervals obtained using the percentile +method. +\item \code{bc}: Bias-corrected confidence intervals. +} + +Note that the standard method is not implemented because this method is +often not suitable due to non-normal distributions. The bias-corrected and +accelerated (BCa) method is not implemented yet.} + }} +} +\value{ +\code{NULL} or list of plot objects, if \code{dir_path} is \code{NULL}. +} +\description{ +This method creates a heatmap based on data stored in a +\code{familiarCollection} object. Features in the heatmap are ordered so that +more similar features appear together. +} +\details{ +This function generates area under the ROC curve plots. + +Available splitting variables are: \code{fs_method}, \code{learner}, and \code{data_set}. +By default, the data is split by \code{fs_method} and \code{learner}, with facetting +by \code{data_set}. + +Note that similarity is determined based on the underlying data. Hence the +ordering of features may differ between facets, and tick labels are +maintained for each panel. + +Available palettes for \code{gradient_palette} are those listed by +\code{grDevices::palette.pals()} (requires R >= 4.0.0), \code{grDevices::hcl.pals()} +(requires R >= 3.6.0) and \code{rainbow}, \code{heat.colors}, \code{terrain.colors}, +\code{topo.colors} and \code{cm.colors}, which correspond to the palettes of the same +name in \code{grDevices}. If not specified, a default palette based on palettes +in Tableau are used. You may also specify your own palette by using colour +names listed by \code{grDevices::colors()} or through hexadecimal RGB strings. + +Labeling methods such as \code{set_fs_method_names} or \code{set_data_set_names} can +be applied to the \code{familiarCollection} object to update labels, and order +the output in the figure. +} diff --git a/man/plot_ice-methods.Rd b/man/plot_ice-methods.Rd new file mode 100644 index 00000000..6d1d34c8 --- /dev/null +++ b/man/plot_ice-methods.Rd @@ -0,0 +1,502 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/PlotICE.R +\name{plot_ice} +\alias{plot_ice} +\alias{plot_ice,ANY-method} +\alias{plot_ice,familiarCollection-method} +\title{Plot individual conditional expectation plots.} +\usage{ +plot_ice( + object, + draw = FALSE, + dir_path = NULL, + split_by = NULL, + color_by = NULL, + facet_by = NULL, + facet_wrap_cols = NULL, + ggtheme = NULL, + discrete_palette = NULL, + gradient_palette = NULL, + gradient_palette_range = NULL, + x_label = waiver(), + y_label = waiver(), + legend_label = waiver(), + plot_title = NULL, + plot_sub_title = NULL, + caption = NULL, + x_range = NULL, + x_n_breaks = 5, + x_breaks = NULL, + y_range = NULL, + y_n_breaks = 5, + y_breaks = NULL, + novelty_range = NULL, + value_scales = waiver(), + novelty_scales = waiver(), + conf_int_style = c("ribbon", "step", "none"), + conf_int_alpha = 0.4, + ice_default_alpha = 0.6, + n_max_samples_shown = 50L, + show_ice = TRUE, + show_pd = TRUE, + show_novelty = TRUE, + anchor_values = NULL, + width = waiver(), + height = waiver(), + units = waiver(), + export_collection = FALSE, + ... +) + +\S4method{plot_ice}{ANY}( + object, + draw = FALSE, + dir_path = NULL, + split_by = NULL, + color_by = NULL, + facet_by = NULL, + facet_wrap_cols = NULL, + ggtheme = NULL, + discrete_palette = NULL, + gradient_palette = NULL, + gradient_palette_range = NULL, + x_label = waiver(), + y_label = waiver(), + legend_label = waiver(), + plot_title = NULL, + plot_sub_title = NULL, + caption = NULL, + x_range = NULL, + x_n_breaks = 5, + x_breaks = NULL, + y_range = NULL, + y_n_breaks = 5, + y_breaks = NULL, + novelty_range = NULL, + value_scales = waiver(), + novelty_scales = waiver(), + conf_int_style = c("ribbon", "step", "none"), + conf_int_alpha = 0.4, + ice_default_alpha = 0.6, + n_max_samples_shown = 50L, + show_ice = TRUE, + show_pd = TRUE, + show_novelty = TRUE, + anchor_values = NULL, + width = waiver(), + height = waiver(), + units = waiver(), + export_collection = FALSE, + ... +) + +\S4method{plot_ice}{familiarCollection}( + object, + draw = FALSE, + dir_path = NULL, + split_by = NULL, + color_by = NULL, + facet_by = NULL, + facet_wrap_cols = NULL, + ggtheme = NULL, + discrete_palette = NULL, + gradient_palette = NULL, + gradient_palette_range = NULL, + x_label = waiver(), + y_label = waiver(), + legend_label = waiver(), + plot_title = waiver(), + plot_sub_title = waiver(), + caption = NULL, + x_range = NULL, + x_n_breaks = 5, + x_breaks = NULL, + y_range = NULL, + y_n_breaks = 5, + y_breaks = NULL, + novelty_range = NULL, + value_scales = waiver(), + novelty_scales = waiver(), + conf_int_style = c("ribbon", "step", "none"), + conf_int_alpha = 0.4, + ice_default_alpha = 0.6, + n_max_samples_shown = 50L, + show_ice = TRUE, + show_pd = TRUE, + show_novelty = TRUE, + anchor_values = NULL, + width = waiver(), + height = waiver(), + units = waiver(), + export_collection = FALSE, + ... +) +} +\arguments{ +\item{object}{\code{familiarCollection} object, or one or more \code{familiarData} +objects, that will be internally converted to a \code{familiarCollection} object. +It is also possible to provide a \code{familiarEnsemble} or one or more +\code{familiarModel} objects together with the data from which data is computed +prior to export. Paths to such files can also be provided.} + +\item{draw}{(\emph{optional}) Draws the plot if TRUE.} + +\item{dir_path}{(\emph{optional}) Path to the directory where created individual +conditional expectation plots are saved to. Output is saved in the +\code{explanation} subdirectory. If \code{NULL}, figures are written to the folder, +but are returned instead.} + +\item{split_by}{(\emph{optional}) Splitting variables. This refers to column names +on which datasets are split. A separate figure is created for each split. +See details for available variables.} + +\item{color_by}{(\emph{optional}) Variables used to determine fill colour of plot +objects. The variables cannot overlap with those provided to the \code{split_by} +argument, but may overlap with other arguments. See details for available +variables.} + +\item{facet_by}{(\emph{optional}) Variables used to determine how and if facets of +each figure appear. In case the \code{facet_wrap_cols} argument is \code{NULL}, the +first variable is used to define columns, and the remaing variables are +used to define rows of facets. The variables cannot overlap with those +provided to the \code{split_by} argument, but may overlap with other arguments. +See details for available variables.} + +\item{facet_wrap_cols}{(\emph{optional}) Number of columns to generate when facet +wrapping. If NULL, a facet grid is produced instead.} + +\item{ggtheme}{(\emph{optional}) \code{ggplot} theme to use for plotting.} + +\item{discrete_palette}{(\emph{optional}) Palette to use to colour the different +plot elements in case a value was provided to the \code{color_by} argument. For +2D individual conditional expectation plots without novelty, the initial +colour determines the colour of the points indicating sample values.} + +\item{gradient_palette}{(\emph{optional}) Sequential or divergent palette used to +colour the raster in 2D individual conditional expectation or partial +dependence plots. This argument is not used for 1D plots.} + +\item{gradient_palette_range}{(\emph{optional}) Numerical range used to span the +gradient for 2D plots. This should be a range of two values, e.g. \code{c(0, 1)}. +By default, values are determined from the data, dependent on the +\code{value_scales} parameter. This parameter is ignored for 1D plots.} + +\item{x_label}{(\emph{optional}) Label to provide to the x-axis. If NULL, no label +is shown.} + +\item{y_label}{(\emph{optional}) Label to provide to the y-axis. If NULL, no label +is shown.} + +\item{legend_label}{(\emph{optional}) Label to provide to the legend. If NULL, the +legend will not have a name.} + +\item{plot_title}{(\emph{optional}) Label to provide as figure title. If NULL, no +title is shown.} + +\item{plot_sub_title}{(\emph{optional}) Label to provide as figure subtitle. If +NULL, no subtitle is shown.} + +\item{caption}{(\emph{optional}) Label to provide as figure caption. If NULL, no +caption is shown.} + +\item{x_range}{(\emph{optional}) Value range for the x-axis.} + +\item{x_n_breaks}{(\emph{optional}) Number of breaks to show on the x-axis of the +plot. \code{x_n_breaks} is used to determine the \code{x_breaks} argument in case it +is unset.} + +\item{x_breaks}{(\emph{optional}) Break points on the x-axis of the plot.} + +\item{y_range}{(\emph{optional}) Value range for the y-axis.} + +\item{y_n_breaks}{(\emph{optional}) Number of breaks to show on the y-axis of the +plot. \code{y_n_breaks} is used to determine the \code{y_breaks} argument in case it +is unset.} + +\item{y_breaks}{(\emph{optional}) Break points on the y-axis of the plot.} + +\item{novelty_range}{(\emph{optional}) Numerical range used to span the range of +novelty values. This determines the size of the bubbles in 2D, and +transparency of lines in 1D. This should be a range of two values, e.g. +\code{c(0, 1)}. By default, values are determined from the data, dependent on the +\code{value_scales} parameter. This parameter is ignored if \code{show_novelty=FALSE}.} + +\item{value_scales}{(\emph{optional}) Sets scaling of predicted values. This +parameter has several options: +\itemize{ +\item \code{fixed} (default): The value axis for all features will have the same +range. +\item \code{feature}: The value axis for each feature will have the same range. This +option is unavailable for 2D plots. +\item \code{figure}: The value axis for all facets in a figure will have the same +range. +\item \code{facet}: Each facet has its own range. This option is unavailable for 2D +plots. +} + +For 1D plots, this option is ignored if the \code{y_range} is provided, whereas +for 2D it is ignored if the \code{gradient_palette_range} is provided.} + +\item{novelty_scales}{(\emph{optional}) Sets scaling of novelty values, similar to +the \code{value_scales} parameter, but with more limited options: +\itemize{ +\item \code{fixed} (default): The novelty will have the same range for all features. +\item \code{figure}: The novelty will have the same range for all facets in a figure. +}} + +\item{conf_int_style}{(\emph{optional}) Confidence interval style. See details for +allowed styles.} + +\item{conf_int_alpha}{(\emph{optional}) Alpha value to determine transparency of +confidence intervals or, alternatively, other plot elements with which the +confidence interval overlaps. Only values between 0.0 (fully transparent) +and 1.0 (fully opaque) are allowed.} + +\item{ice_default_alpha}{(\emph{optional}) Default transparency (value) of sample +lines in an 1D plot. When novelty is shown, this is the transparency +corresponding to the least novel points. The confidence interval alpha +values is scaled by this value.} + +\item{n_max_samples_shown}{(\emph{optional}) Maximum number of samples shown in an +individual conditional expectation plot. Defaults to 50. These samples are +randomly picked from the samples present in the ICE data, but the same +samples are consistently picked. Partial dependence is nonetheless computed +from all available samples.} + +\item{show_ice}{(\emph{optional}) Sets whether individual conditional expectation +plots should be created.} + +\item{show_pd}{(\emph{optional}) Sets whether partial dependence plots should be +created. Note that if an anchor is set for a particular feature, its partial +dependence cannot be shown.} + +\item{show_novelty}{(\emph{optional}) Sets whether novelty is shown in plots.} + +\item{anchor_values}{(\emph{optional}) A single value or a named list or array of +values that are used to centre the individual conditional expectation plot. +A single value is valid if and only if only a single feature is assessed. +Otherwise, values Has no effect if the plot is not shown, i.e. +\code{show_ice=FALSE}. A partial dependence plot cannot be shown for those +features.} + +\item{width}{(\emph{optional}) Width of the plot. A default value is derived from +the number of facets.} + +\item{height}{(\emph{optional}) Height of the plot. A default value is derived from +the number of features and the number of facets.} + +\item{units}{(\emph{optional}) Plot size unit. Either \code{cm} (default), \code{mm} or \verb{in}.} + +\item{export_collection}{(\emph{optional}) Exports the collection if TRUE.} + +\item{...}{ + Arguments passed on to \code{\link[=export_ice_data]{export_ice_data}}, \code{\link[ggplot2:ggsave]{ggplot2::ggsave}}, \code{\link[=extract_ice]{extract_ice}} + \describe{ + \item{\code{aggregate_results}}{Flag that signifies whether results should be +aggregated for export.} + \item{\code{filename}}{File name to create on disk.} + \item{\code{plot}}{Plot to save, defaults to last plot displayed.} + \item{\code{device}}{Device to use. Can either be a device function +(e.g. \link{png}), or one of "eps", "ps", "tex" (pictex), +"pdf", "jpeg", "tiff", "png", "bmp", "svg" or "wmf" (windows only). If +\code{NULL} (default), the device is guessed based on the \code{filename} extension.} + \item{\code{path}}{Path of the directory to save plot to: \code{path} and \code{filename} +are combined to create the fully qualified file name. Defaults to the +working directory.} + \item{\code{scale}}{Multiplicative scaling factor.} + \item{\code{dpi}}{Plot resolution. Also accepts a string input: "retina" (320), +"print" (300), or "screen" (72). Applies only to raster output types.} + \item{\code{limitsize}}{When \code{TRUE} (the default), \code{ggsave()} will not +save images larger than 50x50 inches, to prevent the common error of +specifying dimensions in pixels.} + \item{\code{bg}}{Background colour. If \code{NULL}, uses the \code{plot.background} fill value +from the plot theme.} + \item{\code{create.dir}}{Whether to create new directories if a non-existing +directory is specified in the \code{filename} or \code{path} (\code{TRUE}) or return an +error (\code{FALSE}, default). If \code{FALSE} and run in an interactive session, +a prompt will appear asking to create a new directory when necessary.} + \item{\code{features}}{Names of the feature or features (2) assessed simultaneously. +By default \code{NULL}, which means that all features are assessed one-by-one.} + \item{\code{feature_x_range}}{When one or two features are defined using \code{features}, +\code{feature_x_range} can be used to set the range of values for the first +feature. For numeric features, a vector of two values is assumed to indicate +a range from which \code{n_sample_points} are uniformly sampled. A vector of more +than two values is interpreted as is, i.e. these represent the values to be +sampled. For categorical features, values should represent a (sub)set of +available levels.} + \item{\code{feature_y_range}}{As \code{feature_x_range}, but for the second feature in +case two features are defined.} + \item{\code{n_sample_points}}{Number of points used to sample continuous features.} + \item{\code{data}}{A \code{dataObject} object, \code{data.table} or \code{data.frame} that +constitutes the data that are assessed.} + \item{\code{is_pre_processed}}{Flag that indicates whether the data was already +pre-processed externally, e.g. normalised and clustered. Only used if the +\code{data} argument is a \code{data.table} or \code{data.frame}.} + \item{\code{cl}}{Cluster created using the \code{parallel} package. This cluster is then +used to speed up computation through parallellisation.} + \item{\code{evaluation_times}}{One or more time points that are used for in analysis of +survival problems when data has to be assessed at a set time, e.g. +calibration. If not provided explicitly, this parameter is read from +settings used at creation of the underlying \code{familiarModel} objects. Only +used for \code{survival} outcomes.} + \item{\code{ensemble_method}}{Method for ensembling predictions from models for the +same sample. Available methods are: +\itemize{ +\item \code{median} (default): Use the median of the predicted values as the ensemble +value for a sample. +\item \code{mean}: Use the mean of the predicted values as the ensemble value for a +sample. +}} + \item{\code{verbose}}{Flag to indicate whether feedback should be provided on the +computation and extraction of various data elements.} + \item{\code{message_indent}}{Number of indentation steps for messages shown during +computation and extraction of various data elements.} + \item{\code{sample_limit}}{(\emph{optional}) Set the upper limit of the number of samples +that are used during evaluation steps. Cannot be less than 20. + +This setting can be specified per data element by providing a parameter +value in a named list with data elements, e.g. +\code{list("sample_similarity"=100, "permutation_vimp"=1000)}. + +This parameter can be set for the following data elements: +\code{sample_similarity} and \code{ice_data}.} + \item{\code{detail_level}}{(\emph{optional}) Sets the level at which results are computed +and aggregated. +\itemize{ +\item \code{ensemble}: Results are computed at the ensemble level, i.e. over all +models in the ensemble. This means that, for example, bias-corrected +estimates of model performance are assessed by creating (at least) 20 +bootstraps and computing the model performance of the ensemble model for +each bootstrap. +\item \code{hybrid} (default): Results are computed at the level of models in an +ensemble. This means that, for example, bias-corrected estimates of model +performance are directly computed using the models in the ensemble. If there +are at least 20 trained models in the ensemble, performance is computed for +each model, in contrast to \code{ensemble} where performance is computed for the +ensemble of models. If there are less than 20 trained models in the +ensemble, bootstraps are created so that at least 20 point estimates can be +made. +\item \code{model}: Results are computed at the model level. This means that, for +example, bias-corrected estimates of model performance are assessed by +creating (at least) 20 bootstraps and computing the performance of the model +for each bootstrap. +} + +Note that each level of detail has a different interpretation for bootstrap +confidence intervals. For \code{ensemble} and \code{model} these are the confidence +intervals for the ensemble and an individual model, respectively. That is, +the confidence interval describes the range where an estimate produced by a +respective ensemble or model trained on a repeat of the experiment may be +found with the probability of the confidence level. For \code{hybrid}, it +represents the range where any single model trained on a repeat of the +experiment may be found with the probability of the confidence level. By +definition, confidence intervals obtained using \code{hybrid} are at least as +wide as those for \code{ensemble}. \code{hybrid} offers the correct interpretation if +the goal of the analysis is to assess the result of a single, unspecified, +model. + +\code{hybrid} is generally computationally less expensive then \code{ensemble}, which +in turn is somewhat less expensive than \code{model}. + +A non-default \code{detail_level} parameter can be specified for separate +evaluation steps by providing a parameter value in a named list with data +elements, e.g. \code{list("auc_data"="ensemble", "model_performance"="hybrid")}. +This parameter can be set for the following data elements: \code{auc_data}, +\code{decision_curve_analyis}, \code{model_performance}, \code{permutation_vimp}, +\code{ice_data}, \code{prediction_data} and \code{confusion_matrix}.} + \item{\code{estimation_type}}{(\emph{optional}) Sets the type of estimation that should be +possible. This has the following options: +\itemize{ +\item \code{point}: Point estimates. +\item \code{bias_correction} or \code{bc}: Bias-corrected estimates. A bias-corrected +estimate is computed from (at least) 20 point estimates, and \code{familiar} may +bootstrap the data to create them. +\item \code{bootstrap_confidence_interval} or \code{bci} (default): Bias-corrected +estimates with bootstrap confidence intervals (Efron and Hastie, 2016). The +number of point estimates required depends on the \code{confidence_level} +parameter, and \code{familiar} may bootstrap the data to create them. +} + +As with \code{detail_level}, a non-default \code{estimation_type} parameter can be +specified for separate evaluation steps by providing a parameter value in a +named list with data elements, e.g. \code{list("auc_data"="bci", "model_performance"="point")}. This parameter can be set for the following +data elements: \code{auc_data}, \code{decision_curve_analyis}, \code{model_performance}, +\code{permutation_vimp}, \code{ice_data}, and \code{prediction_data}.} + \item{\code{confidence_level}}{(\emph{optional}) Numeric value for the level at which +confidence intervals are determined. In the case bootstraps are used to +determine the confidence intervals bootstrap estimation, \code{familiar} uses the +rule of thumb \eqn{n = 20 / ci.level} to determine the number of required +bootstraps. + +The default value is \code{0.95}.} + \item{\code{bootstrap_ci_method}}{(\emph{optional}) Method used to determine bootstrap +confidence intervals (Efron and Hastie, 2016). The following methods are +implemented: +\itemize{ +\item \code{percentile} (default): Confidence intervals obtained using the percentile +method. +\item \code{bc}: Bias-corrected confidence intervals. +} + +Note that the standard method is not implemented because this method is +often not suitable due to non-normal distributions. The bias-corrected and +accelerated (BCa) method is not implemented yet.} + }} +} +\value{ +\code{NULL} or list of plot objects, if \code{dir_path} is \code{NULL}. +} +\description{ +This method creates individual conditional expectation plots +based on data in a familiarCollection object. +} +\details{ +This function generates individual conditional expectation plots. +These plots come in two varieties, namely 1D and 2D. 1D plots show the +predicted value as function of a single feature, whereas 2D plots show the +predicted value as a function of two features. + +Available splitting variables are: \code{feature_x}, \code{feature_y} (2D only), +\code{fs_method}, \code{learner}, \code{data_set} and \code{positive_class} (categorical +outcomes) or \code{evaluation_time} (survival outcomes). By default, for 1D ICE +plots the data are split by \code{feature_x}, \code{fs_method} and \code{learner}, with +faceting by \code{data_set}, \code{positive_class} or \code{evaluation_time}. If only +partial dependence is shown, \code{positive_class} and \code{evaluation_time} are used +to set colours instead. For 2D plots, by default the data are split by +\code{feature_x}, \code{fs_method} and \code{learner}, with faceting by \code{data_set}, +\code{positive_class} or \code{evaluation_time}. The \code{color_by} argument cannot be +used with 2D plots, and attempting to do so causes an error. Attempting to +specify \code{feature_x} or \code{feature_y} for \code{color_by} will likewise result in an +error, as multiple features cannot be shown in the same facet. + +The splitting variables indicated by \code{color_by} are coloured according to +the \code{discrete_palette} parameter. This parameter is therefore only used for +1D plots. Available palettes for \code{discrete_palette} and \code{gradient_palette} +are those listed by \code{grDevices::palette.pals()} (requires R >= 4.0.0), +\code{grDevices::hcl.pals()} (requires R >= 3.6.0) and \code{rainbow}, \code{heat.colors}, +\code{terrain.colors}, \code{topo.colors} and \code{cm.colors}, which correspond to the +palettes of the same name in \code{grDevices}. If not specified, a default +palette based on palettes in Tableau are used. You may also specify your own +palette by using colour names listed by \code{grDevices::colors()} or through +hexadecimal RGB strings. + +Bootstrap confidence intervals of the partial dependence plots can be shown +using various styles set by \code{conf_int_style}: +\itemize{ +\item \code{ribbon} (default): confidence intervals are shown as a ribbon with an +opacity of \code{conf_int_alpha} around the point estimate of the partial +dependence. +\item \code{step} (default): confidence intervals are shown as a step function around +the point estimate of the partial dependence. +\item \code{none}: confidence intervals are not shown. The point estimate of the +partial dependence is shown as usual. +} + +Note that when bootstrap confidence intervals were computed, they were also +computed for individual samples in individual conditional expectation plots. +To avoid clutter, only point estimates for individual samples are shown. + +Labelling methods such as \code{set_fs_method_names} or \code{set_data_set_names} can +be applied to the \code{familiarCollection} object to update labels, and order +the output in the figure. +} diff --git a/man/plot_kaplan_meier-methods.Rd b/man/plot_kaplan_meier-methods.Rd new file mode 100644 index 00000000..564f5e47 --- /dev/null +++ b/man/plot_kaplan_meier-methods.Rd @@ -0,0 +1,404 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/PlotKaplanMeier.R +\name{plot_kaplan_meier} +\alias{plot_kaplan_meier} +\alias{plot_kaplan_meier,ANY-method} +\alias{plot_kaplan_meier,familiarCollection-method} +\title{Plot Kaplan-Meier survival curves.} +\usage{ +plot_kaplan_meier( + object, + draw = FALSE, + dir_path = NULL, + split_by = NULL, + color_by = NULL, + linetype_by = NULL, + facet_by = NULL, + facet_wrap_cols = NULL, + combine_legend = TRUE, + ggtheme = NULL, + discrete_palette = NULL, + x_label = "time", + x_label_shared = "column", + y_label = "survival probability", + y_label_shared = "row", + legend_label = waiver(), + plot_title = waiver(), + plot_sub_title = waiver(), + caption = NULL, + x_range = NULL, + x_n_breaks = 5, + x_breaks = NULL, + y_range = c(0, 1), + y_n_breaks = 5, + y_breaks = NULL, + confidence_level = NULL, + conf_int_style = c("ribbon", "step", "none"), + conf_int_alpha = 0.4, + censoring = TRUE, + censor_shape = "plus", + show_logrank = TRUE, + show_survival_table = TRUE, + width = waiver(), + height = waiver(), + units = waiver(), + export_collection = FALSE, + ... +) + +\S4method{plot_kaplan_meier}{ANY}( + object, + draw = FALSE, + dir_path = NULL, + split_by = NULL, + color_by = NULL, + linetype_by = NULL, + facet_by = NULL, + facet_wrap_cols = NULL, + combine_legend = TRUE, + ggtheme = NULL, + discrete_palette = NULL, + x_label = "time", + x_label_shared = "column", + y_label = "survival probability", + y_label_shared = "row", + legend_label = waiver(), + plot_title = waiver(), + plot_sub_title = waiver(), + caption = NULL, + x_range = NULL, + x_n_breaks = 5, + x_breaks = NULL, + y_range = c(0, 1), + y_n_breaks = 5, + y_breaks = NULL, + confidence_level = NULL, + conf_int_style = c("ribbon", "step", "none"), + conf_int_alpha = 0.4, + censoring = TRUE, + censor_shape = "plus", + show_logrank = TRUE, + show_survival_table = TRUE, + width = waiver(), + height = waiver(), + units = waiver(), + export_collection = FALSE, + ... +) + +\S4method{plot_kaplan_meier}{familiarCollection}( + object, + draw = FALSE, + dir_path = NULL, + split_by = NULL, + color_by = NULL, + linetype_by = NULL, + facet_by = NULL, + facet_wrap_cols = NULL, + combine_legend = TRUE, + ggtheme = NULL, + discrete_palette = NULL, + x_label = "time", + x_label_shared = "column", + y_label = "survival probability", + y_label_shared = "row", + legend_label = waiver(), + plot_title = waiver(), + plot_sub_title = waiver(), + caption = NULL, + x_range = NULL, + x_n_breaks = 5, + x_breaks = NULL, + y_range = c(0, 1), + y_n_breaks = 5, + y_breaks = NULL, + confidence_level = NULL, + conf_int_style = c("ribbon", "step", "none"), + conf_int_alpha = 0.4, + censoring = TRUE, + censor_shape = "plus", + show_logrank = TRUE, + show_survival_table = TRUE, + width = waiver(), + height = waiver(), + units = waiver(), + export_collection = FALSE, + ... +) +} +\arguments{ +\item{object}{\code{familiarCollection} object, or one or more \code{familiarData} +objects, that will be internally converted to a \code{familiarCollection} object. +It is also possible to provide a \code{familiarEnsemble} or one or more +\code{familiarModel} objects together with the data from which data is computed +prior to export. Paths to such files can also be provided.} + +\item{draw}{(\emph{optional}) Draws the plot if TRUE.} + +\item{dir_path}{(\emph{optional}) Path to the directory where created figures are +saved to. Output is saved in the \code{stratification} subdirectory. If \code{NULL} no +figures are saved, but are returned instead.} + +\item{split_by}{(\emph{optional}) Splitting variables. This refers to column names +on which datasets are split. A separate figure is created for each split. +See details for available variables.} + +\item{color_by}{(\emph{optional}) Variables used to determine fill colour of plot +objects. The variables cannot overlap with those provided to the \code{split_by} +argument, but may overlap with other arguments. See details for available +variables.} + +\item{linetype_by}{(\emph{optional}) Variables that are used to determine the +linetype of lines in a plot. The variables cannot overlap with those +provided to the \code{split_by} argument, but may overlap with other arguments. +Sett details for available variables.} + +\item{facet_by}{(\emph{optional}) Variables used to determine how and if facets of +each figure appear. In case the \code{facet_wrap_cols} argument is \code{NULL}, the +first variable is used to define columns, and the remaing variables are +used to define rows of facets. The variables cannot overlap with those +provided to the \code{split_by} argument, but may overlap with other arguments. +See details for available variables.} + +\item{facet_wrap_cols}{(\emph{optional}) Number of columns to generate when facet +wrapping. If NULL, a facet grid is produced instead.} + +\item{combine_legend}{(\emph{optional}) Flag to indicate whether the same legend +is to be shared by multiple aesthetics, such as those specified by +\code{color_by} and \code{linetype_by} arguments.} + +\item{ggtheme}{(\emph{optional}) \code{ggplot} theme to use for plotting.} + +\item{discrete_palette}{(\emph{optional}) Palette to use to color the different +risk strata in case a non-singular variable was provided to the \code{color_by} +argument.} + +\item{x_label}{(\emph{optional}) Label to provide to the x-axis. If NULL, no label +is shown.} + +\item{x_label_shared}{(\emph{optional}) Sharing of x-axis labels between facets. +One of three values: +\itemize{ +\item \code{overall}: A single label is placed at the bottom of the figure. Tick +text (but not the ticks themselves) is removed for all but the bottom facet +plot(s). +\item \code{column}: A label is placed at the bottom of each column. Tick text (but +not the ticks themselves) is removed for all but the bottom facet plot(s). +\item \code{individual}: A label is placed below each facet plot. Tick text is kept. +}} + +\item{y_label}{(\emph{optional}) Label to provide to the y-axis. If NULL, no label +is shown.} + +\item{y_label_shared}{(\emph{optional}) Sharing of y-axis labels between facets. +One of three values: +\itemize{ +\item \code{overall}: A single label is placed to the left of the figure. Tick text +(but not the ticks themselves) is removed for all but the left-most facet +plot(s). +\item \code{row}: A label is placed to the left of each row. Tick text (but not the +ticks themselves) is removed for all but the left-most facet plot(s). +\item \code{individual}: A label is placed below each facet plot. Tick text is kept. +}} + +\item{legend_label}{(\emph{optional}) Label to provide to the legend. If NULL, the +legend will not have a name.} + +\item{plot_title}{(\emph{optional}) Label to provide as figure title. If NULL, no +title is shown.} + +\item{plot_sub_title}{(\emph{optional}) Label to provide as figure subtitle. If +NULL, no subtitle is shown.} + +\item{caption}{(\emph{optional}) Label to provide as figure caption. If NULL, no +caption is shown.} + +\item{x_range}{(\emph{optional}) Value range for the x-axis.} + +\item{x_n_breaks}{(\emph{optional}) Number of breaks to show on the x-axis of the +plot. \code{x_n_breaks} is used to determine the \code{x_breaks} argument in case it +is unset.} + +\item{x_breaks}{(\emph{optional}) Break points on the x-axis of the plot.} + +\item{y_range}{(\emph{optional}) Value range for the y-axis.} + +\item{y_n_breaks}{(\emph{optional}) Number of breaks to show on the y-axis of the +plot. \code{y_n_breaks} is used to determine the \code{y_breaks} argument in case it +is unset.} + +\item{y_breaks}{(\emph{optional}) Break points on the y-axis of the plot.} + +\item{confidence_level}{(\emph{optional}) Confidence level for the strata in the +plot.} + +\item{conf_int_style}{(\emph{optional}) Confidence interval style. See details for +allowed styles.} + +\item{conf_int_alpha}{(\emph{optional}) Alpha value to determine transparency of +confidence intervals or, alternatively, other plot elements with which the +confidence interval overlaps. Only values between 0.0 (fully transparent) +and 1.0 (fully opaque) are allowed.} + +\item{censoring}{(\emph{optional}) Flag to indicate whether censored samples should +be indicated on the survival curve.} + +\item{censor_shape}{(\emph{optional}) Shape used to indicate censored samples on +the survival curve. Available shapes are documented in the \code{ggplot2} +vignette \emph{Aesthetic specifications}. By default a plus shape is used.} + +\item{show_logrank}{(\emph{optional}) Specifies whether the results of a logrank +test to assess differences between the risk strata is annotated in the plot. +A log-rank test can only be shown when \code{color_by} and \code{linestyle_by} are +either unset, or only contain \code{risk_group}.} + +\item{show_survival_table}{(\emph{optional}) Specifies whether a survival table is +shown below the Kaplan-Meier survival curves. Survival in the risk strata is +assessed for each of the breaks in \code{x_breaks}.} + +\item{width}{(\emph{optional}) Width of the plot. A default value is derived from +the number of facets.} + +\item{height}{(\emph{optional}) Height of the plot. A default value is derived from +number of facets and the inclusion of survival tables.} + +\item{units}{(\emph{optional}) Plot size unit. Either \code{cm} (default), \code{mm} or \verb{in}.} + +\item{export_collection}{(\emph{optional}) Exports the collection if TRUE.} + +\item{...}{ + Arguments passed on to \code{\link[=as_familiar_collection]{as_familiar_collection}}, \code{\link[ggplot2:ggsave]{ggplot2::ggsave}}, \code{\link[=extract_risk_stratification_data]{extract_risk_stratification_data}} + \describe{ + \item{\code{familiar_data_names}}{Names of the dataset(s). Only used if the \code{object} parameter +is one or more \code{familiarData} objects.} + \item{\code{collection_name}}{Name of the collection.} + \item{\code{filename}}{File name to create on disk.} + \item{\code{plot}}{Plot to save, defaults to last plot displayed.} + \item{\code{device}}{Device to use. Can either be a device function +(e.g. \link{png}), or one of "eps", "ps", "tex" (pictex), +"pdf", "jpeg", "tiff", "png", "bmp", "svg" or "wmf" (windows only). If +\code{NULL} (default), the device is guessed based on the \code{filename} extension.} + \item{\code{path}}{Path of the directory to save plot to: \code{path} and \code{filename} +are combined to create the fully qualified file name. Defaults to the +working directory.} + \item{\code{scale}}{Multiplicative scaling factor.} + \item{\code{dpi}}{Plot resolution. Also accepts a string input: "retina" (320), +"print" (300), or "screen" (72). Applies only to raster output types.} + \item{\code{limitsize}}{When \code{TRUE} (the default), \code{ggsave()} will not +save images larger than 50x50 inches, to prevent the common error of +specifying dimensions in pixels.} + \item{\code{bg}}{Background colour. If \code{NULL}, uses the \code{plot.background} fill value +from the plot theme.} + \item{\code{create.dir}}{Whether to create new directories if a non-existing +directory is specified in the \code{filename} or \code{path} (\code{TRUE}) or return an +error (\code{FALSE}, default). If \code{FALSE} and run in an interactive session, +a prompt will appear asking to create a new directory when necessary.} + \item{\code{data}}{A \code{dataObject} object, \code{data.table} or \code{data.frame} that +constitutes the data that are assessed.} + \item{\code{is_pre_processed}}{Flag that indicates whether the data was already +pre-processed externally, e.g. normalised and clustered. Only used if the +\code{data} argument is a \code{data.table} or \code{data.frame}.} + \item{\code{cl}}{Cluster created using the \code{parallel} package. This cluster is then +used to speed up computation through parallellisation.} + \item{\code{ensemble_method}}{Method for ensembling predictions from models for the +same sample. Available methods are: +\itemize{ +\item \code{median} (default): Use the median of the predicted values as the ensemble +value for a sample. +\item \code{mean}: Use the mean of the predicted values as the ensemble value for a +sample. +}} + \item{\code{verbose}}{Flag to indicate whether feedback should be provided on the +computation and extraction of various data elements.} + \item{\code{message_indent}}{Number of indentation steps for messages shown during +computation and extraction of various data elements.} + \item{\code{detail_level}}{(\emph{optional}) Sets the level at which results are computed +and aggregated. +\itemize{ +\item \code{ensemble}: Results are computed at the ensemble level, i.e. over all +models in the ensemble. This means that, for example, bias-corrected +estimates of model performance are assessed by creating (at least) 20 +bootstraps and computing the model performance of the ensemble model for +each bootstrap. +\item \code{hybrid} (default): Results are computed at the level of models in an +ensemble. This means that, for example, bias-corrected estimates of model +performance are directly computed using the models in the ensemble. If there +are at least 20 trained models in the ensemble, performance is computed for +each model, in contrast to \code{ensemble} where performance is computed for the +ensemble of models. If there are less than 20 trained models in the +ensemble, bootstraps are created so that at least 20 point estimates can be +made. +\item \code{model}: Results are computed at the model level. This means that, for +example, bias-corrected estimates of model performance are assessed by +creating (at least) 20 bootstraps and computing the performance of the model +for each bootstrap. +} + +Note that each level of detail has a different interpretation for bootstrap +confidence intervals. For \code{ensemble} and \code{model} these are the confidence +intervals for the ensemble and an individual model, respectively. That is, +the confidence interval describes the range where an estimate produced by a +respective ensemble or model trained on a repeat of the experiment may be +found with the probability of the confidence level. For \code{hybrid}, it +represents the range where any single model trained on a repeat of the +experiment may be found with the probability of the confidence level. By +definition, confidence intervals obtained using \code{hybrid} are at least as +wide as those for \code{ensemble}. \code{hybrid} offers the correct interpretation if +the goal of the analysis is to assess the result of a single, unspecified, +model. + +\code{hybrid} is generally computationally less expensive then \code{ensemble}, which +in turn is somewhat less expensive than \code{model}. + +A non-default \code{detail_level} parameter can be specified for separate +evaluation steps by providing a parameter value in a named list with data +elements, e.g. \code{list("auc_data"="ensemble", "model_performance"="hybrid")}. +This parameter can be set for the following data elements: \code{auc_data}, +\code{decision_curve_analyis}, \code{model_performance}, \code{permutation_vimp}, +\code{ice_data}, \code{prediction_data} and \code{confusion_matrix}.} + }} +} +\value{ +\code{NULL} or list of plot objects, if \code{dir_path} is \code{NULL}. +} +\description{ +This function creates Kaplan-Meier survival curves from +stratification data stored in a familiarCollection object. +} +\details{ +This function generates a Kaplan-Meier survival plot based on risk +group stratification by the learners. + +\code{familiar} does not determine what units the x-axis has or what kind of +survival the y-axis represents. It is therefore recommended to provide +\code{x_label} and \code{y_label} arguments. + +Available splitting variables are: \code{fs_method}, \code{learner}, \code{data_set}, +\code{risk_group} and \code{stratification_method}. By default, separate figures are +created for each combination of \code{fs_method} and \code{learner}, with faceting by +\code{data_set}, colouring of the strata in each individual plot by \code{risk_group}. + +Available palettes for \code{discrete_palette} are those listed by +\code{grDevices::palette.pals()} (requires R >= 4.0.0), \code{grDevices::hcl.pals()} +(requires R >= 3.6.0) and \code{rainbow}, \code{heat.colors}, \code{terrain.colors}, +\code{topo.colors} and \code{cm.colors}, which correspond to the palettes of the same +name in \code{grDevices}. If not specified, a default palette based on palettes +in Tableau are used. You may also specify your own palette by using colour +names listed by \code{grDevices::colors()} or through hexadecimal RGB strings. + +Greenwood confidence intervals of the Kaplan-Meier curve can be shown using +various styles set by \code{conf_int_style}: +\itemize{ +\item \code{ribbon} (default): confidence intervals are shown as a ribbon with an +opacity of \code{conf_int_alpha} around the point estimate of the Kaplan-Meier +curve. +\item \code{step} (default): confidence intervals are shown as a step function around +the point estimate of the Kaplan-Meier curve. +\item \code{none}: confidence intervals are not shown. The point estimate of the ROC +curve is shown as usual. +} + +Labelling methods such as \code{set_risk_group_names} or \code{set_data_set_names} can +be applied to the \code{familiarCollection} object to update labels, and order +the output in the figure. +} diff --git a/man/plot_model_performance-methods.Rd b/man/plot_model_performance-methods.Rd new file mode 100644 index 00000000..1803f281 --- /dev/null +++ b/man/plot_model_performance-methods.Rd @@ -0,0 +1,422 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/PlotModelPerformance.R +\name{plot_model_performance} +\alias{plot_model_performance} +\alias{plot_model_performance,ANY-method} +\alias{plot_model_performance,familiarCollection-method} +\title{Plot model performance.} +\usage{ +plot_model_performance( + object, + draw = FALSE, + dir_path = NULL, + split_by = NULL, + x_axis_by = NULL, + y_axis_by = NULL, + color_by = NULL, + facet_by = NULL, + facet_wrap_cols = NULL, + plot_type = NULL, + ggtheme = NULL, + discrete_palette = NULL, + gradient_palette = NULL, + gradient_palette_range = waiver(), + x_label = waiver(), + y_label = waiver(), + legend_label = waiver(), + plot_title = waiver(), + plot_sub_title = waiver(), + caption = NULL, + rotate_x_tick_labels = waiver(), + y_range = NULL, + y_n_breaks = 5, + y_breaks = NULL, + width = waiver(), + height = waiver(), + units = waiver(), + annotate_performance = NULL, + export_collection = FALSE, + ... +) + +\S4method{plot_model_performance}{ANY}( + object, + draw = FALSE, + dir_path = NULL, + split_by = NULL, + x_axis_by = NULL, + y_axis_by = NULL, + color_by = NULL, + facet_by = NULL, + facet_wrap_cols = NULL, + plot_type = NULL, + ggtheme = NULL, + discrete_palette = NULL, + gradient_palette = NULL, + gradient_palette_range = waiver(), + x_label = waiver(), + y_label = waiver(), + legend_label = waiver(), + plot_title = waiver(), + plot_sub_title = waiver(), + caption = NULL, + rotate_x_tick_labels = waiver(), + y_range = NULL, + y_n_breaks = 5, + y_breaks = NULL, + width = waiver(), + height = waiver(), + units = waiver(), + annotate_performance = NULL, + export_collection = FALSE, + ... +) + +\S4method{plot_model_performance}{familiarCollection}( + object, + draw = FALSE, + dir_path = NULL, + split_by = NULL, + x_axis_by = NULL, + y_axis_by = NULL, + color_by = NULL, + facet_by = NULL, + facet_wrap_cols = NULL, + plot_type = NULL, + ggtheme = NULL, + discrete_palette = NULL, + gradient_palette = NULL, + gradient_palette_range = waiver(), + x_label = waiver(), + y_label = waiver(), + legend_label = waiver(), + plot_title = waiver(), + plot_sub_title = waiver(), + caption = NULL, + rotate_x_tick_labels = waiver(), + y_range = NULL, + y_n_breaks = 5, + y_breaks = NULL, + width = waiver(), + height = waiver(), + units = waiver(), + annotate_performance = NULL, + export_collection = FALSE, + ... +) +} +\arguments{ +\item{object}{\code{familiarCollection} object, or one or more \code{familiarData} +objects, that will be internally converted to a \code{familiarCollection} object. +It is also possible to provide a \code{familiarEnsemble} or one or more +\code{familiarModel} objects together with the data from which data is computed +prior to export. Paths to such files can also be provided.} + +\item{draw}{(\emph{optional}) Draws the plot if TRUE.} + +\item{dir_path}{(\emph{optional}) Path to the directory where created performance +plots are saved to. Output is saved in the \code{performance} subdirectory. If +\code{NULL} no figures are saved, but are returned instead.} + +\item{split_by}{(\emph{optional}) Splitting variables. This refers to column names +on which datasets are split. A separate figure is created for each split. +See details for available variables.} + +\item{x_axis_by}{(\emph{optional}) Variable plotted along the x-axis of a plot. +The variable cannot overlap with variables provided to the \code{split_by} and +\code{y_axis_by} arguments (if used), but may overlap with other arguments. Only +one variable is allowed for this argument. See details for available +variables.} + +\item{y_axis_by}{(\emph{optional}) Variable plotted along the y-axis of a plot. +The variable cannot overlap with variables provided to the \code{split_by} and +\code{x_axis_by} arguments (if used), but may overlap with other arguments. Only +one variable is allowed for this argument. See details for available +variables.} + +\item{color_by}{(\emph{optional}) Variables used to determine fill colour of plot +objects. The variables cannot overlap with those provided to the \code{split_by} +argument, but may overlap with other arguments. See details for available +variables.} + +\item{facet_by}{(\emph{optional}) Variables used to determine how and if facets of +each figure appear. In case the \code{facet_wrap_cols} argument is \code{NULL}, the +first variable is used to define columns, and the remaing variables are +used to define rows of facets. The variables cannot overlap with those +provided to the \code{split_by} argument, but may overlap with other arguments. +See details for available variables.} + +\item{facet_wrap_cols}{(\emph{optional}) Number of columns to generate when facet +wrapping. If NULL, a facet grid is produced instead.} + +\item{plot_type}{(\emph{optional}) Type of plot to draw. This is one of \code{heatmap} +(draws a heatmap), \code{barplot} (draws a barplot with confidence intervals), +\code{boxplot} (draws a boxplot) and \code{violinplot} (draws a violin plot). Defaults +to \code{violinplot}. + +The choice for \code{plot_type} affects several other arguments, e.g. \code{color_by} +is not used for \code{heatmap} and \code{y_axis_by} is only used by \code{heatmap}.} + +\item{ggtheme}{(\emph{optional}) \code{ggplot} theme to use for plotting.} + +\item{discrete_palette}{(\emph{optional}) Palette to use to color the different +plot elements in case a value was provided to the \code{color_by} argument. Only +used when \code{plot_type} is not \code{heatmap}.} + +\item{gradient_palette}{(\emph{optional}) Sequential or divergent palette used to +color the raster in \code{heatmap} plots. This argument is not used for other +\code{plot_type} value.} + +\item{gradient_palette_range}{(\emph{optional}) Numerical range used to span the +gradient. This should be a range of two values, e.g. \code{c(0, 1)}. Lower or +upper boundary can be unset by using \code{NA}. If not set, the full +metric-specific range is used.} + +\item{x_label}{(\emph{optional}) Label to provide to the x-axis. If NULL, no label +is shown.} + +\item{y_label}{(\emph{optional}) Label to provide to the y-axis. If NULL, no label +is shown.} + +\item{legend_label}{(\emph{optional}) Label to provide to the legend. If NULL, the +legend will not have a name.} + +\item{plot_title}{(\emph{optional}) Label to provide as figure title. If NULL, no +title is shown.} + +\item{plot_sub_title}{(\emph{optional}) Label to provide as figure subtitle. If +NULL, no subtitle is shown.} + +\item{caption}{(\emph{optional}) Label to provide as figure caption. If NULL, no +caption is shown.} + +\item{rotate_x_tick_labels}{(\emph{optional}) Rotate tick labels on the x-axis by +90 degrees. Defaults to \code{TRUE}. Rotation of x-axis tick labels may also be +controlled through the \code{ggtheme}. In this case, \code{FALSE} should be provided +explicitly.} + +\item{y_range}{(\emph{optional}) Value range for the y-axis.} + +\item{y_n_breaks}{(\emph{optional}) Number of breaks to show on the y-axis of the +plot. \code{y_n_breaks} is used to determine the \code{y_breaks} argument in case it +is unset.} + +\item{y_breaks}{(\emph{optional}) Break points on the y-axis of the plot.} + +\item{width}{(\emph{optional}) Width of the plot. A default value is derived from +the number of facets.} + +\item{height}{(\emph{optional}) Height of the plot. A default value is derived from +the number of features and the number of facets.} + +\item{units}{(\emph{optional}) Plot size unit. Either \code{cm} (default), \code{mm} or \verb{in}.} + +\item{annotate_performance}{(\emph{optional}) Indicates whether performance in +heatmaps should be annotated with text. Can be \code{none}, \code{value} (default), or +\code{value_ci} (median value plus 95\% credibility intervals).} + +\item{export_collection}{(\emph{optional}) Exports the collection if TRUE.} + +\item{...}{ + Arguments passed on to \code{\link[=extract_performance]{extract_performance}}, \code{\link[=as_familiar_collection]{as_familiar_collection}}, \code{\link[ggplot2:ggsave]{ggplot2::ggsave}} + \describe{ + \item{\code{data}}{A \code{dataObject} object, \code{data.table} or \code{data.frame} that +constitutes the data that are assessed.} + \item{\code{is_pre_processed}}{Flag that indicates whether the data was already +pre-processed externally, e.g. normalised and clustered. Only used if the +\code{data} argument is a \code{data.table} or \code{data.frame}.} + \item{\code{cl}}{Cluster created using the \code{parallel} package. This cluster is then +used to speed up computation through parallellisation.} + \item{\code{evaluation_times}}{One or more time points that are used for in analysis of +survival problems when data has to be assessed at a set time, e.g. +calibration. If not provided explicitly, this parameter is read from +settings used at creation of the underlying \code{familiarModel} objects. Only +used for \code{survival} outcomes.} + \item{\code{ensemble_method}}{Method for ensembling predictions from models for the +same sample. Available methods are: +\itemize{ +\item \code{median} (default): Use the median of the predicted values as the ensemble +value for a sample. +\item \code{mean}: Use the mean of the predicted values as the ensemble value for a +sample. +}} + \item{\code{metric}}{One or more metrics for assessing model performance. See the +vignette on performance metrics for the available metrics. If not provided +explicitly, this parameter is read from settings used at creation of the +underlying \code{familiarModel} objects.} + \item{\code{verbose}}{Flag to indicate whether feedback should be provided on the +computation and extraction of various data elements.} + \item{\code{message_indent}}{Number of indentation steps for messages shown during +computation and extraction of various data elements.} + \item{\code{detail_level}}{(\emph{optional}) Sets the level at which results are computed +and aggregated. +\itemize{ +\item \code{ensemble}: Results are computed at the ensemble level, i.e. over all +models in the ensemble. This means that, for example, bias-corrected +estimates of model performance are assessed by creating (at least) 20 +bootstraps and computing the model performance of the ensemble model for +each bootstrap. +\item \code{hybrid} (default): Results are computed at the level of models in an +ensemble. This means that, for example, bias-corrected estimates of model +performance are directly computed using the models in the ensemble. If there +are at least 20 trained models in the ensemble, performance is computed for +each model, in contrast to \code{ensemble} where performance is computed for the +ensemble of models. If there are less than 20 trained models in the +ensemble, bootstraps are created so that at least 20 point estimates can be +made. +\item \code{model}: Results are computed at the model level. This means that, for +example, bias-corrected estimates of model performance are assessed by +creating (at least) 20 bootstraps and computing the performance of the model +for each bootstrap. +} + +Note that each level of detail has a different interpretation for bootstrap +confidence intervals. For \code{ensemble} and \code{model} these are the confidence +intervals for the ensemble and an individual model, respectively. That is, +the confidence interval describes the range where an estimate produced by a +respective ensemble or model trained on a repeat of the experiment may be +found with the probability of the confidence level. For \code{hybrid}, it +represents the range where any single model trained on a repeat of the +experiment may be found with the probability of the confidence level. By +definition, confidence intervals obtained using \code{hybrid} are at least as +wide as those for \code{ensemble}. \code{hybrid} offers the correct interpretation if +the goal of the analysis is to assess the result of a single, unspecified, +model. + +\code{hybrid} is generally computationally less expensive then \code{ensemble}, which +in turn is somewhat less expensive than \code{model}. + +A non-default \code{detail_level} parameter can be specified for separate +evaluation steps by providing a parameter value in a named list with data +elements, e.g. \code{list("auc_data"="ensemble", "model_performance"="hybrid")}. +This parameter can be set for the following data elements: \code{auc_data}, +\code{decision_curve_analyis}, \code{model_performance}, \code{permutation_vimp}, +\code{ice_data}, \code{prediction_data} and \code{confusion_matrix}.} + \item{\code{estimation_type}}{(\emph{optional}) Sets the type of estimation that should be +possible. This has the following options: +\itemize{ +\item \code{point}: Point estimates. +\item \code{bias_correction} or \code{bc}: Bias-corrected estimates. A bias-corrected +estimate is computed from (at least) 20 point estimates, and \code{familiar} may +bootstrap the data to create them. +\item \code{bootstrap_confidence_interval} or \code{bci} (default): Bias-corrected +estimates with bootstrap confidence intervals (Efron and Hastie, 2016). The +number of point estimates required depends on the \code{confidence_level} +parameter, and \code{familiar} may bootstrap the data to create them. +} + +As with \code{detail_level}, a non-default \code{estimation_type} parameter can be +specified for separate evaluation steps by providing a parameter value in a +named list with data elements, e.g. \code{list("auc_data"="bci", "model_performance"="point")}. This parameter can be set for the following +data elements: \code{auc_data}, \code{decision_curve_analyis}, \code{model_performance}, +\code{permutation_vimp}, \code{ice_data}, and \code{prediction_data}.} + \item{\code{aggregate_results}}{(\emph{optional}) Flag that signifies whether results +should be aggregated during evaluation. If \code{estimation_type} is +\code{bias_correction} or \code{bc}, aggregation leads to a single bias-corrected +estimate. If \code{estimation_type} is \code{bootstrap_confidence_interval} or \code{bci}, +aggregation leads to a single bias-corrected estimate with lower and upper +boundaries of the confidence interval. This has no effect if +\code{estimation_type} is \code{point}. + +The default value is equal to \code{TRUE} except when assessing metrics to assess +model performance, as the default violin plot requires underlying data. + +As with \code{detail_level} and \code{estimation_type}, a non-default +\code{aggregate_results} parameter can be specified for separate evaluation steps +by providing a parameter value in a named list with data elements, e.g. +\code{list("auc_data"=TRUE, , "model_performance"=FALSE)}. This parameter exists +for the same elements as \code{estimation_type}.} + \item{\code{confidence_level}}{(\emph{optional}) Numeric value for the level at which +confidence intervals are determined. In the case bootstraps are used to +determine the confidence intervals bootstrap estimation, \code{familiar} uses the +rule of thumb \eqn{n = 20 / ci.level} to determine the number of required +bootstraps. + +The default value is \code{0.95}.} + \item{\code{bootstrap_ci_method}}{(\emph{optional}) Method used to determine bootstrap +confidence intervals (Efron and Hastie, 2016). The following methods are +implemented: +\itemize{ +\item \code{percentile} (default): Confidence intervals obtained using the percentile +method. +\item \code{bc}: Bias-corrected confidence intervals. +} + +Note that the standard method is not implemented because this method is +often not suitable due to non-normal distributions. The bias-corrected and +accelerated (BCa) method is not implemented yet.} + \item{\code{familiar_data_names}}{Names of the dataset(s). Only used if the \code{object} parameter +is one or more \code{familiarData} objects.} + \item{\code{collection_name}}{Name of the collection.} + \item{\code{filename}}{File name to create on disk.} + \item{\code{plot}}{Plot to save, defaults to last plot displayed.} + \item{\code{device}}{Device to use. Can either be a device function +(e.g. \link{png}), or one of "eps", "ps", "tex" (pictex), +"pdf", "jpeg", "tiff", "png", "bmp", "svg" or "wmf" (windows only). If +\code{NULL} (default), the device is guessed based on the \code{filename} extension.} + \item{\code{path}}{Path of the directory to save plot to: \code{path} and \code{filename} +are combined to create the fully qualified file name. Defaults to the +working directory.} + \item{\code{scale}}{Multiplicative scaling factor.} + \item{\code{dpi}}{Plot resolution. Also accepts a string input: "retina" (320), +"print" (300), or "screen" (72). Applies only to raster output types.} + \item{\code{limitsize}}{When \code{TRUE} (the default), \code{ggsave()} will not +save images larger than 50x50 inches, to prevent the common error of +specifying dimensions in pixels.} + \item{\code{bg}}{Background colour. If \code{NULL}, uses the \code{plot.background} fill value +from the plot theme.} + \item{\code{create.dir}}{Whether to create new directories if a non-existing +directory is specified in the \code{filename} or \code{path} (\code{TRUE}) or return an +error (\code{FALSE}, default). If \code{FALSE} and run in an interactive session, +a prompt will appear asking to create a new directory when necessary.} + }} +} +\value{ +\code{NULL} or list of plot objects, if \code{dir_path} is \code{NULL}. +} +\description{ +This method creates plots that show model performance from the +data stored in a familiarCollection object. This method may create several +types of plots, as determined by \code{plot_type}. +} +\details{ +This function plots model performance based on empirical bootstraps, +using various plot representations. + +Available splitting variables are: \code{fs_method}, \code{learner}, \code{data_set}, +\code{evaluation_time} (survival outcome only) and \code{metric}. The default for +\code{heatmap} is to split by \code{metric}, facet by \code{data_set} and +\code{evaluation_time}, position \code{learner} along the x-axis and \code{fs_method} along +the y-axis. The \code{color_by} argument is not used. The only valid options for +\code{x_axis_by} and \code{y_axis_by} are \code{learner} and \code{fs_method}. + +For other plot types (\code{barplot}, \code{boxplot} and \code{violinplot}), depends on the +number of learners and feature selection methods: +\itemize{ +\item \emph{one feature selection method and one learner}: the default is to split by +\code{metric}, and have \code{data_set} along the x-axis. +\item \emph{one feature selection and multiple learners}: the default is to split by +\code{metric}, facet by \code{data_set} and have \code{learner} along the x-axis. +\item \emph{multiple feature selection methods and one learner}: the default is to +split by \code{metric}, facet by \code{data_set} and have \code{fs_method} along the +x-axis. +\item \emph{multiple feature selection methods and learners}: the default is to split +by \code{metric}, facet by \code{data_set}, colour by \code{fs_method} and have \code{learner} +along the x-axis. +} + +If applicable, additional faceting is performed for \code{evaluation_time}. + +Available palettes for \code{discrete_palette} and \code{gradient_palette} are those +listed by \code{grDevices::palette.pals()} (requires R >= 4.0.0), +\code{grDevices::hcl.pals()} (requires R >= 3.6.0) and \code{rainbow}, \code{heat.colors}, +\code{terrain.colors}, \code{topo.colors} and \code{cm.colors}, which correspond to the +palettes of the same name in \code{grDevices}. If not specified, a default +palette based on palettes in Tableau are used. You may also specify your own +palette by using colour names listed by \code{grDevices::colors()} or through +hexadecimal RGB strings. + +Labeling methods such as \code{set_fs_method_names} or \code{set_data_set_names} can +be applied to the \code{familiarCollection} object to update labels, and order +the output in the figure. +} diff --git a/man/plot_pd-methods.Rd b/man/plot_pd-methods.Rd new file mode 100644 index 00000000..cffc4174 --- /dev/null +++ b/man/plot_pd-methods.Rd @@ -0,0 +1,429 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/PlotICE.R +\name{plot_pd} +\alias{plot_pd} +\alias{plot_pd,ANY-method} +\title{Plot partial dependence.} +\usage{ +plot_pd( + object, + draw = FALSE, + dir_path = NULL, + split_by = NULL, + color_by = NULL, + facet_by = NULL, + facet_wrap_cols = NULL, + ggtheme = NULL, + discrete_palette = NULL, + gradient_palette = NULL, + gradient_palette_range = NULL, + x_label = waiver(), + y_label = waiver(), + legend_label = waiver(), + plot_title = waiver(), + plot_sub_title = waiver(), + caption = NULL, + x_range = NULL, + x_n_breaks = 5, + x_breaks = NULL, + y_range = NULL, + y_n_breaks = 5, + y_breaks = NULL, + novelty_range = NULL, + value_scales = waiver(), + novelty_scales = waiver(), + conf_int_style = c("ribbon", "step", "none"), + conf_int_alpha = 0.4, + show_novelty = TRUE, + anchor_values = NULL, + width = waiver(), + height = waiver(), + units = waiver(), + export_collection = FALSE, + ... +) + +\S4method{plot_pd}{ANY}( + object, + draw = FALSE, + dir_path = NULL, + split_by = NULL, + color_by = NULL, + facet_by = NULL, + facet_wrap_cols = NULL, + ggtheme = NULL, + discrete_palette = NULL, + gradient_palette = NULL, + gradient_palette_range = NULL, + x_label = waiver(), + y_label = waiver(), + legend_label = waiver(), + plot_title = waiver(), + plot_sub_title = waiver(), + caption = NULL, + x_range = NULL, + x_n_breaks = 5, + x_breaks = NULL, + y_range = NULL, + y_n_breaks = 5, + y_breaks = NULL, + novelty_range = NULL, + value_scales = waiver(), + novelty_scales = waiver(), + conf_int_style = c("ribbon", "step", "none"), + conf_int_alpha = 0.4, + show_novelty = TRUE, + anchor_values = NULL, + width = waiver(), + height = waiver(), + units = waiver(), + export_collection = FALSE, + ... +) +} +\arguments{ +\item{object}{\code{familiarCollection} object, or one or more \code{familiarData} +objects, that will be internally converted to a \code{familiarCollection} object. +It is also possible to provide a \code{familiarEnsemble} or one or more +\code{familiarModel} objects together with the data from which data is computed +prior to export. Paths to such files can also be provided.} + +\item{draw}{(\emph{optional}) Draws the plot if TRUE.} + +\item{dir_path}{(\emph{optional}) Path to the directory where created individual +conditional expectation plots are saved to. Output is saved in the +\code{explanation} subdirectory. If \code{NULL}, figures are written to the folder, +but are returned instead.} + +\item{split_by}{(\emph{optional}) Splitting variables. This refers to column names +on which datasets are split. A separate figure is created for each split. +See details for available variables.} + +\item{color_by}{(\emph{optional}) Variables used to determine fill colour of plot +objects. The variables cannot overlap with those provided to the \code{split_by} +argument, but may overlap with other arguments. See details for available +variables.} + +\item{facet_by}{(\emph{optional}) Variables used to determine how and if facets of +each figure appear. In case the \code{facet_wrap_cols} argument is \code{NULL}, the +first variable is used to define columns, and the remaing variables are +used to define rows of facets. The variables cannot overlap with those +provided to the \code{split_by} argument, but may overlap with other arguments. +See details for available variables.} + +\item{facet_wrap_cols}{(\emph{optional}) Number of columns to generate when facet +wrapping. If NULL, a facet grid is produced instead.} + +\item{ggtheme}{(\emph{optional}) \code{ggplot} theme to use for plotting.} + +\item{discrete_palette}{(\emph{optional}) Palette to use to colour the different +plot elements in case a value was provided to the \code{color_by} argument. For +2D individual conditional expectation plots without novelty, the initial +colour determines the colour of the points indicating sample values.} + +\item{gradient_palette}{(\emph{optional}) Sequential or divergent palette used to +colour the raster in 2D individual conditional expectation or partial +dependence plots. This argument is not used for 1D plots.} + +\item{gradient_palette_range}{(\emph{optional}) Numerical range used to span the +gradient for 2D plots. This should be a range of two values, e.g. \code{c(0, 1)}. +By default, values are determined from the data, dependent on the +\code{value_scales} parameter. This parameter is ignored for 1D plots.} + +\item{x_label}{(\emph{optional}) Label to provide to the x-axis. If NULL, no label +is shown.} + +\item{y_label}{(\emph{optional}) Label to provide to the y-axis. If NULL, no label +is shown.} + +\item{legend_label}{(\emph{optional}) Label to provide to the legend. If NULL, the +legend will not have a name.} + +\item{plot_title}{(\emph{optional}) Label to provide as figure title. If NULL, no +title is shown.} + +\item{plot_sub_title}{(\emph{optional}) Label to provide as figure subtitle. If +NULL, no subtitle is shown.} + +\item{caption}{(\emph{optional}) Label to provide as figure caption. If NULL, no +caption is shown.} + +\item{x_range}{(\emph{optional}) Value range for the x-axis.} + +\item{x_n_breaks}{(\emph{optional}) Number of breaks to show on the x-axis of the +plot. \code{x_n_breaks} is used to determine the \code{x_breaks} argument in case it +is unset.} + +\item{x_breaks}{(\emph{optional}) Break points on the x-axis of the plot.} + +\item{y_range}{(\emph{optional}) Value range for the y-axis.} + +\item{y_n_breaks}{(\emph{optional}) Number of breaks to show on the y-axis of the +plot. \code{y_n_breaks} is used to determine the \code{y_breaks} argument in case it +is unset.} + +\item{y_breaks}{(\emph{optional}) Break points on the y-axis of the plot.} + +\item{novelty_range}{(\emph{optional}) Numerical range used to span the range of +novelty values. This determines the size of the bubbles in 2D, and +transparency of lines in 1D. This should be a range of two values, e.g. +\code{c(0, 1)}. By default, values are determined from the data, dependent on the +\code{value_scales} parameter. This parameter is ignored if \code{show_novelty=FALSE}.} + +\item{value_scales}{(\emph{optional}) Sets scaling of predicted values. This +parameter has several options: +\itemize{ +\item \code{fixed} (default): The value axis for all features will have the same +range. +\item \code{feature}: The value axis for each feature will have the same range. This +option is unavailable for 2D plots. +\item \code{figure}: The value axis for all facets in a figure will have the same +range. +\item \code{facet}: Each facet has its own range. This option is unavailable for 2D +plots. +} + +For 1D plots, this option is ignored if the \code{y_range} is provided, whereas +for 2D it is ignored if the \code{gradient_palette_range} is provided.} + +\item{novelty_scales}{(\emph{optional}) Sets scaling of novelty values, similar to +the \code{value_scales} parameter, but with more limited options: +\itemize{ +\item \code{fixed} (default): The novelty will have the same range for all features. +\item \code{figure}: The novelty will have the same range for all facets in a figure. +}} + +\item{conf_int_style}{(\emph{optional}) Confidence interval style. See details for +allowed styles.} + +\item{conf_int_alpha}{(\emph{optional}) Alpha value to determine transparency of +confidence intervals or, alternatively, other plot elements with which the +confidence interval overlaps. Only values between 0.0 (fully transparent) +and 1.0 (fully opaque) are allowed.} + +\item{show_novelty}{(\emph{optional}) Sets whether novelty is shown in plots.} + +\item{anchor_values}{(\emph{optional}) A single value or a named list or array of +values that are used to centre the individual conditional expectation plot. +A single value is valid if and only if only a single feature is assessed. +Otherwise, values Has no effect if the plot is not shown, i.e. +\code{show_ice=FALSE}. A partial dependence plot cannot be shown for those +features.} + +\item{width}{(\emph{optional}) Width of the plot. A default value is derived from +the number of facets.} + +\item{height}{(\emph{optional}) Height of the plot. A default value is derived from +the number of features and the number of facets.} + +\item{units}{(\emph{optional}) Plot size unit. Either \code{cm} (default), \code{mm} or \verb{in}.} + +\item{export_collection}{(\emph{optional}) Exports the collection if TRUE.} + +\item{...}{ + Arguments passed on to \code{\link[=export_ice_data]{export_ice_data}}, \code{\link[ggplot2:ggsave]{ggplot2::ggsave}}, \code{\link[=extract_ice]{extract_ice}} + \describe{ + \item{\code{aggregate_results}}{Flag that signifies whether results should be +aggregated for export.} + \item{\code{filename}}{File name to create on disk.} + \item{\code{plot}}{Plot to save, defaults to last plot displayed.} + \item{\code{device}}{Device to use. Can either be a device function +(e.g. \link{png}), or one of "eps", "ps", "tex" (pictex), +"pdf", "jpeg", "tiff", "png", "bmp", "svg" or "wmf" (windows only). If +\code{NULL} (default), the device is guessed based on the \code{filename} extension.} + \item{\code{path}}{Path of the directory to save plot to: \code{path} and \code{filename} +are combined to create the fully qualified file name. Defaults to the +working directory.} + \item{\code{scale}}{Multiplicative scaling factor.} + \item{\code{dpi}}{Plot resolution. Also accepts a string input: "retina" (320), +"print" (300), or "screen" (72). Applies only to raster output types.} + \item{\code{limitsize}}{When \code{TRUE} (the default), \code{ggsave()} will not +save images larger than 50x50 inches, to prevent the common error of +specifying dimensions in pixels.} + \item{\code{bg}}{Background colour. If \code{NULL}, uses the \code{plot.background} fill value +from the plot theme.} + \item{\code{create.dir}}{Whether to create new directories if a non-existing +directory is specified in the \code{filename} or \code{path} (\code{TRUE}) or return an +error (\code{FALSE}, default). If \code{FALSE} and run in an interactive session, +a prompt will appear asking to create a new directory when necessary.} + \item{\code{features}}{Names of the feature or features (2) assessed simultaneously. +By default \code{NULL}, which means that all features are assessed one-by-one.} + \item{\code{feature_x_range}}{When one or two features are defined using \code{features}, +\code{feature_x_range} can be used to set the range of values for the first +feature. For numeric features, a vector of two values is assumed to indicate +a range from which \code{n_sample_points} are uniformly sampled. A vector of more +than two values is interpreted as is, i.e. these represent the values to be +sampled. For categorical features, values should represent a (sub)set of +available levels.} + \item{\code{feature_y_range}}{As \code{feature_x_range}, but for the second feature in +case two features are defined.} + \item{\code{n_sample_points}}{Number of points used to sample continuous features.} + \item{\code{data}}{A \code{dataObject} object, \code{data.table} or \code{data.frame} that +constitutes the data that are assessed.} + \item{\code{is_pre_processed}}{Flag that indicates whether the data was already +pre-processed externally, e.g. normalised and clustered. Only used if the +\code{data} argument is a \code{data.table} or \code{data.frame}.} + \item{\code{cl}}{Cluster created using the \code{parallel} package. This cluster is then +used to speed up computation through parallellisation.} + \item{\code{evaluation_times}}{One or more time points that are used for in analysis of +survival problems when data has to be assessed at a set time, e.g. +calibration. If not provided explicitly, this parameter is read from +settings used at creation of the underlying \code{familiarModel} objects. Only +used for \code{survival} outcomes.} + \item{\code{ensemble_method}}{Method for ensembling predictions from models for the +same sample. Available methods are: +\itemize{ +\item \code{median} (default): Use the median of the predicted values as the ensemble +value for a sample. +\item \code{mean}: Use the mean of the predicted values as the ensemble value for a +sample. +}} + \item{\code{verbose}}{Flag to indicate whether feedback should be provided on the +computation and extraction of various data elements.} + \item{\code{message_indent}}{Number of indentation steps for messages shown during +computation and extraction of various data elements.} + \item{\code{sample_limit}}{(\emph{optional}) Set the upper limit of the number of samples +that are used during evaluation steps. Cannot be less than 20. + +This setting can be specified per data element by providing a parameter +value in a named list with data elements, e.g. +\code{list("sample_similarity"=100, "permutation_vimp"=1000)}. + +This parameter can be set for the following data elements: +\code{sample_similarity} and \code{ice_data}.} + \item{\code{detail_level}}{(\emph{optional}) Sets the level at which results are computed +and aggregated. +\itemize{ +\item \code{ensemble}: Results are computed at the ensemble level, i.e. over all +models in the ensemble. This means that, for example, bias-corrected +estimates of model performance are assessed by creating (at least) 20 +bootstraps and computing the model performance of the ensemble model for +each bootstrap. +\item \code{hybrid} (default): Results are computed at the level of models in an +ensemble. This means that, for example, bias-corrected estimates of model +performance are directly computed using the models in the ensemble. If there +are at least 20 trained models in the ensemble, performance is computed for +each model, in contrast to \code{ensemble} where performance is computed for the +ensemble of models. If there are less than 20 trained models in the +ensemble, bootstraps are created so that at least 20 point estimates can be +made. +\item \code{model}: Results are computed at the model level. This means that, for +example, bias-corrected estimates of model performance are assessed by +creating (at least) 20 bootstraps and computing the performance of the model +for each bootstrap. +} + +Note that each level of detail has a different interpretation for bootstrap +confidence intervals. For \code{ensemble} and \code{model} these are the confidence +intervals for the ensemble and an individual model, respectively. That is, +the confidence interval describes the range where an estimate produced by a +respective ensemble or model trained on a repeat of the experiment may be +found with the probability of the confidence level. For \code{hybrid}, it +represents the range where any single model trained on a repeat of the +experiment may be found with the probability of the confidence level. By +definition, confidence intervals obtained using \code{hybrid} are at least as +wide as those for \code{ensemble}. \code{hybrid} offers the correct interpretation if +the goal of the analysis is to assess the result of a single, unspecified, +model. + +\code{hybrid} is generally computationally less expensive then \code{ensemble}, which +in turn is somewhat less expensive than \code{model}. + +A non-default \code{detail_level} parameter can be specified for separate +evaluation steps by providing a parameter value in a named list with data +elements, e.g. \code{list("auc_data"="ensemble", "model_performance"="hybrid")}. +This parameter can be set for the following data elements: \code{auc_data}, +\code{decision_curve_analyis}, \code{model_performance}, \code{permutation_vimp}, +\code{ice_data}, \code{prediction_data} and \code{confusion_matrix}.} + \item{\code{estimation_type}}{(\emph{optional}) Sets the type of estimation that should be +possible. This has the following options: +\itemize{ +\item \code{point}: Point estimates. +\item \code{bias_correction} or \code{bc}: Bias-corrected estimates. A bias-corrected +estimate is computed from (at least) 20 point estimates, and \code{familiar} may +bootstrap the data to create them. +\item \code{bootstrap_confidence_interval} or \code{bci} (default): Bias-corrected +estimates with bootstrap confidence intervals (Efron and Hastie, 2016). The +number of point estimates required depends on the \code{confidence_level} +parameter, and \code{familiar} may bootstrap the data to create them. +} + +As with \code{detail_level}, a non-default \code{estimation_type} parameter can be +specified for separate evaluation steps by providing a parameter value in a +named list with data elements, e.g. \code{list("auc_data"="bci", "model_performance"="point")}. This parameter can be set for the following +data elements: \code{auc_data}, \code{decision_curve_analyis}, \code{model_performance}, +\code{permutation_vimp}, \code{ice_data}, and \code{prediction_data}.} + \item{\code{confidence_level}}{(\emph{optional}) Numeric value for the level at which +confidence intervals are determined. In the case bootstraps are used to +determine the confidence intervals bootstrap estimation, \code{familiar} uses the +rule of thumb \eqn{n = 20 / ci.level} to determine the number of required +bootstraps. + +The default value is \code{0.95}.} + \item{\code{bootstrap_ci_method}}{(\emph{optional}) Method used to determine bootstrap +confidence intervals (Efron and Hastie, 2016). The following methods are +implemented: +\itemize{ +\item \code{percentile} (default): Confidence intervals obtained using the percentile +method. +\item \code{bc}: Bias-corrected confidence intervals. +} + +Note that the standard method is not implemented because this method is +often not suitable due to non-normal distributions. The bias-corrected and +accelerated (BCa) method is not implemented yet.} + }} +} +\value{ +\code{NULL} or list of plot objects, if \code{dir_path} is \code{NULL}. +} +\description{ +This method creates partial dependence plots +based on data in a familiarCollection object. +} +\details{ +This function generates partial dependence plots. +These plots come in two varieties, namely 1D and 2D. 1D plots show the +predicted value as function of a single feature, whereas 2D plots show the +predicted value as a function of two features. + +Available splitting variables are: \code{feature_x}, \code{feature_y} (2D only), +\code{fs_method}, \code{learner}, \code{data_set} and \code{positive_class} (categorical +outcomes) or \code{evaluation_time} (survival outcomes). By default, for 1D ICE +plots the data are split by \code{feature_x}, \code{fs_method} and \code{learner}, with +faceting by \code{data_set}, \code{positive_class} or \code{evaluation_time}. If only +partial dependence is shown, \code{positive_class} and \code{evaluation_time} are used +to set colours instead. For 2D plots, by default the data are split by +\code{feature_x}, \code{fs_method} and \code{learner}, with faceting by \code{data_set}, +\code{positive_class} or \code{evaluation_time}. The \code{color_by} argument cannot be +used with 2D plots, and attempting to do so causes an error. Attempting to +specify \code{feature_x} or \code{feature_y} for \code{color_by} will likewise result in an +error, as multiple features cannot be shown in the same facet. + +The splitting variables indicated by \code{color_by} are coloured according to +the \code{discrete_palette} parameter. This parameter is therefore only used for +1D plots. Available palettes for \code{discrete_palette} and \code{gradient_palette} +are those listed by \code{grDevices::palette.pals()} (requires R >= 4.0.0), +\code{grDevices::hcl.pals()} (requires R >= 3.6.0) and \code{rainbow}, \code{heat.colors}, +\code{terrain.colors}, \code{topo.colors} and \code{cm.colors}, which correspond to the +palettes of the same name in \code{grDevices}. If not specified, a default +palette based on palettes in Tableau are used. You may also specify your own +palette by using colour names listed by \code{grDevices::colors()} or through +hexadecimal RGB strings. + +Bootstrap confidence intervals of the partial dependence plots can be shown +using various styles set by \code{conf_int_style}: +\itemize{ +\item \code{ribbon} (default): confidence intervals are shown as a ribbon with an +opacity of \code{conf_int_alpha} around the point estimate of the partial +dependence. +\item \code{step} (default): confidence intervals are shown as a step function around +the point estimate of the partial dependence. +\item \code{none}: confidence intervals are not shown. The point estimate of the +partial dependence is shown as usual. +} + +Labelling methods such as \code{set_fs_method_names} or \code{set_data_set_names} can +be applied to the \code{familiarCollection} object to update labels, and order +the output in the figure. +} diff --git a/man/plot_permutation_variable_importance-methods.Rd b/man/plot_permutation_variable_importance-methods.Rd new file mode 100644 index 00000000..27a414c2 --- /dev/null +++ b/man/plot_permutation_variable_importance-methods.Rd @@ -0,0 +1,428 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/PlotPermutationVariableImportance.R +\name{plot_permutation_variable_importance} +\alias{plot_permutation_variable_importance} +\alias{plot_permutation_variable_importance,ANY-method} +\alias{plot_permutation_variable_importance,familiarCollection-method} +\title{Plot permutation variable importance.} +\usage{ +plot_permutation_variable_importance( + object, + draw = FALSE, + dir_path = NULL, + split_by = NULL, + color_by = NULL, + facet_by = NULL, + facet_wrap_cols = NULL, + ggtheme = NULL, + discrete_palette = NULL, + x_label = waiver(), + y_label = "feature", + legend_label = waiver(), + plot_title = waiver(), + plot_sub_title = waiver(), + caption = NULL, + x_range = NULL, + x_n_breaks = 5, + x_breaks = NULL, + conf_int_style = c("point_line", "line", "bar_line", "none"), + conf_int_alpha = 0.4, + width = waiver(), + height = waiver(), + units = waiver(), + export_collection = FALSE, + ... +) + +\S4method{plot_permutation_variable_importance}{ANY}( + object, + draw = FALSE, + dir_path = NULL, + split_by = NULL, + color_by = NULL, + facet_by = NULL, + facet_wrap_cols = NULL, + ggtheme = NULL, + discrete_palette = NULL, + x_label = waiver(), + y_label = "feature", + legend_label = waiver(), + plot_title = waiver(), + plot_sub_title = waiver(), + caption = NULL, + x_range = NULL, + x_n_breaks = 5, + x_breaks = NULL, + conf_int_style = c("point_line", "line", "bar_line", "none"), + conf_int_alpha = 0.4, + width = waiver(), + height = waiver(), + units = waiver(), + export_collection = FALSE, + ... +) + +\S4method{plot_permutation_variable_importance}{familiarCollection}( + object, + draw = FALSE, + dir_path = NULL, + split_by = NULL, + color_by = NULL, + facet_by = NULL, + facet_wrap_cols = NULL, + ggtheme = NULL, + discrete_palette = NULL, + x_label = waiver(), + y_label = "feature", + legend_label = waiver(), + plot_title = waiver(), + plot_sub_title = waiver(), + caption = NULL, + x_range = NULL, + x_n_breaks = 5, + x_breaks = NULL, + conf_int_style = c("point_line", "line", "bar_line", "none"), + conf_int_alpha = 0.4, + width = waiver(), + height = waiver(), + units = waiver(), + export_collection = FALSE, + ... +) +} +\arguments{ +\item{object}{\code{familiarCollection} object, or one or more \code{familiarData} +objects, that will be internally converted to a \code{familiarCollection} object. +It is also possible to provide a \code{familiarEnsemble} or one or more +\code{familiarModel} objects together with the data from which data is computed +prior to export. Paths to such files can also be provided.} + +\item{draw}{(\emph{optional}) Draws the plot if TRUE.} + +\item{dir_path}{(\emph{optional}) Path to the directory where created figures are +saved to. Output is saved in the \code{variable_importance} subdirectory. If NULL +no figures are saved, but are returned instead.} + +\item{split_by}{(\emph{optional}) Splitting variables. This refers to column names +on which datasets are split. A separate figure is created for each split. +See details for available variables.} + +\item{color_by}{(\emph{optional}) Variables used to determine fill colour of plot +objects. The variables cannot overlap with those provided to the \code{split_by} +argument, but may overlap with other arguments. See details for available +variables.} + +\item{facet_by}{(\emph{optional}) Variables used to determine how and if facets of +each figure appear. In case the \code{facet_wrap_cols} argument is \code{NULL}, the +first variable is used to define columns, and the remaing variables are +used to define rows of facets. The variables cannot overlap with those +provided to the \code{split_by} argument, but may overlap with other arguments. +See details for available variables.} + +\item{facet_wrap_cols}{(\emph{optional}) Number of columns to generate when facet +wrapping. If NULL, a facet grid is produced instead.} + +\item{ggtheme}{(\emph{optional}) \code{ggplot} theme to use for plotting.} + +\item{discrete_palette}{(\emph{optional}) Palette used to fill the bars in case a +non-singular variable was provided to the \code{color_by} argument.} + +\item{x_label}{(\emph{optional}) Label to provide to the x-axis. If NULL, no label +is shown.} + +\item{y_label}{(\emph{optional}) Label to provide to the y-axis. If NULL, no label +is shown.} + +\item{legend_label}{(\emph{optional}) Label to provide to the legend. If NULL, the +legend will not have a name.} + +\item{plot_title}{(\emph{optional}) Label to provide as figure title. If NULL, no +title is shown.} + +\item{plot_sub_title}{(\emph{optional}) Label to provide as figure subtitle. If +NULL, no subtitle is shown.} + +\item{caption}{(\emph{optional}) Label to provide as figure caption. If NULL, no +caption is shown.} + +\item{x_range}{(\emph{optional}) Value range for the x-axis.} + +\item{x_n_breaks}{(\emph{optional}) Number of breaks to show on the x-axis of the +plot. \code{x_n_breaks} is used to determine the \code{x_breaks} argument in case it +is unset.} + +\item{x_breaks}{(\emph{optional}) Break points on the x-axis of the plot.} + +\item{conf_int_style}{(\emph{optional}) Confidence interval style. See details for +allowed styles.} + +\item{conf_int_alpha}{(\emph{optional}) Alpha value to determine transparency of +confidence intervals or, alternatively, other plot elements with which the +confidence interval overlaps. Only values between 0.0 (fully transparent) +and 1.0 (fully opaque) are allowed.} + +\item{width}{(\emph{optional}) Width of the plot. A default value is derived from +the number of facets.} + +\item{height}{(\emph{optional}) Height of the plot. A default value is derived from +the number of features and the number of facets.} + +\item{units}{(\emph{optional}) Plot size unit. Either \code{cm} (default), \code{mm} or \verb{in}.} + +\item{export_collection}{(\emph{optional}) Exports the collection if TRUE.} + +\item{...}{ + Arguments passed on to \code{\link[=as_familiar_collection]{as_familiar_collection}}, \code{\link[ggplot2:ggsave]{ggplot2::ggsave}}, \code{\link[=extract_permutation_vimp]{extract_permutation_vimp}} + \describe{ + \item{\code{familiar_data_names}}{Names of the dataset(s). Only used if the \code{object} parameter +is one or more \code{familiarData} objects.} + \item{\code{collection_name}}{Name of the collection.} + \item{\code{filename}}{File name to create on disk.} + \item{\code{plot}}{Plot to save, defaults to last plot displayed.} + \item{\code{device}}{Device to use. Can either be a device function +(e.g. \link{png}), or one of "eps", "ps", "tex" (pictex), +"pdf", "jpeg", "tiff", "png", "bmp", "svg" or "wmf" (windows only). If +\code{NULL} (default), the device is guessed based on the \code{filename} extension.} + \item{\code{path}}{Path of the directory to save plot to: \code{path} and \code{filename} +are combined to create the fully qualified file name. Defaults to the +working directory.} + \item{\code{scale}}{Multiplicative scaling factor.} + \item{\code{dpi}}{Plot resolution. Also accepts a string input: "retina" (320), +"print" (300), or "screen" (72). Applies only to raster output types.} + \item{\code{limitsize}}{When \code{TRUE} (the default), \code{ggsave()} will not +save images larger than 50x50 inches, to prevent the common error of +specifying dimensions in pixels.} + \item{\code{bg}}{Background colour. If \code{NULL}, uses the \code{plot.background} fill value +from the plot theme.} + \item{\code{create.dir}}{Whether to create new directories if a non-existing +directory is specified in the \code{filename} or \code{path} (\code{TRUE}) or return an +error (\code{FALSE}, default). If \code{FALSE} and run in an interactive session, +a prompt will appear asking to create a new directory when necessary.} + \item{\code{data}}{A \code{dataObject} object, \code{data.table} or \code{data.frame} that +constitutes the data that are assessed.} + \item{\code{is_pre_processed}}{Flag that indicates whether the data was already +pre-processed externally, e.g. normalised and clustered. Only used if the +\code{data} argument is a \code{data.table} or \code{data.frame}.} + \item{\code{cl}}{Cluster created using the \code{parallel} package. This cluster is then +used to speed up computation through parallellisation.} + \item{\code{evaluation_times}}{One or more time points that are used for in analysis of +survival problems when data has to be assessed at a set time, e.g. +calibration. If not provided explicitly, this parameter is read from +settings used at creation of the underlying \code{familiarModel} objects. Only +used for \code{survival} outcomes.} + \item{\code{ensemble_method}}{Method for ensembling predictions from models for the +same sample. Available methods are: +\itemize{ +\item \code{median} (default): Use the median of the predicted values as the ensemble +value for a sample. +\item \code{mean}: Use the mean of the predicted values as the ensemble value for a +sample. +}} + \item{\code{metric}}{One or more metrics for assessing model performance. See the +vignette on performance metrics for the available metrics. If not provided +explicitly, this parameter is read from settings used at creation of the +underlying \code{familiarModel} objects.} + \item{\code{feature_cluster_method}}{The method used to perform clustering. These are +the same methods as for the \code{cluster_method} configuration parameter: +\code{none}, \code{hclust}, \code{agnes}, \code{diana} and \code{pam}. + +\code{none} cannot be used when extracting data regarding mutual correlation or +feature expressions. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{feature_linkage_method}}{The method used for agglomerative clustering in +\code{hclust} and \code{agnes}. These are the same methods as for the +\code{cluster_linkage_method} configuration parameter: \code{average}, \code{single}, +\code{complete}, \code{weighted}, and \code{ward}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{feature_cluster_cut_method}}{The method used to divide features into +separate clusters. The available methods are the same as for the +\code{cluster_cut_method} configuration parameter: \code{silhouette}, \code{fixed_cut} and +\code{dynamic_cut}. + +\code{silhouette} is available for all cluster methods, but \code{fixed_cut} only +applies to methods that create hierarchical trees (\code{hclust}, \code{agnes} and +\code{diana}). \code{dynamic_cut} requires the \code{dynamicTreeCut} package and can only +be used with \code{agnes} and \code{hclust}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{feature_similarity_threshold}}{The threshold level for pair-wise +similarity that is required to form feature clusters with the \code{fixed_cut} +method. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{feature_similarity_metric}}{Metric to determine pairwise similarity +between features. Similarity is computed in the same manner as for +clustering, and \code{feature_similarity_metric} therefore has the same options +as \code{cluster_similarity_metric}: \code{mcfadden_r2}, \code{cox_snell_r2}, +\code{nagelkerke_r2}, \code{spearman}, \code{kendall} and \code{pearson}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{verbose}}{Flag to indicate whether feedback should be provided on the +computation and extraction of various data elements.} + \item{\code{message_indent}}{Number of indentation steps for messages shown during +computation and extraction of various data elements.} + \item{\code{detail_level}}{(\emph{optional}) Sets the level at which results are computed +and aggregated. +\itemize{ +\item \code{ensemble}: Results are computed at the ensemble level, i.e. over all +models in the ensemble. This means that, for example, bias-corrected +estimates of model performance are assessed by creating (at least) 20 +bootstraps and computing the model performance of the ensemble model for +each bootstrap. +\item \code{hybrid} (default): Results are computed at the level of models in an +ensemble. This means that, for example, bias-corrected estimates of model +performance are directly computed using the models in the ensemble. If there +are at least 20 trained models in the ensemble, performance is computed for +each model, in contrast to \code{ensemble} where performance is computed for the +ensemble of models. If there are less than 20 trained models in the +ensemble, bootstraps are created so that at least 20 point estimates can be +made. +\item \code{model}: Results are computed at the model level. This means that, for +example, bias-corrected estimates of model performance are assessed by +creating (at least) 20 bootstraps and computing the performance of the model +for each bootstrap. +} + +Note that each level of detail has a different interpretation for bootstrap +confidence intervals. For \code{ensemble} and \code{model} these are the confidence +intervals for the ensemble and an individual model, respectively. That is, +the confidence interval describes the range where an estimate produced by a +respective ensemble or model trained on a repeat of the experiment may be +found with the probability of the confidence level. For \code{hybrid}, it +represents the range where any single model trained on a repeat of the +experiment may be found with the probability of the confidence level. By +definition, confidence intervals obtained using \code{hybrid} are at least as +wide as those for \code{ensemble}. \code{hybrid} offers the correct interpretation if +the goal of the analysis is to assess the result of a single, unspecified, +model. + +\code{hybrid} is generally computationally less expensive then \code{ensemble}, which +in turn is somewhat less expensive than \code{model}. + +A non-default \code{detail_level} parameter can be specified for separate +evaluation steps by providing a parameter value in a named list with data +elements, e.g. \code{list("auc_data"="ensemble", "model_performance"="hybrid")}. +This parameter can be set for the following data elements: \code{auc_data}, +\code{decision_curve_analyis}, \code{model_performance}, \code{permutation_vimp}, +\code{ice_data}, \code{prediction_data} and \code{confusion_matrix}.} + \item{\code{estimation_type}}{(\emph{optional}) Sets the type of estimation that should be +possible. This has the following options: +\itemize{ +\item \code{point}: Point estimates. +\item \code{bias_correction} or \code{bc}: Bias-corrected estimates. A bias-corrected +estimate is computed from (at least) 20 point estimates, and \code{familiar} may +bootstrap the data to create them. +\item \code{bootstrap_confidence_interval} or \code{bci} (default): Bias-corrected +estimates with bootstrap confidence intervals (Efron and Hastie, 2016). The +number of point estimates required depends on the \code{confidence_level} +parameter, and \code{familiar} may bootstrap the data to create them. +} + +As with \code{detail_level}, a non-default \code{estimation_type} parameter can be +specified for separate evaluation steps by providing a parameter value in a +named list with data elements, e.g. \code{list("auc_data"="bci", "model_performance"="point")}. This parameter can be set for the following +data elements: \code{auc_data}, \code{decision_curve_analyis}, \code{model_performance}, +\code{permutation_vimp}, \code{ice_data}, and \code{prediction_data}.} + \item{\code{aggregate_results}}{(\emph{optional}) Flag that signifies whether results +should be aggregated during evaluation. If \code{estimation_type} is +\code{bias_correction} or \code{bc}, aggregation leads to a single bias-corrected +estimate. If \code{estimation_type} is \code{bootstrap_confidence_interval} or \code{bci}, +aggregation leads to a single bias-corrected estimate with lower and upper +boundaries of the confidence interval. This has no effect if +\code{estimation_type} is \code{point}. + +The default value is equal to \code{TRUE} except when assessing metrics to assess +model performance, as the default violin plot requires underlying data. + +As with \code{detail_level} and \code{estimation_type}, a non-default +\code{aggregate_results} parameter can be specified for separate evaluation steps +by providing a parameter value in a named list with data elements, e.g. +\code{list("auc_data"=TRUE, , "model_performance"=FALSE)}. This parameter exists +for the same elements as \code{estimation_type}.} + \item{\code{confidence_level}}{(\emph{optional}) Numeric value for the level at which +confidence intervals are determined. In the case bootstraps are used to +determine the confidence intervals bootstrap estimation, \code{familiar} uses the +rule of thumb \eqn{n = 20 / ci.level} to determine the number of required +bootstraps. + +The default value is \code{0.95}.} + \item{\code{bootstrap_ci_method}}{(\emph{optional}) Method used to determine bootstrap +confidence intervals (Efron and Hastie, 2016). The following methods are +implemented: +\itemize{ +\item \code{percentile} (default): Confidence intervals obtained using the percentile +method. +\item \code{bc}: Bias-corrected confidence intervals. +} + +Note that the standard method is not implemented because this method is +often not suitable due to non-normal distributions. The bias-corrected and +accelerated (BCa) method is not implemented yet.} + }} +} +\value{ +\code{NULL} or list of plot objects, if \code{dir_path} is \code{NULL}. +} +\description{ +This function plots the data on permutation variable importance +stored in a familiarCollection object. +} +\details{ +This function generates a horizontal barplot that lists features by +the estimated model improvement over that of a dataset where the respective +feature is randomly permuted. + +The following splitting variables are available for \code{split_by}, \code{color_by} +and \code{facet_by}: +\itemize{ +\item \code{fs_method}: feature selection methods. +\item \code{learner}: learners. +\item \code{data_set}: data sets. +\item \code{metric}: the model performance metrics. +\item \code{evaluation_time}: the evaluation times (survival outcomes only). +\item \code{similarity_threshold}: the similarity threshold used to identify groups +of features to permute simultaneously. +} + +By default, the data is split by \code{fs_method}, \code{learner} and \code{metric}, +faceted by \code{data_set} and \code{evaluation_time}, and coloured by +\code{similarity_threshold}. + +Available palettes for \code{discrete_palette} are those listed by +\code{grDevices::palette.pals()} (requires R >= 4.0.0), \code{grDevices::hcl.pals()} +(requires R >= 3.6.0) and \code{rainbow}, \code{heat.colors}, \code{terrain.colors}, +\code{topo.colors} and \code{cm.colors}, which correspond to the palettes of the same +name in \code{grDevices}. If not specified, a default palette based on palettes +in Tableau are used. You may also specify your own palette by using colour +names listed by \code{grDevices::colors()} or through hexadecimal RGB strings. + +Labelling methods such as \code{set_fs_method_names} or \code{set_feature_names} can +be applied to the \code{familiarCollection} object to update labels, and order +the output in the figure. + +Bootstrap confidence intervals (if present) can be shown using various +styles set by \code{conf_int_style}: +\itemize{ +\item \code{point_line} (default): confidence intervals are shown as lines, on which +the point estimate is likewise shown. +\item \code{line} (default): confidence intervals are shown as lines, but the point +estimate is not shown. +\item \code{bar_line}: confidence intervals are shown as lines, with the point +estimate shown as a bar plot with the opacity of \code{conf_int_alpha}. +\item \code{none}: confidence intervals are not shown. The point estimate is shown as +a bar plot. +} + +For metrics where lower values indicate better model performance, more +negative permutation variable importance values indicate features that are +more important. Because this may cause confusion, values obtained for these +metrics are mirrored around 0.0 for plotting (but not any tabular data +export). +} diff --git a/man/plot_sample_clustering-methods.Rd b/man/plot_sample_clustering-methods.Rd new file mode 100644 index 00000000..830b0a28 --- /dev/null +++ b/man/plot_sample_clustering-methods.Rd @@ -0,0 +1,502 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/PlotSampleClustering.R +\name{plot_sample_clustering} +\alias{plot_sample_clustering} +\alias{plot_sample_clustering,ANY-method} +\alias{plot_sample_clustering,familiarCollection-method} +\title{Plot heatmaps for pairwise similarity between features.} +\usage{ +plot_sample_clustering( + object, + feature_cluster_method = waiver(), + feature_linkage_method = waiver(), + sample_cluster_method = waiver(), + sample_linkage_method = waiver(), + sample_limit = waiver(), + draw = FALSE, + dir_path = NULL, + split_by = NULL, + x_axis_by = NULL, + y_axis_by = NULL, + facet_by = NULL, + facet_wrap_cols = NULL, + ggtheme = NULL, + gradient_palette = NULL, + gradient_palette_range = waiver(), + outcome_palette = NULL, + outcome_palette_range = waiver(), + x_label = waiver(), + x_label_shared = "column", + y_label = waiver(), + y_label_shared = "row", + legend_label = waiver(), + outcome_legend_label = waiver(), + plot_title = waiver(), + plot_sub_title = waiver(), + caption = NULL, + x_range = NULL, + x_n_breaks = 3, + x_breaks = NULL, + y_range = NULL, + y_n_breaks = 3, + y_breaks = NULL, + rotate_x_tick_labels = waiver(), + show_feature_dendrogram = TRUE, + show_sample_dendrogram = TRUE, + show_normalised_data = TRUE, + show_outcome = TRUE, + dendrogram_height = grid::unit(1.5, "cm"), + outcome_height = grid::unit(0.3, "cm"), + evaluation_times = waiver(), + width = waiver(), + height = waiver(), + units = waiver(), + export_collection = FALSE, + verbose = TRUE, + ... +) + +\S4method{plot_sample_clustering}{ANY}( + object, + feature_cluster_method = waiver(), + feature_linkage_method = waiver(), + sample_cluster_method = waiver(), + sample_linkage_method = waiver(), + sample_limit = waiver(), + draw = FALSE, + dir_path = NULL, + split_by = NULL, + x_axis_by = NULL, + y_axis_by = NULL, + facet_by = NULL, + facet_wrap_cols = NULL, + ggtheme = NULL, + gradient_palette = NULL, + gradient_palette_range = waiver(), + outcome_palette = NULL, + outcome_palette_range = waiver(), + x_label = waiver(), + x_label_shared = "column", + y_label = waiver(), + y_label_shared = "row", + legend_label = waiver(), + outcome_legend_label = waiver(), + plot_title = waiver(), + plot_sub_title = waiver(), + caption = NULL, + x_range = NULL, + x_n_breaks = 3, + x_breaks = NULL, + y_range = NULL, + y_n_breaks = 3, + y_breaks = NULL, + rotate_x_tick_labels = waiver(), + show_feature_dendrogram = TRUE, + show_sample_dendrogram = TRUE, + show_normalised_data = TRUE, + show_outcome = TRUE, + dendrogram_height = grid::unit(1.5, "cm"), + outcome_height = grid::unit(0.3, "cm"), + evaluation_times = waiver(), + width = waiver(), + height = waiver(), + units = waiver(), + export_collection = FALSE, + verbose = TRUE, + ... +) + +\S4method{plot_sample_clustering}{familiarCollection}( + object, + feature_cluster_method = waiver(), + feature_linkage_method = waiver(), + sample_cluster_method = waiver(), + sample_linkage_method = waiver(), + sample_limit = waiver(), + draw = FALSE, + dir_path = NULL, + split_by = NULL, + x_axis_by = NULL, + y_axis_by = NULL, + facet_by = NULL, + facet_wrap_cols = NULL, + ggtheme = NULL, + gradient_palette = NULL, + gradient_palette_range = waiver(), + outcome_palette = NULL, + outcome_palette_range = waiver(), + x_label = waiver(), + x_label_shared = "column", + y_label = waiver(), + y_label_shared = "row", + legend_label = waiver(), + outcome_legend_label = waiver(), + plot_title = waiver(), + plot_sub_title = waiver(), + caption = NULL, + x_range = NULL, + x_n_breaks = 3, + x_breaks = NULL, + y_range = NULL, + y_n_breaks = 3, + y_breaks = NULL, + rotate_x_tick_labels = waiver(), + show_feature_dendrogram = TRUE, + show_sample_dendrogram = TRUE, + show_normalised_data = TRUE, + show_outcome = TRUE, + dendrogram_height = grid::unit(1.5, "cm"), + outcome_height = grid::unit(0.3, "cm"), + evaluation_times = waiver(), + width = waiver(), + height = waiver(), + units = waiver(), + export_collection = FALSE, + verbose = TRUE, + ... +) +} +\arguments{ +\item{object}{A \code{familiarCollection} object, or other other objects from which +a \code{familiarCollection} can be extracted. See details for more information.} + +\item{feature_cluster_method}{The method used to perform clustering. These are +the same methods as for the \code{cluster_method} configuration parameter: +\code{none}, \code{hclust}, \code{agnes}, \code{diana} and \code{pam}. + +\code{none} cannot be used when extracting data regarding mutual correlation or +feature expressions. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{feature_linkage_method}{The method used for agglomerative clustering in +\code{hclust} and \code{agnes}. These are the same methods as for the +\code{cluster_linkage_method} configuration parameter: \code{average}, \code{single}, +\code{complete}, \code{weighted}, and \code{ward}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{sample_cluster_method}{The method used to perform clustering based on +distance between samples. These are the same methods as for the +\code{cluster_method} configuration parameter: \code{hclust}, \code{agnes}, \code{diana} and +\code{pam}. + +\code{none} cannot be used when extracting data for feature expressions. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{sample_linkage_method}{The method used for agglomerative clustering in +\code{hclust} and \code{agnes}. These are the same methods as for the +\code{cluster_linkage_method} configuration parameter: \code{average}, \code{single}, +\code{complete}, \code{weighted}, and \code{ward}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{sample_limit}{(\emph{optional}) Set the upper limit of the number of samples +that are used during evaluation steps. Cannot be less than 20. + +This setting can be specified per data element by providing a parameter +value in a named list with data elements, e.g. +\code{list("sample_similarity"=100, "permutation_vimp"=1000)}. + +This parameter can be set for the following data elements: +\code{sample_similarity} and \code{ice_data}.} + +\item{draw}{(\emph{optional}) Draws the plot if TRUE.} + +\item{dir_path}{(\emph{optional}) Path to the directory where created performance +plots are saved to. Output is saved in the \code{feature_similarity} +subdirectory. If \code{NULL} no figures are saved, but are returned instead.} + +\item{split_by}{(\emph{optional}) Splitting variables. This refers to column names +on which datasets are split. A separate figure is created for each split. +See details for available variables.} + +\item{x_axis_by}{(\emph{optional}) Variable plotted along the x-axis of a plot. +The variable cannot overlap with variables provided to the \code{split_by} and +\code{y_axis_by} arguments (if used), but may overlap with other arguments. Only +one variable is allowed for this argument. See details for available +variables.} + +\item{y_axis_by}{(\emph{optional}) Variable plotted along the y-axis of a plot. +The variable cannot overlap with variables provided to the \code{split_by} and +\code{x_axis_by} arguments (if used), but may overlap with other arguments. Only +one variable is allowed for this argument. See details for available +variables.} + +\item{facet_by}{(\emph{optional}) Variables used to determine how and if facets of +each figure appear. In case the \code{facet_wrap_cols} argument is \code{NULL}, the +first variable is used to define columns, and the remaing variables are +used to define rows of facets. The variables cannot overlap with those +provided to the \code{split_by} argument, but may overlap with other arguments. +See details for available variables.} + +\item{facet_wrap_cols}{(\emph{optional}) Number of columns to generate when facet +wrapping. If NULL, a facet grid is produced instead.} + +\item{ggtheme}{(\emph{optional}) \code{ggplot} theme to use for plotting.} + +\item{gradient_palette}{(\emph{optional}) Sequential or divergent palette used to +colour the similarity or distance between features in a heatmap.} + +\item{gradient_palette_range}{(\emph{optional}) Numerical range used to span the +gradient. This should be a range of two values, e.g. \code{c(0, 1)}. Lower or +upper boundary can be unset by using \code{NA}. If not set, the full +metric-specific range is used.} + +\item{outcome_palette}{(\emph{optional}) Sequential (\code{continuous}, \code{count} +outcomes) or qualitative (other outcome types) palette used to show outcome +values. This argument is ignored if the outcome is not shown.} + +\item{outcome_palette_range}{(\emph{optional}) Numerical range used to span the +gradient of numeric (\code{continuous}, \code{count}) outcome values. This argument is +ignored for other outcome types or if the outcome is not shown.} + +\item{x_label}{(\emph{optional}) Label to provide to the x-axis. If NULL, no label +is shown.} + +\item{x_label_shared}{(\emph{optional}) Sharing of x-axis labels between facets. +One of three values: +\itemize{ +\item \code{overall}: A single label is placed at the bottom of the figure. Tick +text (but not the ticks themselves) is removed for all but the bottom facet +plot(s). +\item \code{column}: A label is placed at the bottom of each column. Tick text (but +not the ticks themselves) is removed for all but the bottom facet plot(s). +\item \code{individual}: A label is placed below each facet plot. Tick text is kept. +}} + +\item{y_label}{(\emph{optional}) Label to provide to the y-axis. If NULL, no label +is shown.} + +\item{y_label_shared}{(\emph{optional}) Sharing of y-axis labels between facets. +One of three values: +\itemize{ +\item \code{overall}: A single label is placed to the left of the figure. Tick text +(but not the ticks themselves) is removed for all but the left-most facet +plot(s). +\item \code{row}: A label is placed to the left of each row. Tick text (but not the +ticks themselves) is removed for all but the left-most facet plot(s). +\item \code{individual}: A label is placed below each facet plot. Tick text is kept. +}} + +\item{legend_label}{(\emph{optional}) Label to provide to the legend. If NULL, the +legend will not have a name.} + +\item{outcome_legend_label}{(\emph{optional}) Label to provide to the legend for +outcome data. If NULL, the legend will not have a name. By default, \code{class}, +\code{value} and \code{event} are used for \code{binomial} and \code{multinomial}, \code{continuous} +and \code{count}, and \code{survival} outcome types, respectively.} + +\item{plot_title}{(\emph{optional}) Label to provide as figure title. If NULL, no +title is shown.} + +\item{plot_sub_title}{(\emph{optional}) Label to provide as figure subtitle. If +NULL, no subtitle is shown.} + +\item{caption}{(\emph{optional}) Label to provide as figure caption. If NULL, no +caption is shown.} + +\item{x_range}{(\emph{optional}) Value range for the x-axis.} + +\item{x_n_breaks}{(\emph{optional}) Number of breaks to show on the x-axis of the +plot. \code{x_n_breaks} is used to determine the \code{x_breaks} argument in case it +is unset.} + +\item{x_breaks}{(\emph{optional}) Break points on the x-axis of the plot.} + +\item{y_range}{(\emph{optional}) Value range for the y-axis.} + +\item{y_n_breaks}{(\emph{optional}) Number of breaks to show on the y-axis of the +plot. \code{y_n_breaks} is used to determine the \code{y_breaks} argument in case it +is unset.} + +\item{y_breaks}{(\emph{optional}) Break points on the y-axis of the plot.} + +\item{rotate_x_tick_labels}{(\emph{optional}) Rotate tick labels on the x-axis by +90 degrees. Defaults to \code{TRUE}. Rotation of x-axis tick labels may also be +controlled through the \code{ggtheme}. In this case, \code{FALSE} should be provided +explicitly.} + +\item{show_feature_dendrogram}{(\emph{optional}) Show feature dendrogram around the +main panel. Can be \code{TRUE}, \code{FALSE}, \code{NULL}, or a position, i.e. \code{top}, +\code{bottom}, \code{left} and \code{right}. + +If a position is specified, it should be appropriate with regard to the +\code{x_axis_by} or \code{y_axis_by} argument. If \code{x_axis_by} is \code{sample} (default), +the only valid positions are \code{top} (default) and \code{bottom}. Alternatively, if +\code{y_axis_by} is \code{feature}, the only valid positions are \code{right} (default) and +\code{left}. + +A dendrogram can only be drawn from cluster methods that produce dendograms, +such as \code{hclust}. A dendogram can for example not be constructed using the +partioning around medioids method (\code{pam}).} + +\item{show_sample_dendrogram}{(\emph{optional}) Show sample dendrogram around the +main panel. Can be \code{TRUE}, \code{FALSE}, \code{NULL}, or a position, i.e. \code{top}, +\code{bottom}, \code{left} and \code{right}. + +If a position is specified, it should be appropriate with regard to the +\code{x_axis_by} or \code{y_axis_by} argument. If \code{y_axis_by} is \code{sample} (default), +the only valid positions are \code{right} (default) and \code{left}. Alternatively, if +\code{x_axis_by} is \code{sample}, the only valid positions are \code{top} (default) and +\code{bottom}. + +A dendrogram can only be drawn from cluster methods that produce dendograms, +such as \code{hclust}. A dendogram can for example not be constructed using the +partioning around medioids method (\code{pam}).} + +\item{show_normalised_data}{(\emph{optional}) Flag that determines whether the data +shown in the main heatmap is normalised using the same settings as within +the analysis (\code{fixed}; default), using a standardisation method +(\code{set_normalisation}) that is applied separately to each dataset, or not at +all (\code{none}), which shows the data at the original scale, albeit with +batch-corrections. + +Categorial variables are plotted to span 90\% of the entire numerical value +range, i.e. the levels of categorical variables with 2 levels are +represented at 5\% and 95\% of the range, with 3 levels at 5\%, 50\%, and 95\%, +etc.} + +\item{show_outcome}{(\emph{optional}) Show outcome column(s) or row(s) in the +graph. Can be \code{TRUE}, \code{FALSE}, \code{NULL} or a poistion, i.e. \code{top}, \code{bottom}, +\code{left} and \code{right}. + +If a position is specified, it should be appropriate with regard to the +\code{x_axis_by} or \code{y_axis_by} argument. If \code{y_axis_by} is \code{sample} (default), +the only valid positions are \code{left} (default) and \code{right}. Alternatively, if +\code{x_axis_by} is \code{sample}, the only valid positions are \code{top} (default) and +\code{bottom}. + +The outcome data will be drawn between the main panel and the sample +dendrogram (if any).} + +\item{dendrogram_height}{(\emph{optional}) Height of the dendrogram. The height is +1.5 cm by default. Height is expected to be grid unit (see \code{grid::unit}), +which also allows for specifying relative heights.} + +\item{outcome_height}{(\emph{optional}) Height of an outcome data column/row. The +height is 0.3 cm by default. Height is expected to be a grid unit (see +\code{grid::unit}), which also allows for specifying relative heights. In case of +\code{survival} outcome data with multipe \code{evaluation_times}, this height is +multiplied by the number of time points.} + +\item{evaluation_times}{(\emph{optional}) Times at which the event status of +time-to-event survival outcomes are determined. Only used for \code{survival} +outcome. If not specified, the values used when creating the underlying +\code{familiarData} objects are used.} + +\item{width}{(\emph{optional}) Width of the plot. A default value is derived from +the number of facets.} + +\item{height}{(\emph{optional}) Height of the plot. A default value is derived from +the number of features and the number of facets.} + +\item{units}{(\emph{optional}) Plot size unit. Either \code{cm} (default), \code{mm} or \verb{in}.} + +\item{export_collection}{(\emph{optional}) Exports the collection if TRUE.} + +\item{verbose}{Flag to indicate whether feedback should be provided for the +plotting.} + +\item{...}{ + Arguments passed on to \code{\link[=as_familiar_collection]{as_familiar_collection}}, \code{\link[ggplot2:ggsave]{ggplot2::ggsave}}, \code{\link[=extract_feature_expression]{extract_feature_expression}} + \describe{ + \item{\code{familiar_data_names}}{Names of the dataset(s). Only used if the \code{object} parameter +is one or more \code{familiarData} objects.} + \item{\code{collection_name}}{Name of the collection.} + \item{\code{filename}}{File name to create on disk.} + \item{\code{plot}}{Plot to save, defaults to last plot displayed.} + \item{\code{device}}{Device to use. Can either be a device function +(e.g. \link{png}), or one of "eps", "ps", "tex" (pictex), +"pdf", "jpeg", "tiff", "png", "bmp", "svg" or "wmf" (windows only). If +\code{NULL} (default), the device is guessed based on the \code{filename} extension.} + \item{\code{path}}{Path of the directory to save plot to: \code{path} and \code{filename} +are combined to create the fully qualified file name. Defaults to the +working directory.} + \item{\code{scale}}{Multiplicative scaling factor.} + \item{\code{dpi}}{Plot resolution. Also accepts a string input: "retina" (320), +"print" (300), or "screen" (72). Applies only to raster output types.} + \item{\code{limitsize}}{When \code{TRUE} (the default), \code{ggsave()} will not +save images larger than 50x50 inches, to prevent the common error of +specifying dimensions in pixels.} + \item{\code{bg}}{Background colour. If \code{NULL}, uses the \code{plot.background} fill value +from the plot theme.} + \item{\code{create.dir}}{Whether to create new directories if a non-existing +directory is specified in the \code{filename} or \code{path} (\code{TRUE}) or return an +error (\code{FALSE}, default). If \code{FALSE} and run in an interactive session, +a prompt will appear asking to create a new directory when necessary.} + \item{\code{feature_similarity}}{Table containing pairwise distance between +sample. This is used to determine cluster information, and indicate which +samples are similar. The table is created by the +\code{extract_sample_similarity} method.} + \item{\code{data}}{A \code{dataObject} object, \code{data.table} or \code{data.frame} that +constitutes the data that are assessed.} + \item{\code{feature_similarity_metric}}{Metric to determine pairwise similarity +between features. Similarity is computed in the same manner as for +clustering, and \code{feature_similarity_metric} therefore has the same options +as \code{cluster_similarity_metric}: \code{mcfadden_r2}, \code{cox_snell_r2}, +\code{nagelkerke_r2}, \code{spearman}, \code{kendall} and \code{pearson}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{sample_similarity_metric}}{Metric to determine pairwise similarity +between samples. Similarity is computed in the same manner as for +clustering, but \code{sample_similarity_metric} has different options that are +better suited to computing distance between samples instead of between +features: \code{gower}, \code{euclidean}. + +The underlying feature data is scaled to the \eqn{[0, 1]} range (for +numerical features) using the feature values across the samples. The +normalisation parameters required can optionally be computed from feature +data with the outer 5\% (on both sides) of feature values trimmed or +winsorised. To do so append \verb{_trim} (trimming) or \verb{_winsor} (winsorising) to +the metric name. This reduces the effect of outliers somewhat. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{message_indent}}{Number of indentation steps for messages shown during +computation and extraction of various data elements.} + }} +} +\value{ +\code{NULL} or list of plot objects, if \code{dir_path} is \code{NULL}. +} +\description{ +This method creates a heatmap based on data stored in a +\code{familiarCollection} object. Features in the heatmap are ordered so that +more similar features appear together. +} +\details{ +This function generates area under the ROC curve plots. + +Available splitting variables are: \code{fs_method}, \code{learner}, and \code{data_set}. +By default, the data is split by \code{fs_method} and \code{learner} and \code{data_set}, +since the number of samples will typically differ between data sets, even +for the same feature selection method and learner. + +The \code{x_axis_by} and \code{y_axis_by} arguments determine what data are shown +along which axis. Each argument takes one of \code{feature} and \code{sample}, and +both arguments should be unique. By default, features are shown along the +x-axis and samples along the y-axis. + +Note that similarity is determined based on the underlying data. Hence the +ordering of features may differ between facets, and tick labels are +maintained for each panel. + +Available palettes for \code{gradient_palette} are those listed by +\code{grDevices::palette.pals()} (requires R >= 4.0.0), \code{grDevices::hcl.pals()} +(requires R >= 3.6.0) and \code{rainbow}, \code{heat.colors}, \code{terrain.colors}, +\code{topo.colors} and \code{cm.colors}, which correspond to the palettes of the same +name in \code{grDevices}. If not specified, a default palette based on palettes +in Tableau are used. You may also specify your own palette by using colour +names listed by \code{grDevices::colors()} or through hexadecimal RGB strings. + +Labeling methods such as \code{set_fs_method_names} or \code{set_data_set_names} can +be applied to the \code{familiarCollection} object to update labels, and order +the output in the figure. +} diff --git a/man/plot_univariate_importance-methods.Rd b/man/plot_univariate_importance-methods.Rd new file mode 100644 index 00000000..be6a3134 --- /dev/null +++ b/man/plot_univariate_importance-methods.Rd @@ -0,0 +1,332 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/PlotUnivariateImportance.R +\name{plot_univariate_importance} +\alias{plot_univariate_importance} +\alias{plot_univariate_importance,ANY-method} +\alias{plot_univariate_importance,familiarCollection-method} +\title{Plot univariate importance.} +\usage{ +plot_univariate_importance( + object, + feature_cluster_method = waiver(), + feature_linkage_method = waiver(), + feature_cluster_cut_method = waiver(), + feature_similarity_threshold = waiver(), + draw = FALSE, + dir_path = NULL, + p_adjustment_method = waiver(), + split_by = NULL, + color_by = NULL, + facet_by = NULL, + facet_wrap_cols = NULL, + show_cluster = TRUE, + ggtheme = NULL, + discrete_palette = NULL, + gradient_palette = waiver(), + x_label = waiver(), + y_label = "feature", + legend_label = waiver(), + plot_title = waiver(), + plot_sub_title = waiver(), + caption = NULL, + x_range = NULL, + x_n_breaks = 5, + x_breaks = NULL, + significance_level_shown = 0.05, + width = waiver(), + height = waiver(), + units = waiver(), + verbose = TRUE, + export_collection = FALSE, + ... +) + +\S4method{plot_univariate_importance}{ANY}( + object, + feature_cluster_method = waiver(), + feature_linkage_method = waiver(), + feature_cluster_cut_method = waiver(), + feature_similarity_threshold = waiver(), + draw = FALSE, + dir_path = NULL, + p_adjustment_method = waiver(), + split_by = NULL, + color_by = NULL, + facet_by = NULL, + facet_wrap_cols = NULL, + show_cluster = TRUE, + ggtheme = NULL, + discrete_palette = NULL, + gradient_palette = waiver(), + x_label = waiver(), + y_label = "feature", + legend_label = waiver(), + plot_title = waiver(), + plot_sub_title = waiver(), + caption = NULL, + x_range = NULL, + x_n_breaks = 5, + x_breaks = NULL, + significance_level_shown = 0.05, + width = waiver(), + height = waiver(), + units = waiver(), + verbose = TRUE, + export_collection = FALSE, + ... +) + +\S4method{plot_univariate_importance}{familiarCollection}( + object, + feature_cluster_method = waiver(), + feature_linkage_method = waiver(), + feature_cluster_cut_method = waiver(), + feature_similarity_threshold = waiver(), + draw = FALSE, + dir_path = NULL, + p_adjustment_method = waiver(), + split_by = NULL, + color_by = NULL, + facet_by = NULL, + facet_wrap_cols = NULL, + show_cluster = TRUE, + ggtheme = NULL, + discrete_palette = NULL, + gradient_palette = waiver(), + x_label = waiver(), + y_label = "feature", + legend_label = waiver(), + plot_title = waiver(), + plot_sub_title = waiver(), + caption = NULL, + x_range = NULL, + x_n_breaks = 5, + x_breaks = NULL, + significance_level_shown = 0.05, + width = waiver(), + height = waiver(), + units = waiver(), + verbose = TRUE, + export_collection = FALSE, + ... +) +} +\arguments{ +\item{object}{A \code{familiarCollection} object, or other other objects from which +a \code{familiarCollection} can be extracted. See details for more information.} + +\item{feature_cluster_method}{The method used to perform clustering. These are +the same methods as for the \code{cluster_method} configuration parameter: +\code{none}, \code{hclust}, \code{agnes}, \code{diana} and \code{pam}. + +\code{none} cannot be used when extracting data regarding mutual correlation or +feature expressions. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{feature_linkage_method}{The method used for agglomerative clustering in +\code{hclust} and \code{agnes}. These are the same methods as for the +\code{cluster_linkage_method} configuration parameter: \code{average}, \code{single}, +\code{complete}, \code{weighted}, and \code{ward}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{feature_cluster_cut_method}{The method used to divide features into +separate clusters. The available methods are the same as for the +\code{cluster_cut_method} configuration parameter: \code{silhouette}, \code{fixed_cut} and +\code{dynamic_cut}. + +\code{silhouette} is available for all cluster methods, but \code{fixed_cut} only +applies to methods that create hierarchical trees (\code{hclust}, \code{agnes} and +\code{diana}). \code{dynamic_cut} requires the \code{dynamicTreeCut} package and can only +be used with \code{agnes} and \code{hclust}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{feature_similarity_threshold}{The threshold level for pair-wise +similarity that is required to form feature clusters with the \code{fixed_cut} +method. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{draw}{(\emph{optional}) Draws the plot if TRUE.} + +\item{dir_path}{(\emph{optional}) Path to the directory where created figures are +saved to. Output is saved in the \code{variable_importance} subdirectory. If NULL +no figures are saved, but are returned instead.} + +\item{p_adjustment_method}{(\emph{optional}) Indicates type of p-value that is +shown. One of \code{holm}, \code{hochberg}, \code{hommel}, \code{bonferroni}, \code{BH}, \code{BY}, \code{fdr}, +\code{none}, \code{p_value} or \code{q_value} for adjusted p-values, uncorrected p-values +and q-values. q-values may not be available.} + +\item{split_by}{(\emph{optional}) Splitting variables. This refers to column names +on which datasets are split. A separate figure is created for each split. +See details for available variables.} + +\item{color_by}{(\emph{optional}) Variables used to determine fill colour of plot +objects. The variables cannot overlap with those provided to the \code{split_by} +argument, but may overlap with other arguments. See details for available +variables.} + +\item{facet_by}{(\emph{optional}) Variables used to determine how and if facets of +each figure appear. In case the \code{facet_wrap_cols} argument is \code{NULL}, the +first variable is used to define columns, and the remaing variables are +used to define rows of facets. The variables cannot overlap with those +provided to the \code{split_by} argument, but may overlap with other arguments. +See details for available variables.} + +\item{facet_wrap_cols}{(\emph{optional}) Number of columns to generate when facet +wrapping. If NULL, a facet grid is produced instead.} + +\item{show_cluster}{(\emph{optional}) Show which features were clustered together.} + +\item{ggtheme}{(\emph{optional}) \code{ggplot} theme to use for plotting.} + +\item{discrete_palette}{(\emph{optional}) Palette used to fill the bars in case a +non-singular variable was provided to the \code{color_by} argument.} + +\item{gradient_palette}{(\emph{optional}) Palette to use for filling the bars in +case the \code{color_by} argument is not set. The bars are then coloured +according to their importance. By default, no gradient is used, and the bars +are not filled according to importance. Use \code{NULL} to fill the bars using +the default palette in \code{familiar}.} + +\item{x_label}{(\emph{optional}) Label to provide to the x-axis. If NULL, no label +is shown.} + +\item{y_label}{(\emph{optional}) Label to provide to the y-axis. If NULL, no label +is shown.} + +\item{legend_label}{(\emph{optional}) Label to provide to the legend. If NULL, the +legend will not have a name.} + +\item{plot_title}{(\emph{optional}) Label to provide as figure title. If NULL, no +title is shown.} + +\item{plot_sub_title}{(\emph{optional}) Label to provide as figure subtitle. If +NULL, no subtitle is shown.} + +\item{caption}{(\emph{optional}) Label to provide as figure caption. If NULL, no +caption is shown.} + +\item{x_range}{(\emph{optional}) Value range for the x-axis.} + +\item{x_n_breaks}{(\emph{optional}) Number of breaks to show on the x-axis of the +plot. \code{x_n_breaks} is used to determine the \code{x_breaks} argument in case it +is unset.} + +\item{x_breaks}{(\emph{optional}) Break points on the x-axis of the plot.} + +\item{significance_level_shown}{Position(s) to draw vertical lines indicating +a significance level, e.g. 0.05. Can be NULL to not draw anything.} + +\item{width}{(\emph{optional}) Width of the plot. A default value is derived from +the number of facets.} + +\item{height}{(\emph{optional}) Height of the plot. A default value is derived from +the number of features and the number of facets.} + +\item{units}{(\emph{optional}) Plot size unit. Either \code{cm} (default), \code{mm} or \verb{in}.} + +\item{verbose}{Flag to indicate whether feedback should be provided for the +plotting.} + +\item{export_collection}{(\emph{optional}) Exports the collection if TRUE.} + +\item{...}{ + Arguments passed on to \code{\link[=as_familiar_collection]{as_familiar_collection}}, \code{\link[ggplot2:ggsave]{ggplot2::ggsave}}, \code{\link[=extract_univariate_analysis]{extract_univariate_analysis}} + \describe{ + \item{\code{familiar_data_names}}{Names of the dataset(s). Only used if the \code{object} parameter +is one or more \code{familiarData} objects.} + \item{\code{collection_name}}{Name of the collection.} + \item{\code{filename}}{File name to create on disk.} + \item{\code{plot}}{Plot to save, defaults to last plot displayed.} + \item{\code{device}}{Device to use. Can either be a device function +(e.g. \link{png}), or one of "eps", "ps", "tex" (pictex), +"pdf", "jpeg", "tiff", "png", "bmp", "svg" or "wmf" (windows only). If +\code{NULL} (default), the device is guessed based on the \code{filename} extension.} + \item{\code{path}}{Path of the directory to save plot to: \code{path} and \code{filename} +are combined to create the fully qualified file name. Defaults to the +working directory.} + \item{\code{scale}}{Multiplicative scaling factor.} + \item{\code{dpi}}{Plot resolution. Also accepts a string input: "retina" (320), +"print" (300), or "screen" (72). Applies only to raster output types.} + \item{\code{limitsize}}{When \code{TRUE} (the default), \code{ggsave()} will not +save images larger than 50x50 inches, to prevent the common error of +specifying dimensions in pixels.} + \item{\code{bg}}{Background colour. If \code{NULL}, uses the \code{plot.background} fill value +from the plot theme.} + \item{\code{create.dir}}{Whether to create new directories if a non-existing +directory is specified in the \code{filename} or \code{path} (\code{TRUE}) or return an +error (\code{FALSE}, default). If \code{FALSE} and run in an interactive session, +a prompt will appear asking to create a new directory when necessary.} + \item{\code{data}}{A \code{dataObject} object, \code{data.table} or \code{data.frame} that +constitutes the data that are assessed.} + \item{\code{cl}}{Cluster created using the \code{parallel} package. This cluster is then +used to speed up computation through parallellisation.} + \item{\code{feature_similarity_metric}}{Metric to determine pairwise similarity +between features. Similarity is computed in the same manner as for +clustering, and \code{feature_similarity_metric} therefore has the same options +as \code{cluster_similarity_metric}: \code{mcfadden_r2}, \code{cox_snell_r2}, +\code{nagelkerke_r2}, \code{spearman}, \code{kendall} and \code{pearson}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{icc_type}}{String indicating the type of intraclass correlation +coefficient (\code{1}, \code{2} or \code{3}) that should be used to compute robustness for +features in repeated measurements during the evaluation of univariate +importance. These types correspond to the types in Shrout and Fleiss (1979). +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + \item{\code{message_indent}}{Number of indentation steps for messages shown during +computation and extraction of various data elements.} + }} +} +\value{ +\code{NULL} or list of plot objects, if \code{dir_path} is \code{NULL}. +} +\description{ +This function plots the univariate analysis data stored in a +familiarCollection object. +} +\details{ +This function generates a horizontal barplot with the length of the +bars corresponding to the 10-logarithm of the (multiple-testing corrected) +p-value or q-value. + +Features are assessed univariately using one-sample location t-tests after +fitting a suitable regression model. The fitted model coefficient and the +covariance matrix are then used to compute a p-value. + +The following splitting variables are available for \code{split_by}, \code{color_by} +and \code{facet_by}: +\itemize{ +\item \code{fs_method}: feature selection methods +\item \code{learner}: learners +\item \code{data_set}: data sets +} + +Unlike for plots of feature ranking in feature selection and after modelling +(as assessed by model-specific routines), clusters of features are now found +during creation of underlying \code{familiarData} objects, instead of through +consensus clustering. Hence, clustering results may differ due to +differences in the underlying datasets. + +Available palettes for \code{discrete_palette} and \code{gradient_palette} are those +listed by \code{grDevices::palette.pals()} (requires R >= 4.0.0), +\code{grDevices::hcl.pals()} (requires R >= 3.6.0) and \code{rainbow}, \code{heat.colors}, +\code{terrain.colors}, \code{topo.colors} and \code{cm.colors}, which correspond to the +palettes of the same name in \code{grDevices}. If not specified, a default +palette based on palettes in Tableau are used. You may also specify your own +palette by using colour names listed by \code{grDevices::colors()} or through +hexadecimal RGB strings. + +Labelling methods such as \code{set_fs_method_names} or \code{set_feature_names} can +be applied to the \code{familiarCollection} object to update labels, and order +the output in the figure. +} diff --git a/man/plot_variable_importance-methods.Rd b/man/plot_variable_importance-methods.Rd new file mode 100644 index 00000000..170d4e80 --- /dev/null +++ b/man/plot_variable_importance-methods.Rd @@ -0,0 +1,356 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/PlotFeatureRanking.R +\name{plot_variable_importance} +\alias{plot_variable_importance} +\alias{plot_variable_importance,ANY-method} +\alias{plot_variable_importance,familiarCollection-method} +\alias{plot_feature_selection_occurrence} +\alias{plot_feature_selection_variable_importance} +\alias{plot_model_signature_occurrence} +\alias{plot_model_signature_variable_importance} +\title{Plot variable importance scores of features during feature selection or +after training a model.} +\usage{ +plot_variable_importance( + object, + type, + feature_cluster_method = waiver(), + feature_linkage_method = waiver(), + feature_cluster_cut_method = waiver(), + feature_similarity_threshold = waiver(), + aggregation_method = waiver(), + rank_threshold = waiver(), + draw = FALSE, + dir_path = NULL, + split_by = NULL, + color_by = NULL, + facet_by = NULL, + facet_wrap_cols = NULL, + show_cluster = TRUE, + ggtheme = NULL, + discrete_palette = NULL, + gradient_palette = waiver(), + x_label = "feature", + rotate_x_tick_labels = waiver(), + y_label = waiver(), + legend_label = waiver(), + plot_title = waiver(), + plot_sub_title = waiver(), + caption = NULL, + y_range = NULL, + y_n_breaks = 5, + y_breaks = NULL, + width = waiver(), + height = waiver(), + units = waiver(), + export_collection = FALSE, + ... +) + +\S4method{plot_variable_importance}{ANY}( + object, + type, + feature_cluster_method = waiver(), + feature_linkage_method = waiver(), + feature_cluster_cut_method = waiver(), + feature_similarity_threshold = waiver(), + aggregation_method = waiver(), + rank_threshold = waiver(), + draw = FALSE, + dir_path = NULL, + split_by = NULL, + color_by = NULL, + facet_by = NULL, + facet_wrap_cols = NULL, + show_cluster = TRUE, + ggtheme = NULL, + discrete_palette = NULL, + gradient_palette = waiver(), + x_label = "feature", + rotate_x_tick_labels = waiver(), + y_label = waiver(), + legend_label = waiver(), + plot_title = waiver(), + plot_sub_title = waiver(), + caption = NULL, + y_range = NULL, + y_n_breaks = 5, + y_breaks = NULL, + width = waiver(), + height = waiver(), + units = waiver(), + export_collection = FALSE, + ... +) + +\S4method{plot_variable_importance}{familiarCollection}( + object, + type, + feature_cluster_method = waiver(), + feature_linkage_method = waiver(), + feature_cluster_cut_method = waiver(), + feature_similarity_threshold = waiver(), + aggregation_method = waiver(), + rank_threshold = waiver(), + draw = FALSE, + dir_path = NULL, + split_by = NULL, + color_by = NULL, + facet_by = NULL, + facet_wrap_cols = NULL, + show_cluster = TRUE, + ggtheme = NULL, + discrete_palette = NULL, + gradient_palette = waiver(), + x_label = "feature", + rotate_x_tick_labels = waiver(), + y_label = waiver(), + legend_label = waiver(), + plot_title = waiver(), + plot_sub_title = waiver(), + caption = NULL, + y_range = NULL, + y_n_breaks = 5, + y_breaks = NULL, + width = waiver(), + height = waiver(), + units = waiver(), + export_collection = FALSE, + ... +) + +plot_feature_selection_occurrence(...) + +plot_feature_selection_variable_importance(...) + +plot_model_signature_occurrence(...) + +plot_model_signature_variable_importance(...) +} +\arguments{ +\item{object}{A \code{familiarCollection} object, or other other objects from which +a \code{familiarCollection} can be extracted. See details for more information.} + +\item{type}{Determine what variable importance should be shown. Can be +\code{feature_selection} or \code{model} for the variable importance after the +feature selection step and after the model training step, respectively.} + +\item{feature_cluster_method}{The method used to perform clustering. These are +the same methods as for the \code{cluster_method} configuration parameter: +\code{none}, \code{hclust}, \code{agnes}, \code{diana} and \code{pam}. + +\code{none} cannot be used when extracting data regarding mutual correlation or +feature expressions. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{feature_linkage_method}{The method used for agglomerative clustering in +\code{hclust} and \code{agnes}. These are the same methods as for the +\code{cluster_linkage_method} configuration parameter: \code{average}, \code{single}, +\code{complete}, \code{weighted}, and \code{ward}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{feature_cluster_cut_method}{The method used to divide features into +separate clusters. The available methods are the same as for the +\code{cluster_cut_method} configuration parameter: \code{silhouette}, \code{fixed_cut} and +\code{dynamic_cut}. + +\code{silhouette} is available for all cluster methods, but \code{fixed_cut} only +applies to methods that create hierarchical trees (\code{hclust}, \code{agnes} and +\code{diana}). \code{dynamic_cut} requires the \code{dynamicTreeCut} package and can only +be used with \code{agnes} and \code{hclust}. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{feature_similarity_threshold}{The threshold level for pair-wise +similarity that is required to form feature clusters with the \code{fixed_cut} +method. + +If not provided explicitly, this parameter is read from settings used at +creation of the underlying \code{familiarModel} objects.} + +\item{aggregation_method}{(\emph{optional}) The method used to aggregate variable +importances over different data subsets, e.g. bootstraps. The following +methods can be selected: +\itemize{ +\item \code{mean} (default): Use the mean rank of a feature over the subsets to +determine the aggregated feature rank. +\item \code{median}: Use the median rank of a feature over the subsets to determine +the aggregated feature rank. +\item \code{best}: Use the best rank the feature obtained in any subset to determine +the aggregated feature rank. +\item \code{worst}: Use the worst rank the feature obtained in any subset to +determine the aggregated feature rank. +\item \code{stability}: Use the frequency of the feature being in the subset of +highly ranked features as measure for the aggregated feature rank +(Meinshausen and Buehlmann, 2010). +\item \code{exponential}: Use a rank-weighted frequence of occurrence in the subset +of highly ranked features as measure for the aggregated feature rank (Haury +et al., 2011). +\item \code{borda}: Use the borda count as measure for the aggregated feature rank +(Wald et al., 2012). +\item \code{enhanced_borda}: Use an occurrence frequency-weighted borda count as +measure for the aggregated feature rank (Wald et al., 2012). +\item \code{truncated_borda}: Use borda count computed only on features within the +subset of highly ranked features. +\item \code{enhanced_truncated_borda}: Apply both the enhanced borda method and the +truncated borda method and use the resulting borda count as the aggregated +feature rank. +}} + +\item{rank_threshold}{(\emph{optional}) The threshold used to define the subset of +highly important features. If not set, this threshold is determined by +maximising the variance in the occurrence value over all features over the +subset size. + +This parameter is only relevant for \code{stability}, \code{exponential}, +\code{enhanced_borda}, \code{truncated_borda} and \code{enhanced_truncated_borda} methods.} + +\item{draw}{(\emph{optional}) Draws the plot if TRUE.} + +\item{dir_path}{(\emph{optional}) Path to the directory where created figures are +saved to. Output is saved in the \code{variable_importance} subdirectory. If +\code{NULL} no figures are saved, but are returned instead.} + +\item{split_by}{(\emph{optional}) Splitting variables. This refers to column names +on which datasets are split. A separate figure is created for each split. +See details for available variables.} + +\item{color_by}{(\emph{optional}) Variables used to determine fill colour of plot +objects. The variables cannot overlap with those provided to the \code{split_by} +argument, but may overlap with other arguments. See details for available +variables.} + +\item{facet_by}{(\emph{optional}) Variables used to determine how and if facets of +each figure appear. In case the \code{facet_wrap_cols} argument is \code{NULL}, the +first variable is used to define columns, and the remaing variables are +used to define rows of facets. The variables cannot overlap with those +provided to the \code{split_by} argument, but may overlap with other arguments. +See details for available variables.} + +\item{facet_wrap_cols}{(\emph{optional}) Number of columns to generate when facet +wrapping. If NULL, a facet grid is produced instead.} + +\item{show_cluster}{(\emph{optional}) Show which features were clustered together. +Currently not available in combination with variable importance obtained +during feature selection.} + +\item{ggtheme}{(\emph{optional}) \code{ggplot} theme to use for plotting.} + +\item{discrete_palette}{(\emph{optional}) Palette to use for coloring bar plots, in +case a non-singular variable was provided to the \code{color_by} argument.} + +\item{gradient_palette}{(\emph{optional}) Palette to use for filling the bars in +case the \code{color_by} argument is not set. The bars are then coloured +according to the occurrence of features. By default, no gradient is used, +and the bars are not filled according to occurrence. Use \code{NULL} to fill the +bars using the default palette in \code{familiar}.} + +\item{x_label}{(\emph{optional}) Label to provide to the x-axis. If NULL, no label +is shown.} + +\item{rotate_x_tick_labels}{(\emph{optional}) Rotate tick labels on the x-axis by +90 degrees. Defaults to \code{TRUE}. Rotation of x-axis tick labels may also be +controlled through the \code{ggtheme}. In this case, \code{FALSE} should be provided +explicitly.} + +\item{y_label}{(\emph{optional}) Label to provide to the y-axis. If NULL, no label +is shown.} + +\item{legend_label}{(\emph{optional}) Label to provide to the legend. If NULL, the +legend will not have a name.} + +\item{plot_title}{(\emph{optional}) Label to provide as figure title. If NULL, no +title is shown.} + +\item{plot_sub_title}{(\emph{optional}) Label to provide as figure subtitle. If +NULL, no subtitle is shown.} + +\item{caption}{(\emph{optional}) Label to provide as figure caption. If NULL, no +caption is shown.} + +\item{y_range}{(\emph{optional}) Value range for the y-axis.} + +\item{y_n_breaks}{(\emph{optional}) Number of breaks to show on the y-axis of the +plot. \code{y_n_breaks} is used to determine the \code{y_breaks} argument in case it +is unset.} + +\item{y_breaks}{(\emph{optional}) Break points on the y-axis of the plot.} + +\item{width}{(\emph{optional}) Width of the plot. A default value is derived from +the number of facets and the number of features.} + +\item{height}{(\emph{optional}) Height of the plot. A default value is derived from +number of facets, and the length of the longest feature name (if +\code{rotate_x_tick_labels} is \code{TRUE}).} + +\item{units}{(\emph{optional}) Plot size unit. Either \code{cm} (default), \code{mm} or \verb{in}.} + +\item{export_collection}{(\emph{optional}) Exports the collection if TRUE.} + +\item{...}{ + Arguments passed on to \code{\link[=as_familiar_collection]{as_familiar_collection}}, \code{\link[ggplot2:ggsave]{ggplot2::ggsave}}, \code{\link[=extract_fs_vimp]{extract_fs_vimp}} + \describe{ + \item{\code{familiar_data_names}}{Names of the dataset(s). Only used if the \code{object} parameter +is one or more \code{familiarData} objects.} + \item{\code{collection_name}}{Name of the collection.} + \item{\code{filename}}{File name to create on disk.} + \item{\code{plot}}{Plot to save, defaults to last plot displayed.} + \item{\code{device}}{Device to use. Can either be a device function +(e.g. \link{png}), or one of "eps", "ps", "tex" (pictex), +"pdf", "jpeg", "tiff", "png", "bmp", "svg" or "wmf" (windows only). If +\code{NULL} (default), the device is guessed based on the \code{filename} extension.} + \item{\code{path}}{Path of the directory to save plot to: \code{path} and \code{filename} +are combined to create the fully qualified file name. Defaults to the +working directory.} + \item{\code{scale}}{Multiplicative scaling factor.} + \item{\code{dpi}}{Plot resolution. Also accepts a string input: "retina" (320), +"print" (300), or "screen" (72). Applies only to raster output types.} + \item{\code{limitsize}}{When \code{TRUE} (the default), \code{ggsave()} will not +save images larger than 50x50 inches, to prevent the common error of +specifying dimensions in pixels.} + \item{\code{bg}}{Background colour. If \code{NULL}, uses the \code{plot.background} fill value +from the plot theme.} + \item{\code{create.dir}}{Whether to create new directories if a non-existing +directory is specified in the \code{filename} or \code{path} (\code{TRUE}) or return an +error (\code{FALSE}, default). If \code{FALSE} and run in an interactive session, +a prompt will appear asking to create a new directory when necessary.} + \item{\code{verbose}}{Flag to indicate whether feedback should be provided on the +computation and extraction of various data elements.} + \item{\code{message_indent}}{Number of indentation steps for messages shown during +computation and extraction of various data elements.} + }} +} +\value{ +\code{NULL} or list of plot objects, if \code{dir_path} is \code{NULL}. +} +\description{ +This function plots variable importance based data obtained +during feature selection or after training a model, which are stored in a +\code{familiarCollection} object. +} +\details{ +This function generates a barplot based on variable importance of +features. + +The only allowed values for \code{split_by}, \code{color_by} or \code{facet_by} are +\code{fs_method} and \code{learner}, but note that \code{learner} has no effect when +plotting variable importance of features acquired during feature selection. + +Available palettes for \code{discrete_palette} and \code{gradient_palette} are those +listed by \code{grDevices::palette.pals()} (requires R >= 4.0.0), +\code{grDevices::hcl.pals()} (requires R >= 3.6.0) and \code{rainbow}, \code{heat.colors}, +\code{terrain.colors}, \code{topo.colors} and \code{cm.colors}, which correspond to the +palettes of the same name in \code{grDevices}. If not specified, a default +palette based on palettes in Tableau are used. You may also specify your own +palette by using colour names listed by \code{grDevices::colors()} or through +hexadecimal RGB strings. + +Labeling methods such as \code{set_feature_names} or \code{set_fs_method_names} can be +applied to the \code{familiarCollection} object to update labels, and order the +output in the figure. +} diff --git a/man/plotting.check_data_handling.Rd b/man/plotting.check_data_handling.Rd new file mode 100644 index 00000000..ab08c1d1 --- /dev/null +++ b/man/plotting.check_data_handling.Rd @@ -0,0 +1,67 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/PlotUtilities.R +\name{plotting.check_data_handling} +\alias{plotting.check_data_handling} +\title{Checks and sanitizes splitting variables for plotting.} +\usage{ +plotting.check_data_handling( + x, + split_by = NULL, + color_by = NULL, + linetype_by = NULL, + facet_by = NULL, + x_axis_by = NULL, + y_axis_by = NULL, + available = NULL +) +} +\arguments{ +\item{x}{data.table or data.frame containing the data used for splitting.} + +\item{split_by}{(\emph{optional}) Splitting variables. This refers to column names +on which datasets are split. A separate figure is created for each split. +See details for available variables.} + +\item{color_by}{(\emph{optional}) Variables used to determine fill colour of plot +objects. The variables cannot overlap with those provided to the \code{split_by} +argument, but may overlap with other arguments. See details for available +variables.} + +\item{linetype_by}{(\emph{optional}) Variables that are used to determine the +linetype of lines in a plot. The variables cannot overlap with those +provided to the \code{split_by} argument, but may overlap with other arguments. +Sett details for available variables.} + +\item{facet_by}{(\emph{optional}) Variables used to determine how and if facets of +each figure appear. In case the \code{facet_wrap_cols} argument is \code{NULL}, the +first variable is used to define columns, and the remaing variables are +used to define rows of facets. The variables cannot overlap with those +provided to the \code{split_by} argument, but may overlap with other arguments. +See details for available variables.} + +\item{x_axis_by}{(\emph{optional}) Variable plotted along the x-axis of a plot. +The variable cannot overlap with variables provided to the \code{split_by} and +\code{y_axis_by} arguments (if used), but may overlap with other arguments. Only +one variable is allowed for this argument. See details for available +variables.} + +\item{y_axis_by}{(\emph{optional}) Variable plotted along the y-axis of a plot. +The variable cannot overlap with variables provided to the \code{split_by} and +\code{x_axis_by} arguments (if used), but may overlap with other arguments. Only +one variable is allowed for this argument. See details for available +variables.} + +\item{available}{Names of columns available for splitting.} +} +\value{ +A sanitized list of splitting variables. +} +\description{ +Checks and sanitizes splitting variables for plotting. +} +\details{ +This internal function allows some flexibility regarding the exact +input. Allowed splitting variables should be defined by the available +argument. +} +\keyword{internal} diff --git a/man/plotting.check_input_args.Rd b/man/plotting.check_input_args.Rd new file mode 100644 index 00000000..bddd0ad0 --- /dev/null +++ b/man/plotting.check_input_args.Rd @@ -0,0 +1,119 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/PlotInputArguments.R +\name{plotting.check_input_args} +\alias{plotting.check_input_args} +\title{Internal checks on common plot input arguments} +\usage{ +plotting.check_input_args( + x_range = waiver(), + y_range = waiver(), + x_n_breaks = waiver(), + y_n_breaks = waiver(), + x_breaks = waiver(), + y_breaks = waiver(), + conf_int = waiver(), + conf_int_alpha = waiver(), + conf_int_style = waiver(), + conf_int_default = c("step", "ribbon", "none"), + facet_wrap_cols = waiver(), + x_label = waiver(), + y_label = waiver(), + x_label_shared = waiver(), + y_label_shared = waiver(), + rotate_x_tick_labels = waiver(), + rotate_y_tick_labels = waiver(), + legend_label = waiver(), + combine_legend = waiver(), + plot_title = waiver(), + plot_sub_title = waiver(), + caption = waiver() +) +} +\arguments{ +\item{x_range}{(\emph{optional}) Value range for the x-axis.} + +\item{y_range}{(\emph{optional}) Value range for the y-axis.} + +\item{x_n_breaks}{(\emph{optional}) Number of breaks to show on the x-axis of the +plot. \code{x_n_breaks} is used to determine the \code{x_breaks} argument in case it +is unset.} + +\item{y_n_breaks}{(\emph{optional}) Number of breaks to show on the y-axis of the +plot. \code{y_n_breaks} is used to determine the \code{y_breaks} argument in case it +is unset.} + +\item{x_breaks}{(\emph{optional}) Break points on the x-axis of the plot.} + +\item{y_breaks}{(\emph{optional}) Break points on the y-axis of the plot.} + +\item{conf_int}{(\emph{optional})} + +\item{conf_int_alpha}{(\emph{optional}) Alpha value to determine transparency of +confidence intervals or, alternatively, other plot elements with which the +confidence interval overlaps. Only values between 0.0 (fully transparent) +and 1.0 (fully opaque) are allowed.} + +\item{conf_int_style}{(\emph{optional}) Confidence interval style. See details for +allowed styles.} + +\item{conf_int_default}{Sets the default options for the confidence interval.} + +\item{facet_wrap_cols}{(\emph{optional}) Number of columns to generate when facet +wrapping. If NULL, a facet grid is produced instead.} + +\item{x_label}{(\emph{optional}) Label to provide to the x-axis. If NULL, no label +is shown.} + +\item{y_label}{(\emph{optional}) Label to provide to the y-axis. If NULL, no label +is shown.} + +\item{x_label_shared}{(\emph{optional}) Sharing of x-axis labels between facets. +One of three values: +\itemize{ +\item \code{overall}: A single label is placed at the bottom of the figure. Tick +text (but not the ticks themselves) is removed for all but the bottom facet +plot(s). +\item \code{column}: A label is placed at the bottom of each column. Tick text (but +not the ticks themselves) is removed for all but the bottom facet plot(s). +\item \code{individual}: A label is placed below each facet plot. Tick text is kept. +}} + +\item{y_label_shared}{(\emph{optional}) Sharing of y-axis labels between facets. +One of three values: +\itemize{ +\item \code{overall}: A single label is placed to the left of the figure. Tick text +(but not the ticks themselves) is removed for all but the left-most facet +plot(s). +\item \code{row}: A label is placed to the left of each row. Tick text (but not the +ticks themselves) is removed for all but the left-most facet plot(s). +\item \code{individual}: A label is placed below each facet plot. Tick text is kept. +}} + +\item{rotate_x_tick_labels}{(\emph{optional}) Rotate tick labels on the x-axis by +90 degrees. Defaults to \code{TRUE}. Rotation of x-axis tick labels may also be +controlled through the \code{ggtheme}. In this case, \code{FALSE} should be provided +explicitly.} + +\item{rotate_y_tick_labels}{(\emph{optional}) Rotate tick labels on the y-axis by +45 degrees.} + +\item{legend_label}{(\emph{optional}) Label to provide to the legend. If NULL, the +legend will not have a name.} + +\item{combine_legend}{(\emph{optional}) Flag to indicate whether the same legend +is to be shared by multiple aesthetics, such as those specified by +\code{color_by} and \code{linetype_by} arguments.} + +\item{plot_title}{(\emph{optional}) Label to provide as figure title. If NULL, no +title is shown.} + +\item{plot_sub_title}{(\emph{optional}) Label to provide as figure subtitle. If +NULL, no subtitle is shown.} + +\item{caption}{(\emph{optional}) Label to provide as figure caption. If NULL, no +caption is shown.} +} +\description{ +Internal checks on common plot input arguments +} +\keyword{internal} diff --git a/man/precompute_data_assignment.Rd b/man/precompute_data_assignment.Rd new file mode 100644 index 00000000..76ef73bc --- /dev/null +++ b/man/precompute_data_assignment.Rd @@ -0,0 +1,652 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Familiar.R +\name{precompute_data_assignment} +\alias{precompute_data_assignment} +\title{Pre-compute data assignment} +\usage{ +precompute_data_assignment( + formula = NULL, + data = NULL, + experiment_data = NULL, + cl = NULL, + experimental_design = "fs+mb", + verbose = TRUE, + ... +) +} +\arguments{ +\item{formula}{An R formula. The formula can only contain feature names and +dot (\code{.}). The \code{*} and \code{+1} operators are not supported as these refer to +columns that are not present in the data set. + +Use of the formula interface is optional.} + +\item{data}{A \code{data.table} object, a \code{data.frame} object, list containing +multiple \code{data.table} or \code{data.frame} objects, or paths to data files. + +\code{data} should be provided if no file paths are provided to the \code{data_files} +argument. If both are provided, only \code{data} will be used. + +All data is expected to be in wide format, and ideally has a sample +identifier (see \code{sample_id_column}), batch identifier (see \code{cohort_column}) +and outcome columns (see \code{outcome_column}). + +In case paths are provided, the data should be stored as \code{csv}, \code{rds} or +\code{RData} files. See documentation for the \code{data_files} argument for more +information.} + +\item{experiment_data}{Experimental data may provided in the form of} + +\item{cl}{Cluster created using the \code{parallel} package. This cluster is then +used to speed up computation through parallelisation. When a cluster is not +provided, parallelisation is performed by setting up a cluster on the local +machine. + +This parameter has no effect if the \code{parallel} argument is set to \code{FALSE}.} + +\item{experimental_design}{(\strong{required}) Defines what the experiment looks +like, e.g. \code{cv(bt(fs,20)+mb,3,2)} for 2 times repeated 3-fold +cross-validation with nested feature selection on 20 bootstraps and +model-building. The basic workflow components are: +\itemize{ +\item \code{fs}: (required) feature selection step. +\item \code{mb}: (required) model building step. +\item \code{ev}: (optional) external validation. If validation batches or cohorts +are present in the dataset (\code{data}), these should be indicated in the +\code{validation_batch_id} argument. +} + +The different components are linked using \code{+}. + +Different subsampling methods can be used in conjunction with the basic +workflow components: +\itemize{ +\item \code{bs(x,n)}: (stratified) .632 bootstrap, with \code{n} the number of +bootstraps. In contrast to \code{bt}, feature pre-processing parameters and +hyperparameter optimisation are conducted on individual bootstraps. +\item \code{bt(x,n)}: (stratified) .632 bootstrap, with \code{n} the number of +bootstraps. Unlike \code{bs} and other subsampling methods, no separate +pre-processing parameters or optimised hyperparameters will be determined +for each bootstrap. +\item \code{cv(x,n,p)}: (stratified) \code{n}-fold cross-validation, repeated \code{p} times. +Pre-processing parameters are determined for each iteration. +\item \code{lv(x)}: leave-one-out-cross-validation. Pre-processing parameters are +determined for each iteration. +\item \code{ip(x)}: imbalance partitioning for addressing class imbalances on the +data set. Pre-processing parameters are determined for each partition. The +number of partitions generated depends on the imbalance correction method +(see the \code{imbalance_correction_method} parameter). +} + +As shown in the example above, sampling algorithms can be nested. + +Though neither variable importance is determined nor models are learned +within \code{precompute_data_assignment}, the corresponding elements are still +required to prevent issues when using the resulting \code{experimentData} object +to warm-start the experiments. + +The simplest valid experimental design is \code{fs+mb}. This is the default in +\code{precompute_data_assignment}, and will simply assign all instances to the +training set.} + +\item{verbose}{Indicates verbosity of the results. Default is TRUE, and all +messages and warnings are returned.} + +\item{...}{ + Arguments passed on to \code{\link[=.parse_experiment_settings]{.parse_experiment_settings}}, \code{\link[=.parse_setup_settings]{.parse_setup_settings}}, \code{\link[=.parse_preprocessing_settings]{.parse_preprocessing_settings}} + \describe{ + \item{\code{batch_id_column}}{(\strong{recommended}) Name of the column containing batch +or cohort identifiers. This parameter is required if more than one dataset +is provided, or if external validation is performed. + +In familiar any row of data is organised by four identifiers: +\itemize{ +\item The batch identifier \code{batch_id_column}: This denotes the group to which a +set of samples belongs, e.g. patients from a single study, samples measured +in a batch, etc. The batch identifier is used for batch normalisation, as +well as selection of development and validation datasets. +\item The sample identifier \code{sample_id_column}: This denotes the sample level, +e.g. data from a single individual. Subsets of data, e.g. bootstraps or +cross-validation folds, are created at this level. +\item The series identifier \code{series_id_column}: Indicates measurements on a +single sample that may not share the same outcome value, e.g. a time +series, or the number of cells in a view. +\item The repetition identifier: Indicates repeated measurements in a single +series where any feature values may differ, but the outcome does not. +Repetition identifiers are always implicitly set when multiple entries for +the same series of the same sample in the same batch that share the same +outcome are encountered. +}} + \item{\code{sample_id_column}}{(\strong{recommended}) Name of the column containing +sample or subject identifiers. See \code{batch_id_column} above for more +details. + +If unset, every row will be identified as a single sample.} + \item{\code{series_id_column}}{(\strong{optional}) Name of the column containing series +identifiers, which distinguish between measurements that are part of a +series for a single sample. See \code{batch_id_column} above for more details. + +If unset, rows which share the same batch and sample identifiers but have a +different outcome are assigned unique series identifiers.} + \item{\code{development_batch_id}}{(\emph{optional}) One or more batch or cohort +identifiers to constitute data sets for development. Defaults to all, or +all minus the identifiers in \code{validation_batch_id} for external validation. +Required if external validation is performed and \code{validation_batch_id} is +not provided.} + \item{\code{validation_batch_id}}{(\emph{optional}) One or more batch or cohort +identifiers to constitute data sets for external validation. Defaults to +all data sets except those in \code{development_batch_id} for external +validation, or none if not. Required if \code{development_batch_id} is not +provided.} + \item{\code{outcome_name}}{(\emph{optional}) Name of the modelled outcome. This name will +be used in figures created by \code{familiar}. + +If not set, the column name in \code{outcome_column} will be used for +\code{binomial}, \code{multinomial}, \code{count} and \code{continuous} outcomes. For other +outcomes (\code{survival} and \code{competing_risk}) no default is used.} + \item{\code{outcome_column}}{(\strong{recommended}) Name of the column containing the +outcome of interest. May be identified from a formula, if a formula is +provided as an argument. Otherwise an error is raised. Note that \code{survival} +and \code{competing_risk} outcome type outcomes require two columns that +indicate the time-to-event or the time of last follow-up and the event +status.} + \item{\code{outcome_type}}{(\strong{recommended}) Type of outcome found in the outcome +column. The outcome type determines many aspects of the overall process, +e.g. the available feature selection methods and learners, but also the +type of assessments that can be conducted to evaluate the resulting models. +Implemented outcome types are: +\itemize{ +\item \code{binomial}: categorical outcome with 2 levels. +\item \code{multinomial}: categorical outcome with 2 or more levels. +\item \code{count}: Poisson-distributed numeric outcomes. +\item \code{continuous}: general continuous numeric outcomes. +\item \code{survival}: survival outcome for time-to-event data. +} + +If not provided, the algorithm will attempt to obtain outcome_type from +contents of the outcome column. This may lead to unexpected results, and we +therefore advise to provide this information manually. + +Note that \code{competing_risk} survival analysis are not fully supported, and +is currently not a valid choice for \code{outcome_type}.} + \item{\code{class_levels}}{(\emph{optional}) Class levels for \code{binomial} or \code{multinomial} +outcomes. This argument can be used to specify the ordering of levels for +categorical outcomes. These class levels must exactly match the levels +present in the outcome column.} + \item{\code{event_indicator}}{(\strong{recommended}) Indicator for events in \code{survival} +and \code{competing_risk} analyses. \code{familiar} will automatically recognise \code{1}, +\code{true}, \code{t}, \code{y} and \code{yes} as event indicators, including different +capitalisations. If this parameter is set, it replaces the default values.} + \item{\code{censoring_indicator}}{(\strong{recommended}) Indicator for right-censoring in +\code{survival} and \code{competing_risk} analyses. \code{familiar} will automatically +recognise \code{0}, \code{false}, \code{f}, \code{n}, \code{no} as censoring indicators, including +different capitalisations. If this parameter is set, it replaces the +default values.} + \item{\code{competing_risk_indicator}}{(\strong{recommended}) Indicator for competing +risks in \code{competing_risk} analyses. There are no default values, and if +unset, all values other than those specified by the \code{event_indicator} and +\code{censoring_indicator} parameters are considered to indicate competing +risks.} + \item{\code{signature}}{(\emph{optional}) One or more names of feature columns that are +considered part of a specific signature. Features specified here will +always be used for modelling. Ranking from feature selection has no effect +for these features.} + \item{\code{novelty_features}}{(\emph{optional}) One or more names of feature columns +that should be included for the purpose of novelty detection.} + \item{\code{exclude_features}}{(\emph{optional}) Feature columns that will be removed +from the data set. Cannot overlap with features in \code{signature}, +\code{novelty_features} or \code{include_features}.} + \item{\code{include_features}}{(\emph{optional}) Feature columns that are specifically +included in the data set. By default all features are included. Cannot +overlap with \code{exclude_features}, but may overlap \code{signature}. Features in +\code{signature} and \code{novelty_features} are always included. If both +\code{exclude_features} and \code{include_features} are provided, \code{include_features} +takes precedence, provided that there is no overlap between the two.} + \item{\code{reference_method}}{(\emph{optional}) Method used to set reference levels for +categorical features. There are several options: +\itemize{ +\item \code{auto} (default): Categorical features that are not explicitly set by the +user, i.e. columns containing boolean values or characters, use the most +frequent level as reference. Categorical features that are explicitly set, +i.e. as factors, are used as is. +\item \code{always}: Both automatically detected and user-specified categorical +features have the reference level set to the most frequent level. Ordinal +features are not altered, but are used as is. +\item \code{never}: User-specified categorical features are used as is. +Automatically detected categorical features are simply sorted, and the +first level is then used as the reference level. This was the behaviour +prior to familiar version 1.3.0. +}} + \item{\code{imbalance_correction_method}}{(\emph{optional}) Type of method used to +address class imbalances. Available options are: +\itemize{ +\item \code{full_undersampling} (default): All data will be used in an ensemble +fashion. The full minority class will appear in each partition, but +majority classes are undersampled until all data have been used. +\item \code{random_undersampling}: Randomly undersamples majority classes. This is +useful in cases where full undersampling would lead to the formation of +many models due major overrepresentation of the largest class. +} + +This parameter is only used in combination with imbalance partitioning in +the experimental design, and \code{ip} should therefore appear in the string +that defines the design.} + \item{\code{imbalance_n_partitions}}{(\emph{optional}) Number of times random +undersampling should be repeated. 10 undersampled subsets with balanced +classes are formed by default.} + \item{\code{parallel}}{(\emph{optional}) Enable parallel processing. Defaults to \code{TRUE}. +When set to \code{FALSE}, this disables all parallel processing, regardless of +specific parameters such as \code{parallel_preprocessing}. However, when +\code{parallel} is \code{TRUE}, parallel processing of different parts of the +workflow can be disabled by setting respective flags to \code{FALSE}.} + \item{\code{parallel_nr_cores}}{(\emph{optional}) Number of cores available for +parallelisation. Defaults to 2. This setting does nothing if +parallelisation is disabled.} + \item{\code{restart_cluster}}{(\emph{optional}) Restart nodes used for parallel computing +to free up memory prior to starting a parallel process. Note that it does +take time to set up the clusters. Therefore setting this argument to \code{TRUE} +may impact processing speed. This argument is ignored if \code{parallel} is +\code{FALSE} or the cluster was initialised outside of familiar. Default is +\code{FALSE}, which causes the clusters to be initialised only once.} + \item{\code{cluster_type}}{(\emph{optional}) Selection of the cluster type for parallel +processing. Available types are the ones supported by the parallel package +that is part of the base R distribution: \code{psock} (default), \code{fork}, \code{mpi}, +\code{nws}, \code{sock}. In addition, \code{none} is available, which also disables +parallel processing.} + \item{\code{backend_type}}{(\emph{optional}) Selection of the backend for distributing +copies of the data. This backend ensures that only a single master copy is +kept in memory. This limits memory usage during parallel processing. + +Several backend options are available, notably \code{socket_server}, and \code{none} +(default). \code{socket_server} is based on the callr package and R sockets, +comes with \code{familiar} and is available for any OS. \code{none} uses the package +environment of familiar to store data, and is available for any OS. +However, \code{none} requires copying of data to any parallel process, and has a +larger memory footprint.} + \item{\code{server_port}}{(\emph{optional}) Integer indicating the port on which the +socket server or RServe process should communicate. Defaults to port 6311. +Note that ports 0 to 1024 and 49152 to 65535 cannot be used.} + \item{\code{feature_max_fraction_missing}}{(\emph{optional}) Numeric value between \code{0.0} +and \code{0.95} that determines the meximum fraction of missing values that +still allows a feature to be included in the data set. All features with a +missing value fraction over this threshold are not processed further. The +default value is \code{0.30}.} + \item{\code{sample_max_fraction_missing}}{(\emph{optional}) Numeric value between \code{0.0} +and \code{0.95} that determines the maximum fraction of missing values that +still allows a sample to be included in the data set. All samples with a +missing value fraction over this threshold are excluded and not processed +further. The default value is \code{0.30}.} + \item{\code{filter_method}}{(\emph{optional}) One or methods used to reduce +dimensionality of the data set by removing irrelevant or poorly +reproducible features. + +Several method are available: +\itemize{ +\item \code{none} (default): None of the features will be filtered. +\item \code{low_variance}: Features with a variance below the +\code{low_var_minimum_variance_threshold} are filtered. This can be useful to +filter, for example, genes that are not differentially expressed. +\item \code{univariate_test}: Features undergo a univariate regression using an +outcome-appropriate regression model. The p-value of the model coefficient +is collected. Features with coefficient p or q-value above the +\code{univariate_test_threshold} are subsequently filtered. +\item \code{robustness}: Features that are not sufficiently robust according to the +intraclass correlation coefficient are filtered. Use of this method +requires that repeated measurements are present in the data set, i.e. there +should be entries for which the sample and cohort identifiers are the same. +} + +More than one method can be used simultaneously. Features with singular +values are always filtered, as these do not contain information.} + \item{\code{univariate_test_threshold}}{(\emph{optional}) Numeric value between \code{1.0} and +\code{0.0} that determines which features are irrelevant and will be filtered by +the \code{univariate_test}. The p or q-values are compared to this threshold. +All features with values above the threshold are filtered. The default +value is \code{0.20}.} + \item{\code{univariate_test_threshold_metric}}{(\emph{optional}) Metric used with the to +compare the \code{univariate_test_threshold} against. The following metrics can +be chosen: +\itemize{ +\item \code{p_value} (default): The unadjusted p-value of each feature is used for +to filter features. +\item \code{q_value}: The q-value (Story, 2002), is used to filter features. Some +data sets may have insufficient samples to compute the q-value. The +\code{qvalue} package must be installed from Bioconductor to use this method. +}} + \item{\code{univariate_test_max_feature_set_size}}{(\emph{optional}) Maximum size of the +feature set after the univariate test. P or q values of features are +compared against the threshold, but if the resulting data set would be +larger than this setting, only the most relevant features up to the desired +feature set size are selected. + +The default value is \code{NULL}, which causes features to be filtered based on +their relevance only.} + \item{\code{low_var_minimum_variance_threshold}}{(required, if used) Numeric value +that determines which features will be filtered by the \code{low_variance} +method. The variance of each feature is computed and compared to the +threshold. If it is below the threshold, the feature is removed. + +This parameter has no default value and should be set if \code{low_variance} is +used.} + \item{\code{low_var_max_feature_set_size}}{(\emph{optional}) Maximum size of the feature +set after filtering features with a low variance. All features are first +compared against \code{low_var_minimum_variance_threshold}. If the resulting +feature set would be larger than specified, only the most strongly varying +features will be selected, up to the desired size of the feature set. + +The default value is \code{NULL}, which causes features to be filtered based on +their variance only.} + \item{\code{robustness_icc_type}}{(\emph{optional}) String indicating the type of +intraclass correlation coefficient (\code{1}, \code{2} or \code{3}) that should be used to +compute robustness for features in repeated measurements. These types +correspond to the types in Shrout and Fleiss (1979). The default value is +\code{1}.} + \item{\code{robustness_threshold_metric}}{(\emph{optional}) String indicating which +specific intraclass correlation coefficient (ICC) metric should be used to +filter features. This should be one of: +\itemize{ +\item \code{icc}: The estimated ICC value itself. +\item \code{icc_low} (default): The estimated lower limit of the 95\% confidence +interval of the ICC, as suggested by Koo and Li (2016). +\item \code{icc_panel}: The estimated ICC value over the panel average, i.e. the ICC +that would be obtained if all repeated measurements were averaged. +\item \code{icc_panel_low}: The estimated lower limit of the 95\% confidence interval +of the panel ICC. +}} + \item{\code{robustness_threshold_value}}{(\emph{optional}) The intraclass correlation +coefficient value that is as threshold. The default value is \code{0.70}.} + \item{\code{transformation_method}}{(\emph{optional}) The transformation method used to +change the distribution of the data to be more normal-like. The following +methods are available: +\itemize{ +\item \code{none}: This disables transformation of features. +\item \code{yeo_johnson} (default): Transformation using the Yeo-Johnson +transformation (Yeo and Johnson, 2000). The algorithm tests various lambda +values and selects the lambda that maximises the log-likelihood. +\item \code{yeo_johnson_trim}: As \code{yeo_johnson}, but based on the set of feature +values where the 5\% lowest and 5\% highest values are discarded. This +reduces the effect of outliers. +\item \code{yeo_johnson_winsor}: As \code{yeo_johnson}, but based on the set of feature +values where the 5\% lowest and 5\% highest values are winsorised. This +reduces the effect of outliers. +\item \code{yeo_johnson_robust}: A robust version of \code{yeo_johnson} after Raymaekers +and Rousseeuw (2021). This method is less sensitive to outliers. +\item \code{box_cox}: Transformation using the Box-Cox transformation (Box and Cox, +1964). Unlike the Yeo-Johnson transformation, the Box-Cox transformation +requires that all data are positive. Features that contain zero or negative +values cannot be transformed using this transformation. The algorithm tests +various lambda values and selects the lambda that maximises the +log-likelihood. +\item \code{box_cox_trim}: As \code{box_cox}, but based on the set of feature values +where the 5\% lowest and 5\% highest values are discarded. This reduces the +effect of outliers. +\item \code{box_cox_winsor}: As \code{box_cox}, but based on the set of feature values +where the 5\% lowest and 5\% highest values are winsorised. This reduces the +effect of outliers. +\item \code{box_cox_robust}: A robust verson of \code{box_cox} after Raymaekers and +Rousseew (2021). This method is less sensitive to outliers. +} + +Only features that contain numerical data are transformed. Transformation +parameters obtained in development data are stored within \code{featureInfo} +objects for later use with validation data sets.} + \item{\code{normalisation_method}}{(\emph{optional}) The normalisation method used to +improve the comparability between numerical features that may have very +different scales. The following normalisation methods can be chosen: +\itemize{ +\item \code{none}: This disables feature normalisation. +\item \code{standardisation}: Features are normalised by subtraction of their mean +values and division by their standard deviations. This causes every feature +to be have a center value of 0.0 and standard deviation of 1.0. +\item \code{standardisation_trim}: As \code{standardisation}, but based on the set of +feature values where the 5\% lowest and 5\% highest values are discarded. +This reduces the effect of outliers. +\item \code{standardisation_winsor}: As \code{standardisation}, but based on the set of +feature values where the 5\% lowest and 5\% highest values are winsorised. +This reduces the effect of outliers. +\item \code{standardisation_robust} (default): A robust version of \code{standardisation} +that relies on computing Huber's M-estimators for location and scale. +\item \code{normalisation}: Features are normalised by subtraction of their minimum +values and division by their ranges. This maps all feature values to a +\eqn{[0, 1]} interval. +\item \code{normalisation_trim}: As \code{normalisation}, but based on the set of feature +values where the 5\% lowest and 5\% highest values are discarded. This +reduces the effect of outliers. +\item \code{normalisation_winsor}: As \code{normalisation}, but based on the set of +feature values where the 5\% lowest and 5\% highest values are winsorised. +This reduces the effect of outliers. +\item \code{quantile}: Features are normalised by subtraction of their median values +and division by their interquartile range. +\item \code{mean_centering}: Features are centered by substracting the mean, but do +not undergo rescaling. +} + +Only features that contain numerical data are normalised. Normalisation +parameters obtained in development data are stored within \code{featureInfo} +objects for later use with validation data sets.} + \item{\code{batch_normalisation_method}}{(\emph{optional}) The method used for batch +normalisation. Available methods are: +\itemize{ +\item \code{none} (default): This disables batch normalisation of features. +\item \code{standardisation}: Features within each batch are normalised by +subtraction of the mean value and division by the standard deviation in +each batch. +\item \code{standardisation_trim}: As \code{standardisation}, but based on the set of +feature values where the 5\% lowest and 5\% highest values are discarded. +This reduces the effect of outliers. +\item \code{standardisation_winsor}: As \code{standardisation}, but based on the set of +feature values where the 5\% lowest and 5\% highest values are winsorised. +This reduces the effect of outliers. +\item \code{standardisation_robust}: A robust version of \code{standardisation} that +relies on computing Huber's M-estimators for location and scale within each +batch. +\item \code{normalisation}: Features within each batch are normalised by subtraction +of their minimum values and division by their range in each batch. This +maps all feature values in each batch to a \eqn{[0, 1]} interval. +\item \code{normalisation_trim}: As \code{normalisation}, but based on the set of feature +values where the 5\% lowest and 5\% highest values are discarded. This +reduces the effect of outliers. +\item \code{normalisation_winsor}: As \code{normalisation}, but based on the set of +feature values where the 5\% lowest and 5\% highest values are winsorised. +This reduces the effect of outliers. +\item \code{quantile}: Features in each batch are normalised by subtraction of the +median value and division by the interquartile range of each batch. +\item \code{mean_centering}: Features in each batch are centered on 0.0 by +substracting the mean value in each batch, but are not rescaled. +\item \code{combat_parametric}: Batch adjustments using parametric empirical Bayes +(Johnson et al, 2007). \code{combat_p} leads to the same method. +\item \code{combat_non_parametric}: Batch adjustments using non-parametric empirical +Bayes (Johnson et al, 2007). \code{combat_np} and \code{combat} lead to the same +method. Note that we reduced complexity from O(\eqn{n^2}) to O(\eqn{n}) by +only computing batch adjustment parameters for each feature on a subset of +50 randomly selected features, instead of all features. +} + +Only features that contain numerical data are normalised using batch +normalisation. Batch normalisation parameters obtained in development data +are stored within \code{featureInfo} objects for later use with validation data +sets, in case the validation data is from the same batch. + +If validation data contains data from unknown batches, normalisation +parameters are separately determined for these batches. + +Note that for both empirical Bayes methods, the batch effect is assumed to +produce results across the features. This is often true for things such as +gene expressions, but the assumption may not hold generally. + +When performing batch normalisation, it is moreover important to check that +differences between batches or cohorts are not related to the studied +endpoint.} + \item{\code{imputation_method}}{(\emph{optional}) Method used for imputing missing +feature values. Two methods are implemented: +\itemize{ +\item \code{simple}: Simple replacement of a missing value by the median value (for +numeric features) or the modal value (for categorical features). +\item \code{lasso}: Imputation of missing value by lasso regression (using \code{glmnet}) +based on information contained in other features. +} + +\code{simple} imputation precedes \code{lasso} imputation to ensure that any missing +values in predictors required for \code{lasso} regression are resolved. The +\code{lasso} estimate is then used to replace the missing value. + +The default value depends on the number of features in the dataset. If the +number is lower than 100, \code{lasso} is used by default, and \code{simple} +otherwise. + +Only single imputation is performed. Imputation models and parameters are +stored within \code{featureInfo} objects for later use with validation data +sets.} + \item{\code{cluster_method}}{(\emph{optional}) Clustering is performed to identify and +replace redundant features, for example those that are highly correlated. +Such features do not carry much additional information and may be removed +or replaced instead (Park et al., 2007; Tolosi and Lengauer, 2011). + +The cluster method determines the algorithm used to form the clusters. The +following cluster methods are implemented: +\itemize{ +\item \code{none}: No clustering is performed. +\item \code{hclust} (default): Hierarchical agglomerative clustering. If the +\code{fastcluster} package is installed, \code{fastcluster::hclust} is used (Muellner +2013), otherwise \code{stats::hclust} is used. +\item \code{agnes}: Hierarchical clustering using agglomerative nesting (Kaufman and +Rousseeuw, 1990). This algorithm is similar to \code{hclust}, but uses the +\code{cluster::agnes} implementation. +\item \code{diana}: Divisive analysis hierarchical clustering. This method uses +divisive instead of agglomerative clustering (Kaufman and Rousseeuw, 1990). +\code{cluster::diana} is used. +\item \code{pam}: Partioning around medioids. This partitions the data into $k$ +clusters around medioids (Kaufman and Rousseeuw, 1990). $k$ is selected +using the \code{silhouette} metric. \code{pam} is implemented using the +\code{cluster::pam} function. +} + +Clusters and cluster information is stored within \code{featureInfo} objects for +later use with validation data sets. This enables reproduction of the same +clusters as formed in the development data set.} + \item{\code{cluster_linkage_method}}{(\emph{optional}) Linkage method used for +agglomerative clustering in \code{hclust} and \code{agnes}. The following linkage +methods can be used: +\itemize{ +\item \code{average} (default): Average linkage. +\item \code{single}: Single linkage. +\item \code{complete}: Complete linkage. +\item \code{weighted}: Weighted linkage, also known as McQuitty linkage. +\item \code{ward}: Linkage using Ward's minimum variance method. +} + +\code{diana} and \code{pam} do not require a linkage method.} + \item{\code{cluster_cut_method}}{(\emph{optional}) The method used to define the actual +clusters. The following methods can be used: +\itemize{ +\item \code{silhouette}: Clusters are formed based on the silhouette score +(Rousseeuw, 1987). The average silhouette score is computed from 2 to +\eqn{n} clusters, with \eqn{n} the number of features. Clusters are only +formed if the average silhouette exceeds 0.50, which indicates reasonable +evidence for structure. This procedure may be slow if the number of +features is large (>100s). +\item \code{fixed_cut}: Clusters are formed by cutting the hierarchical tree at the +point indicated by the \code{cluster_similarity_threshold}, e.g. where features +in a cluster have an average Spearman correlation of 0.90. \code{fixed_cut} is +only available for \code{agnes}, \code{diana} and \code{hclust}. +\item \code{dynamic_cut}: Dynamic cluster formation using the cutting algorithm in +the \code{dynamicTreeCut} package. This package should be installed to select +this option. \code{dynamic_cut} can only be used with \code{agnes} and \code{hclust}. +} + +The default options are \code{silhouette} for partioning around medioids (\code{pam}) +and \code{fixed_cut} otherwise.} + \item{\code{cluster_similarity_metric}}{(\emph{optional}) Clusters are formed based on +feature similarity. All features are compared in a pair-wise fashion to +compute similarity, for example correlation. The resulting similarity grid +is converted into a distance matrix that is subsequently used for +clustering. The following metrics are supported to compute pairwise +similarities: +\itemize{ +\item \code{mutual_information} (default): normalised mutual information. +\item \code{mcfadden_r2}: McFadden's pseudo R-squared (McFadden, 1974). +\item \code{cox_snell_r2}: Cox and Snell's pseudo R-squared (Cox and Snell, 1989). +\item \code{nagelkerke_r2}: Nagelkerke's pseudo R-squared (Nagelkerke, 1991). +\item \code{spearman}: Spearman's rank order correlation. +\item \code{kendall}: Kendall rank correlation. +\item \code{pearson}: Pearson product-moment correlation. +} + +The pseudo R-squared metrics can be used to assess similarity between mixed +pairs of numeric and categorical features, as these are based on the +log-likelihood of regression models. In \code{familiar}, the more informative +feature is used as the predictor and the other feature as the reponse +variable. In numeric-categorical pairs, the numeric feature is considered +to be more informative and is thus used as the predictor. In +categorical-categorical pairs, the feature with most levels is used as the +predictor. + +In case any of the classical correlation coefficients (\code{pearson}, +\code{spearman} and \code{kendall}) are used with (mixed) categorical features, the +categorical features are one-hot encoded and the mean correlation over all +resulting pairs is used as similarity.} + \item{\code{cluster_similarity_threshold}}{(\emph{optional}) The threshold level for +pair-wise similarity that is required to form clusters using \code{fixed_cut}. +This should be a numerical value between 0.0 and 1.0. Note however, that a +reasonable threshold value depends strongly on the similarity metric. The +following are the default values used: +\itemize{ +\item \code{mcfadden_r2} and \code{mutual_information}: \code{0.30} +\item \code{cox_snell_r2} and \code{nagelkerke_r2}: \code{0.75} +\item \code{spearman}, \code{kendall} and \code{pearson}: \code{0.90} +} + +Alternatively, if the \verb{fixed cut} method is not used, this value determines +whether any clustering should be performed, because the data may not +contain highly similar features. The default values in this situation are: +\itemize{ +\item \code{mcfadden_r2} and \code{mutual_information}: \code{0.25} +\item \code{cox_snell_r2} and \code{nagelkerke_r2}: \code{0.40} +\item \code{spearman}, \code{kendall} and \code{pearson}: \code{0.70} +} + +The threshold value is converted to a distance (1-similarity) prior to +cutting hierarchical trees.} + \item{\code{cluster_representation_method}}{(\emph{optional}) Method used to determine +how the information of co-clustered features is summarised and used to +represent the cluster. The following methods can be selected: +\itemize{ +\item \code{best_predictor} (default): The feature with the highest importance +according to univariate regression with the outcome is used to represent +the cluster. +\item \code{medioid}: The feature closest to the cluster center, i.e. the feature +that is most similar to the remaining features in the cluster, is used to +represent the feature. +\item \code{mean}: A meta-feature is generated by averaging the feature values for +all features in a cluster. This method aligns all features so that all +features will be positively correlated prior to averaging. Should a cluster +contain one or more categorical features, the \code{medioid} method will be used +instead, as averaging is not possible. Note that if this method is chosen, +the \code{normalisation_method} parameter should be one of \code{standardisation}, +\code{standardisation_trim}, \code{standardisation_winsor} or \code{quantile}.` +} + +If the \code{pam} cluster method is selected, only the \code{medioid} method can be +used. In that case 1 medioid is used by default.} + \item{\code{parallel_preprocessing}}{(\emph{optional}) Enable parallel processing for the +preprocessing workflow. Defaults to \code{TRUE}. When set to \code{FALSE}, this will +disable the use of parallel processing while preprocessing, regardless of +the settings of the \code{parallel} parameter. \code{parallel_preprocessing} is +ignored if \code{parallel=FALSE}.} + }} +} +\value{ +An \code{experimentData} object. +} +\description{ +Creates data assignment. +} +\details{ +This is a thin wrapper around \code{summon_familiar}, and functions like +it, but automatically skips computation of variable importance, learning +and subsequent evaluation steps. + +The function returns an \code{experimentData} object, which can be used to +warm-start other experiments by providing it to the \code{experiment_data} +argument. +} diff --git a/man/precompute_feature_info.Rd b/man/precompute_feature_info.Rd new file mode 100644 index 00000000..1f18fb8e --- /dev/null +++ b/man/precompute_feature_info.Rd @@ -0,0 +1,655 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Familiar.R +\name{precompute_feature_info} +\alias{precompute_feature_info} +\title{Pre-compute feature information} +\usage{ +precompute_feature_info( + formula = NULL, + data = NULL, + experiment_data = NULL, + cl = NULL, + experimental_design = "fs+mb", + verbose = TRUE, + ... +) +} +\arguments{ +\item{formula}{An R formula. The formula can only contain feature names and +dot (\code{.}). The \code{*} and \code{+1} operators are not supported as these refer to +columns that are not present in the data set. + +Use of the formula interface is optional.} + +\item{data}{A \code{data.table} object, a \code{data.frame} object, list containing +multiple \code{data.table} or \code{data.frame} objects, or paths to data files. + +\code{data} should be provided if no file paths are provided to the \code{data_files} +argument. If both are provided, only \code{data} will be used. + +All data is expected to be in wide format, and ideally has a sample +identifier (see \code{sample_id_column}), batch identifier (see \code{cohort_column}) +and outcome columns (see \code{outcome_column}). + +In case paths are provided, the data should be stored as \code{csv}, \code{rds} or +\code{RData} files. See documentation for the \code{data_files} argument for more +information.} + +\item{experiment_data}{Experimental data may provided in the form of} + +\item{cl}{Cluster created using the \code{parallel} package. This cluster is then +used to speed up computation through parallelisation. When a cluster is not +provided, parallelisation is performed by setting up a cluster on the local +machine. + +This parameter has no effect if the \code{parallel} argument is set to \code{FALSE}.} + +\item{experimental_design}{(\strong{required}) Defines what the experiment looks +like, e.g. \code{cv(bt(fs,20)+mb,3,2)} for 2 times repeated 3-fold +cross-validation with nested feature selection on 20 bootstraps and +model-building. The basic workflow components are: +\itemize{ +\item \code{fs}: (required) feature selection step. +\item \code{mb}: (required) model building step. +\item \code{ev}: (optional) external validation. If validation batches or cohorts +are present in the dataset (\code{data}), these should be indicated in the +\code{validation_batch_id} argument. +} + +The different components are linked using \code{+}. + +Different subsampling methods can be used in conjunction with the basic +workflow components: +\itemize{ +\item \code{bs(x,n)}: (stratified) .632 bootstrap, with \code{n} the number of +bootstraps. In contrast to \code{bt}, feature pre-processing parameters and +hyperparameter optimisation are conducted on individual bootstraps. +\item \code{bt(x,n)}: (stratified) .632 bootstrap, with \code{n} the number of +bootstraps. Unlike \code{bs} and other subsampling methods, no separate +pre-processing parameters or optimised hyperparameters will be determined +for each bootstrap. +\item \code{cv(x,n,p)}: (stratified) \code{n}-fold cross-validation, repeated \code{p} times. +Pre-processing parameters are determined for each iteration. +\item \code{lv(x)}: leave-one-out-cross-validation. Pre-processing parameters are +determined for each iteration. +\item \code{ip(x)}: imbalance partitioning for addressing class imbalances on the +data set. Pre-processing parameters are determined for each partition. The +number of partitions generated depends on the imbalance correction method +(see the \code{imbalance_correction_method} parameter). +} + +As shown in the example above, sampling algorithms can be nested. + +Though neither variable importance is determined nor models are learned +within \code{precompute_feature_info}, the corresponding elements are still +required to prevent issues when using the resulting \code{experimentData} object +to warm-start the experiments. + +The simplest valid experimental design is \code{fs+mb}. This is the default in +\code{precompute_feature_info}, and will determine feature parameters over the +entire dataset. + +This argument is ignored if the \code{experiment_data} argument is set.} + +\item{verbose}{Indicates verbosity of the results. Default is TRUE, and all +messages and warnings are returned.} + +\item{...}{ + Arguments passed on to \code{\link[=.parse_experiment_settings]{.parse_experiment_settings}}, \code{\link[=.parse_setup_settings]{.parse_setup_settings}}, \code{\link[=.parse_preprocessing_settings]{.parse_preprocessing_settings}} + \describe{ + \item{\code{batch_id_column}}{(\strong{recommended}) Name of the column containing batch +or cohort identifiers. This parameter is required if more than one dataset +is provided, or if external validation is performed. + +In familiar any row of data is organised by four identifiers: +\itemize{ +\item The batch identifier \code{batch_id_column}: This denotes the group to which a +set of samples belongs, e.g. patients from a single study, samples measured +in a batch, etc. The batch identifier is used for batch normalisation, as +well as selection of development and validation datasets. +\item The sample identifier \code{sample_id_column}: This denotes the sample level, +e.g. data from a single individual. Subsets of data, e.g. bootstraps or +cross-validation folds, are created at this level. +\item The series identifier \code{series_id_column}: Indicates measurements on a +single sample that may not share the same outcome value, e.g. a time +series, or the number of cells in a view. +\item The repetition identifier: Indicates repeated measurements in a single +series where any feature values may differ, but the outcome does not. +Repetition identifiers are always implicitly set when multiple entries for +the same series of the same sample in the same batch that share the same +outcome are encountered. +}} + \item{\code{sample_id_column}}{(\strong{recommended}) Name of the column containing +sample or subject identifiers. See \code{batch_id_column} above for more +details. + +If unset, every row will be identified as a single sample.} + \item{\code{series_id_column}}{(\strong{optional}) Name of the column containing series +identifiers, which distinguish between measurements that are part of a +series for a single sample. See \code{batch_id_column} above for more details. + +If unset, rows which share the same batch and sample identifiers but have a +different outcome are assigned unique series identifiers.} + \item{\code{development_batch_id}}{(\emph{optional}) One or more batch or cohort +identifiers to constitute data sets for development. Defaults to all, or +all minus the identifiers in \code{validation_batch_id} for external validation. +Required if external validation is performed and \code{validation_batch_id} is +not provided.} + \item{\code{validation_batch_id}}{(\emph{optional}) One or more batch or cohort +identifiers to constitute data sets for external validation. Defaults to +all data sets except those in \code{development_batch_id} for external +validation, or none if not. Required if \code{development_batch_id} is not +provided.} + \item{\code{outcome_name}}{(\emph{optional}) Name of the modelled outcome. This name will +be used in figures created by \code{familiar}. + +If not set, the column name in \code{outcome_column} will be used for +\code{binomial}, \code{multinomial}, \code{count} and \code{continuous} outcomes. For other +outcomes (\code{survival} and \code{competing_risk}) no default is used.} + \item{\code{outcome_column}}{(\strong{recommended}) Name of the column containing the +outcome of interest. May be identified from a formula, if a formula is +provided as an argument. Otherwise an error is raised. Note that \code{survival} +and \code{competing_risk} outcome type outcomes require two columns that +indicate the time-to-event or the time of last follow-up and the event +status.} + \item{\code{outcome_type}}{(\strong{recommended}) Type of outcome found in the outcome +column. The outcome type determines many aspects of the overall process, +e.g. the available feature selection methods and learners, but also the +type of assessments that can be conducted to evaluate the resulting models. +Implemented outcome types are: +\itemize{ +\item \code{binomial}: categorical outcome with 2 levels. +\item \code{multinomial}: categorical outcome with 2 or more levels. +\item \code{count}: Poisson-distributed numeric outcomes. +\item \code{continuous}: general continuous numeric outcomes. +\item \code{survival}: survival outcome for time-to-event data. +} + +If not provided, the algorithm will attempt to obtain outcome_type from +contents of the outcome column. This may lead to unexpected results, and we +therefore advise to provide this information manually. + +Note that \code{competing_risk} survival analysis are not fully supported, and +is currently not a valid choice for \code{outcome_type}.} + \item{\code{class_levels}}{(\emph{optional}) Class levels for \code{binomial} or \code{multinomial} +outcomes. This argument can be used to specify the ordering of levels for +categorical outcomes. These class levels must exactly match the levels +present in the outcome column.} + \item{\code{event_indicator}}{(\strong{recommended}) Indicator for events in \code{survival} +and \code{competing_risk} analyses. \code{familiar} will automatically recognise \code{1}, +\code{true}, \code{t}, \code{y} and \code{yes} as event indicators, including different +capitalisations. If this parameter is set, it replaces the default values.} + \item{\code{censoring_indicator}}{(\strong{recommended}) Indicator for right-censoring in +\code{survival} and \code{competing_risk} analyses. \code{familiar} will automatically +recognise \code{0}, \code{false}, \code{f}, \code{n}, \code{no} as censoring indicators, including +different capitalisations. If this parameter is set, it replaces the +default values.} + \item{\code{competing_risk_indicator}}{(\strong{recommended}) Indicator for competing +risks in \code{competing_risk} analyses. There are no default values, and if +unset, all values other than those specified by the \code{event_indicator} and +\code{censoring_indicator} parameters are considered to indicate competing +risks.} + \item{\code{signature}}{(\emph{optional}) One or more names of feature columns that are +considered part of a specific signature. Features specified here will +always be used for modelling. Ranking from feature selection has no effect +for these features.} + \item{\code{novelty_features}}{(\emph{optional}) One or more names of feature columns +that should be included for the purpose of novelty detection.} + \item{\code{exclude_features}}{(\emph{optional}) Feature columns that will be removed +from the data set. Cannot overlap with features in \code{signature}, +\code{novelty_features} or \code{include_features}.} + \item{\code{include_features}}{(\emph{optional}) Feature columns that are specifically +included in the data set. By default all features are included. Cannot +overlap with \code{exclude_features}, but may overlap \code{signature}. Features in +\code{signature} and \code{novelty_features} are always included. If both +\code{exclude_features} and \code{include_features} are provided, \code{include_features} +takes precedence, provided that there is no overlap between the two.} + \item{\code{reference_method}}{(\emph{optional}) Method used to set reference levels for +categorical features. There are several options: +\itemize{ +\item \code{auto} (default): Categorical features that are not explicitly set by the +user, i.e. columns containing boolean values or characters, use the most +frequent level as reference. Categorical features that are explicitly set, +i.e. as factors, are used as is. +\item \code{always}: Both automatically detected and user-specified categorical +features have the reference level set to the most frequent level. Ordinal +features are not altered, but are used as is. +\item \code{never}: User-specified categorical features are used as is. +Automatically detected categorical features are simply sorted, and the +first level is then used as the reference level. This was the behaviour +prior to familiar version 1.3.0. +}} + \item{\code{imbalance_correction_method}}{(\emph{optional}) Type of method used to +address class imbalances. Available options are: +\itemize{ +\item \code{full_undersampling} (default): All data will be used in an ensemble +fashion. The full minority class will appear in each partition, but +majority classes are undersampled until all data have been used. +\item \code{random_undersampling}: Randomly undersamples majority classes. This is +useful in cases where full undersampling would lead to the formation of +many models due major overrepresentation of the largest class. +} + +This parameter is only used in combination with imbalance partitioning in +the experimental design, and \code{ip} should therefore appear in the string +that defines the design.} + \item{\code{imbalance_n_partitions}}{(\emph{optional}) Number of times random +undersampling should be repeated. 10 undersampled subsets with balanced +classes are formed by default.} + \item{\code{parallel}}{(\emph{optional}) Enable parallel processing. Defaults to \code{TRUE}. +When set to \code{FALSE}, this disables all parallel processing, regardless of +specific parameters such as \code{parallel_preprocessing}. However, when +\code{parallel} is \code{TRUE}, parallel processing of different parts of the +workflow can be disabled by setting respective flags to \code{FALSE}.} + \item{\code{parallel_nr_cores}}{(\emph{optional}) Number of cores available for +parallelisation. Defaults to 2. This setting does nothing if +parallelisation is disabled.} + \item{\code{restart_cluster}}{(\emph{optional}) Restart nodes used for parallel computing +to free up memory prior to starting a parallel process. Note that it does +take time to set up the clusters. Therefore setting this argument to \code{TRUE} +may impact processing speed. This argument is ignored if \code{parallel} is +\code{FALSE} or the cluster was initialised outside of familiar. Default is +\code{FALSE}, which causes the clusters to be initialised only once.} + \item{\code{cluster_type}}{(\emph{optional}) Selection of the cluster type for parallel +processing. Available types are the ones supported by the parallel package +that is part of the base R distribution: \code{psock} (default), \code{fork}, \code{mpi}, +\code{nws}, \code{sock}. In addition, \code{none} is available, which also disables +parallel processing.} + \item{\code{backend_type}}{(\emph{optional}) Selection of the backend for distributing +copies of the data. This backend ensures that only a single master copy is +kept in memory. This limits memory usage during parallel processing. + +Several backend options are available, notably \code{socket_server}, and \code{none} +(default). \code{socket_server} is based on the callr package and R sockets, +comes with \code{familiar} and is available for any OS. \code{none} uses the package +environment of familiar to store data, and is available for any OS. +However, \code{none} requires copying of data to any parallel process, and has a +larger memory footprint.} + \item{\code{server_port}}{(\emph{optional}) Integer indicating the port on which the +socket server or RServe process should communicate. Defaults to port 6311. +Note that ports 0 to 1024 and 49152 to 65535 cannot be used.} + \item{\code{feature_max_fraction_missing}}{(\emph{optional}) Numeric value between \code{0.0} +and \code{0.95} that determines the meximum fraction of missing values that +still allows a feature to be included in the data set. All features with a +missing value fraction over this threshold are not processed further. The +default value is \code{0.30}.} + \item{\code{sample_max_fraction_missing}}{(\emph{optional}) Numeric value between \code{0.0} +and \code{0.95} that determines the maximum fraction of missing values that +still allows a sample to be included in the data set. All samples with a +missing value fraction over this threshold are excluded and not processed +further. The default value is \code{0.30}.} + \item{\code{filter_method}}{(\emph{optional}) One or methods used to reduce +dimensionality of the data set by removing irrelevant or poorly +reproducible features. + +Several method are available: +\itemize{ +\item \code{none} (default): None of the features will be filtered. +\item \code{low_variance}: Features with a variance below the +\code{low_var_minimum_variance_threshold} are filtered. This can be useful to +filter, for example, genes that are not differentially expressed. +\item \code{univariate_test}: Features undergo a univariate regression using an +outcome-appropriate regression model. The p-value of the model coefficient +is collected. Features with coefficient p or q-value above the +\code{univariate_test_threshold} are subsequently filtered. +\item \code{robustness}: Features that are not sufficiently robust according to the +intraclass correlation coefficient are filtered. Use of this method +requires that repeated measurements are present in the data set, i.e. there +should be entries for which the sample and cohort identifiers are the same. +} + +More than one method can be used simultaneously. Features with singular +values are always filtered, as these do not contain information.} + \item{\code{univariate_test_threshold}}{(\emph{optional}) Numeric value between \code{1.0} and +\code{0.0} that determines which features are irrelevant and will be filtered by +the \code{univariate_test}. The p or q-values are compared to this threshold. +All features with values above the threshold are filtered. The default +value is \code{0.20}.} + \item{\code{univariate_test_threshold_metric}}{(\emph{optional}) Metric used with the to +compare the \code{univariate_test_threshold} against. The following metrics can +be chosen: +\itemize{ +\item \code{p_value} (default): The unadjusted p-value of each feature is used for +to filter features. +\item \code{q_value}: The q-value (Story, 2002), is used to filter features. Some +data sets may have insufficient samples to compute the q-value. The +\code{qvalue} package must be installed from Bioconductor to use this method. +}} + \item{\code{univariate_test_max_feature_set_size}}{(\emph{optional}) Maximum size of the +feature set after the univariate test. P or q values of features are +compared against the threshold, but if the resulting data set would be +larger than this setting, only the most relevant features up to the desired +feature set size are selected. + +The default value is \code{NULL}, which causes features to be filtered based on +their relevance only.} + \item{\code{low_var_minimum_variance_threshold}}{(required, if used) Numeric value +that determines which features will be filtered by the \code{low_variance} +method. The variance of each feature is computed and compared to the +threshold. If it is below the threshold, the feature is removed. + +This parameter has no default value and should be set if \code{low_variance} is +used.} + \item{\code{low_var_max_feature_set_size}}{(\emph{optional}) Maximum size of the feature +set after filtering features with a low variance. All features are first +compared against \code{low_var_minimum_variance_threshold}. If the resulting +feature set would be larger than specified, only the most strongly varying +features will be selected, up to the desired size of the feature set. + +The default value is \code{NULL}, which causes features to be filtered based on +their variance only.} + \item{\code{robustness_icc_type}}{(\emph{optional}) String indicating the type of +intraclass correlation coefficient (\code{1}, \code{2} or \code{3}) that should be used to +compute robustness for features in repeated measurements. These types +correspond to the types in Shrout and Fleiss (1979). The default value is +\code{1}.} + \item{\code{robustness_threshold_metric}}{(\emph{optional}) String indicating which +specific intraclass correlation coefficient (ICC) metric should be used to +filter features. This should be one of: +\itemize{ +\item \code{icc}: The estimated ICC value itself. +\item \code{icc_low} (default): The estimated lower limit of the 95\% confidence +interval of the ICC, as suggested by Koo and Li (2016). +\item \code{icc_panel}: The estimated ICC value over the panel average, i.e. the ICC +that would be obtained if all repeated measurements were averaged. +\item \code{icc_panel_low}: The estimated lower limit of the 95\% confidence interval +of the panel ICC. +}} + \item{\code{robustness_threshold_value}}{(\emph{optional}) The intraclass correlation +coefficient value that is as threshold. The default value is \code{0.70}.} + \item{\code{transformation_method}}{(\emph{optional}) The transformation method used to +change the distribution of the data to be more normal-like. The following +methods are available: +\itemize{ +\item \code{none}: This disables transformation of features. +\item \code{yeo_johnson} (default): Transformation using the Yeo-Johnson +transformation (Yeo and Johnson, 2000). The algorithm tests various lambda +values and selects the lambda that maximises the log-likelihood. +\item \code{yeo_johnson_trim}: As \code{yeo_johnson}, but based on the set of feature +values where the 5\% lowest and 5\% highest values are discarded. This +reduces the effect of outliers. +\item \code{yeo_johnson_winsor}: As \code{yeo_johnson}, but based on the set of feature +values where the 5\% lowest and 5\% highest values are winsorised. This +reduces the effect of outliers. +\item \code{yeo_johnson_robust}: A robust version of \code{yeo_johnson} after Raymaekers +and Rousseeuw (2021). This method is less sensitive to outliers. +\item \code{box_cox}: Transformation using the Box-Cox transformation (Box and Cox, +1964). Unlike the Yeo-Johnson transformation, the Box-Cox transformation +requires that all data are positive. Features that contain zero or negative +values cannot be transformed using this transformation. The algorithm tests +various lambda values and selects the lambda that maximises the +log-likelihood. +\item \code{box_cox_trim}: As \code{box_cox}, but based on the set of feature values +where the 5\% lowest and 5\% highest values are discarded. This reduces the +effect of outliers. +\item \code{box_cox_winsor}: As \code{box_cox}, but based on the set of feature values +where the 5\% lowest and 5\% highest values are winsorised. This reduces the +effect of outliers. +\item \code{box_cox_robust}: A robust verson of \code{box_cox} after Raymaekers and +Rousseew (2021). This method is less sensitive to outliers. +} + +Only features that contain numerical data are transformed. Transformation +parameters obtained in development data are stored within \code{featureInfo} +objects for later use with validation data sets.} + \item{\code{normalisation_method}}{(\emph{optional}) The normalisation method used to +improve the comparability between numerical features that may have very +different scales. The following normalisation methods can be chosen: +\itemize{ +\item \code{none}: This disables feature normalisation. +\item \code{standardisation}: Features are normalised by subtraction of their mean +values and division by their standard deviations. This causes every feature +to be have a center value of 0.0 and standard deviation of 1.0. +\item \code{standardisation_trim}: As \code{standardisation}, but based on the set of +feature values where the 5\% lowest and 5\% highest values are discarded. +This reduces the effect of outliers. +\item \code{standardisation_winsor}: As \code{standardisation}, but based on the set of +feature values where the 5\% lowest and 5\% highest values are winsorised. +This reduces the effect of outliers. +\item \code{standardisation_robust} (default): A robust version of \code{standardisation} +that relies on computing Huber's M-estimators for location and scale. +\item \code{normalisation}: Features are normalised by subtraction of their minimum +values and division by their ranges. This maps all feature values to a +\eqn{[0, 1]} interval. +\item \code{normalisation_trim}: As \code{normalisation}, but based on the set of feature +values where the 5\% lowest and 5\% highest values are discarded. This +reduces the effect of outliers. +\item \code{normalisation_winsor}: As \code{normalisation}, but based on the set of +feature values where the 5\% lowest and 5\% highest values are winsorised. +This reduces the effect of outliers. +\item \code{quantile}: Features are normalised by subtraction of their median values +and division by their interquartile range. +\item \code{mean_centering}: Features are centered by substracting the mean, but do +not undergo rescaling. +} + +Only features that contain numerical data are normalised. Normalisation +parameters obtained in development data are stored within \code{featureInfo} +objects for later use with validation data sets.} + \item{\code{batch_normalisation_method}}{(\emph{optional}) The method used for batch +normalisation. Available methods are: +\itemize{ +\item \code{none} (default): This disables batch normalisation of features. +\item \code{standardisation}: Features within each batch are normalised by +subtraction of the mean value and division by the standard deviation in +each batch. +\item \code{standardisation_trim}: As \code{standardisation}, but based on the set of +feature values where the 5\% lowest and 5\% highest values are discarded. +This reduces the effect of outliers. +\item \code{standardisation_winsor}: As \code{standardisation}, but based on the set of +feature values where the 5\% lowest and 5\% highest values are winsorised. +This reduces the effect of outliers. +\item \code{standardisation_robust}: A robust version of \code{standardisation} that +relies on computing Huber's M-estimators for location and scale within each +batch. +\item \code{normalisation}: Features within each batch are normalised by subtraction +of their minimum values and division by their range in each batch. This +maps all feature values in each batch to a \eqn{[0, 1]} interval. +\item \code{normalisation_trim}: As \code{normalisation}, but based on the set of feature +values where the 5\% lowest and 5\% highest values are discarded. This +reduces the effect of outliers. +\item \code{normalisation_winsor}: As \code{normalisation}, but based on the set of +feature values where the 5\% lowest and 5\% highest values are winsorised. +This reduces the effect of outliers. +\item \code{quantile}: Features in each batch are normalised by subtraction of the +median value and division by the interquartile range of each batch. +\item \code{mean_centering}: Features in each batch are centered on 0.0 by +substracting the mean value in each batch, but are not rescaled. +\item \code{combat_parametric}: Batch adjustments using parametric empirical Bayes +(Johnson et al, 2007). \code{combat_p} leads to the same method. +\item \code{combat_non_parametric}: Batch adjustments using non-parametric empirical +Bayes (Johnson et al, 2007). \code{combat_np} and \code{combat} lead to the same +method. Note that we reduced complexity from O(\eqn{n^2}) to O(\eqn{n}) by +only computing batch adjustment parameters for each feature on a subset of +50 randomly selected features, instead of all features. +} + +Only features that contain numerical data are normalised using batch +normalisation. Batch normalisation parameters obtained in development data +are stored within \code{featureInfo} objects for later use with validation data +sets, in case the validation data is from the same batch. + +If validation data contains data from unknown batches, normalisation +parameters are separately determined for these batches. + +Note that for both empirical Bayes methods, the batch effect is assumed to +produce results across the features. This is often true for things such as +gene expressions, but the assumption may not hold generally. + +When performing batch normalisation, it is moreover important to check that +differences between batches or cohorts are not related to the studied +endpoint.} + \item{\code{imputation_method}}{(\emph{optional}) Method used for imputing missing +feature values. Two methods are implemented: +\itemize{ +\item \code{simple}: Simple replacement of a missing value by the median value (for +numeric features) or the modal value (for categorical features). +\item \code{lasso}: Imputation of missing value by lasso regression (using \code{glmnet}) +based on information contained in other features. +} + +\code{simple} imputation precedes \code{lasso} imputation to ensure that any missing +values in predictors required for \code{lasso} regression are resolved. The +\code{lasso} estimate is then used to replace the missing value. + +The default value depends on the number of features in the dataset. If the +number is lower than 100, \code{lasso} is used by default, and \code{simple} +otherwise. + +Only single imputation is performed. Imputation models and parameters are +stored within \code{featureInfo} objects for later use with validation data +sets.} + \item{\code{cluster_method}}{(\emph{optional}) Clustering is performed to identify and +replace redundant features, for example those that are highly correlated. +Such features do not carry much additional information and may be removed +or replaced instead (Park et al., 2007; Tolosi and Lengauer, 2011). + +The cluster method determines the algorithm used to form the clusters. The +following cluster methods are implemented: +\itemize{ +\item \code{none}: No clustering is performed. +\item \code{hclust} (default): Hierarchical agglomerative clustering. If the +\code{fastcluster} package is installed, \code{fastcluster::hclust} is used (Muellner +2013), otherwise \code{stats::hclust} is used. +\item \code{agnes}: Hierarchical clustering using agglomerative nesting (Kaufman and +Rousseeuw, 1990). This algorithm is similar to \code{hclust}, but uses the +\code{cluster::agnes} implementation. +\item \code{diana}: Divisive analysis hierarchical clustering. This method uses +divisive instead of agglomerative clustering (Kaufman and Rousseeuw, 1990). +\code{cluster::diana} is used. +\item \code{pam}: Partioning around medioids. This partitions the data into $k$ +clusters around medioids (Kaufman and Rousseeuw, 1990). $k$ is selected +using the \code{silhouette} metric. \code{pam} is implemented using the +\code{cluster::pam} function. +} + +Clusters and cluster information is stored within \code{featureInfo} objects for +later use with validation data sets. This enables reproduction of the same +clusters as formed in the development data set.} + \item{\code{cluster_linkage_method}}{(\emph{optional}) Linkage method used for +agglomerative clustering in \code{hclust} and \code{agnes}. The following linkage +methods can be used: +\itemize{ +\item \code{average} (default): Average linkage. +\item \code{single}: Single linkage. +\item \code{complete}: Complete linkage. +\item \code{weighted}: Weighted linkage, also known as McQuitty linkage. +\item \code{ward}: Linkage using Ward's minimum variance method. +} + +\code{diana} and \code{pam} do not require a linkage method.} + \item{\code{cluster_cut_method}}{(\emph{optional}) The method used to define the actual +clusters. The following methods can be used: +\itemize{ +\item \code{silhouette}: Clusters are formed based on the silhouette score +(Rousseeuw, 1987). The average silhouette score is computed from 2 to +\eqn{n} clusters, with \eqn{n} the number of features. Clusters are only +formed if the average silhouette exceeds 0.50, which indicates reasonable +evidence for structure. This procedure may be slow if the number of +features is large (>100s). +\item \code{fixed_cut}: Clusters are formed by cutting the hierarchical tree at the +point indicated by the \code{cluster_similarity_threshold}, e.g. where features +in a cluster have an average Spearman correlation of 0.90. \code{fixed_cut} is +only available for \code{agnes}, \code{diana} and \code{hclust}. +\item \code{dynamic_cut}: Dynamic cluster formation using the cutting algorithm in +the \code{dynamicTreeCut} package. This package should be installed to select +this option. \code{dynamic_cut} can only be used with \code{agnes} and \code{hclust}. +} + +The default options are \code{silhouette} for partioning around medioids (\code{pam}) +and \code{fixed_cut} otherwise.} + \item{\code{cluster_similarity_metric}}{(\emph{optional}) Clusters are formed based on +feature similarity. All features are compared in a pair-wise fashion to +compute similarity, for example correlation. The resulting similarity grid +is converted into a distance matrix that is subsequently used for +clustering. The following metrics are supported to compute pairwise +similarities: +\itemize{ +\item \code{mutual_information} (default): normalised mutual information. +\item \code{mcfadden_r2}: McFadden's pseudo R-squared (McFadden, 1974). +\item \code{cox_snell_r2}: Cox and Snell's pseudo R-squared (Cox and Snell, 1989). +\item \code{nagelkerke_r2}: Nagelkerke's pseudo R-squared (Nagelkerke, 1991). +\item \code{spearman}: Spearman's rank order correlation. +\item \code{kendall}: Kendall rank correlation. +\item \code{pearson}: Pearson product-moment correlation. +} + +The pseudo R-squared metrics can be used to assess similarity between mixed +pairs of numeric and categorical features, as these are based on the +log-likelihood of regression models. In \code{familiar}, the more informative +feature is used as the predictor and the other feature as the reponse +variable. In numeric-categorical pairs, the numeric feature is considered +to be more informative and is thus used as the predictor. In +categorical-categorical pairs, the feature with most levels is used as the +predictor. + +In case any of the classical correlation coefficients (\code{pearson}, +\code{spearman} and \code{kendall}) are used with (mixed) categorical features, the +categorical features are one-hot encoded and the mean correlation over all +resulting pairs is used as similarity.} + \item{\code{cluster_similarity_threshold}}{(\emph{optional}) The threshold level for +pair-wise similarity that is required to form clusters using \code{fixed_cut}. +This should be a numerical value between 0.0 and 1.0. Note however, that a +reasonable threshold value depends strongly on the similarity metric. The +following are the default values used: +\itemize{ +\item \code{mcfadden_r2} and \code{mutual_information}: \code{0.30} +\item \code{cox_snell_r2} and \code{nagelkerke_r2}: \code{0.75} +\item \code{spearman}, \code{kendall} and \code{pearson}: \code{0.90} +} + +Alternatively, if the \verb{fixed cut} method is not used, this value determines +whether any clustering should be performed, because the data may not +contain highly similar features. The default values in this situation are: +\itemize{ +\item \code{mcfadden_r2} and \code{mutual_information}: \code{0.25} +\item \code{cox_snell_r2} and \code{nagelkerke_r2}: \code{0.40} +\item \code{spearman}, \code{kendall} and \code{pearson}: \code{0.70} +} + +The threshold value is converted to a distance (1-similarity) prior to +cutting hierarchical trees.} + \item{\code{cluster_representation_method}}{(\emph{optional}) Method used to determine +how the information of co-clustered features is summarised and used to +represent the cluster. The following methods can be selected: +\itemize{ +\item \code{best_predictor} (default): The feature with the highest importance +according to univariate regression with the outcome is used to represent +the cluster. +\item \code{medioid}: The feature closest to the cluster center, i.e. the feature +that is most similar to the remaining features in the cluster, is used to +represent the feature. +\item \code{mean}: A meta-feature is generated by averaging the feature values for +all features in a cluster. This method aligns all features so that all +features will be positively correlated prior to averaging. Should a cluster +contain one or more categorical features, the \code{medioid} method will be used +instead, as averaging is not possible. Note that if this method is chosen, +the \code{normalisation_method} parameter should be one of \code{standardisation}, +\code{standardisation_trim}, \code{standardisation_winsor} or \code{quantile}.` +} + +If the \code{pam} cluster method is selected, only the \code{medioid} method can be +used. In that case 1 medioid is used by default.} + \item{\code{parallel_preprocessing}}{(\emph{optional}) Enable parallel processing for the +preprocessing workflow. Defaults to \code{TRUE}. When set to \code{FALSE}, this will +disable the use of parallel processing while preprocessing, regardless of +the settings of the \code{parallel} parameter. \code{parallel_preprocessing} is +ignored if \code{parallel=FALSE}.} + }} +} +\value{ +An \code{experimentData} object. +} +\description{ +Creates data assignment and subsequently extracts feature +information such as normalisation and clustering parameters. +} +\details{ +This is a thin wrapper around \code{summon_familiar}, and functions like +it, but automatically skips computation of variable importance, learning +and subsequent evaluation steps. + +The function returns an \code{experimentData} object, which can be used to +warm-start other experiments by providing it to the \code{experiment_data} +argument. +} diff --git a/man/precompute_vimp.Rd b/man/precompute_vimp.Rd new file mode 100644 index 00000000..051d48b1 --- /dev/null +++ b/man/precompute_vimp.Rd @@ -0,0 +1,684 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Familiar.R +\name{precompute_vimp} +\alias{precompute_vimp} +\title{Pre-compute variable importance} +\usage{ +precompute_vimp( + formula = NULL, + data = NULL, + experiment_data = NULL, + cl = NULL, + experimental_design = "fs+mb", + fs_method = NULL, + fs_method_parameter = NULL, + verbose = TRUE, + ... +) +} +\arguments{ +\item{formula}{An R formula. The formula can only contain feature names and +dot (\code{.}). The \code{*} and \code{+1} operators are not supported as these refer to +columns that are not present in the data set. + +Use of the formula interface is optional.} + +\item{data}{A \code{data.table} object, a \code{data.frame} object, list containing +multiple \code{data.table} or \code{data.frame} objects, or paths to data files. + +\code{data} should be provided if no file paths are provided to the \code{data_files} +argument. If both are provided, only \code{data} will be used. + +All data is expected to be in wide format, and ideally has a sample +identifier (see \code{sample_id_column}), batch identifier (see \code{cohort_column}) +and outcome columns (see \code{outcome_column}). + +In case paths are provided, the data should be stored as \code{csv}, \code{rds} or +\code{RData} files. See documentation for the \code{data_files} argument for more +information.} + +\item{experiment_data}{Experimental data may provided in the form of} + +\item{cl}{Cluster created using the \code{parallel} package. This cluster is then +used to speed up computation through parallelisation. When a cluster is not +provided, parallelisation is performed by setting up a cluster on the local +machine. + +This parameter has no effect if the \code{parallel} argument is set to \code{FALSE}.} + +\item{experimental_design}{(\strong{required}) Defines what the experiment looks +like, e.g. \code{cv(bt(fs,20)+mb,3,2)} for 2 times repeated 3-fold +cross-validation with nested feature selection on 20 bootstraps and +model-building. The basic workflow components are: +\itemize{ +\item \code{fs}: (required) feature selection step. +\item \code{mb}: (required) model building step. Though models are not learned by +\code{precompute_vimp}, this element is still required to prevent issues when +using the resulting \code{experimentData} object to warm-start the experiments. +\item \code{ev}: (optional) external validation. If validation batches or cohorts +are present in the dataset (\code{data}), these should be indicated in the +\code{validation_batch_id} argument. +} + +The different components are linked using \code{+}. + +Different subsampling methods can be used in conjunction with the basic +workflow components: +\itemize{ +\item \code{bs(x,n)}: (stratified) .632 bootstrap, with \code{n} the number of +bootstraps. In contrast to \code{bt}, feature pre-processing parameters and +hyperparameter optimisation are conducted on individual bootstraps. +\item \code{bt(x,n)}: (stratified) .632 bootstrap, with \code{n} the number of +bootstraps. Unlike \code{bs} and other subsampling methods, no separate +pre-processing parameters or optimised hyperparameters will be determined +for each bootstrap. +\item \code{cv(x,n,p)}: (stratified) \code{n}-fold cross-validation, repeated \code{p} times. +Pre-processing parameters are determined for each iteration. +\item \code{lv(x)}: leave-one-out-cross-validation. Pre-processing parameters are +determined for each iteration. +\item \code{ip(x)}: imbalance partitioning for addressing class imbalances on the +data set. Pre-processing parameters are determined for each partition. The +number of partitions generated depends on the imbalance correction method +(see the \code{imbalance_correction_method} parameter). +} + +As shown in the example above, sampling algorithms can be nested. + +The simplest valid experimental design is \code{fs+mb}. This is the default in +\code{precompute_vimp}, and will compute variable importance over the entire +dataset. + +This argument is ignored if the \code{experiment_data} argument is set.} + +\item{fs_method}{(\strong{required}) Feature selection method to be used for +determining variable importance. \code{familiar} implements various feature +selection methods. Please refer to the vignette on feature selection +methods for more details. + +More than one feature selection method can be chosen. The experiment will +then repeated for each feature selection method. + +Feature selection methods determines the ranking of features. Actual +selection of features is done by optimising the signature size model +hyperparameter during the hyperparameter optimisation step.} + +\item{fs_method_parameter}{(\emph{optional}) List of lists containing parameters +for feature selection methods. Each sublist should have the name of the +feature selection method it corresponds to. + +Most feature selection methods do not have parameters that can be set. +Please refer to the vignette on feature selection methods for more details. +Note that if the feature selection method is based on a learner (e.g. lasso +regression), hyperparameter optimisation may be performed prior to +assessing variable importance.} + +\item{verbose}{Indicates verbosity of the results. Default is TRUE, and all +messages and warnings are returned.} + +\item{...}{ + Arguments passed on to \code{\link[=.parse_experiment_settings]{.parse_experiment_settings}}, \code{\link[=.parse_setup_settings]{.parse_setup_settings}}, \code{\link[=.parse_preprocessing_settings]{.parse_preprocessing_settings}}, \code{\link[=.parse_feature_selection_settings]{.parse_feature_selection_settings}} + \describe{ + \item{\code{batch_id_column}}{(\strong{recommended}) Name of the column containing batch +or cohort identifiers. This parameter is required if more than one dataset +is provided, or if external validation is performed. + +In familiar any row of data is organised by four identifiers: +\itemize{ +\item The batch identifier \code{batch_id_column}: This denotes the group to which a +set of samples belongs, e.g. patients from a single study, samples measured +in a batch, etc. The batch identifier is used for batch normalisation, as +well as selection of development and validation datasets. +\item The sample identifier \code{sample_id_column}: This denotes the sample level, +e.g. data from a single individual. Subsets of data, e.g. bootstraps or +cross-validation folds, are created at this level. +\item The series identifier \code{series_id_column}: Indicates measurements on a +single sample that may not share the same outcome value, e.g. a time +series, or the number of cells in a view. +\item The repetition identifier: Indicates repeated measurements in a single +series where any feature values may differ, but the outcome does not. +Repetition identifiers are always implicitly set when multiple entries for +the same series of the same sample in the same batch that share the same +outcome are encountered. +}} + \item{\code{sample_id_column}}{(\strong{recommended}) Name of the column containing +sample or subject identifiers. See \code{batch_id_column} above for more +details. + +If unset, every row will be identified as a single sample.} + \item{\code{series_id_column}}{(\strong{optional}) Name of the column containing series +identifiers, which distinguish between measurements that are part of a +series for a single sample. See \code{batch_id_column} above for more details. + +If unset, rows which share the same batch and sample identifiers but have a +different outcome are assigned unique series identifiers.} + \item{\code{development_batch_id}}{(\emph{optional}) One or more batch or cohort +identifiers to constitute data sets for development. Defaults to all, or +all minus the identifiers in \code{validation_batch_id} for external validation. +Required if external validation is performed and \code{validation_batch_id} is +not provided.} + \item{\code{validation_batch_id}}{(\emph{optional}) One or more batch or cohort +identifiers to constitute data sets for external validation. Defaults to +all data sets except those in \code{development_batch_id} for external +validation, or none if not. Required if \code{development_batch_id} is not +provided.} + \item{\code{outcome_name}}{(\emph{optional}) Name of the modelled outcome. This name will +be used in figures created by \code{familiar}. + +If not set, the column name in \code{outcome_column} will be used for +\code{binomial}, \code{multinomial}, \code{count} and \code{continuous} outcomes. For other +outcomes (\code{survival} and \code{competing_risk}) no default is used.} + \item{\code{outcome_column}}{(\strong{recommended}) Name of the column containing the +outcome of interest. May be identified from a formula, if a formula is +provided as an argument. Otherwise an error is raised. Note that \code{survival} +and \code{competing_risk} outcome type outcomes require two columns that +indicate the time-to-event or the time of last follow-up and the event +status.} + \item{\code{outcome_type}}{(\strong{recommended}) Type of outcome found in the outcome +column. The outcome type determines many aspects of the overall process, +e.g. the available feature selection methods and learners, but also the +type of assessments that can be conducted to evaluate the resulting models. +Implemented outcome types are: +\itemize{ +\item \code{binomial}: categorical outcome with 2 levels. +\item \code{multinomial}: categorical outcome with 2 or more levels. +\item \code{count}: Poisson-distributed numeric outcomes. +\item \code{continuous}: general continuous numeric outcomes. +\item \code{survival}: survival outcome for time-to-event data. +} + +If not provided, the algorithm will attempt to obtain outcome_type from +contents of the outcome column. This may lead to unexpected results, and we +therefore advise to provide this information manually. + +Note that \code{competing_risk} survival analysis are not fully supported, and +is currently not a valid choice for \code{outcome_type}.} + \item{\code{class_levels}}{(\emph{optional}) Class levels for \code{binomial} or \code{multinomial} +outcomes. This argument can be used to specify the ordering of levels for +categorical outcomes. These class levels must exactly match the levels +present in the outcome column.} + \item{\code{event_indicator}}{(\strong{recommended}) Indicator for events in \code{survival} +and \code{competing_risk} analyses. \code{familiar} will automatically recognise \code{1}, +\code{true}, \code{t}, \code{y} and \code{yes} as event indicators, including different +capitalisations. If this parameter is set, it replaces the default values.} + \item{\code{censoring_indicator}}{(\strong{recommended}) Indicator for right-censoring in +\code{survival} and \code{competing_risk} analyses. \code{familiar} will automatically +recognise \code{0}, \code{false}, \code{f}, \code{n}, \code{no} as censoring indicators, including +different capitalisations. If this parameter is set, it replaces the +default values.} + \item{\code{competing_risk_indicator}}{(\strong{recommended}) Indicator for competing +risks in \code{competing_risk} analyses. There are no default values, and if +unset, all values other than those specified by the \code{event_indicator} and +\code{censoring_indicator} parameters are considered to indicate competing +risks.} + \item{\code{signature}}{(\emph{optional}) One or more names of feature columns that are +considered part of a specific signature. Features specified here will +always be used for modelling. Ranking from feature selection has no effect +for these features.} + \item{\code{novelty_features}}{(\emph{optional}) One or more names of feature columns +that should be included for the purpose of novelty detection.} + \item{\code{exclude_features}}{(\emph{optional}) Feature columns that will be removed +from the data set. Cannot overlap with features in \code{signature}, +\code{novelty_features} or \code{include_features}.} + \item{\code{include_features}}{(\emph{optional}) Feature columns that are specifically +included in the data set. By default all features are included. Cannot +overlap with \code{exclude_features}, but may overlap \code{signature}. Features in +\code{signature} and \code{novelty_features} are always included. If both +\code{exclude_features} and \code{include_features} are provided, \code{include_features} +takes precedence, provided that there is no overlap between the two.} + \item{\code{reference_method}}{(\emph{optional}) Method used to set reference levels for +categorical features. There are several options: +\itemize{ +\item \code{auto} (default): Categorical features that are not explicitly set by the +user, i.e. columns containing boolean values or characters, use the most +frequent level as reference. Categorical features that are explicitly set, +i.e. as factors, are used as is. +\item \code{always}: Both automatically detected and user-specified categorical +features have the reference level set to the most frequent level. Ordinal +features are not altered, but are used as is. +\item \code{never}: User-specified categorical features are used as is. +Automatically detected categorical features are simply sorted, and the +first level is then used as the reference level. This was the behaviour +prior to familiar version 1.3.0. +}} + \item{\code{imbalance_correction_method}}{(\emph{optional}) Type of method used to +address class imbalances. Available options are: +\itemize{ +\item \code{full_undersampling} (default): All data will be used in an ensemble +fashion. The full minority class will appear in each partition, but +majority classes are undersampled until all data have been used. +\item \code{random_undersampling}: Randomly undersamples majority classes. This is +useful in cases where full undersampling would lead to the formation of +many models due major overrepresentation of the largest class. +} + +This parameter is only used in combination with imbalance partitioning in +the experimental design, and \code{ip} should therefore appear in the string +that defines the design.} + \item{\code{imbalance_n_partitions}}{(\emph{optional}) Number of times random +undersampling should be repeated. 10 undersampled subsets with balanced +classes are formed by default.} + \item{\code{parallel}}{(\emph{optional}) Enable parallel processing. Defaults to \code{TRUE}. +When set to \code{FALSE}, this disables all parallel processing, regardless of +specific parameters such as \code{parallel_preprocessing}. However, when +\code{parallel} is \code{TRUE}, parallel processing of different parts of the +workflow can be disabled by setting respective flags to \code{FALSE}.} + \item{\code{parallel_nr_cores}}{(\emph{optional}) Number of cores available for +parallelisation. Defaults to 2. This setting does nothing if +parallelisation is disabled.} + \item{\code{restart_cluster}}{(\emph{optional}) Restart nodes used for parallel computing +to free up memory prior to starting a parallel process. Note that it does +take time to set up the clusters. Therefore setting this argument to \code{TRUE} +may impact processing speed. This argument is ignored if \code{parallel} is +\code{FALSE} or the cluster was initialised outside of familiar. Default is +\code{FALSE}, which causes the clusters to be initialised only once.} + \item{\code{cluster_type}}{(\emph{optional}) Selection of the cluster type for parallel +processing. Available types are the ones supported by the parallel package +that is part of the base R distribution: \code{psock} (default), \code{fork}, \code{mpi}, +\code{nws}, \code{sock}. In addition, \code{none} is available, which also disables +parallel processing.} + \item{\code{backend_type}}{(\emph{optional}) Selection of the backend for distributing +copies of the data. This backend ensures that only a single master copy is +kept in memory. This limits memory usage during parallel processing. + +Several backend options are available, notably \code{socket_server}, and \code{none} +(default). \code{socket_server} is based on the callr package and R sockets, +comes with \code{familiar} and is available for any OS. \code{none} uses the package +environment of familiar to store data, and is available for any OS. +However, \code{none} requires copying of data to any parallel process, and has a +larger memory footprint.} + \item{\code{server_port}}{(\emph{optional}) Integer indicating the port on which the +socket server or RServe process should communicate. Defaults to port 6311. +Note that ports 0 to 1024 and 49152 to 65535 cannot be used.} + \item{\code{feature_max_fraction_missing}}{(\emph{optional}) Numeric value between \code{0.0} +and \code{0.95} that determines the meximum fraction of missing values that +still allows a feature to be included in the data set. All features with a +missing value fraction over this threshold are not processed further. The +default value is \code{0.30}.} + \item{\code{sample_max_fraction_missing}}{(\emph{optional}) Numeric value between \code{0.0} +and \code{0.95} that determines the maximum fraction of missing values that +still allows a sample to be included in the data set. All samples with a +missing value fraction over this threshold are excluded and not processed +further. The default value is \code{0.30}.} + \item{\code{filter_method}}{(\emph{optional}) One or methods used to reduce +dimensionality of the data set by removing irrelevant or poorly +reproducible features. + +Several method are available: +\itemize{ +\item \code{none} (default): None of the features will be filtered. +\item \code{low_variance}: Features with a variance below the +\code{low_var_minimum_variance_threshold} are filtered. This can be useful to +filter, for example, genes that are not differentially expressed. +\item \code{univariate_test}: Features undergo a univariate regression using an +outcome-appropriate regression model. The p-value of the model coefficient +is collected. Features with coefficient p or q-value above the +\code{univariate_test_threshold} are subsequently filtered. +\item \code{robustness}: Features that are not sufficiently robust according to the +intraclass correlation coefficient are filtered. Use of this method +requires that repeated measurements are present in the data set, i.e. there +should be entries for which the sample and cohort identifiers are the same. +} + +More than one method can be used simultaneously. Features with singular +values are always filtered, as these do not contain information.} + \item{\code{univariate_test_threshold}}{(\emph{optional}) Numeric value between \code{1.0} and +\code{0.0} that determines which features are irrelevant and will be filtered by +the \code{univariate_test}. The p or q-values are compared to this threshold. +All features with values above the threshold are filtered. The default +value is \code{0.20}.} + \item{\code{univariate_test_threshold_metric}}{(\emph{optional}) Metric used with the to +compare the \code{univariate_test_threshold} against. The following metrics can +be chosen: +\itemize{ +\item \code{p_value} (default): The unadjusted p-value of each feature is used for +to filter features. +\item \code{q_value}: The q-value (Story, 2002), is used to filter features. Some +data sets may have insufficient samples to compute the q-value. The +\code{qvalue} package must be installed from Bioconductor to use this method. +}} + \item{\code{univariate_test_max_feature_set_size}}{(\emph{optional}) Maximum size of the +feature set after the univariate test. P or q values of features are +compared against the threshold, but if the resulting data set would be +larger than this setting, only the most relevant features up to the desired +feature set size are selected. + +The default value is \code{NULL}, which causes features to be filtered based on +their relevance only.} + \item{\code{low_var_minimum_variance_threshold}}{(required, if used) Numeric value +that determines which features will be filtered by the \code{low_variance} +method. The variance of each feature is computed and compared to the +threshold. If it is below the threshold, the feature is removed. + +This parameter has no default value and should be set if \code{low_variance} is +used.} + \item{\code{low_var_max_feature_set_size}}{(\emph{optional}) Maximum size of the feature +set after filtering features with a low variance. All features are first +compared against \code{low_var_minimum_variance_threshold}. If the resulting +feature set would be larger than specified, only the most strongly varying +features will be selected, up to the desired size of the feature set. + +The default value is \code{NULL}, which causes features to be filtered based on +their variance only.} + \item{\code{robustness_icc_type}}{(\emph{optional}) String indicating the type of +intraclass correlation coefficient (\code{1}, \code{2} or \code{3}) that should be used to +compute robustness for features in repeated measurements. These types +correspond to the types in Shrout and Fleiss (1979). The default value is +\code{1}.} + \item{\code{robustness_threshold_metric}}{(\emph{optional}) String indicating which +specific intraclass correlation coefficient (ICC) metric should be used to +filter features. This should be one of: +\itemize{ +\item \code{icc}: The estimated ICC value itself. +\item \code{icc_low} (default): The estimated lower limit of the 95\% confidence +interval of the ICC, as suggested by Koo and Li (2016). +\item \code{icc_panel}: The estimated ICC value over the panel average, i.e. the ICC +that would be obtained if all repeated measurements were averaged. +\item \code{icc_panel_low}: The estimated lower limit of the 95\% confidence interval +of the panel ICC. +}} + \item{\code{robustness_threshold_value}}{(\emph{optional}) The intraclass correlation +coefficient value that is as threshold. The default value is \code{0.70}.} + \item{\code{transformation_method}}{(\emph{optional}) The transformation method used to +change the distribution of the data to be more normal-like. The following +methods are available: +\itemize{ +\item \code{none}: This disables transformation of features. +\item \code{yeo_johnson} (default): Transformation using the Yeo-Johnson +transformation (Yeo and Johnson, 2000). The algorithm tests various lambda +values and selects the lambda that maximises the log-likelihood. +\item \code{yeo_johnson_trim}: As \code{yeo_johnson}, but based on the set of feature +values where the 5\% lowest and 5\% highest values are discarded. This +reduces the effect of outliers. +\item \code{yeo_johnson_winsor}: As \code{yeo_johnson}, but based on the set of feature +values where the 5\% lowest and 5\% highest values are winsorised. This +reduces the effect of outliers. +\item \code{yeo_johnson_robust}: A robust version of \code{yeo_johnson} after Raymaekers +and Rousseeuw (2021). This method is less sensitive to outliers. +\item \code{box_cox}: Transformation using the Box-Cox transformation (Box and Cox, +1964). Unlike the Yeo-Johnson transformation, the Box-Cox transformation +requires that all data are positive. Features that contain zero or negative +values cannot be transformed using this transformation. The algorithm tests +various lambda values and selects the lambda that maximises the +log-likelihood. +\item \code{box_cox_trim}: As \code{box_cox}, but based on the set of feature values +where the 5\% lowest and 5\% highest values are discarded. This reduces the +effect of outliers. +\item \code{box_cox_winsor}: As \code{box_cox}, but based on the set of feature values +where the 5\% lowest and 5\% highest values are winsorised. This reduces the +effect of outliers. +\item \code{box_cox_robust}: A robust verson of \code{box_cox} after Raymaekers and +Rousseew (2021). This method is less sensitive to outliers. +} + +Only features that contain numerical data are transformed. Transformation +parameters obtained in development data are stored within \code{featureInfo} +objects for later use with validation data sets.} + \item{\code{normalisation_method}}{(\emph{optional}) The normalisation method used to +improve the comparability between numerical features that may have very +different scales. The following normalisation methods can be chosen: +\itemize{ +\item \code{none}: This disables feature normalisation. +\item \code{standardisation}: Features are normalised by subtraction of their mean +values and division by their standard deviations. This causes every feature +to be have a center value of 0.0 and standard deviation of 1.0. +\item \code{standardisation_trim}: As \code{standardisation}, but based on the set of +feature values where the 5\% lowest and 5\% highest values are discarded. +This reduces the effect of outliers. +\item \code{standardisation_winsor}: As \code{standardisation}, but based on the set of +feature values where the 5\% lowest and 5\% highest values are winsorised. +This reduces the effect of outliers. +\item \code{standardisation_robust} (default): A robust version of \code{standardisation} +that relies on computing Huber's M-estimators for location and scale. +\item \code{normalisation}: Features are normalised by subtraction of their minimum +values and division by their ranges. This maps all feature values to a +\eqn{[0, 1]} interval. +\item \code{normalisation_trim}: As \code{normalisation}, but based on the set of feature +values where the 5\% lowest and 5\% highest values are discarded. This +reduces the effect of outliers. +\item \code{normalisation_winsor}: As \code{normalisation}, but based on the set of +feature values where the 5\% lowest and 5\% highest values are winsorised. +This reduces the effect of outliers. +\item \code{quantile}: Features are normalised by subtraction of their median values +and division by their interquartile range. +\item \code{mean_centering}: Features are centered by substracting the mean, but do +not undergo rescaling. +} + +Only features that contain numerical data are normalised. Normalisation +parameters obtained in development data are stored within \code{featureInfo} +objects for later use with validation data sets.} + \item{\code{batch_normalisation_method}}{(\emph{optional}) The method used for batch +normalisation. Available methods are: +\itemize{ +\item \code{none} (default): This disables batch normalisation of features. +\item \code{standardisation}: Features within each batch are normalised by +subtraction of the mean value and division by the standard deviation in +each batch. +\item \code{standardisation_trim}: As \code{standardisation}, but based on the set of +feature values where the 5\% lowest and 5\% highest values are discarded. +This reduces the effect of outliers. +\item \code{standardisation_winsor}: As \code{standardisation}, but based on the set of +feature values where the 5\% lowest and 5\% highest values are winsorised. +This reduces the effect of outliers. +\item \code{standardisation_robust}: A robust version of \code{standardisation} that +relies on computing Huber's M-estimators for location and scale within each +batch. +\item \code{normalisation}: Features within each batch are normalised by subtraction +of their minimum values and division by their range in each batch. This +maps all feature values in each batch to a \eqn{[0, 1]} interval. +\item \code{normalisation_trim}: As \code{normalisation}, but based on the set of feature +values where the 5\% lowest and 5\% highest values are discarded. This +reduces the effect of outliers. +\item \code{normalisation_winsor}: As \code{normalisation}, but based on the set of +feature values where the 5\% lowest and 5\% highest values are winsorised. +This reduces the effect of outliers. +\item \code{quantile}: Features in each batch are normalised by subtraction of the +median value and division by the interquartile range of each batch. +\item \code{mean_centering}: Features in each batch are centered on 0.0 by +substracting the mean value in each batch, but are not rescaled. +\item \code{combat_parametric}: Batch adjustments using parametric empirical Bayes +(Johnson et al, 2007). \code{combat_p} leads to the same method. +\item \code{combat_non_parametric}: Batch adjustments using non-parametric empirical +Bayes (Johnson et al, 2007). \code{combat_np} and \code{combat} lead to the same +method. Note that we reduced complexity from O(\eqn{n^2}) to O(\eqn{n}) by +only computing batch adjustment parameters for each feature on a subset of +50 randomly selected features, instead of all features. +} + +Only features that contain numerical data are normalised using batch +normalisation. Batch normalisation parameters obtained in development data +are stored within \code{featureInfo} objects for later use with validation data +sets, in case the validation data is from the same batch. + +If validation data contains data from unknown batches, normalisation +parameters are separately determined for these batches. + +Note that for both empirical Bayes methods, the batch effect is assumed to +produce results across the features. This is often true for things such as +gene expressions, but the assumption may not hold generally. + +When performing batch normalisation, it is moreover important to check that +differences between batches or cohorts are not related to the studied +endpoint.} + \item{\code{imputation_method}}{(\emph{optional}) Method used for imputing missing +feature values. Two methods are implemented: +\itemize{ +\item \code{simple}: Simple replacement of a missing value by the median value (for +numeric features) or the modal value (for categorical features). +\item \code{lasso}: Imputation of missing value by lasso regression (using \code{glmnet}) +based on information contained in other features. +} + +\code{simple} imputation precedes \code{lasso} imputation to ensure that any missing +values in predictors required for \code{lasso} regression are resolved. The +\code{lasso} estimate is then used to replace the missing value. + +The default value depends on the number of features in the dataset. If the +number is lower than 100, \code{lasso} is used by default, and \code{simple} +otherwise. + +Only single imputation is performed. Imputation models and parameters are +stored within \code{featureInfo} objects for later use with validation data +sets.} + \item{\code{cluster_method}}{(\emph{optional}) Clustering is performed to identify and +replace redundant features, for example those that are highly correlated. +Such features do not carry much additional information and may be removed +or replaced instead (Park et al., 2007; Tolosi and Lengauer, 2011). + +The cluster method determines the algorithm used to form the clusters. The +following cluster methods are implemented: +\itemize{ +\item \code{none}: No clustering is performed. +\item \code{hclust} (default): Hierarchical agglomerative clustering. If the +\code{fastcluster} package is installed, \code{fastcluster::hclust} is used (Muellner +2013), otherwise \code{stats::hclust} is used. +\item \code{agnes}: Hierarchical clustering using agglomerative nesting (Kaufman and +Rousseeuw, 1990). This algorithm is similar to \code{hclust}, but uses the +\code{cluster::agnes} implementation. +\item \code{diana}: Divisive analysis hierarchical clustering. This method uses +divisive instead of agglomerative clustering (Kaufman and Rousseeuw, 1990). +\code{cluster::diana} is used. +\item \code{pam}: Partioning around medioids. This partitions the data into $k$ +clusters around medioids (Kaufman and Rousseeuw, 1990). $k$ is selected +using the \code{silhouette} metric. \code{pam} is implemented using the +\code{cluster::pam} function. +} + +Clusters and cluster information is stored within \code{featureInfo} objects for +later use with validation data sets. This enables reproduction of the same +clusters as formed in the development data set.} + \item{\code{cluster_linkage_method}}{(\emph{optional}) Linkage method used for +agglomerative clustering in \code{hclust} and \code{agnes}. The following linkage +methods can be used: +\itemize{ +\item \code{average} (default): Average linkage. +\item \code{single}: Single linkage. +\item \code{complete}: Complete linkage. +\item \code{weighted}: Weighted linkage, also known as McQuitty linkage. +\item \code{ward}: Linkage using Ward's minimum variance method. +} + +\code{diana} and \code{pam} do not require a linkage method.} + \item{\code{cluster_cut_method}}{(\emph{optional}) The method used to define the actual +clusters. The following methods can be used: +\itemize{ +\item \code{silhouette}: Clusters are formed based on the silhouette score +(Rousseeuw, 1987). The average silhouette score is computed from 2 to +\eqn{n} clusters, with \eqn{n} the number of features. Clusters are only +formed if the average silhouette exceeds 0.50, which indicates reasonable +evidence for structure. This procedure may be slow if the number of +features is large (>100s). +\item \code{fixed_cut}: Clusters are formed by cutting the hierarchical tree at the +point indicated by the \code{cluster_similarity_threshold}, e.g. where features +in a cluster have an average Spearman correlation of 0.90. \code{fixed_cut} is +only available for \code{agnes}, \code{diana} and \code{hclust}. +\item \code{dynamic_cut}: Dynamic cluster formation using the cutting algorithm in +the \code{dynamicTreeCut} package. This package should be installed to select +this option. \code{dynamic_cut} can only be used with \code{agnes} and \code{hclust}. +} + +The default options are \code{silhouette} for partioning around medioids (\code{pam}) +and \code{fixed_cut} otherwise.} + \item{\code{cluster_similarity_metric}}{(\emph{optional}) Clusters are formed based on +feature similarity. All features are compared in a pair-wise fashion to +compute similarity, for example correlation. The resulting similarity grid +is converted into a distance matrix that is subsequently used for +clustering. The following metrics are supported to compute pairwise +similarities: +\itemize{ +\item \code{mutual_information} (default): normalised mutual information. +\item \code{mcfadden_r2}: McFadden's pseudo R-squared (McFadden, 1974). +\item \code{cox_snell_r2}: Cox and Snell's pseudo R-squared (Cox and Snell, 1989). +\item \code{nagelkerke_r2}: Nagelkerke's pseudo R-squared (Nagelkerke, 1991). +\item \code{spearman}: Spearman's rank order correlation. +\item \code{kendall}: Kendall rank correlation. +\item \code{pearson}: Pearson product-moment correlation. +} + +The pseudo R-squared metrics can be used to assess similarity between mixed +pairs of numeric and categorical features, as these are based on the +log-likelihood of regression models. In \code{familiar}, the more informative +feature is used as the predictor and the other feature as the reponse +variable. In numeric-categorical pairs, the numeric feature is considered +to be more informative and is thus used as the predictor. In +categorical-categorical pairs, the feature with most levels is used as the +predictor. + +In case any of the classical correlation coefficients (\code{pearson}, +\code{spearman} and \code{kendall}) are used with (mixed) categorical features, the +categorical features are one-hot encoded and the mean correlation over all +resulting pairs is used as similarity.} + \item{\code{cluster_similarity_threshold}}{(\emph{optional}) The threshold level for +pair-wise similarity that is required to form clusters using \code{fixed_cut}. +This should be a numerical value between 0.0 and 1.0. Note however, that a +reasonable threshold value depends strongly on the similarity metric. The +following are the default values used: +\itemize{ +\item \code{mcfadden_r2} and \code{mutual_information}: \code{0.30} +\item \code{cox_snell_r2} and \code{nagelkerke_r2}: \code{0.75} +\item \code{spearman}, \code{kendall} and \code{pearson}: \code{0.90} +} + +Alternatively, if the \verb{fixed cut} method is not used, this value determines +whether any clustering should be performed, because the data may not +contain highly similar features. The default values in this situation are: +\itemize{ +\item \code{mcfadden_r2} and \code{mutual_information}: \code{0.25} +\item \code{cox_snell_r2} and \code{nagelkerke_r2}: \code{0.40} +\item \code{spearman}, \code{kendall} and \code{pearson}: \code{0.70} +} + +The threshold value is converted to a distance (1-similarity) prior to +cutting hierarchical trees.} + \item{\code{cluster_representation_method}}{(\emph{optional}) Method used to determine +how the information of co-clustered features is summarised and used to +represent the cluster. The following methods can be selected: +\itemize{ +\item \code{best_predictor} (default): The feature with the highest importance +according to univariate regression with the outcome is used to represent +the cluster. +\item \code{medioid}: The feature closest to the cluster center, i.e. the feature +that is most similar to the remaining features in the cluster, is used to +represent the feature. +\item \code{mean}: A meta-feature is generated by averaging the feature values for +all features in a cluster. This method aligns all features so that all +features will be positively correlated prior to averaging. Should a cluster +contain one or more categorical features, the \code{medioid} method will be used +instead, as averaging is not possible. Note that if this method is chosen, +the \code{normalisation_method} parameter should be one of \code{standardisation}, +\code{standardisation_trim}, \code{standardisation_winsor} or \code{quantile}.` +} + +If the \code{pam} cluster method is selected, only the \code{medioid} method can be +used. In that case 1 medioid is used by default.} + \item{\code{parallel_preprocessing}}{(\emph{optional}) Enable parallel processing for the +preprocessing workflow. Defaults to \code{TRUE}. When set to \code{FALSE}, this will +disable the use of parallel processing while preprocessing, regardless of +the settings of the \code{parallel} parameter. \code{parallel_preprocessing} is +ignored if \code{parallel=FALSE}.} + \item{\code{parallel_feature_selection}}{(\emph{optional}) Enable parallel processing for +the feature selection workflow. Defaults to \code{TRUE}. When set to \code{FALSE}, +this will disable the use of parallel processing while performing feature +selection, regardless of the settings of the \code{parallel} parameter. +\code{parallel_feature_selection} is ignored if \code{parallel=FALSE}.} + }} +} +\value{ +An \code{experimentData} object. +} +\description{ +Creates data assignment, extracts feature information and +subsequently computes variable importance. +} +\details{ +This is a thin wrapper around \code{summon_familiar}, and functions like +it, but automatically skips learning and subsequent evaluation steps. + +The function returns an \code{experimentData} object, which can be used to +warm-start other experiments by providing it to the \code{experiment_data} +argument. Variable importance may be retrieved from this object using the +\code{get_vimp_table} and \code{aggregate_vimp_table} methods. +} +\seealso{ +\code{\link{get_vimp_table}}, \code{\link{aggregate_vimp_table}} +} diff --git a/man/predict-methods.Rd b/man/predict-methods.Rd new file mode 100644 index 00000000..1c644964 --- /dev/null +++ b/man/predict-methods.Rd @@ -0,0 +1,144 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/PredictS4Methods.R +\name{predict} +\alias{predict} +\alias{predict,familiarModel-method} +\alias{predict,familiarEnsemble-method} +\alias{predict,familiarNoveltyDetector-method} +\alias{predict,list-method} +\alias{predict,character-method} +\title{Model predictions for familiar models and model ensembles} +\usage{ +predict(object, ...) + +\S4method{predict}{familiarModel}( + object, + newdata, + type = "default", + time = NULL, + dir_path = NULL, + ensemble_method = "median", + stratification_threshold = NULL, + stratification_method = NULL, + percentiles = NULL, + ... +) + +\S4method{predict}{familiarEnsemble}( + object, + newdata, + type = "default", + time = NULL, + dir_path = NULL, + ensemble_method = "median", + stratification_threshold = NULL, + stratification_method = NULL, + percentiles = NULL, + ... +) + +\S4method{predict}{familiarNoveltyDetector}(object, newdata, type = "novelty", ...) + +\S4method{predict}{list}( + object, + newdata, + type = "default", + time = NULL, + dir_path = NULL, + ensemble_method = "median", + stratification_threshold = NULL, + stratification_method = NULL, + percentiles = NULL, + ... +) + +\S4method{predict}{character}( + object, + newdata, + type = "default", + time = NULL, + dir_path = NULL, + ensemble_method = "median", + stratification_threshold = NULL, + stratification_method = NULL, + percentiles = NULL, + ... +) +} +\arguments{ +\item{object}{A familiar model or ensemble of models that should be used for +prediction. This can also be a path to the ensemble model, one or more paths +to models, or a list of models.} + +\item{...}{to be documented.} + +\item{newdata}{Data to which the models are fitted. \code{familiar} performs checks +on the data to ensure that all features required for fitting the model are +present, and no additional levels are present in categorical features. +Unlike other \code{predict} methods, \code{newdata} cannot be missing in \code{familiar}, +as training data are not stored with the models.} + +\item{type}{Type of prediction made. The following values are directly +supported: +\itemize{ +\item \code{default}: Default prediction, i.e. value estimates for \code{count} and +\code{continuous} outcomes, predicted class probabilities and class for +\code{binomial} and \code{multinomial} and the model response for \code{survival} outcomes. +\item \code{survival_probability}: Predicts survival probabilities at the time +specified by \code{time}. Only applicable to \code{survival} outcomes. Some models may +not allow for predicting survival probabilities based on their response. +\item \code{novelty}: Predicts novelty of each sample, which can be used for +out-of-distribution detection. +\item \code{risk_stratification}: Predicts the strata to which the data belongs. Only +for \code{survival} outcomes. +} + +Other values for type are passed to the fitting method of the actual +underlying model. For example for generalised linear models (\code{glm}) \code{type} +can be \code{link}, \code{response} or \code{terms} as well. Some of these model-specific +prediction types may fail to return results if the model has been trimmed.} + +\item{time}{Time at which the response (\code{default}) or survival probability +(\code{survival_probability}) should be predicted for \code{survival} outcomes. Some +models have a response that does not depend on \code{time}, e.g. \code{cox}, whereas +others do, e.g. \code{random_forest}.} + +\item{dir_path}{Path to the folder containing the models. Ensemble objects are +stored with the models detached. In case the models were moved since +creation, \code{dir_path} can be used to specify the current folder. +Alternatively the \code{update_model_dir_path} method can be used to update the +path.} + +\item{ensemble_method}{Method for ensembling predictions from models for the +same sample. Available methods are: +\itemize{ +\item \code{median} (default): Use the median of the predicted values as the ensemble +value for a sample. +\item \code{mean}: Use the mean of the predicted values as the ensemble value for a +sample. +}} + +\item{stratification_threshold}{Threshold value(s) used for stratifying +instances into risk groups. If this parameter is specified, +\code{stratification_method} and any threshold values that come with the model +are ignored, and \code{stratification_threshold} is used instead.} + +\item{stratification_method}{Selects the stratification method from which the +threshold values should be selected. If the model or ensemble of models does +not contain thresholds for the indicated method, an error is returned. In +addition this argument is ignored if a \code{stratification_threshold} is set.} + +\item{percentiles}{Currently unused.} +} +\value{ +A \code{data.table} with predicted values. +} +\description{ +Fits the model or ensemble of models to the data and shows the +result. +} +\details{ +This method is used to predict values for instances specified by the +\code{newdata} using the model or ensemble of models specified by the \code{object} +argument. +} diff --git a/man/set_class_names-familiarCollection-method.Rd b/man/set_class_names-familiarCollection-method.Rd new file mode 100644 index 00000000..597c972c --- /dev/null +++ b/man/set_class_names-familiarCollection-method.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarCollection.R +\name{set_class_names,familiarCollection-method} +\alias{set_class_names,familiarCollection-method} +\alias{set_class_names} +\title{Rename outcome classes for plotting and export} +\usage{ +\S4method{set_class_names}{familiarCollection}(x, old = NULL, new = NULL, order = NULL) +} +\arguments{ +\item{x}{A familiarCollection object.} + +\item{old}{(optional) Set of old labels to replace.} + +\item{new}{Set of replacement labels. The number of replacement labels should +be equal to the number of provided old labels or the full number of labels. +If a subset of labels is to be replaced, both \code{old} and \code{new} +should be provided.} + +\item{order}{(optional) Ordered set of replacement labels. This is used to +provide the order in which the labels should be placed, which affects e.g. +levels in a plot. If the ordering is not explicitly provided, the old +ordering is used.} +} +\value{ +A familiarCollection object with updated labels. +} +\description{ +Tabular exports and figures created from a familiarCollection +object can be customised by providing names for outcome classes. +} +\details{ +Labels convert the internal naming for class levels to the requested +label at export or when plotting. This enables customisation of class +names. Currently assigned labels can be found using the +\code{get_class_names} method. +} +\seealso{ +\itemize{ +\item \linkS4class{familiarCollection} for information concerning the +familiarCollection class. * \code{\link{get_class_names}} for obtaining +currently assigned class names. +} +} diff --git a/man/set_data_set_names-familiarCollection-method.Rd b/man/set_data_set_names-familiarCollection-method.Rd new file mode 100644 index 00000000..a1e7953e --- /dev/null +++ b/man/set_data_set_names-familiarCollection-method.Rd @@ -0,0 +1,43 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarCollection.R +\name{set_data_set_names,familiarCollection-method} +\alias{set_data_set_names,familiarCollection-method} +\alias{set_data_set_names} +\title{Name datasets for plotting and export} +\usage{ +\S4method{set_data_set_names}{familiarCollection}(x, old = NULL, new = NULL, order = NULL) +} +\arguments{ +\item{x}{A familiarCollection object.} + +\item{old}{(optional) Set of old labels to replace.} + +\item{new}{Set of replacement labels. The number of replacement labels should +be equal to the number of provided old labels or the full number of labels. +If a subset of labels is to be replaced, both \code{old} and \code{new} +should be provided.} + +\item{order}{(optional) Ordered set of replacement labels. This is used to +provide the order in which the labels should be placed, which affects e.g. +levels in a plot. If the ordering is not explicitly provided, the old +ordering is used.} +} +\value{ +A familiarCollection object with custom names for the data sets. +} +\description{ +Tabular exports and figures created from a familiarCollection +object can be customised by setting data labels. +} +\details{ +Labels convert internal naming of data sets to the requested label +at export or when plotting. Currently assigned labels can be found using +the \code{get_data_set_names} method. +} +\seealso{ +\itemize{ +\item \linkS4class{familiarCollection} for information concerning the +familiarCollection class. * \code{\link{get_data_set_names}} for obtaining +currently assigned labels. +} +} diff --git a/man/set_feature_names-familiarCollection-method.Rd b/man/set_feature_names-familiarCollection-method.Rd new file mode 100644 index 00000000..a1d90106 --- /dev/null +++ b/man/set_feature_names-familiarCollection-method.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarCollection.R +\name{set_feature_names,familiarCollection-method} +\alias{set_feature_names,familiarCollection-method} +\alias{set_feature_names} +\title{Rename features for plotting and export} +\usage{ +\S4method{set_feature_names}{familiarCollection}(x, old = NULL, new = NULL, order = NULL) +} +\arguments{ +\item{x}{A familiarCollection object.} + +\item{old}{(optional) Set of old labels to replace.} + +\item{new}{Set of replacement labels. The number of replacement labels should +be equal to the number of provided old labels or the full number of labels. +If a subset of labels is to be replaced, both \code{old} and \code{new} +should be provided.} + +\item{order}{(optional) Ordered set of replacement labels. This is used to +provide the order in which the labels should be placed, which affects e.g. +levels in a plot. If the ordering is not explicitly provided, the old +ordering is used.} +} +\value{ +A familiarCollection object with updated labels. +} +\description{ +Tabular exports and figures created from a familiarCollection +object can be customised by providing names for features. +} +\details{ +Labels convert the internal naming for features to the requested +label at export or when plotting. This enables customisation without +redoing the analysis with renamed input data. Currently assigned labels can +be found using the \code{get_feature_names} method. +} +\seealso{ +\itemize{ +\item \linkS4class{familiarCollection} for information concerning the +familiarCollection class. * \code{\link{get_feature_names}} for obtaining +currently assigned feature names. +} +} diff --git a/man/set_fs_method_names-familiarCollection-method.Rd b/man/set_fs_method_names-familiarCollection-method.Rd new file mode 100644 index 00000000..49ede4d5 --- /dev/null +++ b/man/set_fs_method_names-familiarCollection-method.Rd @@ -0,0 +1,46 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarCollection.R +\name{set_fs_method_names,familiarCollection-method} +\alias{set_fs_method_names,familiarCollection-method} +\alias{set_fs_method_names} +\title{Rename feature selection methods for plotting and export} +\usage{ +\S4method{set_fs_method_names}{familiarCollection}(x, old = NULL, new = NULL, order = NULL) +} +\arguments{ +\item{x}{A familiarCollection object.} + +\item{old}{(optional) Set of old labels to replace.} + +\item{new}{Set of replacement labels. The number of replacement labels should +be equal to the number of provided old labels or the full number of labels. +If a subset of labels is to be replaced, both \code{old} and \code{new} +should be provided.} + +\item{order}{(optional) Ordered set of replacement labels. This is used to +provide the order in which the labels should be placed, which affects e.g. +levels in a plot. If the ordering is not explicitly provided, the old +ordering is used.} +} +\value{ +A familiarCollection object with updated labels. +} +\description{ +Tabular exports and figures created from a familiarCollection +object can be customised by providing names for the feature selection +methods. +} +\details{ +Labels convert the internal naming for feature selection methods to +the requested label at export or when plotting. This enables the use of +more specific naming, e.g. changing \code{mim} to \code{Mutual Information + Maximisation}. Currently assigned labels can be found using the +\code{get_fs_method_names} method. +} +\seealso{ +\itemize{ +\item \linkS4class{familiarCollection} for information concerning the +familiarCollection class. * \code{\link{get_fs_method_names}} for obtaining +currently assigned labels. +} +} diff --git a/man/set_learner_names-familiarCollection-method.Rd b/man/set_learner_names-familiarCollection-method.Rd new file mode 100644 index 00000000..2205b10d --- /dev/null +++ b/man/set_learner_names-familiarCollection-method.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarCollection.R +\name{set_learner_names,familiarCollection-method} +\alias{set_learner_names,familiarCollection-method} +\alias{set_learner_names} +\title{Rename learners for plotting and export} +\usage{ +\S4method{set_learner_names}{familiarCollection}(x, old = NULL, new = NULL, order = NULL) +} +\arguments{ +\item{x}{A familiarCollection object.} + +\item{old}{(optional) Set of old labels to replace.} + +\item{new}{Set of replacement labels. The number of replacement labels should +be equal to the number of provided old labels or the full number of labels. +If a subset of labels is to be replaced, both \code{old} and \code{new} +should be provided.} + +\item{order}{(optional) Ordered set of replacement labels. This is used to +provide the order in which the labels should be placed, which affects e.g. +levels in a plot. If the ordering is not explicitly provided, the old +ordering is used.} +} +\value{ +A familiarCollection object with custom labels for the learners. +} +\description{ +Tabular exports and figures created from a familiarCollection +object can be customised by providing names for the learners. +} +\details{ +Labels convert the internal naming for learners to the requested +label at export or when plotting. This enables the use of more specific +naming, e.g. changing \code{random_forest_rfsrc} to \code{Random Forest}. +Currently assigned labels can be found using the \code{get_learner_names} +method. +} +\seealso{ +\itemize{ +\item \linkS4class{familiarCollection} for information concerning the +familiarCollection class. * \code{\link{get_learner_names}} for obtaining +currently assigned labels. +} +} diff --git a/man/set_object_name-familiarData-method.Rd b/man/set_object_name-familiarData-method.Rd new file mode 100644 index 00000000..fb2c86ae --- /dev/null +++ b/man/set_object_name-familiarData-method.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarData.R +\name{set_object_name,familiarData-method} +\alias{set_object_name,familiarData-method} +\title{Set the name of a \code{familiarData} object.} +\usage{ +\S4method{set_object_name}{familiarData}(x, new = NULL) +} +\arguments{ +\item{x}{A \code{familiarData} object.} +} +\value{ +A \code{familiarData} object with a generated or a provided name. +} +\description{ +Set the \code{name} slot using the object name. +} +\keyword{internal} diff --git a/man/set_object_name-familiarEnsemble-method.Rd b/man/set_object_name-familiarEnsemble-method.Rd new file mode 100644 index 00000000..64ccb19f --- /dev/null +++ b/man/set_object_name-familiarEnsemble-method.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarEnsemble.R +\name{set_object_name,familiarEnsemble-method} +\alias{set_object_name,familiarEnsemble-method} +\title{Set the name of a \code{familiarEnsemble} object.} +\usage{ +\S4method{set_object_name}{familiarEnsemble}(x, new = NULL) +} +\arguments{ +\item{x}{A \code{familiarEnsemble} object.} +} +\value{ +A \code{familiarEnsemble} object with a generated or a provided name. +} +\description{ +Set the \code{name} slot using the object name. +} +\keyword{internal} diff --git a/man/set_object_name-familiarModel-method.Rd b/man/set_object_name-familiarModel-method.Rd new file mode 100644 index 00000000..fe2f4e9c --- /dev/null +++ b/man/set_object_name-familiarModel-method.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarModel.R +\name{set_object_name,familiarModel-method} +\alias{set_object_name,familiarModel-method} +\title{Set the name of a \code{familiarModel} object.} +\usage{ +\S4method{set_object_name}{familiarModel}(x, new = NULL) +} +\arguments{ +\item{x}{A \code{familiarModel} object.} +} +\value{ +A \code{familiarModel} object with a generated or a provided name. +} +\description{ +Set the \code{name} slot using the object name. +} +\keyword{internal} diff --git a/man/set_risk_group_names-familiarCollection-method.Rd b/man/set_risk_group_names-familiarCollection-method.Rd new file mode 100644 index 00000000..de665ed9 --- /dev/null +++ b/man/set_risk_group_names-familiarCollection-method.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarCollection.R +\name{set_risk_group_names,familiarCollection-method} +\alias{set_risk_group_names,familiarCollection-method} +\alias{set_risk_group_names} +\title{Rename risk groups for plotting and export} +\usage{ +\S4method{set_risk_group_names}{familiarCollection}(x, old = NULL, new = NULL, order = NULL) +} +\arguments{ +\item{x}{A familiarCollection object.} + +\item{old}{(optional) Set of old labels to replace.} + +\item{new}{Set of replacement labels. The number of replacement labels should +be equal to the number of provided old labels or the full number of labels. +If a subset of labels is to be replaced, both \code{old} and \code{new} +should be provided.} + +\item{order}{(optional) Ordered set of replacement labels. This is used to +provide the order in which the labels should be placed, which affects e.g. +levels in a plot. If the ordering is not explicitly provided, the old +ordering is used.} +} +\value{ +A familiarCollection object with updated labels. +} +\description{ +Tabular exports and figures created from a familiarCollection +object can be customised by providing names for risk groups in survival +analysis. +} +\details{ +Labels convert the internal naming for risk groups to the requested +label at export or when plotting. This enables customisation of risk group +names. Currently assigned labels can be found using the +\code{get_risk_group_names} method. +} +\seealso{ +\itemize{ +\item \linkS4class{familiarCollection} for information concerning the +familiarCollection class. * \code{\link{get_risk_group_names}} for obtaining +currently assigned risk group labels. +} +} diff --git a/man/summary-methods.Rd b/man/summary-methods.Rd new file mode 100644 index 00000000..262221e8 --- /dev/null +++ b/man/summary-methods.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarModel.R +\name{summary} +\alias{summary} +\alias{summary,familiarModel-method} +\title{Model summaries} +\usage{ +summary(object, ...) + +\S4method{summary}{familiarModel}(object, ...) +} +\arguments{ +\item{object}{a familiarModel object} + +\item{...}{additional arguments passed to \code{summary} methods for the underlying +model, when available.} +} +\value{ +Depends on underlying model. See the documentation for the particular +models. +} +\description{ +\code{summary} produces model summaries. +} +\details{ +This method extends the \code{summary} S3 method. For some models +\code{summary} requires information that is trimmed from the model. In this case +a copy of summary data is stored with the model, and returned. +} diff --git a/man/summon_familiar.Rd b/man/summon_familiar.Rd new file mode 100644 index 00000000..3c7541d3 --- /dev/null +++ b/man/summon_familiar.Rd @@ -0,0 +1,1465 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Familiar.R +\name{summon_familiar} +\alias{summon_familiar} +\title{Perform end-to-end machine learning and data analysis} +\usage{ +summon_familiar( + formula = NULL, + data = NULL, + experiment_data = NULL, + cl = NULL, + config = NULL, + config_id = 1, + verbose = TRUE, + .stop_after = "evaluation", + ... +) +} +\arguments{ +\item{formula}{An R formula. The formula can only contain feature names and +dot (\code{.}). The \code{*} and \code{+1} operators are not supported as these refer to +columns that are not present in the data set. + +Use of the formula interface is optional.} + +\item{data}{A \code{data.table} object, a \code{data.frame} object, list containing +multiple \code{data.table} or \code{data.frame} objects, or paths to data files. + +\code{data} should be provided if no file paths are provided to the \code{data_files} +argument. If both are provided, only \code{data} will be used. + +All data is expected to be in wide format, and ideally has a sample +identifier (see \code{sample_id_column}), batch identifier (see \code{cohort_column}) +and outcome columns (see \code{outcome_column}). + +In case paths are provided, the data should be stored as \code{csv}, \code{rds} or +\code{RData} files. See documentation for the \code{data_files} argument for more +information.} + +\item{experiment_data}{Experimental data may provided in the form of} + +\item{cl}{Cluster created using the \code{parallel} package. This cluster is then +used to speed up computation through parallelisation. When a cluster is not +provided, parallelisation is performed by setting up a cluster on the local +machine. + +This parameter has no effect if the \code{parallel} argument is set to \code{FALSE}.} + +\item{config}{List containing configuration parameters, or path to an \code{xml} +file containing these parameters. An empty configuration file can obtained +using the \code{get_xml_config} function. + +All parameters can also be set programmatically. These supersede any +arguments derived from the configuration list.} + +\item{config_id}{Identifier for the configuration in case the list or \code{xml} +table indicated by \code{config} contains more than one set of configurations.} + +\item{verbose}{Indicates verbosity of the results. Default is TRUE, and all +messages and warnings are returned.} + +\item{.stop_after}{Variable for internal use.} + +\item{...}{ + Arguments passed on to \code{\link[=.parse_file_paths]{.parse_file_paths}}, \code{\link[=.parse_experiment_settings]{.parse_experiment_settings}}, \code{\link[=.parse_setup_settings]{.parse_setup_settings}}, \code{\link[=.parse_preprocessing_settings]{.parse_preprocessing_settings}}, \code{\link[=.parse_feature_selection_settings]{.parse_feature_selection_settings}}, \code{\link[=.parse_model_development_settings]{.parse_model_development_settings}}, \code{\link[=.parse_hyperparameter_optimisation_settings]{.parse_hyperparameter_optimisation_settings}}, \code{\link[=.parse_evaluation_settings]{.parse_evaluation_settings}} + \describe{ + \item{\code{project_dir}}{(\emph{optional}) Path to the project directory. \code{familiar} +checks if the directory indicated by \code{experiment_dir} and data files in +\code{data_file} are relative to the \code{project_dir}.} + \item{\code{experiment_dir}}{(\strong{recommended}) Path to the directory where all +intermediate and final results produced by \code{familiar} are written to. + +The \code{experiment_dir} can be a path relative to \code{project_dir} or an absolute +path. + +In case no project directory is provided and the experiment directory is +not on an absolute path, a directory will be created in the temporary R +directory indicated by \code{tempdir()}. This directory is deleted after closing +the R session or once data analysis has finished. All information will be +lost afterwards. Hence, it is recommended to provide either +\code{experiment_dir} as an absolute path, or provide both \code{project_dir} and +\code{experiment_dir}.} + \item{\code{data_file}}{(\emph{optional}) Path to files containing data that should be +analysed. The paths can be relative to \code{project_dir} or absolute paths. An +error will be raised if the file cannot be found. + +The following types of data are supported. +\itemize{ +\item \code{csv} files containing column headers on the first row, and samples per +row. \code{csv} files are read using \code{data.table::fread}. +\item \code{rds} files that contain a \code{data.table} or \code{data.frame} object. \code{rds} +files are imported using \code{base::readRDS}. +\item \code{RData} files that contain a single \code{data.table} or \code{data.frame} object. +\code{RData} files are imported using \code{base::load}. +} + +All data are expected in wide format, with sample information organised +row-wise. + +More than one data file can be provided. \code{familiar} will try to combine +data files based on column names and identifier columns. + +Alternatively, data can be provided using the \code{data} argument. These data +are expected to be \code{data.frame} or \code{data.table} objects or paths to data +files. The latter are handled in the same way as file paths provided to +\code{data_file}.} + \item{\code{batch_id_column}}{(\strong{recommended}) Name of the column containing batch +or cohort identifiers. This parameter is required if more than one dataset +is provided, or if external validation is performed. + +In familiar any row of data is organised by four identifiers: +\itemize{ +\item The batch identifier \code{batch_id_column}: This denotes the group to which a +set of samples belongs, e.g. patients from a single study, samples measured +in a batch, etc. The batch identifier is used for batch normalisation, as +well as selection of development and validation datasets. +\item The sample identifier \code{sample_id_column}: This denotes the sample level, +e.g. data from a single individual. Subsets of data, e.g. bootstraps or +cross-validation folds, are created at this level. +\item The series identifier \code{series_id_column}: Indicates measurements on a +single sample that may not share the same outcome value, e.g. a time +series, or the number of cells in a view. +\item The repetition identifier: Indicates repeated measurements in a single +series where any feature values may differ, but the outcome does not. +Repetition identifiers are always implicitly set when multiple entries for +the same series of the same sample in the same batch that share the same +outcome are encountered. +}} + \item{\code{sample_id_column}}{(\strong{recommended}) Name of the column containing +sample or subject identifiers. See \code{batch_id_column} above for more +details. + +If unset, every row will be identified as a single sample.} + \item{\code{series_id_column}}{(\strong{optional}) Name of the column containing series +identifiers, which distinguish between measurements that are part of a +series for a single sample. See \code{batch_id_column} above for more details. + +If unset, rows which share the same batch and sample identifiers but have a +different outcome are assigned unique series identifiers.} + \item{\code{development_batch_id}}{(\emph{optional}) One or more batch or cohort +identifiers to constitute data sets for development. Defaults to all, or +all minus the identifiers in \code{validation_batch_id} for external validation. +Required if external validation is performed and \code{validation_batch_id} is +not provided.} + \item{\code{validation_batch_id}}{(\emph{optional}) One or more batch or cohort +identifiers to constitute data sets for external validation. Defaults to +all data sets except those in \code{development_batch_id} for external +validation, or none if not. Required if \code{development_batch_id} is not +provided.} + \item{\code{outcome_name}}{(\emph{optional}) Name of the modelled outcome. This name will +be used in figures created by \code{familiar}. + +If not set, the column name in \code{outcome_column} will be used for +\code{binomial}, \code{multinomial}, \code{count} and \code{continuous} outcomes. For other +outcomes (\code{survival} and \code{competing_risk}) no default is used.} + \item{\code{outcome_column}}{(\strong{recommended}) Name of the column containing the +outcome of interest. May be identified from a formula, if a formula is +provided as an argument. Otherwise an error is raised. Note that \code{survival} +and \code{competing_risk} outcome type outcomes require two columns that +indicate the time-to-event or the time of last follow-up and the event +status.} + \item{\code{outcome_type}}{(\strong{recommended}) Type of outcome found in the outcome +column. The outcome type determines many aspects of the overall process, +e.g. the available feature selection methods and learners, but also the +type of assessments that can be conducted to evaluate the resulting models. +Implemented outcome types are: +\itemize{ +\item \code{binomial}: categorical outcome with 2 levels. +\item \code{multinomial}: categorical outcome with 2 or more levels. +\item \code{count}: Poisson-distributed numeric outcomes. +\item \code{continuous}: general continuous numeric outcomes. +\item \code{survival}: survival outcome for time-to-event data. +} + +If not provided, the algorithm will attempt to obtain outcome_type from +contents of the outcome column. This may lead to unexpected results, and we +therefore advise to provide this information manually. + +Note that \code{competing_risk} survival analysis are not fully supported, and +is currently not a valid choice for \code{outcome_type}.} + \item{\code{class_levels}}{(\emph{optional}) Class levels for \code{binomial} or \code{multinomial} +outcomes. This argument can be used to specify the ordering of levels for +categorical outcomes. These class levels must exactly match the levels +present in the outcome column.} + \item{\code{event_indicator}}{(\strong{recommended}) Indicator for events in \code{survival} +and \code{competing_risk} analyses. \code{familiar} will automatically recognise \code{1}, +\code{true}, \code{t}, \code{y} and \code{yes} as event indicators, including different +capitalisations. If this parameter is set, it replaces the default values.} + \item{\code{censoring_indicator}}{(\strong{recommended}) Indicator for right-censoring in +\code{survival} and \code{competing_risk} analyses. \code{familiar} will automatically +recognise \code{0}, \code{false}, \code{f}, \code{n}, \code{no} as censoring indicators, including +different capitalisations. If this parameter is set, it replaces the +default values.} + \item{\code{competing_risk_indicator}}{(\strong{recommended}) Indicator for competing +risks in \code{competing_risk} analyses. There are no default values, and if +unset, all values other than those specified by the \code{event_indicator} and +\code{censoring_indicator} parameters are considered to indicate competing +risks.} + \item{\code{signature}}{(\emph{optional}) One or more names of feature columns that are +considered part of a specific signature. Features specified here will +always be used for modelling. Ranking from feature selection has no effect +for these features.} + \item{\code{novelty_features}}{(\emph{optional}) One or more names of feature columns +that should be included for the purpose of novelty detection.} + \item{\code{exclude_features}}{(\emph{optional}) Feature columns that will be removed +from the data set. Cannot overlap with features in \code{signature}, +\code{novelty_features} or \code{include_features}.} + \item{\code{include_features}}{(\emph{optional}) Feature columns that are specifically +included in the data set. By default all features are included. Cannot +overlap with \code{exclude_features}, but may overlap \code{signature}. Features in +\code{signature} and \code{novelty_features} are always included. If both +\code{exclude_features} and \code{include_features} are provided, \code{include_features} +takes precedence, provided that there is no overlap between the two.} + \item{\code{reference_method}}{(\emph{optional}) Method used to set reference levels for +categorical features. There are several options: +\itemize{ +\item \code{auto} (default): Categorical features that are not explicitly set by the +user, i.e. columns containing boolean values or characters, use the most +frequent level as reference. Categorical features that are explicitly set, +i.e. as factors, are used as is. +\item \code{always}: Both automatically detected and user-specified categorical +features have the reference level set to the most frequent level. Ordinal +features are not altered, but are used as is. +\item \code{never}: User-specified categorical features are used as is. +Automatically detected categorical features are simply sorted, and the +first level is then used as the reference level. This was the behaviour +prior to familiar version 1.3.0. +}} + \item{\code{experimental_design}}{(\strong{required}) Defines what the experiment looks +like, e.g. \code{cv(bt(fs,20)+mb,3,2)+ev} for 2 times repeated 3-fold +cross-validation with nested feature selection on 20 bootstraps and +model-building, and external validation. The basic workflow components are: +\itemize{ +\item \code{fs}: (required) feature selection step. +\item \code{mb}: (required) model building step. +\item \code{ev}: (optional) external validation. Note that internal validation due +to subsampling will always be conducted if the subsampling methods create +any validation data sets. +} + +The different components are linked using \code{+}. + +Different subsampling methods can be used in conjunction with the basic +workflow components: +\itemize{ +\item \code{bs(x,n)}: (stratified) .632 bootstrap, with \code{n} the number of +bootstraps. In contrast to \code{bt}, feature pre-processing parameters and +hyperparameter optimisation are conducted on individual bootstraps. +\item \code{bt(x,n)}: (stratified) .632 bootstrap, with \code{n} the number of +bootstraps. Unlike \code{bs} and other subsampling methods, no separate +pre-processing parameters or optimised hyperparameters will be determined +for each bootstrap. +\item \code{cv(x,n,p)}: (stratified) \code{n}-fold cross-validation, repeated \code{p} times. +Pre-processing parameters are determined for each iteration. +\item \code{lv(x)}: leave-one-out-cross-validation. Pre-processing parameters are +determined for each iteration. +\item \code{ip(x)}: imbalance partitioning for addressing class imbalances on the +data set. Pre-processing parameters are determined for each partition. The +number of partitions generated depends on the imbalance correction method +(see the \code{imbalance_correction_method} parameter). Imbalance partitioning +does not generate validation sets. +} + +As shown in the example above, sampling algorithms can be nested. + +The simplest valid experimental design is \code{fs+mb}, which corresponds to a +TRIPOD type 1a analysis. Type 1b analyses are only possible using +bootstraps, e.g. \code{bt(fs+mb,100)}. Type 2a analyses can be conducted using +cross-validation, e.g. \code{cv(bt(fs,100)+mb,10,1)}. Depending on the origin of +the external validation data, designs such as \code{fs+mb+ev} or +\code{cv(bt(fs,100)+mb,10,1)+ev} constitute type 2b or type 3 analyses. Type 4 +analyses can be done by obtaining one or more \code{familiarModel} objects from +others and applying them to your own data set. + +Alternatively, the \code{experimental_design} parameter may be used to provide a +path to a file containing iterations, which is named \verb{####_iterations.RDS} +by convention. This path can be relative to the directory of the current +experiment (\code{experiment_dir}), or an absolute path. The absolute path may +thus also point to a file from a different experiment.} + \item{\code{imbalance_correction_method}}{(\emph{optional}) Type of method used to +address class imbalances. Available options are: +\itemize{ +\item \code{full_undersampling} (default): All data will be used in an ensemble +fashion. The full minority class will appear in each partition, but +majority classes are undersampled until all data have been used. +\item \code{random_undersampling}: Randomly undersamples majority classes. This is +useful in cases where full undersampling would lead to the formation of +many models due major overrepresentation of the largest class. +} + +This parameter is only used in combination with imbalance partitioning in +the experimental design, and \code{ip} should therefore appear in the string +that defines the design.} + \item{\code{imbalance_n_partitions}}{(\emph{optional}) Number of times random +undersampling should be repeated. 10 undersampled subsets with balanced +classes are formed by default.} + \item{\code{parallel}}{(\emph{optional}) Enable parallel processing. Defaults to \code{TRUE}. +When set to \code{FALSE}, this disables all parallel processing, regardless of +specific parameters such as \code{parallel_preprocessing}. However, when +\code{parallel} is \code{TRUE}, parallel processing of different parts of the +workflow can be disabled by setting respective flags to \code{FALSE}.} + \item{\code{parallel_nr_cores}}{(\emph{optional}) Number of cores available for +parallelisation. Defaults to 2. This setting does nothing if +parallelisation is disabled.} + \item{\code{restart_cluster}}{(\emph{optional}) Restart nodes used for parallel computing +to free up memory prior to starting a parallel process. Note that it does +take time to set up the clusters. Therefore setting this argument to \code{TRUE} +may impact processing speed. This argument is ignored if \code{parallel} is +\code{FALSE} or the cluster was initialised outside of familiar. Default is +\code{FALSE}, which causes the clusters to be initialised only once.} + \item{\code{cluster_type}}{(\emph{optional}) Selection of the cluster type for parallel +processing. Available types are the ones supported by the parallel package +that is part of the base R distribution: \code{psock} (default), \code{fork}, \code{mpi}, +\code{nws}, \code{sock}. In addition, \code{none} is available, which also disables +parallel processing.} + \item{\code{backend_type}}{(\emph{optional}) Selection of the backend for distributing +copies of the data. This backend ensures that only a single master copy is +kept in memory. This limits memory usage during parallel processing. + +Several backend options are available, notably \code{socket_server}, and \code{none} +(default). \code{socket_server} is based on the callr package and R sockets, +comes with \code{familiar} and is available for any OS. \code{none} uses the package +environment of familiar to store data, and is available for any OS. +However, \code{none} requires copying of data to any parallel process, and has a +larger memory footprint.} + \item{\code{server_port}}{(\emph{optional}) Integer indicating the port on which the +socket server or RServe process should communicate. Defaults to port 6311. +Note that ports 0 to 1024 and 49152 to 65535 cannot be used.} + \item{\code{feature_max_fraction_missing}}{(\emph{optional}) Numeric value between \code{0.0} +and \code{0.95} that determines the meximum fraction of missing values that +still allows a feature to be included in the data set. All features with a +missing value fraction over this threshold are not processed further. The +default value is \code{0.30}.} + \item{\code{sample_max_fraction_missing}}{(\emph{optional}) Numeric value between \code{0.0} +and \code{0.95} that determines the maximum fraction of missing values that +still allows a sample to be included in the data set. All samples with a +missing value fraction over this threshold are excluded and not processed +further. The default value is \code{0.30}.} + \item{\code{filter_method}}{(\emph{optional}) One or methods used to reduce +dimensionality of the data set by removing irrelevant or poorly +reproducible features. + +Several method are available: +\itemize{ +\item \code{none} (default): None of the features will be filtered. +\item \code{low_variance}: Features with a variance below the +\code{low_var_minimum_variance_threshold} are filtered. This can be useful to +filter, for example, genes that are not differentially expressed. +\item \code{univariate_test}: Features undergo a univariate regression using an +outcome-appropriate regression model. The p-value of the model coefficient +is collected. Features with coefficient p or q-value above the +\code{univariate_test_threshold} are subsequently filtered. +\item \code{robustness}: Features that are not sufficiently robust according to the +intraclass correlation coefficient are filtered. Use of this method +requires that repeated measurements are present in the data set, i.e. there +should be entries for which the sample and cohort identifiers are the same. +} + +More than one method can be used simultaneously. Features with singular +values are always filtered, as these do not contain information.} + \item{\code{univariate_test_threshold}}{(\emph{optional}) Numeric value between \code{1.0} and +\code{0.0} that determines which features are irrelevant and will be filtered by +the \code{univariate_test}. The p or q-values are compared to this threshold. +All features with values above the threshold are filtered. The default +value is \code{0.20}.} + \item{\code{univariate_test_threshold_metric}}{(\emph{optional}) Metric used with the to +compare the \code{univariate_test_threshold} against. The following metrics can +be chosen: +\itemize{ +\item \code{p_value} (default): The unadjusted p-value of each feature is used for +to filter features. +\item \code{q_value}: The q-value (Story, 2002), is used to filter features. Some +data sets may have insufficient samples to compute the q-value. The +\code{qvalue} package must be installed from Bioconductor to use this method. +}} + \item{\code{univariate_test_max_feature_set_size}}{(\emph{optional}) Maximum size of the +feature set after the univariate test. P or q values of features are +compared against the threshold, but if the resulting data set would be +larger than this setting, only the most relevant features up to the desired +feature set size are selected. + +The default value is \code{NULL}, which causes features to be filtered based on +their relevance only.} + \item{\code{low_var_minimum_variance_threshold}}{(required, if used) Numeric value +that determines which features will be filtered by the \code{low_variance} +method. The variance of each feature is computed and compared to the +threshold. If it is below the threshold, the feature is removed. + +This parameter has no default value and should be set if \code{low_variance} is +used.} + \item{\code{low_var_max_feature_set_size}}{(\emph{optional}) Maximum size of the feature +set after filtering features with a low variance. All features are first +compared against \code{low_var_minimum_variance_threshold}. If the resulting +feature set would be larger than specified, only the most strongly varying +features will be selected, up to the desired size of the feature set. + +The default value is \code{NULL}, which causes features to be filtered based on +their variance only.} + \item{\code{robustness_icc_type}}{(\emph{optional}) String indicating the type of +intraclass correlation coefficient (\code{1}, \code{2} or \code{3}) that should be used to +compute robustness for features in repeated measurements. These types +correspond to the types in Shrout and Fleiss (1979). The default value is +\code{1}.} + \item{\code{robustness_threshold_metric}}{(\emph{optional}) String indicating which +specific intraclass correlation coefficient (ICC) metric should be used to +filter features. This should be one of: +\itemize{ +\item \code{icc}: The estimated ICC value itself. +\item \code{icc_low} (default): The estimated lower limit of the 95\% confidence +interval of the ICC, as suggested by Koo and Li (2016). +\item \code{icc_panel}: The estimated ICC value over the panel average, i.e. the ICC +that would be obtained if all repeated measurements were averaged. +\item \code{icc_panel_low}: The estimated lower limit of the 95\% confidence interval +of the panel ICC. +}} + \item{\code{robustness_threshold_value}}{(\emph{optional}) The intraclass correlation +coefficient value that is as threshold. The default value is \code{0.70}.} + \item{\code{transformation_method}}{(\emph{optional}) The transformation method used to +change the distribution of the data to be more normal-like. The following +methods are available: +\itemize{ +\item \code{none}: This disables transformation of features. +\item \code{yeo_johnson} (default): Transformation using the Yeo-Johnson +transformation (Yeo and Johnson, 2000). The algorithm tests various lambda +values and selects the lambda that maximises the log-likelihood. +\item \code{yeo_johnson_trim}: As \code{yeo_johnson}, but based on the set of feature +values where the 5\% lowest and 5\% highest values are discarded. This +reduces the effect of outliers. +\item \code{yeo_johnson_winsor}: As \code{yeo_johnson}, but based on the set of feature +values where the 5\% lowest and 5\% highest values are winsorised. This +reduces the effect of outliers. +\item \code{yeo_johnson_robust}: A robust version of \code{yeo_johnson} after Raymaekers +and Rousseeuw (2021). This method is less sensitive to outliers. +\item \code{box_cox}: Transformation using the Box-Cox transformation (Box and Cox, +1964). Unlike the Yeo-Johnson transformation, the Box-Cox transformation +requires that all data are positive. Features that contain zero or negative +values cannot be transformed using this transformation. The algorithm tests +various lambda values and selects the lambda that maximises the +log-likelihood. +\item \code{box_cox_trim}: As \code{box_cox}, but based on the set of feature values +where the 5\% lowest and 5\% highest values are discarded. This reduces the +effect of outliers. +\item \code{box_cox_winsor}: As \code{box_cox}, but based on the set of feature values +where the 5\% lowest and 5\% highest values are winsorised. This reduces the +effect of outliers. +\item \code{box_cox_robust}: A robust verson of \code{box_cox} after Raymaekers and +Rousseew (2021). This method is less sensitive to outliers. +} + +Only features that contain numerical data are transformed. Transformation +parameters obtained in development data are stored within \code{featureInfo} +objects for later use with validation data sets.} + \item{\code{normalisation_method}}{(\emph{optional}) The normalisation method used to +improve the comparability between numerical features that may have very +different scales. The following normalisation methods can be chosen: +\itemize{ +\item \code{none}: This disables feature normalisation. +\item \code{standardisation}: Features are normalised by subtraction of their mean +values and division by their standard deviations. This causes every feature +to be have a center value of 0.0 and standard deviation of 1.0. +\item \code{standardisation_trim}: As \code{standardisation}, but based on the set of +feature values where the 5\% lowest and 5\% highest values are discarded. +This reduces the effect of outliers. +\item \code{standardisation_winsor}: As \code{standardisation}, but based on the set of +feature values where the 5\% lowest and 5\% highest values are winsorised. +This reduces the effect of outliers. +\item \code{standardisation_robust} (default): A robust version of \code{standardisation} +that relies on computing Huber's M-estimators for location and scale. +\item \code{normalisation}: Features are normalised by subtraction of their minimum +values and division by their ranges. This maps all feature values to a +\eqn{[0, 1]} interval. +\item \code{normalisation_trim}: As \code{normalisation}, but based on the set of feature +values where the 5\% lowest and 5\% highest values are discarded. This +reduces the effect of outliers. +\item \code{normalisation_winsor}: As \code{normalisation}, but based on the set of +feature values where the 5\% lowest and 5\% highest values are winsorised. +This reduces the effect of outliers. +\item \code{quantile}: Features are normalised by subtraction of their median values +and division by their interquartile range. +\item \code{mean_centering}: Features are centered by substracting the mean, but do +not undergo rescaling. +} + +Only features that contain numerical data are normalised. Normalisation +parameters obtained in development data are stored within \code{featureInfo} +objects for later use with validation data sets.} + \item{\code{batch_normalisation_method}}{(\emph{optional}) The method used for batch +normalisation. Available methods are: +\itemize{ +\item \code{none} (default): This disables batch normalisation of features. +\item \code{standardisation}: Features within each batch are normalised by +subtraction of the mean value and division by the standard deviation in +each batch. +\item \code{standardisation_trim}: As \code{standardisation}, but based on the set of +feature values where the 5\% lowest and 5\% highest values are discarded. +This reduces the effect of outliers. +\item \code{standardisation_winsor}: As \code{standardisation}, but based on the set of +feature values where the 5\% lowest and 5\% highest values are winsorised. +This reduces the effect of outliers. +\item \code{standardisation_robust}: A robust version of \code{standardisation} that +relies on computing Huber's M-estimators for location and scale within each +batch. +\item \code{normalisation}: Features within each batch are normalised by subtraction +of their minimum values and division by their range in each batch. This +maps all feature values in each batch to a \eqn{[0, 1]} interval. +\item \code{normalisation_trim}: As \code{normalisation}, but based on the set of feature +values where the 5\% lowest and 5\% highest values are discarded. This +reduces the effect of outliers. +\item \code{normalisation_winsor}: As \code{normalisation}, but based on the set of +feature values where the 5\% lowest and 5\% highest values are winsorised. +This reduces the effect of outliers. +\item \code{quantile}: Features in each batch are normalised by subtraction of the +median value and division by the interquartile range of each batch. +\item \code{mean_centering}: Features in each batch are centered on 0.0 by +substracting the mean value in each batch, but are not rescaled. +\item \code{combat_parametric}: Batch adjustments using parametric empirical Bayes +(Johnson et al, 2007). \code{combat_p} leads to the same method. +\item \code{combat_non_parametric}: Batch adjustments using non-parametric empirical +Bayes (Johnson et al, 2007). \code{combat_np} and \code{combat} lead to the same +method. Note that we reduced complexity from O(\eqn{n^2}) to O(\eqn{n}) by +only computing batch adjustment parameters for each feature on a subset of +50 randomly selected features, instead of all features. +} + +Only features that contain numerical data are normalised using batch +normalisation. Batch normalisation parameters obtained in development data +are stored within \code{featureInfo} objects for later use with validation data +sets, in case the validation data is from the same batch. + +If validation data contains data from unknown batches, normalisation +parameters are separately determined for these batches. + +Note that for both empirical Bayes methods, the batch effect is assumed to +produce results across the features. This is often true for things such as +gene expressions, but the assumption may not hold generally. + +When performing batch normalisation, it is moreover important to check that +differences between batches or cohorts are not related to the studied +endpoint.} + \item{\code{imputation_method}}{(\emph{optional}) Method used for imputing missing +feature values. Two methods are implemented: +\itemize{ +\item \code{simple}: Simple replacement of a missing value by the median value (for +numeric features) or the modal value (for categorical features). +\item \code{lasso}: Imputation of missing value by lasso regression (using \code{glmnet}) +based on information contained in other features. +} + +\code{simple} imputation precedes \code{lasso} imputation to ensure that any missing +values in predictors required for \code{lasso} regression are resolved. The +\code{lasso} estimate is then used to replace the missing value. + +The default value depends on the number of features in the dataset. If the +number is lower than 100, \code{lasso} is used by default, and \code{simple} +otherwise. + +Only single imputation is performed. Imputation models and parameters are +stored within \code{featureInfo} objects for later use with validation data +sets.} + \item{\code{cluster_method}}{(\emph{optional}) Clustering is performed to identify and +replace redundant features, for example those that are highly correlated. +Such features do not carry much additional information and may be removed +or replaced instead (Park et al., 2007; Tolosi and Lengauer, 2011). + +The cluster method determines the algorithm used to form the clusters. The +following cluster methods are implemented: +\itemize{ +\item \code{none}: No clustering is performed. +\item \code{hclust} (default): Hierarchical agglomerative clustering. If the +\code{fastcluster} package is installed, \code{fastcluster::hclust} is used (Muellner +2013), otherwise \code{stats::hclust} is used. +\item \code{agnes}: Hierarchical clustering using agglomerative nesting (Kaufman and +Rousseeuw, 1990). This algorithm is similar to \code{hclust}, but uses the +\code{cluster::agnes} implementation. +\item \code{diana}: Divisive analysis hierarchical clustering. This method uses +divisive instead of agglomerative clustering (Kaufman and Rousseeuw, 1990). +\code{cluster::diana} is used. +\item \code{pam}: Partioning around medioids. This partitions the data into $k$ +clusters around medioids (Kaufman and Rousseeuw, 1990). $k$ is selected +using the \code{silhouette} metric. \code{pam} is implemented using the +\code{cluster::pam} function. +} + +Clusters and cluster information is stored within \code{featureInfo} objects for +later use with validation data sets. This enables reproduction of the same +clusters as formed in the development data set.} + \item{\code{cluster_linkage_method}}{(\emph{optional}) Linkage method used for +agglomerative clustering in \code{hclust} and \code{agnes}. The following linkage +methods can be used: +\itemize{ +\item \code{average} (default): Average linkage. +\item \code{single}: Single linkage. +\item \code{complete}: Complete linkage. +\item \code{weighted}: Weighted linkage, also known as McQuitty linkage. +\item \code{ward}: Linkage using Ward's minimum variance method. +} + +\code{diana} and \code{pam} do not require a linkage method.} + \item{\code{cluster_cut_method}}{(\emph{optional}) The method used to define the actual +clusters. The following methods can be used: +\itemize{ +\item \code{silhouette}: Clusters are formed based on the silhouette score +(Rousseeuw, 1987). The average silhouette score is computed from 2 to +\eqn{n} clusters, with \eqn{n} the number of features. Clusters are only +formed if the average silhouette exceeds 0.50, which indicates reasonable +evidence for structure. This procedure may be slow if the number of +features is large (>100s). +\item \code{fixed_cut}: Clusters are formed by cutting the hierarchical tree at the +point indicated by the \code{cluster_similarity_threshold}, e.g. where features +in a cluster have an average Spearman correlation of 0.90. \code{fixed_cut} is +only available for \code{agnes}, \code{diana} and \code{hclust}. +\item \code{dynamic_cut}: Dynamic cluster formation using the cutting algorithm in +the \code{dynamicTreeCut} package. This package should be installed to select +this option. \code{dynamic_cut} can only be used with \code{agnes} and \code{hclust}. +} + +The default options are \code{silhouette} for partioning around medioids (\code{pam}) +and \code{fixed_cut} otherwise.} + \item{\code{cluster_similarity_metric}}{(\emph{optional}) Clusters are formed based on +feature similarity. All features are compared in a pair-wise fashion to +compute similarity, for example correlation. The resulting similarity grid +is converted into a distance matrix that is subsequently used for +clustering. The following metrics are supported to compute pairwise +similarities: +\itemize{ +\item \code{mutual_information} (default): normalised mutual information. +\item \code{mcfadden_r2}: McFadden's pseudo R-squared (McFadden, 1974). +\item \code{cox_snell_r2}: Cox and Snell's pseudo R-squared (Cox and Snell, 1989). +\item \code{nagelkerke_r2}: Nagelkerke's pseudo R-squared (Nagelkerke, 1991). +\item \code{spearman}: Spearman's rank order correlation. +\item \code{kendall}: Kendall rank correlation. +\item \code{pearson}: Pearson product-moment correlation. +} + +The pseudo R-squared metrics can be used to assess similarity between mixed +pairs of numeric and categorical features, as these are based on the +log-likelihood of regression models. In \code{familiar}, the more informative +feature is used as the predictor and the other feature as the reponse +variable. In numeric-categorical pairs, the numeric feature is considered +to be more informative and is thus used as the predictor. In +categorical-categorical pairs, the feature with most levels is used as the +predictor. + +In case any of the classical correlation coefficients (\code{pearson}, +\code{spearman} and \code{kendall}) are used with (mixed) categorical features, the +categorical features are one-hot encoded and the mean correlation over all +resulting pairs is used as similarity.} + \item{\code{cluster_similarity_threshold}}{(\emph{optional}) The threshold level for +pair-wise similarity that is required to form clusters using \code{fixed_cut}. +This should be a numerical value between 0.0 and 1.0. Note however, that a +reasonable threshold value depends strongly on the similarity metric. The +following are the default values used: +\itemize{ +\item \code{mcfadden_r2} and \code{mutual_information}: \code{0.30} +\item \code{cox_snell_r2} and \code{nagelkerke_r2}: \code{0.75} +\item \code{spearman}, \code{kendall} and \code{pearson}: \code{0.90} +} + +Alternatively, if the \verb{fixed cut} method is not used, this value determines +whether any clustering should be performed, because the data may not +contain highly similar features. The default values in this situation are: +\itemize{ +\item \code{mcfadden_r2} and \code{mutual_information}: \code{0.25} +\item \code{cox_snell_r2} and \code{nagelkerke_r2}: \code{0.40} +\item \code{spearman}, \code{kendall} and \code{pearson}: \code{0.70} +} + +The threshold value is converted to a distance (1-similarity) prior to +cutting hierarchical trees.} + \item{\code{cluster_representation_method}}{(\emph{optional}) Method used to determine +how the information of co-clustered features is summarised and used to +represent the cluster. The following methods can be selected: +\itemize{ +\item \code{best_predictor} (default): The feature with the highest importance +according to univariate regression with the outcome is used to represent +the cluster. +\item \code{medioid}: The feature closest to the cluster center, i.e. the feature +that is most similar to the remaining features in the cluster, is used to +represent the feature. +\item \code{mean}: A meta-feature is generated by averaging the feature values for +all features in a cluster. This method aligns all features so that all +features will be positively correlated prior to averaging. Should a cluster +contain one or more categorical features, the \code{medioid} method will be used +instead, as averaging is not possible. Note that if this method is chosen, +the \code{normalisation_method} parameter should be one of \code{standardisation}, +\code{standardisation_trim}, \code{standardisation_winsor} or \code{quantile}.` +} + +If the \code{pam} cluster method is selected, only the \code{medioid} method can be +used. In that case 1 medioid is used by default.} + \item{\code{parallel_preprocessing}}{(\emph{optional}) Enable parallel processing for the +preprocessing workflow. Defaults to \code{TRUE}. When set to \code{FALSE}, this will +disable the use of parallel processing while preprocessing, regardless of +the settings of the \code{parallel} parameter. \code{parallel_preprocessing} is +ignored if \code{parallel=FALSE}.} + \item{\code{fs_method}}{(\strong{required}) Feature selection method to be used for +determining variable importance. \code{familiar} implements various feature +selection methods. Please refer to the vignette on feature selection +methods for more details. + +More than one feature selection method can be chosen. The experiment will +then repeated for each feature selection method. + +Feature selection methods determines the ranking of features. Actual +selection of features is done by optimising the signature size model +hyperparameter during the hyperparameter optimisation step.} + \item{\code{fs_method_parameter}}{(\emph{optional}) List of lists containing parameters +for feature selection methods. Each sublist should have the name of the +feature selection method it corresponds to. + +Most feature selection methods do not have parameters that can be set. +Please refer to the vignette on feature selection methods for more details. +Note that if the feature selection method is based on a learner (e.g. lasso +regression), hyperparameter optimisation may be performed prior to +assessing variable importance.} + \item{\code{vimp_aggregation_method}}{(\emph{optional}) The method used to aggregate +variable importances over different data subsets, e.g. bootstraps. The +following methods can be selected: +\itemize{ +\item \code{none}: Don't aggregate ranks, but rather aggregate the variable +importance scores themselves. +\item \code{mean}: Use the mean rank of a feature over the subsets to +determine the aggregated feature rank. +\item \code{median}: Use the median rank of a feature over the subsets to determine +the aggregated feature rank. +\item \code{best}: Use the best rank the feature obtained in any subset to determine +the aggregated feature rank. +\item \code{worst}: Use the worst rank the feature obtained in any subset to +determine the aggregated feature rank. +\item \code{stability}: Use the frequency of the feature being in the subset of +highly ranked features as measure for the aggregated feature rank +(Meinshausen and Buehlmann, 2010). +\item \code{exponential}: Use a rank-weighted frequence of occurrence in the subset +of highly ranked features as measure for the aggregated feature rank (Haury +et al., 2011). +\item \code{borda} (default): Use the borda count as measure for the aggregated +feature rank (Wald et al., 2012). +\item \code{enhanced_borda}: Use an occurrence frequency-weighted borda count as +measure for the aggregated feature rank (Wald et al., 2012). +\item \code{truncated_borda}: Use borda count computed only on features within the +subset of highly ranked features. +\item \code{enhanced_truncated_borda}: Apply both the enhanced borda method and the +truncated borda method and use the resulting borda count as the aggregated +feature rank. +} + +The \emph{feature selection methods} vignette provides additional information.} + \item{\code{vimp_aggregation_rank_threshold}}{(\emph{optional}) The threshold used to +define the subset of highly important features. If not set, this threshold +is determined by maximising the variance in the occurrence value over all +features over the subset size. + +This parameter is only relevant for \code{stability}, \code{exponential}, +\code{enhanced_borda}, \code{truncated_borda} and \code{enhanced_truncated_borda} methods.} + \item{\code{parallel_feature_selection}}{(\emph{optional}) Enable parallel processing for +the feature selection workflow. Defaults to \code{TRUE}. When set to \code{FALSE}, +this will disable the use of parallel processing while performing feature +selection, regardless of the settings of the \code{parallel} parameter. +\code{parallel_feature_selection} is ignored if \code{parallel=FALSE}.} + \item{\code{learner}}{(\strong{required}) One or more algorithms used for model +development. A sizeable number learners is supported in \code{familiar}. Please +see the vignette on learners for more information concerning the available +learners.} + \item{\code{hyperparameter}}{(\emph{optional}) List of lists containing hyperparameters +for learners. Each sublist should have the name of the learner method it +corresponds to, with list elements being named after the intended +hyperparameter, e.g. \code{"glm_logistic"=list("sign_size"=3)} + +All learners have hyperparameters. Please refer to the vignette on learners +for more details. If no parameters are provided, sequential model-based +optimisation is used to determine optimal hyperparameters. + +Hyperparameters provided by the user are never optimised. However, if more +than one value is provided for a single hyperparameter, optimisation will +be conducted using these values.} + \item{\code{novelty_detector}}{(\emph{optional}) Specify the algorithm used for training +a novelty detector. This detector can be used to identify +out-of-distribution data prospectively.} + \item{\code{detector_parameters}}{(\emph{optional}) List lists containing hyperparameters +for novelty detectors. Currently not used.} + \item{\code{parallel_model_development}}{(\emph{optional}) Enable parallel processing for +the model development workflow. Defaults to \code{TRUE}. When set to \code{FALSE}, +this will disable the use of parallel processing while developing models, +regardless of the settings of the \code{parallel} parameter. +\code{parallel_model_development} is ignored if \code{parallel=FALSE}.} + \item{\code{optimisation_bootstraps}}{(\emph{optional}) Number of bootstraps that should +be generated from the development data set. During the optimisation +procedure one or more of these bootstraps (indicated by +\code{smbo_step_bootstraps}) are used for model development using different +combinations of hyperparameters. The effect of the hyperparameters is then +assessed by comparing in-bag and out-of-bag model performance. + +The default number of bootstraps is \code{50}. Hyperparameter optimisation may +finish before exhausting the set of bootstraps.} + \item{\code{optimisation_determine_vimp}}{(\emph{optional}) Logical value that indicates +whether variable importance is determined separately for each of the +bootstraps created during the optimisation process (\code{TRUE}) or the +applicable results from the feature selection step are used (\code{FALSE}). + +Determining variable importance increases the initial computational +overhead. However, it prevents positive biases for the out-of-bag data due +to overlap of these data with the development data set used for the feature +selection step. In this case, any hyperparameters of the variable +importance method are not determined separately for each bootstrap, but +those obtained during the feature selection step are used instead. In case +multiple of such hyperparameter sets could be applicable, the set that will +be used is randomly selected for each bootstrap. + +This parameter only affects hyperparameter optimisation of learners. The +default is \code{TRUE}.} + \item{\code{smbo_random_initialisation}}{(\emph{optional}) String indicating the +initialisation method for the hyperparameter space. Can be one of +\code{fixed_subsample} (default), \code{fixed}, or \code{random}. \code{fixed} and +\code{fixed_subsample} first create hyperparameter sets from a range of default +values set by familiar. \code{fixed_subsample} then randomly draws up to +\code{smbo_n_random_sets} from the grid. \code{random} does not rely upon a fixed +grid, and randomly draws up to \code{smbo_n_random_sets} hyperparameter sets +from the hyperparameter space.} + \item{\code{smbo_n_random_sets}}{(\emph{optional}) Number of random or subsampled +hyperparameters drawn during the initialisation process. Default: \code{100}. +Cannot be smaller than \code{10}. The parameter is not used when +\code{smbo_random_initialisation} is \code{fixed}, as the entire pre-defined grid +will be explored.} + \item{\code{max_smbo_iterations}}{(\emph{optional}) Maximum number of intensify +iterations of the SMBO algorithm. During an intensify iteration a run-off +occurs between the current \emph{best} hyperparameter combination and either 10 +challenger combination with the highest expected improvement or a set of 20 +random combinations. + +Run-off with random combinations is used to force exploration of the +hyperparameter space, and is performed every second intensify iteration, or +if there is no expected improvement for any challenger combination. + +If a combination of hyperparameters leads to better performance on the same +data than the incumbent \emph{best} set of hyperparameters, it replaces the +incumbent set at the end of the intensify iteration. + +The default number of intensify iteration is \code{20}. Iterations may be +stopped early if the incumbent set of hyperparameters remains the same for +\code{smbo_stop_convergent_iterations} iterations, or performance improvement is +minimal. This behaviour is suppressed during the first 4 iterations to +enable the algorithm to explore the hyperparameter space.} + \item{\code{smbo_stop_convergent_iterations}}{(\emph{optional}) The number of subsequent +convergent SMBO iterations required to stop hyperparameter optimisation +early. An iteration is convergent if the \emph{best} parameter set has not +changed or the optimisation score over the 4 most recent iterations has not +changed beyond the tolerance level in \code{smbo_stop_tolerance}. + +The default value is \code{3}.} + \item{\code{smbo_stop_tolerance}}{(\emph{optional}) Tolerance for early stopping due to +convergent optimisation score. + +The default value depends on the square root of the number of samples (at +the series level), and is \code{0.01} for 100 samples. This value is computed as +\code{0.1 * 1 / sqrt(n_samples)}. The upper limit is \code{0.0001} for 1M or more +samples.} + \item{\code{smbo_time_limit}}{(\emph{optional}) Time limit (in minutes) for the +optimisation process. Optimisation is stopped after this limit is exceeded. +Time taken to determine variable importance for the optimisation process +(see the \code{optimisation_determine_vimp} parameter) does not count. + +The default is \code{NULL}, indicating that there is no time limit for the +optimisation process. The time limit cannot be less than 1 minute.} + \item{\code{smbo_initial_bootstraps}}{(\emph{optional}) The number of bootstraps taken +from the set of \code{optimisation_bootstraps} as the bootstraps assessed +initially. + +The default value is \code{1}. The value cannot be larger than +\code{optimisation_bootstraps}.} + \item{\code{smbo_step_bootstraps}}{(\emph{optional}) The number of bootstraps taken from +the set of \code{optimisation_bootstraps} bootstraps as the bootstraps assessed +during the steps of each intensify iteration. + +The default value is \code{3}. The value cannot be larger than +\code{optimisation_bootstraps}.} + \item{\code{smbo_intensify_steps}}{(\emph{optional}) The number of steps in each SMBO +intensify iteration. Each step a new set of \code{smbo_step_bootstraps} +bootstraps is drawn and used in the run-off between the incumbent \emph{best} +hyperparameter combination and its challengers. + +The default value is \code{5}. Higher numbers allow for a more detailed +comparison, but this comes with added computational cost.} + \item{\code{optimisation_metric}}{(\emph{optional}) One or more metrics used to compute +performance scores. See the vignette on performance metrics for the +available metrics. + +If unset, the following metrics are used by default: +\itemize{ +\item \code{auc_roc}: For \code{binomial} and \code{multinomial} models. +\item \code{mse}: Mean squared error for \code{continuous} models. +\item \code{msle}: Mean squared logarithmic error for \code{count} models. +\item \code{concordance_index}: For \code{survival} models. +} + +Multiple optimisation metrics can be specified. Actual metric values are +converted to an objective value by comparison with a baseline metric value +that derives from a trivial model, i.e. majority class for binomial and +multinomial outcomes, the median outcome for count and continuous outcomes +and a fixed risk or time for survival outcomes.} + \item{\code{optimisation_function}}{(\emph{optional}) Type of optimisation function used +to quantify the performance of a hyperparameter set. Model performance is +assessed using the metric(s) specified by \code{optimisation_metric} on the +in-bag (IB) and out-of-bag (OOB) samples of a bootstrap. These values are +converted to objective scores with a standardised interval of \eqn{[-1.0, + 1.0]}. Each pair of objective is subsequently used to compute an +optimisation score. The optimisation score across different bootstraps is +than aggregated to a summary score. This summary score is used to rank +hyperparameter sets, and select the optimal set. + +The combination of optimisation score and summary score is determined by +the optimisation function indicated by this parameter: +\itemize{ +\item \code{validation} or \code{max_validation} (default): seeks to maximise OOB score. +\item \code{balanced}: seeks to balance IB and OOB score. +\item \code{stronger_balance}: similar to \code{balanced}, but with stronger penalty for +differences between IB and OOB scores. +\item \code{validation_minus_sd}: seeks to optimise the average OOB score minus its +standard deviation. +\item \code{validation_25th_percentile}: seeks to optimise the 25th percentile of +OOB scores, and is conceptually similar to \code{validation_minus_sd}. +\item \code{model_estimate}: seeks to maximise the OOB score estimate predicted by +the hyperparameter learner (not available for random search). +\item \code{model_estimate_minus_sd}: seeks to maximise the OOB score estimate minus +its estimated standard deviation, as predicted by the hyperparameter +learner (not available for random search). +\item \code{model_balanced_estimate}: seeks to maximise the estimate of the balanced +IB and OOB score. This is similar to the \code{balanced} score, and in fact uses +a hyperparameter learner to predict said score (not available for random +search). +\item \code{model_balanced_estimate_minus_sd}: seeks to maximise the estimate of the +balanced IB and OOB score, minus its estimated standard deviation. This is +similar to the \code{balanced} score, but takes into account its estimated +spread. +} + +Additional detail are provided in the \emph{Learning algorithms and +hyperparameter optimisation} vignette.} + \item{\code{hyperparameter_learner}}{(\emph{optional}) Any point in the hyperparameter +space has a single, scalar, optimisation score value that is \emph{a priori} +unknown. During the optimisation process, the algorithm samples from the +hyperparameter space by selecting hyperparameter sets and computing the +optimisation score value for one or more bootstraps. For each +hyperparameter set the resulting values are distributed around the actual +value. The learner indicated by \code{hyperparameter_learner} is then used to +infer optimisation score estimates for unsampled parts of the +hyperparameter space. + +The following models are available: +\itemize{ +\item \code{bayesian_additive_regression_trees} or \code{bart}: Uses Bayesian Additive +Regression Trees (Sparapani et al., 2021) for inference. Unlike standard +random forests, BART allows for estimating posterior distributions directly +and can extrapolate. +\item \code{gaussian_process} (default): Creates a localised approximate Gaussian +process for inference (Gramacy, 2016). This allows for better scaling than +deterministic Gaussian Processes. +\item \code{random_forest}: Creates a random forest for inference. Originally +suggested by Hutter et al. (2011). A weakness of random forests is their +lack of extrapolation beyond observed values, which limits their usefulness +in exploiting promising areas of hyperparameter space. +\item \code{random} or \code{random_search}: Forgoes the use of models to steer +optimisation. Instead, a random search is performed. +}} + \item{\code{acquisition_function}}{(\emph{optional}) The acquisition function influences +how new hyperparameter sets are selected. The algorithm uses the model +learned by the learner indicated by \code{hyperparameter_learner} to search the +hyperparameter space for hyperparameter sets that are either likely better +than the best known set (\emph{exploitation}) or where there is considerable +uncertainty (\emph{exploration}). The acquisition function quantifies this +(Shahriari et al., 2016). + +The following acquisition functions are available, and are described in +more detail in the \emph{learner algorithms} vignette: +\itemize{ +\item \code{improvement_probability}: The probability of improvement quantifies the +probability that the expected optimisation score for a set is better than +the best observed optimisation score +\item \code{improvement_empirical_probability}: Similar to +\code{improvement_probability}, but based directly on optimisation scores +predicted by the individual decision trees. +\item \code{expected_improvement} (default): Computes expected improvement. +\item \code{upper_confidence_bound}: This acquisition function is based on the upper +confidence bound of the distribution (Srinivas et al., 2012). +\item \code{bayes_upper_confidence_bound}: This acquisition function is based on the +upper confidence bound of the distribution (Kaufmann et al., 2012). +}} + \item{\code{exploration_method}}{(\emph{optional}) Method used to steer exploration in +post-initialisation intensive searching steps. As stated earlier, each SMBO +iteration step compares suggested alternative parameter sets with an +incumbent \strong{best} set in a series of steps. The exploration method +controls how the set of alternative parameter sets is pruned after each +step in an iteration. Can be one of the following: +\itemize{ +\item \code{single_shot} (default): The set of alternative parameter sets is not +pruned, and each intensification iteration contains only a single +intensification step that only uses a single bootstrap. This is the fastest +exploration method, but only superficially tests each parameter set. +\item \code{successive_halving}: The set of alternative parameter sets is +pruned by removing the worst performing half of the sets after each step +(Jamieson and Talwalkar, 2016). +\item \code{stochastic_reject}: The set of alternative parameter sets is pruned by +comparing the performance of each parameter set with that of the incumbent +\strong{best} parameter set using a paired Wilcoxon test based on shared +bootstraps. Parameter sets that perform significantly worse, at an alpha +level indicated by \code{smbo_stochastic_reject_p_value}, are pruned. +\item \code{none}: The set of alternative parameter sets is not pruned. +}} + \item{\code{smbo_stochastic_reject_p_value}}{(\emph{optional}) The p-value threshold used +for the \code{stochastic_reject} exploration method. + +The default value is \code{0.05}.} + \item{\code{parallel_hyperparameter_optimisation}}{(\emph{optional}) Enable parallel +processing for hyperparameter optimisation. Defaults to \code{TRUE}. When set to +\code{FALSE}, this will disable the use of parallel processing while performing +optimisation, regardless of the settings of the \code{parallel} parameter. The +parameter moreover specifies whether parallelisation takes place within the +optimisation algorithm (\code{inner}, default), or in an outer loop ( \code{outer}) +over learners, data subsamples, etc. + +\code{parallel_hyperparameter_optimisation} is ignored if \code{parallel=FALSE}.} + \item{\code{evaluate_top_level_only}}{(\emph{optional}) Flag that signals that only +evaluation at the most global experiment level is required. Consider a +cross-validation experiment with additional external validation. The global +experiment level consists of data that are used for development, internal +validation and external validation. The next lower experiment level are the +individual cross-validation iterations. + +When the flag is \code{true}, evaluations take place on the global level only, +and no results are generated for the next lower experiment levels. In our +example, this means that results from individual cross-validation iterations +are not computed and shown. When the flag is \code{false}, results are computed +from both the global layer and the next lower level. + +Setting the flag to \code{true} saves computation time.} + \item{\code{skip_evaluation_elements}}{(\emph{optional}) Specifies which evaluation steps, +if any, should be skipped as part of the evaluation process. Defaults to +\code{none}, which means that all relevant evaluation steps are performed. It can +have one or more of the following values: +\itemize{ +\item \code{none}, \code{false}: no steps are skipped. +\item \code{all}, \code{true}: all steps are skipped. +\item \code{auc_data}: data for assessing and plotting the area under the receiver +operating characteristic curve are not computed. +\item \code{calibration_data}: data for assessing and plotting model calibration are +not computed. +\item \code{calibration_info}: data required to assess calibration, such as baseline +survival curves, are not collected. These data will still be present in the +models. +\item \code{confusion_matrix}: data for assessing and plotting a confusion matrix are +not collected. +\item \code{decision_curve_analyis}: data for performing a decision curve analysis +are not computed. +\item \code{feature_expressions}: data for assessing and plotting sample clustering +are not computed. +\item \code{feature_similarity}: data for assessing and plotting feature clusters are +not computed. +\item \code{fs_vimp}: data for assessing and plotting feature selection-based +variable importance are not collected. +\item \code{hyperparameters}: data for assessing model hyperparameters are not +collected. These data will still be present in the models. +\item \code{ice_data}: data for individual conditional expectation and partial +dependence plots are not created. +\item \code{model_performance}: data for assessing and visualising model performance +are not created. +\item \code{model_vimp}: data for assessing and plotting model-based variable +importance are not collected. +\item \code{permutation_vimp}: data for assessing and plotting model-agnostic +permutation variable importance are not computed. +\item \code{prediction_data}: predictions for each sample are not made and exported. +\item \code{risk_stratification_data}: data for assessing and plotting Kaplan-Meier +survival curves are not collected. +\item \code{risk_stratification_info}: data for assessing stratification into risk +groups are not computed. +\item \code{univariate_analysis}: data for assessing and plotting univariate feature +importance are not computed. +}} + \item{\code{ensemble_method}}{(\emph{optional}) Method for ensembling predictions from +models for the same sample. Available methods are: +\itemize{ +\item \code{median} (default): Use the median of the predicted values as the ensemble +value for a sample. +\item \code{mean}: Use the mean of the predicted values as the ensemble value for a +sample. +} + +This parameter is only used if \code{detail_level} is \code{ensemble}.} + \item{\code{evaluation_metric}}{(\emph{optional}) One or more metrics for assessing model +performance. See the vignette on performance metrics for the available +metrics. + +Confidence intervals (or rather credibility intervals) are computed for each +metric during evaluation. This is done using bootstraps, the number of which +depends on the value of \code{confidence_level} (Davison and Hinkley, 1997). + +If unset, the metric in the \code{optimisation_metric} variable is used.} + \item{\code{sample_limit}}{(\emph{optional}) Set the upper limit of the number of samples +that are used during evaluation steps. Cannot be less than 20. + +This setting can be specified per data element by providing a parameter +value in a named list with data elements, e.g. +\code{list("sample_similarity"=100, "permutation_vimp"=1000)}. + +This parameter can be set for the following data elements: +\code{sample_similarity} and \code{ice_data}.} + \item{\code{detail_level}}{(\emph{optional}) Sets the level at which results are computed +and aggregated. +\itemize{ +\item \code{ensemble}: Results are computed at the ensemble level, i.e. over all +models in the ensemble. This means that, for example, bias-corrected +estimates of model performance are assessed by creating (at least) 20 +bootstraps and computing the model performance of the ensemble model for +each bootstrap. +\item \code{hybrid} (default): Results are computed at the level of models in an +ensemble. This means that, for example, bias-corrected estimates of model +performance are directly computed using the models in the ensemble. If there +are at least 20 trained models in the ensemble, performance is computed for +each model, in contrast to \code{ensemble} where performance is computed for the +ensemble of models. If there are less than 20 trained models in the +ensemble, bootstraps are created so that at least 20 point estimates can be +made. +\item \code{model}: Results are computed at the model level. This means that, for +example, bias-corrected estimates of model performance are assessed by +creating (at least) 20 bootstraps and computing the performance of the model +for each bootstrap. +} + +Note that each level of detail has a different interpretation for bootstrap +confidence intervals. For \code{ensemble} and \code{model} these are the confidence +intervals for the ensemble and an individual model, respectively. That is, +the confidence interval describes the range where an estimate produced by a +respective ensemble or model trained on a repeat of the experiment may be +found with the probability of the confidence level. For \code{hybrid}, it +represents the range where any single model trained on a repeat of the +experiment may be found with the probability of the confidence level. By +definition, confidence intervals obtained using \code{hybrid} are at least as +wide as those for \code{ensemble}. \code{hybrid} offers the correct interpretation if +the goal of the analysis is to assess the result of a single, unspecified, +model. + +\code{hybrid} is generally computationally less expensive then \code{ensemble}, which +in turn is somewhat less expensive than \code{model}. + +A non-default \code{detail_level} parameter can be specified for separate +evaluation steps by providing a parameter value in a named list with data +elements, e.g. \code{list("auc_data"="ensemble", "model_performance"="hybrid")}. +This parameter can be set for the following data elements: \code{auc_data}, +\code{decision_curve_analyis}, \code{model_performance}, \code{permutation_vimp}, +\code{ice_data}, \code{prediction_data} and \code{confusion_matrix}.} + \item{\code{estimation_type}}{(\emph{optional}) Sets the type of estimation that should be +possible. This has the following options: +\itemize{ +\item \code{point}: Point estimates. +\item \code{bias_correction} or \code{bc}: Bias-corrected estimates. A bias-corrected +estimate is computed from (at least) 20 point estimates, and \code{familiar} may +bootstrap the data to create them. +\item \code{bootstrap_confidence_interval} or \code{bci} (default): Bias-corrected +estimates with bootstrap confidence intervals (Efron and Hastie, 2016). The +number of point estimates required depends on the \code{confidence_level} +parameter, and \code{familiar} may bootstrap the data to create them. +} + +As with \code{detail_level}, a non-default \code{estimation_type} parameter can be +specified for separate evaluation steps by providing a parameter value in a +named list with data elements, e.g. \code{list("auc_data"="bci", "model_performance"="point")}. This parameter can be set for the following +data elements: \code{auc_data}, \code{decision_curve_analyis}, \code{model_performance}, +\code{permutation_vimp}, \code{ice_data}, and \code{prediction_data}.} + \item{\code{aggregate_results}}{(\emph{optional}) Flag that signifies whether results +should be aggregated during evaluation. If \code{estimation_type} is +\code{bias_correction} or \code{bc}, aggregation leads to a single bias-corrected +estimate. If \code{estimation_type} is \code{bootstrap_confidence_interval} or \code{bci}, +aggregation leads to a single bias-corrected estimate with lower and upper +boundaries of the confidence interval. This has no effect if +\code{estimation_type} is \code{point}. + +The default value is equal to \code{TRUE} except when assessing metrics to assess +model performance, as the default violin plot requires underlying data. + +As with \code{detail_level} and \code{estimation_type}, a non-default +\code{aggregate_results} parameter can be specified for separate evaluation steps +by providing a parameter value in a named list with data elements, e.g. +\code{list("auc_data"=TRUE, , "model_performance"=FALSE)}. This parameter exists +for the same elements as \code{estimation_type}.} + \item{\code{confidence_level}}{(\emph{optional}) Numeric value for the level at which +confidence intervals are determined. In the case bootstraps are used to +determine the confidence intervals bootstrap estimation, \code{familiar} uses the +rule of thumb \eqn{n = 20 / ci.level} to determine the number of required +bootstraps. + +The default value is \code{0.95}.} + \item{\code{bootstrap_ci_method}}{(\emph{optional}) Method used to determine bootstrap +confidence intervals (Efron and Hastie, 2016). The following methods are +implemented: +\itemize{ +\item \code{percentile} (default): Confidence intervals obtained using the percentile +method. +\item \code{bc}: Bias-corrected confidence intervals. +} + +Note that the standard method is not implemented because this method is +often not suitable due to non-normal distributions. The bias-corrected and +accelerated (BCa) method is not implemented yet.} + \item{\code{feature_cluster_method}}{(\emph{optional}) Method used to perform clustering +of features. The same methods as for the \code{cluster_method} configuration +parameter are available: \code{none}, \code{hclust}, \code{agnes}, \code{diana} and \code{pam}. + +The value for the \code{cluster_method} configuration parameter is used by +default. When generating clusters for the purpose of determining mutual +correlation and ordering feature expressions, \code{none} is ignored and \code{hclust} +is used instead.} + \item{\code{feature_linkage_method}}{(\emph{optional}) Method used for agglomerative +clustering with \code{hclust} and \code{agnes}. Linkage determines how features are +sequentially combined into clusters based on distance. The methods are +shared with the \code{cluster_linkage_method} configuration parameter: \code{average}, +\code{single}, \code{complete}, \code{weighted}, and \code{ward}. + +The value for the \code{cluster_linkage_method} configuration parameters is used +by default.} + \item{\code{feature_cluster_cut_method}}{(\emph{optional}) Method used to divide features +into separate clusters. The available methods are the same as for the +\code{cluster_cut_method} configuration parameter: \code{silhouette}, \code{fixed_cut} and +\code{dynamic_cut}. + +\code{silhouette} is available for all cluster methods, but \code{fixed_cut} only +applies to methods that create hierarchical trees (\code{hclust}, \code{agnes} and +\code{diana}). \code{dynamic_cut} requires the \code{dynamicTreeCut} package and can only +be used with \code{agnes} and \code{hclust}. + +The value for the \code{cluster_cut_method} configuration parameter is used by +default.} + \item{\code{feature_similarity_metric}}{(\emph{optional}) Metric to determine pairwise +similarity between features. Similarity is computed in the same manner as +for clustering, and \code{feature_similarity_metric} therefore has the same +options as \code{cluster_similarity_metric}: \code{mcfadden_r2}, \code{cox_snell_r2}, +\code{nagelkerke_r2}, \code{mutual_information}, \code{spearman}, \code{kendall} and \code{pearson}. + +The value used for the \code{cluster_similarity_metric} configuration parameter +is used by default.} + \item{\code{feature_similarity_threshold}}{(\emph{optional}) The threshold level for +pair-wise similarity that is required to form feature clusters with the +\code{fixed_cut} method. This threshold functions in the same manner as the one +defined using the \code{cluster_similarity_threshold} parameter. + +By default, the value for the \code{cluster_similarity_threshold} configuration +parameter is used. + +Unlike for \code{cluster_similarity_threshold}, more than one value can be +supplied here.} + \item{\code{sample_cluster_method}}{(\emph{optional}) The method used to perform +clustering based on distance between samples. These are the same methods as +for the \code{cluster_method} configuration parameter: \code{hclust}, \code{agnes}, \code{diana} +and \code{pam}. + +The value for the \code{cluster_method} configuration parameter is used by +default. When generating clusters for the purpose of ordering samples in +feature expressions, \code{none} is ignored and \code{hclust} is used instead.} + \item{\code{sample_linkage_method}}{(\emph{optional}) The method used for agglomerative +clustering in \code{hclust} and \code{agnes}. These are the same methods as for the +\code{cluster_linkage_method} configuration parameter: \code{average}, \code{single}, +\code{complete}, \code{weighted}, and \code{ward}. + +The value for the \code{cluster_linkage_method} configuration parameters is used +by default.} + \item{\code{sample_similarity_metric}}{(\emph{optional}) Metric to determine pairwise +similarity between samples. Similarity is computed in the same manner as for +clustering, but \code{sample_similarity_metric} has different options that are +better suited to computing distance between samples instead of between +features. The following metrics are available. +\itemize{ +\item \code{gower} (default): compute Gower's distance between samples. By default, +Gower's distance is computed based on winsorised data to reduce the effect +of outliers (see below). +\item \code{euclidean}: compute the Euclidean distance between samples. +} + +The underlying feature data for numerical features is scaled to the +\eqn{[0,1]} range using the feature values across the samples. The +normalisation parameters required can optionally be computed from feature +data with the outer 5\% (on both sides) of feature values trimmed or +winsorised. To do so append \verb{_trim} (trimming) or \verb{_winsor} (winsorising) to +the metric name. This reduces the effect of outliers somewhat. + +Regardless of metric, all categorical features are handled as for the +Gower's distance: distance is 0 if the values in a pair of samples match, +and 1 if they do not.} + \item{\code{eval_aggregation_method}}{(\emph{optional}) Method for aggregating variable +importances for the purpose of evaluation. Variable importances are +determined during feature selection steps and after training the model. Both +types are evaluated, but feature selection variable importance is only +evaluated at run-time. + +See the documentation for the \code{vimp_aggregation_method} argument for +information concerning the different methods available.} + \item{\code{eval_aggregation_rank_threshold}}{(\emph{optional}) The threshold used to +define the subset of highly important features during evaluation. + +See the documentation for the \code{vimp_aggregation_rank_threshold} argument for +more information.} + \item{\code{eval_icc_type}}{(\emph{optional}) String indicating the type of intraclass +correlation coefficient (\code{1}, \code{2} or \code{3}) that should be used to compute +robustness for features in repeated measurements during the evaluation of +univariate importance. These types correspond to the types in Shrout and +Fleiss (1979). The default value is \code{1}.} + \item{\code{stratification_method}}{(\emph{optional}) Method for determining the +stratification threshold for creating survival groups. The actual, +model-dependent, threshold value is obtained from the development data, and +can afterwards be used to perform stratification on validation data. + +The following stratification methods are available: +\itemize{ +\item \code{median} (default): The median predicted value in the development cohort +is used to stratify the samples into two risk groups. For predicted outcome +values that build a continuous spectrum, the two risk groups in the +development cohort will be roughly equal in size. +\item \code{mean}: The mean predicted value in the development cohort is used to +stratify the samples into two risk groups. +\item \code{mean_trim}: As \code{mean}, but based on the set of predicted values +where the 5\% lowest and 5\% highest values are discarded. This reduces the +effect of outliers. +\item \code{mean_winsor}: As \code{mean}, but based on the set of predicted values where +the 5\% lowest and 5\% highest values are winsorised. This reduces the effect +of outliers. +\item \code{fixed}: Samples are stratified based on the sample quantiles of the +predicted values. These quantiles are defined using the +\code{stratification_threshold} parameter. +\item \code{optimised}: Use maximally selected rank statistics to determine the +optimal threshold (Lausen and Schumacher, 1992; Hothorn et al., 2003) to +stratify samples into two optimally separated risk groups. +} + +One or more stratification methods can be selected simultaneously. + +This parameter is only relevant for \code{survival} outcomes.} + \item{\code{stratification_threshold}}{(\emph{optional}) Numeric value(s) signifying the +sample quantiles for stratification using the \code{fixed} method. The number of +risk groups will be the number of values +1. + +The default value is \code{c(1/3, 2/3)}, which will yield two thresholds that +divide samples into three equally sized groups. If \code{fixed} is not among the +selected stratification methods, this parameter is ignored. + +This parameter is only relevant for \code{survival} outcomes.} + \item{\code{time_max}}{(\emph{optional}) Time point which is used as the benchmark for +e.g. cumulative risks generated by random forest, or the cutoff for Uno's +concordance index. + +If \code{time_max} is not provided, but \code{evaluation_times} is, the largest value +of \code{evaluation_times} is used. If both are not provided, \code{time_max} is set +to the 98th percentile of the distribution of survival times for samples +with an event in the development data set. + +This parameter is only relevant for \code{survival} outcomes.} + \item{\code{evaluation_times}}{(\emph{optional}) One or more time points that are used for +assessing calibration in survival problems. This is done as expected and +observed survival probabilities depend on time. + +If unset, \code{evaluation_times} will be equal to \code{time_max}. + +This parameter is only relevant for \code{survival} outcomes.} + \item{\code{dynamic_model_loading}}{(\emph{optional}) Enables dynamic loading of models +during the evaluation process, if \code{TRUE}. Defaults to \code{FALSE}. Dynamic +loading of models may reduce the overall memory footprint, at the cost of +increased disk or network IO. Models can only be dynamically loaded if they +are found at an accessible disk or network location. Setting this parameter +to \code{TRUE} may help if parallel processing causes out-of-memory issues during +evaluation.} + \item{\code{parallel_evaluation}}{(\emph{optional}) Enable parallel processing for +hyperparameter optimisation. Defaults to \code{TRUE}. When set to \code{FALSE}, this +will disable the use of parallel processing while performing optimisation, +regardless of the settings of the \code{parallel} parameter. The parameter +moreover specifies whether parallelisation takes place within the evaluation +process steps (\code{inner}, default), or in an outer loop ( \code{outer}) over +learners, data subsamples, etc. + +\code{parallel_evaluation} is ignored if \code{parallel=FALSE}.} + }} +} +\value{ +Nothing. All output is written to the experiment directory. If the +experiment directory is in a temporary location, a list with all +familiarModel, familiarEnsemble, familiarData and familiarCollection +objects will be returned. +} +\description{ +Perform end-to-end machine learning and data analysis +} +\references{ +\enumerate{ +\item Storey, J. D. A direct approach to false discovery rates. J. +R. Stat. Soc. Series B Stat. Methodol. 64, 479–498 (2002). +\item Shrout, P. E. & Fleiss, J. L. Intraclass correlations: uses in assessing +rater reliability. Psychol. Bull. 86, 420–428 (1979). +\item Koo, T. K. & Li, M. Y. A guideline of selecting and reporting intraclass +correlation coefficients for reliability research. J. Chiropr. Med. 15, +155–163 (2016). +\item Yeo, I. & Johnson, R. A. A new family of power transformations to +improve normality or symmetry. Biometrika 87, 954–959 (2000). +\item Box, G. E. P. & Cox, D. R. An analysis of transformations. J. R. Stat. +Soc. Series B Stat. Methodol. 26, 211–252 (1964). +\item Raymaekers, J., Rousseeuw, P. J. Transforming variables to central +normality. Mach Learn. (2021). +\item Park, M. Y., Hastie, T. & Tibshirani, R. Averaged gene expressions for +regression. Biostatistics 8, 212–227 (2007). +\item Tolosi, L. & Lengauer, T. Classification with correlated features: +unreliability of feature ranking and solutions. Bioinformatics 27, +1986–1994 (2011). +\item Johnson, W. E., Li, C. & Rabinovic, A. Adjusting batch effects in +microarray expression data using empirical Bayes methods. Biostatistics 8, +118–127 (2007) +\item Kaufman, L. & Rousseeuw, P. J. Finding groups in data: an introduction +to cluster analysis. (John Wiley & Sons, 2009). +\item Muellner, D. fastcluster: fast hierarchical, agglomerative clustering +routines for R and Python. J. Stat. Softw. 53, 1–18 (2013). +\item Rousseeuw, P. J. Silhouettes: A graphical aid to the interpretation and +validation of cluster analysis. J. Comput. Appl. Math. 20, 53–65 (1987). +\item Langfelder, P., Zhang, B. & Horvath, S. Defining clusters from a +hierarchical cluster tree: the Dynamic Tree Cut package for R. +Bioinformatics 24, 719–720 (2008). +\item McFadden, D. Conditional logit analysis of qualitative choice behavior. +in Frontiers in Econometrics (ed. Zarembka, P.) 105–142 (Academic Press, +1974). +\item Cox, D. R. & Snell, E. J. Analysis of binary data. (Chapman and Hall, +1989). +\item Nagelkerke, N. J. D. A note on a general definition of the coefficient +of determination. Biometrika 78, 691–692 (1991). +\item Meinshausen, N. & Buehlmann, P. Stability selection. J. R. Stat. Soc. +Series B Stat. Methodol. 72, 417–473 (2010). +\item Haury, A.-C., Gestraud, P. & Vert, J.-P. The influence of feature +selection methods on accuracy, stability and interpretability of molecular +signatures. PLoS One 6, e28210 (2011). +\item Wald, R., Khoshgoftaar, T. M., Dittman, D., Awada, W. & Napolitano,A. An +extensive comparison of feature ranking aggregation techniques in +bioinformatics. in 2012 IEEE 13th International Conference on Information +Reuse Integration (IRI) 377–384 (2012). +\item Hutter, F., Hoos, H. H. & Leyton-Brown, K. Sequential model-based +optimization for general algorithm configuration. in Learning and +Intelligent Optimization (ed. Coello, C. A. C.) 6683, 507–523 (Springer +Berlin Heidelberg, 2011). +\item Shahriari, B., Swersky, K., Wang, Z., Adams, R. P. & de Freitas, N. +Taking the Human Out of the Loop: A Review of Bayesian Optimization. Proc. +IEEE 104, 148–175 (2016) +\item Srinivas, N., Krause, A., Kakade, S. M. & Seeger, M. W. +Information-Theoretic Regret Bounds for Gaussian Process Optimization in +the Bandit Setting. IEEE Trans. Inf. Theory 58, 3250–3265 (2012) +\item Kaufmann, E., Cappé, O. & Garivier, A. On Bayesian upper confidence +bounds for bandit problems. in Artificial intelligence and statistics +592–600 (2012). +\item Jamieson, K. & Talwalkar, A. Non-stochastic Best Arm Identification and +Hyperparameter Optimization. in Proceedings of the 19th International +Conference on Artificial Intelligence and Statistics (eds. Gretton, A. & +Robert, C. C.) vol. 51 240–248 (PMLR, 2016). +\item Gramacy, R. B. laGP: Large-Scale Spatial Modeling via Local Approximate +Gaussian Processes in R. Journal of Statistical Software 72, 1–46 (2016) +\item Sparapani, R., Spanbauer, C. & McCulloch, R. Nonparametric Machine +Learning and Efficient Computation with Bayesian Additive Regression Trees: +The BART R Package. Journal of Statistical Software 97, 1–66 (2021) +\item Davison, A. C. & Hinkley, D. V. Bootstrap methods and their application. +(Cambridge University Press, 1997). +\item Efron, B. & Hastie, T. Computer Age Statistical Inference. (Cambridge +University Press, 2016). +\item Lausen, B. & Schumacher, M. Maximally Selected Rank Statistics. +Biometrics 48, 73 (1992). +\item Hothorn, T. & Lausen, B. On the exact distribution of maximally selected +rank statistics. Comput. Stat. Data Anal. 43, 121–137 (2003). +} +} diff --git a/man/theme_familiar.Rd b/man/theme_familiar.Rd new file mode 100644 index 00000000..7fe559b8 --- /dev/null +++ b/man/theme_familiar.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/PlotUtilities.R +\name{theme_familiar} +\alias{theme_familiar} +\title{Familiar ggplot2 theme} +\usage{ +theme_familiar( + base_size = 11, + base_family = "", + base_line_size = 0.5, + base_rect_size = 0.5 +) +} +\arguments{ +\item{base_size}{Base font size in points. Size of other plot text elements +is based off this.} + +\item{base_family}{Font family used for text elements.} + +\item{base_line_size}{Base size for line elements, in points.} + +\item{base_rect_size}{Base size for rectangular elements, in points.} +} +\value{ +A complete plotting theme. +} +\description{ +This is the default theme used for plots created by familiar. The theme uses +\code{ggplot2::theme_light} as the base template. +} diff --git a/man/train_familiar.Rd b/man/train_familiar.Rd new file mode 100644 index 00000000..f066a7a3 --- /dev/null +++ b/man/train_familiar.Rd @@ -0,0 +1,979 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Familiar.R +\name{train_familiar} +\alias{train_familiar} +\title{Create models using end-to-end machine learning} +\usage{ +train_familiar( + formula = NULL, + data = NULL, + experiment_data = NULL, + cl = NULL, + experimental_design = "fs+mb", + learner = NULL, + hyperparameter = NULL, + verbose = TRUE, + ... +) +} +\arguments{ +\item{formula}{An R formula. The formula can only contain feature names and +dot (\code{.}). The \code{*} and \code{+1} operators are not supported as these refer to +columns that are not present in the data set. + +Use of the formula interface is optional.} + +\item{data}{A \code{data.table} object, a \code{data.frame} object, list containing +multiple \code{data.table} or \code{data.frame} objects, or paths to data files. + +\code{data} should be provided if no file paths are provided to the \code{data_files} +argument. If both are provided, only \code{data} will be used. + +All data is expected to be in wide format, and ideally has a sample +identifier (see \code{sample_id_column}), batch identifier (see \code{cohort_column}) +and outcome columns (see \code{outcome_column}). + +In case paths are provided, the data should be stored as \code{csv}, \code{rds} or +\code{RData} files. See documentation for the \code{data_files} argument for more +information.} + +\item{experiment_data}{Experimental data may provided in the form of} + +\item{cl}{Cluster created using the \code{parallel} package. This cluster is then +used to speed up computation through parallelisation. When a cluster is not +provided, parallelisation is performed by setting up a cluster on the local +machine. + +This parameter has no effect if the \code{parallel} argument is set to \code{FALSE}.} + +\item{experimental_design}{(\strong{required}) Defines what the experiment looks +like, e.g. \code{cv(bt(fs,20)+mb,3,2)} for 2 times repeated 3-fold +cross-validation with nested feature selection on 20 bootstraps and +model-building. The basic workflow components are: +\itemize{ +\item \code{fs}: (required) feature selection step. +\item \code{mb}: (required) model building step. +\item \code{ev}: (optional) external validation. Setting this is not required for +\code{train_familiar}, but if validation batches or cohorts are present in the +dataset (\code{data}), these should be indicated in the \code{validation_batch_id} +argument. +} + +The different components are linked using \code{+}. + +Different subsampling methods can be used in conjunction with the basic +workflow components: +\itemize{ +\item \code{bs(x,n)}: (stratified) .632 bootstrap, with \code{n} the number of +bootstraps. In contrast to \code{bt}, feature pre-processing parameters and +hyperparameter optimisation are conducted on individual bootstraps. +\item \code{bt(x,n)}: (stratified) .632 bootstrap, with \code{n} the number of +bootstraps. Unlike \code{bs} and other subsampling methods, no separate +pre-processing parameters or optimised hyperparameters will be determined +for each bootstrap. +\item \code{cv(x,n,p)}: (stratified) \code{n}-fold cross-validation, repeated \code{p} times. +Pre-processing parameters are determined for each iteration. +\item \code{lv(x)}: leave-one-out-cross-validation. Pre-processing parameters are +determined for each iteration. +\item \code{ip(x)}: imbalance partitioning for addressing class imbalances on the +data set. Pre-processing parameters are determined for each partition. The +number of partitions generated depends on the imbalance correction method +(see the \code{imbalance_correction_method} parameter). +} + +As shown in the example above, sampling algorithms can be nested. + +The simplest valid experimental design is \code{fs+mb}. This is the default in +\code{train_familiar}, and will create one model for each feature selection +method in \code{fs_method}. To create more models, a subsampling method should +be introduced, e.g. \code{bs(fs+mb,20)} to create 20 models based on bootstraps +of the data. + +This argument is ignored if the \code{experiment_data} argument is set.} + +\item{learner}{(\strong{required}) Name of the learner used to develop a model. A +sizeable number learners is supported in \code{familiar}. Please see the +vignette on learners for more information concerning the available +learners. Unlike the \code{summon_familiar} function, \code{train_familiar} only +allows for a single learner.} + +\item{hyperparameter}{(\emph{optional}) List, or nested list containing +hyperparameters for learners. If a nested list is provided, each sublist +should have the name of the learner method it corresponds to, with list +elements being named after the intended hyperparameter, e.g. +\code{"glm_logistic"=list("sign_size"=3)} + +All learners have hyperparameters. Please refer to the vignette on learners +for more details. If no parameters are provided, sequential model-based +optimisation is used to determine optimal hyperparameters. + +Hyperparameters provided by the user are never optimised. However, if more +than one value is provided for a single hyperparameter, optimisation will +be conducted using these values.} + +\item{verbose}{Indicates verbosity of the results. Default is TRUE, and all +messages and warnings are returned.} + +\item{...}{ + Arguments passed on to \code{\link[=.parse_experiment_settings]{.parse_experiment_settings}}, \code{\link[=.parse_setup_settings]{.parse_setup_settings}}, \code{\link[=.parse_preprocessing_settings]{.parse_preprocessing_settings}}, \code{\link[=.parse_feature_selection_settings]{.parse_feature_selection_settings}}, \code{\link[=.parse_model_development_settings]{.parse_model_development_settings}}, \code{\link[=.parse_hyperparameter_optimisation_settings]{.parse_hyperparameter_optimisation_settings}} + \describe{ + \item{\code{batch_id_column}}{(\strong{recommended}) Name of the column containing batch +or cohort identifiers. This parameter is required if more than one dataset +is provided, or if external validation is performed. + +In familiar any row of data is organised by four identifiers: +\itemize{ +\item The batch identifier \code{batch_id_column}: This denotes the group to which a +set of samples belongs, e.g. patients from a single study, samples measured +in a batch, etc. The batch identifier is used for batch normalisation, as +well as selection of development and validation datasets. +\item The sample identifier \code{sample_id_column}: This denotes the sample level, +e.g. data from a single individual. Subsets of data, e.g. bootstraps or +cross-validation folds, are created at this level. +\item The series identifier \code{series_id_column}: Indicates measurements on a +single sample that may not share the same outcome value, e.g. a time +series, or the number of cells in a view. +\item The repetition identifier: Indicates repeated measurements in a single +series where any feature values may differ, but the outcome does not. +Repetition identifiers are always implicitly set when multiple entries for +the same series of the same sample in the same batch that share the same +outcome are encountered. +}} + \item{\code{sample_id_column}}{(\strong{recommended}) Name of the column containing +sample or subject identifiers. See \code{batch_id_column} above for more +details. + +If unset, every row will be identified as a single sample.} + \item{\code{series_id_column}}{(\strong{optional}) Name of the column containing series +identifiers, which distinguish between measurements that are part of a +series for a single sample. See \code{batch_id_column} above for more details. + +If unset, rows which share the same batch and sample identifiers but have a +different outcome are assigned unique series identifiers.} + \item{\code{development_batch_id}}{(\emph{optional}) One or more batch or cohort +identifiers to constitute data sets for development. Defaults to all, or +all minus the identifiers in \code{validation_batch_id} for external validation. +Required if external validation is performed and \code{validation_batch_id} is +not provided.} + \item{\code{validation_batch_id}}{(\emph{optional}) One or more batch or cohort +identifiers to constitute data sets for external validation. Defaults to +all data sets except those in \code{development_batch_id} for external +validation, or none if not. Required if \code{development_batch_id} is not +provided.} + \item{\code{outcome_name}}{(\emph{optional}) Name of the modelled outcome. This name will +be used in figures created by \code{familiar}. + +If not set, the column name in \code{outcome_column} will be used for +\code{binomial}, \code{multinomial}, \code{count} and \code{continuous} outcomes. For other +outcomes (\code{survival} and \code{competing_risk}) no default is used.} + \item{\code{outcome_column}}{(\strong{recommended}) Name of the column containing the +outcome of interest. May be identified from a formula, if a formula is +provided as an argument. Otherwise an error is raised. Note that \code{survival} +and \code{competing_risk} outcome type outcomes require two columns that +indicate the time-to-event or the time of last follow-up and the event +status.} + \item{\code{outcome_type}}{(\strong{recommended}) Type of outcome found in the outcome +column. The outcome type determines many aspects of the overall process, +e.g. the available feature selection methods and learners, but also the +type of assessments that can be conducted to evaluate the resulting models. +Implemented outcome types are: +\itemize{ +\item \code{binomial}: categorical outcome with 2 levels. +\item \code{multinomial}: categorical outcome with 2 or more levels. +\item \code{count}: Poisson-distributed numeric outcomes. +\item \code{continuous}: general continuous numeric outcomes. +\item \code{survival}: survival outcome for time-to-event data. +} + +If not provided, the algorithm will attempt to obtain outcome_type from +contents of the outcome column. This may lead to unexpected results, and we +therefore advise to provide this information manually. + +Note that \code{competing_risk} survival analysis are not fully supported, and +is currently not a valid choice for \code{outcome_type}.} + \item{\code{class_levels}}{(\emph{optional}) Class levels for \code{binomial} or \code{multinomial} +outcomes. This argument can be used to specify the ordering of levels for +categorical outcomes. These class levels must exactly match the levels +present in the outcome column.} + \item{\code{event_indicator}}{(\strong{recommended}) Indicator for events in \code{survival} +and \code{competing_risk} analyses. \code{familiar} will automatically recognise \code{1}, +\code{true}, \code{t}, \code{y} and \code{yes} as event indicators, including different +capitalisations. If this parameter is set, it replaces the default values.} + \item{\code{censoring_indicator}}{(\strong{recommended}) Indicator for right-censoring in +\code{survival} and \code{competing_risk} analyses. \code{familiar} will automatically +recognise \code{0}, \code{false}, \code{f}, \code{n}, \code{no} as censoring indicators, including +different capitalisations. If this parameter is set, it replaces the +default values.} + \item{\code{competing_risk_indicator}}{(\strong{recommended}) Indicator for competing +risks in \code{competing_risk} analyses. There are no default values, and if +unset, all values other than those specified by the \code{event_indicator} and +\code{censoring_indicator} parameters are considered to indicate competing +risks.} + \item{\code{signature}}{(\emph{optional}) One or more names of feature columns that are +considered part of a specific signature. Features specified here will +always be used for modelling. Ranking from feature selection has no effect +for these features.} + \item{\code{novelty_features}}{(\emph{optional}) One or more names of feature columns +that should be included for the purpose of novelty detection.} + \item{\code{exclude_features}}{(\emph{optional}) Feature columns that will be removed +from the data set. Cannot overlap with features in \code{signature}, +\code{novelty_features} or \code{include_features}.} + \item{\code{include_features}}{(\emph{optional}) Feature columns that are specifically +included in the data set. By default all features are included. Cannot +overlap with \code{exclude_features}, but may overlap \code{signature}. Features in +\code{signature} and \code{novelty_features} are always included. If both +\code{exclude_features} and \code{include_features} are provided, \code{include_features} +takes precedence, provided that there is no overlap between the two.} + \item{\code{reference_method}}{(\emph{optional}) Method used to set reference levels for +categorical features. There are several options: +\itemize{ +\item \code{auto} (default): Categorical features that are not explicitly set by the +user, i.e. columns containing boolean values or characters, use the most +frequent level as reference. Categorical features that are explicitly set, +i.e. as factors, are used as is. +\item \code{always}: Both automatically detected and user-specified categorical +features have the reference level set to the most frequent level. Ordinal +features are not altered, but are used as is. +\item \code{never}: User-specified categorical features are used as is. +Automatically detected categorical features are simply sorted, and the +first level is then used as the reference level. This was the behaviour +prior to familiar version 1.3.0. +}} + \item{\code{imbalance_correction_method}}{(\emph{optional}) Type of method used to +address class imbalances. Available options are: +\itemize{ +\item \code{full_undersampling} (default): All data will be used in an ensemble +fashion. The full minority class will appear in each partition, but +majority classes are undersampled until all data have been used. +\item \code{random_undersampling}: Randomly undersamples majority classes. This is +useful in cases where full undersampling would lead to the formation of +many models due major overrepresentation of the largest class. +} + +This parameter is only used in combination with imbalance partitioning in +the experimental design, and \code{ip} should therefore appear in the string +that defines the design.} + \item{\code{imbalance_n_partitions}}{(\emph{optional}) Number of times random +undersampling should be repeated. 10 undersampled subsets with balanced +classes are formed by default.} + \item{\code{parallel}}{(\emph{optional}) Enable parallel processing. Defaults to \code{TRUE}. +When set to \code{FALSE}, this disables all parallel processing, regardless of +specific parameters such as \code{parallel_preprocessing}. However, when +\code{parallel} is \code{TRUE}, parallel processing of different parts of the +workflow can be disabled by setting respective flags to \code{FALSE}.} + \item{\code{parallel_nr_cores}}{(\emph{optional}) Number of cores available for +parallelisation. Defaults to 2. This setting does nothing if +parallelisation is disabled.} + \item{\code{restart_cluster}}{(\emph{optional}) Restart nodes used for parallel computing +to free up memory prior to starting a parallel process. Note that it does +take time to set up the clusters. Therefore setting this argument to \code{TRUE} +may impact processing speed. This argument is ignored if \code{parallel} is +\code{FALSE} or the cluster was initialised outside of familiar. Default is +\code{FALSE}, which causes the clusters to be initialised only once.} + \item{\code{cluster_type}}{(\emph{optional}) Selection of the cluster type for parallel +processing. Available types are the ones supported by the parallel package +that is part of the base R distribution: \code{psock} (default), \code{fork}, \code{mpi}, +\code{nws}, \code{sock}. In addition, \code{none} is available, which also disables +parallel processing.} + \item{\code{backend_type}}{(\emph{optional}) Selection of the backend for distributing +copies of the data. This backend ensures that only a single master copy is +kept in memory. This limits memory usage during parallel processing. + +Several backend options are available, notably \code{socket_server}, and \code{none} +(default). \code{socket_server} is based on the callr package and R sockets, +comes with \code{familiar} and is available for any OS. \code{none} uses the package +environment of familiar to store data, and is available for any OS. +However, \code{none} requires copying of data to any parallel process, and has a +larger memory footprint.} + \item{\code{server_port}}{(\emph{optional}) Integer indicating the port on which the +socket server or RServe process should communicate. Defaults to port 6311. +Note that ports 0 to 1024 and 49152 to 65535 cannot be used.} + \item{\code{feature_max_fraction_missing}}{(\emph{optional}) Numeric value between \code{0.0} +and \code{0.95} that determines the meximum fraction of missing values that +still allows a feature to be included in the data set. All features with a +missing value fraction over this threshold are not processed further. The +default value is \code{0.30}.} + \item{\code{sample_max_fraction_missing}}{(\emph{optional}) Numeric value between \code{0.0} +and \code{0.95} that determines the maximum fraction of missing values that +still allows a sample to be included in the data set. All samples with a +missing value fraction over this threshold are excluded and not processed +further. The default value is \code{0.30}.} + \item{\code{filter_method}}{(\emph{optional}) One or methods used to reduce +dimensionality of the data set by removing irrelevant or poorly +reproducible features. + +Several method are available: +\itemize{ +\item \code{none} (default): None of the features will be filtered. +\item \code{low_variance}: Features with a variance below the +\code{low_var_minimum_variance_threshold} are filtered. This can be useful to +filter, for example, genes that are not differentially expressed. +\item \code{univariate_test}: Features undergo a univariate regression using an +outcome-appropriate regression model. The p-value of the model coefficient +is collected. Features with coefficient p or q-value above the +\code{univariate_test_threshold} are subsequently filtered. +\item \code{robustness}: Features that are not sufficiently robust according to the +intraclass correlation coefficient are filtered. Use of this method +requires that repeated measurements are present in the data set, i.e. there +should be entries for which the sample and cohort identifiers are the same. +} + +More than one method can be used simultaneously. Features with singular +values are always filtered, as these do not contain information.} + \item{\code{univariate_test_threshold}}{(\emph{optional}) Numeric value between \code{1.0} and +\code{0.0} that determines which features are irrelevant and will be filtered by +the \code{univariate_test}. The p or q-values are compared to this threshold. +All features with values above the threshold are filtered. The default +value is \code{0.20}.} + \item{\code{univariate_test_threshold_metric}}{(\emph{optional}) Metric used with the to +compare the \code{univariate_test_threshold} against. The following metrics can +be chosen: +\itemize{ +\item \code{p_value} (default): The unadjusted p-value of each feature is used for +to filter features. +\item \code{q_value}: The q-value (Story, 2002), is used to filter features. Some +data sets may have insufficient samples to compute the q-value. The +\code{qvalue} package must be installed from Bioconductor to use this method. +}} + \item{\code{univariate_test_max_feature_set_size}}{(\emph{optional}) Maximum size of the +feature set after the univariate test. P or q values of features are +compared against the threshold, but if the resulting data set would be +larger than this setting, only the most relevant features up to the desired +feature set size are selected. + +The default value is \code{NULL}, which causes features to be filtered based on +their relevance only.} + \item{\code{low_var_minimum_variance_threshold}}{(required, if used) Numeric value +that determines which features will be filtered by the \code{low_variance} +method. The variance of each feature is computed and compared to the +threshold. If it is below the threshold, the feature is removed. + +This parameter has no default value and should be set if \code{low_variance} is +used.} + \item{\code{low_var_max_feature_set_size}}{(\emph{optional}) Maximum size of the feature +set after filtering features with a low variance. All features are first +compared against \code{low_var_minimum_variance_threshold}. If the resulting +feature set would be larger than specified, only the most strongly varying +features will be selected, up to the desired size of the feature set. + +The default value is \code{NULL}, which causes features to be filtered based on +their variance only.} + \item{\code{robustness_icc_type}}{(\emph{optional}) String indicating the type of +intraclass correlation coefficient (\code{1}, \code{2} or \code{3}) that should be used to +compute robustness for features in repeated measurements. These types +correspond to the types in Shrout and Fleiss (1979). The default value is +\code{1}.} + \item{\code{robustness_threshold_metric}}{(\emph{optional}) String indicating which +specific intraclass correlation coefficient (ICC) metric should be used to +filter features. This should be one of: +\itemize{ +\item \code{icc}: The estimated ICC value itself. +\item \code{icc_low} (default): The estimated lower limit of the 95\% confidence +interval of the ICC, as suggested by Koo and Li (2016). +\item \code{icc_panel}: The estimated ICC value over the panel average, i.e. the ICC +that would be obtained if all repeated measurements were averaged. +\item \code{icc_panel_low}: The estimated lower limit of the 95\% confidence interval +of the panel ICC. +}} + \item{\code{robustness_threshold_value}}{(\emph{optional}) The intraclass correlation +coefficient value that is as threshold. The default value is \code{0.70}.} + \item{\code{transformation_method}}{(\emph{optional}) The transformation method used to +change the distribution of the data to be more normal-like. The following +methods are available: +\itemize{ +\item \code{none}: This disables transformation of features. +\item \code{yeo_johnson} (default): Transformation using the Yeo-Johnson +transformation (Yeo and Johnson, 2000). The algorithm tests various lambda +values and selects the lambda that maximises the log-likelihood. +\item \code{yeo_johnson_trim}: As \code{yeo_johnson}, but based on the set of feature +values where the 5\% lowest and 5\% highest values are discarded. This +reduces the effect of outliers. +\item \code{yeo_johnson_winsor}: As \code{yeo_johnson}, but based on the set of feature +values where the 5\% lowest and 5\% highest values are winsorised. This +reduces the effect of outliers. +\item \code{yeo_johnson_robust}: A robust version of \code{yeo_johnson} after Raymaekers +and Rousseeuw (2021). This method is less sensitive to outliers. +\item \code{box_cox}: Transformation using the Box-Cox transformation (Box and Cox, +1964). Unlike the Yeo-Johnson transformation, the Box-Cox transformation +requires that all data are positive. Features that contain zero or negative +values cannot be transformed using this transformation. The algorithm tests +various lambda values and selects the lambda that maximises the +log-likelihood. +\item \code{box_cox_trim}: As \code{box_cox}, but based on the set of feature values +where the 5\% lowest and 5\% highest values are discarded. This reduces the +effect of outliers. +\item \code{box_cox_winsor}: As \code{box_cox}, but based on the set of feature values +where the 5\% lowest and 5\% highest values are winsorised. This reduces the +effect of outliers. +\item \code{box_cox_robust}: A robust verson of \code{box_cox} after Raymaekers and +Rousseew (2021). This method is less sensitive to outliers. +} + +Only features that contain numerical data are transformed. Transformation +parameters obtained in development data are stored within \code{featureInfo} +objects for later use with validation data sets.} + \item{\code{normalisation_method}}{(\emph{optional}) The normalisation method used to +improve the comparability between numerical features that may have very +different scales. The following normalisation methods can be chosen: +\itemize{ +\item \code{none}: This disables feature normalisation. +\item \code{standardisation}: Features are normalised by subtraction of their mean +values and division by their standard deviations. This causes every feature +to be have a center value of 0.0 and standard deviation of 1.0. +\item \code{standardisation_trim}: As \code{standardisation}, but based on the set of +feature values where the 5\% lowest and 5\% highest values are discarded. +This reduces the effect of outliers. +\item \code{standardisation_winsor}: As \code{standardisation}, but based on the set of +feature values where the 5\% lowest and 5\% highest values are winsorised. +This reduces the effect of outliers. +\item \code{standardisation_robust} (default): A robust version of \code{standardisation} +that relies on computing Huber's M-estimators for location and scale. +\item \code{normalisation}: Features are normalised by subtraction of their minimum +values and division by their ranges. This maps all feature values to a +\eqn{[0, 1]} interval. +\item \code{normalisation_trim}: As \code{normalisation}, but based on the set of feature +values where the 5\% lowest and 5\% highest values are discarded. This +reduces the effect of outliers. +\item \code{normalisation_winsor}: As \code{normalisation}, but based on the set of +feature values where the 5\% lowest and 5\% highest values are winsorised. +This reduces the effect of outliers. +\item \code{quantile}: Features are normalised by subtraction of their median values +and division by their interquartile range. +\item \code{mean_centering}: Features are centered by substracting the mean, but do +not undergo rescaling. +} + +Only features that contain numerical data are normalised. Normalisation +parameters obtained in development data are stored within \code{featureInfo} +objects for later use with validation data sets.} + \item{\code{batch_normalisation_method}}{(\emph{optional}) The method used for batch +normalisation. Available methods are: +\itemize{ +\item \code{none} (default): This disables batch normalisation of features. +\item \code{standardisation}: Features within each batch are normalised by +subtraction of the mean value and division by the standard deviation in +each batch. +\item \code{standardisation_trim}: As \code{standardisation}, but based on the set of +feature values where the 5\% lowest and 5\% highest values are discarded. +This reduces the effect of outliers. +\item \code{standardisation_winsor}: As \code{standardisation}, but based on the set of +feature values where the 5\% lowest and 5\% highest values are winsorised. +This reduces the effect of outliers. +\item \code{standardisation_robust}: A robust version of \code{standardisation} that +relies on computing Huber's M-estimators for location and scale within each +batch. +\item \code{normalisation}: Features within each batch are normalised by subtraction +of their minimum values and division by their range in each batch. This +maps all feature values in each batch to a \eqn{[0, 1]} interval. +\item \code{normalisation_trim}: As \code{normalisation}, but based on the set of feature +values where the 5\% lowest and 5\% highest values are discarded. This +reduces the effect of outliers. +\item \code{normalisation_winsor}: As \code{normalisation}, but based on the set of +feature values where the 5\% lowest and 5\% highest values are winsorised. +This reduces the effect of outliers. +\item \code{quantile}: Features in each batch are normalised by subtraction of the +median value and division by the interquartile range of each batch. +\item \code{mean_centering}: Features in each batch are centered on 0.0 by +substracting the mean value in each batch, but are not rescaled. +\item \code{combat_parametric}: Batch adjustments using parametric empirical Bayes +(Johnson et al, 2007). \code{combat_p} leads to the same method. +\item \code{combat_non_parametric}: Batch adjustments using non-parametric empirical +Bayes (Johnson et al, 2007). \code{combat_np} and \code{combat} lead to the same +method. Note that we reduced complexity from O(\eqn{n^2}) to O(\eqn{n}) by +only computing batch adjustment parameters for each feature on a subset of +50 randomly selected features, instead of all features. +} + +Only features that contain numerical data are normalised using batch +normalisation. Batch normalisation parameters obtained in development data +are stored within \code{featureInfo} objects for later use with validation data +sets, in case the validation data is from the same batch. + +If validation data contains data from unknown batches, normalisation +parameters are separately determined for these batches. + +Note that for both empirical Bayes methods, the batch effect is assumed to +produce results across the features. This is often true for things such as +gene expressions, but the assumption may not hold generally. + +When performing batch normalisation, it is moreover important to check that +differences between batches or cohorts are not related to the studied +endpoint.} + \item{\code{imputation_method}}{(\emph{optional}) Method used for imputing missing +feature values. Two methods are implemented: +\itemize{ +\item \code{simple}: Simple replacement of a missing value by the median value (for +numeric features) or the modal value (for categorical features). +\item \code{lasso}: Imputation of missing value by lasso regression (using \code{glmnet}) +based on information contained in other features. +} + +\code{simple} imputation precedes \code{lasso} imputation to ensure that any missing +values in predictors required for \code{lasso} regression are resolved. The +\code{lasso} estimate is then used to replace the missing value. + +The default value depends on the number of features in the dataset. If the +number is lower than 100, \code{lasso} is used by default, and \code{simple} +otherwise. + +Only single imputation is performed. Imputation models and parameters are +stored within \code{featureInfo} objects for later use with validation data +sets.} + \item{\code{cluster_method}}{(\emph{optional}) Clustering is performed to identify and +replace redundant features, for example those that are highly correlated. +Such features do not carry much additional information and may be removed +or replaced instead (Park et al., 2007; Tolosi and Lengauer, 2011). + +The cluster method determines the algorithm used to form the clusters. The +following cluster methods are implemented: +\itemize{ +\item \code{none}: No clustering is performed. +\item \code{hclust} (default): Hierarchical agglomerative clustering. If the +\code{fastcluster} package is installed, \code{fastcluster::hclust} is used (Muellner +2013), otherwise \code{stats::hclust} is used. +\item \code{agnes}: Hierarchical clustering using agglomerative nesting (Kaufman and +Rousseeuw, 1990). This algorithm is similar to \code{hclust}, but uses the +\code{cluster::agnes} implementation. +\item \code{diana}: Divisive analysis hierarchical clustering. This method uses +divisive instead of agglomerative clustering (Kaufman and Rousseeuw, 1990). +\code{cluster::diana} is used. +\item \code{pam}: Partioning around medioids. This partitions the data into $k$ +clusters around medioids (Kaufman and Rousseeuw, 1990). $k$ is selected +using the \code{silhouette} metric. \code{pam} is implemented using the +\code{cluster::pam} function. +} + +Clusters and cluster information is stored within \code{featureInfo} objects for +later use with validation data sets. This enables reproduction of the same +clusters as formed in the development data set.} + \item{\code{cluster_linkage_method}}{(\emph{optional}) Linkage method used for +agglomerative clustering in \code{hclust} and \code{agnes}. The following linkage +methods can be used: +\itemize{ +\item \code{average} (default): Average linkage. +\item \code{single}: Single linkage. +\item \code{complete}: Complete linkage. +\item \code{weighted}: Weighted linkage, also known as McQuitty linkage. +\item \code{ward}: Linkage using Ward's minimum variance method. +} + +\code{diana} and \code{pam} do not require a linkage method.} + \item{\code{cluster_cut_method}}{(\emph{optional}) The method used to define the actual +clusters. The following methods can be used: +\itemize{ +\item \code{silhouette}: Clusters are formed based on the silhouette score +(Rousseeuw, 1987). The average silhouette score is computed from 2 to +\eqn{n} clusters, with \eqn{n} the number of features. Clusters are only +formed if the average silhouette exceeds 0.50, which indicates reasonable +evidence for structure. This procedure may be slow if the number of +features is large (>100s). +\item \code{fixed_cut}: Clusters are formed by cutting the hierarchical tree at the +point indicated by the \code{cluster_similarity_threshold}, e.g. where features +in a cluster have an average Spearman correlation of 0.90. \code{fixed_cut} is +only available for \code{agnes}, \code{diana} and \code{hclust}. +\item \code{dynamic_cut}: Dynamic cluster formation using the cutting algorithm in +the \code{dynamicTreeCut} package. This package should be installed to select +this option. \code{dynamic_cut} can only be used with \code{agnes} and \code{hclust}. +} + +The default options are \code{silhouette} for partioning around medioids (\code{pam}) +and \code{fixed_cut} otherwise.} + \item{\code{cluster_similarity_metric}}{(\emph{optional}) Clusters are formed based on +feature similarity. All features are compared in a pair-wise fashion to +compute similarity, for example correlation. The resulting similarity grid +is converted into a distance matrix that is subsequently used for +clustering. The following metrics are supported to compute pairwise +similarities: +\itemize{ +\item \code{mutual_information} (default): normalised mutual information. +\item \code{mcfadden_r2}: McFadden's pseudo R-squared (McFadden, 1974). +\item \code{cox_snell_r2}: Cox and Snell's pseudo R-squared (Cox and Snell, 1989). +\item \code{nagelkerke_r2}: Nagelkerke's pseudo R-squared (Nagelkerke, 1991). +\item \code{spearman}: Spearman's rank order correlation. +\item \code{kendall}: Kendall rank correlation. +\item \code{pearson}: Pearson product-moment correlation. +} + +The pseudo R-squared metrics can be used to assess similarity between mixed +pairs of numeric and categorical features, as these are based on the +log-likelihood of regression models. In \code{familiar}, the more informative +feature is used as the predictor and the other feature as the reponse +variable. In numeric-categorical pairs, the numeric feature is considered +to be more informative and is thus used as the predictor. In +categorical-categorical pairs, the feature with most levels is used as the +predictor. + +In case any of the classical correlation coefficients (\code{pearson}, +\code{spearman} and \code{kendall}) are used with (mixed) categorical features, the +categorical features are one-hot encoded and the mean correlation over all +resulting pairs is used as similarity.} + \item{\code{cluster_similarity_threshold}}{(\emph{optional}) The threshold level for +pair-wise similarity that is required to form clusters using \code{fixed_cut}. +This should be a numerical value between 0.0 and 1.0. Note however, that a +reasonable threshold value depends strongly on the similarity metric. The +following are the default values used: +\itemize{ +\item \code{mcfadden_r2} and \code{mutual_information}: \code{0.30} +\item \code{cox_snell_r2} and \code{nagelkerke_r2}: \code{0.75} +\item \code{spearman}, \code{kendall} and \code{pearson}: \code{0.90} +} + +Alternatively, if the \verb{fixed cut} method is not used, this value determines +whether any clustering should be performed, because the data may not +contain highly similar features. The default values in this situation are: +\itemize{ +\item \code{mcfadden_r2} and \code{mutual_information}: \code{0.25} +\item \code{cox_snell_r2} and \code{nagelkerke_r2}: \code{0.40} +\item \code{spearman}, \code{kendall} and \code{pearson}: \code{0.70} +} + +The threshold value is converted to a distance (1-similarity) prior to +cutting hierarchical trees.} + \item{\code{cluster_representation_method}}{(\emph{optional}) Method used to determine +how the information of co-clustered features is summarised and used to +represent the cluster. The following methods can be selected: +\itemize{ +\item \code{best_predictor} (default): The feature with the highest importance +according to univariate regression with the outcome is used to represent +the cluster. +\item \code{medioid}: The feature closest to the cluster center, i.e. the feature +that is most similar to the remaining features in the cluster, is used to +represent the feature. +\item \code{mean}: A meta-feature is generated by averaging the feature values for +all features in a cluster. This method aligns all features so that all +features will be positively correlated prior to averaging. Should a cluster +contain one or more categorical features, the \code{medioid} method will be used +instead, as averaging is not possible. Note that if this method is chosen, +the \code{normalisation_method} parameter should be one of \code{standardisation}, +\code{standardisation_trim}, \code{standardisation_winsor} or \code{quantile}.` +} + +If the \code{pam} cluster method is selected, only the \code{medioid} method can be +used. In that case 1 medioid is used by default.} + \item{\code{parallel_preprocessing}}{(\emph{optional}) Enable parallel processing for the +preprocessing workflow. Defaults to \code{TRUE}. When set to \code{FALSE}, this will +disable the use of parallel processing while preprocessing, regardless of +the settings of the \code{parallel} parameter. \code{parallel_preprocessing} is +ignored if \code{parallel=FALSE}.} + \item{\code{fs_method}}{(\strong{required}) Feature selection method to be used for +determining variable importance. \code{familiar} implements various feature +selection methods. Please refer to the vignette on feature selection +methods for more details. + +More than one feature selection method can be chosen. The experiment will +then repeated for each feature selection method. + +Feature selection methods determines the ranking of features. Actual +selection of features is done by optimising the signature size model +hyperparameter during the hyperparameter optimisation step.} + \item{\code{fs_method_parameter}}{(\emph{optional}) List of lists containing parameters +for feature selection methods. Each sublist should have the name of the +feature selection method it corresponds to. + +Most feature selection methods do not have parameters that can be set. +Please refer to the vignette on feature selection methods for more details. +Note that if the feature selection method is based on a learner (e.g. lasso +regression), hyperparameter optimisation may be performed prior to +assessing variable importance.} + \item{\code{vimp_aggregation_method}}{(\emph{optional}) The method used to aggregate +variable importances over different data subsets, e.g. bootstraps. The +following methods can be selected: +\itemize{ +\item \code{none}: Don't aggregate ranks, but rather aggregate the variable +importance scores themselves. +\item \code{mean}: Use the mean rank of a feature over the subsets to +determine the aggregated feature rank. +\item \code{median}: Use the median rank of a feature over the subsets to determine +the aggregated feature rank. +\item \code{best}: Use the best rank the feature obtained in any subset to determine +the aggregated feature rank. +\item \code{worst}: Use the worst rank the feature obtained in any subset to +determine the aggregated feature rank. +\item \code{stability}: Use the frequency of the feature being in the subset of +highly ranked features as measure for the aggregated feature rank +(Meinshausen and Buehlmann, 2010). +\item \code{exponential}: Use a rank-weighted frequence of occurrence in the subset +of highly ranked features as measure for the aggregated feature rank (Haury +et al., 2011). +\item \code{borda} (default): Use the borda count as measure for the aggregated +feature rank (Wald et al., 2012). +\item \code{enhanced_borda}: Use an occurrence frequency-weighted borda count as +measure for the aggregated feature rank (Wald et al., 2012). +\item \code{truncated_borda}: Use borda count computed only on features within the +subset of highly ranked features. +\item \code{enhanced_truncated_borda}: Apply both the enhanced borda method and the +truncated borda method and use the resulting borda count as the aggregated +feature rank. +} + +The \emph{feature selection methods} vignette provides additional information.} + \item{\code{vimp_aggregation_rank_threshold}}{(\emph{optional}) The threshold used to +define the subset of highly important features. If not set, this threshold +is determined by maximising the variance in the occurrence value over all +features over the subset size. + +This parameter is only relevant for \code{stability}, \code{exponential}, +\code{enhanced_borda}, \code{truncated_borda} and \code{enhanced_truncated_borda} methods.} + \item{\code{parallel_feature_selection}}{(\emph{optional}) Enable parallel processing for +the feature selection workflow. Defaults to \code{TRUE}. When set to \code{FALSE}, +this will disable the use of parallel processing while performing feature +selection, regardless of the settings of the \code{parallel} parameter. +\code{parallel_feature_selection} is ignored if \code{parallel=FALSE}.} + \item{\code{novelty_detector}}{(\emph{optional}) Specify the algorithm used for training +a novelty detector. This detector can be used to identify +out-of-distribution data prospectively.} + \item{\code{detector_parameters}}{(\emph{optional}) List lists containing hyperparameters +for novelty detectors. Currently not used.} + \item{\code{parallel_model_development}}{(\emph{optional}) Enable parallel processing for +the model development workflow. Defaults to \code{TRUE}. When set to \code{FALSE}, +this will disable the use of parallel processing while developing models, +regardless of the settings of the \code{parallel} parameter. +\code{parallel_model_development} is ignored if \code{parallel=FALSE}.} + \item{\code{optimisation_bootstraps}}{(\emph{optional}) Number of bootstraps that should +be generated from the development data set. During the optimisation +procedure one or more of these bootstraps (indicated by +\code{smbo_step_bootstraps}) are used for model development using different +combinations of hyperparameters. The effect of the hyperparameters is then +assessed by comparing in-bag and out-of-bag model performance. + +The default number of bootstraps is \code{50}. Hyperparameter optimisation may +finish before exhausting the set of bootstraps.} + \item{\code{optimisation_determine_vimp}}{(\emph{optional}) Logical value that indicates +whether variable importance is determined separately for each of the +bootstraps created during the optimisation process (\code{TRUE}) or the +applicable results from the feature selection step are used (\code{FALSE}). + +Determining variable importance increases the initial computational +overhead. However, it prevents positive biases for the out-of-bag data due +to overlap of these data with the development data set used for the feature +selection step. In this case, any hyperparameters of the variable +importance method are not determined separately for each bootstrap, but +those obtained during the feature selection step are used instead. In case +multiple of such hyperparameter sets could be applicable, the set that will +be used is randomly selected for each bootstrap. + +This parameter only affects hyperparameter optimisation of learners. The +default is \code{TRUE}.} + \item{\code{smbo_random_initialisation}}{(\emph{optional}) String indicating the +initialisation method for the hyperparameter space. Can be one of +\code{fixed_subsample} (default), \code{fixed}, or \code{random}. \code{fixed} and +\code{fixed_subsample} first create hyperparameter sets from a range of default +values set by familiar. \code{fixed_subsample} then randomly draws up to +\code{smbo_n_random_sets} from the grid. \code{random} does not rely upon a fixed +grid, and randomly draws up to \code{smbo_n_random_sets} hyperparameter sets +from the hyperparameter space.} + \item{\code{smbo_n_random_sets}}{(\emph{optional}) Number of random or subsampled +hyperparameters drawn during the initialisation process. Default: \code{100}. +Cannot be smaller than \code{10}. The parameter is not used when +\code{smbo_random_initialisation} is \code{fixed}, as the entire pre-defined grid +will be explored.} + \item{\code{max_smbo_iterations}}{(\emph{optional}) Maximum number of intensify +iterations of the SMBO algorithm. During an intensify iteration a run-off +occurs between the current \emph{best} hyperparameter combination and either 10 +challenger combination with the highest expected improvement or a set of 20 +random combinations. + +Run-off with random combinations is used to force exploration of the +hyperparameter space, and is performed every second intensify iteration, or +if there is no expected improvement for any challenger combination. + +If a combination of hyperparameters leads to better performance on the same +data than the incumbent \emph{best} set of hyperparameters, it replaces the +incumbent set at the end of the intensify iteration. + +The default number of intensify iteration is \code{20}. Iterations may be +stopped early if the incumbent set of hyperparameters remains the same for +\code{smbo_stop_convergent_iterations} iterations, or performance improvement is +minimal. This behaviour is suppressed during the first 4 iterations to +enable the algorithm to explore the hyperparameter space.} + \item{\code{smbo_stop_convergent_iterations}}{(\emph{optional}) The number of subsequent +convergent SMBO iterations required to stop hyperparameter optimisation +early. An iteration is convergent if the \emph{best} parameter set has not +changed or the optimisation score over the 4 most recent iterations has not +changed beyond the tolerance level in \code{smbo_stop_tolerance}. + +The default value is \code{3}.} + \item{\code{smbo_stop_tolerance}}{(\emph{optional}) Tolerance for early stopping due to +convergent optimisation score. + +The default value depends on the square root of the number of samples (at +the series level), and is \code{0.01} for 100 samples. This value is computed as +\code{0.1 * 1 / sqrt(n_samples)}. The upper limit is \code{0.0001} for 1M or more +samples.} + \item{\code{smbo_time_limit}}{(\emph{optional}) Time limit (in minutes) for the +optimisation process. Optimisation is stopped after this limit is exceeded. +Time taken to determine variable importance for the optimisation process +(see the \code{optimisation_determine_vimp} parameter) does not count. + +The default is \code{NULL}, indicating that there is no time limit for the +optimisation process. The time limit cannot be less than 1 minute.} + \item{\code{smbo_initial_bootstraps}}{(\emph{optional}) The number of bootstraps taken +from the set of \code{optimisation_bootstraps} as the bootstraps assessed +initially. + +The default value is \code{1}. The value cannot be larger than +\code{optimisation_bootstraps}.} + \item{\code{smbo_step_bootstraps}}{(\emph{optional}) The number of bootstraps taken from +the set of \code{optimisation_bootstraps} bootstraps as the bootstraps assessed +during the steps of each intensify iteration. + +The default value is \code{3}. The value cannot be larger than +\code{optimisation_bootstraps}.} + \item{\code{smbo_intensify_steps}}{(\emph{optional}) The number of steps in each SMBO +intensify iteration. Each step a new set of \code{smbo_step_bootstraps} +bootstraps is drawn and used in the run-off between the incumbent \emph{best} +hyperparameter combination and its challengers. + +The default value is \code{5}. Higher numbers allow for a more detailed +comparison, but this comes with added computational cost.} + \item{\code{optimisation_metric}}{(\emph{optional}) One or more metrics used to compute +performance scores. See the vignette on performance metrics for the +available metrics. + +If unset, the following metrics are used by default: +\itemize{ +\item \code{auc_roc}: For \code{binomial} and \code{multinomial} models. +\item \code{mse}: Mean squared error for \code{continuous} models. +\item \code{msle}: Mean squared logarithmic error for \code{count} models. +\item \code{concordance_index}: For \code{survival} models. +} + +Multiple optimisation metrics can be specified. Actual metric values are +converted to an objective value by comparison with a baseline metric value +that derives from a trivial model, i.e. majority class for binomial and +multinomial outcomes, the median outcome for count and continuous outcomes +and a fixed risk or time for survival outcomes.} + \item{\code{optimisation_function}}{(\emph{optional}) Type of optimisation function used +to quantify the performance of a hyperparameter set. Model performance is +assessed using the metric(s) specified by \code{optimisation_metric} on the +in-bag (IB) and out-of-bag (OOB) samples of a bootstrap. These values are +converted to objective scores with a standardised interval of \eqn{[-1.0, + 1.0]}. Each pair of objective is subsequently used to compute an +optimisation score. The optimisation score across different bootstraps is +than aggregated to a summary score. This summary score is used to rank +hyperparameter sets, and select the optimal set. + +The combination of optimisation score and summary score is determined by +the optimisation function indicated by this parameter: +\itemize{ +\item \code{validation} or \code{max_validation} (default): seeks to maximise OOB score. +\item \code{balanced}: seeks to balance IB and OOB score. +\item \code{stronger_balance}: similar to \code{balanced}, but with stronger penalty for +differences between IB and OOB scores. +\item \code{validation_minus_sd}: seeks to optimise the average OOB score minus its +standard deviation. +\item \code{validation_25th_percentile}: seeks to optimise the 25th percentile of +OOB scores, and is conceptually similar to \code{validation_minus_sd}. +\item \code{model_estimate}: seeks to maximise the OOB score estimate predicted by +the hyperparameter learner (not available for random search). +\item \code{model_estimate_minus_sd}: seeks to maximise the OOB score estimate minus +its estimated standard deviation, as predicted by the hyperparameter +learner (not available for random search). +\item \code{model_balanced_estimate}: seeks to maximise the estimate of the balanced +IB and OOB score. This is similar to the \code{balanced} score, and in fact uses +a hyperparameter learner to predict said score (not available for random +search). +\item \code{model_balanced_estimate_minus_sd}: seeks to maximise the estimate of the +balanced IB and OOB score, minus its estimated standard deviation. This is +similar to the \code{balanced} score, but takes into account its estimated +spread. +} + +Additional detail are provided in the \emph{Learning algorithms and +hyperparameter optimisation} vignette.} + \item{\code{hyperparameter_learner}}{(\emph{optional}) Any point in the hyperparameter +space has a single, scalar, optimisation score value that is \emph{a priori} +unknown. During the optimisation process, the algorithm samples from the +hyperparameter space by selecting hyperparameter sets and computing the +optimisation score value for one or more bootstraps. For each +hyperparameter set the resulting values are distributed around the actual +value. The learner indicated by \code{hyperparameter_learner} is then used to +infer optimisation score estimates for unsampled parts of the +hyperparameter space. + +The following models are available: +\itemize{ +\item \code{bayesian_additive_regression_trees} or \code{bart}: Uses Bayesian Additive +Regression Trees (Sparapani et al., 2021) for inference. Unlike standard +random forests, BART allows for estimating posterior distributions directly +and can extrapolate. +\item \code{gaussian_process} (default): Creates a localised approximate Gaussian +process for inference (Gramacy, 2016). This allows for better scaling than +deterministic Gaussian Processes. +\item \code{random_forest}: Creates a random forest for inference. Originally +suggested by Hutter et al. (2011). A weakness of random forests is their +lack of extrapolation beyond observed values, which limits their usefulness +in exploiting promising areas of hyperparameter space. +\item \code{random} or \code{random_search}: Forgoes the use of models to steer +optimisation. Instead, a random search is performed. +}} + \item{\code{acquisition_function}}{(\emph{optional}) The acquisition function influences +how new hyperparameter sets are selected. The algorithm uses the model +learned by the learner indicated by \code{hyperparameter_learner} to search the +hyperparameter space for hyperparameter sets that are either likely better +than the best known set (\emph{exploitation}) or where there is considerable +uncertainty (\emph{exploration}). The acquisition function quantifies this +(Shahriari et al., 2016). + +The following acquisition functions are available, and are described in +more detail in the \emph{learner algorithms} vignette: +\itemize{ +\item \code{improvement_probability}: The probability of improvement quantifies the +probability that the expected optimisation score for a set is better than +the best observed optimisation score +\item \code{improvement_empirical_probability}: Similar to +\code{improvement_probability}, but based directly on optimisation scores +predicted by the individual decision trees. +\item \code{expected_improvement} (default): Computes expected improvement. +\item \code{upper_confidence_bound}: This acquisition function is based on the upper +confidence bound of the distribution (Srinivas et al., 2012). +\item \code{bayes_upper_confidence_bound}: This acquisition function is based on the +upper confidence bound of the distribution (Kaufmann et al., 2012). +}} + \item{\code{exploration_method}}{(\emph{optional}) Method used to steer exploration in +post-initialisation intensive searching steps. As stated earlier, each SMBO +iteration step compares suggested alternative parameter sets with an +incumbent \strong{best} set in a series of steps. The exploration method +controls how the set of alternative parameter sets is pruned after each +step in an iteration. Can be one of the following: +\itemize{ +\item \code{single_shot} (default): The set of alternative parameter sets is not +pruned, and each intensification iteration contains only a single +intensification step that only uses a single bootstrap. This is the fastest +exploration method, but only superficially tests each parameter set. +\item \code{successive_halving}: The set of alternative parameter sets is +pruned by removing the worst performing half of the sets after each step +(Jamieson and Talwalkar, 2016). +\item \code{stochastic_reject}: The set of alternative parameter sets is pruned by +comparing the performance of each parameter set with that of the incumbent +\strong{best} parameter set using a paired Wilcoxon test based on shared +bootstraps. Parameter sets that perform significantly worse, at an alpha +level indicated by \code{smbo_stochastic_reject_p_value}, are pruned. +\item \code{none}: The set of alternative parameter sets is not pruned. +}} + \item{\code{smbo_stochastic_reject_p_value}}{(\emph{optional}) The p-value threshold used +for the \code{stochastic_reject} exploration method. + +The default value is \code{0.05}.} + \item{\code{parallel_hyperparameter_optimisation}}{(\emph{optional}) Enable parallel +processing for hyperparameter optimisation. Defaults to \code{TRUE}. When set to +\code{FALSE}, this will disable the use of parallel processing while performing +optimisation, regardless of the settings of the \code{parallel} parameter. The +parameter moreover specifies whether parallelisation takes place within the +optimisation algorithm (\code{inner}, default), or in an outer loop ( \code{outer}) +over learners, data subsamples, etc. + +\code{parallel_hyperparameter_optimisation} is ignored if \code{parallel=FALSE}.} + }} +} +\value{ +One or more familiarModel objects. +} +\description{ +Train models using familiar. Evaluation is not performed. +} +\details{ +This is a thin wrapper around \code{summon_familiar}, and functions like +it, but automatically skips all evaluation steps. Only a single learner is +allowed. +} diff --git a/man/update_model_dir_path-methods.Rd b/man/update_model_dir_path-methods.Rd new file mode 100644 index 00000000..cf758733 --- /dev/null +++ b/man/update_model_dir_path-methods.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarEnsemble.R +\name{update_model_dir_path} +\alias{update_model_dir_path} +\alias{update_model_dir_path,familiarEnsemble-method} +\alias{update_model_dir_path,ANY-method} +\title{Updates model directory path for ensemble objects.} +\usage{ +update_model_dir_path(object, dir_path, ...) + +\S4method{update_model_dir_path}{familiarEnsemble}(object, dir_path) + +\S4method{update_model_dir_path}{ANY}(object, dir_path) +} +\arguments{ +\item{object}{A \code{familiarEnsemble} object, or one or more \code{familiarModel} +objects that will be internally converted to a \code{familiarEnsemble} object. +Paths to such objects can also be provided.} + +\item{dir_path}{Path to the directory where models are stored.} + +\item{...}{Unused arguments.} +} +\value{ +A \code{familiarEnsemble} object. +} +\description{ +Updates the model directory path of a \code{familiarEnsemble} object. +} +\details{ +Ensemble models created by familiar are often written to a directory +on a local drive or network. In such cases, the actual models are detached, +and paths to the models are stored instead. When the models are moved from +their original location, they can no longer be found and attached to the +ensemble. This method allows for pointing to the new directory containing +the models. +} diff --git a/man/update_object-methods.Rd b/man/update_object-methods.Rd new file mode 100644 index 00000000..1ee0b0f2 --- /dev/null +++ b/man/update_object-methods.Rd @@ -0,0 +1,53 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarObjectUpdate.R +\name{update_object} +\alias{update_object} +\alias{update_object,familiarModel-method} +\alias{update_object,familiarEnsemble-method} +\alias{update_object,familiarData-method} +\alias{update_object,familiarCollection-method} +\alias{update_object,vimpTable-method} +\alias{update_object,familiarNoveltyDetector-method} +\alias{update_object,featureInfo-method} +\alias{update_object,experimentData-method} +\alias{update_object,list-method} +\alias{update_object,ANY-method} +\title{Update familiar S4 objects to the most recent version.} +\usage{ +update_object(object, ...) + +\S4method{update_object}{familiarModel}(object, ...) + +\S4method{update_object}{familiarEnsemble}(object, ...) + +\S4method{update_object}{familiarData}(object, ...) + +\S4method{update_object}{familiarCollection}(object, ...) + +\S4method{update_object}{vimpTable}(object, ...) + +\S4method{update_object}{familiarNoveltyDetector}(object, ...) + +\S4method{update_object}{featureInfo}(object, ...) + +\S4method{update_object}{experimentData}(object, ...) + +\S4method{update_object}{list}(object, ...) + +\S4method{update_object}{ANY}(object, ...) +} +\arguments{ +\item{object}{A \code{familiarModel}, a \code{familiarEnsemble}, a \code{familiarData} or +\code{familiarCollection} object.} + +\item{...}{Unused arguments.} +} +\value{ +An up-to-date version of the respective S4 object. +} +\description{ +Provides backward compatibility for familiar objects exported to +a file. This mitigates compatibility issues when working with files that +become outdated as new versions of familiar are released, e.g. because slots +have been removed. +} diff --git a/man/vcov-methods.Rd b/man/vcov-methods.Rd new file mode 100644 index 00000000..feddafbd --- /dev/null +++ b/man/vcov-methods.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarModel.R +\name{vcov} +\alias{vcov} +\alias{vcov,familiarModel-method} +\title{Calculate variance-covariance matrix for a model} +\usage{ +vcov(object, ...) + +\S4method{vcov}{familiarModel}(object, ...) +} +\arguments{ +\item{object}{a familiarModel object} + +\item{...}{additional arguments passed to \code{vcob} methods for the underlying +model, when available.} +} +\value{ +Variance-covariance matrix of the model in the familiarModel object, +if any. +} +\description{ +Calculate variance-covariance matrix for a model +} +\details{ +This method extends the \code{vcov} S3 method. For some models \code{vcov} +requires information that is trimmed from the model. In this case a copy of +the variance-covariance matrix is stored with the model, and returned. +} diff --git a/man/vimpTable-class.Rd b/man/vimpTable-class.Rd new file mode 100644 index 00000000..44abbbaa --- /dev/null +++ b/man/vimpTable-class.Rd @@ -0,0 +1,80 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FamiliarS4Classes.R +\docType{class} +\name{vimpTable-class} +\alias{vimpTable-class} +\title{Variable importance table} +\description{ +A vimpTable object contains information concerning variable importance of one +or more features. These objects are created during feature selection. +} +\details{ +vimpTable objects exists in various states. These states are +generally incremental, i.e. one cannot turn a declustered table into the +initial version. Some methods such as aggregation internally do some state +reshuffling. + +This object replaces the ad-hoc lists with information that were used in +versions prior to familiar 1.2.0. +} +\section{Slots}{ + +\describe{ +\item{\code{vimp_table}}{Table containing features with corresponding scores.} + +\item{\code{vimp_method}}{Method used to compute variable importance scores for each +feature.} + +\item{\code{run_table}}{Run table for the data used to compute variable importances +from. Used internally.} + +\item{\code{score_aggregation}}{Method used to aggregate the score of contrasts for +each categorical feature, if any,} + +\item{\code{encoding_table}}{Table used to relate categorical features to their +contrasts, if any. Not used for all variable importance methods.} + +\item{\code{cluster_table}}{Table used to relate original features with features +after clustering. Variable importance is determined after feature +processing, which includes clustering.} + +\item{\code{invert}}{Determines whether increasing score corresponds to increasing +(\code{FALSE}) or decreasing rank (\code{TRUE}). Used internally to determine how +ranks should be formed.} + +\item{\code{project_id}}{Identifier of the project that generated the vimpTable +object.} + +\item{\code{familiar_version}}{Version of the familiar package used to create this +table.} + +\item{\code{state}}{State of the variable importance table. The object can have the +following states: +\itemize{ +\item \code{initial}: initial state, directly after the variable importance table is +filled. +\item \code{decoded}: depending on the variable importance method, the initial +variable importance table may contain the scores of individual contrasts +for categorical variables. When decoded, data in the \code{encoding_table} +attribute has been used to aggregate scores from all contrasts into a +single score for each feature. +\item \code{declustered}: variable importance is determined from fully processed +features, which includes clustering. This means that a single feature in +the variable importance table may represent multiple original features. +When a variable importance table has been declustered, all clusters have +been turned into their constituent features. +\item \code{reclustered}: When the table is reclustered, features are replaced by +their respective clusters. This is actually used when updating the cluster +table to ensure it fits to a local context. This prevents issues when +attempting to aggregate or apply variable importance tables in data with +different feature preprocessing, and as a result, different clusters. +\item \code{ranked}: The scores have been used to create ranks, with lower ranks +indicating better features. +\item \code{aggregated}: Score and ranks from multiple variable importance tables +were aggregated. +}} +}} + +\seealso{ +\code{\link{get_vimp_table}}, \code{\link{aggregate_vimp_table}} +} diff --git a/man/waiver.Rd b/man/waiver.Rd new file mode 100644 index 00000000..8dc9f509 --- /dev/null +++ b/man/waiver.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Utilities.R +\name{waiver} +\alias{waiver} +\title{Create a waiver object} +\usage{ +waiver() +} +\value{ +waiver object +} +\description{ +This function is functionally identical to \code{ggplot2::waiver()} function and +creates a waiver object. A waiver object is an otherwise empty object that +serves the same purpose as \code{NULL}, i.e. as placeholder for a default value. +Because \code{NULL} can sometimes be a valid input argument, it can therefore not +be used to switch to an internal default value. +} diff --git a/tests/testthat/test-configuration_file.R b/tests/testthat/test-configuration_file.R index 8a35bf23..d2230c89 100644 --- a/tests/testthat/test-configuration_file.R +++ b/tests/testthat/test-configuration_file.R @@ -1,3 +1,5 @@ +testthat::skip_if_not_installed("xml2") + # Find path to configuration file in package. config <- system.file("config.xml", package="familiar") diff --git a/tests/testthat/test-sample_weights.R b/tests/testthat/test-sample_weights.R index e510ab09..57758418 100644 --- a/tests/testthat/test-sample_weights.R +++ b/tests/testthat/test-sample_weights.R @@ -1,3 +1,5 @@ +if (!familiar:::test_data_package_installed("binomial")) testthat::skip() + # Create test dataset. data <- familiar:::test.create_good_data_set(outcome_type="binomial")