diff --git a/.Rbuildignore b/.Rbuildignore index 3bbb482..f3e5c66 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -7,3 +7,4 @@ ^README\.Rmd$ ^README\.md$ ^\.travis\.yml$ +^docs$ \ No newline at end of file diff --git a/.gitignore b/.gitignore index 663b1cb..a684e5f 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,6 @@ alookr.Rproj .DS_Store doc Meta -_pkgdown.yml \ No newline at end of file +_pkgdown.yml +/doc/ +/Meta/ diff --git a/DESCRIPTION b/DESCRIPTION index bb621f2..5a33fd4 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: alookr Type: Package Title: Model Classifier for Binary Classification -Version: 0.3.8.9000 +Version: 0.3.9 Authors@R: c( person("Choonghyun", "Ryu",, "choonghyun.ryu@gmail.com", role = c("aut", "cre")) ) @@ -46,8 +46,8 @@ Suggests: Author: Choonghyun Ryu [aut, cre] Maintainer: Choonghyun Ryu BugReports: https://github.com/choonghyunryu/alookr/issues -License: GPL-2 | file LICENSE +License: GPL-2 Encoding: UTF-8 VignetteBuilder: knitr -RoxygenNote: 7.1.2 +RoxygenNote: 7.2.3 Language: en-US diff --git a/LICENSE b/LICENSE deleted file mode 100644 index d159169..0000000 --- a/LICENSE +++ /dev/null @@ -1,339 +0,0 @@ - GNU GENERAL PUBLIC LICENSE - Version 2, June 1991 - - Copyright (C) 1989, 1991 Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - - Preamble - - The licenses for most software are designed to take away your -freedom to share and change it. By contrast, the GNU General Public -License is intended to guarantee your freedom to share and change free -software--to make sure the software is free for all its users. This -General Public License applies to most of the Free Software -Foundation's software and to any other program whose authors commit to -using it. (Some other Free Software Foundation software is covered by -the GNU Lesser General Public License instead.) You can apply it to -your programs, too. - - When we speak of free software, we are referring to freedom, not -price. Our General Public Licenses are designed to make sure that you -have the freedom to distribute copies of free software (and charge for -this service if you wish), that you receive source code or can get it -if you want it, that you can change the software or use pieces of it -in new free programs; and that you know you can do these things. - - To protect your rights, we need to make restrictions that forbid -anyone to deny you these rights or to ask you to surrender the rights. -These restrictions translate to certain responsibilities for you if you -distribute copies of the software, or if you modify it. - - For example, if you distribute copies of such a program, whether -gratis or for a fee, you must give the recipients all the rights that -you have. You must make sure that they, too, receive or can get the -source code. And you must show them these terms so they know their -rights. - - We protect your rights with two steps: (1) copyright the software, and -(2) offer you this license which gives you legal permission to copy, -distribute and/or modify the software. - - Also, for each author's protection and ours, we want to make certain -that everyone understands that there is no warranty for this free -software. If the software is modified by someone else and passed on, we -want its recipients to know that what they have is not the original, so -that any problems introduced by others will not reflect on the original -authors' reputations. - - Finally, any free program is threatened constantly by software -patents. We wish to avoid the danger that redistributors of a free -program will individually obtain patent licenses, in effect making the -program proprietary. To prevent this, we have made it clear that any -patent must be licensed for everyone's free use or not licensed at all. - - The precise terms and conditions for copying, distribution and -modification follow. - - GNU GENERAL PUBLIC LICENSE - TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION - - 0. This License applies to any program or other work which contains -a notice placed by the copyright holder saying it may be distributed -under the terms of this General Public License. The "Program", below, -refers to any such program or work, and a "work based on the Program" -means either the Program or any derivative work under copyright law: -that is to say, a work containing the Program or a portion of it, -either verbatim or with modifications and/or translated into another -language. (Hereinafter, translation is included without limitation in -the term "modification".) Each licensee is addressed as "you". - -Activities other than copying, distribution and modification are not -covered by this License; they are outside its scope. The act of -running the Program is not restricted, and the output from the Program -is covered only if its contents constitute a work based on the -Program (independent of having been made by running the Program). -Whether that is true depends on what the Program does. - - 1. You may copy and distribute verbatim copies of the Program's -source code as you receive it, in any medium, provided that you -conspicuously and appropriately publish on each copy an appropriate -copyright notice and disclaimer of warranty; keep intact all the -notices that refer to this License and to the absence of any warranty; -and give any other recipients of the Program a copy of this License -along with the Program. - -You may charge a fee for the physical act of transferring a copy, and -you may at your option offer warranty protection in exchange for a fee. - - 2. You may modify your copy or copies of the Program or any portion -of it, thus forming a work based on the Program, and copy and -distribute such modifications or work under the terms of Section 1 -above, provided that you also meet all of these conditions: - - a) You must cause the modified files to carry prominent notices - stating that you changed the files and the date of any change. - - b) You must cause any work that you distribute or publish, that in - whole or in part contains or is derived from the Program or any - part thereof, to be licensed as a whole at no charge to all third - parties under the terms of this License. - - c) If the modified program normally reads commands interactively - when run, you must cause it, when started running for such - interactive use in the most ordinary way, to print or display an - announcement including an appropriate copyright notice and a - notice that there is no warranty (or else, saying that you provide - a warranty) and that users may redistribute the program under - these conditions, and telling the user how to view a copy of this - License. (Exception: if the Program itself is interactive but - does not normally print such an announcement, your work based on - the Program is not required to print an announcement.) - -These requirements apply to the modified work as a whole. If -identifiable sections of that work are not derived from the Program, -and can be reasonably considered independent and separate works in -themselves, then this License, and its terms, do not apply to those -sections when you distribute them as separate works. But when you -distribute the same sections as part of a whole which is a work based -on the Program, the distribution of the whole must be on the terms of -this License, whose permissions for other licensees extend to the -entire whole, and thus to each and every part regardless of who wrote it. - -Thus, it is not the intent of this section to claim rights or contest -your rights to work written entirely by you; rather, the intent is to -exercise the right to control the distribution of derivative or -collective works based on the Program. - -In addition, mere aggregation of another work not based on the Program -with the Program (or with a work based on the Program) on a volume of -a storage or distribution medium does not bring the other work under -the scope of this License. - - 3. You may copy and distribute the Program (or a work based on it, -under Section 2) in object code or executable form under the terms of -Sections 1 and 2 above provided that you also do one of the following: - - a) Accompany it with the complete corresponding machine-readable - source code, which must be distributed under the terms of Sections - 1 and 2 above on a medium customarily used for software interchange; or, - - b) Accompany it with a written offer, valid for at least three - years, to give any third party, for a charge no more than your - cost of physically performing source distribution, a complete - machine-readable copy of the corresponding source code, to be - distributed under the terms of Sections 1 and 2 above on a medium - customarily used for software interchange; or, - - c) Accompany it with the information you received as to the offer - to distribute corresponding source code. (This alternative is - allowed only for noncommercial distribution and only if you - received the program in object code or executable form with such - an offer, in accord with Subsection b above.) - -The source code for a work means the preferred form of the work for -making modifications to it. For an executable work, complete source -code means all the source code for all modules it contains, plus any -associated interface definition files, plus the scripts used to -control compilation and installation of the executable. However, as a -special exception, the source code distributed need not include -anything that is normally distributed (in either source or binary -form) with the major components (compiler, kernel, and so on) of the -operating system on which the executable runs, unless that component -itself accompanies the executable. - -If distribution of executable or object code is made by offering -access to copy from a designated place, then offering equivalent -access to copy the source code from the same place counts as -distribution of the source code, even though third parties are not -compelled to copy the source along with the object code. - - 4. You may not copy, modify, sublicense, or distribute the Program -except as expressly provided under this License. Any attempt -otherwise to copy, modify, sublicense or distribute the Program is -void, and will automatically terminate your rights under this License. -However, parties who have received copies, or rights, from you under -this License will not have their licenses terminated so long as such -parties remain in full compliance. - - 5. You are not required to accept this License, since you have not -signed it. However, nothing else grants you permission to modify or -distribute the Program or its derivative works. These actions are -prohibited by law if you do not accept this License. Therefore, by -modifying or distributing the Program (or any work based on the -Program), you indicate your acceptance of this License to do so, and -all its terms and conditions for copying, distributing or modifying -the Program or works based on it. - - 6. Each time you redistribute the Program (or any work based on the -Program), the recipient automatically receives a license from the -original licensor to copy, distribute or modify the Program subject to -these terms and conditions. You may not impose any further -restrictions on the recipients' exercise of the rights granted herein. -You are not responsible for enforcing compliance by third parties to -this License. - - 7. If, as a consequence of a court judgment or allegation of patent -infringement or for any other reason (not limited to patent issues), -conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot -distribute so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you -may not distribute the Program at all. For example, if a patent -license would not permit royalty-free redistribution of the Program by -all those who receive copies directly or indirectly through you, then -the only way you could satisfy both it and this License would be to -refrain entirely from distribution of the Program. - -If any portion of this section is held invalid or unenforceable under -any particular circumstance, the balance of the section is intended to -apply and the section as a whole is intended to apply in other -circumstances. - -It is not the purpose of this section to induce you to infringe any -patents or other property right claims or to contest validity of any -such claims; this section has the sole purpose of protecting the -integrity of the free software distribution system, which is -implemented by public license practices. Many people have made -generous contributions to the wide range of software distributed -through that system in reliance on consistent application of that -system; it is up to the author/donor to decide if he or she is willing -to distribute software through any other system and a licensee cannot -impose that choice. - -This section is intended to make thoroughly clear what is believed to -be a consequence of the rest of this License. - - 8. If the distribution and/or use of the Program is restricted in -certain countries either by patents or by copyrighted interfaces, the -original copyright holder who places the Program under this License -may add an explicit geographical distribution limitation excluding -those countries, so that distribution is permitted only in or among -countries not thus excluded. In such case, this License incorporates -the limitation as if written in the body of this License. - - 9. The Free Software Foundation may publish revised and/or new versions -of the General Public License from time to time. Such new versions will -be similar in spirit to the present version, but may differ in detail to -address new problems or concerns. - -Each version is given a distinguishing version number. If the Program -specifies a version number of this License which applies to it and "any -later version", you have the option of following the terms and conditions -either of that version or of any later version published by the Free -Software Foundation. If the Program does not specify a version number of -this License, you may choose any version ever published by the Free Software -Foundation. - - 10. If you wish to incorporate parts of the Program into other free -programs whose distribution conditions are different, write to the author -to ask for permission. For software which is copyrighted by the Free -Software Foundation, write to the Free Software Foundation; we sometimes -make exceptions for this. Our decision will be guided by the two goals -of preserving the free status of all derivatives of our free software and -of promoting the sharing and reuse of software generally. - - NO WARRANTY - - 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY -FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN -OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES -PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED -OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS -TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE -PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, -REPAIR OR CORRECTION. - - 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING -WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR -REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, -INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING -OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED -TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY -YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER -PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE -POSSIBILITY OF SUCH DAMAGES. - - END OF TERMS AND CONDITIONS - - How to Apply These Terms to Your New Programs - - If you develop a new program, and you want it to be of the greatest -possible use to the public, the best way to achieve this is to make it -free software which everyone can redistribute and change under these terms. - - To do so, attach the following notices to the program. It is safest -to attach them to the start of each source file to most effectively -convey the exclusion of warranty; and each file should have at least -the "copyright" line and a pointer to where the full notice is found. - - - Copyright (C) - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - -Also add information on how to contact you by electronic and paper mail. - -If the program is interactive, make it output a short notice like this -when it starts in an interactive mode: - - Gnomovision version 69, Copyright (C) year name of author - Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. - This is free software, and you are welcome to redistribute it - under certain conditions; type `show c' for details. - -The hypothetical commands `show w' and `show c' should show the appropriate -parts of the General Public License. Of course, the commands you use may -be called something other than `show w' and `show c'; they could even be -mouse-clicks or menu items--whatever suits your program. - -You should also get your employer (if you work as a programmer) or your -school, if any, to sign a "copyright disclaimer" for the program, if -necessary. Here is a sample; alter the names: - - Yoyodyne, Inc., hereby disclaims all copyright interest in the program - `Gnomovision' (which makes passes at compilers) written by James Hacker. - - , 1 April 1989 - Ty Coon, President of Vice - -This General Public License does not permit incorporating your program into -proprietary programs. If your program is a subroutine library, you may -consider it more useful to permit linking proprietary applications with the -library. If this is what you want to do, use the GNU Lesser General -Public License instead of this License. diff --git a/NEWS b/NEWS deleted file mode 100644 index 32bfecf..0000000 --- a/NEWS +++ /dev/null @@ -1,104 +0,0 @@ -CHANGES IN R VERSION 0.3.8: - - BUG FIXES: - - * Fix error in treatment_corr() that is "All columns in a tibble must be - vectors." error. (#6, thanks to Cathy Tomson) - - - -CHANGES IN R VERSION 0.3.7: - - MAJOR CHANGES: - - * Removed plan(multiprocess) from logic for parallel processing. Because, - plan(multiprocess) of future is deprecated. (#2, thanks to Henrik Bengtsson) - - MINOR CHANGES: - - * Remove the waring of "UNRELIABLE VALUE" with seed = TRUE in future function. - - BUG FIXES: - - * Fix error in run_performance() that is "replacement has length zero" error. - (#5, thanks to Muhammad Fawad) - - - -CHANGES IN R VERSION 0.3.6: - - MINOR CHANGES: - - * Implemented a function to replace the unbalanced package used - in the process of performing split data. This is because unbalanced - packages have been removed from CRAN. (#3) - - - -CHANGES IN R VERSION 0.3.5: - - BUG FIXES: - - * Fix error in glmnet when run_predict() is performed with - test data that has more variables than train data. - - - -CHANGES IN R VERSION 0.3.4: - - MAJOR CHANGES: - - * add xgboosting methodlogy for binary classifier. - - * add lasso regression model for binary classifier. - - - -CHANGES IN R VERSION 0.3.3: - - BUG FIXES: - - * run_predict() fixed error when try to predict on dataset without - the response variable (thanks @shivakhanal, #1). - - MINOR CHANGES: - - * run_models(), run_predict(), run_performance() not support - future::multiprocess when running R from RStudio. - - - -CHANGES IN R VERSION 0.3.2: - - BUG FIXES: - - * Fixed explanation errors in `Classification Modeling` vignettes - for debian linux. - - MINOR CHANGES: - - * Renamed compare_category() to compare_target_category(). - This is because it overlaps the function name of the dlookr package. - - * Renamed compare_numeric() to compare_target_numeric(). - This is because it overlaps the function name of the dlookr package. - - * compare_target_category() modified from is.tibble(), - as.tibble() to is_tibble(), as_tibble(). - - * compare_diag() modified from is.tibble(), as.tibble() to - is_tibble(), as_tibble(). - - * sampling_target() modified from as.tbl() to tibble::as_tibble(). - - - -CHANGES IN R VERSION 0.3.1: - - BUG FIXES: - - * Fixed explanation errors in `Cleansing the dataset` vignettes. - - * Fixed explanation errors in `Classification Modeling` vignettes. - - * Modified explanation errors in `Splitting the dataset` vignettes. \ No newline at end of file diff --git a/NEWS.md b/NEWS.md new file mode 100644 index 0000000..cd0f9c6 --- /dev/null +++ b/NEWS.md @@ -0,0 +1,101 @@ +# alookr 0.3.9 + +## MINOR CHANGES + +* Fix error in treatment_corr() that is "All columns in a tibble must be vectors." error. + - (#6, thanks to Cathy Tomson) + + + +# alookr 0.3.8 + +## BUG FIXES + +* Fix error in treatment_corr() that is "All columns in a tibble must be vectors." error. + - (#6, thanks to Cathy Tomson) + + + +# alookr 0.3.7 + +## MAJOR CHANGES + +* Removed plan(multiprocess) from logic for parallel processing. + - Because, plan(multiprocess) of future is deprecated. (#2, thanks to Henrik Bengtsson) + +## MINOR CHANGES + +* Remove the waring of "UNRELIABLE VALUE" with seed = TRUE in future function. + +## BUG FIXES + +* Fix error in run_performance() that is "replacement has length zero" error. + - (#5, thanks to Muhammad Fawad) + + + +# alookr 0.3.6 + +## MINOR CHANGES + +* Implemented a function to replace the unbalanced package used in the process of performing split data. + - This is because unbalanced packages have been removed from CRAN. (#3) + + + +# alookr 0.3.5 + +## BUG FIXES + +* Fix error in glmnet when run_predict() is performed with test data that has more variables than train data. + + + +# alookr 0.3.4 + +## MAJOR CHANGES + +* add xgboosting methodlogy for binary classifier. +* add lasso regression model for binary classifier. + + + +# alookr 0.3.3 + +## BUG FIXES + +* run_predict() fixed error when try to predict on dataset without the response variable + - (thanks @shivakhanal, #1). + +## MINOR CHANGES + +* run_models(), run_predict(), run_performance() not support future::multiprocess when running R from RStudio. + + + +# alookr 0.3.2 + +## BUG FIXES + +* Fixed explanation errors in `Classification Modeling` vignettes for debian linux. + +## MINOR CHANGES + +* Renamed compare_category() to compare_target_category(). + - This is because it overlaps the function name of the dlookr package. +* Renamed compare_numeric() to compare_target_numeric(). + - This is because it overlaps the function name of the dlookr package. +* compare_target_category() modified from is.tibble(), as.tibble() to is_tibble(), as_tibble(). +* compare_diag() modified from is.tibble(), as.tibble() to is_tibble(), as_tibble(). +* sampling_target() modified from as.tbl() to tibble::as_tibble(). + + + +# alookr 0.3.1 + +## BUG FIXES + +* Fixed explanation errors in `Cleansing the dataset` vignettes. +* Fixed explanation errors in `Classification Modeling` vignettes. +* Modified explanation errors in `Splitting the dataset` vignettes. + \ No newline at end of file diff --git a/R/evaluate.R b/R/evaluate.R index f667886..6e3e773 100644 --- a/R/evaluate.R +++ b/R/evaluate.R @@ -110,7 +110,6 @@ get_cross <- function(predicted, y, positive) { #' In this case, the speed of visualization can be slow. #' #' @examples -#' \donttest{ #' library(ggplot2) #' library(rpart) #' data(kyphosis) @@ -130,7 +129,6 @@ get_cross <- function(predicted, y, positive) { #' plot_cutoff(pred, kyphosis$Kyphosis, "present", type = "prob", measure = "mcc") #' plot_cutoff(pred, kyphosis$Kyphosis, "present", type = "prob", measure = "cross") #' plot_cutoff(pred, kyphosis$Kyphosis, "present", type = "prob", measure = "half") -#' } #' #' @import dplyr #' @import ggplot2 @@ -286,7 +284,6 @@ plot_cutoff <- function(predicted, y, positive, type = c("mcc", "density", "prob #' } #' #' @examples -#' \donttest{ #' library(dplyr) #' #' # Divide the train data set and the test data set. @@ -326,7 +323,6 @@ plot_cutoff <- function(predicted, y, positive, type = c("mcc", "density", "prob #' # Calculate Confusion Matrix by cutoff = 0.55. #' performance_metric(attr(pred$predicted[[1]], "pred_prob"), test$Kyphosis, #' "present", "ConfusionMatrix", cutoff = 0.55) -#' } #' #' @importFrom stats density #' @export diff --git a/R/modeling.R b/R/modeling.R index 840728d..0aecd2d 100644 --- a/R/modeling.R +++ b/R/modeling.R @@ -156,10 +156,7 @@ classifier_dispatch <- function(model = c("logistic", "rpart", "ctree", #' # Run the several kinds model fitting by dplyr #' train %>% #' run_models(target = "Kyphosis", positive = "present") -#' -#' # Run the logistic model fitting by dplyr -#' train %>% -#' run_models(target = "Kyphosis", positive = "present", models = "logistic") +#' #' @importFrom stats density #' @importFrom future plan #' @importFrom parallelly supportsMulticore @@ -313,14 +310,10 @@ predictor <- function(model, .data, target, positive, negative, is_factor, #' result <- run_models(.data = train, target = "Kyphosis", positive = "present") #' result #' -#' # Predict the model. -#' pred <- run_predict(result, test) -#' pred -#' #' # Run the several kinds model predict by dplyr #' result %>% #' run_predict(test) -#' +#' #' @importFrom stats density #' @importFrom future plan #' @importFrom parallelly supportsMulticore diff --git a/inst/doc/cleansing.html b/inst/doc/cleansing.html index cffbe93..5116ec3 100644 --- a/inst/doc/cleansing.html +++ b/inst/doc/cleansing.html @@ -12,27 +12,220 @@ - + Cleansing the dataset - + - - - - - - - +code{white-space: pre-wrap;} +span.smallcaps{font-variant: small-caps;} +span.underline{text-decoration: underline;} +div.column{display: inline-block; vertical-align: top; width: 50%;} +div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;} +ul.task-list{list-style: none;} + + + + + + + + + @@ -46,20 +239,25 @@

Cleansing the dataset

Choonghyun Ryu

-

2021-11-30

+

2024-02-10

Preface

-

If you created a dataset to create a classification model, you must perform cleansing of the data. After you create the dataset, you should do the following:

+

If you created a dataset to create a classification model, you must +perform cleansing of the data. After you create the dataset, you should +do the following:

  • Cleansing the dataset
      -
    • Optional removal of variables including missing values
    • +
    • Optional removal of variables including missing +values
    • Remove a variable with one unique number
    • -
    • Remove categorical variables with a large number of levels
    • -
    • Convert a character variable to a categorical variable
      +
    • Remove categorical variables with a large number of +levels
    • +
    • Convert a character variable to a categorical +variable
  • Split the data into a train set and a test set
  • @@ -69,7 +267,8 @@

    Preface

How to perform cleansing the dataset

-

For information on how to perform cleansing the dataset, refer to the following website.

+

For information on how to perform cleansing the dataset, refer to the +following website.

diff --git a/inst/doc/modeling.html b/inst/doc/modeling.html index 3e7cc9a..c709b95 100644 --- a/inst/doc/modeling.html +++ b/inst/doc/modeling.html @@ -12,27 +12,220 @@ - + Classification Modeling - + - - - - - - - +code{white-space: pre-wrap;} +span.smallcaps{font-variant: small-caps;} +span.underline{text-decoration: underline;} +div.column{display: inline-block; vertical-align: top; width: 50%;} +div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;} +ul.task-list{list-style: none;} + + + + + + + + + @@ -46,13 +239,14 @@

Classification Modeling

Choonghyun Ryu

-

2021-11-30

+

2024-02-10

Preface

-

Once the data set is ready for model development, the model is fitted, predicted and evaluated in the following ways:

+

Once the data set is ready for model development, the model is +fitted, predicted and evaluated in the following ways:

  • Cleansing the dataset
  • Split the data into a train set and a test set
  • @@ -81,7 +275,8 @@

    Preface

How to perform modeling

-

For information on how to perform modeling and evaluate, predict, refer to the following website.

+

For information on how to perform modeling and evaluate, predict, +refer to the following website.

diff --git a/inst/doc/split.html b/inst/doc/split.html index f38970c..420324f 100644 --- a/inst/doc/split.html +++ b/inst/doc/split.html @@ -12,27 +12,220 @@ - + Splitting the dataset - + - - - - - - - +code{white-space: pre-wrap;} +span.smallcaps{font-variant: small-caps;} +span.underline{text-decoration: underline;} +div.column{display: inline-block; vertical-align: top; width: 50%;} +div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;} +ul.task-list{list-style: none;} + + + + + + + + + @@ -46,18 +239,20 @@

Splitting the dataset

Choonghyun Ryu

-

2021-11-30

+

2024-02-10

Preface

-

To develop a classification model, the original data must be divided into train data set and test data set. You should do the following:

+

To develop a classification model, the original data must be divided +into train data set and test data set. You should do the following:

  • Cleansing the dataset
  • Split the data into a train set and a test set
      -
    • Split the data.frame or tbl_df into a train set and a test set
    • +
    • Split the data.frame or tbl_df into a train set and a test +set
    • Compare dataset
      • Comparison of categorical variables
      • @@ -76,7 +271,8 @@

        Preface

How to perform split the data

-

For information on how to perform split the data into a train set and a test set, refer to the following website.

+

For information on how to perform split the data into a train set and +a test set, refer to the following website.

diff --git a/man/performance_metric.Rd b/man/performance_metric.Rd index 387c428..64da34f 100644 --- a/man/performance_metric.Rd +++ b/man/performance_metric.Rd @@ -59,7 +59,6 @@ The cutoff argument applies only if the metric argument is "ZeroOneLoss", "Accur "Sensitivity", "Specificity", "F1_Score", "Fbeta_Score", "ConfusionMatrix". } \examples{ -\donttest{ library(dplyr) # Divide the train data set and the test data set. @@ -99,6 +98,5 @@ performance_metric(attr(pred$predicted[[1]], "pred_prob"), test$Kyphosis, # Calculate Confusion Matrix by cutoff = 0.55. performance_metric(attr(pred$predicted[[1]], "pred_prob"), test$Kyphosis, "present", "ConfusionMatrix", cutoff = 0.55) -} } diff --git a/man/plot_cutoff.Rd b/man/plot_cutoff.Rd index f47eeda..39e8b23 100644 --- a/man/plot_cutoff.Rd +++ b/man/plot_cutoff.Rd @@ -41,7 +41,6 @@ is less than 100. If the observation is greater than 100, draw a line plot. In this case, the speed of visualization can be slow. } \examples{ -\donttest{ library(ggplot2) library(rpart) data(kyphosis) @@ -61,6 +60,5 @@ plot_cutoff(pred, kyphosis$Kyphosis, "present", type = "density", measure = "hal plot_cutoff(pred, kyphosis$Kyphosis, "present", type = "prob", measure = "mcc") plot_cutoff(pred, kyphosis$Kyphosis, "present", type = "prob", measure = "cross") plot_cutoff(pred, kyphosis$Kyphosis, "present", type = "prob", measure = "half") -} } diff --git a/man/run_models.Rd b/man/run_models.Rd index 1d19bd2..f3202ca 100644 --- a/man/run_models.Rd +++ b/man/run_models.Rd @@ -8,8 +8,7 @@ run_models( .data, target, positive, - models = c("logistic", "rpart", "ctree", "randomForest", "ranger", "xgboost", - "lasso") + models = c("logistic", "rpart", "ctree", "randomForest", "ranger", "xgboost", "lasso") ) } \arguments{ @@ -90,7 +89,4 @@ result train \%>\% run_models(target = "Kyphosis", positive = "present") -# Run the logistic model fitting by dplyr -train \%>\% - run_models(target = "Kyphosis", positive = "present", models = "logistic") } diff --git a/man/run_predict.Rd b/man/run_predict.Rd index 54a6554..546d78f 100644 --- a/man/run_predict.Rd +++ b/man/run_predict.Rd @@ -73,10 +73,6 @@ train <- train \%>\% result <- run_models(.data = train, target = "Kyphosis", positive = "present") result -# Predict the model. -pred <- run_predict(result, test) -pred - # Run the several kinds model predict by dplyr result \%>\% run_predict(test)