Skip to content

Commit

Permalink
Improve formatting of console output
Browse files Browse the repository at this point in the history
- The console output needs to be formatted so it is better organized and
  more user friendly. The file read messages need to be in green. The file
  write messages need to be in blue. Each major operation should have a
  heading and a footer.
- Correct formatting error in DESCRIPTION file.
- Remove the references.bib file from .Rbuildignore.
- Add headings to all major functions.
- Correct error in CITATION file.
- Set version number to 0.0.1 in DESCRIPTION file.
- Set development mode to unreleased in pkgdown config file.
- When the wordpredictor package is loaded, it shows a red warning
  message from the pryr package. Only the object_size method from the pryr
  package is used once. Instead of using this method a method may be added
  to the Model class for calculating the object size. The dependency on
  pryr package should be removed.
- Build pkgdown website.
  • Loading branch information
pakjiddat committed Jun 14, 2021
1 parent 609cd05 commit a7d7f23
Show file tree
Hide file tree
Showing 79 changed files with 3,050 additions and 717 deletions.
1 change: 0 additions & 1 deletion .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,3 @@
^vignettes/features_cache$
^vignettes/overview_cache$
^CRAN-RELEASE$
^vignettes/references.bib$
3 changes: 1 addition & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: wordpredictor
Title: Develop Text Prediction Models Based on N-Grams
Version: 1.0.0
Version: 0.0.1
URL: https://github.com/pakjiddat/word-predictor, https://pakjiddat.github.io/word-predictor/
BugReports: https://github.com/pakjiddat/word-predictor/issues
Authors@R:
Expand All @@ -12,7 +12,6 @@ Authors@R:
Description: The wordpredictor package allows developing n-gram models for text
prediction. It provides methods for data cleaning, data sampling,
tokenization, model generation, model evaluation and word prediction.

For information on how n-gram models work we referred to: Jurafsky, Daniel &
Martin, James. (2008). "Speech and Language Processing: An Introduction to
Natural Language Processing, Computational Linguistics, and Speech
Expand Down
1 change: 0 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ importFrom(ggplot2,labs)
importFrom(ggplot2,xlab)
importFrom(ggplot2,ylab)
importFrom(patchwork,plot_annotation)
importFrom(pryr,object_size)
importFrom(stringr,boundary)
importFrom(stringr,str_count)
importFrom(stringr,str_match)
51 changes: 45 additions & 6 deletions R/base.R
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,10 @@ Base <- R6::R6Class(
initialize = function(fn = NULL, lc = 100, ve = 2) {
# If the given file name is not NULL and is not valid
if (!is.null(fn) && !file.exists(fn)) {
private$dm("The given file name is not valid", md = -1, ty = "e")
private$dm(
"The given file name is not valid",
md = -1, ty = "e"
)
}

# The base class attributes are set
Expand Down Expand Up @@ -189,8 +192,10 @@ Base <- R6::R6Class(
# @param is_csv If the data is a csv file
# @return The file data
read_file = function(fn, is_csv) {
# The information message
msg <- paste0("Reading \033[0;", 32, "m'", fn, "'\033[0m")
# Information message is shown
private$dm("Reading file:", fn, md = 1)
private$dm(msg, md = 1)
# If the file is not a csv file
if (!is_csv) {
# File is opened for reading
Expand All @@ -215,8 +220,10 @@ Base <- R6::R6Class(
# @param lc The number of lines to read.
# @return The file data
read_lines = function(fn, lc) {
# The information message
msg <- paste0("Reading \033[0;", 32, "m'", fn, "'\033[0m")
# Information message is shown
private$dm("Reading file:", fn, md = 1)
private$dm(msg, md = 1)
# File is opened for reading
con <- file(fn)
# The file contents are read
Expand All @@ -236,8 +243,10 @@ Base <- R6::R6Class(
# @param fn The name of the file.
# @param is_append Indicates if data should be saved.
write_file = function(data, fn, is_append) {
# The information message
msg <- paste0("Writing \033[0;", 34, "m'", fn, "'\033[0m")
# Information message is shown
private$dm("Saving file:", fn, md = 1)
private$dm(msg, md = 1)
# If the given data is a data frame
if ("data.frame" %in% class(data)) {
# The data frame is written to a file
Expand All @@ -264,8 +273,10 @@ Base <- R6::R6Class(
# @param obj The object to save.
# @param fn The file name.
save_obj = function(obj, fn) {
# The information message
msg <- paste0("Writing \033[0;", 34, "m'", fn, "'\033[0m")
# Information message is shown
private$dm("Saving file:", fn, md = 1)
private$dm(msg, md = 1)
# The object is saved to a file in version 2 format
saveRDS(obj, fn, version = 2)
# The information message is shown
Expand All @@ -278,8 +289,10 @@ Base <- R6::R6Class(
# @param fn The file name.
# @return The loaded R obj.
read_obj = function(fn) {
# The information message
msg <- paste0("Reading \033[0;", 32, "m'", fn, "'\033[0m")
# Information message is shown
private$dm("Reading file:", fn, md = 1)
private$dm(msg, md = 1)
# If the file does not exist
if (!file.exists(fn)) {
# The error message
Expand Down Expand Up @@ -321,6 +334,32 @@ Base <- R6::R6Class(
}
},

# @description
# Displays the given heading text in bold.
# @param text The heading text to display.
# @param char The padding character to use.
# @param md The minimum debugging level.
# @param ll The total length of the line. Default is 80 chars.
dh = function(text, char, md, ll = 80) {
# If verbose is >= min_debug, then message is displayed
if (private$ve >= md) {
# The heading prefix
pre <- paste0(rep(char, 2), collapse = "")
pre <- paste0(pre, " ", collapse = "")
# The number of times the suffix should be repeated
c <- ll - (nchar(text) - 3)
# The heading text is added
msg <- paste0(pre, text, collapse = "")
msg <- paste0(msg, " ", collapse = "")
# The heading suffix
su <- paste0(rep(char, c), collapse = "")
msg <- paste0(msg, su, collapse = "")
msg <- paste0(msg, "\n", collapse = "")
# The heading prefix is printed
cat(msg)
}
},

# @description
# Performs processing on the data. It should be
# overriden by a derived class.
Expand Down
32 changes: 17 additions & 15 deletions R/data-analyzer.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#' It allows analyzing input text files and n-gram token files
#' Analyzes input text files and n-gram token files
#'
#' @description
#' It provides a method that returns information about text files, such as
Expand Down Expand Up @@ -92,22 +92,18 @@ DataAnalyzer <- R6::R6Class(
#' # files generated by the function can be viewed
#' em$td_env()
plot_n_gram_stats = function(opts) {
# The opts is merged with the da_opts attribute
private$da_opts <- modifyList(private$da_opts, opts)
# The da_opts is merged with the base class opts attribute
private$opts <- modifyList(private$opts, private$da_opts)
# The information message is shown
private$dh("Displaying Plot", "-", md = 1)
# The n-gram data is read
df <- private$read_obj(private$fn)
# The information message is shown
private$dm("Displaying Plot", md = 1)
# If the coverage option was specified
if (private$opts[["type"]] == "coverage") {
if (opts[["type"]] == "coverage") {
# The y values
y <- as.character(1:private$opts[["n"]])
y <- as.character(1:opts[["n"]])
# The x values
x <- numeric()
# The percentage frequencies is calculated
for (i in 1:private$opts[["n"]]) {
for (i in 1:opts[["n"]]) {
# The percentage of tokens with frequency i
x[i] <- 100 * (nrow(df[df$freq == i, ]) / nrow(df))
# The percentage is rounded to 2 decimal places
Expand All @@ -123,12 +119,12 @@ DataAnalyzer <- R6::R6Class(
)
}
# If the top_features option was specified
else if (private$opts[["type"]] == "top_features") {
else if (opts[["type"]] == "top_features") {
# The plot labels
labels <- list(
y = "Frequency",
x = "Feature",
title = paste("Top", private$opts[["n"]], "Features")
title = paste("Top", opts[["n"]], "Features")
)
}
# The freq column is converted to numeric
Expand All @@ -138,7 +134,7 @@ DataAnalyzer <- R6::R6Class(
# The data frame is sorted in descending order
df <- (df[order(df$freq, decreasing = T), ])
# The top n terms are extracted
df <- df[1:private$opts[["n"]], ]
df <- df[1:opts[["n"]], ]
# The chart is plotted
g <- private$display_plot(df, labels)

Expand All @@ -162,7 +158,7 @@ DataAnalyzer <- R6::R6Class(
print(g)
}
# The information message is shown
private$dm(" \u2714\n", md = 1)
private$dh("DONE", "=", md = 1)

return(df)
},
Expand Down Expand Up @@ -203,6 +199,8 @@ DataAnalyzer <- R6::R6Class(
#' # files generated by the function can be viewed
#' em$td_env()
get_file_info = function(res) {
# The information message is shown
private$dh("Generating file stats", "-", md = 1)
# The list of files to check
fl <- NULL
# If a directory name was passed
Expand Down Expand Up @@ -280,6 +278,9 @@ DataAnalyzer <- R6::R6Class(

# The required stats
stats <- list("file_stats" = fstats, "overall_stats" = ostats)
# The information message is shown
private$dh("DONE", "=", md = 1)

# The required stats are returned
return(stats)
},
Expand All @@ -301,7 +302,8 @@ DataAnalyzer <- R6::R6Class(
#' # The name of the folder that will contain all the files. It will be
#' # created in the current directory. NULL implies tempdir will be used
#' fn <- NULL
#' # The required files. They are default files that are part of the package
#' # The required files. They are default files that are part of the
#' # package
#' rf <- c("n2.RDS")
#' # An object of class EnvManager is created
#' em <- EnvManager$new(ve = ve, rp = "./")
Expand Down
8 changes: 2 additions & 6 deletions R/data-cleaner.R
Original file line number Diff line number Diff line change
Expand Up @@ -130,19 +130,15 @@ DataCleaner <- R6::R6Class(
#' # files generated by the function can be viewed
#' em$td_env()
clean_file = function() {
# The information message
msg <- paste0("Cleaning the file: ", private$fn, "\n")
# The information message is shown
private$dm(msg, md = 1)
private$dh("Cleaning file", "-", md = 1)
# The base class process_file function is called
private$process_file(
private$pre_process, private$process,
private$post_process
)
# The information message
msg <- paste0("The file: ", private$fn, " has been cleaned\n")
# The information message is shown
private$dm(msg, md = 1)
private$dh("DONE", "=", md = 1)
# If the data should not be saved
if (!private$dc_opts[["save_data"]]) {
# The processed output is returned
Expand Down
40 changes: 18 additions & 22 deletions R/data-sampler.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#' It allows generating data samples from text files.
#' Generates data samples from text files
#'
#' @description
#' It provides a method for generating training, testing and validation data
Expand Down Expand Up @@ -108,10 +108,12 @@ DataSampler <- R6::R6Class(
#' # Start of environment setup code
#' # The level of detail in the information messages
#' ve <- 0
#' # The name of the folder that will contain all the files. It will be created in
#' # the current directory. NULL implies tempdir will be used.
#' # The name of the folder that will contain all the files. It will be
#' # created in the current directory. NULL implies tempdir will be
#' # used
#' fn <- NULL
#' # The required files. They are default files that are part of the package
#' # The required files. They are default files that are part of the
#' # package
#' rf <- c("input.txt")
#' # An object of class EnvManager is created
#' em <- EnvManager$new(ve = ve)
Expand All @@ -133,8 +135,8 @@ DataSampler <- R6::R6Class(
#' )
#' )
#'
#' # The test environment is removed. Comment the below line, so the files
#' # generated by the function can be viewed
#' # The test environment is removed. Comment the below line, so the
#' # files generated by the function can be viewed
#' em$td_env()
generate_data = function(fn, percs) {
# The directory containing the input and output files
Expand All @@ -152,10 +154,11 @@ DataSampler <- R6::R6Class(
if (!file.exists(fn)) {
# The error message
private$dm("The input file: ",
fn,
" does not exist",
md = -1,
ty = "e")
fn,
" does not exist",
md = -1,
ty = "e"
)
}
# If the train, test and validation files already exist
if (file.exists(paste0(dir, "/train.txt")) &&
Expand Down Expand Up @@ -196,11 +199,8 @@ DataSampler <- R6::R6Class(
# The validation data is written to file
private$write_file(validate_ds, paste0(dir, "/validate.txt"), F)
}
# The information message
msg <- "Training, testing and validation data sets"
msg <- paste0(msg, " were successfully generated\n")
# The information message is shown
private$dm(msg, md = 1)
private$dh("DONE", "=", md = 1)
}
),
private = list(
Expand All @@ -219,9 +219,8 @@ DataSampler <- R6::R6Class(
# @param dc_opts The options for cleaning the data.
# @return The sampled data is returned
generate_sf_from_f = function(fn = NULL, ss, ic, ir, of, is, dc_opts) {
# The information message
private$dm(
"Generating sample file from the file:", fn, "\n", md = 1)
# The information message is shown
private$dh("Generating sample file", "-", md = 1)
# The input file is read
data <- private$read_file(fn, F)
# The number of lines in the main file
Expand Down Expand Up @@ -258,11 +257,8 @@ DataSampler <- R6::R6Class(
# The sample file is cleaned
data <- dc$clean_file()
}
# Information message is shown
private$dm(
"Sample file was sucessfully generated\n",
md = 1
)
# The information message is shown
private$dh("DONE", "=", md = 1)

return(data)
}
Expand Down
Loading

0 comments on commit a7d7f23

Please sign in to comment.