diff --git a/.Rbuildignore b/.Rbuildignore
index 91f6187..ca60038 100644
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -1,2 +1,3 @@
diff --git a/.gitignore b/.gitignore
index 565f2b6..3f32e00 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,4 @@
index eadbe32..4276530 100644
@@ -1,16 +1,32 @@
Package: wordpredictor
-Title: What the Package Does (One Line, Title Case)
+Title: Develop Text Prediction Models Based On N-grams
+Version: 1.0.0
- person(given = "First",
- family = "Last",
+ person(given = "Nadir",
+ family = "Latif",
role = c("aut", "cre"),
- email = "first.last@example.com",
- comment = c(ORCID = "YOUR-ORCID-ID"))
-Description: What the package does (one paragraph).
-License: `use_mit_license()`, `use_gpl3_license()` or friends to
- pick a license
+ email = "pakjiddat@gmail.com",
+ comment = c(ORCID = "0000-0002-7543-7405"))
+Description: It allows developing n-gram models for predicting text. It allows
+ cleaning input text using various options such as removal of stop words,
+ profanity, non-dictionary words, punctuation, stop words and stemming.
+ It generates n-gram tokens of a given size from a text file. It also
+ generates transition probability data for the n-grams. This data is used
+ to predict the next word given a set of words. The model's performance can
+ be evaluation using Perplexity (intrinsic evaluation) and
+ accuracy (extrinsic evaluation). The package also provides methods for
+ analyzing the n-gram model using bar plots. For example it displays plots
+ of n-gram frequencies and plots of model performance. It provides methods
+ for generating the n-gram model step by step or by using a single method.
+ The performance of the generated model may be evaluated and displayed in
+ plots. The generated model can be easily exported as a R object and used
+ in applications. The package is implemented using R6 classes.
+License: MIT + file LICENSE
Encoding: UTF-8
LazyData: true
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.1.1
+Imports: digest, ggplot2, R6, patchwork, stringr, dplyr, pryr, SnowballC, utils
+ testthat (>= 3.0.0)
+Config/testthat/edition: 3
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..586e115
--- /dev/null
@@ -0,0 +1,2 @@
+YEAR: 2021
diff --git a/LICENSE.md b/LICENSE.md
new file mode 100644
index 0000000..c7b302f
--- /dev/null
+++ b/LICENSE.md
@@ -0,0 +1,21 @@
+# MIT License
+Copyright (c) 2021 Nadir Latif
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
index 6ae9268..0a8a37e 100644
@@ -1,2 +1,30 @@
# Generated by roxygen2: do not edit by hand
diff --git a/R/data-analyzer.R b/R/data-analyzer.R
index 6484074..3ae411c 100644
--- a/R/data-analyzer.R
+++ b/R/data-analyzer.R
@@ -11,6 +11,7 @@
#' length and file size. It also provides a method that takes random samples of
#' lines in an input text file. It provides a method that reads an input text
#' file containing token frequencies. It displays the most occuring tokens.
+#' @importFrom ggplot2 ggplot geom_bar ggtitle coord_flip ylab xlab aes
DataAnalyzer <- R6::R6Class(
inherit = TextFileProcessor,
@@ -21,14 +22,15 @@ DataAnalyzer <- R6::R6Class(
#' @param file_name The path to the input file.
#' @param line_count The number of lines to read at a time.
#' @param verbose If progress information should be displayed.
- initialize = function(file_name = "./data/n1.txt",
+ #' @export
+ initialize = function(file_name = NULL,
verbose = 0) {
# The file name is set
- self$file_name <- file_name
+ private$file_name <- file_name
# The processed output is initialized
- self$p_output <- data.frame()
+ private$p_output <- data.frame()
# The verbose options is set
- self$verbose = verbose
+ private$verbose = verbose
#' @description
@@ -42,23 +44,23 @@ DataAnalyzer <- R6::R6Class(
#' 'top_features', 'coverage'.
#' n -> For 'top_features', it is the number of top most occuring
#' tokens.
- plot_data = function(opts) {
+ plot_n_gram_stats = function(opts) {
# The opts is merged with the da_opts attribute
private$da_opts = modifyList(private$da_opts, opts)
# The da_opts is merged with the base class opts attribute
- self$opts = modifyList(self$opts, private$da_opts)
+ private$opts = modifyList(private$opts, private$da_opts)
# The ngram data is read
- df <- self$read_file(self$file_name, T)
+ df <- private$read_obj(private$file_name)
# The information message is shown
- self$display_msg("Displaying Plot...", 1)
+ private$display_msg("Displaying Plot...", 1)
# If the coverage option was specified
- if (self$opts[["type"]] == "coverage") {
+ if (private$opts[["type"]] == "coverage") {
# The y values
- y <- as.character(1:self$opts[["n"]])
+ y <- as.character(1:private$opts[["n"]])
# The x values
x <- numeric()
# The percentage frequencies is calculated
- for (i in 1:self$opts[["n"]]) {
+ for (i in 1:private$opts[["n"]]) {
# The percentage of tokens with frequency i
x[i] <- round(100*(nrow(df[df$freq == i,])/nrow(df)), 2)
@@ -74,12 +76,12 @@ DataAnalyzer <- R6::R6Class(
private$display_plot(df, labels)
# If the top_features option was specified
- else if (self$opts[["type"]] == "top_features") {
+ else if (private$opts[["type"]] == "top_features") {
# The plot labels
labels <- list(
y = "Frequency",
x = "Feature",
- title = paste("Top", self$opts[["n"]], "Features"))
+ title = paste("Top", private$opts[["n"]], "Features"))
# The chart is plotted
private$display_plot(df, labels)
@@ -87,53 +89,85 @@ DataAnalyzer <- R6::R6Class(
#' @description
- #' Generates and returns information about the given text files.
- #' @param file_list The list of text files to check.
- #' @return A data frame containing the overall file statistics.
- get_file_info = function(file_list = list()) {
- # If the file list is empty, then the file name passed to the
- # current objet is used.
- if (length(file_list) == 0)
- file_list = list(self$file_name)
- # Empty list. Used to store information about each file
- stats <- data.frame(
- "total_line_count" = 0,
- "max_line_length" = 0,
- "min_line_length" = 0,
- "mean_line_length" = 0,
- "total_size" = 0)
+ #' Generates and returns information about text files.
+ #' @param res The name of a directory or a file name.
+ #' @return A data frame containing the file statistics.
+ get_file_info = function(res) {
+ # The list of files to check
+ file_list <- NULL
+ # If a directory name was passed
+ if (dir.exists(res)) {
+ # All files in the directory are fetched
+ file_list = dir(res, full.names = T)
+ }
+ # If a file name was passed
+ else if (file.exists(res)) {
+ # The file name is set
+ file_list <- res
+ }
+ # Used to store overall information about files
+ ostats <- data.frame(
+ "total_lc" = 0,
+ "max_ll" = 0,
+ "min_ll" = 0,
+ "mean_ll" = 0,
+ "total_s" = 0
+ )
+ # Used to store information about each file
+ fstats <- tstats <- data.frame()
# Temporary variables for calculating max, min, mean line length
temp_max <- temp_min <- temp_mean <- 0
# For each file in the list
- for (file_name in file_list) {
+ for (fn in file_list) {
# The file is read
- lines <- self$read_file(file_name, F)
+ lines <- private$read_file(fn, F)
+ # The line count
+ lc <- length(lines)
+ # The file size
+ size <- file.size(fn)
# The file stats are updated
- stats["total_size"] <-
- stats["total_size"] + file.size(file_name)
- stats["total_line_count"] <-
- stats["total_line_count"] + length(lines)
+ ostats[["total_s"]] <- ostats[["total_s"]] + size
+ ostats[["total_lc"]] <- ostats[["total_lc"]] + lc
# The temporary variables are updated
temp_max <- max(nchar(lines))
temp_min <- min(nchar(lines))
- temp_mean <- mean(nchar(lines))
- if (temp_max > stats["max_line_length"])
- stats["max_line_length"] <- temp_max
- if (temp_min > stats["min_line_length"])
- stats["min_line_length"] <- temp_min
- if (temp_mean > stats["mean_line_length"])
- stats["mean_line_length"] <- round(temp_mean)
+ temp_mean <- round(mean(nchar(lines)))
+ # The file stats are updated
+ tstats <- data.frame(
+ "fn" = fn,
+ "total_lc" = lc,
+ "max_ll" = temp_max,
+ "min_ll" = temp_min,
+ "mean_ll" = temp_mean,
+ "size" = size
+ )
+ # The size is formatted
+ tstats["size"] <-
+ utils:::format.object_size(tstats["size"], "auto")
+ # The file stats are appended
+ fstats <- rbind(fstats, tstats)
+ if (temp_max > ostats["max_ll"])
+ ostats["max_ll"] <- temp_max
+ if (temp_min > ostats["min_ll"])
+ ostats["min_ll"] <- temp_min
+ if (temp_mean > ostats["mean_ll"])
+ ostats["mean_ll"] <- temp_mean
# The total size is formatted
- stats["total_size"] <- utils:::format.object_size(
- stats["total_size"], "auto")
+ ostats["total_s"] <-
+ utils:::format.object_size(ostats["total_s"], "auto")
- # The required data is returned
+ # The required stats
+ stats = list("file_stats" = fstats, "overall_stats" = ostats)
+ # The required stats are returned
@@ -149,7 +183,7 @@ DataAnalyzer <- R6::R6Class(
#' @param percs The size of the training, testing and validation sets.
generate_data = function(dir, percs) {
# The information message is shown
- self$display_msg(
+ private$display_msg(
"Generating training, testing and validation data sets...", 1)
# If the train, test and validation files already exist
if (file.exists(paste0(dir, "/train.txt")) &&
@@ -158,11 +192,11 @@ DataAnalyzer <- R6::R6Class(
# The information message
msg <- "The train, test and validate files already exist"
# The information message is shown
- self$display_msg(msg, 1)
+ private$display_msg(msg, 1)
else {
# The input file is read
- data <- self$read_file(self$file_name, F)
+ data <- private$read_file(private$file_name, F)
# The number of lines in the data
lc <- length(data)
# Random indexes are generated
@@ -176,11 +210,11 @@ DataAnalyzer <- R6::R6Class(
# The validation set data
validate_ds <- rd[1:round(lc*percs[["validate"]])]
# The training data is written to file
- self$write_file(train_ds, paste0(dir, "/train.txt"), F)
+ private$write_file(train_ds, paste0(dir, "/train.txt"), F)
# The testing data is written to file
- self$write_file(test_ds, paste0(dir, "/test.txt"), F)
+ private$write_file(test_ds, paste0(dir, "/test.txt"), F)
# The validation data is written to file
- self$write_file(validate_ds, paste0(dir, "/validate.txt"), F)
+ private$write_file(validate_ds, paste0(dir, "/validate.txt"), F)
@@ -194,7 +228,7 @@ DataAnalyzer <- R6::R6Class(
#' @param pre The ngram prefix, given as a regular expression.
get_ngrams = function(fn, c = NULL, pre = NULL) {
# The data is read
- df <- self$read_obj(fn)
+ df <- private$read_obj(fn)
# If the prefix is not given
if (is.null(pre)) {
# The sample indexes
@@ -235,7 +269,7 @@ DataAnalyzer <- R6::R6Class(
# The data frame is sorted in descending order
df <- (df[order(df$freq, decreasing = T),])
# The top n terms are extracted
- df <- df[1:self$opts[["n"]], ]
+ df <- df[1:private$opts[["n"]], ]
# The ngram names and their frequencies are plotted
g <- ggplot(data = df, aes(x = reorder(pre, freq), y = freq)) +
geom_bar(stat = "identity", fill = "red") +
diff --git a/R/data-cleaner.R b/R/data-cleaner.R
index 61ec87c..396ae6c 100644
--- a/R/data-cleaner.R
+++ b/R/data-cleaner.R
@@ -11,48 +11,6 @@ DataCleaner <- R6::R6Class(
inherit = TextFileProcessor,
public = list(
- #' @field dc_opts The options for the data cleaner object.
- #' min_words -> The minimum number of words per sentence.
- #' line_count -> The number of lines to read and clean at a time.
- # save_data -> If the combined processed lines should be saved.
- # output_file -> Name of the output file used to store the data.
- #' sw_file -> The stop words file path.
- # dict_file -> The dictionary file path.
- #' bad_file -> The bad words file path.
- #' to_lower -> If the words should be converted to lower case.
- #' remove_stop -> If stop words should be removed.
- #' remove_punct -> If punctuation symbols should be removed.
- #' remove_non_dict -> If non dictionary words should be removed.
- #' remove_non_alpha -> If non alphabet symbols should be removed.
- #' remove_extra_space -> If leading, trailing and double spaces
- #' should be removed.
- #' remove_bad -> If bad words should be removed
- dc_opts = list(
- "min_words" = 2,
- "line_count" = 1000,
- "save_data" = T,
- "output_file" = "./data/sample-clean.txt",
- "sw_file" = "./data/stop-words.txt",
- "dict_file" = "./data/dict-no-bad.txt",
- "bad_file" = "./data/bad-words.txt",
- "to_lower" = T,
- "remove_stop" = F,
- "remove_punct" = T,
- "remove_non_dict" = T,
- "remove_non_alpha" = T,
- "remove_extra_space" = T,
- "remove_bad" = F
- ),
- #' @field sw The list of stop words.
- sw = list(),
- #' @field bw The list of bad words.
- bw = list(),
- #' @field dw The list of dictionary words.
- dw = list(),
#' @description
#' It initializes the current object. It is used to set the file name
#' and verbose options.
@@ -74,21 +32,33 @@ DataCleaner <- R6::R6Class(
#' should be removed.
#' remove_bad -> If bad words should be removed
#' @param verbose Indicates if progress information should be displayed.
+ #' @export
initialize = function(file_name = NULL,
- opts = self$dc_opts,
+ opts = list(),
verbose = 0) {
+ # The stop words file is checked
+ opts[["sw_file"]] <- private$check_file(
+ opts[["sw_file"]], "stop-words.txt")
+ # The bad words file is checked
+ opts[["bad_file"]] <- private$check_file(
+ opts[["bad_file"]], "bad-words.txt")
+ # The dict words file is checked
+ opts[["dict_file"]] <- private$check_file(
+ opts[["dict_file"]], "dict-no-bad.txt")
# The given options are merged with the opts attribute
- self$dc_opts <- modifyList(self$dc_opts, opts)
- # The dc_opts is merged with the base class opts attribute
- self$opts <- modifyList(self$opts, self$dc_opts)
+ private$dc_opts <- modifyList(private$dc_opts, opts)
+ # The save_data option of base class is set
+ private$opts[["save_data"]] <- private$dc_opts[["save_data"]]
+ # The output_file option of base class is set
+ private$opts[["output_file"]] <- private$dc_opts[["output_file"]]
# The stop words file is read
- self$sw <- self$read_file(self$opts[["sw_file"]], F);
+ private$sw <- private$read_file(private$opts[["sw_file"]], F);
# The dictionary file is read
- self$dw <- self$read_file(self$opts[["dict_file"]], F);
+ private$dw <- private$read_file(private$opts[["dict_file"]], F);
# The bad word file is read
- self$bw <- self$read_file(self$opts[["bad_file"]], F);
+ private$bw <- private$read_file(private$opts[["bad_file"]], F);
# The base class is initialized
- super$initialize(file_name, self$opts[['line_count']], verbose)
+ super$initialize(file_name, private$opts[['line_count']], verbose)
#' @description
@@ -100,12 +70,17 @@ DataCleaner <- R6::R6Class(
#' number of lines, the cleaned lines are saved to the output file.
clean_file = function() {
# The information message
- msg <- paste0("Cleaning the sample file...", self$file_name)
+ msg <- paste0("Cleaning the sample file...", private$file_name)
# The information message is shown
- self$display_msg(msg, 1)
+ private$display_msg(msg, 1)
# The base class process_file function is called
- super$process_file(super$pre_process, private$process,
- super$post_process)
+ private$process_file(private$pre_process, private$process,
+ private$post_process)
+ # If the data should not be saved
+ if (!private$opts[["save_data"]]) {
+ # The processed output is returned
+ return(private$p_output)
+ }
#' @description
@@ -122,41 +97,41 @@ DataCleaner <- R6::R6Class(
# The "." character is replaced with the string "specialdotsep"
l <- gsub("\\.", " specialdotsep ", l)
# If the words should be converted to lower case
- if (self$opts[["to_lower"]]) {
+ if (private$opts[["to_lower"]]) {
# The information message
- self$display_msg("Converting lines to lower case...", 3)
+ private$display_msg("Converting lines to lower case...", 3)
# The line is converted to lower case
l <- tolower(l)
# If punctuation symbols should be removed
- if (self$opts[["remove_punct"]]) {
+ if (private$opts[["remove_punct"]]) {
# The information message
- self$display_msg("Removing punctuation symbols...", 3)
+ private$display_msg("Removing punctuation symbols...", 3)
# The pattern for removing all punctuation symbols
l <- gsub("[[:punct:]]+", "", l)
# If non alphabet symbols should be removed
- if (self$opts[["remove_non_alpha"]]) {
+ if (private$opts[["remove_non_alpha"]]) {
# The information message
- self$display_msg("Removing non alphabet symbols...", 3)
+ private$display_msg("Removing non alphabet symbols...", 3)
# Words containing non alphabetical characters are removed
l <- gsub("([^[:alpha:]\\s])", "", l, perl = T)
# If stop words should be removed
- if (self$opts[["remove_stop"]]) {
+ if (private$opts[["remove_stop"]]) {
# The information message
- self$display_msg("Removing stop words..", 3)
+ private$display_msg("Removing stop words..", 3)
# Stop words are collapsed
- sw <- paste(self$sw, collapse = "|")
+ sw <- paste(private$sw, collapse = "|")
swp <- paste("\\b(", sw, ")\\b", sep = "")
# The stop words are removed
l <- gsub(swp, "", l)
# If extra spaces should be removed
- if (self$opts[["remove_extra_space"]]) {
+ if (private$opts[["remove_extra_space"]]) {
# The information message
- self$display_msg("Removing extra space...", 3)
+ private$display_msg("Removing extra space...", 3)
# Multiple spaces are replaced by single space
l = gsub("\\s{2,}", " ", l)
# Leading and trailing whitespaces are removed
@@ -168,10 +143,10 @@ DataCleaner <- R6::R6Class(
# The words are converted to an atomic list
words <- unlist(words)
# If non dictionary words should be removed
- if (self$opts[["remove_non_dict"]]) {
+ if (private$opts[["remove_non_dict"]]) {
# The "specialdotsep" string is added to list of dictionary
# words
- dw <- c(self$dw, "specialdotsep")
+ dw <- c(private$dw, "specialdotsep")
# The non dictionary words are removed from the data
words <- words[words %in% dw]
# All 1 length words except for 'a' and 'i' are removed
@@ -184,11 +159,11 @@ DataCleaner <- R6::R6Class(
words <- words[i1 | i2]
# If bad words should be removed
- if (self$opts[["remove_bad"]]) {
+ if (private$opts[["remove_bad"]]) {
# The "specialdotsep" string is added to list of bad words
- bw <- c(self$bw, "specialdotsep")
+ bw <- c(private$bw, "specialdotsep")
# The bad words are removed from the data
- words <- words[!words %in% self$bw]
+ words <- words[!words %in% private$bw]
# The words are combined with space
l <- paste(words, collapse = " ")
@@ -199,7 +174,7 @@ DataCleaner <- R6::R6Class(
# The sentences are converted to an atomic list
l <- unlist(l)
# If extra spaces should be removed
- if (self$opts[["remove_extra_space"]]) {
+ if (private$opts[["remove_extra_space"]]) {
# Multiple spaces are replaced by single space
l = gsub("\\s{2,}", " ", l)
# Leading and trailing whitespaces are removed
@@ -207,12 +182,12 @@ DataCleaner <- R6::R6Class(
# If each sentence should have a minimum number of words
- if (self$opts[["min_words"]] > -1) {
+ if (private$opts[["min_words"]] > -1) {
# The number of words in each sentence
wc <- str_count(l, pattern = boundary("word"))
# The lines containing less than min_words number of words are
# removed
- l <- l[wc >= self$opts[["min_words"]]]
+ l <- l[wc >= private$opts[["min_words"]]]
# Consecutive 'a' and 'i' are replaced with single 'a' or 'i'
@@ -225,6 +200,47 @@ DataCleaner <- R6::R6Class(
private = list(
+ # @field dc_opts The options for the data cleaner object.
+ # min_words -> The minimum number of words per sentence.
+ # line_count -> The number of lines to read and clean at a time.
+ # save_data -> If the combined processed lines should be saved.
+ # output_file -> Name of the output file used to store the data.
+ # sw_file -> The stop words file path.
+ # dict_file -> The dictionary file path.
+ # bad_file -> The bad words file path.
+ # to_lower -> If the words should be converted to lower case.
+ # remove_stop -> If stop words should be removed.
+ # remove_punct -> If punctuation symbols should be removed.
+ # remove_non_dict -> If non dictionary words should be removed.
+ # remove_non_alpha -> If non alphabet symbols should be removed.
+ # remove_extra_space -> If leading, trailing and double spaces
+ # should be removed.
+ # remove_bad -> If bad words should be removed
+ dc_opts = list(
+ "min_words" = 2,
+ "line_count" = 1000,
+ "save_data" = T,
+ "output_file" = "./data/sample-clean.txt",
+ "sw_file" = NULL,
+ "dict_file" = NULL,
+ "bad_file" = NULL,
+ "to_lower" = T,
+ "remove_stop" = F,
+ "remove_punct" = T,
+ "remove_non_dict" = T,
+ "remove_non_alpha" = T,
+ "remove_extra_space" = T,
+ "remove_bad" = F
+ ),
+ # @field sw The list of stop words.
+ sw = list(),
+ # @field bw The list of bad words.
+ bw = list(),
+ # @field dw The list of dictionary words.
+ dw = list(),
# @description
# Performs processing for the \code{clean_files} function.
@@ -239,6 +255,35 @@ DataCleaner <- R6::R6Class(
cl <- self$clean_lines(lines)
+ },
+ # @description
+ # Checks if the given file exists. If it does not exist,
+ # then it tried to load the file from the external data folder of the
+ # package. It throws an error if the file was not found
+ # @param fn The file name.
+ # @param dfn The name of the default file in the external data folder of
+ # the package.
+ # @return The name of the file if it exists, or the full path to the
+ # default file.
+ check_file = function(fn, dfn) {
+ # The required file name
+ rfn <- fn
+ # If the file is not given
+ if (is.null(fn)) {
+ # The file path is set to the default file
+ # included with the wordpredictor package
+ rfn <- system.file("extdata", dfn, package = "wordpredictor")
+ # If the file was not found
+ if (!file.exists(rfn))
+ stop(paste0("The file: ", rfn," does not exist !"))
+ }
+ # If the file name is given but the file does not exist
+ else if (!file.exists(fn)) {
+ # An error message is shown
+ stop(paste0("The file: ", fn," does not exist !"))
+ }
+ return (rfn)
diff --git a/R/data-sampler.R b/R/data-sampler.R
new file mode 100644
index 0000000..dc2989c
--- /dev/null
+++ b/R/data-sampler.R
@@ -0,0 +1,196 @@
+#' It is used to generate data samples from text files.
+#' @description
+#' It provides a method for generating training, testing and
+#' validation data sets from a given input text file. It also provides a method
+#' for generating a sample file of given size or number of lines from an input
+#' text file.
+DataSampler <- R6::R6Class(
+ "DataSampler",
+ inherit = TextFileProcessor,
+ public = list(
+ #' @description
+ #' It initializes the current object. It is used to set the
+ #' verbose option.
+ #' @param ddir The data directory.
+ #' @param mdir The model directory.
+ #' @param verbose If progress information should be displayed.
+ initialize = function(ddir = "./data",
+ mdir = "./models",
+ verbose = 0) {
+ # The data directory name is set
+ private$ddir <- ddir
+ # The model directory name is set
+ private$mdir <- mdir
+ # The base class is initialized
+ super$initialize(NULL, NULL, verbose)
+ },
+ #' @description Generates a sample file of given size from the given
+ #' input file. The file is optionally cleaned and saved.
+ #' @param fn The input file name. It is the short file name relative to
+ #' the ddir. If not given, then the file name is auto generated from
+ #' the type parameter.
+ #' @param ss The number of lines or proportion of lines to sample.
+ #' @param ic If the sample file should be cleaned.
+ #' @param t The type of sample. It can be: 'tr' -> training 'te' ->
+ #' testing 'va' -> validation
+ #' @param is If the sampled data should be saved to a file.
+ generate_sample = function(fn = NULL, ss, ic, t, is) {
+ # If the type is 'tr'
+ if (t == 'tr') sfn <- 'train'
+ # If the type is 'te'
+ else if (t == 'te') sfn <- 'test'
+ # If the type is 'va'
+ else if (t == 'va') sfn <- 'validate'
+ # If the input file name is not given
+ if (is.null(fn)) {
+ # The input file name
+ fn <- paste0(private$ddir, "/", sfn, ".txt")
+ }
+ # If the input file name is given
+ else {
+ # The full path to the input file
+ fn <- paste0(private$ddir, "/", fn)
+ }
+ # If the input file does not exist
+ if (!file.exists(fn)) {
+ # The information message
+ msg <- paste0("The input file: ", fn, " does not exist")
+ # An error is thrown
+ stop(msg)
+ }
+ # The sample file name
+ sf <- paste0(private$mdir, "/", sfn, ".txt")
+ # The clean sample file name
+ csf <- paste0(private$mdir, "/", sfn, "-clean.txt")
+ # If the cleaned sample file already exists
+ if (file.exists(csf)) {
+ # The information message
+ msg <- paste0("The cleaned sample file: ", csf,
+ " already exists")
+ # Information message is shown
+ private$display_msg(msg, 2)
+ }
+ else {
+ # If the sample file does not exist
+ if (!file.exists(sf)) {
+ # The information message
+ msg <- paste0("Generating sample file from the file: ", fn)
+ # Information message is shown
+ private$display_msg(msg, 2)
+ # The input file is read
+ data <- private$read_file(fn, F)
+ # If the sample size is less than 1
+ if (ss < 1) {
+ # The number of lines in the main file
+ lc <- length(data)
+ # The number of lines in the sample file
+ lc <- round(lc*ss)
+ }
+ else {
+ lc <- ss
+ }
+ # The sample file data
+ data <- data[1:lc]
+ # If the data should be saved
+ if (is) {
+ # The sample file data is saved
+ private$write_file(data, sf, F)
+ }
+ }
+ # If the sample file exists
+ else {
+ # The information message
+ msg <- paste0("The sample file: ", sf, " already exists")
+ # Information message is shown
+ private$display_msg(msg, 2)
+ }
+ # If the sample file should be cleaned
+ if (ic) {
+ # The options for cleaning the data
+ opts <- private$dc_opts
+ # The line count is set to 5000
+ opts[["line_count"]] <- 5000
+ # The output file name
+ opts[["output_file"]] <- csf
+ # If the data should be saved
+ opts[["save_data"]] <- is
+ # The data cleaner object is created
+ dc <- DataCleaner$new(sf, opts, verbose = private$verbose)
+ # The sample file is cleaned
+ data <- dc$clean_file()
+ }
+ }
+ # If the data should not be saved
+ if (!is) {
+ # The data is returned
+ return(data)
+ }
+ },
+ #' @description
+ #' It generates training, testing and validation data sets
+ #' from the given input file. It first reads the file given as a
+ #' parameter to the current object. It generates random indexes for the
+ #' data. It partitions the data into training, testing and validation
+ #' sets, according to the given parameters. The files are named
+ #' train.txt, test.txt and va.txt. The files are saved to the given
+ #' output folder.
+ #' @param fn The input file name. It should be relative to the ddir.
+ #' @param dir The name of the output folder.
+ #' @param percs The size of the training, testing and validation sets.
+ generate_data = function(fn, dir, percs) {
+ # The information message is shown
+ private$display_msg(
+ "Generating training, testing and validation data sets...", 1)
+ # The input file path is generated
+ fn <- paste0(private$ddir, "/", fn)
+ # If the input file does not exist
+ if (!file.exists(fn)) {
+ # The information message
+ msg <- paste0("The input file: ", fn, " does not exist")
+ # An error is thrown
+ stop(msg)
+ }
+ # If the train, test and validation files already exist
+ if (file.exists(paste0(dir, "/train.txt")) &&
+ file.exists(paste0(dir, "/test.txt")) &&
+ file.exists(paste0(dir, "/validate.txt"))) {
+ # The information message
+ msg <- "The train, test and validate files already exist"
+ # The information message is shown
+ private$display_msg(msg, 1)
+ }
+ else {
+ # The input file is read
+ data <- private$read_file(fn, F)
+ # The number of lines in the data
+ lc <- length(data)
+ # Random indexes are generated
+ indexes <- sample(1:lc, lc)
+ # The randomized data
+ rd <- data[indexes]
+ # The training set data
+ train_ds <- rd[1:round(lc*percs[["train"]])]
+ # The testing set data
+ test_ds <- rd[1:round(lc*percs[["test"]])]
+ # The validation set data
+ validate_ds <- rd[1:round(lc*percs[["validate"]])]
+ # The training data is written to file
+ private$write_file(train_ds, paste0(dir, "/train.txt"), F)
+ # The testing data is written to file
+ private$write_file(test_ds, paste0(dir, "/test.txt"), F)
+ # The validation data is written to file
+ private$write_file(validate_ds, paste0(dir, "/validate.txt"), F)
+ }
+ }
+ ),
+ private = list(
+ # @field ddir The folder containing the data files
+ ddir = "./data",
+ # @field mdir The folder containing the model files
+ mdir = "./models"
+ )
diff --git a/R/model-evaluator.R b/R/model-evaluator.R
deleted file mode 100644
index ca02aa1..0000000
--- a/R/model-evaluator.R
+++ /dev/null
@@ -1,1136 +0,0 @@
-#' It is used to evaluate the accuracy and performance of the model.
-#' @description
-#' It provides methods that perform extrinsic and intrinsic model
-#' evaluation. It also provides methods for determining the memory and time
-#' requirements for generating the model. It also provides a method for
-#' determining how much memory is used by the final model.
-#' @details
-#' It provides a method that performs intrinsic model evaluation based
-#' on Perplexity. It also provides a method that performs extrinsic model
-#' evalation based on accuracy. It provides a method for determining how much
-#' memory and time is needed to generate a model for different input data
-#' sizes. It provides a method for determining how much memory is needed by
-#' the final model.
-ModelEvaluator <- R6::R6Class(
- "ModelEvaluator",
- inherit = TextFileProcessor,
- public = list(
- #' @field tp The transition probabilities data frame.
- tp = NULL,
- #' @field wl The list of unique words.
- wl = NULL,
- #' @field dp The default probability is equal to 1/(N+V), where N is the
- #' number of words in the sentence, V is the number of words in the
- #' vocabulary.
- dp = NULL,
- #' @field model The maximum number of ngrams supported by the model.
- model = 4,
- #' @field dc_opts The options for the data cleaner object.
- #' min_words -> The minimum number of words per sentence.
- #' line_count -> The number of lines to read and clean at a time.
- #' sw_file -> The stop words file path.
- # dict_file -> The dictionary file path.
- #' bad_file -> The bad words file path.
- #' to_lower -> If the words should be converted to lower case.
- #' remove_stop -> If stop words should be removed.
- #' remove_punct -> If punctuation symbols should be removed.
- #' remove_non_dict -> If non dictionary words should be removed.
- #' remove_non_alpha -> If non alphabet symbols should be removed.
- #' remove_extra_space -> If leading, trailing and double spaces
- #' should be removed.
- #' remove_bad -> If bad words should be removed
- dc_opts = list(
- "min_words" = 2,
- "line_count" = 1000,
- "sw_file" = "./data/stop-words.txt",
- "dict_file" = "./data/dict-no-bad.txt",
- "bad_file" = "./data/bad-words.txt",
- "to_lower" = T,
- "remove_stop" = F,
- "remove_punc" = T,
- "remove_non_dict" = T,
- "remove_non_alpha" = T,
- "remove_extra_space" = T,
- "remove_bad" = F
- ),
- #' @field tg_opts The options for the token generator obj.
- #' n -> The ngram size.
- #' save_ngrams -> If the ngram data should be saved.
- #' min_freq -> All ngrams with frequency less than min_freq are
- #' ignored.
- #' line_count -> The number of lines to process at a time.
- #' stem_words -> If words should be converted to their stem.
- #' dir -> The dir where the output file should be saved.
- #' format -> The format for the output. There are two options.
- #' 'plain' -> The data is stored in plain text.
- #' 'obj' -> The data is stored as a R obj.
- tg_opts = list(
- "min_freq" = -1,
- "n" = 1,
- "save_ngrams" = T,
- "min_freq" = -1,
- "line_count" = 5000,
- "stem_words" = F,
- "dir" = "./data/models",
- "format" = "obj"
- ),
- #' @field ssize The sample size in Mb.
- ssize = 30,
- #' @field ddir The folder containing the data files
- ddir = "./data",
- #' @field mdir The folder containing the model files
- mdir = "./data/models",
- #' @description
- #' It initializes the current object. It is used to set the
- #' maximum ngram number, sample size, input file name, data cleaner
- #' options and verbose option.
- #' @param model The maximum ngram number supported by the model.
- #' @param ssize The sample size in Mb.
- #' @param ddir The data directory.
- #' @param mdir The model directory.
- #' @param dc_opts The data cleaner options.
- #' @param tg_opts The token generator options.
- #' @param verbose If progress information should be displayed.
- initialize = function(model = 4,
- ssize = 30,
- ddir = "./data",
- mdir = "./data/models",
- dc_opts = list(),
- tg_opts = list(),
- verbose = 0) {
- # The base class is initialized
- super$initialize(NULL, NULL, verbose)
- # The model number is set
- self$model <- model
- # The sample size is set
- self$ssize <- ssize
- # The data directory name is set
- self$ddir <- ddir
- # The model directory name is set
- self$mdir <- mdir
- # If the dc_opts are given
- if (length(dc_opts) > 0) {
- # The custom dc_opts are merged with the default dc_opts
- self$dc_opts = modifyList(self$dc_opts, dc_opts)
- }
- # If the tg_opts are given
- if (length(tg_opts) > 0) {
- # The custom tg_opts are merged with the default tg_opts
- self$tg_opts = modifyList(self$tg_opts, tg_opts)
- }
- # The transition probabilities data is initialized
- self$tp <- NULL
- },
- #' @description
- #' It compares the performance of the specified models by
- #' plotting the performance statistics. The models are specified with
- #' the type parameter.
- #' @param type The models to compare. It can be:
- #' 'basic' -> One model for each ngram.
- #' 'grouped' -> One model for each group and ngram. For
- #' e.g For data size 5 Mb, there are 3 models for the ngrams 2:4.
- #' @param opts The options for plotting the data.
- #' 'group' -> The field to group by.
- #' 'title' -> The main plot title.
- #' 'subtitle' -> The plot sub title.
- #' @return The performance stats.
- performance_comparision = function(type, opts) {
- # If the type is 'basic'
- if (type == 'basic') {
- # The pstats file name
- fn <- paste0(self$mdir, "/pstats.RDS")
- # The stats are read
- pstats <- self$read_obj(fn)
- # The config file name
- fn <- paste0(self$mdir, "/config.RDS")
- # The config file is read
- config <- self$read_obj(fn)
- # The y-axis values
- y <- list("m" = NULL, "t" = NULL, "p" = NULL, "a" = NULL)
- # For each model the performance metrics are measured
- for (i in 1:(config$model-1)) {
- # The memory value
- m <- self$format_size(pstats[[i]][["memory"]])
- # The y-axis values are updated
- y[["m"]] <- c(y[["m"]], m)
- y[["t"]] <- c(y[["t"]], pstats[[i]][["time"]])
- y[["p"]] <- c(y[["p"]],
- pstats[[i]][["intrinsic"]][["mean"]])
- y[["a"]] <- c(y[["a"]],
- pstats[[i]][["extrinsic"]][["valid_perc"]])
- }
- # The data frame containing the data to be plotted
- df <- data.frame(
- "n" = 2:(length(pstats) + 1),
- "m" = y[["m"]],
- "t" = y[["t"]],
- "p" = y[["p"]],
- "a" = y[["a"]]
- )
- # The options for plotting
- opts <- list(
- "type" = "basic",
- "title" = "Variation of performance with ngram size",
- "subtitle" = opts[['subtitle']])
- }
- # If the type is 'grouped'
- else if (type == 'grouped') {
- # The average performance stats for each data size
- y <- list("m" = NULL, "t" = NULL, "p" = NULL, "a" = NULL)
- # The different items in the group. Each folder is a group item
- f <- list.dirs(self$mdir, recursive = F, full.names = F)
- # The output data frame
- df <- data.frame()
- # For each folder, the performance stats for all ngram models is
- # calculated.
- for (i in f) {
- # The performance stats file name
- fn <- paste0(self$mdir, "/", i, "/pstats.RDS")
- # The stats are read
- pstats <- self$read_obj(fn)
- # The config file name
- fn <- paste0(self$mdir, "/", i, "/config.RDS")
- # The stats are read
- config <- self$read_obj(fn)
- # The memory usage for each ngram
- m <- private$process_stats(pstats, "memory", config$model)
- # The time taken for each ngram
- t <- private$process_stats(pstats, "time", config$model)
- # The perplexity for each ngram
- p <- private$process_stats(pstats, "perplexity", config$model)
- # The accuracy score for each ngram
- a <- private$process_stats(pstats, "accuracy", config$model)
- # The temprary data frame for the current data size
- tdf <- data.frame(
- rep(i, (config$model-1)),
- m, t, p, a, 2:(length(pstats) + 1))
- # The column names are set
- names(tdf) <- c(opts[["group"]], "m", "t", "p", "a", "n")
- # The temprary data frame is appended to the output data
- # frame
- df <- rbind(df, tdf)
- }
- # The group name
- g <- opts[["group"]]
- # The options for plotting
- opts <- list(
- "type" = "grouped",
- "group" = g,
- "title" = opts[['title']],
- "subtitle" = opts[['subtitle']])
- # If the group column is numeric
- if (is.numeric(as.numeric(as.character(df[[g]])))) {
- # The group column is converted to numeric
- df[[g]] <- as.numeric(as.character(df[[g]]))
- # The data is ordered by group
- df <- df[order(df[[opts[["group"]]]]), ]
- # The group column is converted to factor
- df[[g]] <- as.factor(df[[g]])
- }
- }
- # The performance stats are plotted
- private$plot_stats(df, opts)
- # The performance stats are returned
- return(df)
- },
- #' @description
- #' For each model it performs intrinsic and extrinsic
- #' evaluation. It also measures the memory usage and time taken. The
- #' performance metrics are displayed in 5 plots on one page. Performance
- #' statistics are saved to the file pstats.RDS.
- performance_evaluation = function() {
- # The performance stats
- pstats <- list()
- # The maximum model number
- mmax <- self$model
- # The y-axis values
- y <- list("m" = NULL, "t" = NULL, "p" = NULL, "a" = NULL)
- # For each model the performance metrics are measured
- for (m in 2:mmax) {
- # The transition probabilities data is initialized
- self$tp <- NULL
- # The model number is set
- self$model <- m
- # The intrinsic and extrinsic evaluation is performed
- time_taken <- system.time({
- memory_used <- mem_change({
- # intrinsic evaluation is performed
- istats <- self$intrinsic_evaluation()
- # Extrinsic evaluation is performed
- estats <- self$extrinsic_evaluation()
- })
- })
- # The memory used
- memory_used <- object_size(self$tp)
- # The performance stats are updated
- pstats[[m-1]] <- list(
- "intrinsic" = istats,
- "extrinsic" = estats,
- "memory" = memory_used,
- "time" = time_taken[[3]]
- )
- # The y-axis values are updated
- y[["m"]] <- c(y[["m"]], self$format_size(memory_used))
- y[["t"]] <- c(y[["t"]], time_taken[[3]])
- y[["p"]] <- c(y[["p"]], istats$mean)
- y[["a"]] <- c(y[["a"]], estats$valid_perc)
- }
- # The information message is shown
- self$display_msg("Saving model performance stats...", 1)
- # The performance stats file name
- fn <- paste0(self$mdir, "/pstats.RDS")
- # The performance stats are saved
- self$save_obj(pstats, fn)
- # The data to be plotted
- df <- data.frame(
- "n" = 2:(length(pstats)+1),
- "m" = y[["m"]],
- "t" = y[["t"]],
- "p" = y[["p"]],
- "a" = y[["a"]]
- )
- # The options for plotting
- opts <- list("type" = "basic")
- # The performance stats are plotted
- private$plot_stats(df, opts)
- },
- #' @description
- #' It loads the given model configuration file.
- #' @param fn The config file name.
- load_config = function(fn) {
- # The model configuration is read
- config <- self$read_obj(fn)
- # The configuration is copied to the object attributes
- self$model <- config$model
- self$ssize <- config$ssize
- self$dc_opts <- config$dc_opts
- self$ddir <- config$ddir
- self$mdir <- config$mdir
- },
- #' @description
- #' It loads the model located at the mdir location
- load_model = function() {
- # The configuration file name
- fn <- paste0(self$mdir, "/config.RDS")
- # The model configuration is loaded to the current object
- self$load_config(fn)
- # The tp data for the specified ngrams is loaded
- private$read_tp_data(self$model)
- },
- #' @description
- #' It saves the model configuration to the models
- #' subdirectory of the given directory.
- save_config = function() {
- # The information message is shown
- self$display_msg("Saving model configuration...", 1)
- # The model configuration
- config <- list(
- "model" = self$model,
- "ssize" = self$ssize,
- "ddir" = self$ddir,
- "mdir" = self$mdir,
- "dc_opts" = self$dc_opts,
- "tg_opts" = self$tg_opts
- )
- # The configuration file
- fn <- paste0(self$mdir, "/config.RDS")
- # The configuration is saved
- self$save_obj(config, fn)
- },
- #' @description
- #' It generates the model for the given ngram number,
- #' data sample size, data cleaning options and input file.
- generate_model = function() {
- # The data analyzer object is created
- da <- DataAnalyzer$new(
- paste0(self$ddir, "/train.txt"), self$verbose)
- # The training, testing and validation data sets are generated
- da$generate_data(self$ddir,
- list(train = .8, test = .1, validate = .1))
- # The object size is formatted
- obj_size <- file.size(paste0(self$ddir, "/train.txt"))/10^6
- # The proportion of data to sample
- prop <- (self$ssize/obj_size)
- # Random sample is taken and cleaned
- self$generate_clean_sample(prop, T, 'tr')
- # The model directory is set
- self$tg_opts$dir <- self$mdir
- # For each ngram number, the ngram token file is generated
- for (i in 1:self$model) {
- # The clean train data file name
- fn <- paste0(self$mdir, "/train-clean.txt")
- # The ngram number is set
- self$tg_opts$n <- i
- # The TokenGenerator object is created
- tg <- TokenGenerator$new(fn, self$tg_opts, self$verbose)
- # The ngram tokens are generated
- tg$generate_tokens()
- }
- # For each ngram number, the transition probabilities are generated
- for (i in 2:self$model) {
- # The TPGenerator object is created
- tp <- TPGenerator$new()
- tp_opts <- list(
- "n_range" = 1:i,
- "save_tp" = T,
- "format" = "obj",
- "dir" = self$mdir
- )
- # The transition probabilities are generated
- tp$generate_tp(tp_opts)
- }
- # The model config is saved
- self$save_config()
- },
- #' @description
- #' Evaluates the model using intrinsic evaluation based on
- #' Perplexity. First a validation data set containing 1000 lines is
- #' generated. It is then cleaned. 20 random sentences are taken. For
- #' each sentence, Perplexity of all the words is calculated.
- #' @return The minumum, maximum and mean Perplexity score.
- intrinsic_evaluation = function() {
- # The transition probabilities data is read
- private$read_tp_data(self$model)
- # Random sample is taken and cleaned
- self$generate_clean_sample(1000, T, 'va')
- # The validation sample data is read
- data <- self$read_file(
- paste0(self$mdir,"/validate-clean.txt"), F)
- # A sample of 20 sentences is taken
- data <- data[1:20]
- # The information message
- msg <- paste0(
- "Calculating Perplexity for ", length(data), " sentences")
- # The information message is shown
- self$display_msg(msg, 1)
- # The list of perplexities
- pl <- c();
- # The loop counter
- c <- 1
- # The Perplexity of each sentence in the test data is calculated
- for (line in data) {
- # The line is split on space
- words <- str_split(line, " ")[[1]]
- # The perplexity for the line is calculated
- p <- self$calc_perplexity(words)
- # The information message
- msg <- paste0(
- "Perplexity of the sentence '",
- line, "' is: ", p)
- # The information message is shown
- self$display_msg(msg, 2)
- # The list of perplexities is updated
- pl <- c(pl, p);
- # If the counter is divisible by 10
- if (c %% 10 == 0) {
- # The information message
- msg <- paste0(c, " lines have been processed")
- # The information message is shown
- self$display_msg(msg, 1)
- }
- # The counter is increased by 1
- c <- c + 1
- }
- # The perplexity stats
- stats <- list(
- "min" = min(pl),
- "max" = max(pl),
- "mean" = mean(pl));
- return(stats);
- },
- #' @description
- #' Evaluates the model using extrinsic evaluation based on
- #' Accuracy. First a validation data set containing 1000 lines is
- #' generated. It is then cleaned. 20 random sentences are taken. For
- #' each sentence, the model is used to predict the next word. The
- #' accuracy stats are returned. A prediction is considered to be correct
- #' if one of the predicted words matched the actual word.
- #' @return The number of correct and incorrect predictions.
- extrinsic_evaluation = function() {
- # The transition probabilities data is read
- private$read_tp_data(self$model)
- # Random sample is taken and cleaned
- self$generate_clean_sample(1000, T, 'va')
- # The validation sample data is read
- data <- self$read_file(
- paste0(self$mdir,"/validate-clean.txt"), F)
- # A Random sample of 100 sentences is taken
- data <- data[1:100]
- # The information message
- msg <- paste0(
- "Predicting the next word for ", length(data), " sentences")
- # The information message is shown
- self$display_msg(msg, 1)
- # The statistics
- stats <- list("valid" = 0, "invalid" = 0)
- # The loop counter
- c <- 1
- # The last word for each sentence is predicted
- for (line in data) {
- # The line is split on space
- words <- str_split(line, " ")[[1]]
- # The word to predict
- w <- words[length(words)]
- # The previous words used to predict the word
- pw <- words[1:length(words)-1]
- # If the words should be stemmed
- if (self$tg_opts[["stem_words"]]) {
- # The previous words are stemmed
- pw <- wordStem(pw)
- }
- # The next word is predicted
- res <- self$predict_word(pw, F)
- # If the predicted word matches the actual word
- if (w %in% res["words"]) {
- stats[["valid"]] <- stats[["valid"]] + 1;
- # The information message
- self$display_msg(
- paste0("The word: ", w, " was predicted"), 3)
- }
- # If the predicted word does not match
- else {
- stats[["invalid"]] <- stats[["invalid"]] + 1;
- # The information message
- self$display_msg(
- paste0("The word: ", w, " could not be predicted"), 3)
- }
- # The counter is increased by 1
- c <- c + 1
- # If the counter is divisible by 10
- if (c %% 10 == 0) {
- # The information message
- msg <- paste0(c, " sentences have been processed")
- # The information message is shown
- self$display_msg(msg, 1)
- }
- }
- # The precentage of valid
- stats[["valid_perc"]] <-
- (stats[["valid"]]/(stats[["valid"]] + stats[["invalid"]]))*100
- # The precentage of invalid
- stats[["invalid_perc"]] <- 100-stats[["valid_perc"]]
- return(stats)
- },
- #' @description
- #' Predicts the new word given a list of 1, 2 or 3 previous
- #' words. It checks the given n words in the transition probabilities
- #' data. If there is a match, the top 3 next words with highest
- #' probabilities are returned. If there is no match, then the last n-1
- #' previous words are checked. This process is continued until the last
- #' word is checked. If there is no match, then empty result is returned.
- #' @param words A character vector of previous words or a single vector
- #' containing the previous word text.
- #' @param count The number of results to return.
- #' @param dc A DataCleaner object. If it is given, then the given words
- #' are cleaned. If the stem_words option was set in the TokenGenerator
- #' object configuration for the current model, then the words are
- #' converted to their stems.
- #' @return The top 3 predicted words along with their probabilities.
- predict_word = function(words, count = 3, dc = NULL) {
- # The transition probabilities data is read
- private$read_tp_data(self$model)
- # The words are assigned to temp variable
- w <- words
- # If the DataCleaner obj was specified
- if (!is.null(dc)) {
- # If the words is a set of vectors
- if (length(w) > 1) {
- # The words are converted to a single line of text
- w <- paste0(w, collapse = " ")
- }
- # The words are cleaned
- w <- dc$clean_lines(w)
- # If the words should be stemmed
- if (self$tg_opts[["stem_words"]]) {
- # The previous words are stemmed
- w <- wordStem(w)
- }
- }
- # If the words are in the form of a line
- if (length(w) == 1) {
- # The words are split on space
- w <- strsplit(w, " ")[[1]]
- }
- # The length of previous words
- pwl <- length(w)
- # The loop counter
- c <- 1
- # Indicates if the word was found
- found <- FALSE
- # The result
- result <- list("found" = F, "words" = "", "probs" = "")
- # If the previous words length is 0
- if (pwl == 0)
- return(result)
- # The last 3 words are extracted.
- # If the previous word length is more than 3
- if (pwl > 3) {
- pw <- w[(pwl-2):pwl]
- }
- else {
- pw <- w
- }
- # The length of previous words
- pwl <- length(pw)
- # Each ngram in the previous word list is checked starting from
- # largest ngram
- for (i in pwl:1) {
- # The previous words to check
- tpw <- pw[c:pwl]
- # The key to use for the transition probabilities data
- k <- paste(tpw, collapse = "_")
- # The key is converted to a numeric hash
- h <- digest2int(k)
- # The transition probabilities data is checked
- res <- self$tp[self$tp$pre == h, ]
- # If the prefix was found
- if (nrow(res) > 0) {
- # The word was found
- found <- TRUE
- # The result is sorted by probability
- sres <- res[order(res$prob, decreasing = T),]
- # The number of rows in the result set
- rcount <- nrow(sres)
- # If the number of results is more than the required number
- # of results
- if (rcount > count) {
- # The result count is set to the required number of
- # results
- rc <- count
- }
- else {
- # The result count is set to the number of results
- rc <- rcount
- }
- # The required word probabilities
- probs <- sres$prob[1:rc]
- # The next words indexes
- ind <- sres$nw[1:rc]
- # The required words
- nw <- as.character(self$wl$pre[ind])
- # The result is updated
- result[["words"]] <- nw
- result[["probs"]] <- probs
- result[["found"]] <- T
- # The information message
- msg <- paste0("The ngram key: ", k, " was found")
- # Information message is shown
- self$display_msg(msg, 3)
- # The loop ends
- break;
- }
- else {
- # The information message
- msg <- paste0("The ngram key: ", k, " was not found")
- # Information message is shown
- self$display_msg(msg, 3)
- # The result is updated
- result[["found"]] <- F
- }
- # The information message
- msg <- paste0("Backing off to ", (i) ,"-gram")
- # Information message is shown
- self$display_msg(msg, 3);
- # The counter is increased by 1
- c <- c + 1;
- }
- return(result);
- },
- #' @description
- #' Calculates the probability of the given word given the
- #' previous model-1 words, where model is the maximum ngram number. It
- #' looks up the probability of a word given n previous words. The
- #' previous n words are converted to numeric hash using digest2int
- #' function. The hash is looked up in a data frame of transition
- #' probabilities. The word is converted to a number by checking its
- #' position in a list of unique words. If the hash and the word position
- #' were found, then the probability of the previous word and hash is
- #' returned. If it was not found, then the hash of the n-1 previous
- #' words is taken and the processed is repeated. If the data was not
- #' found in the data frame, then the word probability is returned. This
- #' is known as backoff. If the word probability could not be found then
- #' the default probability is returned. The default probability is
- #' calculated as 1/(N+V), Where N = number of words in corpus and V is
- #' the number of dictionary words.
- #' @param word The word whoose probability is to be calculated.
- #' @param pw The previous words.
- #' @return The probability of the word given the previous words.
- get_word_prob = function(word, pw) {
- # If the default probability is not set, then an error is raised
- if (is.null(self$dp))
- stop("The default probability is not set !")
- # The length of previous words
- pwl <- length(pw)
- # The probability of the word given the previous words. It is
- # initialized to the default probability, which should be 1/(N+V)
- prob <- self$dp
- # The loop counter
- c <- 1
- # Indicates if the word was found
- found <- FALSE
- # The next word id
- nw <- match(word, self$wl$pre)
- # If the next word was not found
- if (is.na(nw)) {
- # The information message
- msg <- paste0(
- "The next word: ", word, " was not found")
- # Information message is shown
- self$display_msg(msg, 3)
- }
- # If the previous word count is more than 0
- else if (pwl > 0) {
- # The previous words are checked
- for (i in pwl:1) {
- # The previous words to check
- tpw <- pw[c:pwl]
- # The key to use for the transition matrix
- k <- paste(tpw, collapse = "_")
- # The key is converted to a numeric hash
- h <- digest2int(k)
- # The transition probabilities data is checked
- res <- self$tp[self$tp$pre == h & self$tp$nw == nw, ]
- # If the prefix was found
- if (nrow(res) > 0) {
- # The word was found
- found <- TRUE
- # The probability is set
- prob <- as.numeric(res$prob)
- # The information message
- msg <- paste0("The ngram key: ",
- k, " and the next word: ",
- word, " were found")
- # Information message is shown
- self$display_msg(msg, 3)
- # The loop ends
- break
- }
- else {
- # The information message
- msg <- paste0("The ngram key: ",
- k, " and the next word: ",
- word, " were not found")
- # Information message is shown
- self$display_msg(msg, 3)
- }
- # The information message
- msg <- paste0("Backing off to ", (i) ,"-gram")
- # Information message is shown
- self$display_msg(msg, 3)
- # The counter is increased by 1
- c <- c + 1
- }
- }
- # If the word was not found then the probability of the word is
- # checked in the n1-gram
- if (!found) {
- # If the word was not found
- if (sum(self$wl$pre == word) == 0) {
- # Information message is shown
- self$display_msg("Using default probability", 3)
- }
- else {
- # The word probability
- prob <- as.numeric(self$wl[self$wl$pre == word, "prob"])
- }
- }
- return(prob)
- },
- #' @description
- #' The Perplexity for the given sentence is calculated. For
- #' each word, the probability of the word given the previous words is
- #' calculated. The probabilities are multiplied and then inverted. The
- #' nth root of the result is the perplexity, where n is the number of
- #' words in the sentence. If the stem_words tokenization option was
- #' specified, then the previous words are converted to their stem.
- #' @param words The list of words.
- #' @return The perplexity of the given list of words.
- calc_perplexity = function(words) {
- # The number of words in the sentence
- wl <- length(words)
- # The product of the word probabilities
- prob_prod <- 1
- # For each word, the probability of the word is calculated
- for (i in 1:wl) {
- # The word
- word <- words[i]
- # The list of previous words
- pw <- NULL
- # If i is more than 1
- if (i > 1) {
- # The start index
- start <- 1
- # If i > self$model
- if (i > self$model) start <- i-(self$model-1)
- # The list of previous words
- pw <- words[start:(i-1)]
- # If the words should be stemmed
- if (self$tg_opts[["stem_words"]]) {
- # The previous words are stemmed
- pw <- wordStem(pw)
- }
- }
- # The word probability
- prob <- self$get_word_prob(word, pw)
- # The probability product is updated
- prob_prod <- prob_prod * prob
- }
- # The nth root of the inverse of the probability product is taken
- p <- round((1/prob_prod)^(1/wl), 0)
- return(p)
- },
- #' @description
- #' Generates a sample file of given size from the main
- #' train.txt file file name. The file is cleaned and saved.
- #' @param ss The number of lines or proportion of lines to sample.
- #' @param ic If the sample file should be cleaned.
- #' @param t The type of sample. It can be:
- #' 'tr' -> training
- #' 'te' -> testing
- #' 'va' -> validation
- generate_clean_sample = function(ss, ic, t) {
- # If the type is 'tr'
- if (t == 'tr') sfn <- 'train'
- # If the type is 'te'
- else if (t == 'te') sfn <- 'test'
- # If the type is 'va'
- else if (t == 'va') sfn <- 'validate'
- # The sample file name
- fn <- paste0(self$ddir, "/", sfn, ".txt")
- # The sample file name
- sf <- paste0(self$mdir, "/", sfn, ".txt")
- # The clean sample file name
- csf <- paste0(self$mdir, "/", sfn, "-clean.txt")
- # If the cleaned sample file already exists
- if (file.exists(csf)) {
- # The information message
- msg <- paste0("The cleaned sample file: ", csf,
- " already exists")
- # Information message is shown
- self$display_msg(msg, 2)
- }
- else {
- # If the sample file does not exist
- if (!file.exists(sf)) {
- # The information message
- msg <- paste0("Generating sample file from the file: ", fn)
- # Information message is shown
- self$display_msg(msg, 2)
- # The input file is read
- data <- self$read_file(fn, F)
- # If the sample size is less than 1
- if (ss < 1) {
- # The number of lines in the main file
- lc <- length(data)
- # The number of lines in the sample file
- lc <- round(lc*ss)
- }
- else {
- lc <- ss
- }
- # The sample file data
- data <- data[1:lc]
- # The sample file data is saved
- self$write_file(data, sf, F)
- }
- # If the sample file should be cleaned
- if (ic) {
- # The options for cleaning the data
- opts <- self$dc_opts
- # The line count is set to 5000
- opts[["line_count"]] <- 5000
- # The output file name
- opts[["output_file"]] <- csf
- # The data cleaner object is created
- dc <- DataCleaner$new(sf, opts, verbose = self$verbose)
- # The sample file is cleaned
- dc$clean_file()
- }
- }
- }
- ),
- private = list(
- # @description It creates a single plot based on ggplot2. Depending on
- # the opt parameter, the plot may contain groups or it may be a simple
- # plot.
- # @param data A data frame containing the data to be plotted. It should
- # have 2 variables, x and y.
- # @param opts The options for plotting the data. It contains:
- # 'x_lab' -> The x-axis label.
- # 'y_lab' -> The y-axis label.
- # 'type' -> The type of plot. It can be 'basic' or 'grouped'.
- # 'group' -> The field to group by.
- # @return A ggplot object representing the plot.
- plot_graph = function(data, opts) {
- # y-max
- y_max <- max(as.numeric(as.character(data$y)))
- # If the type is basic
- if (opts[['type']] == 'basic') {
- # The graph is plotted
- p <- ggplot(data, aes(x, y)) + geom_point() +
- geom_smooth(method='lm', formula= y~x) +
- labs(x = opts[["x_lab"]], y = opts[["y_lab"]]) +
- xlim(min(data$x), max(data$x)) +
- coord_cartesian(ylim = c(0,y_max))
- }
- # If the type is grouped
- else if (opts[['type']] == 'grouped') {
- # The data is duplicated to prevent the warnings
- data <- rbind(data, data)
- # The group field data
- g <- opts$group
- # The graph is plotted
- p <- ggplot(
- data,
- aes_string("x", "y", group = g, col = g)) +
- geom_point() +
- geom_smooth(method = "loess", formula = y ~ x, span = 1.4, se = FALSE) +
- labs(x = opts[["x_lab"]], y = opts[["y_lab"]]) +
- xlim(min(data$x), max(data$x)) +
- coord_cartesian(ylim = c(0,y_max))
- }
- return(p)
- },
- # @description
- # It plots the given model performance stats. The stats
- # are plotted on one page. 5 stats are plotted. The first 4 plots are
- # of memory, time taken, mean Perplexity and accuracy. The fifth plot
- # is of Perplexity vs accuracy.
- # @param data The data to plot.
- # @param opts The options for plotting the data. It contains:
- # 'type' -> The type of plot. It can be 'basic' or 'grouped'.
- # 'group' -> The field to group by.
- # 'title' -> The main plot title.
- # 'subtitle' -> The plot sub title.
- plot_stats = function(data, opts) {
- # The information message is shown
- self$display_msg("Plotting model performance stats...", 1)
- # If the type is basic
- if (opts[["type"]] == 'basic') {
- # The data frames
- df1 <- data.frame(x = data$n, y = data$m)
- df2 <- data.frame(x = data$n, y = data$t)
- df3 <- data.frame(x = data$n, y = data$p)
- df4 <- data.frame(x = data$n, y = data$a)
- df5 <- data.frame(x = data$a, y = data$p)
- # The options for plot 1
- popts <- list("x_lab" = "ngram",
- "y_lab" = "memory",
- "type" = opts[["type"]])
- # Plot 1
- p1 <- private$plot_graph(df1, popts)
- # The options for plot 2
- popts <- list("x_lab" = "ngram",
- "y_lab" = "time",
- "type" = opts[["type"]])
- # Plot 2
- p2 <- private$plot_graph(df2, popts)
- # The options for plot 3
- popts <- list("x_lab" = "ngram",
- "y_lab" = "perplexity",
- "type" = opts[["type"]])
- # Plot 3
- p3 <- private$plot_graph(df3, popts)
- # The options for plot 4
- popts <- list("x_lab" = "ngram",
- "y_lab" = "accuracy",
- "type" = opts[["type"]])
- # Plot 4
- p4 <- private$plot_graph(df4, popts)
- # The options for plot 5
- popts <- list("x_lab" = "accuracy",
- "y_lab" = "perplexity",
- "type" = "basic")
- # Plot 5
- p5 <- private$plot_graph(df5, popts)
- # The plots are displayed on a single page
- patchwork <- p1 + p2 + p3 + p4 + p5
- # Main title is added
- print(patchwork + plot_annotation(
- "title" = opts[["title"]],
- "subtitle" = opts[["subtitle"]]))
- }
- # If the type is grouped
- else if (opts[["type"]] == 'grouped') {
- # The group name
- g <- opts[["group"]]
- # The data frames
- df1 <- data.frame(data$n, data$m, data[[g]])
- df2 <- data.frame(data$n, data$t, data[[g]])
- df3 <- data.frame(data$n, data$p, data[[g]])
- df4 <- data.frame(data$n, data$a, data[[g]])
- # The column names
- n <- c("x", "y", g)
- # The column names are set
- names(df1) <- n
- names(df2) <- n
- names(df3) <- n
- names(df4) <- n
- # The options for plot 1
- popts <- list("x_lab" = "ngram",
- "y_lab" = "memory",
- "group" = opts[["group"]],
- "type" = opts[["type"]])
- # Plot 1
- p1 <- private$plot_graph(df1, popts)
- # The options for plot 2
- popts <- list("x_lab" = "ngram",
- "y_lab" = "time",
- "group" = opts[["group"]],
- "type" = opts[["type"]])
- # Plot 2
- p2 <- private$plot_graph(df2, popts)
- # The options for plot 3
- popts <- list("x_lab" = "ngram",
- "y_lab" = "perplexity",
- "group" = opts[["group"]],
- "type" = opts[["type"]])
- # Plot 3
- p3 <- private$plot_graph(df3, popts)
- # The options for plot 4
- popts <- list("x_lab" = "ngram",
- "y_lab" = "accuracy",
- "group" = opts[["group"]],
- "type" = opts[["type"]])
- # Plot 4
- p4 <- private$plot_graph(df4, popts)
- # The plots are displayed on a single page
- patchwork <- p1 + p2 + p3 + p4
- # Main title is added
- # Main title is added
- print(patchwork + plot_annotation(
- "title" = opts[["title"]],
- "subtitle" = opts[["subtitle"]]))
- }
- },
- # @description
- # Reads the model file and sets the current objects attributes.
- # @param model The model number. It is the maximum ngrams supported by
- # the model.
- read_tp_data = function(model) {
- # If the model has already been loaded then function returns
- if (!is.null(self$tp)) return()
- # The information message
- msg <- paste0("Loading model ", model, "...")
- # The information message is shown
- self$display_msg(msg, 1)
- # The model config file name
- fn <- paste0(self$mdir, "/config.RDS")
- # The config file is loaded
- self$load_config(fn)
- # The model file name
- fn <- paste0(self$mdir, "/model-", model, ".RDS")
- # The model is read to a data frame
- self$tp <- self$read_obj(fn)
- # The words file name
- fn <- paste0(self$mdir, "/words.RDS")
- # The list of words is read
- self$wl <- self$read_obj(fn)
- # The dictionary file name
- fn <- self$dc_opts[["dict_file"]]
- # The file contents
- dict <- self$read_file(fn, F)
- # The information message is shown
- self$display_msg("Calculating default probability...", 1)
- # The number of words in the dictionary file. It is used to
- # calculate Perplexity.
- vc <- length(dict)
- # The cleaned training file is read
- fn <- paste0(self$mdir, "/train-clean.txt")
- # The model is read to a data frame
- data <- self$read_file(fn, F)
- # The words are split on " "
- w <- strsplit(data, " ")
- # The words are converted to atomic list
- w <- unlist(w)
- # The number of words
- n <- length(w)
- # The default probability is set
- self$dp <- 1/(n + vc)
- },
- # @description
- # It returns the average value for the required stats, for
- # the given performance stats.
- # @param pstats The performance stats.
- # @param type The type of stats. It can be:
- # 'memory' -> The mean memory used.
- # 'time_taken' -> The mean time taken by the intrinsic and
- # extrinsic evaluation tests.
- # 'perplexity' -> The mean Perplexity score.
- # 'accuracy' -> The mean accuracy score.
- # @param nmax -> The maximum ngram size.
- process_stats = function(pstats, type, nmax) {
- # The stats
- s <- NULL
- # The average memory for each ngram model
- for (n in 1:(nmax-1)) {
- # If the stats is memory
- if (type == "memory") {
- # The stats value for the current ngram number
- v <- pstats[[n]][["memory"]]
- # The stats are formated
- v <- self$format_size(v)
- }
- # If the stats is perplexity
- else if (type == "perplexity") {
- # The stats value for the current ngram number
- v <- pstats[[n]][["intrinsic"]][["mean"]]
- }
- # If the stats is accuracy
- else if (type == "accuracy") {
- # The stats value for the current ngram number
- v <- pstats[[n]][["extrinsic"]][["valid_perc"]]
- }
- # If the stats is time
- else if (type == "time") {
- # The stats value for the current ngram number
- v <- pstats[[n]][["time"]]
- }
- # The stats are updated
- s <- c(s, v)
- }
- return(s)
- }
- )
diff --git a/R/model/evaluator.R b/R/model/evaluator.R
new file mode 100644
index 0000000..d0451cb
--- /dev/null
+++ b/R/model/evaluator.R
@@ -0,0 +1,320 @@
+#' It is used to perform extrinsic and intrinsic evaluation of a model.
+#' @description
+#' It provides methods for performing extrinsic and intrinsic
+#' evaluation. Intrinsic evaluation is based on calculation of Perplexity.
+#' Extrinsic evaluation is based on accuracy. It involves determining the
+#' percentage of correct next word predictions.
+#' @details
+#' Before performing the intrinsic and extrinsic model evaluation, a
+#' validation file must be first generated. This can be done using the Generator
+#' class. Each line in the validation file is evaluated. For intrinsic
+#' evaluation Perplexity for the line is calculated. An overall summary of the
+#' Perplexity calculations is returned. It includes the min, max and mean
+#' Perplexity. For extrinsic evaluation, next word prediction is performed on
+#' each line. If the actual next word is one of the three predicted next words,
+#' then the prediction is considered to be accurate. The extrinsic evaluation
+#' returns the percentage of correct and incorrect predictions.
+Evaluator <- R6::R6Class(
+ "Evaluator",
+ inherit = TextFileProcessor,
+ public = list(
+ #' @description
+ #' It initializes the current object. It is used to set the
+ #' model file name and verbose options.
+ #' @param mfile The model file name.
+ #' @param verbose If progress information should be displayed.
+ #' @export
+ initialize = function(mfile, verbose = 0) {
+ # The base class is initialized
+ super$initialize(NULL, NULL, verbose)
+ # If the model file name is not valid, then an error is thrown
+ if (!file.exists(mfile))
+ stop(paste0("Invalid model file: ", model_file))
+ else {
+ # The model file name is set
+ private$mfile <- mfile
+ # The model object is read
+ private$m <- private$read_obj(model_file)
+ }
+ },
+ #' @description
+ #' It performs intrinsic and extrinsic evaluation for the
+ #' given model. It also measures the memory usage and time taken. The
+ #' performance metrics are displayed in 5 plots on one page. Performance
+ #' statistics are saved to the model object.
+ #' @param lc The number of lines of text in the validation file to be
+ #' used for the evaluation.
+ #' @param fn The name of the validation file. If it does not exist, then
+ #' the default file validation-clean.txt is checked in the models
+ #' folder
+ performance_evaluation = function(lc, fn) {
+ # The y-axis values
+ pstats <- list("m" = NULL, "t" = NULL, "p" = NULL, "a" = NULL)
+ time_taken <- system.time({
+ memory_used <- mem_change({
+ # Intrinsic evaluation is performed
+ istats <- self$intrinsic_evaluation(lc, fn)
+ # Extrinsic evaluation is performed
+ estats <- self$extrinsic_evaluation(lc, fn)
+ })
+ })
+ # The memory used
+ memory_used <- object_size(private$model)
+ # The y-axis values are updated
+ pstats[["m"]] <- private$format_size(memory_used)
+ pstats[["t"]] <- time_taken[[3]]
+ pstats[["p"]] <- istats$mean
+ pstats[["a"]] <- estats$valid_perc
+ # The performance stats are saved
+ private$m$pstats <- pstats
+ # The model is saved
+ private$save_obj(private$m, private$mfile)
+ },
+ #' @description
+ #' It compares the performance of the specified models by
+ #' plotting the performance statistics generated by the
+ #' performance_evaluation function. The models are specified with the
+ #' type parameter.
+ #' @param type The models to compare. It can be: 'basic' -> The models
+ #' for each n-gram size are compared. 'grouped' -> Each model folder
+ #' contains n-gram models for a given input data size. For e.g For
+ #' data size 5 Mb, there are 3 models for the ngrams 2:4. The
+ #' performance of all models in all folders is plotted on a graph.
+ #' @param opts A list of options for plotting the data. 'group' -> The
+ #' field to group by. 'title' -> The main plot title. 'subtitle' ->
+ #' The plot sub title.
+ #' @return The performance stats.
+ performance_comparision = function(type, opts) {
+ # If the type is 'basic'
+ if (type == 'basic') {
+ # The pstats file name
+ fn <- paste0(private$mdir, "/pstats.RDS")
+ # The stats are read
+ pstats <- private$read_obj(fn)
+ # The config file name
+ fn <- paste0(private$mdir, "/config.RDS")
+ # The config file is read
+ config <- private$read_obj(fn)
+ # The y-axis values
+ y <- list("m" = NULL, "t" = NULL, "p" = NULL, "a" = NULL)
+ # For each model the performance metrics are measured
+ for (i in 1:(config$model-1)) {
+ # The memory value
+ m <- private$format_size(pstats[[i]][["memory"]])
+ # The y-axis values are updated
+ y[["m"]] <- c(y[["m"]], m)
+ y[["t"]] <- c(y[["t"]], pstats[[i]][["time"]])
+ y[["p"]] <- c(y[["p"]],
+ pstats[[i]][["intrinsic"]][["mean"]])
+ y[["a"]] <- c(y[["a"]],
+ pstats[[i]][["extrinsic"]][["valid_perc"]])
+ }
+ # The data frame containing the data to be plotted
+ df <- data.frame(
+ "n" = 2:(length(pstats) + 1),
+ "m" = y[["m"]],
+ "t" = y[["t"]],
+ "p" = y[["p"]],
+ "a" = y[["a"]]
+ )
+ # The options for plotting
+ opts <- list(
+ "type" = "basic",
+ "title" = "Variation of performance with ngram size",
+ "subtitle" = opts[['subtitle']])
+ }
+ # If the type is 'grouped'
+ else if (type == 'grouped') {
+ # The average performance stats for each data size
+ y <- list("m" = NULL, "t" = NULL, "p" = NULL, "a" = NULL)
+ # The different items in the group. Each folder is a group item
+ f <- list.dirs(private$mdir, recursive = F, full.names = F)
+ # The output data frame
+ df <- data.frame()
+ # For each folder, the performance stats for all ngram models is
+ # calculated.
+ for (i in f) {
+ # The performance stats file name
+ fn <- paste0(private$mdir, "/", i, "/pstats.RDS")
+ # The stats are read
+ pstats <- private$read_obj(fn)
+ # The config file name
+ fn <- paste0(private$mdir, "/", i, "/config.RDS")
+ # The stats are read
+ config <- private$read_obj(fn)
+ # The memory usage for each ngram
+ m <- private$process_stats(pstats, "memory", config$model)
+ # The time taken for each ngram
+ t <- private$process_stats(pstats, "time", config$model)
+ # The perplexity for each ngram
+ p <- private$process_stats(pstats, "perplexity", config$model)
+ # The accuracy score for each ngram
+ a <- private$process_stats(pstats, "accuracy", config$model)
+ # The temprary data frame for the current data size
+ tdf <- data.frame(
+ rep(i, (config$model-1)),
+ m, t, p, a, 2:(length(pstats) + 1))
+ # The column names are set
+ names(tdf) <- c(opts[["group"]], "m", "t", "p", "a", "n")
+ # The temprary data frame is appended to the output data
+ # frame
+ df <- rbind(df, tdf)
+ }
+ # The group name
+ g <- opts[["group"]]
+ # The options for plotting
+ opts <- list(
+ "type" = "grouped",
+ "group" = g,
+ "title" = opts[['title']],
+ "subtitle" = opts[['subtitle']])
+ # If the group column is numeric
+ if (is.numeric(as.numeric(as.character(df[[g]])))) {
+ # The group column is converted to numeric
+ df[[g]] <- as.numeric(as.character(df[[g]]))
+ # The data is ordered by group
+ df <- df[order(df[[opts[["group"]]]]), ]
+ # The group column is converted to factor
+ df[[g]] <- as.factor(df[[g]])
+ }
+ }
+ # The performance stats are plotted
+ private$plot_stats(df, opts)
+ # The performance stats are returned
+ return(df)
+ },
+ #' @description
+ #' Evaluates the model using intrinsic evaluation based on
+ #' Perplexity. The given number of sentences are taken from the
+ #' validation file. For each sentence, the Perplexity is calculated.
+ #' @param lc The number of lines of text in the validation file to be
+ #' used for the evaluation.
+ #' @param fn The name of the validation file. If it does not exist, then
+ #' the default file validation-clean.txt is checked in the models
+ #' folder
+ #' @return The min, max and mean Perplexity score.
+ intrinsic_evaluation = function(lc, fn) {
+ # The Predictor class object is created
+ pr <- Predictor$new()
+ # The validation data is read
+ data <- private$read_validation_data(lc, "I", fn)
+ # The list of perplexities
+ pl <- c();
+ # The loop counter
+ c <- 1
+ # The Perplexity of each sentence in the test data is calculated
+ for (line in data) {
+ # The line is split on space
+ words <- str_split(line, " ")[[1]]
+ # The perplexity for the line is calculated
+ p <- pr$calc_perplexity(words)
+ # The information message
+ msg <- paste0(
+ "Perplexity of the sentence '",
+ line, "' is: ", p)
+ # The information message is shown
+ private$display_msg(msg, 2)
+ # The list of perplexities is updated
+ pl <- c(pl, p);
+ # If the counter is divisible by 10
+ if (c %% 10 == 0) {
+ # The information message
+ msg <- paste0(c, " lines have been processed")
+ # The information message is shown
+ private$display_msg(msg, 1)
+ }
+ # The counter is increased by 1
+ c <- c + 1
+ }
+ # The perplexity stats
+ stats <- list(
+ "min" = min(pl),
+ "max" = max(pl),
+ "mean" = mean(pl));
+ return(stats);
+ },
+ #' @description
+ #' Evaluates the model using extrinsic evaluation based on
+ #' Accuracy. The given number of sentences are taken from the validation
+ #' file. For each sentence, the model is used to predict the next word.
+ #' The accuracy stats are returned. A prediction is considered to be
+ #' correct if one of the predicted words matches the actual word.
+ #' @param lc The number of lines of text in the validation file to be
+ #' used for the evaluation.
+ #' @param fn The name of the validation file. If it does not exist, then
+ #' the default file validation-clean.txt is checked in the models
+ #' folder
+ #' @return The number of correct and incorrect predictions.
+ extrinsic_evaluation = function(lc, fn) {
+ # The Predictor class object is created
+ pr <- Predictor$new()
+ # The validation data is read
+ data <- private$read_validation_data(lc, "E", fn)
+ # The statistics
+ stats <- list("valid" = 0, "invalid" = 0)
+ # The loop counter
+ c <- 1
+ # The last word for each sentence is predicted
+ for (line in data) {
+ # The line is split on space
+ words <- str_split(line, " ")[[1]]
+ # The word to predict
+ w <- words[length(words)]
+ # The previous words used to predict the word
+ pw <- words[1:length(words)-1]
+ # If the words should be stemmed
+ if (private$tg_opts[["stem_words"]]) {
+ # The previous words are stemmed
+ pw <- wordStem(pw)
+ }
+ # The next word is predicted
+ res <- pr$predict_word(pw, F)
+ # If the predicted word matches the actual word
+ if (w %in% res["words"]) {
+ stats[["valid"]] <- stats[["valid"]] + 1;
+ # The information message
+ private$display_msg(
+ paste0("The word: ", w, " was predicted"), 3)
+ }
+ # If the predicted word does not match
+ else {
+ stats[["invalid"]] <- stats[["invalid"]] + 1;
+ # The information message
+ private$display_msg(
+ paste0("The word: ", w, " could not be predicted"), 3)
+ }
+ # The counter is increased by 1
+ c <- c + 1
+ # If the counter is divisible by 10
+ if (c %% 10 == 0) {
+ # The information message
+ msg <- paste0(c, " sentences have been processed")
+ # The information message is shown
+ private$display_msg(msg, 1)
+ }
+ }
+ # The precentage of valid
+ stats[["valid_perc"]] <-
+ (stats[["valid"]]/(stats[["valid"]] + stats[["invalid"]]))*100
+ # The precentage of invalid
+ stats[["invalid_perc"]] <- 100-stats[["valid_perc"]]
+ return(stats)
+ }
+ ),
+ private = list(
+ # @field m The model object.
+ m = NULL,
+ # @field mfile The path to the model file.
+ mfile = NULL
+ )
diff --git a/R/model/generator.R b/R/model/generator.R
new file mode 100644
index 0000000..c8b0d3c
--- /dev/null
+++ b/R/model/generator.R
@@ -0,0 +1,211 @@
+#' It is used to generate n-gram models for the given data file.
+#' @description
+#' It provides methods that are used for generating n-gram models.
+#' The n-gram models may be customized by specifying the data cleaning and
+#' tokenization options.
+#' @details
+#' It provides a method that generates a n-gram model. The n-gram model
+#' may be customized by specifying the data cleaning and tokenization options.
+#' The data cleaning options include removal of punctuation, stop words, extra
+#' space, non-dictionary words and bad words. The tokenization options include
+#' n-gram number and word stemming.
+Generator <- R6::R6Class(
+ "Generator",
+ inherit = TextFileProcessor,
+ public = list(
+ #' @description
+ #' It initializes the current object. It is used to set the
+ #' maximum ngram number, sample size, input file name, data cleaner,
+ #' tokenization and verbose option.
+ #' @param n The n-gram size for the model.
+ #' @param ssize The sample size in Mb.
+ #' @param ddir The data directory.
+ #' @param mdir The model directory.
+ #' @param dc_opts The data cleaner options.
+ #' @param tg_opts The token generator options.
+ #' @param verbose If progress information should be displayed.
+ #' @export
+ initialize = function(n = 4,
+ ssize = 30,
+ ddir = "./data",
+ mdir = "./models",
+ dc_opts = list(),
+ tg_opts = list(),
+ verbose = 0) {
+ # The base class is initialized
+ super$initialize(NULL, NULL, verbose)
+ # The n-gram size is set
+ private$n <- n
+ # The sample size is set
+ private$ssize <- ssize
+ # The data directory name is set
+ private$ddir <- ddir
+ # The model directory name is set
+ private$mdir <- mdir
+ # If the dc_opts are given
+ if (length(dc_opts) > 0) {
+ # The custom dc_opts are merged with the default dc_opts
+ private$dc_opts = modifyList(private$dc_opts, dc_opts)
+ }
+ # If the tg_opts are given
+ if (length(tg_opts) > 0) {
+ # The custom tg_opts are merged with the default tg_opts
+ private$tg_opts = modifyList(private$tg_opts, tg_opts)
+ }
+ },
+ #' @description
+ #' It generates the model for the given name, description,
+ #' n-gram size, data sample size, data cleaning options and input file.
+ #' @param name The model name.
+ #' @param desc The model description.
+ #' @param ifn The input file name. If not given then the train.txt file
+ #' in the data directory folder is used.
+ #' @param ofn The output file name. If not given, then "model.RDS" is
+ #' used as the file name .
+ generate_model = function(name, desc, ifn = NULL, ofn = NULL) {
+ # If the input file name is not given
+ if (is.null(ifn)) {
+ # The default training data file name
+ ifn <- paste0(private$ddir, "/train.txt")
+ }
+ # If the input file does not exist
+ if (!file.exists(ifn)) {
+ # An error is thrown
+ stop("The file: ", ifn, " does not exist !")
+ }
+ # If the output file name is not given
+ if (is.null(ofn)) {
+ # The default output file name is used
+ ofn <- paste0(private$mdir, "/model.RDS")
+ }
+ # The data analyzer object is created
+ da <- DataAnalyzer$new(fn, private$verbose)
+ # The training, testing and validation data sets are generated
+ da$generate_data(private$ddir,
+ list(train = .8, test = .1, validate = .1))
+ # The object size is formatted
+ obj_size <- file.size(ifn)/10^6
+ # The proportion of data to sample
+ prop <- (private$ssize/obj_size)
+ # Random sample is taken and cleaned
+ self$generate_sample(prop, T, 'tr')
+ # The model directory is set
+ private$tg_opts$dir <- private$mdir
+ # The clean train data file name
+ fn <- paste0(private$mdir, "/train-clean.txt")
+ # For each ngram number, the ngram token file is generated
+ for (i in 1:private$n) {
+ # The ngram number is set
+ private$tg_opts$n <- i
+ # The TokenGenerator object is created
+ tg <- TokenGenerator$new(fn, private$tg_opts, private$verbose)
+ # The ngram tokens are generated
+ tg$generate_tokens()
+ }
+ # The TPGenerator object is created
+ tp <- TPGenerator$new()
+ # The options for generating combined transition probabilities
+ tp_opts <- list(
+ "n" = private$n,
+ "save_tp" = T,
+ "format" = "obj",
+ "dir" = private$mdir
+ )
+ # The transition probabilities are generated
+ tp$generate_tp(tp_opts)
+ # The information message is shown
+ private$display_msg("Saving model...", 1)
+ # An object of class Model is created
+ m <- Model$new(
+ name = name,
+ desc = desc,
+ data_file = fn,
+ wl_file = paste0(private$mdir, "/words.RDS"),
+ model = private$n,
+ ssize = private$ssize,
+ ddir = private$ddir,
+ mdir = private$mdir,
+ dc_opts = private$dc_opts,
+ tg_opts = private$tg_opts,
+ verbose = private$verbose
+ )
+ # The model object is loaded
+ m$load_model()
+ # The model object is saved to the models folder using the output
+ # file name
+ private$save_obj(m, ofn)
+ }
+ ),
+ private = list(
+ # @field tp The transition probabilities data frame.
+ tp = NULL,
+ # @field wl The list of unique words.
+ wl = NULL,
+ # @field dp The default probability is equal to 1/(N+V), where N is the
+ # number of words in the sentence, V is the number of words in the
+ # vocabulary.
+ dp = NULL,
+ # @field n The n-gram size.
+ n = 4,
+ # @field dc_opts The options for the data cleaner object.
+ # min_words -> The minimum number of words per sentence.
+ # line_count -> The number of lines to read and clean at a time.
+ # sw_file -> The stop words file path.
+ # dict_file -> The dictionary file path.
+ # bad_file -> The bad words file path.
+ # to_lower -> If the words should be converted to lower case.
+ # remove_stop -> If stop words should be removed.
+ # remove_punct -> If punctuation symbols should be removed.
+ # remove_non_dict -> If non dictionary words should be removed.
+ # remove_non_alpha -> If non alphabet symbols should be removed.
+ # remove_extra_space -> If leading, trailing and double spaces
+ # should be removed.
+ # remove_bad -> If bad words should be removed
+ dc_opts = list(
+ "min_words" = 2,
+ "line_count" = 1000,
+ "sw_file" = NULL,
+ "dict_file" = NULL,
+ "bad_file" = NULL,
+ "to_lower" = T,
+ "remove_stop" = F,
+ "remove_punc" = T,
+ "remove_non_dict" = T,
+ "remove_non_alpha" = T,
+ "remove_extra_space" = T,
+ "remove_bad" = F
+ ),
+ # @field tg_opts The options for the token generator obj.
+ # n -> The ngram size.
+ # save_ngrams -> If the ngram data should be saved.
+ # min_freq -> All ngrams with frequency less than min_freq are
+ # ignored.
+ # line_count -> The number of lines to process at a time.
+ # stem_words -> If words should be converted to their stem.
+ # dir -> The dir where the output file should be saved.
+ # format -> The format for the output. There are two options.
+ # 'plain' -> The data is stored in plain text.
+ # 'obj' -> The data is stored as a R obj.
+ tg_opts = list(
+ "min_freq" = -1,
+ "n" = 1,
+ "save_ngrams" = T,
+ "min_freq" = -1,
+ "line_count" = 5000,
+ "stem_words" = F,
+ "dir" = "./data/models",
+ "format" = "obj"
+ ),
+ # @field ssize The sample size in Mb.
+ ssize = 30,
+ # @field ddir The folder containing the data files
+ ddir = "./data",
+ # @field mdir The folder containing the model files
+ mdir = "./models")
diff --git a/R/model/model.R b/R/model/model.R
new file mode 100644
index 0000000..f794742
--- /dev/null
+++ b/R/model/model.R
@@ -0,0 +1,219 @@
+#' The Model class represents n-gram models. An instance of the class is a
+#' single n-gram model.
+#' @description
+#' The attributes of this class are used to store n-gram model
+#' information. The class provides methods for loading and saving the model.
+#' @details
+#' The attributes of this class are used to store n-gram model
+#' information such as model name, model description, model file name, n-gram
+#' number, transition probabilities data, default probability, n-gram
+#' configuration options such as data cleaning and tokenization options, word
+#' list, model path, data directory path and performance stats. The model is
+#' saved to a single file as a R object. A model file contains all the
+#' information required by the model. The model object is used by other model
+#' classes that perform operations on the model such as evaluation of model
+#' performance, making word predictions based on the model and plotting model
+#' performance stats.
+Model <- R6::R6Class(
+ "Model",
+ inherit = TextFileProcessor,
+ public = list(
+ #' @field pstats The performance stats for the model.
+ pstats = list(),
+ #' @field name The model name.
+ name = NULL,
+ #' @field desc The model description.
+ desc = NULL,
+ #' @description
+ #' It initializes the current object. It is used to set the
+ #' maximum ngram number, sample size, input file name, data cleaner
+ #' options, tokenization options, combined transition probabilities file
+ #' name and verbose.
+ #' @param name The model name.
+ #' @param desc The model description.
+ #' @param file_name The model file name.
+ #' @param data_file The path of the file used to generate the model. If
+ #' the data was cleaned, then data_file is the path to the cleaned
+ #' file.
+ #' @param wl_file The path of the word list file.
+ #' @param n The maximum ngram number supported by the model.
+ #' @param ssize The sample size in Mb.
+ #' @param ddir The data directory.
+ #' @param mdir The model directory.
+ #' @param dc_opts The data cleaner options.
+ #' @param tg_opts The token generator options.
+ #' @param verbose If progress information should be displayed.
+ #' @export
+ initialize = function(name = NULL,
+ desc = NULL,
+ file_name = NULL,
+ data_file = NULL,
+ wl_file = NULL,
+ n = 4,
+ ssize = 30,
+ ddir = "./data",
+ mdir = "./models",
+ dc_opts = list(),
+ tg_opts = list(),
+ verbose = 0) {
+ # The base class is initialized
+ super$initialize(NULL, NULL, verbose)
+ # If the wl_file does not exist, then an error is thrown
+ if (!file.exists(wl_file))
+ stop(paste0("The file: ", wl_file, " does not exist !"))
+ # If the data file does not exist, then an error is thrown
+ if (!file.exists(data_file))
+ stop(paste0("The file: ", data_file, " does not exist !"))
+ # If the data directory does not exist, then an error is thrown
+ if (!dir.exists(ddir))
+ stop(paste0("The dir: ", ddir, " does not exist !"))
+ # If the model directory does not exist, then an error is thrown
+ if (!dir.exists(mdir))
+ stop(paste0("The dir: ", mdir, " does not exist !"))
+ # If the dict_file does not exist, then an error is thrown
+ if (!file.exists(dc_opts$dict_file)) {
+ # The error message
+ msg <- paste0("The file: ", dc_opts$dict_file)
+ msg <- paste0(msg, " does not exist !")
+ stop(msg)
+ }
+ # The model name is set
+ self$name <- name
+ # The model description is set
+ self$desc <- desc
+ # The ngram number is set
+ private$n <- n
+ # The sample size is set
+ private$ssize <- ssize
+ # The data directory name is set
+ private$ddir <- ddir
+ # The model directory name is set
+ private$mdir <- mdir
+ # The input file name is set
+ private$data_file <- data_file
+ # The word list file name is set
+ private$wl_file <- wl_file
+ # The model file name is set
+ private$file_name <- file_name
+ # If the dc_opts are given
+ if (length(dc_opts) > 0) {
+ # The custom dc_opts are merged with the default dc_opts
+ private$dc_opts = modifyList(private$dc_opts, dc_opts)
+ }
+ # If the tg_opts are given
+ if (length(tg_opts) > 0) {
+ # The custom tg_opts are merged with the default tg_opts
+ private$tg_opts = modifyList(private$tg_opts, tg_opts)
+ }
+ },
+ #' @description
+ #' It loads the model using the given information
+ load_model = function() {
+ # The tp file name
+ fn <- paste0(mdir, "/model-", private$n, ".RDS")
+ # The tp file is read
+ private$tp <- private$read_obj(fn)
+ # The wl file is read
+ private$wl <- private$read_obj(private$wl_file)
+ # The dictionary file name
+ fn <- private$dc_opts[["dict_file"]]
+ # The file contents
+ dict <- private$read_file(fn, F)
+ # The information message is shown
+ private$display_msg("Calculating default probability...", 1)
+ # The number of words in the dictionary file. It is used to
+ # calculate Perplexity.
+ vc <- length(dict)
+ # The data file is read
+ data <- private$read_file(private$data_file, F)
+ # The words are split on " "
+ w <- strsplit(data, " ")
+ # The words are converted to atomic list
+ w <- unlist(w)
+ # The number of words
+ n <- length(w)
+ # The default probability is set
+ private$dp <- 1/(n + vc)
+ }
+ ),
+ private = list(
+ # @field file_name The path to the model file.
+ file_name = NULL,
+ # @field wl_file The path to the word list file.
+ wl_file = NULL,
+ # @field data_file The path to the input file.
+ data_file = NULL,
+ # @field tp The transition probabilities data frame.
+ tp = NULL,
+ # @field wl The list of unique words.
+ wl = NULL,
+ # @field dp The default probability is equal to 1/(N+V), where N is the
+ # number of words in the sentence, V is the number of words in the
+ # vocabulary.
+ dp = NULL,
+ # @field n The maximum number of ngrams supported by the model.
+ n = 4,
+ # @field dc_opts The options for the data cleaner object.
+ # min_words -> The minimum number of words per sentence.
+ # line_count -> The number of lines to read and clean at a time.
+ # sw_file -> The stop words file path.
+ # dict_file -> The dictionary file path.
+ # bad_file -> The bad words file path.
+ # to_lower -> If the words should be converted to lower case.
+ # remove_stop -> If stop words should be removed.
+ # remove_punct -> If punctuation symbols should be removed.
+ # remove_non_dict -> If non dictionary words should be removed.
+ # remove_non_alpha -> If non alphabet symbols should be removed.
+ # remove_extra_space -> If leading, trailing and double spaces
+ # should be removed.
+ # remove_bad -> If bad words should be removed
+ dc_opts = list(
+ "min_words" = 2,
+ "line_count" = 1000,
+ "sw_file" = NULL,
+ "dict_file" = NULL,
+ "bad_file" = NULL,
+ "to_lower" = T,
+ "remove_stop" = F,
+ "remove_punc" = T,
+ "remove_non_dict" = T,
+ "remove_non_alpha" = T,
+ "remove_extra_space" = T,
+ "remove_bad" = F
+ ),
+ # @field tg_opts The options for the token generator obj.
+ # n -> The ngram size.
+ # save_ngrams -> If the ngram data should be saved.
+ # min_freq -> All ngrams with frequency less than min_freq are
+ # ignored.
+ # line_count -> The number of lines to process at a time.
+ # stem_words -> If words should be converted to their stem.
+ # dir -> The dir where the output file should be saved.
+ # format -> The format for the output. There are two options.
+ # 'plain' -> The data is stored in plain text.
+ # 'obj' -> The data is stored as a R obj.
+ tg_opts = list(
+ "min_freq" = -1,
+ "n" = 1,
+ "save_ngrams" = T,
+ "min_freq" = -1,
+ "line_count" = 5000,
+ "stem_words" = F,
+ "dir" = "./data/models",
+ "format" = "obj"
+ ),
+ # @field ssize The sample size in Mb.
+ ssize = 30,
+ # @field ddir The folder containing the data files
+ ddir = "./data",
+ # @field mdir The folder containing the model files
+ mdir = "./models"
+ )
diff --git a/R/model/predictor.R b/R/model/predictor.R
new file mode 100644
index 0000000..79d0834
--- /dev/null
+++ b/R/model/predictor.R
@@ -0,0 +1,303 @@
+#' It is used to evaluate the accuracy and performance of the model.
+#' @description
+#' It provides methods that perform extrinsic and intrinsic model
+#' evaluation. It also provides methods for determining the memory and time
+#' requirements for generating the model. It also provides a method for
+#' determining how much memory is used by the final model.
+#' @details
+#' It provides a method that performs intrinsic model evaluation based
+#' on Perplexity. It also provides a method that performs extrinsic model
+#' evalation based on accuracy. It provides a method for determining how much
+#' memory and time is needed to generate a model for different input data sizes.
+#' It provides a method for determining how much memory is needed by the final
+#' model.
+#' @importFrom ggplot2 ggplot aes aes_string geom_point geom_smooth labs xlim
+#' coord_cartesian
+#' @importFrom digest digest2int
+#' @importFrom SnowballC wordStem
+#' @importFrom patchwork plot_annotation
+#' @importFrom pryr mem_change object_size
+ModelEvaluator <- R6::R6Class(
+ "ModelEvaluator",
+ inherit = TextFileProcessor,
+ public = list(
+ #' @description
+ #' It initializes the current object. It is used to set the
+ #' maximum ngram number, sample size, input file name, data cleaner
+ #' options and verbose option.
+ #' @param model The maximum ngram number supported by the model.
+ #' @param ssize The sample size in Mb.
+ #' @param ddir The data directory.
+ #' @param mdir The model directory.
+ #' @param dc_opts The data cleaner options.
+ #' @param tg_opts The token generator options.
+ #' @param verbose If progress information should be displayed.
+ #' @export
+ initialize = function(model = 4,
+ ssize = 30,
+ ddir = "./data",
+ mdir = "./data/models",
+ dc_opts = list(),
+ tg_opts = list(),
+ verbose = 0) {
+ # The base class is initialized
+ super$initialize(NULL, NULL, verbose)
+ # The model number is set
+ private$model <- model
+ # The sample size is set
+ private$ssize <- ssize
+ # The data directory name is set
+ private$ddir <- ddir
+ # The model directory name is set
+ private$mdir <- mdir
+ # If the dc_opts are given
+ if (length(dc_opts) > 0) {
+ # The custom dc_opts are merged with the default dc_opts
+ private$dc_opts = modifyList(private$dc_opts, dc_opts)
+ }
+ # If the tg_opts are given
+ if (length(tg_opts) > 0) {
+ # The custom tg_opts are merged with the default tg_opts
+ private$tg_opts = modifyList(private$tg_opts, tg_opts)
+ }
+ # The transition probabilities data is initialized
+ private$tp <- NULL
+ },
+ #' @description
+ #' Predicts the new word given a list of 1, 2 or 3 previous
+ #' words. It checks the given n words in the transition probabilities
+ #' data. If there is a match, the top 3 next words with highest
+ #' probabilities are returned. If there is no match, then the last n-1
+ #' previous words are checked. This process is continued until the last
+ #' word is checked. If there is no match, then empty result is returned.
+ #' @param words A character vector of previous words or a single vector
+ #' containing the previous word text.
+ #' @param count The number of results to return.
+ #' @param dc A DataCleaner object. If it is given, then the given words
+ #' are cleaned. If the stem_words option was set in the TokenGenerator
+ #' object configuration for the current model, then the words are
+ #' converted to their stems.
+ #' @return The top 3 predicted words along with their probabilities.
+ predict_word = function(words, count = 3, dc = NULL) {
+ # The transition probabilities data is read
+ private$read_tp_data(private$model)
+ # The words are assigned to temp variable
+ w <- words
+ # If the DataCleaner obj was specified
+ if (!is.null(dc)) {
+ # If the words is a set of vectors
+ if (length(w) > 1) {
+ # The words are converted to a single line of text
+ w <- paste0(w, collapse = " ")
+ }
+ # The words are cleaned
+ w <- dc$clean_lines(w)
+ # If the words should be stemmed
+ if (private$tg_opts[["stem_words"]]) {
+ # The previous words are stemmed
+ w <- wordStem(w)
+ }
+ }
+ # If the words are in the form of a line
+ if (length(w) == 1) {
+ # The words are split on space
+ w <- strsplit(w, " ")[[1]]
+ }
+ # The length of previous words
+ pwl <- length(w)
+ # The loop counter
+ c <- 1
+ # Indicates if the word was found
+ found <- FALSE
+ # The result
+ result <- list("found" = F, "words" = "", "probs" = "")
+ # If the previous words length is 0
+ if (pwl == 0)
+ return(result)
+ # The last 3 words are extracted.
+ # If the previous word length is more than 3
+ if (pwl > 3) {
+ pw <- w[(pwl-2):pwl]
+ }
+ else {
+ pw <- w
+ }
+ # The length of previous words
+ pwl <- length(pw)
+ # Each ngram in the previous word list is checked starting from
+ # largest ngram
+ for (i in pwl:1) {
+ # The previous words to check
+ tpw <- pw[c:pwl]
+ # The key to use for the transition probabilities data
+ k <- paste(tpw, collapse = "_")
+ # The key is converted to a numeric hash
+ h <- digest2int(k)
+ # The transition probabilities data is checked
+ res <- private$tp[private$tp$pre == h, ]
+ # If the prefix was found
+ if (nrow(res) > 0) {
+ # The word was found
+ found <- TRUE
+ # The result is sorted by probability
+ sres <- res[order(res$prob, decreasing = T),]
+ # The number of rows in the result set
+ rcount <- nrow(sres)
+ # If the number of results is more than the required number
+ # of results
+ if (rcount > count) {
+ # The result count is set to the required number of
+ # results
+ rc <- count
+ }
+ else {
+ # The result count is set to the number of results
+ rc <- rcount
+ }
+ # The required word probabilities
+ probs <- sres$prob[1:rc]
+ # The next words indexes
+ ind <- sres$nw[1:rc]
+ # The required words
+ nw <- as.character(private$wl$pre[ind])
+ # The result is updated
+ result[["words"]] <- nw
+ result[["probs"]] <- probs
+ result[["found"]] <- T
+ # The information message
+ msg <- paste0("The ngram key: ", k, " was found")
+ # Information message is shown
+ private$display_msg(msg, 3)
+ # The loop ends
+ break;
+ }
+ else {
+ # The information message
+ msg <- paste0("The ngram key: ", k, " was not found")
+ # Information message is shown
+ private$display_msg(msg, 3)
+ # The result is updated
+ result[["found"]] <- F
+ }
+ # The information message
+ msg <- paste0("Backing off to ", (i) ,"-gram")
+ # Information message is shown
+ private$display_msg(msg, 3);
+ # The counter is increased by 1
+ c <- c + 1;
+ }
+ return(result);
+ },
+ #' @description
+ #' Calculates the probability of the given word given the
+ #' previous model-1 words, where model is the maximum ngram number. It
+ #' looks up the probability of a word given n previous words. The
+ #' previous n words are converted to numeric hash using digest2int
+ #' function. The hash is looked up in a data frame of transition
+ #' probabilities. The word is converted to a number by checking its
+ #' position in a list of unique words. If the hash and the word position
+ #' were found, then the probability of the previous word and hash is
+ #' returned. If it was not found, then the hash of the n-1 previous
+ #' words is taken and the processed is repeated. If the data was not
+ #' found in the data frame, then the word probability is returned. This
+ #' is known as backoff. If the word probability could not be found then
+ #' the default probability is returned. The default probability is
+ #' calculated as 1/(N+V), Where N = number of words in corpus and V is
+ #' the number of dictionary words.
+ #' @param word The word whoose probability is to be calculated.
+ #' @param pw The previous words.
+ #' @return The probability of the word given the previous words.
+ get_word_prob = function(word, pw) {
+ # If the default probability is not set, then an error is raised
+ if (is.null(private$dp))
+ stop("The default probability is not set !")
+ # The length of previous words
+ pwl <- length(pw)
+ # The probability of the word given the previous words. It is
+ # initialized to the default probability, which should be 1/(N+V)
+ prob <- private$dp
+ # The loop counter
+ c <- 1
+ # Indicates if the word was found
+ found <- FALSE
+ # The next word id
+ nw <- match(word, private$wl$pre)
+ # If the next word was not found
+ if (is.na(nw)) {
+ # The information message
+ msg <- paste0(
+ "The next word: ", word, " was not found")
+ # Information message is shown
+ private$display_msg(msg, 3)
+ }
+ # If the previous word count is more than 0
+ else if (pwl > 0) {
+ # The previous words are checked
+ for (i in pwl:1) {
+ # The previous words to check
+ tpw <- pw[c:pwl]
+ # The key to use for the transition matrix
+ k <- paste(tpw, collapse = "_")
+ # The key is converted to a numeric hash
+ h <- digest2int(k)
+ # The transition probabilities data is checked
+ res <- private$tp[private$tp$pre == h & private$tp$nw == nw, ]
+ # If the prefix was found
+ if (nrow(res) > 0) {
+ # The word was found
+ found <- TRUE
+ # The probability is set
+ prob <- as.numeric(res$prob)
+ # The information message
+ msg <- paste0("The ngram key: ",
+ k, " and the next word: ",
+ word, " were found")
+ # Information message is shown
+ private$display_msg(msg, 3)
+ # The loop ends
+ break
+ }
+ else {
+ # The information message
+ msg <- paste0("The ngram key: ",
+ k, " and the next word: ",
+ word, " were not found")
+ # Information message is shown
+ private$display_msg(msg, 3)
+ }
+ # The information message
+ msg <- paste0("Backing off to ", (i) ,"-gram")
+ # Information message is shown
+ private$display_msg(msg, 3)
+ # The counter is increased by 1
+ c <- c + 1
+ }
+ }
+ # If the word was not found then the probability of the word is
+ # checked in the n1-gram
+ if (!found) {
+ # If the word was not found
+ if (sum(private$wl$pre == word) == 0) {
+ # Information message is shown
+ private$display_msg("Using default probability", 3)
+ }
+ else {
+ # The word probability
+ prob <- as.numeric(private$wl[private$wl$pre == word, "prob"])
+ }
+ }
+ return(prob)
+ }
+ )
diff --git a/R/text-file-processor.R b/R/text-file-processor.R
index e47e187..0331e94 100644
--- a/R/text-file-processor.R
+++ b/R/text-file-processor.R
@@ -9,38 +9,17 @@
#' called before reading a file. The process function is called for processing a
#' line. The post processing function is called on the processed data. It also
#' provides a method for generating a sample file from an input text file
+#' @importFrom pryr object_size
TextFileProcessor <- R6::R6Class(
public = list(
- #' @field opts The list of file processing options.
- #' save_data -> If the combined processed lines should be saved.
- #' output_file -> Name of the output file used to store the data.
- opts = list(
- "save_data" = F,
- "output_file" = "./data/sample-clean.txt"
- ),
- #' @field line_count The number of lines to read and process at a time.
- line_count = 100,
- #' @field pp_output The output of the pre-processing step
- pp_output = NULL,
- #' @field p_output The output of the processing step
- p_output = NULL,
- #' @field file_name The name of the text file to process.
- file_name = "./data/sample.txt",
- #' @field verbose Indicates if progress data should be printed.
- verbose = 0,
#' @description
#' It initializes the current object. It is used to set the file name
#' and verbose options.
#' @param file_name The path to the file to clean.
#' @param line_count The number of lines to read and clean at a time.
#' @param verbose Indicates if progress information should be displayed.
+ #' @export
initialize = function(file_name = "./data/sample.txt",
line_count = 100,
verbose = 2) {
@@ -50,30 +29,96 @@ TextFileProcessor <- R6::R6Class(
# The base class attributes are set
# The file name is set
- self$file_name <- file_name
+ private$file_name <- file_name
# The verbose option is set
- self$verbose <- verbose
+ private$verbose <- verbose
# The line count is set
- self$line_count <- line_count
+ private$line_count <- line_count
# The processed output is set
- self$p_output <- NULL
+ private$p_output <- NULL
+ }
+ ),
+ private = list(
+ # @field opts The list of file processing options.
+ # save_data -> If the combined processed lines should be saved.
+ # output_file -> Name of the output file used to store the data.
+ opts = list(
+ "save_data" = F,
+ "output_file" = "./data/sample-clean.txt"
+ ),
+ # @field line_count The number of lines to read and process at a time.
+ line_count = 100,
+ # @field p_output The output of the processing step
+ p_output = NULL,
+ # @field file_name The name of the text file to process.
+ file_name = "./data/sample.txt",
+ # @field verbose Indicates if progress data should be printed.
+ verbose = 0,
+ # @field con The input file connection
+ con = NULL,
+ # @description
+ # Reads the contents of the given file. Loads the file
+ # contents to a R object, a data frame or character vector.
+ # @param file_name The file name.
+ # @param format The file format. 'plain' or 'obj'
+ # @param opts Options for reading the file.
+ # @return The required data.
+ read_data = function(file_name, format, opts) {
+ # If the format is plain
+ if (format == "plain") {
+ # The file is read
+ data <- private$read_file(file_name, opts)
+ }
+ # If the format is obj
+ else if (format == "obj") {
+ # The file is read
+ data <- private$read_obj(file_name)
+ }
+ return(data)
- #' @description
+ # @description
+ # Writes the given data to a file. The data may be a R object, a
+ # character vector or a data frame.
+ # @param file_name The file name.
+ # @param format The file format. 'plain' or 'obj'
+ # @param opts Options for writting to the file.
+ # @return The required data.
+ write_data = function(data, file_name, format, opts) {
+ # If the format is plain
+ if (format == "plain") {
+ # The data is written to a file
+ private$write_file(data, file_name, opts)
+ }
+ # If the format is obj
+ if (format == "obj") {
+ # The R object is saved
+ private$save_obj(data, file_name)
+ }
+ },
+ # @description
#' Reads the given file one line at a time. It runs the given
#' pre-processing function before reading the file. It runs the given
- #' line processing function for each line. It optionally saves the
- #' output of line processing after reading the file or after processing
- #' certain number of lines.
- #' @param pre_process The pre-processing function.
- #' @param process The function used to process each line.
- #' @param post_process The function used to perform post processing.
- #' @return The combined processed data
+ # line processing function for each line. It optionally saves the
+ # output of line processing after reading the file or after processing
+ # certain number of lines.
+ # @param pre_process The pre-processing function.
+ # @param process The function used to process each line.
+ # @param post_process The function used to perform post processing.
+ # @return The combined processed data
process_file = function(pre_process, process, post_process) {
# Pre-processing is done
# The file is opened
- private$con <- file(self$file_name)
+ private$con <- file(private$file_name)
# The connection is opened for reading
# The lines to be read,
@@ -83,31 +128,38 @@ TextFileProcessor <- R6::R6Class(
# Indicates that data should not be appended
is_app <- F
# The output file name
- of <- self$opts[["output_file"]]
+ of <- private$opts[["output_file"]]
# All lines are read
while (TRUE) {
# The lines are read
- lines <- readLines(private$con, n = self$line_count,
+ lines <- readLines(private$con, n = private$line_count,
skipNul = TRUE)
# If all the lines have been read
if (length(lines) == 0) break
# The lines are processed
p_lines <- process(lines)
+ # If the processed lines are NULL
+ if(is.null(p_lines)) next
# If the data should be saved
- if (self$opts[["save_data"]] && !is.null(p_lines)) {
+ if (private$opts[["save_data"]]) {
# The cleaned data is written to file
- self$write_file(p_lines, of, is_app)
+ private$write_file(p_lines, of, is_app)
# Debug message
- self$display_msg(
+ private$display_msg(
paste(length(p_lines), "lines were written"), 1)
# Indicates that data should be appended
is_app <- T
+ # If the processed data should not be saved
+ else {
+ # The processed output is merged
+ private$p_output <- c(private$p_output, p_lines)
+ }
# The loop counter is increased by 1
c <- c + 1
# Debug message
- self$display_msg(
- paste(self$line_count*c, "lines have been processed"), 1)
+ private$display_msg(
+ paste(private$line_count*c, "lines have been processed"), 1)
# if (c == 2) break;
@@ -115,18 +167,23 @@ TextFileProcessor <- R6::R6Class(
# Post processing is performed
+ # If the data should not be saved
+ if (!private$opts[["save_data"]]) {
+ # The processed output is returned
+ return(private$p_output)
+ }
- #' @description
- #' Reads the given file and returns its contents.
- #' @param file_name The name of the file to read.
- #' @param is_csv If the data is a csv file
- #' @return The file data
+ # @description
+ # Reads the given file and returns its contents.
+ # @param file_name The name of the file to read.
+ # @param is_csv If the data is a csv file
+ # @return The file data
read_file = function(file_name, is_csv) {
# The information message
msg <- paste0("Reading file: ", file_name)
# Information message is shown
- self$display_msg(msg, 1)
+ private$display_msg(msg, 1)
# If the file is not a csv file
if (!is_csv) {
# File is opened for reading
@@ -143,16 +200,16 @@ TextFileProcessor <- R6::R6Class(
return (data)
- #' @description
- #' Reads the given number of lines from the given file.
- #' @param file_name The name of the file to read.
- #' @param line_count The number of lines to read.
- #' @return The file data
+ # @description
+ # Reads the given number of lines from the given file.
+ # @param file_name The name of the file to read.
+ # @param line_count The number of lines to read.
+ # @return The file data
read_lines = function(file_name, line_count) {
# The information message
msg <- paste0("Reading file: ", file_name)
# Information message is shown
- self$display_msg(msg, 1)
+ private$display_msg(msg, 1)
# File is opened for reading
con <- file(file_name)
# The file contents are read
@@ -163,17 +220,17 @@ TextFileProcessor <- R6::R6Class(
return (data)
- #' @description
- #' Writes the given data to the given file. The data may be appended to
- #' an existing file.
- #' @param data The data to be written.
- #' @param file_name The name of the file.
- #' @param is_append Indicates if data should be saved.
+ # @description
+ # Writes the given data to the given file. The data may be appended to
+ # an existing file.
+ # @param data The data to be written.
+ # @param file_name The name of the file.
+ # @param is_append Indicates if data should be saved.
write_file = function(data, file_name, is_append) {
# The information message
msg <- paste0("Saving file: ", file_name)
# Information message is shown
- self$display_msg(msg, 1)
+ private$display_msg(msg, 1)
# If the given data is a data frame
if ("data.frame" %in% class(data)) {
# The data frame is written to a file
@@ -193,13 +250,13 @@ TextFileProcessor <- R6::R6Class(
- #' @description
- #' Calculates the size of the given object or formats the given bytes
- #' object as a number without units. The returned number is the size in
- #' Mb.
- #' @param obj An object of class bytes or an object whoose size is to be
- #' found.
- #' @return The size formatted as a string.
+ # @description
+ # Calculates the size of the given object or formats the given bytes
+ # object as a number without units. The returned number is the size in
+ # Mb.
+ # @param obj An object of class bytes or an object whoose size is to be
+ # found.
+ # @return The size formatted as a string.
format_size = function(obj) {
# If the obj is not of class bytes
if (!("bytes" %in% class(obj))) {
@@ -219,29 +276,29 @@ TextFileProcessor <- R6::R6Class(
- #' @description
- #' Saves the given object as a file.
- #' @param obj The object to save.
- #' @param file_name The file name.
+ # @description
+ # Saves the given object as a file.
+ # @param obj The object to save.
+ # @param file_name The file name.
save_obj = function(obj, file_name) {
# The information message
msg <- paste0("Saving file: ", file_name)
# Information message is shown
- self$display_msg(msg, 1)
+ private$display_msg(msg, 1)
# The object is saved to a file
saveRDS(obj, file_name)
- #' @description
- #' Reads the contents of the given file. Loads the file
- #' contents to a R object.
- #' @param file_name The file name.
- #' @return The loaded R obj.
+ # @description
+ # Reads the contents of the given file. Loads the file
+ # contents to a R object.
+ # @param file_name The file name.
+ # @return The loaded R obj.
read_obj = function(file_name) {
# The information message
msg <- paste0("Reading file: ", file_name)
# Information message is shown
- self$display_msg(msg, 1)
+ private$display_msg(msg, 1)
# If the file does not exist
if (!file.exists(file_name)) {
# The information message
@@ -257,83 +314,38 @@ TextFileProcessor <- R6::R6Class(
- #' @description
- #' Prints the given message depending on verbose settings.
- #' @param msg The message to be printed.
- #' @param min_debug The minimum debugging level
+ # @description
+ # Prints the given message depending on verbose settings.
+ # @param msg The message to be printed.
+ # @param min_debug The minimum debugging level
display_msg = function(msg, min_debug) {
# If verbose is >= min_debug , then message is displayed
- if (self$verbose >= min_debug) {
+ if (private$verbose >= min_debug) {
- #' @description
- #' Performs processing on the data. It should be
- #' overriden by a derived class.
- #' @param lines The lines to process
+ # @description
+ # Performs processing on the data. It should be
+ # overriden by a derived class.
+ # @param lines The lines to process
process = function(lines) {
- #' @description
- #' Performs post-processing on the processed data. It should be
- #' overriden by a derived class.
+ # @description
+ # Performs post-processing on the processed data. It should be
+ # overriden by a derived class.
post_process = function() {
- #' @description
- #' Performs pre-processing on the processed data. It should be
- #' overriden by a derived class.
+ # @description
+ # Performs pre-processing on the processed data. It should be
+ # overriden by a derived class.
pre_process = function() {
- ),
- private = list(
- # @field con The input file connection
- con = NULL,
- # @description
- # Reads the contents of the given file. Loads the file
- # contents to a R object, a data frame or character vector.
- # @param file_name The file name.
- # @param format The file format. 'plain' or 'obj'
- # @param opts Options for reading the file.
- # @return The required data.
- read_data = function(file_name, format, opts) {
- # If the format is plain
- if (format == "plain") {
- # The file is read
- data <- self$read_file(file_name, opts)
- }
- # If the format is obj
- else if (format == "obj") {
- # The file is read
- data <- self$read_obj(file_name)
- }
- return(data)
- },
- # @description
- # Writes the given data to a file. The data may be a R object, a
- # character vector or a data frame.
- # @param file_name The file name.
- # @param format The file format. 'plain' or 'obj'
- # @param opts Options for writting to the file.
- # @return The required data.
- write_data = function(data, file_name, format, opts) {
- # If the format is plain
- if (format == "plain") {
- # The data is written to a file
- self$write_file(data, file_name, opts)
- }
- # If the format is obj
- if (format == "obj") {
- # The R object is saved
- self$save_obj(data, file_name)
- }
- }
diff --git a/R/token-generator.R b/R/token-generator.R
index 6bd5a6a..6254971 100644
--- a/R/token-generator.R
+++ b/R/token-generator.R
@@ -1,39 +1,18 @@
#' It generates ngrams of given size from an input text file
#' @description
-#' It generates ngram tokens along with their frequencies.
+#' It generates ngram tokens along with their frequencies. The data
+#' may be saved to a file in plain text format or as a R object.
-#' @details
-#' It provides a method for generating ngrams of given size. It saves
-#' each ngram along with its frequency to a text file.
+#' @importFrom SnowballC wordStem
+#' @importFrom dplyr group_by summarize_all
TokenGenerator <- R6::R6Class(
inherit = TextFileProcessor,
public = list(
- #' @field tg_opts The options for the token generator obj.
- #' n -> The ngram size.
- #' save_ngrams -> If the ngram data should be saved.
- #' min_freq -> All ngrams with frequency less than min_freq are
- #' ignored.
- #' line_count -> The number of lines to process at a time.
- #' stem_words -> If words should be converted to their stem.
- #' dir -> The dir where the output file should be saved.
- #' format -> The format for the output. There are two options.
- #' 'plain' -> The data is stored in plain text.
- #' 'obj' -> The data is stored as a R obj.
- tg_opts = list(
- "n" = 1,
- "save_ngrams" = F,
- "min_freq" = -1,
- "line_count" = 5000,
- "stem_words" = F,
- "dir" = "./data/models",
- "format" = "obj"
- ),
#' @description
- #' It initializes the current obj. It is used to set the file name
- #' and verbose options.
+ #' It initializes the current obj. It is used to set the file name,
+ #' tokenization options and verbose option.
#' @param file_name The path to the input file.
#' @param opts The options for generating the ngram tokens.
#' n -> The ngram size.
@@ -47,57 +26,76 @@ TokenGenerator <- R6::R6Class(
#' 'plain' -> The data is stored in plain text.
#' 'obj' -> The data is stored as a R obj.
#' @param verbose Indicates if progress information should be displayed.
- initialize = function(file_name = "./data/models/validate-clean.txt",
- opts = self$tg_opts,
- verbose = 0) {
+ #' @export
+ initialize = function(file_name = NULL, opts = list(), verbose = 0) {
# The given options are merged with the opts attribute
- self$tg_opts <- modifyList(self$tg_opts, opts)
- # The tg_opts is merged with the base class opts attribute
- self$opts <- modifyList(self$opts, self$tg_opts)
+ private$tg_opts <- modifyList(private$tg_opts, opts)
# The base class is initialized
- super$initialize(file_name, self$opts[["line_count"]], verbose)
+ super$initialize(file_name, private$tg_opts$line_count, verbose)
# The processed output is initialized
- self$p_output <- NULL
+ private$p_output <- NULL
#' @description
- #' It generates ngram tokens and their frequencies from the given file
- #' name. The tokens may be saved to a text file as plain text or a R
- #' obj.
- #' @return The ngram tokens along with their frequencies.
+ #' It generates ngram tokens and their frequencies from the
+ #' given file name. The tokens may be saved to a text file as plain text
+ #' or a R object.
+ #' @return The data frame containing ngram tokens along with their
+ #' frequencies.
generate_tokens = function() {
# The processed output is initialized
- self$p_output <- NULL
+ private$p_output <- NULL
# The output file name
fn <- private$get_file_name()
# If the output file already exists
if (file.exists(fn)) {
# The information message
- msg <- paste0("The ", self$opts[["n"]],
+ msg <- paste0("The ", private$tg_opts[["n"]],
"-gram file already exists")
# The information message is shown
- self$display_msg(msg, 1)
+ private$display_msg(msg, 1)
# If the ngram data should not be saved
- if (!self$opts[["save_ngrams"]]) {
+ if (!private$tg_opts[["save_ngrams"]]) {
# The ngrams file is read
- self$p_output <- self$read_data(
- fn, self$opts[["format"]], T)
+ private$p_output <- private$read_data(
+ fn, private$tg_opts[["format"]], T)
else {
# The information message
msg <- paste0("Generating ",
- self$opts[["n"]], "-gram tokens...")
+ private$tg_opts[["n"]], "-gram tokens...")
# The information message is shown
- self$display_msg(msg, 1)
+ private$display_msg(msg, 1)
# The base class process_file function is called
- super$process_file(super$pre_process, private$process,
+ private$process_file(private$pre_process, private$process,
private = list(
+ # @field tg_opts The options for the token generator obj.
+ # n -> The ngram size.
+ # save_ngrams -> If the ngram data should be saved.
+ # min_freq -> All ngrams with frequency less than min_freq are
+ # ignored.
+ # line_count -> The number of lines to process at a time.
+ # stem_words -> If words should be converted to their stem.
+ # dir -> The dir where the output file should be saved.
+ # format -> The format for the output. There are two options.
+ # 'plain' -> The data is stored in plain text.
+ # 'obj' -> The data is stored as a R obj.
+ tg_opts = list(
+ "n" = 1,
+ "save_ngrams" = F,
+ "min_freq" = -1,
+ "line_count" = 5000,
+ "stem_words" = F,
+ "dir" = "./data/models",
+ "format" = "obj"
+ ),
# @description
# Performs processing for the \code{generate_tokens} function. It
# processes the given line of text. It converts each line of text into
@@ -107,13 +105,13 @@ TokenGenerator <- R6::R6Class(
# Ngrams are extracted from each line
ngrams <- private$generate_ngrams(lines)
# If the processed output is empty
- if (is.null(self$p_output)) {
+ if (is.null(private$p_output)) {
# The ngram words are set to the processed output
- self$p_output <- ngrams
+ private$p_output <- ngrams
else {
# The ngram words are appended to the processed output
- self$p_output <- c(self$p_output, ngrams)
+ private$p_output <- c(private$p_output, ngrams)
@@ -121,11 +119,11 @@ TokenGenerator <- R6::R6Class(
# It returns the name of the output ngram file.
get_file_name = function() {
# The ngram number
- n <- self$opts[["n"]]
+ n <- private$tg_opts[["n"]]
# The format
- fo <- self$opts[["format"]]
+ fo <- private$tg_opts[["format"]]
# The output directory
- dir <- self$opts[["dir"]]
+ dir <- private$tg_opts[["dir"]]
# The file extension
if (fo == "plain") ext <- ".txt"
else ext <- ".RDS"
@@ -141,36 +139,36 @@ TokenGenerator <- R6::R6Class(
post_process = function() {
# The information message
msg <- paste0("Calculating ",
- self$opts[["n"]], "-gram frequencies...")
+ private$tg_opts[["n"]], "-gram frequencies...")
# The information message is shown
- self$display_msg(msg, 1)
+ private$display_msg(msg, 1)
# The output is copied to a variable
- df <- data.frame("pre" = self$p_output)
+ df <- data.frame("pre" = private$p_output)
# A frequency column is added
df$freq <- 1
# Each prefix is grouped and summed
df <- df %>% group_by(pre) %>% summarize_all(sum)
# If the minimum ngram frequency is given
- if (self$opts[["min_freq"]] > -1) {
+ if (private$tg_opts[["min_freq"]] > -1) {
# The information message
msg <- paste0("Removing low frequency ngrams...")
# The information message is shown
- self$display_msg(msg, 2)
+ private$display_msg(msg, 2)
# All ngrams with frequency less than min_freq are ignored
- df <- df[df$freq >= self$opts[["min_freq"]], ]
+ df <- df[df$freq >= private$tg_opts[["min_freq"]], ]
# The column names are set
colnames(df) <- c("pre", "freq")
# The output is set to the updated variable
- self$p_output <- df
+ private$p_output <- df
# If the ngram data should be saved
- if (self$opts[["save_ngrams"]]) {
+ if (private$tg_opts[["save_ngrams"]]) {
# The required file name
fn <- private$get_file_name()
# The format
- fo <- self$opts[["format"]]
+ fo <- private$tg_opts[["format"]]
# The n-gram data frame is written to file
- private$write_data(self$p_output, fn, fo, F)
+ private$write_data(private$p_output, fn, fo, F)
@@ -179,7 +177,7 @@ TokenGenerator <- R6::R6Class(
# @param lines The lines of text to process
generate_ngrams = function(lines) {
# The ngram number
- n <- self$opts[["n"]]
+ n <- private$tg_opts[["n"]]
# If n > 1
if (n > 1) {
# Trailing and leading white space is removed
@@ -199,7 +197,7 @@ TokenGenerator <- R6::R6Class(
# The ngrams are generated
l <- sapply(indexes, function(i) {
# If the words should be stemmed
- if (self$tg_opts[["stem_words"]]) {
+ if (private$tg_opts[["stem_words"]]) {
# The ngram prefix words are stemmed. The next word is
# not stemmed
v <- c(wordStem(w[i:(i+n-2)]), w[(i+n-1)])
diff --git a/R/tp-generator.R b/R/tp-generator.R
index 7a2ebb9..f18e7b0 100644
--- a/R/tp-generator.R
+++ b/R/tp-generator.R
@@ -1,84 +1,74 @@
-#' It is used to generate the transition probabilities
+#' It is used to generate transition probabilities for n-grams.
#' @description
-#' It implements Markov Chains for ngrams. It generates transition
-#' probabilities for ngrams.
+#' It provides a method for generating transition probabilities for the given
+#' n-gram size. It also provides a method for generating the combined transition
+#' probabilities data for n-gram sizes from 1 to the given size. The combined
+#' transition probabilities data can be used to implement back-off.
#' @details
-#' It reads ngram frequencies from an input text file. It parses each
-#' ngram into a prefix, a next word and the next word frequency. The prefix is
-#' converted to a numeric hash using the digest2int function from the digest
-#' package. The next word is replaced with the position of the next word in the
-#' list of all words. The data is stored in a data frame. It may be saved to a
-#' file.
+#' It provides a method for generating n-gram transition probabilities. It reads
+#' ngram frequencies from an input text file that is generated by the
+#' TokenGenerator class. It parses each ngram into a prefix, a next word and the
+#' next word frequency. Maximum Likelihood count is used to generated the next
+#' word probabilities. The n-gram prefix is converted to a numeric hash. The
+#' next word is replaced with the position of the next word in the list of all
+#' words. The transition probabilities data is stored in a file. Another method
+#' is provided that combines the transition probabilities for n-grams of size 1
+#' to the given size. The combined transition probabilities can be saved as a
+#' data frame. This file may be regarded as a n-gram model. The name of the file
+#' is model-n.RDS, where n is the n-gram number. By combining the transition
+#' probabilities of n-grams, back-off may be used to evaluate word probabilities
+#' or predict the next word.
+#' @importFrom stringr str_match
+#' @importFrom digest digest2int
+#' @importFrom dplyr group_by mutate
TPGenerator <- R6::R6Class(
inherit = TextFileProcessor,
public = list(
#' @description
- #' It initializes the current obj. It is used to set the verbose option.
+ #' It initializes the current obj. It is used to set the
+ #' transition probabilities options and verbose option.
+ #' @param opts The options for generating the transition probabilities.
+ #' save_tp -> If the data should be saved.
+ #' n -> The ngram size.
+ #' dir -> The dir where the output file should be saved.
+ #' format -> The format for the output. There are two options.
+ #' 'plain' -> The data is stored in plain text.
+ #' 'obj' -> The data is stored as a R obj.
#' @param verbose If progress information should be displayed.
- initialize = function(verbose = 0) {
+ #' @export
+ initialize = function(opts = NULL, verbose = 0) {
+ # The given options are merged with the opts attribute
+ private$tp_opts <- modifyList(private$tp_opts, opts)
# The base class is initialized
super$initialize(NULL, NULL, verbose)
# The processed output is initialized
- self$p_output <- data.frame()
+ private$p_output <- data.frame()
#' @description
- #' It groups the given tp data by prefix. Each prefix has
- #' the top c next words with the highest probabilities. All other next
- #' words are removed. The trimmed tp data may be saved to a file or
- #' returned. The file is saved with the suffix -min.
- #' e.g model-4-min.RDS.
- #' @param opts The options for trimming the tp data.
- #' save_tp -> If the data should be saved.
- #' dir -> The dir where the output file should be saved.
- #' c -> The top c next words per ngrams prefix.
- #' m -> The maximum ngram number supported by the model.
- trim_tp = function(opts) {
- # The model file name
- fn <- paste0(opts$dir, "/model-", opts$m, ".RDS")
- # The model file is read to a data frame
- df <- self$read_obj(fn)
- },
- #' @description
- #' It generates the transition probabilities for the given
- #' ngram numbers It first generates the transition probabilities for
- #' each ngram number. The transition probabilities are then combined
- #' into a single data frame. The data frame may be saved to a file.
- #' @param opts The options for generating the transition probabilities.
- #' save_tp -> If the data should be saved.
- #' n_range -> The range of ngram numbers.
- #' dir -> The dir where the output file should be saved.
- #' format -> The format for the output. There are two options.
- #' 'plain' -> The data is stored in plain text.
- #' 'obj' -> The data is stored as a R obj.
- generate_tp = function(opts) {
- # The opts is merged with the tp_opts attribute
- private$tp_opts = modifyList(private$tp_opts, opts)
- # The tp_opts is merged with the base class opts attribute
- self$opts = modifyList(self$opts, private$tp_opts)
+ #' It combines the transition probabilities for n-grams
+ #' less than or equal to the given n-gram size. It first generates
+ #' the transition probabilities for each ngram size from 1 to the given
+ #' size. The transition probabilities are then combined into a single
+ #' data frame and saved to the output folder that is given as parameter
+ #' to the current object. By combining the transition probabilities for
+ #' all n-gram sizes less than n, back-off can be used to calculate next
+ #' word probabilities or predict the next word.
+ generate_tp = function() {
# The information message
- msg <- paste0(
- "Generating Transition Probabilities for n = ",
- min(self$opts[["n_range"]]),
- ":",
- max(self$opts[["n_range"]])
- )
+ msg <- paste0("Generating Transition Probabilities for n = ")
+ msg <- paste0(msg, "1:", private$tp_opts[["n"]])
# Information message is shown
- self$display_msg(msg, 1)
+ private$display_msg(msg, 1)
# The processed output is cleared
- self$p_output <- data.frame()
- # The ngram number range
- n_range <- self$opts[["n_range"]]
- # The minimum ngram number
- nmin <- min(n_range)
- # The maximum ngram number
- nmax <- max(n_range)
+ private$p_output <- data.frame()
# The output format
- fo <- self$opts[["format"]]
+ fo <- private$tp_opts[["format"]]
+ # The n-gram number
+ nmax <- private$tp_opts[["n"]]
# The file extension
if (fo == "plain") ext <- ".txt"
else ext <- ".RDS"
@@ -87,17 +77,17 @@ TPGenerator <- R6::R6Class(
n = 1,
format = fo,
save_tp = T,
- dir = self$opts[["dir"]]
+ dir = private$tp_opts[["dir"]]
# The combined tp data
c_pre <- c_nw <- c_prob <- c()
# For each ngram number, the transition probabilities data is
# generated.
- for (n in n_range) {
+ for (n in 1:nmax) {
# The value of n is set
tp_opts$n <- n
# The transition probabilities or word list is generated
- self$generate_tp_for_n(tp_opts)
+ self$generate_tp_for_n(n)
# If n == 1, then word list data is saved
if (n == 1) {
# The combined tp data is saved
@@ -105,22 +95,24 @@ TPGenerator <- R6::R6Class(
else {
# c_pre is updated
- c_pre <- c(c_pre, self$p_output$pre)
+ c_pre <- c(c_pre, private$p_output$pre)
# c_nw is updated
- c_nw <- c(c_nw, self$p_output$nw)
+ c_nw <- c(c_nw, private$p_output$nw)
# c_prob is updated
- c_prob <- c(c_prob, self$p_output$prob)
+ c_prob <- c(c_prob, private$p_output$prob)
# The processed output is cleared
- self$p_output <- data.frame()
+ private$p_output <- data.frame()
# The processed output is set to the combined tp data
- self$p_output <-
+ private$p_output <-
data.frame("pre" = c_pre,
"nw" = c_nw,
"prob" = c_prob)
+ # The model file name
+ fn <- paste0("model-", nmax, ext)
# The combined tp data is saved
- private$save_data(paste0("model-", nmax, ext))
+ private$save_data(fn)
#' @description
@@ -128,22 +120,9 @@ TPGenerator <- R6::R6Class(
#' file. It generates a data frame containing the prefix, next word
#' and next word frequency. The data frame may be saved to a file as
#' plain text or as a R obj. For n = 1, the list of words is saved.
- #' @param opts The options for generating the transition probabilities.
- #' save_tp -> If the data should be saved.
- #' n -> The ngram number
- #' dir -> The location of the input and output files.
- #' format -> The format of the input and output files. Options are:
- #' 'plain' -> The data is stored in plain text.
- #' 'obj' -> The data is stored as a R obj.
- generate_tp_for_n = function(opts) {
- # The opts is merged with the tp_opts attribute
- private$tp_opts = modifyList(private$tp_opts, opts)
- # The tp_opts is merged with the base class opts attribute
- self$opts = modifyList(self$opts, private$tp_opts)
+ generate_tp_for_n = function(n) {
# The output format
- fo <- self$opts[["format"]]
- # The ngram number
- n <- self$opts[["n"]]
+ fo <- private$tp_opts[["format"]]
# The output file name
fn <- private$get_file_name(T)
# If the output file already exists
@@ -151,7 +130,7 @@ TPGenerator <- R6::R6Class(
# The information message
msg <- paste0("The file: ", fn, " already exists")
# The information message is shown
- self$display_msg(msg, 1)
+ private$display_msg(msg, 1)
# The file is read
data <- private$read_data(fn, fo, T)
# If n = 1
@@ -161,19 +140,19 @@ TPGenerator <- R6::R6Class(
else {
# The processed output is set to the data
- self$p_output <- data
+ private$p_output <- data
else {
# The information message
msg <- paste0("Generating Transition Probabilities for n=", n)
# Information message is shown
- self$display_msg(msg, 1)
+ private$display_msg(msg, 1)
# The input file name
- self$file_name <- private$get_file_name(F)
+ private$file_name <- private$get_file_name(F)
# The data is read
- df <- private$read_data(self$file_name, fo, T)
+ df <- private$read_data(private$file_name, fo, T)
# If n = 1
if (n == 1) {
# The word list is set to the data frame
@@ -201,13 +180,13 @@ TPGenerator <- R6::R6Class(
"nw" = nw,
"freq" = nf)
# The processed output is set to the data frame
- self$p_output <- df
+ private$p_output <- df
# The next word probabilities are generated
# The frequency column is removed
- self$p_output$freq <- NULL
+ private$p_output$freq <- NULL
# If the data should be saved
- if (self$opts[["save_tp"]]) {
+ if (private$tp_opts[["save_tp"]]) {
@@ -227,7 +206,7 @@ TPGenerator <- R6::R6Class(
tp_opts = list(
"save_tp" = F,
"n" = 1,
- "dir" = "./data/models",
+ "dir" = "./models",
"format" = "obj"
@@ -239,13 +218,13 @@ TPGenerator <- R6::R6Class(
# saves the transition probability data to a file.
generate_probs = function() {
# Information message is shown
- self$display_msg("Generating Transition Probabilities...", 1)
+ private$display_msg("Generating Transition Probabilities...", 1)
# The ngram number
- n <- self$opts[["n"]]
+ n <- private$tp_opts[["n"]]
# If n > 1
if (n > 1) {
# The output is copied to a variable
- df <- self$p_output
+ df <- private$p_output
# A new probability column is added. It is set to the sum of
# frequency column for each prefix group.
df <- df %>%
@@ -254,7 +233,7 @@ TPGenerator <- R6::R6Class(
# Each frequency is divided by the sum to give the probability.
df$prob <- round(df$freq/df$prob, 8)
# The output is set to the updated variable
- self$p_output <- df
+ private$p_output <- df
@@ -263,11 +242,11 @@ TPGenerator <- R6::R6Class(
# @param is_output If the output file name is required.
get_file_name = function(is_output) {
# The ngram number
- n <- self$opts[["n"]]
+ n <- private$tp_opts[["n"]]
# The directory
- od <- self$opts[["dir"]]
+ od <- private$tp_opts[["dir"]]
# The format
- fo <- self$opts[["format"]]
+ fo <- private$tp_opts[["format"]]
# The file extension
if (fo == "plain") ext <- ".txt"
else ext <- ".RDS"
@@ -300,11 +279,11 @@ TPGenerator <- R6::R6Class(
# @param file_name The file name to use.
save_data = function(file_name = NULL) {
# The ngram number
- n <- self$opts[["n"]]
+ n <- private$tp_opts[["n"]]
# The directory
- od <- self$opts[["dir"]]
+ od <- private$tp_opts[["dir"]]
# The format
- fo <- self$opts[["format"]]
+ fo <- private$tp_opts[["format"]]
# If n = 1
if (n == 1) {
# The data to save
@@ -313,7 +292,7 @@ TPGenerator <- R6::R6Class(
# If n > 1
else if (n > 1) {
# The data to save
- data <- self$p_output
+ data <- private$p_output
# If the file name is given as parameter then it is used
if (!is.null(file_name)) fn <- paste0(od, "/", file_name)
@@ -328,14 +307,14 @@ TPGenerator <- R6::R6Class(
# If the word list has not been read
if (nrow(private$wl) == 0) {
# The format
- fo <- self$opts[["format"]]
+ fo <- private$tp_opts[["format"]]
# The file extension
if (fo == "plain") ext <- ".txt"
else ext <- ".RDS"
# The 1-gram words file name
- fn <- paste0(self$opts[["dir"]], "/words", ext)
+ fn <- paste0(private$tp_opts[["dir"]], "/words", ext)
# The words are read
- private$wl <- private$read_data(fn, self$opts[["format"]], F)
+ private$wl <- private$read_data(fn, private$tp_opts[["format"]], F)
diff --git a/inst/extdata/bad-words.txt b/inst/extdata/bad-words.txt
new file mode 100644
index 0000000..d2dacc0
--- /dev/null
+++ b/inst/extdata/bad-words.txt
@@ -0,0 +1,1734 @@
+2 girls 1 cup
+alabama hot pocket
+alaskan pipeline
+anal impaler
+anal leakage
+ass fuck
+ass hole
+auto erotic
+baby batter
+baby juice
+ball gag
+ball gravy
+ball kicking
+ball licking
+ball sack
+ball sucking
+bang (one's) box
+barely legal
+batty boy
+beaver cleaver
+beaver lips
+beef curtain
+beef curtains
+big black
+big breasts
+big knockers
+big tits
+bitch tit
+black cock
+blonde action
+blonde on blonde action
+bloody hell
+blow job
+blow me
+blow mud
+blow your load
+blue waffle
+booty call
+brown showers
+brunette action
+bull shit
+bullet vibe
+bum boy
+bung hole
+bunny fucker
+bust a load
+butt fuck
+butt plug
+camel toe
+carpet muncher
+chi-chi man
+chick with a dick
+choc ice
+chocolate rosebuds
+chota bags
+cleveland steamer
+clit licker
+clitty litter
+clover clamps
+cock pocket
+cock snot
+cock sucker
+coffin dodger
+cop some wood
+corp whore
+cum chugger
+cum dumpster
+cum freak
+cum guzzler
+cunt hair
+cut rope
+date rape
+deep throat
+dick head
+dick hole
+dick shy
+dirty pillows
+dirty sanchez
+dog style
+doggie style
+doggy style
+donkey punch
+double dong
+double penetration
+dp action
+dry hump
+dumb ass
+eat a dick
+eat hair pie
+eat my ass
+f u c k
+f u c k e r
+female squirting
+fist fuck
+flog the log
+foot fetish
+fuck buttons
+fuck hole
+fuck off
+fuck puppet
+fuck trophy
+fuck yo mama
+fuck you
+fudge packer
+gang bang
+gassy ass
+gay sex
+gender bender
+giant cock
+girl on
+girl on top
+girls gone wild
+god damn
+golden shower
+goo girl
+group sex
+ham flap
+hand job
+hard core
+hard on
+holy shit
+hot carl
+hot chick
+how to kill
+how to murdep
+how to murder
+huge fat
+iberian slap
+jack off
+jail bait
+jelly donut
+jerk off
+jungle bunny
+knob end
+leather restraint
+leather straight jacket
+lemon party
+make me come
+male squirting
+menage a trois
+middle finger
+missionary position
+moo moo foo foo
+mother fucker
+mound of venus
+mr hands
+muff diver
+muff puff
+need the dick
+nig nog
+nob jokey
+nsfw images
+nut butter
+nut sack
+old bag
+one cup two girls
+one guy one jar
+phone sex
+piece of shit
+piss off
+piss pig
+pissed off
+pleasure chest
+pole smoker
+poop chute
+porch monkey
+prince albert piercing
+pussy fart
+pussy palace
+raging boner
+reverse cowgirl
+rosy palm
+rosy palm and her 5 sisters
+rusty trombone
+s hit
+sand nigger
+sausage queen
+shaved beaver
+shaved pussy
+shirt lifter
+shit ass
+shit fucker
+slut bucket
+sod off
+son of a bitch
+son of a motherless goat
+son of a whore
+splooge moose
+spread legs
+strap on
+strip club
+style doggy
+suicide girls
+sultry women
+tainted love
+taking the piss
+taste my
+tea bagging
+tied up
+tight white
+tit wank
+tongue in a
+tub girl
+two fingers
+two fingers with tongue
+two girls one cup
+urethra play
+venus mound
+violet wand
+wet dream
+white power
+window licker
+wrapping men
+wrinkled starfish
+yellow showers
diff --git a/inst/extdata/dict-no-bad.txt b/inst/extdata/dict-no-bad.txt
new file mode 100644
index 0000000..788ce82
--- /dev/null
+++ b/inst/extdata/dict-no-bad.txt
@@ -0,0 +1,54378 @@
diff --git a/inst/extdata/stop-words.txt b/inst/extdata/stop-words.txt
new file mode 100644
index 0000000..f6e9200
--- /dev/null
+++ b/inst/extdata/stop-words.txt
@@ -0,0 +1,174 @@
diff --git a/man/DataAnalyzer.Rd b/man/DataAnalyzer.Rd
new file mode 100644
index 0000000..5aba097
--- /dev/null
+++ b/man/DataAnalyzer.Rd
@@ -0,0 +1,182 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data-analyzer.R
+\title{It is used to analyze text data}
+It provides information on text files, such as number of lines
+and number of words. It allows generating sample file from an input text
+file. It displays bar plots showing word frequencies.
+It provides a method that returns text file information. The text
+file information includes total number of lines, max, min and mean line
+length and file size. It also provides a method that takes random samples of
+lines in an input text file. It provides a method that reads an input text
+file containing token frequencies. It displays the most occuring tokens.
+\section{Super class}{
+\code{\link[wordpredictor:TextFileProcessor]{wordpredictor::TextFileProcessor}} -> \code{DataAnalyzer}
+\subsection{Public methods}{
+\item \href{#method-new}{\code{DataAnalyzer$new()}}
+\item \href{#method-plot_data}{\code{DataAnalyzer$plot_data()}}
+\item \href{#method-get_file_info}{\code{DataAnalyzer$get_file_info()}}
+\item \href{#method-generate_data}{\code{DataAnalyzer$generate_data()}}
+\item \href{#method-get_ngrams}{\code{DataAnalyzer$get_ngrams()}}
+\item \href{#method-clone}{\code{DataAnalyzer$clone()}}
+\out{Inherited methods
+\item \out{}\href{../../wordpredictor/html/TextFileProcessor.html#method-display_msg}{\code{wordpredictor::TextFileProcessor$display_msg()}}\out{}
+\item \out{}\href{../../wordpredictor/html/TextFileProcessor.html#method-format_size}{\code{wordpredictor::TextFileProcessor$format_size()}}\out{}
+\item \out{}\href{../../wordpredictor/html/TextFileProcessor.html#method-post_process}{\code{wordpredictor::TextFileProcessor$post_process()}}\out{}
+\item \out{}\href{../../wordpredictor/html/TextFileProcessor.html#method-pre_process}{\code{wordpredictor::TextFileProcessor$pre_process()}}\out{}
+\item \out{}\href{../../wordpredictor/html/TextFileProcessor.html#method-process}{\code{wordpredictor::TextFileProcessor$process()}}\out{}
+\item \out{}\href{../../wordpredictor/html/TextFileProcessor.html#method-process_file}{\code{wordpredictor::TextFileProcessor$process_file()}}\out{}
+\item \out{}\href{../../wordpredictor/html/TextFileProcessor.html#method-read_file}{\code{wordpredictor::TextFileProcessor$read_file()}}\out{}
+\item \out{}\href{../../wordpredictor/html/TextFileProcessor.html#method-read_lines}{\code{wordpredictor::TextFileProcessor$read_lines()}}\out{}
+\item \out{}\href{../../wordpredictor/html/TextFileProcessor.html#method-read_obj}{\code{wordpredictor::TextFileProcessor$read_obj()}}\out{}
+\item \out{}\href{../../wordpredictor/html/TextFileProcessor.html#method-save_obj}{\code{wordpredictor::TextFileProcessor$save_obj()}}\out{}
+\item \out{}\href{../../wordpredictor/html/TextFileProcessor.html#method-write_file}{\code{wordpredictor::TextFileProcessor$write_file()}}\out{}