From c83ca0d2f1287df58e993998fd7f24b8accc2072 Mon Sep 17 00:00:00 2001
From: pakjiddat <pakjiddat@gmail.com>
Date: Tue, 11 May 2021 06:45:53 +0000
Subject: [PATCH] Add source files

---
 R/data-analyzer.R       |  249 +++++++++
 R/data-cleaner.R        |  244 +++++++++
 R/model-evaluator.R     | 1136 +++++++++++++++++++++++++++++++++++++++
 R/text-file-processor.R |  339 ++++++++++++
 R/token-generator.R     |  241 +++++++++
 R/tp-generator.R        |  342 ++++++++++++
 6 files changed, 2551 insertions(+)
 create mode 100644 R/data-analyzer.R
 create mode 100644 R/data-cleaner.R
 create mode 100644 R/model-evaluator.R
 create mode 100644 R/text-file-processor.R
 create mode 100644 R/token-generator.R
 create mode 100644 R/tp-generator.R

diff --git a/R/data-analyzer.R b/R/data-analyzer.R
new file mode 100644
index 0000000..6484074
--- /dev/null
+++ b/R/data-analyzer.R
@@ -0,0 +1,249 @@
+#' It is used to analyze text data
+#'
+#' @description
+#' It provides information on text files, such as number of lines
+#' and number of words. It allows generating sample file from an input text
+#' file. It displays bar plots showing word frequencies.
+#'
+#' @details
+#' It provides a method that returns text file information. The text
+#' file information includes total number of lines, max, min and mean line
+#' length and file size. It also provides a method that takes random samples of
+#' lines in an input text file. It provides a method that reads an input text
+#' file containing token frequencies. It displays the most occuring tokens.
+DataAnalyzer <- R6::R6Class(
+    "DataAnalyzer",
+    inherit = TextFileProcessor,
+    public = list(
+        #' @description
+        #' It initializes the current object. It is used to set the file name
+        #' and verbose options.
+        #' @param file_name The path to the input file.
+        #' @param line_count The number of lines to read at a time.
+        #' @param verbose If progress information should be displayed.
+        initialize = function(file_name = "./data/n1.txt",
+                              verbose = 0) {
+            # The file name is set
+            self$file_name <- file_name
+            # The processed output is initialized
+            self$p_output <- data.frame()
+            # The verbose options is set
+            self$verbose = verbose
+        },
+
+        #' @description
+        #' It reads ngram token frequencies from an input text file. The ngram
+        #' frequencies are then displayed in a bar plot. The type of plot is
+        #' specified by the type option. 'top_features' displays the top n most
+        #' occuring tokens along with their frequencies. 'coverage' displays
+        #' the number of words along with their frequencies.
+        #' @param opts The options for analyzing the data.
+        #'   type -> The type of plot to display. The options are:
+        #'     'top_features', 'coverage'.
+        #'   n -> For 'top_features', it is the number of top most occuring
+        #'     tokens.
+        plot_data = function(opts) {
+            # The opts is merged with the da_opts attribute
+            private$da_opts = modifyList(private$da_opts, opts)
+            # The da_opts is merged with the base class opts attribute
+            self$opts = modifyList(self$opts, private$da_opts)
+            # The ngram data is read
+            df <- self$read_file(self$file_name, T)
+            # The information message is shown
+            self$display_msg("Displaying Plot...", 1)
+            # If the coverage option was specified
+            if (self$opts[["type"]] == "coverage") {
+                # The y values
+                y <- as.character(1:self$opts[["n"]])
+                # The x values
+                x <- numeric()
+                # The percentage frequencies is calculated
+                for (i in 1:self$opts[["n"]]) {
+                    # The percentage of tokens with frequency i
+                    x[i] <- round(100*(nrow(df[df$freq == i,])/nrow(df)), 2)
+                }
+                # A data frame is created
+                df <- data.frame("freq" = x, "pre" = y)
+                # The plot labels
+                labels <- list(
+                    y = "Percentage of total",
+                    x = "Word Frequency",
+                    title = "Coverage")
+
+                # The chart is plotted
+                private$display_plot(df, labels)
+            }
+            # If the top_features option was specified
+            else if (self$opts[["type"]] == "top_features") {
+                # The plot labels
+                labels <- list(
+                    y = "Frequency",
+                    x = "Feature",
+                    title = paste("Top", self$opts[["n"]], "Features"))
+
+                # The chart is plotted
+                private$display_plot(df, labels)
+            }
+        },
+
+        #' @description
+        #' Generates and returns information about the given text files.
+        #' @param file_list The list of text files to check.
+        #' @return A data frame containing the overall file statistics.
+        get_file_info = function(file_list = list()) {
+            # If the file list is empty, then the file name passed to the
+            # current objet is used.
+            if (length(file_list) == 0)
+                file_list = list(self$file_name)
+
+            # Empty list. Used to store information about each file
+            stats <- data.frame(
+                "total_line_count" = 0,
+                "max_line_length" = 0,
+                "min_line_length" = 0,
+                "mean_line_length" = 0,
+                "total_size" = 0)
+            # Temporary variables for calculating max, min, mean line length
+            temp_max <- temp_min <- temp_mean <- 0
+
+            # For each file in the list
+            for (file_name in file_list) {
+                # The file is read
+                lines <- self$read_file(file_name, F)
+
+                # The file stats are updated
+                stats["total_size"] <-
+                    stats["total_size"] + file.size(file_name)
+                stats["total_line_count"] <-
+                    stats["total_line_count"] + length(lines)
+
+                # The temporary variables are updated
+                temp_max <- max(nchar(lines))
+                temp_min <- min(nchar(lines))
+                temp_mean <- mean(nchar(lines))
+
+                if (temp_max > stats["max_line_length"])
+                    stats["max_line_length"] <- temp_max
+                if (temp_min > stats["min_line_length"])
+                    stats["min_line_length"] <- temp_min
+                if (temp_mean > stats["mean_line_length"])
+                    stats["mean_line_length"] <- round(temp_mean)
+            }
+            # The total size is formatted
+            stats["total_size"] <- utils:::format.object_size(
+                stats["total_size"], "auto")
+
+            # The required data is returned
+            return(stats)
+        },
+
+        #' @description
+        #' It generates training, testing and validation data sets
+        #' from the given input file. It first reads the file given as a
+        #' parameter to the current object. It generates random indexes for the
+        #' data. It partitions the data into training, testing and validation
+        #' sets, according to the given parameters. The files are named
+        #' train.txt, test.txt and va.txt. The files are saved to the given
+        #' output folder.
+        #' @param dir The name of the output folder.
+        #' @param percs The size of the training, testing and validation sets.
+        generate_data = function(dir, percs) {
+            # The information message is shown
+            self$display_msg(
+                "Generating training, testing and validation data sets...", 1)
+            # If the train, test and validation files already exist
+            if (file.exists(paste0(dir, "/train.txt")) &&
+                file.exists(paste0(dir, "/test.txt")) &&
+                file.exists(paste0(dir, "/validate.txt"))) {
+                # The information message
+                msg <- "The train, test and validate files already exist"
+                # The information message is shown
+                self$display_msg(msg, 1)
+            }
+            else {
+                # The input file is read
+                data <- self$read_file(self$file_name, F)
+                # The number of lines in the data
+                lc <- length(data)
+                # Random indexes are generated
+                indexes <- sample(1:lc, lc)
+                # The randomized data
+                rd <- data[indexes]
+                # The training set data
+                train_ds <- rd[1:round(lc*percs[["train"]])]
+                # The testing set data
+                test_ds <- rd[1:round(lc*percs[["test"]])]
+                # The validation set data
+                validate_ds <- rd[1:round(lc*percs[["validate"]])]
+                # The training data is written to file
+                self$write_file(train_ds, paste0(dir, "/train.txt"), F)
+                # The testing data is written to file
+                self$write_file(test_ds, paste0(dir, "/test.txt"), F)
+                # The validation data is written to file
+                self$write_file(validate_ds, paste0(dir, "/validate.txt"), F)
+            }
+        },
+
+        #' @description
+        #' Returns the given number of ngrams and their
+        #' frequencies. If the prefix parameter is not given, then the ngrams
+        #' are randomly chosen. Otherise ngrams starting with the given regular
+        #' expression are returned.
+        #' @param fn The ngram file name.
+        #' @param c The number of ngrams to return.
+        #' @param pre The ngram prefix, given as a regular expression.
+        get_ngrams = function(fn, c = NULL, pre = NULL) {
+            # The data is read
+            df <- self$read_obj(fn)
+            # If the prefix is not given
+            if (is.null(pre)) {
+                # The sample indexes
+                i <- sample(1:nrow(df), c)
+                # The ngram samples
+                s <- df[i,]
+            }
+            else {
+                # The ngram samples
+                s <- df[grepl(pre, df$pre), ]
+            }
+            return(s)
+        }
+    ),
+
+    private = list(
+        # @field da_opts The options for data analyzer object.
+        #   type -> The type of plot to display. The options are:
+        #     'top_features', 'coverage'.
+        #   n -> For 'top_features', it is the number of top most occuring
+        #     tokens.
+        da_opts = list(
+            "type" = "top_features",
+            "n" = 10
+        ),
+
+        # @description
+        # Displays a plot using ggplot2. The plot is a horizontal
+        # bar plot filled with red. It has the given labels and main title
+        # @param df The data to plot. It is a data frame with prefix and freq
+        #   columns.
+        # @param labels The main title, x and y axis labels
+        display_plot = function(df, labels) {
+            # The freq column is converted to numeric
+            df$freq <- as.numeric(df$freq)
+            # The pre column is converted to character
+            df$pre <- as.character(df$pre)
+            # The data frame is sorted in descending order
+            df <- (df[order(df$freq, decreasing = T),])
+            # The top n terms are extracted
+            df <- df[1:self$opts[["n"]], ]
+            # The ngram names and their frequencies are plotted
+            g <- ggplot(data = df, aes(x = reorder(pre, freq), y = freq)) +
+                geom_bar(stat = "identity", fill = "red") +
+                ggtitle(labels[["title"]]) +
+                coord_flip() +
+                ylab(labels[["y"]]) +
+                xlab(labels[["x"]])
+            print(g)
+        }
+    )
+)
diff --git a/R/data-cleaner.R b/R/data-cleaner.R
new file mode 100644
index 0000000..61ec87c
--- /dev/null
+++ b/R/data-cleaner.R
@@ -0,0 +1,244 @@
+#' Provides data cleaning functionality
+#'
+#' @description
+#' Allows removing unneeded characters from text files.
+#'
+#' @details
+#' It provides a method for cleaning text files. It allows removing bad words,
+#' stop words, non dictionary words, extra space, punctuation and non-alphabet
+#' characters. Converting text to lower case. It supports large text files.
+DataCleaner <- R6::R6Class(
+    "DataCleaner",
+    inherit = TextFileProcessor,
+    public = list(
+        #' @field dc_opts The options for the data cleaner object.
+        #'   min_words -> The minimum number of words per sentence.
+        #'   line_count -> The number of lines to read and clean at a time.
+        #    save_data -> If the combined processed lines should be saved.
+        #    output_file -> Name of the output file used to store the data.
+        #'   sw_file -> The stop words file path.
+        #    dict_file -> The dictionary file path.
+        #'   bad_file -> The bad words file path.
+        #'   to_lower -> If the words should be converted to lower case.
+        #'   remove_stop -> If stop words should be removed.
+        #'   remove_punct -> If punctuation symbols should be removed.
+        #'   remove_non_dict -> If non dictionary words should be removed.
+        #'   remove_non_alpha -> If non alphabet symbols should be removed.
+        #'   remove_extra_space -> If leading, trailing and double spaces
+        #'     should be removed.
+        #'   remove_bad -> If bad words should be removed
+        dc_opts = list(
+            "min_words" = 2,
+            "line_count" = 1000,
+            "save_data" = T,
+            "output_file" = "./data/sample-clean.txt",
+            "sw_file" = "./data/stop-words.txt",
+            "dict_file" = "./data/dict-no-bad.txt",
+            "bad_file" = "./data/bad-words.txt",
+            "to_lower" = T,
+            "remove_stop" = F,
+            "remove_punct" = T,
+            "remove_non_dict" = T,
+            "remove_non_alpha" = T,
+            "remove_extra_space" = T,
+            "remove_bad" = F
+        ),
+
+        #' @field sw The list of stop words.
+        sw = list(),
+
+        #' @field bw The list of bad words.
+        bw = list(),
+
+        #' @field dw The list of dictionary words.
+        dw = list(),
+
+        #' @description
+        #' It initializes the current object. It is used to set the file name
+        #' and verbose options.
+        #' @param file_name The path to the file to clean.
+        #' @param opts The options for data cleaning.
+        #'   min_words -> The minimum number of words per sentence.
+        #'   line_count -> The number of lines to read and clean at a time.
+        #    save_data -> If the combined processed lines should be saved.
+        #    output_file -> Name of the output file used to store the data.
+        #'   sw_file -> The stop words file path.
+        #    dict_file -> The dictionary file path.
+        #'   bad_file -> The bad words file path.
+        #'   to_lower -> If the words should be converted to lower case.
+        #'   remove_stop -> If stop words should be removed.
+        #'   remove_punct -> If punctuation symbols should be removed.
+        #'   remove_non_dict -> If non dictionary words should be removed.
+        #'   remove_non_alpha -> If non alphabet symbols should be removed.
+        #'   remove_extra_space -> If leading, trailing and double spaces
+        #'     should be removed.
+        #'   remove_bad -> If bad words should be removed
+        #' @param verbose Indicates if progress information should be displayed.
+        initialize = function(file_name = NULL,
+                              opts = self$dc_opts,
+                              verbose = 0) {
+            # The given options are merged with the opts attribute
+            self$dc_opts <- modifyList(self$dc_opts, opts)
+            # The dc_opts is merged with the base class opts attribute
+            self$opts <- modifyList(self$opts, self$dc_opts)
+            # The stop words file is read
+            self$sw <- self$read_file(self$opts[["sw_file"]], F);
+            # The dictionary file is read
+            self$dw <- self$read_file(self$opts[["dict_file"]], F);
+            # The bad word file is read
+            self$bw <- self$read_file(self$opts[["bad_file"]], F);
+            # The base class is initialized
+            super$initialize(file_name, self$opts[['line_count']], verbose)
+        },
+
+        #' @description
+        #' It removes unneeded characters from the given text with
+        #' several options. It allows removing punctuations, numbers, symbols,
+        #' urls and separators. It allows removing bad words, stop words and
+        #' words not in the given dictionary file. It reads the given file one
+        #' line at a time, removing unneeded characters. After every line_count
+        #' number of lines, the cleaned lines are saved to the output file.
+        clean_file = function() {
+            # The information message
+            msg <- paste0("Cleaning the sample file...", self$file_name)
+            # The information message is shown
+            self$display_msg(msg, 1)
+            # The base class process_file function is called
+            super$process_file(super$pre_process, private$process,
+                               super$post_process)
+        },
+
+        #' @description
+        #' It cleans the given lines of text using the options
+        #' passed to the current object.
+        #' @param lines The input sentences.
+        #' @return The cleaned lines of text.
+        clean_lines = function(lines) {
+            # The lines to clean
+            l <- lines
+            # If a line does not end with a ".", then "." is appended to the
+            # line
+            l <- gsub("(.+[^\\.])$", "\\1.", l)
+            # The "." character is replaced with the string "specialdotsep"
+            l <- gsub("\\.", " specialdotsep ", l)
+            # If the words should be converted to lower case
+            if (self$opts[["to_lower"]]) {
+                # The information message
+                self$display_msg("Converting lines to lower case...", 3)
+                # The line is converted to lower case
+                l <- tolower(l)
+            }
+            # If punctuation symbols should be removed
+            if (self$opts[["remove_punct"]]) {
+                # The information message
+                self$display_msg("Removing punctuation symbols...", 3)
+                # The pattern for removing all punctuation symbols
+                l <- gsub("[[:punct:]]+", "", l)
+            }
+            # If non alphabet symbols should be removed
+            if (self$opts[["remove_non_alpha"]]) {
+                # The information message
+                self$display_msg("Removing non alphabet symbols...", 3)
+                # Words containing non alphabetical characters are removed
+                l <- gsub("([^[:alpha:]\\s])", "", l, perl = T)
+            }
+
+            # If stop words should be removed
+            if (self$opts[["remove_stop"]]) {
+                # The information message
+                self$display_msg("Removing stop words..", 3)
+                # Stop words are collapsed
+                sw <- paste(self$sw, collapse = "|")
+                swp <- paste("\\b(", sw, ")\\b", sep = "")
+                # The stop words are removed
+                l <- gsub(swp, "", l)
+            }
+            # If extra spaces should be removed
+            if (self$opts[["remove_extra_space"]]) {
+                # The information message
+                self$display_msg("Removing extra space...", 3)
+                # Multiple spaces are replaced by single space
+                l = gsub("\\s{2,}", " ", l)
+                # Leading and trailing whitespaces are removed
+                l = trimws(l)
+            }
+
+            # The words in the lines are extracted
+            words <- strsplit(l, split = " ")
+            # The words are converted to an atomic list
+            words <- unlist(words)
+            # If non dictionary words should be removed
+            if (self$opts[["remove_non_dict"]]) {
+                # The "specialdotsep" string is added to list of dictionary
+                # words
+                dw <- c(self$dw, "specialdotsep")
+                # The non dictionary words are removed from the data
+                words <- words[words %in% dw]
+                # All 1 length words except for 'a' and 'i' are removed
+                # The indexes position of all words that are "a" or "i"
+                i1 <- (words == "a" | words == "i")
+                # The index position of words of length 2 or more
+                i2 <- (nchar(words) > 1)
+                # The list of all words of length 2 or more including "a" and
+                # "i"
+                words <- words[i1 | i2]
+            }
+            # If bad words should be removed
+            if (self$opts[["remove_bad"]]) {
+                # The "specialdotsep" string is added to list of bad words
+                bw <- c(self$bw, "specialdotsep")
+                # The bad words are removed from the data
+                words <- words[!words %in% self$bw]
+            }
+            # The words are combined with space
+            l <- paste(words, collapse = " ")
+            # The "specialdotsep" string is replaced with "."
+            l <- gsub("specialdotsep", ".", l)
+            # The sentences in the lines are extracted
+            l <- strsplit(l, split = "\\.")
+            # The sentences are converted to an atomic list
+            l <- unlist(l)
+            # If extra spaces should be removed
+            if (self$opts[["remove_extra_space"]]) {
+                # Multiple spaces are replaced by single space
+                l = gsub("\\s{2,}", " ", l)
+                # Leading and trailing whitespaces are removed
+                l = trimws(l)
+            }
+
+            # If each sentence should have a minimum number of words
+            if (self$opts[["min_words"]] > -1) {
+                # The number of words in each sentence
+                wc <- str_count(l, pattern = boundary("word"))
+                # The lines containing less than min_words number of words are
+                # removed
+                l <- l[wc >= self$opts[["min_words"]]]
+            }
+
+            # Consecutive 'a' and 'i' are replaced with single 'a' or 'i'
+            l <- gsub("(a\\s){2,}", "\\1 ", l)
+            l <- gsub("(i\\s){2,}", "\\1 ", l)
+            l <- gsub("a$", "", l)
+
+            return(l)
+        }
+    ),
+
+    private = list(
+
+        # @description
+        # Performs processing for the \code{clean_files} function.
+        # It processes the given lines of text. It divides the given lines of
+        # text into sentences by spliting on '.'. Each sentence is then cleaned
+        # using \code{clean_line}. If the number of words in the cleaned
+        # sentence is less than min_words, then the sentence is rejected.
+        # @param lines The lines of text to clean.
+        # @return The processed line is returned.
+        process = function(lines) {
+            # The sentence is cleaned
+            cl <- self$clean_lines(lines)
+
+            return(cl)
+        }
+    )
+)
diff --git a/R/model-evaluator.R b/R/model-evaluator.R
new file mode 100644
index 0000000..ca02aa1
--- /dev/null
+++ b/R/model-evaluator.R
@@ -0,0 +1,1136 @@
+#' It is used to evaluate the accuracy and performance of the model.
+#'
+#' @description
+#' It provides methods that perform extrinsic and intrinsic model
+#' evaluation. It also provides methods for determining the memory and time
+#' requirements for generating the model. It also provides a method for
+#' determining how much memory is used by the final model.
+#'
+#' @details
+#' It provides a method that performs intrinsic model evaluation based
+#' on Perplexity. It also provides a method that performs extrinsic model
+#' evalation based on accuracy. It provides a method for determining how much
+#' memory and time is needed to generate a model for different input data
+#' sizes. It provides a method for determining how much memory is needed by
+#' the final model.
+ModelEvaluator <- R6::R6Class(
+    "ModelEvaluator",
+    inherit = TextFileProcessor,
+    public = list(
+        #' @field tp The transition probabilities data frame.
+        tp = NULL,
+
+        #' @field wl The list of unique words.
+        wl = NULL,
+
+        #' @field dp The default probability is equal to 1/(N+V), where N is the
+        #'   number of words in the sentence, V is the number of words in the
+        #'   vocabulary.
+        dp = NULL,
+
+        #' @field model The maximum number of ngrams supported by the model.
+        model = 4,
+
+        #' @field dc_opts The options for the data cleaner object.
+        #'   min_words -> The minimum number of words per sentence.
+        #'   line_count -> The number of lines to read and clean at a time.
+        #'   sw_file -> The stop words file path.
+        #    dict_file -> The dictionary file path.
+        #'   bad_file -> The bad words file path.
+        #'   to_lower -> If the words should be converted to lower case.
+        #'   remove_stop -> If stop words should be removed.
+        #'   remove_punct -> If punctuation symbols should be removed.
+        #'   remove_non_dict -> If non dictionary words should be removed.
+        #'   remove_non_alpha -> If non alphabet symbols should be removed.
+        #'   remove_extra_space -> If leading, trailing and double spaces
+        #'     should be removed.
+        #'   remove_bad -> If bad words should be removed
+        dc_opts = list(
+            "min_words" = 2,
+            "line_count" = 1000,
+            "sw_file" = "./data/stop-words.txt",
+            "dict_file" = "./data/dict-no-bad.txt",
+            "bad_file" = "./data/bad-words.txt",
+            "to_lower" = T,
+            "remove_stop" = F,
+            "remove_punc" = T,
+            "remove_non_dict" = T,
+            "remove_non_alpha" = T,
+            "remove_extra_space" = T,
+            "remove_bad" = F
+        ),
+
+        #' @field tg_opts The options for the token generator obj.
+        #'   n -> The ngram size.
+        #'   save_ngrams -> If the ngram data should be saved.
+        #'   min_freq -> All ngrams with frequency less than min_freq are
+        #'     ignored.
+        #'   line_count -> The number of lines to process at a time.
+        #'   stem_words -> If words should be converted to their stem.
+        #'   dir -> The dir where the output file should be saved.
+        #'   format -> The format for the output. There are two options.
+        #'     'plain' -> The data is stored in plain text.
+        #'     'obj' -> The data is stored as a R obj.
+        tg_opts = list(
+            "min_freq" = -1,
+            "n" = 1,
+            "save_ngrams" = T,
+            "min_freq" = -1,
+            "line_count" = 5000,
+            "stem_words" = F,
+            "dir" = "./data/models",
+            "format" = "obj"
+        ),
+
+        #' @field ssize The sample size in Mb.
+        ssize = 30,
+
+        #' @field ddir The folder containing the data files
+        ddir = "./data",
+
+        #' @field mdir The folder containing the model files
+        mdir = "./data/models",
+
+        #' @description
+        #' It initializes the current object. It is used to set the
+        #' maximum ngram number, sample size, input file name, data cleaner
+        #' options and verbose option.
+        #' @param model The maximum ngram number supported by the model.
+        #' @param ssize The sample size in Mb.
+        #' @param ddir The data directory.
+        #' @param mdir The model directory.
+        #' @param dc_opts The data cleaner options.
+        #' @param tg_opts The token generator options.
+        #' @param verbose If progress information should be displayed.
+        initialize = function(model = 4,
+                              ssize = 30,
+                              ddir = "./data",
+                              mdir = "./data/models",
+                              dc_opts = list(),
+                              tg_opts = list(),
+                              verbose = 0) {
+
+            # The base class is initialized
+            super$initialize(NULL, NULL, verbose)
+
+            # The model number is set
+            self$model <- model
+            # The sample size is set
+            self$ssize <- ssize
+            # The data directory name is set
+            self$ddir <- ddir
+            # The model directory name is set
+            self$mdir <- mdir
+            # If the dc_opts are given
+            if (length(dc_opts) > 0) {
+                # The custom dc_opts are merged with the default dc_opts
+                self$dc_opts = modifyList(self$dc_opts, dc_opts)
+            }
+            # If the tg_opts are given
+            if (length(tg_opts) > 0) {
+                # The custom tg_opts are merged with the default tg_opts
+                self$tg_opts = modifyList(self$tg_opts, tg_opts)
+            }
+            # The transition probabilities data is initialized
+            self$tp <- NULL
+        },
+
+        #' @description
+        #' It compares the performance of the specified models by
+        #' plotting the performance statistics. The models are specified with
+        #' the type parameter.
+        #' @param type The models to compare. It can be:
+        #' 'basic' -> One model for each ngram.
+        #' 'grouped' -> One model for each group and ngram. For
+        #'   e.g For data size 5 Mb, there are 3 models for the ngrams 2:4.
+        #' @param opts The options for plotting the data.
+        #'   'group' -> The field to group by.
+        #'   'title' -> The main plot title.
+        #'   'subtitle' -> The plot sub title.
+        #' @return The performance stats.
+        performance_comparision = function(type, opts) {
+            # If the type is 'basic'
+            if (type == 'basic') {
+                # The pstats file name
+                fn <- paste0(self$mdir, "/pstats.RDS")
+                # The stats are read
+                pstats <- self$read_obj(fn)
+                # The config file name
+                fn <- paste0(self$mdir, "/config.RDS")
+                # The config file is read
+                config <- self$read_obj(fn)
+                # The y-axis values
+                y <- list("m" = NULL, "t" = NULL, "p" = NULL, "a" = NULL)
+                # For each model the performance metrics are measured
+                for (i in 1:(config$model-1)) {
+                    # The memory value
+                    m <- self$format_size(pstats[[i]][["memory"]])
+                    # The y-axis values are updated
+                    y[["m"]] <- c(y[["m"]],  m)
+                    y[["t"]] <- c(y[["t"]], pstats[[i]][["time"]])
+                    y[["p"]] <- c(y[["p"]],
+                                  pstats[[i]][["intrinsic"]][["mean"]])
+                    y[["a"]] <- c(y[["a"]],
+                                  pstats[[i]][["extrinsic"]][["valid_perc"]])
+                }
+                # The data frame containing the data to be plotted
+                df <- data.frame(
+                    "n" = 2:(length(pstats) + 1),
+                    "m" = y[["m"]],
+                    "t" = y[["t"]],
+                    "p" = y[["p"]],
+                    "a" = y[["a"]]
+                )
+                # The options for plotting
+                opts <- list(
+                    "type" = "basic",
+                    "title" = "Variation of performance with ngram size",
+                    "subtitle" = opts[['subtitle']])
+            }
+            # If the type is 'grouped'
+            else if (type == 'grouped') {
+                # The average performance stats for each data size
+                y <- list("m" = NULL, "t" = NULL, "p" = NULL, "a" = NULL)
+                # The different items in the group. Each folder is a group item
+                f <- list.dirs(self$mdir, recursive = F, full.names = F)
+                # The output data frame
+                df <- data.frame()
+                # For each folder, the performance stats for all ngram models is
+                # calculated.
+                for (i in f) {
+                    # The performance stats file name
+                    fn <- paste0(self$mdir, "/", i, "/pstats.RDS")
+                    # The stats are read
+                    pstats <- self$read_obj(fn)
+                    # The config file name
+                    fn <- paste0(self$mdir, "/", i, "/config.RDS")
+                    # The stats are read
+                    config <- self$read_obj(fn)
+                    # The memory usage for each ngram
+                    m <- private$process_stats(pstats, "memory", config$model)
+                    # The time taken for each ngram
+                    t <- private$process_stats(pstats, "time", config$model)
+                    # The perplexity for each ngram
+                    p <- private$process_stats(pstats, "perplexity", config$model)
+                    # The accuracy score for each ngram
+                    a <- private$process_stats(pstats, "accuracy", config$model)
+                    # The temprary data frame for the current data size
+                    tdf <- data.frame(
+                        rep(i, (config$model-1)),
+                        m, t, p, a, 2:(length(pstats) + 1))
+                    # The column names are set
+                    names(tdf) <- c(opts[["group"]], "m", "t", "p", "a", "n")
+                    # The temprary data frame is appended to the output data
+                    # frame
+                    df <- rbind(df, tdf)
+                }
+                # The group name
+                g <- opts[["group"]]
+                # The options for plotting
+                opts <- list(
+                    "type" = "grouped",
+                    "group" = g,
+                    "title" = opts[['title']],
+                    "subtitle" = opts[['subtitle']])
+                # If the group column is numeric
+                if (is.numeric(as.numeric(as.character(df[[g]])))) {
+                    # The group column is converted to numeric
+                    df[[g]] <- as.numeric(as.character(df[[g]]))
+                    # The data is ordered by group
+                    df <- df[order(df[[opts[["group"]]]]), ]
+                    # The group column is converted to factor
+                    df[[g]] <- as.factor(df[[g]])
+                }
+            }
+            # The performance stats are plotted
+            private$plot_stats(df, opts)
+            # The performance stats are returned
+            return(df)
+        },
+
+        #' @description
+        #' For each model it performs intrinsic and extrinsic
+        #' evaluation. It also measures the memory usage and time taken. The
+        #' performance metrics are displayed in 5 plots on one page. Performance
+        #' statistics are saved to the file pstats.RDS.
+        performance_evaluation = function() {
+            # The performance stats
+            pstats <- list()
+            # The maximum model number
+            mmax <- self$model
+            # The y-axis values
+            y <- list("m" = NULL, "t" = NULL, "p" = NULL, "a" = NULL)
+            # For each model the performance metrics are measured
+            for (m in 2:mmax) {
+                # The transition probabilities data is initialized
+                self$tp <- NULL
+                # The model number is set
+                self$model <- m
+                # The intrinsic and extrinsic evaluation is performed
+                time_taken <- system.time({
+                    memory_used <- mem_change({
+                        # intrinsic evaluation is performed
+                        istats <- self$intrinsic_evaluation()
+                        # Extrinsic evaluation is performed
+                        estats <- self$extrinsic_evaluation()
+                    })
+                })
+                # The memory used
+                memory_used <- object_size(self$tp)
+                # The performance stats are updated
+                pstats[[m-1]] <- list(
+                    "intrinsic" = istats,
+                    "extrinsic" = estats,
+                    "memory" = memory_used,
+                    "time" = time_taken[[3]]
+                )
+                # The y-axis values are updated
+                y[["m"]] <- c(y[["m"]], self$format_size(memory_used))
+                y[["t"]] <- c(y[["t"]], time_taken[[3]])
+                y[["p"]] <- c(y[["p"]], istats$mean)
+                y[["a"]] <- c(y[["a"]], estats$valid_perc)
+            }
+
+            # The information message is shown
+            self$display_msg("Saving model performance stats...", 1)
+            # The performance stats file name
+            fn <- paste0(self$mdir, "/pstats.RDS")
+            # The performance stats are saved
+            self$save_obj(pstats, fn)
+            # The data to be plotted
+            df <- data.frame(
+                "n" = 2:(length(pstats)+1),
+                "m" = y[["m"]],
+                "t" = y[["t"]],
+                "p" = y[["p"]],
+                "a" = y[["a"]]
+            )
+            # The options for plotting
+            opts <- list("type" = "basic")
+            # The performance stats are plotted
+            private$plot_stats(df, opts)
+        },
+
+        #' @description
+        #' It loads the given model configuration file.
+        #' @param fn The config file name.
+        load_config = function(fn) {
+            # The model configuration is read
+            config <- self$read_obj(fn)
+            # The configuration is copied to the object attributes
+            self$model <- config$model
+            self$ssize <- config$ssize
+            self$dc_opts <- config$dc_opts
+            self$ddir <- config$ddir
+            self$mdir <- config$mdir
+        },
+
+        #' @description
+        #' It loads the model located at the mdir location
+        load_model = function() {
+            # The configuration file name
+            fn <- paste0(self$mdir, "/config.RDS")
+            # The model configuration is loaded to the current object
+            self$load_config(fn)
+            # The tp data for the specified ngrams is loaded
+            private$read_tp_data(self$model)
+        },
+
+        #' @description
+        #' It saves the model configuration to the models
+        #' subdirectory of the given directory.
+        save_config = function() {
+            # The information message is shown
+            self$display_msg("Saving model configuration...", 1)
+            # The model configuration
+            config <- list(
+                "model" = self$model,
+                "ssize" = self$ssize,
+                "ddir" = self$ddir,
+                "mdir" = self$mdir,
+                "dc_opts" = self$dc_opts,
+                "tg_opts" = self$tg_opts
+            )
+            # The configuration file
+            fn <- paste0(self$mdir, "/config.RDS")
+            # The configuration is saved
+            self$save_obj(config, fn)
+        },
+
+        #' @description
+        #' It generates the model for the given ngram number,
+        #' data sample size, data cleaning options and input file.
+        generate_model = function() {
+            # The data analyzer object is created
+            da <- DataAnalyzer$new(
+                paste0(self$ddir, "/train.txt"), self$verbose)
+            # The training, testing and validation data sets are generated
+            da$generate_data(self$ddir,
+                             list(train = .8, test = .1, validate = .1))
+            # The object size is formatted
+            obj_size <- file.size(paste0(self$ddir, "/train.txt"))/10^6
+            # The proportion of data to sample
+            prop <- (self$ssize/obj_size)
+            # Random sample is taken and cleaned
+            self$generate_clean_sample(prop, T, 'tr')
+            # The model directory is set
+            self$tg_opts$dir <- self$mdir
+            # For each ngram number, the ngram token file is generated
+            for (i in 1:self$model) {
+                # The clean train data file name
+                fn <- paste0(self$mdir, "/train-clean.txt")
+                # The ngram number is set
+                self$tg_opts$n <- i
+                # The TokenGenerator object is created
+                tg <- TokenGenerator$new(fn, self$tg_opts, self$verbose)
+                # The ngram tokens are generated
+                tg$generate_tokens()
+            }
+            # For each ngram number, the transition probabilities are generated
+            for (i in 2:self$model) {
+                # The TPGenerator object is created
+                tp <- TPGenerator$new()
+                tp_opts <- list(
+                    "n_range" = 1:i,
+                    "save_tp" = T,
+                    "format" = "obj",
+                    "dir" = self$mdir
+                )
+                # The transition probabilities are generated
+                tp$generate_tp(tp_opts)
+            }
+            # The model config is saved
+            self$save_config()
+        },
+
+        #' @description
+        #' Evaluates the model using intrinsic evaluation based on
+        #' Perplexity. First a validation data set containing 1000 lines is
+        #' generated. It is then cleaned. 20 random sentences are taken. For
+        #' each sentence, Perplexity of all the words is calculated.
+        #' @return The minumum, maximum and mean Perplexity score.
+        intrinsic_evaluation = function() {
+            # The transition probabilities data is read
+            private$read_tp_data(self$model)
+            # Random sample is taken and cleaned
+            self$generate_clean_sample(1000, T, 'va')
+            # The validation sample data is read
+            data <- self$read_file(
+                paste0(self$mdir,"/validate-clean.txt"), F)
+            # A sample of 20 sentences is taken
+            data <- data[1:20]
+            # The information message
+            msg <- paste0(
+                "Calculating Perplexity for ", length(data), " sentences")
+            # The information message is shown
+            self$display_msg(msg, 1)
+            # The list of perplexities
+            pl <- c();
+            # The loop counter
+            c <- 1
+            # The Perplexity of each sentence in the test data is calculated
+            for (line in data) {
+                # The line is split on space
+                words <- str_split(line, " ")[[1]]
+                # The perplexity for the line is calculated
+                p <- self$calc_perplexity(words)
+                # The information message
+                msg <- paste0(
+                    "Perplexity of the sentence '",
+                    line, "' is: ", p)
+                # The information message is shown
+                self$display_msg(msg, 2)
+                # The list of perplexities is updated
+                pl <- c(pl, p);
+                # If the counter is divisible by 10
+                if (c %% 10 == 0) {
+                    # The information message
+                    msg <- paste0(c, " lines have been processed")
+                    # The information message is shown
+                    self$display_msg(msg, 1)
+                }
+                # The counter is increased by 1
+                c <- c + 1
+            }
+            # The perplexity stats
+            stats <- list(
+                "min" = min(pl),
+                "max" = max(pl),
+                "mean" = mean(pl));
+
+            return(stats);
+        },
+
+        #' @description
+        #' Evaluates the model using extrinsic evaluation based on
+        #' Accuracy. First a validation data set containing 1000 lines is
+        #' generated. It is then cleaned. 20 random sentences are taken. For
+        #' each sentence, the model is used to predict the next word. The
+        #' accuracy stats are returned. A prediction is considered to be correct
+        #' if one of the predicted words matched the actual word.
+        #' @return The number of correct and incorrect predictions.
+        extrinsic_evaluation = function() {
+            # The transition probabilities data is read
+            private$read_tp_data(self$model)
+            # Random sample is taken and cleaned
+            self$generate_clean_sample(1000, T, 'va')
+            # The validation sample data is read
+            data <- self$read_file(
+                paste0(self$mdir,"/validate-clean.txt"), F)
+            # A Random sample of 100 sentences is taken
+            data <- data[1:100]
+            # The information message
+            msg <- paste0(
+                "Predicting the next word for ", length(data), " sentences")
+            # The information message is shown
+            self$display_msg(msg, 1)
+            # The statistics
+            stats <- list("valid" = 0, "invalid" = 0)
+            # The loop counter
+            c <- 1
+            # The last word for each sentence is predicted
+            for (line in data) {
+                # The line is split on space
+                words <- str_split(line, " ")[[1]]
+                # The word to predict
+                w <- words[length(words)]
+                # The previous words used to predict the word
+                pw <- words[1:length(words)-1]
+                # If the words should be stemmed
+                if (self$tg_opts[["stem_words"]]) {
+                    # The previous words are stemmed
+                    pw <- wordStem(pw)
+                }
+                # The next word is predicted
+                res <- self$predict_word(pw, F)
+                # If the predicted word matches the actual word
+                if (w %in% res["words"]) {
+                    stats[["valid"]] <- stats[["valid"]] + 1;
+                    # The information message
+                    self$display_msg(
+                        paste0("The word: ", w, " was predicted"), 3)
+                }
+                # If the predicted word does not match
+                else {
+                    stats[["invalid"]] <- stats[["invalid"]] + 1;
+                    # The information message
+                    self$display_msg(
+                        paste0("The word: ", w, " could not be predicted"), 3)
+                }
+                # The counter is increased by 1
+                c <- c + 1
+                # If the counter is divisible by 10
+                if (c %% 10 == 0) {
+                    # The information message
+                    msg <- paste0(c, " sentences have been processed")
+                    # The information message is shown
+                    self$display_msg(msg, 1)
+                }
+            }
+
+            # The precentage of valid
+            stats[["valid_perc"]] <-
+                (stats[["valid"]]/(stats[["valid"]] + stats[["invalid"]]))*100
+            # The precentage of invalid
+            stats[["invalid_perc"]] <- 100-stats[["valid_perc"]]
+
+            return(stats)
+        },
+
+        #' @description
+        #' Predicts the new word given a list of 1, 2 or 3 previous
+        #' words. It checks the given n words in the transition probabilities
+        #' data. If there is a match, the top 3 next words with highest
+        #' probabilities are returned. If there is no match, then the last n-1
+        #' previous words are checked. This process is continued until the last
+        #' word is checked. If there is no match, then empty result is returned.
+        #' @param words A character vector of previous words or a single vector
+        #'   containing the previous word text.
+        #' @param count The number of results to return.
+        #' @param dc A DataCleaner object. If it is given, then the given words
+        #'   are cleaned. If the stem_words option was set in the TokenGenerator
+        #'   object configuration for the current model, then the words are
+        #'   converted to their stems.
+        #' @return The top 3 predicted words along with their probabilities.
+        predict_word = function(words, count = 3, dc = NULL) {
+            # The transition probabilities data is read
+            private$read_tp_data(self$model)
+            # The words are assigned to temp variable
+            w <- words
+            # If the DataCleaner obj was specified
+            if (!is.null(dc)) {
+                # If the words is a set of vectors
+                if (length(w) > 1) {
+                    # The words are converted to a single line of text
+                    w <- paste0(w, collapse = " ")
+                }
+                # The words are cleaned
+                w <- dc$clean_lines(w)
+                # If the words should be stemmed
+                if (self$tg_opts[["stem_words"]]) {
+                    # The previous words are stemmed
+                    w <- wordStem(w)
+                }
+            }
+
+            # If the words are in the form of a line
+            if (length(w) == 1) {
+                # The words are split on space
+                w <- strsplit(w, " ")[[1]]
+            }
+
+            # The length of previous words
+            pwl <- length(w)
+            # The loop counter
+            c <- 1
+            # Indicates if the word was found
+            found <- FALSE
+            # The result
+            result <- list("found" = F, "words" = "", "probs" = "")
+            # If the previous words length is 0
+            if (pwl == 0)
+                return(result)
+            # The last 3 words are extracted.
+            # If the previous word length is more than 3
+            if (pwl > 3) {
+                pw <- w[(pwl-2):pwl]
+            }
+            else {
+                pw <- w
+            }
+            # The length of previous words
+            pwl <- length(pw)
+            # Each ngram in the previous word list is checked starting from
+            # largest ngram
+            for (i in pwl:1) {
+                # The previous words to check
+                tpw <- pw[c:pwl]
+                # The key to use for the transition probabilities data
+                k <- paste(tpw, collapse = "_")
+                # The key is converted to a numeric hash
+                h <- digest2int(k)
+                # The transition probabilities data is checked
+                res <- self$tp[self$tp$pre == h, ]
+                # If the prefix was found
+                if (nrow(res) > 0) {
+                    # The word was found
+                    found <- TRUE
+                    # The result is sorted by probability
+                    sres <- res[order(res$prob, decreasing = T),]
+                    # The number of rows in the result set
+                    rcount <- nrow(sres)
+                    # If the number of results is more than the required number
+                    # of results
+                    if (rcount > count) {
+                        # The result count is set to the required number of
+                        # results
+                        rc <- count
+                    }
+                    else {
+                        # The result count is set to the number of results
+                        rc <- rcount
+                    }
+                    # The required word probabilities
+                    probs <- sres$prob[1:rc]
+                    # The next words indexes
+                    ind <- sres$nw[1:rc]
+                    # The required words
+                    nw <- as.character(self$wl$pre[ind])
+
+                    # The result is updated
+                    result[["words"]] <- nw
+                    result[["probs"]] <- probs
+                    result[["found"]] <- T
+                    # The information message
+                    msg <- paste0("The ngram key: ", k, " was found")
+                    # Information message is shown
+                    self$display_msg(msg, 3)
+                    # The loop ends
+                    break;
+                }
+                else {
+                    # The information message
+                    msg <- paste0("The ngram key: ", k, " was not found")
+                    # Information message is shown
+                    self$display_msg(msg, 3)
+                    # The result is updated
+                    result[["found"]] <- F
+                }
+                # The information message
+                msg <- paste0("Backing off to ", (i) ,"-gram")
+                # Information message is shown
+                self$display_msg(msg, 3);
+                # The counter is increased by 1
+                c <- c + 1;
+            }
+
+            return(result);
+        },
+
+        #' @description
+        #' Calculates the probability of the given word given the
+        #' previous model-1 words, where model is the maximum ngram number. It
+        #' looks up the probability of a word given n previous words. The
+        #' previous n words are converted to numeric hash using digest2int
+        #' function. The hash is looked up in a data frame of transition
+        #' probabilities. The word is converted to a number by checking its
+        #' position in a list of unique words. If the hash and the word position
+        #' were found, then the probability of the previous word and hash is
+        #' returned. If it was not found, then the hash of the n-1 previous
+        #' words is taken and the processed is repeated. If the data was not
+        #' found in the data frame, then the word probability is returned. This
+        #' is known as backoff. If the word probability could not be found then
+        #' the default probability is returned. The default probability is
+        #' calculated as 1/(N+V), Where N = number of words in corpus and V is
+        #' the number of dictionary words.
+        #' @param word The word whoose probability is to be calculated.
+        #' @param pw The previous words.
+        #' @return The probability of the word given the previous words.
+        get_word_prob = function(word, pw) {
+            # If the default probability is not set, then an error is raised
+            if (is.null(self$dp))
+                stop("The default probability is not set !")
+            # The length of previous words
+            pwl <- length(pw)
+            # The probability of the word given the previous words. It is
+            # initialized to the default probability, which should be 1/(N+V)
+            prob <- self$dp
+            # The loop counter
+            c <- 1
+            # Indicates if the word was found
+            found <- FALSE
+            # The next word id
+            nw <- match(word, self$wl$pre)
+            # If the next word was not found
+            if (is.na(nw)) {
+                # The information message
+                msg <- paste0(
+                    "The next word: ", word, " was not found")
+                # Information message is shown
+                self$display_msg(msg, 3)
+            }
+            # If the previous word count is more than 0
+            else if (pwl > 0) {
+                # The previous words are checked
+                for (i in pwl:1) {
+                    # The previous words to check
+                    tpw <- pw[c:pwl]
+                    # The key to use for the transition matrix
+                    k <- paste(tpw, collapse = "_")
+                    # The key is converted to a numeric hash
+                    h <- digest2int(k)
+                    # The transition probabilities data is checked
+                    res <- self$tp[self$tp$pre == h & self$tp$nw == nw, ]
+                    # If the prefix was found
+                    if (nrow(res) > 0) {
+                        # The word was found
+                        found <- TRUE
+                        # The probability is set
+                        prob <- as.numeric(res$prob)
+                        # The information message
+                        msg <- paste0("The ngram key: ",
+                                      k, " and the next word: ",
+                                      word, " were found")
+                        # Information message is shown
+                        self$display_msg(msg, 3)
+                        # The loop ends
+                        break
+                    }
+                    else {
+                        # The information message
+                        msg <- paste0("The ngram key: ",
+                                      k, " and the next word: ",
+                                      word, " were not found")
+                        # Information message is shown
+                        self$display_msg(msg, 3)
+                    }
+                    # The information message
+                    msg <- paste0("Backing off to ", (i) ,"-gram")
+                    # Information message is shown
+                    self$display_msg(msg, 3)
+                    # The counter is increased by 1
+                    c <- c + 1
+                }
+            }
+            # If the word was not found then the probability of the word is
+            # checked in the n1-gram
+            if (!found) {
+                # If the word was not found
+                if (sum(self$wl$pre == word) == 0) {
+                    # Information message is shown
+                    self$display_msg("Using default probability", 3)
+                }
+                else {
+                    # The word probability
+                    prob <- as.numeric(self$wl[self$wl$pre == word, "prob"])
+                }
+            }
+
+            return(prob)
+        },
+
+        #' @description
+        #' The Perplexity for the given sentence is calculated. For
+        #' each word, the probability of the word given the previous words is
+        #' calculated. The probabilities are multiplied and then inverted. The
+        #' nth root of the result is the perplexity, where n is the number of
+        #' words in the sentence. If the stem_words tokenization option was
+        #' specified, then the previous words are converted to their stem.
+        #' @param words The list of words.
+        #' @return The perplexity of the given list of words.
+        calc_perplexity = function(words) {
+            # The number of words in the sentence
+            wl <- length(words)
+            # The product of the word probabilities
+            prob_prod <- 1
+            # For each word, the probability of the word is calculated
+            for (i in 1:wl) {
+                # The word
+                word <- words[i]
+                # The list of previous words
+                pw <- NULL
+                # If i is more than 1
+                if (i > 1) {
+                    # The start index
+                    start <- 1
+                    # If i > self$model
+                    if (i > self$model) start <- i-(self$model-1)
+                    # The list of previous words
+                    pw <- words[start:(i-1)]
+                    # If the words should be stemmed
+                    if (self$tg_opts[["stem_words"]]) {
+                        # The previous words are stemmed
+                        pw <- wordStem(pw)
+                    }
+                }
+                # The word probability
+                prob <- self$get_word_prob(word, pw)
+                # The probability product is updated
+                prob_prod <- prob_prod * prob
+            }
+            # The nth root of the inverse of the probability product is taken
+            p <- round((1/prob_prod)^(1/wl), 0)
+
+            return(p)
+        },
+
+        #' @description
+        #' Generates a sample file of given size from the main
+        #' train.txt file file name. The file is cleaned and saved.
+        #' @param ss The number of lines or proportion of lines to sample.
+        #' @param ic If the sample file should be cleaned.
+        #' @param t The type of sample. It can be:
+        #'   'tr' -> training
+        #'   'te' -> testing
+        #'   'va' -> validation
+        generate_clean_sample = function(ss, ic, t) {
+            # If the type is 'tr'
+            if (t == 'tr') sfn <- 'train'
+            # If the type is 'te'
+            else if (t == 'te') sfn <- 'test'
+            # If the type is 'va'
+            else if (t == 'va') sfn <- 'validate'
+            # The sample file name
+            fn <- paste0(self$ddir, "/", sfn, ".txt")
+            # The sample file name
+            sf <- paste0(self$mdir, "/", sfn, ".txt")
+            # The clean sample file name
+            csf <- paste0(self$mdir, "/", sfn, "-clean.txt")
+            # If the cleaned sample file already exists
+            if (file.exists(csf)) {
+                # The information message
+                msg <- paste0("The cleaned sample file: ", csf,
+                              " already exists")
+                # Information message is shown
+                self$display_msg(msg, 2)
+            }
+            else {
+                # If the sample file does not exist
+                if (!file.exists(sf)) {
+                    # The information message
+                    msg <- paste0("Generating sample file from the file: ", fn)
+                    # Information message is shown
+                    self$display_msg(msg, 2)
+                    # The input file is read
+                    data <- self$read_file(fn, F)
+                    # If the sample size is less than 1
+                    if (ss < 1) {
+                        # The number of lines in the main file
+                        lc <- length(data)
+                        # The number of lines in the sample file
+                        lc <- round(lc*ss)
+                    }
+                    else {
+                        lc <- ss
+                    }
+                    # The sample file data
+                    data <- data[1:lc]
+                    # The sample file data is saved
+                    self$write_file(data, sf, F)
+                }
+                # If the sample file should be cleaned
+                if (ic) {
+                    # The options for cleaning the data
+                    opts <- self$dc_opts
+                    # The line count is set to 5000
+                    opts[["line_count"]] <- 5000
+                    # The output file name
+                    opts[["output_file"]] <- csf
+                    # The data cleaner object is created
+                    dc <- DataCleaner$new(sf, opts, verbose = self$verbose)
+                    # The sample file is cleaned
+                    dc$clean_file()
+                }
+            }
+        }
+    ),
+
+    private = list(
+
+        # @description It creates a single plot based on ggplot2. Depending on
+        # the opt parameter, the plot may contain groups or it may be a simple
+        # plot.
+        # @param data A data frame containing the data to be plotted. It should
+        #   have 2 variables, x and y.
+        # @param opts The options for plotting the data. It contains:
+        #   'x_lab' -> The x-axis label.
+        #   'y_lab' -> The y-axis label.
+        #   'type' -> The type of plot. It can be 'basic' or 'grouped'.
+        #   'group' -> The field to group by.
+        # @return A ggplot object representing the plot.
+        plot_graph = function(data, opts) {
+            # y-max
+            y_max <- max(as.numeric(as.character(data$y)))
+            # If the type is basic
+            if (opts[['type']] == 'basic') {
+                # The graph is plotted
+                p <- ggplot(data, aes(x, y)) + geom_point() +
+                    geom_smooth(method='lm', formula= y~x) +
+                    labs(x = opts[["x_lab"]], y = opts[["y_lab"]]) +
+                    xlim(min(data$x), max(data$x)) +
+                    coord_cartesian(ylim = c(0,y_max))
+            }
+            # If the type is grouped
+            else if (opts[['type']] == 'grouped') {
+                # The data is duplicated to prevent the warnings
+                data <- rbind(data, data)
+                # The group field data
+                g <- opts$group
+                # The graph is plotted
+                p <- ggplot(
+                    data,
+                    aes_string("x", "y", group = g, col = g)) +
+                    geom_point() +
+                    geom_smooth(method = "loess", formula = y ~ x, span = 1.4, se = FALSE) +
+                    labs(x = opts[["x_lab"]], y = opts[["y_lab"]]) +
+                    xlim(min(data$x), max(data$x)) +
+                    coord_cartesian(ylim = c(0,y_max))
+            }
+            return(p)
+        },
+
+        # @description
+        # It plots the given model performance stats. The stats
+        # are plotted on one page. 5 stats are plotted. The first 4 plots are
+        # of memory, time taken, mean Perplexity and accuracy. The fifth plot
+        # is of Perplexity vs accuracy.
+        # @param data The data to plot.
+        # @param opts The options for plotting the data. It contains:
+        #   'type' -> The type of plot. It can be 'basic' or 'grouped'.
+        #   'group' -> The field to group by.
+        #   'title' -> The main plot title.
+        #   'subtitle' -> The plot sub title.
+        plot_stats = function(data, opts) {
+            # The information message is shown
+            self$display_msg("Plotting model performance stats...", 1)
+            # If the type is basic
+            if (opts[["type"]] == 'basic') {
+                # The data frames
+                df1 <- data.frame(x = data$n, y = data$m)
+                df2 <- data.frame(x = data$n, y = data$t)
+                df3 <- data.frame(x = data$n, y = data$p)
+                df4 <- data.frame(x = data$n, y = data$a)
+                df5 <- data.frame(x = data$a, y = data$p)
+                # The options for plot 1
+                popts <- list("x_lab" = "ngram",
+                              "y_lab" = "memory",
+                              "type" = opts[["type"]])
+                # Plot 1
+                p1 <- private$plot_graph(df1, popts)
+                # The options for plot 2
+                popts <- list("x_lab" = "ngram",
+                              "y_lab" = "time",
+                              "type" = opts[["type"]])
+                # Plot 2
+                p2 <- private$plot_graph(df2, popts)
+                # The options for plot 3
+                popts <- list("x_lab" = "ngram",
+                              "y_lab" = "perplexity",
+                              "type" = opts[["type"]])
+                # Plot 3
+                p3 <- private$plot_graph(df3, popts)
+                # The options for plot 4
+                popts <- list("x_lab" = "ngram",
+                              "y_lab" = "accuracy",
+                              "type" = opts[["type"]])
+                # Plot 4
+                p4 <- private$plot_graph(df4, popts)
+                # The options for plot 5
+                popts <- list("x_lab" = "accuracy",
+                              "y_lab" = "perplexity",
+                              "type" = "basic")
+                # Plot 5
+                p5 <- private$plot_graph(df5, popts)
+                # The plots are displayed on a single page
+                patchwork <- p1 + p2 + p3 + p4 + p5
+                # Main title is added
+                print(patchwork + plot_annotation(
+                    "title" = opts[["title"]],
+                    "subtitle" = opts[["subtitle"]]))
+            }
+            # If the type is grouped
+            else if (opts[["type"]] == 'grouped') {
+                # The group name
+                g <- opts[["group"]]
+                # The data frames
+                df1 <- data.frame(data$n, data$m, data[[g]])
+                df2 <- data.frame(data$n, data$t, data[[g]])
+                df3 <- data.frame(data$n, data$p, data[[g]])
+                df4 <- data.frame(data$n, data$a, data[[g]])
+                # The column names
+                n <- c("x", "y", g)
+                # The column names are set
+                names(df1) <- n
+                names(df2) <- n
+                names(df3) <- n
+                names(df4) <- n
+                # The options for plot 1
+                popts <- list("x_lab" = "ngram",
+                              "y_lab" = "memory",
+                              "group" = opts[["group"]],
+                              "type" = opts[["type"]])
+                # Plot 1
+                p1 <- private$plot_graph(df1, popts)
+                # The options for plot 2
+                popts <- list("x_lab" = "ngram",
+                              "y_lab" = "time",
+                              "group" = opts[["group"]],
+                              "type" = opts[["type"]])
+                # Plot 2
+                p2 <- private$plot_graph(df2, popts)
+                # The options for plot 3
+                popts <- list("x_lab" = "ngram",
+                              "y_lab" = "perplexity",
+                              "group" = opts[["group"]],
+                              "type" = opts[["type"]])
+                # Plot 3
+                p3 <- private$plot_graph(df3, popts)
+                # The options for plot 4
+                popts <- list("x_lab" = "ngram",
+                              "y_lab" = "accuracy",
+                              "group" = opts[["group"]],
+                              "type" = opts[["type"]])
+                # Plot 4
+                p4 <- private$plot_graph(df4, popts)
+                # The plots are displayed on a single page
+                patchwork <- p1 + p2 + p3 + p4
+                # Main title is added
+                # Main title is added
+                print(patchwork + plot_annotation(
+                    "title" = opts[["title"]],
+                    "subtitle" = opts[["subtitle"]]))
+            }
+        },
+
+        # @description
+        # Reads the model file and sets the current objects attributes.
+        # @param model The model number. It is the maximum ngrams supported by
+        #   the model.
+        read_tp_data = function(model) {
+            # If the model has already been loaded then function returns
+            if (!is.null(self$tp)) return()
+            # The information message
+            msg <- paste0("Loading model ", model, "...")
+            # The information message is shown
+            self$display_msg(msg, 1)
+            # The model config file name
+            fn <- paste0(self$mdir, "/config.RDS")
+            # The config file is loaded
+            self$load_config(fn)
+            # The model file name
+            fn <- paste0(self$mdir, "/model-", model, ".RDS")
+            # The model is read to a data frame
+            self$tp <- self$read_obj(fn)
+            # The words file name
+            fn <- paste0(self$mdir, "/words.RDS")
+            # The list of words is read
+            self$wl <- self$read_obj(fn)
+            # The dictionary file name
+            fn <- self$dc_opts[["dict_file"]]
+            # The file contents
+            dict <- self$read_file(fn, F)
+            # The information message is shown
+            self$display_msg("Calculating default probability...", 1)
+            # The number of words in the dictionary file. It is used to
+            # calculate Perplexity.
+            vc <- length(dict)
+            # The cleaned training file is read
+            fn <- paste0(self$mdir, "/train-clean.txt")
+            # The model is read to a data frame
+            data <- self$read_file(fn, F)
+            # The words are split on " "
+            w <- strsplit(data, " ")
+            # The words are converted to atomic list
+            w <- unlist(w)
+            # The number of words
+            n <- length(w)
+            # The default probability is set
+            self$dp <- 1/(n + vc)
+        },
+
+        # @description
+        # It returns the average value for the required stats, for
+        # the given performance stats.
+        # @param pstats The performance stats.
+        # @param type The type of stats. It can be:
+        #   'memory' -> The mean memory used.
+        #   'time_taken' -> The mean time taken by the intrinsic and
+        #     extrinsic evaluation tests.
+        #   'perplexity' -> The mean Perplexity score.
+        #   'accuracy' -> The mean accuracy score.
+        # @param nmax -> The maximum ngram size.
+        process_stats = function(pstats, type, nmax) {
+            # The stats
+            s <- NULL
+            # The average memory for each ngram model
+            for (n in 1:(nmax-1)) {
+                # If the stats is memory
+                if (type == "memory") {
+                    # The stats value for the current ngram number
+                    v <- pstats[[n]][["memory"]]
+                    # The stats are formated
+                    v <- self$format_size(v)
+                }
+                # If the stats is perplexity
+                else if (type == "perplexity") {
+                    # The stats value for the current ngram number
+                    v <- pstats[[n]][["intrinsic"]][["mean"]]
+                }
+                # If the stats is accuracy
+                else if (type == "accuracy") {
+                    # The stats value for the current ngram number
+                    v <- pstats[[n]][["extrinsic"]][["valid_perc"]]
+                }
+                # If the stats is time
+                else if (type == "time") {
+                    # The stats value for the current ngram number
+                    v <- pstats[[n]][["time"]]
+                }
+                # The stats are updated
+                s <- c(s, v)
+            }
+
+            return(s)
+        }
+    )
+)
diff --git a/R/text-file-processor.R b/R/text-file-processor.R
new file mode 100644
index 0000000..e47e187
--- /dev/null
+++ b/R/text-file-processor.R
@@ -0,0 +1,339 @@
+#' Base class for text file processing
+#'
+#' @description
+#' Provides basic structure for processing text files.
+#'
+#' @details
+#' It provides pre-processing, processing and post-processing functions, which
+#' need to be overridden by derived classes. The pre-processing function is
+#' called before reading a file. The process function is called for processing a
+#' line. The post processing function is called on the processed data. It also
+#' provides a method for generating a sample file from an input text file
+TextFileProcessor <- R6::R6Class(
+    "TextFileProcessor",
+    public = list(
+        #' @field opts The list of file processing options.
+        #'   save_data -> If the combined processed lines should be saved.
+        #'   output_file -> Name of the output file used to store the data.
+        opts = list(
+            "save_data" = F,
+            "output_file" = "./data/sample-clean.txt"
+        ),
+
+        #' @field line_count The number of lines to read and process at a time.
+        line_count = 100,
+
+        #' @field pp_output The output of the pre-processing step
+        pp_output = NULL,
+
+        #' @field p_output The output of the processing step
+        p_output = NULL,
+
+        #' @field file_name The name of the text file to process.
+        file_name = "./data/sample.txt",
+
+        #' @field verbose Indicates if progress data should be printed.
+        verbose = 0,
+
+        #' @description
+        #' It initializes the current object. It is used to set the file name
+        #' and verbose options.
+        #' @param file_name The path to the file to clean.
+        #' @param line_count The number of lines to read and clean at a time.
+        #' @param verbose Indicates if progress information should be displayed.
+        initialize = function(file_name = "./data/sample.txt",
+                              line_count = 100,
+                              verbose = 2) {
+            # If the given file name is not NULL and is not valid
+            if (!is.null(file_name) && !file.exists(file_name))
+                stop("The given file name is not valid")
+
+            # The base class attributes are set
+            # The file name is set
+            self$file_name <- file_name
+            # The verbose option is set
+            self$verbose <- verbose
+            # The line count is set
+            self$line_count <- line_count
+            # The processed output is set
+            self$p_output <- NULL
+        },
+
+        #' @description
+        #' Reads the given file one line at a time. It runs the given
+        #' pre-processing function before reading the file. It runs the given
+        #' line processing function for each line. It optionally saves the
+        #' output of line processing after reading the file or after processing
+        #' certain number of lines.
+        #' @param pre_process The pre-processing function.
+        #' @param process The function used to process each line.
+        #' @param post_process The function used to perform post processing.
+        #' @return The combined processed data
+        process_file = function(pre_process, process, post_process) {
+            # Pre-processing is done
+            pre_process()
+            # The file is opened
+            private$con <- file(self$file_name)
+            # The connection is opened for reading
+            open(private$con)
+            # The lines to be read,
+            lines <- c()
+            # The loop counter
+            c <- 0
+            # Indicates that data should not be appended
+            is_app <- F
+            # The output file name
+            of <- self$opts[["output_file"]]
+            # All lines are read
+            while (TRUE) {
+                # The lines are read
+                lines <- readLines(private$con, n = self$line_count,
+                                   skipNul = TRUE)
+                # If all the lines have been read
+                if (length(lines) == 0) break
+                # The lines are processed
+                p_lines <- process(lines)
+                # If the data should be saved
+                if (self$opts[["save_data"]] && !is.null(p_lines)) {
+                    # The cleaned data is written to file
+                    self$write_file(p_lines, of, is_app)
+                    # Debug message
+                    self$display_msg(
+                        paste(length(p_lines), "lines were written"), 1)
+                    # Indicates that data should be appended
+                    is_app <- T
+                }
+                # The loop counter is increased by 1
+                c <- c + 1
+                # Debug message
+                self$display_msg(
+                    paste(self$line_count*c, "lines have been processed"), 1)
+
+                # if (c == 2) break;
+            }
+            # The file connection is closed if it is open
+            close(private$con)
+            # Post processing is performed
+            post_process()
+        },
+
+        #' @description
+        #' Reads the given file and returns its contents.
+        #' @param file_name The name of the file to read.
+        #' @param is_csv If the data is a csv file
+        #' @return The file data
+        read_file = function(file_name, is_csv) {
+            # The information message
+            msg <- paste0("Reading file: ", file_name)
+            # Information message is shown
+            self$display_msg(msg, 1)
+            # If the file is not a csv file
+            if (!is_csv) {
+                # File is opened for reading
+                con <- file(file_name)
+                # The file contents are read
+                data <- readLines(con, skipNul = TRUE)
+                # The file connection is closed
+                close(con)
+            }
+            else {
+                data <- read.csv(file_name)
+            }
+            # The data is returned
+            return (data)
+        },
+
+        #' @description
+        #' Reads the given number of lines from the given file.
+        #' @param file_name The name of the file to read.
+        #' @param line_count The number of lines to read.
+        #' @return The file data
+        read_lines = function(file_name, line_count) {
+            # The information message
+            msg <- paste0("Reading file: ", file_name)
+            # Information message is shown
+            self$display_msg(msg, 1)
+            # File is opened for reading
+            con <- file(file_name)
+            # The file contents are read
+            data <- readLines(con, n = line_count, skipNul = TRUE)
+            # The file connection is closed
+            close(con)
+            # The data is returned
+            return (data)
+        },
+
+        #' @description
+        #' Writes the given data to the given file. The data may be appended to
+        #' an existing file.
+        #' @param data The data to be written.
+        #' @param file_name The name of the file.
+        #' @param is_append Indicates if data should be saved.
+        write_file = function(data, file_name, is_append) {
+            # The information message
+            msg <- paste0("Saving file: ", file_name)
+            # Information message is shown
+            self$display_msg(msg, 1)
+            # If the given data is a data frame
+            if ("data.frame" %in% class(data)) {
+                # The data frame is  written to a file
+                write.csv(data, file_name, row.names = F)
+            }
+            else {
+                # The file open mode
+                mode <- "w"
+                # If the data should be appended
+                if (is_append) mode <- "a"
+                # The output file is opened for writing
+                con <- file(file_name, open = mode)
+                # The data is written to the output file
+                writeLines(data, con)
+                # The file connection is closed
+                close(con)
+            }
+        },
+
+        #' @description
+        #' Calculates the size of the given object or formats the given bytes
+        #' object as a number without units. The returned number is the size in
+        #' Mb.
+        #' @param obj An object of class bytes or an object whoose size is to be
+        #'   found.
+        #' @return The size formatted as a string.
+        format_size = function(obj) {
+            # If the obj is not of class bytes
+            if (!("bytes" %in% class(obj))) {
+                # The object size
+                obj_size <- object_size(obj)
+            }
+            else {
+                obj_size <- obj
+            }
+            # The bytes obj is formatted as a string
+            obj_size <- utils:::format.object_size(obj_size, units = "Mb")
+            # The Mb suffix is removed
+            obj_size <- sub(" Mb", "\\1", obj_size)
+            # The object size is converted to a number
+            obj_size <- as.numeric(obj_size)
+
+            return(obj_size)
+        },
+
+        #' @description
+        #' Saves the given object as a file.
+        #' @param obj The object to save.
+        #' @param file_name The file name.
+        save_obj = function(obj, file_name) {
+            # The information message
+            msg <- paste0("Saving file: ", file_name)
+            # Information message is shown
+            self$display_msg(msg, 1)
+            # The object is saved to a file
+            saveRDS(obj, file_name)
+        },
+
+        #' @description
+        #' Reads the contents of the given file. Loads the file
+        #' contents to a R object.
+        #' @param file_name The file name.
+        #' @return The loaded R obj.
+        read_obj = function(file_name) {
+            # The information message
+            msg <- paste0("Reading file: ", file_name)
+            # Information message is shown
+            self$display_msg(msg, 1)
+            # If the file does not exist
+            if (!file.exists(file_name)) {
+                # The information message
+                msg <- paste0("The file: ", file_name, " cannot be read !")
+                # Program execution is stopped
+                stop(msg)
+            }
+            else {
+                # The object is saved
+                obj <- readRDS(file_name)
+            }
+
+            return(obj)
+        },
+
+        #' @description
+        #' Prints the given message depending on verbose settings.
+        #' @param msg The message to be printed.
+        #' @param min_debug The minimum debugging level
+        display_msg = function(msg, min_debug) {
+            # If verbose is >= min_debug , then message is displayed
+            if (self$verbose >= min_debug) {
+                print(msg)
+            }
+        },
+
+        #' @description
+        #' Performs processing on the data. It should be
+        #' overriden by a derived class.
+        #' @param lines The lines to process
+        process = function(lines) {
+
+        },
+
+        #' @description
+        #' Performs post-processing on the processed data. It should be
+        #' overriden by a derived class.
+        post_process = function() {
+
+        },
+
+        #' @description
+        #' Performs pre-processing on the processed data. It should be
+        #' overriden by a derived class.
+        pre_process = function() {
+            return(NULL)
+        }
+    ),
+    private = list(
+        # @field con The input file connection
+        con = NULL,
+
+        # @description
+        # Reads the contents of the given file. Loads the file
+        # contents to a R object, a data frame or character vector.
+        # @param file_name The file name.
+        # @param format The file format. 'plain' or 'obj'
+        # @param opts Options for reading the file.
+        # @return The required data.
+        read_data = function(file_name, format, opts) {
+            # If the format is plain
+            if (format == "plain") {
+                # The file is read
+                data <- self$read_file(file_name, opts)
+            }
+            # If the format is obj
+            else if (format == "obj") {
+                # The file is read
+                data <- self$read_obj(file_name)
+            }
+
+            return(data)
+        },
+
+        # @description
+        # Writes the given data to a file. The data may be a R object, a
+        # character vector or a data frame.
+        # @param file_name The file name.
+        # @param format The file format. 'plain' or 'obj'
+        # @param opts Options for writting to the file.
+        # @return The required data.
+        write_data = function(data, file_name, format, opts) {
+            # If the format is plain
+            if (format == "plain") {
+                # The data is written to a file
+                self$write_file(data, file_name, opts)
+            }
+            # If the format is obj
+            if (format == "obj") {
+                # The R object is saved
+                self$save_obj(data, file_name)
+            }
+        }
+    )
+)
diff --git a/R/token-generator.R b/R/token-generator.R
new file mode 100644
index 0000000..6bd5a6a
--- /dev/null
+++ b/R/token-generator.R
@@ -0,0 +1,241 @@
+#' It generates ngrams of given size from an input text file
+#'
+#' @description
+#' It generates ngram tokens along with their frequencies.
+#'
+#' @details
+#' It provides a method for generating ngrams of given size. It saves
+#' each ngram along with its frequency to a text file.
+TokenGenerator <- R6::R6Class(
+    "TokenGenerator",
+    inherit = TextFileProcessor,
+    public = list(
+        #' @field tg_opts The options for the token generator obj.
+        #'   n -> The ngram size.
+        #'   save_ngrams -> If the ngram data should be saved.
+        #'   min_freq -> All ngrams with frequency less than min_freq are
+        #'     ignored.
+        #'   line_count -> The number of lines to process at a time.
+        #'   stem_words -> If words should be converted to their stem.
+        #'   dir -> The dir where the output file should be saved.
+        #'   format -> The format for the output. There are two options.
+        #'     'plain' -> The data is stored in plain text.
+        #'     'obj' -> The data is stored as a R obj.
+        tg_opts = list(
+            "n" = 1,
+            "save_ngrams" = F,
+            "min_freq" = -1,
+            "line_count" = 5000,
+            "stem_words" = F,
+            "dir" = "./data/models",
+            "format" = "obj"
+        ),
+
+        #' @description
+        #' It initializes the current obj. It is used to set the file name
+        #' and verbose options.
+        #' @param file_name The path to the input file.
+        #' @param opts The options for generating the ngram tokens.
+        #'   n -> The ngram size.
+        #'   save_ngrams -> If the ngram data should be saved.
+        #'   min_freq -> All ngrams with frequency less than min_freq are
+        #'     ignored.
+        #'   line_count -> The number of lines to process at a time.
+        #'   stem_words -> If words should be converted to their stem.
+        #'   dir -> The dir where the output file should be saved.
+        #'   format -> The format for the output. There are two options.
+        #'     'plain' -> The data is stored in plain text.
+        #'     'obj' -> The data is stored as a R obj.
+        #' @param verbose Indicates if progress information should be displayed.
+        initialize = function(file_name = "./data/models/validate-clean.txt",
+                              opts = self$tg_opts,
+                              verbose = 0) {
+            # The given options are merged with the opts attribute
+            self$tg_opts <- modifyList(self$tg_opts, opts)
+            # The tg_opts is merged with the base class opts attribute
+            self$opts <- modifyList(self$opts, self$tg_opts)
+            # The base class is initialized
+            super$initialize(file_name, self$opts[["line_count"]], verbose)
+            # The processed output is initialized
+            self$p_output <- NULL
+        },
+
+        #' @description
+        #' It generates ngram tokens and their frequencies from the given file
+        #' name. The tokens may be saved to a text file as plain text or a R
+        #' obj.
+        #' @return The ngram tokens along with their frequencies.
+        generate_tokens = function() {
+            # The processed output is initialized
+            self$p_output <- NULL
+            # The output file name
+            fn <- private$get_file_name()
+            # If the output file already exists
+            if (file.exists(fn)) {
+                # The information message
+                msg <- paste0("The ", self$opts[["n"]],
+                              "-gram file already exists")
+                # The information message is shown
+                self$display_msg(msg, 1)
+                # If the ngram data should not be saved
+                if (!self$opts[["save_ngrams"]]) {
+                    # The ngrams file is read
+                    self$p_output <- self$read_data(
+                        fn, self$opts[["format"]], T)
+                }
+            }
+            else {
+                # The information message
+                msg <- paste0("Generating ",
+                              self$opts[["n"]], "-gram tokens...")
+                # The information message is shown
+                self$display_msg(msg, 1)
+                # The base class process_file function is called
+                super$process_file(super$pre_process, private$process,
+                                   private$post_process)
+            }
+        }
+    ),
+
+    private = list(
+        # @description
+        # Performs processing for the \code{generate_tokens} function. It
+        # processes the given line of text. It converts each line of text into
+        # ngrams of the given size. The frequency of each ngram is updated.
+        # @param lines The lines of text.
+        process = function(lines) {
+            # Ngrams are extracted from each line
+            ngrams <- private$generate_ngrams(lines)
+            # If the processed output is empty
+            if (is.null(self$p_output)) {
+                # The ngram words are set to the processed output
+                self$p_output <- ngrams
+            }
+            else {
+                # The ngram words are appended to the processed output
+                self$p_output <- c(self$p_output, ngrams)
+            }
+        },
+
+        # @description
+        # It returns the name of the output ngram file.
+        get_file_name = function() {
+            # The ngram number
+            n <- self$opts[["n"]]
+            # The format
+            fo <- self$opts[["format"]]
+            # The output directory
+            dir <- self$opts[["dir"]]
+            # The file extension
+            if (fo == "plain") ext <- ".txt"
+            else ext <- ".RDS"
+
+            # The file name
+            file_name <- paste0(dir, "/n", n, ext)
+
+            return(file_name)
+        },
+
+        # @description
+        # It saves the ngram tokens and their frequencies to a text file.
+        post_process = function() {
+            # The information message
+            msg <- paste0("Calculating ",
+                          self$opts[["n"]], "-gram frequencies...")
+            # The information message is shown
+            self$display_msg(msg, 1)
+            # The output is copied to a variable
+            df <- data.frame("pre" = self$p_output)
+            # A frequency column is added
+            df$freq <- 1
+            # Each prefix is grouped and summed
+            df <- df %>% group_by(pre) %>% summarize_all(sum)
+            # If the minimum ngram frequency is given
+            if (self$opts[["min_freq"]] > -1) {
+                # The information message
+                msg <- paste0("Removing low frequency ngrams...")
+                # The information message is shown
+                self$display_msg(msg, 2)
+                # All ngrams with frequency less than min_freq are ignored
+                df <- df[df$freq >= self$opts[["min_freq"]], ]
+            }
+            # The column names are set
+            colnames(df) <- c("pre", "freq")
+            # The output is set to the updated variable
+            self$p_output <- df
+            # If the ngram data should be saved
+            if (self$opts[["save_ngrams"]]) {
+                # The required file name
+                fn <- private$get_file_name()
+                # The format
+                fo <- self$opts[["format"]]
+                # The n-gram data frame is written to file
+                private$write_data(self$p_output, fn, fo, F)
+            }
+        },
+
+        # @description
+        # It generates ngram frequencies for the given lines of text.
+        # @param lines The lines of text to process
+        generate_ngrams = function(lines) {
+            # The ngram number
+            n <- self$opts[["n"]]
+            # If n > 1
+            if (n > 1) {
+                # Trailing and leading white space is removed
+                l <- trimws(lines, "both")
+                # Start and end of sentence tags are added
+                l  <- gsub("(^)(.+)($)", "<s>\\2<e>", l)
+                # The lines are split on space
+                w <- strsplit(l, " ")
+                # The words are converted to an atomic vector
+                w <- unlist(w)
+                # The index of empty words
+                i <- (w == "")
+                # The empty words are removed
+                w <- w[!i]
+                # The indexes for the words
+                indexes <- 1:length(w)
+                # The ngrams are generated
+                l <- sapply(indexes, function(i) {
+                    # If the words should be stemmed
+                    if (self$tg_opts[["stem_words"]]) {
+                        # The ngram prefix words are stemmed. The next word is
+                        # not stemmed
+                        v <- c(wordStem(w[i:(i+n-2)]), w[(i+n-1)])
+                    }
+                    else {
+                        # The ngram token
+                        v <- w[i:(i+n-1)]
+                    }
+                    # The ngram token
+                    v <- paste0(v, collapse = "_")
+                    # The ngram token is returned
+                    return(v)
+                },
+                simplify = T)
+                # Invalid ngrams need to be removed
+                # A logical vector indicating position of invalid ngrams
+                i <- grepl(".+<e>.+", l)
+                # The list of valid ngrams
+                l <- l[!i]
+                # The start of sentence tokens are removed
+                l <- gsub("<s>", "", l)
+                # The end of sentence tokens are removed
+                l <- gsub("<e>", "", l)
+            }
+            else {
+                # The line is split on " "
+                words <- strsplit(lines, " ")
+                # The list of words is converted to atomic vector
+                l <- unlist(words)
+                # The index of empty words
+                i <- l == ""
+                # The empty words are removed
+                l <- l[!i]
+            }
+
+            return(l)
+        }
+    )
+)
diff --git a/R/tp-generator.R b/R/tp-generator.R
new file mode 100644
index 0000000..7a2ebb9
--- /dev/null
+++ b/R/tp-generator.R
@@ -0,0 +1,342 @@
+#' It is used to generate the transition probabilities
+#'
+#' @description
+#' It implements Markov Chains for ngrams. It generates transition
+#' probabilities for ngrams.
+#'
+#' @details
+#' It reads ngram frequencies from an input text file. It parses each
+#' ngram into a prefix, a next word and the next word frequency. The prefix is
+#' converted to a numeric hash using the digest2int function from the digest
+#' package. The next word is replaced with the position of the next word in the
+#' list of all words. The data is stored in a data frame. It may be saved to a
+#' file.
+TPGenerator <- R6::R6Class(
+    "TPGenerator",
+    inherit = TextFileProcessor,
+    public = list(
+        #' @description
+        #' It initializes the current obj. It is used to set the verbose option.
+        #' @param verbose If progress information should be displayed.
+        initialize = function(verbose = 0) {
+            # The base class is initialized
+            super$initialize(NULL, NULL, verbose)
+            # The processed output is initialized
+            self$p_output <- data.frame()
+        },
+
+        #' @description
+        #' It groups the given tp data by prefix. Each prefix has
+        #' the top c next words with the highest probabilities. All other next
+        #' words are removed. The trimmed tp data may be saved to a file or
+        #' returned. The file is saved with the suffix -min.
+        #' e.g model-4-min.RDS.
+        #' @param opts The options for trimming the tp data.
+        #'   save_tp -> If the data should be saved.
+        #'   dir -> The dir where the output file should be saved.
+        #'   c -> The top c next words per ngrams prefix.
+        #'   m -> The maximum ngram number supported by the model.
+        trim_tp = function(opts) {
+            # The model file name
+            fn <- paste0(opts$dir, "/model-", opts$m, ".RDS")
+            # The model file is read to a data frame
+            df <- self$read_obj(fn)
+        },
+
+        #' @description
+        #' It generates the transition probabilities for the given
+        #' ngram numbers It first generates the transition probabilities for
+        #' each ngram number. The transition probabilities are then combined
+        #' into a single data frame. The data frame may be saved to a file.
+        #' @param opts The options for generating the transition probabilities.
+        #'   save_tp -> If the data should be saved.
+        #'   n_range -> The range of ngram numbers.
+        #'   dir -> The dir where the output file should be saved.
+        #'   format -> The format for the output. There are two options.
+        #'     'plain' -> The data is stored in plain text.
+        #'     'obj' -> The data is stored as a R obj.
+        generate_tp = function(opts) {
+            # The opts is merged with the tp_opts attribute
+            private$tp_opts = modifyList(private$tp_opts, opts)
+            # The tp_opts is merged with the base class opts attribute
+            self$opts = modifyList(self$opts, private$tp_opts)
+            # The information message
+            msg <- paste0(
+                "Generating Transition Probabilities for n = ",
+                min(self$opts[["n_range"]]),
+                ":",
+                max(self$opts[["n_range"]])
+            )
+            # Information message is shown
+            self$display_msg(msg, 1)
+            # The processed output is cleared
+            self$p_output <- data.frame()
+            # The ngram number range
+            n_range <- self$opts[["n_range"]]
+            # The minimum ngram number
+            nmin <- min(n_range)
+            # The maximum ngram number
+            nmax <- max(n_range)
+            # The output format
+            fo <- self$opts[["format"]]
+            # The file extension
+            if (fo == "plain") ext <- ".txt"
+            else ext <- ".RDS"
+            # The options for generating transition probabilities
+            tp_opts <- list(
+                n = 1,
+                format = fo,
+                save_tp = T,
+                dir = self$opts[["dir"]]
+            )
+            # The combined tp data
+            c_pre <- c_nw <- c_prob <- c()
+            # For each ngram number, the transition probabilities data is
+            # generated.
+            for (n in n_range) {
+                # The value of n is set
+                tp_opts$n <- n
+                # The transition probabilities or word list is generated
+                self$generate_tp_for_n(tp_opts)
+                # If n == 1, then word list data is saved
+                if (n == 1) {
+                    # The combined tp data is saved
+                    private$save_data()
+                }
+                else {
+                    # c_pre is updated
+                    c_pre <- c(c_pre, self$p_output$pre)
+                    # c_nw is updated
+                    c_nw <- c(c_nw, self$p_output$nw)
+                    # c_prob is updated
+                    c_prob <- c(c_prob, self$p_output$prob)
+                    # The processed output is cleared
+                    self$p_output <- data.frame()
+                }
+            }
+            # The processed output is set to the combined tp data
+            self$p_output <-
+                data.frame("pre" = c_pre,
+                           "nw" = c_nw,
+                           "prob" = c_prob)
+            # The combined tp data is saved
+            private$save_data(paste0("model-", nmax, ext))
+        },
+
+        #' @description
+        #' It reads ngram token frequencies from an input text
+        #' file. It generates a data frame containing the prefix, next word
+        #' and next word frequency. The data frame may be saved to a file as
+        #' plain text or as a R obj. For n = 1, the list of words is saved.
+        #' @param opts The options for generating the transition probabilities.
+        #'   save_tp -> If the data should be saved.
+        #'   n -> The ngram number
+        #'   dir -> The location of  the input and output files.
+        #'   format -> The format of the input and output files. Options are:
+        #'     'plain' -> The data is stored in plain text.
+        #'     'obj' -> The data is stored as a R obj.
+        generate_tp_for_n = function(opts) {
+            # The opts is merged with the tp_opts attribute
+            private$tp_opts = modifyList(private$tp_opts, opts)
+            # The tp_opts is merged with the base class opts attribute
+            self$opts = modifyList(self$opts, private$tp_opts)
+            # The output format
+            fo <- self$opts[["format"]]
+            # The ngram number
+            n <- self$opts[["n"]]
+            # The output file name
+            fn <- private$get_file_name(T)
+            # If the output file already exists
+            if (file.exists(fn)) {
+                # The information message
+                msg <- paste0("The file: ", fn, " already exists")
+                # The information message is shown
+                self$display_msg(msg, 1)
+                # The file is read
+                data <- private$read_data(fn, fo, T)
+                # If n = 1
+                if (n == 1) {
+                    # The word list is set to the data
+                    private$wl <- data
+                }
+                else {
+                    # The processed output is set to the data
+                    self$p_output <- data
+                }
+            }
+            else {
+                # The information message
+                msg <- paste0("Generating Transition Probabilities for n=", n)
+                # Information message is shown
+                self$display_msg(msg, 1)
+
+                # The input file name
+                self$file_name <- private$get_file_name(F)
+                # The data is read
+                df <- private$read_data(self$file_name, fo, T)
+                # If n = 1
+                if (n == 1) {
+                    # The word list is set to the data frame
+                    private$wl <- df
+                    # A probabilities column is added
+                    private$wl$prob = (private$wl$freq/sum(private$wl$freq))
+                    # The probabilities are rounded to 8 decimal places
+                    private$wl$prob = round(private$wl$prob, 8)
+                    # The frequency column is removed
+                    private$wl$freq <- NULL
+                }
+                else {
+                    # The 1-gram words are read
+                    private$read_words()
+                    # The lines are split on "prefix_nextword:frequency"
+                    m <- str_match(df$pre, "(.+)_(.+)")
+                    # The hash of the prefix is taken
+                    np <- digest2int(m[,2])
+                    # The next word id based on index position
+                    nw <- match(m[,3], private$wl$pre)
+                    # The next word frequencies
+                    nf <- df$freq
+                    # The data is added to a data frame
+                    df <- data.frame("pre" = np,
+                                     "nw" = nw,
+                                     "freq" = nf)
+                    # The processed output is set to the data frame
+                    self$p_output <- df
+                    # The next word probabilities are generated
+                    private$generate_probs()
+                    # The frequency column is removed
+                    self$p_output$freq <- NULL
+                    # If the data should be saved
+                    if (self$opts[["save_tp"]]) {
+                        private$save_data()
+                    }
+                }
+            }
+        }
+    ),
+
+    private = list(
+        # @field tp_opts The options for generating the transition
+        #   probabilities.
+        #   save_tp -> If the data should be saved.
+        #   n -> The ngram number
+        #   dir -> The dir where the output file should be saved.
+        #   format -> The format for the output. There are two options.
+        #     'plain' -> The data is stored in plain text.
+        #     'obj' -> The data is stored as a R obj.
+        tp_opts = list(
+            "save_tp" = F,
+            "n" = 1,
+            "dir" = "./data/models",
+            "format" = "obj"
+        ),
+
+        # @field The list of unique words and their frequencies
+        wl = data.frame(),
+
+        # @description
+        # It calculates the next word probabilities and optionally
+        # saves the transition probability data to a file.
+        generate_probs = function() {
+            # Information message is shown
+            self$display_msg("Generating Transition Probabilities...", 1)
+            # The ngram number
+            n <- self$opts[["n"]]
+            # If n > 1
+            if (n > 1) {
+                # The output is copied to a variable
+                df <- self$p_output
+                # A new probability column is added. It is set to the sum of
+                # frequency column for each prefix group.
+                df <- df %>%
+                    group_by(pre) %>%
+                    mutate(prob = sum(freq))
+                # Each frequency is divided by the sum to give the probability.
+                df$prob <- round(df$freq/df$prob, 8)
+                # The output is set to the updated variable
+                self$p_output <- df
+            }
+        },
+
+        # @description
+        # It returns the name of the output or input file.
+        # @param is_output If the output file name is required.
+        get_file_name = function(is_output) {
+            # The ngram number
+            n <- self$opts[["n"]]
+            # The directory
+            od <- self$opts[["dir"]]
+            # The format
+            fo <- self$opts[["format"]]
+            # The file extension
+            if (fo == "plain") ext <- ".txt"
+            else ext <- ".RDS"
+            # If the output file name is required
+            if (is_output) {
+                # If n = 1
+                if (n == 1) {
+                    # The file name
+                    fn <- paste0(od, "/words", ext)
+                }
+                # If n > 1
+                else if (n > 1) {
+                    # The file name
+                    fn <- paste0(od, "/tp", n, ext)
+                }
+            }
+            else {
+                # The file name
+                fn <- paste0(od, "/n", n, ext)
+            }
+
+            return(fn)
+        },
+
+        # @description
+        # It saves the transition probabilities or word list
+        # depending on the n options, to a file in plain format or as a R obj.
+        # If the file name is not given, then it is generated using the current
+        # object attributes.
+        # @param file_name The file name to use.
+        save_data = function(file_name = NULL) {
+            # The ngram number
+            n <- self$opts[["n"]]
+            # The directory
+            od <- self$opts[["dir"]]
+            # The format
+            fo <- self$opts[["format"]]
+            # If n = 1
+            if (n == 1) {
+                # The data to save
+                data <- private$wl
+            }
+            # If n > 1
+            else if (n > 1) {
+                # The data to save
+                data <- self$p_output
+            }
+            # If the file name is given as parameter then it is used
+            if (!is.null(file_name)) fn <- paste0(od, "/", file_name)
+            else fn <- private$get_file_name(T)
+            # The data is written
+            private$write_data(data, fn, fo, F)
+        },
+
+        # @description
+        # It reads the list of 1-gram words.
+        read_words = function() {
+            # If the word list has not been read
+            if (nrow(private$wl) == 0) {
+                # The format
+                fo <- self$opts[["format"]]
+                # The file extension
+                if (fo == "plain") ext <- ".txt"
+                else ext <- ".RDS"
+                # The 1-gram words file name
+                fn <- paste0(self$opts[["dir"]], "/words", ext)
+                # The words are read
+                private$wl <- private$read_data(fn, self$opts[["format"]], F)
+            }
+        }
+    )
+)