From c83ca0d2f1287df58e993998fd7f24b8accc2072 Mon Sep 17 00:00:00 2001 From: pakjiddat Date: Tue, 11 May 2021 06:45:53 +0000 Subject: [PATCH] Add source files --- R/data-analyzer.R | 249 +++++++++ R/data-cleaner.R | 244 +++++++++ R/model-evaluator.R | 1136 +++++++++++++++++++++++++++++++++++++++ R/text-file-processor.R | 339 ++++++++++++ R/token-generator.R | 241 +++++++++ R/tp-generator.R | 342 ++++++++++++ 6 files changed, 2551 insertions(+) create mode 100644 R/data-analyzer.R create mode 100644 R/data-cleaner.R create mode 100644 R/model-evaluator.R create mode 100644 R/text-file-processor.R create mode 100644 R/token-generator.R create mode 100644 R/tp-generator.R diff --git a/R/data-analyzer.R b/R/data-analyzer.R new file mode 100644 index 0000000..6484074 --- /dev/null +++ b/R/data-analyzer.R @@ -0,0 +1,249 @@ +#' It is used to analyze text data +#' +#' @description +#' It provides information on text files, such as number of lines +#' and number of words. It allows generating sample file from an input text +#' file. It displays bar plots showing word frequencies. +#' +#' @details +#' It provides a method that returns text file information. The text +#' file information includes total number of lines, max, min and mean line +#' length and file size. It also provides a method that takes random samples of +#' lines in an input text file. It provides a method that reads an input text +#' file containing token frequencies. It displays the most occuring tokens. +DataAnalyzer <- R6::R6Class( + "DataAnalyzer", + inherit = TextFileProcessor, + public = list( + #' @description + #' It initializes the current object. It is used to set the file name + #' and verbose options. + #' @param file_name The path to the input file. + #' @param line_count The number of lines to read at a time. + #' @param verbose If progress information should be displayed. + initialize = function(file_name = "./data/n1.txt", + verbose = 0) { + # The file name is set + self$file_name <- file_name + # The processed output is initialized + self$p_output <- data.frame() + # The verbose options is set + self$verbose = verbose + }, + + #' @description + #' It reads ngram token frequencies from an input text file. The ngram + #' frequencies are then displayed in a bar plot. The type of plot is + #' specified by the type option. 'top_features' displays the top n most + #' occuring tokens along with their frequencies. 'coverage' displays + #' the number of words along with their frequencies. + #' @param opts The options for analyzing the data. + #' type -> The type of plot to display. The options are: + #' 'top_features', 'coverage'. + #' n -> For 'top_features', it is the number of top most occuring + #' tokens. + plot_data = function(opts) { + # The opts is merged with the da_opts attribute + private$da_opts = modifyList(private$da_opts, opts) + # The da_opts is merged with the base class opts attribute + self$opts = modifyList(self$opts, private$da_opts) + # The ngram data is read + df <- self$read_file(self$file_name, T) + # The information message is shown + self$display_msg("Displaying Plot...", 1) + # If the coverage option was specified + if (self$opts[["type"]] == "coverage") { + # The y values + y <- as.character(1:self$opts[["n"]]) + # The x values + x <- numeric() + # The percentage frequencies is calculated + for (i in 1:self$opts[["n"]]) { + # The percentage of tokens with frequency i + x[i] <- round(100*(nrow(df[df$freq == i,])/nrow(df)), 2) + } + # A data frame is created + df <- data.frame("freq" = x, "pre" = y) + # The plot labels + labels <- list( + y = "Percentage of total", + x = "Word Frequency", + title = "Coverage") + + # The chart is plotted + private$display_plot(df, labels) + } + # If the top_features option was specified + else if (self$opts[["type"]] == "top_features") { + # The plot labels + labels <- list( + y = "Frequency", + x = "Feature", + title = paste("Top", self$opts[["n"]], "Features")) + + # The chart is plotted + private$display_plot(df, labels) + } + }, + + #' @description + #' Generates and returns information about the given text files. + #' @param file_list The list of text files to check. + #' @return A data frame containing the overall file statistics. + get_file_info = function(file_list = list()) { + # If the file list is empty, then the file name passed to the + # current objet is used. + if (length(file_list) == 0) + file_list = list(self$file_name) + + # Empty list. Used to store information about each file + stats <- data.frame( + "total_line_count" = 0, + "max_line_length" = 0, + "min_line_length" = 0, + "mean_line_length" = 0, + "total_size" = 0) + # Temporary variables for calculating max, min, mean line length + temp_max <- temp_min <- temp_mean <- 0 + + # For each file in the list + for (file_name in file_list) { + # The file is read + lines <- self$read_file(file_name, F) + + # The file stats are updated + stats["total_size"] <- + stats["total_size"] + file.size(file_name) + stats["total_line_count"] <- + stats["total_line_count"] + length(lines) + + # The temporary variables are updated + temp_max <- max(nchar(lines)) + temp_min <- min(nchar(lines)) + temp_mean <- mean(nchar(lines)) + + if (temp_max > stats["max_line_length"]) + stats["max_line_length"] <- temp_max + if (temp_min > stats["min_line_length"]) + stats["min_line_length"] <- temp_min + if (temp_mean > stats["mean_line_length"]) + stats["mean_line_length"] <- round(temp_mean) + } + # The total size is formatted + stats["total_size"] <- utils:::format.object_size( + stats["total_size"], "auto") + + # The required data is returned + return(stats) + }, + + #' @description + #' It generates training, testing and validation data sets + #' from the given input file. It first reads the file given as a + #' parameter to the current object. It generates random indexes for the + #' data. It partitions the data into training, testing and validation + #' sets, according to the given parameters. The files are named + #' train.txt, test.txt and va.txt. The files are saved to the given + #' output folder. + #' @param dir The name of the output folder. + #' @param percs The size of the training, testing and validation sets. + generate_data = function(dir, percs) { + # The information message is shown + self$display_msg( + "Generating training, testing and validation data sets...", 1) + # If the train, test and validation files already exist + if (file.exists(paste0(dir, "/train.txt")) && + file.exists(paste0(dir, "/test.txt")) && + file.exists(paste0(dir, "/validate.txt"))) { + # The information message + msg <- "The train, test and validate files already exist" + # The information message is shown + self$display_msg(msg, 1) + } + else { + # The input file is read + data <- self$read_file(self$file_name, F) + # The number of lines in the data + lc <- length(data) + # Random indexes are generated + indexes <- sample(1:lc, lc) + # The randomized data + rd <- data[indexes] + # The training set data + train_ds <- rd[1:round(lc*percs[["train"]])] + # The testing set data + test_ds <- rd[1:round(lc*percs[["test"]])] + # The validation set data + validate_ds <- rd[1:round(lc*percs[["validate"]])] + # The training data is written to file + self$write_file(train_ds, paste0(dir, "/train.txt"), F) + # The testing data is written to file + self$write_file(test_ds, paste0(dir, "/test.txt"), F) + # The validation data is written to file + self$write_file(validate_ds, paste0(dir, "/validate.txt"), F) + } + }, + + #' @description + #' Returns the given number of ngrams and their + #' frequencies. If the prefix parameter is not given, then the ngrams + #' are randomly chosen. Otherise ngrams starting with the given regular + #' expression are returned. + #' @param fn The ngram file name. + #' @param c The number of ngrams to return. + #' @param pre The ngram prefix, given as a regular expression. + get_ngrams = function(fn, c = NULL, pre = NULL) { + # The data is read + df <- self$read_obj(fn) + # If the prefix is not given + if (is.null(pre)) { + # The sample indexes + i <- sample(1:nrow(df), c) + # The ngram samples + s <- df[i,] + } + else { + # The ngram samples + s <- df[grepl(pre, df$pre), ] + } + return(s) + } + ), + + private = list( + # @field da_opts The options for data analyzer object. + # type -> The type of plot to display. The options are: + # 'top_features', 'coverage'. + # n -> For 'top_features', it is the number of top most occuring + # tokens. + da_opts = list( + "type" = "top_features", + "n" = 10 + ), + + # @description + # Displays a plot using ggplot2. The plot is a horizontal + # bar plot filled with red. It has the given labels and main title + # @param df The data to plot. It is a data frame with prefix and freq + # columns. + # @param labels The main title, x and y axis labels + display_plot = function(df, labels) { + # The freq column is converted to numeric + df$freq <- as.numeric(df$freq) + # The pre column is converted to character + df$pre <- as.character(df$pre) + # The data frame is sorted in descending order + df <- (df[order(df$freq, decreasing = T),]) + # The top n terms are extracted + df <- df[1:self$opts[["n"]], ] + # The ngram names and their frequencies are plotted + g <- ggplot(data = df, aes(x = reorder(pre, freq), y = freq)) + + geom_bar(stat = "identity", fill = "red") + + ggtitle(labels[["title"]]) + + coord_flip() + + ylab(labels[["y"]]) + + xlab(labels[["x"]]) + print(g) + } + ) +) diff --git a/R/data-cleaner.R b/R/data-cleaner.R new file mode 100644 index 0000000..61ec87c --- /dev/null +++ b/R/data-cleaner.R @@ -0,0 +1,244 @@ +#' Provides data cleaning functionality +#' +#' @description +#' Allows removing unneeded characters from text files. +#' +#' @details +#' It provides a method for cleaning text files. It allows removing bad words, +#' stop words, non dictionary words, extra space, punctuation and non-alphabet +#' characters. Converting text to lower case. It supports large text files. +DataCleaner <- R6::R6Class( + "DataCleaner", + inherit = TextFileProcessor, + public = list( + #' @field dc_opts The options for the data cleaner object. + #' min_words -> The minimum number of words per sentence. + #' line_count -> The number of lines to read and clean at a time. + # save_data -> If the combined processed lines should be saved. + # output_file -> Name of the output file used to store the data. + #' sw_file -> The stop words file path. + # dict_file -> The dictionary file path. + #' bad_file -> The bad words file path. + #' to_lower -> If the words should be converted to lower case. + #' remove_stop -> If stop words should be removed. + #' remove_punct -> If punctuation symbols should be removed. + #' remove_non_dict -> If non dictionary words should be removed. + #' remove_non_alpha -> If non alphabet symbols should be removed. + #' remove_extra_space -> If leading, trailing and double spaces + #' should be removed. + #' remove_bad -> If bad words should be removed + dc_opts = list( + "min_words" = 2, + "line_count" = 1000, + "save_data" = T, + "output_file" = "./data/sample-clean.txt", + "sw_file" = "./data/stop-words.txt", + "dict_file" = "./data/dict-no-bad.txt", + "bad_file" = "./data/bad-words.txt", + "to_lower" = T, + "remove_stop" = F, + "remove_punct" = T, + "remove_non_dict" = T, + "remove_non_alpha" = T, + "remove_extra_space" = T, + "remove_bad" = F + ), + + #' @field sw The list of stop words. + sw = list(), + + #' @field bw The list of bad words. + bw = list(), + + #' @field dw The list of dictionary words. + dw = list(), + + #' @description + #' It initializes the current object. It is used to set the file name + #' and verbose options. + #' @param file_name The path to the file to clean. + #' @param opts The options for data cleaning. + #' min_words -> The minimum number of words per sentence. + #' line_count -> The number of lines to read and clean at a time. + # save_data -> If the combined processed lines should be saved. + # output_file -> Name of the output file used to store the data. + #' sw_file -> The stop words file path. + # dict_file -> The dictionary file path. + #' bad_file -> The bad words file path. + #' to_lower -> If the words should be converted to lower case. + #' remove_stop -> If stop words should be removed. + #' remove_punct -> If punctuation symbols should be removed. + #' remove_non_dict -> If non dictionary words should be removed. + #' remove_non_alpha -> If non alphabet symbols should be removed. + #' remove_extra_space -> If leading, trailing and double spaces + #' should be removed. + #' remove_bad -> If bad words should be removed + #' @param verbose Indicates if progress information should be displayed. + initialize = function(file_name = NULL, + opts = self$dc_opts, + verbose = 0) { + # The given options are merged with the opts attribute + self$dc_opts <- modifyList(self$dc_opts, opts) + # The dc_opts is merged with the base class opts attribute + self$opts <- modifyList(self$opts, self$dc_opts) + # The stop words file is read + self$sw <- self$read_file(self$opts[["sw_file"]], F); + # The dictionary file is read + self$dw <- self$read_file(self$opts[["dict_file"]], F); + # The bad word file is read + self$bw <- self$read_file(self$opts[["bad_file"]], F); + # The base class is initialized + super$initialize(file_name, self$opts[['line_count']], verbose) + }, + + #' @description + #' It removes unneeded characters from the given text with + #' several options. It allows removing punctuations, numbers, symbols, + #' urls and separators. It allows removing bad words, stop words and + #' words not in the given dictionary file. It reads the given file one + #' line at a time, removing unneeded characters. After every line_count + #' number of lines, the cleaned lines are saved to the output file. + clean_file = function() { + # The information message + msg <- paste0("Cleaning the sample file...", self$file_name) + # The information message is shown + self$display_msg(msg, 1) + # The base class process_file function is called + super$process_file(super$pre_process, private$process, + super$post_process) + }, + + #' @description + #' It cleans the given lines of text using the options + #' passed to the current object. + #' @param lines The input sentences. + #' @return The cleaned lines of text. + clean_lines = function(lines) { + # The lines to clean + l <- lines + # If a line does not end with a ".", then "." is appended to the + # line + l <- gsub("(.+[^\\.])$", "\\1.", l) + # The "." character is replaced with the string "specialdotsep" + l <- gsub("\\.", " specialdotsep ", l) + # If the words should be converted to lower case + if (self$opts[["to_lower"]]) { + # The information message + self$display_msg("Converting lines to lower case...", 3) + # The line is converted to lower case + l <- tolower(l) + } + # If punctuation symbols should be removed + if (self$opts[["remove_punct"]]) { + # The information message + self$display_msg("Removing punctuation symbols...", 3) + # The pattern for removing all punctuation symbols + l <- gsub("[[:punct:]]+", "", l) + } + # If non alphabet symbols should be removed + if (self$opts[["remove_non_alpha"]]) { + # The information message + self$display_msg("Removing non alphabet symbols...", 3) + # Words containing non alphabetical characters are removed + l <- gsub("([^[:alpha:]\\s])", "", l, perl = T) + } + + # If stop words should be removed + if (self$opts[["remove_stop"]]) { + # The information message + self$display_msg("Removing stop words..", 3) + # Stop words are collapsed + sw <- paste(self$sw, collapse = "|") + swp <- paste("\\b(", sw, ")\\b", sep = "") + # The stop words are removed + l <- gsub(swp, "", l) + } + # If extra spaces should be removed + if (self$opts[["remove_extra_space"]]) { + # The information message + self$display_msg("Removing extra space...", 3) + # Multiple spaces are replaced by single space + l = gsub("\\s{2,}", " ", l) + # Leading and trailing whitespaces are removed + l = trimws(l) + } + + # The words in the lines are extracted + words <- strsplit(l, split = " ") + # The words are converted to an atomic list + words <- unlist(words) + # If non dictionary words should be removed + if (self$opts[["remove_non_dict"]]) { + # The "specialdotsep" string is added to list of dictionary + # words + dw <- c(self$dw, "specialdotsep") + # The non dictionary words are removed from the data + words <- words[words %in% dw] + # All 1 length words except for 'a' and 'i' are removed + # The indexes position of all words that are "a" or "i" + i1 <- (words == "a" | words == "i") + # The index position of words of length 2 or more + i2 <- (nchar(words) > 1) + # The list of all words of length 2 or more including "a" and + # "i" + words <- words[i1 | i2] + } + # If bad words should be removed + if (self$opts[["remove_bad"]]) { + # The "specialdotsep" string is added to list of bad words + bw <- c(self$bw, "specialdotsep") + # The bad words are removed from the data + words <- words[!words %in% self$bw] + } + # The words are combined with space + l <- paste(words, collapse = " ") + # The "specialdotsep" string is replaced with "." + l <- gsub("specialdotsep", ".", l) + # The sentences in the lines are extracted + l <- strsplit(l, split = "\\.") + # The sentences are converted to an atomic list + l <- unlist(l) + # If extra spaces should be removed + if (self$opts[["remove_extra_space"]]) { + # Multiple spaces are replaced by single space + l = gsub("\\s{2,}", " ", l) + # Leading and trailing whitespaces are removed + l = trimws(l) + } + + # If each sentence should have a minimum number of words + if (self$opts[["min_words"]] > -1) { + # The number of words in each sentence + wc <- str_count(l, pattern = boundary("word")) + # The lines containing less than min_words number of words are + # removed + l <- l[wc >= self$opts[["min_words"]]] + } + + # Consecutive 'a' and 'i' are replaced with single 'a' or 'i' + l <- gsub("(a\\s){2,}", "\\1 ", l) + l <- gsub("(i\\s){2,}", "\\1 ", l) + l <- gsub("a$", "", l) + + return(l) + } + ), + + private = list( + + # @description + # Performs processing for the \code{clean_files} function. + # It processes the given lines of text. It divides the given lines of + # text into sentences by spliting on '.'. Each sentence is then cleaned + # using \code{clean_line}. If the number of words in the cleaned + # sentence is less than min_words, then the sentence is rejected. + # @param lines The lines of text to clean. + # @return The processed line is returned. + process = function(lines) { + # The sentence is cleaned + cl <- self$clean_lines(lines) + + return(cl) + } + ) +) diff --git a/R/model-evaluator.R b/R/model-evaluator.R new file mode 100644 index 0000000..ca02aa1 --- /dev/null +++ b/R/model-evaluator.R @@ -0,0 +1,1136 @@ +#' It is used to evaluate the accuracy and performance of the model. +#' +#' @description +#' It provides methods that perform extrinsic and intrinsic model +#' evaluation. It also provides methods for determining the memory and time +#' requirements for generating the model. It also provides a method for +#' determining how much memory is used by the final model. +#' +#' @details +#' It provides a method that performs intrinsic model evaluation based +#' on Perplexity. It also provides a method that performs extrinsic model +#' evalation based on accuracy. It provides a method for determining how much +#' memory and time is needed to generate a model for different input data +#' sizes. It provides a method for determining how much memory is needed by +#' the final model. +ModelEvaluator <- R6::R6Class( + "ModelEvaluator", + inherit = TextFileProcessor, + public = list( + #' @field tp The transition probabilities data frame. + tp = NULL, + + #' @field wl The list of unique words. + wl = NULL, + + #' @field dp The default probability is equal to 1/(N+V), where N is the + #' number of words in the sentence, V is the number of words in the + #' vocabulary. + dp = NULL, + + #' @field model The maximum number of ngrams supported by the model. + model = 4, + + #' @field dc_opts The options for the data cleaner object. + #' min_words -> The minimum number of words per sentence. + #' line_count -> The number of lines to read and clean at a time. + #' sw_file -> The stop words file path. + # dict_file -> The dictionary file path. + #' bad_file -> The bad words file path. + #' to_lower -> If the words should be converted to lower case. + #' remove_stop -> If stop words should be removed. + #' remove_punct -> If punctuation symbols should be removed. + #' remove_non_dict -> If non dictionary words should be removed. + #' remove_non_alpha -> If non alphabet symbols should be removed. + #' remove_extra_space -> If leading, trailing and double spaces + #' should be removed. + #' remove_bad -> If bad words should be removed + dc_opts = list( + "min_words" = 2, + "line_count" = 1000, + "sw_file" = "./data/stop-words.txt", + "dict_file" = "./data/dict-no-bad.txt", + "bad_file" = "./data/bad-words.txt", + "to_lower" = T, + "remove_stop" = F, + "remove_punc" = T, + "remove_non_dict" = T, + "remove_non_alpha" = T, + "remove_extra_space" = T, + "remove_bad" = F + ), + + #' @field tg_opts The options for the token generator obj. + #' n -> The ngram size. + #' save_ngrams -> If the ngram data should be saved. + #' min_freq -> All ngrams with frequency less than min_freq are + #' ignored. + #' line_count -> The number of lines to process at a time. + #' stem_words -> If words should be converted to their stem. + #' dir -> The dir where the output file should be saved. + #' format -> The format for the output. There are two options. + #' 'plain' -> The data is stored in plain text. + #' 'obj' -> The data is stored as a R obj. + tg_opts = list( + "min_freq" = -1, + "n" = 1, + "save_ngrams" = T, + "min_freq" = -1, + "line_count" = 5000, + "stem_words" = F, + "dir" = "./data/models", + "format" = "obj" + ), + + #' @field ssize The sample size in Mb. + ssize = 30, + + #' @field ddir The folder containing the data files + ddir = "./data", + + #' @field mdir The folder containing the model files + mdir = "./data/models", + + #' @description + #' It initializes the current object. It is used to set the + #' maximum ngram number, sample size, input file name, data cleaner + #' options and verbose option. + #' @param model The maximum ngram number supported by the model. + #' @param ssize The sample size in Mb. + #' @param ddir The data directory. + #' @param mdir The model directory. + #' @param dc_opts The data cleaner options. + #' @param tg_opts The token generator options. + #' @param verbose If progress information should be displayed. + initialize = function(model = 4, + ssize = 30, + ddir = "./data", + mdir = "./data/models", + dc_opts = list(), + tg_opts = list(), + verbose = 0) { + + # The base class is initialized + super$initialize(NULL, NULL, verbose) + + # The model number is set + self$model <- model + # The sample size is set + self$ssize <- ssize + # The data directory name is set + self$ddir <- ddir + # The model directory name is set + self$mdir <- mdir + # If the dc_opts are given + if (length(dc_opts) > 0) { + # The custom dc_opts are merged with the default dc_opts + self$dc_opts = modifyList(self$dc_opts, dc_opts) + } + # If the tg_opts are given + if (length(tg_opts) > 0) { + # The custom tg_opts are merged with the default tg_opts + self$tg_opts = modifyList(self$tg_opts, tg_opts) + } + # The transition probabilities data is initialized + self$tp <- NULL + }, + + #' @description + #' It compares the performance of the specified models by + #' plotting the performance statistics. The models are specified with + #' the type parameter. + #' @param type The models to compare. It can be: + #' 'basic' -> One model for each ngram. + #' 'grouped' -> One model for each group and ngram. For + #' e.g For data size 5 Mb, there are 3 models for the ngrams 2:4. + #' @param opts The options for plotting the data. + #' 'group' -> The field to group by. + #' 'title' -> The main plot title. + #' 'subtitle' -> The plot sub title. + #' @return The performance stats. + performance_comparision = function(type, opts) { + # If the type is 'basic' + if (type == 'basic') { + # The pstats file name + fn <- paste0(self$mdir, "/pstats.RDS") + # The stats are read + pstats <- self$read_obj(fn) + # The config file name + fn <- paste0(self$mdir, "/config.RDS") + # The config file is read + config <- self$read_obj(fn) + # The y-axis values + y <- list("m" = NULL, "t" = NULL, "p" = NULL, "a" = NULL) + # For each model the performance metrics are measured + for (i in 1:(config$model-1)) { + # The memory value + m <- self$format_size(pstats[[i]][["memory"]]) + # The y-axis values are updated + y[["m"]] <- c(y[["m"]], m) + y[["t"]] <- c(y[["t"]], pstats[[i]][["time"]]) + y[["p"]] <- c(y[["p"]], + pstats[[i]][["intrinsic"]][["mean"]]) + y[["a"]] <- c(y[["a"]], + pstats[[i]][["extrinsic"]][["valid_perc"]]) + } + # The data frame containing the data to be plotted + df <- data.frame( + "n" = 2:(length(pstats) + 1), + "m" = y[["m"]], + "t" = y[["t"]], + "p" = y[["p"]], + "a" = y[["a"]] + ) + # The options for plotting + opts <- list( + "type" = "basic", + "title" = "Variation of performance with ngram size", + "subtitle" = opts[['subtitle']]) + } + # If the type is 'grouped' + else if (type == 'grouped') { + # The average performance stats for each data size + y <- list("m" = NULL, "t" = NULL, "p" = NULL, "a" = NULL) + # The different items in the group. Each folder is a group item + f <- list.dirs(self$mdir, recursive = F, full.names = F) + # The output data frame + df <- data.frame() + # For each folder, the performance stats for all ngram models is + # calculated. + for (i in f) { + # The performance stats file name + fn <- paste0(self$mdir, "/", i, "/pstats.RDS") + # The stats are read + pstats <- self$read_obj(fn) + # The config file name + fn <- paste0(self$mdir, "/", i, "/config.RDS") + # The stats are read + config <- self$read_obj(fn) + # The memory usage for each ngram + m <- private$process_stats(pstats, "memory", config$model) + # The time taken for each ngram + t <- private$process_stats(pstats, "time", config$model) + # The perplexity for each ngram + p <- private$process_stats(pstats, "perplexity", config$model) + # The accuracy score for each ngram + a <- private$process_stats(pstats, "accuracy", config$model) + # The temprary data frame for the current data size + tdf <- data.frame( + rep(i, (config$model-1)), + m, t, p, a, 2:(length(pstats) + 1)) + # The column names are set + names(tdf) <- c(opts[["group"]], "m", "t", "p", "a", "n") + # The temprary data frame is appended to the output data + # frame + df <- rbind(df, tdf) + } + # The group name + g <- opts[["group"]] + # The options for plotting + opts <- list( + "type" = "grouped", + "group" = g, + "title" = opts[['title']], + "subtitle" = opts[['subtitle']]) + # If the group column is numeric + if (is.numeric(as.numeric(as.character(df[[g]])))) { + # The group column is converted to numeric + df[[g]] <- as.numeric(as.character(df[[g]])) + # The data is ordered by group + df <- df[order(df[[opts[["group"]]]]), ] + # The group column is converted to factor + df[[g]] <- as.factor(df[[g]]) + } + } + # The performance stats are plotted + private$plot_stats(df, opts) + # The performance stats are returned + return(df) + }, + + #' @description + #' For each model it performs intrinsic and extrinsic + #' evaluation. It also measures the memory usage and time taken. The + #' performance metrics are displayed in 5 plots on one page. Performance + #' statistics are saved to the file pstats.RDS. + performance_evaluation = function() { + # The performance stats + pstats <- list() + # The maximum model number + mmax <- self$model + # The y-axis values + y <- list("m" = NULL, "t" = NULL, "p" = NULL, "a" = NULL) + # For each model the performance metrics are measured + for (m in 2:mmax) { + # The transition probabilities data is initialized + self$tp <- NULL + # The model number is set + self$model <- m + # The intrinsic and extrinsic evaluation is performed + time_taken <- system.time({ + memory_used <- mem_change({ + # intrinsic evaluation is performed + istats <- self$intrinsic_evaluation() + # Extrinsic evaluation is performed + estats <- self$extrinsic_evaluation() + }) + }) + # The memory used + memory_used <- object_size(self$tp) + # The performance stats are updated + pstats[[m-1]] <- list( + "intrinsic" = istats, + "extrinsic" = estats, + "memory" = memory_used, + "time" = time_taken[[3]] + ) + # The y-axis values are updated + y[["m"]] <- c(y[["m"]], self$format_size(memory_used)) + y[["t"]] <- c(y[["t"]], time_taken[[3]]) + y[["p"]] <- c(y[["p"]], istats$mean) + y[["a"]] <- c(y[["a"]], estats$valid_perc) + } + + # The information message is shown + self$display_msg("Saving model performance stats...", 1) + # The performance stats file name + fn <- paste0(self$mdir, "/pstats.RDS") + # The performance stats are saved + self$save_obj(pstats, fn) + # The data to be plotted + df <- data.frame( + "n" = 2:(length(pstats)+1), + "m" = y[["m"]], + "t" = y[["t"]], + "p" = y[["p"]], + "a" = y[["a"]] + ) + # The options for plotting + opts <- list("type" = "basic") + # The performance stats are plotted + private$plot_stats(df, opts) + }, + + #' @description + #' It loads the given model configuration file. + #' @param fn The config file name. + load_config = function(fn) { + # The model configuration is read + config <- self$read_obj(fn) + # The configuration is copied to the object attributes + self$model <- config$model + self$ssize <- config$ssize + self$dc_opts <- config$dc_opts + self$ddir <- config$ddir + self$mdir <- config$mdir + }, + + #' @description + #' It loads the model located at the mdir location + load_model = function() { + # The configuration file name + fn <- paste0(self$mdir, "/config.RDS") + # The model configuration is loaded to the current object + self$load_config(fn) + # The tp data for the specified ngrams is loaded + private$read_tp_data(self$model) + }, + + #' @description + #' It saves the model configuration to the models + #' subdirectory of the given directory. + save_config = function() { + # The information message is shown + self$display_msg("Saving model configuration...", 1) + # The model configuration + config <- list( + "model" = self$model, + "ssize" = self$ssize, + "ddir" = self$ddir, + "mdir" = self$mdir, + "dc_opts" = self$dc_opts, + "tg_opts" = self$tg_opts + ) + # The configuration file + fn <- paste0(self$mdir, "/config.RDS") + # The configuration is saved + self$save_obj(config, fn) + }, + + #' @description + #' It generates the model for the given ngram number, + #' data sample size, data cleaning options and input file. + generate_model = function() { + # The data analyzer object is created + da <- DataAnalyzer$new( + paste0(self$ddir, "/train.txt"), self$verbose) + # The training, testing and validation data sets are generated + da$generate_data(self$ddir, + list(train = .8, test = .1, validate = .1)) + # The object size is formatted + obj_size <- file.size(paste0(self$ddir, "/train.txt"))/10^6 + # The proportion of data to sample + prop <- (self$ssize/obj_size) + # Random sample is taken and cleaned + self$generate_clean_sample(prop, T, 'tr') + # The model directory is set + self$tg_opts$dir <- self$mdir + # For each ngram number, the ngram token file is generated + for (i in 1:self$model) { + # The clean train data file name + fn <- paste0(self$mdir, "/train-clean.txt") + # The ngram number is set + self$tg_opts$n <- i + # The TokenGenerator object is created + tg <- TokenGenerator$new(fn, self$tg_opts, self$verbose) + # The ngram tokens are generated + tg$generate_tokens() + } + # For each ngram number, the transition probabilities are generated + for (i in 2:self$model) { + # The TPGenerator object is created + tp <- TPGenerator$new() + tp_opts <- list( + "n_range" = 1:i, + "save_tp" = T, + "format" = "obj", + "dir" = self$mdir + ) + # The transition probabilities are generated + tp$generate_tp(tp_opts) + } + # The model config is saved + self$save_config() + }, + + #' @description + #' Evaluates the model using intrinsic evaluation based on + #' Perplexity. First a validation data set containing 1000 lines is + #' generated. It is then cleaned. 20 random sentences are taken. For + #' each sentence, Perplexity of all the words is calculated. + #' @return The minumum, maximum and mean Perplexity score. + intrinsic_evaluation = function() { + # The transition probabilities data is read + private$read_tp_data(self$model) + # Random sample is taken and cleaned + self$generate_clean_sample(1000, T, 'va') + # The validation sample data is read + data <- self$read_file( + paste0(self$mdir,"/validate-clean.txt"), F) + # A sample of 20 sentences is taken + data <- data[1:20] + # The information message + msg <- paste0( + "Calculating Perplexity for ", length(data), " sentences") + # The information message is shown + self$display_msg(msg, 1) + # The list of perplexities + pl <- c(); + # The loop counter + c <- 1 + # The Perplexity of each sentence in the test data is calculated + for (line in data) { + # The line is split on space + words <- str_split(line, " ")[[1]] + # The perplexity for the line is calculated + p <- self$calc_perplexity(words) + # The information message + msg <- paste0( + "Perplexity of the sentence '", + line, "' is: ", p) + # The information message is shown + self$display_msg(msg, 2) + # The list of perplexities is updated + pl <- c(pl, p); + # If the counter is divisible by 10 + if (c %% 10 == 0) { + # The information message + msg <- paste0(c, " lines have been processed") + # The information message is shown + self$display_msg(msg, 1) + } + # The counter is increased by 1 + c <- c + 1 + } + # The perplexity stats + stats <- list( + "min" = min(pl), + "max" = max(pl), + "mean" = mean(pl)); + + return(stats); + }, + + #' @description + #' Evaluates the model using extrinsic evaluation based on + #' Accuracy. First a validation data set containing 1000 lines is + #' generated. It is then cleaned. 20 random sentences are taken. For + #' each sentence, the model is used to predict the next word. The + #' accuracy stats are returned. A prediction is considered to be correct + #' if one of the predicted words matched the actual word. + #' @return The number of correct and incorrect predictions. + extrinsic_evaluation = function() { + # The transition probabilities data is read + private$read_tp_data(self$model) + # Random sample is taken and cleaned + self$generate_clean_sample(1000, T, 'va') + # The validation sample data is read + data <- self$read_file( + paste0(self$mdir,"/validate-clean.txt"), F) + # A Random sample of 100 sentences is taken + data <- data[1:100] + # The information message + msg <- paste0( + "Predicting the next word for ", length(data), " sentences") + # The information message is shown + self$display_msg(msg, 1) + # The statistics + stats <- list("valid" = 0, "invalid" = 0) + # The loop counter + c <- 1 + # The last word for each sentence is predicted + for (line in data) { + # The line is split on space + words <- str_split(line, " ")[[1]] + # The word to predict + w <- words[length(words)] + # The previous words used to predict the word + pw <- words[1:length(words)-1] + # If the words should be stemmed + if (self$tg_opts[["stem_words"]]) { + # The previous words are stemmed + pw <- wordStem(pw) + } + # The next word is predicted + res <- self$predict_word(pw, F) + # If the predicted word matches the actual word + if (w %in% res["words"]) { + stats[["valid"]] <- stats[["valid"]] + 1; + # The information message + self$display_msg( + paste0("The word: ", w, " was predicted"), 3) + } + # If the predicted word does not match + else { + stats[["invalid"]] <- stats[["invalid"]] + 1; + # The information message + self$display_msg( + paste0("The word: ", w, " could not be predicted"), 3) + } + # The counter is increased by 1 + c <- c + 1 + # If the counter is divisible by 10 + if (c %% 10 == 0) { + # The information message + msg <- paste0(c, " sentences have been processed") + # The information message is shown + self$display_msg(msg, 1) + } + } + + # The precentage of valid + stats[["valid_perc"]] <- + (stats[["valid"]]/(stats[["valid"]] + stats[["invalid"]]))*100 + # The precentage of invalid + stats[["invalid_perc"]] <- 100-stats[["valid_perc"]] + + return(stats) + }, + + #' @description + #' Predicts the new word given a list of 1, 2 or 3 previous + #' words. It checks the given n words in the transition probabilities + #' data. If there is a match, the top 3 next words with highest + #' probabilities are returned. If there is no match, then the last n-1 + #' previous words are checked. This process is continued until the last + #' word is checked. If there is no match, then empty result is returned. + #' @param words A character vector of previous words or a single vector + #' containing the previous word text. + #' @param count The number of results to return. + #' @param dc A DataCleaner object. If it is given, then the given words + #' are cleaned. If the stem_words option was set in the TokenGenerator + #' object configuration for the current model, then the words are + #' converted to their stems. + #' @return The top 3 predicted words along with their probabilities. + predict_word = function(words, count = 3, dc = NULL) { + # The transition probabilities data is read + private$read_tp_data(self$model) + # The words are assigned to temp variable + w <- words + # If the DataCleaner obj was specified + if (!is.null(dc)) { + # If the words is a set of vectors + if (length(w) > 1) { + # The words are converted to a single line of text + w <- paste0(w, collapse = " ") + } + # The words are cleaned + w <- dc$clean_lines(w) + # If the words should be stemmed + if (self$tg_opts[["stem_words"]]) { + # The previous words are stemmed + w <- wordStem(w) + } + } + + # If the words are in the form of a line + if (length(w) == 1) { + # The words are split on space + w <- strsplit(w, " ")[[1]] + } + + # The length of previous words + pwl <- length(w) + # The loop counter + c <- 1 + # Indicates if the word was found + found <- FALSE + # The result + result <- list("found" = F, "words" = "", "probs" = "") + # If the previous words length is 0 + if (pwl == 0) + return(result) + # The last 3 words are extracted. + # If the previous word length is more than 3 + if (pwl > 3) { + pw <- w[(pwl-2):pwl] + } + else { + pw <- w + } + # The length of previous words + pwl <- length(pw) + # Each ngram in the previous word list is checked starting from + # largest ngram + for (i in pwl:1) { + # The previous words to check + tpw <- pw[c:pwl] + # The key to use for the transition probabilities data + k <- paste(tpw, collapse = "_") + # The key is converted to a numeric hash + h <- digest2int(k) + # The transition probabilities data is checked + res <- self$tp[self$tp$pre == h, ] + # If the prefix was found + if (nrow(res) > 0) { + # The word was found + found <- TRUE + # The result is sorted by probability + sres <- res[order(res$prob, decreasing = T),] + # The number of rows in the result set + rcount <- nrow(sres) + # If the number of results is more than the required number + # of results + if (rcount > count) { + # The result count is set to the required number of + # results + rc <- count + } + else { + # The result count is set to the number of results + rc <- rcount + } + # The required word probabilities + probs <- sres$prob[1:rc] + # The next words indexes + ind <- sres$nw[1:rc] + # The required words + nw <- as.character(self$wl$pre[ind]) + + # The result is updated + result[["words"]] <- nw + result[["probs"]] <- probs + result[["found"]] <- T + # The information message + msg <- paste0("The ngram key: ", k, " was found") + # Information message is shown + self$display_msg(msg, 3) + # The loop ends + break; + } + else { + # The information message + msg <- paste0("The ngram key: ", k, " was not found") + # Information message is shown + self$display_msg(msg, 3) + # The result is updated + result[["found"]] <- F + } + # The information message + msg <- paste0("Backing off to ", (i) ,"-gram") + # Information message is shown + self$display_msg(msg, 3); + # The counter is increased by 1 + c <- c + 1; + } + + return(result); + }, + + #' @description + #' Calculates the probability of the given word given the + #' previous model-1 words, where model is the maximum ngram number. It + #' looks up the probability of a word given n previous words. The + #' previous n words are converted to numeric hash using digest2int + #' function. The hash is looked up in a data frame of transition + #' probabilities. The word is converted to a number by checking its + #' position in a list of unique words. If the hash and the word position + #' were found, then the probability of the previous word and hash is + #' returned. If it was not found, then the hash of the n-1 previous + #' words is taken and the processed is repeated. If the data was not + #' found in the data frame, then the word probability is returned. This + #' is known as backoff. If the word probability could not be found then + #' the default probability is returned. The default probability is + #' calculated as 1/(N+V), Where N = number of words in corpus and V is + #' the number of dictionary words. + #' @param word The word whoose probability is to be calculated. + #' @param pw The previous words. + #' @return The probability of the word given the previous words. + get_word_prob = function(word, pw) { + # If the default probability is not set, then an error is raised + if (is.null(self$dp)) + stop("The default probability is not set !") + # The length of previous words + pwl <- length(pw) + # The probability of the word given the previous words. It is + # initialized to the default probability, which should be 1/(N+V) + prob <- self$dp + # The loop counter + c <- 1 + # Indicates if the word was found + found <- FALSE + # The next word id + nw <- match(word, self$wl$pre) + # If the next word was not found + if (is.na(nw)) { + # The information message + msg <- paste0( + "The next word: ", word, " was not found") + # Information message is shown + self$display_msg(msg, 3) + } + # If the previous word count is more than 0 + else if (pwl > 0) { + # The previous words are checked + for (i in pwl:1) { + # The previous words to check + tpw <- pw[c:pwl] + # The key to use for the transition matrix + k <- paste(tpw, collapse = "_") + # The key is converted to a numeric hash + h <- digest2int(k) + # The transition probabilities data is checked + res <- self$tp[self$tp$pre == h & self$tp$nw == nw, ] + # If the prefix was found + if (nrow(res) > 0) { + # The word was found + found <- TRUE + # The probability is set + prob <- as.numeric(res$prob) + # The information message + msg <- paste0("The ngram key: ", + k, " and the next word: ", + word, " were found") + # Information message is shown + self$display_msg(msg, 3) + # The loop ends + break + } + else { + # The information message + msg <- paste0("The ngram key: ", + k, " and the next word: ", + word, " were not found") + # Information message is shown + self$display_msg(msg, 3) + } + # The information message + msg <- paste0("Backing off to ", (i) ,"-gram") + # Information message is shown + self$display_msg(msg, 3) + # The counter is increased by 1 + c <- c + 1 + } + } + # If the word was not found then the probability of the word is + # checked in the n1-gram + if (!found) { + # If the word was not found + if (sum(self$wl$pre == word) == 0) { + # Information message is shown + self$display_msg("Using default probability", 3) + } + else { + # The word probability + prob <- as.numeric(self$wl[self$wl$pre == word, "prob"]) + } + } + + return(prob) + }, + + #' @description + #' The Perplexity for the given sentence is calculated. For + #' each word, the probability of the word given the previous words is + #' calculated. The probabilities are multiplied and then inverted. The + #' nth root of the result is the perplexity, where n is the number of + #' words in the sentence. If the stem_words tokenization option was + #' specified, then the previous words are converted to their stem. + #' @param words The list of words. + #' @return The perplexity of the given list of words. + calc_perplexity = function(words) { + # The number of words in the sentence + wl <- length(words) + # The product of the word probabilities + prob_prod <- 1 + # For each word, the probability of the word is calculated + for (i in 1:wl) { + # The word + word <- words[i] + # The list of previous words + pw <- NULL + # If i is more than 1 + if (i > 1) { + # The start index + start <- 1 + # If i > self$model + if (i > self$model) start <- i-(self$model-1) + # The list of previous words + pw <- words[start:(i-1)] + # If the words should be stemmed + if (self$tg_opts[["stem_words"]]) { + # The previous words are stemmed + pw <- wordStem(pw) + } + } + # The word probability + prob <- self$get_word_prob(word, pw) + # The probability product is updated + prob_prod <- prob_prod * prob + } + # The nth root of the inverse of the probability product is taken + p <- round((1/prob_prod)^(1/wl), 0) + + return(p) + }, + + #' @description + #' Generates a sample file of given size from the main + #' train.txt file file name. The file is cleaned and saved. + #' @param ss The number of lines or proportion of lines to sample. + #' @param ic If the sample file should be cleaned. + #' @param t The type of sample. It can be: + #' 'tr' -> training + #' 'te' -> testing + #' 'va' -> validation + generate_clean_sample = function(ss, ic, t) { + # If the type is 'tr' + if (t == 'tr') sfn <- 'train' + # If the type is 'te' + else if (t == 'te') sfn <- 'test' + # If the type is 'va' + else if (t == 'va') sfn <- 'validate' + # The sample file name + fn <- paste0(self$ddir, "/", sfn, ".txt") + # The sample file name + sf <- paste0(self$mdir, "/", sfn, ".txt") + # The clean sample file name + csf <- paste0(self$mdir, "/", sfn, "-clean.txt") + # If the cleaned sample file already exists + if (file.exists(csf)) { + # The information message + msg <- paste0("The cleaned sample file: ", csf, + " already exists") + # Information message is shown + self$display_msg(msg, 2) + } + else { + # If the sample file does not exist + if (!file.exists(sf)) { + # The information message + msg <- paste0("Generating sample file from the file: ", fn) + # Information message is shown + self$display_msg(msg, 2) + # The input file is read + data <- self$read_file(fn, F) + # If the sample size is less than 1 + if (ss < 1) { + # The number of lines in the main file + lc <- length(data) + # The number of lines in the sample file + lc <- round(lc*ss) + } + else { + lc <- ss + } + # The sample file data + data <- data[1:lc] + # The sample file data is saved + self$write_file(data, sf, F) + } + # If the sample file should be cleaned + if (ic) { + # The options for cleaning the data + opts <- self$dc_opts + # The line count is set to 5000 + opts[["line_count"]] <- 5000 + # The output file name + opts[["output_file"]] <- csf + # The data cleaner object is created + dc <- DataCleaner$new(sf, opts, verbose = self$verbose) + # The sample file is cleaned + dc$clean_file() + } + } + } + ), + + private = list( + + # @description It creates a single plot based on ggplot2. Depending on + # the opt parameter, the plot may contain groups or it may be a simple + # plot. + # @param data A data frame containing the data to be plotted. It should + # have 2 variables, x and y. + # @param opts The options for plotting the data. It contains: + # 'x_lab' -> The x-axis label. + # 'y_lab' -> The y-axis label. + # 'type' -> The type of plot. It can be 'basic' or 'grouped'. + # 'group' -> The field to group by. + # @return A ggplot object representing the plot. + plot_graph = function(data, opts) { + # y-max + y_max <- max(as.numeric(as.character(data$y))) + # If the type is basic + if (opts[['type']] == 'basic') { + # The graph is plotted + p <- ggplot(data, aes(x, y)) + geom_point() + + geom_smooth(method='lm', formula= y~x) + + labs(x = opts[["x_lab"]], y = opts[["y_lab"]]) + + xlim(min(data$x), max(data$x)) + + coord_cartesian(ylim = c(0,y_max)) + } + # If the type is grouped + else if (opts[['type']] == 'grouped') { + # The data is duplicated to prevent the warnings + data <- rbind(data, data) + # The group field data + g <- opts$group + # The graph is plotted + p <- ggplot( + data, + aes_string("x", "y", group = g, col = g)) + + geom_point() + + geom_smooth(method = "loess", formula = y ~ x, span = 1.4, se = FALSE) + + labs(x = opts[["x_lab"]], y = opts[["y_lab"]]) + + xlim(min(data$x), max(data$x)) + + coord_cartesian(ylim = c(0,y_max)) + } + return(p) + }, + + # @description + # It plots the given model performance stats. The stats + # are plotted on one page. 5 stats are plotted. The first 4 plots are + # of memory, time taken, mean Perplexity and accuracy. The fifth plot + # is of Perplexity vs accuracy. + # @param data The data to plot. + # @param opts The options for plotting the data. It contains: + # 'type' -> The type of plot. It can be 'basic' or 'grouped'. + # 'group' -> The field to group by. + # 'title' -> The main plot title. + # 'subtitle' -> The plot sub title. + plot_stats = function(data, opts) { + # The information message is shown + self$display_msg("Plotting model performance stats...", 1) + # If the type is basic + if (opts[["type"]] == 'basic') { + # The data frames + df1 <- data.frame(x = data$n, y = data$m) + df2 <- data.frame(x = data$n, y = data$t) + df3 <- data.frame(x = data$n, y = data$p) + df4 <- data.frame(x = data$n, y = data$a) + df5 <- data.frame(x = data$a, y = data$p) + # The options for plot 1 + popts <- list("x_lab" = "ngram", + "y_lab" = "memory", + "type" = opts[["type"]]) + # Plot 1 + p1 <- private$plot_graph(df1, popts) + # The options for plot 2 + popts <- list("x_lab" = "ngram", + "y_lab" = "time", + "type" = opts[["type"]]) + # Plot 2 + p2 <- private$plot_graph(df2, popts) + # The options for plot 3 + popts <- list("x_lab" = "ngram", + "y_lab" = "perplexity", + "type" = opts[["type"]]) + # Plot 3 + p3 <- private$plot_graph(df3, popts) + # The options for plot 4 + popts <- list("x_lab" = "ngram", + "y_lab" = "accuracy", + "type" = opts[["type"]]) + # Plot 4 + p4 <- private$plot_graph(df4, popts) + # The options for plot 5 + popts <- list("x_lab" = "accuracy", + "y_lab" = "perplexity", + "type" = "basic") + # Plot 5 + p5 <- private$plot_graph(df5, popts) + # The plots are displayed on a single page + patchwork <- p1 + p2 + p3 + p4 + p5 + # Main title is added + print(patchwork + plot_annotation( + "title" = opts[["title"]], + "subtitle" = opts[["subtitle"]])) + } + # If the type is grouped + else if (opts[["type"]] == 'grouped') { + # The group name + g <- opts[["group"]] + # The data frames + df1 <- data.frame(data$n, data$m, data[[g]]) + df2 <- data.frame(data$n, data$t, data[[g]]) + df3 <- data.frame(data$n, data$p, data[[g]]) + df4 <- data.frame(data$n, data$a, data[[g]]) + # The column names + n <- c("x", "y", g) + # The column names are set + names(df1) <- n + names(df2) <- n + names(df3) <- n + names(df4) <- n + # The options for plot 1 + popts <- list("x_lab" = "ngram", + "y_lab" = "memory", + "group" = opts[["group"]], + "type" = opts[["type"]]) + # Plot 1 + p1 <- private$plot_graph(df1, popts) + # The options for plot 2 + popts <- list("x_lab" = "ngram", + "y_lab" = "time", + "group" = opts[["group"]], + "type" = opts[["type"]]) + # Plot 2 + p2 <- private$plot_graph(df2, popts) + # The options for plot 3 + popts <- list("x_lab" = "ngram", + "y_lab" = "perplexity", + "group" = opts[["group"]], + "type" = opts[["type"]]) + # Plot 3 + p3 <- private$plot_graph(df3, popts) + # The options for plot 4 + popts <- list("x_lab" = "ngram", + "y_lab" = "accuracy", + "group" = opts[["group"]], + "type" = opts[["type"]]) + # Plot 4 + p4 <- private$plot_graph(df4, popts) + # The plots are displayed on a single page + patchwork <- p1 + p2 + p3 + p4 + # Main title is added + # Main title is added + print(patchwork + plot_annotation( + "title" = opts[["title"]], + "subtitle" = opts[["subtitle"]])) + } + }, + + # @description + # Reads the model file and sets the current objects attributes. + # @param model The model number. It is the maximum ngrams supported by + # the model. + read_tp_data = function(model) { + # If the model has already been loaded then function returns + if (!is.null(self$tp)) return() + # The information message + msg <- paste0("Loading model ", model, "...") + # The information message is shown + self$display_msg(msg, 1) + # The model config file name + fn <- paste0(self$mdir, "/config.RDS") + # The config file is loaded + self$load_config(fn) + # The model file name + fn <- paste0(self$mdir, "/model-", model, ".RDS") + # The model is read to a data frame + self$tp <- self$read_obj(fn) + # The words file name + fn <- paste0(self$mdir, "/words.RDS") + # The list of words is read + self$wl <- self$read_obj(fn) + # The dictionary file name + fn <- self$dc_opts[["dict_file"]] + # The file contents + dict <- self$read_file(fn, F) + # The information message is shown + self$display_msg("Calculating default probability...", 1) + # The number of words in the dictionary file. It is used to + # calculate Perplexity. + vc <- length(dict) + # The cleaned training file is read + fn <- paste0(self$mdir, "/train-clean.txt") + # The model is read to a data frame + data <- self$read_file(fn, F) + # The words are split on " " + w <- strsplit(data, " ") + # The words are converted to atomic list + w <- unlist(w) + # The number of words + n <- length(w) + # The default probability is set + self$dp <- 1/(n + vc) + }, + + # @description + # It returns the average value for the required stats, for + # the given performance stats. + # @param pstats The performance stats. + # @param type The type of stats. It can be: + # 'memory' -> The mean memory used. + # 'time_taken' -> The mean time taken by the intrinsic and + # extrinsic evaluation tests. + # 'perplexity' -> The mean Perplexity score. + # 'accuracy' -> The mean accuracy score. + # @param nmax -> The maximum ngram size. + process_stats = function(pstats, type, nmax) { + # The stats + s <- NULL + # The average memory for each ngram model + for (n in 1:(nmax-1)) { + # If the stats is memory + if (type == "memory") { + # The stats value for the current ngram number + v <- pstats[[n]][["memory"]] + # The stats are formated + v <- self$format_size(v) + } + # If the stats is perplexity + else if (type == "perplexity") { + # The stats value for the current ngram number + v <- pstats[[n]][["intrinsic"]][["mean"]] + } + # If the stats is accuracy + else if (type == "accuracy") { + # The stats value for the current ngram number + v <- pstats[[n]][["extrinsic"]][["valid_perc"]] + } + # If the stats is time + else if (type == "time") { + # The stats value for the current ngram number + v <- pstats[[n]][["time"]] + } + # The stats are updated + s <- c(s, v) + } + + return(s) + } + ) +) diff --git a/R/text-file-processor.R b/R/text-file-processor.R new file mode 100644 index 0000000..e47e187 --- /dev/null +++ b/R/text-file-processor.R @@ -0,0 +1,339 @@ +#' Base class for text file processing +#' +#' @description +#' Provides basic structure for processing text files. +#' +#' @details +#' It provides pre-processing, processing and post-processing functions, which +#' need to be overridden by derived classes. The pre-processing function is +#' called before reading a file. The process function is called for processing a +#' line. The post processing function is called on the processed data. It also +#' provides a method for generating a sample file from an input text file +TextFileProcessor <- R6::R6Class( + "TextFileProcessor", + public = list( + #' @field opts The list of file processing options. + #' save_data -> If the combined processed lines should be saved. + #' output_file -> Name of the output file used to store the data. + opts = list( + "save_data" = F, + "output_file" = "./data/sample-clean.txt" + ), + + #' @field line_count The number of lines to read and process at a time. + line_count = 100, + + #' @field pp_output The output of the pre-processing step + pp_output = NULL, + + #' @field p_output The output of the processing step + p_output = NULL, + + #' @field file_name The name of the text file to process. + file_name = "./data/sample.txt", + + #' @field verbose Indicates if progress data should be printed. + verbose = 0, + + #' @description + #' It initializes the current object. It is used to set the file name + #' and verbose options. + #' @param file_name The path to the file to clean. + #' @param line_count The number of lines to read and clean at a time. + #' @param verbose Indicates if progress information should be displayed. + initialize = function(file_name = "./data/sample.txt", + line_count = 100, + verbose = 2) { + # If the given file name is not NULL and is not valid + if (!is.null(file_name) && !file.exists(file_name)) + stop("The given file name is not valid") + + # The base class attributes are set + # The file name is set + self$file_name <- file_name + # The verbose option is set + self$verbose <- verbose + # The line count is set + self$line_count <- line_count + # The processed output is set + self$p_output <- NULL + }, + + #' @description + #' Reads the given file one line at a time. It runs the given + #' pre-processing function before reading the file. It runs the given + #' line processing function for each line. It optionally saves the + #' output of line processing after reading the file or after processing + #' certain number of lines. + #' @param pre_process The pre-processing function. + #' @param process The function used to process each line. + #' @param post_process The function used to perform post processing. + #' @return The combined processed data + process_file = function(pre_process, process, post_process) { + # Pre-processing is done + pre_process() + # The file is opened + private$con <- file(self$file_name) + # The connection is opened for reading + open(private$con) + # The lines to be read, + lines <- c() + # The loop counter + c <- 0 + # Indicates that data should not be appended + is_app <- F + # The output file name + of <- self$opts[["output_file"]] + # All lines are read + while (TRUE) { + # The lines are read + lines <- readLines(private$con, n = self$line_count, + skipNul = TRUE) + # If all the lines have been read + if (length(lines) == 0) break + # The lines are processed + p_lines <- process(lines) + # If the data should be saved + if (self$opts[["save_data"]] && !is.null(p_lines)) { + # The cleaned data is written to file + self$write_file(p_lines, of, is_app) + # Debug message + self$display_msg( + paste(length(p_lines), "lines were written"), 1) + # Indicates that data should be appended + is_app <- T + } + # The loop counter is increased by 1 + c <- c + 1 + # Debug message + self$display_msg( + paste(self$line_count*c, "lines have been processed"), 1) + + # if (c == 2) break; + } + # The file connection is closed if it is open + close(private$con) + # Post processing is performed + post_process() + }, + + #' @description + #' Reads the given file and returns its contents. + #' @param file_name The name of the file to read. + #' @param is_csv If the data is a csv file + #' @return The file data + read_file = function(file_name, is_csv) { + # The information message + msg <- paste0("Reading file: ", file_name) + # Information message is shown + self$display_msg(msg, 1) + # If the file is not a csv file + if (!is_csv) { + # File is opened for reading + con <- file(file_name) + # The file contents are read + data <- readLines(con, skipNul = TRUE) + # The file connection is closed + close(con) + } + else { + data <- read.csv(file_name) + } + # The data is returned + return (data) + }, + + #' @description + #' Reads the given number of lines from the given file. + #' @param file_name The name of the file to read. + #' @param line_count The number of lines to read. + #' @return The file data + read_lines = function(file_name, line_count) { + # The information message + msg <- paste0("Reading file: ", file_name) + # Information message is shown + self$display_msg(msg, 1) + # File is opened for reading + con <- file(file_name) + # The file contents are read + data <- readLines(con, n = line_count, skipNul = TRUE) + # The file connection is closed + close(con) + # The data is returned + return (data) + }, + + #' @description + #' Writes the given data to the given file. The data may be appended to + #' an existing file. + #' @param data The data to be written. + #' @param file_name The name of the file. + #' @param is_append Indicates if data should be saved. + write_file = function(data, file_name, is_append) { + # The information message + msg <- paste0("Saving file: ", file_name) + # Information message is shown + self$display_msg(msg, 1) + # If the given data is a data frame + if ("data.frame" %in% class(data)) { + # The data frame is written to a file + write.csv(data, file_name, row.names = F) + } + else { + # The file open mode + mode <- "w" + # If the data should be appended + if (is_append) mode <- "a" + # The output file is opened for writing + con <- file(file_name, open = mode) + # The data is written to the output file + writeLines(data, con) + # The file connection is closed + close(con) + } + }, + + #' @description + #' Calculates the size of the given object or formats the given bytes + #' object as a number without units. The returned number is the size in + #' Mb. + #' @param obj An object of class bytes or an object whoose size is to be + #' found. + #' @return The size formatted as a string. + format_size = function(obj) { + # If the obj is not of class bytes + if (!("bytes" %in% class(obj))) { + # The object size + obj_size <- object_size(obj) + } + else { + obj_size <- obj + } + # The bytes obj is formatted as a string + obj_size <- utils:::format.object_size(obj_size, units = "Mb") + # The Mb suffix is removed + obj_size <- sub(" Mb", "\\1", obj_size) + # The object size is converted to a number + obj_size <- as.numeric(obj_size) + + return(obj_size) + }, + + #' @description + #' Saves the given object as a file. + #' @param obj The object to save. + #' @param file_name The file name. + save_obj = function(obj, file_name) { + # The information message + msg <- paste0("Saving file: ", file_name) + # Information message is shown + self$display_msg(msg, 1) + # The object is saved to a file + saveRDS(obj, file_name) + }, + + #' @description + #' Reads the contents of the given file. Loads the file + #' contents to a R object. + #' @param file_name The file name. + #' @return The loaded R obj. + read_obj = function(file_name) { + # The information message + msg <- paste0("Reading file: ", file_name) + # Information message is shown + self$display_msg(msg, 1) + # If the file does not exist + if (!file.exists(file_name)) { + # The information message + msg <- paste0("The file: ", file_name, " cannot be read !") + # Program execution is stopped + stop(msg) + } + else { + # The object is saved + obj <- readRDS(file_name) + } + + return(obj) + }, + + #' @description + #' Prints the given message depending on verbose settings. + #' @param msg The message to be printed. + #' @param min_debug The minimum debugging level + display_msg = function(msg, min_debug) { + # If verbose is >= min_debug , then message is displayed + if (self$verbose >= min_debug) { + print(msg) + } + }, + + #' @description + #' Performs processing on the data. It should be + #' overriden by a derived class. + #' @param lines The lines to process + process = function(lines) { + + }, + + #' @description + #' Performs post-processing on the processed data. It should be + #' overriden by a derived class. + post_process = function() { + + }, + + #' @description + #' Performs pre-processing on the processed data. It should be + #' overriden by a derived class. + pre_process = function() { + return(NULL) + } + ), + private = list( + # @field con The input file connection + con = NULL, + + # @description + # Reads the contents of the given file. Loads the file + # contents to a R object, a data frame or character vector. + # @param file_name The file name. + # @param format The file format. 'plain' or 'obj' + # @param opts Options for reading the file. + # @return The required data. + read_data = function(file_name, format, opts) { + # If the format is plain + if (format == "plain") { + # The file is read + data <- self$read_file(file_name, opts) + } + # If the format is obj + else if (format == "obj") { + # The file is read + data <- self$read_obj(file_name) + } + + return(data) + }, + + # @description + # Writes the given data to a file. The data may be a R object, a + # character vector or a data frame. + # @param file_name The file name. + # @param format The file format. 'plain' or 'obj' + # @param opts Options for writting to the file. + # @return The required data. + write_data = function(data, file_name, format, opts) { + # If the format is plain + if (format == "plain") { + # The data is written to a file + self$write_file(data, file_name, opts) + } + # If the format is obj + if (format == "obj") { + # The R object is saved + self$save_obj(data, file_name) + } + } + ) +) diff --git a/R/token-generator.R b/R/token-generator.R new file mode 100644 index 0000000..6bd5a6a --- /dev/null +++ b/R/token-generator.R @@ -0,0 +1,241 @@ +#' It generates ngrams of given size from an input text file +#' +#' @description +#' It generates ngram tokens along with their frequencies. +#' +#' @details +#' It provides a method for generating ngrams of given size. It saves +#' each ngram along with its frequency to a text file. +TokenGenerator <- R6::R6Class( + "TokenGenerator", + inherit = TextFileProcessor, + public = list( + #' @field tg_opts The options for the token generator obj. + #' n -> The ngram size. + #' save_ngrams -> If the ngram data should be saved. + #' min_freq -> All ngrams with frequency less than min_freq are + #' ignored. + #' line_count -> The number of lines to process at a time. + #' stem_words -> If words should be converted to their stem. + #' dir -> The dir where the output file should be saved. + #' format -> The format for the output. There are two options. + #' 'plain' -> The data is stored in plain text. + #' 'obj' -> The data is stored as a R obj. + tg_opts = list( + "n" = 1, + "save_ngrams" = F, + "min_freq" = -1, + "line_count" = 5000, + "stem_words" = F, + "dir" = "./data/models", + "format" = "obj" + ), + + #' @description + #' It initializes the current obj. It is used to set the file name + #' and verbose options. + #' @param file_name The path to the input file. + #' @param opts The options for generating the ngram tokens. + #' n -> The ngram size. + #' save_ngrams -> If the ngram data should be saved. + #' min_freq -> All ngrams with frequency less than min_freq are + #' ignored. + #' line_count -> The number of lines to process at a time. + #' stem_words -> If words should be converted to their stem. + #' dir -> The dir where the output file should be saved. + #' format -> The format for the output. There are two options. + #' 'plain' -> The data is stored in plain text. + #' 'obj' -> The data is stored as a R obj. + #' @param verbose Indicates if progress information should be displayed. + initialize = function(file_name = "./data/models/validate-clean.txt", + opts = self$tg_opts, + verbose = 0) { + # The given options are merged with the opts attribute + self$tg_opts <- modifyList(self$tg_opts, opts) + # The tg_opts is merged with the base class opts attribute + self$opts <- modifyList(self$opts, self$tg_opts) + # The base class is initialized + super$initialize(file_name, self$opts[["line_count"]], verbose) + # The processed output is initialized + self$p_output <- NULL + }, + + #' @description + #' It generates ngram tokens and their frequencies from the given file + #' name. The tokens may be saved to a text file as plain text or a R + #' obj. + #' @return The ngram tokens along with their frequencies. + generate_tokens = function() { + # The processed output is initialized + self$p_output <- NULL + # The output file name + fn <- private$get_file_name() + # If the output file already exists + if (file.exists(fn)) { + # The information message + msg <- paste0("The ", self$opts[["n"]], + "-gram file already exists") + # The information message is shown + self$display_msg(msg, 1) + # If the ngram data should not be saved + if (!self$opts[["save_ngrams"]]) { + # The ngrams file is read + self$p_output <- self$read_data( + fn, self$opts[["format"]], T) + } + } + else { + # The information message + msg <- paste0("Generating ", + self$opts[["n"]], "-gram tokens...") + # The information message is shown + self$display_msg(msg, 1) + # The base class process_file function is called + super$process_file(super$pre_process, private$process, + private$post_process) + } + } + ), + + private = list( + # @description + # Performs processing for the \code{generate_tokens} function. It + # processes the given line of text. It converts each line of text into + # ngrams of the given size. The frequency of each ngram is updated. + # @param lines The lines of text. + process = function(lines) { + # Ngrams are extracted from each line + ngrams <- private$generate_ngrams(lines) + # If the processed output is empty + if (is.null(self$p_output)) { + # The ngram words are set to the processed output + self$p_output <- ngrams + } + else { + # The ngram words are appended to the processed output + self$p_output <- c(self$p_output, ngrams) + } + }, + + # @description + # It returns the name of the output ngram file. + get_file_name = function() { + # The ngram number + n <- self$opts[["n"]] + # The format + fo <- self$opts[["format"]] + # The output directory + dir <- self$opts[["dir"]] + # The file extension + if (fo == "plain") ext <- ".txt" + else ext <- ".RDS" + + # The file name + file_name <- paste0(dir, "/n", n, ext) + + return(file_name) + }, + + # @description + # It saves the ngram tokens and their frequencies to a text file. + post_process = function() { + # The information message + msg <- paste0("Calculating ", + self$opts[["n"]], "-gram frequencies...") + # The information message is shown + self$display_msg(msg, 1) + # The output is copied to a variable + df <- data.frame("pre" = self$p_output) + # A frequency column is added + df$freq <- 1 + # Each prefix is grouped and summed + df <- df %>% group_by(pre) %>% summarize_all(sum) + # If the minimum ngram frequency is given + if (self$opts[["min_freq"]] > -1) { + # The information message + msg <- paste0("Removing low frequency ngrams...") + # The information message is shown + self$display_msg(msg, 2) + # All ngrams with frequency less than min_freq are ignored + df <- df[df$freq >= self$opts[["min_freq"]], ] + } + # The column names are set + colnames(df) <- c("pre", "freq") + # The output is set to the updated variable + self$p_output <- df + # If the ngram data should be saved + if (self$opts[["save_ngrams"]]) { + # The required file name + fn <- private$get_file_name() + # The format + fo <- self$opts[["format"]] + # The n-gram data frame is written to file + private$write_data(self$p_output, fn, fo, F) + } + }, + + # @description + # It generates ngram frequencies for the given lines of text. + # @param lines The lines of text to process + generate_ngrams = function(lines) { + # The ngram number + n <- self$opts[["n"]] + # If n > 1 + if (n > 1) { + # Trailing and leading white space is removed + l <- trimws(lines, "both") + # Start and end of sentence tags are added + l <- gsub("(^)(.+)($)", "\\2", l) + # The lines are split on space + w <- strsplit(l, " ") + # The words are converted to an atomic vector + w <- unlist(w) + # The index of empty words + i <- (w == "") + # The empty words are removed + w <- w[!i] + # The indexes for the words + indexes <- 1:length(w) + # The ngrams are generated + l <- sapply(indexes, function(i) { + # If the words should be stemmed + if (self$tg_opts[["stem_words"]]) { + # The ngram prefix words are stemmed. The next word is + # not stemmed + v <- c(wordStem(w[i:(i+n-2)]), w[(i+n-1)]) + } + else { + # The ngram token + v <- w[i:(i+n-1)] + } + # The ngram token + v <- paste0(v, collapse = "_") + # The ngram token is returned + return(v) + }, + simplify = T) + # Invalid ngrams need to be removed + # A logical vector indicating position of invalid ngrams + i <- grepl(".+.+", l) + # The list of valid ngrams + l <- l[!i] + # The start of sentence tokens are removed + l <- gsub("", "", l) + # The end of sentence tokens are removed + l <- gsub("", "", l) + } + else { + # The line is split on " " + words <- strsplit(lines, " ") + # The list of words is converted to atomic vector + l <- unlist(words) + # The index of empty words + i <- l == "" + # The empty words are removed + l <- l[!i] + } + + return(l) + } + ) +) diff --git a/R/tp-generator.R b/R/tp-generator.R new file mode 100644 index 0000000..7a2ebb9 --- /dev/null +++ b/R/tp-generator.R @@ -0,0 +1,342 @@ +#' It is used to generate the transition probabilities +#' +#' @description +#' It implements Markov Chains for ngrams. It generates transition +#' probabilities for ngrams. +#' +#' @details +#' It reads ngram frequencies from an input text file. It parses each +#' ngram into a prefix, a next word and the next word frequency. The prefix is +#' converted to a numeric hash using the digest2int function from the digest +#' package. The next word is replaced with the position of the next word in the +#' list of all words. The data is stored in a data frame. It may be saved to a +#' file. +TPGenerator <- R6::R6Class( + "TPGenerator", + inherit = TextFileProcessor, + public = list( + #' @description + #' It initializes the current obj. It is used to set the verbose option. + #' @param verbose If progress information should be displayed. + initialize = function(verbose = 0) { + # The base class is initialized + super$initialize(NULL, NULL, verbose) + # The processed output is initialized + self$p_output <- data.frame() + }, + + #' @description + #' It groups the given tp data by prefix. Each prefix has + #' the top c next words with the highest probabilities. All other next + #' words are removed. The trimmed tp data may be saved to a file or + #' returned. The file is saved with the suffix -min. + #' e.g model-4-min.RDS. + #' @param opts The options for trimming the tp data. + #' save_tp -> If the data should be saved. + #' dir -> The dir where the output file should be saved. + #' c -> The top c next words per ngrams prefix. + #' m -> The maximum ngram number supported by the model. + trim_tp = function(opts) { + # The model file name + fn <- paste0(opts$dir, "/model-", opts$m, ".RDS") + # The model file is read to a data frame + df <- self$read_obj(fn) + }, + + #' @description + #' It generates the transition probabilities for the given + #' ngram numbers It first generates the transition probabilities for + #' each ngram number. The transition probabilities are then combined + #' into a single data frame. The data frame may be saved to a file. + #' @param opts The options for generating the transition probabilities. + #' save_tp -> If the data should be saved. + #' n_range -> The range of ngram numbers. + #' dir -> The dir where the output file should be saved. + #' format -> The format for the output. There are two options. + #' 'plain' -> The data is stored in plain text. + #' 'obj' -> The data is stored as a R obj. + generate_tp = function(opts) { + # The opts is merged with the tp_opts attribute + private$tp_opts = modifyList(private$tp_opts, opts) + # The tp_opts is merged with the base class opts attribute + self$opts = modifyList(self$opts, private$tp_opts) + # The information message + msg <- paste0( + "Generating Transition Probabilities for n = ", + min(self$opts[["n_range"]]), + ":", + max(self$opts[["n_range"]]) + ) + # Information message is shown + self$display_msg(msg, 1) + # The processed output is cleared + self$p_output <- data.frame() + # The ngram number range + n_range <- self$opts[["n_range"]] + # The minimum ngram number + nmin <- min(n_range) + # The maximum ngram number + nmax <- max(n_range) + # The output format + fo <- self$opts[["format"]] + # The file extension + if (fo == "plain") ext <- ".txt" + else ext <- ".RDS" + # The options for generating transition probabilities + tp_opts <- list( + n = 1, + format = fo, + save_tp = T, + dir = self$opts[["dir"]] + ) + # The combined tp data + c_pre <- c_nw <- c_prob <- c() + # For each ngram number, the transition probabilities data is + # generated. + for (n in n_range) { + # The value of n is set + tp_opts$n <- n + # The transition probabilities or word list is generated + self$generate_tp_for_n(tp_opts) + # If n == 1, then word list data is saved + if (n == 1) { + # The combined tp data is saved + private$save_data() + } + else { + # c_pre is updated + c_pre <- c(c_pre, self$p_output$pre) + # c_nw is updated + c_nw <- c(c_nw, self$p_output$nw) + # c_prob is updated + c_prob <- c(c_prob, self$p_output$prob) + # The processed output is cleared + self$p_output <- data.frame() + } + } + # The processed output is set to the combined tp data + self$p_output <- + data.frame("pre" = c_pre, + "nw" = c_nw, + "prob" = c_prob) + # The combined tp data is saved + private$save_data(paste0("model-", nmax, ext)) + }, + + #' @description + #' It reads ngram token frequencies from an input text + #' file. It generates a data frame containing the prefix, next word + #' and next word frequency. The data frame may be saved to a file as + #' plain text or as a R obj. For n = 1, the list of words is saved. + #' @param opts The options for generating the transition probabilities. + #' save_tp -> If the data should be saved. + #' n -> The ngram number + #' dir -> The location of the input and output files. + #' format -> The format of the input and output files. Options are: + #' 'plain' -> The data is stored in plain text. + #' 'obj' -> The data is stored as a R obj. + generate_tp_for_n = function(opts) { + # The opts is merged with the tp_opts attribute + private$tp_opts = modifyList(private$tp_opts, opts) + # The tp_opts is merged with the base class opts attribute + self$opts = modifyList(self$opts, private$tp_opts) + # The output format + fo <- self$opts[["format"]] + # The ngram number + n <- self$opts[["n"]] + # The output file name + fn <- private$get_file_name(T) + # If the output file already exists + if (file.exists(fn)) { + # The information message + msg <- paste0("The file: ", fn, " already exists") + # The information message is shown + self$display_msg(msg, 1) + # The file is read + data <- private$read_data(fn, fo, T) + # If n = 1 + if (n == 1) { + # The word list is set to the data + private$wl <- data + } + else { + # The processed output is set to the data + self$p_output <- data + } + } + else { + # The information message + msg <- paste0("Generating Transition Probabilities for n=", n) + # Information message is shown + self$display_msg(msg, 1) + + # The input file name + self$file_name <- private$get_file_name(F) + # The data is read + df <- private$read_data(self$file_name, fo, T) + # If n = 1 + if (n == 1) { + # The word list is set to the data frame + private$wl <- df + # A probabilities column is added + private$wl$prob = (private$wl$freq/sum(private$wl$freq)) + # The probabilities are rounded to 8 decimal places + private$wl$prob = round(private$wl$prob, 8) + # The frequency column is removed + private$wl$freq <- NULL + } + else { + # The 1-gram words are read + private$read_words() + # The lines are split on "prefix_nextword:frequency" + m <- str_match(df$pre, "(.+)_(.+)") + # The hash of the prefix is taken + np <- digest2int(m[,2]) + # The next word id based on index position + nw <- match(m[,3], private$wl$pre) + # The next word frequencies + nf <- df$freq + # The data is added to a data frame + df <- data.frame("pre" = np, + "nw" = nw, + "freq" = nf) + # The processed output is set to the data frame + self$p_output <- df + # The next word probabilities are generated + private$generate_probs() + # The frequency column is removed + self$p_output$freq <- NULL + # If the data should be saved + if (self$opts[["save_tp"]]) { + private$save_data() + } + } + } + } + ), + + private = list( + # @field tp_opts The options for generating the transition + # probabilities. + # save_tp -> If the data should be saved. + # n -> The ngram number + # dir -> The dir where the output file should be saved. + # format -> The format for the output. There are two options. + # 'plain' -> The data is stored in plain text. + # 'obj' -> The data is stored as a R obj. + tp_opts = list( + "save_tp" = F, + "n" = 1, + "dir" = "./data/models", + "format" = "obj" + ), + + # @field The list of unique words and their frequencies + wl = data.frame(), + + # @description + # It calculates the next word probabilities and optionally + # saves the transition probability data to a file. + generate_probs = function() { + # Information message is shown + self$display_msg("Generating Transition Probabilities...", 1) + # The ngram number + n <- self$opts[["n"]] + # If n > 1 + if (n > 1) { + # The output is copied to a variable + df <- self$p_output + # A new probability column is added. It is set to the sum of + # frequency column for each prefix group. + df <- df %>% + group_by(pre) %>% + mutate(prob = sum(freq)) + # Each frequency is divided by the sum to give the probability. + df$prob <- round(df$freq/df$prob, 8) + # The output is set to the updated variable + self$p_output <- df + } + }, + + # @description + # It returns the name of the output or input file. + # @param is_output If the output file name is required. + get_file_name = function(is_output) { + # The ngram number + n <- self$opts[["n"]] + # The directory + od <- self$opts[["dir"]] + # The format + fo <- self$opts[["format"]] + # The file extension + if (fo == "plain") ext <- ".txt" + else ext <- ".RDS" + # If the output file name is required + if (is_output) { + # If n = 1 + if (n == 1) { + # The file name + fn <- paste0(od, "/words", ext) + } + # If n > 1 + else if (n > 1) { + # The file name + fn <- paste0(od, "/tp", n, ext) + } + } + else { + # The file name + fn <- paste0(od, "/n", n, ext) + } + + return(fn) + }, + + # @description + # It saves the transition probabilities or word list + # depending on the n options, to a file in plain format or as a R obj. + # If the file name is not given, then it is generated using the current + # object attributes. + # @param file_name The file name to use. + save_data = function(file_name = NULL) { + # The ngram number + n <- self$opts[["n"]] + # The directory + od <- self$opts[["dir"]] + # The format + fo <- self$opts[["format"]] + # If n = 1 + if (n == 1) { + # The data to save + data <- private$wl + } + # If n > 1 + else if (n > 1) { + # The data to save + data <- self$p_output + } + # If the file name is given as parameter then it is used + if (!is.null(file_name)) fn <- paste0(od, "/", file_name) + else fn <- private$get_file_name(T) + # The data is written + private$write_data(data, fn, fo, F) + }, + + # @description + # It reads the list of 1-gram words. + read_words = function() { + # If the word list has not been read + if (nrow(private$wl) == 0) { + # The format + fo <- self$opts[["format"]] + # The file extension + if (fo == "plain") ext <- ".txt" + else ext <- ".RDS" + # The 1-gram words file name + fn <- paste0(self$opts[["dir"]], "/words", ext) + # The words are read + private$wl <- private$read_data(fn, self$opts[["format"]], F) + } + } + ) +)