Merge pull request #31 from vgherard/dev

v0.2.0
vgherard · Oct 6, 2023 · 4b037c4 · 4b037c4
2 parents 33af34f + 2e75a8a
commit 4b037c4
Show file tree

Hide file tree

Showing 20 changed files with 362 additions and 115 deletions.
diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml
@@ -1,48 +1,50 @@
+# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
+# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 on:
   push:
-    branches:
-      - main
-      - master
+    branches: [main, master]
   pull_request:
-    branches:
-      - main
-      - master
+    branches: [main, master]
 
 name: test-coverage
 
 jobs:
   test-coverage:
-    runs-on: macOS-latest
+    runs-on: ubuntu-latest
     env:
       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
 
-      - uses: r-lib/actions/setup-r@v1
+      - uses: r-lib/actions/setup-r@v2
+        with:
+          use-public-rspm: true
 
-      - uses: r-lib/actions/setup-pandoc@v1
+      - uses: r-lib/actions/setup-r-dependencies@v2
+        with:
+          extra-packages: any::covr
+          needs: coverage
 
-      - name: Query dependencies
+      - name: Test coverage
         run: |
-          install.packages('remotes')
-          saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2)
-          writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version")
+          covr::codecov(
+            quiet = FALSE,
+            clean = FALSE,
+            install_path = file.path(Sys.getenv("RUNNER_TEMP"), "package")
+          )
         shell: Rscript {0}
 
-      - name: Cache R packages
-        uses: actions/cache@v2
-        with:
-          path: ${{ env.R_LIBS_USER }}
-          key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }}
-          restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-
-
-      - name: Install dependencies
+      - name: Show testthat output
+        if: always()
         run: |
-          install.packages(c("remotes"))
-          remotes::install_deps(dependencies = TRUE)
-          remotes::install_cran("covr")
-        shell: Rscript {0}
+          ## --------------------------------------------------------------------
+          find ${{ runner.temp }}/package -name 'testthat.Rout*' -exec cat '{}' \; || true
+        shell: bash
 
-      - name: Test coverage
-        run: covr::codecov()
-        shell: Rscript {0}
+      - name: Upload test results
+        if: failure()
+        uses: actions/upload-artifact@v3
+        with:
+          name: coverage-test-failures
+          path: ${{ runner.temp }}/package
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: kgrams
 Title: Classical k-gram Language Models
-Version: 0.1.5
+Version: 0.2.0
 Authors@R: 
     person(given = "Valerio",
            family = "Gherardi",

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,26 @@
+# kgrams 0.2.0
+
+
+### Breaking changes
+
+* `tknz_sent()` and `preprocess()` now have a different implementation on 
+Windows and UNIX OSs, respectively (since the previous C++ implementation has 
+impredictable behaviour on Windows, see #30). This fix also included minor 
+changes in the `tknz_sent()` output, in some corner cases (e.g. `tknz_sent("")` 
+now returns `character(0)`, wheareas it used to return `""`).
+
+### New features
+
+* `perplexity()` gets a new argument `exp` that allows to return the 
+cross-entropy per word, rather than perplexity (its exponential).
+* `perplexity.character()` gets a new argument `detailed` that allows to return, alongside with the total perplexity of the input document, also the 
+cross-entropies and word lengths of individual sentences. Closes #28.
+
+### Improvements
+
+* Minor documentation improvements.
+
+
 # kgrams 0.1.5
 
 * Removed "Tools for..." at the beginning of package DESCRIPTION, as per CRAN's 

diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -1,16 +1,12 @@
 # Generated by using Rcpp::compileAttributes() -> do not edit by hand
 # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 
-#' @rdname preprocess
-#' @export
-preprocess <- function(input, erase = "[^.?!:;'[:alnum:][:space:]]", lower_case = TRUE) {
-    .Call(`_kgrams_preprocess`, input, erase, lower_case)
+preprocess_cpp <- function(input, erase = "[^.?!:;'[:alnum:][:space:]]", lower_case = TRUE) {
+    .Call(`_kgrams_preprocess_cpp`, input, erase, lower_case)
 }
 
-#' @rdname tknz_sent
-#' @export
-tknz_sent <- function(input, EOS = "[.?!:;]+", keep_first = FALSE) {
-    .Call(`_kgrams_tknz_sent`, input, EOS, keep_first)
+tknz_sent_cpp <- function(input, EOS = "[.?!:;]+", keep_first = FALSE) {
+    .Call(`_kgrams_tknz_sent_cpp`, input, EOS, keep_first)
 }
 
 #' @rdname special_tokens

diff --git a/R/language_model.R b/R/language_model.R
@@ -49,9 +49,8 @@
 #' \code{language_models()} have methods for computing word continuation and
 #' sentence probabilities (see \link[kgrams]{probability}), 
 #' random text generation (see \link[kgrams]{sample_sentences}) 
-#' and other type of language modeling tasks such as 
-#' (\strong{not yet implemented}) computing perplexities and word 
-#' prediction accuracies.
+#' and other type of language modeling tasks such as computing perplexities and 
+#' word prediction accuracies.
 #' 
 #' Smoothers have often tuning parameters, which need to be specified by
 #' (exact) name through the \code{...} arguments; otherwise, 

diff --git a/R/perplexity.R b/R/perplexity.R
@@ -15,6 +15,12 @@
 #' @param .tknz_sent a function taking a character vector as input and 
 #' returning a character vector as output. Optional sentence tokenization step
 #' applied before computing perplexity.
+#' @param exp \code{TRUE} or \code{FALSE}. If \code{TRUE}, returns the actual
+#' perplexity - exponential of cross-entropy per token - otherwise returns its 
+#' natural logarithm.
+#' @param detailed \code{TRUE} or \code{FALSE}. If \code{TRUE}, the output has
+#' a \code{"details"} attribute, which is a data-frame containing the 
+#' cross-entropy of each individual sentence tokenized from \code{text}.
 #' @param batch_size a length one positive integer or \code{Inf}.
 #' Size of text batches when reading text from a \code{connection}. 
 #' If \code{Inf}, all input text is processed in a single batch.
@@ -82,6 +88,7 @@ perplexity <- function(text,
                        model,
                        .preprocess = attr(model, ".preprocess"),
                        .tknz_sent = attr(model, ".tknz_sent"),
+                       exp = TRUE,
                        ...
                        )
 {
@@ -99,15 +106,32 @@ perplexity.character <- function(
         model,
         .preprocess = attr(model, ".preprocess"),
         .tknz_sent = attr(model, ".tknz_sent"),
+        exp = TRUE,
+        detailed = FALSE,
         ...
         ) 
 {
         assert_character_no_NA(text)
+        assert_true_or_false(detailed)
+
         text <- .preprocess(text)
         text <- .tknz_sent(text)
         lp <- attr(model, "cpp_obj")$log_probability_sentence(text)
-        cross_entropy <- -sum(lp$log_prob) / sum(lp$n_words) 
-        return(exp(cross_entropy))
+        cross_entropy_normalized <- -sum(lp$log_prob) / sum(lp$n_words) 
+
+        res <- ifelse(exp, 
+                      exp(cross_entropy_normalized), 
+                      cross_entropy_normalized)
+
+        if (detailed) {
+                attr(res, "details") <- 
+                        data.frame(sentence = text,
+                                   cross_entropy = -lp$log_prob,
+                                   n_words = lp$n_words
+                                   )
+                }
+
+        return(res)
 }
 
 #' @rdname perplexity
@@ -117,6 +141,7 @@ perplexity.connection <- function(
         model,
         .preprocess = attr(model, ".preprocess"),
         .tknz_sent = attr(model, ".tknz_sent"),
+        exp = TRUE,
         batch_size = Inf,
         ...
         ) 

diff --git a/R/preprocess.R b/R/preprocess.R
@@ -21,12 +21,33 @@
 #' \code{gsub(pattern, "", x)}, respectively, provided that the regular 
 #' expression 'pattern' is correctly recognized by R.
 #' 
-#' Internally, \code{preprocess()} converts the string 'pattern' is converted 
-#' into a C++ \code{std::regex} class by the default constructor 
-#' \code{std::regex::regex(std::string)}.
+#' **Note.** This function, as well as \link[kgrams]{tknz_sent}, are included 
+#' in the library for illustrative purposes only, and are not optimized for 
+#' performance. Furthermore (for performance reasons) the function has a 
+#' separate implementation for Windows and UNIX OS types, respectively, so that 
+#' results obtained in the two cases may differ slightly. 
+#' In contexts that require full reproducibility, users are encouraged to define 
+#' their own preprocessing and tokenization custom functions - or to work with
+#' externally processed data.
+#' 
 #' @examples
 #' preprocess("#This Is An Example@@-@@!#")
 #' @name preprocess
-NULL
-
-# Defined in UtilitiesR.cpp
+#' @export
+preprocess <- function(input, 
+                       erase = "[^.?!:;'[:alnum:][:space:]]", 
+                       lower_case = TRUE
+) {
+        if (.Platform$OS.type != "windows") 
+                return(preprocess_cpp(input, erase, lower_case))
+
+        assert_string(erase)
+        assert_true_or_false(lower_case)        
+
+        res <- gsub(erase, "", input)
+
+        if (lower_case)
+                return(tolower(res))
+
+        return(res)
+}
diff --git a/R/sample_sentences.R b/R/sample_sentences.R
@@ -22,12 +22,16 @@
 #' as context, starting from the Begin-Of-Sentence context (i.e. \code{N - 1} 
 #' BOS tokens). Sampling stops either when an End-Of-Sentence token is 
 #' encountered, or when the string exceeds \code{max_length}, in which case
-#' a truncated output is returned.
+#' a truncated output is returned. 
 #' 
-#' A word of caution on some special smoothers: 'sbo' smoother (Stupid Backoff),
-#' does not produce normalized continuation probabilities, but rather 
-#' continuation \emph{scores}. Sampling is here performed by assuming that 
-#' Stupid Backoff scores are \emph{proportional} to actual probabilities.
+#' Some language models may give a non-zero probability to the the Unknown word 
+#' token, but this is never produced in text generated by 
+#' \code{sample_sentences()}: when randomly sampled, it is simply ignored.
+#' 
+#' Finally, a word of caution on some special smoothers: \code{"sbo"} smoother 
+#' (Stupid Backoff), does not produce normalized continuation probabilities, 
+#' but rather continuation \emph{scores}. Sampling is here performed by assuming 
+#' that Stupid Backoff scores are \emph{proportional} to actual probabilities.
 #' 'ml' smoother (Maximum Likelihood) does not assign probabilities when the
 #' k-gram count of the context is zero. When this happens, the next word is 
 #' chosen uniformly at random from the model's dictionary.

diff --git a/R/tokenize_sentences.R b/R/tokenize_sentences.R
@@ -12,24 +12,80 @@
 #' @return a character vector, each entry of which corresponds to a single
 #' sentence.
 #' @details
-#' \code{tknz_sent()} splits text into sentences using a list of 
-#' single character delimiters, specified by the parameter \code{EOS}. 
+#' \code{tknz_sent()} splits text into sentences, where sentence delimiters are
+#' specified by a regular expression through the \code{EOS} argument. 
 #' Specifically, when an EOS token is found, the next sentence begins at the
 #' first position in the input string not containing any of the EOS tokens 
 #' \emph{or white space} (so that entries like \code{"Hi there!!!"} or 
 #' \code{"Hello . . ."} are both recognized as a single sentence).
 #' 
 #' If \code{keep_first} is \code{FALSE}, the delimiters are stripped off from 
-#' the returned sequences, which means that all delimiters are treated 
-#' symmetrically.
+#' the returned sequences. Otherwise, the first character of the substrings 
+#' matching the \code{EOS} regular expressions are appended to the corresponding 
+#' sentences, preceded by a white space.
 #' 
 #' In the absence of any \code{EOS} delimiter, \code{tknz_sent()} 
 #' returns the input as is, since parts of text corresponding to different 
 #' entries of the input vector \code{x} are understood as parts of separate 
 #' sentences.
+#' 
+#' **Note.** This function, as well as \link[kgrams]{preprocess}, are included 
+#' in the library for illustrative purposes only, and are not optimized for 
+#' performance. Furthermore (for performance reasons) the function has a 
+#' separate implementation for Windows and UNIX OS types, respectively, so that 
+#' results obtained in the two cases may differ slightly. 
+#' In contexts that require full reproducibility, users are encouraged to define 
+#' their own preprocessing and tokenization custom functions - or to work with
+#' externally processed data.
+#' 
 #' @examples
-#' tknz_sent("Hi there! I'm using `sbo`.")
+#' tknz_sent("Hi there! I'm using kgrams.")
 #' @name tknz_sent
-NULL
+#' @export
+tknz_sent <- function(input, EOS = "[.?!:;]+", keep_first = FALSE) {
+
+        if (.Platform$OS.type == "windows") 
+                res <- tknz_sent_win(input, EOS, keep_first)
+        else
+                res <- tknz_sent_cpp(input, EOS, keep_first)
+
+        tknz_sent_postproc(res)
+}
+
+
+# Fallback implementation for windows (C++ implementation does not work) due
+# to https://github.com/RcppCore/Rcpp/issues/810
+tknz_sent_win <- function(input, EOS, keep_first) {
+        assert_string(EOS)
+        assert_true_or_false(keep_first)
+
+        if (EOS == "") 
+                return(input)
+
+        sent_bare <- strsplit(input, EOS)
+
+        if (!keep_first) {
+                return( unlist(sent_bare) )
+        }
+
+        puncts <- regmatches(input, gregexpr(EOS, input))
+
+        sent_puncts <- lapply(seq_along(sent_bare), function(i) {
+                n_sents <- length(sent_bare[[i]])
+                endings <- substr(puncts[[i]], 1, 1)
+
+                if( length(endings) == n_sents )
+                        return(paste(sent_bare[[i]], endings))
+
+                c(paste(sent_bare[[i]][-n_sents], endings),
+                  sent_bare[[i]][n_sents]
+                )
+        }) 
+
+        return( unlist(sent_puncts) )        
+}
 
-# Defined in UtilitiesR.cpp
+tknz_sent_postproc <- function(s) {
+        s <- trimws(s, which = "both")
+        s[s != ""]
+}