Skip to content

Commit

Permalink
Merge pull request #31 from vgherard/dev
Browse files Browse the repository at this point in the history
v0.2.0
  • Loading branch information
vgherard authored Oct 6, 2023
2 parents 33af34f + 2e75a8a commit 4b037c4
Show file tree
Hide file tree
Showing 20 changed files with 362 additions and 115 deletions.
60 changes: 31 additions & 29 deletions .github/workflows/test-coverage.yaml
Original file line number Diff line number Diff line change
@@ -1,48 +1,50 @@
# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
on:
push:
branches:
- main
- master
branches: [main, master]
pull_request:
branches:
- main
- master
branches: [main, master]

name: test-coverage

jobs:
test-coverage:
runs-on: macOS-latest
runs-on: ubuntu-latest
env:
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}

steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3

- uses: r-lib/actions/setup-r@v1
- uses: r-lib/actions/setup-r@v2
with:
use-public-rspm: true

- uses: r-lib/actions/setup-pandoc@v1
- uses: r-lib/actions/setup-r-dependencies@v2
with:
extra-packages: any::covr
needs: coverage

- name: Query dependencies
- name: Test coverage
run: |
install.packages('remotes')
saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2)
writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version")
covr::codecov(
quiet = FALSE,
clean = FALSE,
install_path = file.path(Sys.getenv("RUNNER_TEMP"), "package")
)
shell: Rscript {0}

- name: Cache R packages
uses: actions/cache@v2
with:
path: ${{ env.R_LIBS_USER }}
key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }}
restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-

- name: Install dependencies
- name: Show testthat output
if: always()
run: |
install.packages(c("remotes"))
remotes::install_deps(dependencies = TRUE)
remotes::install_cran("covr")
shell: Rscript {0}
## --------------------------------------------------------------------
find ${{ runner.temp }}/package -name 'testthat.Rout*' -exec cat '{}' \; || true
shell: bash

- name: Test coverage
run: covr::codecov()
shell: Rscript {0}
- name: Upload test results
if: failure()
uses: actions/upload-artifact@v3
with:
name: coverage-test-failures
path: ${{ runner.temp }}/package
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: kgrams
Title: Classical k-gram Language Models
Version: 0.1.5
Version: 0.2.0
Authors@R:
person(given = "Valerio",
family = "Gherardi",
Expand Down
23 changes: 23 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,26 @@
# kgrams 0.2.0


### Breaking changes

* `tknz_sent()` and `preprocess()` now have a different implementation on
Windows and UNIX OSs, respectively (since the previous C++ implementation has
impredictable behaviour on Windows, see #30). This fix also included minor
changes in the `tknz_sent()` output, in some corner cases (e.g. `tknz_sent("")`
now returns `character(0)`, wheareas it used to return `""`).

### New features

* `perplexity()` gets a new argument `exp` that allows to return the
cross-entropy per word, rather than perplexity (its exponential).
* `perplexity.character()` gets a new argument `detailed` that allows to return, alongside with the total perplexity of the input document, also the
cross-entropies and word lengths of individual sentences. Closes #28.

### Improvements

* Minor documentation improvements.


# kgrams 0.1.5

* Removed "Tools for..." at the beginning of package DESCRIPTION, as per CRAN's
Expand Down
12 changes: 4 additions & 8 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
@@ -1,16 +1,12 @@
# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

#' @rdname preprocess
#' @export
preprocess <- function(input, erase = "[^.?!:;'[:alnum:][:space:]]", lower_case = TRUE) {
.Call(`_kgrams_preprocess`, input, erase, lower_case)
preprocess_cpp <- function(input, erase = "[^.?!:;'[:alnum:][:space:]]", lower_case = TRUE) {
.Call(`_kgrams_preprocess_cpp`, input, erase, lower_case)
}

#' @rdname tknz_sent
#' @export
tknz_sent <- function(input, EOS = "[.?!:;]+", keep_first = FALSE) {
.Call(`_kgrams_tknz_sent`, input, EOS, keep_first)
tknz_sent_cpp <- function(input, EOS = "[.?!:;]+", keep_first = FALSE) {
.Call(`_kgrams_tknz_sent_cpp`, input, EOS, keep_first)
}

#' @rdname special_tokens
Expand Down
5 changes: 2 additions & 3 deletions R/language_model.R
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,8 @@
#' \code{language_models()} have methods for computing word continuation and
#' sentence probabilities (see \link[kgrams]{probability}),
#' random text generation (see \link[kgrams]{sample_sentences})
#' and other type of language modeling tasks such as
#' (\strong{not yet implemented}) computing perplexities and word
#' prediction accuracies.
#' and other type of language modeling tasks such as computing perplexities and
#' word prediction accuracies.
#'
#' Smoothers have often tuning parameters, which need to be specified by
#' (exact) name through the \code{...} arguments; otherwise,
Expand Down
29 changes: 27 additions & 2 deletions R/perplexity.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,12 @@
#' @param .tknz_sent a function taking a character vector as input and
#' returning a character vector as output. Optional sentence tokenization step
#' applied before computing perplexity.
#' @param exp \code{TRUE} or \code{FALSE}. If \code{TRUE}, returns the actual
#' perplexity - exponential of cross-entropy per token - otherwise returns its
#' natural logarithm.
#' @param detailed \code{TRUE} or \code{FALSE}. If \code{TRUE}, the output has
#' a \code{"details"} attribute, which is a data-frame containing the
#' cross-entropy of each individual sentence tokenized from \code{text}.
#' @param batch_size a length one positive integer or \code{Inf}.
#' Size of text batches when reading text from a \code{connection}.
#' If \code{Inf}, all input text is processed in a single batch.
Expand Down Expand Up @@ -82,6 +88,7 @@ perplexity <- function(text,
model,
.preprocess = attr(model, ".preprocess"),
.tknz_sent = attr(model, ".tknz_sent"),
exp = TRUE,
...
)
{
Expand All @@ -99,15 +106,32 @@ perplexity.character <- function(
model,
.preprocess = attr(model, ".preprocess"),
.tknz_sent = attr(model, ".tknz_sent"),
exp = TRUE,
detailed = FALSE,
...
)
{
assert_character_no_NA(text)
assert_true_or_false(detailed)

text <- .preprocess(text)
text <- .tknz_sent(text)
lp <- attr(model, "cpp_obj")$log_probability_sentence(text)
cross_entropy <- -sum(lp$log_prob) / sum(lp$n_words)
return(exp(cross_entropy))
cross_entropy_normalized <- -sum(lp$log_prob) / sum(lp$n_words)

res <- ifelse(exp,
exp(cross_entropy_normalized),
cross_entropy_normalized)

if (detailed) {
attr(res, "details") <-
data.frame(sentence = text,
cross_entropy = -lp$log_prob,
n_words = lp$n_words
)
}

return(res)
}

#' @rdname perplexity
Expand All @@ -117,6 +141,7 @@ perplexity.connection <- function(
model,
.preprocess = attr(model, ".preprocess"),
.tknz_sent = attr(model, ".tknz_sent"),
exp = TRUE,
batch_size = Inf,
...
)
Expand Down
33 changes: 27 additions & 6 deletions R/preprocess.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,33 @@
#' \code{gsub(pattern, "", x)}, respectively, provided that the regular
#' expression 'pattern' is correctly recognized by R.
#'
#' Internally, \code{preprocess()} converts the string 'pattern' is converted
#' into a C++ \code{std::regex} class by the default constructor
#' \code{std::regex::regex(std::string)}.
#' **Note.** This function, as well as \link[kgrams]{tknz_sent}, are included
#' in the library for illustrative purposes only, and are not optimized for
#' performance. Furthermore (for performance reasons) the function has a
#' separate implementation for Windows and UNIX OS types, respectively, so that
#' results obtained in the two cases may differ slightly.
#' In contexts that require full reproducibility, users are encouraged to define
#' their own preprocessing and tokenization custom functions - or to work with
#' externally processed data.
#'
#' @examples
#' preprocess("#This Is An Example@@-@@!#")
#' @name preprocess
NULL

# Defined in UtilitiesR.cpp
#' @export
preprocess <- function(input,
erase = "[^.?!:;'[:alnum:][:space:]]",
lower_case = TRUE
) {
if (.Platform$OS.type != "windows")
return(preprocess_cpp(input, erase, lower_case))

assert_string(erase)
assert_true_or_false(lower_case)

res <- gsub(erase, "", input)

if (lower_case)
return(tolower(res))

return(res)
}
14 changes: 9 additions & 5 deletions R/sample_sentences.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,16 @@
#' as context, starting from the Begin-Of-Sentence context (i.e. \code{N - 1}
#' BOS tokens). Sampling stops either when an End-Of-Sentence token is
#' encountered, or when the string exceeds \code{max_length}, in which case
#' a truncated output is returned.
#' a truncated output is returned.
#'
#' A word of caution on some special smoothers: 'sbo' smoother (Stupid Backoff),
#' does not produce normalized continuation probabilities, but rather
#' continuation \emph{scores}. Sampling is here performed by assuming that
#' Stupid Backoff scores are \emph{proportional} to actual probabilities.
#' Some language models may give a non-zero probability to the the Unknown word
#' token, but this is never produced in text generated by
#' \code{sample_sentences()}: when randomly sampled, it is simply ignored.
#'
#' Finally, a word of caution on some special smoothers: \code{"sbo"} smoother
#' (Stupid Backoff), does not produce normalized continuation probabilities,
#' but rather continuation \emph{scores}. Sampling is here performed by assuming
#' that Stupid Backoff scores are \emph{proportional} to actual probabilities.
#' 'ml' smoother (Maximum Likelihood) does not assign probabilities when the
#' k-gram count of the context is zero. When this happens, the next word is
#' chosen uniformly at random from the model's dictionary.
Expand Down
70 changes: 63 additions & 7 deletions R/tokenize_sentences.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,24 +12,80 @@
#' @return a character vector, each entry of which corresponds to a single
#' sentence.
#' @details
#' \code{tknz_sent()} splits text into sentences using a list of
#' single character delimiters, specified by the parameter \code{EOS}.
#' \code{tknz_sent()} splits text into sentences, where sentence delimiters are
#' specified by a regular expression through the \code{EOS} argument.
#' Specifically, when an EOS token is found, the next sentence begins at the
#' first position in the input string not containing any of the EOS tokens
#' \emph{or white space} (so that entries like \code{"Hi there!!!"} or
#' \code{"Hello . . ."} are both recognized as a single sentence).
#'
#' If \code{keep_first} is \code{FALSE}, the delimiters are stripped off from
#' the returned sequences, which means that all delimiters are treated
#' symmetrically.
#' the returned sequences. Otherwise, the first character of the substrings
#' matching the \code{EOS} regular expressions are appended to the corresponding
#' sentences, preceded by a white space.
#'
#' In the absence of any \code{EOS} delimiter, \code{tknz_sent()}
#' returns the input as is, since parts of text corresponding to different
#' entries of the input vector \code{x} are understood as parts of separate
#' sentences.
#'
#' **Note.** This function, as well as \link[kgrams]{preprocess}, are included
#' in the library for illustrative purposes only, and are not optimized for
#' performance. Furthermore (for performance reasons) the function has a
#' separate implementation for Windows and UNIX OS types, respectively, so that
#' results obtained in the two cases may differ slightly.
#' In contexts that require full reproducibility, users are encouraged to define
#' their own preprocessing and tokenization custom functions - or to work with
#' externally processed data.
#'
#' @examples
#' tknz_sent("Hi there! I'm using `sbo`.")
#' tknz_sent("Hi there! I'm using kgrams.")
#' @name tknz_sent
NULL
#' @export
tknz_sent <- function(input, EOS = "[.?!:;]+", keep_first = FALSE) {

if (.Platform$OS.type == "windows")
res <- tknz_sent_win(input, EOS, keep_first)
else
res <- tknz_sent_cpp(input, EOS, keep_first)

tknz_sent_postproc(res)
}


# Fallback implementation for windows (C++ implementation does not work) due
# to https://github.com/RcppCore/Rcpp/issues/810
tknz_sent_win <- function(input, EOS, keep_first) {
assert_string(EOS)
assert_true_or_false(keep_first)

if (EOS == "")
return(input)

sent_bare <- strsplit(input, EOS)

if (!keep_first) {
return( unlist(sent_bare) )
}

puncts <- regmatches(input, gregexpr(EOS, input))

sent_puncts <- lapply(seq_along(sent_bare), function(i) {
n_sents <- length(sent_bare[[i]])
endings <- substr(puncts[[i]], 1, 1)

if( length(endings) == n_sents )
return(paste(sent_bare[[i]], endings))

c(paste(sent_bare[[i]][-n_sents], endings),
sent_bare[[i]][n_sents]
)
})

return( unlist(sent_puncts) )
}

# Defined in UtilitiesR.cpp
tknz_sent_postproc <- function(s) {
s <- trimws(s, which = "both")
s[s != ""]
}
Loading

0 comments on commit 4b037c4

Please sign in to comment.