Meredith-Lab · Aariq · Aug 6, 2024 · Aug 6, 2024 · Aug 6, 2024 · Aug 6, 2024
diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml
@@ -18,7 +18,7 @@ jobs:
       fail-fast: false
       matrix:
         config:
-          - {os: macos-latest,   r: 'release'}
+          - {os: macos-13,   r: 'release'} #TODO change back to macos-latest some day when ChemmineOB works on M1
           - {os: windows-latest, r: 'release'}
           - {os: ubuntu-latest,   r: 'devel', http-user-agent: 'release'}
           - {os: ubuntu-latest,   r: 'release'}
@@ -39,7 +39,7 @@ jobs:
           http-user-agent: ${{ matrix.config.http-user-agent }}
           use-public-rspm: true
       - name: macOS openbabel
-        if: matrix.config.os == 'macos-latest'
+        if: contains(matrix.config.os, 'macos')
         run: |
           brew install open-babel
       - name: ubuntu openbabel

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -26,7 +26,6 @@ Imports:
     fs,
     glue,
     httr2,
-    KEGGREST,
     magrittr,
     purrr,
     rlang,
@@ -46,4 +45,4 @@ biocViews:
 Config/testthat/edition: 3
 Encoding: UTF-8
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.3.1
+RoxygenNote: 7.3.2
diff --git a/NEWS.md b/NEWS.md
@@ -1,6 +1,7 @@
 # volcalc (development version)
 
 * adds a `validate = TRUE` option to `calc_vol()` and `get_fx_groups()` that returns `NA`s when there are suspected errors in parsing SMILES or .mol files. This is unfortunately not available on Windows due to differences in the windows version of `ChemmineOB`
+* `KEGGREST` is no longer a dependency of `volcalc` (previously used in `get_mol_kegg()`)
 
 # volcalc 2.1.2
 

diff --git a/R/get_mol_kegg.R b/R/get_mol_kegg.R
@@ -14,8 +14,12 @@ utils::globalVariables(".data")
 #'   if they are found in `dir`. Set this to `TRUE` to download and overwrite
 #'   existing files.
 #'   
+#' @note For additional functionality for interacting with KEGG, try the
+#'   `KEGGREST` package, which this function was inspired by.
+#'   
 #' @returns A tibble with the columns `compound_ids`, `pathway_ids` (if used),
 #'   and `mol_paths` (paths to downloaded .mol files).
+#'   
 #' @export
 #'
 #' @examples
@@ -47,7 +51,7 @@ get_mol_kegg <- function(compound_ids, pathway_ids, dir, force = FALSE){
       stop("Some pathway_ids are not in the correct KEGG format")
     }
     fs::dir_create(dir, pathway_ids)
-    compound_ids_list <- lapply(pathway_ids, keggGetCompounds)
+    compound_ids_list <- lapply(pathway_ids, get_compounds_kegg)
     names(compound_ids_list) <- pathway_ids
     out_tbl <- 
       tibble::enframe(compound_ids_list, name = "pathway_id", value = "compound_id") %>% 
@@ -91,12 +95,9 @@ get_mol_kegg <- function(compound_ids, pathway_ids, dir, force = FALSE){
 
 #' Get list of KEGG compound IDs for given KEGG pathway
 #'
-#' This is a temporary helper function until this function is improved and
-#' pushed into KEGGREST package
-#'
 #' @param pathway string that is a KEGG identifier for a molecular pathway
 #' @noRd
-keggGetCompounds <- function(pathway){
+get_compounds_kegg <- function(pathway){
 
   resp <- 
     httr2::request("https://rest.kegg.jp/")  %>%  
@@ -113,36 +114,48 @@ keggGetCompounds <- function(pathway){
 
 }
 
-dl_mol_kegg <- function(compound_ids) {
-  #balances compound_ids into groups of less than 10 to meet API guidelines
-  compound_id_list <- split_to_list(compound_ids, max_len = 10)
+#' Get and wrangle mol files for a single API request of up to 10 IDs
+#' @noRd
+.dl_mol_kegg <- function(ids) {
+  if (length(ids) > 10) {
+    stop("Provide 10 or fewer IDs at a time")
+  }
+  req_names <- 
+    httr2::request("https://rest.kegg.jp/get") %>% 
+    httr2::req_url_path_append(paste(ids, collapse = "+")) %>% 
+    httr2::req_retry(max_tries = 3)
+
+  resp_names <- httr2::req_perform(req_names) %>% 
+    httr2::resp_body_string()
 
-  #maps over list, but returns it to a single character vector to simplify wrangling code
-  raw <- 
-    purrr::map(compound_id_list, function(x) KEGGREST::keggGet(x, option = "mol")) %>% 
-    purrr::list_c() %>% 
-    glue::glue_collapse()
-  #split into multiples
-  mols <- stringr::str_split(raw, "(?<=\\${4})", n = length(compound_ids)) %>%
+  # There's a lot of stuff in the response, but I only care about the compound name
+  names <- resp_names %>%
+    stringr::str_extract_all("(?<=NAME).+(?=\\n)") %>%
     unlist() %>% 
-    stringr::str_trim(side = "left")
+    stringr::str_trim() %>% 
+    stringr::str_remove(";")
+
+  # get mol file
+  req_mols <- req_names %>% 
+    httr2::req_url_path_append("mol")
+
+  resp_mols <- httr2::req_perform(req_mols) %>% 
+    httr2::resp_body_string()
 
-  # Adds title to mol file because it is used later on by get_fx_groups()
-  titles <- purrr::map(compound_id_list, function(x) { #for every group of <10 IDs
-    KEGGREST::keggGet(x) %>% 
-      purrr::map_chr(function(names) { #for every ID
-        purrr::pluck(names, "NAME", 1) %>% #get first element of NAME
-          stringr::str_remove(";")
-      })
-  }) %>% unlist()
-  purrr::map2(mols, titles, function(mol, title) {
-    paste0(title, "\n\n\n", gsub(">.*", "", mol))
-  })
+  # wrangle into valid mol files
+  mols <- resp_mols %>% 
+    stringr::str_split("(?<=\\${4})", n = length(ids)) %>%
+    unlist() %>% 
+    stringr::str_trim(side = "left")
+  mols <- 
+    gsub(">.*", "", mols) #for some reason this pattern doesn't work with str_remove()
 
+  #add compound name in correct place
+  paste0(names, "\n\n\n", mols)
 }
 
-
-
+#' Split vector into list elements of max length
+#' @noRd
 split_to_list <- function(x, max_len = 10) {
 
   if(length(x) > max_len) {
@@ -154,3 +167,14 @@ split_to_list <- function(x, max_len = 10) {
   }
 
 }
+
+#' Get mol files for compound_ids by splitting into groups of 10 and calling .dl_mol_kegg
+#' @noRd
+dl_mol_kegg <- function(compound_ids) {
+  #balances compound_ids into groups of less than 10 to meet API guidelines
+  compound_id_list <- split_to_list(compound_ids, max_len = 10)
+
+  #maps over list, but returns it to a single character vector to simplify wrangling code
+  purrr::map(compound_id_list, .dl_mol_kegg) %>% 
+    purrr::list_c()
+}
diff --git a/README.Rmd b/README.Rmd
@@ -77,6 +77,9 @@ For windows, `OpenBabel` is included in the `ChemmineOB` binary and does not nee
 
 For other installation options see the [OpenBabel documentation](https://open-babel.readthedocs.io/en/latest/Installation/install.html) and `ChemmineOB` [install guide](https://github.com/girke-lab/ChemmineOB/blob/master/INSTALL)
 
+> [!NOTE]  
+> As of Dec 2024, `ChemmineOB` may fail to build on macs with Apple silicon (https://github.com/girke-lab/ChemmineOB/issues/35) causing installation failture for `volcalc`.
+
 ## Basic Usage
 
 This is a basic example which shows you how to get an estimated relative volatility index (`rvi`) for two example compounds *beta-2,3,4,5,6-Pentachlorocyclohexanol*, and *Succinate*.

diff --git a/README.md b/README.md
@@ -40,13 +40,11 @@ strings as input, and supports downloading .mol files directly from
 
 ## Installation
 
-<!--
 Install from CRAN with
 
 ``` r
 install.packages("volcalc")
 ```
--->
 
 You can install the development version of `volcalc` from GitHub with
 
@@ -96,6 +94,11 @@ documentation](https://open-babel.readthedocs.io/en/latest/Installation/install.
 and `ChemmineOB` [install
 guide](https://github.com/girke-lab/ChemmineOB/blob/master/INSTALL)
 
+> \[!NOTE\]  
+> As of Dec 2024, `ChemmineOB` may fail to build on macs with Apple
+> silicon (<https://github.com/girke-lab/ChemmineOB/issues/35>) causing
+> installation failture for `volcalc`.
+
 ## Basic Usage
 
 This is a basic example which shows you how to get an estimated relative
@@ -118,7 +121,7 @@ calc_vol(files$mol_path)
 #>   mol_path                                          formula name    rvi category
 #>   <chr>                                             <chr>   <chr> <dbl> <fct>   
 #> 1 /var/folders/wr/by_lst2d2fngf67mknmgf4340000gn/T… C6H7Cl… beta…  6.98 high    
-#> 2 /var/folders/wr/by_lst2d2fngf67mknmgf4340000gn/T… C4H6O4  Succ…  2.57 high
+#> 2 /var/folders/wr/by_lst2d2fngf67mknmgf4340000gn/T… C6H7Cl… beta…  6.98 high
 
 #alternatively, supply a SMILES representation
 calc_vol(c("C1(C(C(C(C(C1Cl)Cl)Cl)Cl)Cl)O",  "C(CC(=O)O)C(=O)O"), from = "smiles")

diff --git a/man/get_mol_kegg.Rd b/man/get_mol_kegg.Rd
diff --git a/tests/testthat/test-get_mol_kegg.R b/tests/testthat/test-get_mol_kegg.R
@@ -110,3 +110,14 @@ test_that("works with pathway modules", {
   expect_true(all(file.exists(out$mol_path)))
 })
 
+test_that("one compound per mol", {
+  skip_on_cran()
+  skip_if_offline()
+
+  dir <- withr::local_tempdir()
+  mols <- get_mol_kegg(c("C16181", "C00042"), dir = dir)
+  mol1 <- readLines(mols$mol_path[1])
+  mol2 <- readLines(mols$mol_path[2])
+  expect_equal(sum(stringr::str_detect(mol1, "END")), 1)
+  expect_equal(sum(stringr::str_detect(mol2, "END")), 1)
+})