Merge pull request #102 from r-world-devs/waisk/34/prepare_release

pre-release
r-world-devs · Feb 17, 2025 · 2444c04 · 2444c04
2 parents 3f8c005 + 8c35f51
commit 2444c04
Show file tree

Hide file tree

Showing 26 changed files with 693 additions and 203 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,3 +2,7 @@
 .Renviron
 docs
 inst/demo-app/rsconnect/
+
+/.quarto/
+README.html
+inst/doc
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,12 +1,12 @@
 Package: GitAI
 Title: Extracts Knowledge From Git Repositories
-Version: 0.0.0.9015
+Version: 0.0.0.9017
 Authors@R: c(
     person("Kamil", "Wais", , "[email protected]", role = c("aut", "cre")),
     person("Krystian", "Igras", , "[email protected]", role = "aut"),
     person("Maciej", "Banas", , "[email protected]", role = "aut")
   )
-Description: Scan multiple Git repositories, pull specified files content and process it with Large Language Models. You can summarize the content in specific way, extract information and data, or find answers to your questions about the repositories.
+Description: Scan multiple Git repositories, pull specified files content and process it with Large Language Models. You can summarize the content in specific way, extract information and data, or find answers to your questions about the repositories. The output can be stored in vector database and used for semantic search or as a part of a RAG (Retrieval Augmented Generation) prompt.
 License: MIT + file LICENSE
 Encoding: UTF-8
 Roxygen: list(markdown = TRUE)
@@ -15,8 +15,8 @@ Depends:
     R (>= 4.1.0)
 Imports:
     cli (>= 3.4.0),
-    elmer,
-    GitStats,
+    ellmer,
+    GitStats (>= 2.2.0),
     httr2,
     lubridate,
     R6,
@@ -28,5 +28,9 @@ Imports:
 Suggests: 
     testthat (>= 3.0.0),
     shiny,
-    withr
+    shinychat,
+    withr,
+    knitr,
+    rmarkdown
 Config/testthat/edition: 3
+VignetteBuilder: knitr
diff --git a/R/GitAI-package.R b/R/GitAI-package.R
@@ -8,6 +8,6 @@
 #' within file marked at '.Rbuildignore' file.
 missing_deps_note_fix <- function() {
   R6::R6Class
-  elmer::chat_ollama
+  ellmer::chat_ollama
   lubridate::as_datetime
 }
diff --git a/R/Pinecone.R b/R/Pinecone.R
@@ -114,7 +114,7 @@ Pinecone <- R6::R6Class(
         })
     },
 
-    list_record_IDs = function() {
+    list_record_ids = function() {
 
       pinecone_api_key <- Sys.getenv("PINECONE_API_KEY")
 
@@ -147,7 +147,7 @@ Pinecone <- R6::R6Class(
         has_next_page <- "pagination" %in% names(response_body)
       }
 
-      return(record_ids)
+      record_ids
     },
 
     purge_records = function(ids) {

diff --git a/R/process_content.R b/R/process_content.R
@@ -16,7 +16,6 @@ process_content <- function(gitai, content, max_words = 80000, verbose) {
   }
 
   llm_clone <- gitai$llm$clone(deep = TRUE)
-
   llm_clone$chat(content)
 
   turn <- llm_clone$last_turn("assistant")

diff --git a/R/process_repos.R b/R/process_repos.R
@@ -21,13 +21,12 @@ process_repos <- function(
     add_contributors = FALSE,
     verbose = verbose
   )
-  GitStats::get_files_structure(
+  files_content <- GitStats::get_files(
     gitstats,
     pattern = paste0(gitai$files, collapse = "|"),
     depth = depth,
     verbose = verbose
   )
-  files_content <- GitStats::get_files_content(gitstats, verbose = verbose)
 
   distinct_repos <- files_content |>
     dplyr::distinct(repo_name, api_url)

diff --git a/R/set_llm.R b/R/set_llm.R
@@ -3,15 +3,15 @@
 #' @name set_llm
 #' @param gitai A \code{GitAI} object.
 #' @param provider Name of LLM provider, a string. Results with setting up LLM using
-#'   \code{elmer::chat_<provider>} function.
-#' @param ... Other arguments to pass to corresponding \code{elmer::chat_<provider>} function.
+#'   \code{ellmer::chat_<provider>} function.
+#' @param ... Other arguments to pass to corresponding \code{ellmer::chat_<provider>} function.
 #'   Please use \link{get_llm_defaults} to get default model arguments.
 #' @return A \code{GitAI} object.
 #' @export
 set_llm <- function(gitai, provider = "openai", ...) {
 
   provider_method <- rlang::env_get(
-    env = asNamespace("elmer"),
+    env = asNamespace("ellmer"),
     nm = glue::glue("chat_{provider}")
   )
   provider_args <- purrr::list_modify(

diff --git a/R/test-helpers.R b/R/test-helpers.R
@@ -136,7 +136,7 @@ PineconeMocked <- R6::R6Class(
         })
     },
 
-    list_record_IDs = function() {
+    list_record_ids = function() {
       pinecone_api_key <- Sys.getenv("PINECONE_API_KEY")
 
       url <- paste0("https://", private$.index_host)
@@ -153,7 +153,7 @@ PineconeMocked <- R6::R6Class(
         )
 
       response <- httr2::response_json(
-        body = test_fixtures[["list_record_IDs"]]
+        body = test_fixtures[["list_record_ids"]]
       )
 
       response_body <- httr2::resp_body_json(response)
@@ -261,7 +261,7 @@ test_fixtures[["read_record"]] <- list(
   "usage" = list("readUnits" = 1L)
 )
 
-test_fixtures[["list_record_IDs"]] <- list(
+test_fixtures[["list_record_ids"]] <- list(
   "vectors" = list(
     list(
       "id" = "project_1"

diff --git a/README.Rmd b/README.Rmd
@@ -19,15 +19,46 @@ knitr::opts_chunk$set(
 [![Codecov test coverage](https://codecov.io/gh/r-world-devs/GitAI/graph/badge.svg)](https://app.codecov.io/gh/r-world-devs/GitAI)
 <!-- badges: end -->
 
-The goal of GitAI is to derive knowledge from GitHub or GitLab repositories with the use of AI/LLM (Large Language Models). With GitAI you can easily:
+> The goal of `GitAI` is to **extract knowledge from Git repositories** with the use of AI/LLM (Large Language Models). 
 
-- set up your project scope (Git repositories),
-- select content of interest (files and file types),
-- choose your LLM backend,
-- define the LLM prompts,
-- process content of all repositories with a single function call.
+## Motivation
 
-And all of that in a nice tidyverse style.
+Large organizations need to deal with massive number of git repositories
+(both internal and external). Those repositories can be hosted on different
+platforms (like `GitHub` and `GitLab`).
+
+It is very difficult or even impossible to review all those repositories 
+manually, especially if one needs to perform an exploratory search, 
+not knowing the exact keywords that should be used.
+
+Because of that the reusability of the knowledge (and code) hidden in the 
+repositories is a constant challenge.
+
+## Solution
+
+We propose the `GitAI` framework written in R.
+
+It is applicable to multiple use cases related to extracting knowledge from Git repositories. 
+At the same time, is IT infrastructure agnostic. It is designed to work with
+different backends, LLMs, embeddings models, and vector databases.
+Adapting to particular backends may need implementation of new classes, but
+the core functionality stays the same.
+
+## Workflow
+
+Typical `GitAI` workflow looks like that:
+
+1. Set up your project.
+    1. Set up your project scope (Git repositories).
+    1. Select content type of interest (files and file types).
+    1. Choose your LLM backend.
+    1. Define the LLM prompts.
+    1. (Optional) Choose embedding model and vector database provider.
+1. Process content of all repositories with a single function call.
+    1. (Optional) If vector database is setup, the results will be stored there.
+1. Use the information extracted from files content from git repositories.
+    1. (Optional) If results are stored in vector database, 
+  they can be searched using *semantic search* or used as a part of a RAG (*Retrieval Augmented Generation*) prompt.
 
 ## Installation
 
@@ -38,21 +69,43 @@ You can install the development version of `GitAI` from [GitHub](https://github.
 pak::pak("r-world-devs/GitAI")
 ```
 
-## Example workflow
-
-Basic workflow could look like:
+## Simplified example (without vector database usage)
 
 ```{r}
 library(GitAI)
-# Set up project
+```
+
+Let's set up a project `fascinating_project` that will extract some summaries from the content of the `README.md` files in the few selected git repositories.
+
+
+```{r}
+options(ellmer_timeout_s = 120)
 verbose_off()
 my_project <- initialize_project("fascinating_project") |>
-  set_github_repos(repos = c("r-world-devs/GitStats", "r-world-devs/GitAI", "openpharma/DataFakeR")) |>
+  set_github_repos(
+    repos = c(
+      "r-world-devs/GitStats", 
+      "r-world-devs/GitAI", 
+      "openpharma/DataFakeR"
+    )
+  ) |>
   add_files(files = "README.md") |>
   set_llm() |>
   set_prompt("Write one-sentence summary for a project based on given input.")
+```
 
-# Get the results
+Now, let's get the results and print them.
+
+```{r}
 results <- process_repos(my_project)
-purrr::map(results, ~.$text)
+
+purrr::walk(results, function(result) {
+  result$text |> stringr::str_wrap(width = 80) |> cat("\n\n")
+})
 ```
+
+## See also
+
+Our `GitAI` uses under the hood the `GitStats` R package.
+If you want to use it directly for pulling git data, check out:
+[https://r-world-devs.github.io/GitStats/](https://r-world-devs.github.io/GitStats/)
diff --git a/README.md b/README.md
@@ -9,17 +9,51 @@
 coverage](https://codecov.io/gh/r-world-devs/GitAI/graph/badge.svg)](https://app.codecov.io/gh/r-world-devs/GitAI)
 <!-- badges: end -->
 
-The goal of GitAI is to derive knowledge from GitHub or GitLab
-repositories with the use of AI/LLM (Large Language Models). With GitAI
-you can easily:
+> The goal of `GitAI` is to **extract knowledge from Git repositories**
+> with the use of AI/LLM (Large Language Models).
 
-- set up your project scope (Git repositories),
-- select content of interest (files and file types),
-- choose your LLM backend,
-- define the LLM prompts,
-- process content of all repositories with a single function call.
+## Motivation
 
-And all of that in a nice tidyverse style.
+Large organizations need to deal with massive number of git repositories
+(both internal and external). Those repositories can be hosted on
+different platforms (like `GitHub` and `GitLab`).
+
+It is very difficult or even impossible to review all those repositories
+manually, especially if one needs to perform an exploratory search, not
+knowing the exact keywords that should be used.
+
+Because of that the reusability of the knowledge (and code) hidden in
+the repositories is a constant challenge.
+
+## Solution
+
+We propose the `GitAI` framework written in R.
+
+It is applicable to multiple use cases related to extracting knowledge
+from Git repositories. At the same time, is IT infrastructure agnostic.
+It is designed to work with different backends, LLMs, embeddings models,
+and vector databases. Adapting to particular backends may need
+implementation of new classes, but the core functionality stays the
+same.
+
+## Workflow
+
+Typical `GitAI` workflow looks like that:
+
+1.  Set up your project.
+    1.  Set up your project scope (Git repositories).
+    2.  Select content type of interest (files and file types).
+    3.  Choose your LLM backend.
+    4.  Define the LLM prompts.
+    5.  (Optional) Choose embedding model and vector database provider.
+2.  Process content of all repositories with a single function call.
+    1.  (Optional) If vector database is setup, the results will be
+        stored there.
+3.  Use the information extracted from files content from git
+    repositories.
+    1.  (Optional) If results are stored in vector database, they can be
+        searched using *semantic search* or used as a part of a RAG
+        (*Retrieval Augmented Generation*) prompt.
 
 ## Installation
 
@@ -31,29 +65,56 @@ You can install the development version of `GitAI` from
 pak::pak("r-world-devs/GitAI")
 ```
 
-## Example workflow
-
-Basic workflow could look like:
+## Simplified example (without vector database usage)
 
 ``` r
 library(GitAI)
-# Set up project
+```
+
+Let’s set up a project `fascinating_project` that will extract some
+summaries from the content of the `README.md` files in the few selected
+git repositories.
+
+``` r
+options(ellmer_timeout_s = 120)
 verbose_off()
 my_project <- initialize_project("fascinating_project") |>
-  set_github_repos(repos = c("r-world-devs/GitStats", "r-world-devs/GitAI", "openpharma/DataFakeR")) |>
+  set_github_repos(
+    repos = c(
+      "r-world-devs/GitStats", 
+      "r-world-devs/GitAI", 
+      "openpharma/DataFakeR"
+    )
+  ) |>
   add_files(files = "README.md") |>
   set_llm() |>
   set_prompt("Write one-sentence summary for a project based on given input.")
+```
+
+Now, let’s get the results and print them.
 
-# Get the results
+``` r
 results <- process_repos(my_project)
-purrr::map(results, ~.$text)
-#> $GitStats
-#> [1] "GitStats is an R package that enables users to extract and analyze GitHub and GitLab data, such as repository details, commits, and user activity, in a standardized table format."
+
+purrr::walk(results, function(result) {
+  result$text |> stringr::str_wrap(width = 80) |> cat("\n\n")
+})
+#> GitStats is an experimental R package that facilitates the extraction
+#> and analysis of git data from GitHub and GitLab, providing insights into
+#> repositories, commits, users, and R package usage in a structured format. 
 #> 
-#> $GitAI
-#> [1] "GitAI is an R package designed to harness the power of AI and Large Language Models to extract insights from GitHub or GitLab repositories in a user-friendly, tidyverse style, enabling users to set project scopes, select content of interest, and process repositories with ease."
+#> GitAI is an R package that leverages AI and Large Language Models to extract
+#> insights from GitHub or GitLab repositories, allowing users to define project
+#> scopes, select relevant content, and process repositories efficiently in a
+#> tidyverse-compliant manner. 
 #> 
-#> $DataFakeR
-#> [1] "DataFakeR is an experimental R package designed to generate fake data samples that maintain specified characteristics of original datasets, streamlined through customizable configurations and schema management."
+#> DataFakeR is an R package that enables users to generate synthetic datasets
+#> while maintaining specified assumptions about the original data structure,
+#> facilitating data simulation for testing and analysis.
 ```
+
+## See also
+
+Our `GitAI` uses under the hood the `GitStats` R package. If you want to
+use it directly for pulling git data, check out:
+<https://r-world-devs.github.io/GitStats/>
-Original file line number
+Diff line change
@@ Expand Up @@
       }
       llm_clone <- gitai$llm$clone(deep = TRUE)
       llm_clone$chat(content)
       turn <- llm_clone$last_turn("assistant")
@@ Expand Down @@