diff --git a/.gitignore b/.gitignore index 4c14954..9d7770f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ .Rproj.user .Renviron docs +inst/demo-app/rsconnect/ diff --git a/DESCRIPTION b/DESCRIPTION index 71dfc96..5bee9d5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -27,6 +27,7 @@ Imports: glue Suggests: testthat (>= 3.0.0), + shiny, withr Config/testthat/edition: 3 Config/testthat/parallel: true diff --git a/R/Pinecone.R b/R/Pinecone.R index aecb880..13c86c4 100644 --- a/R/Pinecone.R +++ b/R/Pinecone.R @@ -48,6 +48,34 @@ Pinecone <- R6::R6Class( response_body <- httr2::resp_body_json(response) response_body }, + + read_record = function(id) { + + pinecone_api_key <- Sys.getenv("PINECONE_API_KEY") + + url <- paste0("https://", private$.index_host) + + request <- httr2::request(url) |> + httr2::req_url_path_append("vectors") |> + httr2::req_url_path_append("fetch") |> + httr2::req_url_query( + ids = id, + namespace = private$.namespace + ) |> + httr2::req_headers( + "Api-Key" = pinecone_api_key, + "X-Pinecone-API-Version" = "2024-10" + ) + + response <- request |> + httr2::req_perform() + + response_body <- httr2::resp_body_json(response) + results <- response_body$vectors + + results + }, + find_records = function(query, top_k = 1) { diff --git a/R/VectorDatabase.R b/R/VectorDatabase.R index 519d299..d6ac8e7 100644 --- a/R/VectorDatabase.R +++ b/R/VectorDatabase.R @@ -11,6 +11,10 @@ VectorDatabase <- R6::R6Class( stop(call. = FALSE, "Not implemented yet.") }, + read_record = function(id) { + stop(call. = FALSE, "Not implemented yet.") + }, + find_records = function(query, top_k = 1) { stop(call. = FALSE, "Not implemented yet.") } diff --git a/R/add_metadata.R b/R/add_metadata.R index 7441a47..7557e72 100644 --- a/R/add_metadata.R +++ b/R/add_metadata.R @@ -1,5 +1,5 @@ #' @noRd -add_metadata <- function(result, content) { +add_metadata <- function(result, content, timestamp) { web_url <- content$repo_url[1] api_url <- content$api_url[1] if (grepl("github", api_url)) { @@ -10,7 +10,7 @@ add_metadata <- function(result, content) { result[["metadata"]] <- list( repo_url = web_url, files = paste0(content$file_path, collapse = ", "), - timestamp = get_repo_date(api_url) + timestamp = timestamp ) result } diff --git a/R/process_content.R b/R/process_content.R index a0139fa..e396b58 100644 --- a/R/process_content.R +++ b/R/process_content.R @@ -1,6 +1,19 @@ -process_content <- function(gitai, content) { +process_content <- function(gitai, content, max_words = 80000, verbose) { - # TODO: check if it fits in the context window + words <- strsplit(content, "\\s+")[[1]] + num_words <- length(words) + if (verbose) cli::cli_alert_info("Repo content has {num_words} words") + + if (num_words > max_words) { + if (verbose) { + cli::cli_alert_warning("Repo content is probably too long, triming...") + } + trimmed_words <- words[seq_len(min(length(words), max_words))] + content <- paste(trimmed_words, collapse = " ") + if (verbose) { + cli::cli_alert_info("Repo content has now {length(trimmed_words)} words.") + } + } llm_clone <- gitai$llm$clone(deep = TRUE) diff --git a/R/process_repos.R b/R/process_repos.R index 1057052..376f54e 100644 --- a/R/process_repos.R +++ b/R/process_repos.R @@ -1,15 +1,19 @@ #' Run LLM on `GitAI` repositories content #' @name process_repos #' @param gitai A \code{GitAI} object. +#' @param depth A numeric, maximum depth of folders to process. #' @param verbose A logical. If \code{FALSE} you won't be getting #' additional diagnostic messages. #' @return A list. #' @export process_repos <- function( gitai, + depth = 1, verbose = is_verbose() ) { + repo_name <- api_url <- NULL + gitstats <- gitai$gitstats gitai$repos_metadata <- GitStats::get_repos( @@ -18,18 +22,31 @@ process_repos <- function( verbose = verbose ) GitStats::get_files_structure( - gitstats_object = gitstats, + gitstats, pattern = paste0(gitai$files, collapse = "|"), - depth = Inf, + depth = depth, verbose = verbose ) files_content <- GitStats::get_files_content(gitstats, verbose = verbose) - repositories <- unique(files_content$repo_name) + + distinct_repos <- files_content |> + dplyr::distinct(repo_name, api_url) + + repositories <- distinct_repos$repo_name + api_urls <- distinct_repos$api_url + results <- - repositories |> - purrr::map(function(repo_name) { + purrr::map2(repositories, api_urls, function(repo_name, api_url) { + + current_repo_number <- which(repositories == repo_name) + if (verbose) { - cli::cli_alert_info("Processing repository: {.pkg {repo_name}}") + cli::cli_alert(paste0( + "Processing repository ", + "[{current_repo_number}/{length(repositories)} ", + "{round(current_repo_number / length(repositories) * 100, 2)}%]: ", + "{.pkg {repo_name}}" + )) } filtered_content <- files_content |> @@ -39,14 +56,43 @@ process_repos <- function( dplyr::pull(file_content) |> paste(collapse = "\n\n") + if (grepl("github", api_url)) { + api_url <- github_repo(api_url) + } else { + api_url <- gitlab_repo(api_url) + } + repo_timestamp <- get_repo_date(api_url) + + if (!is.null(gitai$db)) { + if (verbose) { + cli::cli_alert_info("Checking repo timestamp...") + } + record <- gitai$db$read_record(id = repo_name) + + if (NROW(record) > 0) { + + record <- record[[1]] + record_timestamp <- as.POSIXct(record$metadata$timestamp, tz = "UTC") + + if (repo_timestamp <= record_timestamp) { + if (verbose) { + cli::cli_alert_info("Repo has not been updated. Skipping...") + } + return(NULL) + } + } + } + if (verbose) { cli::cli_alert_info("Processing content with LLM...") } + result <- process_content( gitai = gitai, - content = content_to_process + content = content_to_process, + verbose = verbose ) |> - add_metadata(content = filtered_content) + add_metadata(content = filtered_content, timestamp = repo_timestamp) if (!is.null(gitai$db)) { if (verbose) { diff --git a/R/run_demo.R b/R/run_demo.R new file mode 100644 index 0000000..435f53c --- /dev/null +++ b/R/run_demo.R @@ -0,0 +1,6 @@ +run_demo <- function() { + + app_folder <- system.file("demo-app", package = "GitAI") + + shiny::runApp(app_folder) +} diff --git a/devel/deploy_demo.R b/devel/deploy_demo.R new file mode 100644 index 0000000..11c0762 --- /dev/null +++ b/devel/deploy_demo.R @@ -0,0 +1,17 @@ +rstudioapi::restartSession() + +rsconnect::setAccountInfo( + name = 'kalimu', + token = Sys.getenv("SHINYAPPSIO_TOKEN"), + secret = Sys.getenv("SHINYAPPSIO_SECRET") +) + +# pak::pkg_install("r-world-devs/GitAI") + +rsconnect::deployApp( + appDir = "inst/demo", + account = "kalimu", + appName = "GitAI-demo" +) + +# https://kalimu.shinyapps.io/GitAI-demo/ \ No newline at end of file diff --git a/devel/sandbox.R b/devel/sandbox.R index c8e9d41..6f4fe1a 100644 --- a/devel/sandbox.R +++ b/devel/sandbox.R @@ -1,5 +1,4 @@ -my_project <- - initialize_project("gitai-demo") |> +my_project <- initialize_project("gitai-demo") |> set_database( provider = "Pinecone", index = "gitai", @@ -7,86 +6,68 @@ my_project <- ) |> set_llm(seed = 1014, api_args = list(temperature = 0)) -my_project <- - my_project |> - set_github_repos(repos = c( - "r-world-devs/GitStats", - "r-world-devs/GitAI", - "openpharma/DataFakeR" - )) |> - add_files(files = c("README.md")) - -my_project <- - my_project |> - set_prompt(paste( - "Write two paragraphs of summary for a project based on given input.", - "Highlight business value of the project, its functionality, main features,", - "and use cases." +my_project <- my_project |> + set_github_repos( + # repos = c( + # "r-world-devs/GitStats", + # "r-world-devs/GitAI", + # "r-world-devs/cohortBuilder", + # "r-world-devs/shinyCohortBuilder", + # "r-world-devs/shinyQueryBuilder", + # "r-world-devs/queryBuilder", + # "r-world-devs/shinyGizmo", + # "r-world-devs/shinyTimelines", + # "openpharma/DataFakeR" + # ) + orgs = c( + "insightsengineering", + "openpharma", + "pharmaverse", + "tidymodels", + "r-lib", + "rstudio", + "tidyverse" + ) + ) |> + add_files(c( + "DESCRIPTION", + "*.md", + "*.Rmd" )) + +my_project <- my_project |> + set_prompt(r"( + Write up to ten paragraphs of summary for a project based on given input. + Be precise and to the point in your answers. + Mention core functionality and all main features of the project. + If available, mention brifly the technology used in the project + (like R, Python, etc). + If available, mention brifly if a project is an R package, shiny app, + or other type of tool. + )") + +custom_function <- function(provider, req) { + + req |> + httr2::req_timeout(60) |> + httr2::req_perform() |> + httr2::resp_body_json() +} +unlockBinding("chat_perform_value", asNamespace("elmer")) +assign("chat_perform_value", custom_function, envir = asNamespace("elmer")) +lockBinding("chat_perform_value", asNamespace("elmer")) results <- process_repos(my_project) - -results |> dplyr::glimpse() -purrr::map(results, ~.$text) - - - - - - -# my_project <- -# initialize_project(project_id = "gitai-demo") |> -# set_database(index = "gitai") - -my_project |> find_records("I'm looking for an R package to create synthetic datasets.") - -my_project |> find_records("How can I check statisting of git repositories.") - -my_project |> find_records("Can I somehow extract information from code from git repositories?") - -my_project |> find_records("What are the tools that can help me in my work as a Data Scientist?") +# results |> dplyr::glimpse() +# purrr::map(results, ~.$text) my_project |> - find_records("As a Manager I look for tools which let me learn what tools can be reused in my company", top_k = 2) - -my_project |> find_records("Which data products could have large impact in global company that maintains many repositories?", top_k = 3) + find_records("How can I create synthetic datasets?", top_k = 3) -my_project |> find_records("Szukam narzędzi które wykorzystują sztuczną inteligencję?") - - - - - - - -my_chatbot <- - initialize_project("gitai-demo") |> - set_database(index = "gitai") |> - set_llm(seed = 1014, api_args = list(temperature = 0)) |> - set_prompt(paste( - "As a helpful assistant, answer user question using only the provided input." - )) - -get_answer <- function(my_chatbot, query) { - - cat("\n") - my_chatbot$llm$chat(paste( - "User query:", query, "\n\n", - "Known input for the answer:", - my_project$db$find_records(query = query, top_k = 1) - )) |> - cat() -} - -my_chatbot |> - get_answer("I'm looking for an R package to create synthetic datasets.") - -my_chatbot |> - get_answer("How can I check statisting of git repositories?") - -my_chatbot |> - get_answer("Can I somehow extract information from code from git repositories?") +my_project |> + find_records("How can I check statisting of git repositories.", top_k = 3) -my_chatbot |> - get_answer("I would love to use AI to process code files. Is it possible? Give me the answer writting in one sentence with very funny style.") +my_project |> + find_records("Can I somehow extract information from file content from git repositories using LLM?") +run_demo() diff --git a/inst/demo-app/app.R b/inst/demo-app/app.R new file mode 100644 index 0000000..865482a --- /dev/null +++ b/inst/demo-app/app.R @@ -0,0 +1,62 @@ +library(shiny) +library(shinychat) +library(GitAI) + +readRenviron(".Renviron") + +gitai <- initialize_project("gitai-demo") |> + set_database(index = "gitai") |> + set_llm(seed = 1014, api_args = list(temperature = 0)) |> + set_prompt(r"( + As a helpful assistant, answer user question + using only the provided input. + Use only provided with the query known input + that is most relevent to the user's query. + Do not use any other information + apart from the input provided with the query. + Be concise but provide all important information. + Also awalys provide link to mentioned git repositories + with visible full URL for example: https://github.com/some_repository. + Do not mask it with any other text. + )") + +ui <- bslib::page_fluid( + bslib::layout_sidebar( + sidebar = shiny::sliderInput( + "top_k", + "Use top K results", + step = 1, + min = 1, + max = 10, + value = 5 + ), + shiny::HTML(markdown::markdownToHTML(fragment.only = TRUE, + "This is a demo app of the outputs from the `GitAI` open-source framework.The `GitAI` allows to extract knowledge from GitHub and GitLab repositories with the use of AI/LLM (Large Language Models) on scale and with minimum costs.\n\nThe results you see in the chatbot are from processing **800+** repositories in mulitple public GitHub organizations: `r-world-devs`, `openpharma`, `pharmaverse`, `tidymodels`, `r-lib`, `rstudio`, `tidyverse`, `insightsengineering`. In the repositories we are scanning the following files: `DESCRIPTION`, `*.md`, and `*.Rmd` files, so it includes files like `README.md` or R package vignettes.\n\nFor this demo we use simple and cheap LLM `gpt-4o-mini` from OpenAI. As embedings we use `multilingual-e5-large` embedding model from Pinecone as well as its vector database with 1024 dimensions. The overall one-time cost of processing all 800+ repositories is **less then $1** with this setup (yes, one USD!). \nEven more impressive results can be achieved with more powerful LLMs, and higher-dimensional embeddings.\n See more: [GitAI](https://github.com/r-world-devs/GitAI)\n\nTry to chat with the chatbot to find and reuse tools for your specific needs, for example, you can ask:\n`I need to filter datasets and build cohorts interactively in a shiny dashboard. What dashboarding component could I use?`" + )), + + chat_ui("chat") + ) +) + +server <- function(input, output, session) { + + user_chatbot <- gitai$llm$clone() + + shiny::observeEvent(input$chat_user_input, { + + query <- input$chat_user_input + + stream <- user_chatbot$stream_async( + paste( + "User query:", query, "\n\n", + "Known input provided for the answer:\n\n", + gitai$db$find_records(query = query, top_k = input$top_k) + ) + ) + chat_append("chat", stream) + }) +} + +shinyApp(ui, server) + + diff --git a/man/process_repos.Rd b/man/process_repos.Rd index 670aa41..9d24d1a 100644 --- a/man/process_repos.Rd +++ b/man/process_repos.Rd @@ -4,11 +4,13 @@ \alias{process_repos} \title{Run LLM on \code{GitAI} repositories content} \usage{ -process_repos(gitai, verbose = is_verbose()) +process_repos(gitai, depth = 1, verbose = is_verbose()) } \arguments{ \item{gitai}{A \code{GitAI} object.} +\item{depth}{A numeric, maximum depth of folders to process.} + \item{verbose}{A logical. If \code{FALSE} you won't be getting additional diagnostic messages.} } diff --git a/tests/testthat/test-Pinecone.R b/tests/testthat/test-Pinecone.R index 1d7c078..f18b741 100644 --- a/tests/testthat/test-Pinecone.R +++ b/tests/testthat/test-Pinecone.R @@ -1,10 +1,10 @@ test_that("getting index metadata", { db <- Pinecone$new( - namespace = "test_project_id", + namespace = "test_project_id", index = "gitai" ) - + index <- db$get_index_metadata() index$host |> is.character() |> expect_true() }) @@ -12,10 +12,10 @@ test_that("getting index metadata", { test_that("getting embeddings", { db <- Pinecone$new( - namespace = "test_project_id", + namespace = "test_project_id", index = "gitai" ) - + test_text <- "Apple is a popular fruit known for its sweetness and crisp texture." embeddings <- db$.__enclos_env__$private$.get_embeddings(text = test_text) @@ -23,12 +23,12 @@ test_that("getting embeddings", { }) test_that("writting records", { - + db <- Pinecone$new( - namespace = "test_project_id", + namespace = "test_project_id", index = "gitai" ) - + test_texts <- c( "Apple is a popular fruit known for its sweetness and crisp texture.", "The tech company Apple is known for its innovative products like the iPhone.", @@ -39,11 +39,11 @@ test_that("writting records", { ) for (i in seq_along(test_texts)) { - + result <- db$write_record( id = paste0("id_", i), text = test_texts[i] - ) + ) result$upsertedCount |> expect_equal(1) } @@ -52,14 +52,14 @@ test_that("writting records", { test_that("finding records", { Sys.sleep(3) - + db <- Pinecone$new( namespace = "test_project_id", index = "gitai" ) result <- db$find_records( - query = "Tell me about Apple Tech computer company.", + query = "Tell me about Apple Tech computer company.", top_k = 1 ) @@ -69,10 +69,23 @@ test_that("finding records", { result[[1]]$score |> is.numeric() |> expect_true() result_2 <- db$find_records( - query = "Tell me about apple fruit.", + query = "Tell me about apple fruit.", top_k = 1 ) - + expect_false(result_2[[1]]$id == result[[1]]$id) }) +test_that("reading records", { + + db <- Pinecone$new( + namespace = "test_project_id", + index = "gitai" + ) + + result <- db$read_record(id = "id_1") + + result[[1]]$metadata$text |> + is.character() |> + expect_true() +}) diff --git a/tests/testthat/test-add_metadata.R b/tests/testthat/test-add_metadata.R index ea2ea09..35b4541 100644 --- a/tests/testthat/test-add_metadata.R +++ b/tests/testthat/test-add_metadata.R @@ -9,16 +9,13 @@ test_that("metadata is added to content", { repo_url = c("test_URL", "test_URL"), api_url = c("test_URL", "test_URL") ) - testthat::with_mocked_bindings({ - result_with_metadata <- "result" |> - test_mocker$use() |> - add_metadata( - content = mocked_files_content - ) - expect_true("metadata" %in% names(result_with_metadata)) - expect_type(result_with_metadata[["metadata"]], "list") - expect_equal(names(result_with_metadata[["metadata"]]), c("repo_url", "files", "timestamp")) - }, - get_repo_date = function(api_url) Sys.time() - ) + result_with_metadata <- "result" |> + test_mocker$use() |> + add_metadata( + content = mocked_files_content, + timestamp = Sys.Date() + ) + expect_true("metadata" %in% names(result_with_metadata)) + expect_type(result_with_metadata[["metadata"]], "list") + expect_equal(names(result_with_metadata[["metadata"]]), c("repo_url", "files", "timestamp")) }) diff --git a/tests/testthat/test-process_content.R b/tests/testthat/test-process_content.R index a5828f3..a7884ca 100644 --- a/tests/testthat/test-process_content.R +++ b/tests/testthat/test-process_content.R @@ -3,7 +3,7 @@ test_that("processing content have proper output structure", { set_llm() |> set_prompt(system_prompt = "Say 'Hi there!' only and nothing else.") - result <- process_content(gitai = my_project, content = "") + result <- process_content(gitai = my_project, content = "", verbose = FALSE) expect_equal(result$text, "Hi there!") expect_true(is.numeric(result$tokens)) expect_true(is.list(result$output)) @@ -26,17 +26,18 @@ test_that("processing a single file content with deterministic output", { httr2::with_verbosity(verbosity = 0, { result <- process_content( gitai = my_project, - content = test_content + content = test_content, + verbose = FALSE ) }) expect_length(gregexpr("\\.", result$text)[[1]], 1) expect_equal( result$text, - process_content(gitai = my_project, content = test_content)$text + process_content(gitai = my_project, content = test_content, verbose = FALSE)$text ) expect_equal( result$text, - process_content(gitai = my_project, content = test_content)$text + process_content(gitai = my_project, content = test_content, verbose = FALSE)$text ) test_mocker$cache(result)