From 86fa8d4ab360825553a3c174d1e0d913f41273f3 Mon Sep 17 00:00:00 2001 From: Kamil Wais Date: Thu, 12 Dec 2024 09:48:15 +0100 Subject: [PATCH 1/6] added demo app --- .gitignore | 1 + R/run_demo.R | 6 +++ devel/sandbox.R | 118 ++++++++++++++++++------------------------------ inst/demo/app.R | 57 +++++++++++++++++++++++ 4 files changed, 109 insertions(+), 73 deletions(-) create mode 100644 R/run_demo.R create mode 100644 inst/demo/app.R diff --git a/.gitignore b/.gitignore index 4c14954..1ead979 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ .Rproj.user .Renviron docs +inst/demo/rsconnect/shinyapps.io diff --git a/R/run_demo.R b/R/run_demo.R new file mode 100644 index 0000000..6d5d557 --- /dev/null +++ b/R/run_demo.R @@ -0,0 +1,6 @@ +run_demo <- function() { + + app_folder <- system.file("demo", package = "GitAI") + + shiny::runApp(app_folder) +} diff --git a/devel/sandbox.R b/devel/sandbox.R index c8e9d41..9c63c10 100644 --- a/devel/sandbox.R +++ b/devel/sandbox.R @@ -1,5 +1,4 @@ -my_project <- - initialize_project("gitai-demo") |> +my_project <- initialize_project("gitai-demo") |> set_database( provider = "Pinecone", index = "gitai", @@ -7,86 +6,59 @@ my_project <- ) |> set_llm(seed = 1014, api_args = list(temperature = 0)) -my_project <- - my_project |> - set_github_repos(repos = c( - "r-world-devs/GitStats", - "r-world-devs/GitAI", - "openpharma/DataFakeR" - )) |> - add_files(files = c("README.md")) - -my_project <- - my_project |> - set_prompt(paste( - "Write two paragraphs of summary for a project based on given input.", - "Highlight business value of the project, its functionality, main features,", - "and use cases." +my_project <- my_project |> + set_github_repos( + repos = c( + "r-world-devs/GitStats", + "r-world-devs/GitAI", + "r-world-devs/cohortBuilder", + "r-world-devs/shinyCohortBuilder", + "r-world-devs/shinyQueryBuilder", + "r-world-devs/queryBuilder", + "r-world-devs/shinyGizmo", + "r-world-devs/shinyTimelines", + "openpharma/DataFakeR" + ) + # orgs = c( + # # "r-lib", + # # "rstudio", + # "tidyverse" + # ) + ) |> + add_files(c( + # "DESCRIPTION", + # "project_metadata.yaml", + # "*.md", + "README.md" )) + + +my_project <- my_project |> + set_prompt(r"( + Write about five paragraphs of summary for a project based on given input. + Be precise and to the point in your answers. + Mention core functionality and all main features of the project. + If available, mention brifly the technology used in the project + (like R, Python, etc). + If available, mention brifly if a project is an R package, shiny app, + or other type of tool. + )") results <- process_repos(my_project) results |> dplyr::glimpse() purrr::map(results, ~.$text) - - - - - -# my_project <- -# initialize_project(project_id = "gitai-demo") |> -# set_database(index = "gitai") - -my_project |> find_records("I'm looking for an R package to create synthetic datasets.") - -my_project |> find_records("How can I check statisting of git repositories.") - -my_project |> find_records("Can I somehow extract information from code from git repositories?") - -my_project |> find_records("What are the tools that can help me in my work as a Data Scientist?") - my_project |> - find_records("As a Manager I look for tools which let me learn what tools can be reused in my company", top_k = 2) - -my_project |> find_records("Which data products could have large impact in global company that maintains many repositories?", top_k = 3) - -my_project |> find_records("Szukam narzędzi które wykorzystują sztuczną inteligencję?") + find_records("How can I create synthetic datasets?") +my_project |> + find_records("How can I check statisting of git repositories.") +my_project |> + find_records("Can I somehow extract information from file content from git repositories using LLM?") +my_project |> + find_records("What could help me managing many git repositories?", top_k = 2) - - - -my_chatbot <- - initialize_project("gitai-demo") |> - set_database(index = "gitai") |> - set_llm(seed = 1014, api_args = list(temperature = 0)) |> - set_prompt(paste( - "As a helpful assistant, answer user question using only the provided input." - )) - -get_answer <- function(my_chatbot, query) { - - cat("\n") - my_chatbot$llm$chat(paste( - "User query:", query, "\n\n", - "Known input for the answer:", - my_project$db$find_records(query = query, top_k = 1) - )) |> - cat() -} - -my_chatbot |> - get_answer("I'm looking for an R package to create synthetic datasets.") - -my_chatbot |> - get_answer("How can I check statisting of git repositories?") - -my_chatbot |> - get_answer("Can I somehow extract information from code from git repositories?") - -my_chatbot |> - get_answer("I would love to use AI to process code files. Is it possible? Give me the answer writting in one sentence with very funny style.") - +run_demo() diff --git a/inst/demo/app.R b/inst/demo/app.R new file mode 100644 index 0000000..98b5a6c --- /dev/null +++ b/inst/demo/app.R @@ -0,0 +1,57 @@ +library(shiny) +library(shinychat) +library(GitAI) + +readRenviron(".Renviron") + +gitai <- initialize_project("gitai-demo") |> + set_database(index = "gitai") |> + set_llm(seed = 1014, api_args = list(temperature = 0)) |> + set_prompt(r"( + As a helpful assistant, answer user question + using only the provided input. + Use only provided with the query known input + that is most relevent to the user's query. + Do know use any other information apart from input provided with the query. + Be concise and to the point in your answers. + Also awalys provide link to mentioned git repositories + with visible full URL for example: https://github.com/some_repository. + Do not mask it with any other text. + )") + +ui <- bslib::page_fluid( + bslib::layout_sidebar( + sidebar = shiny::sliderInput( + "top_k", + "Use top K results", + step = 1, + min = 1, + max = 5, + value = 5 + ), + chat_ui("chat") + ) +) + +server <- function(input, output, session) { + + user_chatbot <- gitai$llm$clone() + + shiny::observeEvent(input$chat_user_input, { + + query <- input$chat_user_input + + stream <- user_chatbot$stream_async( + paste( + "User query:", query, "\n\n", + "Known input provided for the answer:\n\n", + gitai$db$find_records(query = query, top_k = input$top_k) + ) + ) + chat_append("chat", stream) + }) +} + +shinyApp(ui, server) + + From 4cb0022a74c87ee43edb07d99e8c0c8107e6e862 Mon Sep 17 00:00:00 2001 From: Kamil Wais Date: Thu, 12 Dec 2024 09:48:15 +0100 Subject: [PATCH 2/6] added demo app --- .gitignore | 1 + R/run_demo.R | 6 +++ devel/sandbox.R | 118 ++++++++++++++++++------------------------------ inst/demo/app.R | 57 +++++++++++++++++++++++ 4 files changed, 109 insertions(+), 73 deletions(-) create mode 100644 R/run_demo.R create mode 100644 inst/demo/app.R diff --git a/.gitignore b/.gitignore index 4c14954..1ead979 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ .Rproj.user .Renviron docs +inst/demo/rsconnect/shinyapps.io diff --git a/R/run_demo.R b/R/run_demo.R new file mode 100644 index 0000000..6d5d557 --- /dev/null +++ b/R/run_demo.R @@ -0,0 +1,6 @@ +run_demo <- function() { + + app_folder <- system.file("demo", package = "GitAI") + + shiny::runApp(app_folder) +} diff --git a/devel/sandbox.R b/devel/sandbox.R index c8e9d41..9c63c10 100644 --- a/devel/sandbox.R +++ b/devel/sandbox.R @@ -1,5 +1,4 @@ -my_project <- - initialize_project("gitai-demo") |> +my_project <- initialize_project("gitai-demo") |> set_database( provider = "Pinecone", index = "gitai", @@ -7,86 +6,59 @@ my_project <- ) |> set_llm(seed = 1014, api_args = list(temperature = 0)) -my_project <- - my_project |> - set_github_repos(repos = c( - "r-world-devs/GitStats", - "r-world-devs/GitAI", - "openpharma/DataFakeR" - )) |> - add_files(files = c("README.md")) - -my_project <- - my_project |> - set_prompt(paste( - "Write two paragraphs of summary for a project based on given input.", - "Highlight business value of the project, its functionality, main features,", - "and use cases." +my_project <- my_project |> + set_github_repos( + repos = c( + "r-world-devs/GitStats", + "r-world-devs/GitAI", + "r-world-devs/cohortBuilder", + "r-world-devs/shinyCohortBuilder", + "r-world-devs/shinyQueryBuilder", + "r-world-devs/queryBuilder", + "r-world-devs/shinyGizmo", + "r-world-devs/shinyTimelines", + "openpharma/DataFakeR" + ) + # orgs = c( + # # "r-lib", + # # "rstudio", + # "tidyverse" + # ) + ) |> + add_files(c( + # "DESCRIPTION", + # "project_metadata.yaml", + # "*.md", + "README.md" )) + + +my_project <- my_project |> + set_prompt(r"( + Write about five paragraphs of summary for a project based on given input. + Be precise and to the point in your answers. + Mention core functionality and all main features of the project. + If available, mention brifly the technology used in the project + (like R, Python, etc). + If available, mention brifly if a project is an R package, shiny app, + or other type of tool. + )") results <- process_repos(my_project) results |> dplyr::glimpse() purrr::map(results, ~.$text) - - - - - -# my_project <- -# initialize_project(project_id = "gitai-demo") |> -# set_database(index = "gitai") - -my_project |> find_records("I'm looking for an R package to create synthetic datasets.") - -my_project |> find_records("How can I check statisting of git repositories.") - -my_project |> find_records("Can I somehow extract information from code from git repositories?") - -my_project |> find_records("What are the tools that can help me in my work as a Data Scientist?") - my_project |> - find_records("As a Manager I look for tools which let me learn what tools can be reused in my company", top_k = 2) - -my_project |> find_records("Which data products could have large impact in global company that maintains many repositories?", top_k = 3) - -my_project |> find_records("Szukam narzędzi które wykorzystują sztuczną inteligencję?") + find_records("How can I create synthetic datasets?") +my_project |> + find_records("How can I check statisting of git repositories.") +my_project |> + find_records("Can I somehow extract information from file content from git repositories using LLM?") +my_project |> + find_records("What could help me managing many git repositories?", top_k = 2) - - - -my_chatbot <- - initialize_project("gitai-demo") |> - set_database(index = "gitai") |> - set_llm(seed = 1014, api_args = list(temperature = 0)) |> - set_prompt(paste( - "As a helpful assistant, answer user question using only the provided input." - )) - -get_answer <- function(my_chatbot, query) { - - cat("\n") - my_chatbot$llm$chat(paste( - "User query:", query, "\n\n", - "Known input for the answer:", - my_project$db$find_records(query = query, top_k = 1) - )) |> - cat() -} - -my_chatbot |> - get_answer("I'm looking for an R package to create synthetic datasets.") - -my_chatbot |> - get_answer("How can I check statisting of git repositories?") - -my_chatbot |> - get_answer("Can I somehow extract information from code from git repositories?") - -my_chatbot |> - get_answer("I would love to use AI to process code files. Is it possible? Give me the answer writting in one sentence with very funny style.") - +run_demo() diff --git a/inst/demo/app.R b/inst/demo/app.R new file mode 100644 index 0000000..98b5a6c --- /dev/null +++ b/inst/demo/app.R @@ -0,0 +1,57 @@ +library(shiny) +library(shinychat) +library(GitAI) + +readRenviron(".Renviron") + +gitai <- initialize_project("gitai-demo") |> + set_database(index = "gitai") |> + set_llm(seed = 1014, api_args = list(temperature = 0)) |> + set_prompt(r"( + As a helpful assistant, answer user question + using only the provided input. + Use only provided with the query known input + that is most relevent to the user's query. + Do know use any other information apart from input provided with the query. + Be concise and to the point in your answers. + Also awalys provide link to mentioned git repositories + with visible full URL for example: https://github.com/some_repository. + Do not mask it with any other text. + )") + +ui <- bslib::page_fluid( + bslib::layout_sidebar( + sidebar = shiny::sliderInput( + "top_k", + "Use top K results", + step = 1, + min = 1, + max = 5, + value = 5 + ), + chat_ui("chat") + ) +) + +server <- function(input, output, session) { + + user_chatbot <- gitai$llm$clone() + + shiny::observeEvent(input$chat_user_input, { + + query <- input$chat_user_input + + stream <- user_chatbot$stream_async( + paste( + "User query:", query, "\n\n", + "Known input provided for the answer:\n\n", + gitai$db$find_records(query = query, top_k = input$top_k) + ) + ) + chat_append("chat", stream) + }) +} + +shinyApp(ui, server) + + From 2917361e5ebad78520a5a97ed2c34f334ed756da Mon Sep 17 00:00:00 2001 From: Kamil Wais Date: Thu, 12 Dec 2024 14:56:57 +0100 Subject: [PATCH 3/6] added cache checking --- R/Pinecone.R | 28 +++++++++++++++++++++ R/VectorDatabase.R | 4 +++ R/process_content.R | 3 +++ R/process_repos.R | 46 +++++++++++++++++++++++++++++++--- tests/testthat/test-Pinecone.R | 21 +++++++++++++--- 5 files changed, 94 insertions(+), 8 deletions(-) diff --git a/R/Pinecone.R b/R/Pinecone.R index aecb880..13c86c4 100644 --- a/R/Pinecone.R +++ b/R/Pinecone.R @@ -48,6 +48,34 @@ Pinecone <- R6::R6Class( response_body <- httr2::resp_body_json(response) response_body }, + + read_record = function(id) { + + pinecone_api_key <- Sys.getenv("PINECONE_API_KEY") + + url <- paste0("https://", private$.index_host) + + request <- httr2::request(url) |> + httr2::req_url_path_append("vectors") |> + httr2::req_url_path_append("fetch") |> + httr2::req_url_query( + ids = id, + namespace = private$.namespace + ) |> + httr2::req_headers( + "Api-Key" = pinecone_api_key, + "X-Pinecone-API-Version" = "2024-10" + ) + + response <- request |> + httr2::req_perform() + + response_body <- httr2::resp_body_json(response) + results <- response_body$vectors + + results + }, + find_records = function(query, top_k = 1) { diff --git a/R/VectorDatabase.R b/R/VectorDatabase.R index 519d299..d6ac8e7 100644 --- a/R/VectorDatabase.R +++ b/R/VectorDatabase.R @@ -11,6 +11,10 @@ VectorDatabase <- R6::R6Class( stop(call. = FALSE, "Not implemented yet.") }, + read_record = function(id) { + stop(call. = FALSE, "Not implemented yet.") + }, + find_records = function(query, top_k = 1) { stop(call. = FALSE, "Not implemented yet.") } diff --git a/R/process_content.R b/R/process_content.R index a0139fa..b1affeb 100644 --- a/R/process_content.R +++ b/R/process_content.R @@ -2,6 +2,9 @@ process_content <- function(gitai, content) { # TODO: check if it fits in the context window + num_words <- length(strsplit(content, "\\s+")[[1]]) + cli::cli_alert_info("Repo content has {num_words} words") + llm_clone <- gitai$llm$clone(deep = TRUE) llm_clone$chat(content) diff --git a/R/process_repos.R b/R/process_repos.R index 1057052..858def6 100644 --- a/R/process_repos.R +++ b/R/process_repos.R @@ -7,6 +7,7 @@ #' @export process_repos <- function( gitai, + depth = 1, verbose = is_verbose() ) { @@ -20,16 +21,25 @@ process_repos <- function( GitStats::get_files_structure( gitstats_object = gitstats, pattern = paste0(gitai$files, collapse = "|"), - depth = Inf, + depth = depth, verbose = verbose ) files_content <- GitStats::get_files_content(gitstats, verbose = verbose) repositories <- unique(files_content$repo_name) + api_urls <- unique(files_content$api_url) + results <- - repositories |> - purrr::map(function(repo_name) { + purrr::map2(repositories, api_urls, function(repo_name, api_url) { + + current_repo_number <- which(repositories == repo_name) + if (verbose) { - cli::cli_alert_info("Processing repository: {.pkg {repo_name}}") + cli::cli_alert(paste0( + "Processing repository ", + "[{current_repo_number}/{length(repositories)} ", + "{round(current_repo_number / length(repositories) * 100, 2)}%]: ", + "{.pkg {repo_name}}" + )) } filtered_content <- files_content |> @@ -39,6 +49,34 @@ process_repos <- function( dplyr::pull(file_content) |> paste(collapse = "\n\n") + if (!is.null(gitai$db)) { + if (verbose) { + cli::cli_alert_info("Checking repo timestamp...") + } + record <- gitai$db$read_record(id = repo_name) + + if (NROW(record) > 0) { + + record <- record[[1]] + record_timestamp <- as.POSIXct(record$metadata$timestamp, tz = "UTC") + + if (grepl("github", api_url)) { + api_url <- github_repo(api_url) + } else { + api_url <- gitlab_repo(api_url) + } + + repo_timestamp <- get_repo_date(api_url) + + if (repo_timestamp <= record_timestamp) { + if (verbose) { + cli::cli_alert_info("Repo has not been updated. Skipping...") + } + return(NULL) + } + } + } + if (verbose) { cli::cli_alert_info("Processing content with LLM...") } diff --git a/tests/testthat/test-Pinecone.R b/tests/testthat/test-Pinecone.R index 1d7c078..df3341d 100644 --- a/tests/testthat/test-Pinecone.R +++ b/tests/testthat/test-Pinecone.R @@ -50,24 +50,24 @@ test_that("writting records", { }) test_that("finding records", { - + Sys.sleep(3) db <- Pinecone$new( namespace = "test_project_id", index = "gitai" ) - + result <- db$find_records( query = "Tell me about Apple Tech computer company.", top_k = 1 ) - + length(result) |> expect_equal(1) result[[1]]$id |> expect_equal("id_2") result[[1]]$metadata$text |> is.character() |> expect_true() result[[1]]$score |> is.numeric() |> expect_true() - + result_2 <- db$find_records( query = "Tell me about apple fruit.", top_k = 1 @@ -76,3 +76,16 @@ test_that("finding records", { expect_false(result_2[[1]]$id == result[[1]]$id) }) +test_that("reading records", { + + db <- Pinecone$new( + namespace = "test_project_id", + index = "gitai" + ) + + result <- db$read_record(id = "id_1") + + result[[1]]$metadata$text |> + is.character() |> + expect_true() +}) From 547378ad646c96629c9bd0db2c16f7fb0ca67753 Mon Sep 17 00:00:00 2001 From: Kamil Wais Date: Fri, 13 Dec 2024 10:57:34 +0100 Subject: [PATCH 4/6] updated demo app --- R/process_content.R | 14 +++++++--- R/process_repos.R | 12 ++++++-- devel/deploy_demo.R | 17 ++++++++++++ devel/sandbox.R | 67 +++++++++++++++++++++++++-------------------- inst/demo/app.R | 15 ++++++---- 5 files changed, 84 insertions(+), 41 deletions(-) create mode 100644 devel/deploy_demo.R diff --git a/R/process_content.R b/R/process_content.R index b1affeb..070850e 100644 --- a/R/process_content.R +++ b/R/process_content.R @@ -1,10 +1,16 @@ -process_content <- function(gitai, content) { +process_content <- function(gitai, content, max_words = 80000) { - # TODO: check if it fits in the context window - - num_words <- length(strsplit(content, "\\s+")[[1]]) + words <- strsplit(content, "\\s+")[[1]] + num_words <- length(words) cli::cli_alert_info("Repo content has {num_words} words") + if (num_words > max_words) { + cli::cli_alert_warning("Repo content is probably too long, triming...") + trimmed_words <- words[1:min(length(words), max_words)] + content <- paste(trimmed_words, collapse = " ") + cli::cli_alert_info("Repo content has now {length(trimmed_words)} words.") + } + llm_clone <- gitai$llm$clone(deep = TRUE) llm_clone$chat(content) diff --git a/R/process_repos.R b/R/process_repos.R index 858def6..de3a7f3 100644 --- a/R/process_repos.R +++ b/R/process_repos.R @@ -25,9 +25,15 @@ process_repos <- function( verbose = verbose ) files_content <- GitStats::get_files_content(gitstats, verbose = verbose) - repositories <- unique(files_content$repo_name) - api_urls <- unique(files_content$api_url) - + + distinct_repos <- files_content |> + dplyr::distinct(repo_name, api_url) + + # repositories <- unique(files_content$repo_name) + repositories <- distinct_repos$repo_name + # api_urls <- unique(files_content$api_url) + api_urls <- distinct_repos$api_url + results <- purrr::map2(repositories, api_urls, function(repo_name, api_url) { diff --git a/devel/deploy_demo.R b/devel/deploy_demo.R new file mode 100644 index 0000000..11c0762 --- /dev/null +++ b/devel/deploy_demo.R @@ -0,0 +1,17 @@ +rstudioapi::restartSession() + +rsconnect::setAccountInfo( + name = 'kalimu', + token = Sys.getenv("SHINYAPPSIO_TOKEN"), + secret = Sys.getenv("SHINYAPPSIO_SECRET") +) + +# pak::pkg_install("r-world-devs/GitAI") + +rsconnect::deployApp( + appDir = "inst/demo", + account = "kalimu", + appName = "GitAI-demo" +) + +# https://kalimu.shinyapps.io/GitAI-demo/ \ No newline at end of file diff --git a/devel/sandbox.R b/devel/sandbox.R index 9c63c10..6f4fe1a 100644 --- a/devel/sandbox.R +++ b/devel/sandbox.R @@ -8,34 +8,36 @@ my_project <- initialize_project("gitai-demo") |> my_project <- my_project |> set_github_repos( - repos = c( - "r-world-devs/GitStats", - "r-world-devs/GitAI", - "r-world-devs/cohortBuilder", - "r-world-devs/shinyCohortBuilder", - "r-world-devs/shinyQueryBuilder", - "r-world-devs/queryBuilder", - "r-world-devs/shinyGizmo", - "r-world-devs/shinyTimelines", - "openpharma/DataFakeR" - ) - # orgs = c( - # # "r-lib", - # # "rstudio", - # "tidyverse" + # repos = c( + # "r-world-devs/GitStats", + # "r-world-devs/GitAI", + # "r-world-devs/cohortBuilder", + # "r-world-devs/shinyCohortBuilder", + # "r-world-devs/shinyQueryBuilder", + # "r-world-devs/queryBuilder", + # "r-world-devs/shinyGizmo", + # "r-world-devs/shinyTimelines", + # "openpharma/DataFakeR" # ) + orgs = c( + "insightsengineering", + "openpharma", + "pharmaverse", + "tidymodels", + "r-lib", + "rstudio", + "tidyverse" + ) ) |> add_files(c( - # "DESCRIPTION", - # "project_metadata.yaml", - # "*.md", - "README.md" + "DESCRIPTION", + "*.md", + "*.Rmd" )) - my_project <- my_project |> set_prompt(r"( - Write about five paragraphs of summary for a project based on given input. + Write up to ten paragraphs of summary for a project based on given input. Be precise and to the point in your answers. Mention core functionality and all main features of the project. If available, mention brifly the technology used in the project @@ -44,21 +46,28 @@ my_project <- my_project |> or other type of tool. )") -results <- process_repos(my_project) +custom_function <- function(provider, req) { -results |> dplyr::glimpse() -purrr::map(results, ~.$text) + req |> + httr2::req_timeout(60) |> + httr2::req_perform() |> + httr2::resp_body_json() +} +unlockBinding("chat_perform_value", asNamespace("elmer")) +assign("chat_perform_value", custom_function, envir = asNamespace("elmer")) +lockBinding("chat_perform_value", asNamespace("elmer")) -my_project |> - find_records("How can I create synthetic datasets?") +results <- process_repos(my_project) +# results |> dplyr::glimpse() +# purrr::map(results, ~.$text) my_project |> - find_records("How can I check statisting of git repositories.") + find_records("How can I create synthetic datasets?", top_k = 3) my_project |> - find_records("Can I somehow extract information from file content from git repositories using LLM?") + find_records("How can I check statisting of git repositories.", top_k = 3) my_project |> - find_records("What could help me managing many git repositories?", top_k = 2) + find_records("Can I somehow extract information from file content from git repositories using LLM?") run_demo() diff --git a/inst/demo/app.R b/inst/demo/app.R index 98b5a6c..865482a 100644 --- a/inst/demo/app.R +++ b/inst/demo/app.R @@ -12,13 +12,14 @@ gitai <- initialize_project("gitai-demo") |> using only the provided input. Use only provided with the query known input that is most relevent to the user's query. - Do know use any other information apart from input provided with the query. - Be concise and to the point in your answers. + Do not use any other information + apart from the input provided with the query. + Be concise but provide all important information. Also awalys provide link to mentioned git repositories with visible full URL for example: https://github.com/some_repository. Do not mask it with any other text. - )") - + )") + ui <- bslib::page_fluid( bslib::layout_sidebar( sidebar = shiny::sliderInput( @@ -26,9 +27,13 @@ ui <- bslib::page_fluid( "Use top K results", step = 1, min = 1, - max = 5, + max = 10, value = 5 ), + shiny::HTML(markdown::markdownToHTML(fragment.only = TRUE, + "This is a demo app of the outputs from the `GitAI` open-source framework.The `GitAI` allows to extract knowledge from GitHub and GitLab repositories with the use of AI/LLM (Large Language Models) on scale and with minimum costs.\n\nThe results you see in the chatbot are from processing **800+** repositories in mulitple public GitHub organizations: `r-world-devs`, `openpharma`, `pharmaverse`, `tidymodels`, `r-lib`, `rstudio`, `tidyverse`, `insightsengineering`. In the repositories we are scanning the following files: `DESCRIPTION`, `*.md`, and `*.Rmd` files, so it includes files like `README.md` or R package vignettes.\n\nFor this demo we use simple and cheap LLM `gpt-4o-mini` from OpenAI. As embedings we use `multilingual-e5-large` embedding model from Pinecone as well as its vector database with 1024 dimensions. The overall one-time cost of processing all 800+ repositories is **less then $1** with this setup (yes, one USD!). \nEven more impressive results can be achieved with more powerful LLMs, and higher-dimensional embeddings.\n See more: [GitAI](https://github.com/r-world-devs/GitAI)\n\nTry to chat with the chatbot to find and reuse tools for your specific needs, for example, you can ask:\n`I need to filter datasets and build cohorts interactively in a shiny dashboard. What dashboarding component could I use?`" + )), + chat_ui("chat") ) ) From 30edc03078544f60052d04370bdd08c27b896408 Mon Sep 17 00:00:00 2001 From: Kamil Wais Date: Fri, 13 Dec 2024 11:06:34 +0100 Subject: [PATCH 5/6] fixed some checks issues --- .gitignore | 2 +- DESCRIPTION | 1 + R/process_repos.R | 7 ++++--- R/run_demo.R | 2 +- inst/{demo => demo-app}/app.R | 0 man/process_repos.Rd | 4 +++- 6 files changed, 10 insertions(+), 6 deletions(-) rename inst/{demo => demo-app}/app.R (100%) diff --git a/.gitignore b/.gitignore index 1ead979..9d7770f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ .Rproj.user .Renviron docs -inst/demo/rsconnect/shinyapps.io +inst/demo-app/rsconnect/ diff --git a/DESCRIPTION b/DESCRIPTION index 71dfc96..5bee9d5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -27,6 +27,7 @@ Imports: glue Suggests: testthat (>= 3.0.0), + shiny, withr Config/testthat/edition: 3 Config/testthat/parallel: true diff --git a/R/process_repos.R b/R/process_repos.R index de3a7f3..21fea85 100644 --- a/R/process_repos.R +++ b/R/process_repos.R @@ -1,6 +1,7 @@ #' Run LLM on `GitAI` repositories content #' @name process_repos #' @param gitai A \code{GitAI} object. +#' @param depth A numeric, maximum depth of folders to process. #' @param verbose A logical. If \code{FALSE} you won't be getting #' additional diagnostic messages. #' @return A list. @@ -11,6 +12,8 @@ process_repos <- function( verbose = is_verbose() ) { + repo_name <- api_url <- NULL + gitstats <- gitai$gitstats gitai$repos_metadata <- GitStats::get_repos( @@ -29,10 +32,8 @@ process_repos <- function( distinct_repos <- files_content |> dplyr::distinct(repo_name, api_url) - # repositories <- unique(files_content$repo_name) repositories <- distinct_repos$repo_name - # api_urls <- unique(files_content$api_url) - api_urls <- distinct_repos$api_url + api_urls <- distinct_repos$api_url results <- purrr::map2(repositories, api_urls, function(repo_name, api_url) { diff --git a/R/run_demo.R b/R/run_demo.R index 6d5d557..435f53c 100644 --- a/R/run_demo.R +++ b/R/run_demo.R @@ -1,6 +1,6 @@ run_demo <- function() { - app_folder <- system.file("demo", package = "GitAI") + app_folder <- system.file("demo-app", package = "GitAI") shiny::runApp(app_folder) } diff --git a/inst/demo/app.R b/inst/demo-app/app.R similarity index 100% rename from inst/demo/app.R rename to inst/demo-app/app.R diff --git a/man/process_repos.Rd b/man/process_repos.Rd index 670aa41..9d24d1a 100644 --- a/man/process_repos.Rd +++ b/man/process_repos.Rd @@ -4,11 +4,13 @@ \alias{process_repos} \title{Run LLM on \code{GitAI} repositories content} \usage{ -process_repos(gitai, verbose = is_verbose()) +process_repos(gitai, depth = 1, verbose = is_verbose()) } \arguments{ \item{gitai}{A \code{GitAI} object.} +\item{depth}{A numeric, maximum depth of folders to process.} + \item{verbose}{A logical. If \code{FALSE} you won't be getting additional diagnostic messages.} } From bebe6737e7eb7ff3b0edf37b5d8f5bb1ec784dbd Mon Sep 17 00:00:00 2001 From: Maciej Banas Date: Fri, 20 Dec 2024 15:20:41 +0000 Subject: [PATCH 6/6] Some small adjustments with repo date. --- R/add_metadata.R | 4 +-- R/process_content.R | 14 +++++++---- R/process_repos.R | 35 +++++++++++++------------- tests/testthat/test-Pinecone.R | 36 +++++++++++++-------------- tests/testthat/test-add_metadata.R | 21 +++++++--------- tests/testthat/test-process_content.R | 9 ++++--- 6 files changed, 61 insertions(+), 58 deletions(-) diff --git a/R/add_metadata.R b/R/add_metadata.R index 7441a47..7557e72 100644 --- a/R/add_metadata.R +++ b/R/add_metadata.R @@ -1,5 +1,5 @@ #' @noRd -add_metadata <- function(result, content) { +add_metadata <- function(result, content, timestamp) { web_url <- content$repo_url[1] api_url <- content$api_url[1] if (grepl("github", api_url)) { @@ -10,7 +10,7 @@ add_metadata <- function(result, content) { result[["metadata"]] <- list( repo_url = web_url, files = paste0(content$file_path, collapse = ", "), - timestamp = get_repo_date(api_url) + timestamp = timestamp ) result } diff --git a/R/process_content.R b/R/process_content.R index 070850e..e396b58 100644 --- a/R/process_content.R +++ b/R/process_content.R @@ -1,14 +1,18 @@ -process_content <- function(gitai, content, max_words = 80000) { +process_content <- function(gitai, content, max_words = 80000, verbose) { words <- strsplit(content, "\\s+")[[1]] num_words <- length(words) - cli::cli_alert_info("Repo content has {num_words} words") + if (verbose) cli::cli_alert_info("Repo content has {num_words} words") if (num_words > max_words) { - cli::cli_alert_warning("Repo content is probably too long, triming...") - trimmed_words <- words[1:min(length(words), max_words)] + if (verbose) { + cli::cli_alert_warning("Repo content is probably too long, triming...") + } + trimmed_words <- words[seq_len(min(length(words), max_words))] content <- paste(trimmed_words, collapse = " ") - cli::cli_alert_info("Repo content has now {length(trimmed_words)} words.") + if (verbose) { + cli::cli_alert_info("Repo content has now {length(trimmed_words)} words.") + } } llm_clone <- gitai$llm$clone(deep = TRUE) diff --git a/R/process_repos.R b/R/process_repos.R index 21fea85..376f54e 100644 --- a/R/process_repos.R +++ b/R/process_repos.R @@ -22,24 +22,24 @@ process_repos <- function( verbose = verbose ) GitStats::get_files_structure( - gitstats_object = gitstats, + gitstats, pattern = paste0(gitai$files, collapse = "|"), depth = depth, verbose = verbose ) files_content <- GitStats::get_files_content(gitstats, verbose = verbose) - distinct_repos <- files_content |> - dplyr::distinct(repo_name, api_url) + distinct_repos <- files_content |> + dplyr::distinct(repo_name, api_url) repositories <- distinct_repos$repo_name api_urls <- distinct_repos$api_url results <- purrr::map2(repositories, api_urls, function(repo_name, api_url) { - + current_repo_number <- which(repositories == repo_name) - + if (verbose) { cli::cli_alert(paste0( "Processing repository ", @@ -56,25 +56,24 @@ process_repos <- function( dplyr::pull(file_content) |> paste(collapse = "\n\n") + if (grepl("github", api_url)) { + api_url <- github_repo(api_url) + } else { + api_url <- gitlab_repo(api_url) + } + repo_timestamp <- get_repo_date(api_url) + if (!is.null(gitai$db)) { if (verbose) { cli::cli_alert_info("Checking repo timestamp...") } record <- gitai$db$read_record(id = repo_name) - + if (NROW(record) > 0) { record <- record[[1]] record_timestamp <- as.POSIXct(record$metadata$timestamp, tz = "UTC") - - if (grepl("github", api_url)) { - api_url <- github_repo(api_url) - } else { - api_url <- gitlab_repo(api_url) - } - - repo_timestamp <- get_repo_date(api_url) - + if (repo_timestamp <= record_timestamp) { if (verbose) { cli::cli_alert_info("Repo has not been updated. Skipping...") @@ -87,11 +86,13 @@ process_repos <- function( if (verbose) { cli::cli_alert_info("Processing content with LLM...") } + result <- process_content( gitai = gitai, - content = content_to_process + content = content_to_process, + verbose = verbose ) |> - add_metadata(content = filtered_content) + add_metadata(content = filtered_content, timestamp = repo_timestamp) if (!is.null(gitai$db)) { if (verbose) { diff --git a/tests/testthat/test-Pinecone.R b/tests/testthat/test-Pinecone.R index df3341d..f18b741 100644 --- a/tests/testthat/test-Pinecone.R +++ b/tests/testthat/test-Pinecone.R @@ -1,10 +1,10 @@ test_that("getting index metadata", { db <- Pinecone$new( - namespace = "test_project_id", + namespace = "test_project_id", index = "gitai" ) - + index <- db$get_index_metadata() index$host |> is.character() |> expect_true() }) @@ -12,10 +12,10 @@ test_that("getting index metadata", { test_that("getting embeddings", { db <- Pinecone$new( - namespace = "test_project_id", + namespace = "test_project_id", index = "gitai" ) - + test_text <- "Apple is a popular fruit known for its sweetness and crisp texture." embeddings <- db$.__enclos_env__$private$.get_embeddings(text = test_text) @@ -23,12 +23,12 @@ test_that("getting embeddings", { }) test_that("writting records", { - + db <- Pinecone$new( - namespace = "test_project_id", + namespace = "test_project_id", index = "gitai" ) - + test_texts <- c( "Apple is a popular fruit known for its sweetness and crisp texture.", "The tech company Apple is known for its innovative products like the iPhone.", @@ -39,40 +39,40 @@ test_that("writting records", { ) for (i in seq_along(test_texts)) { - + result <- db$write_record( id = paste0("id_", i), text = test_texts[i] - ) + ) result$upsertedCount |> expect_equal(1) } }) test_that("finding records", { - + Sys.sleep(3) - + db <- Pinecone$new( namespace = "test_project_id", index = "gitai" ) - + result <- db$find_records( - query = "Tell me about Apple Tech computer company.", + query = "Tell me about Apple Tech computer company.", top_k = 1 ) - + length(result) |> expect_equal(1) result[[1]]$id |> expect_equal("id_2") result[[1]]$metadata$text |> is.character() |> expect_true() result[[1]]$score |> is.numeric() |> expect_true() - + result_2 <- db$find_records( - query = "Tell me about apple fruit.", + query = "Tell me about apple fruit.", top_k = 1 ) - + expect_false(result_2[[1]]$id == result[[1]]$id) }) @@ -85,7 +85,7 @@ test_that("reading records", { result <- db$read_record(id = "id_1") - result[[1]]$metadata$text |> + result[[1]]$metadata$text |> is.character() |> expect_true() }) diff --git a/tests/testthat/test-add_metadata.R b/tests/testthat/test-add_metadata.R index ea2ea09..35b4541 100644 --- a/tests/testthat/test-add_metadata.R +++ b/tests/testthat/test-add_metadata.R @@ -9,16 +9,13 @@ test_that("metadata is added to content", { repo_url = c("test_URL", "test_URL"), api_url = c("test_URL", "test_URL") ) - testthat::with_mocked_bindings({ - result_with_metadata <- "result" |> - test_mocker$use() |> - add_metadata( - content = mocked_files_content - ) - expect_true("metadata" %in% names(result_with_metadata)) - expect_type(result_with_metadata[["metadata"]], "list") - expect_equal(names(result_with_metadata[["metadata"]]), c("repo_url", "files", "timestamp")) - }, - get_repo_date = function(api_url) Sys.time() - ) + result_with_metadata <- "result" |> + test_mocker$use() |> + add_metadata( + content = mocked_files_content, + timestamp = Sys.Date() + ) + expect_true("metadata" %in% names(result_with_metadata)) + expect_type(result_with_metadata[["metadata"]], "list") + expect_equal(names(result_with_metadata[["metadata"]]), c("repo_url", "files", "timestamp")) }) diff --git a/tests/testthat/test-process_content.R b/tests/testthat/test-process_content.R index a5828f3..a7884ca 100644 --- a/tests/testthat/test-process_content.R +++ b/tests/testthat/test-process_content.R @@ -3,7 +3,7 @@ test_that("processing content have proper output structure", { set_llm() |> set_prompt(system_prompt = "Say 'Hi there!' only and nothing else.") - result <- process_content(gitai = my_project, content = "") + result <- process_content(gitai = my_project, content = "", verbose = FALSE) expect_equal(result$text, "Hi there!") expect_true(is.numeric(result$tokens)) expect_true(is.list(result$output)) @@ -26,17 +26,18 @@ test_that("processing a single file content with deterministic output", { httr2::with_verbosity(verbosity = 0, { result <- process_content( gitai = my_project, - content = test_content + content = test_content, + verbose = FALSE ) }) expect_length(gregexpr("\\.", result$text)[[1]], 1) expect_equal( result$text, - process_content(gitai = my_project, content = test_content)$text + process_content(gitai = my_project, content = test_content, verbose = FALSE)$text ) expect_equal( result$text, - process_content(gitai = my_project, content = test_content)$text + process_content(gitai = my_project, content = test_content, verbose = FALSE)$text ) test_mocker$cache(result)