From 817dfac205367510e03961696f02839724249736 Mon Sep 17 00:00:00 2001 From: Kamil Wais Date: Thu, 28 Nov 2024 15:50:26 +0100 Subject: [PATCH 01/11] added Pinecone class --- R/Pinecone.R | 141 +++++++++++++++++++++++++++ R/VectorDatabase.R | 30 ++++++ devel/sandbox.R | 13 +-- tests/testthat/test-Pinecone.R | 66 +++++++++++++ tests/testthat/test-VectorDatabase.R | 10 ++ 5 files changed, 248 insertions(+), 12 deletions(-) create mode 100644 R/Pinecone.R create mode 100644 R/VectorDatabase.R create mode 100644 tests/testthat/test-Pinecone.R create mode 100644 tests/testthat/test-VectorDatabase.R diff --git a/R/Pinecone.R b/R/Pinecone.R new file mode 100644 index 0000000..b8f73ac --- /dev/null +++ b/R/Pinecone.R @@ -0,0 +1,141 @@ +Pinecone <- R6::R6Class( + classname = "Pinecone", + inherit = VectorDatabase, + public = list( + + get_index_metadata = function() { + + pinecone_api_key <- Sys.getenv("PINECONE_API_KEY") + + url <- paste0("https://api.pinecone.io/indexes/", private$.index_id) + + httr2::request(url) |> + httr2::req_headers( + "Api-Key" = pinecone_api_key + ) |> + httr2::req_perform() |> + httr2::resp_body_json() + + }, + + get_embeddings = function(text) { + + pinecone_api_key <- Sys.getenv("PINECONE_API_KEY") + + url <- "https://api.pinecone.io" + + body <- list( + model = "multilingual-e5-large", + parameters = list( + input_type = "passage", + truncate = "END" + ), + inputs = list( + list(text = text) + ) + ) + + request <- + httr2::request(url) |> + httr2::req_url_path_append("embed") |> + httr2::req_headers( + "Api-Key" = pinecone_api_key, + "X-Pinecone-API-Version" = "2024-10" + ) |> + httr2::req_body_json(body) + + response <- + request |> + httr2::req_perform() + + response_body <- httr2::resp_body_json(response) + + response_body$data[[1]]$values |> unlist() + }, + + write_record = function(id, embeddings, metadata) { + + pinecone_api_key <- Sys.getenv("PINECONE_API_KEY") + + url <- paste0("https://", private$.index_host) + + body <- list( + namespace = private$.project_id, + vectors = list( + id = id, + values = embeddings, + metadata = metadata + ) + ) + + request <- + httr2::request(url) |> + httr2::req_url_path_append("vectors/upsert") |> + httr2::req_headers( + "Api-Key" = pinecone_api_key, + "X-Pinecone-API-Version" = "2024-10" + ) |> + httr2::req_body_json(body) + + response <- + request |> + httr2::req_perform() + + response_body <- httr2::resp_body_json(response) + response_body + }, + + find_records = function(query, top_k = 3) { + + embeddings <- self$get_embeddings(query) + + pinecone_api_key <- Sys.getenv("PINECONE_API_KEY") + + url <- paste0("https://", private$.index_host) + + body <- list( + namespace = private$.project_id, + vector = embeddings, + topK = top_k, + includeValues = FALSE, + includeMetadata = TRUE + ) + + request <- + httr2::request(url) |> + httr2::req_url_path_append("query") |> + httr2::req_headers( + "Api-Key" = pinecone_api_key, + "X-Pinecone-API-Version" = "2024-10" + ) |> + httr2::req_body_json(body) + + response <- + request |> + httr2::req_perform() + + response_body <- httr2::resp_body_json(response) + results <- response_body$matches + + results |> + purrr::map(function(result) { + result$values <- NULL + result + }) + } + ), + + private = list( + + .project_id = NULL, + .index_id = NULL, + .index_host = NULL, + + .initialize = function(index_id) { + + private$.index_id <- index_id + + private$.index_host <- self$get_index_metadata()$host + } + ) +) \ No newline at end of file diff --git a/R/VectorDatabase.R b/R/VectorDatabase.R new file mode 100644 index 0000000..7c2b1f1 --- /dev/null +++ b/R/VectorDatabase.R @@ -0,0 +1,30 @@ +VectorDatabase <- R6::R6Class( + classname = "VectorDatabase", + public = list( + + initialize = function(project_id, ...) { + + private$.project_id <- project_id + + private$.initialize(...) + }, + + get_embeddings = function(text) { + stop(call. = FALSE, "Not implemented yet.") + }, + + write_record = function(id, embeddings, metadata) { + stop(call. = FALSE, "Not implemented yet.") + }, + + find_records = function(query) { + stop(call. = FALSE, "Not implemented yet.") + } + ), + + private = list( + .project_id = NULL, + + .initialize = function(...) {} + ) +) \ No newline at end of file diff --git a/devel/sandbox.R b/devel/sandbox.R index e841ae6..2512aa7 100644 --- a/devel/sandbox.R +++ b/devel/sandbox.R @@ -1,13 +1,2 @@ -pak::pak("tidyverse/elmer") - -chat <- elmer::chat_openai( - model = "gpt-4o-mini", - # system_prompt = "You are a friendly but terse assistant.", - echo = "none" -) -chat -chat$system_prompt <- "You always start with 'HI'" -chat |> str() -result <- chat$chat("What is the meaning of life?") -result +Sys.getenv("PINECONE_API_KEY") diff --git a/tests/testthat/test-Pinecone.R b/tests/testthat/test-Pinecone.R new file mode 100644 index 0000000..1f0b94e --- /dev/null +++ b/tests/testthat/test-Pinecone.R @@ -0,0 +1,66 @@ +test_that("getting index metadata", { + + db <- Pinecone$new( + project_id = "test_project_id", + index_id = "gitai" + ) + + index <- db$get_index_metadata() + index$host |> is.character() |> expect_true() +}) + +test_that("getting embeddings", { + + db <- Pinecone$new(project_id = "test_project_id", index_id = "gitai") + + test_text <- "Apple is a popular fruit known for its sweetness and crisp texture." + embeddings <- db$get_embeddings(text = test_text) + + length(embeddings) |> expect_equal(1024) +}) + +test_that("writting records", { + + db <- Pinecone$new(project_id = "test_project_id", index_id = "gitai") + + test_texts <- c( + "Apple is a popular fruit known for its sweetness and crisp texture.", + "The tech company Apple is known for its innovative products like the iPhone.", + "Many people enjoy eating apples as a healthy snack.", + "Apple Inc. has revolutionized the tech industry with its sleek designs and user-friendly interfaces.", + "An apple a day keeps the doctor away, as the saying goes.", + "Apple Computer Company was founded on April 1, 1976, by Steve Jobs, Steve Wozniak, and Ronald Wayne as a partnership." + ) + + for (i in seq_along(test_texts)) { + + embeddings <- db$get_embeddings(text = test_texts[i]) + + result <- db$write_record( + id = paste0("id_", i), + embeddings = embeddings, + metadata = list(text = test_texts[i]) + ) + + result$upsertedCount |> expect_equal(1) + } + }) + +test_that("finding records", { + + Sys.sleep(3) + + db <- Pinecone$new(project_id = "test_project_id", index_id = "gitai") + + result <- db$find_records(query = "Tell me about Apple Tech computer company.", top_k = 1) + + length(result) |> expect_equal(1) + result[[1]]$id |> expect_equal("id_2") + result[[1]]$metadata$text |> is.character() |> expect_true() + result[[1]]$score |> is.numeric() |> expect_true() + + result_2 <- db$find_records(query = "Tell me about apple fruit.") + + expect_false(result_2[[1]]$id == result[[1]]$id) + }) + diff --git a/tests/testthat/test-VectorDatabase.R b/tests/testthat/test-VectorDatabase.R new file mode 100644 index 0000000..e328329 --- /dev/null +++ b/tests/testthat/test-VectorDatabase.R @@ -0,0 +1,10 @@ +test_that("superclass has expected methods", { + + db <- VectorDatabase$new(project_id = "test_project_id") + + db$get_embeddings(text = "test_text") |> expect_error("Not implemented yet.") + + db$write_record(record = "test_record") |> expect_error("Not implemented yet.") + + db$find_records(query = "test_query") |> expect_error("Not implemented yet.") +}) From 9aed2f6b9dfb51ec60ef52c4e61e26edb955a2e7 Mon Sep 17 00:00:00 2001 From: Kamil Wais Date: Thu, 28 Nov 2024 15:57:45 +0100 Subject: [PATCH 02/11] updated sandbox --- devel/sandbox.R | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/devel/sandbox.R b/devel/sandbox.R index 2512aa7..d79b3a0 100644 --- a/devel/sandbox.R +++ b/devel/sandbox.R @@ -1,2 +1,17 @@ -Sys.getenv("PINECONE_API_KEY") + +my_project <- + initialize_project("gitai-demo") |> + set_github_repos(repos = c( + "r-world-devs/GitStats", + "r-world-devs/GitAI", + "openpharma/DataFakeR" + )) |> + add_files(files = "README.md") |> + set_llm() |> + set_prompt(paste( + "Write a paragraph of summary for a project based on given input.", + "Highlight business value of the project, its use cases and target audience." + )) + +process_repos(my_project) From 473e55b5e27ffd8c25cf4b3a056154a5dd129034 Mon Sep 17 00:00:00 2001 From: Kamil Wais Date: Thu, 5 Dec 2024 13:31:52 +0100 Subject: [PATCH 03/11] added Pinecone support --- DESCRIPTION | 1 + R/GitAI.R | 6 + R/Pinecone.R | 171 +++++++++++++++------------ R/VectorDatabase.R | 23 ++-- R/find_records.R | 14 +++ R/process_repos.R | 38 ++++-- R/set_database.R | 23 ++++ devel/sandbox.R | 89 ++++++++++++-- tests/testthat/test-Pinecone.R | 40 ++++--- tests/testthat/test-VectorDatabase.R | 12 +- tests/testthat/test-set_database.R | 24 ++++ 11 files changed, 313 insertions(+), 128 deletions(-) create mode 100644 R/find_records.R create mode 100644 R/set_database.R create mode 100644 tests/testthat/test-set_database.R diff --git a/DESCRIPTION b/DESCRIPTION index 29e7c80..195902b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -24,3 +24,4 @@ Imports: Suggests: testthat (>= 3.0.0) Config/testthat/edition: 3 +Config/testthat/parallel: true diff --git a/R/GitAI.R b/R/GitAI.R index 9fc08e7..8ec23db 100644 --- a/R/GitAI.R +++ b/R/GitAI.R @@ -18,6 +18,11 @@ GitAI <- R6::R6Class( private$.llm <- value }, + db = function(value) { + if (missing(value)) return(private$.db) + private$.db <- value + }, + system_prompt = function(value) { if (is.null(private$.llm)) @@ -47,6 +52,7 @@ GitAI <- R6::R6Class( private = list( .project_id = NULL, .llm = NULL, + .db = NULL, .gitstats = NULL, .files = NULL, .repos_metadata = NULL, diff --git a/R/Pinecone.R b/R/Pinecone.R index b8f73ac..9184a8b 100644 --- a/R/Pinecone.R +++ b/R/Pinecone.R @@ -7,7 +7,7 @@ Pinecone <- R6::R6Class( pinecone_api_key <- Sys.getenv("PINECONE_API_KEY") - url <- paste0("https://api.pinecone.io/indexes/", private$.index_id) + url <- paste0("https://api.pinecone.io/indexes/", private$.index) httr2::request(url) |> httr2::req_headers( @@ -17,84 +17,53 @@ Pinecone <- R6::R6Class( httr2::resp_body_json() }, - - get_embeddings = function(text) { - - pinecone_api_key <- Sys.getenv("PINECONE_API_KEY") - - url <- "https://api.pinecone.io" - - body <- list( - model = "multilingual-e5-large", - parameters = list( - input_type = "passage", - truncate = "END" - ), - inputs = list( - list(text = text) - ) - ) - - request <- - httr2::request(url) |> - httr2::req_url_path_append("embed") |> - httr2::req_headers( - "Api-Key" = pinecone_api_key, - "X-Pinecone-API-Version" = "2024-10" - ) |> - httr2::req_body_json(body) - - response <- - request |> - httr2::req_perform() - response_body <- httr2::resp_body_json(response) - - response_body$data[[1]]$values |> unlist() - }, - - write_record = function(id, embeddings, metadata) { - + write_record = function(id, text, metadata = list()) { + pinecone_api_key <- Sys.getenv("PINECONE_API_KEY") - + url <- paste0("https://", private$.index_host) - + + embeddings <- private$.get_embeddings(text = text) + + metadata$text <- text + body <- list( - namespace = private$.project_id, + namespace = private$.namespace, vectors = list( id = id, values = embeddings, metadata = metadata ) ) - + request <- httr2::request(url) |> - httr2::req_url_path_append("vectors/upsert") |> - httr2::req_headers( + httr2::req_url_path_append("vectors/upsert") |> + httr2::req_headers( "Api-Key" = pinecone_api_key, "X-Pinecone-API-Version" = "2024-10" ) |> - httr2::req_body_json(body) - - response <- - request |> - httr2::req_perform() - + httr2::req_body_json(body) + + response <- + request |> + httr2::req_perform() + response_body <- httr2::resp_body_json(response) response_body }, - - find_records = function(query, top_k = 3) { - - embeddings <- self$get_embeddings(query) + + find_records = function(query, top_k = 1) { + + embeddings <- private$.get_embeddings(query) pinecone_api_key <- Sys.getenv("PINECONE_API_KEY") url <- paste0("https://", private$.index_host) body <- list( - namespace = private$.project_id, + namespace = private$.namespace, vector = embeddings, topK = top_k, includeValues = FALSE, @@ -103,39 +72,89 @@ Pinecone <- R6::R6Class( request <- httr2::request(url) |> - httr2::req_url_path_append("query") |> - httr2::req_headers( - "Api-Key" = pinecone_api_key, - "X-Pinecone-API-Version" = "2024-10" - ) |> - httr2::req_body_json(body) - + httr2::req_url_path_append("query") |> + httr2::req_headers( + "Api-Key" = pinecone_api_key, + "X-Pinecone-API-Version" = "2024-10" + ) |> + httr2::req_body_json(body) + response <- request |> - httr2::req_perform() + httr2::req_perform() + + response_body <- httr2::resp_body_json(response) + results <- response_body$matches + + results |> + purrr::map(function(result) { + result$values <- NULL + result + }) + } + ), - response_body <- httr2::resp_body_json(response) - results <- response_body$matches + active = list( + + namespace = function(value) { + if (missing(value)) return(private$.namespace) + private$.namespace <- value + }, - results |> - purrr::map(function(result) { - result$values <- NULL - result - }) + index = function(value) { + if (missing(value)) return(private$.index) + private$.index <- value } ), - + private = list( - + .project_id = NULL, - .index_id = NULL, + .index = NULL, + .namespace = NULL, .index_host = NULL, + + .initialize = function(index, namespace) { - .initialize = function(index_id) { + private$.index <- index + private$.namespace <- namespace + private$.index_host <- self$get_index_metadata()$host + }, - private$.index_id <- index_id + .get_embeddings = function(text) { + + pinecone_api_key <- Sys.getenv("PINECONE_API_KEY") + + url <- "https://api.pinecone.io" + + body <- list( + model = "multilingual-e5-large", + parameters = list( + input_type = "passage", + truncate = "END" + ), + inputs = list( + list(text = text) + ) + ) - private$.index_host <- self$get_index_metadata()$host + request <- + httr2::request(url) |> + httr2::req_url_path_append("embed") |> + httr2::req_headers( + "Api-Key" = pinecone_api_key, + "X-Pinecone-API-Version" = "2024-10" + ) |> + httr2::req_body_json(body) + + response <- + request |> + httr2::req_perform() + + response_body <- httr2::resp_body_json(response) + + response_body$data[[1]]$values |> unlist() + } ) ) \ No newline at end of file diff --git a/R/VectorDatabase.R b/R/VectorDatabase.R index 7c2b1f1..70061bf 100644 --- a/R/VectorDatabase.R +++ b/R/VectorDatabase.R @@ -2,29 +2,26 @@ VectorDatabase <- R6::R6Class( classname = "VectorDatabase", public = list( - initialize = function(project_id, ...) { - - private$.project_id <- project_id + initialize = function(...) { private$.initialize(...) }, - - get_embeddings = function(text) { - stop(call. = FALSE, "Not implemented yet.") - }, - + write_record = function(id, embeddings, metadata) { stop(call. = FALSE, "Not implemented yet.") }, - - find_records = function(query) { + + find_records = function(query, top_k = 1) { stop(call. = FALSE, "Not implemented yet.") } ), - + private = list( - .project_id = NULL, - .initialize = function(...) {} + .initialize = function(...) {}, + + .get_embeddings = function(text) { + stop(call. = FALSE, "Not implemented yet.") + } ) ) \ No newline at end of file diff --git a/R/find_records.R b/R/find_records.R new file mode 100644 index 0000000..8e1b5fb --- /dev/null +++ b/R/find_records.R @@ -0,0 +1,14 @@ +#' @export +find_records <- function( + gitai, + query, + top_k = 1, + verbose = is_verbose() +) { + + gitai$db$find_records( + query = query, + top_k = top_k + ) + +} \ No newline at end of file diff --git a/R/process_repos.R b/R/process_repos.R index e6e622b..d9d2c11 100644 --- a/R/process_repos.R +++ b/R/process_repos.R @@ -5,7 +5,10 @@ #' additional diagnostic messages. #' @return A list. #' @export -process_repos <- function(gitai, verbose = is_verbose()) { +process_repos <- function( + gitai, + verbose = is_verbose() +) { gitstats <- gitai$gitstats @@ -28,25 +31,36 @@ process_repos <- function(gitai, verbose = is_verbose()) { if (verbose) { cli::cli_alert_info("Processing repository: {.pkg {repo_name}}") } - - filtered_content <- - files_content |> + + filtered_content <- files_content |> dplyr::filter(repo_name == !!repo_name) - content_to_process <- - filtered_content |> + + content_to_process <- filtered_content |> dplyr::pull(file_content) |> - paste(collapse = "\n\n") - + paste(collapse = "\n\n") + + if (verbose) { + cli::cli_alert_info("Processing content with LLM...") + } result <- process_content( gitai = gitai, content = content_to_process ) |> - add_metadata( - content = filtered_content + add_metadata(content = filtered_content) + + if (!is.null(gitai$db)) { + if (verbose) { + cli::cli_alert_info("Writing to database...") + } + gitai$db$write_record( + id = repo_name, + text = result$text, + metadata = result$metadata ) - + } + result }) |> purrr::set_names(repositories) - results + invisible(results) } diff --git a/R/set_database.R b/R/set_database.R new file mode 100644 index 0000000..d0e208b --- /dev/null +++ b/R/set_database.R @@ -0,0 +1,23 @@ +#' @export +set_database <- function( + gitai, + provider = "Pinecone", + ... +) { + + provider_class <- get(provider) + + args <- list(...) + + if (is.null(args$namespace)) { + args$namespace <- gitai$project_id + } + + db <- do.call( + what = provider_class$new, + args = args + ) + + gitai$db <- db + invisible(gitai) +} \ No newline at end of file diff --git a/devel/sandbox.R b/devel/sandbox.R index d79b3a0..c8e9d41 100644 --- a/devel/sandbox.R +++ b/devel/sandbox.R @@ -1,17 +1,92 @@ - - my_project <- initialize_project("gitai-demo") |> + set_database( + provider = "Pinecone", + index = "gitai", + namespace = NULL + ) |> + set_llm(seed = 1014, api_args = list(temperature = 0)) + +my_project <- + my_project |> set_github_repos(repos = c( "r-world-devs/GitStats", "r-world-devs/GitAI", "openpharma/DataFakeR" )) |> - add_files(files = "README.md") |> - set_llm() |> + add_files(files = c("README.md")) + +my_project <- + my_project |> set_prompt(paste( - "Write a paragraph of summary for a project based on given input.", - "Highlight business value of the project, its use cases and target audience." + "Write two paragraphs of summary for a project based on given input.", + "Highlight business value of the project, its functionality, main features,", + "and use cases." )) -process_repos(my_project) +results <- process_repos(my_project) + +results |> dplyr::glimpse() +purrr::map(results, ~.$text) + + + + + + +# my_project <- +# initialize_project(project_id = "gitai-demo") |> +# set_database(index = "gitai") + +my_project |> find_records("I'm looking for an R package to create synthetic datasets.") + +my_project |> find_records("How can I check statisting of git repositories.") + +my_project |> find_records("Can I somehow extract information from code from git repositories?") + +my_project |> find_records("What are the tools that can help me in my work as a Data Scientist?") + +my_project |> + find_records("As a Manager I look for tools which let me learn what tools can be reused in my company", top_k = 2) + +my_project |> find_records("Which data products could have large impact in global company that maintains many repositories?", top_k = 3) + +my_project |> find_records("Szukam narzędzi które wykorzystują sztuczną inteligencję?") + + + + + + + +my_chatbot <- + initialize_project("gitai-demo") |> + set_database(index = "gitai") |> + set_llm(seed = 1014, api_args = list(temperature = 0)) |> + set_prompt(paste( + "As a helpful assistant, answer user question using only the provided input." + )) + +get_answer <- function(my_chatbot, query) { + + cat("\n") + my_chatbot$llm$chat(paste( + "User query:", query, "\n\n", + "Known input for the answer:", + my_project$db$find_records(query = query, top_k = 1) + )) |> + cat() +} + +my_chatbot |> + get_answer("I'm looking for an R package to create synthetic datasets.") + +my_chatbot |> + get_answer("How can I check statisting of git repositories?") + +my_chatbot |> + get_answer("Can I somehow extract information from code from git repositories?") + +my_chatbot |> + get_answer("I would love to use AI to process code files. Is it possible? Give me the answer writting in one sentence with very funny style.") + diff --git a/tests/testthat/test-Pinecone.R b/tests/testthat/test-Pinecone.R index 1f0b94e..bc74115 100644 --- a/tests/testthat/test-Pinecone.R +++ b/tests/testthat/test-Pinecone.R @@ -1,8 +1,8 @@ test_that("getting index metadata", { db <- Pinecone$new( - project_id = "test_project_id", - index_id = "gitai" + namespace = "test_project_id", + index = "gitai" ) index <- db$get_index_metadata() @@ -11,17 +11,23 @@ test_that("getting index metadata", { test_that("getting embeddings", { - db <- Pinecone$new(project_id = "test_project_id", index_id = "gitai") + db <- Pinecone$new( + namespace = "test_project_id", + index = "gitai" + ) test_text <- "Apple is a popular fruit known for its sweetness and crisp texture." - embeddings <- db$get_embeddings(text = test_text) + embeddings <- db$.__enclos_env__$private$.get_embeddings(text = test_text) length(embeddings) |> expect_equal(1024) }) test_that("writting records", { - db <- Pinecone$new(project_id = "test_project_id", index_id = "gitai") + db <- Pinecone$new( + namespace = "test_project_id", + index = "gitai" + ) test_texts <- c( "Apple is a popular fruit known for its sweetness and crisp texture.", @@ -34,12 +40,9 @@ test_that("writting records", { for (i in seq_along(test_texts)) { - embeddings <- db$get_embeddings(text = test_texts[i]) - result <- db$write_record( id = paste0("id_", i), - embeddings = embeddings, - metadata = list(text = test_texts[i]) + text = test_texts[i] ) result$upsertedCount |> expect_equal(1) @@ -50,17 +53,26 @@ test_that("finding records", { Sys.sleep(3) - db <- Pinecone$new(project_id = "test_project_id", index_id = "gitai") + db <- Pinecone$new( + namespace = "test_project_id", + index = "gitai" + ) - result <- db$find_records(query = "Tell me about Apple Tech computer company.", top_k = 1) + result <- db$find_records( + query = "Tell me about Apple Tech computer company.", + top_k = 1 + ) length(result) |> expect_equal(1) result[[1]]$id |> expect_equal("id_2") result[[1]]$metadata$text |> is.character() |> expect_true() result[[1]]$score |> is.numeric() |> expect_true() - result_2 <- db$find_records(query = "Tell me about apple fruit.") - + result_2 <- db$find_records( + query = "Tell me about apple fruit.", + top_k = 1 + ) + expect_false(result_2[[1]]$id == result[[1]]$id) - }) +}) diff --git a/tests/testthat/test-VectorDatabase.R b/tests/testthat/test-VectorDatabase.R index e328329..323a0b4 100644 --- a/tests/testthat/test-VectorDatabase.R +++ b/tests/testthat/test-VectorDatabase.R @@ -1,10 +1,10 @@ test_that("superclass has expected methods", { - db <- VectorDatabase$new(project_id = "test_project_id") + db <- VectorDatabase$new(namespace = "test_project_id") - db$get_embeddings(text = "test_text") |> expect_error("Not implemented yet.") - - db$write_record(record = "test_record") |> expect_error("Not implemented yet.") - - db$find_records(query = "test_query") |> expect_error("Not implemented yet.") + db$write_record() |> + expect_error("Not implemented yet.") + + db$find_records(query = "test_query") |> + expect_error("Not implemented yet.") }) diff --git a/tests/testthat/test-set_database.R b/tests/testthat/test-set_database.R new file mode 100644 index 0000000..f3dee7a --- /dev/null +++ b/tests/testthat/test-set_database.R @@ -0,0 +1,24 @@ +test_that("setting database provider with default namespace", { + + gitai <- initialize_project("gitai-demo") |> + set_database( + provider = "Pinecone", + index = "gitai" + ) + + gitai$db$index |> expect_equal("gitai") + gitai$db$namespace |> expect_equal("gitai-demo") +}) + +test_that("setting database provider with custom namepsace", { + + gitai <- initialize_project("gitai-demo") |> + set_database( + provider = "Pinecone", + index = "gitai", + namespace = "test_namespace" + ) + + gitai$db$index |> expect_equal("gitai") + gitai$db$namespace |> expect_equal("test_namespace") +}) From 5004d1ed29fd288c77cc99ddce671a2bea2e1914 Mon Sep 17 00:00:00 2001 From: Kamil Wais Date: Thu, 28 Nov 2024 15:50:26 +0100 Subject: [PATCH 04/11] added Pinecone class --- R/Pinecone.R | 141 +++++++++++++++++++++++++++ R/VectorDatabase.R | 30 ++++++ devel/sandbox.R | 13 +-- tests/testthat/test-Pinecone.R | 66 +++++++++++++ tests/testthat/test-VectorDatabase.R | 10 ++ 5 files changed, 248 insertions(+), 12 deletions(-) create mode 100644 R/Pinecone.R create mode 100644 R/VectorDatabase.R create mode 100644 tests/testthat/test-Pinecone.R create mode 100644 tests/testthat/test-VectorDatabase.R diff --git a/R/Pinecone.R b/R/Pinecone.R new file mode 100644 index 0000000..b8f73ac --- /dev/null +++ b/R/Pinecone.R @@ -0,0 +1,141 @@ +Pinecone <- R6::R6Class( + classname = "Pinecone", + inherit = VectorDatabase, + public = list( + + get_index_metadata = function() { + + pinecone_api_key <- Sys.getenv("PINECONE_API_KEY") + + url <- paste0("https://api.pinecone.io/indexes/", private$.index_id) + + httr2::request(url) |> + httr2::req_headers( + "Api-Key" = pinecone_api_key + ) |> + httr2::req_perform() |> + httr2::resp_body_json() + + }, + + get_embeddings = function(text) { + + pinecone_api_key <- Sys.getenv("PINECONE_API_KEY") + + url <- "https://api.pinecone.io" + + body <- list( + model = "multilingual-e5-large", + parameters = list( + input_type = "passage", + truncate = "END" + ), + inputs = list( + list(text = text) + ) + ) + + request <- + httr2::request(url) |> + httr2::req_url_path_append("embed") |> + httr2::req_headers( + "Api-Key" = pinecone_api_key, + "X-Pinecone-API-Version" = "2024-10" + ) |> + httr2::req_body_json(body) + + response <- + request |> + httr2::req_perform() + + response_body <- httr2::resp_body_json(response) + + response_body$data[[1]]$values |> unlist() + }, + + write_record = function(id, embeddings, metadata) { + + pinecone_api_key <- Sys.getenv("PINECONE_API_KEY") + + url <- paste0("https://", private$.index_host) + + body <- list( + namespace = private$.project_id, + vectors = list( + id = id, + values = embeddings, + metadata = metadata + ) + ) + + request <- + httr2::request(url) |> + httr2::req_url_path_append("vectors/upsert") |> + httr2::req_headers( + "Api-Key" = pinecone_api_key, + "X-Pinecone-API-Version" = "2024-10" + ) |> + httr2::req_body_json(body) + + response <- + request |> + httr2::req_perform() + + response_body <- httr2::resp_body_json(response) + response_body + }, + + find_records = function(query, top_k = 3) { + + embeddings <- self$get_embeddings(query) + + pinecone_api_key <- Sys.getenv("PINECONE_API_KEY") + + url <- paste0("https://", private$.index_host) + + body <- list( + namespace = private$.project_id, + vector = embeddings, + topK = top_k, + includeValues = FALSE, + includeMetadata = TRUE + ) + + request <- + httr2::request(url) |> + httr2::req_url_path_append("query") |> + httr2::req_headers( + "Api-Key" = pinecone_api_key, + "X-Pinecone-API-Version" = "2024-10" + ) |> + httr2::req_body_json(body) + + response <- + request |> + httr2::req_perform() + + response_body <- httr2::resp_body_json(response) + results <- response_body$matches + + results |> + purrr::map(function(result) { + result$values <- NULL + result + }) + } + ), + + private = list( + + .project_id = NULL, + .index_id = NULL, + .index_host = NULL, + + .initialize = function(index_id) { + + private$.index_id <- index_id + + private$.index_host <- self$get_index_metadata()$host + } + ) +) \ No newline at end of file diff --git a/R/VectorDatabase.R b/R/VectorDatabase.R new file mode 100644 index 0000000..7c2b1f1 --- /dev/null +++ b/R/VectorDatabase.R @@ -0,0 +1,30 @@ +VectorDatabase <- R6::R6Class( + classname = "VectorDatabase", + public = list( + + initialize = function(project_id, ...) { + + private$.project_id <- project_id + + private$.initialize(...) + }, + + get_embeddings = function(text) { + stop(call. = FALSE, "Not implemented yet.") + }, + + write_record = function(id, embeddings, metadata) { + stop(call. = FALSE, "Not implemented yet.") + }, + + find_records = function(query) { + stop(call. = FALSE, "Not implemented yet.") + } + ), + + private = list( + .project_id = NULL, + + .initialize = function(...) {} + ) +) \ No newline at end of file diff --git a/devel/sandbox.R b/devel/sandbox.R index e841ae6..2512aa7 100644 --- a/devel/sandbox.R +++ b/devel/sandbox.R @@ -1,13 +1,2 @@ -pak::pak("tidyverse/elmer") - -chat <- elmer::chat_openai( - model = "gpt-4o-mini", - # system_prompt = "You are a friendly but terse assistant.", - echo = "none" -) -chat -chat$system_prompt <- "You always start with 'HI'" -chat |> str() -result <- chat$chat("What is the meaning of life?") -result +Sys.getenv("PINECONE_API_KEY") diff --git a/tests/testthat/test-Pinecone.R b/tests/testthat/test-Pinecone.R new file mode 100644 index 0000000..1f0b94e --- /dev/null +++ b/tests/testthat/test-Pinecone.R @@ -0,0 +1,66 @@ +test_that("getting index metadata", { + + db <- Pinecone$new( + project_id = "test_project_id", + index_id = "gitai" + ) + + index <- db$get_index_metadata() + index$host |> is.character() |> expect_true() +}) + +test_that("getting embeddings", { + + db <- Pinecone$new(project_id = "test_project_id", index_id = "gitai") + + test_text <- "Apple is a popular fruit known for its sweetness and crisp texture." + embeddings <- db$get_embeddings(text = test_text) + + length(embeddings) |> expect_equal(1024) +}) + +test_that("writting records", { + + db <- Pinecone$new(project_id = "test_project_id", index_id = "gitai") + + test_texts <- c( + "Apple is a popular fruit known for its sweetness and crisp texture.", + "The tech company Apple is known for its innovative products like the iPhone.", + "Many people enjoy eating apples as a healthy snack.", + "Apple Inc. has revolutionized the tech industry with its sleek designs and user-friendly interfaces.", + "An apple a day keeps the doctor away, as the saying goes.", + "Apple Computer Company was founded on April 1, 1976, by Steve Jobs, Steve Wozniak, and Ronald Wayne as a partnership." + ) + + for (i in seq_along(test_texts)) { + + embeddings <- db$get_embeddings(text = test_texts[i]) + + result <- db$write_record( + id = paste0("id_", i), + embeddings = embeddings, + metadata = list(text = test_texts[i]) + ) + + result$upsertedCount |> expect_equal(1) + } + }) + +test_that("finding records", { + + Sys.sleep(3) + + db <- Pinecone$new(project_id = "test_project_id", index_id = "gitai") + + result <- db$find_records(query = "Tell me about Apple Tech computer company.", top_k = 1) + + length(result) |> expect_equal(1) + result[[1]]$id |> expect_equal("id_2") + result[[1]]$metadata$text |> is.character() |> expect_true() + result[[1]]$score |> is.numeric() |> expect_true() + + result_2 <- db$find_records(query = "Tell me about apple fruit.") + + expect_false(result_2[[1]]$id == result[[1]]$id) + }) + diff --git a/tests/testthat/test-VectorDatabase.R b/tests/testthat/test-VectorDatabase.R new file mode 100644 index 0000000..e328329 --- /dev/null +++ b/tests/testthat/test-VectorDatabase.R @@ -0,0 +1,10 @@ +test_that("superclass has expected methods", { + + db <- VectorDatabase$new(project_id = "test_project_id") + + db$get_embeddings(text = "test_text") |> expect_error("Not implemented yet.") + + db$write_record(record = "test_record") |> expect_error("Not implemented yet.") + + db$find_records(query = "test_query") |> expect_error("Not implemented yet.") +}) From 645773c6ea64d31bbada7772830528575b1f402d Mon Sep 17 00:00:00 2001 From: Kamil Wais Date: Thu, 28 Nov 2024 15:57:45 +0100 Subject: [PATCH 05/11] updated sandbox --- devel/sandbox.R | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/devel/sandbox.R b/devel/sandbox.R index 2512aa7..d79b3a0 100644 --- a/devel/sandbox.R +++ b/devel/sandbox.R @@ -1,2 +1,17 @@ -Sys.getenv("PINECONE_API_KEY") + +my_project <- + initialize_project("gitai-demo") |> + set_github_repos(repos = c( + "r-world-devs/GitStats", + "r-world-devs/GitAI", + "openpharma/DataFakeR" + )) |> + add_files(files = "README.md") |> + set_llm() |> + set_prompt(paste( + "Write a paragraph of summary for a project based on given input.", + "Highlight business value of the project, its use cases and target audience." + )) + +process_repos(my_project) From e8a251e4c1c1d7ca1ed3118238a5df2353cb128a Mon Sep 17 00:00:00 2001 From: Kamil Wais Date: Thu, 5 Dec 2024 13:31:52 +0100 Subject: [PATCH 06/11] added Pinecone support --- DESCRIPTION | 1 + R/GitAI.R | 6 + R/Pinecone.R | 171 +++++++++++++++------------ R/VectorDatabase.R | 23 ++-- R/find_records.R | 14 +++ R/process_repos.R | 60 ++++++---- R/set_database.R | 23 ++++ devel/sandbox.R | 89 ++++++++++++-- tests/testthat/test-Pinecone.R | 40 ++++--- tests/testthat/test-VectorDatabase.R | 12 +- tests/testthat/test-set_database.R | 24 ++++ 11 files changed, 325 insertions(+), 138 deletions(-) create mode 100644 R/find_records.R create mode 100644 R/set_database.R create mode 100644 tests/testthat/test-set_database.R diff --git a/DESCRIPTION b/DESCRIPTION index 438bc9c..bce20b0 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -28,3 +28,4 @@ Imports: Suggests: testthat (>= 3.0.0) Config/testthat/edition: 3 +Config/testthat/parallel: true diff --git a/R/GitAI.R b/R/GitAI.R index 9fc08e7..8ec23db 100644 --- a/R/GitAI.R +++ b/R/GitAI.R @@ -18,6 +18,11 @@ GitAI <- R6::R6Class( private$.llm <- value }, + db = function(value) { + if (missing(value)) return(private$.db) + private$.db <- value + }, + system_prompt = function(value) { if (is.null(private$.llm)) @@ -47,6 +52,7 @@ GitAI <- R6::R6Class( private = list( .project_id = NULL, .llm = NULL, + .db = NULL, .gitstats = NULL, .files = NULL, .repos_metadata = NULL, diff --git a/R/Pinecone.R b/R/Pinecone.R index b8f73ac..9184a8b 100644 --- a/R/Pinecone.R +++ b/R/Pinecone.R @@ -7,7 +7,7 @@ Pinecone <- R6::R6Class( pinecone_api_key <- Sys.getenv("PINECONE_API_KEY") - url <- paste0("https://api.pinecone.io/indexes/", private$.index_id) + url <- paste0("https://api.pinecone.io/indexes/", private$.index) httr2::request(url) |> httr2::req_headers( @@ -17,84 +17,53 @@ Pinecone <- R6::R6Class( httr2::resp_body_json() }, - - get_embeddings = function(text) { - - pinecone_api_key <- Sys.getenv("PINECONE_API_KEY") - - url <- "https://api.pinecone.io" - - body <- list( - model = "multilingual-e5-large", - parameters = list( - input_type = "passage", - truncate = "END" - ), - inputs = list( - list(text = text) - ) - ) - - request <- - httr2::request(url) |> - httr2::req_url_path_append("embed") |> - httr2::req_headers( - "Api-Key" = pinecone_api_key, - "X-Pinecone-API-Version" = "2024-10" - ) |> - httr2::req_body_json(body) - - response <- - request |> - httr2::req_perform() - response_body <- httr2::resp_body_json(response) - - response_body$data[[1]]$values |> unlist() - }, - - write_record = function(id, embeddings, metadata) { - + write_record = function(id, text, metadata = list()) { + pinecone_api_key <- Sys.getenv("PINECONE_API_KEY") - + url <- paste0("https://", private$.index_host) - + + embeddings <- private$.get_embeddings(text = text) + + metadata$text <- text + body <- list( - namespace = private$.project_id, + namespace = private$.namespace, vectors = list( id = id, values = embeddings, metadata = metadata ) ) - + request <- httr2::request(url) |> - httr2::req_url_path_append("vectors/upsert") |> - httr2::req_headers( + httr2::req_url_path_append("vectors/upsert") |> + httr2::req_headers( "Api-Key" = pinecone_api_key, "X-Pinecone-API-Version" = "2024-10" ) |> - httr2::req_body_json(body) - - response <- - request |> - httr2::req_perform() - + httr2::req_body_json(body) + + response <- + request |> + httr2::req_perform() + response_body <- httr2::resp_body_json(response) response_body }, - - find_records = function(query, top_k = 3) { - - embeddings <- self$get_embeddings(query) + + find_records = function(query, top_k = 1) { + + embeddings <- private$.get_embeddings(query) pinecone_api_key <- Sys.getenv("PINECONE_API_KEY") url <- paste0("https://", private$.index_host) body <- list( - namespace = private$.project_id, + namespace = private$.namespace, vector = embeddings, topK = top_k, includeValues = FALSE, @@ -103,39 +72,89 @@ Pinecone <- R6::R6Class( request <- httr2::request(url) |> - httr2::req_url_path_append("query") |> - httr2::req_headers( - "Api-Key" = pinecone_api_key, - "X-Pinecone-API-Version" = "2024-10" - ) |> - httr2::req_body_json(body) - + httr2::req_url_path_append("query") |> + httr2::req_headers( + "Api-Key" = pinecone_api_key, + "X-Pinecone-API-Version" = "2024-10" + ) |> + httr2::req_body_json(body) + response <- request |> - httr2::req_perform() + httr2::req_perform() + + response_body <- httr2::resp_body_json(response) + results <- response_body$matches + + results |> + purrr::map(function(result) { + result$values <- NULL + result + }) + } + ), - response_body <- httr2::resp_body_json(response) - results <- response_body$matches + active = list( + + namespace = function(value) { + if (missing(value)) return(private$.namespace) + private$.namespace <- value + }, - results |> - purrr::map(function(result) { - result$values <- NULL - result - }) + index = function(value) { + if (missing(value)) return(private$.index) + private$.index <- value } ), - + private = list( - + .project_id = NULL, - .index_id = NULL, + .index = NULL, + .namespace = NULL, .index_host = NULL, + + .initialize = function(index, namespace) { - .initialize = function(index_id) { + private$.index <- index + private$.namespace <- namespace + private$.index_host <- self$get_index_metadata()$host + }, - private$.index_id <- index_id + .get_embeddings = function(text) { + + pinecone_api_key <- Sys.getenv("PINECONE_API_KEY") + + url <- "https://api.pinecone.io" + + body <- list( + model = "multilingual-e5-large", + parameters = list( + input_type = "passage", + truncate = "END" + ), + inputs = list( + list(text = text) + ) + ) - private$.index_host <- self$get_index_metadata()$host + request <- + httr2::request(url) |> + httr2::req_url_path_append("embed") |> + httr2::req_headers( + "Api-Key" = pinecone_api_key, + "X-Pinecone-API-Version" = "2024-10" + ) |> + httr2::req_body_json(body) + + response <- + request |> + httr2::req_perform() + + response_body <- httr2::resp_body_json(response) + + response_body$data[[1]]$values |> unlist() + } ) ) \ No newline at end of file diff --git a/R/VectorDatabase.R b/R/VectorDatabase.R index 7c2b1f1..70061bf 100644 --- a/R/VectorDatabase.R +++ b/R/VectorDatabase.R @@ -2,29 +2,26 @@ VectorDatabase <- R6::R6Class( classname = "VectorDatabase", public = list( - initialize = function(project_id, ...) { - - private$.project_id <- project_id + initialize = function(...) { private$.initialize(...) }, - - get_embeddings = function(text) { - stop(call. = FALSE, "Not implemented yet.") - }, - + write_record = function(id, embeddings, metadata) { stop(call. = FALSE, "Not implemented yet.") }, - - find_records = function(query) { + + find_records = function(query, top_k = 1) { stop(call. = FALSE, "Not implemented yet.") } ), - + private = list( - .project_id = NULL, - .initialize = function(...) {} + .initialize = function(...) {}, + + .get_embeddings = function(text) { + stop(call. = FALSE, "Not implemented yet.") + } ) ) \ No newline at end of file diff --git a/R/find_records.R b/R/find_records.R new file mode 100644 index 0000000..8e1b5fb --- /dev/null +++ b/R/find_records.R @@ -0,0 +1,14 @@ +#' @export +find_records <- function( + gitai, + query, + top_k = 1, + verbose = is_verbose() +) { + + gitai$db$find_records( + query = query, + top_k = top_k + ) + +} \ No newline at end of file diff --git a/R/process_repos.R b/R/process_repos.R index d3a843b..dcc053c 100644 --- a/R/process_repos.R +++ b/R/process_repos.R @@ -5,7 +5,10 @@ #' additional diagnostic messages. #' @return A list. #' @export -process_repos <- function(gitai, verbose = is_verbose()) { +process_repos <- function( + gitai, + verbose = is_verbose() +) { gitstats <- gitai$gitstats @@ -23,29 +26,42 @@ process_repos <- function(gitai, verbose = is_verbose()) { ) files_content <- GitStats::get_files_content(gitstats, verbose = verbose) repositories <- unique(files_content$repo_name) - process_repo_content <- function(repo_name) { - if (verbose) { - cli::cli_alert_info("Processing repository: {.pkg {repo_name}}") - } - - filtered_content <- files_content |> - dplyr::filter(repo_name == !!repo_name) - content_to_process <- filtered_content |> - dplyr::pull(file_content) |> - paste(collapse = "\n\n") - - result <- gitai |> - process_content( + results <- + repositories |> + purrr::map(function(repo_name) { + if (verbose) { + cli::cli_alert_info("Processing repository: {.pkg {repo_name}}") + } + + filtered_content <- files_content |> + dplyr::filter(repo_name == !!repo_name) + + content_to_process <- filtered_content |> + dplyr::pull(file_content) |> + paste(collapse = "\n\n") + + if (verbose) { + cli::cli_alert_info("Processing content with LLM...") + } + result <- process_content( + gitai = gitai, content = content_to_process ) |> - add_metadata( - content = filtered_content - ) - } - - results <- repositories |> - purrr::map(process_repo_content) |> + add_metadata(content = filtered_content) + + if (!is.null(gitai$db)) { + if (verbose) { + cli::cli_alert_info("Writing to database...") + } + gitai$db$write_record( + id = repo_name, + text = result$text, + metadata = result$metadata + ) + } + result + }) |> purrr::set_names(repositories) - results + invisible(results) } diff --git a/R/set_database.R b/R/set_database.R new file mode 100644 index 0000000..d0e208b --- /dev/null +++ b/R/set_database.R @@ -0,0 +1,23 @@ +#' @export +set_database <- function( + gitai, + provider = "Pinecone", + ... +) { + + provider_class <- get(provider) + + args <- list(...) + + if (is.null(args$namespace)) { + args$namespace <- gitai$project_id + } + + db <- do.call( + what = provider_class$new, + args = args + ) + + gitai$db <- db + invisible(gitai) +} \ No newline at end of file diff --git a/devel/sandbox.R b/devel/sandbox.R index d79b3a0..c8e9d41 100644 --- a/devel/sandbox.R +++ b/devel/sandbox.R @@ -1,17 +1,92 @@ - - my_project <- initialize_project("gitai-demo") |> + set_database( + provider = "Pinecone", + index = "gitai", + namespace = NULL + ) |> + set_llm(seed = 1014, api_args = list(temperature = 0)) + +my_project <- + my_project |> set_github_repos(repos = c( "r-world-devs/GitStats", "r-world-devs/GitAI", "openpharma/DataFakeR" )) |> - add_files(files = "README.md") |> - set_llm() |> + add_files(files = c("README.md")) + +my_project <- + my_project |> set_prompt(paste( - "Write a paragraph of summary for a project based on given input.", - "Highlight business value of the project, its use cases and target audience." + "Write two paragraphs of summary for a project based on given input.", + "Highlight business value of the project, its functionality, main features,", + "and use cases." )) -process_repos(my_project) +results <- process_repos(my_project) + +results |> dplyr::glimpse() +purrr::map(results, ~.$text) + + + + + + +# my_project <- +# initialize_project(project_id = "gitai-demo") |> +# set_database(index = "gitai") + +my_project |> find_records("I'm looking for an R package to create synthetic datasets.") + +my_project |> find_records("How can I check statisting of git repositories.") + +my_project |> find_records("Can I somehow extract information from code from git repositories?") + +my_project |> find_records("What are the tools that can help me in my work as a Data Scientist?") + +my_project |> + find_records("As a Manager I look for tools which let me learn what tools can be reused in my company", top_k = 2) + +my_project |> find_records("Which data products could have large impact in global company that maintains many repositories?", top_k = 3) + +my_project |> find_records("Szukam narzędzi które wykorzystują sztuczną inteligencję?") + + + + + + + +my_chatbot <- + initialize_project("gitai-demo") |> + set_database(index = "gitai") |> + set_llm(seed = 1014, api_args = list(temperature = 0)) |> + set_prompt(paste( + "As a helpful assistant, answer user question using only the provided input." + )) + +get_answer <- function(my_chatbot, query) { + + cat("\n") + my_chatbot$llm$chat(paste( + "User query:", query, "\n\n", + "Known input for the answer:", + my_project$db$find_records(query = query, top_k = 1) + )) |> + cat() +} + +my_chatbot |> + get_answer("I'm looking for an R package to create synthetic datasets.") + +my_chatbot |> + get_answer("How can I check statisting of git repositories?") + +my_chatbot |> + get_answer("Can I somehow extract information from code from git repositories?") + +my_chatbot |> + get_answer("I would love to use AI to process code files. Is it possible? Give me the answer writting in one sentence with very funny style.") + diff --git a/tests/testthat/test-Pinecone.R b/tests/testthat/test-Pinecone.R index 1f0b94e..bc74115 100644 --- a/tests/testthat/test-Pinecone.R +++ b/tests/testthat/test-Pinecone.R @@ -1,8 +1,8 @@ test_that("getting index metadata", { db <- Pinecone$new( - project_id = "test_project_id", - index_id = "gitai" + namespace = "test_project_id", + index = "gitai" ) index <- db$get_index_metadata() @@ -11,17 +11,23 @@ test_that("getting index metadata", { test_that("getting embeddings", { - db <- Pinecone$new(project_id = "test_project_id", index_id = "gitai") + db <- Pinecone$new( + namespace = "test_project_id", + index = "gitai" + ) test_text <- "Apple is a popular fruit known for its sweetness and crisp texture." - embeddings <- db$get_embeddings(text = test_text) + embeddings <- db$.__enclos_env__$private$.get_embeddings(text = test_text) length(embeddings) |> expect_equal(1024) }) test_that("writting records", { - db <- Pinecone$new(project_id = "test_project_id", index_id = "gitai") + db <- Pinecone$new( + namespace = "test_project_id", + index = "gitai" + ) test_texts <- c( "Apple is a popular fruit known for its sweetness and crisp texture.", @@ -34,12 +40,9 @@ test_that("writting records", { for (i in seq_along(test_texts)) { - embeddings <- db$get_embeddings(text = test_texts[i]) - result <- db$write_record( id = paste0("id_", i), - embeddings = embeddings, - metadata = list(text = test_texts[i]) + text = test_texts[i] ) result$upsertedCount |> expect_equal(1) @@ -50,17 +53,26 @@ test_that("finding records", { Sys.sleep(3) - db <- Pinecone$new(project_id = "test_project_id", index_id = "gitai") + db <- Pinecone$new( + namespace = "test_project_id", + index = "gitai" + ) - result <- db$find_records(query = "Tell me about Apple Tech computer company.", top_k = 1) + result <- db$find_records( + query = "Tell me about Apple Tech computer company.", + top_k = 1 + ) length(result) |> expect_equal(1) result[[1]]$id |> expect_equal("id_2") result[[1]]$metadata$text |> is.character() |> expect_true() result[[1]]$score |> is.numeric() |> expect_true() - result_2 <- db$find_records(query = "Tell me about apple fruit.") - + result_2 <- db$find_records( + query = "Tell me about apple fruit.", + top_k = 1 + ) + expect_false(result_2[[1]]$id == result[[1]]$id) - }) +}) diff --git a/tests/testthat/test-VectorDatabase.R b/tests/testthat/test-VectorDatabase.R index e328329..323a0b4 100644 --- a/tests/testthat/test-VectorDatabase.R +++ b/tests/testthat/test-VectorDatabase.R @@ -1,10 +1,10 @@ test_that("superclass has expected methods", { - db <- VectorDatabase$new(project_id = "test_project_id") + db <- VectorDatabase$new(namespace = "test_project_id") - db$get_embeddings(text = "test_text") |> expect_error("Not implemented yet.") - - db$write_record(record = "test_record") |> expect_error("Not implemented yet.") - - db$find_records(query = "test_query") |> expect_error("Not implemented yet.") + db$write_record() |> + expect_error("Not implemented yet.") + + db$find_records(query = "test_query") |> + expect_error("Not implemented yet.") }) diff --git a/tests/testthat/test-set_database.R b/tests/testthat/test-set_database.R new file mode 100644 index 0000000..f3dee7a --- /dev/null +++ b/tests/testthat/test-set_database.R @@ -0,0 +1,24 @@ +test_that("setting database provider with default namespace", { + + gitai <- initialize_project("gitai-demo") |> + set_database( + provider = "Pinecone", + index = "gitai" + ) + + gitai$db$index |> expect_equal("gitai") + gitai$db$namespace |> expect_equal("gitai-demo") +}) + +test_that("setting database provider with custom namepsace", { + + gitai <- initialize_project("gitai-demo") |> + set_database( + provider = "Pinecone", + index = "gitai", + namespace = "test_namespace" + ) + + gitai$db$index |> expect_equal("gitai") + gitai$db$namespace |> expect_equal("test_namespace") +}) From 13ec25db79ca09bafb4d3fa42f507b97e56a8d91 Mon Sep 17 00:00:00 2001 From: Kamil Wais Date: Fri, 6 Dec 2024 13:31:47 +0100 Subject: [PATCH 07/11] updated docs --- NAMESPACE | 2 ++ R/find_records.R | 9 +++++++-- R/set_database.R | 6 ++++++ man/find_records.Rd | 21 +++++++++++++++++++++ man/set_database.Rd | 18 ++++++++++++++++++ 5 files changed, 54 insertions(+), 2 deletions(-) create mode 100644 man/find_records.Rd create mode 100644 man/set_database.Rd diff --git a/NAMESPACE b/NAMESPACE index 1da8f0a..0347dce 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,9 +1,11 @@ # Generated by roxygen2: do not edit by hand export(add_files) +export(find_records) export(initialize_project) export(is_verbose) export(process_repos) +export(set_database) export(set_github_repos) export(set_gitlab_repos) export(set_llm) diff --git a/R/find_records.R b/R/find_records.R index 8e1b5fb..5c0dc0c 100644 --- a/R/find_records.R +++ b/R/find_records.R @@ -1,3 +1,9 @@ +#' Finding top K records in a vector database. +#' +#' @param query A character, user query. +#' @param top_k A numeric, number of top K records to return. +#' @inheritParams process_repos +#' #' @export find_records <- function( gitai, @@ -10,5 +16,4 @@ find_records <- function( query = query, top_k = top_k ) - -} \ No newline at end of file +} diff --git a/R/set_database.R b/R/set_database.R index d0e208b..56f1e58 100644 --- a/R/set_database.R +++ b/R/set_database.R @@ -1,3 +1,9 @@ +#' Setting database in `GitAI` object. +#' +#' @inheritParams process_repos +#' @param provider A string. Name of database provider. +#' @param ... Additional arguments to pass to database provider constructor. +#' #' @export set_database <- function( gitai, diff --git a/man/find_records.Rd b/man/find_records.Rd new file mode 100644 index 0000000..e9d1b42 --- /dev/null +++ b/man/find_records.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/find_records.R +\name{find_records} +\alias{find_records} +\title{Finding top K records in a vector database.} +\usage{ +find_records(gitai, query, top_k = 1, verbose = is_verbose()) +} +\arguments{ +\item{gitai}{A \code{GitAI} object.} + +\item{query}{A character, user query.} + +\item{top_k}{A numeric, number of top K records to return.} + +\item{verbose}{A logical. If \code{FALSE} you won't be getting +additional diagnostic messages.} +} +\description{ +Finding top K records in a vector database. +} diff --git a/man/set_database.Rd b/man/set_database.Rd new file mode 100644 index 0000000..b75a54c --- /dev/null +++ b/man/set_database.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/set_database.R +\name{set_database} +\alias{set_database} +\title{Setting database in \code{GitAI} object.} +\usage{ +set_database(gitai, provider = "Pinecone", ...) +} +\arguments{ +\item{gitai}{A \code{GitAI} object.} + +\item{provider}{A string. Name of database provider.} + +\item{...}{Additional arguments to pass to database provider constructor.} +} +\description{ +Setting database in \code{GitAI} object. +} From 9c8350efcf4dc7a787b8d70df6e332e9bacc59d6 Mon Sep 17 00:00:00 2001 From: Kamil Wais Date: Fri, 6 Dec 2024 13:54:10 +0100 Subject: [PATCH 08/11] fixed style --- R/Pinecone.R | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/R/Pinecone.R b/R/Pinecone.R index 9184a8b..2db37ed 100644 --- a/R/Pinecone.R +++ b/R/Pinecone.R @@ -10,12 +10,9 @@ Pinecone <- R6::R6Class( url <- paste0("https://api.pinecone.io/indexes/", private$.index) httr2::request(url) |> - httr2::req_headers( - "Api-Key" = pinecone_api_key - ) |> + httr2::req_headers("Api-Key" = pinecone_api_key) |> httr2::req_perform() |> httr2::resp_body_json() - }, write_record = function(id, text, metadata = list()) { @@ -37,19 +34,17 @@ Pinecone <- R6::R6Class( ) ) - request <- - httr2::request(url) |> - httr2::req_url_path_append("vectors/upsert") |> - httr2::req_headers( + request <- httr2::request(url) |> + httr2::req_url_path_append("vectors/upsert") |> + httr2::req_headers( "Api-Key" = pinecone_api_key, "X-Pinecone-API-Version" = "2024-10" ) |> - httr2::req_body_json(body) + httr2::req_body_json(body) + + response <- request |> + httr2::req_perform() - response <- - request |> - httr2::req_perform() - response_body <- httr2::resp_body_json(response) response_body }, From 424a57fa5268dd7e9882bccd7784cb1101f59a73 Mon Sep 17 00:00:00 2001 From: Kamil Wais Date: Fri, 6 Dec 2024 14:02:00 +0100 Subject: [PATCH 09/11] updated renv snapshot --- renv.lock | 122 +++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 108 insertions(+), 14 deletions(-) diff --git a/renv.lock b/renv.lock index cbb4ea8..5b5c1e4 100644 --- a/renv.lock +++ b/renv.lock @@ -218,6 +218,26 @@ ], "Hash": "33698c4b3127fc9f506654607fb73676" }, + "downlit": { + "Package": "downlit", + "Version": "0.4.4", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "brio", + "desc", + "digest", + "evaluate", + "fansi", + "memoise", + "rlang", + "vctrs", + "withr", + "yaml" + ], + "Hash": "45a6a596bf0108ee1ff16a040a2df897" + }, "dplyr": { "Package": "dplyr", "Version": "1.1.4", @@ -261,7 +281,7 @@ "jsonlite", "rlang" ], - "Hash": "7b6008b2091a6d4a93b8e57bef424449" + "Hash": "b88717af28296eff2bc8f6f7f474ef1a" }, "evaluate": { "Package": "evaluate", @@ -388,7 +408,7 @@ "vctrs", "withr" ], - "Hash": "07091206ec270c43fa3913e40536d5ac" + "Hash": "66c98064fcefeac59159ba0d69e9bb8f" }, "jquerylib": { "Package": "jquerylib", @@ -493,18 +513,6 @@ ], "Hash": "d413e0fef796c9401a4419485f709ca1" }, - "pak": { - "Package": "pak", - "Version": "0.8.0", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "tools", - "utils" - ], - "Hash": "019d4d5634410800a96b69737fb2dbac" - }, "pillar": { "Package": "pillar", "Version": "1.9.0", @@ -547,6 +555,36 @@ ], "Hash": "01f28d4278f15c76cddbea05899c5d6f" }, + "pkgdown": { + "Package": "pkgdown", + "Version": "2.1.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "bslib", + "callr", + "cli", + "desc", + "digest", + "downlit", + "fontawesome", + "fs", + "httr2", + "jsonlite", + "openssl", + "purrr", + "ragg", + "rlang", + "rmarkdown", + "tibble", + "whisker", + "withr", + "xml2", + "yaml" + ], + "Hash": "df2912d5873422b55a13002510f02c9f" + }, "pkgload": { "Package": "pkgload", "Version": "1.4.0", @@ -615,6 +653,17 @@ ], "Hash": "1cba04a4e9414bdefc9dcaa99649a8dc" }, + "ragg": { + "Package": "ragg", + "Version": "1.3.3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "systemfonts", + "textshaping" + ], + "Hash": "0595fe5e47357111f29ad19101c7d271" + }, "rappdirs": { "Package": "rappdirs", "Version": "0.3.3", @@ -730,6 +779,18 @@ "Repository": "CRAN", "Hash": "de342ebfebdbf40477d0758d05426646" }, + "systemfonts": { + "Package": "systemfonts", + "Version": "1.1.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "cpp11", + "lifecycle" + ], + "Hash": "213b6b8ed5afbf934843e6c3b090d418" + }, "testthat": { "Package": "testthat", "Version": "3.2.1.1", @@ -759,6 +820,19 @@ ], "Hash": "3f6e7e5e2220856ff865e4834766bf2b" }, + "textshaping": { + "Package": "textshaping", + "Version": "0.4.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "cpp11", + "lifecycle", + "systemfonts" + ], + "Hash": "5142f8bc78ed3d819d26461b641627ce" + }, "tibble": { "Package": "tibble", "Version": "3.2.1", @@ -854,6 +928,13 @@ ], "Hash": "52f574062a7b66e56926988c3fbdb3b7" }, + "whisker": { + "Package": "whisker", + "Version": "0.4.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "c6abfa47a46d281a7d5159d0a8891e88" + }, "withr": { "Package": "withr", "Version": "3.0.2", @@ -879,6 +960,19 @@ ], "Hash": "8687398773806cfff9401a2feca96298" }, + "xml2": { + "Package": "xml2", + "Version": "1.3.6", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "cli", + "methods", + "rlang" + ], + "Hash": "1d0336142f4cd25d8d23cd3ba7a8fb61" + }, "yaml": { "Package": "yaml", "Version": "2.3.10", From 7d0a2b175712537b3bb93019ce61a9a34fae4f6d Mon Sep 17 00:00:00 2001 From: Kamil Wais Date: Fri, 6 Dec 2024 14:12:28 +0100 Subject: [PATCH 10/11] fixed env var for cicd --- .github/workflows/R-CMD-check.yaml | 1 + .github/workflows/test-coverage.yaml | 1 + 2 files changed, 2 insertions(+) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index e828c2d..f92b560 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -24,6 +24,7 @@ jobs: env: GITHUB_PAT: ${{ secrets.TEST_GITHUB_PAT }} + PINECONE_API_KEY: ${{ secrets.PINECONE_API_KEY }} R_KEEP_PKG_SOURCE: yes steps: diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml index 8807a34..dfbfcf5 100644 --- a/.github/workflows/test-coverage.yaml +++ b/.github/workflows/test-coverage.yaml @@ -15,6 +15,7 @@ jobs: runs-on: ubuntu-latest env: GITHUB_PAT: ${{ secrets.TEST_GITHUB_PAT }} + PINECONE_API_KEY: ${{ secrets.PINECONE_API_KEY }} steps: - uses: actions/checkout@v4 From 9c8b318e707cfa035e3e7e3bed30b4c36e7f69fd Mon Sep 17 00:00:00 2001 From: Kamil Wais Date: Fri, 6 Dec 2024 14:51:52 +0100 Subject: [PATCH 11/11] fixed lintr issues --- .lintr | 18 ++++++----- R/Pinecone.R | 58 ++++++++++++++++------------------ R/VectorDatabase.R | 2 +- R/find_records.R | 4 +-- R/process_repos.R | 2 +- R/set_database.R | 2 +- tests/testthat/test-Pinecone.R | 2 +- 7 files changed, 43 insertions(+), 45 deletions(-) diff --git a/.lintr b/.lintr index 37d7fa4..6d7d544 100644 --- a/.lintr +++ b/.lintr @@ -1,11 +1,13 @@ linters: linters_with_defaults( - line_length_linter = line_length_linter(120L), - object_usage_linter = NULL, - object_length_linter = object_length_linter(45L), - object_name_linter = object_name_linter( - styles = c("snake_case", "CamelCase", "symbols"), - regexes = character() - ), - cyclocomp_linter = NULL + trailing_whitespace_linter = NULL, + trailing_blank_lines_linter = NULL, + line_length_linter = NULL, + object_usage_linter = NULL, + object_length_linter = object_length_linter(45L), + object_name_linter = object_name_linter( + styles = c("snake_case", "CamelCase", "symbols"), + regexes = character() + ), + cyclocomp_linter = NULL ) encoding: "UTF-8" diff --git a/R/Pinecone.R b/R/Pinecone.R index 2db37ed..aecb880 100644 --- a/R/Pinecone.R +++ b/R/Pinecone.R @@ -65,27 +65,25 @@ Pinecone <- R6::R6Class( includeMetadata = TRUE ) - request <- - httr2::request(url) |> - httr2::req_url_path_append("query") |> - httr2::req_headers( - "Api-Key" = pinecone_api_key, - "X-Pinecone-API-Version" = "2024-10" - ) |> - httr2::req_body_json(body) - - response <- - request |> - httr2::req_perform() - - response_body <- httr2::resp_body_json(response) - results <- response_body$matches + request <- httr2::request(url) |> + httr2::req_url_path_append("query") |> + httr2::req_headers( + "Api-Key" = pinecone_api_key, + "X-Pinecone-API-Version" = "2024-10" + ) |> + httr2::req_body_json(body) - results |> - purrr::map(function(result) { - result$values <- NULL - result - }) + response <- request |> + httr2::req_perform() + + response_body <- httr2::resp_body_json(response) + results <- response_body$matches + + results |> + purrr::map(function(result) { + result$values <- NULL + result + }) } ), @@ -133,18 +131,16 @@ Pinecone <- R6::R6Class( ) ) - request <- - httr2::request(url) |> - httr2::req_url_path_append("embed") |> - httr2::req_headers( + request <- httr2::request(url) |> + httr2::req_url_path_append("embed") |> + httr2::req_headers( "Api-Key" = pinecone_api_key, - "X-Pinecone-API-Version" = "2024-10" - ) |> - httr2::req_body_json(body) + "X-Pinecone-API-Version" = "2024-10" + ) |> + httr2::req_body_json(body) - response <- - request |> - httr2::req_perform() + response <- request |> + httr2::req_perform() response_body <- httr2::resp_body_json(response) @@ -152,4 +148,4 @@ Pinecone <- R6::R6Class( } ) -) \ No newline at end of file +) diff --git a/R/VectorDatabase.R b/R/VectorDatabase.R index 70061bf..519d299 100644 --- a/R/VectorDatabase.R +++ b/R/VectorDatabase.R @@ -24,4 +24,4 @@ VectorDatabase <- R6::R6Class( stop(call. = FALSE, "Not implemented yet.") } ) -) \ No newline at end of file +) diff --git a/R/find_records.R b/R/find_records.R index 5c0dc0c..c3e7d85 100644 --- a/R/find_records.R +++ b/R/find_records.R @@ -6,14 +6,14 @@ #' #' @export find_records <- function( - gitai, + gitai, query, top_k = 1, verbose = is_verbose() ) { gitai$db$find_records( - query = query, + query = query, top_k = top_k ) } diff --git a/R/process_repos.R b/R/process_repos.R index dcc053c..7fe0c5a 100644 --- a/R/process_repos.R +++ b/R/process_repos.R @@ -38,7 +38,7 @@ process_repos <- function( content_to_process <- filtered_content |> dplyr::pull(file_content) |> - paste(collapse = "\n\n") + paste(collapse = "\n\n") if (verbose) { cli::cli_alert_info("Processing content with LLM...") diff --git a/R/set_database.R b/R/set_database.R index 56f1e58..74cc3a6 100644 --- a/R/set_database.R +++ b/R/set_database.R @@ -26,4 +26,4 @@ set_database <- function( gitai$db <- db invisible(gitai) -} \ No newline at end of file +} diff --git a/tests/testthat/test-Pinecone.R b/tests/testthat/test-Pinecone.R index bc74115..1d7c078 100644 --- a/tests/testthat/test-Pinecone.R +++ b/tests/testthat/test-Pinecone.R @@ -47,7 +47,7 @@ test_that("writting records", { result$upsertedCount |> expect_equal(1) } - }) +}) test_that("finding records", {