Skip to content

Commit

Permalink
Merge pull request #74 from r-world-devs/waisk/63/demo
Browse files Browse the repository at this point in the history
Waisk/63/demo
  • Loading branch information
maciekbanas authored Dec 23, 2024
2 parents 81fb750 + bebe673 commit 032efc8
Show file tree
Hide file tree
Showing 15 changed files with 291 additions and 119 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
.Rproj.user
.Renviron
docs
inst/demo-app/rsconnect/
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ Imports:
glue
Suggests:
testthat (>= 3.0.0),
shiny,
withr
Config/testthat/edition: 3
Config/testthat/parallel: true
28 changes: 28 additions & 0 deletions R/Pinecone.R
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,34 @@ Pinecone <- R6::R6Class(
response_body <- httr2::resp_body_json(response)
response_body
},

read_record = function(id) {

pinecone_api_key <- Sys.getenv("PINECONE_API_KEY")

url <- paste0("https://", private$.index_host)

request <- httr2::request(url) |>
httr2::req_url_path_append("vectors") |>
httr2::req_url_path_append("fetch") |>
httr2::req_url_query(
ids = id,
namespace = private$.namespace
) |>
httr2::req_headers(
"Api-Key" = pinecone_api_key,
"X-Pinecone-API-Version" = "2024-10"
)

response <- request |>
httr2::req_perform()

response_body <- httr2::resp_body_json(response)
results <- response_body$vectors

results
},


find_records = function(query, top_k = 1) {

Expand Down
4 changes: 4 additions & 0 deletions R/VectorDatabase.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ VectorDatabase <- R6::R6Class(
stop(call. = FALSE, "Not implemented yet.")
},

read_record = function(id) {
stop(call. = FALSE, "Not implemented yet.")
},

find_records = function(query, top_k = 1) {
stop(call. = FALSE, "Not implemented yet.")
}
Expand Down
4 changes: 2 additions & 2 deletions R/add_metadata.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#' @noRd
add_metadata <- function(result, content) {
add_metadata <- function(result, content, timestamp) {
web_url <- content$repo_url[1]
api_url <- content$api_url[1]
if (grepl("github", api_url)) {
Expand All @@ -10,7 +10,7 @@ add_metadata <- function(result, content) {
result[["metadata"]] <- list(
repo_url = web_url,
files = paste0(content$file_path, collapse = ", "),
timestamp = get_repo_date(api_url)
timestamp = timestamp
)
result
}
Expand Down
17 changes: 15 additions & 2 deletions R/process_content.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,19 @@
process_content <- function(gitai, content) {
process_content <- function(gitai, content, max_words = 80000, verbose) {

# TODO: check if it fits in the context window
words <- strsplit(content, "\\s+")[[1]]
num_words <- length(words)
if (verbose) cli::cli_alert_info("Repo content has {num_words} words")

if (num_words > max_words) {
if (verbose) {
cli::cli_alert_warning("Repo content is probably too long, triming...")
}
trimmed_words <- words[seq_len(min(length(words), max_words))]
content <- paste(trimmed_words, collapse = " ")
if (verbose) {
cli::cli_alert_info("Repo content has now {length(trimmed_words)} words.")
}
}

llm_clone <- gitai$llm$clone(deep = TRUE)

Expand Down
62 changes: 54 additions & 8 deletions R/process_repos.R
Original file line number Diff line number Diff line change
@@ -1,15 +1,19 @@
#' Run LLM on `GitAI` repositories content
#' @name process_repos
#' @param gitai A \code{GitAI} object.
#' @param depth A numeric, maximum depth of folders to process.
#' @param verbose A logical. If \code{FALSE} you won't be getting
#' additional diagnostic messages.
#' @return A list.
#' @export
process_repos <- function(
gitai,
depth = 1,
verbose = is_verbose()
) {

repo_name <- api_url <- NULL

gitstats <- gitai$gitstats

gitai$repos_metadata <- GitStats::get_repos(
Expand All @@ -18,18 +22,31 @@ process_repos <- function(
verbose = verbose
)
GitStats::get_files_structure(
gitstats_object = gitstats,
gitstats,
pattern = paste0(gitai$files, collapse = "|"),
depth = Inf,
depth = depth,
verbose = verbose
)
files_content <- GitStats::get_files_content(gitstats, verbose = verbose)
repositories <- unique(files_content$repo_name)

distinct_repos <- files_content |>
dplyr::distinct(repo_name, api_url)

repositories <- distinct_repos$repo_name
api_urls <- distinct_repos$api_url

results <-
repositories |>
purrr::map(function(repo_name) {
purrr::map2(repositories, api_urls, function(repo_name, api_url) {

current_repo_number <- which(repositories == repo_name)

if (verbose) {
cli::cli_alert_info("Processing repository: {.pkg {repo_name}}")
cli::cli_alert(paste0(
"Processing repository ",
"[{current_repo_number}/{length(repositories)} ",
"{round(current_repo_number / length(repositories) * 100, 2)}%]: ",
"{.pkg {repo_name}}"
))
}

filtered_content <- files_content |>
Expand All @@ -39,14 +56,43 @@ process_repos <- function(
dplyr::pull(file_content) |>
paste(collapse = "\n\n")

if (grepl("github", api_url)) {
api_url <- github_repo(api_url)
} else {
api_url <- gitlab_repo(api_url)
}
repo_timestamp <- get_repo_date(api_url)

if (!is.null(gitai$db)) {
if (verbose) {
cli::cli_alert_info("Checking repo timestamp...")
}
record <- gitai$db$read_record(id = repo_name)

if (NROW(record) > 0) {

record <- record[[1]]
record_timestamp <- as.POSIXct(record$metadata$timestamp, tz = "UTC")

if (repo_timestamp <= record_timestamp) {
if (verbose) {
cli::cli_alert_info("Repo has not been updated. Skipping...")
}
return(NULL)
}
}
}

if (verbose) {
cli::cli_alert_info("Processing content with LLM...")
}

result <- process_content(
gitai = gitai,
content = content_to_process
content = content_to_process,
verbose = verbose
) |>
add_metadata(content = filtered_content)
add_metadata(content = filtered_content, timestamp = repo_timestamp)

if (!is.null(gitai$db)) {
if (verbose) {
Expand Down
6 changes: 6 additions & 0 deletions R/run_demo.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
run_demo <- function() {

app_folder <- system.file("demo-app", package = "GitAI")

shiny::runApp(app_folder)
}
17 changes: 17 additions & 0 deletions devel/deploy_demo.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
rstudioapi::restartSession()

rsconnect::setAccountInfo(
name = 'kalimu',
token = Sys.getenv("SHINYAPPSIO_TOKEN"),
secret = Sys.getenv("SHINYAPPSIO_SECRET")
)

# pak::pkg_install("r-world-devs/GitAI")

rsconnect::deployApp(
appDir = "inst/demo",
account = "kalimu",
appName = "GitAI-demo"
)

# https://kalimu.shinyapps.io/GitAI-demo/
135 changes: 58 additions & 77 deletions devel/sandbox.R
Original file line number Diff line number Diff line change
@@ -1,92 +1,73 @@
my_project <-
initialize_project("gitai-demo") |>
my_project <- initialize_project("gitai-demo") |>
set_database(
provider = "Pinecone",
index = "gitai",
namespace = NULL
) |>
set_llm(seed = 1014, api_args = list(temperature = 0))

my_project <-
my_project |>
set_github_repos(repos = c(
"r-world-devs/GitStats",
"r-world-devs/GitAI",
"openpharma/DataFakeR"
)) |>
add_files(files = c("README.md"))

my_project <-
my_project |>
set_prompt(paste(
"Write two paragraphs of summary for a project based on given input.",
"Highlight business value of the project, its functionality, main features,",
"and use cases."
my_project <- my_project |>
set_github_repos(
# repos = c(
# "r-world-devs/GitStats",
# "r-world-devs/GitAI",
# "r-world-devs/cohortBuilder",
# "r-world-devs/shinyCohortBuilder",
# "r-world-devs/shinyQueryBuilder",
# "r-world-devs/queryBuilder",
# "r-world-devs/shinyGizmo",
# "r-world-devs/shinyTimelines",
# "openpharma/DataFakeR"
# )
orgs = c(
"insightsengineering",
"openpharma",
"pharmaverse",
"tidymodels",
"r-lib",
"rstudio",
"tidyverse"
)
) |>
add_files(c(
"DESCRIPTION",
"*.md",
"*.Rmd"
))

my_project <- my_project |>
set_prompt(r"(
Write up to ten paragraphs of summary for a project based on given input.
Be precise and to the point in your answers.
Mention core functionality and all main features of the project.
If available, mention brifly the technology used in the project
(like R, Python, etc).
If available, mention brifly if a project is an R package, shiny app,
or other type of tool.
)")

custom_function <- function(provider, req) {

req |>
httr2::req_timeout(60) |>
httr2::req_perform() |>
httr2::resp_body_json()
}
unlockBinding("chat_perform_value", asNamespace("elmer"))
assign("chat_perform_value", custom_function, envir = asNamespace("elmer"))
lockBinding("chat_perform_value", asNamespace("elmer"))

results <- process_repos(my_project)

results |> dplyr::glimpse()
purrr::map(results, ~.$text)






# my_project <-
# initialize_project(project_id = "gitai-demo") |>
# set_database(index = "gitai")

my_project |> find_records("I'm looking for an R package to create synthetic datasets.")

my_project |> find_records("How can I check statisting of git repositories.")

my_project |> find_records("Can I somehow extract information from code from git repositories?")

my_project |> find_records("What are the tools that can help me in my work as a Data Scientist?")
# results |> dplyr::glimpse()
# purrr::map(results, ~.$text)

my_project |>
find_records("As a Manager I look for tools which let me learn what tools can be reused in my company", top_k = 2)

my_project |> find_records("Which data products could have large impact in global company that maintains many repositories?", top_k = 3)
find_records("How can I create synthetic datasets?", top_k = 3)

my_project |> find_records("Szukam narzędzi które wykorzystują sztuczną inteligencję?")







my_chatbot <-
initialize_project("gitai-demo") |>
set_database(index = "gitai") |>
set_llm(seed = 1014, api_args = list(temperature = 0)) |>
set_prompt(paste(
"As a helpful assistant, answer user question using only the provided input."
))

get_answer <- function(my_chatbot, query) {

cat("\n")
my_chatbot$llm$chat(paste(
"User query:", query, "\n\n",
"Known input for the answer:",
my_project$db$find_records(query = query, top_k = 1)
)) |>
cat()
}

my_chatbot |>
get_answer("I'm looking for an R package to create synthetic datasets.")

my_chatbot |>
get_answer("How can I check statisting of git repositories?")

my_chatbot |>
get_answer("Can I somehow extract information from code from git repositories?")
my_project |>
find_records("How can I check statisting of git repositories.", top_k = 3)

my_chatbot |>
get_answer("I would love to use AI to process code files. Is it possible? Give me the answer writting in one sentence with very funny style.")
my_project |>
find_records("Can I somehow extract information from file content from git repositories using LLM?")

run_demo()
Loading

0 comments on commit 032efc8

Please sign in to comment.