Skip to content

Commit

Permalink
add vector retrieval context generation scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
jpoly1219 committed Jul 7, 2024
1 parent bdc6793 commit b047aa3
Showing 1 changed file with 68 additions and 0 deletions.
68 changes: 68 additions & 0 deletions starcoder_generate_rags.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import os
import json
import openai
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

project_root = os.getcwd()
api_key_path = os.path.join(project_root, "openai-key.txt")
embeddings_path = os.path.join(
project_root, "targets", "combined", "embeddings.json")

# List of directory paths
directory_paths = [
os.path.join(project_root, "targets", "starcoder-todo"),
os.path.join(project_root, "targets", "starcoder-playlist"),
os.path.join(project_root, "targets", "starcoder-emojipaint"),
os.path.join(project_root, "targets", "starcoder-booking"),
os.path.join(project_root, "targets", "starcoder-passwords")
]

with open(api_key_path, "r") as file:
openai.api_key = file.read().strip()


def find_similar_chunks(header, embeddings, top_n=6):
# Convert the header to an embedding
header_embedding = openai.embeddings.create(
input=header, model="text-embedding-ada-002").data[0].embedding

# Calculate the cosine similarity between the header embedding and chunk embeddings
chunk_embeddings = [embedding["embedding"] for embedding in embeddings]
similarities = cosine_similarity([header_embedding], chunk_embeddings)[0]

# Get the indices of the top-n most similar chunks
top_indices = similarities.argsort()[-top_n:][::-1]

# Return the top-n most similar chunks
return [embeddings[i]["chunk"] for i in top_indices]


# Load the pre-generated embeddings from the JSON file
with open(embeddings_path, "r") as file:
embeddings = json.load(file)

for directory_path in directory_paths:
sketch_file_path = os.path.join(directory_path, "sketch.ts")

# Check if the sketch file exists in the directory
if os.path.isfile(sketch_file_path):
with open(sketch_file_path, "r") as file:
header = file.read()

# Find the top-n most similar chunks
similar_chunks = find_similar_chunks(header, embeddings)

# Prepare the result string
result = ""
for i, chunk in enumerate(similar_chunks, start=1):
result += f"// SNIPPET {i}\n{chunk}\n\n"

# Save the result to a file in the same directory
rag_file_path = os.path.join(directory_path, "RAG.txt")
with open(rag_file_path, "w") as file:
file.write(result)

print(f"RAG.txt file created in {directory_path}")
else:
print(f"sketch.ts file not found in {directory_path}")

0 comments on commit b047aa3

Please sign in to comment.