Skip to content

Commit

Permalink
add vector retrieval context generation scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
jpoly1219 committed Jul 7, 2024
1 parent 0ab1e59 commit bdc6793
Show file tree
Hide file tree
Showing 2 changed files with 108 additions and 0 deletions.
40 changes: 40 additions & 0 deletions chunker2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import openai
import json
import os

project_root = os.getcwd()
file_path = os.path.join(project_root, "targets",
"combined", "vector_prelude.ts")
api_key_path = os.path.join(project_root, "openai-key.txt")
embeddings_path = os.path.join(
project_root, "targets", "combined", "embeddings.json")
chunk_length = 150
max_chunks = 10000 # Maximum number of chunks to process

with open(api_key_path, "r") as file:
openai.api_key = file.read().strip()

with open(file_path, "r") as file:
text = file.read()

chunks = [text[i:i+chunk_length] for i in range(0, len(text), chunk_length)]
total_chunks = len(chunks)
print(f"Total chunks: {total_chunks}")

embeddings = []
for i, chunk in enumerate(chunks[:max_chunks], start=1):
print(f"Processing chunk {i}/{max_chunks}")
try:
response = openai.embeddings.create(
input=chunk, model="text-embedding-ada-002")
print(f"API Response: {response}") # Debugging information
embedding = response.data[0].embedding
embeddings.append({"chunk": chunk, "embedding": embedding})
except Exception as e:
print(f"Error processing chunk {i}: {str(e)}")

# Save embeddings to a JSON file
with open(embeddings_path, "w") as file:
json.dump(embeddings, file)

print("Embeddings saved to " + embeddings_path)
68 changes: 68 additions & 0 deletions generate_rags.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import os
import json
import openai
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

project_root = os.getcwd()
api_key_path = os.path.join(project_root, "openai-key.txt")
embeddings_path = os.path.join(
project_root, "targets", "combined", "embeddings.json")

# List of directory paths
directory_paths = [
os.path.join(project_root, "targets", "todo"),
os.path.join(project_root, "targets", "playlist"),
os.path.join(project_root, "targets", "emojipaint"),
os.path.join(project_root, "targets", "booking"),
os.path.join(project_root, "targets", "passwords")
]

with open(api_key_path, "r") as file:
openai.api_key = file.read().strip()


def find_similar_chunks(header, embeddings, top_n=6):
# Convert the header to an embedding
header_embedding = openai.embeddings.create(
input=header, model="text-embedding-ada-002").data[0].embedding

# Calculate the cosine similarity between the header embedding and chunk embeddings
chunk_embeddings = [embedding["embedding"] for embedding in embeddings]
similarities = cosine_similarity([header_embedding], chunk_embeddings)[0]

# Get the indices of the top-n most similar chunks
top_indices = similarities.argsort()[-top_n:][::-1]

# Return the top-n most similar chunks
return [embeddings[i]["chunk"] for i in top_indices]


# Load the pre-generated embeddings from the JSON file
with open(embeddings_path, "r") as file:
embeddings = json.load(file)

for directory_path in directory_paths:
sketch_file_path = os.path.join(directory_path, "sketch.ts")

# Check if the sketch file exists in the directory
if os.path.isfile(sketch_file_path):
with open(sketch_file_path, "r") as file:
header = file.read()

# Find the top-n most similar chunks
similar_chunks = find_similar_chunks(header, embeddings)

# Prepare the result string
result = ""
for i, chunk in enumerate(similar_chunks, start=1):
result += f"# SNIPPET {i} #\n{chunk}\n\n"

# Save the result to a file in the same directory
rag_file_path = os.path.join(directory_path, "RAG.txt")
with open(rag_file_path, "w") as file:
file.write(result)

print(f"RAG.txt file created in {directory_path}")
else:
print(f"sketch.ts file not found in {directory_path}")

0 comments on commit bdc6793

Please sign in to comment.