add vector retrieval context generation scripts

jpoly1219 · Jul 7, 2024 · bdc6793 · bdc6793
1 parent 0ab1e59
commit bdc6793
Show file tree

Hide file tree

Showing 2 changed files with 108 additions and 0 deletions.
diff --git a/chunker2.py b/chunker2.py
@@ -0,0 +1,40 @@
+import openai
+import json
+import os
+
+project_root = os.getcwd()
+file_path = os.path.join(project_root, "targets",
+                         "combined", "vector_prelude.ts")
+api_key_path = os.path.join(project_root, "openai-key.txt")
+embeddings_path = os.path.join(
+    project_root, "targets", "combined", "embeddings.json")
+chunk_length = 150
+max_chunks = 10000  # Maximum number of chunks to process
+
+with open(api_key_path, "r") as file:
+    openai.api_key = file.read().strip()
+
+with open(file_path, "r") as file:
+    text = file.read()
+
+chunks = [text[i:i+chunk_length] for i in range(0, len(text), chunk_length)]
+total_chunks = len(chunks)
+print(f"Total chunks: {total_chunks}")
+
+embeddings = []
+for i, chunk in enumerate(chunks[:max_chunks], start=1):
+    print(f"Processing chunk {i}/{max_chunks}")
+    try:
+        response = openai.embeddings.create(
+            input=chunk, model="text-embedding-ada-002")
+        print(f"API Response: {response}")  # Debugging information
+        embedding = response.data[0].embedding
+        embeddings.append({"chunk": chunk, "embedding": embedding})
+    except Exception as e:
+        print(f"Error processing chunk {i}: {str(e)}")
+
+# Save embeddings to a JSON file
+with open(embeddings_path, "w") as file:
+    json.dump(embeddings, file)
+
+print("Embeddings saved to " + embeddings_path)
diff --git a/generate_rags.py b/generate_rags.py
@@ -0,0 +1,68 @@
+import os
+import json
+import openai
+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity
+
+project_root = os.getcwd()
+api_key_path = os.path.join(project_root, "openai-key.txt")
+embeddings_path = os.path.join(
+    project_root, "targets", "combined", "embeddings.json")
+
+# List of directory paths
+directory_paths = [
+    os.path.join(project_root, "targets", "todo"),
+    os.path.join(project_root, "targets", "playlist"),
+    os.path.join(project_root, "targets", "emojipaint"),
+    os.path.join(project_root, "targets", "booking"),
+    os.path.join(project_root, "targets", "passwords")
+]
+
+with open(api_key_path, "r") as file:
+    openai.api_key = file.read().strip()
+
+
+def find_similar_chunks(header, embeddings, top_n=6):
+    # Convert the header to an embedding
+    header_embedding = openai.embeddings.create(
+        input=header, model="text-embedding-ada-002").data[0].embedding
+
+    # Calculate the cosine similarity between the header embedding and chunk embeddings
+    chunk_embeddings = [embedding["embedding"] for embedding in embeddings]
+    similarities = cosine_similarity([header_embedding], chunk_embeddings)[0]
+
+    # Get the indices of the top-n most similar chunks
+    top_indices = similarities.argsort()[-top_n:][::-1]
+
+    # Return the top-n most similar chunks
+    return [embeddings[i]["chunk"] for i in top_indices]
+
+
+# Load the pre-generated embeddings from the JSON file
+with open(embeddings_path, "r") as file:
+    embeddings = json.load(file)
+
+for directory_path in directory_paths:
+    sketch_file_path = os.path.join(directory_path, "sketch.ts")
+
+    # Check if the sketch file exists in the directory
+    if os.path.isfile(sketch_file_path):
+        with open(sketch_file_path, "r") as file:
+            header = file.read()
+
+        # Find the top-n most similar chunks
+        similar_chunks = find_similar_chunks(header, embeddings)
+
+        # Prepare the result string
+        result = ""
+        for i, chunk in enumerate(similar_chunks, start=1):
+            result += f"# SNIPPET {i} #\n{chunk}\n\n"
+
+        # Save the result to a file in the same directory
+        rag_file_path = os.path.join(directory_path, "RAG.txt")
+        with open(rag_file_path, "w") as file:
+            file.write(result)
+
+        print(f"RAG.txt file created in {directory_path}")
+    else:
+        print(f"sketch.ts file not found in {directory_path}")