add vector retrieval context generation scripts

jpoly1219 · Jul 7, 2024 · b047aa3 · b047aa3
1 parent bdc6793
commit b047aa3
Showing 1 changed file with 68 additions and 0 deletions.
diff --git a/starcoder_generate_rags.py b/starcoder_generate_rags.py
@@ -0,0 +1,68 @@
+import os
+import json
+import openai
+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity
+
+project_root = os.getcwd()
+api_key_path = os.path.join(project_root, "openai-key.txt")
+embeddings_path = os.path.join(
+    project_root, "targets", "combined", "embeddings.json")
+
+# List of directory paths
+directory_paths = [
+    os.path.join(project_root, "targets", "starcoder-todo"),
+    os.path.join(project_root, "targets", "starcoder-playlist"),
+    os.path.join(project_root, "targets", "starcoder-emojipaint"),
+    os.path.join(project_root, "targets", "starcoder-booking"),
+    os.path.join(project_root, "targets", "starcoder-passwords")
+]
+
+with open(api_key_path, "r") as file:
+    openai.api_key = file.read().strip()
+
+
+def find_similar_chunks(header, embeddings, top_n=6):
+    # Convert the header to an embedding
+    header_embedding = openai.embeddings.create(
+        input=header, model="text-embedding-ada-002").data[0].embedding
+
+    # Calculate the cosine similarity between the header embedding and chunk embeddings
+    chunk_embeddings = [embedding["embedding"] for embedding in embeddings]
+    similarities = cosine_similarity([header_embedding], chunk_embeddings)[0]
+
+    # Get the indices of the top-n most similar chunks
+    top_indices = similarities.argsort()[-top_n:][::-1]
+
+    # Return the top-n most similar chunks
+    return [embeddings[i]["chunk"] for i in top_indices]
+
+
+# Load the pre-generated embeddings from the JSON file
+with open(embeddings_path, "r") as file:
+    embeddings = json.load(file)
+
+for directory_path in directory_paths:
+    sketch_file_path = os.path.join(directory_path, "sketch.ts")
+
+    # Check if the sketch file exists in the directory
+    if os.path.isfile(sketch_file_path):
+        with open(sketch_file_path, "r") as file:
+            header = file.read()
+
+        # Find the top-n most similar chunks
+        similar_chunks = find_similar_chunks(header, embeddings)
+
+        # Prepare the result string
+        result = ""
+        for i, chunk in enumerate(similar_chunks, start=1):
+            result += f"// SNIPPET {i}\n{chunk}\n\n"
+
+            # Save the result to a file in the same directory
+        rag_file_path = os.path.join(directory_path, "RAG.txt")
+        with open(rag_file_path, "w") as file:
+            file.write(result)
+
+        print(f"RAG.txt file created in {directory_path}")
+    else:
+        print(f"sketch.ts file not found in {directory_path}")