diff --git a/chunker2.py b/chunker2.py new file mode 100644 index 0000000..39a9059 --- /dev/null +++ b/chunker2.py @@ -0,0 +1,40 @@ +import openai +import json +import os + +project_root = os.getcwd() +file_path = os.path.join(project_root, "targets", + "combined", "vector_prelude.ts") +api_key_path = os.path.join(project_root, "openai-key.txt") +embeddings_path = os.path.join( + project_root, "targets", "combined", "embeddings.json") +chunk_length = 150 +max_chunks = 10000 # Maximum number of chunks to process + +with open(api_key_path, "r") as file: + openai.api_key = file.read().strip() + +with open(file_path, "r") as file: + text = file.read() + +chunks = [text[i:i+chunk_length] for i in range(0, len(text), chunk_length)] +total_chunks = len(chunks) +print(f"Total chunks: {total_chunks}") + +embeddings = [] +for i, chunk in enumerate(chunks[:max_chunks], start=1): + print(f"Processing chunk {i}/{max_chunks}") + try: + response = openai.embeddings.create( + input=chunk, model="text-embedding-ada-002") + print(f"API Response: {response}") # Debugging information + embedding = response.data[0].embedding + embeddings.append({"chunk": chunk, "embedding": embedding}) + except Exception as e: + print(f"Error processing chunk {i}: {str(e)}") + +# Save embeddings to a JSON file +with open(embeddings_path, "w") as file: + json.dump(embeddings, file) + +print("Embeddings saved to " + embeddings_path) diff --git a/generate_rags.py b/generate_rags.py new file mode 100644 index 0000000..1f405bc --- /dev/null +++ b/generate_rags.py @@ -0,0 +1,68 @@ +import os +import json +import openai +import numpy as np +from sklearn.metrics.pairwise import cosine_similarity + +project_root = os.getcwd() +api_key_path = os.path.join(project_root, "openai-key.txt") +embeddings_path = os.path.join( + project_root, "targets", "combined", "embeddings.json") + +# List of directory paths +directory_paths = [ + os.path.join(project_root, "targets", "todo"), + os.path.join(project_root, "targets", "playlist"), + os.path.join(project_root, "targets", "emojipaint"), + os.path.join(project_root, "targets", "booking"), + os.path.join(project_root, "targets", "passwords") +] + +with open(api_key_path, "r") as file: + openai.api_key = file.read().strip() + + +def find_similar_chunks(header, embeddings, top_n=6): + # Convert the header to an embedding + header_embedding = openai.embeddings.create( + input=header, model="text-embedding-ada-002").data[0].embedding + + # Calculate the cosine similarity between the header embedding and chunk embeddings + chunk_embeddings = [embedding["embedding"] for embedding in embeddings] + similarities = cosine_similarity([header_embedding], chunk_embeddings)[0] + + # Get the indices of the top-n most similar chunks + top_indices = similarities.argsort()[-top_n:][::-1] + + # Return the top-n most similar chunks + return [embeddings[i]["chunk"] for i in top_indices] + + +# Load the pre-generated embeddings from the JSON file +with open(embeddings_path, "r") as file: + embeddings = json.load(file) + +for directory_path in directory_paths: + sketch_file_path = os.path.join(directory_path, "sketch.ts") + + # Check if the sketch file exists in the directory + if os.path.isfile(sketch_file_path): + with open(sketch_file_path, "r") as file: + header = file.read() + + # Find the top-n most similar chunks + similar_chunks = find_similar_chunks(header, embeddings) + + # Prepare the result string + result = "" + for i, chunk in enumerate(similar_chunks, start=1): + result += f"# SNIPPET {i} #\n{chunk}\n\n" + + # Save the result to a file in the same directory + rag_file_path = os.path.join(directory_path, "RAG.txt") + with open(rag_file_path, "w") as file: + file.write(result) + + print(f"RAG.txt file created in {directory_path}") + else: + print(f"sketch.ts file not found in {directory_path}")