-
Notifications
You must be signed in to change notification settings - Fork 33
/
Copy pathvector_db.py
107 lines (92 loc) · 4.34 KB
/
vector_db.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# Set environment variable before imports
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
import logging
from langchain_chroma import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
import chromadb
# Set up logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
class VectorStore:
def __init__(self, model_name="all-MiniLM-L6-v2", persist_directory="./chroma_db", dry_run=False):
"""Initialize the vector store with the specified embedding model."""
self.dry_run = dry_run
logging.debug(f"[__init__] Initializing VectorStore with model: {model_name}, dry_run: {self.dry_run}")
self.embedding_function = HuggingFaceEmbeddings(model_name=model_name)
self.persist_directory = persist_directory
if not self.dry_run:
# Try to load existing DB
if os.path.exists(persist_directory):
try:
self.db = Chroma(
persist_directory=persist_directory,
embedding_function=self.embedding_function,
client_settings=chromadb.config.Settings(
anonymized_telemetry=False,
is_persistent=True
)
)
logging.info("[__init__] Loaded existing vector store from disk")
except Exception as e:
logging.error(f"[__init__] Error loading existing vector store: {e}")
self.db = None
else:
self.db = None
else:
self.db = None
logging.info("[__init__] Dry run mode - skipping DB load")
def initialize_from_pdf(self, pdf_path, chunk_size=300, chunk_overlap=50):
"""Initialize vector DB with PDF content."""
if self.dry_run:
logging.info(f"[initialize_from_pdf] Dry run mode - skipping PDF initialization for {pdf_path}")
return True
logging.debug(f"[initialize_from_pdf] Loading PDF: {pdf_path}")
try:
loader = PyPDFLoader(pdf_path)
documents = loader.load()
logging.debug(f"[initialize_from_pdf] Loaded {len(documents)} pages from PDF")
text_splitter = CharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
separator="\n"
)
docs = text_splitter.split_documents(documents)
logging.debug(f"[initialize_from_pdf] Split into {len(docs)} chunks")
self.db = Chroma.from_documents(
documents=docs,
embedding=self.embedding_function,
persist_directory=self.persist_directory,
client_settings=chromadb.config.Settings(
anonymized_telemetry=False,
is_persistent=True
)
)
logging.debug("[initialize_from_pdf] Vector store initialized successfully")
return True
except Exception as e:
logging.error(f"[initialize_from_pdf] Error initializing vector store: {e}")
return False
def search(self, query, n_results=3):
"""Search the vector store for relevant documents."""
if self.dry_run:
logging.info(f"[search] Dry run mode - returning mock results for query: {query}")
return "Mock result: This is a dry run response."
logging.debug(f"[search] Searching for: {query}")
try:
if not self.db:
logging.error("[search] Vector store not initialized")
return ""
docs = self.db.similarity_search(query, k=n_results)
context = "\n".join(doc.page_content for doc in docs)
logging.debug(f"[search] Found {len(docs)} relevant documents")
return context
except Exception as e:
logging.error(f"[search] Error searching vector store: {e}")
return ""
def __del__(self):
"""Cleanup when the object is destroyed."""
logging.debug("[__del__] Vector store cleanup complete")