diff --git "a/src/pages/Products_Catalogue_\342\232\231\357\270\217.py" "b/src/pages/Products_Catalogue_\342\232\231\357\270\217.py" index c7358ac..bb848e1 100644 --- "a/src/pages/Products_Catalogue_\342\232\231\357\270\217.py" +++ "b/src/pages/Products_Catalogue_\342\232\231\357\270\217.py" @@ -1,6 +1,6 @@ __import__("pysqlite3") import sys - +import time sys.modules["sqlite3"] = sys.modules.pop("pysqlite3") import pysqlite3 import os @@ -42,12 +42,42 @@ def load_and_store_embedded_documents( embeddings = TogetherEmbeddings( model=embeddings_model_name, api_key=st.secrets["TOGETHER_API_KEY"] ) - # Vectorstore - vectorstore = Chroma.from_documents( - documents=data, - embedding=embeddings, - persist_directory=vecstore_persist_directory, - ) + # Initialize the progress bar and status text + progress_bar = st.progress(0) + status_text = st.empty() + + # Track the total number of documents + total_docs = len(data) + start_time = time.time() + + for idx, doc in enumerate(data): + # Process each document one at a time + vectorstore = Chroma.from_documents( + documents=[doc], # Pass a single document as a list + embedding=embeddings, + persist_directory=VECSTORE_PERSIST_DIRECTORY, + ) + + # Update the progress bar + progress_bar.progress((idx + 1) / total_docs) + + # Calculate elapsed time and documents per second + elapsed_time = time.time() - start_time + docs_per_sec = (idx + 1) / elapsed_time + remaining_docs = total_docs - (idx + 1) + estimated_time_remaining = remaining_docs / docs_per_sec if docs_per_sec > 0 else float('inf') + + # Convert estimated time remaining to hh:mm:ss format + hrs, rem = divmod(estimated_time_remaining, 3600) + mins, secs = divmod(rem, 60) + time_remaining_str = f"{int(hrs):02}:{int(mins):02}:{int(secs):02}" + + # Update the status text + status_text.text( + f"Processing document {idx + 1}/{total_docs} " + f"({docs_per_sec:.2f} docs/sec, " + f"Estimated time remaining: {time_remaining_str})" + ) return vectorstore