Merge branch 'main' of https://github.com/Medkallel/Gift-Recommendati…

…on-Chatbot
Medkallel · Oct 14, 2024 · 34c8f56 · 34c8f56
2 parents 9c68e55 + 72bf020
commit 34c8f56
Show file tree

Hide file tree

Showing 2 changed files with 257 additions and 4 deletions.
diff --git a/Data/create_llm_ready_csv.ipynb b/Data/create_llm_ready_csv.ipynb
@@ -0,0 +1,251 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## JSONL Filters and Transformations"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "4043\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import jsonlines\n",
+    "import re\n",
+    "\n",
+    "def remove_emojis(text):\n",
+    "    # Regular expression to detect emojis and other special symbols\n",
+    "    emoji_pattern = re.compile(\n",
+    "        \"[\"\n",
+    "        \"\\U0001F600-\\U0001F64F\"  # Emoticons\n",
+    "        \"\\U0001F300-\\U0001F5FF\"  # Miscellaneous Symbols and Pictographs\n",
+    "        \"\\U0001F680-\\U0001F6FF\"  # Transport and Map Symbols\n",
+    "        \"\\U0001F700-\\U0001F77F\"  # Alchemical Symbols\n",
+    "        \"\\U0001F780-\\U0001F7FF\"  # Geometric Shapes Extended\n",
+    "        \"\\U0001F800-\\U0001F8FF\"  # Supplemental Arrows-C\n",
+    "        \"\\U0001F900-\\U0001F9FF\"  # Supplemental Symbols and Pictographs\n",
+    "        \"\\U0001FA00-\\U0001FA6F\"  # Chess Symbols\n",
+    "        \"\\U0001FA70-\\U0001FAFF\"  # Symbols and Pictographs Extended-A\n",
+    "        \"\\U00002705\"\n",
+    "        \"]+\", flags=re.UNICODE\n",
+    "    )\n",
+    "    return emoji_pattern.sub(r'', text)\n",
+    "\n",
+    "# Function to join lists into a single string and then remove emojis\n",
+    "def clean_description(val):\n",
+    "    if isinstance(val, list):\n",
+    "        val = ' '.join(val)  # Join the list elements into a single string\n",
+    "    return remove_emojis(str(val)) if pd.notnull(val) else val\n",
+    "\n",
+    "def process_amazon_reviews(meta_file, output_file):\n",
+    "    # Read the \"meta\" JSONL file and convert it to a DataFrame\n",
+    "    data_meta = []\n",
+    "    with jsonlines.open(meta_file) as reader:\n",
+    "        for obj in reader:\n",
+    "            data_meta.append(obj)\n",
+    "\n",
+    "    df = pd.DataFrame(data_meta)\n",
+    "\n",
+    "    # Filter rows where 'description' is empty\n",
+    "    df = df[df['description'].apply(lambda x: len(x) != 0)]\n",
+    "    df = df[df['features'].apply(lambda x: len(x) != 0)]\n",
+    "\n",
+    "    # Keep only the necessary columns\n",
+    "    df = df[['parent_asin', 'average_rating', 'rating_number', 'main_category', 'title', 'description', 'features']]\n",
+    "\n",
+    "    # Filter reviews with a rating > 4.5\n",
+    "    df = df[df['average_rating'].astype(float) > 4.5]\n",
+    "\n",
+    "    # Filter reviews with a rating number > 200\n",
+    "    df = df[df['rating_number'].astype(int) > 200]\n",
+    "\n",
+    "    # Apply the cleaning function to the 'description' column\n",
+    "    df['title'] = df['title'].apply(clean_description)\n",
+    "    df['description'] = df['description'].apply(clean_description)\n",
+    "    df['features'] = df['features'].apply(clean_description)\n",
+    "\n",
+    "    # Replace curly apostrophes with straight ones in text fields\n",
+    "    df['title'] = df['title'].str.replace('’', \"'\")\n",
+    "    df['description'] = df['description'].str.replace('’', \"'\")\n",
+    "    df['features'] = df['features'].str.replace('’', \"'\")\n",
+    "\n",
+    "    # Save the final DataFrame to a CSV file\n",
+    "    df.to_csv(output_file, index=False)\n",
+    "\n",
+    "    print(len(df))\n",
+    "\n",
+    "# Example function call\n",
+    "process_amazon_reviews('Data/Input/meta_Video_Games.jsonl', 'Data/Output/meta_Video_Games.csv')\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Special case for Clothing dataset. Takes only one object out of every 3\n",
+    "Very big dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2933\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import jsonlines\n",
+    "import re\n",
+    "\n",
+    "\n",
+    "def remove_emojis(text):\n",
+    "    # Regular expression to detect emojis and other special symbols\n",
+    "    emoji_pattern = re.compile(\n",
+    "        \"[\"  \n",
+    "        \"\\U0001F600-\\U0001F64F\"  # Emoticons\n",
+    "        \"\\U0001F300-\\U0001F5FF\"  # Miscellaneous Symbols and Pictographs\n",
+    "        \"\\U0001F680-\\U0001F6FF\"  # Transport and Map Symbols\n",
+    "        \"\\U0001F700-\\U0001F77F\"  # Alchemical Symbols\n",
+    "        \"\\U0001F780-\\U0001F7FF\"  # Geometric Shapes Extended\n",
+    "        \"\\U0001F800-\\U0001F8FF\"  # Supplemental Arrows-C\n",
+    "        \"\\U0001F900-\\U0001F9FF\"  # Supplemental Symbols and Pictographs\n",
+    "        \"\\U0001FA00-\\U0001FA6F\"  # Symbols and Pictographs Extended-A\n",
+    "        \"\\U0001FA70-\\U0001FAFF\"  # Supplemental Objects\n",
+    "        \"\\U00002705\"              # Miscellaneous Check Mark\n",
+    "        \"]+\", flags=re.UNICODE\n",
+    "    )\n",
+    "    return emoji_pattern.sub(r'', text)\n",
+    "\n",
+    "# Function to join lists into a string and then remove emojis\n",
+    "def clean_description(val):\n",
+    "    if isinstance(val, list):\n",
+    "        val = ' '.join(val)  # Join the elements of the list into a single string\n",
+    "    return remove_emojis(str(val)) if pd.notnull(val) else val\n",
+    "\n",
+    "\n",
+    "def process_amazon_reviews(meta_file, output_file):\n",
+    "    # Read the \"meta\" JSONL file and convert it into a DataFrame\n",
+    "    data_meta = []\n",
+    "    with jsonlines.open(meta_file) as reader:\n",
+    "        for i, obj in enumerate(reader):\n",
+    "            if i % 3 == 0:  # Take only one object out of every 3\n",
+    "                data_meta.append(obj)\n",
+    "\n",
+    "    df = pd.DataFrame(data_meta)\n",
+    "\n",
+    "    # Filter rows where the description is empty\n",
+    "    df = df[df['description'].apply(lambda x: len(x) != 0)]\n",
+    "    df = df[df['features'].apply(lambda x: len(x) != 0)]\n",
+    "    \n",
+    "    # Keep only the necessary columns\n",
+    "    df = df[['parent_asin', 'average_rating', 'rating_number', 'main_category', 'title', 'description', 'features']]\n",
+    "\n",
+    "    # Filter reviews with an average rating > 4.5\n",
+    "    df = df[df['average_rating'].astype(float) > 4.5]\n",
+    "\n",
+    "    # Filter reviews with a rating number > 2000\n",
+    "    df = df[df['rating_number'].astype(int) > 2000]\n",
+    "\n",
+    "    # Apply the cleaning function to 'title', 'description', and 'features' columns\n",
+    "    df['title'] = df['title'].apply(clean_description)\n",
+    "    df['description'] = df['description'].apply(clean_description)\n",
+    "    df['features'] = df['features'].apply(clean_description)\n",
+    "\n",
+    "    # Replace special apostrophe characters in 'title', 'description', and 'features'\n",
+    "    df['title'] = df['title'].str.replace('’', \"'\")\n",
+    "    df['description'] = df['description'].str.replace('’', \"'\")\n",
+    "    df['features'] = df['features'].str.replace('’', \"'\")\n",
+    "   \n",
+    "    # Save the final DataFrame as a CSV file\n",
+    "    df.to_csv(output_file, index=False)\n",
+    "\n",
+    "    print(len(df))\n",
+    "\n",
+    "\n",
+    "# Example call to the function\n",
+    "process_amazon_reviews('Data/Input/meta/meta_Clothing_Shoes_and_Jewelry.jsonl', 'Data/Output/meta_Clothing_Shoes_and_Jewelry.csv')\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Concatenation of final csv"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "All CSV files have been concatenated into 'Output/concatenated_28657_products_list.csv'\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import pandas as pd\n",
+    "\n",
+    "# Path to the folder containing the CSV files\n",
+    "csv_folder = 'Data/Output/'\n",
+    "\n",
+    "# List all files in the folder\n",
+    "csv_files = [f for f in os.listdir(csv_folder) if f.endswith('.csv')]\n",
+    "\n",
+    "# Read and concatenate all the CSV files\n",
+    "df_list = [pd.read_csv(os.path.join(csv_folder, file)) for file in csv_files]\n",
+    "final_df = pd.concat(df_list, ignore_index=True)\n",
+    "\n",
+    "# Save the final DataFrame into a single CSV file\n",
+    "final_df.to_csv(f'Data/Output/LLM_Ready/concatenated_{len(final_df)}_products_list.csv', index=False)\n",
+    "\n",
+    "# Print message confirming the concatenation\n",
+    "print(f\"All CSV files have been concatenated into 'Data/Output/LLM_Ready/concatenated_{len(final_df)}_products_list.csv'\")\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "genenv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/README.md b/README.md
@@ -63,7 +63,7 @@ A **presentation** is available as a **PDF** file in the repo `Gift_Recommendati
 
 ## Notebooks Overview
 
-1. **DataPreprocessing.ipynb**:
+1. **create_llm_ready_csv.ipynb**:
    - Used to preprocess the Amazon Dataset and extract products with rich text features for the demo.
 
 ---
@@ -82,8 +82,11 @@ $ cd Gift-Recommendation-ChatBot
 ```sh
 $ pip install -r requirements.txt
 ```
+3. Download the Chroma VecStore & upload it to your Dropbox App:
+   [Chroma Store Download Link](https://drive.google.com/drive/folders/1zateQgEBTLoUUb5tSdLyBJIF57wermxC?usp=drive_link)
+
 > [!IMPORTANT]
-> Ensure you have the necessary API keys for TogetherAI set up in a `.env` file.
+> Ensure you have the necessary API keys for TogetherAI and Dropbox set up in the `secrets.toml` file.
 
 ---
 
@@ -106,15 +109,14 @@ $ streamlit run src/Gift\ Recommendation\ Bot\ 🎁.py
 ├── 📁__pycache__/
 ├── 📁chroma_vectorstore/ # Contains the vector store
 ├── 📁Data/ # Contains the dataset
-├── └── 📓Data_Preprocessing.ipynb
+├── └── 📓create_llm_ready_csv.ipynb
 ├── 📁src/
 │   ├── 🐍Gift Recommendation Bot 🎁.py
 │   ├── 📁pages/
 │   │   └── 🐍Products Catalogue ⚙️.py
 ├── 📁tmp/ # Used to store temporary csv file for data embedding
 ├── 📁.streamlit/
 │   └── 🔑secrets.toml # Used to store api Keys for running locally
-├── 📄.env
 ├── 📄.gitignore
 ├── 📄README.md
 ├── 📄requirements.txt