feat : add notebook to transform products.jsonl into LLM embedding re…

…ady csv
Medkallel · Oct 14, 2024 · c26432c · c26432c
1 parent 01cb7e2
commit c26432c
Showing 1 changed file with 251 additions and 0 deletions.
diff --git a/EDA/create_llm_ready_csv.ipynb b/EDA/create_llm_ready_csv.ipynb
@@ -0,0 +1,251 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## JSONL Filters and Transformations"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "4043\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import jsonlines\n",
+    "import re\n",
+    "\n",
+    "def remove_emojis(text):\n",
+    "    # Regular expression to detect emojis and other special symbols\n",
+    "    emoji_pattern = re.compile(\n",
+    "        \"[\"\n",
+    "        \"\\U0001F600-\\U0001F64F\"  # Emoticons\n",
+    "        \"\\U0001F300-\\U0001F5FF\"  # Miscellaneous Symbols and Pictographs\n",
+    "        \"\\U0001F680-\\U0001F6FF\"  # Transport and Map Symbols\n",
+    "        \"\\U0001F700-\\U0001F77F\"  # Alchemical Symbols\n",
+    "        \"\\U0001F780-\\U0001F7FF\"  # Geometric Shapes Extended\n",
+    "        \"\\U0001F800-\\U0001F8FF\"  # Supplemental Arrows-C\n",
+    "        \"\\U0001F900-\\U0001F9FF\"  # Supplemental Symbols and Pictographs\n",
+    "        \"\\U0001FA00-\\U0001FA6F\"  # Chess Symbols\n",
+    "        \"\\U0001FA70-\\U0001FAFF\"  # Symbols and Pictographs Extended-A\n",
+    "        \"\\U00002705\"\n",
+    "        \"]+\", flags=re.UNICODE\n",
+    "    )\n",
+    "    return emoji_pattern.sub(r'', text)\n",
+    "\n",
+    "# Function to join lists into a single string and then remove emojis\n",
+    "def clean_description(val):\n",
+    "    if isinstance(val, list):\n",
+    "        val = ' '.join(val)  # Join the list elements into a single string\n",
+    "    return remove_emojis(str(val)) if pd.notnull(val) else val\n",
+    "\n",
+    "def process_amazon_reviews(meta_file, output_file):\n",
+    "    # Read the \"meta\" JSONL file and convert it to a DataFrame\n",
+    "    data_meta = []\n",
+    "    with jsonlines.open(meta_file) as reader:\n",
+    "        for obj in reader:\n",
+    "            data_meta.append(obj)\n",
+    "\n",
+    "    df = pd.DataFrame(data_meta)\n",
+    "\n",
+    "    # Filter rows where 'description' is empty\n",
+    "    df = df[df['description'].apply(lambda x: len(x) != 0)]\n",
+    "    df = df[df['features'].apply(lambda x: len(x) != 0)]\n",
+    "\n",
+    "    # Keep only the necessary columns\n",
+    "    df = df[['parent_asin', 'average_rating', 'rating_number', 'main_category', 'title', 'description', 'features']]\n",
+    "\n",
+    "    # Filter reviews with a rating > 4.5\n",
+    "    df = df[df['average_rating'].astype(float) > 4.5]\n",
+    "\n",
+    "    # Filter reviews with a rating number > 200\n",
+    "    df = df[df['rating_number'].astype(int) > 200]\n",
+    "\n",
+    "    # Apply the cleaning function to the 'description' column\n",
+    "    df['title'] = df['title'].apply(clean_description)\n",
+    "    df['description'] = df['description'].apply(clean_description)\n",
+    "    df['features'] = df['features'].apply(clean_description)\n",
+    "\n",
+    "    # Replace curly apostrophes with straight ones in text fields\n",
+    "    df['title'] = df['title'].str.replace('’', \"'\")\n",
+    "    df['description'] = df['description'].str.replace('’', \"'\")\n",
+    "    df['features'] = df['features'].str.replace('’', \"'\")\n",
+    "\n",
+    "    # Save the final DataFrame to a CSV file\n",
+    "    df.to_csv(output_file, index=False)\n",
+    "\n",
+    "    print(len(df))\n",
+    "\n",
+    "# Example function call\n",
+    "process_amazon_reviews('Data/Input/meta_Video_Games.jsonl', 'Data/Output/meta_Video_Games.csv')\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Special case for Clothing dataset. Takes only one object out of every 3\n",
+    "Very big dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2933\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import jsonlines\n",
+    "import re\n",
+    "\n",
+    "\n",
+    "def remove_emojis(text):\n",
+    "    # Regular expression to detect emojis and other special symbols\n",
+    "    emoji_pattern = re.compile(\n",
+    "        \"[\"  \n",
+    "        \"\\U0001F600-\\U0001F64F\"  # Emoticons\n",
+    "        \"\\U0001F300-\\U0001F5FF\"  # Miscellaneous Symbols and Pictographs\n",
+    "        \"\\U0001F680-\\U0001F6FF\"  # Transport and Map Symbols\n",
+    "        \"\\U0001F700-\\U0001F77F\"  # Alchemical Symbols\n",
+    "        \"\\U0001F780-\\U0001F7FF\"  # Geometric Shapes Extended\n",
+    "        \"\\U0001F800-\\U0001F8FF\"  # Supplemental Arrows-C\n",
+    "        \"\\U0001F900-\\U0001F9FF\"  # Supplemental Symbols and Pictographs\n",
+    "        \"\\U0001FA00-\\U0001FA6F\"  # Symbols and Pictographs Extended-A\n",
+    "        \"\\U0001FA70-\\U0001FAFF\"  # Supplemental Objects\n",
+    "        \"\\U00002705\"              # Miscellaneous Check Mark\n",
+    "        \"]+\", flags=re.UNICODE\n",
+    "    )\n",
+    "    return emoji_pattern.sub(r'', text)\n",
+    "\n",
+    "# Function to join lists into a string and then remove emojis\n",
+    "def clean_description(val):\n",
+    "    if isinstance(val, list):\n",
+    "        val = ' '.join(val)  # Join the elements of the list into a single string\n",
+    "    return remove_emojis(str(val)) if pd.notnull(val) else val\n",
+    "\n",
+    "\n",
+    "def process_amazon_reviews(meta_file, output_file):\n",
+    "    # Read the \"meta\" JSONL file and convert it into a DataFrame\n",
+    "    data_meta = []\n",
+    "    with jsonlines.open(meta_file) as reader:\n",
+    "        for i, obj in enumerate(reader):\n",
+    "            if i % 3 == 0:  # Take only one object out of every 3\n",
+    "                data_meta.append(obj)\n",
+    "\n",
+    "    df = pd.DataFrame(data_meta)\n",
+    "\n",
+    "    # Filter rows where the description is empty\n",
+    "    df = df[df['description'].apply(lambda x: len(x) != 0)]\n",
+    "    df = df[df['features'].apply(lambda x: len(x) != 0)]\n",
+    "    \n",
+    "    # Keep only the necessary columns\n",
+    "    df = df[['parent_asin', 'average_rating', 'rating_number', 'main_category', 'title', 'description', 'features']]\n",
+    "\n",
+    "    # Filter reviews with an average rating > 4.5\n",
+    "    df = df[df['average_rating'].astype(float) > 4.5]\n",
+    "\n",
+    "    # Filter reviews with a rating number > 2000\n",
+    "    df = df[df['rating_number'].astype(int) > 2000]\n",
+    "\n",
+    "    # Apply the cleaning function to 'title', 'description', and 'features' columns\n",
+    "    df['title'] = df['title'].apply(clean_description)\n",
+    "    df['description'] = df['description'].apply(clean_description)\n",
+    "    df['features'] = df['features'].apply(clean_description)\n",
+    "\n",
+    "    # Replace special apostrophe characters in 'title', 'description', and 'features'\n",
+    "    df['title'] = df['title'].str.replace('’', \"'\")\n",
+    "    df['description'] = df['description'].str.replace('’', \"'\")\n",
+    "    df['features'] = df['features'].str.replace('’', \"'\")\n",
+    "   \n",
+    "    # Save the final DataFrame as a CSV file\n",
+    "    df.to_csv(output_file, index=False)\n",
+    "\n",
+    "    print(len(df))\n",
+    "\n",
+    "\n",
+    "# Example call to the function\n",
+    "process_amazon_reviews('Data/Input/meta/meta_Clothing_Shoes_and_Jewelry.jsonl', 'Data/Output/meta_Clothing_Shoes_and_Jewelry.csv')\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Concatenation of final csv"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "All CSV files have been concatenated into 'Output/concatenated_28657_products_list.csv'\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import pandas as pd\n",
+    "\n",
+    "# Path to the folder containing the CSV files\n",
+    "csv_folder = 'Data/Output/'\n",
+    "\n",
+    "# List all files in the folder\n",
+    "csv_files = [f for f in os.listdir(csv_folder) if f.endswith('.csv')]\n",
+    "\n",
+    "# Read and concatenate all the CSV files\n",
+    "df_list = [pd.read_csv(os.path.join(csv_folder, file)) for file in csv_files]\n",
+    "final_df = pd.concat(df_list, ignore_index=True)\n",
+    "\n",
+    "# Save the final DataFrame into a single CSV file\n",
+    "final_df.to_csv(f'Data/Output/LLM_Ready/concatenated_{len(final_df)}_products_list.csv', index=False)\n",
+    "\n",
+    "# Print message confirming the concatenation\n",
+    "print(f\"All CSV files have been concatenated into 'Data/Output/LLM_Ready/concatenated_{len(final_df)}_products_list.csv'\")\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "genenv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}