Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
Medkallel committed Oct 14, 2024
2 parents 9c68e55 + 72bf020 commit 34c8f56
Show file tree
Hide file tree
Showing 2 changed files with 257 additions and 4 deletions.
251 changes: 251 additions & 0 deletions Data/create_llm_ready_csv.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,251 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## JSONL Filters and Transformations"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"4043\n"
]
}
],
"source": [
"import pandas as pd\n",
"import jsonlines\n",
"import re\n",
"\n",
"def remove_emojis(text):\n",
" # Regular expression to detect emojis and other special symbols\n",
" emoji_pattern = re.compile(\n",
" \"[\"\n",
" \"\\U0001F600-\\U0001F64F\" # Emoticons\n",
" \"\\U0001F300-\\U0001F5FF\" # Miscellaneous Symbols and Pictographs\n",
" \"\\U0001F680-\\U0001F6FF\" # Transport and Map Symbols\n",
" \"\\U0001F700-\\U0001F77F\" # Alchemical Symbols\n",
" \"\\U0001F780-\\U0001F7FF\" # Geometric Shapes Extended\n",
" \"\\U0001F800-\\U0001F8FF\" # Supplemental Arrows-C\n",
" \"\\U0001F900-\\U0001F9FF\" # Supplemental Symbols and Pictographs\n",
" \"\\U0001FA00-\\U0001FA6F\" # Chess Symbols\n",
" \"\\U0001FA70-\\U0001FAFF\" # Symbols and Pictographs Extended-A\n",
" \"\\U00002705\"\n",
" \"]+\", flags=re.UNICODE\n",
" )\n",
" return emoji_pattern.sub(r'', text)\n",
"\n",
"# Function to join lists into a single string and then remove emojis\n",
"def clean_description(val):\n",
" if isinstance(val, list):\n",
" val = ' '.join(val) # Join the list elements into a single string\n",
" return remove_emojis(str(val)) if pd.notnull(val) else val\n",
"\n",
"def process_amazon_reviews(meta_file, output_file):\n",
" # Read the \"meta\" JSONL file and convert it to a DataFrame\n",
" data_meta = []\n",
" with jsonlines.open(meta_file) as reader:\n",
" for obj in reader:\n",
" data_meta.append(obj)\n",
"\n",
" df = pd.DataFrame(data_meta)\n",
"\n",
" # Filter rows where 'description' is empty\n",
" df = df[df['description'].apply(lambda x: len(x) != 0)]\n",
" df = df[df['features'].apply(lambda x: len(x) != 0)]\n",
"\n",
" # Keep only the necessary columns\n",
" df = df[['parent_asin', 'average_rating', 'rating_number', 'main_category', 'title', 'description', 'features']]\n",
"\n",
" # Filter reviews with a rating > 4.5\n",
" df = df[df['average_rating'].astype(float) > 4.5]\n",
"\n",
" # Filter reviews with a rating number > 200\n",
" df = df[df['rating_number'].astype(int) > 200]\n",
"\n",
" # Apply the cleaning function to the 'description' column\n",
" df['title'] = df['title'].apply(clean_description)\n",
" df['description'] = df['description'].apply(clean_description)\n",
" df['features'] = df['features'].apply(clean_description)\n",
"\n",
" # Replace curly apostrophes with straight ones in text fields\n",
" df['title'] = df['title'].str.replace('’', \"'\")\n",
" df['description'] = df['description'].str.replace('’', \"'\")\n",
" df['features'] = df['features'].str.replace('’', \"'\")\n",
"\n",
" # Save the final DataFrame to a CSV file\n",
" df.to_csv(output_file, index=False)\n",
"\n",
" print(len(df))\n",
"\n",
"# Example function call\n",
"process_amazon_reviews('Data/Input/meta_Video_Games.jsonl', 'Data/Output/meta_Video_Games.csv')\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Special case for Clothing dataset. Takes only one object out of every 3\n",
"Very big dataset"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2933\n"
]
}
],
"source": [
"import pandas as pd\n",
"import jsonlines\n",
"import re\n",
"\n",
"\n",
"def remove_emojis(text):\n",
" # Regular expression to detect emojis and other special symbols\n",
" emoji_pattern = re.compile(\n",
" \"[\" \n",
" \"\\U0001F600-\\U0001F64F\" # Emoticons\n",
" \"\\U0001F300-\\U0001F5FF\" # Miscellaneous Symbols and Pictographs\n",
" \"\\U0001F680-\\U0001F6FF\" # Transport and Map Symbols\n",
" \"\\U0001F700-\\U0001F77F\" # Alchemical Symbols\n",
" \"\\U0001F780-\\U0001F7FF\" # Geometric Shapes Extended\n",
" \"\\U0001F800-\\U0001F8FF\" # Supplemental Arrows-C\n",
" \"\\U0001F900-\\U0001F9FF\" # Supplemental Symbols and Pictographs\n",
" \"\\U0001FA00-\\U0001FA6F\" # Symbols and Pictographs Extended-A\n",
" \"\\U0001FA70-\\U0001FAFF\" # Supplemental Objects\n",
" \"\\U00002705\" # Miscellaneous Check Mark\n",
" \"]+\", flags=re.UNICODE\n",
" )\n",
" return emoji_pattern.sub(r'', text)\n",
"\n",
"# Function to join lists into a string and then remove emojis\n",
"def clean_description(val):\n",
" if isinstance(val, list):\n",
" val = ' '.join(val) # Join the elements of the list into a single string\n",
" return remove_emojis(str(val)) if pd.notnull(val) else val\n",
"\n",
"\n",
"def process_amazon_reviews(meta_file, output_file):\n",
" # Read the \"meta\" JSONL file and convert it into a DataFrame\n",
" data_meta = []\n",
" with jsonlines.open(meta_file) as reader:\n",
" for i, obj in enumerate(reader):\n",
" if i % 3 == 0: # Take only one object out of every 3\n",
" data_meta.append(obj)\n",
"\n",
" df = pd.DataFrame(data_meta)\n",
"\n",
" # Filter rows where the description is empty\n",
" df = df[df['description'].apply(lambda x: len(x) != 0)]\n",
" df = df[df['features'].apply(lambda x: len(x) != 0)]\n",
" \n",
" # Keep only the necessary columns\n",
" df = df[['parent_asin', 'average_rating', 'rating_number', 'main_category', 'title', 'description', 'features']]\n",
"\n",
" # Filter reviews with an average rating > 4.5\n",
" df = df[df['average_rating'].astype(float) > 4.5]\n",
"\n",
" # Filter reviews with a rating number > 2000\n",
" df = df[df['rating_number'].astype(int) > 2000]\n",
"\n",
" # Apply the cleaning function to 'title', 'description', and 'features' columns\n",
" df['title'] = df['title'].apply(clean_description)\n",
" df['description'] = df['description'].apply(clean_description)\n",
" df['features'] = df['features'].apply(clean_description)\n",
"\n",
" # Replace special apostrophe characters in 'title', 'description', and 'features'\n",
" df['title'] = df['title'].str.replace('’', \"'\")\n",
" df['description'] = df['description'].str.replace('’', \"'\")\n",
" df['features'] = df['features'].str.replace('’', \"'\")\n",
" \n",
" # Save the final DataFrame as a CSV file\n",
" df.to_csv(output_file, index=False)\n",
"\n",
" print(len(df))\n",
"\n",
"\n",
"# Example call to the function\n",
"process_amazon_reviews('Data/Input/meta/meta_Clothing_Shoes_and_Jewelry.jsonl', 'Data/Output/meta_Clothing_Shoes_and_Jewelry.csv')\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Concatenation of final csv"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"All CSV files have been concatenated into 'Output/concatenated_28657_products_list.csv'\n"
]
}
],
"source": [
"import os\n",
"import pandas as pd\n",
"\n",
"# Path to the folder containing the CSV files\n",
"csv_folder = 'Data/Output/'\n",
"\n",
"# List all files in the folder\n",
"csv_files = [f for f in os.listdir(csv_folder) if f.endswith('.csv')]\n",
"\n",
"# Read and concatenate all the CSV files\n",
"df_list = [pd.read_csv(os.path.join(csv_folder, file)) for file in csv_files]\n",
"final_df = pd.concat(df_list, ignore_index=True)\n",
"\n",
"# Save the final DataFrame into a single CSV file\n",
"final_df.to_csv(f'Data/Output/LLM_Ready/concatenated_{len(final_df)}_products_list.csv', index=False)\n",
"\n",
"# Print message confirming the concatenation\n",
"print(f\"All CSV files have been concatenated into 'Data/Output/LLM_Ready/concatenated_{len(final_df)}_products_list.csv'\")\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "genenv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
10 changes: 6 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ A **presentation** is available as a **PDF** file in the repo `Gift_Recommendati

## Notebooks Overview

1. **DataPreprocessing.ipynb**:
1. **create_llm_ready_csv.ipynb**:
- Used to preprocess the Amazon Dataset and extract products with rich text features for the demo.

---
Expand All @@ -82,8 +82,11 @@ $ cd Gift-Recommendation-ChatBot
```sh
$ pip install -r requirements.txt
```
3. Download the Chroma VecStore & upload it to your Dropbox App:
[Chroma Store Download Link](https://drive.google.com/drive/folders/1zateQgEBTLoUUb5tSdLyBJIF57wermxC?usp=drive_link)

> [!IMPORTANT]
> Ensure you have the necessary API keys for TogetherAI set up in a `.env` file.
> Ensure you have the necessary API keys for TogetherAI and Dropbox set up in the `secrets.toml` file.
---

Expand All @@ -106,15 +109,14 @@ $ streamlit run src/Gift\ Recommendation\ Bot\ 🎁.py
├── 📁__pycache__/
├── 📁chroma_vectorstore/ # Contains the vector store
├── 📁Data/ # Contains the dataset
├── └── 📓Data_Preprocessing.ipynb
├── └── 📓create_llm_ready_csv.ipynb
├── 📁src/
│ ├── 🐍Gift Recommendation Bot 🎁.py
│ ├── 📁pages/
│ │ └── 🐍Products Catalogue ⚙️.py
├── 📁tmp/ # Used to store temporary csv file for data embedding
├── 📁.streamlit/
│ └── 🔑secrets.toml # Used to store api Keys for running locally
├── 📄.env
├── 📄.gitignore
├── 📄README.md
├── 📄requirements.txt
Expand Down

0 comments on commit 34c8f56

Please sign in to comment.