-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat : add notebook to transform products.jsonl into LLM embedding re…
…ady csv
- Loading branch information
Showing
1 changed file
with
251 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,251 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## JSONL Filters and Transformations" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 46, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"4043\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"import pandas as pd\n", | ||
"import jsonlines\n", | ||
"import re\n", | ||
"\n", | ||
"def remove_emojis(text):\n", | ||
" # Regular expression to detect emojis and other special symbols\n", | ||
" emoji_pattern = re.compile(\n", | ||
" \"[\"\n", | ||
" \"\\U0001F600-\\U0001F64F\" # Emoticons\n", | ||
" \"\\U0001F300-\\U0001F5FF\" # Miscellaneous Symbols and Pictographs\n", | ||
" \"\\U0001F680-\\U0001F6FF\" # Transport and Map Symbols\n", | ||
" \"\\U0001F700-\\U0001F77F\" # Alchemical Symbols\n", | ||
" \"\\U0001F780-\\U0001F7FF\" # Geometric Shapes Extended\n", | ||
" \"\\U0001F800-\\U0001F8FF\" # Supplemental Arrows-C\n", | ||
" \"\\U0001F900-\\U0001F9FF\" # Supplemental Symbols and Pictographs\n", | ||
" \"\\U0001FA00-\\U0001FA6F\" # Chess Symbols\n", | ||
" \"\\U0001FA70-\\U0001FAFF\" # Symbols and Pictographs Extended-A\n", | ||
" \"\\U00002705\"\n", | ||
" \"]+\", flags=re.UNICODE\n", | ||
" )\n", | ||
" return emoji_pattern.sub(r'', text)\n", | ||
"\n", | ||
"# Function to join lists into a single string and then remove emojis\n", | ||
"def clean_description(val):\n", | ||
" if isinstance(val, list):\n", | ||
" val = ' '.join(val) # Join the list elements into a single string\n", | ||
" return remove_emojis(str(val)) if pd.notnull(val) else val\n", | ||
"\n", | ||
"def process_amazon_reviews(meta_file, output_file):\n", | ||
" # Read the \"meta\" JSONL file and convert it to a DataFrame\n", | ||
" data_meta = []\n", | ||
" with jsonlines.open(meta_file) as reader:\n", | ||
" for obj in reader:\n", | ||
" data_meta.append(obj)\n", | ||
"\n", | ||
" df = pd.DataFrame(data_meta)\n", | ||
"\n", | ||
" # Filter rows where 'description' is empty\n", | ||
" df = df[df['description'].apply(lambda x: len(x) != 0)]\n", | ||
" df = df[df['features'].apply(lambda x: len(x) != 0)]\n", | ||
"\n", | ||
" # Keep only the necessary columns\n", | ||
" df = df[['parent_asin', 'average_rating', 'rating_number', 'main_category', 'title', 'description', 'features']]\n", | ||
"\n", | ||
" # Filter reviews with a rating > 4.5\n", | ||
" df = df[df['average_rating'].astype(float) > 4.5]\n", | ||
"\n", | ||
" # Filter reviews with a rating number > 200\n", | ||
" df = df[df['rating_number'].astype(int) > 200]\n", | ||
"\n", | ||
" # Apply the cleaning function to the 'description' column\n", | ||
" df['title'] = df['title'].apply(clean_description)\n", | ||
" df['description'] = df['description'].apply(clean_description)\n", | ||
" df['features'] = df['features'].apply(clean_description)\n", | ||
"\n", | ||
" # Replace curly apostrophes with straight ones in text fields\n", | ||
" df['title'] = df['title'].str.replace('’', \"'\")\n", | ||
" df['description'] = df['description'].str.replace('’', \"'\")\n", | ||
" df['features'] = df['features'].str.replace('’', \"'\")\n", | ||
"\n", | ||
" # Save the final DataFrame to a CSV file\n", | ||
" df.to_csv(output_file, index=False)\n", | ||
"\n", | ||
" print(len(df))\n", | ||
"\n", | ||
"# Example function call\n", | ||
"process_amazon_reviews('Data/Input/meta_Video_Games.jsonl', 'Data/Output/meta_Video_Games.csv')\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"### Special case for Clothing dataset. Takes only one object out of every 3\n", | ||
"Very big dataset" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 38, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"2933\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"import pandas as pd\n", | ||
"import jsonlines\n", | ||
"import re\n", | ||
"\n", | ||
"\n", | ||
"def remove_emojis(text):\n", | ||
" # Regular expression to detect emojis and other special symbols\n", | ||
" emoji_pattern = re.compile(\n", | ||
" \"[\" \n", | ||
" \"\\U0001F600-\\U0001F64F\" # Emoticons\n", | ||
" \"\\U0001F300-\\U0001F5FF\" # Miscellaneous Symbols and Pictographs\n", | ||
" \"\\U0001F680-\\U0001F6FF\" # Transport and Map Symbols\n", | ||
" \"\\U0001F700-\\U0001F77F\" # Alchemical Symbols\n", | ||
" \"\\U0001F780-\\U0001F7FF\" # Geometric Shapes Extended\n", | ||
" \"\\U0001F800-\\U0001F8FF\" # Supplemental Arrows-C\n", | ||
" \"\\U0001F900-\\U0001F9FF\" # Supplemental Symbols and Pictographs\n", | ||
" \"\\U0001FA00-\\U0001FA6F\" # Symbols and Pictographs Extended-A\n", | ||
" \"\\U0001FA70-\\U0001FAFF\" # Supplemental Objects\n", | ||
" \"\\U00002705\" # Miscellaneous Check Mark\n", | ||
" \"]+\", flags=re.UNICODE\n", | ||
" )\n", | ||
" return emoji_pattern.sub(r'', text)\n", | ||
"\n", | ||
"# Function to join lists into a string and then remove emojis\n", | ||
"def clean_description(val):\n", | ||
" if isinstance(val, list):\n", | ||
" val = ' '.join(val) # Join the elements of the list into a single string\n", | ||
" return remove_emojis(str(val)) if pd.notnull(val) else val\n", | ||
"\n", | ||
"\n", | ||
"def process_amazon_reviews(meta_file, output_file):\n", | ||
" # Read the \"meta\" JSONL file and convert it into a DataFrame\n", | ||
" data_meta = []\n", | ||
" with jsonlines.open(meta_file) as reader:\n", | ||
" for i, obj in enumerate(reader):\n", | ||
" if i % 3 == 0: # Take only one object out of every 3\n", | ||
" data_meta.append(obj)\n", | ||
"\n", | ||
" df = pd.DataFrame(data_meta)\n", | ||
"\n", | ||
" # Filter rows where the description is empty\n", | ||
" df = df[df['description'].apply(lambda x: len(x) != 0)]\n", | ||
" df = df[df['features'].apply(lambda x: len(x) != 0)]\n", | ||
" \n", | ||
" # Keep only the necessary columns\n", | ||
" df = df[['parent_asin', 'average_rating', 'rating_number', 'main_category', 'title', 'description', 'features']]\n", | ||
"\n", | ||
" # Filter reviews with an average rating > 4.5\n", | ||
" df = df[df['average_rating'].astype(float) > 4.5]\n", | ||
"\n", | ||
" # Filter reviews with a rating number > 2000\n", | ||
" df = df[df['rating_number'].astype(int) > 2000]\n", | ||
"\n", | ||
" # Apply the cleaning function to 'title', 'description', and 'features' columns\n", | ||
" df['title'] = df['title'].apply(clean_description)\n", | ||
" df['description'] = df['description'].apply(clean_description)\n", | ||
" df['features'] = df['features'].apply(clean_description)\n", | ||
"\n", | ||
" # Replace special apostrophe characters in 'title', 'description', and 'features'\n", | ||
" df['title'] = df['title'].str.replace('’', \"'\")\n", | ||
" df['description'] = df['description'].str.replace('’', \"'\")\n", | ||
" df['features'] = df['features'].str.replace('’', \"'\")\n", | ||
" \n", | ||
" # Save the final DataFrame as a CSV file\n", | ||
" df.to_csv(output_file, index=False)\n", | ||
"\n", | ||
" print(len(df))\n", | ||
"\n", | ||
"\n", | ||
"# Example call to the function\n", | ||
"process_amazon_reviews('Data/Input/meta/meta_Clothing_Shoes_and_Jewelry.jsonl', 'Data/Output/meta_Clothing_Shoes_and_Jewelry.csv')\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Concatenation of final csv" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 50, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"All CSV files have been concatenated into 'Output/concatenated_28657_products_list.csv'\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"import os\n", | ||
"import pandas as pd\n", | ||
"\n", | ||
"# Path to the folder containing the CSV files\n", | ||
"csv_folder = 'Data/Output/'\n", | ||
"\n", | ||
"# List all files in the folder\n", | ||
"csv_files = [f for f in os.listdir(csv_folder) if f.endswith('.csv')]\n", | ||
"\n", | ||
"# Read and concatenate all the CSV files\n", | ||
"df_list = [pd.read_csv(os.path.join(csv_folder, file)) for file in csv_files]\n", | ||
"final_df = pd.concat(df_list, ignore_index=True)\n", | ||
"\n", | ||
"# Save the final DataFrame into a single CSV file\n", | ||
"final_df.to_csv(f'Data/Output/LLM_Ready/concatenated_{len(final_df)}_products_list.csv', index=False)\n", | ||
"\n", | ||
"# Print message confirming the concatenation\n", | ||
"print(f\"All CSV files have been concatenated into 'Data/Output/LLM_Ready/concatenated_{len(final_df)}_products_list.csv'\")\n" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "genenv", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.12.7" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |