Skip to content

Commit

Permalink
feat : add notebook to transform products.jsonl into LLM embedding re…
Browse files Browse the repository at this point in the history
…ady csv
  • Loading branch information
jcrigoni committed Oct 14, 2024
1 parent 01cb7e2 commit c26432c
Showing 1 changed file with 251 additions and 0 deletions.
251 changes: 251 additions & 0 deletions EDA/create_llm_ready_csv.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,251 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## JSONL Filters and Transformations"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"4043\n"
]
}
],
"source": [
"import pandas as pd\n",
"import jsonlines\n",
"import re\n",
"\n",
"def remove_emojis(text):\n",
" # Regular expression to detect emojis and other special symbols\n",
" emoji_pattern = re.compile(\n",
" \"[\"\n",
" \"\\U0001F600-\\U0001F64F\" # Emoticons\n",
" \"\\U0001F300-\\U0001F5FF\" # Miscellaneous Symbols and Pictographs\n",
" \"\\U0001F680-\\U0001F6FF\" # Transport and Map Symbols\n",
" \"\\U0001F700-\\U0001F77F\" # Alchemical Symbols\n",
" \"\\U0001F780-\\U0001F7FF\" # Geometric Shapes Extended\n",
" \"\\U0001F800-\\U0001F8FF\" # Supplemental Arrows-C\n",
" \"\\U0001F900-\\U0001F9FF\" # Supplemental Symbols and Pictographs\n",
" \"\\U0001FA00-\\U0001FA6F\" # Chess Symbols\n",
" \"\\U0001FA70-\\U0001FAFF\" # Symbols and Pictographs Extended-A\n",
" \"\\U00002705\"\n",
" \"]+\", flags=re.UNICODE\n",
" )\n",
" return emoji_pattern.sub(r'', text)\n",
"\n",
"# Function to join lists into a single string and then remove emojis\n",
"def clean_description(val):\n",
" if isinstance(val, list):\n",
" val = ' '.join(val) # Join the list elements into a single string\n",
" return remove_emojis(str(val)) if pd.notnull(val) else val\n",
"\n",
"def process_amazon_reviews(meta_file, output_file):\n",
" # Read the \"meta\" JSONL file and convert it to a DataFrame\n",
" data_meta = []\n",
" with jsonlines.open(meta_file) as reader:\n",
" for obj in reader:\n",
" data_meta.append(obj)\n",
"\n",
" df = pd.DataFrame(data_meta)\n",
"\n",
" # Filter rows where 'description' is empty\n",
" df = df[df['description'].apply(lambda x: len(x) != 0)]\n",
" df = df[df['features'].apply(lambda x: len(x) != 0)]\n",
"\n",
" # Keep only the necessary columns\n",
" df = df[['parent_asin', 'average_rating', 'rating_number', 'main_category', 'title', 'description', 'features']]\n",
"\n",
" # Filter reviews with a rating > 4.5\n",
" df = df[df['average_rating'].astype(float) > 4.5]\n",
"\n",
" # Filter reviews with a rating number > 200\n",
" df = df[df['rating_number'].astype(int) > 200]\n",
"\n",
" # Apply the cleaning function to the 'description' column\n",
" df['title'] = df['title'].apply(clean_description)\n",
" df['description'] = df['description'].apply(clean_description)\n",
" df['features'] = df['features'].apply(clean_description)\n",
"\n",
" # Replace curly apostrophes with straight ones in text fields\n",
" df['title'] = df['title'].str.replace('’', \"'\")\n",
" df['description'] = df['description'].str.replace('’', \"'\")\n",
" df['features'] = df['features'].str.replace('’', \"'\")\n",
"\n",
" # Save the final DataFrame to a CSV file\n",
" df.to_csv(output_file, index=False)\n",
"\n",
" print(len(df))\n",
"\n",
"# Example function call\n",
"process_amazon_reviews('Data/Input/meta_Video_Games.jsonl', 'Data/Output/meta_Video_Games.csv')\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Special case for Clothing dataset. Takes only one object out of every 3\n",
"Very big dataset"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2933\n"
]
}
],
"source": [
"import pandas as pd\n",
"import jsonlines\n",
"import re\n",
"\n",
"\n",
"def remove_emojis(text):\n",
" # Regular expression to detect emojis and other special symbols\n",
" emoji_pattern = re.compile(\n",
" \"[\" \n",
" \"\\U0001F600-\\U0001F64F\" # Emoticons\n",
" \"\\U0001F300-\\U0001F5FF\" # Miscellaneous Symbols and Pictographs\n",
" \"\\U0001F680-\\U0001F6FF\" # Transport and Map Symbols\n",
" \"\\U0001F700-\\U0001F77F\" # Alchemical Symbols\n",
" \"\\U0001F780-\\U0001F7FF\" # Geometric Shapes Extended\n",
" \"\\U0001F800-\\U0001F8FF\" # Supplemental Arrows-C\n",
" \"\\U0001F900-\\U0001F9FF\" # Supplemental Symbols and Pictographs\n",
" \"\\U0001FA00-\\U0001FA6F\" # Symbols and Pictographs Extended-A\n",
" \"\\U0001FA70-\\U0001FAFF\" # Supplemental Objects\n",
" \"\\U00002705\" # Miscellaneous Check Mark\n",
" \"]+\", flags=re.UNICODE\n",
" )\n",
" return emoji_pattern.sub(r'', text)\n",
"\n",
"# Function to join lists into a string and then remove emojis\n",
"def clean_description(val):\n",
" if isinstance(val, list):\n",
" val = ' '.join(val) # Join the elements of the list into a single string\n",
" return remove_emojis(str(val)) if pd.notnull(val) else val\n",
"\n",
"\n",
"def process_amazon_reviews(meta_file, output_file):\n",
" # Read the \"meta\" JSONL file and convert it into a DataFrame\n",
" data_meta = []\n",
" with jsonlines.open(meta_file) as reader:\n",
" for i, obj in enumerate(reader):\n",
" if i % 3 == 0: # Take only one object out of every 3\n",
" data_meta.append(obj)\n",
"\n",
" df = pd.DataFrame(data_meta)\n",
"\n",
" # Filter rows where the description is empty\n",
" df = df[df['description'].apply(lambda x: len(x) != 0)]\n",
" df = df[df['features'].apply(lambda x: len(x) != 0)]\n",
" \n",
" # Keep only the necessary columns\n",
" df = df[['parent_asin', 'average_rating', 'rating_number', 'main_category', 'title', 'description', 'features']]\n",
"\n",
" # Filter reviews with an average rating > 4.5\n",
" df = df[df['average_rating'].astype(float) > 4.5]\n",
"\n",
" # Filter reviews with a rating number > 2000\n",
" df = df[df['rating_number'].astype(int) > 2000]\n",
"\n",
" # Apply the cleaning function to 'title', 'description', and 'features' columns\n",
" df['title'] = df['title'].apply(clean_description)\n",
" df['description'] = df['description'].apply(clean_description)\n",
" df['features'] = df['features'].apply(clean_description)\n",
"\n",
" # Replace special apostrophe characters in 'title', 'description', and 'features'\n",
" df['title'] = df['title'].str.replace('’', \"'\")\n",
" df['description'] = df['description'].str.replace('’', \"'\")\n",
" df['features'] = df['features'].str.replace('’', \"'\")\n",
" \n",
" # Save the final DataFrame as a CSV file\n",
" df.to_csv(output_file, index=False)\n",
"\n",
" print(len(df))\n",
"\n",
"\n",
"# Example call to the function\n",
"process_amazon_reviews('Data/Input/meta/meta_Clothing_Shoes_and_Jewelry.jsonl', 'Data/Output/meta_Clothing_Shoes_and_Jewelry.csv')\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Concatenation of final csv"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"All CSV files have been concatenated into 'Output/concatenated_28657_products_list.csv'\n"
]
}
],
"source": [
"import os\n",
"import pandas as pd\n",
"\n",
"# Path to the folder containing the CSV files\n",
"csv_folder = 'Data/Output/'\n",
"\n",
"# List all files in the folder\n",
"csv_files = [f for f in os.listdir(csv_folder) if f.endswith('.csv')]\n",
"\n",
"# Read and concatenate all the CSV files\n",
"df_list = [pd.read_csv(os.path.join(csv_folder, file)) for file in csv_files]\n",
"final_df = pd.concat(df_list, ignore_index=True)\n",
"\n",
"# Save the final DataFrame into a single CSV file\n",
"final_df.to_csv(f'Data/Output/LLM_Ready/concatenated_{len(final_df)}_products_list.csv', index=False)\n",
"\n",
"# Print message confirming the concatenation\n",
"print(f\"All CSV files have been concatenated into 'Data/Output/LLM_Ready/concatenated_{len(final_df)}_products_list.csv'\")\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "genenv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

0 comments on commit c26432c

Please sign in to comment.