diff --git a/milestoneP3.ipynb b/milestoneP3.ipynb index 8ff081b..846d24a 100644 --- a/milestoneP3.ipynb +++ b/milestoneP3.ipynb @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 7, "id": "746388e8963208b1", "metadata": { "ExecuteTime": { @@ -34,8 +34,7 @@ ] }, { - "cell_type": "code", - "execution_count": 2, + "cell_type": "markdown", "id": "9b5fb4d5f7daba7a", "metadata": { "ExecuteTime": { @@ -43,9 +42,18 @@ "start_time": "2024-12-19T08:02:02.467117Z" } }, - "outputs": [], "source": [ - "# TODO: Make the table of content with the hyperlinks" + "\n", + "[Data pre-processing](#data-pre-processing)\n", + "\n", + "\n", + "[Season Trends](#seasonal-trend-analysis)\n", + "\n", + "\n", + "[Sentiment Analysis](#sentiment-analysis)\n", + "\n", + "\n", + "[Semantic Analysis](#semantic-analysis)" ] }, { @@ -68,12 +76,14 @@ "id": "e3a9526c840c3b4a", "metadata": {}, "source": [ + "<a id=\"data-pre-processing\"></a>\n", + "\n", "## Data pre-processing \n" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 8, "id": "initial_id", "metadata": { "ExecuteTime": { @@ -82,7 +92,19 @@ }, "collapsed": true }, - "outputs": [], + "outputs": [ + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'geopandas'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[8], line 20\u001b[0m\n\u001b[0;32m 18\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtqdm\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m tqdm\n\u001b[0;32m 19\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mconcurrent\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mfutures\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ThreadPoolExecutor\n\u001b[1;32m---> 20\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mgeopandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mgpd\u001b[39;00m\n\u001b[0;32m 23\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mplotly\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mio\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpio\u001b[39;00m\n\u001b[0;32m 24\u001b[0m pio\u001b[38;5;241m.\u001b[39mrenderers\u001b[38;5;241m.\u001b[39mdefault \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbrowser\u001b[39m\u001b[38;5;124m\"\u001b[39m\n", + "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'geopandas'" + ] + } + ], "source": [ "import pandas as pd\n", "import tarfile\n", @@ -189,6 +211,8 @@ "id": "331f5bf59b62323f", "metadata": {}, "source": [ + "\n", + "\n", "## Seasonal trend analysis\n" ] }, @@ -824,6 +848,7 @@ "id": "b93c1543", "metadata": {}, "source": [ + "<a id=\"sentiment-analysis\"></a>\n", "## Sentiment analysis\n", "\n", "In this section we look at the words used in the review to try to determine the ones that are associated to positive reviews and the ones associated to negative ones.\n", @@ -888,19 +913,19 @@ "evalue": "", "output_type": "error", "traceback": [ - "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", - "\u001B[0;31mKeyboardInterrupt\u001B[0m Traceback (most recent call last)", - "Cell \u001B[0;32mIn[5], line 2\u001B[0m\n\u001B[1;32m 1\u001B[0m \u001B[38;5;66;03m# We begin by cleaning the reviews \u001B[39;00m\n\u001B[0;32m----> 2\u001B[0m reviews_clean \u001B[38;5;241m=\u001B[39m sent_ana\u001B[38;5;241m.\u001B[39mclean_data(reviews)\n\u001B[1;32m 4\u001B[0m \u001B[38;5;66;03m# Separate the reviews into two groups: those with a rating higher than 4 and those with a rating lower than 3\u001B[39;00m\n\u001B[1;32m 5\u001B[0m reviews_high \u001B[38;5;241m=\u001B[39m reviews_clean[reviews_clean[\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mrating\u001B[39m\u001B[38;5;124m'\u001B[39m] \u001B[38;5;241m>\u001B[39m \u001B[38;5;241m4\u001B[39m]\n", - "File \u001B[0;32m~/Documents/epfl/Master/MA3/ADA/ada-2024-project-databrewers/src/scripts/sentiment_analysis.py:35\u001B[0m, in \u001B[0;36mclean_data\u001B[0;34m(reviews)\u001B[0m\n\u001B[1;32m 33\u001B[0m reviews_clean \u001B[38;5;241m=\u001B[39m reviews[reviews[\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mtext\u001B[39m\u001B[38;5;124m'\u001B[39m]\u001B[38;5;241m.\u001B[39mapply(\u001B[38;5;28;01mlambda\u001B[39;00m x: \u001B[38;5;28misinstance\u001B[39m(x, \u001B[38;5;28mstr\u001B[39m))]\n\u001B[1;32m 34\u001B[0m \u001B[38;5;66;03m# Now apply the preprocess_text function\u001B[39;00m\n\u001B[0;32m---> 35\u001B[0m reviews_clean[\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mcleaned_tokens\u001B[39m\u001B[38;5;124m'\u001B[39m] \u001B[38;5;241m=\u001B[39m reviews_clean[\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mtext\u001B[39m\u001B[38;5;124m'\u001B[39m]\u001B[38;5;241m.\u001B[39mapply(preprocess_text)\n\u001B[1;32m 37\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m reviews_clean\n", - "File \u001B[0;32m/opt/anaconda3/lib/python3.12/site-packages/pandas/core/series.py:4924\u001B[0m, in \u001B[0;36mSeries.apply\u001B[0;34m(self, func, convert_dtype, args, by_row, **kwargs)\u001B[0m\n\u001B[1;32m 4789\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mapply\u001B[39m(\n\u001B[1;32m 4790\u001B[0m \u001B[38;5;28mself\u001B[39m,\n\u001B[1;32m 4791\u001B[0m func: AggFuncType,\n\u001B[0;32m (...)\u001B[0m\n\u001B[1;32m 4796\u001B[0m \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs,\n\u001B[1;32m 4797\u001B[0m ) \u001B[38;5;241m-\u001B[39m\u001B[38;5;241m>\u001B[39m DataFrame \u001B[38;5;241m|\u001B[39m Series:\n\u001B[1;32m 4798\u001B[0m \u001B[38;5;250m \u001B[39m\u001B[38;5;124;03m\"\"\"\u001B[39;00m\n\u001B[1;32m 4799\u001B[0m \u001B[38;5;124;03m Invoke function on values of Series.\u001B[39;00m\n\u001B[1;32m 4800\u001B[0m \n\u001B[0;32m (...)\u001B[0m\n\u001B[1;32m 4915\u001B[0m \u001B[38;5;124;03m dtype: float64\u001B[39;00m\n\u001B[1;32m 4916\u001B[0m \u001B[38;5;124;03m \"\"\"\u001B[39;00m\n\u001B[1;32m 4917\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m SeriesApply(\n\u001B[1;32m 4918\u001B[0m \u001B[38;5;28mself\u001B[39m,\n\u001B[1;32m 4919\u001B[0m func,\n\u001B[1;32m 4920\u001B[0m convert_dtype\u001B[38;5;241m=\u001B[39mconvert_dtype,\n\u001B[1;32m 4921\u001B[0m by_row\u001B[38;5;241m=\u001B[39mby_row,\n\u001B[1;32m 4922\u001B[0m args\u001B[38;5;241m=\u001B[39margs,\n\u001B[1;32m 4923\u001B[0m kwargs\u001B[38;5;241m=\u001B[39mkwargs,\n\u001B[0;32m-> 4924\u001B[0m )\u001B[38;5;241m.\u001B[39mapply()\n", - "File \u001B[0;32m/opt/anaconda3/lib/python3.12/site-packages/pandas/core/apply.py:1427\u001B[0m, in \u001B[0;36mSeriesApply.apply\u001B[0;34m(self)\u001B[0m\n\u001B[1;32m 1424\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mapply_compat()\n\u001B[1;32m 1426\u001B[0m \u001B[38;5;66;03m# self.func is Callable\u001B[39;00m\n\u001B[0;32m-> 1427\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mapply_standard()\n", - "File \u001B[0;32m/opt/anaconda3/lib/python3.12/site-packages/pandas/core/apply.py:1507\u001B[0m, in \u001B[0;36mSeriesApply.apply_standard\u001B[0;34m(self)\u001B[0m\n\u001B[1;32m 1501\u001B[0m \u001B[38;5;66;03m# row-wise access\u001B[39;00m\n\u001B[1;32m 1502\u001B[0m \u001B[38;5;66;03m# apply doesn't have a `na_action` keyword and for backward compat reasons\u001B[39;00m\n\u001B[1;32m 1503\u001B[0m \u001B[38;5;66;03m# we need to give `na_action=\"ignore\"` for categorical data.\u001B[39;00m\n\u001B[1;32m 1504\u001B[0m \u001B[38;5;66;03m# TODO: remove the `na_action=\"ignore\"` when that default has been changed in\u001B[39;00m\n\u001B[1;32m 1505\u001B[0m \u001B[38;5;66;03m# Categorical (GH51645).\u001B[39;00m\n\u001B[1;32m 1506\u001B[0m action \u001B[38;5;241m=\u001B[39m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mignore\u001B[39m\u001B[38;5;124m\"\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28misinstance\u001B[39m(obj\u001B[38;5;241m.\u001B[39mdtype, CategoricalDtype) \u001B[38;5;28;01melse\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m\n\u001B[0;32m-> 1507\u001B[0m mapped \u001B[38;5;241m=\u001B[39m obj\u001B[38;5;241m.\u001B[39m_map_values(\n\u001B[1;32m 1508\u001B[0m mapper\u001B[38;5;241m=\u001B[39mcurried, na_action\u001B[38;5;241m=\u001B[39maction, convert\u001B[38;5;241m=\u001B[39m\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mconvert_dtype\n\u001B[1;32m 1509\u001B[0m )\n\u001B[1;32m 1511\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mlen\u001B[39m(mapped) \u001B[38;5;129;01mand\u001B[39;00m \u001B[38;5;28misinstance\u001B[39m(mapped[\u001B[38;5;241m0\u001B[39m], ABCSeries):\n\u001B[1;32m 1512\u001B[0m \u001B[38;5;66;03m# GH#43986 Need to do list(mapped) in order to get treated as nested\u001B[39;00m\n\u001B[1;32m 1513\u001B[0m \u001B[38;5;66;03m# See also GH#25959 regarding EA support\u001B[39;00m\n\u001B[1;32m 1514\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m obj\u001B[38;5;241m.\u001B[39m_constructor_expanddim(\u001B[38;5;28mlist\u001B[39m(mapped), index\u001B[38;5;241m=\u001B[39mobj\u001B[38;5;241m.\u001B[39mindex)\n", - "File \u001B[0;32m/opt/anaconda3/lib/python3.12/site-packages/pandas/core/base.py:921\u001B[0m, in \u001B[0;36mIndexOpsMixin._map_values\u001B[0;34m(self, mapper, na_action, convert)\u001B[0m\n\u001B[1;32m 918\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28misinstance\u001B[39m(arr, ExtensionArray):\n\u001B[1;32m 919\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m arr\u001B[38;5;241m.\u001B[39mmap(mapper, na_action\u001B[38;5;241m=\u001B[39mna_action)\n\u001B[0;32m--> 921\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m algorithms\u001B[38;5;241m.\u001B[39mmap_array(arr, mapper, na_action\u001B[38;5;241m=\u001B[39mna_action, convert\u001B[38;5;241m=\u001B[39mconvert)\n", - "File \u001B[0;32m/opt/anaconda3/lib/python3.12/site-packages/pandas/core/algorithms.py:1743\u001B[0m, in \u001B[0;36mmap_array\u001B[0;34m(arr, mapper, na_action, convert)\u001B[0m\n\u001B[1;32m 1741\u001B[0m values \u001B[38;5;241m=\u001B[39m arr\u001B[38;5;241m.\u001B[39mastype(\u001B[38;5;28mobject\u001B[39m, copy\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mFalse\u001B[39;00m)\n\u001B[1;32m 1742\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m na_action \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m:\n\u001B[0;32m-> 1743\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m lib\u001B[38;5;241m.\u001B[39mmap_infer(values, mapper, convert\u001B[38;5;241m=\u001B[39mconvert)\n\u001B[1;32m 1744\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[1;32m 1745\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m lib\u001B[38;5;241m.\u001B[39mmap_infer_mask(\n\u001B[1;32m 1746\u001B[0m values, mapper, mask\u001B[38;5;241m=\u001B[39misna(values)\u001B[38;5;241m.\u001B[39mview(np\u001B[38;5;241m.\u001B[39muint8), convert\u001B[38;5;241m=\u001B[39mconvert\n\u001B[1;32m 1747\u001B[0m )\n", - "File \u001B[0;32mlib.pyx:2972\u001B[0m, in \u001B[0;36mpandas._libs.lib.map_infer\u001B[0;34m()\u001B[0m\n", - "File \u001B[0;32m~/Documents/epfl/Master/MA3/ADA/ada-2024-project-databrewers/src/scripts/sentiment_analysis.py:18\u001B[0m, in \u001B[0;36mpreprocess_text\u001B[0;34m(text)\u001B[0m\n\u001B[1;32m 17\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mpreprocess_text\u001B[39m(text):\n\u001B[0;32m---> 18\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m re\u001B[38;5;241m.\u001B[39mfindall(\u001B[38;5;124mr\u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;124m\\\u001B[39m\u001B[38;5;124mb\u001B[39m\u001B[38;5;124m\\\u001B[39m\u001B[38;5;124mw+\u001B[39m\u001B[38;5;124m\\\u001B[39m\u001B[38;5;124mb\u001B[39m\u001B[38;5;124m'\u001B[39m, text\u001B[38;5;241m.\u001B[39mlower())\n", - "File \u001B[0;32m/opt/anaconda3/lib/python3.12/re/__init__.py:217\u001B[0m, in \u001B[0;36mfindall\u001B[0;34m(pattern, string, flags)\u001B[0m\n\u001B[1;32m 209\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mfindall\u001B[39m(pattern, string, flags\u001B[38;5;241m=\u001B[39m\u001B[38;5;241m0\u001B[39m):\n\u001B[1;32m 210\u001B[0m \u001B[38;5;250m \u001B[39m\u001B[38;5;124;03m\"\"\"Return a list of all non-overlapping matches in the string.\u001B[39;00m\n\u001B[1;32m 211\u001B[0m \n\u001B[1;32m 212\u001B[0m \u001B[38;5;124;03m If one or more capturing groups are present in the pattern, return\u001B[39;00m\n\u001B[0;32m (...)\u001B[0m\n\u001B[1;32m 215\u001B[0m \n\u001B[1;32m 216\u001B[0m \u001B[38;5;124;03m Empty matches are included in the result.\"\"\"\u001B[39;00m\n\u001B[0;32m--> 217\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m _compile(pattern, flags)\u001B[38;5;241m.\u001B[39mfindall(string)\n", - "\u001B[0;31mKeyboardInterrupt\u001B[0m: " + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[5], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# We begin by cleaning the reviews \u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m reviews_clean \u001b[38;5;241m=\u001b[39m sent_ana\u001b[38;5;241m.\u001b[39mclean_data(reviews)\n\u001b[1;32m 4\u001b[0m \u001b[38;5;66;03m# Separate the reviews into two groups: those with a rating higher than 4 and those with a rating lower than 3\u001b[39;00m\n\u001b[1;32m 5\u001b[0m reviews_high \u001b[38;5;241m=\u001b[39m reviews_clean[reviews_clean[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mrating\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m4\u001b[39m]\n", + "File \u001b[0;32m~/Documents/epfl/Master/MA3/ADA/ada-2024-project-databrewers/src/scripts/sentiment_analysis.py:35\u001b[0m, in \u001b[0;36mclean_data\u001b[0;34m(reviews)\u001b[0m\n\u001b[1;32m 33\u001b[0m reviews_clean \u001b[38;5;241m=\u001b[39m reviews[reviews[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtext\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mapply(\u001b[38;5;28;01mlambda\u001b[39;00m x: \u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mstr\u001b[39m))]\n\u001b[1;32m 34\u001b[0m \u001b[38;5;66;03m# Now apply the preprocess_text function\u001b[39;00m\n\u001b[0;32m---> 35\u001b[0m reviews_clean[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcleaned_tokens\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m reviews_clean[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtext\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mapply(preprocess_text)\n\u001b[1;32m 37\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m reviews_clean\n", + "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/pandas/core/series.py:4924\u001b[0m, in \u001b[0;36mSeries.apply\u001b[0;34m(self, func, convert_dtype, args, by_row, **kwargs)\u001b[0m\n\u001b[1;32m 4789\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mapply\u001b[39m(\n\u001b[1;32m 4790\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 4791\u001b[0m func: AggFuncType,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 4796\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m 4797\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m DataFrame \u001b[38;5;241m|\u001b[39m Series:\n\u001b[1;32m 4798\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 4799\u001b[0m \u001b[38;5;124;03m Invoke function on values of Series.\u001b[39;00m\n\u001b[1;32m 4800\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 4915\u001b[0m \u001b[38;5;124;03m dtype: float64\u001b[39;00m\n\u001b[1;32m 4916\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m 4917\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m SeriesApply(\n\u001b[1;32m 4918\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 4919\u001b[0m func,\n\u001b[1;32m 4920\u001b[0m convert_dtype\u001b[38;5;241m=\u001b[39mconvert_dtype,\n\u001b[1;32m 4921\u001b[0m by_row\u001b[38;5;241m=\u001b[39mby_row,\n\u001b[1;32m 4922\u001b[0m args\u001b[38;5;241m=\u001b[39margs,\n\u001b[1;32m 4923\u001b[0m kwargs\u001b[38;5;241m=\u001b[39mkwargs,\n\u001b[0;32m-> 4924\u001b[0m )\u001b[38;5;241m.\u001b[39mapply()\n", + "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/pandas/core/apply.py:1427\u001b[0m, in \u001b[0;36mSeriesApply.apply\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1424\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mapply_compat()\n\u001b[1;32m 1426\u001b[0m \u001b[38;5;66;03m# self.func is Callable\u001b[39;00m\n\u001b[0;32m-> 1427\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mapply_standard()\n", + "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/pandas/core/apply.py:1507\u001b[0m, in \u001b[0;36mSeriesApply.apply_standard\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1501\u001b[0m \u001b[38;5;66;03m# row-wise access\u001b[39;00m\n\u001b[1;32m 1502\u001b[0m \u001b[38;5;66;03m# apply doesn't have a `na_action` keyword and for backward compat reasons\u001b[39;00m\n\u001b[1;32m 1503\u001b[0m \u001b[38;5;66;03m# we need to give `na_action=\"ignore\"` for categorical data.\u001b[39;00m\n\u001b[1;32m 1504\u001b[0m \u001b[38;5;66;03m# TODO: remove the `na_action=\"ignore\"` when that default has been changed in\u001b[39;00m\n\u001b[1;32m 1505\u001b[0m \u001b[38;5;66;03m# Categorical (GH51645).\u001b[39;00m\n\u001b[1;32m 1506\u001b[0m action \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mignore\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(obj\u001b[38;5;241m.\u001b[39mdtype, CategoricalDtype) \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1507\u001b[0m mapped \u001b[38;5;241m=\u001b[39m obj\u001b[38;5;241m.\u001b[39m_map_values(\n\u001b[1;32m 1508\u001b[0m mapper\u001b[38;5;241m=\u001b[39mcurried, na_action\u001b[38;5;241m=\u001b[39maction, convert\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconvert_dtype\n\u001b[1;32m 1509\u001b[0m )\n\u001b[1;32m 1511\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(mapped) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(mapped[\u001b[38;5;241m0\u001b[39m], ABCSeries):\n\u001b[1;32m 1512\u001b[0m \u001b[38;5;66;03m# GH#43986 Need to do list(mapped) in order to get treated as nested\u001b[39;00m\n\u001b[1;32m 1513\u001b[0m \u001b[38;5;66;03m# See also GH#25959 regarding EA support\u001b[39;00m\n\u001b[1;32m 1514\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m obj\u001b[38;5;241m.\u001b[39m_constructor_expanddim(\u001b[38;5;28mlist\u001b[39m(mapped), index\u001b[38;5;241m=\u001b[39mobj\u001b[38;5;241m.\u001b[39mindex)\n", + "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/pandas/core/base.py:921\u001b[0m, in \u001b[0;36mIndexOpsMixin._map_values\u001b[0;34m(self, mapper, na_action, convert)\u001b[0m\n\u001b[1;32m 918\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(arr, ExtensionArray):\n\u001b[1;32m 919\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m arr\u001b[38;5;241m.\u001b[39mmap(mapper, na_action\u001b[38;5;241m=\u001b[39mna_action)\n\u001b[0;32m--> 921\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m algorithms\u001b[38;5;241m.\u001b[39mmap_array(arr, mapper, na_action\u001b[38;5;241m=\u001b[39mna_action, convert\u001b[38;5;241m=\u001b[39mconvert)\n", + "File \u001b[0;32m/opt/anaconda3/lib/python3.12/site-packages/pandas/core/algorithms.py:1743\u001b[0m, in \u001b[0;36mmap_array\u001b[0;34m(arr, mapper, na_action, convert)\u001b[0m\n\u001b[1;32m 1741\u001b[0m values \u001b[38;5;241m=\u001b[39m arr\u001b[38;5;241m.\u001b[39mastype(\u001b[38;5;28mobject\u001b[39m, copy\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[1;32m 1742\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m na_action \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m-> 1743\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m lib\u001b[38;5;241m.\u001b[39mmap_infer(values, mapper, convert\u001b[38;5;241m=\u001b[39mconvert)\n\u001b[1;32m 1744\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1745\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m lib\u001b[38;5;241m.\u001b[39mmap_infer_mask(\n\u001b[1;32m 1746\u001b[0m values, mapper, mask\u001b[38;5;241m=\u001b[39misna(values)\u001b[38;5;241m.\u001b[39mview(np\u001b[38;5;241m.\u001b[39muint8), convert\u001b[38;5;241m=\u001b[39mconvert\n\u001b[1;32m 1747\u001b[0m )\n", + "File \u001b[0;32mlib.pyx:2972\u001b[0m, in \u001b[0;36mpandas._libs.lib.map_infer\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32m~/Documents/epfl/Master/MA3/ADA/ada-2024-project-databrewers/src/scripts/sentiment_analysis.py:18\u001b[0m, in \u001b[0;36mpreprocess_text\u001b[0;34m(text)\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mpreprocess_text\u001b[39m(text):\n\u001b[0;32m---> 18\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m re\u001b[38;5;241m.\u001b[39mfindall(\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124mw+\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m'\u001b[39m, text\u001b[38;5;241m.\u001b[39mlower())\n", + "File \u001b[0;32m/opt/anaconda3/lib/python3.12/re/__init__.py:217\u001b[0m, in \u001b[0;36mfindall\u001b[0;34m(pattern, string, flags)\u001b[0m\n\u001b[1;32m 209\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mfindall\u001b[39m(pattern, string, flags\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m):\n\u001b[1;32m 210\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Return a list of all non-overlapping matches in the string.\u001b[39;00m\n\u001b[1;32m 211\u001b[0m \n\u001b[1;32m 212\u001b[0m \u001b[38;5;124;03m If one or more capturing groups are present in the pattern, return\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 215\u001b[0m \n\u001b[1;32m 216\u001b[0m \u001b[38;5;124;03m Empty matches are included in the result.\"\"\"\u001b[39;00m\n\u001b[0;32m--> 217\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m _compile(pattern, flags)\u001b[38;5;241m.\u001b[39mfindall(string)\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } ], @@ -1065,6 +1090,9 @@ "id": "b771e187", "metadata": {}, "source": [ + "\n", + "<a id=\"semantic-analysis\"></a>\n", + "\n", "## Semantic analysis" ] }, @@ -1077,13 +1105,13 @@ ] }, { - "metadata": {}, "cell_type": "markdown", + "id": "372e36f37fd5722c", + "metadata": {}, "source": [ "In this section we aim to analyze the global semantic of the reviews and associate recurrent words to positive reviews. This will help us understand how different characteristics are associated with the beers' appeal. \n", "We will then perform this semantic analysis across seasons, to evaluate how consumer tastes and expectations vary depending on the time of the year." - ], - "id": "372e36f37fd5722c" + ] }, { "cell_type": "code", @@ -1183,14 +1211,6 @@ "### Flavor analysis " ] }, - { - "cell_type": "markdown", - "id": "bf711142c05d8590", - "metadata": {}, - "source": [ - "We will now analyze the flavor ..." - ] - }, { "cell_type": "markdown", "id": "36bedd8ad3816f65", @@ -1210,10 +1230,10 @@ "evalue": "name 'sem_ana' is not defined", "output_type": "error", "traceback": [ - "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m", - "\u001B[1;31mNameError\u001B[0m Traceback (most recent call last)", - "Cell \u001B[1;32mIn[1], line 1\u001B[0m\n\u001B[1;32m----> 1\u001B[0m \u001B[43msem_ana\u001B[49m\u001B[38;5;241m.\u001B[39manalyse_flavours(clean_reviews)\n", - "\u001B[1;31mNameError\u001B[0m: name 'sem_ana' is not defined" + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[1], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[43msem_ana\u001b[49m\u001b[38;5;241m.\u001b[39manalyse_flavours(clean_reviews)\n", + "\u001b[1;31mNameError\u001b[0m: name 'sem_ana' is not defined" ] } ], @@ -1264,7 +1284,7 @@ "id": "3e63e568", "metadata": {}, "source": [ - "For example Stout would be a good winter beer contrary to wild beer" + "For example Stout would be a good winter beer contrary to wild beer. Thanks to our analysis we can provide a profile of perception of beer types, then map when wanted to seasonal preferences, thus directing production or marketing to seasonal trends." ] }, { @@ -1279,7 +1299,9 @@ "cell_type": "markdown", "id": "51ce79f1", "metadata": {}, - "source": "Now that we did a general semantic analysis on reviews, we will perform a semantic comparison on the season reviews, keeping only the highly rated ones. To do so, we categorize the reviews according to the season when they were written." + "source": [ + "Now that we did a general semantic analysis on reviews, we will perform a semantic comparison on the season reviews, keeping only the highly rated ones. To do so, we categorize the reviews according to the season when they were written." + ] }, { "cell_type": "code", @@ -1345,10 +1367,12 @@ ] }, { - "metadata": {}, "cell_type": "markdown", - "source": "On the BeerAdvocate website, they provide detailed guidance on rating beers, offering examples of adjectives for each characteristic (aroma, palate, mouthfeel, and taste), and grouping them into categories. For instance, in the case of aroma, adjectives like “sweet,” “roasty,” “smoky,” “toasty,” “nutty,” and “chocolate” are grouped under the category “Malt.” Drawing inspiration from their approach, we similarly grouped various adjectives into categories for all four characteristics. ", - "id": "fe018cdaf1175d75" + "id": "fe018cdaf1175d75", + "metadata": {}, + "source": [ + "On the BeerAdvocate website, they provide detailed guidance on rating beers, offering examples of adjectives for each characteristic (aroma, palate, mouthfeel, and taste), and grouping them into categories. For instance, in the case of aroma, adjectives like “sweet,” “roasty,” “smoky,” “toasty,” “nutty,” and “chocolate” are grouped under the category “Malt.” Drawing inspiration from their approach, we similarly grouped various adjectives into categories for all four characteristics. " + ] }, { "cell_type": "code", @@ -1480,7 +1504,9 @@ "cell_type": "markdown", "id": "65dcd9f9d134a993", "metadata": {}, - "source": "#### Wordcloud by seasons" + "source": [ + "#### Wordcloud by seasons" + ] }, { "cell_type": "code", @@ -1528,10 +1554,12 @@ ] }, { - "metadata": {}, "cell_type": "markdown", - "source": "During Winter, we can see that people tend to prefer aromas that are rich, full in flavor, like \"roasted\", \"caramel\", \"chocolate\" aromas. We see the same trend for the palate, with a high occurrence of those same words. Moreover, people apparently prefer a \"smooth\", \"creamy\" and \"rich\" mouthfeel. This is also consistent with the taste preferences, with a high occurrence of the words \"sweet\", \"chocolate\".", - "id": "c1e40e335dcd0006" + "id": "c1e40e335dcd0006", + "metadata": {}, + "source": [ + "During Winter, we can see that people tend to prefer aromas that are rich, full in flavor, like \"roasted\", \"caramel\", \"chocolate\" aromas. We see the same trend for the palate, with a high occurrence of those same words. Moreover, people apparently prefer a \"smooth\", \"creamy\" and \"rich\" mouthfeel. This is also consistent with the taste preferences, with a high occurrence of the words \"sweet\", \"chocolate\"." + ] }, { "cell_type": "code", @@ -1555,16 +1583,18 @@ ] }, { - "metadata": {}, "cell_type": "markdown", - "source": "In summer, we can see there is a preference for light aromas, with a variety of words like \"grapefruit\", \"citrus\", \"fruit\", \"fruity\", \"herbal\", etc. Moreover, the word \"bitter\" is overrepresented for the palate category, compared to colder seasons. Additionally, consumers seem to specifically like a \"dry\" mouthfeel, which is also an interesting result compared to the other seasons. Finally, concerning the taste criteria, we have opposing results, with high occurrence of words \"sweet\" and\"citrus\".", - "id": "46b5d2e785942fed" + "id": "46b5d2e785942fed", + "metadata": {}, + "source": [ + "In summer, we can see there is a preference for light aromas, with a variety of words like \"grapefruit\", \"citrus\", \"fruit\", \"fruity\", \"herbal\", etc. Moreover, the word \"bitter\" is overrepresented for the palate category, compared to colder seasons. Additionally, consumers seem to specifically like a \"dry\" mouthfeel, which is also an interesting result compared to the other seasons. Finally, concerning the taste criteria, we have opposing results, with high occurrence of words \"sweet\" and\"citrus\"." + ] }, { - "metadata": {}, "cell_type": "markdown", - "source": "", - "id": "447be7a9b59781e1" + "id": "447be7a9b59781e1", + "metadata": {}, + "source": [] }, { "cell_type": "code", @@ -1588,10 +1618,12 @@ ] }, { - "metadata": {}, "cell_type": "markdown", - "source": "For Autumn, we can see a clear preference for full-bodied aromas, like \"chocolate\", \"caramel\", exactly like winter. Moreover, we can see an interesting change for the palate criteria, with a high occurrence of the word \"spicy\" and \"sweet\", which is relevant with the taste preferences. Finally, people also prefer a \"smooth\" and \"creamy\" mouthfeel during this time period, which is also similar for winter reviews.", - "id": "e48369edce5a49d1" + "id": "e48369edce5a49d1", + "metadata": {}, + "source": [ + "For Autumn, we can see a clear preference for full-bodied aromas, like \"chocolate\", \"caramel\", exactly like winter. Moreover, we can see an interesting change for the palate criteria, with a high occurrence of the word \"spicy\" and \"sweet\", which is relevant with the taste preferences. Finally, people also prefer a \"smooth\" and \"creamy\" mouthfeel during this time period, which is also similar for winter reviews." + ] }, { "cell_type": "code", @@ -1615,14 +1647,14 @@ ] }, { - "metadata": {}, "cell_type": "markdown", + "id": "a4aa5948947ef2e7", + "metadata": {}, "source": [ "During Spring season, there is a clear preference for full-bodied aromas like \"chocolate\", \"caramel\". We also have a lot of fruit word occurrences, like \"citrus\", \"grapefruit\", fruit\", etc. Moreover, we can see a clear difference on the palate level, as people seem to specifically appreciate \"bitter\" and \"sour\" palates.\n", "Additionally, consumers seem to like a \"creamy\" and \"smooth\" mouthfeel, which stays quite common across the seasons.\n", "Finally, \"sweet\" tastes are more favored during this time period." - ], - "id": "a4aa5948947ef2e7" + ] }, { "cell_type": "markdown",