From b8b837b5276e75f0b59515d678beae98e22ef810 Mon Sep 17 00:00:00 2001 From: Rizhong Lin Date: Fri, 15 Nov 2024 20:51:34 +0100 Subject: [PATCH] results rq1 update --- results_new.ipynb | 74 +++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 69 insertions(+), 5 deletions(-) diff --git a/results_new.ipynb b/results_new.ipynb index 69eb4e2..753de34 100644 --- a/results_new.ipynb +++ b/results_new.ipynb @@ -47,7 +47,12 @@ "import pandas as pd\n", "import numpy as np\n", "import seaborn as sns\n", - "import matplotlib.pyplot as plt" + "import matplotlib.pyplot as plt\n", + "\n", + "from src.utils.visualization_utils import *\n", + "\n", + "# Setup\n", + "setup_visualization()" ] }, { @@ -229,8 +234,70 @@ "source": [ "#### 1.1 What we have done for the initial analysis\n", "\n", + "- We examined distributions of key metrics (vote average, revenue, profit, and revenue-to-budget ratio) using histograms with kernel density estimation.\n", + "- We visualized relationships between critical variables through scatter plots, particularly focusing on vote average vs. count and financial metrics vs. budget.\n", + "- We calculated and visualized a correlation matrix to quantify relationships between `vote_average`, `vote_count`, `revenue`, `budget`, and `profit`.\n", + "- We preprocessed the data by removing zero-value entries and calculating derived metrics such as profit and revenue-to-budget ratios.\n", + "- We applied logarithmic transformations to handle the high skewness in financial data distributions.\n", + "\n", + "#### 1.2 Key observations\n", + "\n", + "##### Distribution Analysis\n", + "\n", + "1. Vote Average:\n", + "\n", + " - Normal distribution centered around 6.0-6.5\n", + " - Range primarily between 4-8\n", + " - Very few movies rated below 4 or above 8\n", + "\n", + "2. Revenue and Profit (Log Scale):\n", + "\n", + " - Both show right-skewed distributions\n", + " - Revenue peaks around 15-17 on log scale\n", + " - Profit shows similar pattern but with more spread\n", + " - Small cluster of extremely high performers\n", + "\n", + "3. Revenue-to-Budget Ratio (Log Scale):\n", + " - Heavily right-skewed\n", + " - Most movies cluster between 0-2 (log scale)\n", + " - Long tail indicating few movies with exceptional ROI\n", "\n", - "#### 1.2 Key observations\n" + "##### Relationship Analysis\n", + "\n", + "1. Vote Average vs. Vote Count:\n", + "\n", + " - Positive correlation (0.38)\n", + " - Higher vote counts tend toward 6-8 rating range\n", + " - More rating variance with fewer votes\n", + " - Ratings stabilize with increased votes\n", + "\n", + "2. Revenue/Profit vs. Budget:\n", + "\n", + " - Strong positive correlation (revenue-budget: 0.72)\n", + " - Higher budgets show wider profit variance\n", + " - Some high-budget outliers with exceptional returns\n", + " - Most movies cluster in lower budget range with moderate returns\n", + "\n", + "3. Rating vs. Revenue:\n", + " - Weak positive correlation (0.20)\n", + " - Higher-rated movies tend to have slightly better revenue\n", + " - Wide revenue spread regardless of rating\n", + " - Log-scale reveals clearer pattern at higher ratings\n", + "\n", + "##### Correlation Matrix Insights\n", + "\n", + "1. Strongest Correlations:\n", + "\n", + " - Revenue-Profit: 0.98 (very strong)\n", + " - Revenue-Vote Count: 0.77 (strong)\n", + " - Profit-Vote Count: 0.75 (strong)\n", + "\n", + "2. Weak Correlations:\n", + " - Vote Average-Budget: 0.028 (negligible)\n", + " - Vote Average-Revenue: 0.20 (weak)\n", + " - Vote Average-Profit: 0.23 (weak)\n", + "\n", + "These patterns suggest that while ratings have some influence on success, volume of audience engagement (vote count) and budget size are stronger predictors of financial performance. The extremely high correlation between revenue and profit suggests that cost management might be less critical than revenue generation for overall success.\n" ] }, { @@ -442,10 +509,7 @@ "source": [ "## code\n", "from src.utils.genre_analysis import *\n", - "from src.utils.visualization_utils import *\n", "\n", - "# Setup\n", - "setup_visualization()\n", "df, df_genres = prepare_data(\"data/cmu_tmdb.csv\")\n", "unique_genres = sorted(df_genres[\"genres\"].unique())\n", "genre_colors = create_genre_colors(unique_genres)\n",