Skip to content

Commit

Permalink
Update from juldib for gender part
Browse files Browse the repository at this point in the history
  • Loading branch information
juldib committed Dec 20, 2024
1 parent a279a66 commit 8ea7a3a
Show file tree
Hide file tree
Showing 7 changed files with 3,637 additions and 187 deletions.
102 changes: 0 additions & 102 deletions assets/style.css

This file was deleted.

2 changes: 1 addition & 1 deletion docs/_includes/pie_chart_gender.html

Large diffs are not rendered by default.

Large diffs are not rendered by default.

14 changes: 0 additions & 14 deletions docs/_includes/top5_count_male.html

This file was deleted.

3,492 changes: 3,423 additions & 69 deletions julien_evolution_genre_MF.ipynb

Large diffs are not rendered by default.

108 changes: 108 additions & 0 deletions src/models/trend_by_gender.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import pandas as pd
import src.utils.pipelines as pip

def load_and_clean_influenced_names(file_path):
"""
Load and clean the dataset of influenced names.
:param file_path: The path to the CSV file containing the influenced names data.
:return: A cleaned DataFrame containing the influenced names.
"""
# Load the dataset
prophet = pd.read_csv(file_path)

# Filter for influenced names
influenced_prophet = prophet[prophet["Influenced"] > 0]

# Remove common identification mistakes such as "the", "a" or "Mr"
influenced_prophet = influenced_prophet[~influenced_prophet["Character Name"].isin(["the", "a", "Mr"])]

# Print the number of influenced names and the shape of the dataframe
print("Number of influenced names with prophet: ", len(influenced_prophet))
print(influenced_prophet.shape)

# Drop unnecessary columns
influenced_prophet.drop(columns=["Count", 'Mean Difference'], inplace=True)

return influenced_prophet


def load_and_clean_character_data():
"""
Load and clean the character names data, keeping only 'Name' and 'Sex' columns.
:return: A cleaned DataFrame with 'Name' and 'Gender' columns.
"""
# Load the data
global_names, _, _, _, _ = pip.read_all_names_data()
df_character = global_names()

# Keep only 'Name' and 'Sex' columns
df_character = df_character[['Name', 'Sex']]

# Rename 'Sex' to 'Gender'
df_character.rename(columns={'Sex': 'Gender'}, inplace=True)

return df_character

def merge_influenced_and_character_data(influenced_prophet, df_character):
"""
Merge the influenced names data with the character data based on 'Normalized_name' and 'Year'.
:param influenced_prophet: DataFrame containing the influenced names data.
:param df_character: DataFrame containing the character data with 'Name', 'Gender', and 'Year' columns.
:return: A merged DataFrame with relevant columns.
"""
# Perform a merge based on 'Normalized_name' == 'Name' and 'Year'
merged_df = influenced_prophet.merge(
df_character[['Name', 'Gender', 'Year', 'Count']], # Select only the relevant columns
how='left', # Use a left join to keep all rows in influenced_prophet
left_on=['Normalized_name', 'Year'], # Match these columns from influenced_prophet
right_on=['Name', 'Year'] # With these columns from df_character
)

# Drop the extra 'Name' column if needed (optional)
merged_df.drop(columns='Name', inplace=True)

return merged_df

def fill_missing_gender_and_clean(merged_df):
"""
Fill missing values in 'Gender' column and clean data based on a reference dataset.
:param merged_df: DataFrame containing the merged data with 'Gender' column having missing values.
:return: A cleaned DataFrame with 'Gender' filled and specific rows dropped.
"""
# Data to use for filling missing values
data = {
'Wikipedia ID': [323715, 920296, 11077335, 97758, 97646, 146947, 8695, 10645970, 23487440, 321496,
97758, 68245, 133648, 1210303, 950929, 697113, 3917873, 31557],
'Movie Name': ['troy', 'somewhere in time', 'doctor zhivago', 'doctor zhivago', 'die hard', 'spider-man',
'dr. strangelove or: how i learned to stop worrying and love the bomb', 'rocky', 'alien',
'pirates of the caribbean: the curse of the black pearl', 'doctor zhivago', 'bonnie and clyde',
'scent of a woman', 'constantine', 'the haunting', 'big trouble in little china',
'chitty chitty bang bang', 'the good, the bad and the ugly'],
'Year': [2004, 1980, 1965, 1965, 1988, 2002, 1964, 1981, 1979, 2003, 1965, 1967, 1992, 2005, 1963,
1986, 1968, 1966],
'Character Name': ['Briseis', 'McKenna', 'Yuri', 'Yuri', 'Kristoff', 'Daily', 'Alexei', 'Shankar', 'Ash',
'Sparrow', 'Pasha', 'Moss', 'Ranger', 'Lucifer', 'Hill', 'Lo', 'Jemima', 'Blondie'],
'Gender': ['F', 'F', 'M', 'M', 'M', 'F', 'M', 'M', 'M', 'M', 'M',
'M', 'M', 'M', 'M', 'M', 'F', 'F']
}

df_reference = pd.DataFrame(data)

# Merge to bring in Gender information from df_reference
merged_df = merged_df.merge(
df_reference[['Wikipedia ID', 'Movie Name', 'Year', 'Character Name', 'Gender']],
on=['Wikipedia ID', 'Movie Name', 'Year', 'Character Name'],
how='left',
suffixes=('', '_reference')
)

# Fill NaN values in 'Gender' using 'Gender_reference' from df_reference
merged_df['Gender'] = merged_df['Gender'].fillna(merged_df['Gender_reference'])

# Drop the temporary 'Gender_reference' column
merged_df.drop(columns=['Gender_reference'], inplace=True)

# Drop rows with 'Character Name' as 'Daily', 'Hill', or 'Lo'
merged_df = merged_df[~merged_df['Character Name'].isin(['Daily', 'Hill', 'Lo'])]

return merged_df
104 changes: 104 additions & 0 deletions src/utils/gender_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import pandas as pd
import plotly.express as px

def plot_gender_proportion(df):
"""
Plot the proportion of male vs female names influenced.
:param df: DataFrame with a 'Gender' column.
:return: None. Displays a pie chart of the proportion of male and female names.
"""
# Calculate the proportion of male vs female names influenced
gender_counts = df['Gender'].value_counts()

# Create a pie chart
fig = px.pie(
gender_counts,
values=gender_counts.values,
names=gender_counts.index,
title='Proportion of Male vs Female Names Influenced'
)

# Set transparent background
fig.update_layout(
plot_bgcolor='rgba(0,0,0,0)', # Transparent plot background
paper_bgcolor='rgba(0,0,0,0)' # Transparent chart background
)

# Show the pie chart
fig.show()

def plot_genre_gender_influence(df):
"""
Plots the percentage distribution of influenced baby names
Parameters:
- df (DataFrame): The dataset containing influenced baby names and genres.
Returns:
- None: Displays the plot.
"""

# Drop rows where 'Genres' is NaN
df = df.dropna(subset=['Genres'])

# If a movie can belong to multiple genres, split them
df['Genres'] = df['Genres'].str.split(',')

# Explode the genres so each genre has its own row
df_exploded = df.explode('Genres')

# Clean whitespace in genres
df_exploded['Genres'] = df_exploded['Genres'].str.strip()

# Group by 'Genres' and 'Gender' and sum the 'Count'
genre_gender_trend = df_exploded.groupby(['Genres', 'Gender'])['Count'].sum().reset_index()

# Calculate total influence per genre
top_genres_total = genre_gender_trend.groupby('Genres')['Count'].sum().reset_index()

# Sort genres by total influence in descending order and select top 10
top_genres = top_genres_total.sort_values(by='Count', ascending=False).head(10)

# Filter the original genre_gender_trend for top genres
genre_gender_trend_top = genre_gender_trend[genre_gender_trend['Genres'].isin(top_genres['Genres'])]

# Pivot the table to have genders as separate columns
genre_pivot = genre_gender_trend_top.pivot(index='Genres', columns='Gender', values='Count').fillna(0)

# Calculate total influenced names per genre
genre_pivot['Total'] = genre_pivot.sum(axis=1)

# Calculate percentage for each gender within genres
genre_pivot['M_Percent'] = (genre_pivot['M'] / genre_pivot['Total']) * 100
genre_pivot['F_Percent'] = (genre_pivot['F'] / genre_pivot['Total']) * 100

# Reset index to turn 'Genres' back into a column
genre_pivot = genre_pivot.reset_index()

# Melt the pivot table to long format for easier plotting with seaborn
genre_percentage_melted = genre_pivot.melt(id_vars='Genres', value_vars=['M_Percent', 'F_Percent'],
var_name='Gender', value_name='Percentage')

# Replace 'M_Percent'/'F_Percent' with 'M'/'F' for clarity in the plot
genre_percentage_melted['Gender'] = genre_percentage_melted['Gender'].str.replace('_Percent', '')

# Create a horizontal bar plot to display percentage distribution within each genre
fig = px.bar(
genre_percentage_melted,
x='Percentage',
y='Genres',
color='Gender',
color_discrete_map={'M': 'blue', 'F': 'pink'},
title='Percentage of Influenced Baby Names by Gender Across Top Genres',
labels={'Percentage': 'Percentage of Influenced Names', 'Genres': 'Genre', 'Gender': 'Gender'},
orientation='h' # Horizontal bars
)

fig.update_layout(
xaxis=dict(range=[0, 100]), # Since percentages range from 0 to 100
barmode='stack', # Stack the bars for gender comparison
plot_bgcolor='rgba(0,0,0,0)', # Transparent plot background
paper_bgcolor='rgba(0,0,0,0)', # Transparent chart background
)

fig.show()

0 comments on commit 8ea7a3a

Please sign in to comment.