Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add_porn_hub_regress_code #1

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 95 additions & 0 deletions pornhub_lineplot_over_years.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt

# Read the CSV file
df = pd.read_csv('porn-with-dates-2022.csv')

# Convert the 'date' column to datetime
df['date'] = pd.to_datetime(df['date'])

# Determine the latest year in the dataset
latest_year = int(df['date'].dt.year.max())

# Initialize a dictionary to store the popularity of top tags for the latest year
popularity_latest_year = {}
popularity_latest_year_raw = {}

# Filter data for the latest year
df_latest_year = df[df['date'].dt.year == latest_year]

# Get the total number of views for the latest year
total_views_latest_year = df_latest_year['views'].sum()


# Initialize an empty dictionary to store the total views for each tag
tag_views_latest_year = {}

# Iterate over each row in the latest year DataFrame
for index, row in df_latest_year.iterrows():
# Convert the string representation of tags to a list
tags = eval(row['categories'])

# Iterate over each tag in the list
for tag in tags:
# Add the number of views associated with the tag to the dictionary
tag_views_latest_year[tag] = tag_views_latest_year.get(tag, 0) + row['views']

# Convert the dictionary to a pandas Series
tag_views_latest_year = pd.Series(tag_views_latest_year)

# Sort the Series by the total views in descending order
tag_views_latest_year = tag_views_latest_year.sort_values(ascending=False)


# Calculate the percentage of total views for each tag in the latest year
for tag, count in tag_views_latest_year.items():
percentage = (count / total_views_latest_year) * 100
popularity_latest_year_raw[tag] = count
popularity_latest_year[tag] = percentage

# Get the top 10 tags for the latest year
top_tags_latest_year = pd.Series(popularity_latest_year).nlargest(10)
top_tags_latest_year_raw = pd.Series(popularity_latest_year_raw).nlargest(10)


# Print distribution of views among the top tags in the latest year
print(f"Top 10 tags in {latest_year}:")
top_tags = []
top_views = []
other_views = 0

# Iterate over (tag, views) pairs in the top tags Series
for tag, views in top_tags_latest_year_raw.items():
percentage = (views / total_views_latest_year) * 100
print(f"{tag}: {views} views ({percentage:.2f}% of total)")
top_tags.append(tag)
top_views.append(percentage)

# Initialize a set to store video IDs associated with top 10 tags
videos_with_top_tags = set()

# Iterate over the top tags to collect video IDs
for tag in top_tags_latest_year_raw.index:
# Get the DataFrame rows where the tag appears
rows_with_tag = df_latest_year[df_latest_year['categories'].str.contains(tag)]
# Add the IDs of these rows to the set
videos_with_top_tags.update(rows_with_tag['url'])

# Calculate the total views for 'other' tags
other_views = df_latest_year[~df_latest_year['url'].isin(videos_with_top_tags)]['views'].sum()

top_tags.append('Other')
percentage_other = (other_views / total_views_latest_year) * 100
top_views.append(percentage_other)
print(f"Other: {other_views} views ({percentage_other:.2f}% of total)")

# Plot distribution of views among the top tags in the latest year
plt.figure(figsize=(10, 6))
plt.bar(top_tags, top_views)
plt.xlabel('Tags')
plt.ylabel('Number of Views (%)')
plt.title(f'Distribution of Views Among Top Tags in {latest_year}')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
70 changes: 70 additions & 0 deletions pornhub_tags_regression_over_years.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import pandas as pd
from sklearn.linear_model import LinearRegression
import numpy as np
from scipy.stats import linregress

# Read the CSV file
df = pd.read_csv('porn-with-dates-2022.csv')

# Convert the 'date' column to datetime
df['date'] = pd.to_datetime(df['date'])

# Determine the unique tags in the dataset
unique_tags = set()
for categories in df['categories']:
unique_tags.update(eval(categories))

# Determine the unique years in the dataset
unique_years = sorted(df['date'].dt.year.unique())

# Initialize a dictionary to store the p-values of linear regression for each tag
tag_stats = {}

# Iterate over each tag
for tag in unique_tags:
# Initialize lists to store years and normalized views
years = []
normalized_views = []

# Iterate over each year
for year in unique_years:
# Print out progress for the researchers own sanity
print(f'Running on tag "{tag}" in year {year}')

# Filter data for the current year
total_year_data = df[(df['date'].dt.year == year)]

# Calculate total views for the current year
total_year_views = total_year_data['views'].sum()

# Filter data for the current tag and year
tag_year_data = df[(df['date'].dt.year == year) & df['categories'].apply(lambda x: tag in eval(x))]

# Calculate total views for current tag in current year
tag_year_views = tag_year_data['views'].sum()

# Append the year
years.append(year)

# Append the normalized views for the current year
normalized_views.append(tag_year_views / total_year_views if total_year_views != 0 else 0)

# Perform linear regression on years and normalized views
slope, _, r_value, p_value, _ = linregress(years, normalized_views)

# Store the statistics of linear regression for the current tag
tag_stats[tag] = {'slope': slope, 'r_value': r_value, 'p_value': p_value}


# Save the list of significant tags along with their statistics to a tab-delimited file
significant_tags = []
with open('significant_tags.txt', 'w') as file:
file.write('Tag\tSlope\tR Value\tP Value\n')
for tag, stats in tag_stats.items():
if stats['p_value'] < 0.05:
file.write(f"{tag}\t{stats['slope']}\t{stats['r_value']}\t{stats['p_value']}\n")
significant_tags.append(tag)

print("Tags with significant changes in view count over years:")
for tag in significant_tags:
print(tag)
95 changes: 95 additions & 0 deletions total_views_over_years_pornhub.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt

# Read the CSV file
df = pd.read_csv('porn-with-dates-2022.csv')

# Convert the 'date' column to datetime
df['date'] = pd.to_datetime(df['date'])

# Determine the latest year in the dataset
latest_year = int(df['date'].dt.year.max())

# Initialize a dictionary to store the popularity of top tags for the latest year
popularity_latest_year = {}
popularity_latest_year_raw = {}

# Filter data for the latest year
df_latest_year = df[df['date'].dt.year == latest_year]

# Get the total number of views for the latest year
total_views_latest_year = df_latest_year['views'].sum()


# Initialize an empty dictionary to store the total views for each tag
tag_views_latest_year = {}

# Iterate over each row in the latest year DataFrame
for index, row in df_latest_year.iterrows():
# Convert the string representation of tags to a list
tags = eval(row['categories'])

# Iterate over each tag in the list
for tag in tags:
# Add the number of views associated with the tag to the dictionary
tag_views_latest_year[tag] = tag_views_latest_year.get(tag, 0) + row['views']

# Convert the dictionary to a pandas Series
tag_views_latest_year = pd.Series(tag_views_latest_year)

# Sort the Series by the total views in descending order
tag_views_latest_year = tag_views_latest_year.sort_values(ascending=False)


# Calculate the percentage of total views for each tag in the latest year
for tag, count in tag_views_latest_year.items():
percentage = (count / total_views_latest_year) * 100
popularity_latest_year_raw[tag] = count
popularity_latest_year[tag] = percentage

# Get the top 10 tags for the latest year
top_tags_latest_year = pd.Series(popularity_latest_year).nlargest(10)
top_tags_latest_year_raw = pd.Series(popularity_latest_year_raw).nlargest(10)


# Print distribution of views among the top tags in the latest year
print(f"Top 10 tags in {latest_year}:")
top_tags = []
top_views = []
other_views = 0

# Iterate over (tag, views) pairs in the top tags Series
for tag, views in top_tags_latest_year_raw.items():
percentage = (views / total_views_latest_year) * 100
print(f"{tag}: {views} views ({percentage:.2f}% of total)")
top_tags.append(tag)
top_views.append(percentage)

# Initialize a set to store video IDs associated with top 10 tags
videos_with_top_tags = set()

# Iterate over the top tags to collect video IDs
for tag in top_tags_latest_year_raw.index:
# Get the DataFrame rows where the tag appears
rows_with_tag = df_latest_year[df_latest_year['categories'].str.contains(tag)]
# Add the IDs of these rows to the set
videos_with_top_tags.update(rows_with_tag['url'])

# Calculate the total views for 'other' tags
other_views = df_latest_year[~df_latest_year['url'].isin(videos_with_top_tags)]['views'].sum()

top_tags.append('Other')
percentage_other = (other_views / total_views_latest_year) * 100
top_views.append(percentage_other)
print(f"Other: {other_views} views ({percentage_other:.2f}% of total)")

# Plot distribution of views among the top tags in the latest year
plt.figure(figsize=(10, 6))
plt.bar(top_tags, top_views)
plt.xlabel('Tags')
plt.ylabel('Number of Views (%)')
plt.title(f'Distribution of Views Among Top Tags in {latest_year}')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
95 changes: 95 additions & 0 deletions total_views_over_years_xhamster.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt

# Read the CSV file
df = pd.read_csv('xhamster.csv')

# Convert the 'upload_date' column to datetime
df['upload_date'] = pd.to_datetime(df['upload_date'])

# Determine the latest year in the dataset
latest_year = int(df['upload_date'].dt.year.max())

# Initialize a dictionary to store the popularity of top tags for the latest year
popularity_latest_year = {}
popularity_latest_year_raw = {}

# Filter data for the latest year
df_latest_year = df[df['upload_date'].dt.year == latest_year]

# Get the total number of views for the latest year
total_views_latest_year = df_latest_year['nb_views'].sum()


# Initialize an empty dictionary to store the total views for each tag
tag_views_latest_year = {}

# Iterate over each row in the latest year DataFrame
for index, row in df_latest_year.iterrows():
# Convert the string representation of tags to a list
tags = eval(row['channels'])

# Iterate over each tag in the list
for tag in tags:
# Add the number of views associated with the tag to the dictionary
tag_views_latest_year[tag] = tag_views_latest_year.get(tag, 0) + row['nb_views']

# Convert the dictionary to a pandas Series
tag_views_latest_year = pd.Series(tag_views_latest_year)

# Sort the Series by the total views in descending order
tag_views_latest_year = tag_views_latest_year.sort_values(ascending=False)


# Calculate the percentage of total views for each tag in the latest year
for tag, count in tag_views_latest_year.items():
percentage = (count / total_views_latest_year) * 100
popularity_latest_year_raw[tag] = count
popularity_latest_year[tag] = percentage

# Get the top 10 tags for the latest year
top_tags_latest_year = pd.Series(popularity_latest_year).nlargest(10)
top_tags_latest_year_raw = pd.Series(popularity_latest_year_raw).nlargest(10)


# Print distribution of views among the top tags in the latest year
print(f"Top 10 tags in {latest_year}:")
top_tags = []
top_views = []
other_views = 0

# Iterate over (tag, views) pairs in the top tags Series
for tag, views in top_tags_latest_year_raw.items():
percentage = (views / total_views_latest_year) * 100
print(f"{tag}: {views} views ({percentage:.2f}% of total)")
top_tags.append(tag)
top_views.append(percentage)

# Initialize a set to store video IDs associated with top 10 tags
videos_with_top_tags = set()

# Iterate over the top tags to collect video IDs
for tag in top_tags_latest_year_raw.index:
# Get the DataFrame rows where the tag appears
rows_with_tag = df_latest_year[df_latest_year['channels'].str.contains(tag)]
# Add the IDs of these rows to the set
videos_with_top_tags.update(rows_with_tag['id'])

# Calculate the total views for 'other' tags
other_views = df_latest_year[~df_latest_year['id'].isin(videos_with_top_tags)]['nb_views'].sum()

top_tags.append('Other')
percentage_other = (other_views / total_views_latest_year) * 100
top_views.append(percentage_other)
print(f"Other: {other_views} views ({percentage_other:.2f}% of total)")

# Plot distribution of views among the top tags in the latest year
plt.figure(figsize=(10, 6))
plt.bar(top_tags, top_views)
plt.xlabel('Tags')
plt.ylabel('Number of Views (%)')
plt.title(f'Distribution of Views Among Top Tags in {latest_year}')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
Loading