-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathEDA-distribution_individual.py
55 lines (43 loc) · 1.71 KB
/
EDA-distribution_individual.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import pandas as pd
import openpyxl
import matplotlib.pyplot as plt
from load_data import load_data
def bar_plot_ind(df, col_index):
column_name = df.columns[col_index]
postfix = "\n"
def add_postfix(sentence):
words = sentence.split()
postfixed_words = [word + postfix for word in words]
return ' '.join(postfixed_words)
# Calculate the frequency of each language
values_counts_sorted = df[column_name].apply(add_postfix).value_counts().sort_values(ascending=False)
top_12 = values_counts_sorted.head(12)
# print(top_12)
# Sum the remaining values to create an 'Others' category
others_sum = values_counts_sorted[12:].sum()
total_sum = values_counts_sorted.sum()
# Create a Series for 'Others'
others_series = pd.Series({'Others': others_sum})
# Use pd.concat to combine the top 12 values with the 'Others' Series
top_12_with_others = pd.concat([top_12, others_series])
percentages = []
for value in top_12_with_others:
per = round(value * 100 / total_sum, 2)
percentages.append(str(per) + "%")
# Create a bar plot
plt.figure(figsize=(15, 10))
graph = top_12_with_others.plot(kind='bar')
for i, per in enumerate(percentages):
graph.text(i, top_12_with_others[i], per, ha='center', va='bottom')
plt.title(column_name + " Distribution")
plt.xlabel(column_name)
plt.ylabel('Number')
plt.xticks(rotation=0) # Rotates the language names for better readability
plt.tight_layout() # Adjust the plot to ensure everything fits without overlapping
plt.show()
if __name__ == "__main__":
df = load_data()
bar_plot_ind(df, 2)
bar_plot_ind(df, 3)
bar_plot_ind(df, 4)
bar_plot_ind(df, 6)