-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathKL_divergence.py
100 lines (68 loc) · 3.34 KB
/
KL_divergence.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#arrumar o "apos erro"
import counter_occurrences as count_occur
import numpy as np
def KL_divergence(frequency_corpora_reviews, frequency_corpora_BNC):
"""Computes the discrete KL divergence for a noun present in the corpora review.
KL value = (frequency_corpora_reviews) * log(frequency_corpora_reviews/frequency_corpora_BNC)
Args:
frequency_corpora_reviews (int): The number of occurrences of the noun in the reviews
corpora; it's the term "ca" in the formula described in the paper
frequency_corpora_BNC (int): The number of occurrences of the noun in the BNC
corpora; it's the term "cb" in the formula described in the paper
Returns:
KL (double): The KL value calculated for the noun
"""
#by definition, if the number of occurrences of the noun in the BNC
#corpora is zero, that means that the kl value for this noun in infinity:
if frequency_corpora_BNC == 0:
return np.inf
division_term = (frequency_corpora_reviews/frequency_corpora_BNC)
second_term= np.log(division_term)
KL= frequency_corpora_reviews*second_term
return (KL)
def KL_nouns_values(corpora_BNC, corpora_reviews):
"""Computes the KL value for each noun present in the corpora of reviews,
using the previous generated files: "corpora_reviews.txt" and
"corpora_BNC.txt"
Args:
Returns:
KL_values (dict): Contains the KL value associated to each noun of the corpora review,
for all nouns presents in the corpora review
"""
#counts the occurrence of each noun in the following corporas:
count_nouns_reviews = count_occur.counting_occurrences_list(corpora_reviews)
count_nouns_BNC = count_occur.counting_occurrences_list(corpora_BNC)
KL_values = {}
for noun in count_nouns_reviews:
#frequency_corpora_reviews is never zero;
#frequency_corpora_reviews is the number of occurrences of
#the current noun in the reviews corpora:
frequency_corpora_reviews = count_nouns_reviews[noun]
#frequency_corpora_BNC is the number of occurrences of
#the current now in the BNC corpora:
frequency_corpora_BNC = count_nouns_BNC[noun]
#KL_values is a dict containing the noun and it's kl_value, for all nouns
#in the corpora review:
KL_values[noun] = KL_divergence(frequency_corpora_reviews, frequency_corpora_BNC)
return KL_values
def epsilon_aspects_extraction(KL_values, threshold, destiny_file):
"""Given a certain threshold for KL divergence, this function extracts
aspects from the KL_values dict
Args:
KL_values (dict): dict containing each noun in the corpora review
associated with it's KL value
threshold (double): The Epsilon cited in the paper
destiny_file(str): the file that is going to contain all the
aspects extracted. If printing is not desired, this parameter
should be passed as "none"
Returns:
aspects(dict): A dict that relates each aspect to it's KL value
"""
aspects={}
for noun in KL_values:
if KL_values[noun] > threshold:
aspects[noun] = KL_values[noun]
if destiny_file != "none":
with open(destiny_file, 'a+', encoding="utf-8") as f:
print(aspects, file=f)
return aspects