-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmovie_extractor.py
202 lines (140 loc) · 8.68 KB
/
movie_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
#código oficial, falta arrumar o sentence filtring (test_filtered_sentences.txt) acho que só flata complementar..nao arrumar!
import xml.etree.ElementTree as ET
import collections
import os
import string
import Sentence
import Review
import Movie
def movie_extractor(directory_path, aspects, set_of_movies, k):
"""
Given the directory that contains xml files - each of them
representing a single review - this function instantiates movie
objects. At the end of the function, it is computed the score of
each aspect belonging to the movie.
Args:
directory_path (str): the directory where the single reviews
xml files are kept.
aspects(dict): is a dict that maps each aspect to it's KL relevance
number.
set of movies(list): list containing the instantiated movie objects.
k (int) : represents the number of the chosen main aspects (top k aspects)
Returns:
None.
"""
for dirpath, dirnames, files in os.walk(directory_path):
if dirpath[-4:] != "iews":
new_movie = Movie.Movie(dirpath[-4:])
for file in files:
file_name = os.path.join(dirpath, file)
new_review = Review.Review(file_name)
new_review.review_extractor(aspects)
#new_review.review_extractor(aspects)
new_movie.reviews.append(new_review)
new_movie.number_of_reviews += 1
for review in new_movie.reviews:
#aspects_in_review is a counter; it contains
#the counting of aspects in the current review:
aspects_in_review = review.occurrences_of_each_aspect
#the above loop iterates through each aspect
#belonging to the counter attribute ("aspects_in_review") of the review:
for current_aspect in aspects_in_review:
#the counter "aspects_in_review" is acessed;
#the number of occurences of the "current_aspect" is
#attritbuted to "current_aspect_count":
current_aspect_count = aspects_in_review[current_aspect]
#the KL value of the "current_aspect" is acessed
#in the dict of aspects:
aspect_KL_rel = aspects[current_aspect]
review_sent = review.average_sentiment
current_aspect_score = current_aspect_count * aspect_KL_rel * review_sent
if current_aspect == "film":
with open("debug_score"+ new_movie.xml + ".txt",'a+', encoding="utf-8") as f:
print(current_aspect, file=f)
print("current_aspect_count: ", current_aspect_count, file=f)
print("aspect_KL_rel: ", aspect_KL_rel, file=f)
print("review_sent: ", review_sent, file=f)
print("current_aspect_score: ", current_aspect_score, file=f)
print("new_movie.number_of_reviews: ", new_movie.number_of_reviews, file=f)
print("------------", file=f)
current_aspect_score = current_aspect_score / new_movie.number_of_reviews
#total = new_movie.temp_acumulator_aspect[current_aspect] + current_aspect_score
if current_aspect not in list(new_movie.aspects_score.keys()):
new_movie.aspects_score[current_aspect] = 0
score_acumulator = new_movie.aspects_score[current_aspect] + current_aspect_score
new_movie.aspects_score[current_aspect] = score_acumulator
new_movie.top_k_aspects_evaluation(k)
print(new_movie.aspects_score["film"])
set_of_movies.append(new_movie)
def test_movie_extractor(set_of_movies):
for movie in set_of_movies:
file_destiny = "test_movie_extractor" + movie.xml + ".txt"
#file_destiny = "batata" + movie.xml + ".txt"
with open(file_destiny,'a+', encoding="utf-8") as f:
print("#################################### Beggining of this movie ##########################\n", file=f)
print("this movie belongs to the file: ",movie.xml, file=f)
print("\nnumber of reviews of this movie: ", movie.number_of_reviews, file=f)
print("\nScore of each aspect:", file=f)
for aspect in movie.aspects_score:
print("\taspect", aspect, ":", movie.aspects_score[aspect], file=f)
print("\nTop-" + str(k) + " aspects:", file=f)
for main_aspect in movie.top_k_aspects:
print("\tmain aspect: ", main_aspect, file=f)
print("\n", file=f)
iterator = 1
for review in movie.reviews:
print("\n\tReview " + str(iterator) + ":", file=f)
print("\n\t\tThis review belongs to the file: ", review.xml_name, file=f)
print("\t\tAverage sentiment of this review: ", review.average_sentiment, file=f)
print("\t\tNumber of sentences in this review: ", review.number_of_sentences, file=f)
print("\t\tAspects and it's occurrence in this review: ", file=f)
for aspect in review.occurrences_of_each_aspect:
print("\t\t\taspect: ", aspect," number of occurrences: ", review.occurrences_of_each_aspect[aspect], file=f)
print("\n\t\t\t\t---sentences in this review ---\n", file=f)
sentence_iterator = 1
for sentence in review.sentences:
print("\t\t\t\tSentence " + str(sentence_iterator) + ":\n", file=f)
print("\t\t\t\tNumber of tokens in this sentence: ",sentence.number_of_tokens, file=f)
print("\t\t\t\tSentiment value of this sentence: ", sentence.sentiment_value, file=f)
print("\t\t\t\tAspects in this sentence: ", file=f)
for aspect in sentence.aspects:
print("\t\t\t\t\t-", aspect, file=f)
print("\n\t\t\t\t\t" + sentence.__str__(), file=f)
print("\t\t\t\t------------------------------------------------\n", file=f)
sentence_iterator += 1
print("\n\t\t**************************** end of this review **********************************", file=f)
iterator +=1
print("\n#################################### end of this movie ##########################\n", file=f)
'''def test_directory(directory_path):
with open("test_directory.txt", 'a+', encoding="utf-8") as f:
for dirpath, dirnames, files in os.walk(directory_path):
#print("dirpath: ", dirpath, file=f)
for file in files:
filename = os.path.join(dirpath, file)
print("filename: ", filename, file=f)
'''
def test_sentence_filtering(set_of_movies):
for movie in set_of_movies:
print(movie.xml)
movie.sentence_filtering()
with open("test_sentence_filtering.txt", 'a+', encoding="utf-8") as f:
print("############ MOVIE: " + movie.xml + " ###########", file=f)
for sentence in movie.filtered_sentences:
print("\n\t-------------- in movie "+ movie.xml +"-------------------", file=f)
print("\n\tSentence sentiment: ", sentence.sentiment, file=f)
print("\n\tSentence sentiment_value:", sentence.sentiment_value, file=f)
print("\n\tSentence number of tokens: ", sentence.number_of_tokens, file=f)
print("\n\tSentence personal opinion: ", sentence.personal_opinion, file=f)
print("\t\t\t\tAspects in this sentence: ", file=f)
for aspect in sentence.aspects:
print("\t\t\t\t\t-", aspect, file=f)
print("\n\t\t\t\t\t" + sentence.__str__() + "\n", file=f)
if __name__ == '__main__':
directory_path = "C:\\Users\\User\\Desktop\\ic\\single_xml_reviews"
set_of_movies = []
aspects = {'morality': 2, 'film': 3, 'dog': 5, 'movie': 7, 'art': 9, 'mother': 10, 'way': 11}
k = 2
movie_extractor(directory_path, aspects, set_of_movies, k)
test_movie_extractor(set_of_movies)
test_sentence_filtering(set_of_movies)
#test_directory(directory_path)