-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathinfo_extraction.py
139 lines (110 loc) · 5.12 KB
/
info_extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
from globals import *
# Person and Movie identifier - Takes in a string and determines if it is the name of a person or a movie.
# Returns either "Person", "Movie", or False, as well as the official name of the person/movie.
def personOrMovie(name):
name = name.lower()
people = ia.search_person(name)
movies = ia.search_movie(name)
person = people[0] if people else False
movie = movies[0] if movies else False
person_match = similar(person['name'].lower(), name) if person else 0.0
movie_match_l = similar(
movie['long imdb title'].lower(), name) if movie else 0.0
movie_match_s = similar(movie['title'].lower(), name) if movie else 0.0
movie_match_avg = (movie_match_l + movie_match_s) / 2
# print("Person match:", person_match)
# print("Movie match (long):", movie_match_l)
# print("Movie match (short):", movie_match_s)
# print("Movie match (avg)", movie_match_avg)
(entity, match, type) = (person['name'], person_match, "person") if person_match > movie_match_avg else (
movie['title'], movie_match_avg, "movie") if movie else (False, 0.0, False)
if match < .65:
return (False, False)
return (type, entity)
# Relation finder - Takes in a text and finds relations between awards and entities.
# Returns a list of relations, which are tuples in the form: (type, award, entity), where
# 'type' is 'nominees', 'winner', and 'presenters'; 'award' is the name of the award; and
# 'entity' is the name of the person/movie.
def findRelations(text):
Text = DecomposedText(text)
relations = []
# print(Text.nouns)
# Nominees and Awards
keywords = ["win", "nominate"]
for i, token in enumerate(Text.doc):
if token.lemma_ in keywords:
type = False
award = False
entity = False
for child in token.children:
if not entity and child.pos_ in ['NOUN', 'PROPN']:
entity = [noun for noun in Text.nouns if child in noun]
if not entity: continue
(type, entity) = personOrMovie(entity[0].text)
continue
if not award and child.pos_ in ['NOUN', 'ADP']:
if child.pos_ == 'ADP':
child = [c for c in child.children]
if not child: continue
child = child[0]
award = [noun for noun in Text.nouns if child in noun]
if not award: continue
award = award[0].text
break
relations.append(("nominees", award, entity))
return relations
# Award finder - This function specifically finds award names and adds them to the awardsDict.
# It's somewhat effective, though it does have a couple hardcoded words which is isn't ideal.
award_pos = ["ADJ", "NOUN", "ADP", "DET", "PUNCT", "CCONJ"]
def findAward(text):
Text = DecomposedText(text)
start = False
end = False
dash = False
# check if 'goes' is in the tweet, make sure 'to' comes right after. Extract 'Best Award' from tweet and return
if 'goes' in Text.text or 'Goes' in Text.text:
try:
goesidx = Text.text.index('goes')
except:
goesidx = Text.text.index('Goes')
if Text.text[goesidx + 1] == 'to':
if 'Best' in Text.text:
award_name = 'Best' + Text.text[Text.text.index('Best') + 1]
return award_name
elif 'best' in Text.text:
award_name = 'Best' + Text.text[Text.text.index('best') + 1]
return award_name
for i, pos in enumerate(Text.pos):
if not start and i >= len(Text.text) - 2: break
# Start reading potential award
if not start and pos == "ADJ" and (
Text.pos[i + 1] == "NOUN" or Text.pos[i + 1] == "ADJ" or Text.pos[i + 1] == "VERB") and (
Text.lemma[i] == "good"):
start = i
continue
# Restrict certain words from counting as award
if start and pos in ["ADP", "DET", "PUNCT", "CCONJ"]:
if Text.text[i] not in ["in", "a", "-", "or"]:
end = i
break
# Stop reading award if encountering a second dash, or if the word after the first dash isn't valid.
if start and Text.text[i] == "-":
if dash or (Text.pos[i + 1] != "NOUN") or (
Text.text[i + 1] not in ["drama", "musical", "comedy", "foreign", "animated"]):
end = i
break
dash = True
# Continue reading until not valid award pos, unless it's only the second in the sequence
if start and pos not in award_pos:
if i == start + 1:
continue
end = i
break
# If reading an award and reached the end of the text, stop reading.
# If shorter than 3 words, ignore it (though 2 words would be useful for 'best dressed')
if start and not end: end = len(Text.text)
if not (end - start >= 3):
return False
award_name = Text.doc[start:end].text
# print(award_name)
return award_name