-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathglobals.py
107 lines (87 loc) · 3.18 KB
/
globals.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import spacy
import re
import en_core_web_sm
from imdb import IMDb
from difflib import SequenceMatcher
import json
import pandas as pd
# This function returns a metric (0 to 1) for how similar strings a and b are.
def similar(a, b):
return SequenceMatcher(None, a, b).ratio()
# Initializing IMDb API and spacy language model
ia = IMDb()
nlp = spacy.load("en_core_web_sm")
# Initializing tweets from json
# Load in the tweets json
with open('data/gg2013.json') as f:
tweets = json.load(f)[0000:]
# Awards class - Contains all extracted awards as keys, whose values are dictionaries containing their
# extraction tally, related presenters, nominees, winners.
class Awards():
def __init__(self):
self.dict = {}
def foundRelation(self, type, award_name, entity_name): # 'type' can be 'presenters', 'nominees', 'winner', or False
self.foundAward(award_name)
if not type: return
if entity_name in self.dict[award_name][type]:
self.dict[award_name][type][entity_name] += 1
else:
self.dict[award_name][type][entity_name] = 1
def foundAward(self, award_name):
if award_name in self.dict:
self.dict[award_name]['tally'] += 1
else:
self.newAward(award_name)
def newAward(self, award_name):
self.dict[award_name] = {
"tally" : 1,
"presenters" : {},
"nominees" : {},
"winner" : {}
}
awardsDict = Awards()
# Clean emojis from tweets
def demoji(text):
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U0001F1F2-\U0001F1F4" # Macau flag
u"\U0001F1E6-\U0001F1FF" # flags
u"\U0001F600-\U0001F64F"
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
u"\U0001f926-\U0001f937"
u"\U0001F1F2"
u"\U0001F1F4"
u"\U0001F620"
u"\u200d"
u"\u2640-\u2642"
"]+", flags=re.UNICODE)
return emoji_pattern.sub(r'', text)
def containsAnyOf(str, strList):
return any(map(str.__contains__, strList))
# Text decomposer: Creates an object that allows one to access the tokenized form of a texxt along with its
# part-of-speech tagging, lemmas, and relations to other words.
class DecomposedText():
def __init__(self, text):
text = text.lower()
self.full_text = text
self.doc = nlp(text)
self.text = []
self.lemma = []
self.pos = []
self.parent = []
self.children = []
self.nouns = []
for token in self.doc:
self.text.append(token.text)
self.lemma.append(token.lemma_)
self.pos.append(token.pos_)
self.parent.append(token.head)
self.children.append([child for child in token.children])
self.nouns = [chunk for chunk in self.doc.noun_chunks]
def show(self):
print(pd.DataFrame({'Text':self.text, 'Lemma':self.lemma, 'Pos':self.pos, 'Parent':self.parent, 'Children':self.children}))
print(self.nouns)