-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 99317a3
Showing
11 changed files
with
2,574 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
# Brazil Election 2018 - Sentiment Analysis | ||
|
||
An approach using deep learning to sentiment analysis for presidential Brazilian elections in 2018. Code used to get the achieved results in my final graduation project in Computer Science at Universidade Paulista. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
import string | ||
from nltk.corpus import stopwords | ||
from keras.preprocessing.text import Tokenizer | ||
from keras.preprocessing.sequence import pad_sequences | ||
import pandas as pd | ||
import numpy as np | ||
import re | ||
import unidecode | ||
import pickle | ||
np.random.seed(42) | ||
|
||
def save_tokenizer(tokenizer): | ||
with open('tokenizer.pickle', 'wb') as handle: | ||
pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) | ||
|
||
def load_tokenizer(): | ||
with open('tokenizer.pickle', 'rb') as handle: | ||
tokenizer = pickle.load(handle) | ||
return tokenizer | ||
|
||
def load_stopwords(): | ||
with open('stopwords.txt', 'r') as f: | ||
words = [i.replace('\n', '') for i in f] | ||
return words | ||
|
||
def remove_duplicates(x, y): | ||
df = pd.DataFrame({"x":x, "y":y}) | ||
df = df.drop_duplicates(subset=['x'], keep=False) | ||
texts = df['x'].tolist() | ||
labels = df['y'].tolist() | ||
print(len([labels for i in labels if i == 1])) | ||
return texts, labels | ||
|
||
def under_sampling(x, y): | ||
x = pd.DataFrame(x) | ||
x.insert(loc=0, column='target', value=y) | ||
count_class_0, count_class_1 = x.target.value_counts() | ||
df_class_0 = x[x['target'] == 0] | ||
df_class_1 = x[x['target'] == 1] | ||
df_class_0_under = df_class_0.sample(count_class_1) | ||
x_under = pd.concat([df_class_0_under, df_class_1], axis=0) | ||
y = x_under.target.tolist() | ||
x = x_under.drop(['target'], 1) | ||
x = x[0].tolist() | ||
return x, y | ||
|
||
def clean_text(texts, labels): | ||
new_text = [] | ||
new_labels = [] | ||
i = 0 | ||
for text in texts: | ||
text = text.lower() | ||
text = unidecode.unidecode(text) | ||
text = ''.join(re.sub("(#[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text)) | ||
text = ''.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text)) | ||
text = re.sub(r'^RT[\s]+', '', text) | ||
text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE) | ||
tokens = text.split() | ||
table = str.maketrans('', '', string.punctuation) | ||
tokens = [w.translate(table) for w in tokens] | ||
tokens = [word for word in tokens if word.isalpha()] | ||
stop_words = stopwords.words('portuguese')+load_stopwords() | ||
tokens = [w for w in tokens if not w in stop_words] | ||
tokens = [word for word in tokens if len(word) > 1] | ||
tokens = ' '.join(tokens) | ||
new_text.append(tokens) | ||
if not labels == None: | ||
new_labels.append(labels[i]) | ||
i = i + 1 | ||
return new_text, new_labels | ||
|
||
def create_tokenizer(lines, load=None): | ||
if load: | ||
tokenizer = load_tokenizer() | ||
else: | ||
tokenizer = Tokenizer(num_words=2000) | ||
tokenizer.fit_on_texts(lines) | ||
if not load: save_tokenizer(tokenizer) | ||
return tokenizer | ||
|
||
def max_length(lines): | ||
return max([len(s.split()) for s in lines]) | ||
|
||
def encode_text(tokenizer, lines, length): | ||
encoded = tokenizer.texts_to_sequences(lines) | ||
padded = pad_sequences(encoded, | ||
padding='post', | ||
maxlen=26 | ||
) | ||
return padded | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
import pandas as pd | ||
import json | ||
import numpy as np | ||
np.random.seed(42) | ||
|
||
def sent_16(): | ||
top = 800000 | ||
df_pos = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='utf-8', header=None, usecols=[0,5]).head(top) | ||
df_neg = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='utf-8', header=None, usecols=[0,5]).tail(top) | ||
database = df_pos[5].tolist() + df_neg[5].tolist() | ||
labels = df_pos[0].tolist() + df_neg[0].tolist() | ||
for n, i in enumerate(labels): | ||
if i == 4: | ||
labels[n] = 1 | ||
return database, labels | ||
|
||
def sent_16_translate(): | ||
df = pd.read_csv('translate.csv', header=None, usecols=[0,4]) | ||
df_neg = df[df[4] == '0'] | ||
df_pos = df[df[4] == '1'] | ||
|
||
database = df_pos[0].tolist() + df_neg[0].tolist() | ||
labels = df_pos[4].tolist() + df_neg[4].tolist() | ||
labels = [int(i) for i in labels] | ||
return database, labels | ||
|
||
def political_data(): | ||
df = pd.read_csv('labeled_samples2.csv', index_col=[0]) | ||
database = df['text'].tolist() | ||
labels = df['label'].tolist() | ||
return database, labels | ||
|
||
def pos_neg(): | ||
database = [] | ||
labels = [] | ||
files = ['negativos_stream_download.json', 'positivos_stream_download.json'] | ||
for file in files: | ||
with open(file, 'r') as f: | ||
for line in f: | ||
try: | ||
tweet = json.loads(line) | ||
try: | ||
try: | ||
x = ('from RT extended_full_text') | ||
text = tweet["retweeted_status"]["extended_tweet"]["full_text"] | ||
except: | ||
x = ('from RT extended_text') | ||
text = tweet["retweeted_status"]["text"] | ||
except: | ||
try: | ||
x = ('from extended_text') | ||
text = tweet["extended_tweet"]["full_text"] | ||
except: | ||
x = ('from text') | ||
text = tweet["text"] | ||
database.append(str(text)) | ||
if 'negativo' in file: | ||
labels.append(0) | ||
else: | ||
labels.append(1) | ||
except: | ||
pass | ||
return database, labels |
Oops, something went wrong.