Skip to content

Commit

Permalink
Initial commit.
Browse files Browse the repository at this point in the history
  • Loading branch information
anjosma committed Dec 2, 2018
0 parents commit 99317a3
Show file tree
Hide file tree
Showing 11 changed files with 2,574 additions and 0 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Brazil Election 2018 - Sentiment Analysis

An approach using deep learning to sentiment analysis for presidential Brazilian elections in 2018. Code used to get the achieved results in my final graduation project in Computer Science at Universidade Paulista.
91 changes: 91 additions & 0 deletions data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import string
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pandas as pd
import numpy as np
import re
import unidecode
import pickle
np.random.seed(42)

def save_tokenizer(tokenizer):
with open('tokenizer.pickle', 'wb') as handle:
pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

def load_tokenizer():
with open('tokenizer.pickle', 'rb') as handle:
tokenizer = pickle.load(handle)
return tokenizer

def load_stopwords():
with open('stopwords.txt', 'r') as f:
words = [i.replace('\n', '') for i in f]
return words

def remove_duplicates(x, y):
df = pd.DataFrame({"x":x, "y":y})
df = df.drop_duplicates(subset=['x'], keep=False)
texts = df['x'].tolist()
labels = df['y'].tolist()
print(len([labels for i in labels if i == 1]))
return texts, labels

def under_sampling(x, y):
x = pd.DataFrame(x)
x.insert(loc=0, column='target', value=y)
count_class_0, count_class_1 = x.target.value_counts()
df_class_0 = x[x['target'] == 0]
df_class_1 = x[x['target'] == 1]
df_class_0_under = df_class_0.sample(count_class_1)
x_under = pd.concat([df_class_0_under, df_class_1], axis=0)
y = x_under.target.tolist()
x = x_under.drop(['target'], 1)
x = x[0].tolist()
return x, y

def clean_text(texts, labels):
new_text = []
new_labels = []
i = 0
for text in texts:
text = text.lower()
text = unidecode.unidecode(text)
text = ''.join(re.sub("(#[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text))
text = ''.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text))
text = re.sub(r'^RT[\s]+', '', text)
text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
tokens = text.split()
table = str.maketrans('', '', string.punctuation)
tokens = [w.translate(table) for w in tokens]
tokens = [word for word in tokens if word.isalpha()]
stop_words = stopwords.words('portuguese')+load_stopwords()
tokens = [w for w in tokens if not w in stop_words]
tokens = [word for word in tokens if len(word) > 1]
tokens = ' '.join(tokens)
new_text.append(tokens)
if not labels == None:
new_labels.append(labels[i])
i = i + 1
return new_text, new_labels

def create_tokenizer(lines, load=None):
if load:
tokenizer = load_tokenizer()
else:
tokenizer = Tokenizer(num_words=2000)
tokenizer.fit_on_texts(lines)
if not load: save_tokenizer(tokenizer)
return tokenizer

def max_length(lines):
return max([len(s.split()) for s in lines])

def encode_text(tokenizer, lines, length):
encoded = tokenizer.texts_to_sequences(lines)
padded = pad_sequences(encoded,
padding='post',
maxlen=26
)
return padded

63 changes: 63 additions & 0 deletions dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import pandas as pd
import json
import numpy as np
np.random.seed(42)

def sent_16():
top = 800000
df_pos = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='utf-8', header=None, usecols=[0,5]).head(top)
df_neg = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='utf-8', header=None, usecols=[0,5]).tail(top)
database = df_pos[5].tolist() + df_neg[5].tolist()
labels = df_pos[0].tolist() + df_neg[0].tolist()
for n, i in enumerate(labels):
if i == 4:
labels[n] = 1
return database, labels

def sent_16_translate():
df = pd.read_csv('translate.csv', header=None, usecols=[0,4])
df_neg = df[df[4] == '0']
df_pos = df[df[4] == '1']

database = df_pos[0].tolist() + df_neg[0].tolist()
labels = df_pos[4].tolist() + df_neg[4].tolist()
labels = [int(i) for i in labels]
return database, labels

def political_data():
df = pd.read_csv('labeled_samples2.csv', index_col=[0])
database = df['text'].tolist()
labels = df['label'].tolist()
return database, labels

def pos_neg():
database = []
labels = []
files = ['negativos_stream_download.json', 'positivos_stream_download.json']
for file in files:
with open(file, 'r') as f:
for line in f:
try:
tweet = json.loads(line)
try:
try:
x = ('from RT extended_full_text')
text = tweet["retweeted_status"]["extended_tweet"]["full_text"]
except:
x = ('from RT extended_text')
text = tweet["retweeted_status"]["text"]
except:
try:
x = ('from extended_text')
text = tweet["extended_tweet"]["full_text"]
except:
x = ('from text')
text = tweet["text"]
database.append(str(text))
if 'negativo' in file:
labels.append(0)
else:
labels.append(1)
except:
pass
return database, labels
Loading

0 comments on commit 99317a3

Please sign in to comment.