-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpreprocessing.py
133 lines (120 loc) · 6.3 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# PREPROCESSING
# Adrian Brünger, Stefan Rummer, TUM, summer 2021
# This file contains 2 preprocessing functions to avoid redundant code
# imports
# GENERAL
import pickle
import pandas as pd
# for PREPROCESSING
import re # RegEx for removing non-letter characters
import nltk # natural language tool kit
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem.porter import *
import emoji
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
def prepare_dataframe(airline, shuffle = True):
# takes percentages of airline data to be used as input
# and returns concatenated DataFrame
# load airline tweets
df_tweets_air_full = pd.read_csv("tweets_data/Tweets_airlines.csv")
#print(df_tweets_air_full.head())
# get DataFrame with text and sentiment
#text_data = get_text(twitter_data)
df_tweets_air = df_tweets_air_full[["text", "airline_sentiment"]]
print(f"Examples in airline data: {df_tweets_air.shape[0]}")
df_tweets_air = df_tweets_air.rename(columns = {"text": "text", "airline_sentiment": "category"})
df_tweets_air["category"] = df_tweets_air["category"].map({"negative": -1.0, "neutral": 0.0, "positive": 1.0})
if shuffle == True:
df_tweets_air = df_tweets_air.sample(frac=1) # shuffle dataframe
else: pass
df_tweets = df_tweets_air.head(round(airline * df_tweets_air.shape[0])) # TODO custom
print(f"USED examples of airline data: {df_tweets.shape[0]}\n")
# drop rows/examples where either no text could be fetched or no sentiment was assigned
print("\nRows with nan/missing entries:")
print(df_tweets.isnull().sum())
df_tweets.dropna(axis = 0, how = "any", inplace = True)
if shuffle == True:
df_tweets = df_tweets.sample(frac=1) # shuffle dataframe
else: pass
# One-hot encode sentiments
y = pd.get_dummies(df_tweets["category"])
return df_tweets, y
def clean_text(tweet_import, stopwords = False, stemming = False):
def extract_emojis(text_import): # extract a list containing the emojis in the tweet
emoji_list = []
[emoji_list.append(c) for c in text_import if c in emoji.UNICODE_EMOJI.keys()]
emoji_list = list(set(emoji_list)) # REMOVE DUPLICATE emojis in the list
return emoji_list
def remove_emojis(text_import):
regex_pattern = re.compile(pattern="["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
u"\U0001f926-\U0001f937"
u"\U00010000-\U0010ffff"
u"\u2640-\u2642"
u"\u2600-\u2B55"
u"\u200d"
u"\u23cf"
u"\u23e9"
u"\u231a"
u"\ufe0f" # dingbats
u"\u3030"
"]+", flags=re.UNICODE)
return regex_pattern.sub(" ", text_import)
emojis_in_text = extract_emojis(tweet_import)
emoji_string = " ".join(emj for emj in emojis_in_text)
text = remove_emojis(tweet_import) # remove all emojis after extraction
text = text + emoji_string # add every emoji ONCE at the end
text = emoji.demojize(text) # translate emojis into words
text = text.lower() # convert text lower case
text = re.sub(r"http\S+", "", text) # text remove hyperlinks
text = re.sub(r"#", "", text) # text remove hashtag symbol
text = re.sub(r"@\S+", "", text) # text remove @mentions
text = ''.join((char for char in text
if not char.isdigit())) # remove all numbers
text = re.sub(r"'", "", text) # remove apostrophes
text = re.sub(r"[^a-zA-Z0-9]", " ", text) # remove non letters
text = re.sub(r"^RT[\s]+", "", text) # remove retweet text "RT"
words = text.split()
# remove stopwords
if stopwords == True:
words = [w for w in words if w not in stopwords.words("english")]
else: pass
# apply stemming
if stemming == True:
words = [PorterStemmer().stem(w) for w in words]
else: pass
text = " ".join(words) # remove multiple white space
text = text.lstrip() # remove space from the left
return text
def tokenize_and_pad(texts, vocab_size, max_length):
# takes "clean" texts as input and returns (and saves) tokenized and padded sequences
# tokenizing:
## assign vocab_size-1 most frequent words numbers from 1 to vocab_size-1
## with 1 as highest and vocab_size-1 as lowest frequented word
tokenizer = Tokenizer(num_words = vocab_size)
tokenizer.fit_on_texts(texts)
texts_t = tokenizer.texts_to_sequences(texts)
# padding:
## Add 0 token(s) to sequences shorter than max_length and truncate sequences
## longer than max_length
texts_tp = pad_sequences(texts_t, maxlen = max_length, padding = "post")
#possible different approach
#vectorizer = TextVectorization(max_tokens=vocab_size, output_mode = 'int', output_sequence_length = max_length)
#vectorizer.adapt(texts)
#return texts.map(vectorizer), vectorizer
# saving
# save tokenizer
with open(r'model_data_keras_embedding\tokenizer_save.pickle', 'wb') as handle:
pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
# save padder/max_length
with open(r'model_data_keras_embedding\padder_save.pickle', 'wb') as handle:
pickle.dump(max_length, handle, protocol=pickle.HIGHEST_PROTOCOL)
return texts_tp, tokenizer