-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathpreprocessing.py
120 lines (94 loc) · 3.55 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# -*- coding: utf-8 -*-
"""part1 2_21.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1Me67PabGSdyCNiTc3xXtXBE37rNuj98u
"""
import pandas as pd
import numpy as np
import pandas as pd
from scipy import spatial
import string
import nltk
import re
nltk.download('stopwords')
from nltk.corpus import stopwords
#import stemmer
from nltk.stem import PorterStemmer
ps = PorterStemmer()
#import lemmatizer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
lemmatizer = WordNetLemmatizer()
#import word embedding
!wget http://nlp.stanford.edu/data/glove.42B.300d.zip
!unzip glove.42B.300d.zip
#import csv here and get column of mission statements
# text copied from csv file
mission_statement = 'Whittier is a place where students /234/ + - 131 faculty become actively involved in the local community as they explore their role in the global community, too. Our residential nature is a key component in defining who we are, as students and professors live and learn in close proximity to one another, fostering an atmosphere where knowledge can be pursued 24/7. This is a community where individuality is valued, tolerance is cherished, and the things that make each of us unique are seen as invaluable tools in seeking greater knowledge and understanding.'
#naming word embedding data
!head -n 1000 glove.42B.300d.txt > top_1000.txt
embeddings = {}
with open('top_1000.txt', 'r') as f:
for line in f:
values = line.split()
word = values[0]
vector = np.asarray(values[1:], "float32")
embeddings[word] = vector
words_with_embeddings = set([w for w in embeddings])
def clean(text):
'''
cleans text by changing text to a list of words, removing punctuation, numbers, unwatned characters (, ’ ” + ),
changin to lower case, and removing stop words.
'''
new_list = []
# clean description
text = text.translate(str.maketrans('', '', string.punctuation))
description = text.lower().strip()
words = description.split(" ")
stops = set(stopwords.words('english'))
#filter out numbers/characters
new_list = [x for x in words if not any(c.isdigit() or c in ['+', '-', '/', '"', "'"] for c in x)]
#removes any empty strings
new_list = list(filter(None, new_list))
# filter out stop words
new_list = [w for w in new_list if not w in stops]
return new_list
clean(mission_statement)
def stem(words):
'''
takes in a clean list of words, uses stemmer to return a list of lemmas for words without duplicates
'''
new_list = []
for w in words:
x = ps.stem(w)
if x not in new_list:
new_list += [x]
return new_list
#testing out the stemmer
stem(clean(mission_statement))
def lemma(words):
'''
takes in a clean list of words, uses stemmer to return a list of lemmas for words without duplicates
'''
new_list = []
for w in words:
x = lemmatizer.lemmatize(w)
if x not in new_list:
new_list += [x]
return new_list
#testing out lemma
lemma(clean(mission_statement))
def calculate_description_embedding(words):
'''
takes in a clean list of words, finds the word embeddings for each word, and finds the average word embedding for the list of words.
'''
#removes any words not in words embedding list
words = [w for w in words if (w in words_with_embeddings)]
if len(words) == 0:
return None
# calculate embedding and return
return sum([embeddings[w] for w in words])/len(words)
#trying everything out
calculate_description_embedding(stem(clean(mission_statement)))