preprocessing.py

# -*- coding: utf-8 -*-
"""part1 2_21.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1Me67PabGSdyCNiTc3xXtXBE37rNuj98u
"""

import pandas as pd
import numpy as np
import pandas as pd
from scipy import spatial
import string
import nltk
import re
nltk.download('stopwords')
from nltk.corpus import stopwords
#import stemmer 
from nltk.stem import PorterStemmer
ps = PorterStemmer()
#import lemmatizer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
lemmatizer = WordNetLemmatizer()

#import word embedding 
!wget http://nlp.stanford.edu/data/glove.42B.300d.zip
!unzip glove.42B.300d.zip


#import csv here and get column of mission statements

# text copied from csv file

mission_statement = 'Whittier is a place where students /234/ + - 131 faculty become actively involved in the local community as they explore their role in the global community, too. Our residential nature is a key component in defining who we are, as students and professors live and learn in close proximity to one another, fostering an atmosphere where knowledge can be pursued 24/7. This is a community where individuality is valued, tolerance is cherished, and the things that make each of us unique are seen as invaluable tools in seeking greater knowledge and understanding.'

#naming word embedding data 
!head -n 1000 glove.42B.300d.txt > top_1000.txt

embeddings = {}
with open('top_1000.txt', 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings[word] = vector
        
words_with_embeddings = set([w for w in embeddings])

def clean(text):
  '''
  cleans text by changing text to a list of words, removing punctuation, numbers, unwatned characters (, ’ ” + ), 
  changin to lower case, and removing stop words. 
  '''
  new_list = []
  # clean description
  text = text.translate(str.maketrans('', '', string.punctuation))
  description = text.lower().strip()
  words = description.split(" ")
  stops = set(stopwords.words('english'))

  #filter out numbers/characters
  new_list = [x for x in words if not any(c.isdigit() or c in ['+', '-', '/', '"', "'"] for c in x)]
    
  #removes any empty strings
  new_list = list(filter(None, new_list))

  # filter out stop words
  new_list = [w for w in new_list if not w in stops] 

  return new_list

clean(mission_statement)

def stem(words):
  '''
  takes in a clean list of words, uses stemmer to return a list of lemmas for words without duplicates
  '''
  new_list = []
  for w in words:
    x = ps.stem(w)
    if x not in new_list:
      new_list += [x]
  return new_list

#testing out the stemmer
stem(clean(mission_statement))

def lemma(words):
  '''
  takes in a clean list of words, uses stemmer to return a list of lemmas for words without duplicates
  '''
  new_list = []
  for w in words:
    x = lemmatizer.lemmatize(w)
    if x not in new_list:
      new_list += [x]
  return new_list

#testing out lemma
lemma(clean(mission_statement))

def calculate_description_embedding(words):
  '''
  takes in a clean list of words, finds the word embeddings for each word, and finds the average word embedding for the list of words.
  '''
  #removes any words not in words embedding list
  words = [w for w in words if (w in words_with_embeddings)]
    
  if len(words) == 0:
      return None
    
  # calculate embedding and return
  return sum([embeddings[w] for w in words])/len(words)

#trying everything out 

calculate_description_embedding(stem(clean(mission_statement)))