diff --git a/README.md b/README.md index ab27293..011cb73 100644 --- a/README.md +++ b/README.md @@ -47,6 +47,7 @@ While FastText provides several pre-trained word vector datasets trained on Wiki ### Featurization Models We provide the TF-IDF vectorizer built from a 1-percent sample of English tweets posted to Twitter and captured in Twitter's public sample stream between 2013 and 2016. +This dataset contains 11,715,393 tweets. You can download this vectorizer here: [2013to2016_tfidf_vectorizer_20190109.pkl](http://obj.umiacs.umd.edu/trecis_2018/2013to2016_tfidf_vectorizer_20190109.pkl) We also provide our FastText-trained model on this same set of English tweets, which you can find here: [archived_text_sample_2013to2016_gensim_200.model.tgz](http://obj.umiacs.umd.edu/trecis_2018/archived_text_sample_2013to2016_gensim_200.model.tgz) diff --git a/featurization/.ipynb_checkpoints/FastTextBuilder-checkpoint.ipynb b/featurization/.ipynb_checkpoints/FastTextBuilder-checkpoint.ipynb new file mode 100644 index 0000000..d1b8e20 --- /dev/null +++ b/featurization/.ipynb_checkpoints/FastTextBuilder-checkpoint.ipynb @@ -0,0 +1,729 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import string\n", + "import gzip\n", + "import json\n", + "import re\n", + "import glob" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "import scipy\n", + "from scipy import interpolate" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "import sklearn.cluster\n", + "import sklearn.feature_extraction \n", + "import sklearn.feature_extraction.text\n", + "import sklearn.metrics\n", + "import sklearn.preprocessing\n", + "\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.externals import joblib\n", + "from sklearn.feature_selection import SelectFromModel\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.metrics import accuracy_score\n", + "from sklearn.metrics import auc\n", + "from sklearn.metrics import average_precision_score\n", + "from sklearn.metrics import confusion_matrix\n", + "from sklearn.metrics import f1_score\n", + "from sklearn.metrics import precision_recall_curve\n", + "from sklearn.model_selection import RandomizedSearchCV\n", + "from sklearn.model_selection import StratifiedKFold\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.multiclass import OneVsRestClassifier\n", + "from sklearn.naive_bayes import BernoulliNB\n", + "from sklearn.naive_bayes import MultinomialNB\n", + "from sklearn.pipeline import FeatureUnion\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.semi_supervised import LabelPropagation\n", + "from sklearn.semi_supervised import LabelSpreading\n", + "from sklearn.svm import LinearSVC\n", + "from sklearn.svm import SVC\n", + "from sklearn.tree import DecisionTreeClassifier" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package stopwords to /home/clb617/nltk_data...\n", + "[nltk_data] Package stopwords is already up-to-date!\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/scratch/clb617/anaconda3/lib/python3.6/site-packages/nltk/twitter/__init__.py:20: UserWarning: The twython library has not been installed. Some functionality from the twitter package will not be available.\n", + " warnings.warn(\"The twython library has not been installed. \"\n" + ] + } + ], + "source": [ + "import nltk\n", + "nltk.download(\"stopwords\")\n", + "from nltk.corpus import stopwords\n", + "\n", + "from nltk.tokenize import TweetTokenizer\n", + "from nltk.sentiment.vader import SentimentIntensityAnalyzer as VS" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "from gensim.models.fasttext import FastText" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# Skip stop words, retweet signs, @ symbols, and URL headers\n", + "stopList = [\"http\", \"https\", \"rt\", \"@\", \":\", \"t.co\", \"co\", \"amp\", \"&\", \"...\", \"\\n\", \"\\r\"]\n", + "stopList.extend(string.punctuation)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# def tokenizer_wrapper(text):\n", + "# return [t.lemma_ for t in nlp(text)]\n", + "\n", + "local_tokenizer = TweetTokenizer()\n", + "def tokenizer_wrapper(text):\n", + " return local_tokenizer.tokenize(text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(\n", + " tokenizer=tokenizer_wrapper,\n", + " ngram_range=(1, 1),\n", + " stop_words=stopList, #We do better when we keep stopwords\n", + " use_idf=True,\n", + " smooth_idf=False,\n", + " norm=None, #Applies l2 norm smoothing\n", + " decode_error='replace',\n", + " max_features=10000,\n", + " min_df=4,\n", + " max_df=0.501\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "analyzer = vectorizer.build_analyzer()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "def normalize(s):\n", + " \"\"\"\n", + " Given a text, cleans and normalizes it. Feel free to add your own stuff.\n", + " From: https://www.kaggle.com/mschumacher/using-fasttext-models-for-robust-embeddings\n", + " \"\"\"\n", + " s = s.lower()\n", + "\n", + " # Replace numbers and symbols with language\n", + " s = s.replace('&', ' and ')\n", + " s = s.replace('@', ' at ')\n", + " s = s.replace('0', 'zero')\n", + " s = s.replace('1', 'one')\n", + " s = s.replace('2', 'two')\n", + " s = s.replace('3', 'three')\n", + " s = s.replace('4', 'four')\n", + " s = s.replace('5', 'five')\n", + " s = s.replace('6', 'six')\n", + " s = s.replace('7', 'seven')\n", + " s = s.replace('8', 'eight')\n", + " s = s.replace('9', 'nine')\n", + "\n", + " return s" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "trec_tweets = []\n", + "tweet_texts = []" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"rehydrated_tweets.json\", \"r\") as in_file:\n", + " for line in in_file:\n", + " tweet = json.loads(line)\n", + " trec_tweets.append(tweet[\"text\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "11717671\n" + ] + } + ], + "source": [ + "# for f in glob.glob(\"/home/clb617/scratch/projects/trecis_proj/data/tweet_random_subset_2013to2016_v2/part*.gz\"):\n", + "# with gzip.open(f, \"r\") as in_file:\n", + "# for line_bytes in in_file:\n", + "# line = line_bytes.decode(\"utf8\")\n", + "# tweet = json.loads(line)\n", + "# if ( tweet[\"lang\"] == \"en\" and \"retweeted_status\" not in tweet ):\n", + "# tweet_texts.append([\"text\"])\n", + "\n", + "# with gzip.open(\"/home/clb617/scratch/projects/trecis_proj/data/english_2015_sample_1m.json.gz\", \"r\") as in_file:\n", + "# with gzip.open(\"/home/clb617/scratch/projects/trecis_proj/data/text_sample_2015.json.gz\", \"r\") as in_file:\n", + "# for line_bytes in in_file:\n", + "# line = line_bytes.decode(\"utf8\")\n", + "# tweet_texts.append(json.loads(line)[\"text\"])\n", + "\n", + "def jsonstr2tweet(jstr):\n", + " try:\n", + " tweet = json.loads(jstr)\n", + " if ( tweet[\"lang\"] == \"en\" and \"retweeted_status\" not in tweet ):\n", + " return tweet\n", + " else:\n", + " return None\n", + " except:\n", + " return None\n", + "\n", + "rdd = sc.textFile(\"tweet_random_subset_2013to2016_v2/\")\n", + "tweet_texts = rdd.map(jsonstr2tweet).filter(lambda x : x != None).map(lambda x: x[\"text\"]).collect()\n", + "\n", + "# [[normalize(t) for t in analyzer(s)] for s in all_texts]\n", + "\n", + "print(len(tweet_texts))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "all_texts = trec_tweets + tweet_texts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'prayfourparis'" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "normalize(\"pray4paris\")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "cleaned_text = [[normalize(t) for t in analyzer(s)] for s in all_texts]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "FastText(vocab=368359, size=200, alpha=0.025)\n" + ] + } + ], + "source": [ + "model_gensim = FastText(\n", + " size=200,\n", + " min_count=5,\n", + " window=10\n", + ")\n", + "\n", + "# build the vocabulary\n", + "model_gensim.build_vocab(cleaned_text)\n", + "\n", + "# train the model\n", + "model_gensim.train(cleaned_text, total_examples=model_gensim.corpus_count, epochs=model_gensim.epochs)\n", + "\n", + "print(model_gensim)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "model_gensim.save('text_sample_2013to2016_gensim_200.model')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([-0.00251447, 0.22377786, 0.98355085, -0.7715473 , -0.13788459,\n", + " 0.90395945, -0.44666514, -0.42506534, -0.54950446, 1.0771613 ,\n", + " 1.2118708 , 0.34573835, -1.12505 , -1.1896925 , 0.10617373,\n", + " 0.14834636, -0.63678473, 0.5306245 , 0.91713506, 0.78559077,\n", + " 1.677414 , -0.13240121, -0.30925515, -0.3110578 , 0.5169658 ,\n", + " 0.5799523 , -1.0073831 , 0.80370396, -0.40828082, 0.15891187,\n", + " 0.05013 , -0.9099712 , 0.30009922, 1.1096807 , -0.31454656,\n", + " 1.8045816 , 0.31973043, -0.5810332 , -0.05613879, 0.8439649 ,\n", + " -0.5796422 , 0.09544377, 0.5595102 , 0.29849023, -0.5070656 ,\n", + " -0.33554476, 0.2724658 , 0.3301648 , 0.03812361, 1.280918 ,\n", + " -0.8932892 , 1.0181543 , 0.6297721 , -0.79178953, -0.4830722 ,\n", + " 1.900083 , -0.97206956, -1.231961 , 0.519141 , -1.3692759 ,\n", + " 0.14483038, -0.81043893, -0.7811998 , -0.5656443 , 0.68394303,\n", + " -0.00412971, 1.8282131 , 0.38563082, -0.14956062, -0.15799755,\n", + " 0.8279126 , 1.1851251 , -0.60868716, -1.1392959 , 0.01907011,\n", + " -0.1993565 , 0.08864743, 0.73447526, 1.1220739 , 0.15387197,\n", + " -0.23781261, 0.35393322, 0.5229472 , 1.1374369 , 1.1848328 ,\n", + " 1.3268511 , -1.0447361 , 1.595999 , 0.66512877, 0.7180348 ,\n", + " -0.75844324, 0.05856895, -0.61785024, -0.270561 , -1.0492203 ,\n", + " -0.9660757 , 0.13246736, 0.8860084 , 0.04582638, 0.05656901,\n", + " 0.06524241, 0.10691343, -0.40181476, 0.13881405, -1.3615162 ,\n", + " -0.10067749, 0.97370344, -0.9406569 , 0.42231107, 0.505285 ,\n", + " -0.40041313, -0.03177388, 0.8784866 , -0.33346197, 0.0926585 ,\n", + " -0.6843044 , -1.0193583 , -0.34783173, -0.23892027, 0.18805595,\n", + " 0.14845106, 0.14841844, 0.85369676, 0.54138273, 1.148289 ,\n", + " -0.6931565 , -0.80891985, 0.33537978, 0.1751958 , 1.3862312 ,\n", + " -0.51982635, -0.2999697 , 0.12558676, -0.05539849, -0.16308819,\n", + " 0.6570933 , -1.5081487 , 1.197739 , -0.8126156 , 0.74745417,\n", + " -0.943481 , -1.6323334 , 0.29812822, -0.14741744, 0.1782602 ,\n", + " 0.9578688 , -0.2850476 , -1.1810929 , -0.7266627 , -0.13423558,\n", + " 0.8501329 , 1.406546 , 0.6848001 , 0.95838404, 0.35591075,\n", + " 1.7670783 , 0.35993043, -0.16310644, -0.23210363, -0.26639834,\n", + " 0.14959098, 0.51991194, -0.30781794, 0.61597764, 0.08290225,\n", + " -0.5098869 , -1.0988191 , -0.04218112, 1.3670919 , 0.5600427 ,\n", + " -0.650471 , 0.48897344, -0.14917164, -0.09997601, -0.07794272,\n", + " 0.7474547 , -0.11964303, 0.7395969 , -0.2718192 , -0.71038485,\n", + " -0.66098285, 0.6040508 , 0.14715323, -0.7527266 , -0.31431022,\n", + " 0.75994515, 0.44979033, 0.8749855 , 0.24492699, 0.29419214,\n", + " 0.5749436 , 0.2710817 , 0.3514359 , -0.48893654, 0.18267944,\n", + " 0.10806478, -0.46188998, 0.04687883, 0.11950479, 0.4838739 ],\n", + " dtype=float32)" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model_gensim.wv[normalize(\"pray4paris\")]" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/scratch/clb617/anaconda3/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n", + " if np.issubdtype(vec.dtype, np.int):\n" + ] + }, + { + "data": { + "text/plain": [ + "[('#prayforparis', 0.7384930849075317),\n", + " ('paris', 0.6991567611694336),\n", + " ('#prayersforparis', 0.6977229118347168),\n", + " ('#prayforjakarta', 0.6635531187057495),\n", + " ('#prayersfourparis', 0.65708327293396),\n", + " ('vogueparis', 0.6510902643203735),\n", + " ('pragya', 0.6476349234580994),\n", + " ('#paris', 0.6376635432243347),\n", + " ('#prayforbangkok', 0.6372296810150146),\n", + " ('#jesuisparis', 0.6351719498634338)]" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model_gensim.wv.most_similar(\"pray4paris\")" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/scratch/clb617/anaconda3/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n", + " if np.issubdtype(vec.dtype, np.int):\n" + ] + }, + { + "data": { + "text/plain": [ + "[('#prayforparis', 0.7739624381065369),\n", + " ('#prayforgaza', 0.674716055393219),\n", + " ('#prayforiraq', 0.6647266149520874),\n", + " ('#prayforsyria', 0.6583042144775391),\n", + " ('#prayforjakarta', 0.6565165519714355),\n", + " ('#prayfororlando', 0.6531145572662354),\n", + " ('#prayforvenezuela', 0.6508985757827759),\n", + " ('#prayforlebanon', 0.6483551263809204),\n", + " ('#prayforpalestine', 0.6478559970855713),\n", + " ('#prayforpalestina', 0.6470707654953003)]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model_gensim.wv.most_similar(normalize(\"pray4paris\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/scratch/clb617/anaconda3/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n", + " if np.issubdtype(vec.dtype, np.int):\n" + ] + }, + { + "data": { + "text/plain": [ + "[('earthquake', 0.917615532875061),\n", + " ('quake-hit', 0.9108307361602783),\n", + " ('cupquake', 0.9038751721382141),\n", + " ('earthquake_rt', 0.8828902244567871),\n", + " ('#earthquake', 0.8807302117347717),\n", + " ('#quake', 0.869563102722168),\n", + " ('earthquakes', 0.8631203174591064),\n", + " ('#nepalearthquake', 0.8421447277069092),\n", + " ('earthqua', 0.830664336681366),\n", + " ('#earthquakeph', 0.8282575011253357)]" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model_gensim.wv.most_similar(normalize(\"quake\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "test_sentences = [\n", + " \"19 dead, 50 injured in #ManchesterArena explosion. Being treated as a terror incident. http://www.bbc.co.uk/news/live/uk-england-manchester-40007967 …\",\n", + " \"EXPLOSION AT MANCHESTER ARENA AND EVERYONE RAN OUT SO SCARY😭\",\n", + " \"Watch live coverage: #ManchesterArena explosion at Ariana Grande concert:\",\n", + " \"Greater Manchester Police says the number of people killed in the Manchester attack has risen to 22\",\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "19 dead, 50 injured in #ManchesterArena explosion. Being treated as a terror incident. http://www.bbc.co.uk/news/live/uk-england-manchester-40007967 …\n", + "('terrestrial', 0.8943253755569458)\n", + "('interceding', 0.8877480030059814)\n", + "('intercostal', 0.8862031698226929)\n", + "('marauding', 0.8860480785369873)\n", + "('indentured', 0.8854652643203735)\n", + "('intercontinental', 0.8847228288650513)\n", + "('sacramentonews', 0.8833929300308228)\n", + "('severing', 0.8816179633140564)\n", + "('derailment', 0.8803422451019287)\n", + "('fouroneactionnews', 0.8803269863128662)\n", + "---\n", + "EXPLOSION AT MANCHESTER ARENA AND EVERYONE RAN OUT SO SCARY😭\n", + "('groundbreaking', 0.8508864045143127)\n", + "('housebreaking', 0.8507399559020996)\n", + "('andover', 0.8389917612075806)\n", + "('ground-breaking', 0.8367241024971008)\n", + "('seaking', 0.8314098119735718)\n", + "('backbreaking', 0.8305997848510742)\n", + "('#groundbreaking', 0.8290548324584961)\n", + "('westinghouse', 0.8275308609008789)\n", + "('creaking', 0.8262030482292175)\n", + "('lansing', 0.8249188661575317)\n", + "---\n", + "Watch live coverage: #ManchesterArena explosion at Ariana Grande concert:\n", + "('performing', 0.855387806892395)\n", + "('outperforming', 0.8498433828353882)\n", + "('live-streaming', 0.8489073514938354)\n", + "('underperforming', 0.8463796377182007)\n", + "('high-performing', 0.8437735438346863)\n", + "('livestreaming', 0.8407323360443115)\n", + "('chantecaille', 0.8391469717025757)\n", + "('perfoming', 0.834829568862915)\n", + "('livening', 0.8326269388198853)\n", + "('performence', 0.8307388424873352)\n", + "---\n", + "Greater Manchester Police says the number of people killed in the Manchester attack has risen to 22\n", + "('worcester', 0.8883395195007324)\n", + "('undistorted', 0.8783309459686279)\n", + "('disenfranchising', 0.8776552081108093)\n", + "('sacramentonews', 0.8775242567062378)\n", + "('devestating', 0.876780092716217)\n", + "('interceding', 0.8760107755661011)\n", + "('marauding', 0.8759708404541016)\n", + "('uniden', 0.875922441482544)\n", + "('persepolis', 0.8755065202713013)\n", + "('suster', 0.8751187324523926)\n", + "---\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/scratch/clb617/anaconda3/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n", + " if np.issubdtype(vec.dtype, np.int):\n" + ] + } + ], + "source": [ + "def vectorize(sentence):\n", + " tokenized = [normalize(t) for t in analyzer(sentence)]\n", + " \n", + " wvs = []\n", + " for t in tokenized:\n", + " v = model_gensim.wv[t]\n", + " norm = np.linalg.norm(v)\n", + " normed_v = v / norm\n", + " wvs.append(normed_v)\n", + " \n", + " m = np.array(wvs)\n", + " normed_m = np.mean(m, axis=0)\n", + "\n", + " return normed_m\n", + "\n", + "for s in test_sentences:\n", + " sv = vectorize(s)\n", + " print(s)\n", + " for tup in model_gensim.wv.similar_by_vector(sv):\n", + " print(tup)\n", + " print(\"---\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/featurization/.ipynb_checkpoints/VectorizerBuilder-checkpoint.ipynb b/featurization/.ipynb_checkpoints/VectorizerBuilder-checkpoint.ipynb new file mode 100644 index 0000000..31d75a5 --- /dev/null +++ b/featurization/.ipynb_checkpoints/VectorizerBuilder-checkpoint.ipynb @@ -0,0 +1,414 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import string\n", + "import gzip\n", + "import json\n", + "import re" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import scipy\n", + "from scipy import interpolate" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import glob" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/clb617/.local/lib/python3.6/site-packages/sklearn/ensemble/weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.\n", + " from numpy.core.umath_tests import inner1d\n" + ] + } + ], + "source": [ + "import sklearn.cluster\n", + "import sklearn.feature_extraction \n", + "import sklearn.feature_extraction.text\n", + "import sklearn.metrics\n", + "import sklearn.preprocessing\n", + "\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.externals import joblib\n", + "from sklearn.feature_selection import SelectFromModel\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.metrics import accuracy_score\n", + "from sklearn.metrics import auc\n", + "from sklearn.metrics import average_precision_score\n", + "from sklearn.metrics import confusion_matrix\n", + "from sklearn.metrics import f1_score\n", + "from sklearn.metrics import precision_recall_curve\n", + "from sklearn.model_selection import RandomizedSearchCV\n", + "from sklearn.model_selection import StratifiedKFold\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.multiclass import OneVsRestClassifier\n", + "from sklearn.naive_bayes import BernoulliNB\n", + "from sklearn.naive_bayes import MultinomialNB\n", + "from sklearn.pipeline import FeatureUnion\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.semi_supervised import LabelPropagation\n", + "from sklearn.semi_supervised import LabelSpreading\n", + "from sklearn.svm import LinearSVC\n", + "from sklearn.svm import SVC\n", + "from sklearn.tree import DecisionTreeClassifier" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package stopwords to /home/clb617/nltk_data...\n", + "[nltk_data] Package stopwords is already up-to-date!\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/clb617/.local/lib/python3.6/site-packages/nltk/twitter/__init__.py:20: UserWarning: The twython library has not been installed. Some functionality from the twitter package will not be available.\n", + " warnings.warn(\"The twython library has not been installed. \"\n" + ] + } + ], + "source": [ + "import nltk\n", + "nltk.download(\"stopwords\")\n", + "from nltk.corpus import stopwords\n", + "\n", + "from nltk.tokenize import TweetTokenizer\n", + "from nltk.sentiment.vader import SentimentIntensityAnalyzer as VS" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# But first, read in stopwrods\n", + "enStop = stopwords.words('english')\n", + "\n", + "# Skip stop words, retweet signs, @ symbols, and URL headers\n", + "stopList = enStop +\\\n", + " [\"http\", \"https\", \"rt\", \"@\", \":\", \"t.co\", \"co\", \"amp\", \"&\", \"...\", \"\\n\", \"\\r\"]\n", + "stopList.extend(string.punctuation)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# def tokenizer_wrapper(text):\n", + "# return [t.lemma_ for t in nlp(text)]\n", + "\n", + "local_tokenizer = TweetTokenizer()\n", + "def tokenizer_wrapper(text):\n", + " return local_tokenizer.tokenize(text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(\n", + " tokenizer=tokenizer_wrapper,\n", + " ngram_range=(1, 2),\n", + " stop_words=stopList, #We do better when we keep stopwords\n", + " use_idf=True,\n", + " smooth_idf=False,\n", + " norm=None, #Applies l2 norm smoothing\n", + " decode_error='replace',\n", + " max_features=10000,\n", + " min_df=4,\n", + " max_df=0.501\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "tweet_texts = []\n", + "\n", + "for f in glob.glob(\"tweet_random_subset_2013to2016_v2/part*.gz\"):\n", + " flag = False\n", + " with gzip.open(f, \"r\") as in_file:\n", + " for line_bytes in in_file:\n", + " line = line_bytes.decode(\"utf8\")\n", + "\n", + " if ( len(line.strip()) == 0 ):\n", + " continue\n", + " \n", + " tweet = json.loads(line)\n", + " text = tweet[\"text\"]\n", + "\n", + " tweet_texts.append(text)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tweet Count: 11715393\n" + ] + } + ], + "source": [ + "print(\"Tweet Count:\", len(tweet_texts))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "trec_tweets = []\n", + "with open(\"rehydrated_tweets.json\", \"r\") as in_file:\n", + " for line in in_file:\n", + " tweet = json.loads(line)\n", + " trec_tweets.append(tweet[\"text\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "all_texts = trec_tweets + tweet_texts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vectorizer.fit(all_texts)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "joblib.dump(vectorizer, \"2013to2016_tfidf_vectorizer_20190109.pkl\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vocab = {v:i for i, v in enumerate(vectorizer.get_feature_names())}\n", + "idf_vals = vectorizer.idf_\n", + "idf_dict = {i:idf_vals[i] for i in vocab.values()} #keys are indices; values are IDF scores" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dictionary Word Count: 10000\n", + "['🚨', '🚨 🚨', '🚫', '🚮', '🚶', '🤑', '🤔', '🤔 🤔', '🤗', '🤘']\n" + ] + } + ], + "source": [ + "print(\"Dictionary Word Count:\", len(vocab))\n", + "print([x[0] for x in vocab.items()][-10:])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Finished\n" + ] + } + ], + "source": [ + "print(\"Finished\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/featurization/FastTextBuilder.ipynb b/featurization/FastTextBuilder.ipynb new file mode 100644 index 0000000..d1b8e20 --- /dev/null +++ b/featurization/FastTextBuilder.ipynb @@ -0,0 +1,729 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import string\n", + "import gzip\n", + "import json\n", + "import re\n", + "import glob" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "import scipy\n", + "from scipy import interpolate" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "import sklearn.cluster\n", + "import sklearn.feature_extraction \n", + "import sklearn.feature_extraction.text\n", + "import sklearn.metrics\n", + "import sklearn.preprocessing\n", + "\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.externals import joblib\n", + "from sklearn.feature_selection import SelectFromModel\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.metrics import accuracy_score\n", + "from sklearn.metrics import auc\n", + "from sklearn.metrics import average_precision_score\n", + "from sklearn.metrics import confusion_matrix\n", + "from sklearn.metrics import f1_score\n", + "from sklearn.metrics import precision_recall_curve\n", + "from sklearn.model_selection import RandomizedSearchCV\n", + "from sklearn.model_selection import StratifiedKFold\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.multiclass import OneVsRestClassifier\n", + "from sklearn.naive_bayes import BernoulliNB\n", + "from sklearn.naive_bayes import MultinomialNB\n", + "from sklearn.pipeline import FeatureUnion\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.semi_supervised import LabelPropagation\n", + "from sklearn.semi_supervised import LabelSpreading\n", + "from sklearn.svm import LinearSVC\n", + "from sklearn.svm import SVC\n", + "from sklearn.tree import DecisionTreeClassifier" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package stopwords to /home/clb617/nltk_data...\n", + "[nltk_data] Package stopwords is already up-to-date!\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/scratch/clb617/anaconda3/lib/python3.6/site-packages/nltk/twitter/__init__.py:20: UserWarning: The twython library has not been installed. Some functionality from the twitter package will not be available.\n", + " warnings.warn(\"The twython library has not been installed. \"\n" + ] + } + ], + "source": [ + "import nltk\n", + "nltk.download(\"stopwords\")\n", + "from nltk.corpus import stopwords\n", + "\n", + "from nltk.tokenize import TweetTokenizer\n", + "from nltk.sentiment.vader import SentimentIntensityAnalyzer as VS" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "from gensim.models.fasttext import FastText" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# Skip stop words, retweet signs, @ symbols, and URL headers\n", + "stopList = [\"http\", \"https\", \"rt\", \"@\", \":\", \"t.co\", \"co\", \"amp\", \"&\", \"...\", \"\\n\", \"\\r\"]\n", + "stopList.extend(string.punctuation)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# def tokenizer_wrapper(text):\n", + "# return [t.lemma_ for t in nlp(text)]\n", + "\n", + "local_tokenizer = TweetTokenizer()\n", + "def tokenizer_wrapper(text):\n", + " return local_tokenizer.tokenize(text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(\n", + " tokenizer=tokenizer_wrapper,\n", + " ngram_range=(1, 1),\n", + " stop_words=stopList, #We do better when we keep stopwords\n", + " use_idf=True,\n", + " smooth_idf=False,\n", + " norm=None, #Applies l2 norm smoothing\n", + " decode_error='replace',\n", + " max_features=10000,\n", + " min_df=4,\n", + " max_df=0.501\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "analyzer = vectorizer.build_analyzer()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "def normalize(s):\n", + " \"\"\"\n", + " Given a text, cleans and normalizes it. Feel free to add your own stuff.\n", + " From: https://www.kaggle.com/mschumacher/using-fasttext-models-for-robust-embeddings\n", + " \"\"\"\n", + " s = s.lower()\n", + "\n", + " # Replace numbers and symbols with language\n", + " s = s.replace('&', ' and ')\n", + " s = s.replace('@', ' at ')\n", + " s = s.replace('0', 'zero')\n", + " s = s.replace('1', 'one')\n", + " s = s.replace('2', 'two')\n", + " s = s.replace('3', 'three')\n", + " s = s.replace('4', 'four')\n", + " s = s.replace('5', 'five')\n", + " s = s.replace('6', 'six')\n", + " s = s.replace('7', 'seven')\n", + " s = s.replace('8', 'eight')\n", + " s = s.replace('9', 'nine')\n", + "\n", + " return s" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "trec_tweets = []\n", + "tweet_texts = []" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"rehydrated_tweets.json\", \"r\") as in_file:\n", + " for line in in_file:\n", + " tweet = json.loads(line)\n", + " trec_tweets.append(tweet[\"text\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "11717671\n" + ] + } + ], + "source": [ + "# for f in glob.glob(\"/home/clb617/scratch/projects/trecis_proj/data/tweet_random_subset_2013to2016_v2/part*.gz\"):\n", + "# with gzip.open(f, \"r\") as in_file:\n", + "# for line_bytes in in_file:\n", + "# line = line_bytes.decode(\"utf8\")\n", + "# tweet = json.loads(line)\n", + "# if ( tweet[\"lang\"] == \"en\" and \"retweeted_status\" not in tweet ):\n", + "# tweet_texts.append([\"text\"])\n", + "\n", + "# with gzip.open(\"/home/clb617/scratch/projects/trecis_proj/data/english_2015_sample_1m.json.gz\", \"r\") as in_file:\n", + "# with gzip.open(\"/home/clb617/scratch/projects/trecis_proj/data/text_sample_2015.json.gz\", \"r\") as in_file:\n", + "# for line_bytes in in_file:\n", + "# line = line_bytes.decode(\"utf8\")\n", + "# tweet_texts.append(json.loads(line)[\"text\"])\n", + "\n", + "def jsonstr2tweet(jstr):\n", + " try:\n", + " tweet = json.loads(jstr)\n", + " if ( tweet[\"lang\"] == \"en\" and \"retweeted_status\" not in tweet ):\n", + " return tweet\n", + " else:\n", + " return None\n", + " except:\n", + " return None\n", + "\n", + "rdd = sc.textFile(\"tweet_random_subset_2013to2016_v2/\")\n", + "tweet_texts = rdd.map(jsonstr2tweet).filter(lambda x : x != None).map(lambda x: x[\"text\"]).collect()\n", + "\n", + "# [[normalize(t) for t in analyzer(s)] for s in all_texts]\n", + "\n", + "print(len(tweet_texts))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "all_texts = trec_tweets + tweet_texts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'prayfourparis'" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "normalize(\"pray4paris\")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "cleaned_text = [[normalize(t) for t in analyzer(s)] for s in all_texts]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "FastText(vocab=368359, size=200, alpha=0.025)\n" + ] + } + ], + "source": [ + "model_gensim = FastText(\n", + " size=200,\n", + " min_count=5,\n", + " window=10\n", + ")\n", + "\n", + "# build the vocabulary\n", + "model_gensim.build_vocab(cleaned_text)\n", + "\n", + "# train the model\n", + "model_gensim.train(cleaned_text, total_examples=model_gensim.corpus_count, epochs=model_gensim.epochs)\n", + "\n", + "print(model_gensim)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "model_gensim.save('text_sample_2013to2016_gensim_200.model')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([-0.00251447, 0.22377786, 0.98355085, -0.7715473 , -0.13788459,\n", + " 0.90395945, -0.44666514, -0.42506534, -0.54950446, 1.0771613 ,\n", + " 1.2118708 , 0.34573835, -1.12505 , -1.1896925 , 0.10617373,\n", + " 0.14834636, -0.63678473, 0.5306245 , 0.91713506, 0.78559077,\n", + " 1.677414 , -0.13240121, -0.30925515, -0.3110578 , 0.5169658 ,\n", + " 0.5799523 , -1.0073831 , 0.80370396, -0.40828082, 0.15891187,\n", + " 0.05013 , -0.9099712 , 0.30009922, 1.1096807 , -0.31454656,\n", + " 1.8045816 , 0.31973043, -0.5810332 , -0.05613879, 0.8439649 ,\n", + " -0.5796422 , 0.09544377, 0.5595102 , 0.29849023, -0.5070656 ,\n", + " -0.33554476, 0.2724658 , 0.3301648 , 0.03812361, 1.280918 ,\n", + " -0.8932892 , 1.0181543 , 0.6297721 , -0.79178953, -0.4830722 ,\n", + " 1.900083 , -0.97206956, -1.231961 , 0.519141 , -1.3692759 ,\n", + " 0.14483038, -0.81043893, -0.7811998 , -0.5656443 , 0.68394303,\n", + " -0.00412971, 1.8282131 , 0.38563082, -0.14956062, -0.15799755,\n", + " 0.8279126 , 1.1851251 , -0.60868716, -1.1392959 , 0.01907011,\n", + " -0.1993565 , 0.08864743, 0.73447526, 1.1220739 , 0.15387197,\n", + " -0.23781261, 0.35393322, 0.5229472 , 1.1374369 , 1.1848328 ,\n", + " 1.3268511 , -1.0447361 , 1.595999 , 0.66512877, 0.7180348 ,\n", + " -0.75844324, 0.05856895, -0.61785024, -0.270561 , -1.0492203 ,\n", + " -0.9660757 , 0.13246736, 0.8860084 , 0.04582638, 0.05656901,\n", + " 0.06524241, 0.10691343, -0.40181476, 0.13881405, -1.3615162 ,\n", + " -0.10067749, 0.97370344, -0.9406569 , 0.42231107, 0.505285 ,\n", + " -0.40041313, -0.03177388, 0.8784866 , -0.33346197, 0.0926585 ,\n", + " -0.6843044 , -1.0193583 , -0.34783173, -0.23892027, 0.18805595,\n", + " 0.14845106, 0.14841844, 0.85369676, 0.54138273, 1.148289 ,\n", + " -0.6931565 , -0.80891985, 0.33537978, 0.1751958 , 1.3862312 ,\n", + " -0.51982635, -0.2999697 , 0.12558676, -0.05539849, -0.16308819,\n", + " 0.6570933 , -1.5081487 , 1.197739 , -0.8126156 , 0.74745417,\n", + " -0.943481 , -1.6323334 , 0.29812822, -0.14741744, 0.1782602 ,\n", + " 0.9578688 , -0.2850476 , -1.1810929 , -0.7266627 , -0.13423558,\n", + " 0.8501329 , 1.406546 , 0.6848001 , 0.95838404, 0.35591075,\n", + " 1.7670783 , 0.35993043, -0.16310644, -0.23210363, -0.26639834,\n", + " 0.14959098, 0.51991194, -0.30781794, 0.61597764, 0.08290225,\n", + " -0.5098869 , -1.0988191 , -0.04218112, 1.3670919 , 0.5600427 ,\n", + " -0.650471 , 0.48897344, -0.14917164, -0.09997601, -0.07794272,\n", + " 0.7474547 , -0.11964303, 0.7395969 , -0.2718192 , -0.71038485,\n", + " -0.66098285, 0.6040508 , 0.14715323, -0.7527266 , -0.31431022,\n", + " 0.75994515, 0.44979033, 0.8749855 , 0.24492699, 0.29419214,\n", + " 0.5749436 , 0.2710817 , 0.3514359 , -0.48893654, 0.18267944,\n", + " 0.10806478, -0.46188998, 0.04687883, 0.11950479, 0.4838739 ],\n", + " dtype=float32)" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model_gensim.wv[normalize(\"pray4paris\")]" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/scratch/clb617/anaconda3/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n", + " if np.issubdtype(vec.dtype, np.int):\n" + ] + }, + { + "data": { + "text/plain": [ + "[('#prayforparis', 0.7384930849075317),\n", + " ('paris', 0.6991567611694336),\n", + " ('#prayersforparis', 0.6977229118347168),\n", + " ('#prayforjakarta', 0.6635531187057495),\n", + " ('#prayersfourparis', 0.65708327293396),\n", + " ('vogueparis', 0.6510902643203735),\n", + " ('pragya', 0.6476349234580994),\n", + " ('#paris', 0.6376635432243347),\n", + " ('#prayforbangkok', 0.6372296810150146),\n", + " ('#jesuisparis', 0.6351719498634338)]" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model_gensim.wv.most_similar(\"pray4paris\")" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/scratch/clb617/anaconda3/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n", + " if np.issubdtype(vec.dtype, np.int):\n" + ] + }, + { + "data": { + "text/plain": [ + "[('#prayforparis', 0.7739624381065369),\n", + " ('#prayforgaza', 0.674716055393219),\n", + " ('#prayforiraq', 0.6647266149520874),\n", + " ('#prayforsyria', 0.6583042144775391),\n", + " ('#prayforjakarta', 0.6565165519714355),\n", + " ('#prayfororlando', 0.6531145572662354),\n", + " ('#prayforvenezuela', 0.6508985757827759),\n", + " ('#prayforlebanon', 0.6483551263809204),\n", + " ('#prayforpalestine', 0.6478559970855713),\n", + " ('#prayforpalestina', 0.6470707654953003)]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model_gensim.wv.most_similar(normalize(\"pray4paris\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/scratch/clb617/anaconda3/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n", + " if np.issubdtype(vec.dtype, np.int):\n" + ] + }, + { + "data": { + "text/plain": [ + "[('earthquake', 0.917615532875061),\n", + " ('quake-hit', 0.9108307361602783),\n", + " ('cupquake', 0.9038751721382141),\n", + " ('earthquake_rt', 0.8828902244567871),\n", + " ('#earthquake', 0.8807302117347717),\n", + " ('#quake', 0.869563102722168),\n", + " ('earthquakes', 0.8631203174591064),\n", + " ('#nepalearthquake', 0.8421447277069092),\n", + " ('earthqua', 0.830664336681366),\n", + " ('#earthquakeph', 0.8282575011253357)]" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model_gensim.wv.most_similar(normalize(\"quake\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "test_sentences = [\n", + " \"19 dead, 50 injured in #ManchesterArena explosion. Being treated as a terror incident. http://www.bbc.co.uk/news/live/uk-england-manchester-40007967 …\",\n", + " \"EXPLOSION AT MANCHESTER ARENA AND EVERYONE RAN OUT SO SCARY😭\",\n", + " \"Watch live coverage: #ManchesterArena explosion at Ariana Grande concert:\",\n", + " \"Greater Manchester Police says the number of people killed in the Manchester attack has risen to 22\",\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "19 dead, 50 injured in #ManchesterArena explosion. Being treated as a terror incident. http://www.bbc.co.uk/news/live/uk-england-manchester-40007967 …\n", + "('terrestrial', 0.8943253755569458)\n", + "('interceding', 0.8877480030059814)\n", + "('intercostal', 0.8862031698226929)\n", + "('marauding', 0.8860480785369873)\n", + "('indentured', 0.8854652643203735)\n", + "('intercontinental', 0.8847228288650513)\n", + "('sacramentonews', 0.8833929300308228)\n", + "('severing', 0.8816179633140564)\n", + "('derailment', 0.8803422451019287)\n", + "('fouroneactionnews', 0.8803269863128662)\n", + "---\n", + "EXPLOSION AT MANCHESTER ARENA AND EVERYONE RAN OUT SO SCARY😭\n", + "('groundbreaking', 0.8508864045143127)\n", + "('housebreaking', 0.8507399559020996)\n", + "('andover', 0.8389917612075806)\n", + "('ground-breaking', 0.8367241024971008)\n", + "('seaking', 0.8314098119735718)\n", + "('backbreaking', 0.8305997848510742)\n", + "('#groundbreaking', 0.8290548324584961)\n", + "('westinghouse', 0.8275308609008789)\n", + "('creaking', 0.8262030482292175)\n", + "('lansing', 0.8249188661575317)\n", + "---\n", + "Watch live coverage: #ManchesterArena explosion at Ariana Grande concert:\n", + "('performing', 0.855387806892395)\n", + "('outperforming', 0.8498433828353882)\n", + "('live-streaming', 0.8489073514938354)\n", + "('underperforming', 0.8463796377182007)\n", + "('high-performing', 0.8437735438346863)\n", + "('livestreaming', 0.8407323360443115)\n", + "('chantecaille', 0.8391469717025757)\n", + "('perfoming', 0.834829568862915)\n", + "('livening', 0.8326269388198853)\n", + "('performence', 0.8307388424873352)\n", + "---\n", + "Greater Manchester Police says the number of people killed in the Manchester attack has risen to 22\n", + "('worcester', 0.8883395195007324)\n", + "('undistorted', 0.8783309459686279)\n", + "('disenfranchising', 0.8776552081108093)\n", + "('sacramentonews', 0.8775242567062378)\n", + "('devestating', 0.876780092716217)\n", + "('interceding', 0.8760107755661011)\n", + "('marauding', 0.8759708404541016)\n", + "('uniden', 0.875922441482544)\n", + "('persepolis', 0.8755065202713013)\n", + "('suster', 0.8751187324523926)\n", + "---\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/scratch/clb617/anaconda3/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n", + " if np.issubdtype(vec.dtype, np.int):\n" + ] + } + ], + "source": [ + "def vectorize(sentence):\n", + " tokenized = [normalize(t) for t in analyzer(sentence)]\n", + " \n", + " wvs = []\n", + " for t in tokenized:\n", + " v = model_gensim.wv[t]\n", + " norm = np.linalg.norm(v)\n", + " normed_v = v / norm\n", + " wvs.append(normed_v)\n", + " \n", + " m = np.array(wvs)\n", + " normed_m = np.mean(m, axis=0)\n", + "\n", + " return normed_m\n", + "\n", + "for s in test_sentences:\n", + " sv = vectorize(s)\n", + " print(s)\n", + " for tup in model_gensim.wv.similar_by_vector(sv):\n", + " print(tup)\n", + " print(\"---\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/featurization/VectorizerBuilder.ipynb b/featurization/VectorizerBuilder.ipynb new file mode 100644 index 0000000..31d75a5 --- /dev/null +++ b/featurization/VectorizerBuilder.ipynb @@ -0,0 +1,414 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import string\n", + "import gzip\n", + "import json\n", + "import re" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import scipy\n", + "from scipy import interpolate" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import glob" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/clb617/.local/lib/python3.6/site-packages/sklearn/ensemble/weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.\n", + " from numpy.core.umath_tests import inner1d\n" + ] + } + ], + "source": [ + "import sklearn.cluster\n", + "import sklearn.feature_extraction \n", + "import sklearn.feature_extraction.text\n", + "import sklearn.metrics\n", + "import sklearn.preprocessing\n", + "\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.externals import joblib\n", + "from sklearn.feature_selection import SelectFromModel\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.metrics import accuracy_score\n", + "from sklearn.metrics import auc\n", + "from sklearn.metrics import average_precision_score\n", + "from sklearn.metrics import confusion_matrix\n", + "from sklearn.metrics import f1_score\n", + "from sklearn.metrics import precision_recall_curve\n", + "from sklearn.model_selection import RandomizedSearchCV\n", + "from sklearn.model_selection import StratifiedKFold\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.multiclass import OneVsRestClassifier\n", + "from sklearn.naive_bayes import BernoulliNB\n", + "from sklearn.naive_bayes import MultinomialNB\n", + "from sklearn.pipeline import FeatureUnion\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.semi_supervised import LabelPropagation\n", + "from sklearn.semi_supervised import LabelSpreading\n", + "from sklearn.svm import LinearSVC\n", + "from sklearn.svm import SVC\n", + "from sklearn.tree import DecisionTreeClassifier" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package stopwords to /home/clb617/nltk_data...\n", + "[nltk_data] Package stopwords is already up-to-date!\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/clb617/.local/lib/python3.6/site-packages/nltk/twitter/__init__.py:20: UserWarning: The twython library has not been installed. Some functionality from the twitter package will not be available.\n", + " warnings.warn(\"The twython library has not been installed. \"\n" + ] + } + ], + "source": [ + "import nltk\n", + "nltk.download(\"stopwords\")\n", + "from nltk.corpus import stopwords\n", + "\n", + "from nltk.tokenize import TweetTokenizer\n", + "from nltk.sentiment.vader import SentimentIntensityAnalyzer as VS" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# But first, read in stopwrods\n", + "enStop = stopwords.words('english')\n", + "\n", + "# Skip stop words, retweet signs, @ symbols, and URL headers\n", + "stopList = enStop +\\\n", + " [\"http\", \"https\", \"rt\", \"@\", \":\", \"t.co\", \"co\", \"amp\", \"&\", \"...\", \"\\n\", \"\\r\"]\n", + "stopList.extend(string.punctuation)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# def tokenizer_wrapper(text):\n", + "# return [t.lemma_ for t in nlp(text)]\n", + "\n", + "local_tokenizer = TweetTokenizer()\n", + "def tokenizer_wrapper(text):\n", + " return local_tokenizer.tokenize(text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(\n", + " tokenizer=tokenizer_wrapper,\n", + " ngram_range=(1, 2),\n", + " stop_words=stopList, #We do better when we keep stopwords\n", + " use_idf=True,\n", + " smooth_idf=False,\n", + " norm=None, #Applies l2 norm smoothing\n", + " decode_error='replace',\n", + " max_features=10000,\n", + " min_df=4,\n", + " max_df=0.501\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "tweet_texts = []\n", + "\n", + "for f in glob.glob(\"tweet_random_subset_2013to2016_v2/part*.gz\"):\n", + " flag = False\n", + " with gzip.open(f, \"r\") as in_file:\n", + " for line_bytes in in_file:\n", + " line = line_bytes.decode(\"utf8\")\n", + "\n", + " if ( len(line.strip()) == 0 ):\n", + " continue\n", + " \n", + " tweet = json.loads(line)\n", + " text = tweet[\"text\"]\n", + "\n", + " tweet_texts.append(text)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tweet Count: 11715393\n" + ] + } + ], + "source": [ + "print(\"Tweet Count:\", len(tweet_texts))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "trec_tweets = []\n", + "with open(\"rehydrated_tweets.json\", \"r\") as in_file:\n", + " for line in in_file:\n", + " tweet = json.loads(line)\n", + " trec_tweets.append(tweet[\"text\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "all_texts = trec_tweets + tweet_texts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vectorizer.fit(all_texts)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "joblib.dump(vectorizer, \"2013to2016_tfidf_vectorizer_20190109.pkl\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "vocab = {v:i for i, v in enumerate(vectorizer.get_feature_names())}\n", + "idf_vals = vectorizer.idf_\n", + "idf_dict = {i:idf_vals[i] for i in vocab.values()} #keys are indices; values are IDF scores" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dictionary Word Count: 10000\n", + "['🚨', '🚨 🚨', '🚫', '🚮', '🚶', '🤑', '🤔', '🤔 🤔', '🤗', '🤘']\n" + ] + } + ], + "source": [ + "print(\"Dictionary Word Count:\", len(vocab))\n", + "print([x[0] for x in vocab.items()][-10:])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Finished\n" + ] + } + ], + "source": [ + "print(\"Finished\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}