diff --git a/README.md b/README.md
index ab27293..011cb73 100644
--- a/README.md
+++ b/README.md
@@ -47,6 +47,7 @@ While FastText provides several pre-trained word vector datasets trained on Wiki
 ### Featurization Models
 
 We provide the TF-IDF vectorizer built from a 1-percent sample of English tweets posted to Twitter and captured in Twitter's public sample stream between 2013 and 2016.
+This dataset contains 11,715,393 tweets.
 You can download this vectorizer here: [2013to2016_tfidf_vectorizer_20190109.pkl](http://obj.umiacs.umd.edu/trecis_2018/2013to2016_tfidf_vectorizer_20190109.pkl)
 
 We also provide our FastText-trained model on this same set of English tweets, which you can find here: [archived_text_sample_2013to2016_gensim_200.model.tgz](http://obj.umiacs.umd.edu/trecis_2018/archived_text_sample_2013to2016_gensim_200.model.tgz)
diff --git a/featurization/.ipynb_checkpoints/FastTextBuilder-checkpoint.ipynb b/featurization/.ipynb_checkpoints/FastTextBuilder-checkpoint.ipynb
new file mode 100644
index 0000000..d1b8e20
--- /dev/null
+++ b/featurization/.ipynb_checkpoints/FastTextBuilder-checkpoint.ipynb
@@ -0,0 +1,729 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%matplotlib inline\n",
+    "\n",
+    "import matplotlib.pyplot as plt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import string\n",
+    "import gzip\n",
+    "import json\n",
+    "import re\n",
+    "import glob"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import scipy\n",
+    "from scipy import interpolate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sklearn.cluster\n",
+    "import sklearn.feature_extraction \n",
+    "import sklearn.feature_extraction.text\n",
+    "import sklearn.metrics\n",
+    "import sklearn.preprocessing\n",
+    "\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from sklearn.externals import joblib\n",
+    "from sklearn.feature_selection import SelectFromModel\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.metrics import accuracy_score\n",
+    "from sklearn.metrics import auc\n",
+    "from sklearn.metrics import average_precision_score\n",
+    "from sklearn.metrics import confusion_matrix\n",
+    "from sklearn.metrics import f1_score\n",
+    "from sklearn.metrics import precision_recall_curve\n",
+    "from sklearn.model_selection import RandomizedSearchCV\n",
+    "from sklearn.model_selection import StratifiedKFold\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.multiclass import OneVsRestClassifier\n",
+    "from sklearn.naive_bayes import BernoulliNB\n",
+    "from sklearn.naive_bayes import MultinomialNB\n",
+    "from sklearn.pipeline import FeatureUnion\n",
+    "from sklearn.pipeline import Pipeline\n",
+    "from sklearn.semi_supervised import LabelPropagation\n",
+    "from sklearn.semi_supervised import LabelSpreading\n",
+    "from sklearn.svm import LinearSVC\n",
+    "from sklearn.svm import SVC\n",
+    "from sklearn.tree import DecisionTreeClassifier"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package stopwords to /home/clb617/nltk_data...\n",
+      "[nltk_data]   Package stopwords is already up-to-date!\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/scratch/clb617/anaconda3/lib/python3.6/site-packages/nltk/twitter/__init__.py:20: UserWarning: The twython library has not been installed. Some functionality from the twitter package will not be available.\n",
+      "  warnings.warn(\"The twython library has not been installed. \"\n"
+     ]
+    }
+   ],
+   "source": [
+    "import nltk\n",
+    "nltk.download(\"stopwords\")\n",
+    "from nltk.corpus import stopwords\n",
+    "\n",
+    "from nltk.tokenize import TweetTokenizer\n",
+    "from nltk.sentiment.vader import SentimentIntensityAnalyzer as VS"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from gensim.models.fasttext import FastText"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Skip stop words, retweet signs, @ symbols, and URL headers\n",
+    "stopList = [\"http\", \"https\", \"rt\", \"@\", \":\", \"t.co\", \"co\", \"amp\", \"&amp;\", \"...\", \"\\n\", \"\\r\"]\n",
+    "stopList.extend(string.punctuation)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# def tokenizer_wrapper(text):\n",
+    "#     return [t.lemma_ for t in nlp(text)]\n",
+    "\n",
+    "local_tokenizer = TweetTokenizer()\n",
+    "def tokenizer_wrapper(text):\n",
+    "    return local_tokenizer.tokenize(text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(\n",
+    "    tokenizer=tokenizer_wrapper,\n",
+    "    ngram_range=(1, 1),\n",
+    "    stop_words=stopList, #We do better when we keep stopwords\n",
+    "    use_idf=True,\n",
+    "    smooth_idf=False,\n",
+    "    norm=None, #Applies l2 norm smoothing\n",
+    "    decode_error='replace',\n",
+    "    max_features=10000,\n",
+    "    min_df=4,\n",
+    "    max_df=0.501\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "analyzer = vectorizer.build_analyzer()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def normalize(s):\n",
+    "    \"\"\"\n",
+    "    Given a text, cleans and normalizes it. Feel free to add your own stuff.\n",
+    "    From: https://www.kaggle.com/mschumacher/using-fasttext-models-for-robust-embeddings\n",
+    "    \"\"\"\n",
+    "    s = s.lower()\n",
+    "\n",
+    "    # Replace numbers and symbols with language\n",
+    "    s = s.replace('&', ' and ')\n",
+    "    s = s.replace('@', ' at ')\n",
+    "    s = s.replace('0', 'zero')\n",
+    "    s = s.replace('1', 'one')\n",
+    "    s = s.replace('2', 'two')\n",
+    "    s = s.replace('3', 'three')\n",
+    "    s = s.replace('4', 'four')\n",
+    "    s = s.replace('5', 'five')\n",
+    "    s = s.replace('6', 'six')\n",
+    "    s = s.replace('7', 'seven')\n",
+    "    s = s.replace('8', 'eight')\n",
+    "    s = s.replace('9', 'nine')\n",
+    "\n",
+    "    return s"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "trec_tweets = []\n",
+    "tweet_texts = []"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(\"rehydrated_tweets.json\", \"r\") as in_file:\n",
+    "    for line in in_file:\n",
+    "        tweet = json.loads(line)\n",
+    "        trec_tweets.append(tweet[\"text\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "11717671\n"
+     ]
+    }
+   ],
+   "source": [
+    "# for f in glob.glob(\"/home/clb617/scratch/projects/trecis_proj/data/tweet_random_subset_2013to2016_v2/part*.gz\"):\n",
+    "#     with gzip.open(f, \"r\") as in_file:\n",
+    "#         for line_bytes in in_file:\n",
+    "#             line = line_bytes.decode(\"utf8\")\n",
+    "#             tweet = json.loads(line)\n",
+    "#             if ( tweet[\"lang\"] == \"en\" and \"retweeted_status\" not in tweet ):\n",
+    "#                 tweet_texts.append([\"text\"])\n",
+    "\n",
+    "# with gzip.open(\"/home/clb617/scratch/projects/trecis_proj/data/english_2015_sample_1m.json.gz\", \"r\") as in_file:\n",
+    "# with gzip.open(\"/home/clb617/scratch/projects/trecis_proj/data/text_sample_2015.json.gz\", \"r\") as in_file:\n",
+    "#     for line_bytes in in_file:\n",
+    "#         line = line_bytes.decode(\"utf8\")\n",
+    "#         tweet_texts.append(json.loads(line)[\"text\"])\n",
+    "\n",
+    "def jsonstr2tweet(jstr):\n",
+    "    try:\n",
+    "        tweet = json.loads(jstr)\n",
+    "        if ( tweet[\"lang\"] == \"en\" and \"retweeted_status\" not in tweet ):\n",
+    "            return tweet\n",
+    "        else:\n",
+    "            return None\n",
+    "    except:\n",
+    "        return None\n",
+    "\n",
+    "rdd = sc.textFile(\"tweet_random_subset_2013to2016_v2/\")\n",
+    "tweet_texts = rdd.map(jsonstr2tweet).filter(lambda x : x != None).map(lambda x: x[\"text\"]).collect()\n",
+    "\n",
+    "# [[normalize(t) for t in analyzer(s)] for s in all_texts]\n",
+    "\n",
+    "print(len(tweet_texts))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_texts = trec_tweets + tweet_texts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'prayfourparis'"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "normalize(\"pray4paris\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cleaned_text = [[normalize(t) for t in analyzer(s)] for s in all_texts]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "FastText(vocab=368359, size=200, alpha=0.025)\n"
+     ]
+    }
+   ],
+   "source": [
+    "model_gensim = FastText(\n",
+    "    size=200,\n",
+    "    min_count=5,\n",
+    "    window=10\n",
+    ")\n",
+    "\n",
+    "# build the vocabulary\n",
+    "model_gensim.build_vocab(cleaned_text)\n",
+    "\n",
+    "# train the model\n",
+    "model_gensim.train(cleaned_text, total_examples=model_gensim.corpus_count, epochs=model_gensim.epochs)\n",
+    "\n",
+    "print(model_gensim)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_gensim.save('text_sample_2013to2016_gensim_200.model')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([-0.00251447,  0.22377786,  0.98355085, -0.7715473 , -0.13788459,\n",
+       "        0.90395945, -0.44666514, -0.42506534, -0.54950446,  1.0771613 ,\n",
+       "        1.2118708 ,  0.34573835, -1.12505   , -1.1896925 ,  0.10617373,\n",
+       "        0.14834636, -0.63678473,  0.5306245 ,  0.91713506,  0.78559077,\n",
+       "        1.677414  , -0.13240121, -0.30925515, -0.3110578 ,  0.5169658 ,\n",
+       "        0.5799523 , -1.0073831 ,  0.80370396, -0.40828082,  0.15891187,\n",
+       "        0.05013   , -0.9099712 ,  0.30009922,  1.1096807 , -0.31454656,\n",
+       "        1.8045816 ,  0.31973043, -0.5810332 , -0.05613879,  0.8439649 ,\n",
+       "       -0.5796422 ,  0.09544377,  0.5595102 ,  0.29849023, -0.5070656 ,\n",
+       "       -0.33554476,  0.2724658 ,  0.3301648 ,  0.03812361,  1.280918  ,\n",
+       "       -0.8932892 ,  1.0181543 ,  0.6297721 , -0.79178953, -0.4830722 ,\n",
+       "        1.900083  , -0.97206956, -1.231961  ,  0.519141  , -1.3692759 ,\n",
+       "        0.14483038, -0.81043893, -0.7811998 , -0.5656443 ,  0.68394303,\n",
+       "       -0.00412971,  1.8282131 ,  0.38563082, -0.14956062, -0.15799755,\n",
+       "        0.8279126 ,  1.1851251 , -0.60868716, -1.1392959 ,  0.01907011,\n",
+       "       -0.1993565 ,  0.08864743,  0.73447526,  1.1220739 ,  0.15387197,\n",
+       "       -0.23781261,  0.35393322,  0.5229472 ,  1.1374369 ,  1.1848328 ,\n",
+       "        1.3268511 , -1.0447361 ,  1.595999  ,  0.66512877,  0.7180348 ,\n",
+       "       -0.75844324,  0.05856895, -0.61785024, -0.270561  , -1.0492203 ,\n",
+       "       -0.9660757 ,  0.13246736,  0.8860084 ,  0.04582638,  0.05656901,\n",
+       "        0.06524241,  0.10691343, -0.40181476,  0.13881405, -1.3615162 ,\n",
+       "       -0.10067749,  0.97370344, -0.9406569 ,  0.42231107,  0.505285  ,\n",
+       "       -0.40041313, -0.03177388,  0.8784866 , -0.33346197,  0.0926585 ,\n",
+       "       -0.6843044 , -1.0193583 , -0.34783173, -0.23892027,  0.18805595,\n",
+       "        0.14845106,  0.14841844,  0.85369676,  0.54138273,  1.148289  ,\n",
+       "       -0.6931565 , -0.80891985,  0.33537978,  0.1751958 ,  1.3862312 ,\n",
+       "       -0.51982635, -0.2999697 ,  0.12558676, -0.05539849, -0.16308819,\n",
+       "        0.6570933 , -1.5081487 ,  1.197739  , -0.8126156 ,  0.74745417,\n",
+       "       -0.943481  , -1.6323334 ,  0.29812822, -0.14741744,  0.1782602 ,\n",
+       "        0.9578688 , -0.2850476 , -1.1810929 , -0.7266627 , -0.13423558,\n",
+       "        0.8501329 ,  1.406546  ,  0.6848001 ,  0.95838404,  0.35591075,\n",
+       "        1.7670783 ,  0.35993043, -0.16310644, -0.23210363, -0.26639834,\n",
+       "        0.14959098,  0.51991194, -0.30781794,  0.61597764,  0.08290225,\n",
+       "       -0.5098869 , -1.0988191 , -0.04218112,  1.3670919 ,  0.5600427 ,\n",
+       "       -0.650471  ,  0.48897344, -0.14917164, -0.09997601, -0.07794272,\n",
+       "        0.7474547 , -0.11964303,  0.7395969 , -0.2718192 , -0.71038485,\n",
+       "       -0.66098285,  0.6040508 ,  0.14715323, -0.7527266 , -0.31431022,\n",
+       "        0.75994515,  0.44979033,  0.8749855 ,  0.24492699,  0.29419214,\n",
+       "        0.5749436 ,  0.2710817 ,  0.3514359 , -0.48893654,  0.18267944,\n",
+       "        0.10806478, -0.46188998,  0.04687883,  0.11950479,  0.4838739 ],\n",
+       "      dtype=float32)"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model_gensim.wv[normalize(\"pray4paris\")]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/scratch/clb617/anaconda3/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n",
+      "  if np.issubdtype(vec.dtype, np.int):\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[('#prayforparis', 0.7384930849075317),\n",
+       " ('paris', 0.6991567611694336),\n",
+       " ('#prayersforparis', 0.6977229118347168),\n",
+       " ('#prayforjakarta', 0.6635531187057495),\n",
+       " ('#prayersfourparis', 0.65708327293396),\n",
+       " ('vogueparis', 0.6510902643203735),\n",
+       " ('pragya', 0.6476349234580994),\n",
+       " ('#paris', 0.6376635432243347),\n",
+       " ('#prayforbangkok', 0.6372296810150146),\n",
+       " ('#jesuisparis', 0.6351719498634338)]"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model_gensim.wv.most_similar(\"pray4paris\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/scratch/clb617/anaconda3/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n",
+      "  if np.issubdtype(vec.dtype, np.int):\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[('#prayforparis', 0.7739624381065369),\n",
+       " ('#prayforgaza', 0.674716055393219),\n",
+       " ('#prayforiraq', 0.6647266149520874),\n",
+       " ('#prayforsyria', 0.6583042144775391),\n",
+       " ('#prayforjakarta', 0.6565165519714355),\n",
+       " ('#prayfororlando', 0.6531145572662354),\n",
+       " ('#prayforvenezuela', 0.6508985757827759),\n",
+       " ('#prayforlebanon', 0.6483551263809204),\n",
+       " ('#prayforpalestine', 0.6478559970855713),\n",
+       " ('#prayforpalestina', 0.6470707654953003)]"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model_gensim.wv.most_similar(normalize(\"pray4paris\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/scratch/clb617/anaconda3/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n",
+      "  if np.issubdtype(vec.dtype, np.int):\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[('earthquake', 0.917615532875061),\n",
+       " ('quake-hit', 0.9108307361602783),\n",
+       " ('cupquake', 0.9038751721382141),\n",
+       " ('earthquake_rt', 0.8828902244567871),\n",
+       " ('#earthquake', 0.8807302117347717),\n",
+       " ('#quake', 0.869563102722168),\n",
+       " ('earthquakes', 0.8631203174591064),\n",
+       " ('#nepalearthquake', 0.8421447277069092),\n",
+       " ('earthqua', 0.830664336681366),\n",
+       " ('#earthquakeph', 0.8282575011253357)]"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model_gensim.wv.most_similar(normalize(\"quake\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_sentences = [\n",
+    "    \"19 dead, 50 injured in #ManchesterArena explosion. Being treated as a terror incident. http://www.bbc.co.uk/news/live/uk-england-manchester-40007967 …\",\n",
+    "    \"EXPLOSION AT MANCHESTER ARENA AND EVERYONE RAN OUT SO SCARY😭\",\n",
+    "    \"Watch live coverage: #ManchesterArena explosion at Ariana Grande concert:\",\n",
+    "    \"Greater Manchester Police says the number of people killed in the Manchester attack has risen to 22\",\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "19 dead, 50 injured in #ManchesterArena explosion. Being treated as a terror incident. http://www.bbc.co.uk/news/live/uk-england-manchester-40007967 …\n",
+      "('terrestrial', 0.8943253755569458)\n",
+      "('interceding', 0.8877480030059814)\n",
+      "('intercostal', 0.8862031698226929)\n",
+      "('marauding', 0.8860480785369873)\n",
+      "('indentured', 0.8854652643203735)\n",
+      "('intercontinental', 0.8847228288650513)\n",
+      "('sacramentonews', 0.8833929300308228)\n",
+      "('severing', 0.8816179633140564)\n",
+      "('derailment', 0.8803422451019287)\n",
+      "('fouroneactionnews', 0.8803269863128662)\n",
+      "---\n",
+      "EXPLOSION AT MANCHESTER ARENA AND EVERYONE RAN OUT SO SCARY😭\n",
+      "('groundbreaking', 0.8508864045143127)\n",
+      "('housebreaking', 0.8507399559020996)\n",
+      "('andover', 0.8389917612075806)\n",
+      "('ground-breaking', 0.8367241024971008)\n",
+      "('seaking', 0.8314098119735718)\n",
+      "('backbreaking', 0.8305997848510742)\n",
+      "('#groundbreaking', 0.8290548324584961)\n",
+      "('westinghouse', 0.8275308609008789)\n",
+      "('creaking', 0.8262030482292175)\n",
+      "('lansing', 0.8249188661575317)\n",
+      "---\n",
+      "Watch live coverage: #ManchesterArena explosion at Ariana Grande concert:\n",
+      "('performing', 0.855387806892395)\n",
+      "('outperforming', 0.8498433828353882)\n",
+      "('live-streaming', 0.8489073514938354)\n",
+      "('underperforming', 0.8463796377182007)\n",
+      "('high-performing', 0.8437735438346863)\n",
+      "('livestreaming', 0.8407323360443115)\n",
+      "('chantecaille', 0.8391469717025757)\n",
+      "('perfoming', 0.834829568862915)\n",
+      "('livening', 0.8326269388198853)\n",
+      "('performence', 0.8307388424873352)\n",
+      "---\n",
+      "Greater Manchester Police says the number of people killed in the Manchester attack has risen to 22\n",
+      "('worcester', 0.8883395195007324)\n",
+      "('undistorted', 0.8783309459686279)\n",
+      "('disenfranchising', 0.8776552081108093)\n",
+      "('sacramentonews', 0.8775242567062378)\n",
+      "('devestating', 0.876780092716217)\n",
+      "('interceding', 0.8760107755661011)\n",
+      "('marauding', 0.8759708404541016)\n",
+      "('uniden', 0.875922441482544)\n",
+      "('persepolis', 0.8755065202713013)\n",
+      "('suster', 0.8751187324523926)\n",
+      "---\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/scratch/clb617/anaconda3/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n",
+      "  if np.issubdtype(vec.dtype, np.int):\n"
+     ]
+    }
+   ],
+   "source": [
+    "def vectorize(sentence):\n",
+    "    tokenized = [normalize(t) for t in analyzer(sentence)]\n",
+    "    \n",
+    "    wvs = []\n",
+    "    for t in tokenized:\n",
+    "        v = model_gensim.wv[t]\n",
+    "        norm = np.linalg.norm(v)\n",
+    "        normed_v = v / norm\n",
+    "        wvs.append(normed_v)\n",
+    "        \n",
+    "    m = np.array(wvs)\n",
+    "    normed_m = np.mean(m, axis=0)\n",
+    "\n",
+    "    return normed_m\n",
+    "\n",
+    "for s in test_sentences:\n",
+    "    sv = vectorize(s)\n",
+    "    print(s)\n",
+    "    for tup in model_gensim.wv.similar_by_vector(sv):\n",
+    "        print(tup)\n",
+    "    print(\"---\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/featurization/.ipynb_checkpoints/VectorizerBuilder-checkpoint.ipynb b/featurization/.ipynb_checkpoints/VectorizerBuilder-checkpoint.ipynb
new file mode 100644
index 0000000..31d75a5
--- /dev/null
+++ b/featurization/.ipynb_checkpoints/VectorizerBuilder-checkpoint.ipynb
@@ -0,0 +1,414 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "%matplotlib inline\n",
+    "\n",
+    "import matplotlib.pyplot as plt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import string\n",
+    "import gzip\n",
+    "import json\n",
+    "import re"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import scipy\n",
+    "from scipy import interpolate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import glob"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/clb617/.local/lib/python3.6/site-packages/sklearn/ensemble/weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.\n",
+      "  from numpy.core.umath_tests import inner1d\n"
+     ]
+    }
+   ],
+   "source": [
+    "import sklearn.cluster\n",
+    "import sklearn.feature_extraction \n",
+    "import sklearn.feature_extraction.text\n",
+    "import sklearn.metrics\n",
+    "import sklearn.preprocessing\n",
+    "\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from sklearn.externals import joblib\n",
+    "from sklearn.feature_selection import SelectFromModel\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.metrics import accuracy_score\n",
+    "from sklearn.metrics import auc\n",
+    "from sklearn.metrics import average_precision_score\n",
+    "from sklearn.metrics import confusion_matrix\n",
+    "from sklearn.metrics import f1_score\n",
+    "from sklearn.metrics import precision_recall_curve\n",
+    "from sklearn.model_selection import RandomizedSearchCV\n",
+    "from sklearn.model_selection import StratifiedKFold\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.multiclass import OneVsRestClassifier\n",
+    "from sklearn.naive_bayes import BernoulliNB\n",
+    "from sklearn.naive_bayes import MultinomialNB\n",
+    "from sklearn.pipeline import FeatureUnion\n",
+    "from sklearn.pipeline import Pipeline\n",
+    "from sklearn.semi_supervised import LabelPropagation\n",
+    "from sklearn.semi_supervised import LabelSpreading\n",
+    "from sklearn.svm import LinearSVC\n",
+    "from sklearn.svm import SVC\n",
+    "from sklearn.tree import DecisionTreeClassifier"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package stopwords to /home/clb617/nltk_data...\n",
+      "[nltk_data]   Package stopwords is already up-to-date!\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/clb617/.local/lib/python3.6/site-packages/nltk/twitter/__init__.py:20: UserWarning: The twython library has not been installed. Some functionality from the twitter package will not be available.\n",
+      "  warnings.warn(\"The twython library has not been installed. \"\n"
+     ]
+    }
+   ],
+   "source": [
+    "import nltk\n",
+    "nltk.download(\"stopwords\")\n",
+    "from nltk.corpus import stopwords\n",
+    "\n",
+    "from nltk.tokenize import TweetTokenizer\n",
+    "from nltk.sentiment.vader import SentimentIntensityAnalyzer as VS"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# But first, read in stopwrods\n",
+    "enStop = stopwords.words('english')\n",
+    "\n",
+    "# Skip stop words, retweet signs, @ symbols, and URL headers\n",
+    "stopList = enStop +\\\n",
+    "    [\"http\", \"https\", \"rt\", \"@\", \":\", \"t.co\", \"co\", \"amp\", \"&amp;\", \"...\", \"\\n\", \"\\r\"]\n",
+    "stopList.extend(string.punctuation)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# def tokenizer_wrapper(text):\n",
+    "#     return [t.lemma_ for t in nlp(text)]\n",
+    "\n",
+    "local_tokenizer = TweetTokenizer()\n",
+    "def tokenizer_wrapper(text):\n",
+    "    return local_tokenizer.tokenize(text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(\n",
+    "    tokenizer=tokenizer_wrapper,\n",
+    "    ngram_range=(1, 2),\n",
+    "    stop_words=stopList, #We do better when we keep stopwords\n",
+    "    use_idf=True,\n",
+    "    smooth_idf=False,\n",
+    "    norm=None, #Applies l2 norm smoothing\n",
+    "    decode_error='replace',\n",
+    "    max_features=10000,\n",
+    "    min_df=4,\n",
+    "    max_df=0.501\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "tweet_texts = []\n",
+    "\n",
+    "for f in glob.glob(\"tweet_random_subset_2013to2016_v2/part*.gz\"):\n",
+    "    flag = False\n",
+    "    with gzip.open(f, \"r\") as in_file:\n",
+    "        for line_bytes in in_file:\n",
+    "            line = line_bytes.decode(\"utf8\")\n",
+    "\n",
+    "            if ( len(line.strip()) == 0 ):\n",
+    "                continue\n",
+    "                \n",
+    "            tweet = json.loads(line)\n",
+    "            text = tweet[\"text\"]\n",
+    "\n",
+    "            tweet_texts.append(text)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Tweet Count: 11715393\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Tweet Count:\", len(tweet_texts))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "trec_tweets = []\n",
+    "with open(\"rehydrated_tweets.json\", \"r\") as in_file:\n",
+    "    for line in in_file:\n",
+    "        tweet = json.loads(line)\n",
+    "        trec_tweets.append(tweet[\"text\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "all_texts = trec_tweets + tweet_texts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vectorizer.fit(all_texts)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "joblib.dump(vectorizer, \"2013to2016_tfidf_vectorizer_20190109.pkl\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vocab = {v:i for i, v in enumerate(vectorizer.get_feature_names())}\n",
+    "idf_vals = vectorizer.idf_\n",
+    "idf_dict = {i:idf_vals[i] for i in vocab.values()} #keys are indices; values are IDF scores"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dictionary Word Count: 10000\n",
+      "['🚨', '🚨 🚨', '🚫', '🚮', '🚶', '🤑', '🤔', '🤔 🤔', '🤗', '🤘']\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Dictionary Word Count:\", len(vocab))\n",
+    "print([x[0] for x in vocab.items()][-10:])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Finished\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Finished\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/featurization/FastTextBuilder.ipynb b/featurization/FastTextBuilder.ipynb
new file mode 100644
index 0000000..d1b8e20
--- /dev/null
+++ b/featurization/FastTextBuilder.ipynb
@@ -0,0 +1,729 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%matplotlib inline\n",
+    "\n",
+    "import matplotlib.pyplot as plt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import string\n",
+    "import gzip\n",
+    "import json\n",
+    "import re\n",
+    "import glob"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import scipy\n",
+    "from scipy import interpolate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sklearn.cluster\n",
+    "import sklearn.feature_extraction \n",
+    "import sklearn.feature_extraction.text\n",
+    "import sklearn.metrics\n",
+    "import sklearn.preprocessing\n",
+    "\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from sklearn.externals import joblib\n",
+    "from sklearn.feature_selection import SelectFromModel\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.metrics import accuracy_score\n",
+    "from sklearn.metrics import auc\n",
+    "from sklearn.metrics import average_precision_score\n",
+    "from sklearn.metrics import confusion_matrix\n",
+    "from sklearn.metrics import f1_score\n",
+    "from sklearn.metrics import precision_recall_curve\n",
+    "from sklearn.model_selection import RandomizedSearchCV\n",
+    "from sklearn.model_selection import StratifiedKFold\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.multiclass import OneVsRestClassifier\n",
+    "from sklearn.naive_bayes import BernoulliNB\n",
+    "from sklearn.naive_bayes import MultinomialNB\n",
+    "from sklearn.pipeline import FeatureUnion\n",
+    "from sklearn.pipeline import Pipeline\n",
+    "from sklearn.semi_supervised import LabelPropagation\n",
+    "from sklearn.semi_supervised import LabelSpreading\n",
+    "from sklearn.svm import LinearSVC\n",
+    "from sklearn.svm import SVC\n",
+    "from sklearn.tree import DecisionTreeClassifier"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package stopwords to /home/clb617/nltk_data...\n",
+      "[nltk_data]   Package stopwords is already up-to-date!\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/scratch/clb617/anaconda3/lib/python3.6/site-packages/nltk/twitter/__init__.py:20: UserWarning: The twython library has not been installed. Some functionality from the twitter package will not be available.\n",
+      "  warnings.warn(\"The twython library has not been installed. \"\n"
+     ]
+    }
+   ],
+   "source": [
+    "import nltk\n",
+    "nltk.download(\"stopwords\")\n",
+    "from nltk.corpus import stopwords\n",
+    "\n",
+    "from nltk.tokenize import TweetTokenizer\n",
+    "from nltk.sentiment.vader import SentimentIntensityAnalyzer as VS"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from gensim.models.fasttext import FastText"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Skip stop words, retweet signs, @ symbols, and URL headers\n",
+    "stopList = [\"http\", \"https\", \"rt\", \"@\", \":\", \"t.co\", \"co\", \"amp\", \"&amp;\", \"...\", \"\\n\", \"\\r\"]\n",
+    "stopList.extend(string.punctuation)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# def tokenizer_wrapper(text):\n",
+    "#     return [t.lemma_ for t in nlp(text)]\n",
+    "\n",
+    "local_tokenizer = TweetTokenizer()\n",
+    "def tokenizer_wrapper(text):\n",
+    "    return local_tokenizer.tokenize(text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(\n",
+    "    tokenizer=tokenizer_wrapper,\n",
+    "    ngram_range=(1, 1),\n",
+    "    stop_words=stopList, #We do better when we keep stopwords\n",
+    "    use_idf=True,\n",
+    "    smooth_idf=False,\n",
+    "    norm=None, #Applies l2 norm smoothing\n",
+    "    decode_error='replace',\n",
+    "    max_features=10000,\n",
+    "    min_df=4,\n",
+    "    max_df=0.501\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "analyzer = vectorizer.build_analyzer()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def normalize(s):\n",
+    "    \"\"\"\n",
+    "    Given a text, cleans and normalizes it. Feel free to add your own stuff.\n",
+    "    From: https://www.kaggle.com/mschumacher/using-fasttext-models-for-robust-embeddings\n",
+    "    \"\"\"\n",
+    "    s = s.lower()\n",
+    "\n",
+    "    # Replace numbers and symbols with language\n",
+    "    s = s.replace('&', ' and ')\n",
+    "    s = s.replace('@', ' at ')\n",
+    "    s = s.replace('0', 'zero')\n",
+    "    s = s.replace('1', 'one')\n",
+    "    s = s.replace('2', 'two')\n",
+    "    s = s.replace('3', 'three')\n",
+    "    s = s.replace('4', 'four')\n",
+    "    s = s.replace('5', 'five')\n",
+    "    s = s.replace('6', 'six')\n",
+    "    s = s.replace('7', 'seven')\n",
+    "    s = s.replace('8', 'eight')\n",
+    "    s = s.replace('9', 'nine')\n",
+    "\n",
+    "    return s"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "trec_tweets = []\n",
+    "tweet_texts = []"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(\"rehydrated_tweets.json\", \"r\") as in_file:\n",
+    "    for line in in_file:\n",
+    "        tweet = json.loads(line)\n",
+    "        trec_tweets.append(tweet[\"text\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "11717671\n"
+     ]
+    }
+   ],
+   "source": [
+    "# for f in glob.glob(\"/home/clb617/scratch/projects/trecis_proj/data/tweet_random_subset_2013to2016_v2/part*.gz\"):\n",
+    "#     with gzip.open(f, \"r\") as in_file:\n",
+    "#         for line_bytes in in_file:\n",
+    "#             line = line_bytes.decode(\"utf8\")\n",
+    "#             tweet = json.loads(line)\n",
+    "#             if ( tweet[\"lang\"] == \"en\" and \"retweeted_status\" not in tweet ):\n",
+    "#                 tweet_texts.append([\"text\"])\n",
+    "\n",
+    "# with gzip.open(\"/home/clb617/scratch/projects/trecis_proj/data/english_2015_sample_1m.json.gz\", \"r\") as in_file:\n",
+    "# with gzip.open(\"/home/clb617/scratch/projects/trecis_proj/data/text_sample_2015.json.gz\", \"r\") as in_file:\n",
+    "#     for line_bytes in in_file:\n",
+    "#         line = line_bytes.decode(\"utf8\")\n",
+    "#         tweet_texts.append(json.loads(line)[\"text\"])\n",
+    "\n",
+    "def jsonstr2tweet(jstr):\n",
+    "    try:\n",
+    "        tweet = json.loads(jstr)\n",
+    "        if ( tweet[\"lang\"] == \"en\" and \"retweeted_status\" not in tweet ):\n",
+    "            return tweet\n",
+    "        else:\n",
+    "            return None\n",
+    "    except:\n",
+    "        return None\n",
+    "\n",
+    "rdd = sc.textFile(\"tweet_random_subset_2013to2016_v2/\")\n",
+    "tweet_texts = rdd.map(jsonstr2tweet).filter(lambda x : x != None).map(lambda x: x[\"text\"]).collect()\n",
+    "\n",
+    "# [[normalize(t) for t in analyzer(s)] for s in all_texts]\n",
+    "\n",
+    "print(len(tweet_texts))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_texts = trec_tweets + tweet_texts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'prayfourparis'"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "normalize(\"pray4paris\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cleaned_text = [[normalize(t) for t in analyzer(s)] for s in all_texts]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "FastText(vocab=368359, size=200, alpha=0.025)\n"
+     ]
+    }
+   ],
+   "source": [
+    "model_gensim = FastText(\n",
+    "    size=200,\n",
+    "    min_count=5,\n",
+    "    window=10\n",
+    ")\n",
+    "\n",
+    "# build the vocabulary\n",
+    "model_gensim.build_vocab(cleaned_text)\n",
+    "\n",
+    "# train the model\n",
+    "model_gensim.train(cleaned_text, total_examples=model_gensim.corpus_count, epochs=model_gensim.epochs)\n",
+    "\n",
+    "print(model_gensim)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_gensim.save('text_sample_2013to2016_gensim_200.model')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([-0.00251447,  0.22377786,  0.98355085, -0.7715473 , -0.13788459,\n",
+       "        0.90395945, -0.44666514, -0.42506534, -0.54950446,  1.0771613 ,\n",
+       "        1.2118708 ,  0.34573835, -1.12505   , -1.1896925 ,  0.10617373,\n",
+       "        0.14834636, -0.63678473,  0.5306245 ,  0.91713506,  0.78559077,\n",
+       "        1.677414  , -0.13240121, -0.30925515, -0.3110578 ,  0.5169658 ,\n",
+       "        0.5799523 , -1.0073831 ,  0.80370396, -0.40828082,  0.15891187,\n",
+       "        0.05013   , -0.9099712 ,  0.30009922,  1.1096807 , -0.31454656,\n",
+       "        1.8045816 ,  0.31973043, -0.5810332 , -0.05613879,  0.8439649 ,\n",
+       "       -0.5796422 ,  0.09544377,  0.5595102 ,  0.29849023, -0.5070656 ,\n",
+       "       -0.33554476,  0.2724658 ,  0.3301648 ,  0.03812361,  1.280918  ,\n",
+       "       -0.8932892 ,  1.0181543 ,  0.6297721 , -0.79178953, -0.4830722 ,\n",
+       "        1.900083  , -0.97206956, -1.231961  ,  0.519141  , -1.3692759 ,\n",
+       "        0.14483038, -0.81043893, -0.7811998 , -0.5656443 ,  0.68394303,\n",
+       "       -0.00412971,  1.8282131 ,  0.38563082, -0.14956062, -0.15799755,\n",
+       "        0.8279126 ,  1.1851251 , -0.60868716, -1.1392959 ,  0.01907011,\n",
+       "       -0.1993565 ,  0.08864743,  0.73447526,  1.1220739 ,  0.15387197,\n",
+       "       -0.23781261,  0.35393322,  0.5229472 ,  1.1374369 ,  1.1848328 ,\n",
+       "        1.3268511 , -1.0447361 ,  1.595999  ,  0.66512877,  0.7180348 ,\n",
+       "       -0.75844324,  0.05856895, -0.61785024, -0.270561  , -1.0492203 ,\n",
+       "       -0.9660757 ,  0.13246736,  0.8860084 ,  0.04582638,  0.05656901,\n",
+       "        0.06524241,  0.10691343, -0.40181476,  0.13881405, -1.3615162 ,\n",
+       "       -0.10067749,  0.97370344, -0.9406569 ,  0.42231107,  0.505285  ,\n",
+       "       -0.40041313, -0.03177388,  0.8784866 , -0.33346197,  0.0926585 ,\n",
+       "       -0.6843044 , -1.0193583 , -0.34783173, -0.23892027,  0.18805595,\n",
+       "        0.14845106,  0.14841844,  0.85369676,  0.54138273,  1.148289  ,\n",
+       "       -0.6931565 , -0.80891985,  0.33537978,  0.1751958 ,  1.3862312 ,\n",
+       "       -0.51982635, -0.2999697 ,  0.12558676, -0.05539849, -0.16308819,\n",
+       "        0.6570933 , -1.5081487 ,  1.197739  , -0.8126156 ,  0.74745417,\n",
+       "       -0.943481  , -1.6323334 ,  0.29812822, -0.14741744,  0.1782602 ,\n",
+       "        0.9578688 , -0.2850476 , -1.1810929 , -0.7266627 , -0.13423558,\n",
+       "        0.8501329 ,  1.406546  ,  0.6848001 ,  0.95838404,  0.35591075,\n",
+       "        1.7670783 ,  0.35993043, -0.16310644, -0.23210363, -0.26639834,\n",
+       "        0.14959098,  0.51991194, -0.30781794,  0.61597764,  0.08290225,\n",
+       "       -0.5098869 , -1.0988191 , -0.04218112,  1.3670919 ,  0.5600427 ,\n",
+       "       -0.650471  ,  0.48897344, -0.14917164, -0.09997601, -0.07794272,\n",
+       "        0.7474547 , -0.11964303,  0.7395969 , -0.2718192 , -0.71038485,\n",
+       "       -0.66098285,  0.6040508 ,  0.14715323, -0.7527266 , -0.31431022,\n",
+       "        0.75994515,  0.44979033,  0.8749855 ,  0.24492699,  0.29419214,\n",
+       "        0.5749436 ,  0.2710817 ,  0.3514359 , -0.48893654,  0.18267944,\n",
+       "        0.10806478, -0.46188998,  0.04687883,  0.11950479,  0.4838739 ],\n",
+       "      dtype=float32)"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model_gensim.wv[normalize(\"pray4paris\")]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/scratch/clb617/anaconda3/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n",
+      "  if np.issubdtype(vec.dtype, np.int):\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[('#prayforparis', 0.7384930849075317),\n",
+       " ('paris', 0.6991567611694336),\n",
+       " ('#prayersforparis', 0.6977229118347168),\n",
+       " ('#prayforjakarta', 0.6635531187057495),\n",
+       " ('#prayersfourparis', 0.65708327293396),\n",
+       " ('vogueparis', 0.6510902643203735),\n",
+       " ('pragya', 0.6476349234580994),\n",
+       " ('#paris', 0.6376635432243347),\n",
+       " ('#prayforbangkok', 0.6372296810150146),\n",
+       " ('#jesuisparis', 0.6351719498634338)]"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model_gensim.wv.most_similar(\"pray4paris\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/scratch/clb617/anaconda3/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n",
+      "  if np.issubdtype(vec.dtype, np.int):\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[('#prayforparis', 0.7739624381065369),\n",
+       " ('#prayforgaza', 0.674716055393219),\n",
+       " ('#prayforiraq', 0.6647266149520874),\n",
+       " ('#prayforsyria', 0.6583042144775391),\n",
+       " ('#prayforjakarta', 0.6565165519714355),\n",
+       " ('#prayfororlando', 0.6531145572662354),\n",
+       " ('#prayforvenezuela', 0.6508985757827759),\n",
+       " ('#prayforlebanon', 0.6483551263809204),\n",
+       " ('#prayforpalestine', 0.6478559970855713),\n",
+       " ('#prayforpalestina', 0.6470707654953003)]"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model_gensim.wv.most_similar(normalize(\"pray4paris\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/scratch/clb617/anaconda3/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n",
+      "  if np.issubdtype(vec.dtype, np.int):\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[('earthquake', 0.917615532875061),\n",
+       " ('quake-hit', 0.9108307361602783),\n",
+       " ('cupquake', 0.9038751721382141),\n",
+       " ('earthquake_rt', 0.8828902244567871),\n",
+       " ('#earthquake', 0.8807302117347717),\n",
+       " ('#quake', 0.869563102722168),\n",
+       " ('earthquakes', 0.8631203174591064),\n",
+       " ('#nepalearthquake', 0.8421447277069092),\n",
+       " ('earthqua', 0.830664336681366),\n",
+       " ('#earthquakeph', 0.8282575011253357)]"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model_gensim.wv.most_similar(normalize(\"quake\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_sentences = [\n",
+    "    \"19 dead, 50 injured in #ManchesterArena explosion. Being treated as a terror incident. http://www.bbc.co.uk/news/live/uk-england-manchester-40007967 …\",\n",
+    "    \"EXPLOSION AT MANCHESTER ARENA AND EVERYONE RAN OUT SO SCARY😭\",\n",
+    "    \"Watch live coverage: #ManchesterArena explosion at Ariana Grande concert:\",\n",
+    "    \"Greater Manchester Police says the number of people killed in the Manchester attack has risen to 22\",\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "19 dead, 50 injured in #ManchesterArena explosion. Being treated as a terror incident. http://www.bbc.co.uk/news/live/uk-england-manchester-40007967 …\n",
+      "('terrestrial', 0.8943253755569458)\n",
+      "('interceding', 0.8877480030059814)\n",
+      "('intercostal', 0.8862031698226929)\n",
+      "('marauding', 0.8860480785369873)\n",
+      "('indentured', 0.8854652643203735)\n",
+      "('intercontinental', 0.8847228288650513)\n",
+      "('sacramentonews', 0.8833929300308228)\n",
+      "('severing', 0.8816179633140564)\n",
+      "('derailment', 0.8803422451019287)\n",
+      "('fouroneactionnews', 0.8803269863128662)\n",
+      "---\n",
+      "EXPLOSION AT MANCHESTER ARENA AND EVERYONE RAN OUT SO SCARY😭\n",
+      "('groundbreaking', 0.8508864045143127)\n",
+      "('housebreaking', 0.8507399559020996)\n",
+      "('andover', 0.8389917612075806)\n",
+      "('ground-breaking', 0.8367241024971008)\n",
+      "('seaking', 0.8314098119735718)\n",
+      "('backbreaking', 0.8305997848510742)\n",
+      "('#groundbreaking', 0.8290548324584961)\n",
+      "('westinghouse', 0.8275308609008789)\n",
+      "('creaking', 0.8262030482292175)\n",
+      "('lansing', 0.8249188661575317)\n",
+      "---\n",
+      "Watch live coverage: #ManchesterArena explosion at Ariana Grande concert:\n",
+      "('performing', 0.855387806892395)\n",
+      "('outperforming', 0.8498433828353882)\n",
+      "('live-streaming', 0.8489073514938354)\n",
+      "('underperforming', 0.8463796377182007)\n",
+      "('high-performing', 0.8437735438346863)\n",
+      "('livestreaming', 0.8407323360443115)\n",
+      "('chantecaille', 0.8391469717025757)\n",
+      "('perfoming', 0.834829568862915)\n",
+      "('livening', 0.8326269388198853)\n",
+      "('performence', 0.8307388424873352)\n",
+      "---\n",
+      "Greater Manchester Police says the number of people killed in the Manchester attack has risen to 22\n",
+      "('worcester', 0.8883395195007324)\n",
+      "('undistorted', 0.8783309459686279)\n",
+      "('disenfranchising', 0.8776552081108093)\n",
+      "('sacramentonews', 0.8775242567062378)\n",
+      "('devestating', 0.876780092716217)\n",
+      "('interceding', 0.8760107755661011)\n",
+      "('marauding', 0.8759708404541016)\n",
+      "('uniden', 0.875922441482544)\n",
+      "('persepolis', 0.8755065202713013)\n",
+      "('suster', 0.8751187324523926)\n",
+      "---\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/scratch/clb617/anaconda3/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n",
+      "  if np.issubdtype(vec.dtype, np.int):\n"
+     ]
+    }
+   ],
+   "source": [
+    "def vectorize(sentence):\n",
+    "    tokenized = [normalize(t) for t in analyzer(sentence)]\n",
+    "    \n",
+    "    wvs = []\n",
+    "    for t in tokenized:\n",
+    "        v = model_gensim.wv[t]\n",
+    "        norm = np.linalg.norm(v)\n",
+    "        normed_v = v / norm\n",
+    "        wvs.append(normed_v)\n",
+    "        \n",
+    "    m = np.array(wvs)\n",
+    "    normed_m = np.mean(m, axis=0)\n",
+    "\n",
+    "    return normed_m\n",
+    "\n",
+    "for s in test_sentences:\n",
+    "    sv = vectorize(s)\n",
+    "    print(s)\n",
+    "    for tup in model_gensim.wv.similar_by_vector(sv):\n",
+    "        print(tup)\n",
+    "    print(\"---\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/featurization/VectorizerBuilder.ipynb b/featurization/VectorizerBuilder.ipynb
new file mode 100644
index 0000000..31d75a5
--- /dev/null
+++ b/featurization/VectorizerBuilder.ipynb
@@ -0,0 +1,414 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "%matplotlib inline\n",
+    "\n",
+    "import matplotlib.pyplot as plt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import string\n",
+    "import gzip\n",
+    "import json\n",
+    "import re"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import scipy\n",
+    "from scipy import interpolate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import glob"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/clb617/.local/lib/python3.6/site-packages/sklearn/ensemble/weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.\n",
+      "  from numpy.core.umath_tests import inner1d\n"
+     ]
+    }
+   ],
+   "source": [
+    "import sklearn.cluster\n",
+    "import sklearn.feature_extraction \n",
+    "import sklearn.feature_extraction.text\n",
+    "import sklearn.metrics\n",
+    "import sklearn.preprocessing\n",
+    "\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from sklearn.externals import joblib\n",
+    "from sklearn.feature_selection import SelectFromModel\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.metrics import accuracy_score\n",
+    "from sklearn.metrics import auc\n",
+    "from sklearn.metrics import average_precision_score\n",
+    "from sklearn.metrics import confusion_matrix\n",
+    "from sklearn.metrics import f1_score\n",
+    "from sklearn.metrics import precision_recall_curve\n",
+    "from sklearn.model_selection import RandomizedSearchCV\n",
+    "from sklearn.model_selection import StratifiedKFold\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.multiclass import OneVsRestClassifier\n",
+    "from sklearn.naive_bayes import BernoulliNB\n",
+    "from sklearn.naive_bayes import MultinomialNB\n",
+    "from sklearn.pipeline import FeatureUnion\n",
+    "from sklearn.pipeline import Pipeline\n",
+    "from sklearn.semi_supervised import LabelPropagation\n",
+    "from sklearn.semi_supervised import LabelSpreading\n",
+    "from sklearn.svm import LinearSVC\n",
+    "from sklearn.svm import SVC\n",
+    "from sklearn.tree import DecisionTreeClassifier"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package stopwords to /home/clb617/nltk_data...\n",
+      "[nltk_data]   Package stopwords is already up-to-date!\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/clb617/.local/lib/python3.6/site-packages/nltk/twitter/__init__.py:20: UserWarning: The twython library has not been installed. Some functionality from the twitter package will not be available.\n",
+      "  warnings.warn(\"The twython library has not been installed. \"\n"
+     ]
+    }
+   ],
+   "source": [
+    "import nltk\n",
+    "nltk.download(\"stopwords\")\n",
+    "from nltk.corpus import stopwords\n",
+    "\n",
+    "from nltk.tokenize import TweetTokenizer\n",
+    "from nltk.sentiment.vader import SentimentIntensityAnalyzer as VS"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# But first, read in stopwrods\n",
+    "enStop = stopwords.words('english')\n",
+    "\n",
+    "# Skip stop words, retweet signs, @ symbols, and URL headers\n",
+    "stopList = enStop +\\\n",
+    "    [\"http\", \"https\", \"rt\", \"@\", \":\", \"t.co\", \"co\", \"amp\", \"&amp;\", \"...\", \"\\n\", \"\\r\"]\n",
+    "stopList.extend(string.punctuation)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# def tokenizer_wrapper(text):\n",
+    "#     return [t.lemma_ for t in nlp(text)]\n",
+    "\n",
+    "local_tokenizer = TweetTokenizer()\n",
+    "def tokenizer_wrapper(text):\n",
+    "    return local_tokenizer.tokenize(text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(\n",
+    "    tokenizer=tokenizer_wrapper,\n",
+    "    ngram_range=(1, 2),\n",
+    "    stop_words=stopList, #We do better when we keep stopwords\n",
+    "    use_idf=True,\n",
+    "    smooth_idf=False,\n",
+    "    norm=None, #Applies l2 norm smoothing\n",
+    "    decode_error='replace',\n",
+    "    max_features=10000,\n",
+    "    min_df=4,\n",
+    "    max_df=0.501\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "tweet_texts = []\n",
+    "\n",
+    "for f in glob.glob(\"tweet_random_subset_2013to2016_v2/part*.gz\"):\n",
+    "    flag = False\n",
+    "    with gzip.open(f, \"r\") as in_file:\n",
+    "        for line_bytes in in_file:\n",
+    "            line = line_bytes.decode(\"utf8\")\n",
+    "\n",
+    "            if ( len(line.strip()) == 0 ):\n",
+    "                continue\n",
+    "                \n",
+    "            tweet = json.loads(line)\n",
+    "            text = tweet[\"text\"]\n",
+    "\n",
+    "            tweet_texts.append(text)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Tweet Count: 11715393\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Tweet Count:\", len(tweet_texts))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "trec_tweets = []\n",
+    "with open(\"rehydrated_tweets.json\", \"r\") as in_file:\n",
+    "    for line in in_file:\n",
+    "        tweet = json.loads(line)\n",
+    "        trec_tweets.append(tweet[\"text\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "all_texts = trec_tweets + tweet_texts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vectorizer.fit(all_texts)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "joblib.dump(vectorizer, \"2013to2016_tfidf_vectorizer_20190109.pkl\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vocab = {v:i for i, v in enumerate(vectorizer.get_feature_names())}\n",
+    "idf_vals = vectorizer.idf_\n",
+    "idf_dict = {i:idf_vals[i] for i in vocab.values()} #keys are indices; values are IDF scores"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dictionary Word Count: 10000\n",
+      "['🚨', '🚨 🚨', '🚫', '🚮', '🚶', '🤑', '🤔', '🤔 🤔', '🤗', '🤘']\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Dictionary Word Count:\", len(vocab))\n",
+    "print([x[0] for x in vocab.items()][-10:])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Finished\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Finished\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}