From 0f4b45623187e8b649bb9f8d945b92a763ad60a9 Mon Sep 17 00:00:00 2001 From: Isabel Brador <59972166+izzybrador@users.noreply.github.com> Date: Fri, 3 Jan 2025 15:11:23 -0500 Subject: [PATCH] Delete Data Sets/Web Archives/LOC_Election_Dataset_Walkthrough.ipynb Deleting original us elections data walkthrough notebook, new one has been created in it's place. --- .../LOC_Election_Dataset_Walkthrough.ipynb | 809 ------------------ 1 file changed, 809 deletions(-) delete mode 100644 Data Sets/Web Archives/LOC_Election_Dataset_Walkthrough.ipynb diff --git a/Data Sets/Web Archives/LOC_Election_Dataset_Walkthrough.ipynb b/Data Sets/Web Archives/LOC_Election_Dataset_Walkthrough.ipynb deleted file mode 100644 index 8081dce..0000000 --- a/Data Sets/Web Archives/LOC_Election_Dataset_Walkthrough.ipynb +++ /dev/null @@ -1,809 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Exploring the United States Election Datasets\n", - "\n", - "This notebook demonstrates an intermediate approach to exploring and accessing the United States Election Datasets produced by the Library of Congress.\n", - "\n", - "As an intermediate approach, this notebook uses functions as reusable blocks of code. These functions could potentially be used outside of this notebook to assist in further exploration." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Imports\n", - "Below are the Python packages used in this notebook." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import requests # Used to make HTTP requests; this is how we will scrape the Wayback\n", - "from bs4 import BeautifulSoup # Used to process the scraped content \n", - "import gzip # Used to decompress the gzipped CDX files\n", - "import pandas as pd # Used to transform the data into dataframes for computation\n", - "from sklearn.feature_extraction.text import CountVectorizer # Used to create a matrix out of a bag of words\n", - "from time import sleep # Used to provide a slight pause between Wayback requests" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Defining Global Variables\n", - "Here we’re defining two global variables that will be used in the functions to gather and scrape the content.\n", - "The represent the base of the URL structure for two of the important web applications we will use: Cloudfront, which is the front end to where the dataset files are location and Wayback, where the actual web resources are accessed." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "CLOUDFRONT_BASE_URL = 'https://d2rxokvmqqcpq7.cloudfront.net/'\n", - "WAYBACK_BASE_URL = 'https://webarchive.loc.gov/all/'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Functions\n", - "Here we will define the functions in the order that they are used in this notebook. " - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "def gather_files_from_manifest(year: str):\n", - " \"\"\"\n", - " Function that takes a year (YYYY) as an argument.\n", - " The function collects the locations of the CDX files \n", - " listed by the provided year's manifest.\n", - " \n", - " Args:\n", - " year (str): String of a year YYYY.\n", - "\n", - " Returns:\n", - " :obj:`list` of :obj:`str` of individual CDX file URLs.\n", - " \"\"\"\n", - " \n", - " election_years = [\n", - " \"2000\",\n", - " \"2002\",\n", - " \"2004\",\n", - " \"2006\",\n", - " \"2008\",\n", - " \"2010\",\n", - " \"2012\",\n", - " \"2014\",\n", - " \"2016\",\n", - " \"2018\"\n", - " ]\n", - " \n", - " if year not in election_years:\n", - " return []\n", - " else:\n", - " manifest_url = f\"{CLOUDFRONT_BASE_URL}cdx11-indexes/{year}/el{year[-2:]}-manifest.html\"\n", - " response = requests.get(manifest_url)\n", - " soup = BeautifulSoup(response.content, 'html.parser')\n", - " cdx_files = [link.get('href') for link in soup.find_all('a')]\n", - "\n", - " return cdx_files\n", - "\n", - "def fetch_file(cdx_url: str):\n", - " \"\"\"\n", - " Function that takes a `String` as an argument.\n", - " The `cdx_url` is a singular item from the result\n", - " of the `gather_files_from_manifest` function.\n", - " The function fetches the gzipped CDX file, decompresses it,\n", - " splits it on the newlines, and removes the header. \n", - " Args:\n", - " cdx_url (str): Individual item from the result of\n", - " the `gather_files_from_manifest` function.\n", - "\n", - " Returns:\n", - " :obj:`list` of :obj:`str` of individual CDX lines, each representing\n", - " a web object.\n", - " \"\"\"\n", - " response = requests.get(cdx_url)\n", - " \n", - " # Here we decompress the gzipped CDX, decode it, split it on the newline, and remove the header\n", - " cdx_content = gzip.decompress(response.content).decode('utf-8').split('\\n')[1:]\n", - " \n", - " return cdx_content\n", - "\n", - "def create_dataframe(data: list):\n", - " \"\"\"\n", - " Function that takes a :obj:`list` of :obj:`str` as an argument.\n", - " `data` is the contents of the CDX file split on newlines. \n", - " This function takes `data`, applies a schema to it, and transforms it\n", - " into a `pandas.DataFrame`.\n", - " Args:\n", - " data (list): :obj:`list` of :obj:`str`. Each item is a line from\n", - " a CDX file or group of files.\n", - "\n", - " Returns:\n", - " A `pandas.DataFrame` of a CDX file or group of files.\n", - " \"\"\"\n", - " schema = [\n", - " 'urlkey',\n", - " 'timestamp',\n", - " 'original',\n", - " 'mimetype',\n", - " 'statuscode',\n", - " 'digest',\n", - " 'redirect',\n", - " 'metatags',\n", - " 'file_size',\n", - " 'offset',\n", - " 'warc_filename'\n", - " ]\n", - " _data = [row.split() for row in data]\n", - " \n", - " df = pd.DataFrame(_data, columns=schema)\n", - " \n", - " return df\n", - "\n", - "def create_dataframe_from_manifest(manifest: list):\n", - " \"\"\"\n", - " Function that takes a :obj:`list` of :obj:`str` as an argument.\n", - " The `manifest` is a list of all the individual CDX files found\n", - " from an Election year's or group of Election years' HTML manifest.\n", - " This function loops through each file, transforms it into a `pandas.DataFrame`\n", - " by calling the `create_dataframe` function, concats the DataFrames together,\n", - " and then returns the Dataframe representing the entire manifest.\n", - " Args:\n", - " manifest (list): :obj:`list` of :obj:`str` of all the individual CDX files found\n", - " from an Election year's or group of Election years' HTML manifest.\n", - "\n", - " Returns:\n", - " `pandas.DataFrame` representing every file present in the `manifest`.\n", - " \"\"\"\n", - " df = pd.DataFrame() \n", - " for index, line in enumerate(manifest[0:2]):\n", - " cdx = fetch_file(line)\n", - " if index == 0:\n", - " df = create_dataframe(cdx)\n", - " else:\n", - " df = pd.concat([df, create_dataframe(cdx)])\n", - " return df\n", - "\n", - "def fetch_text(row: pd.Series):\n", - " \"\"\"\n", - " Function that takes a `pandas.Series`, which is a single row \n", - " from a `pandas.DataFrame`, as an argument.\n", - " The functions uses the timestamp and original fields from the `row`\n", - " to request the specific resource. Once the resource is fetched,\n", - " the Wayback banner div elements are removed so as to not detract from \n", - " the words in the resource itself. \n", - " Args:\n", - " row (pandas.Series): `pandas.Series`, which is a single row \n", - " from a `pandas.DataFrame`.\n", - "\n", - " Returns:\n", - " `String` of the resource's text.\n", - " \"\"\"\n", - " response = requests.get(f\"{WAYBACK_BASE_URL}{row['timestamp']}/{row['original']}\")\n", - " soup = BeautifulSoup(response.content, 'html.parser')\n", - " [el.extract() for el in soup.find_all('div', {'id': 'wm-maximized'})]\n", - " [el.extract() for el in soup.find_all('div', {'id': 'wm-minimized'})]\n", - " return soup.text\n", - "\n", - "def fetch_all_text(df: pd.DataFrame):\n", - " \"\"\"\n", - " Function that takes a `pandas.Dataframe` as an argument.\n", - " This is the most complicated function here. The function first cleans the\n", - " `df` that was passed in by dropping all the rows that do not have a value in the\n", - " mimetype field. Then, it drops all the duplicate digests, which removes resources\n", - " that are exactly the same. Finally, it only returns rows that have 'text' in the mimetype\n", - " and have a '200' HTTP status response, meaning the resource was sucessfully capture.\n", - " Once the `df` is cleaned, each resource's text is fetched from the Wayback,\n", - " transformed into a matrix using `sklearn.CountVectorizer`, and then returns a `pandas.DataFrame`\n", - " with words and their occurance per resource. A politeness of 15 seconds is added between Wayback requests.\n", - " Args:\n", - " row (pandas.DataFrame): `pandas.Dataframe` representing web resources as CDX lines.\n", - "\n", - " Returns:\n", - " `pandas.Dataframe` of the resource's words tabulated per web resource.\n", - " \"\"\"\n", - " countvec = CountVectorizer(ngram_range=(1,1), stop_words='english')\n", - " unprocessed_bag_of_words = []\n", - " text_df = df\\\n", - " .dropna(subset=['mimetype'])\\\n", - " .drop_duplicates(subset=['digest'])\\\n", - " .query('mimetype.str.contains(\"text\") and statuscode.str.match(\"200\")', engine='python')\n", - " \n", - " for i, row in text_df.iterrows():\n", - " unprocessed_bag_of_words.append(fetch_text(row))\n", - " sleep(15)\n", - " \n", - " processed_bag_of_words = countvec.fit_transform(unprocessed_bag_of_words)\n", - " \n", - " return pd.DataFrame(processed_bag_of_words.toarray(),columns=countvec.get_feature_names())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Gathering the list of CDX Files\n", - "\n", - "The first step is gathering the list of CDX files. To do that, simply call the `gather_files_from_manifest` function, providing the Election year as an argument." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "el00_files = gather_files_from_manifest('2000')" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['https://d2rxokvmqqcpq7.cloudfront.net/cdx11-indexes/2000/unique.20010415093936.surt.cdx.gz',\n", - " 'https://d2rxokvmqqcpq7.cloudfront.net/cdx11-indexes/2000/unique.20010415094743.surt.cdx.gz',\n", - " 'https://d2rxokvmqqcpq7.cloudfront.net/cdx11-indexes/2000/unique.20010415095044.surt.cdx.gz',\n", - " 'https://d2rxokvmqqcpq7.cloudfront.net/cdx11-indexes/2000/unique.20010415095244.surt.cdx.gz',\n", - " 'https://d2rxokvmqqcpq7.cloudfront.net/cdx11-indexes/2000/unique.20010415095459.surt.cdx.gz']" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "el00_files[:5]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### CDX File\n", - "Next, we'll demonstrate what a particular CDX File looks like" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "cdx = fetch_file(el00_files[0])" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['com,voter)/home/candidates/info/0,1214,2-11880-,00.html 20001002182124 http://www.voter.com:80/home/candidates/info/0,1214,2-11880-,00.html text/html 200 FYXP43MQC5GVBQMVK3ETWSPXUBR5ICKP - - 5051 149 unique.20010415093936.arc.gz',\n", - " 'com,voter)/home/candidates/info/0,1214,2-18885-,00.html 20001002185814 http://www.voter.com:80/home/candidates/info/0,1214,2-18885-,00.html text/html 200 H6QN5ZULJ6YZP756QNVM3YXKXC7HZUIL - - 4829 5200 unique.20010415093936.arc.gz',\n", - " 'com,voter)/home/candidates/info/0,1214,2-18880-,00.html 20001002185815 http://www.voter.com:80/home/candidates/info/0,1214,2-18880-,00.html text/html 200 HFG67JI4KBPHFXMQE5DJRHF3OEKKBOO6 - - 4794 10029 unique.20010415093936.arc.gz',\n", - " 'com,voter)/home/officials/general/1,1195,2-2467-,00.html 20001002185815 http://voter.com:80/home/officials/general/1,1195,2-2467-,00.html text/html 200 HZJFLTHZD5MGEPJS2WVGBHQRQUPFBE3O - - 5282 14823 unique.20010415093936.arc.gz',\n", - " 'com,voter)/home/candidates/info/0,1214,2-18886-,00.html 20001002185816 http://www.voter.com:80/home/candidates/info/0,1214,2-18886-,00.html text/html 200 QAM7JW7S4CNYMP6HLA6DASOXTO2SIGWO - - 4823 20105 unique.20010415093936.arc.gz']" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cdx[:5]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### CDX as a DataFrame\n", - "Now, here is the same CDX transformed into a DataFrame" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "cdx_df = create_dataframe(cdx)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
urlkeytimestamporiginalmimetypestatuscodedigestredirectmetatagsfile_sizeoffsetwarc_filename
0com,voter)/home/candidates/info/0,1214,2-11880...20001002182124http://www.voter.com:80/home/candidates/info/0...text/html200FYXP43MQC5GVBQMVK3ETWSPXUBR5ICKP--5051149unique.20010415093936.arc.gz
1com,voter)/home/candidates/info/0,1214,2-18885...20001002185814http://www.voter.com:80/home/candidates/info/0...text/html200H6QN5ZULJ6YZP756QNVM3YXKXC7HZUIL--48295200unique.20010415093936.arc.gz
2com,voter)/home/candidates/info/0,1214,2-18880...20001002185815http://www.voter.com:80/home/candidates/info/0...text/html200HFG67JI4KBPHFXMQE5DJRHF3OEKKBOO6--479410029unique.20010415093936.arc.gz
3com,voter)/home/officials/general/1,1195,2-246...20001002185815http://voter.com:80/home/officials/general/1,1...text/html200HZJFLTHZD5MGEPJS2WVGBHQRQUPFBE3O--528214823unique.20010415093936.arc.gz
4com,voter)/home/candidates/info/0,1214,2-18886...20001002185816http://www.voter.com:80/home/candidates/info/0...text/html200QAM7JW7S4CNYMP6HLA6DASOXTO2SIGWO--482320105unique.20010415093936.arc.gz
....................................
1096875com,voter)/home/candidates/info/0,1214,2-9118-...20001002183052http://www.voter.com:80/home/candidates/info/0...--3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ--118145323588unique.20010415093936.arc.gz
1096876com,voter)/home/candidates/info/0,1214,2-9115-...20001002183052http://www.voter.com:80/home/candidates/info/0...--3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ--118145323706unique.20010415093936.arc.gz
1096877com,voter)/home/candidates/info/0,1214,2-15361...20001002182249http://www.voter.com:80/home/candidates/info/0...--3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ--119145323824unique.20010415093936.arc.gz
1096878com,voter)/home/candidates/info/0,1214,2-12994...20001002181842http://www.voter.com:80/home/candidates/info/0...text/html404UDSH36NBYWO2X73LNMX2LEHLNQ7FYXHZ--351145323943unique.20010415093936.arc.gz
1096879NoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNone
\n", - "

1096880 rows × 11 columns

\n", - "
" - ], - "text/plain": [ - " urlkey timestamp \\\n", - "0 com,voter)/home/candidates/info/0,1214,2-11880... 20001002182124 \n", - "1 com,voter)/home/candidates/info/0,1214,2-18885... 20001002185814 \n", - "2 com,voter)/home/candidates/info/0,1214,2-18880... 20001002185815 \n", - "3 com,voter)/home/officials/general/1,1195,2-246... 20001002185815 \n", - "4 com,voter)/home/candidates/info/0,1214,2-18886... 20001002185816 \n", - "... ... ... \n", - "1096875 com,voter)/home/candidates/info/0,1214,2-9118-... 20001002183052 \n", - "1096876 com,voter)/home/candidates/info/0,1214,2-9115-... 20001002183052 \n", - "1096877 com,voter)/home/candidates/info/0,1214,2-15361... 20001002182249 \n", - "1096878 com,voter)/home/candidates/info/0,1214,2-12994... 20001002181842 \n", - "1096879 None None \n", - "\n", - " original mimetype \\\n", - "0 http://www.voter.com:80/home/candidates/info/0... text/html \n", - "1 http://www.voter.com:80/home/candidates/info/0... text/html \n", - "2 http://www.voter.com:80/home/candidates/info/0... text/html \n", - "3 http://voter.com:80/home/officials/general/1,1... text/html \n", - "4 http://www.voter.com:80/home/candidates/info/0... text/html \n", - "... ... ... \n", - "1096875 http://www.voter.com:80/home/candidates/info/0... - \n", - "1096876 http://www.voter.com:80/home/candidates/info/0... - \n", - "1096877 http://www.voter.com:80/home/candidates/info/0... - \n", - "1096878 http://www.voter.com:80/home/candidates/info/0... text/html \n", - "1096879 None None \n", - "\n", - " statuscode digest redirect metatags \\\n", - "0 200 FYXP43MQC5GVBQMVK3ETWSPXUBR5ICKP - - \n", - "1 200 H6QN5ZULJ6YZP756QNVM3YXKXC7HZUIL - - \n", - "2 200 HFG67JI4KBPHFXMQE5DJRHF3OEKKBOO6 - - \n", - "3 200 HZJFLTHZD5MGEPJS2WVGBHQRQUPFBE3O - - \n", - "4 200 QAM7JW7S4CNYMP6HLA6DASOXTO2SIGWO - - \n", - "... ... ... ... ... \n", - "1096875 - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - \n", - "1096876 - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - \n", - "1096877 - 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - \n", - "1096878 404 UDSH36NBYWO2X73LNMX2LEHLNQ7FYXHZ - - \n", - "1096879 None None None None \n", - "\n", - " file_size offset warc_filename \n", - "0 5051 149 unique.20010415093936.arc.gz \n", - "1 4829 5200 unique.20010415093936.arc.gz \n", - "2 4794 10029 unique.20010415093936.arc.gz \n", - "3 5282 14823 unique.20010415093936.arc.gz \n", - "4 4823 20105 unique.20010415093936.arc.gz \n", - "... ... ... ... \n", - "1096875 118 145323588 unique.20010415093936.arc.gz \n", - "1096876 118 145323706 unique.20010415093936.arc.gz \n", - "1096877 119 145323824 unique.20010415093936.arc.gz \n", - "1096878 351 145323943 unique.20010415093936.arc.gz \n", - "1096879 None None None \n", - "\n", - "[1096880 rows x 11 columns]" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cdx_df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Election 2000 DataFrame\n", - "Now we'll just create a DataFrame from the whole 2000 Election dataset. To do that, we'll use the `create_dataframe_from_manifest` which loops over the files and calls `create_dataframe` programmatically instead of manually and individually as we did above. " - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "el00_df = create_dataframe_from_manifest(el00_files)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Mimetypes\n", - "For this exercise, we're going to take a brief look at the mimetypes. First, we'll select all the mimetypes in the Dataframe and get their sums by calling `value_counts` which is a method from Pandas. " - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "el00_mimetypes = el00_df['mimetype'].value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "- 1094810\n", - "text/html 6209\n", - "application/pdf 13\n", - "image/gif 11\n", - "image/jpeg 4\n", - "text/plain 3\n", - "application/mac-binhex40 1\n", - "Name: mimetype, dtype: int64" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "el00_mimetypes" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Dropping and Graphing\n", - "You can see from the output above, that the majority of the mimetypes are not listed and show up as `-`. Further research into this could prove interesting. For now, however, we're just going to drop them and graph the remainder." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "el00_mimetypes.drop(labels=['-']).plot.bar(figsize=(5,5))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Fetching the Text\n", - "Now that we know the majority of the remaining resources in this dataset have a text-based mimetype, we can gather all the text and do some basic analysis. First, we'll fetch all the text from just the first 50 rows. If you have access to a larger machine, you may certainly increase the number or run it across the whole DataFrame." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "text_df = fetch_all_text(el00_df.head(50))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Top 25 Words\n", - "Now that the text has been fetched, we'll do a simple summation and sorting, displaying the top 25 words from the first 50 rows of the 2000 Election dataset." - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "information 248\n", - "com 150\n", - "voter 150\n", - "terms 121\n", - "contact 119\n", - "state 105\n", - "candidate 95\n", - "service 78\n", - "return 72\n", - "senate 70\n", - "candidates 62\n", - "elected 62\n", - "officials 62\n", - "check 56\n", - "ca 54\n", - "new 53\n", - "capitol 52\n", - "background 52\n", - "general 51\n", - "use 51\n", - "rights 50\n", - "press 50\n", - "2000 50\n", - "privacy 50\n", - "portions 50\n", - "dtype: int64" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "text_df.sum(axis=0).sort_values(ascending=False).head(25)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Next Steps\n", - "Please feel free to use this notebook and the functions found here to help in your exploration of the U.S. Election datasets." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.5" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -}