From 6bd679fe4240473d0ad9340082ede98ae50c9a90 Mon Sep 17 00:00:00 2001 From: Brian Foo Date: Thu, 23 Jan 2025 10:34:32 -0500 Subject: [PATCH] Re-build gh pages --- .../LOC_Election_Dataset_Walkthrough.html | 644 +-------------- .../LOC_Election_Dataset_Walkthrough.ipynb | 782 +----------------- searchindex.js | 2 +- 3 files changed, 8 insertions(+), 1420 deletions(-) diff --git a/Data Sets/Web Archives/LOC_Election_Dataset_Walkthrough.html b/Data Sets/Web Archives/LOC_Election_Dataset_Walkthrough.html index dfdcb20..6202991 100644 --- a/Data Sets/Web Archives/LOC_Election_Dataset_Walkthrough.html +++ b/Data Sets/Web Archives/LOC_Election_Dataset_Walkthrough.html @@ -349,9 +349,7 @@ - + @@ -367,29 +365,6 @@

Exploring the United States Election Datasets

@@ -401,590 +376,8 @@


Exploring the United States Election Datasets#


This notebook demonstrates an intermediate approach to exploring and accessing the United States Election Datasets produced by the Library of Congress.


As an intermediate approach, this notebook uses functions as reusable blocks of code. These functions could potentially be used outside of this notebook to assist in further exploration.




Below are the Python packages used in this notebook.

import requests # Used to make HTTP requests; this is how we will scrape the Wayback
-from bs4 import BeautifulSoup # Used to process the scraped content 
-import gzip # Used to decompress the gzipped CDX files
-import pandas as pd # Used to transform the data into dataframes for computation
-from sklearn.feature_extraction.text import CountVectorizer # Used to create a matrix out of a bag of words
-from time import sleep # Used to provide a slight pause between Wayback requests

Defining Global Variables#


Here we’re defining two global variables that will be used in the functions to gather and scrape the content. -The represent the base of the URL structure for two of the important web applications we will use: Cloudfront, which is the front end to where the dataset files are location and Wayback, where the actual web resources are accessed.




Here we will define the functions in the order that they are used in this notebook.

def gather_files_from_manifest(year: str):
-    """
-    Function that takes a year (YYYY) as an argument.
-    The function collects the locations of the CDX files 
-    listed by the provided year's manifest.
-    Args:
-        year (str): String of a year YYYY.
-    Returns:
-        :obj:`list` of :obj:`str` of individual CDX file URLs.
-    """
-    election_years = [
-        "2000",
-        "2002",
-        "2004",
-        "2006",
-        "2008",
-        "2010",
-        "2012",
-        "2014",
-        "2016",
-        "2018"
-    ]
-    if year not in election_years:
-        return []
-    else:
-        manifest_url = f"{CLOUDFRONT_BASE_URL}cdx11-indexes/{year}/el{year[-2:]}-manifest.html"
-        response = requests.get(manifest_url)
-        soup = BeautifulSoup(response.content, 'html.parser')
-        cdx_files = [link.get('href') for link in soup.find_all('a')]
-        return cdx_files
-def fetch_file(cdx_url: str):
-    """
-    Function that takes a `String` as an argument.
-    The `cdx_url` is a singular item from the result
-    of the `gather_files_from_manifest` function.
-    The function fetches the gzipped CDX file, decompresses it,
-    splits it on the newlines, and removes the header. 
-    Args:
-        cdx_url (str): Individual item from the result of
-        the `gather_files_from_manifest` function.
-    Returns:
-        :obj:`list` of :obj:`str` of individual CDX lines, each representing
-        a web object.
-    """
-    response = requests.get(cdx_url)
-    # Here we decompress the gzipped CDX, decode it, split it on the newline, and remove the header
-    cdx_content = gzip.decompress(response.content).decode('utf-8').split('\n')[1:]
-    return cdx_content
-def create_dataframe(data: list):
-    """
-    Function that takes a :obj:`list` of :obj:`str` as an argument.
-    `data` is the contents of the CDX file split on newlines. 
-    This function takes `data`, applies a schema to it, and transforms it
-    into a `pandas.DataFrame`.
-    Args:
-        data (list): :obj:`list` of :obj:`str`. Each item is a line from
-        a CDX file or group of files.
-    Returns:
-        A `pandas.DataFrame` of a CDX file or group of files.
-    """
-    schema = [
-        'urlkey',
-        'timestamp',
-        'original',
-        'mimetype',
-        'statuscode',
-        'digest',
-        'redirect',
-        'metatags',
-        'file_size',
-        'offset',
-        'warc_filename'
-    ]
-    _data = [row.split() for row in data]
-    df = pd.DataFrame(_data, columns=schema)
-    return df
-def create_dataframe_from_manifest(manifest: list):
-    """
-    Function that takes a :obj:`list` of :obj:`str` as an argument.
-    The `manifest` is a list of all the individual CDX files found
-    from an Election year's or group of Election years' HTML manifest.
-    This function loops through each file, transforms it into a `pandas.DataFrame`
-    by calling the `create_dataframe` function, concats the DataFrames together,
-    and then returns the Dataframe representing the entire manifest.
-    Args:
-        manifest (list): :obj:`list` of :obj:`str` of all the individual CDX files found
-    from an Election year's or group of Election years' HTML manifest.
-    Returns:
-        `pandas.DataFrame` representing every file present in the `manifest`.
-    """
-    df = pd.DataFrame() 
-    for index, line in enumerate(manifest[0:2]):
-        cdx = fetch_file(line)
-        if index == 0:
-            df = create_dataframe(cdx)
-        else:
-            df = pd.concat([df, create_dataframe(cdx)])
-    return df
-def fetch_text(row: pd.Series):
-    """
-    Function that takes a `pandas.Series`, which is a single row 
-    from a `pandas.DataFrame`, as an argument.
-    The functions uses the timestamp and original fields from the `row`
-    to request the specific resource. Once the resource is fetched,
-    the Wayback banner div elements are removed so as to not detract from 
-    the words in the resource itself. 
-    Args:
-        row (pandas.Series): `pandas.Series`, which is a single row 
-    from a `pandas.DataFrame`.
-    Returns:
-        `String` of the resource's text.
-    """
-    response = requests.get(f"{WAYBACK_BASE_URL}{row['timestamp']}/{row['original']}")
-    soup = BeautifulSoup(response.content, 'html.parser')
-    [el.extract() for el in soup.find_all('div', {'id': 'wm-maximized'})]
-    [el.extract() for el in soup.find_all('div', {'id': 'wm-minimized'})]
-    return soup.text
-def fetch_all_text(df: pd.DataFrame):
-    """
-    Function that takes a `pandas.Dataframe` as an argument.
-    This is the most complicated function here. The function first cleans the
-    `df` that was passed in by dropping all the rows that do not have a value in the
-    mimetype field. Then, it drops all the duplicate digests, which removes resources
-    that are exactly the same. Finally, it only returns rows that have 'text' in the mimetype
-    and have a '200' HTTP status response, meaning the resource was sucessfully capture.
-    Once the `df` is cleaned, each resource's text is fetched from the Wayback,
-    transformed into a matrix using `sklearn.CountVectorizer`, and then returns a `pandas.DataFrame`
-    with words and their occurance per resource. A politeness of 15 seconds is added between Wayback requests.
-    Args:
-        row (pandas.DataFrame): `pandas.Dataframe` representing web resources as CDX lines.
-    Returns:
-        `pandas.Dataframe` of the resource's words tabulated per web resource.
-    """
-    countvec = CountVectorizer(ngram_range=(1,1), stop_words='english')
-    unprocessed_bag_of_words = []
-    text_df = df\
-        .dropna(subset=['mimetype'])\
-        .drop_duplicates(subset=['digest'])\
-        .query('mimetype.str.contains("text") and statuscode.str.match("200")', engine='python')
-    for i, row in text_df.iterrows():
-        unprocessed_bag_of_words.append(fetch_text(row))
-        sleep(15)
-    processed_bag_of_words = countvec.fit_transform(unprocessed_bag_of_words)
-    return pd.DataFrame(processed_bag_of_words.toarray(),columns=countvec.get_feature_names())

Gathering the list of CDX Files#


The first step is gathering the list of CDX files. To do that, simply call the gather_files_from_manifest function, providing the Election year as an argument.

el00_files = gather_files_from_manifest('2000')
- '',
- '',
- '',
- '']

CDX File#


Next, we’ll demonstrate what a particular CDX File looks like

cdx = fetch_file(el00_files[0])
['com,voter)/home/candidates/info/0,1214,2-11880-,00.html 20001002182124,1214,2-11880-,00.html text/html 200 FYXP43MQC5GVBQMVK3ETWSPXUBR5ICKP - - 5051 149 unique.20010415093936.arc.gz',
- 'com,voter)/home/candidates/info/0,1214,2-18885-,00.html 20001002185814,1214,2-18885-,00.html text/html 200 H6QN5ZULJ6YZP756QNVM3YXKXC7HZUIL - - 4829 5200 unique.20010415093936.arc.gz',
- 'com,voter)/home/candidates/info/0,1214,2-18880-,00.html 20001002185815,1214,2-18880-,00.html text/html 200 HFG67JI4KBPHFXMQE5DJRHF3OEKKBOO6 - - 4794 10029 unique.20010415093936.arc.gz',
- 'com,voter)/home/officials/general/1,1195,2-2467-,00.html 20001002185815,1195,2-2467-,00.html text/html 200 HZJFLTHZD5MGEPJS2WVGBHQRQUPFBE3O - - 5282 14823 unique.20010415093936.arc.gz',
- 'com,voter)/home/candidates/info/0,1214,2-18886-,00.html 20001002185816,1214,2-18886-,00.html text/html 200 QAM7JW7S4CNYMP6HLA6DASOXTO2SIGWO - - 4823 20105 unique.20010415093936.arc.gz']

CDX as a DataFrame#


Now, here is the same CDX transformed into a DataFrame

cdx_df = create_dataframe(cdx)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

1096880 rows × 11 columns


Election 2000 DataFrame#


Now we’ll just create a DataFrame from the whole 2000 Election dataset. To do that, we’ll use the create_dataframe_from_manifest which loops over the files and calls create_dataframe programmatically instead of manually and individually as we did above.

el00_df = create_dataframe_from_manifest(el00_files)



For this exercise, we’re going to take a brief look at the mimetypes. First, we’ll select all the mimetypes in the Dataframe and get their sums by calling value_counts which is a method from Pandas.

el00_mimetypes = el00_df['mimetype'].value_counts()
-                           1094810
-text/html                      6209
-application/pdf                  13
-image/gif                        11
-image/jpeg                        4
-text/plain                        3
-application/mac-binhex40          1
-Name: mimetype, dtype: int64

Dropping and Graphing#


You can see from the output above, that the majority of the mimetypes are not listed and show up as -. Further research into this could prove interesting. For now, however, we’re just going to drop them and graph the remainder.

-../../_images/ec0c2f7fa8e647d80d754d22e6c8ec22fa5134d55c4bd17ee340dc44774fedec.png -

Fetching the Text#


Now that we know the majority of the remaining resources in this dataset have a text-based mimetype, we can gather all the text and do some basic analysis. First, we’ll fetch all the text from just the first 50 rows. If you have access to a larger machine, you may certainly increase the number or run it across the whole DataFrame.

text_df = fetch_all_text(el00_df.head(50))

Top 25 Words#


Now that the text has been fetched, we’ll do a simple summation and sorting, displaying the top 25 words from the first 50 rows of the 2000 Election dataset.

information    248
-com            150
-voter          150
-terms          121
-contact        119
-state          105
-candidate       95
-service         78
-return          72
-senate          70
-candidates      62
-elected         62
-officials       62
-check           56
-ca              54
-new             53
-capitol         52
-background      52
-general         51
-use             51
-rights          50
-press           50
-2000            50
-privacy         50
-portions        50
-dtype: int64

Next Steps#


Please feel free to use this notebook and the functions found here to help in your exploration of the U.S. Election datasets.


This notebook has been replaced by an updated version: + Packages/us-elections.ipynb