Skip to content

Commit

Permalink
Refactor data package scripts to use https:// links instead of AWS
Browse files Browse the repository at this point in the history
  • Loading branch information
beefoo committed May 14, 2024
1 parent 3eddf27 commit be4183c
Show file tree
Hide file tree
Showing 12 changed files with 469 additions and 1,040 deletions.
2 changes: 1 addition & 1 deletion Data Packages/README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Resources for using Library of Congress Data Packages

This is a growing collection of resources (Jupyter Notebooks, scripts, workflows) for accessing and using [data packages from the Library of Congress](https://labs.loc.gov/data/explore/).
This is a growing collection of resources (Jupyter Notebooks, scripts, workflows) for accessing and using [data packages from the Library of Congress](https://data.labs.loc.gov/packages/).

To run the Jupyter Notebooks you will need to change into this directory and install the required Python libraries:

Expand Down
126 changes: 28 additions & 98 deletions Data Packages/austro_hungarian_maps.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"source": [
"# LoC Data Package Tutorial: Austro-Hungarian maps set\n",
"\n",
"This notebook will demonstrate basic usage of using the Pyton library `boto3` for interacting with [data packages from the Library of Congress](https://labs.loc.gov/data/) via the [Austro-Hungarian map set](https://labs.loc.gov/data/explore/austro-hungarian-maps/) which is derived from the Library's [Austria-Hungary Topographic Set Maps](https://guides.loc.gov/maps-genealogy-eastern-europe/topographic-sets/austria-hungary). We will:\n",
"This notebook will demonstrate basic usage of using the Pyton for interacting with [data packages from the Library of Congress](https://data.labs.loc.gov/packages/) via the [Austro-Hungarian map set](https://data.labs.loc.gov/austro-hungarian-maps/) which is derived from the Library's [Austria-Hungary Topographic Set Maps](https://guides.loc.gov/maps-genealogy-eastern-europe/topographic-sets/austria-hungary). We will:\n",
"\n",
"1. [Output a summary of the contents of this data package](#Output-data-package-summary)\n",
"2. [Read and query metadata from a data package](#Query-the-metadata-in-a-data-package)\n",
Expand All @@ -28,7 +28,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 4,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -61,112 +61,41 @@
" <tr>\n",
" <th>0</th>\n",
" <td>.tif</td>\n",
" <td>9,976</td>\n",
" <td>1,153.54GB</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>.zip</td>\n",
" <td>3</td>\n",
" <td>636MB</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>.dbf</td>\n",
" <td>1</td>\n",
" <td>8MB</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>.html</td>\n",
" <td>3</td>\n",
" <td>3MB</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>.json</td>\n",
" <td>3</td>\n",
" <td>1,457KB</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>.txt</td>\n",
" <td>5</td>\n",
" <td>1,331KB</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>.csv</td>\n",
" <td>2</td>\n",
" <td>968KB</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>.shp</td>\n",
" <td>1</td>\n",
" <td>678KB</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>.shx</td>\n",
" <td>1</td>\n",
" <td>40KB</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>.md</td>\n",
" <td>2</td>\n",
" <td>24KB</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>.prj</td>\n",
" <td>1</td>\n",
" <td>0KB</td>\n",
" <td>9,881</td>\n",
" <td>1,142.47GB</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" FileType Count Size\n",
"0 .tif 9,976 1,153.54GB\n",
"1 .zip 3 636MB\n",
"2 .dbf 1 8MB\n",
"3 .html 3 3MB\n",
"4 .json 3 1,457KB\n",
"5 .txt 5 1,331KB\n",
"6 .csv 2 968KB\n",
"7 .shp 1 678KB\n",
"8 .shx 1 40KB\n",
"9 .md 2 24KB\n",
"10 .prj 1 0KB"
" FileType Count Size\n",
"0 .tif 9,881 1,142.47GB"
]
},
"execution_count": 1,
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import io\n",
"\n",
"import boto3 # for interacting with Amazon S3 (where the data is stored)\n",
"import pandas as pd # for reading, manipulating, and displaying data\n",
"import requests\n",
"\n",
"from helpers import get_s3_bucket_stats\n",
"\n",
"BUCKET_NAME = 'data.labs.loc.gov' # The name of public S3 bucket\n",
"\n",
"# Connect to Amazon S3\n",
"s3 = boto3.client('s3')\n",
"from helpers import get_file_stats\n",
"\n",
"# This is the name of the data package from the list above; note the trailing slash\n",
"DATA_PACKAGE = 'maps/'\n",
"DATA_URL = 'https://data.labs.loc.gov/austro-hungarian-maps/' # Base URL of this data package\n",
"\n",
"stats = get_s3_bucket_stats(s3, BUCKET_NAME, DATA_PACKAGE)\n",
"# Download the file manifest\n",
"file_manifest_url = f'{DATA_URL}manifest.json'\n",
"response = requests.get(file_manifest_url, timeout=60)\n",
"response_json = response.json()\n",
"files = [dict(zip(response_json[\"cols\"], row)) for row in response_json[\"rows\"]] # zip columns and rows\n",
"\n",
"# Convert to Pandas DataFrame and show table\n",
"# Convert to Pandas DataFrame and show stats table\n",
"stats = get_file_stats(files)\n",
"pd.DataFrame(stats)"
]
},
Expand All @@ -181,7 +110,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 6,
"metadata": {},
"outputs": [
{
Expand All @@ -193,12 +122,12 @@
}
],
"source": [
"obj = s3.get_object(Bucket=BUCKET_NAME, Key=f'{DATA_PACKAGE}metadata.csv')\n",
"contents = obj.get('Body', '').read() # Read contents as a string\n",
"metadata_url = f'{DATA_URL}metadata.csv'\n",
"response = requests.get(metadata_url, timeout=60)\n",
"metadata_string = response.text\n",
"\n",
"# Read contents as a csv file\n",
"csv_string = contents.decode(\"utf-8\")\n",
"df = pd.read_csv(io.StringIO(csv_string), dtype=str, keep_default_na=False)\n",
"df = pd.read_csv(io.StringIO(metadata_string), dtype=str, keep_default_na=False)\n",
"print(f'Loaded metadata file with {df.shape[0]:,} entries.')"
]
},
Expand All @@ -211,7 +140,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 7,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -239,7 +168,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 8,
"metadata": {},
"outputs": [
{
Expand All @@ -265,7 +194,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 9,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -301,8 +230,9 @@
"from rasterio.plot import show\n",
"\n",
"filename = f\"data/{item['parent_dir']}/{item['filename']}\"\n",
"obj = s3.get_object(Bucket=BUCKET_NAME, Key=f'{DATA_PACKAGE}{filename}')\n",
"filestream = io.BytesIO(obj.get('Body').read())\n",
"obj_url = f\"{DATA_URL}{filename}\"\n",
"response = requests.get(obj_url, timeout=60)\n",
"filestream = io.BytesIO(response.content)\n",
"\n",
"with MemoryFile(filestream) as memfile:\n",
" with memfile.open() as dataset:\n",
Expand Down
Loading

0 comments on commit be4183c

Please sign in to comment.