Skip to content

Commit

Permalink
Merge pull request #44 from CentreForDigitalHumanities/feature/update…
Browse files Browse the repository at this point in the history
…-readme

Automatically download databases of new readers
  • Loading branch information
tijmenbaarda authored Aug 15, 2024
2 parents 69da52b + 9e950e0 commit dde4090
Show file tree
Hide file tree
Showing 10 changed files with 169 additions and 122 deletions.
24 changes: 20 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,7 @@ print culture. The VRE will rely on `edpop-explorer`.
## Install

**Please note that `edpop-explorer` is still under active development
and that while it might be useful, some important features are still
missing and the public API is not yet stable.**
and that while it might be useful, the public API is not yet stable.**

`edpop-explorer` can easily be installed from PyPI:

Expand All @@ -41,6 +40,20 @@ package. The commandline tool can be run using the `edpopx` command.
on the path. In that case you can also run it using the command `python
-m edpop_explorer`.)

EDPOP Explorer comes with a number of pre-installed readers. Most of these
readers connect to external APIs. Please take into account that there is
always a chance that some readers are (temporarily) not available or that
the public interfaces have changed. In the latter case, you are welcome
to file an issue or create a fix.

A limited number of pre-installed readers do not work with external APIs
but with pre-downloaded databases. Where possible,
these databases are automatically downloaded the first time. In case of the
USTC reader, an automatic download is not provided but the database file
may be obtained from the project team. If this database is not available,
an exception will be raised with an indication as to where to put the
database file.

## Basic usage

### Python API
Expand Down Expand Up @@ -84,8 +97,11 @@ the query you want to perform, such as:

# hpb gruninger

This will give you the number of results and a summary of the first ten
results. To load more results, use the `next` command:
Before executing the query, EDPOP Explorer will show the way the query is
transformed before calling the external API. In many cases, including
HPB, the transformed query is exactly the same as the user-inputted query.
After performing the query, you will see the number of results and a
summary of the first ten results. To load more results, use the `next` command:

# next

Expand Down
5 changes: 3 additions & 2 deletions edpop_explorer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
'EDPOPREC', 'RELATORS', 'bind_common_namespaces',
'Field', 'FieldError', 'LocationField',
'Reader', 'ReaderError', 'NotFoundError',
'GetByIdBasedOnQueryMixin', 'BasePreparedQuery', 'PreparedQueryType',
'GetByIdBasedOnQueryMixin', 'DatabaseFileMixin',
'BasePreparedQuery', 'PreparedQueryType',
'Record', 'RawData', 'RecordError', 'BibliographicalRecord',
'BiographicalRecord', 'LazyRecordMixin',
'SRUReader',
Expand All @@ -20,7 +21,7 @@
from .fields import Field, FieldError, LocationField
from .reader import (
Reader, ReaderError, GetByIdBasedOnQueryMixin, BasePreparedQuery,
PreparedQueryType, NotFoundError
PreparedQueryType, NotFoundError, DatabaseFileMixin
)
from .record import (
Record, RawData, RecordError, BibliographicalRecord, BiographicalRecord,
Expand Down
2 changes: 1 addition & 1 deletion edpop_explorer/rdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from rdflib.namespace import Namespace
from rdflib import Graph, RDF, RDFS

EDPOPREC = Namespace('https://dhstatic.hum.uu.nl/edpop-records/latest/')
EDPOPREC = Namespace('https://dhstatic.hum.uu.nl/edpop-records/0.1.0/')
"""EDPOP Record Ontology"""

RELATORS = Namespace('http://id.loc.gov/vocabulary/relators/')
Expand Down
69 changes: 69 additions & 0 deletions edpop_explorer/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@

from abc import ABC, abstractmethod
from dataclasses import dataclass
from pathlib import Path
from typing import Optional, Union, Dict

import requests
from appdirs import AppDirs
from rdflib import Graph, RDF, URIRef, SDO, Literal
from urllib.parse import quote, unquote

Expand Down Expand Up @@ -312,6 +316,71 @@ def _prepare_get_by_id_query(cls, identifier: str) -> PreparedQueryType:
pass


class DatabaseFileMixin:
"""Mixin that adds a method ``prepare_data`` to a ``Reader`` class,
which will make the database file available in the ``database_path``
attribute as a ``pathlib.Path`` object. If the constant attribute
``DATABASE_URL`` is given, the database will be downloaded from
that URL if the data is not yet available. The database file will
be (expected to be) stored in the application data directory
using the filename specified in the constant attribute
``DATABASE_FILENAME``, which has to be specified by the user of
this mixin."""
DATABASE_URL: Optional[str] = None
"""The URL to download the database file from. If this attribute is
``None``, automatically downloading the database file is not supported."""
DATABASE_FILENAME: str
"""The filename (not the full path) under which the database is expected
to be stored."""
DATABASE_LICENSE: Optional[str] = None
"""A URL that contains the license of the downloaded database file."""
database_path: Optional[Path] = None
"""The path to the database file. Will be set by the ``prepare_data``
method."""

def prepare_data(self) -> None:
"""Prepare the database file by confirming that it is available,
and if not, by attempting to download it."""
self.database_path = Path(
AppDirs('edpop-explorer', 'cdh').user_data_dir
) / self.DATABASE_FILENAME
if not self.database_path.exists():
if self.DATABASE_URL is None:
# No database URL is given, so the user has to get the database
# by themself.
# Find database dir with .resolve() because on Windows it is
# some sort of hidden symlink if Python was installed using
# the Windows Store...
db_dir = self.database_path.parent.resolve()
error_message = (
f'{self.__class__.__name__} database not found. Please obtain the file '
f'{self.DATABASE_FILENAME} from the project team and add it '
f'to the following directory: {db_dir}'
)
raise ReaderError(error_message)
else:
self._download_database()

def _download_database(self) -> None:
print('Downloading database...')
response = requests.get(self.DATABASE_URL)
if response.ok:
try:
self.database_path.parent.mkdir(exist_ok=True, parents=True)
with open(self.database_path, 'wb') as f:
f.write(response.content)
except OSError as err:
raise ReaderError(
f'Error writing database file to disk: {err}'
)
else:
raise ReaderError(
f'Error downloading database file from {self.DATABASE_URL}'
)
print(f'Successfully saved database to {self.database_path}.')
print(f'See license: {self.DATABASE_LICENSE}')


class ReaderError(Exception):
"""Generic exception for failures in ``Reader`` class. More specific errors
derive from this class."""
Expand Down
17 changes: 10 additions & 7 deletions edpop_explorer/readers/dutch_almanacs.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import csv
from pathlib import Path
from typing import List
from edpop_explorer import Reader, ReaderError, Field, BibliographicalRecord, BIBLIOGRAPHICAL
from edpop_explorer import Reader, ReaderError, Field, BibliographicalRecord, BIBLIOGRAPHICAL, DatabaseFileMixin
from rdflib import URIRef


class DutchAlmanacsReader(Reader):
class DutchAlmanacsReader(DatabaseFileMixin, Reader):
""" Dutch Almanacs database reader. Access with command 'dutalm'."""
FILENAME = Path(__file__).parent / 'data' / 'biblio_dutchalmanacs.csv'
DATABASE_URL = 'https://dhstatic.hum.uu.nl/edpop/biblio_dutchalmanacs.csv'
DATABASE_FILENAME = 'biblio_dutchalmanacs.csv'
CATALOG_URIREF = URIRef(
'https://edpop.hum.uu.nl/readers/dutch_almanacs'
)
Expand All @@ -21,7 +21,7 @@ class DutchAlmanacsReader(Reader):
def _convert_record(cls, rawrecord: dict) -> BibliographicalRecord:
record = BibliographicalRecord(from_reader=cls)
record.data = rawrecord
record.identifier = Field(rawrecord['ID'])
record.identifier = rawrecord['ID']
record.dating = Field(rawrecord['Jaar'])
record.place_of_publication = Field(rawrecord['Plaats uitgave'])
record.bookseller = Field(rawrecord['Boekverkoper'])
Expand All @@ -39,7 +39,9 @@ def transform_query(cls, query) -> str:

@classmethod
def get_by_id(cls, identifier: str) -> BibliographicalRecord:
with open(cls.FILENAME, 'r', encoding='utf-8-sig') as file:
reader = cls()
reader.prepare_data()
with open(reader.database_path, 'r', encoding='utf-8-sig') as file:
reader = csv.DictReader(file, delimiter=';')
for row in reader:
if row['ID'] == identifier:
Expand All @@ -48,10 +50,11 @@ def get_by_id(cls, identifier: str) -> BibliographicalRecord:

def _perform_query(self) -> List[BibliographicalRecord]:
assert isinstance(self.prepared_query, str)
self.prepare_data()

# Search query in all columns, and fetch results based on query
results = []
with open(self.__class__.FILENAME, 'r', encoding='utf-8-sig') as file:
with open(self.database_path, 'r', encoding='utf-8-sig') as file:
reader = csv.DictReader(file, delimiter=';')
for row in reader:
for key in row.keys():
Expand Down
115 changes: 43 additions & 72 deletions edpop_explorer/readers/fbtee.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,17 @@
from pathlib import Path
import sqlite3
from rdflib import URIRef
import requests
from appdirs import AppDirs
from typing import Optional

from edpop_explorer import (
Reader, BibliographicalRecord, ReaderError, Field, BIBLIOGRAPHICAL
Reader, BibliographicalRecord, ReaderError, Field, BIBLIOGRAPHICAL, DatabaseFileMixin
)
from edpop_explorer.reader import GetByIdBasedOnQueryMixin
from edpop_explorer.sql import SQLPreparedQuery


class FBTEEReader(GetByIdBasedOnQueryMixin, Reader):
class FBTEEReader(DatabaseFileMixin, GetByIdBasedOnQueryMixin, Reader):
DATABASE_URL = 'https://dhstatic.hum.uu.nl/edpop/cl.sqlite3'
DATABASE_FILENAME = 'cl.sqlite3'
DATABASE_LICENSE = 'https://dhstatic.hum.uu.nl/edpop/LICENSE.txt'
FBTEE_LINK = 'http://fbtee.uws.edu.au/stn/interface/browse.php?t=book&' \
'id={}'
Expand All @@ -28,36 +26,6 @@ class FBTEEReader(GetByIdBasedOnQueryMixin, Reader):
DESCRIPTION = "Mapping the Trade of the Société Typographique de " \
"Neuchâtel, 1769-1794"

def __init__(self):
super().__init__()
self.database_file = Path(
AppDirs('edpop-explorer', 'cdh').user_data_dir
) / 'cl.sqlite3'

def prepare_data(self):
if not self.database_file.exists():
self._download_database()
self.con = sqlite3.connect(str(self.database_file))

def _download_database(self):
print('Downloading database...')
response = requests.get(self.DATABASE_URL)
if response.ok:
try:
self.database_file.parent.mkdir(exist_ok=True, parents=True)
with open(self.database_file, 'wb') as f:
f.write(response.content)
except OSError as err:
raise ReaderError(
f'Error writing database file to disk: {err}'
)
else:
raise ReaderError(
f'Error downloading database file from {self.DATABASE_URL}'
)
print(f'Successfully saved database to {self.database_file}.')
print(f'See license: {self.DATABASE_LICENSE}')

@classmethod
def _prepare_get_by_id_query(cls, identifier: str) -> SQLPreparedQuery:
return SQLPreparedQuery(
Expand Down Expand Up @@ -105,41 +73,44 @@ def fetch_range(self, range_to_fetch: range) -> range:
raise ReaderError('First call prepare_query method')
if self.fetching_exhausted:
return range(0)
cur = self.con.cursor()
columns = [x[1] for x in cur.execute('PRAGMA table_info(books)')]
res = cur.execute(
'SELECT B.*, BA.author_code, A.author_name FROM books B '
'LEFT OUTER JOIN books_authors BA on B.book_code=BA.book_code '
'JOIN authors A on BA.author_code=A.author_code '
f'{self.prepared_query.where_statement} '
'ORDER BY B.book_code',
self.prepared_query.arguments
)
last_book_code = ''
i = -1
for row in res:
# Since we are joining with another table, a book may be repeated,
# so check if this is a new item
book_code: str = row[columns.index('book_code')]
if last_book_code != book_code:
# We have a new book, so update i
i += 1
record = BibliographicalRecord(self.__class__)
record.data = {}
for j in range(len(columns)):
record.data[columns[j]] = row[j]
record.identifier = book_code
record.link = self.FBTEE_LINK.format(book_code)
record.data['authors'] = []
self.records[i] = record
last_book_code = book_code
# Add author_code and author_name to the last record
assert len(self.records) > 0
author_code = row[len(columns)]
author_name = row[len(columns) + 1]
assert isinstance(self.records[i].data, dict)
self.records[i].data['authors'].append((author_code, author_name))
for record in self.records:
self._add_fields(record)
self.number_of_results = len(self.records)
with sqlite3.connect(str(self.database_path)) as con:
cur = con.cursor()
columns = [x[1] for x in cur.execute('PRAGMA table_info(books)')]
res = cur.execute(
'SELECT B.*, BA.author_code, A.author_name FROM books B '
'LEFT OUTER JOIN books_authors BA on B.book_code=BA.book_code '
'JOIN authors A on BA.author_code=A.author_code '
f'{self.prepared_query.where_statement} '
'ORDER BY B.book_code',
self.prepared_query.arguments
)
last_book_code = ''
i = -1
for row in res:
# Since we are joining with another table, a book may be repeated,
# so check if this is a new item
book_code: str = row[columns.index('book_code')]
if last_book_code != book_code:
# We have a new book, so update i
i += 1
record = BibliographicalRecord(self.__class__)
record.data = {}
for j in range(len(columns)):
record.data[columns[j]] = row[j]
record.identifier = book_code
record.link = self.FBTEE_LINK.format(book_code)
record.data['authors'] = []
self.records[i] = record
last_book_code = book_code
# Add author_code and author_name to the last record
assert len(self.records) > 0
author_code = row[len(columns)]
author_name = row[len(columns) + 1]
assert isinstance(self.records[i].data, dict)
self.records[i].data['authors'].append((author_code, author_name))
for record_number in self.records:
record = self.records[record_number]
assert isinstance(record, BibliographicalRecord)
self._add_fields(record)
self.number_of_results = len(self.records)
return range(0, len(self.records))
Loading

0 comments on commit dde4090

Please sign in to comment.