Merge pull request #44 from CentreForDigitalHumanities/feature/update…

…-readme Automatically download databases of new readers
CentreForDigitalHumanities · Aug 15, 2024 · dde4090 · dde4090
2 parents 69da52b + 9e950e0
commit dde4090
Show file tree

Hide file tree

Showing 10 changed files with 169 additions and 122 deletions.
diff --git a/README.md b/README.md
@@ -28,8 +28,7 @@ print culture. The VRE will rely on `edpop-explorer`.
 ## Install
 
 **Please note that `edpop-explorer` is still under active development
-and that while it might be useful, some important features are still
-missing and the public API is not yet stable.**
+and that while it might be useful, the public API is not yet stable.**
 
 `edpop-explorer` can easily be installed from PyPI:
 
@@ -41,6 +40,20 @@ package. The commandline tool can be run using the `edpopx` command.
 on the path. In that case you can also run it using the command `python 
 -m edpop_explorer`.)
 
+EDPOP Explorer comes with a number of pre-installed readers. Most of these
+readers connect to external APIs. Please take into account that there is 
+always a chance that some readers are (temporarily) not available or that
+the public interfaces have changed. In the latter case, you are welcome
+to file an issue or create a fix.
+
+A limited number of pre-installed readers do not work with external APIs
+but with pre-downloaded databases. Where possible,
+these databases are automatically downloaded the first time. In case of the
+USTC reader, an automatic download is not provided but the database file
+may be obtained from the project team. If this database is not available,
+an exception will be raised with an indication as to where to put the
+database file.
+
 ## Basic usage
 
 ### Python API
@@ -84,8 +97,11 @@ the query you want to perform, such as:
 
     # hpb gruninger
 
-This will give you the number of results and a summary of the first ten
-results. To load more results, use the `next` command:
+Before executing the query, EDPOP Explorer will show the way the query is
+transformed before calling the external API. In many cases, including
+HPB, the transformed query is exactly the same as the user-inputted query.
+After performing the query, you will see the number of results and a 
+summary of the first ten results. To load more results, use the `next` command:
 
     # next
 

diff --git a/edpop_explorer/__init__.py b/edpop_explorer/__init__.py
@@ -2,7 +2,8 @@
     'EDPOPREC', 'RELATORS', 'bind_common_namespaces',
     'Field', 'FieldError', 'LocationField',
     'Reader', 'ReaderError', 'NotFoundError',
-    'GetByIdBasedOnQueryMixin', 'BasePreparedQuery', 'PreparedQueryType',
+    'GetByIdBasedOnQueryMixin', 'DatabaseFileMixin',
+    'BasePreparedQuery', 'PreparedQueryType',
     'Record', 'RawData', 'RecordError', 'BibliographicalRecord',
     'BiographicalRecord', 'LazyRecordMixin',
     'SRUReader',
@@ -20,7 +21,7 @@
 from .fields import Field, FieldError, LocationField
 from .reader import (
     Reader, ReaderError, GetByIdBasedOnQueryMixin, BasePreparedQuery,
-    PreparedQueryType, NotFoundError
+    PreparedQueryType, NotFoundError, DatabaseFileMixin
 )
 from .record import (
     Record, RawData, RecordError, BibliographicalRecord, BiographicalRecord,

diff --git a/edpop_explorer/rdf.py b/edpop_explorer/rdf.py
@@ -3,7 +3,7 @@
 from rdflib.namespace import Namespace
 from rdflib import Graph, RDF, RDFS
 
-EDPOPREC = Namespace('https://dhstatic.hum.uu.nl/edpop-records/latest/')
+EDPOPREC = Namespace('https://dhstatic.hum.uu.nl/edpop-records/0.1.0/')
 """EDPOP Record Ontology"""
 
 RELATORS = Namespace('http://id.loc.gov/vocabulary/relators/')

diff --git a/edpop_explorer/reader.py b/edpop_explorer/reader.py
@@ -2,7 +2,11 @@
 
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
+from pathlib import Path
 from typing import Optional, Union, Dict
+
+import requests
+from appdirs import AppDirs
 from rdflib import Graph, RDF, URIRef, SDO, Literal
 from urllib.parse import quote, unquote
 
@@ -312,6 +316,71 @@ def _prepare_get_by_id_query(cls, identifier: str) -> PreparedQueryType:
         pass
 
 
+class DatabaseFileMixin:
+    """Mixin that adds a method ``prepare_data`` to a ``Reader`` class,
+    which will make the database file available in the ``database_path``
+    attribute as a ``pathlib.Path`` object. If the constant attribute
+    ``DATABASE_URL`` is given, the database will be downloaded from
+    that URL if the data is not yet available. The database file will
+    be (expected to be) stored in the application data directory
+    using the filename specified in the constant attribute
+    ``DATABASE_FILENAME``, which has to be specified by the user of
+    this mixin."""
+    DATABASE_URL: Optional[str] = None
+    """The URL to download the database file from. If this attribute is
+    ``None``, automatically downloading the database file is not supported."""
+    DATABASE_FILENAME: str
+    """The filename (not the full path) under which the database is expected
+    to be stored."""
+    DATABASE_LICENSE: Optional[str] = None
+    """A URL that contains the license of the downloaded database file."""
+    database_path: Optional[Path] = None
+    """The path to the database file. Will be set by the ``prepare_data``
+    method."""
+
+    def prepare_data(self) -> None:
+        """Prepare the database file by confirming that it is available,
+        and if not, by attempting to download it."""
+        self.database_path = Path(
+            AppDirs('edpop-explorer', 'cdh').user_data_dir
+        ) / self.DATABASE_FILENAME
+        if not self.database_path.exists():
+            if self.DATABASE_URL is None:
+                # No database URL is given, so the user has to get the database
+                # by themself.
+                # Find database dir with .resolve() because on Windows it is
+                # some sort of hidden symlink if Python was installed using
+                # the Windows Store...
+                db_dir = self.database_path.parent.resolve()
+                error_message = (
+                    f'{self.__class__.__name__} database not found. Please obtain the file '
+                    f'{self.DATABASE_FILENAME} from the project team and add it '
+                    f'to the following directory: {db_dir}'
+                )
+                raise ReaderError(error_message)
+            else:
+                self._download_database()
+
+    def _download_database(self) -> None:
+        print('Downloading database...')
+        response = requests.get(self.DATABASE_URL)
+        if response.ok:
+            try:
+                self.database_path.parent.mkdir(exist_ok=True, parents=True)
+                with open(self.database_path, 'wb') as f:
+                    f.write(response.content)
+            except OSError as err:
+                raise ReaderError(
+                    f'Error writing database file to disk: {err}'
+                )
+        else:
+            raise ReaderError(
+                f'Error downloading database file from {self.DATABASE_URL}'
+            )
+        print(f'Successfully saved database to {self.database_path}.')
+        print(f'See license: {self.DATABASE_LICENSE}')
+
+
 class ReaderError(Exception):
     """Generic exception for failures in ``Reader`` class. More specific errors
     derive from this class."""

diff --git a/edpop_explorer/readers/dutch_almanacs.py b/edpop_explorer/readers/dutch_almanacs.py
@@ -1,13 +1,13 @@
 import csv
-from pathlib import Path
 from typing import List
-from edpop_explorer import Reader, ReaderError, Field, BibliographicalRecord, BIBLIOGRAPHICAL
+from edpop_explorer import Reader, ReaderError, Field, BibliographicalRecord, BIBLIOGRAPHICAL, DatabaseFileMixin
 from rdflib import URIRef
 
 
-class DutchAlmanacsReader(Reader):
+class DutchAlmanacsReader(DatabaseFileMixin, Reader):
     """ Dutch Almanacs database reader. Access with command 'dutalm'."""
-    FILENAME = Path(__file__).parent / 'data' / 'biblio_dutchalmanacs.csv'
+    DATABASE_URL = 'https://dhstatic.hum.uu.nl/edpop/biblio_dutchalmanacs.csv'
+    DATABASE_FILENAME = 'biblio_dutchalmanacs.csv'
     CATALOG_URIREF = URIRef(
         'https://edpop.hum.uu.nl/readers/dutch_almanacs'
     )
@@ -21,7 +21,7 @@ class DutchAlmanacsReader(Reader):
     def _convert_record(cls, rawrecord: dict) -> BibliographicalRecord:
         record = BibliographicalRecord(from_reader=cls)
         record.data = rawrecord
-        record.identifier = Field(rawrecord['ID'])
+        record.identifier = rawrecord['ID']
         record.dating = Field(rawrecord['Jaar'])
         record.place_of_publication = Field(rawrecord['Plaats uitgave'])
         record.bookseller = Field(rawrecord['Boekverkoper'])
@@ -39,7 +39,9 @@ def transform_query(cls, query) -> str:
 
     @classmethod
     def get_by_id(cls, identifier: str) -> BibliographicalRecord:
-        with open(cls.FILENAME, 'r', encoding='utf-8-sig') as file:
+        reader = cls()
+        reader.prepare_data()
+        with open(reader.database_path, 'r', encoding='utf-8-sig') as file:
             reader = csv.DictReader(file, delimiter=';')
             for row in reader:
                 if row['ID'] == identifier:
@@ -48,10 +50,11 @@ def get_by_id(cls, identifier: str) -> BibliographicalRecord:
 
     def _perform_query(self) -> List[BibliographicalRecord]:
         assert isinstance(self.prepared_query, str)
+        self.prepare_data()
 
         # Search query in all columns, and fetch results based on query
         results = []
-        with open(self.__class__.FILENAME, 'r', encoding='utf-8-sig') as file:
+        with open(self.database_path, 'r', encoding='utf-8-sig') as file:
             reader = csv.DictReader(file, delimiter=';')
             for row in reader:
                 for key in row.keys():

diff --git a/edpop_explorer/readers/fbtee.py b/edpop_explorer/readers/fbtee.py
@@ -1,19 +1,17 @@
-from pathlib import Path
 import sqlite3
 from rdflib import URIRef
-import requests
-from appdirs import AppDirs
 from typing import Optional
 
 from edpop_explorer import (
-    Reader, BibliographicalRecord, ReaderError, Field, BIBLIOGRAPHICAL
+    Reader, BibliographicalRecord, ReaderError, Field, BIBLIOGRAPHICAL, DatabaseFileMixin
 )
 from edpop_explorer.reader import GetByIdBasedOnQueryMixin
 from edpop_explorer.sql import SQLPreparedQuery
 
 
-class FBTEEReader(GetByIdBasedOnQueryMixin, Reader):
+class FBTEEReader(DatabaseFileMixin, GetByIdBasedOnQueryMixin, Reader):
     DATABASE_URL = 'https://dhstatic.hum.uu.nl/edpop/cl.sqlite3'
+    DATABASE_FILENAME = 'cl.sqlite3'
     DATABASE_LICENSE = 'https://dhstatic.hum.uu.nl/edpop/LICENSE.txt'
     FBTEE_LINK = 'http://fbtee.uws.edu.au/stn/interface/browse.php?t=book&' \
         'id={}'
@@ -28,36 +26,6 @@ class FBTEEReader(GetByIdBasedOnQueryMixin, Reader):
     DESCRIPTION = "Mapping the Trade of the Société Typographique de " \
         "Neuchâtel, 1769-1794"
 
-    def __init__(self):
-        super().__init__()
-        self.database_file = Path(
-            AppDirs('edpop-explorer', 'cdh').user_data_dir
-        ) / 'cl.sqlite3'
-
-    def prepare_data(self):
-        if not self.database_file.exists():
-            self._download_database()
-        self.con = sqlite3.connect(str(self.database_file))
-
-    def _download_database(self):
-        print('Downloading database...')
-        response = requests.get(self.DATABASE_URL)
-        if response.ok:
-            try:
-                self.database_file.parent.mkdir(exist_ok=True, parents=True)
-                with open(self.database_file, 'wb') as f:
-                    f.write(response.content)
-            except OSError as err:
-                raise ReaderError(
-                    f'Error writing database file to disk: {err}'
-                )
-        else:
-            raise ReaderError(
-                f'Error downloading database file from {self.DATABASE_URL}'
-            )
-        print(f'Successfully saved database to {self.database_file}.')
-        print(f'See license: {self.DATABASE_LICENSE}')
-
     @classmethod
     def _prepare_get_by_id_query(cls, identifier: str) -> SQLPreparedQuery:
         return SQLPreparedQuery(
@@ -105,41 +73,44 @@ def fetch_range(self, range_to_fetch: range) -> range:
             raise ReaderError('First call prepare_query method')
         if self.fetching_exhausted:
             return range(0)
-        cur = self.con.cursor()
-        columns = [x[1] for x in cur.execute('PRAGMA table_info(books)')]
-        res = cur.execute(
-            'SELECT B.*, BA.author_code, A.author_name FROM books B '
-            'LEFT OUTER JOIN books_authors BA on B.book_code=BA.book_code '
-            'JOIN authors A on BA.author_code=A.author_code '
-            f'{self.prepared_query.where_statement} '
-            'ORDER BY B.book_code',
-            self.prepared_query.arguments
-        )
-        last_book_code = ''
-        i = -1
-        for row in res:
-            # Since we are joining with another table, a book may be repeated,
-            # so check if this is a new item
-            book_code: str = row[columns.index('book_code')]
-            if last_book_code != book_code:
-                # We have a new book, so update i
-                i += 1
-                record = BibliographicalRecord(self.__class__)
-                record.data = {}
-                for j in range(len(columns)):
-                    record.data[columns[j]] = row[j]
-                record.identifier = book_code
-                record.link = self.FBTEE_LINK.format(book_code)
-                record.data['authors'] = []
-                self.records[i] = record
-                last_book_code = book_code
-            # Add author_code and author_name to the last record
-            assert len(self.records) > 0
-            author_code = row[len(columns)]
-            author_name = row[len(columns) + 1]
-            assert isinstance(self.records[i].data, dict)
-            self.records[i].data['authors'].append((author_code, author_name))
-        for record in self.records:
-            self._add_fields(record)
-        self.number_of_results = len(self.records)
+        with sqlite3.connect(str(self.database_path)) as con:
+            cur = con.cursor()
+            columns = [x[1] for x in cur.execute('PRAGMA table_info(books)')]
+            res = cur.execute(
+                'SELECT B.*, BA.author_code, A.author_name FROM books B '
+                'LEFT OUTER JOIN books_authors BA on B.book_code=BA.book_code '
+                'JOIN authors A on BA.author_code=A.author_code '
+                f'{self.prepared_query.where_statement} '
+                'ORDER BY B.book_code',
+                self.prepared_query.arguments
+            )
+            last_book_code = ''
+            i = -1
+            for row in res:
+                # Since we are joining with another table, a book may be repeated,
+                # so check if this is a new item
+                book_code: str = row[columns.index('book_code')]
+                if last_book_code != book_code:
+                    # We have a new book, so update i
+                    i += 1
+                    record = BibliographicalRecord(self.__class__)
+                    record.data = {}
+                    for j in range(len(columns)):
+                        record.data[columns[j]] = row[j]
+                    record.identifier = book_code
+                    record.link = self.FBTEE_LINK.format(book_code)
+                    record.data['authors'] = []
+                    self.records[i] = record
+                    last_book_code = book_code
+                # Add author_code and author_name to the last record
+                assert len(self.records) > 0
+                author_code = row[len(columns)]
+                author_name = row[len(columns) + 1]
+                assert isinstance(self.records[i].data, dict)
+                self.records[i].data['authors'].append((author_code, author_name))
+            for record_number in self.records:
+                record = self.records[record_number]
+                assert isinstance(record, BibliographicalRecord)
+                self._add_fields(record)
+            self.number_of_results = len(self.records)
         return range(0, len(self.records))