Merge pull request #27 from UUDigitalHumanitieslab/feature/fetchinter…

…face Adjustments to fetch API
CentreForDigitalHumanities · Jan 31, 2024 · 48c6206 · 48c6206
2 parents e3ed561 + e8c7ecd
commit 48c6206
Show file tree

Hide file tree

Showing 8 changed files with 104 additions and 65 deletions.
diff --git a/edpop_explorer/edpopxshell.py b/edpop_explorer/edpopxshell.py
@@ -74,7 +74,7 @@ def do_next(self, args) -> None:
             self.perror('All records have been shown')
         else:
             if self.reader.number_fetched - self.shown < self.RECORDS_PER_PAGE:
-                self.reader.fetch_next()
+                self.reader.fetch()
             self.shown += self._show_records(self.reader.records,
                                              self.shown,
                                              self.RECORDS_PER_PAGE)
@@ -195,7 +195,7 @@ def do_kb(self, args) -> None:
         'Koninklijke Bibliotheek'
         self._query(KBReader, args)
 
-    def _show_records(self, records: List[Record],
+    def _show_records(self, records: List[Optional[Record]],
                       start: int,
                       limit=math.inf) -> int:
         """Show the records from start, with limit as the maximum number

diff --git a/edpop_explorer/reader.py b/edpop_explorer/reader.py
@@ -2,6 +2,7 @@
 from dataclasses import dataclass
 from typing import Optional, List, Union
 from rdflib import Graph, RDF, URIRef
+from urllib.parse import quote, unquote
 
 from edpop_explorer import (
     EDPOPREC, BIBLIOGRAPHICAL, BIOGRAPHICAL, bind_common_namespaces
@@ -38,9 +39,10 @@ class Reader(ABC):
     number_of_results: Optional[int] = None
     '''The total number of results for the query, including those
     that have not been fetched yet.'''
-    number_fetched: Optional[int] = None
-    '''The number of results that has been fetched so far.'''
-    records: List[Record]
+    number_fetched: int = 0
+    '''The number of results that has been fetched so far, or 0 if
+    no fetch has been performed yet.'''
+    records: List[Optional[Record]]
     '''The records that have been fetched as instances of
     (a subclass of) ``Record``.'''
     prepared_query: Optional[PreparedQueryType] = None
@@ -57,6 +59,9 @@ class Reader(ABC):
     overridden.'''
     _graph: Optional[Graph] = None
 
+    def __init__(self):
+        self.records = []
+
     @classmethod
     @abstractmethod
     def transform_query(cls, query: str) -> PreparedQueryType:
@@ -77,14 +82,35 @@ def set_query(self, query: PreparedQueryType) -> None:
         attribute.'''
         self.prepared_query = query
 
-    @abstractmethod
-    def fetch(self):
-        '''Perform an initial query.'''
-        pass
+    def adjust_start_record(self, start_number: int) -> None:
+        """Skip the given number of first records and start fetching 
+        afterwards. Should be calling before the first time calling
+        ``fetch()``. The missing records in the ``records`` attribute
+        will be filled by ``None``s. The ``number_fetched`` attribute
+        will be adjusted as if the first records have been fetched.
+        This is mainly useful if the skipped records have already been 
+        fetched but the original ``Reader`` object is not available anymore. 
+        This functionality may be ignored by readers that can only load 
+        all records at once; generally these are readers that return lazy 
+        records."""
+        if self.number_of_results is not None:
+            raise ReaderError(
+                "adjust_start_record should not be called after fetching."
+            )
+        self.number_fetched = start_number
+        self.records = [None for _ in range(start_number)]
 
     @abstractmethod
-    def fetch_next(self):
-        '''Perform a subsequental query.'''
+    def fetch(
+            self, number: Optional[int] = None
+    ):
+        '''Perform an initial or subsequent query. Most readers fetch
+        a limited number of records at once -- this number depends on
+        the reader but it may be adjusted using the ``number`` argument.
+        Other readers fetch all records at once and ignore the ``number``
+        argument. After fetching, the ``records`` and ``number_fetched``
+        attributes are adjusted and the ``number_of_results`` attribute
+        will be available.'''
         pass
 
     @classmethod
@@ -106,7 +132,7 @@ def identifier_to_iri(cls, identifier: str) -> str:
                 f"Cannot convert identifier to IRI: {__class__}.IRI_PREFIX "
                 "not a string."
             )
-        return cls.IRI_PREFIX + identifier
+        return cls.IRI_PREFIX + quote(identifier)
 
     @classmethod
     def iri_to_identifier(cls, iri: str) -> str:
@@ -116,7 +142,7 @@ def iri_to_identifier(cls, iri: str) -> str:
                 "not a string."
             )
         if iri.startswith(cls.IRI_PREFIX):
-            return iri[len(cls.IRI_PREFIX):]
+            return unquote(iri[len(cls.IRI_PREFIX):])
         else:
             raise ReaderError(
                 f"Cannot convert IRI {iri} to identifier: IRI does not start "
@@ -147,6 +173,13 @@ def catalog_to_graph(cls) -> Graph:
 
         return g
 
+    @property
+    def fetching_exhausted(self) -> bool:
+        """Return ``True`` if all results have been fetched. This is currently
+        implemented by simply checking if the ``number_of_results`` and
+        ``number_fetched`` attributes are equal."""
+        return self.number_fetched == self.number_of_results
+
 
 class GetByIdBasedOnQueryMixin(ABC):
     @classmethod

diff --git a/edpop_explorer/readers/fbtee.py b/edpop_explorer/readers/fbtee.py
@@ -92,11 +92,12 @@ def _add_fields(cls, record: BibliographicalRecord) -> None:
             # author is tuple of author code and author name
             record.contributors.append(Field(author[1]))
 
-    def fetch(self) -> None:
+    def fetch(self, number: Optional[int] = None) -> None:
         self.prepare_data()
         if not self.prepared_query:
             raise ReaderError('First call prepare_query method')
-
+        if self.fetching_exhausted:
+            return
         cur = self.con.cursor()
         columns = [x[1] for x in cur.execute('PRAGMA table_info(books)')]
         res = cur.execute(
@@ -133,7 +134,3 @@ def fetch(self) -> None:
             self._add_fields(record)
         self.number_of_results = len(self.records)
         self.number_fetched = self.number_of_results
-        self.fetching_exhausted = True
-
-    def fetch_next(self):
-        pass
diff --git a/edpop_explorer/readers/sbtireader.py b/edpop_explorer/readers/sbtireader.py
@@ -6,8 +6,6 @@
     Reader, Record, ReaderError, BiographicalRecord, Field
 )
 
-RECORDS_PER_PAGE = 10
-
 
 class SBTIReader(Reader):
     api_url = 'https://data.cerl.org/sbti/_search'
@@ -19,6 +17,7 @@ class SBTIReader(Reader):
         'https://edpop.hum.uu.nl/readers/sbti'
     )
     IRI_PREFIX = "https://edpop.hum.uu.nl/readers/sbti/"
+    DEFAULT_RECORDS_PER_PAGE = 10
 
     @classmethod
     def _get_name_field(cls, data: dict) -> Optional[Field]:
@@ -80,14 +79,17 @@ def _convert_record(cls, rawrecord: dict) -> BiographicalRecord:
 
         return record
 
-    def _perform_query(self, start_record: int) -> List[Record]:
+    def _perform_query(self, start_record: int, maximum_records: Optional[int]) -> List[Record]:
+        assert isinstance(self.prepared_query, str)
+        if maximum_records is None:
+            maximum_records = self.DEFAULT_RECORDS_PER_PAGE
         try:
             response = requests.get(
                 self.api_url,
                 params={
                     'query': self.prepared_query,
                     'from': start_record,
-                    'size': RECORDS_PER_PAGE,
+                    'size': maximum_records,
                     'mode': 'default',
                     'sort': 'default'
                 },
@@ -125,23 +127,13 @@ def transform_query(cls, query) -> str:
         # No transformation needed
         return query
 
-    def fetch(self) -> None:
-        self.records = []
+    def fetch(self, number: Optional[int] = None) -> None:
         if self.prepared_query is None:
             raise ReaderError('First call prepare_query')
-        results = self._perform_query(0)
-        self.records.extend(results)
-        self.number_fetched = len(self.records)
-        if self.number_fetched == self.number_of_results:
-            self.fetching_exhausted = True
-
-    def fetch_next(self) -> None:
-        # TODO: can be merged with fetch method
         if self.fetching_exhausted:
             return
-        start_record = len(self.records) + 1
-        results = self._perform_query(start_record)
+        start_record = len(self.records)
+        results = self._perform_query(start_record, number)
         self.records.extend(results)
         self.number_fetched = len(self.records)
-        if self.number_fetched == self.number_of_results:
-            self.fetching_exhausted = True
+
diff --git a/edpop_explorer/readers/ustc.py b/edpop_explorer/readers/ustc.py
@@ -66,14 +66,16 @@ def _prepare_get_by_id_query(cls, identifier: str) -> SQLPreparedQuery:
             arguments=[identifier_int]
         )
 
-    def fetch(self) -> None:
+    def fetch(self, number: Optional[int] = None) -> None:
         self.prepare_data()
 
         # This method fetches all records immediately, because the data is
         # locally stored.
 
         if not self.prepared_query:
             raise ReaderError('No query has been set')
+        if self.fetching_exhausted:
+            return
 
         cur = self.con.cursor()
         columns = [x[1] for x in cur.execute('PRAGMA table_info(editions)')]
@@ -95,10 +97,6 @@ def fetch(self) -> None:
             self.records.append(record)
         self.number_of_results = len(self.records)
         self.number_fetched = self.number_of_results
-        self.fetching_exhausted = True
-
-    def fetch_next(self):
-        pass
 
     def _convert_record(self, data: dict) -> BibliographicalRecord:
         record = BibliographicalRecord(from_reader=self.__class__)

diff --git a/edpop_explorer/sparqlreader.py b/edpop_explorer/sparqlreader.py
@@ -116,9 +116,11 @@ def transform_query(cls, query: str):
     def get_by_id(cls, identifier: str) -> Record:
         return cls._create_lazy_record(identifier)
 
-    def fetch(self):
+    def fetch(self, number: Optional[int] = None):
         if not self.prepared_query:
             raise ReaderError('First call prepare_query method')
+        if self.fetching_exhausted:
+            return
         wrapper = SPARQLWrapper(self.endpoint)
         wrapper.setReturnFormat(JSONFormat)
         wrapper.setQuery(self.prepared_query)
@@ -138,9 +140,6 @@ def fetch(self):
             self.records.append(self._create_lazy_record(iri, name))
         self.number_fetched = self.number_of_results
 
-    def fetch_next(self):
-        pass
-
     @classmethod
     @abstractmethod
     def _convert_record(cls, graph: Graph, record: Record) -> None:

diff --git a/edpop_explorer/srureader.py b/edpop_explorer/srureader.py
@@ -6,8 +6,6 @@
 from edpop_explorer import Reader, Record, ReaderError
 from edpop_explorer.reader import GetByIdBasedOnQueryMixin
 
-RECORDS_PER_PAGE = 10
-
 
 class SRUReader(GetByIdBasedOnQueryMixin, Reader):
     '''Subclass of ``Reader`` that adds basic SRU functionality
@@ -29,11 +27,14 @@ class SRUReader(GetByIdBasedOnQueryMixin, Reader):
     query: Optional[str] = None
     session: requests.Session
     '''The ``Session`` object of the ``requests`` library.'''
+    DEFAULT_RECORDS_PER_PAGE: int = 10
+    '''The number of records to fetch at a time if not determined by user.'''
 
     def __init__(self):
         # Set a session to allow reuse of HTTP sessions and to set additional
         # parameters and settings, which some SRU APIs require -
         # see https://github.com/metaodi/sruthi#custom-parameters-and-settings
+        super().__init__()
         self.session = requests.Session()
 
     @classmethod
@@ -52,13 +53,15 @@ def _convert_record(cls, sruthirecord: dict) -> Record:
     def _prepare_get_by_id_query(cls, identifier: str) -> str:
         return cls.transform_query(identifier)
 
-    def _perform_query(self, start_record: int) -> List[Record]:
+    def _perform_query(self, start_record: int, maximum_records: Optional[int]) -> List[Record]:
+        if maximum_records is None:
+            maximum_records = self.DEFAULT_RECORDS_PER_PAGE
         try:
             response = sruthi.searchretrieve(
                 self.sru_url,
                 self.prepared_query,
                 start_record=start_record,
-                maximum_records=RECORDS_PER_PAGE,
+                maximum_records=maximum_records,
                 sru_version=self.sru_version,
                 session=self.session
             )
@@ -70,27 +73,23 @@ def _perform_query(self, start_record: int) -> List[Record]:
         self.number_of_results = response.count
 
         records: List[Record] = []
-        for sruthirecord in response[0:RECORDS_PER_PAGE]:
+        for sruthirecord in response[0:maximum_records]:
             records.append(self._convert_record(sruthirecord))
 
         return records
 
     def prepare_query(self, query) -> None:
         self.prepared_query = self.transform_query(query)
 
-    def fetch(self) -> None:
-        self.records = []
+    def fetch(self, number: Optional[int] = None) -> None:
+        if self.records is None or self.number_fetched is None:
+            self.records = []
+            self.number_fetched = 0
+        if self.fetching_exhausted:
+            return
         if self.prepared_query is None:
             raise ReaderError('First call prepare_query')
-        results = self._perform_query(1)
-        self.records.extend(results)
-        self.number_fetched = len(self.records)
-
-    def fetch_next(self) -> None:
-        # TODO: can be merged with fetch method
-        if self.number_of_results == self.number_fetched:
-            return
-        start_record = len(self.records) + 1
-        results = self._perform_query(start_record)
+        start_number = self.number_fetched + 1  # SRU starts at 1
+        results = self._perform_query(start_number, number)
         self.records.extend(results)
         self.number_fetched = len(self.records)
diff --git a/tests/test_allreaders.py b/tests/test_allreaders.py
@@ -50,13 +50,17 @@ def test_catalog_to_graph(readercls: Type[Reader]):
 def test_realrequest(readercls: Type[Reader]):
     reader = readercls()
     reader.prepare_query("gruninger")
-    reader.fetch()
-    assert reader.number_fetched is not None
+    reader.fetch(5)
     assert reader.number_of_results is not None
     assert reader.number_fetched == len(reader.records)
+    if not reader.fetching_exhausted:
+        # Assert that maximum number of results is respected if reader does
+        # not fetch all results at once
+        assert reader.number_fetched <= 5
     assert reader.number_of_results >= reader.number_fetched
     if reader.number_fetched > 0:
         record = reader.records[0]
+        assert record is not None
         record.fetch()
         assert isinstance(record.to_graph(), Graph)
         # Take the IRI and check if searching by IRI gives
@@ -69,3 +73,20 @@ def test_realrequest(readercls: Type[Reader]):
             warnings.warn(
                 UserWarning(f"Record {record} has empty IRI")
             )
+    # Perform a second fetch
+    fetched_before = reader.number_fetched
+    reader.fetch()  # Do not pass number of results to test that as well
+    # If not all records had been fetched already, more records
+    # should be available now. Otherwise, nothing should have
+    # changed.
+    if fetched_before < reader.number_of_results:
+        assert reader.number_fetched > fetched_before
+        # Assert that the last record of the previous fetch is not the same as
+        # the first record of the current fetch
+        record1 = reader.records[fetched_before - 1]
+        record2 = reader.records[fetched_before]
+        assert record1 is not None and record2 is not None
+        if record1.identifier is not None:
+            assert record1.identifier != record2.identifier
+    else:
+        assert reader.number_fetched == fetched_before