Skip to content

Commit

Permalink
Merge pull request #27 from UUDigitalHumanitieslab/feature/fetchinter…
Browse files Browse the repository at this point in the history
…face

Adjustments to fetch API
  • Loading branch information
tijmenbaarda authored Jan 31, 2024
2 parents e3ed561 + e8c7ecd commit 48c6206
Show file tree
Hide file tree
Showing 8 changed files with 104 additions and 65 deletions.
4 changes: 2 additions & 2 deletions edpop_explorer/edpopxshell.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def do_next(self, args) -> None:
self.perror('All records have been shown')
else:
if self.reader.number_fetched - self.shown < self.RECORDS_PER_PAGE:
self.reader.fetch_next()
self.reader.fetch()
self.shown += self._show_records(self.reader.records,
self.shown,
self.RECORDS_PER_PAGE)
Expand Down Expand Up @@ -195,7 +195,7 @@ def do_kb(self, args) -> None:
'Koninklijke Bibliotheek'
self._query(KBReader, args)

def _show_records(self, records: List[Record],
def _show_records(self, records: List[Optional[Record]],
start: int,
limit=math.inf) -> int:
"""Show the records from start, with limit as the maximum number
Expand Down
55 changes: 44 additions & 11 deletions edpop_explorer/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from dataclasses import dataclass
from typing import Optional, List, Union
from rdflib import Graph, RDF, URIRef
from urllib.parse import quote, unquote

from edpop_explorer import (
EDPOPREC, BIBLIOGRAPHICAL, BIOGRAPHICAL, bind_common_namespaces
Expand Down Expand Up @@ -38,9 +39,10 @@ class Reader(ABC):
number_of_results: Optional[int] = None
'''The total number of results for the query, including those
that have not been fetched yet.'''
number_fetched: Optional[int] = None
'''The number of results that has been fetched so far.'''
records: List[Record]
number_fetched: int = 0
'''The number of results that has been fetched so far, or 0 if
no fetch has been performed yet.'''
records: List[Optional[Record]]
'''The records that have been fetched as instances of
(a subclass of) ``Record``.'''
prepared_query: Optional[PreparedQueryType] = None
Expand All @@ -57,6 +59,9 @@ class Reader(ABC):
overridden.'''
_graph: Optional[Graph] = None

def __init__(self):
self.records = []

@classmethod
@abstractmethod
def transform_query(cls, query: str) -> PreparedQueryType:
Expand All @@ -77,14 +82,35 @@ def set_query(self, query: PreparedQueryType) -> None:
attribute.'''
self.prepared_query = query

@abstractmethod
def fetch(self):
'''Perform an initial query.'''
pass
def adjust_start_record(self, start_number: int) -> None:
"""Skip the given number of first records and start fetching
afterwards. Should be calling before the first time calling
``fetch()``. The missing records in the ``records`` attribute
will be filled by ``None``s. The ``number_fetched`` attribute
will be adjusted as if the first records have been fetched.
This is mainly useful if the skipped records have already been
fetched but the original ``Reader`` object is not available anymore.
This functionality may be ignored by readers that can only load
all records at once; generally these are readers that return lazy
records."""
if self.number_of_results is not None:
raise ReaderError(
"adjust_start_record should not be called after fetching."
)
self.number_fetched = start_number
self.records = [None for _ in range(start_number)]

@abstractmethod
def fetch_next(self):
'''Perform a subsequental query.'''
def fetch(
self, number: Optional[int] = None
):
'''Perform an initial or subsequent query. Most readers fetch
a limited number of records at once -- this number depends on
the reader but it may be adjusted using the ``number`` argument.
Other readers fetch all records at once and ignore the ``number``
argument. After fetching, the ``records`` and ``number_fetched``
attributes are adjusted and the ``number_of_results`` attribute
will be available.'''
pass

@classmethod
Expand All @@ -106,7 +132,7 @@ def identifier_to_iri(cls, identifier: str) -> str:
f"Cannot convert identifier to IRI: {__class__}.IRI_PREFIX "
"not a string."
)
return cls.IRI_PREFIX + identifier
return cls.IRI_PREFIX + quote(identifier)

@classmethod
def iri_to_identifier(cls, iri: str) -> str:
Expand All @@ -116,7 +142,7 @@ def iri_to_identifier(cls, iri: str) -> str:
"not a string."
)
if iri.startswith(cls.IRI_PREFIX):
return iri[len(cls.IRI_PREFIX):]
return unquote(iri[len(cls.IRI_PREFIX):])
else:
raise ReaderError(
f"Cannot convert IRI {iri} to identifier: IRI does not start "
Expand Down Expand Up @@ -147,6 +173,13 @@ def catalog_to_graph(cls) -> Graph:

return g

@property
def fetching_exhausted(self) -> bool:
"""Return ``True`` if all results have been fetched. This is currently
implemented by simply checking if the ``number_of_results`` and
``number_fetched`` attributes are equal."""
return self.number_fetched == self.number_of_results


class GetByIdBasedOnQueryMixin(ABC):
@classmethod
Expand Down
9 changes: 3 additions & 6 deletions edpop_explorer/readers/fbtee.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,11 +92,12 @@ def _add_fields(cls, record: BibliographicalRecord) -> None:
# author is tuple of author code and author name
record.contributors.append(Field(author[1]))

def fetch(self) -> None:
def fetch(self, number: Optional[int] = None) -> None:
self.prepare_data()
if not self.prepared_query:
raise ReaderError('First call prepare_query method')

if self.fetching_exhausted:
return
cur = self.con.cursor()
columns = [x[1] for x in cur.execute('PRAGMA table_info(books)')]
res = cur.execute(
Expand Down Expand Up @@ -133,7 +134,3 @@ def fetch(self) -> None:
self._add_fields(record)
self.number_of_results = len(self.records)
self.number_fetched = self.number_of_results
self.fetching_exhausted = True

def fetch_next(self):
pass
28 changes: 10 additions & 18 deletions edpop_explorer/readers/sbtireader.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
Reader, Record, ReaderError, BiographicalRecord, Field
)

RECORDS_PER_PAGE = 10


class SBTIReader(Reader):
api_url = 'https://data.cerl.org/sbti/_search'
Expand All @@ -19,6 +17,7 @@ class SBTIReader(Reader):
'https://edpop.hum.uu.nl/readers/sbti'
)
IRI_PREFIX = "https://edpop.hum.uu.nl/readers/sbti/"
DEFAULT_RECORDS_PER_PAGE = 10

@classmethod
def _get_name_field(cls, data: dict) -> Optional[Field]:
Expand Down Expand Up @@ -80,14 +79,17 @@ def _convert_record(cls, rawrecord: dict) -> BiographicalRecord:

return record

def _perform_query(self, start_record: int) -> List[Record]:
def _perform_query(self, start_record: int, maximum_records: Optional[int]) -> List[Record]:
assert isinstance(self.prepared_query, str)
if maximum_records is None:
maximum_records = self.DEFAULT_RECORDS_PER_PAGE
try:
response = requests.get(
self.api_url,
params={
'query': self.prepared_query,
'from': start_record,
'size': RECORDS_PER_PAGE,
'size': maximum_records,
'mode': 'default',
'sort': 'default'
},
Expand Down Expand Up @@ -125,23 +127,13 @@ def transform_query(cls, query) -> str:
# No transformation needed
return query

def fetch(self) -> None:
self.records = []
def fetch(self, number: Optional[int] = None) -> None:
if self.prepared_query is None:
raise ReaderError('First call prepare_query')
results = self._perform_query(0)
self.records.extend(results)
self.number_fetched = len(self.records)
if self.number_fetched == self.number_of_results:
self.fetching_exhausted = True

def fetch_next(self) -> None:
# TODO: can be merged with fetch method
if self.fetching_exhausted:
return
start_record = len(self.records) + 1
results = self._perform_query(start_record)
start_record = len(self.records)
results = self._perform_query(start_record, number)
self.records.extend(results)
self.number_fetched = len(self.records)
if self.number_fetched == self.number_of_results:
self.fetching_exhausted = True

8 changes: 3 additions & 5 deletions edpop_explorer/readers/ustc.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,14 +66,16 @@ def _prepare_get_by_id_query(cls, identifier: str) -> SQLPreparedQuery:
arguments=[identifier_int]
)

def fetch(self) -> None:
def fetch(self, number: Optional[int] = None) -> None:
self.prepare_data()

# This method fetches all records immediately, because the data is
# locally stored.

if not self.prepared_query:
raise ReaderError('No query has been set')
if self.fetching_exhausted:
return

cur = self.con.cursor()
columns = [x[1] for x in cur.execute('PRAGMA table_info(editions)')]
Expand All @@ -95,10 +97,6 @@ def fetch(self) -> None:
self.records.append(record)
self.number_of_results = len(self.records)
self.number_fetched = self.number_of_results
self.fetching_exhausted = True

def fetch_next(self):
pass

def _convert_record(self, data: dict) -> BibliographicalRecord:
record = BibliographicalRecord(from_reader=self.__class__)
Expand Down
7 changes: 3 additions & 4 deletions edpop_explorer/sparqlreader.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,9 +116,11 @@ def transform_query(cls, query: str):
def get_by_id(cls, identifier: str) -> Record:
return cls._create_lazy_record(identifier)

def fetch(self):
def fetch(self, number: Optional[int] = None):
if not self.prepared_query:
raise ReaderError('First call prepare_query method')
if self.fetching_exhausted:
return
wrapper = SPARQLWrapper(self.endpoint)
wrapper.setReturnFormat(JSONFormat)
wrapper.setQuery(self.prepared_query)
Expand All @@ -138,9 +140,6 @@ def fetch(self):
self.records.append(self._create_lazy_record(iri, name))
self.number_fetched = self.number_of_results

def fetch_next(self):
pass

@classmethod
@abstractmethod
def _convert_record(cls, graph: Graph, record: Record) -> None:
Expand Down
33 changes: 16 additions & 17 deletions edpop_explorer/srureader.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
from edpop_explorer import Reader, Record, ReaderError
from edpop_explorer.reader import GetByIdBasedOnQueryMixin

RECORDS_PER_PAGE = 10


class SRUReader(GetByIdBasedOnQueryMixin, Reader):
'''Subclass of ``Reader`` that adds basic SRU functionality
Expand All @@ -29,11 +27,14 @@ class SRUReader(GetByIdBasedOnQueryMixin, Reader):
query: Optional[str] = None
session: requests.Session
'''The ``Session`` object of the ``requests`` library.'''
DEFAULT_RECORDS_PER_PAGE: int = 10
'''The number of records to fetch at a time if not determined by user.'''

def __init__(self):
# Set a session to allow reuse of HTTP sessions and to set additional
# parameters and settings, which some SRU APIs require -
# see https://github.com/metaodi/sruthi#custom-parameters-and-settings
super().__init__()
self.session = requests.Session()

@classmethod
Expand All @@ -52,13 +53,15 @@ def _convert_record(cls, sruthirecord: dict) -> Record:
def _prepare_get_by_id_query(cls, identifier: str) -> str:
return cls.transform_query(identifier)

def _perform_query(self, start_record: int) -> List[Record]:
def _perform_query(self, start_record: int, maximum_records: Optional[int]) -> List[Record]:
if maximum_records is None:
maximum_records = self.DEFAULT_RECORDS_PER_PAGE
try:
response = sruthi.searchretrieve(
self.sru_url,
self.prepared_query,
start_record=start_record,
maximum_records=RECORDS_PER_PAGE,
maximum_records=maximum_records,
sru_version=self.sru_version,
session=self.session
)
Expand All @@ -70,27 +73,23 @@ def _perform_query(self, start_record: int) -> List[Record]:
self.number_of_results = response.count

records: List[Record] = []
for sruthirecord in response[0:RECORDS_PER_PAGE]:
for sruthirecord in response[0:maximum_records]:
records.append(self._convert_record(sruthirecord))

return records

def prepare_query(self, query) -> None:
self.prepared_query = self.transform_query(query)

def fetch(self) -> None:
self.records = []
def fetch(self, number: Optional[int] = None) -> None:
if self.records is None or self.number_fetched is None:
self.records = []
self.number_fetched = 0
if self.fetching_exhausted:
return
if self.prepared_query is None:
raise ReaderError('First call prepare_query')
results = self._perform_query(1)
self.records.extend(results)
self.number_fetched = len(self.records)

def fetch_next(self) -> None:
# TODO: can be merged with fetch method
if self.number_of_results == self.number_fetched:
return
start_record = len(self.records) + 1
results = self._perform_query(start_record)
start_number = self.number_fetched + 1 # SRU starts at 1
results = self._perform_query(start_number, number)
self.records.extend(results)
self.number_fetched = len(self.records)
25 changes: 23 additions & 2 deletions tests/test_allreaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,17 @@ def test_catalog_to_graph(readercls: Type[Reader]):
def test_realrequest(readercls: Type[Reader]):
reader = readercls()
reader.prepare_query("gruninger")
reader.fetch()
assert reader.number_fetched is not None
reader.fetch(5)
assert reader.number_of_results is not None
assert reader.number_fetched == len(reader.records)
if not reader.fetching_exhausted:
# Assert that maximum number of results is respected if reader does
# not fetch all results at once
assert reader.number_fetched <= 5
assert reader.number_of_results >= reader.number_fetched
if reader.number_fetched > 0:
record = reader.records[0]
assert record is not None
record.fetch()
assert isinstance(record.to_graph(), Graph)
# Take the IRI and check if searching by IRI gives
Expand All @@ -69,3 +73,20 @@ def test_realrequest(readercls: Type[Reader]):
warnings.warn(
UserWarning(f"Record {record} has empty IRI")
)
# Perform a second fetch
fetched_before = reader.number_fetched
reader.fetch() # Do not pass number of results to test that as well
# If not all records had been fetched already, more records
# should be available now. Otherwise, nothing should have
# changed.
if fetched_before < reader.number_of_results:
assert reader.number_fetched > fetched_before
# Assert that the last record of the previous fetch is not the same as
# the first record of the current fetch
record1 = reader.records[fetched_before - 1]
record2 = reader.records[fetched_before]
assert record1 is not None and record2 is not None
if record1.identifier is not None:
assert record1.identifier != record2.identifier
else:
assert reader.number_fetched == fetched_before

0 comments on commit 48c6206

Please sign in to comment.