Skip to content

Commit

Permalink
Implement adjustment of maximum numbers per fetch
Browse files Browse the repository at this point in the history
  • Loading branch information
tijmenbaarda committed Jan 3, 2024
1 parent 43eac04 commit 1388d36
Show file tree
Hide file tree
Showing 6 changed files with 26 additions and 18 deletions.
2 changes: 1 addition & 1 deletion edpop_explorer/readers/fbtee.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def _add_fields(cls, record: BibliographicalRecord) -> None:
# author is tuple of author code and author name
record.contributors.append(Field(author[1]))

def fetch(self) -> None:
def fetch(self, number: Optional[int] = None) -> None:
self.prepare_data()
if not self.prepared_query:
raise ReaderError('First call prepare_query method')
Expand Down
14 changes: 8 additions & 6 deletions edpop_explorer/readers/sbtireader.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
Reader, Record, ReaderError, BiographicalRecord, Field
)

RECORDS_PER_PAGE = 10


class SBTIReader(Reader):
api_url = 'https://data.cerl.org/sbti/_search'
Expand All @@ -19,6 +17,7 @@ class SBTIReader(Reader):
'https://edpop.hum.uu.nl/readers/sbti'
)
IRI_PREFIX = "https://edpop.hum.uu.nl/readers/sbti/"
DEFAULT_RECORDS_PER_PAGE = 10

@classmethod
def _get_name_field(cls, data: dict) -> Optional[Field]:
Expand Down Expand Up @@ -80,14 +79,17 @@ def _convert_record(cls, rawrecord: dict) -> BiographicalRecord:

return record

def _perform_query(self, start_record: int) -> List[Record]:
def _perform_query(self, start_record: int, maximum_records: Optional[int]) -> List[Record]:
assert isinstance(self.prepared_query, str)
if maximum_records is None:
maximum_records = self.DEFAULT_RECORDS_PER_PAGE
try:
response = requests.get(
self.api_url,
params={
'query': self.prepared_query,
'from': start_record,
'size': RECORDS_PER_PAGE,
'size': maximum_records,
'mode': 'default',
'sort': 'default'
},
Expand Down Expand Up @@ -125,13 +127,13 @@ def transform_query(cls, query) -> str:
# No transformation needed
return query

def fetch(self) -> None:
def fetch(self, number: Optional[int] = None) -> None:
if self.prepared_query is None:
raise ReaderError('First call prepare_query')
if self.fetching_exhausted:
return
start_record = len(self.records)
results = self._perform_query(start_record)
results = self._perform_query(start_record, number)
self.records.extend(results)
self.number_fetched = len(self.records)

2 changes: 1 addition & 1 deletion edpop_explorer/readers/ustc.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def _prepare_get_by_id_query(cls, identifier: str) -> SQLPreparedQuery:
arguments=[identifier_int]
)

def fetch(self) -> None:
def fetch(self, number: Optional[int] = None) -> None:
self.prepare_data()

# This method fetches all records immediately, because the data is
Expand Down
2 changes: 1 addition & 1 deletion edpop_explorer/sparqlreader.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def transform_query(cls, query: str):
def get_by_id(cls, identifier: str) -> Record:
return cls._create_lazy_record(identifier)

def fetch(self):
def fetch(self, number: Optional[int] = None):
if not self.prepared_query:
raise ReaderError('First call prepare_query method')
if self.fetching_exhausted:
Expand Down
16 changes: 9 additions & 7 deletions edpop_explorer/srureader.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
from edpop_explorer import Reader, Record, ReaderError
from edpop_explorer.reader import GetByIdBasedOnQueryMixin

RECORDS_PER_PAGE = 10


class SRUReader(GetByIdBasedOnQueryMixin, Reader):
'''Subclass of ``Reader`` that adds basic SRU functionality
Expand All @@ -29,6 +27,8 @@ class SRUReader(GetByIdBasedOnQueryMixin, Reader):
query: Optional[str] = None
session: requests.Session
'''The ``Session`` object of the ``requests`` library.'''
DEFAULT_RECORDS_PER_PAGE: int = 10
'''The number of records to fetch at a time if not determined by user.'''

def __init__(self):
# Set a session to allow reuse of HTTP sessions and to set additional
Expand All @@ -53,13 +53,15 @@ def _convert_record(cls, sruthirecord: dict) -> Record:
def _prepare_get_by_id_query(cls, identifier: str) -> str:
return cls.transform_query(identifier)

def _perform_query(self, start_record: int) -> List[Record]:
def _perform_query(self, start_record: int, maximum_records: Optional[int]) -> List[Record]:
if maximum_records is None:
maximum_records = self.DEFAULT_RECORDS_PER_PAGE
try:
response = sruthi.searchretrieve(
self.sru_url,
self.prepared_query,
start_record=start_record,
maximum_records=RECORDS_PER_PAGE,
maximum_records=maximum_records,
sru_version=self.sru_version,
session=self.session
)
Expand All @@ -71,15 +73,15 @@ def _perform_query(self, start_record: int) -> List[Record]:
self.number_of_results = response.count

records: List[Record] = []
for sruthirecord in response[0:RECORDS_PER_PAGE]:
for sruthirecord in response[0:maximum_records]:
records.append(self._convert_record(sruthirecord))

return records

def prepare_query(self, query) -> None:
self.prepared_query = self.transform_query(query)

def fetch(self) -> None:
def fetch(self, number: Optional[int] = None) -> None:
if self.records is None or self.number_fetched is None:
self.records = []
self.number_fetched = 0
Expand All @@ -88,6 +90,6 @@ def fetch(self) -> None:
if self.prepared_query is None:
raise ReaderError('First call prepare_query')
start_number = self.number_fetched + 1 # SRU starts at 1
results = self._perform_query(start_number)
results = self._perform_query(start_number, number)
self.records.extend(results)
self.number_fetched = len(self.records)
8 changes: 6 additions & 2 deletions tests/test_allreaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,13 @@ def test_catalog_to_graph(readercls: Type[Reader]):
def test_realrequest(readercls: Type[Reader]):
reader = readercls()
reader.prepare_query("gruninger")
reader.fetch()
reader.fetch(5)
assert reader.number_of_results is not None
assert reader.number_fetched == len(reader.records)
if not reader.fetching_exhausted:
# Assert that maximum number of results is respected if reader does
# not fetch all results at once
assert reader.number_fetched <= 5
assert reader.number_of_results >= reader.number_fetched
if reader.number_fetched > 0:
record = reader.records[0]
Expand All @@ -71,7 +75,7 @@ def test_realrequest(readercls: Type[Reader]):
)
# Perform a second fetch
fetched_before = reader.number_fetched
reader.fetch()
reader.fetch() # Do not pass number of results to test that as well
# If not all records had been fetched already, more records
# should be available now. Otherwise, nothing should have
# changed.
Expand Down

0 comments on commit 1388d36

Please sign in to comment.