diff --git a/edpop_explorer/edpopxshell.py b/edpop_explorer/edpopxshell.py index e06a8f9..4bd379b 100644 --- a/edpop_explorer/edpopxshell.py +++ b/edpop_explorer/edpopxshell.py @@ -74,7 +74,7 @@ def do_next(self, args) -> None: self.perror('All records have been shown') else: if self.reader.number_fetched - self.shown < self.RECORDS_PER_PAGE: - self.reader.fetch_next() + self.reader.fetch() self.shown += self._show_records(self.reader.records, self.shown, self.RECORDS_PER_PAGE) @@ -195,7 +195,7 @@ def do_kb(self, args) -> None: 'Koninklijke Bibliotheek' self._query(KBReader, args) - def _show_records(self, records: List[Record], + def _show_records(self, records: List[Optional[Record]], start: int, limit=math.inf) -> int: """Show the records from start, with limit as the maximum number diff --git a/edpop_explorer/reader.py b/edpop_explorer/reader.py index b6e2c9e..7237f05 100644 --- a/edpop_explorer/reader.py +++ b/edpop_explorer/reader.py @@ -2,6 +2,7 @@ from dataclasses import dataclass from typing import Optional, List, Union from rdflib import Graph, RDF, URIRef +from urllib.parse import quote, unquote from edpop_explorer import ( EDPOPREC, BIBLIOGRAPHICAL, BIOGRAPHICAL, bind_common_namespaces @@ -38,9 +39,10 @@ class Reader(ABC): number_of_results: Optional[int] = None '''The total number of results for the query, including those that have not been fetched yet.''' - number_fetched: Optional[int] = None - '''The number of results that has been fetched so far.''' - records: List[Record] + number_fetched: int = 0 + '''The number of results that has been fetched so far, or 0 if + no fetch has been performed yet.''' + records: List[Optional[Record]] '''The records that have been fetched as instances of (a subclass of) ``Record``.''' prepared_query: Optional[PreparedQueryType] = None @@ -57,6 +59,9 @@ class Reader(ABC): overridden.''' _graph: Optional[Graph] = None + def __init__(self): + self.records = [] + @classmethod @abstractmethod def transform_query(cls, query: str) -> PreparedQueryType: @@ -77,14 +82,35 @@ def set_query(self, query: PreparedQueryType) -> None: attribute.''' self.prepared_query = query - @abstractmethod - def fetch(self): - '''Perform an initial query.''' - pass + def adjust_start_record(self, start_number: int) -> None: + """Skip the given number of first records and start fetching + afterwards. Should be calling before the first time calling + ``fetch()``. The missing records in the ``records`` attribute + will be filled by ``None``s. The ``number_fetched`` attribute + will be adjusted as if the first records have been fetched. + This is mainly useful if the skipped records have already been + fetched but the original ``Reader`` object is not available anymore. + This functionality may be ignored by readers that can only load + all records at once; generally these are readers that return lazy + records.""" + if self.number_of_results is not None: + raise ReaderError( + "adjust_start_record should not be called after fetching." + ) + self.number_fetched = start_number + self.records = [None for _ in range(start_number)] @abstractmethod - def fetch_next(self): - '''Perform a subsequental query.''' + def fetch( + self, number: Optional[int] = None + ): + '''Perform an initial or subsequent query. Most readers fetch + a limited number of records at once -- this number depends on + the reader but it may be adjusted using the ``number`` argument. + Other readers fetch all records at once and ignore the ``number`` + argument. After fetching, the ``records`` and ``number_fetched`` + attributes are adjusted and the ``number_of_results`` attribute + will be available.''' pass @classmethod @@ -106,7 +132,7 @@ def identifier_to_iri(cls, identifier: str) -> str: f"Cannot convert identifier to IRI: {__class__}.IRI_PREFIX " "not a string." ) - return cls.IRI_PREFIX + identifier + return cls.IRI_PREFIX + quote(identifier) @classmethod def iri_to_identifier(cls, iri: str) -> str: @@ -116,7 +142,7 @@ def iri_to_identifier(cls, iri: str) -> str: "not a string." ) if iri.startswith(cls.IRI_PREFIX): - return iri[len(cls.IRI_PREFIX):] + return unquote(iri[len(cls.IRI_PREFIX):]) else: raise ReaderError( f"Cannot convert IRI {iri} to identifier: IRI does not start " @@ -147,6 +173,13 @@ def catalog_to_graph(cls) -> Graph: return g + @property + def fetching_exhausted(self) -> bool: + """Return ``True`` if all results have been fetched. This is currently + implemented by simply checking if the ``number_of_results`` and + ``number_fetched`` attributes are equal.""" + return self.number_fetched == self.number_of_results + class GetByIdBasedOnQueryMixin(ABC): @classmethod diff --git a/edpop_explorer/readers/fbtee.py b/edpop_explorer/readers/fbtee.py index 02fa9b1..7a3e532 100644 --- a/edpop_explorer/readers/fbtee.py +++ b/edpop_explorer/readers/fbtee.py @@ -92,11 +92,12 @@ def _add_fields(cls, record: BibliographicalRecord) -> None: # author is tuple of author code and author name record.contributors.append(Field(author[1])) - def fetch(self) -> None: + def fetch(self, number: Optional[int] = None) -> None: self.prepare_data() if not self.prepared_query: raise ReaderError('First call prepare_query method') - + if self.fetching_exhausted: + return cur = self.con.cursor() columns = [x[1] for x in cur.execute('PRAGMA table_info(books)')] res = cur.execute( @@ -133,7 +134,3 @@ def fetch(self) -> None: self._add_fields(record) self.number_of_results = len(self.records) self.number_fetched = self.number_of_results - self.fetching_exhausted = True - - def fetch_next(self): - pass diff --git a/edpop_explorer/readers/sbtireader.py b/edpop_explorer/readers/sbtireader.py index aeff797..237722a 100644 --- a/edpop_explorer/readers/sbtireader.py +++ b/edpop_explorer/readers/sbtireader.py @@ -6,8 +6,6 @@ Reader, Record, ReaderError, BiographicalRecord, Field ) -RECORDS_PER_PAGE = 10 - class SBTIReader(Reader): api_url = 'https://data.cerl.org/sbti/_search' @@ -19,6 +17,7 @@ class SBTIReader(Reader): 'https://edpop.hum.uu.nl/readers/sbti' ) IRI_PREFIX = "https://edpop.hum.uu.nl/readers/sbti/" + DEFAULT_RECORDS_PER_PAGE = 10 @classmethod def _get_name_field(cls, data: dict) -> Optional[Field]: @@ -80,14 +79,17 @@ def _convert_record(cls, rawrecord: dict) -> BiographicalRecord: return record - def _perform_query(self, start_record: int) -> List[Record]: + def _perform_query(self, start_record: int, maximum_records: Optional[int]) -> List[Record]: + assert isinstance(self.prepared_query, str) + if maximum_records is None: + maximum_records = self.DEFAULT_RECORDS_PER_PAGE try: response = requests.get( self.api_url, params={ 'query': self.prepared_query, 'from': start_record, - 'size': RECORDS_PER_PAGE, + 'size': maximum_records, 'mode': 'default', 'sort': 'default' }, @@ -125,23 +127,13 @@ def transform_query(cls, query) -> str: # No transformation needed return query - def fetch(self) -> None: - self.records = [] + def fetch(self, number: Optional[int] = None) -> None: if self.prepared_query is None: raise ReaderError('First call prepare_query') - results = self._perform_query(0) - self.records.extend(results) - self.number_fetched = len(self.records) - if self.number_fetched == self.number_of_results: - self.fetching_exhausted = True - - def fetch_next(self) -> None: - # TODO: can be merged with fetch method if self.fetching_exhausted: return - start_record = len(self.records) + 1 - results = self._perform_query(start_record) + start_record = len(self.records) + results = self._perform_query(start_record, number) self.records.extend(results) self.number_fetched = len(self.records) - if self.number_fetched == self.number_of_results: - self.fetching_exhausted = True + diff --git a/edpop_explorer/readers/ustc.py b/edpop_explorer/readers/ustc.py index dc4229c..a4a1c94 100644 --- a/edpop_explorer/readers/ustc.py +++ b/edpop_explorer/readers/ustc.py @@ -66,7 +66,7 @@ def _prepare_get_by_id_query(cls, identifier: str) -> SQLPreparedQuery: arguments=[identifier_int] ) - def fetch(self) -> None: + def fetch(self, number: Optional[int] = None) -> None: self.prepare_data() # This method fetches all records immediately, because the data is @@ -74,6 +74,8 @@ def fetch(self) -> None: if not self.prepared_query: raise ReaderError('No query has been set') + if self.fetching_exhausted: + return cur = self.con.cursor() columns = [x[1] for x in cur.execute('PRAGMA table_info(editions)')] @@ -95,10 +97,6 @@ def fetch(self) -> None: self.records.append(record) self.number_of_results = len(self.records) self.number_fetched = self.number_of_results - self.fetching_exhausted = True - - def fetch_next(self): - pass def _convert_record(self, data: dict) -> BibliographicalRecord: record = BibliographicalRecord(from_reader=self.__class__) diff --git a/edpop_explorer/sparqlreader.py b/edpop_explorer/sparqlreader.py index 53d5b63..4d55656 100644 --- a/edpop_explorer/sparqlreader.py +++ b/edpop_explorer/sparqlreader.py @@ -116,9 +116,11 @@ def transform_query(cls, query: str): def get_by_id(cls, identifier: str) -> Record: return cls._create_lazy_record(identifier) - def fetch(self): + def fetch(self, number: Optional[int] = None): if not self.prepared_query: raise ReaderError('First call prepare_query method') + if self.fetching_exhausted: + return wrapper = SPARQLWrapper(self.endpoint) wrapper.setReturnFormat(JSONFormat) wrapper.setQuery(self.prepared_query) @@ -138,9 +140,6 @@ def fetch(self): self.records.append(self._create_lazy_record(iri, name)) self.number_fetched = self.number_of_results - def fetch_next(self): - pass - @classmethod @abstractmethod def _convert_record(cls, graph: Graph, record: Record) -> None: diff --git a/edpop_explorer/srureader.py b/edpop_explorer/srureader.py index 55a9972..bcde43f 100644 --- a/edpop_explorer/srureader.py +++ b/edpop_explorer/srureader.py @@ -6,8 +6,6 @@ from edpop_explorer import Reader, Record, ReaderError from edpop_explorer.reader import GetByIdBasedOnQueryMixin -RECORDS_PER_PAGE = 10 - class SRUReader(GetByIdBasedOnQueryMixin, Reader): '''Subclass of ``Reader`` that adds basic SRU functionality @@ -29,11 +27,14 @@ class SRUReader(GetByIdBasedOnQueryMixin, Reader): query: Optional[str] = None session: requests.Session '''The ``Session`` object of the ``requests`` library.''' + DEFAULT_RECORDS_PER_PAGE: int = 10 + '''The number of records to fetch at a time if not determined by user.''' def __init__(self): # Set a session to allow reuse of HTTP sessions and to set additional # parameters and settings, which some SRU APIs require - # see https://github.com/metaodi/sruthi#custom-parameters-and-settings + super().__init__() self.session = requests.Session() @classmethod @@ -52,13 +53,15 @@ def _convert_record(cls, sruthirecord: dict) -> Record: def _prepare_get_by_id_query(cls, identifier: str) -> str: return cls.transform_query(identifier) - def _perform_query(self, start_record: int) -> List[Record]: + def _perform_query(self, start_record: int, maximum_records: Optional[int]) -> List[Record]: + if maximum_records is None: + maximum_records = self.DEFAULT_RECORDS_PER_PAGE try: response = sruthi.searchretrieve( self.sru_url, self.prepared_query, start_record=start_record, - maximum_records=RECORDS_PER_PAGE, + maximum_records=maximum_records, sru_version=self.sru_version, session=self.session ) @@ -70,7 +73,7 @@ def _perform_query(self, start_record: int) -> List[Record]: self.number_of_results = response.count records: List[Record] = [] - for sruthirecord in response[0:RECORDS_PER_PAGE]: + for sruthirecord in response[0:maximum_records]: records.append(self._convert_record(sruthirecord)) return records @@ -78,19 +81,15 @@ def _perform_query(self, start_record: int) -> List[Record]: def prepare_query(self, query) -> None: self.prepared_query = self.transform_query(query) - def fetch(self) -> None: - self.records = [] + def fetch(self, number: Optional[int] = None) -> None: + if self.records is None or self.number_fetched is None: + self.records = [] + self.number_fetched = 0 + if self.fetching_exhausted: + return if self.prepared_query is None: raise ReaderError('First call prepare_query') - results = self._perform_query(1) - self.records.extend(results) - self.number_fetched = len(self.records) - - def fetch_next(self) -> None: - # TODO: can be merged with fetch method - if self.number_of_results == self.number_fetched: - return - start_record = len(self.records) + 1 - results = self._perform_query(start_record) + start_number = self.number_fetched + 1 # SRU starts at 1 + results = self._perform_query(start_number, number) self.records.extend(results) self.number_fetched = len(self.records) diff --git a/tests/test_allreaders.py b/tests/test_allreaders.py index 8e0e4a2..0ce1485 100644 --- a/tests/test_allreaders.py +++ b/tests/test_allreaders.py @@ -50,13 +50,17 @@ def test_catalog_to_graph(readercls: Type[Reader]): def test_realrequest(readercls: Type[Reader]): reader = readercls() reader.prepare_query("gruninger") - reader.fetch() - assert reader.number_fetched is not None + reader.fetch(5) assert reader.number_of_results is not None assert reader.number_fetched == len(reader.records) + if not reader.fetching_exhausted: + # Assert that maximum number of results is respected if reader does + # not fetch all results at once + assert reader.number_fetched <= 5 assert reader.number_of_results >= reader.number_fetched if reader.number_fetched > 0: record = reader.records[0] + assert record is not None record.fetch() assert isinstance(record.to_graph(), Graph) # Take the IRI and check if searching by IRI gives @@ -69,3 +73,20 @@ def test_realrequest(readercls: Type[Reader]): warnings.warn( UserWarning(f"Record {record} has empty IRI") ) + # Perform a second fetch + fetched_before = reader.number_fetched + reader.fetch() # Do not pass number of results to test that as well + # If not all records had been fetched already, more records + # should be available now. Otherwise, nothing should have + # changed. + if fetched_before < reader.number_of_results: + assert reader.number_fetched > fetched_before + # Assert that the last record of the previous fetch is not the same as + # the first record of the current fetch + record1 = reader.records[fetched_before - 1] + record2 = reader.records[fetched_before] + assert record1 is not None and record2 is not None + if record1.identifier is not None: + assert record1.identifier != record2.identifier + else: + assert reader.number_fetched == fetched_before