From 8f84a64901758644e3140ec48cc7436d89ab5e68 Mon Sep 17 00:00:00 2001 From: Robert Forkel Date: Sat, 15 Feb 2025 16:13:31 +0100 Subject: [PATCH] docs --- CHANGELOG.md | 1 + README.md | 26 ++++++++++++++-------- src/pycldf/dataset.py | 50 ++++++++++++++++++++++++++++++++----------- src/pycldf/media.py | 10 +++++++-- src/pycldf/util.py | 21 ++++++++++++++++-- tests/test_util.py | 15 +++++++++++++ 6 files changed, 98 insertions(+), 25 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b302ec4..524af60 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ The `pycldf` package adheres to [Semantic Versioning](http://semver.org/spec/v2. ## Unreleased - Added a utility function to query SQLite DBs using user-defined functions, aggregates or collations. +- Fixed a bug whereby validation of remote datasets specified by URL of the metadata file did not work. ## [1.40.4] - 2025-01-15 diff --git a/README.md b/README.md index aaf35e3..82c6057 100644 --- a/README.md +++ b/README.md @@ -69,13 +69,21 @@ sources.bib Sources 2000 ### Summary statistics ```shell -$ cldf stats mydataset/Wordlist-metadata.json - - -Path Type Rows ---------------------- ---------- ------ -forms.csv Form Table 1 -mydataset/sources.bib Sources 1 +$ cldf stats tests/data/wordlist_with_cognates/metadata.json + + value +------------- -------------------------------------------- +dc:conformsTo http://cldf.clld.org/v1.0/terms.rdf#Wordlist +dc:source sources.bib + + Type Rows +--------------- --------------- ------ +languages.csv LanguageTable 2 +parameters.csv ParameterTable 2 +forms.csv FormTable 3 +cognates.csv CognateTable 2 +cognatesets.csv CognatesetTable 1 +sources.bib Sources 1 ``` @@ -159,7 +167,7 @@ provides a pragmatic solution as follows: Running ```shell -cldfbench splitmedia +cldf splitmedia ``` on a dataset will split all media files with sizes bigger than a configurable threshold into multiple files, just like [UNIX' split command](https://en.wikipedia.org/wiki/Split_(Unix)) would. @@ -170,7 +178,7 @@ A file named `audio.wav` will be split into files `audio.wav.aa`, `audio.wav.ab` In order to restore the files, the corresponding command ```shell -cldfbench catmedia +cldf catmedia ``` can be used. diff --git a/src/pycldf/dataset.py b/src/pycldf/dataset.py index e06ce5b..f0aa3f1 100644 --- a/src/pycldf/dataset.py +++ b/src/pycldf/dataset.py @@ -36,6 +36,7 @@ ORM_CLASSES = {cls.component_name(): cls for cls in orm.Object.__subclasses__()} TableType = typing.Union[str, Table] ColType = typing.Union[str, Column] +ColSpecType = typing.Union[str, dict, Column] PathType = typing.Union[str, pathlib.Path] TableSpecType = typing.Union[str, Link, Table] ColSPecType = typing.Union[str, Column] @@ -96,7 +97,21 @@ def get_modules() -> typing.List[Module]: return _modules -def make_column(spec: typing.Union[str, dict, Column]) -> Column: +def make_column(spec: ColSpecType) -> Column: + """ + Create a `Column` instance from `spec`. + + .. code-block:: python + + >>> make_column('id').name + 'id' + >>> make_column('http://cldf.clld.org/v1.0/terms.rdf#id').name + 'ID' + >>> make_column({'name': 'col', 'datatype': 'boolean'}).datatype.base + 'boolean' + >>> type(make_column(make_column('id'))) + + """ if isinstance(spec, str): if spec in TERMS.by_uri: return TERMS.by_uri[spec].to_column() @@ -109,7 +124,15 @@ def make_column(spec: typing.Union[str, dict, Column]) -> Column: class GitRepository: - def __init__(self, url, clone=None, version=None, **dc): + """ + CLDF datasets are often created from data curated in git repositories. If this is the case, we + exploit this to provide better provenance information in the dataset's metadata. + """ + def __init__(self, + url: str, + clone: typing.Optional[typing.Union[str, pathlib.Path]] = None, + version: typing.Optional[str] = None, + **dc): # We remove credentials from the URL immediately to make sure this isn't leaked into # CLDF metadata. Such credentials might be present in URLs read via gitpython from # remotes. @@ -118,7 +141,7 @@ def __init__(self, url, clone=None, version=None, **dc): self.version = version self.dc = dc - def json_ld(self): + def json_ld(self) -> typing.Dict[str, str]: res = collections.OrderedDict([ ('rdf:about', self.url), ('rdf:type', 'prov:Entity'), @@ -152,7 +175,7 @@ def __init__(self, tablegroup: csvw.TableGroup): self._objects_by_pk = collections.defaultdict(collections.OrderedDict) @property - def sources(self): + def sources(self) -> Sources: # We load sources only the first time they are accessed, because for datasets like # Glottolog - with 40MB zipped BibTeX - this may take ~90secs. if self._sources is None: @@ -160,7 +183,7 @@ def sources(self): return self._sources @sources.setter - def sources(self, obj): + def sources(self, obj: Sources): if not isinstance(obj, Sources): raise TypeError('Invalid type for Dataset.sources') self._sources = obj @@ -284,7 +307,7 @@ def module(self) -> str: def version(self) -> str: return self.properties['dc:conformsTo'].split('/')[3] - def __repr__(self): + def __repr__(self) -> str: return '' % (self.version, self.module, self.directory) @property @@ -536,7 +559,7 @@ def to_json(obj): v = old self.tablegroup.common_props[k] = v - def add_table(self, url: str, *cols, **kw) -> csvw.Table: + def add_table(self, url: str, *cols: ColSpecType, **kw) -> csvw.Table: """ Add a table description to the Dataset. @@ -566,7 +589,10 @@ def remove_table(self, table: TableType): # Now remove the table: self.tablegroup.tables = [t for t in self.tablegroup.tables if t.url != table.url] - def add_component(self, component: typing.Union[str, dict], *cols, **kw) -> csvw.Table: + def add_component(self, + component: typing.Union[str, dict], + *cols: ColSpecType, + **kw) -> csvw.Table: """ Add a CLDF component to a dataset. @@ -607,7 +633,7 @@ def add_component(self, component: typing.Union[str, dict], *cols, **kw) -> csvw self.auto_constraints(component) return component - def add_columns(self, table: TableType, *cols) -> None: + def add_columns(self, table: TableType, *cols: ColSpecType) -> None: """ Add columns specified by `cols` to the table specified by `table`. """ @@ -624,7 +650,7 @@ def add_columns(self, table: TableType, *cols) -> None: table.tableSchema.columns.append(col) self.auto_constraints() - def remove_columns(self, table: TableType, *cols): + def remove_columns(self, table: TableType, *cols: str): """ Remove `cols` from `table`'s schema. @@ -781,7 +807,7 @@ def add_sources(self, *sources, **kw): # # Methods to read data # - def iter_rows(self, table: TableType, *cols) -> typing.Iterator[dict]: + def iter_rows(self, table: TableType, *cols: str) -> typing.Generator[dict, None, None]: """ Iterate rows in a table, resolving CLDF property names to local column names. @@ -1116,7 +1142,7 @@ def validate( return success - def stats(self, exact=False) -> typing.List[typing.Tuple[str, str, int]]: + def stats(self, exact: bool = False) -> typing.List[typing.Tuple[str, str, int]]: """ Compute summary statistics for the dataset. diff --git a/src/pycldf/media.py b/src/pycldf/media.py index ddefc01..a90bef4 100644 --- a/src/pycldf/media.py +++ b/src/pycldf/media.py @@ -91,7 +91,7 @@ def from_dataset( MediaTable(ds), row_or_object.data if isinstance(row_or_object, orm.Media) else row_or_object) - def __getitem__(self, item): + def __getitem__(self, item) -> dict: """ Access to the underlying row `dict`. """ @@ -223,7 +223,10 @@ def __iter__(self) -> typing.Generator[File, None, None]: for row in self.table: yield File(self, row) - def split(self, chunksize): + def split(self, chunksize: int) -> int: + """ + :return: The number of media files that have been split. + """ res = 0 for file in self: p = file.local_path() @@ -235,6 +238,9 @@ def split(self, chunksize): return res def cat(self): + """ + :return: The number of media files that have been re-assembled from chunks. + """ res = 0 for file in self: p = file.local_path() diff --git a/src/pycldf/util.py b/src/pycldf/util.py index 01bc4a5..c15626b 100644 --- a/src/pycldf/util.py +++ b/src/pycldf/util.py @@ -17,7 +17,16 @@ 'splitfile', 'catfile'] -def splitfile(p, chunksize, total): +def splitfile(p, chunksize: int, total: typing.Optional[int] = None) -> typing.List[pathlib.Path]: + """ + :param p: Path of the file to split. + :param chunksize: The maximal size of the chunks the file will be split into. + :param total: The size of the input file. + :return: The list of paths of files that the input has been split into. + """ + total = total or p.stat().st_size + if total <= chunksize: # Nothing to do. + return [p] nchunks = math.ceil(total / chunksize) suffix_length = 2 if nchunks < len(string.ascii_lowercase)**2 else 3 suffixes = [ @@ -37,7 +46,15 @@ def splitfile(p, chunksize, total): return res -def catfile(p): +def catfile(p: pathlib.Path) -> bool: + """ + Restore a file that has been split into chunks. + + We determine if a file has been split by looking for files in the parent directory with suffixes + as created by `splitfile`. + """ + if p.exists(): # Nothing to do. + return False # Check, whether the file has been split. suffixes = {pp.suffix: pp for pp in p.parent.iterdir() if pp.stem == p.name} if {'.aa', '.ab'}.issubset(suffixes) or {'.aaa', '.aab'}.issubset(suffixes): diff --git a/tests/test_util.py b/tests/test_util.py index 8321d99..965350d 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -52,3 +52,18 @@ def test_metadata2markdown(tmp_path): tmp_path.joinpath('languages.csv.zip').unlink() md = metadata2markdown(ds, tmp_path / 'Generic-metadata.json') assert 'languages.csv.zip' not in md, "Don't link non-existing files" + + +def test_split_and_cat(tmp_path): + p = tmp_path / 'testfile' + text = 'This is the test content' + p.write_text(text) + res = splitfile(p, 5) + assert not p.exists() + assert len(res) == 5 + assert catfile(p) + assert p.exists() + assert p.read_text() == text + + assert splitfile(p, 1000) == [p] + assert not catfile(p)