From 8f84a64901758644e3140ec48cc7436d89ab5e68 Mon Sep 17 00:00:00 2001
From: Robert Forkel <xrotwang@googlemail.com>
Date: Sat, 15 Feb 2025 16:13:31 +0100
Subject: [PATCH] docs

---
 CHANGELOG.md          |  1 +
 README.md             | 26 ++++++++++++++--------
 src/pycldf/dataset.py | 50 ++++++++++++++++++++++++++++++++-----------
 src/pycldf/media.py   | 10 +++++++--
 src/pycldf/util.py    | 21 ++++++++++++++++--
 tests/test_util.py    | 15 +++++++++++++
 6 files changed, 98 insertions(+), 25 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index b302ec4..524af60 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,7 @@ The `pycldf` package adheres to [Semantic Versioning](http://semver.org/spec/v2.
 ## Unreleased
 
 - Added a utility function to query SQLite DBs using user-defined functions, aggregates or collations.
+- Fixed a bug whereby validation of remote datasets specified by URL of the metadata file did not work.
 
 
 ## [1.40.4] - 2025-01-15
diff --git a/README.md b/README.md
index aaf35e3..82c6057 100644
--- a/README.md
+++ b/README.md
@@ -69,13 +69,21 @@ sources.bib     Sources           2000
 ### Summary statistics
 
 ```shell
-$ cldf stats mydataset/Wordlist-metadata.json 
-<cldf:v1.0:Wordlist at mydataset>
-
-Path                   Type          Rows
----------------------  ----------  ------
-forms.csv              Form Table       1
-mydataset/sources.bib  Sources          1
+$ cldf stats tests/data/wordlist_with_cognates/metadata.json 
+<cldf:v1.0:Wordlist at tests/data/wordlist_with_cognates>
+               value
+-------------  --------------------------------------------
+dc:conformsTo  http://cldf.clld.org/v1.0/terms.rdf#Wordlist
+dc:source      sources.bib
+
+                 Type               Rows
+---------------  ---------------  ------
+languages.csv    LanguageTable         2
+parameters.csv   ParameterTable        2
+forms.csv        FormTable             3
+cognates.csv     CognateTable          2
+cognatesets.csv  CognatesetTable       1
+sources.bib      Sources               1
 ```
 
 
@@ -159,7 +167,7 @@ provides a pragmatic solution as follows:
 
 Running
 ```shell
-cldfbench splitmedia
+cldf splitmedia <dataset-locator>
 ```
 on a dataset will split all media files with sizes bigger than a configurable threshold into
 multiple files, just like [UNIX' split command](https://en.wikipedia.org/wiki/Split_(Unix)) would.
@@ -170,7 +178,7 @@ A file named `audio.wav` will be split into files `audio.wav.aa`, `audio.wav.ab`
 
 In order to restore the files, the corresponding command
 ```shell
-cldfbench catmedia
+cldf catmedia <dataset-locator>
 ```
 can be used.
 
diff --git a/src/pycldf/dataset.py b/src/pycldf/dataset.py
index e06ce5b..f0aa3f1 100644
--- a/src/pycldf/dataset.py
+++ b/src/pycldf/dataset.py
@@ -36,6 +36,7 @@
 ORM_CLASSES = {cls.component_name(): cls for cls in orm.Object.__subclasses__()}
 TableType = typing.Union[str, Table]
 ColType = typing.Union[str, Column]
+ColSpecType = typing.Union[str, dict, Column]
 PathType = typing.Union[str, pathlib.Path]
 TableSpecType = typing.Union[str, Link, Table]
 ColSPecType = typing.Union[str, Column]
@@ -96,7 +97,21 @@ def get_modules() -> typing.List[Module]:
     return _modules
 
 
-def make_column(spec: typing.Union[str, dict, Column]) -> Column:
+def make_column(spec: ColSpecType) -> Column:
+    """
+    Create a `Column` instance from `spec`.
+
+    .. code-block:: python
+
+        >>> make_column('id').name
+        'id'
+        >>> make_column('http://cldf.clld.org/v1.0/terms.rdf#id').name
+        'ID'
+        >>> make_column({'name': 'col', 'datatype': 'boolean'}).datatype.base
+        'boolean'
+        >>> type(make_column(make_column('id')))
+        <class 'csvw.metadata.Column'>
+    """
     if isinstance(spec, str):
         if spec in TERMS.by_uri:
             return TERMS.by_uri[spec].to_column()
@@ -109,7 +124,15 @@ def make_column(spec: typing.Union[str, dict, Column]) -> Column:
 
 
 class GitRepository:
-    def __init__(self, url, clone=None, version=None, **dc):
+    """
+    CLDF datasets are often created from data curated in git repositories. If this is the case, we
+    exploit this to provide better provenance information in the dataset's metadata.
+    """
+    def __init__(self,
+                 url: str,
+                 clone: typing.Optional[typing.Union[str, pathlib.Path]] = None,
+                 version: typing.Optional[str] = None,
+                 **dc):
         # We remove credentials from the URL immediately to make sure this isn't leaked into
         # CLDF metadata. Such credentials might be present in URLs read via gitpython from
         # remotes.
@@ -118,7 +141,7 @@ def __init__(self, url, clone=None, version=None, **dc):
         self.version = version
         self.dc = dc
 
-    def json_ld(self):
+    def json_ld(self) -> typing.Dict[str, str]:
         res = collections.OrderedDict([
             ('rdf:about', self.url),
             ('rdf:type', 'prov:Entity'),
@@ -152,7 +175,7 @@ def __init__(self, tablegroup: csvw.TableGroup):
         self._objects_by_pk = collections.defaultdict(collections.OrderedDict)
 
     @property
-    def sources(self):
+    def sources(self) -> Sources:
         # We load sources only the first time they are accessed, because for datasets like
         # Glottolog - with 40MB zipped BibTeX - this may take ~90secs.
         if self._sources is None:
@@ -160,7 +183,7 @@ def sources(self):
         return self._sources
 
     @sources.setter
-    def sources(self, obj):
+    def sources(self, obj: Sources):
         if not isinstance(obj, Sources):
             raise TypeError('Invalid type for Dataset.sources')
         self._sources = obj
@@ -284,7 +307,7 @@ def module(self) -> str:
     def version(self) -> str:
         return self.properties['dc:conformsTo'].split('/')[3]
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         return '<cldf:%s:%s at %s>' % (self.version, self.module, self.directory)
 
     @property
@@ -536,7 +559,7 @@ def to_json(obj):
                 v = old
             self.tablegroup.common_props[k] = v
 
-    def add_table(self, url: str, *cols, **kw) -> csvw.Table:
+    def add_table(self, url: str, *cols: ColSpecType, **kw) -> csvw.Table:
         """
         Add a table description to the Dataset.
 
@@ -566,7 +589,10 @@ def remove_table(self, table: TableType):
         # Now remove the table:
         self.tablegroup.tables = [t for t in self.tablegroup.tables if t.url != table.url]
 
-    def add_component(self, component: typing.Union[str, dict], *cols, **kw) -> csvw.Table:
+    def add_component(self,
+                      component: typing.Union[str, dict],
+                      *cols: ColSpecType,
+                      **kw) -> csvw.Table:
         """
         Add a CLDF component to a dataset.
 
@@ -607,7 +633,7 @@ def add_component(self, component: typing.Union[str, dict], *cols, **kw) -> csvw
         self.auto_constraints(component)
         return component
 
-    def add_columns(self, table: TableType, *cols) -> None:
+    def add_columns(self, table: TableType, *cols: ColSpecType) -> None:
         """
         Add columns specified by `cols` to the table specified by `table`.
         """
@@ -624,7 +650,7 @@ def add_columns(self, table: TableType, *cols) -> None:
             table.tableSchema.columns.append(col)
         self.auto_constraints()
 
-    def remove_columns(self, table: TableType, *cols):
+    def remove_columns(self, table: TableType, *cols: str):
         """
         Remove `cols` from `table`'s schema.
 
@@ -781,7 +807,7 @@ def add_sources(self, *sources, **kw):
     #
     # Methods to read data
     #
-    def iter_rows(self, table: TableType, *cols) -> typing.Iterator[dict]:
+    def iter_rows(self, table: TableType, *cols: str) -> typing.Generator[dict, None, None]:
         """
         Iterate rows in a table, resolving CLDF property names to local column names.
 
@@ -1116,7 +1142,7 @@ def validate(
 
         return success
 
-    def stats(self, exact=False) -> typing.List[typing.Tuple[str, str, int]]:
+    def stats(self, exact: bool = False) -> typing.List[typing.Tuple[str, str, int]]:
         """
         Compute summary statistics for the dataset.
 
diff --git a/src/pycldf/media.py b/src/pycldf/media.py
index ddefc01..a90bef4 100644
--- a/src/pycldf/media.py
+++ b/src/pycldf/media.py
@@ -91,7 +91,7 @@ def from_dataset(
             MediaTable(ds),
             row_or_object.data if isinstance(row_or_object, orm.Media) else row_or_object)
 
-    def __getitem__(self, item):
+    def __getitem__(self, item) -> dict:
         """
         Access to the underlying row `dict`.
         """
@@ -223,7 +223,10 @@ def __iter__(self) -> typing.Generator[File, None, None]:
         for row in self.table:
             yield File(self, row)
 
-    def split(self, chunksize):
+    def split(self, chunksize: int) -> int:
+        """
+        :return: The number of media files that have been split.
+        """
         res = 0
         for file in self:
             p = file.local_path()
@@ -235,6 +238,9 @@ def split(self, chunksize):
         return res
 
     def cat(self):
+        """
+        :return: The number of media files that have been re-assembled from chunks.
+        """
         res = 0
         for file in self:
             p = file.local_path()
diff --git a/src/pycldf/util.py b/src/pycldf/util.py
index 01bc4a5..c15626b 100644
--- a/src/pycldf/util.py
+++ b/src/pycldf/util.py
@@ -17,7 +17,16 @@
     'splitfile', 'catfile']
 
 
-def splitfile(p, chunksize, total):
+def splitfile(p, chunksize: int, total: typing.Optional[int] = None) -> typing.List[pathlib.Path]:
+    """
+    :param p: Path of the file to split.
+    :param chunksize: The maximal size of the chunks the file will be split into.
+    :param total: The size of the input file.
+    :return: The list of paths of files that the input has been split into.
+    """
+    total = total or p.stat().st_size
+    if total <= chunksize:  # Nothing to do.
+        return [p]
     nchunks = math.ceil(total / chunksize)
     suffix_length = 2 if nchunks < len(string.ascii_lowercase)**2 else 3
     suffixes = [
@@ -37,7 +46,15 @@ def splitfile(p, chunksize, total):
     return res
 
 
-def catfile(p):
+def catfile(p: pathlib.Path) -> bool:
+    """
+    Restore a file that has been split into chunks.
+
+    We determine if a file has been split by looking for files in the parent directory with suffixes
+    as created by `splitfile`.
+    """
+    if p.exists():  # Nothing to do.
+        return False
     # Check, whether the file has been split.
     suffixes = {pp.suffix: pp for pp in p.parent.iterdir() if pp.stem == p.name}
     if {'.aa', '.ab'}.issubset(suffixes) or {'.aaa', '.aab'}.issubset(suffixes):
diff --git a/tests/test_util.py b/tests/test_util.py
index 8321d99..965350d 100644
--- a/tests/test_util.py
+++ b/tests/test_util.py
@@ -52,3 +52,18 @@ def test_metadata2markdown(tmp_path):
     tmp_path.joinpath('languages.csv.zip').unlink()
     md = metadata2markdown(ds, tmp_path / 'Generic-metadata.json')
     assert 'languages.csv.zip' not in md, "Don't link non-existing files"
+
+
+def test_split_and_cat(tmp_path):
+    p = tmp_path / 'testfile'
+    text = 'This is the test content'
+    p.write_text(text)
+    res = splitfile(p, 5)
+    assert not p.exists()
+    assert len(res) == 5
+    assert catfile(p)
+    assert p.exists()
+    assert p.read_text() == text
+
+    assert splitfile(p, 1000) == [p]
+    assert not catfile(p)