Skip to content

Commit

Permalink
docs
Browse files Browse the repository at this point in the history
  • Loading branch information
xrotwang committed Feb 15, 2025
1 parent 01b1d1a commit 8f84a64
Show file tree
Hide file tree
Showing 6 changed files with 98 additions and 25 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ The `pycldf` package adheres to [Semantic Versioning](http://semver.org/spec/v2.
## Unreleased

- Added a utility function to query SQLite DBs using user-defined functions, aggregates or collations.
- Fixed a bug whereby validation of remote datasets specified by URL of the metadata file did not work.


## [1.40.4] - 2025-01-15
Expand Down
26 changes: 17 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,13 +69,21 @@ sources.bib Sources 2000
### Summary statistics

```shell
$ cldf stats mydataset/Wordlist-metadata.json
<cldf:v1.0:Wordlist at mydataset>

Path Type Rows
--------------------- ---------- ------
forms.csv Form Table 1
mydataset/sources.bib Sources 1
$ cldf stats tests/data/wordlist_with_cognates/metadata.json
<cldf:v1.0:Wordlist at tests/data/wordlist_with_cognates>
value
------------- --------------------------------------------
dc:conformsTo http://cldf.clld.org/v1.0/terms.rdf#Wordlist
dc:source sources.bib

Type Rows
--------------- --------------- ------
languages.csv LanguageTable 2
parameters.csv ParameterTable 2
forms.csv FormTable 3
cognates.csv CognateTable 2
cognatesets.csv CognatesetTable 1
sources.bib Sources 1
```


Expand Down Expand Up @@ -159,7 +167,7 @@ provides a pragmatic solution as follows:

Running
```shell
cldfbench splitmedia
cldf splitmedia <dataset-locator>
```
on a dataset will split all media files with sizes bigger than a configurable threshold into
multiple files, just like [UNIX' split command](https://en.wikipedia.org/wiki/Split_(Unix)) would.
Expand All @@ -170,7 +178,7 @@ A file named `audio.wav` will be split into files `audio.wav.aa`, `audio.wav.ab`
In order to restore the files, the corresponding command
```shell
cldfbench catmedia
cldf catmedia <dataset-locator>
```
can be used.

Expand Down
50 changes: 38 additions & 12 deletions src/pycldf/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
ORM_CLASSES = {cls.component_name(): cls for cls in orm.Object.__subclasses__()}
TableType = typing.Union[str, Table]
ColType = typing.Union[str, Column]
ColSpecType = typing.Union[str, dict, Column]
PathType = typing.Union[str, pathlib.Path]
TableSpecType = typing.Union[str, Link, Table]
ColSPecType = typing.Union[str, Column]
Expand Down Expand Up @@ -96,7 +97,21 @@ def get_modules() -> typing.List[Module]:
return _modules


def make_column(spec: typing.Union[str, dict, Column]) -> Column:
def make_column(spec: ColSpecType) -> Column:
"""
Create a `Column` instance from `spec`.
.. code-block:: python
>>> make_column('id').name
'id'
>>> make_column('http://cldf.clld.org/v1.0/terms.rdf#id').name
'ID'
>>> make_column({'name': 'col', 'datatype': 'boolean'}).datatype.base
'boolean'
>>> type(make_column(make_column('id')))
<class 'csvw.metadata.Column'>
"""
if isinstance(spec, str):
if spec in TERMS.by_uri:
return TERMS.by_uri[spec].to_column()
Expand All @@ -109,7 +124,15 @@ def make_column(spec: typing.Union[str, dict, Column]) -> Column:


class GitRepository:
def __init__(self, url, clone=None, version=None, **dc):
"""
CLDF datasets are often created from data curated in git repositories. If this is the case, we
exploit this to provide better provenance information in the dataset's metadata.
"""
def __init__(self,
url: str,
clone: typing.Optional[typing.Union[str, pathlib.Path]] = None,
version: typing.Optional[str] = None,
**dc):
# We remove credentials from the URL immediately to make sure this isn't leaked into
# CLDF metadata. Such credentials might be present in URLs read via gitpython from
# remotes.
Expand All @@ -118,7 +141,7 @@ def __init__(self, url, clone=None, version=None, **dc):
self.version = version
self.dc = dc

def json_ld(self):
def json_ld(self) -> typing.Dict[str, str]:
res = collections.OrderedDict([
('rdf:about', self.url),
('rdf:type', 'prov:Entity'),
Expand Down Expand Up @@ -152,15 +175,15 @@ def __init__(self, tablegroup: csvw.TableGroup):
self._objects_by_pk = collections.defaultdict(collections.OrderedDict)

@property
def sources(self):
def sources(self) -> Sources:
# We load sources only the first time they are accessed, because for datasets like
# Glottolog - with 40MB zipped BibTeX - this may take ~90secs.
if self._sources is None:
self._sources = Sources.from_file(self.bibpath)
return self._sources

@sources.setter
def sources(self, obj):
def sources(self, obj: Sources):
if not isinstance(obj, Sources):
raise TypeError('Invalid type for Dataset.sources')
self._sources = obj
Expand Down Expand Up @@ -284,7 +307,7 @@ def module(self) -> str:
def version(self) -> str:
return self.properties['dc:conformsTo'].split('/')[3]

def __repr__(self):
def __repr__(self) -> str:
return '<cldf:%s:%s at %s>' % (self.version, self.module, self.directory)

@property
Expand Down Expand Up @@ -536,7 +559,7 @@ def to_json(obj):
v = old
self.tablegroup.common_props[k] = v

def add_table(self, url: str, *cols, **kw) -> csvw.Table:
def add_table(self, url: str, *cols: ColSpecType, **kw) -> csvw.Table:
"""
Add a table description to the Dataset.
Expand Down Expand Up @@ -566,7 +589,10 @@ def remove_table(self, table: TableType):
# Now remove the table:
self.tablegroup.tables = [t for t in self.tablegroup.tables if t.url != table.url]

def add_component(self, component: typing.Union[str, dict], *cols, **kw) -> csvw.Table:
def add_component(self,
component: typing.Union[str, dict],
*cols: ColSpecType,
**kw) -> csvw.Table:
"""
Add a CLDF component to a dataset.
Expand Down Expand Up @@ -607,7 +633,7 @@ def add_component(self, component: typing.Union[str, dict], *cols, **kw) -> csvw
self.auto_constraints(component)
return component

def add_columns(self, table: TableType, *cols) -> None:
def add_columns(self, table: TableType, *cols: ColSpecType) -> None:
"""
Add columns specified by `cols` to the table specified by `table`.
"""
Expand All @@ -624,7 +650,7 @@ def add_columns(self, table: TableType, *cols) -> None:
table.tableSchema.columns.append(col)
self.auto_constraints()

def remove_columns(self, table: TableType, *cols):
def remove_columns(self, table: TableType, *cols: str):
"""
Remove `cols` from `table`'s schema.
Expand Down Expand Up @@ -781,7 +807,7 @@ def add_sources(self, *sources, **kw):
#
# Methods to read data
#
def iter_rows(self, table: TableType, *cols) -> typing.Iterator[dict]:
def iter_rows(self, table: TableType, *cols: str) -> typing.Generator[dict, None, None]:
"""
Iterate rows in a table, resolving CLDF property names to local column names.
Expand Down Expand Up @@ -1116,7 +1142,7 @@ def validate(

return success

def stats(self, exact=False) -> typing.List[typing.Tuple[str, str, int]]:
def stats(self, exact: bool = False) -> typing.List[typing.Tuple[str, str, int]]:
"""
Compute summary statistics for the dataset.
Expand Down
10 changes: 8 additions & 2 deletions src/pycldf/media.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def from_dataset(
MediaTable(ds),
row_or_object.data if isinstance(row_or_object, orm.Media) else row_or_object)

def __getitem__(self, item):
def __getitem__(self, item) -> dict:
"""
Access to the underlying row `dict`.
"""
Expand Down Expand Up @@ -223,7 +223,10 @@ def __iter__(self) -> typing.Generator[File, None, None]:
for row in self.table:
yield File(self, row)

def split(self, chunksize):
def split(self, chunksize: int) -> int:
"""
:return: The number of media files that have been split.
"""
res = 0
for file in self:
p = file.local_path()
Expand All @@ -235,6 +238,9 @@ def split(self, chunksize):
return res

def cat(self):
"""
:return: The number of media files that have been re-assembled from chunks.
"""
res = 0
for file in self:
p = file.local_path()
Expand Down
21 changes: 19 additions & 2 deletions src/pycldf/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,16 @@
'splitfile', 'catfile']


def splitfile(p, chunksize, total):
def splitfile(p, chunksize: int, total: typing.Optional[int] = None) -> typing.List[pathlib.Path]:
"""
:param p: Path of the file to split.
:param chunksize: The maximal size of the chunks the file will be split into.
:param total: The size of the input file.
:return: The list of paths of files that the input has been split into.
"""
total = total or p.stat().st_size
if total <= chunksize: # Nothing to do.
return [p]
nchunks = math.ceil(total / chunksize)
suffix_length = 2 if nchunks < len(string.ascii_lowercase)**2 else 3
suffixes = [
Expand All @@ -37,7 +46,15 @@ def splitfile(p, chunksize, total):
return res


def catfile(p):
def catfile(p: pathlib.Path) -> bool:
"""
Restore a file that has been split into chunks.
We determine if a file has been split by looking for files in the parent directory with suffixes
as created by `splitfile`.
"""
if p.exists(): # Nothing to do.
return False
# Check, whether the file has been split.
suffixes = {pp.suffix: pp for pp in p.parent.iterdir() if pp.stem == p.name}
if {'.aa', '.ab'}.issubset(suffixes) or {'.aaa', '.aab'}.issubset(suffixes):
Expand Down
15 changes: 15 additions & 0 deletions tests/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,18 @@ def test_metadata2markdown(tmp_path):
tmp_path.joinpath('languages.csv.zip').unlink()
md = metadata2markdown(ds, tmp_path / 'Generic-metadata.json')
assert 'languages.csv.zip' not in md, "Don't link non-existing files"


def test_split_and_cat(tmp_path):
p = tmp_path / 'testfile'
text = 'This is the test content'
p.write_text(text)
res = splitfile(p, 5)
assert not p.exists()
assert len(res) == 5
assert catfile(p)
assert p.exists()
assert p.read_text() == text

assert splitfile(p, 1000) == [p]
assert not catfile(p)

0 comments on commit 8f84a64

Please sign in to comment.