Skip to content

Commit

Permalink
allow more flexibel access to cldf data on github
Browse files Browse the repository at this point in the history
  • Loading branch information
xrotwang committed Oct 25, 2024
1 parent 9a1adb1 commit 67038f6
Show file tree
Hide file tree
Showing 5 changed files with 50 additions and 0 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@
The `pycldf` package adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).


## unreleased

- Support certain GitHub URLs as dataset locators.


## [1.39.0] - 2024-09-09

- Added option to `downloadmedia` subcommand to customize file naming.
Expand Down
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ console_scripts =
pycldf_dataset_resolver =
local = pycldf.ext.discovery:LocalResolver
generic_url = pycldf.ext.discovery:GenericUrlResolver
github = pycldf.ext.discovery:GitHubResolver

[options.extras_require]
catalogs =
Expand Down
28 changes: 28 additions & 0 deletions src/pycldf/ext/discovery.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,14 @@
- The `cldfzenodo <https://pypi.org/project/cldfzenodo>`_ package (>=1.0) provides a dataset
resolver for DOI URLs pointing to the Zenodo archive.
"""
import re
import typing
import pathlib
import zipfile
import warnings
import functools
import urllib.parse
import urllib.request
from importlib.metadata import entry_points

from csvw.utils import is_url
Expand Down Expand Up @@ -88,6 +91,31 @@ def __call__(self, loc, download_dir):
pass


class GitHubResolver(DatasetResolver):
"""
Resolves dataset locators of the form "https://github.com/<org>/<repos>/tree/<tag>", e.g.
https://github.com/cldf-datasets/petersonsouthasia/tree/v1.1
or
https://github.com/cldf-datasets/petersonsouthasia/releases/tag/v1.1
"""
priority = 3

def __call__(self, loc, download_dir):
url = urllib.parse.urlparse(loc)
if url.netloc == 'github.com' and re.search(r'/[v\.0-9]+$', url.path):
comps = url.path.split('/')
z = download_dir / '{}-{}-{}.zip'.format(comps[1], comps[2], comps[-1])
url = "https://github.com/{}/{}/archive/refs/tags/{}.zip".format(
comps[1], comps[2], comps[-1])
urllib.request.urlretrieve(url, z)
zf = zipfile.ZipFile(z)
dirs = {info.filename.split('/')[0] for info in zf.infolist()}
assert len(dirs) == 1
zf.extractall(download_dir)
z.unlink()
return download_dir / dirs.pop()


class DatasetLocator(str):
@functools.cached_property
def parsed_url(self) -> urllib.parse.ParseResult:
Expand Down
Binary file added tests/data/petersonsouthasia-1.1.zip
Binary file not shown.
16 changes: 16 additions & 0 deletions tests/test_ext_discovery.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import shutil
import urllib.parse

import pytest

from pycldf import Dataset
Expand All @@ -9,6 +12,19 @@ def test_get_dataset_local(data, tmp_path):
assert get_dataset('structuredataset_with_examples', tmp_path, base=data)


def test_get_dataset_github(data, tmp_path, mocker):
def urlretrieve(url, p):
url = urllib.parse.urlparse(url)
assert url.netloc == 'github.com'
assert url.path.startswith('/cldf-datasets/petersonsouthasia')
shutil.copy(data / 'petersonsouthasia-1.1.zip', p)

mocker.patch('pycldf.ext.discovery.urllib.request.urlretrieve', urlretrieve)
ds = get_dataset('https://github.com/cldf-datasets/petersonsouthasia/v1.1', tmp_path)
assert (ds.properties["dc:title"] ==
"Towards a linguistic prehistory of eastern-central South Asia")


def test_get_dataset_url(structuredataset_with_examples, tmp_path, mocker):
class DummyDataset(Dataset):
@classmethod
Expand Down

0 comments on commit 67038f6

Please sign in to comment.