diff --git a/CHANGELOG.md b/CHANGELOG.md index 0fe54bd..8c010d6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,11 @@ The `pycldf` package adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). +## unreleased + +- Support certain GitHub URLs as dataset locators. + + ## [1.39.0] - 2024-09-09 - Added option to `downloadmedia` subcommand to customize file naming. diff --git a/setup.cfg b/setup.cfg index 81e344c..25e1893 100644 --- a/setup.cfg +++ b/setup.cfg @@ -67,6 +67,7 @@ console_scripts = pycldf_dataset_resolver = local = pycldf.ext.discovery:LocalResolver generic_url = pycldf.ext.discovery:GenericUrlResolver + github = pycldf.ext.discovery:GitHubResolver [options.extras_require] catalogs = diff --git a/src/pycldf/ext/discovery.py b/src/pycldf/ext/discovery.py index 93e81cd..93d6c07 100644 --- a/src/pycldf/ext/discovery.py +++ b/src/pycldf/ext/discovery.py @@ -15,11 +15,14 @@ - The `cldfzenodo `_ package (>=1.0) provides a dataset resolver for DOI URLs pointing to the Zenodo archive. """ +import re import typing import pathlib +import zipfile import warnings import functools import urllib.parse +import urllib.request from importlib.metadata import entry_points from csvw.utils import is_url @@ -88,6 +91,31 @@ def __call__(self, loc, download_dir): pass +class GitHubResolver(DatasetResolver): + """ + Resolves dataset locators of the form "https://github.com///tree/", e.g. + https://github.com/cldf-datasets/petersonsouthasia/tree/v1.1 + or + https://github.com/cldf-datasets/petersonsouthasia/releases/tag/v1.1 + """ + priority = 3 + + def __call__(self, loc, download_dir): + url = urllib.parse.urlparse(loc) + if url.netloc == 'github.com' and re.search(r'/[v\.0-9]+$', url.path): + comps = url.path.split('/') + z = download_dir / '{}-{}-{}.zip'.format(comps[1], comps[2], comps[-1]) + url = "https://github.com/{}/{}/archive/refs/tags/{}.zip".format( + comps[1], comps[2], comps[-1]) + urllib.request.urlretrieve(url, z) + zf = zipfile.ZipFile(z) + dirs = {info.filename.split('/')[0] for info in zf.infolist()} + assert len(dirs) == 1 + zf.extractall(download_dir) + z.unlink() + return download_dir / dirs.pop() + + class DatasetLocator(str): @functools.cached_property def parsed_url(self) -> urllib.parse.ParseResult: diff --git a/tests/data/petersonsouthasia-1.1.zip b/tests/data/petersonsouthasia-1.1.zip new file mode 100644 index 0000000..1ca14c5 Binary files /dev/null and b/tests/data/petersonsouthasia-1.1.zip differ diff --git a/tests/test_ext_discovery.py b/tests/test_ext_discovery.py index 3ea1b1d..ebe4404 100644 --- a/tests/test_ext_discovery.py +++ b/tests/test_ext_discovery.py @@ -1,3 +1,6 @@ +import shutil +import urllib.parse + import pytest from pycldf import Dataset @@ -9,6 +12,19 @@ def test_get_dataset_local(data, tmp_path): assert get_dataset('structuredataset_with_examples', tmp_path, base=data) +def test_get_dataset_github(data, tmp_path, mocker): + def urlretrieve(url, p): + url = urllib.parse.urlparse(url) + assert url.netloc == 'github.com' + assert url.path.startswith('/cldf-datasets/petersonsouthasia') + shutil.copy(data / 'petersonsouthasia-1.1.zip', p) + + mocker.patch('pycldf.ext.discovery.urllib.request.urlretrieve', urlretrieve) + ds = get_dataset('https://github.com/cldf-datasets/petersonsouthasia/v1.1', tmp_path) + assert (ds.properties["dc:title"] == + "Towards a linguistic prehistory of eastern-central South Asia") + + def test_get_dataset_url(structuredataset_with_examples, tmp_path, mocker): class DummyDataset(Dataset): @classmethod