allow more flexibel access to cldf data on github

cldf · Oct 25, 2024 · 67038f6 · 67038f6
1 parent 9a1adb1
commit 67038f6
Show file tree

Hide file tree

Showing 5 changed files with 50 additions and 0 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,11 @@
 The `pycldf` package adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
 
 
+## unreleased
+
+- Support certain GitHub URLs as dataset locators.
+
+
 ## [1.39.0] - 2024-09-09
 
 - Added option to `downloadmedia` subcommand to customize file naming.

diff --git a/setup.cfg b/setup.cfg
@@ -67,6 +67,7 @@ console_scripts =
 pycldf_dataset_resolver =
     local = pycldf.ext.discovery:LocalResolver
     generic_url = pycldf.ext.discovery:GenericUrlResolver
+    github = pycldf.ext.discovery:GitHubResolver
 
 [options.extras_require]
 catalogs =

diff --git a/src/pycldf/ext/discovery.py b/src/pycldf/ext/discovery.py
@@ -15,11 +15,14 @@
 - The `cldfzenodo <https://pypi.org/project/cldfzenodo>`_ package (>=1.0) provides a dataset
   resolver for DOI URLs pointing to the Zenodo archive.
 """
+import re
 import typing
 import pathlib
+import zipfile
 import warnings
 import functools
 import urllib.parse
+import urllib.request
 from importlib.metadata import entry_points
 
 from csvw.utils import is_url
@@ -88,6 +91,31 @@ def __call__(self, loc, download_dir):
                 pass
 
 
+class GitHubResolver(DatasetResolver):
+    """
+    Resolves dataset locators of the form "https://github.com/<org>/<repos>/tree/<tag>", e.g.
+    https://github.com/cldf-datasets/petersonsouthasia/tree/v1.1
+    or
+    https://github.com/cldf-datasets/petersonsouthasia/releases/tag/v1.1
+    """
+    priority = 3
+
+    def __call__(self, loc, download_dir):
+        url = urllib.parse.urlparse(loc)
+        if url.netloc == 'github.com' and re.search(r'/[v\.0-9]+$', url.path):
+            comps = url.path.split('/')
+            z = download_dir / '{}-{}-{}.zip'.format(comps[1], comps[2], comps[-1])
+            url = "https://github.com/{}/{}/archive/refs/tags/{}.zip".format(
+                comps[1], comps[2], comps[-1])
+            urllib.request.urlretrieve(url, z)
+            zf = zipfile.ZipFile(z)
+            dirs = {info.filename.split('/')[0] for info in zf.infolist()}
+            assert len(dirs) == 1
+            zf.extractall(download_dir)
+            z.unlink()
+            return download_dir / dirs.pop()
+
+
 class DatasetLocator(str):
     @functools.cached_property
     def parsed_url(self) -> urllib.parse.ParseResult:

diff --git a/tests/data/petersonsouthasia-1.1.zip b/tests/data/petersonsouthasia-1.1.zip
diff --git a/tests/test_ext_discovery.py b/tests/test_ext_discovery.py
@@ -1,3 +1,6 @@
+import shutil
+import urllib.parse
+
 import pytest
 
 from pycldf import Dataset
@@ -9,6 +12,19 @@ def test_get_dataset_local(data, tmp_path):
     assert get_dataset('structuredataset_with_examples', tmp_path, base=data)
 
 
+def test_get_dataset_github(data, tmp_path, mocker):
+    def urlretrieve(url, p):
+        url = urllib.parse.urlparse(url)
+        assert url.netloc == 'github.com'
+        assert url.path.startswith('/cldf-datasets/petersonsouthasia')
+        shutil.copy(data / 'petersonsouthasia-1.1.zip', p)
+
+    mocker.patch('pycldf.ext.discovery.urllib.request.urlretrieve', urlretrieve)
+    ds = get_dataset('https://github.com/cldf-datasets/petersonsouthasia/v1.1', tmp_path)
+    assert (ds.properties["dc:title"] ==
+            "Towards a linguistic prehistory of eastern-central South Asia")
+
+
 def test_get_dataset_url(structuredataset_with_examples, tmp_path, mocker):
     class DummyDataset(Dataset):
         @classmethod