Team result collections (#69)

Enhancing the `narps_open.data.results` module by making calls to Neurovault API and allowing data rectification as done by the NARPS team. Added a script for NARPS results dataset creation using datalad.
Inria-Empenn · Aug 24, 2023 · 06a6d42 · 06a6d42
1 parent 9034678
commit 06a6d42
Show file tree

Hide file tree

Showing 26 changed files with 391 additions and 639 deletions.
diff --git a/.github/workflows/pipeline_tests.yml b/.github/workflows/pipeline_tests.yml
@@ -70,7 +70,7 @@ jobs:
     - name: Execute tests with pytest
       run: |
         if [[ "${{ needs.identify-tests.outputs.tests }}" != "" ]]; then
-          pytest -q -m "pipeline_test" ${{ needs.identify-tests.outputs.tests }}
+          pytest -s -q -m "pipeline_test" ${{ needs.identify-tests.outputs.tests }}
         fi
 
     - name: Report results on GitHub

diff --git a/.github/workflows/test_changes.yml b/.github/workflows/test_changes.yml
@@ -57,5 +57,5 @@ jobs:
     - name: Collect tests with pytest
       run: |
         if [[ "${{ needs.identify-tests.outputs.tests }}" != "" ]]; then
-          pytest -q ${{ needs.identify-tests.outputs.tests }}
+          pytest -s -q ${{ needs.identify-tests.outputs.tests }}
         fi
diff --git a/docs/data.md b/docs/data.md
@@ -48,31 +48,49 @@ aws s3 sync --no-sign-request s3://openneuro.org/ds001734 ds001734/
 
 Stat maps from teams can be downloaded from [NeuroVault](https://www.neurovault.org) [(Gorgolewski & al, 2015)](https://www.frontiersin.org/articles/10.3389/fninf.2015.00008/full).
 
-The `narps_open.utils.results` module will help you download these collections. Here is how to use it:
+The `narps_open.data.results` module will help you download these collections. Note that it is also possible to rectify the collection, i.e: to pre-process the images as done by the NARPS analysis team during the NARPS study.
+
+Here is how to use the module, both using python code or with the command line:
 
 ```python
 # In a python script
-from narps_open.utils.results import download_all_result_collections, download_result_collection
+from narps_open.data.results import ResultsCollectionFactory
 
-# Either download all collections
-download_all_result_collections()
+# Create a collection factory
+factory = ResultsCollectionFactory()
 
-# Or select the ones you need
-teams = ['2T6S', 'C88N', 'L1A8']
+# Select the collections you need
+teams = ['2T6S', 'C88N', 'L1A8'] # Alternatively use the keys from narps_open.pipelines.implemented_pipelines to get all the team ids
 for team in teams:
-    download_result_collection(team)
+    collection = factory.get_collection(team)
+    collection.download() # Collections are downloaded
+    collection.rectify() # Rectified versions are created
 ```
 
 ```bash
 # From the command line
+$ python narps_open/data/results -h
+usage: results [-h] (-t TEAMS [TEAMS ...] | -a) [-r]
+
+Get Neurovault collection of results from NARPS teams.
+
+options:
+  -h, --help            show this help message and exit
+  -t TEAMS [TEAMS ...], --teams TEAMS [TEAMS ...]
+                        a list of team IDs
+  -a, --all             download results from all teams
+  -r, --rectify         rectify the results
 
 # Either download all collections
 python narps_open/utils/results -a
 
 # Or select the ones you need
 python narps_open/utils/results -t 2T6S C88N L1A8
+
+# Download and rectify the collections
+python narps_open/utils/results -r -t 2T6S C88N L1A8
 ```
 
 The collections are also available [here](https://zenodo.org/record/3528329/) as one release on Zenodo that you can download.
 
-Each team results collection is kept in the `orig` in folder organized using the pattern `<neurovault_collection_id>_<team_id>` (e.g.: `4881_2T6S` for the 2T6S team).
+Each team results collection is kept in the `data/results/orig` directory, in a folder using the pattern `<neurovault_collection_id>_<team_id>` (e.g.: `4881_2T6S` for the 2T6S team).
diff --git a/narps_open/data/results/__init__.py b/narps_open/data/results/__init__.py
@@ -5,102 +5,91 @@
     to results from teams involved in NARPS
 """
 
-from os import remove, makedirs
+from os import makedirs
 from os.path import join
+from importlib import import_module
 from json import loads
-from zipfile import ZipFile
-from urllib.request import urlretrieve
-from argparse import ArgumentParser
-from importlib_resources import files
+from urllib.request import urlretrieve, urlopen
 
 from narps_open.utils.configuration import Configuration
 from narps_open.data.description import TeamDescription
-from narps_open.pipelines import implemented_pipelines
 from narps_open.utils import show_download_progress
 
+class ResultsCollectionFactory():
+    """ A factory class to instantiate ResultsCollection objects """
+    collections = {
+        '2T6S': 'ResultsCollection2T6S'
+    }
+
+    def get_collection(self, team_id):
+        """ Return a ResultsCollection object or specialized child class if available """
+        # Send the default ResultsCollection class
+        if team_id not in ResultsCollectionFactory.collections:
+            return ResultsCollection(team_id)
+
+        # There is a specialized class for this team id
+        collection_class = getattr(
+            import_module(f'narps_open.data.results.team_{team_id}'),
+            ResultsCollectionFactory.collections[team_id]
+            )
+        return collection_class()
+
 class ResultsCollection():
     """ Represents a Neurovault collections corresponding
         to results from teams involved in NARPS.
     """
 
     def __init__(self, team_id: str):
-
         # Initialize attributes
         self.team_id = team_id
-        description = TeamDescription(team_id = self.team_id)
-        self.uid = description.general['NV_collection_link'].split('/')[-2]
-        self.url = description.general['NV_collection_link'] + 'download'
+        self.uid = self.get_uid()
         self.directory = join(
             Configuration()['directories']['narps_results'],
             'orig',
             self.uid + '_' + self.team_id
             )
-        self.files = {
-            'hypo1_thresh.nii.gz' : 'hypo1_thresh.nii.gz',
-            'hypo1_unthresh.nii.gz' : 'hypo1_unthresh.nii.gz',
-            'hypo2_thresh.nii.gz' : 'hypo2_thresh.nii.gz',
-            'hypo2_unthresh.nii.gz' : 'hypo2_unthresh.nii.gz',
-            'hypo3_thresh.nii.gz' : 'hypo3_thresh.nii.gz',
-            'hypo3_unthresh.nii.gz' : 'hypo3_unthresh.nii.gz',
-            'hypo4_thresh.nii.gz' : 'hypo4_thresh.nii.gz',
-            'hypo4_unthresh.nii.gz' : 'hypo4_unthresh.nii.gz',
-            'hypo5_thresh.nii.gz' : 'hypo5_thresh.nii.gz',
-            'hypo5_unthresh.nii.gz' : 'hypo5_unthresh.nii.gz',
-            'hypo6_thresh.nii.gz' : 'hypo6_thresh.nii.gz',
-            'hypo6_unthresh.nii.gz' : 'hypo6_unthresh.nii.gz',
-            'hypo7_thresh.nii.gz' : 'hypo7_thresh.nii.gz',
-            'hypo7_unthresh.nii.gz' : 'hypo7_unthresh.nii.gz',
-            'hypo8_thresh.nii.gz' : 'hypo8_thresh.nii.gz',
-            'hypo8_unthresh.nii.gz' : 'hypo8_unthresh.nii.gz',
-            'hypo9_thresh.nii.gz' : 'hypo9_thresh.nii.gz',
-            'hypo9_unthresh.nii.gz' : 'hypo9_unthresh.nii.gz'
-        }
-
-        # Make correspondences between the names given by the
-        # team in neurovault collections, and the expected names of the hypotheses files.
-        if Configuration()['results']['neurovault_naming']:
-            with open(
-                join(files('narps_open.data.results'),'results.json'),
-                'r', encoding = 'utf-8'
-                ) as file:
-                neurovault_files = loads(file.read())[self.team_id]
-
-            if neurovault_files:
-                self.files = neurovault_files
+        self.files = self.get_file_urls()
+
+    def get_uid(self):
+        """ Return the uid of the collection by browsing the team description """
+        return TeamDescription(team_id = self.team_id).general['NV_collection_link'].split('/')[-2]
+
+    def get_file_urls(self):
+        """ Return a dict containing the download url for each file of the collection.
+        * dict key is the file base name (with no extension)
+        * dict value is the download url for the file on Neurovault
+        """
+
+        # Get the images data from Neurovault's API
+        collection_url = 'https://neurovault.org/api/collections/' + self.uid + '/images/'
+
+        with urlopen(collection_url) as response:
+            json = loads(response.read())
+
+            file_urls = {}
+            for result in json['results']:
+                # Get data for a file in the collection
+                file_urls[result['name']] = result['file']
+
+        return file_urls
 
     def download(self):
-        """ Download the collection, unzip it and remove zip file. """
+        """ Download the collection, file by file. """
 
         # Create download directory if not existing
         makedirs(self.directory, exist_ok = True)
 
         # Download dataset
         print('Collecting results for team', self.team_id)
-        zip_filename = join(self.directory, 'NARPS-'+self.team_id+'.zip')
-        urlretrieve(self.url, zip_filename, show_download_progress)
-
-        # Unzip files directly in the download directory
-        with ZipFile(zip_filename, 'r') as zip_file:
-            for zip_info in zip_file.infolist():
-                zip_info.filename = zip_info.filename.split('/')[-1]
-                zip_info.filename = join(self.directory, zip_info.filename)
-                zip_file.extract(zip_info)
-
-        # Remove zip file
-        remove(zip_filename)
-
-if __name__ == '__main__':
-    # Parse arguments
-    parser = ArgumentParser(description='Get Neurovault collection of results from NARPS teams.')
-    group = parser.add_mutually_exclusive_group(required = True)
-    group.add_argument('-t', '--teams', nargs='+', type=str, action='extend',
-        help='a list of team IDs')
-    group.add_argument('-a', '--all', action='store_true', help='download results from all teams')
-    arguments = parser.parse_args()
-
-    if arguments.all:
-        for team_id, _ in implemented_pipelines.items():
-            ResultsCollection(team_id).download()
-    else:
-        for team in arguments.teams:
-            ResultsCollection(team).download()
+        for file_name, file_url in self.files.items():
+            urlretrieve(
+                file_url,
+                join(self.directory, file_name+'.nii.gz'),
+                show_download_progress
+                )
+
+    def rectify(self):
+        """ Rectify files in the collection, if needed.
+        This method can be overwritten by child classes.
+        """
+        # Nothing to rectify by default
diff --git a/narps_open/data/results/__main__.py b/narps_open/data/results/__main__.py
@@ -0,0 +1,34 @@
+#!/usr/bin/python
+# coding: utf-8
+
+""" Provide a command-line interface for the package narps_open.data.results """
+
+from argparse import ArgumentParser
+
+from narps_open.data.results import ResultsCollectionFactory
+from narps_open.pipelines import implemented_pipelines
+
+# Parse arguments
+parser = ArgumentParser(description='Get Neurovault collection of results from NARPS teams.')
+group = parser.add_mutually_exclusive_group(required = True)
+group.add_argument('-t', '--teams', nargs='+', type=str, action='extend',
+    help='a list of team IDs')
+group.add_argument('-a', '--all', action='store_true', help='download results from all teams')
+parser.add_argument('-r', '--rectify', action='store_true', default = False, required = False,
+    help='rectify the results')
+arguments = parser.parse_args()
+
+factory = ResultsCollectionFactory()
+
+if arguments.all:
+    for team_id, _ in implemented_pipelines.items():
+        collection = factory.get_collection(team_id)
+        collection.download()
+        if arguments.rectify:
+            collection.rectify()
+else:
+    for team in arguments.teams:
+        collection = factory.get_collection(team)
+        collection.download()
+        if arguments.rectify:
+            collection.rectify()
diff --git a/narps_open/data/results/dataset.py b/narps_open/data/results/dataset.py
@@ -0,0 +1,57 @@
+#!/usr/bin/python
+# coding: utf-8
+
+""" Generate a bash script to create the NARPS results dataset
+    Warning: unfortunately, the script actually downloads the data.
+"""
+
+from os.path import join
+from argparse import ArgumentParser
+
+from narps_open.utils.configuration import Configuration
+from narps_open.data.results import ResultsCollectionFactory
+from narps_open.pipelines import implemented_pipelines
+
+if __name__ == '__main__':
+    # Parse arguments
+    parser = ArgumentParser(
+        description='Generate a bash script to create the NARPS results dataset.'
+        )
+    parser.add_argument('-r', '--repository', type=str, required=True,
+        help='adress of the repository where to push the dataset'
+        )
+    arguments = parser.parse_args()
+
+    # Handle dataset directory
+    dataset_dir = Configuration()['directories']['narps_results']
+
+    # Create a new dataset
+    print(f'mkdir -p {dataset_dir}')
+    print(f'datalad create -D "NARPS results dataset" {dataset_dir}')
+
+    # Add files for each team results collection
+    collection_factory = ResultsCollectionFactory()
+    for team_id, _ in implemented_pipelines.items():
+
+        # Init collection
+        collection = collection_factory.get_collection(team_id)
+
+        # Create download directory if not existing
+        print(f'mkdir -p {collection.directory}')
+
+        # Create dataset entries
+        for file_name, file_url in collection.files.items():
+            complete_file_name = join(collection.directory, file_name+".nii.gz")
+            short_file_name = complete_file_name.replace(dataset_dir, '')
+            command = 'datalad download-url'
+            command += f' -m \"New file {short_file_name}\"'
+            command += f' --path \"{complete_file_name}\"'
+            command += f' --dataset \"{dataset_dir}\"'
+            command += f' \"{file_url}\"'
+            print(command)
+
+    # Push dataset
+    print(f'cd {dataset_dir}')
+    print(f'git remote add origin {arguments.repository}')
+    print('git push -u origin master')
+    print('datalad push')