Jeremy lig 1745 paginate export endpoints and use (#922)

closes lig-1745 - new filename2readUrls mapping function `export_filenames_and_read_urls_by_tag_id` - paginate and correctly type all export endpoints
lightly-ai · Sep 13, 2022 · dc4b51b · dc4b51b
1 parent cd11cef
commit dc4b51b
Show file tree

Hide file tree

Showing 10 changed files with 982 additions and 206 deletions.
diff --git a/lightly/api/api_workflow_datasets.py b/lightly/api/api_workflow_datasets.py
@@ -1,4 +1,5 @@
 from typing import List
+from xmlrpc.client import Boolean
 
 from lightly.openapi_generated.swagger_client.models.create_entity_response import CreateEntityResponse
 from lightly.openapi_generated.swagger_client.models.dataset_create_request import DatasetCreateRequest
@@ -22,17 +23,17 @@ def _get_current_dataset(self) -> DatasetData:
         """
         return self.get_dataset_by_id(self.dataset_id)
 
-    def dataset_exists(self, dataset_id: str):
+    def dataset_exists(self, dataset_id: str) -> bool:
         """Returns True if a dataset with dataset_id exists. """
         try:
             self.get_dataset_by_id(dataset_id)
             return True
         except ApiException:
             return False
 
-    def get_dataset_by_id(self, dataset_id: str):
+    def get_dataset_by_id(self, dataset_id: str) -> DatasetData:
         """Returns the dataset for the given dataset id. """
-        dataset = self._datasets_api.get_dataset_by_id(dataset_id)
+        dataset: DatasetData = self._datasets_api.get_dataset_by_id(dataset_id)
         return dataset
 
     def get_datasets(self, shared: bool = False) -> List[DatasetData]:

diff --git a/lightly/api/api_workflow_download_dataset.py b/lightly/api/api_workflow_download_dataset.py
@@ -6,12 +6,16 @@
 from urllib.request import Request, urlopen
 from PIL import Image
 
+from lightly.api.utils import paginate_endpoint, retry
 from torch.utils.hipify.hipify_python import bcolors
 
 from concurrent.futures.thread import ThreadPoolExecutor
 
 from lightly.api.bitmask import BitMask
 from lightly.openapi_generated.swagger_client.models.image_type import ImageType
+from lightly.openapi_generated.swagger_client.models.filename_and_read_url import FilenameAndReadUrl
+from lightly.openapi_generated.swagger_client.models.label_box_data_row import LabelBoxDataRow
+from lightly.openapi_generated.swagger_client.models.label_studio_task import LabelStudioTask
 
 
 
@@ -153,7 +157,7 @@ def lambda_(i):
     def export_label_studio_tasks_by_tag_id(
         self,
         tag_id: str,
-    ) -> List[Dict]:
+    ) -> List[LabelStudioTask]:
         """Exports samples in a format compatible with Label Studio.
 
         The format is documented here:
@@ -167,16 +171,18 @@ def export_label_studio_tasks_by_tag_id(
             A list of dictionaries in a format compatible with Label Studio.
 
         """
-        label_studio_tasks = self._tags_api.export_tag_to_label_studio_tasks(
-            self.dataset_id,
-            tag_id
+        label_studio_tasks = paginate_endpoint(
+            self._tags_api.export_tag_to_label_studio_tasks,
+            page_size=20000,
+            dataset_id=self.dataset_id,
+            tag_id=tag_id
         )
         return label_studio_tasks
 
     def export_label_studio_tasks_by_tag_name(
         self,
         tag_name: str,
-    ) -> List[Dict]:
+    ) -> List[LabelStudioTask]:
         """Exports samples in a format compatible with Label Studio.
 
         The format is documented here:
@@ -205,7 +211,7 @@ def export_label_studio_tasks_by_tag_name(
     def export_label_box_data_rows_by_tag_id(
         self,
         tag_id: str,
-    ) -> List[Dict]:
+    ) -> List[LabelBoxDataRow]:
         """Exports samples in a format compatible with Labelbox.
 
         The format is documented here:
@@ -219,16 +225,18 @@ def export_label_box_data_rows_by_tag_id(
             A list of dictionaries in a format compatible with Labelbox.
 
         """
-        label_box_data_rows = self._tags_api.export_tag_to_label_box_data_rows(
-            self.dataset_id,
-            tag_id,
+        label_box_data_rows = paginate_endpoint(
+            self._tags_api.export_tag_to_label_box_data_rows,
+            page_size=20000,
+            dataset_id=self.dataset_id,
+            tag_id=tag_id
         )
         return label_box_data_rows
 
     def export_label_box_data_rows_by_tag_name(
         self,
         tag_name: str,
-    ) -> List[Dict]:
+    ) -> List[LabelBoxDataRow]:
         """Exports samples in a format compatible with Labelbox.
 
         The format is documented here:
@@ -269,9 +277,10 @@ def export_filenames_by_tag_id(
             A list of the samples filenames within a certain tag.
 
         """
-        filenames = self._tags_api.export_tag_to_basic_filenames(
-            self.dataset_id,
-            tag_id,
+        filenames = retry(
+            self._tags_api.export_tag_to_basic_filenames,
+            dataset_id=self.dataset_id,
+            tag_id=tag_id,
         )
         return filenames
 
@@ -299,4 +308,53 @@ def export_filenames_by_tag_name(
 
         """
         tag = self.get_tag_by_name(tag_name)
-        return self.export_filenames_by_tag_id(tag.id)
+        return self.export_filenames_by_tag_id(tag.id)    
+
+
+    def export_filenames_and_read_urls_by_tag_id(
+        self,
+        tag_id: str,
+    ) -> List[FilenameAndReadUrl]:
+        """Export the samples filenames to map with their readURL.
+
+        Args:
+            tag_id:
+                Id of the tag which should exported.
+
+        Returns:
+            A list of mappings of the samples filenames and readURLs within a certain tag.
+
+        """
+        mappings = paginate_endpoint(
+            self._tags_api.export_tag_to_basic_filenames_and_read_urls,
+            page_size=20000,
+            dataset_id=self.dataset_id,
+            tag_id=tag_id
+        )
+        return mappings
+
+    def export_filenames_and_read_urls_by_tag_name(
+        self,
+        tag_name: str,
+    ) -> List[FilenameAndReadUrl]:
+        """Export the samples filenames to map with their readURL.
+
+        Args:
+            tag_name:
+                Name of the tag which should exported.
+
+        Returns:
+            A list of mappings of the samples filenames and readURLs within a certain tag.
+
+        Examples:
+            >>> # write json file which can be used to access the actual file contents.
+            >>> mappings = client.export_filenames_and_read_urls_by_tag_name(
+            >>>     'initial-tag'
+            >>> )
+            >>> 
+            >>> with open('my-readURL-mappings.json', 'w') as f:
+            >>>     json.dump(mappings, f)
+
+        """
+        tag = self.get_tag_by_name(tag_name)
+        return self.export_filenames_and_read_urls_by_tag_id(tag.id)
diff --git a/lightly/api/utils.py b/lightly/api/utils.py
@@ -8,6 +8,7 @@
 import time
 import random
 from enum import Enum
+from typing import List
 
 import numpy as np
 from PIL import Image, ImageFilter
@@ -64,6 +65,35 @@ def retry(func, *args, **kwargs):
                     f'Maximum retries exceeded! Original exception: {type(e)}: {str(e)}') from e
 
 
+
+def paginate_endpoint(fn, page_size=5000, *args, **kwargs) -> List:
+    """Paginates an API endpoint
+
+    Args:
+        fn:
+            The endpoint which will be paginated until there is not any more data
+        page_size:
+            The size of the pages to pull
+    """
+    entries: List = []
+    offset = 0
+    has_more = True
+    while has_more:
+        chunk = retry(
+            fn, page_offset=offset * page_size, page_size=page_size, *args, **kwargs
+        )
+        # if we don't find more data, stop pagination otherwise get next chunk
+        if len(chunk) == 0:
+            has_more = False
+        else:
+            entries.extend(chunk)
+            offset += 1
+
+    return entries
+
+
+
+
 def getenv(key: str, default: str):
     """Return the value of the environment variable key if it exists,
        or default if it doesn’t.

diff --git a/lightly/openapi_generated/swagger_client/__init__.py b/lightly/openapi_generated/swagger_client/__init__.py
@@ -108,6 +108,8 @@
 from lightly.openapi_generated.swagger_client.models.embedding_id_is_processed_body import EmbeddingIdIsProcessedBody
 from lightly.openapi_generated.swagger_client.models.file_name_format import FileNameFormat
 from lightly.openapi_generated.swagger_client.models.file_output_format import FileOutputFormat
+from lightly.openapi_generated.swagger_client.models.filename_and_read_url import FilenameAndReadUrl
+from lightly.openapi_generated.swagger_client.models.filename_and_read_urls import FilenameAndReadUrls
 from lightly.openapi_generated.swagger_client.models.general_job_result import GeneralJobResult
 from lightly.openapi_generated.swagger_client.models.image_type import ImageType
 from lightly.openapi_generated.swagger_client.models.initial_tag_create_request import InitialTagCreateRequest