Skip to content

Commit

Permalink
Jeremy lig 1745 paginate export endpoints and use (#922)
Browse files Browse the repository at this point in the history
closes lig-1745
- new filename2readUrls mapping function `export_filenames_and_read_urls_by_tag_id`
- paginate and correctly type all export endpoints
  • Loading branch information
japrescott authored Sep 13, 2022
1 parent cd11cef commit dc4b51b
Show file tree
Hide file tree
Showing 10 changed files with 982 additions and 206 deletions.
7 changes: 4 additions & 3 deletions lightly/api/api_workflow_datasets.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from typing import List
from xmlrpc.client import Boolean

from lightly.openapi_generated.swagger_client.models.create_entity_response import CreateEntityResponse
from lightly.openapi_generated.swagger_client.models.dataset_create_request import DatasetCreateRequest
Expand All @@ -22,17 +23,17 @@ def _get_current_dataset(self) -> DatasetData:
"""
return self.get_dataset_by_id(self.dataset_id)

def dataset_exists(self, dataset_id: str):
def dataset_exists(self, dataset_id: str) -> bool:
"""Returns True if a dataset with dataset_id exists. """
try:
self.get_dataset_by_id(dataset_id)
return True
except ApiException:
return False

def get_dataset_by_id(self, dataset_id: str):
def get_dataset_by_id(self, dataset_id: str) -> DatasetData:
"""Returns the dataset for the given dataset id. """
dataset = self._datasets_api.get_dataset_by_id(dataset_id)
dataset: DatasetData = self._datasets_api.get_dataset_by_id(dataset_id)
return dataset

def get_datasets(self, shared: bool = False) -> List[DatasetData]:
Expand Down
86 changes: 72 additions & 14 deletions lightly/api/api_workflow_download_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,16 @@
from urllib.request import Request, urlopen
from PIL import Image

from lightly.api.utils import paginate_endpoint, retry
from torch.utils.hipify.hipify_python import bcolors

from concurrent.futures.thread import ThreadPoolExecutor

from lightly.api.bitmask import BitMask
from lightly.openapi_generated.swagger_client.models.image_type import ImageType
from lightly.openapi_generated.swagger_client.models.filename_and_read_url import FilenameAndReadUrl
from lightly.openapi_generated.swagger_client.models.label_box_data_row import LabelBoxDataRow
from lightly.openapi_generated.swagger_client.models.label_studio_task import LabelStudioTask



Expand Down Expand Up @@ -153,7 +157,7 @@ def lambda_(i):
def export_label_studio_tasks_by_tag_id(
self,
tag_id: str,
) -> List[Dict]:
) -> List[LabelStudioTask]:
"""Exports samples in a format compatible with Label Studio.
The format is documented here:
Expand All @@ -167,16 +171,18 @@ def export_label_studio_tasks_by_tag_id(
A list of dictionaries in a format compatible with Label Studio.
"""
label_studio_tasks = self._tags_api.export_tag_to_label_studio_tasks(
self.dataset_id,
tag_id
label_studio_tasks = paginate_endpoint(
self._tags_api.export_tag_to_label_studio_tasks,
page_size=20000,
dataset_id=self.dataset_id,
tag_id=tag_id
)
return label_studio_tasks

def export_label_studio_tasks_by_tag_name(
self,
tag_name: str,
) -> List[Dict]:
) -> List[LabelStudioTask]:
"""Exports samples in a format compatible with Label Studio.
The format is documented here:
Expand Down Expand Up @@ -205,7 +211,7 @@ def export_label_studio_tasks_by_tag_name(
def export_label_box_data_rows_by_tag_id(
self,
tag_id: str,
) -> List[Dict]:
) -> List[LabelBoxDataRow]:
"""Exports samples in a format compatible with Labelbox.
The format is documented here:
Expand All @@ -219,16 +225,18 @@ def export_label_box_data_rows_by_tag_id(
A list of dictionaries in a format compatible with Labelbox.
"""
label_box_data_rows = self._tags_api.export_tag_to_label_box_data_rows(
self.dataset_id,
tag_id,
label_box_data_rows = paginate_endpoint(
self._tags_api.export_tag_to_label_box_data_rows,
page_size=20000,
dataset_id=self.dataset_id,
tag_id=tag_id
)
return label_box_data_rows

def export_label_box_data_rows_by_tag_name(
self,
tag_name: str,
) -> List[Dict]:
) -> List[LabelBoxDataRow]:
"""Exports samples in a format compatible with Labelbox.
The format is documented here:
Expand Down Expand Up @@ -269,9 +277,10 @@ def export_filenames_by_tag_id(
A list of the samples filenames within a certain tag.
"""
filenames = self._tags_api.export_tag_to_basic_filenames(
self.dataset_id,
tag_id,
filenames = retry(
self._tags_api.export_tag_to_basic_filenames,
dataset_id=self.dataset_id,
tag_id=tag_id,
)
return filenames

Expand Down Expand Up @@ -299,4 +308,53 @@ def export_filenames_by_tag_name(
"""
tag = self.get_tag_by_name(tag_name)
return self.export_filenames_by_tag_id(tag.id)
return self.export_filenames_by_tag_id(tag.id)


def export_filenames_and_read_urls_by_tag_id(
self,
tag_id: str,
) -> List[FilenameAndReadUrl]:
"""Export the samples filenames to map with their readURL.
Args:
tag_id:
Id of the tag which should exported.
Returns:
A list of mappings of the samples filenames and readURLs within a certain tag.
"""
mappings = paginate_endpoint(
self._tags_api.export_tag_to_basic_filenames_and_read_urls,
page_size=20000,
dataset_id=self.dataset_id,
tag_id=tag_id
)
return mappings

def export_filenames_and_read_urls_by_tag_name(
self,
tag_name: str,
) -> List[FilenameAndReadUrl]:
"""Export the samples filenames to map with their readURL.
Args:
tag_name:
Name of the tag which should exported.
Returns:
A list of mappings of the samples filenames and readURLs within a certain tag.
Examples:
>>> # write json file which can be used to access the actual file contents.
>>> mappings = client.export_filenames_and_read_urls_by_tag_name(
>>> 'initial-tag'
>>> )
>>>
>>> with open('my-readURL-mappings.json', 'w') as f:
>>> json.dump(mappings, f)
"""
tag = self.get_tag_by_name(tag_name)
return self.export_filenames_and_read_urls_by_tag_id(tag.id)
30 changes: 30 additions & 0 deletions lightly/api/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import time
import random
from enum import Enum
from typing import List

import numpy as np
from PIL import Image, ImageFilter
Expand Down Expand Up @@ -64,6 +65,35 @@ def retry(func, *args, **kwargs):
f'Maximum retries exceeded! Original exception: {type(e)}: {str(e)}') from e



def paginate_endpoint(fn, page_size=5000, *args, **kwargs) -> List:
"""Paginates an API endpoint
Args:
fn:
The endpoint which will be paginated until there is not any more data
page_size:
The size of the pages to pull
"""
entries: List = []
offset = 0
has_more = True
while has_more:
chunk = retry(
fn, page_offset=offset * page_size, page_size=page_size, *args, **kwargs
)
# if we don't find more data, stop pagination otherwise get next chunk
if len(chunk) == 0:
has_more = False
else:
entries.extend(chunk)
offset += 1

return entries




def getenv(key: str, default: str):
"""Return the value of the environment variable key if it exists,
or default if it doesn’t.
Expand Down
2 changes: 2 additions & 0 deletions lightly/openapi_generated/swagger_client/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@
from lightly.openapi_generated.swagger_client.models.embedding_id_is_processed_body import EmbeddingIdIsProcessedBody
from lightly.openapi_generated.swagger_client.models.file_name_format import FileNameFormat
from lightly.openapi_generated.swagger_client.models.file_output_format import FileOutputFormat
from lightly.openapi_generated.swagger_client.models.filename_and_read_url import FilenameAndReadUrl
from lightly.openapi_generated.swagger_client.models.filename_and_read_urls import FilenameAndReadUrls
from lightly.openapi_generated.swagger_client.models.general_job_result import GeneralJobResult
from lightly.openapi_generated.swagger_client.models.image_type import ImageType
from lightly.openapi_generated.swagger_client.models.initial_tag_create_request import InitialTagCreateRequest
Expand Down
Loading

0 comments on commit dc4b51b

Please sign in to comment.