From 5a0155235665442a1b850d5071af9e1fcdcef0bb Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Tue, 29 Oct 2024 19:22:48 +0100 Subject: [PATCH 1/2] render video urls --- libs/libcommon/src/libcommon/url_preparator.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/libs/libcommon/src/libcommon/url_preparator.py b/libs/libcommon/src/libcommon/url_preparator.py index 902199d19..c4a1d1c9f 100644 --- a/libs/libcommon/src/libcommon/url_preparator.py +++ b/libs/libcommon/src/libcommon/url_preparator.py @@ -6,7 +6,7 @@ from dataclasses import dataclass from typing import Any, Callable, Literal, Optional, Union -from datasets import Audio, Features, Image +from datasets import Audio, Features, Image, Video from datasets.features.features import FeatureType, Sequence from libcommon.cloudfront import CloudFrontSigner @@ -23,7 +23,7 @@ class InvalidFirstRowsError(ValueError): @dataclass class AssetUrlPath: - feature_type: Literal["Audio", "Image"] + feature_type: Literal["Audio", "Image", "Video"] path: VisitPath def enter(self) -> "AssetUrlPath": @@ -70,7 +70,10 @@ def classify(feature: FeatureType, visit_path: VisitPath) -> None: if isinstance(feature, Image): asset_url_paths.append(AssetUrlPath(feature_type="Image", path=visit_path)) elif isinstance(feature, Audio): + # for audio we give a list in case there are multiple formats available asset_url_paths.append(AssetUrlPath(feature_type="Audio", path=visit_path + [0])) + elif isinstance(feature, Video): + asset_url_paths.append(AssetUrlPath(feature_type="Video", path=visit_path)) _visit(feature, classify, [column]) return asset_url_paths From a83176e5f29670dcbbb969fa04720a2ee75590ee Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Tue, 29 Oct 2024 19:22:58 +0100 Subject: [PATCH 2/2] disable multithreading for easier debugging --- libs/libapi/src/libapi/rows_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/libapi/src/libapi/rows_utils.py b/libs/libapi/src/libapi/rows_utils.py index 90a0bee49..9741b9c86 100644 --- a/libs/libapi/src/libapi/rows_utils.py +++ b/libs/libapi/src/libapi/rows_utils.py @@ -66,7 +66,7 @@ async def transform_rows( offset=offset, row_idx_column=row_idx_column, ) - if "Audio(" in str(features) or "Image(" in str(features) or "Video(" in str(features): + if "Audio(" in str(features) or "Image(" in str(features): # Use multithreading to parallelize image/audio files uploads. # Also multithreading is ok to convert audio data # (we use pydub which might spawn one ffmpeg process per conversion, which releases the GIL)