From 3e3f0c8cb0dcfe7be93d601b478c7442928b9425 Mon Sep 17 00:00:00 2001 From: guipenedo Date: Wed, 6 Dec 2023 13:35:21 +0100 Subject: [PATCH] fix tokenization error on empty data --- src/datatrove/pipeline/tokens/tokenizer.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/datatrove/pipeline/tokens/tokenizer.py b/src/datatrove/pipeline/tokens/tokenizer.py index 3f03dc9e..401ccdd7 100644 --- a/src/datatrove/pipeline/tokens/tokenizer.py +++ b/src/datatrove/pipeline/tokens/tokenizer.py @@ -182,6 +182,10 @@ def run(self, data: DocumentsPipeline, rank: int = 0, world_size: int = 1) -> Do unshuf_filename = self.get_output_filename(rank, "unshuffled") logger.info(f'Tokenizing in "{unshuf_filename}"...') outputfile: TokenizedFile = self.write_unshuffled(data, unshuf_filename) + if len(outputfile) == 0: + self.output_folder.close() + logger.warning("No data saved.") + return if self.shuffle: shuffled_filename = self.get_output_filename(rank, "shuffled") # get new TokenizedFile, shuffling docs from original one