Skip to content

Commit

Permalink
fix tokenization error on empty data
Browse files Browse the repository at this point in the history
  • Loading branch information
guipenedo committed Dec 6, 2023
1 parent 7e007ff commit 3e3f0c8
Showing 1 changed file with 4 additions and 0 deletions.
4 changes: 4 additions & 0 deletions src/datatrove/pipeline/tokens/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,10 @@ def run(self, data: DocumentsPipeline, rank: int = 0, world_size: int = 1) -> Do
unshuf_filename = self.get_output_filename(rank, "unshuffled")
logger.info(f'Tokenizing in "{unshuf_filename}"...')
outputfile: TokenizedFile = self.write_unshuffled(data, unshuf_filename)
if len(outputfile) == 0:
self.output_folder.close()
logger.warning("No data saved.")
return
if self.shuffle:
shuffled_filename = self.get_output_filename(rank, "shuffled")
# get new TokenizedFile, shuffling docs from original one
Expand Down

0 comments on commit 3e3f0c8

Please sign in to comment.