diff --git a/src/datatrove/pipeline/readers/parquet.py b/src/datatrove/pipeline/readers/parquet.py index 8d5bb625..8668440b 100644 --- a/src/datatrove/pipeline/readers/parquet.py +++ b/src/datatrove/pipeline/readers/parquet.py @@ -31,10 +31,12 @@ def read_file(self, datafile: BaseInputDataFile): li = 0 columns = [self.content_key, self.id_key] if not self.read_metadata else None for batch in pqf.iter_batches(batch_size=self.batch_size, columns=columns): - with self.track_time(): + documents = [] + with self.track_time("batch"): for line in batch.to_pylist(): document = self.get_document_from_dict(line, datafile, li) if not document: continue + documents.append(document) li += 1 - yield document + yield from documents