diff --git a/src/datatrove/pipeline/readers/warc.py b/src/datatrove/pipeline/readers/warc.py index 46106b88..7d83ba2d 100644 --- a/src/datatrove/pipeline/readers/warc.py +++ b/src/datatrove/pipeline/readers/warc.py @@ -41,7 +41,7 @@ def read_file(self, datafile: BaseInputDataFile): def process_record(record: ArcWarcRecord) -> dict | None: # record type - if record.rec_type != "response": + if record.rec_type != "response" and record.rec_type != "conversion": # wet files have "conversion" type return # content type filtering