Skip to content

Commit

Permalink
use relative path in id
Browse files Browse the repository at this point in the history
  • Loading branch information
guipenedo committed Nov 10, 2023
1 parent 227314d commit 30af237
Show file tree
Hide file tree
Showing 5 changed files with 7 additions and 7 deletions.
6 changes: 3 additions & 3 deletions src/datatrove/pipeline/readers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,15 @@ def _default_adapter(self, data: dict, path: str, id_in_file: int):
"metadata": data,
}

def get_document_from_dict(self, data: dict, path: str, id_in_file: int):
parsed_data = self.adapter(data, path, id_in_file)
def get_document_from_dict(self, data: dict, source_file: BaseInputDataFile, id_in_file: int):
parsed_data = self.adapter(data, source_file.relative_path, id_in_file)
if not parsed_data.get("content", None):
if not self.empty_warning:
self.empty_warning = True
logger.warning("Found document without content, skipping.")
return None
document = Document(**parsed_data)
document.metadata.setdefault("file_path", path)
document.metadata.setdefault("file_path", source_file.path)
return document

@abstractmethod
Expand Down
2 changes: 1 addition & 1 deletion src/datatrove/pipeline/readers/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def read_file(self, datafile: BaseInputDataFile):
csv_reader = csv.DictReader(f)
for di, d in enumerate(csv_reader):
with self.stats.time_manager:
document = self.get_document_from_dict(d, datafile.path, di)
document = self.get_document_from_dict(d, datafile, di)
if not document:
continue
yield document
2 changes: 1 addition & 1 deletion src/datatrove/pipeline/readers/jsonl.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def read_file(self, datafile: BaseInputDataFile):
for li, line in enumerate(f):
with self.stats.time_manager:
try:
document = self.get_document_from_dict(json.loads(line), datafile.path, li)
document = self.get_document_from_dict(json.loads(line), datafile, li)
if not document:
continue
except (EOFError, JSONDecodeError) as e:
Expand Down
2 changes: 1 addition & 1 deletion src/datatrove/pipeline/readers/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def read_file(self, datafile: BaseInputDataFile):
with pq.ParquetFile(f) as pqf:
for li, line in enumerate(pqf.iter_batches(batch_size=1)):
with self.stats.time_manager:
document = self.get_document_from_dict(line.to_pydict(), datafile.path, li)
document = self.get_document_from_dict(line.to_pydict(), datafile, li)
if not document:
continue
yield document
2 changes: 1 addition & 1 deletion src/datatrove/pipeline/readers/warc.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def read_file(self, datafile: BaseInputDataFile):
extracted_data = process_record(record)
if not extracted_data:
continue
document = self.get_document_from_dict(extracted_data, datafile.path, ri)
document = self.get_document_from_dict(extracted_data, datafile, ri)
if not document:
continue
yield document
Expand Down

0 comments on commit 30af237

Please sign in to comment.