use relative path in id

huggingface · Nov 10, 2023 · 30af237 · 30af237
1 parent 227314d
commit 30af237
Show file tree

Hide file tree

Showing 5 changed files with 7 additions and 7 deletions.
diff --git a/src/datatrove/pipeline/readers/base.py b/src/datatrove/pipeline/readers/base.py
@@ -40,15 +40,15 @@ def _default_adapter(self, data: dict, path: str, id_in_file: int):
             "metadata": data,
         }
 
-    def get_document_from_dict(self, data: dict, path: str, id_in_file: int):
-        parsed_data = self.adapter(data, path, id_in_file)
+    def get_document_from_dict(self, data: dict, source_file: BaseInputDataFile, id_in_file: int):
+        parsed_data = self.adapter(data, source_file.relative_path, id_in_file)
         if not parsed_data.get("content", None):
             if not self.empty_warning:
                 self.empty_warning = True
                 logger.warning("Found document without content, skipping.")
             return None
         document = Document(**parsed_data)
-        document.metadata.setdefault("file_path", path)
+        document.metadata.setdefault("file_path", source_file.path)
         return document
 
     @abstractmethod

diff --git a/src/datatrove/pipeline/readers/csv.py b/src/datatrove/pipeline/readers/csv.py
@@ -23,7 +23,7 @@ def read_file(self, datafile: BaseInputDataFile):
             csv_reader = csv.DictReader(f)
             for di, d in enumerate(csv_reader):
                 with self.stats.time_manager:
-                    document = self.get_document_from_dict(d, datafile.path, di)
+                    document = self.get_document_from_dict(d, datafile, di)
                     if not document:
                         continue
                 yield document
diff --git a/src/datatrove/pipeline/readers/jsonl.py b/src/datatrove/pipeline/readers/jsonl.py
@@ -25,7 +25,7 @@ def read_file(self, datafile: BaseInputDataFile):
             for li, line in enumerate(f):
                 with self.stats.time_manager:
                     try:
-                        document = self.get_document_from_dict(json.loads(line), datafile.path, li)
+                        document = self.get_document_from_dict(json.loads(line), datafile, li)
                         if not document:
                             continue
                     except (EOFError, JSONDecodeError) as e:

diff --git a/src/datatrove/pipeline/readers/parquet.py b/src/datatrove/pipeline/readers/parquet.py
@@ -19,7 +19,7 @@ def read_file(self, datafile: BaseInputDataFile):
             with pq.ParquetFile(f) as pqf:
                 for li, line in enumerate(pqf.iter_batches(batch_size=1)):
                     with self.stats.time_manager:
-                        document = self.get_document_from_dict(line.to_pydict(), datafile.path, li)
+                        document = self.get_document_from_dict(line.to_pydict(), datafile, li)
                         if not document:
                             continue
                     yield document
diff --git a/src/datatrove/pipeline/readers/warc.py b/src/datatrove/pipeline/readers/warc.py
@@ -23,7 +23,7 @@ def read_file(self, datafile: BaseInputDataFile):
                     extracted_data = process_record(record)
                     if not extracted_data:
                         continue
-                    document = self.get_document_from_dict(extracted_data, datafile.path, ri)
+                    document = self.get_document_from_dict(extracted_data, datafile, ri)
                     if not document:
                         continue
                 yield document