Skip to content

Commit

Permalink
Fix download_sources for DAPT tutorial (#569)
Browse files Browse the repository at this point in the history
Signed-off-by: Sarah Yurick <[email protected]>
  • Loading branch information
sarahyurick authored Feb 25, 2025
1 parent 0158d93 commit e1abd74
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 16 deletions.
17 changes: 8 additions & 9 deletions tutorials/dapt-curation/code/docbuilder.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -15,7 +15,6 @@
import gzip
import os
import re
from typing import Set, Tuple
from zipfile import ZipFile, ZipInfo

import arxiv as arxiv
Expand All @@ -33,7 +32,7 @@

class WikitxtDownloader(DocumentDownloader):
"""
A class for downloading data from wiki urls.
A class for downloading data from wiki URLs.
"""

def __init__(self, download_dir: str):
Expand Down Expand Up @@ -130,9 +129,9 @@ def split_meta(example):


class WikitxtExtractor(DocumentExtractor):
def extract(self, content: str) -> Tuple[Set, str]:
def extract(self, content: str) -> dict:
# No metadata for the text, just the content.
return {}, content
return {"text": content}


class GitHubDownloader(DocumentDownloader):
Expand Down Expand Up @@ -338,9 +337,9 @@ def iterate(self, file_path: str):


class GitHubExtractor(DocumentExtractor):
def extract(self, content: str):
def extract(self, content: str) -> dict:
# Just return the content.
return {}, content
return {"text": content}


class ArxivDownloader(DocumentDownloader):
Expand Down Expand Up @@ -470,6 +469,6 @@ def iterate(self, file_path: str):


class ArxivExtractor(DocumentExtractor):
def extract(self, content: str):
def extract(self, content: str) -> dict:
# Just return the content.
return {}, content
return {"text": content}
17 changes: 10 additions & 7 deletions tutorials/dapt-curation/code/downloaders.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -78,6 +78,7 @@ def download_wikipedia_sources(
"line_count": int,
"size_in_bytes": int,
"path": str,
"file_name": str,
}

downloader = WikitxtDownloader(output_dir)
Expand All @@ -92,8 +93,8 @@ def download_wikipedia_sources(
extractor=extractor,
output_format=output_format,
)
# Force the computation of the dataset
dataset.persist()

dataset.to_json(output_dir, write_to_filename="file_name")
return output_dir


Expand Down Expand Up @@ -151,6 +152,7 @@ def download_github_sources(
"line_count": int,
"size_in_bytes": int,
"path": str,
"file_name": str,
}

dataset = download_and_extract(
Expand All @@ -164,8 +166,8 @@ def download_github_sources(
output_format=output_format,
keep_raw_download=True,
)
# Force the computation of the dataset
dataset.persist()

dataset.to_json(output_jsonl_dir, write_to_filename="file_name")
return output_jsonl_dir


Expand Down Expand Up @@ -225,6 +227,7 @@ def download_pdf_sources(
"line_count": int,
"size_in_bytes": int,
"path": str,
"file_name": str,
}

dataset = download_and_extract(
Expand All @@ -238,6 +241,6 @@ def download_pdf_sources(
output_format=output_format,
keep_raw_download=True,
)
# Force the computation of the dataset
dataset.persist()

dataset.to_json(output_jsonl_dir, write_to_filename="file_name")
return output_jsonl_dir

0 comments on commit e1abd74

Please sign in to comment.