Skip to content

Commit

Permalink
add classifiers and semdedup
Browse files Browse the repository at this point in the history
Signed-off-by: Sarah Yurick <[email protected]>
  • Loading branch information
sarahyurick committed Oct 24, 2024
1 parent 0027093 commit 994e3bb
Show file tree
Hide file tree
Showing 8 changed files with 26 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
import warnings

os.environ["RAPIDS_NO_INITIALIZE"] = "1"

from nemo_curator import __version__
from nemo_curator.classifiers import AegisClassifier
from nemo_curator.datasets import DocumentDataset

Expand Down Expand Up @@ -103,7 +105,10 @@ def main():


def attach_args():
parser = ArgumentHelper.parse_distributed_classifier_args(max_chars_default=6000)
parser = ArgumentHelper.parse_distributed_classifier_args(
description=f"\nNVIDIA NeMo Curator -- v{__version__}\n\nRun AEGIS classifier inference",
max_chars_default=6000,
)

parser.add_argument(
"--aegis-variant",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
import warnings

os.environ["RAPIDS_NO_INITIALIZE"] = "1"

from nemo_curator import __version__
from nemo_curator.classifiers import DomainClassifier
from nemo_curator.datasets import DocumentDataset

Expand All @@ -29,7 +31,9 @@


def main():
args = ArgumentHelper.parse_distributed_classifier_args().parse_args()
args = ArgumentHelper.parse_distributed_classifier_args(
description=f"\nNVIDIA NeMo Curator -- v{__version__}\n\nRun domain classifier inference"
).parse_args()
print(f"Arguments parsed = {args}", flush=True)
client_args = ArgumentHelper.parse_client_args(args)
client_args["cluster_type"] = "gpu"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
os.environ["RAPIDS_NO_INITIALIZE"] = "1"

from nemo_curator import __version__

from nemo_curator.classifiers import FineWebEduClassifier
from nemo_curator.datasets import DocumentDataset

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
import warnings

os.environ["RAPIDS_NO_INITIALIZE"] = "1"

from nemo_curator import __version__
from nemo_curator.classifiers import QualityClassifier
from nemo_curator.datasets import DocumentDataset

Expand All @@ -29,7 +31,9 @@


def main():
args = ArgumentHelper.parse_distributed_classifier_args().parse_args()
args = ArgumentHelper.parse_distributed_classifier_args(
description=f"\nNVIDIA NeMo Curator -- v{__version__}\n\nRun quality classifier inference"
).parse_args()
print(f"Arguments parsed = {args}", flush=True)

client_args = ArgumentHelper.parse_client_args(args)
Expand Down
2 changes: 2 additions & 0 deletions nemo_curator/scripts/semdedup/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

import dask_cudf

from nemo_curator import __version__
from nemo_curator.datasets import DocumentDataset
from nemo_curator.log import create_logger
from nemo_curator.modules.config import SemDedupConfig
Expand Down Expand Up @@ -81,6 +82,7 @@ def main(args):
def attach_args():
parser = ArgumentHelper.parse_semdedup_args(
description=(
f"NVIDIA NeMo Curator -- v{__version__} "
"Performs clustering on the computed embeddings of a collection of documents. "
"This script requires that the embeddings have been created beforehand using: "
"semdedup_extract_embeddings"
Expand Down
2 changes: 2 additions & 0 deletions nemo_curator/scripts/semdedup/compute_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import os
import time

from nemo_curator import __version__
from nemo_curator.datasets import DocumentDataset
from nemo_curator.log import create_logger
from nemo_curator.modules.config import SemDedupConfig
Expand Down Expand Up @@ -87,6 +88,7 @@ def main(args):
def attach_args():
parser = ArgumentHelper.parse_semdedup_args(
description=(
f"NVIDIA NeMo Curator -- v{__version__} "
"Computes the embeddings of a collection of documents using the specified model. "
"The model is specified in the config file using embedding_model_name_or_path (e.g. 'sentence-transformers/paraphrase-MiniLM-L6-v2'). "
"The embeddings are saved in the specified cache directory under the embeddings_save_loc directory. "
Expand Down
2 changes: 2 additions & 0 deletions nemo_curator/scripts/semdedup/extract_dedup_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
from datetime import datetime

from nemo_curator import __version__
from nemo_curator.log import create_logger
from nemo_curator.modules.config import SemDedupConfig
from nemo_curator.modules.semantic_dedup import SemanticClusterLevelDedup
Expand Down Expand Up @@ -62,6 +63,7 @@ def main(args):
def attach_args():
parser = ArgumentHelper.parse_semdedup_args(
description=(
f"NVIDIA NeMo Curator -- v{__version__} "
"Extracts deduplicated data from the clustered embeddings of a collection of documents. "
"This script requires that embeddings and clustering have been performed beforehand using the specified configurations. "
"earlier using semdedup_extract_embeddings and semdedup_cluster_embeddings."
Expand Down
7 changes: 4 additions & 3 deletions nemo_curator/utils/script_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,12 @@ def __init__(self, parser: argparse.ArgumentParser):

def attach_version_arg(self):
self.parser.add_argument(
"--version", "-v",
"--version",
"-v",
action="version",
version=f"NVIDIA NeMo Curator -- v{__version__}",
help="Show the version and exit."
)
help="Show the version and exit.",
)

@staticmethod
def attach_bool_arg(
Expand Down

0 comments on commit 994e3bb

Please sign in to comment.