Add support for parsing WARC (#153)

* testing warc * ignore * testing slow * langdetect * optional import * refactoring * wip * style * wip * test * wip * configs * hash sample * small improvements * updated with output * more details * updated readme * decon wip * new confits * taggging content * changed name of file * fixes * deal with empty docs/local files * increased bloom size * configs for rest of splits * switching to option2 * forgot to do two more * finding puctuation * tokenizer porting * configs * books config * more sources * configs * updated paths * new c4 * cleaned up * sampling * sample * sampling * added tokenizer * update all * style * updated * configs * tokenizer cli wip * cli * wip big refactor * fixed small bugs * tokenizer log * fixed tokenizer paths * added tokenizer small * fixed glob issue * removed temporary directory * added todo * conversion script * more writing * more docs * more docs * logos * pipelines * datasheet * wip * adding script to make wikipedia * wip * more text * more docs! * new examples. * documentation * fixed bug local file * lint * added warc back * using tokens command * typo * moving around * committing versions * adding more info * lang test * new deps * added language prediction * wip * added url normalization, multiple licenses * language * registry fixes * making tests for registry * added split extension funcitonality * added split extension funcitonality; style * reorganized * cli * first version * interface * remove unused * removed dead code * ignores * ignores
allenai · May 2, 2024 · 3bbea39 · 3bbea39
1 parent c1a4430
commit 3bbea39
Show file tree

Hide file tree

Showing 23 changed files with 1,583 additions and 152 deletions.
diff --git a/.gitignore b/.gitignore
@@ -62,7 +62,6 @@ target/
 
 # ignoring test output
 /tests/work/
-/python/dolma/core/warc
 
 # ignore vscode directory
 .vscode

diff --git a/pyproject.toml b/pyproject.toml
@@ -16,7 +16,6 @@ dependencies = [
     "msgspec>=0.14.2",
     "nltk==3.8.1",
     "omegaconf>=2.3.0",
-    "LTpycld2==0.42",        # fork of pycld2 that works on Apple Silicon
     # "pycld2==0.41",
     # "pycld3==0.22",       # does not install correctly
     "platformdirs>=4.2.0",
@@ -30,7 +29,6 @@ dependencies = [
     "uniseg",
     "numpy",
     "necessary>=0.4.3",
-    "langdetect>=1.0.9",
     "charset-normalizer>=3.2.0",
 ]
 classifiers = [
@@ -115,27 +113,48 @@ dev = [
 code = ["detect-secrets==1.4.0", "beautifulsoup4>=4", "pygments", "regex"]
 # extension to detect PIIs using presidio
 pii = ["presidio_analyzer==2.2.32", "regex"]
-# # extension to parse warc files
-# warc = [
-#     "warcio>=1.7.4",
-#     "trafilatura>=1.6.1",
-#     "justext>=3.0.0",
-#     "goose3>=3.1.17",
-
-#     # following are all for speeding up trafilatura
-#     "brotli",
-#     "cchardet >= 2.1.7; python_version < '3.11'",  # build issue
-#     "faust-cchardet >= 2.1.18; python_version >= '3.11'",  # fix for build
-#     "htmldate[speed] >= 1.4.3",
-#     "py3langid >= 0.2.2",
-# ]
+
+# language detection; by default, we use fastttext, everything else is optional
+lang = [
+    "fasttext-wheel==0.9.2",
+    "LTpycld2==0.42",        # fork of pycld2 that works on Apple Silicon
+    "lingua-language-detector>=2.0.0",
+    "langdetect>=1.0.9"
+]
+
+# extension to parse warc files
+warc = [
+    "fastwarc",
+    "w3lib",
+    "url-normalize",
+
+]
+trafilatura = [
+    # must include warc dependencies
+    "dolma[warc]",
+    # core package
+    "trafilatura>=1.6.1",
+    # following are all for speeding up trafilatura
+    "brotli",
+    "cchardet >= 2.1.7; python_version < '3.11'",  # build issue
+    "faust-cchardet >= 2.1.18; python_version >= '3.11'",  # fix for build
+    "htmldate[speed] >= 1.4.3",
+    "py3langid >= 0.2.2",
+]
+
+resiliparse = [
+    "dolma[warc]",
+    "resiliparse",
+]
 
 # all extensions
 all = [
     "dolma[dev]",
     "dolma[code]",
     "dolma[pii]",
-    # "dolma[warc]",
+    "dolma[trafilatura]",
+    "dolma[resiliparse]",
+    "dolma[lang]"
 ]
 
 [build-system]

diff --git a/python/dolma/cli/__main__.py b/python/dolma/cli/__main__.py
@@ -15,6 +15,7 @@
 from .resolvers import *  # noqa: F401,F403,W0401
 from .tagger import ListTaggerCli, TaggerCli
 from .tokenizer import TokenizerCli
+from .warc import WarcExtractorCli
 
 AVAILABLE_COMMANDS = {
     "dedupe": DeduperCli,
@@ -23,6 +24,7 @@
     "list": ListTaggerCli,
     "stat": AnalyzerCli,
     "tokens": TokenizerCli,
+    "warc": WarcExtractorCli,
     # following functionality is not yet implemented
     # "train-ft": None,
     # "train-lm": None,

diff --git a/python/dolma/cli/tagger.py b/python/dolma/cli/tagger.py
@@ -164,7 +164,7 @@ def run(cls, parsed_config: ListTaggerConfig):
         table.add_column("name", justify="left", style="cyan")
         table.add_column("class", justify="left", style="magenta")
 
-        for tagger_name, tagger_cls in sorted(TaggerRegistry.taggers()):
+        for tagger_name, tagger_cls in sorted(TaggerRegistry.items()):
             tagger_repr = f"{tagger_cls.__module__}.{tagger_cls.__name__}"
             table.add_row(tagger_name, tagger_repr)
 

diff --git a/python/dolma/cli/warc.py b/python/dolma/cli/warc.py
@@ -0,0 +1,119 @@
+from dataclasses import dataclass
+from typing import List
+
+from dolma.cli import BaseCli, field, print_config
+from dolma.cli.shared import WorkDirConfig, make_workdirs
+from dolma.core.errors import DolmaConfigError
+from dolma.core.loggers import get_logger
+from dolma.core.paths import glob_path
+from dolma.warc import create_and_run_warc_pipeline
+
+
+@dataclass
+class TaggerConfig:
+    taggers: List[str] = field(
+        default=[],
+        help="List of taggers to run.",
+    )
+    skip: bool = field(
+        default=False,
+        help="Whether to skip if taggers returns no output.",
+    )
+
+
+@dataclass
+class WarcExtractorConfig:
+    documents: List[str] = field(
+        default=[],
+        help="One or more document paths to process; Can be either local or S3 paths. Globs are supported.",
+    )
+    destination: List[str] = field(
+        default=[],
+        nargs="*",
+        help=(
+            "Destination paths to save the outputs; should match the number of document paths. "
+            "If not provided, destination will be derived from the document path."
+        ),
+    )
+    processes: int = field(
+        default=1,
+        help="Number of parallel processes to use.",
+    )
+    ignore_existing: bool = field(
+        default=False,
+        help="Whether to ignore existing outputs and re-run the taggers.",
+    )
+
+    debug: bool = field(
+        default=False,
+        help="Whether to run in debug mode.",
+    )
+    source_name: str = field(help="Name to assign to the source.")
+    linearizer: str = field(
+        default="resiliparse",
+        help="Name of the HTML linearizer to use.",
+    )
+    pre: TaggerConfig = field(default=TaggerConfig(), help="Configuration for pre-extraction taggers.")
+    post: TaggerConfig = field(default=TaggerConfig(), help="Configuration for post-extraction taggers.")
+    store_html_in_metadata: bool = field(
+        default=False,
+        help="Whether to store the HTML content in the metadata.",
+    )
+
+    work_dir: WorkDirConfig = field(default=WorkDirConfig(), help="Configuration for temporary work directories.")
+    dryrun: bool = field(
+        default=False,
+        help="If true, only print the configuration and exit without running the taggers.",
+    )
+
+
+class WarcExtractorCli(BaseCli):
+    CONFIG = WarcExtractorConfig
+    DESCRIPTION = "Extract documents from WARC files and parse HTML out."
+
+    @classmethod
+    def run(cls, parsed_config: WarcExtractorConfig):
+        logger = get_logger("warc")
+
+        with make_workdirs(parsed_config.work_dir) as work_dirs:
+            documents = [str(p) for p in parsed_config.documents]
+            destination = [str(p) for p in parsed_config.destination]
+
+            source_name = parsed_config.source_name
+            if not isinstance(source_name, str):
+                raise ValueError(f"source_name must be a string, not {source_name} ({type(source_name)})")
+
+            # perform some path validation to make sure we don't call
+            # the extractor with invalid config
+            total_matching_documents = 0
+            for document in documents:
+                current_matching_documents = sum(1 for _ in glob_path(document))
+                if current_matching_documents == 0:
+                    # only raise a warning if no documents are found for a single path
+                    logger.warning("No documents found for path %s", document)
+                total_matching_documents += current_matching_documents
+
+            if total_matching_documents == 0:
+                # but raise an error if no documents are found for all paths
+                raise DolmaConfigError(f"No documents found for paths {documents}.")
+
+            print_config(parsed_config)
+            if parsed_config.dryrun:
+                logger.info("Exiting due to dryrun.")
+                return
+
+            create_and_run_warc_pipeline(
+                documents=(documents[0] if len(documents) == 1 else documents),
+                destination=(destination[0] if len(destination) == 1 else destination),
+                metadata=work_dirs.output,
+                num_processes=parsed_config.processes,
+                ignore_existing=parsed_config.ignore_existing,
+                debug=parsed_config.debug,
+                source_name=source_name,
+                pre_taggers=parsed_config.pre.taggers,
+                skip_no_pre_taggers=parsed_config.pre.skip,
+                post_taggers=parsed_config.post.taggers,
+                skip_no_post_taggers=parsed_config.post.skip,
+                store_html_in_metadata=parsed_config.store_html_in_metadata,
+                linearizer_name=parsed_config.linearizer,
+            )
diff --git a/python/dolma/core/data_types.py b/python/dolma/core/data_types.py
@@ -20,16 +20,22 @@ class InputSpec(Struct):
     id: str
     text: str
     source: str = ""
+    created: str = ""
+    added: str = ""
     version: Optional[str] = None
 
 
 class InputSpecWithMetadata(InputSpec):
     metadata: Optional[Dict[str, Any]] = None
 
 
+class InputSpecWithMetadataAndAttributes(InputSpecWithMetadata):
+    attributes: Optional[Dict[str, List[Tuple[int, int, float]]]] = None
+
+
 class OutputSpec(Struct):
     id: str
-    attributes: Dict[str, List[Tuple[int, int, float]]]
+    attributes: Dict[str, List[TaggerOutputValueType]]
     source: Optional[str] = None
 
 
@@ -111,6 +117,59 @@ def __str__(self) -> str:
         return repr_.rstrip(")") + f",metadata={'...' if self.metadata else 'none'})"
 
 
+class DocumentWithMetadataAndAttributes(DocumentWithMetadata):
+    def __init__(
+        self, *args, attributes: Optional[Dict[str, List[Tuple[int, int, float]]]] = None, **kwargs
+    ) -> None:
+        super().__init__(*args, **kwargs)
+        self.attributes = attributes or {}
+
+    @classmethod
+    def from_spec(cls, spec: InputSpecWithMetadataAndAttributes) -> "DocumentWithMetadataAndAttributes":
+        return DocumentWithMetadataAndAttributes(
+            source=spec.source,
+            version=spec.version,
+            id=spec.id,
+            text=spec.text,
+            metadata=spec.metadata,
+            attributes=spec.attributes,
+        )
+
+    @classmethod
+    def from_json(cls, d: Dict) -> "DocumentWithMetadataAndAttributes":
+        return DocumentWithMetadataAndAttributes(
+            source=d["source"],
+            version=d["version"],
+            id=d["id"],
+            text=d["text"],
+            metadata=d["metadata"],
+            attributes=d["attributes"],
+        )
+
+    def to_json(self) -> Dict:
+        return {
+            "source": self.source,
+            "version": self.version,
+            "id": self.id,
+            "text": self.text,
+            "metadata": self.metadata,
+            "attributes": self.attributes,
+        }
+
+    def to_spec(self) -> InputSpecWithMetadataAndAttributes:
+        return InputSpecWithMetadataAndAttributes(
+            source=self.source,
+            version=self.version,
+            id=self.id,
+            text=self.text,
+            metadata=self.metadata,
+            attributes=self.attributes,
+        )
+
+    def __str__(self) -> str:
+        return super().__str__().rstrip(")") + f",attributes={'...' if self.attributes else 'none'})"
+
+
 class Span:
     __slots__ = "start", "end", "type", "score", "experiment", "tagger"
 

diff --git a/python/dolma/core/paths.py b/python/dolma/core/paths.py
@@ -1,4 +1,5 @@
 import glob
+import os
 import re
 from functools import partial
 from hashlib import sha256
@@ -517,3 +518,22 @@ def decompress_path(path: str, dest: Optional[str] = None) -> str:
 
     # already decompressed or can't be decompressed
     return path
+
+
+def split_ext(path: str) -> Tuple[str, Tuple[str, ...], str]:
+    """
+    Split a path into its protocol and extensions.
+    """
+    prot, parts = split_path(path)
+    if not parts:
+        return prot, (), ""
+
+    filename = parts[-1]
+    extensions = []
+    while True:
+        filename, ext = os.path.splitext(filename)
+        if not ext:
+            break
+        extensions.append(ext)
+
+    return prot, (*parts[:-1], filename), "".join(reversed(extensions))