Skip to content

Commit

Permalink
Add support for parsing WARC (#153)
Browse files Browse the repository at this point in the history
* testing warc

* ignore

* testing slow

* langdetect

* optional import

* refactoring

* wip

* style

* wip

* test

* wip

* configs

* hash sample

* small improvements

* updated with output

* more details

* updated readme

* decon wip

* new confits

* taggging content

* changed name of file

* fixes

* deal with empty docs/local files

* increased bloom size

* configs for rest of splits

* switching to option2

* forgot to do two more

* finding puctuation

* tokenizer porting

* configs

* books config

* more sources

* configs

* updated paths

* new c4

* cleaned up

* sampling

* sample

* sampling

* added tokenizer

* update all

* style

* updated

* configs

* tokenizer cli wip

* cli

* wip big refactor

* fixed small bugs

* tokenizer log

* fixed tokenizer paths

* added tokenizer small

* fixed glob issue

* removed temporary directory

* added todo

* conversion script

* more writing

* more docs

* more docs

* logos

* pipelines

* datasheet

* wip

* adding script to make wikipedia

* wip

* more text

* more docs!

* new examples.

* documentation

* fixed bug local file

* lint

* added warc back

* using tokens command

* typo

* moving around

* committing versions

* adding more info

* lang test

* new deps

* added language prediction

* wip

* added url normalization, multiple licenses

* language

* registry fixes

* making tests for registry

* added split extension funcitonality

* added split extension funcitonality; style

* reorganized

* cli

* first version

* interface

* remove unused

* removed dead code

* ignores

* ignores
  • Loading branch information
soldni authored May 2, 2024
1 parent c1a4430 commit 3bbea39
Show file tree
Hide file tree
Showing 23 changed files with 1,583 additions and 152 deletions.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@ target/

# ignoring test output
/tests/work/
/python/dolma/core/warc

# ignore vscode directory
.vscode
Expand Down
53 changes: 36 additions & 17 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ dependencies = [
"msgspec>=0.14.2",
"nltk==3.8.1",
"omegaconf>=2.3.0",
"LTpycld2==0.42", # fork of pycld2 that works on Apple Silicon
# "pycld2==0.41",
# "pycld3==0.22", # does not install correctly
"platformdirs>=4.2.0",
Expand All @@ -30,7 +29,6 @@ dependencies = [
"uniseg",
"numpy",
"necessary>=0.4.3",
"langdetect>=1.0.9",
"charset-normalizer>=3.2.0",
]
classifiers = [
Expand Down Expand Up @@ -115,27 +113,48 @@ dev = [
code = ["detect-secrets==1.4.0", "beautifulsoup4>=4", "pygments", "regex"]
# extension to detect PIIs using presidio
pii = ["presidio_analyzer==2.2.32", "regex"]
# # extension to parse warc files
# warc = [
# "warcio>=1.7.4",
# "trafilatura>=1.6.1",
# "justext>=3.0.0",
# "goose3>=3.1.17",

# # following are all for speeding up trafilatura
# "brotli",
# "cchardet >= 2.1.7; python_version < '3.11'", # build issue
# "faust-cchardet >= 2.1.18; python_version >= '3.11'", # fix for build
# "htmldate[speed] >= 1.4.3",
# "py3langid >= 0.2.2",
# ]

# language detection; by default, we use fastttext, everything else is optional
lang = [
"fasttext-wheel==0.9.2",
"LTpycld2==0.42", # fork of pycld2 that works on Apple Silicon
"lingua-language-detector>=2.0.0",
"langdetect>=1.0.9"
]

# extension to parse warc files
warc = [
"fastwarc",
"w3lib",
"url-normalize",

]
trafilatura = [
# must include warc dependencies
"dolma[warc]",
# core package
"trafilatura>=1.6.1",
# following are all for speeding up trafilatura
"brotli",
"cchardet >= 2.1.7; python_version < '3.11'", # build issue
"faust-cchardet >= 2.1.18; python_version >= '3.11'", # fix for build
"htmldate[speed] >= 1.4.3",
"py3langid >= 0.2.2",
]

resiliparse = [
"dolma[warc]",
"resiliparse",
]

# all extensions
all = [
"dolma[dev]",
"dolma[code]",
"dolma[pii]",
# "dolma[warc]",
"dolma[trafilatura]",
"dolma[resiliparse]",
"dolma[lang]"
]

[build-system]
Expand Down
2 changes: 2 additions & 0 deletions python/dolma/cli/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from .resolvers import * # noqa: F401,F403,W0401
from .tagger import ListTaggerCli, TaggerCli
from .tokenizer import TokenizerCli
from .warc import WarcExtractorCli

AVAILABLE_COMMANDS = {
"dedupe": DeduperCli,
Expand All @@ -23,6 +24,7 @@
"list": ListTaggerCli,
"stat": AnalyzerCli,
"tokens": TokenizerCli,
"warc": WarcExtractorCli,
# following functionality is not yet implemented
# "train-ft": None,
# "train-lm": None,
Expand Down
2 changes: 1 addition & 1 deletion python/dolma/cli/tagger.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ def run(cls, parsed_config: ListTaggerConfig):
table.add_column("name", justify="left", style="cyan")
table.add_column("class", justify="left", style="magenta")

for tagger_name, tagger_cls in sorted(TaggerRegistry.taggers()):
for tagger_name, tagger_cls in sorted(TaggerRegistry.items()):
tagger_repr = f"{tagger_cls.__module__}.{tagger_cls.__name__}"
table.add_row(tagger_name, tagger_repr)

Expand Down
119 changes: 119 additions & 0 deletions python/dolma/cli/warc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
from dataclasses import dataclass
from typing import List

from dolma.cli import BaseCli, field, print_config
from dolma.cli.shared import WorkDirConfig, make_workdirs
from dolma.core.errors import DolmaConfigError
from dolma.core.loggers import get_logger
from dolma.core.paths import glob_path
from dolma.warc import create_and_run_warc_pipeline


@dataclass
class TaggerConfig:
taggers: List[str] = field(
default=[],
help="List of taggers to run.",
)
skip: bool = field(
default=False,
help="Whether to skip if taggers returns no output.",
)


@dataclass
class WarcExtractorConfig:
documents: List[str] = field(
default=[],
help="One or more document paths to process; Can be either local or S3 paths. Globs are supported.",
)
destination: List[str] = field(
default=[],
nargs="*",
help=(
"Destination paths to save the outputs; should match the number of document paths. "
"If not provided, destination will be derived from the document path."
),
)
processes: int = field(
default=1,
help="Number of parallel processes to use.",
)
ignore_existing: bool = field(
default=False,
help="Whether to ignore existing outputs and re-run the taggers.",
)

debug: bool = field(
default=False,
help="Whether to run in debug mode.",
)
source_name: str = field(help="Name to assign to the source.")
linearizer: str = field(
default="resiliparse",
help="Name of the HTML linearizer to use.",
)
pre: TaggerConfig = field(default=TaggerConfig(), help="Configuration for pre-extraction taggers.")
post: TaggerConfig = field(default=TaggerConfig(), help="Configuration for post-extraction taggers.")
store_html_in_metadata: bool = field(
default=False,
help="Whether to store the HTML content in the metadata.",
)

work_dir: WorkDirConfig = field(default=WorkDirConfig(), help="Configuration for temporary work directories.")
dryrun: bool = field(
default=False,
help="If true, only print the configuration and exit without running the taggers.",
)


class WarcExtractorCli(BaseCli):
CONFIG = WarcExtractorConfig
DESCRIPTION = "Extract documents from WARC files and parse HTML out."

@classmethod
def run(cls, parsed_config: WarcExtractorConfig):
logger = get_logger("warc")

with make_workdirs(parsed_config.work_dir) as work_dirs:
documents = [str(p) for p in parsed_config.documents]
destination = [str(p) for p in parsed_config.destination]

source_name = parsed_config.source_name
if not isinstance(source_name, str):
raise ValueError(f"source_name must be a string, not {source_name} ({type(source_name)})")

# perform some path validation to make sure we don't call
# the extractor with invalid config
total_matching_documents = 0
for document in documents:
current_matching_documents = sum(1 for _ in glob_path(document))
if current_matching_documents == 0:
# only raise a warning if no documents are found for a single path
logger.warning("No documents found for path %s", document)
total_matching_documents += current_matching_documents

if total_matching_documents == 0:
# but raise an error if no documents are found for all paths
raise DolmaConfigError(f"No documents found for paths {documents}.")

print_config(parsed_config)
if parsed_config.dryrun:
logger.info("Exiting due to dryrun.")
return

create_and_run_warc_pipeline(
documents=(documents[0] if len(documents) == 1 else documents),
destination=(destination[0] if len(destination) == 1 else destination),
metadata=work_dirs.output,
num_processes=parsed_config.processes,
ignore_existing=parsed_config.ignore_existing,
debug=parsed_config.debug,
source_name=source_name,
pre_taggers=parsed_config.pre.taggers,
skip_no_pre_taggers=parsed_config.pre.skip,
post_taggers=parsed_config.post.taggers,
skip_no_post_taggers=parsed_config.post.skip,
store_html_in_metadata=parsed_config.store_html_in_metadata,
linearizer_name=parsed_config.linearizer,
)
61 changes: 60 additions & 1 deletion python/dolma/core/data_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,22 @@ class InputSpec(Struct):
id: str
text: str
source: str = ""
created: str = ""
added: str = ""
version: Optional[str] = None


class InputSpecWithMetadata(InputSpec):
metadata: Optional[Dict[str, Any]] = None


class InputSpecWithMetadataAndAttributes(InputSpecWithMetadata):
attributes: Optional[Dict[str, List[Tuple[int, int, float]]]] = None


class OutputSpec(Struct):
id: str
attributes: Dict[str, List[Tuple[int, int, float]]]
attributes: Dict[str, List[TaggerOutputValueType]]
source: Optional[str] = None


Expand Down Expand Up @@ -111,6 +117,59 @@ def __str__(self) -> str:
return repr_.rstrip(")") + f",metadata={'...' if self.metadata else 'none'})"


class DocumentWithMetadataAndAttributes(DocumentWithMetadata):
def __init__(
self, *args, attributes: Optional[Dict[str, List[Tuple[int, int, float]]]] = None, **kwargs
) -> None:
super().__init__(*args, **kwargs)
self.attributes = attributes or {}

@classmethod
def from_spec(cls, spec: InputSpecWithMetadataAndAttributes) -> "DocumentWithMetadataAndAttributes":
return DocumentWithMetadataAndAttributes(
source=spec.source,
version=spec.version,
id=spec.id,
text=spec.text,
metadata=spec.metadata,
attributes=spec.attributes,
)

@classmethod
def from_json(cls, d: Dict) -> "DocumentWithMetadataAndAttributes":
return DocumentWithMetadataAndAttributes(
source=d["source"],
version=d["version"],
id=d["id"],
text=d["text"],
metadata=d["metadata"],
attributes=d["attributes"],
)

def to_json(self) -> Dict:
return {
"source": self.source,
"version": self.version,
"id": self.id,
"text": self.text,
"metadata": self.metadata,
"attributes": self.attributes,
}

def to_spec(self) -> InputSpecWithMetadataAndAttributes:
return InputSpecWithMetadataAndAttributes(
source=self.source,
version=self.version,
id=self.id,
text=self.text,
metadata=self.metadata,
attributes=self.attributes,
)

def __str__(self) -> str:
return super().__str__().rstrip(")") + f",attributes={'...' if self.attributes else 'none'})"


class Span:
__slots__ = "start", "end", "type", "score", "experiment", "tagger"

Expand Down
20 changes: 20 additions & 0 deletions python/dolma/core/paths.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import glob
import os
import re
from functools import partial
from hashlib import sha256
Expand Down Expand Up @@ -517,3 +518,22 @@ def decompress_path(path: str, dest: Optional[str] = None) -> str:

# already decompressed or can't be decompressed
return path


def split_ext(path: str) -> Tuple[str, Tuple[str, ...], str]:
"""
Split a path into its protocol and extensions.
"""
prot, parts = split_path(path)
if not parts:
return prot, (), ""

filename = parts[-1]
extensions = []
while True:
filename, ext = os.path.splitext(filename)
if not ext:
break
extensions.append(ext)

return prot, (*parts[:-1], filename), "".join(reversed(extensions))
Loading

0 comments on commit 3bbea39

Please sign in to comment.