Skip to content

Commit

Permalink
makes repo compat with dlt 1.0.0
Browse files Browse the repository at this point in the history
  • Loading branch information
rudolfix committed Dec 4, 2024
1 parent 3d2993a commit 1eff9a7
Show file tree
Hide file tree
Showing 75 changed files with 169 additions and 12,060 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ lint-dlt-init:
lint-code:
./check-package.sh
poetry run mypy --config-file mypy.ini ./sources
# poetry run mypy --config-file mypy.ini ./tests/rest_api
# poetry run mypy --config-file mypy.ini ./tests
poetry run mypy --config-file mypy.ini ./tools
poetry run flake8 --max-line-length=200 --extend-ignore=W503 sources init --show-source
poetry run flake8 --max-line-length=200 --extend-ignore=W503 tests --show-source
Expand Down
47 changes: 35 additions & 12 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@ packages = [{include = "sources"}]

[tool.poetry.dependencies]
python = ">=3.8.1,<3.13"
dlt = {version = "0.5.2", allow-prereleases = true, extras = ["redshift", "bigquery", "postgres", "duckdb"]}
dlt = {version = "1.3.0", allow-prereleases = true, extras = ["redshift", "bigquery", "postgres", "duckdb"]}
graphlib-backport = {version = "*", python = "<3.9"}

[tool.poetry.group.dltpure.dependencies]
dlt = {version = "0.5.2", allow-prereleases = true}
dlt = {version = "1.3.0", allow-prereleases = true}

[tool.poetry.group.pytest.dependencies]
pytest = "^7.2.0"
Expand Down
109 changes: 0 additions & 109 deletions sources/filesystem/README.md

This file was deleted.

100 changes: 1 addition & 99 deletions sources/filesystem/__init__.py
Original file line number Diff line number Diff line change
@@ -1,99 +1 @@
"""Reads files in s3, gs or azure buckets using fsspec and provides convenience resources for chunked reading of various file formats"""
from typing import Iterator, List, Optional, Tuple, Union

import dlt
from dlt.common.typing import copy_sig
from dlt.sources import DltResource
from dlt.sources.filesystem import FileItem, FileItemDict, fsspec_filesystem, glob_files
from dlt.sources.credentials import FileSystemCredentials

from .helpers import (
AbstractFileSystem,
FilesystemConfigurationResource,
)
from .readers import (
ReadersSource,
_read_csv,
_read_csv_duckdb,
_read_jsonl,
_read_parquet,
)
from .settings import DEFAULT_CHUNK_SIZE


@dlt.source(_impl_cls=ReadersSource, spec=FilesystemConfigurationResource)
def readers(
bucket_url: str = dlt.secrets.value,
credentials: Union[FileSystemCredentials, AbstractFileSystem] = dlt.secrets.value,
file_glob: Optional[str] = "*",
) -> Tuple[DltResource, ...]:
"""This source provides a few resources that are chunked file readers. Readers can be further parametrized before use
read_csv(chunksize, **pandas_kwargs)
read_jsonl(chunksize)
read_parquet(chunksize)
Args:
bucket_url (str): The url to the bucket.
credentials (FileSystemCredentials | AbstractFilesystem): The credentials to the filesystem of fsspec `AbstractFilesystem` instance.
file_glob (str, optional): The filter to apply to the files in glob format. by default lists all files in bucket_url non-recursively
"""
return (
filesystem(bucket_url, credentials, file_glob=file_glob)
| dlt.transformer(name="read_csv")(_read_csv),
filesystem(bucket_url, credentials, file_glob=file_glob)
| dlt.transformer(name="read_jsonl")(_read_jsonl),
filesystem(bucket_url, credentials, file_glob=file_glob)
| dlt.transformer(name="read_parquet")(_read_parquet),
filesystem(bucket_url, credentials, file_glob=file_glob)
| dlt.transformer(name="read_csv_duckdb")(_read_csv_duckdb),
)


@dlt.resource(
primary_key="file_url", spec=FilesystemConfigurationResource, standalone=True
)
def filesystem(
bucket_url: str = dlt.secrets.value,
credentials: Union[FileSystemCredentials, AbstractFileSystem] = dlt.secrets.value,
file_glob: Optional[str] = "*",
files_per_page: int = DEFAULT_CHUNK_SIZE,
extract_content: bool = False,
) -> Iterator[List[FileItem]]:
"""This resource lists files in `bucket_url` using `file_glob` pattern. The files are yielded as FileItem which also
provide methods to open and read file data. It should be combined with transformers that further process (ie. load files)
Args:
bucket_url (str): The url to the bucket.
credentials (FileSystemCredentials | AbstractFilesystem): The credentials to the filesystem of fsspec `AbstractFilesystem` instance.
file_glob (str, optional): The filter to apply to the files in glob format. by default lists all files in bucket_url non-recursively
files_per_page (int, optional): The number of files to process at once, defaults to 100.
extract_content (bool, optional): If true, the content of the file will be extracted if
false it will return a fsspec file, defaults to False.
Returns:
Iterator[List[FileItem]]: The list of files.
"""
if isinstance(credentials, AbstractFileSystem):
fs_client = credentials
else:
fs_client = fsspec_filesystem(bucket_url, credentials)[0]

files_chunk: List[FileItem] = []
for file_model in glob_files(fs_client, bucket_url, file_glob):
file_dict = FileItemDict(file_model, credentials)
if extract_content:
file_dict["file_content"] = file_dict.read_bytes()
files_chunk.append(file_dict) # type: ignore

# wait for the chunk to be full
if len(files_chunk) >= files_per_page:
yield files_chunk
files_chunk = []
if files_chunk:
yield files_chunk


read_csv = dlt.transformer(standalone=True)(_read_csv)
read_jsonl = dlt.transformer(standalone=True)(_read_jsonl)
read_parquet = dlt.transformer(standalone=True)(_read_parquet)
read_csv_duckdb = dlt.transformer(standalone=True)(_read_csv_duckdb)
"""Please run dlt init filesystem <destination> --branch 0.5 to access legacy version"""
Loading

0 comments on commit 1eff9a7

Please sign in to comment.