Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add git to filesystem source 301 devel #893

Closed
Closed
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Remove discussion comments
deanja committed Jan 14, 2024
commit 3e93685a670476a99467fc4e28963ecf8c923d97
23 changes: 0 additions & 23 deletions dlt/common/storages/configuration.py
Original file line number Diff line number Diff line change
@@ -112,27 +112,7 @@ def protocol(self) -> str:
return url.scheme

def on_resolved(self) -> None:
#301 this parse will mangle some fsspec urls. Some need to have
# their netloc removed before parsing, usually possible with
# fsspec class method _strip_protocol(). But We dont have a handle
# on the fsspec instance here, so we can't it the fsspec way unless
# we use a class factory for fsspec implementations, which will need
# imports carefully managed.

# So we have Unpredictable basis for evaluating the rules, and any new rules.
url = urlparse(self.bucket_url)
#301
# Do we need this rule at all? without it we get nice error:
# "cannot traverse all of s3" if supply s3://, s3:///
# ToDo: test with google, azure.
#
# The rule is not valid for all fsspec implementations.
# eg, for github, gitpythonfs at least, the netloc
# is stripped out by fsspec and you can ls(), walk() etc the entire
# repo from root.
# May be better to be explicit what protocols _are_ subject to this rule.
# Guessing it is anything bucket-like, because you can't download all
# buckets at once and you can't glob on bucket names?
if not url.scheme in ('gitpythonfs', 'github', 'git', "s3"):
if not url.path and not url.netloc:
raise ConfigurationValueError(
@@ -157,9 +137,6 @@ def __str__(self) -> str:
"""Return displayable destination location"""
url = urlparse(self.bucket_url)
# do not show passwords
#301 Some fsspec implementation put other things in the password@ slot.
# Is hiding the password the more important use case?
# Or add conditions to support both?
if url.password:
new_netloc = f"{url.username}:****@{url.hostname}"
if url.port:
24 changes: 0 additions & 24 deletions dlt/common/storages/fsspec_filesystem.py
Original file line number Diff line number Diff line change
@@ -119,20 +119,6 @@ def fsspec_from_config(config: FilesystemConfiguration) -> Tuple[AbstractFileSys
"filesystem", [f"{version.DLT_PKG_NAME}[{config.protocol}]"]
) from e

#301 Concerns about this class:
# Confirm it is a stateful class/object because it's an output of dlt Resource?
# If yes, could that be made clearer by it inheriting/implementing a dlt Python type?
#
# It has complex responsibilities and not enough scope to fulfull them:
# 1. store file metadata. ok, that's easy.
# 2. lazy fsspec factory, which also exists as functions above, albeit different signature.
# 4. 'file_url' is not enough to instantiate an fs because url params are stripped out. The
# params would need to have been copied into `credentials` object or turned into kwargs.
#
# Possible way to tidy up is to require AbstractFileSystem instance in constructor.
# Mostly, this class is instantiated by filesystem Resource, which has an fs intance it can pass in.
# Edge case is sources/inbox/__init__.py which instantiates it directly. But only accesses simple dict items?
# Are dlt users likely to have instantiated this class directly?
class FileItemDict(DictStrAny):
"""A FileItem dictionary with additional methods to get fsspec filesystem, open and read files."""

@@ -161,10 +147,6 @@ def fsspec(self) -> AbstractFileSystem:
if isinstance(self.credentials, AbstractFileSystem):
return self.credentials
else:
# #301 file_url has two purposes: a) dlt destination field b) fsspec constructor param.
# For fsspec url params to work, the params
# need to still be in the file_url here. Specifically, gitpython needs the repo_path
# from the username: section of the url. But it was removed in ./glob_files().
return fsspec_filesystem(self["file_url"], self.credentials)[0]

def open(self, mode: str = "rb", **kwargs: Any) -> IO[Any]: # noqa: A003
@@ -195,7 +177,6 @@ def open(self, mode: str = "rb", **kwargs: Any) -> IO[Any]: # noqa: A003
**text_kwargs,
)
else:
# #301 ok once fsspec() method file_url fixed?
opened_file = self.fsspec.open(self["file_url"], mode=mode, **kwargs)
return opened_file

@@ -209,7 +190,6 @@ def read_bytes(self) -> bytes:
return ( # type: ignore
self["file_content"]
if "file_content" in self and self["file_content"] is not None
# #301 ok once fsspec() method file_url fixed?
else self.fsspec.read_bytes(self["file_url"])
)

@@ -236,14 +216,10 @@ def glob_files(
import os

protocol = urlparse(bucket_url).scheme
# NOTE: _strip_protocol is internal method. No easy way to do this via
# public url_to_fs() without making breaking changes to filesystem Source/Resource.
fsspec_path = fs_client._strip_protocol(bucket_url)
bucket_url = f"{protocol}://{fsspec_path}"

bucket_url_parsed = urlparse(bucket_url)
# netloc_unquoted = unquote(bucket_url_parsed.netloc)
# bucket_url_parsed = bucket_url_parsed._replace(netloc=netloc_unquoted)

# if this is a file path without a scheme
if not bucket_url_parsed.scheme or (os.path.isabs(bucket_url) and "\\" in bucket_url):