Skip to content

Commit

Permalink
Merge pull request #207 from openlawlibrary/dgreisen/get_file
Browse files Browse the repository at this point in the history
perf: reimplement slow git calls in pygit2
  • Loading branch information
renatav authored Feb 8, 2022
2 parents 85512c9 + d455f97 commit fd99b88
Show file tree
Hide file tree
Showing 6 changed files with 158 additions and 1 deletion.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,13 @@ and this project adheres to [Semantic Versioning][semver].


### Changed
- perf: re-implementing slow git cmds with pygit2 ([207])


### Fixed
- pytest works when taf installed via wheel ([200])

[207]: https://github.com/openlawlibrary/taf/pull/207
[200]: https://github.com/openlawlibrary/taf/pull/200

## [0.14.0] - 01/25/2022
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ def finalize_options(self):
"loguru==0.4.0",
"cryptography==3.2.1",
"pyOpenSSL==20.0.1",
"pygit2==0.28.2",
],
"extras_require": {
"ci": ci_require,
Expand Down
39 changes: 38 additions & 1 deletion taf/git.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,15 @@

import taf.settings as settings
from taf.exceptions import (
TAFError,
CloneRepoException,
FetchException,
InvalidRepositoryError,
GitError,
)
from taf.log import taf_logger
from taf.utils import run
from .pygit import PyGitRepository

EMPTY_TREE = "4b825dc642cb6eb9a060e54bf8d69288fbee4904"

Expand Down Expand Up @@ -84,6 +86,17 @@ def __init__(
self.default_branch = default_branch
self.custom = custom or {}

_pygit = None

@property
def pygit(self):
if self._pygit is None:
try:
self._pygit = PyGitRepository(self)
except Exception:
pass
return self._pygit

@classmethod
def from_json_dict(cls, json_data):
"""Create a new instance based on data contained by the `json_data` dictionary,
Expand Down Expand Up @@ -416,6 +429,11 @@ def checkout_orphan_branch(self, branch_name):
def clean(self):
self._git("clean -fd")

def cleanup(self):
if self._pygit is not None:
self._pygit.cleanup()
self._pygit = None

def clone(self, no_checkout=False, bare=False, **kwargs):
self._log_info("cloning repository")
shutil.rmtree(self.path, True)
Expand Down Expand Up @@ -609,7 +627,17 @@ def get_json(self, commit, path, raw=False):

def get_file(self, commit, path, raw=False):
path = Path(path).as_posix()
return self._git("show {}:{}", commit, path, raw=raw)
if raw:
return self._git("show {}:{}", commit, path, raw=raw)
try:
out = self.pygit.get_file(commit, path)
# if not out:
# import pdb; pdb.set_trace()
return out
except TAFError as e:
raise e
except Exception:
return self._git("show {}:{}", commit, path, raw=raw)

def get_first_commit_on_branch(self, branch=None):
branch = branch or self.default_branch
Expand Down Expand Up @@ -714,6 +742,15 @@ def is_remote_branch(self, branch_name):
return False

def list_files_at_revision(self, commit, path=""):
path = Path(path).as_posix()
try:
return self.pygit.list_files_at_revision(commit, path)
except TAFError as e:
raise e
except Exception:
return self._list_files_at_revision(commit, path)

def _list_files_at_revision(self, commit, path):
if path is None:
path = ""
file_names = self._git("ls-tree -r --name-only {}", commit)
Expand Down
114 changes: 114 additions & 0 deletions taf/pygit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
import pygit2
from taf.log import taf_logger as logger
from taf.exceptions import GitError
import os.path


class PyGitRepository:
def __init__(
self,
encapsulating_repo,
*args,
**kwargs,
):
self.encapsulating_repo = encapsulating_repo
self.path = encapsulating_repo.path
self.repo = pygit2.Repository(str(self.path))

def _get_child(self, parent, path_part):
"""
Return the child object of a parent object.
Used for walking a git tree.
"""
try:
out = parent[path_part]
except KeyError:
return None
else:
return self.repo[out.id]

def _get_object_at_path(self, obj, path):
"""
for the given commit object,
get the object at the given path
"""
working = obj.tree
if path.endswith("/"):
path = path[:-1]
path = path.split("/")
for part in path:
working = self._get_child(working, part)
if working is None:
return None
return working

def _get_blob_at_path(self, obj, path):
"""
for the given commit object,
get the blob at the given path
"""
logger.debug("Get blob at path %s", path)
working = self._get_object_at_path(obj, path)
if working and isinstance(working, pygit2.Blob):
logger.debug("Found blob at path %s", "/".join(path))
return working
logger.debug("Blob not found at path %s", "/".join(path))
return None

def cleanup(self):
"""
Must call this function in order to release pygit2 file handles.
"""
self.repo.free()

def get_file(self, commit, path):
"""
for the given commit string,
return the string contents of the blob at the
given path, if it exists, otherwise raise GitError
"""
obj = self.repo.get(commit)
blob = self._get_blob_at_path(obj, path)
if blob is None:
raise GitError(
self.encapsulating_repo,
message=f"fatal: Path '{path}' does not exist in '{commit}'",
)
else:
return blob.read_raw().decode()

def _list_files_at_revision(self, tree, path="", results=None):
"""
recurse through tree and return paths relative to that tree for
all blobs in that tree.
"""
if results is None:
results = []

for entry in tree:
new_path = os.path.join(path, entry.name)
if entry.type == "blob":
results.append(new_path)
elif entry.type == "tree":
obj = self._get_child(tree, entry.name)
self._list_files_at_revision(obj, new_path, results)
else:
raise NotImplementedError(
f"object at '{new_path}' of type '{entry.name}' not supported"
)
return results

def list_files_at_revision(self, commit, path):
"""
for the given commit string,
return a list of all file paths that are
descendents of the path string.
"""
obj = self.repo.get(commit)
root = self._get_object_at_path(obj, path)
if root is None:
raise GitError(
self.encapsulating_repo,
message=f"fatal: Path '{path}' does not exist in '{commit}'",
)
return self._list_files_at_revision(root)
2 changes: 2 additions & 0 deletions taf/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ def _copy_repos(test_dir_path, test_name):
repo_rel_path = Path(root).relative_to(test_dir_path)
dst_path = TEST_DATA_ORIGIN_PATH / test_name / repo_rel_path
# convert dst_path to string in order to support python 3.5
shutil.rmtree(dst_path, ignore_errors=True)
shutil.copytree(root, str(dst_path))
(dst_path / "git").rename(dst_path / ".git")
repo_rel_path = Path(repo_rel_path).as_posix()
Expand All @@ -101,6 +102,7 @@ def _load_key(keystore_path, key_name, scheme):

@fixture(scope="session", autouse=True)
def output_path():
shutil.rmtree(TEST_OUTPUT_PATH, ignore_errors=True)
TEST_OUTPUT_PATH.mkdir()
yield TEST_OUTPUT_PATH
shutil.rmtree(TEST_OUTPUT_PATH, onerror=on_rm_error)
Expand Down
1 change: 1 addition & 0 deletions taf/updater/handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,7 @@ def cleanup(self):
shutil.rmtree(self.current_path)
if self.previous_path.is_dir():
shutil.rmtree(self.previous_path)
self.validation_auth_repo.cleanup()
temp_dir = Path(self.validation_auth_repo.path, os.pardir).parent
shutil.rmtree(str(temp_dir), onerror=on_rm_error)

Expand Down

0 comments on commit fd99b88

Please sign in to comment.