From 6790ff377ecc73e5b0bdfc440c8d530ab19d27ba Mon Sep 17 00:00:00 2001 From: David Greisen Date: Sat, 5 Feb 2022 17:52:40 -0500 Subject: [PATCH 1/4] test: cleanup before and after each test_updater test Before cleanup only happened after. So if tests fail, must manually cleanup before tests can be run. Now run tests before as well to clean up from any previous failed run. --- taf/tests/conftest.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/taf/tests/conftest.py b/taf/tests/conftest.py index 148300f4..5dbb1d96 100644 --- a/taf/tests/conftest.py +++ b/taf/tests/conftest.py @@ -80,6 +80,7 @@ def _copy_repos(test_dir_path, test_name): repo_rel_path = Path(root).relative_to(test_dir_path) dst_path = TEST_DATA_ORIGIN_PATH / test_name / repo_rel_path # convert dst_path to string in order to support python 3.5 + shutil.rmtree(dst_path, ignore_errors=True) shutil.copytree(root, str(dst_path)) (dst_path / "git").rename(dst_path / ".git") repo_rel_path = Path(repo_rel_path).as_posix() @@ -101,6 +102,7 @@ def _load_key(keystore_path, key_name, scheme): @fixture(scope="session", autouse=True) def output_path(): + shutil.rmtree(TEST_OUTPUT_PATH, ignore_errors=True) TEST_OUTPUT_PATH.mkdir() yield TEST_OUTPUT_PATH shutil.rmtree(TEST_OUTPUT_PATH, onerror=on_rm_error) From af81dc63a0fe1130b21e1c21bcf9dc4db8a081f4 Mon Sep 17 00:00:00 2001 From: David Greisen Date: Thu, 3 Feb 2022 23:18:09 -0500 Subject: [PATCH 2/4] perf: implement get_file in pygit2 Git cli operations are slow on windows. Since git.py is just a wrapper around git cli, it is also very slow. By reimplementing get_file using pygit2 cumtime for that call drops from 524 secs to 3.59 secs. This requires a new dependency on pygit2. pinned to a very old version that is available on our old ubuntu boxes and has built wheels for 3.6. Added a new PyGitRepository that hides the complexity of pygit2. I recall that renata was having a lot of problems with pygit2 being flaky on windows, so we always fall back to the git cli. We must cleanup the pygit2 repo because it keeps file handles open, so can't delete it or anything else. --- setup.py | 1 + taf/git.py | 30 +++++++++++++++++- taf/pygit.py | 69 +++++++++++++++++++++++++++++++++++++++++ taf/updater/handlers.py | 1 + 4 files changed, 100 insertions(+), 1 deletion(-) create mode 100644 taf/pygit.py diff --git a/setup.py b/setup.py index af0d524d..e1b87466 100644 --- a/setup.py +++ b/setup.py @@ -64,6 +64,7 @@ def finalize_options(self): "loguru==0.4.0", "cryptography==3.2.1", "pyOpenSSL==20.0.1", + "pygit2==0.28.2", ], extras_require={ "ci": ci_require, diff --git a/taf/git.py b/taf/git.py index bd0c20ff..902c5520 100644 --- a/taf/git.py +++ b/taf/git.py @@ -10,6 +10,7 @@ import taf.settings as settings from taf.exceptions import ( + TAFError, CloneRepoException, FetchException, InvalidRepositoryError, @@ -17,6 +18,7 @@ ) from taf.log import taf_logger from taf.utils import run +from .pygit import PyGitRepository EMPTY_TREE = "4b825dc642cb6eb9a060e54bf8d69288fbee4904" @@ -84,6 +86,17 @@ def __init__( self.default_branch = default_branch self.custom = custom or {} + _pygit = None + + @property + def pygit(self): + if self._pygit is None: + try: + self._pygit = PyGitRepository(self) + except Exception: + pass + return self._pygit + @classmethod def from_json_dict(cls, json_data): """Create a new instance based on data contained by the `json_data` dictionary, @@ -416,6 +429,11 @@ def checkout_orphan_branch(self, branch_name): def clean(self): self._git("clean -fd") + def cleanup(self): + if self._pygit is not None: + self._pygit.cleanup() + self._pygit = None + def clone(self, no_checkout=False, bare=False, **kwargs): self._log_info("cloning repository") shutil.rmtree(self.path, True) @@ -609,7 +627,17 @@ def get_json(self, commit, path, raw=False): def get_file(self, commit, path, raw=False): path = Path(path).as_posix() - return self._git("show {}:{}", commit, path, raw=raw) + if raw: + return self._git("show {}:{}", commit, path, raw=raw) + try: + out = self.pygit.get_file(commit, path) + # if not out: + # import pdb; pdb.set_trace() + return out + except TAFError as e: + raise e + except Exception: + return self._git("show {}:{}", commit, path, raw=raw) def get_first_commit_on_branch(self, branch=None): branch = branch or self.default_branch diff --git a/taf/pygit.py b/taf/pygit.py new file mode 100644 index 00000000..b5c57172 --- /dev/null +++ b/taf/pygit.py @@ -0,0 +1,69 @@ +import pygit2 +from taf.log import taf_logger as logger +from taf.exceptions import GitError + + +class PyGitRepository: + def __init__( + self, + encapsulating_repo, + *args, + **kwargs, + ): + self.encapsulating_repo = encapsulating_repo + self.path = encapsulating_repo.path + self.repo = pygit2.Repository(str(self.path)) + + def _get_child(self, parent, path_part): + """ + Return the child object of a parent object. + Used for walking a git tree. + """ + try: + out = parent[path_part] + except KeyError: + return None + else: + return self.repo[out.id] + + def _get_blob_at_path(self, obj, path): + """ + for the given commit object, + get the blob at the given path + """ + logger.debug("Get blob at path %s", path) + working = obj.tree + if path.endswith("/"): + path = path[:-1] + path = path.split("/") + for part in path: + working = self._get_child(working, part) + if working is None: + return None + if working and isinstance(working, pygit2.Blob): + logger.debug("Found blob at path %s", "/".join(path)) + return working + logger.debug("Blob not found at path %s", "/".join(path)) + return None + + def cleanup(self): + """ + Must call this function in order to release pygit2 file handles. + """ + self.repo.free() + + def get_file(self, commit, path): + """ + for the given commit string, + return the string contents of the blob at the + given path, if it exists, otherwise raise GitError + """ + obj = self.repo.get(commit) + blob = self._get_blob_at_path(obj, path) + if blob is None: + raise GitError( + self.encapsulating_repo, + message=f"fatal: Path '{path}' does not exist in '{commit}'", + ) + else: + return blob.read_raw().decode() diff --git a/taf/updater/handlers.py b/taf/updater/handlers.py index 38d36b67..6adef794 100644 --- a/taf/updater/handlers.py +++ b/taf/updater/handlers.py @@ -258,6 +258,7 @@ def cleanup(self): shutil.rmtree(self.current_path) if self.previous_path.is_dir(): shutil.rmtree(self.previous_path) + self.validation_auth_repo.cleanup() temp_dir = Path(self.validation_auth_repo.path, os.pardir).parent shutil.rmtree(str(temp_dir), onerror=on_rm_error) From ddba6f40a14a959d6483e0f3b3bbeeb16653bc3c Mon Sep 17 00:00:00 2001 From: David Greisen Date: Sat, 5 Feb 2022 18:00:44 -0500 Subject: [PATCH 3/4] perf: Finish implementing slow git calls with pygit2 Next slowest git command after get_file was list_files_at_revision, so reimplemented it with pygit2. split _get_object_at_path out of _get_blob_at_path, so could get use it to get Tree objs as well as blobs. Then _list_files_at_revision just recurses through the tree starting at path This covers the low hanging git fruit. Bulk of time is now spent in TUF methods. --- taf/git.py | 9 +++++++++ taf/pygit.py | 51 ++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 57 insertions(+), 3 deletions(-) diff --git a/taf/git.py b/taf/git.py index 902c5520..beac295f 100644 --- a/taf/git.py +++ b/taf/git.py @@ -742,6 +742,15 @@ def is_remote_branch(self, branch_name): return False def list_files_at_revision(self, commit, path=""): + path = Path(path).as_posix() + try: + return self.pygit.list_files_at_revision(commit, path) + except TAFError as e: + raise e + except Exception: + return self._list_files_at_revision(commit, path) + + def _list_files_at_revision(self, commit, path): if path is None: path = "" file_names = self._git("ls-tree -r --name-only {}", commit) diff --git a/taf/pygit.py b/taf/pygit.py index b5c57172..56cefee4 100644 --- a/taf/pygit.py +++ b/taf/pygit.py @@ -1,6 +1,7 @@ import pygit2 from taf.log import taf_logger as logger from taf.exceptions import GitError +import os.path class PyGitRepository: @@ -26,12 +27,11 @@ def _get_child(self, parent, path_part): else: return self.repo[out.id] - def _get_blob_at_path(self, obj, path): + def _get_object_at_path(self, obj, path): """ for the given commit object, - get the blob at the given path + get the object at the given path """ - logger.debug("Get blob at path %s", path) working = obj.tree if path.endswith("/"): path = path[:-1] @@ -40,6 +40,15 @@ def _get_blob_at_path(self, obj, path): working = self._get_child(working, part) if working is None: return None + return working + + def _get_blob_at_path(self, obj, path): + """ + for the given commit object, + get the blob at the given path + """ + logger.debug("Get blob at path %s", path) + working = self._get_object_at_path(obj, path) if working and isinstance(working, pygit2.Blob): logger.debug("Found blob at path %s", "/".join(path)) return working @@ -67,3 +76,39 @@ def get_file(self, commit, path): ) else: return blob.read_raw().decode() + + def _list_files_at_revision(self, tree, path="", results=None): + """ + recurse through tree and return paths relative to that tree for + all blobs in that tree. + """ + if results is None: + results = [] + + for entry in tree: + new_path = os.path.join(path, entry.name) + if entry.type == "blob": + results.append(new_path) + elif entry.type == "tree": + obj = self._get_child(tree, entry.name) + self._list_files_at_revision(obj, new_path, results) + else: + raise NotImplementedError( + f"object at '{new_path}' of type '{entry.name}' not supported" + ) + return results + + def list_files_at_revision(self, commit, path): + """ + for the given commit string, + return a list of all file paths that are + descendents of the path string. + """ + obj = self.repo.get(commit) + root = self._get_object_at_path(obj, path) + if root is None: + raise GitError( + self.encapsulating_repo, + message=f"fatal: Path '{path}' does not exist in '{commit}'", + ) + return self._list_files_at_revision(root) From 9fae842f6ec457f570b94ce0f733ba7017d5fd8a Mon Sep 17 00:00:00 2001 From: David Greisen Date: Fri, 4 Feb 2022 21:29:06 -0500 Subject: [PATCH 4/4] update changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 37b30fcf..89b43c44 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,10 +13,12 @@ and this project adheres to [Semantic Versioning][semver]. ### Changed +- perf: re-implementing slow git cmds with pygit2 ([207]) ### Fixed +[207]: https://github.com/openlawlibrary/taf/pull/207x ## [0.14.0] - 01/25/2022