Skip to content

Commit

Permalink
YDA-5516: clean up old pre-bucket revisions
Browse files Browse the repository at this point in the history
In the revision cleanup job, remove all revisions before
the defined buckets, but ensure the last revision of a versioned
data object is not removed, even if it is not in a defined bucket.

This ensures that a user can always restore a data object to an earlier
version, but that we don't keep (many) more revisions than specified
by the revision strategy.
  • Loading branch information
stsnel committed Nov 3, 2023
1 parent 81ffef8 commit 487cfce
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 4 deletions.
25 changes: 23 additions & 2 deletions revision_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,10 @@ def get_deletion_candidates(ctx, revision_strategy, revisions, initial_upper_tim
# List of bucket index with per bucket a list of its revisions within that bucket
# [[data_ids0],[data_ids1]]
bucket_revisions = []
non_bucket_revisions = []
revision_found_in_bucket = False

# Sort revisions by bucket
for bucket in buckets:
t1 = t2
t2 = t1 - bucket[0]
Expand All @@ -72,10 +75,16 @@ def get_deletion_candidates(ctx, revision_strategy, revisions, initial_upper_tim
for revision in revisions:
if revision[1] <= t1 and revision[1] > t2:
# Link the bucket and the revision together so its clear which revisions belong into which bucket
revision_found_in_bucket = True
revision_list.append(revision[0]) # append data-id
# Link the collected data_ids (revision_ids) to the corresponding bucket
bucket_revisions.append(revision_list)

# Get revisions that predate all buckets
for revision in revisions:
if revision[1] < t2:
non_bucket_revisions.append(revision[0])

# Per bucket find the revision candidates for deletion
bucket_counter = 0
for rev_list in bucket_revisions:
Expand Down Expand Up @@ -108,12 +117,24 @@ def get_deletion_candidates(ctx, revision_strategy, revisions, initial_upper_tim

bucket_counter += 1 # To keep conciding with strategy list

# If there are revisions in any bucket, remove all revisions before defined buckets. If there are
# no revisions in buckets, remove all revisions before defined buckets except the last one.
if len(non_bucket_revisions) > 1 or (len(non_bucket_revisions) == 1 and revision_found_in_bucket):
nr_to_be_removed = len(non_bucket_revisions) - (0 if revision_found_in_bucket else 1)
count = 0
while count < nr_to_be_removed:
index = count + (0 if revision_found_in_bucket else 1)
if verbose:
log.write(ctx, 'Scheduling revision <{}> (older than buckets) for removal.'.format(str(index)))
deletion_candidates.append(non_bucket_revisions[index])
count += 1

return deletion_candidates


def revision_cleanup_prefilter(ctx, revisions_list, revision_strategy_name, verbose):
"""Filters out revisioned data objects from a list if we can easily determine that they don't meet criteria for being removed,
for example if the number of revisions is less than the minimum bucket size.
for example if the number of revisions is at most one, and the minimum bucket size is at least one.
This prefilter is performed in the scan phase. A full check of the remaining versioned data objects will be performed in the
processing phase.
Expand All @@ -139,4 +160,4 @@ def revision_cleanup_prefilter(ctx, revisions_list, revision_strategy_name, verb
if verbose:
log.write(ctx, "Removing following revisioned data objects in prefiltering for cleanup: "
+ str([object for object in revisions_list if len(object) <= minimum_bucket_size]))
return [object for object in revisions_list if len(object) > minimum_bucket_size]
return [object for object in revisions_list if len(object) > min(minimum_bucket_size, 1)]
24 changes: 22 additions & 2 deletions unit-tests/test_revisions.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,10 @@ def test_revision_cleanup_prefilter(self):
self.assertEquals(single_output, []) # Does not exceed min. bucket size for strategy B
two_input = [[(1, 123, "/foo/bar/baz"), (2, 234, "/foo/bar/baz")]]
two_output = revision_cleanup_prefilter(None, two_input, "B", False)
self.assertEquals(two_output, []) # Does not exceed min. bucket size for strategy B
# Does not exceed min. bucket size for strategy B
# But more than 1 revision (so cannot prefilter, because
# revisions could be outside defined buckets)
self.assertEquals(two_output, two_input)
three_input = [[(1, 123, "/foo/bar/baz"), (2, 234, "/foo/bar/baz"), (3, 345, "/foo/bar/baz")]]
three_output = revision_cleanup_prefilter(None, three_input, "B", False)
self.assertEquals(three_output, three_input) # Exceeds min. bucket size for strategy B
Expand Down Expand Up @@ -104,11 +107,28 @@ def test_revision_deletion_1_before_buckets(self):
output = get_deletion_candidates(None, revision_strategy, revisions, 1000000000, False)
self.assertEquals(output, [])

def test_revision_deletion_1_bucket_1_before(self):
dummy_time = 1000000000
revision_strategy = get_revision_strategy("B")
revisions = [(1, dummy_time - 60, "/foo/bar/baz"),
(2, dummy_time - 365 * 24 * 3600, "/foo/bar/baz")]
output = get_deletion_candidates(None, revision_strategy, revisions, 1000000000, False)
self.assertEquals(output, [2])

def test_revision_deletion_1_bucket_2_before(self):
dummy_time = 1000000000
revision_strategy = get_revision_strategy("B")
revisions = [(1, dummy_time - 60, "/foo/bar/baz"),
(2, dummy_time - 365 * 24 * 3600 - 60, "/foo/bar/baz"),
(3, dummy_time - 365 * 24 * 3600 - 90, "/foo/bar/baz")]
output = get_deletion_candidates(None, revision_strategy, revisions, 1000000000, False)
self.assertEquals(output, [2, 3])

def test_revision_deletion_3_before_buckets(self):
dummy_time = 1000000000
revision_strategy = get_revision_strategy("B")
revisions = [(1, dummy_time - 365 * 24 * 3600 - 60, "/foo/bar/baz"),
(2, dummy_time - 365 * 24 * 3600 - 120, "/foo/bar/baz"),
(3, dummy_time - 365 * 24 * 3600 - 180, "/foo/bar/baz")]
output = get_deletion_candidates(None, revision_strategy, revisions, 1000000000, False)
self.assertEquals(output, [])
self.assertEquals(output, [2, 3])

0 comments on commit 487cfce

Please sign in to comment.