diff --git a/revision_utils.py b/revision_utils.py index 51e0b15cd..b81b560de 100644 --- a/revision_utils.py +++ b/revision_utils.py @@ -63,7 +63,10 @@ def get_deletion_candidates(ctx, revision_strategy, revisions, initial_upper_tim # List of bucket index with per bucket a list of its revisions within that bucket # [[data_ids0],[data_ids1]] bucket_revisions = [] + non_bucket_revisions = [] + revision_found_in_bucket = False + # Sort revisions by bucket for bucket in buckets: t1 = t2 t2 = t1 - bucket[0] @@ -72,10 +75,16 @@ def get_deletion_candidates(ctx, revision_strategy, revisions, initial_upper_tim for revision in revisions: if revision[1] <= t1 and revision[1] > t2: # Link the bucket and the revision together so its clear which revisions belong into which bucket + revision_found_in_bucket = True revision_list.append(revision[0]) # append data-id # Link the collected data_ids (revision_ids) to the corresponding bucket bucket_revisions.append(revision_list) + # Get revisions that predate all buckets + for revision in revisions: + if revision[1] < t2: + non_bucket_revisions.append(revision[0]) + # Per bucket find the revision candidates for deletion bucket_counter = 0 for rev_list in bucket_revisions: @@ -108,12 +117,24 @@ def get_deletion_candidates(ctx, revision_strategy, revisions, initial_upper_tim bucket_counter += 1 # To keep conciding with strategy list + # If there are revisions in any bucket, remove all revisions before defined buckets. If there are + # no revisions in buckets, remove all revisions before defined buckets except the last one. + if len(non_bucket_revisions) > 1 or (len(non_bucket_revisions) == 1 and revision_found_in_bucket): + nr_to_be_removed = len(non_bucket_revisions) - (0 if revision_found_in_bucket else 1) + count = 0 + while count < nr_to_be_removed: + index = count + (0 if revision_found_in_bucket else 1) + if verbose: + log.write(ctx, 'Scheduling revision <{}> (older than buckets) for removal.'.format(str(index))) + deletion_candidates.append(non_bucket_revisions[index]) + count += 1 + return deletion_candidates def revision_cleanup_prefilter(ctx, revisions_list, revision_strategy_name, verbose): """Filters out revisioned data objects from a list if we can easily determine that they don't meet criteria for being removed, - for example if the number of revisions is less than the minimum bucket size. + for example if the number of revisions is at most one, and the minimum bucket size is at least one. This prefilter is performed in the scan phase. A full check of the remaining versioned data objects will be performed in the processing phase. @@ -139,4 +160,4 @@ def revision_cleanup_prefilter(ctx, revisions_list, revision_strategy_name, verb if verbose: log.write(ctx, "Removing following revisioned data objects in prefiltering for cleanup: " + str([object for object in revisions_list if len(object) <= minimum_bucket_size])) - return [object for object in revisions_list if len(object) > minimum_bucket_size] + return [object for object in revisions_list if len(object) > min(minimum_bucket_size, 1)] diff --git a/unit-tests/test_revisions.py b/unit-tests/test_revisions.py index 9903c260c..5393b2d7c 100644 --- a/unit-tests/test_revisions.py +++ b/unit-tests/test_revisions.py @@ -31,7 +31,10 @@ def test_revision_cleanup_prefilter(self): self.assertEquals(single_output, []) # Does not exceed min. bucket size for strategy B two_input = [[(1, 123, "/foo/bar/baz"), (2, 234, "/foo/bar/baz")]] two_output = revision_cleanup_prefilter(None, two_input, "B", False) - self.assertEquals(two_output, []) # Does not exceed min. bucket size for strategy B + # Does not exceed min. bucket size for strategy B + # But more than 1 revision (so cannot prefilter, because + # revisions could be outside defined buckets) + self.assertEquals(two_output, two_input) three_input = [[(1, 123, "/foo/bar/baz"), (2, 234, "/foo/bar/baz"), (3, 345, "/foo/bar/baz")]] three_output = revision_cleanup_prefilter(None, three_input, "B", False) self.assertEquals(three_output, three_input) # Exceeds min. bucket size for strategy B @@ -104,6 +107,23 @@ def test_revision_deletion_1_before_buckets(self): output = get_deletion_candidates(None, revision_strategy, revisions, 1000000000, False) self.assertEquals(output, []) + def test_revision_deletion_1_bucket_1_before(self): + dummy_time = 1000000000 + revision_strategy = get_revision_strategy("B") + revisions = [(1, dummy_time - 60, "/foo/bar/baz"), + (2, dummy_time - 365 * 24 * 3600, "/foo/bar/baz")] + output = get_deletion_candidates(None, revision_strategy, revisions, 1000000000, False) + self.assertEquals(output, [2]) + + def test_revision_deletion_1_bucket_2_before(self): + dummy_time = 1000000000 + revision_strategy = get_revision_strategy("B") + revisions = [(1, dummy_time - 60, "/foo/bar/baz"), + (2, dummy_time - 365 * 24 * 3600 - 60, "/foo/bar/baz"), + (3, dummy_time - 365 * 24 * 3600 - 90, "/foo/bar/baz")] + output = get_deletion_candidates(None, revision_strategy, revisions, 1000000000, False) + self.assertEquals(output, [2, 3]) + def test_revision_deletion_3_before_buckets(self): dummy_time = 1000000000 revision_strategy = get_revision_strategy("B") @@ -111,4 +131,4 @@ def test_revision_deletion_3_before_buckets(self): (2, dummy_time - 365 * 24 * 3600 - 120, "/foo/bar/baz"), (3, dummy_time - 365 * 24 * 3600 - 180, "/foo/bar/baz")] output = get_deletion_candidates(None, revision_strategy, revisions, 1000000000, False) - self.assertEquals(output, []) + self.assertEquals(output, [2, 3])