Skip to content

Commit

Permalink
Merge branch 'tb/path-filter-fix'
Browse files Browse the repository at this point in the history
The Bloom filter used for path limited history traversal was broken
on systems whose "char" is unsigned; update the implementation and
bump the format version to 2.

* tb/path-filter-fix:
  bloom: introduce `deinit_bloom_filters()`
  commit-graph: reuse existing Bloom filters where possible
  object.h: fix mis-aligned flag bits table
  commit-graph: new Bloom filter version that fixes murmur3
  commit-graph: unconditionally load Bloom filters
  bloom: prepare to discard incompatible Bloom filters
  bloom: annotate filters with hash version
  repo-settings: introduce commitgraph.changedPathsVersion
  t4216: test changed path filters with high bit paths
  t/helper/test-read-graph: implement `bloom-filters` mode
  bloom.h: make `load_bloom_filter_from_graph()` public
  t/helper/test-read-graph.c: extract `dump_graph_info()`
  gitformat-commit-graph: describe version 2 of BDAT
  commit-graph: ensure Bloom filters are read with consistent settings
  revision.c: consult Bloom filters for root commits
  t/t4216-log-bloom.sh: harden `test_bloom_filters_not_used()`
  • Loading branch information
gitster committed Jul 8, 2024
2 parents 6f75d23 + 9c8a9ec commit ecf7fc6
Show file tree
Hide file tree
Showing 14 changed files with 736 additions and 58 deletions.
29 changes: 26 additions & 3 deletions Documentation/config/commitgraph.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,29 @@ commitGraph.maxNewFilters::
commit-graph write` (c.f., linkgit:git-commit-graph[1]).

commitGraph.readChangedPaths::
If true, then git will use the changed-path Bloom filters in the
commit-graph file (if it exists, and they are present). Defaults to
true. See linkgit:git-commit-graph[1] for more information.
Deprecated. Equivalent to commitGraph.changedPathsVersion=-1 if true, and
commitGraph.changedPathsVersion=0 if false. (If commitGraph.changedPathVersion
is also set, commitGraph.changedPathsVersion takes precedence.)

commitGraph.changedPathsVersion::
Specifies the version of the changed-path Bloom filters that Git will read and
write. May be -1, 0, 1, or 2. Note that values greater than 1 may be
incompatible with older versions of Git which do not yet understand
those versions. Use caution when operating in a mixed-version
environment.
+
Defaults to -1.
+
If -1, Git will use the version of the changed-path Bloom filters in the
repository, defaulting to 1 if there are none.
+
If 0, Git will not read any Bloom filters, and will write version 1 Bloom
filters when instructed to write.
+
If 1, Git will only read version 1 Bloom filters, and will write version 1
Bloom filters.
+
If 2, Git will only read version 2 Bloom filters, and will write version 2
Bloom filters.
+
See linkgit:git-commit-graph[1] for more information.
9 changes: 6 additions & 3 deletions Documentation/gitformat-commit-graph.txt
Original file line number Diff line number Diff line change
Expand Up @@ -142,13 +142,16 @@ All multi-byte numbers are in network byte order.

==== Bloom Filter Data (ID: {'B', 'D', 'A', 'T'}) [Optional]
* It starts with header consisting of three unsigned 32-bit integers:
- Version of the hash algorithm being used. We currently only support
value 1 which corresponds to the 32-bit version of the murmur3 hash
- Version of the hash algorithm being used. We currently support
value 2 which corresponds to the 32-bit version of the murmur3 hash
implemented exactly as described in
https://en.wikipedia.org/wiki/MurmurHash#Algorithm and the double
hashing technique using seed values 0x293ae76f and 0x7e646e2 as
described in https://doi.org/10.1007/978-3-540-30494-4_26 "Bloom Filters
in Probabilistic Verification"
in Probabilistic Verification". Version 1 Bloom filters have a bug that appears
when char is signed and the repository has path names that have characters >=
0x80; Git supports reading and writing them, but this ability will be removed
in a future version of Git.
- The number of times a path is hashed and hence the number of bit positions
that cumulatively determine whether a file is present in the commit.
- The minimum number of bits 'b' per entry in the Bloom filter. If the filter
Expand Down
208 changes: 196 additions & 12 deletions bloom.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
#include "commit-graph.h"
#include "commit.h"
#include "commit-slab.h"
#include "tree.h"
#include "tree-walk.h"
#include "config.h"
#include "repository.h"

define_commit_slab(bloom_filter_slab, struct bloom_filter);
Expand Down Expand Up @@ -49,9 +52,9 @@ static int check_bloom_offset(struct commit_graph *g, uint32_t pos,
return -1;
}

static int load_bloom_filter_from_graph(struct commit_graph *g,
struct bloom_filter *filter,
uint32_t graph_pos)
int load_bloom_filter_from_graph(struct commit_graph *g,
struct bloom_filter *filter,
uint32_t graph_pos)
{
uint32_t lex_pos, start_index, end_index;

Expand Down Expand Up @@ -89,6 +92,8 @@ static int load_bloom_filter_from_graph(struct commit_graph *g,
filter->data = (unsigned char *)(g->chunk_bloom_data +
sizeof(unsigned char) * start_index +
BLOOMDATA_CHUNK_HEADER_SIZE);
filter->version = g->bloom_filter_settings->hash_version;
filter->to_free = NULL;

return 1;
}
Expand All @@ -100,7 +105,64 @@ static int load_bloom_filter_from_graph(struct commit_graph *g,
* Not considered to be cryptographically secure.
* Implemented as described in https://en.wikipedia.org/wiki/MurmurHash#Algorithm
*/
uint32_t murmur3_seeded(uint32_t seed, const char *data, size_t len)
uint32_t murmur3_seeded_v2(uint32_t seed, const char *data, size_t len)
{
const uint32_t c1 = 0xcc9e2d51;
const uint32_t c2 = 0x1b873593;
const uint32_t r1 = 15;
const uint32_t r2 = 13;
const uint32_t m = 5;
const uint32_t n = 0xe6546b64;
int i;
uint32_t k1 = 0;
const char *tail;

int len4 = len / sizeof(uint32_t);

uint32_t k;
for (i = 0; i < len4; i++) {
uint32_t byte1 = (uint32_t)(unsigned char)data[4*i];
uint32_t byte2 = ((uint32_t)(unsigned char)data[4*i + 1]) << 8;
uint32_t byte3 = ((uint32_t)(unsigned char)data[4*i + 2]) << 16;
uint32_t byte4 = ((uint32_t)(unsigned char)data[4*i + 3]) << 24;
k = byte1 | byte2 | byte3 | byte4;
k *= c1;
k = rotate_left(k, r1);
k *= c2;

seed ^= k;
seed = rotate_left(seed, r2) * m + n;
}

tail = (data + len4 * sizeof(uint32_t));

switch (len & (sizeof(uint32_t) - 1)) {
case 3:
k1 ^= ((uint32_t)(unsigned char)tail[2]) << 16;
/*-fallthrough*/
case 2:
k1 ^= ((uint32_t)(unsigned char)tail[1]) << 8;
/*-fallthrough*/
case 1:
k1 ^= ((uint32_t)(unsigned char)tail[0]) << 0;
k1 *= c1;
k1 = rotate_left(k1, r1);
k1 *= c2;
seed ^= k1;
break;
}

seed ^= (uint32_t)len;
seed ^= (seed >> 16);
seed *= 0x85ebca6b;
seed ^= (seed >> 13);
seed *= 0xc2b2ae35;
seed ^= (seed >> 16);

return seed;
}

static uint32_t murmur3_seeded_v1(uint32_t seed, const char *data, size_t len)
{
const uint32_t c1 = 0xcc9e2d51;
const uint32_t c2 = 0x1b873593;
Expand Down Expand Up @@ -165,8 +227,14 @@ void fill_bloom_key(const char *data,
int i;
const uint32_t seed0 = 0x293ae76f;
const uint32_t seed1 = 0x7e646e2c;
const uint32_t hash0 = murmur3_seeded(seed0, data, len);
const uint32_t hash1 = murmur3_seeded(seed1, data, len);
uint32_t hash0, hash1;
if (settings->hash_version == 2) {
hash0 = murmur3_seeded_v2(seed0, data, len);
hash1 = murmur3_seeded_v2(seed1, data, len);
} else {
hash0 = murmur3_seeded_v1(seed0, data, len);
hash1 = murmur3_seeded_v1(seed1, data, len);
}

key->hashes = (uint32_t *)xcalloc(settings->num_hashes, sizeof(uint32_t));
for (i = 0; i < settings->num_hashes; i++)
Expand Down Expand Up @@ -198,6 +266,18 @@ void init_bloom_filters(void)
init_bloom_filter_slab(&bloom_filters);
}

static void free_one_bloom_filter(struct bloom_filter *filter)
{
if (!filter)
return;
free(filter->to_free);
}

void deinit_bloom_filters(void)
{
deep_clear_bloom_filter_slab(&bloom_filters, free_one_bloom_filter);
}

static int pathmap_cmp(const void *hashmap_cmp_fn_data UNUSED,
const struct hashmap_entry *eptr,
const struct hashmap_entry *entry_or_key,
Expand All @@ -211,11 +291,97 @@ static int pathmap_cmp(const void *hashmap_cmp_fn_data UNUSED,
return strcmp(e1->path, e2->path);
}

static void init_truncated_large_filter(struct bloom_filter *filter)
static void init_truncated_large_filter(struct bloom_filter *filter,
int version)
{
filter->data = xmalloc(1);
filter->data = filter->to_free = xmalloc(1);
filter->data[0] = 0xFF;
filter->len = 1;
filter->version = version;
}

#define VISITED (1u<<21)
#define HIGH_BITS (1u<<22)

static int has_entries_with_high_bit(struct repository *r, struct tree *t)
{
if (parse_tree(t))
return 1;

if (!(t->object.flags & VISITED)) {
struct tree_desc desc;
struct name_entry entry;

init_tree_desc(&desc, &t->object.oid, t->buffer, t->size);
while (tree_entry(&desc, &entry)) {
size_t i;
for (i = 0; i < entry.pathlen; i++) {
if (entry.path[i] & 0x80) {
t->object.flags |= HIGH_BITS;
goto done;
}
}

if (S_ISDIR(entry.mode)) {
struct tree *sub = lookup_tree(r, &entry.oid);
if (sub && has_entries_with_high_bit(r, sub)) {
t->object.flags |= HIGH_BITS;
goto done;
}
}

}

done:
t->object.flags |= VISITED;
}

return !!(t->object.flags & HIGH_BITS);
}

static int commit_tree_has_high_bit_paths(struct repository *r,
struct commit *c)
{
struct tree *t;
if (repo_parse_commit(r, c))
return 1;
t = repo_get_commit_tree(r, c);
if (!t)
return 1;
return has_entries_with_high_bit(r, t);
}

static struct bloom_filter *upgrade_filter(struct repository *r, struct commit *c,
struct bloom_filter *filter,
int hash_version)
{
struct commit_list *p = c->parents;
if (commit_tree_has_high_bit_paths(r, c))
return NULL;

if (p && commit_tree_has_high_bit_paths(r, p->item))
return NULL;

filter->version = hash_version;

return filter;
}

struct bloom_filter *get_bloom_filter(struct repository *r, struct commit *c)
{
struct bloom_filter *filter;
int hash_version;

filter = get_or_compute_bloom_filter(r, c, 0, NULL, NULL);
if (!filter)
return NULL;

prepare_repo_settings(r);
hash_version = r->settings.commit_graph_changed_paths_version;

if (!(hash_version == -1 || hash_version == filter->version))
return NULL; /* unusable filter */
return filter;
}

struct bloom_filter *get_or_compute_bloom_filter(struct repository *r,
Expand Down Expand Up @@ -243,8 +409,23 @@ struct bloom_filter *get_or_compute_bloom_filter(struct repository *r,
filter, graph_pos);
}

if (filter->data && filter->len)
return filter;
if (filter->data && filter->len) {
struct bloom_filter *upgrade;
if (!settings || settings->hash_version == filter->version)
return filter;

/* version mismatch, see if we can upgrade */
if (compute_if_not_present &&
git_env_bool("GIT_TEST_UPGRADE_BLOOM_FILTERS", 1)) {
upgrade = upgrade_filter(r, c, filter,
settings->hash_version);
if (upgrade) {
if (computed)
*computed |= BLOOM_UPGRADED;
return upgrade;
}
}
}
if (!compute_if_not_present)
return NULL;

Expand Down Expand Up @@ -300,19 +481,22 @@ struct bloom_filter *get_or_compute_bloom_filter(struct repository *r,
}

if (hashmap_get_size(&pathmap) > settings->max_changed_paths) {
init_truncated_large_filter(filter);
init_truncated_large_filter(filter,
settings->hash_version);
if (computed)
*computed |= BLOOM_TRUNC_LARGE;
goto cleanup;
}

filter->len = (hashmap_get_size(&pathmap) * settings->bits_per_entry + BITS_PER_WORD - 1) / BITS_PER_WORD;
filter->version = settings->hash_version;
if (!filter->len) {
if (computed)
*computed |= BLOOM_TRUNC_EMPTY;
filter->len = 1;
}
CALLOC_ARRAY(filter->data, filter->len);
filter->to_free = filter->data;

hashmap_for_each_entry(&pathmap, &iter, e, entry) {
struct bloom_key key;
Expand All @@ -326,7 +510,7 @@ struct bloom_filter *get_or_compute_bloom_filter(struct repository *r,
} else {
for (i = 0; i < diff_queued_diff.nr; i++)
diff_free_filepair(diff_queued_diff.queue[i]);
init_truncated_large_filter(filter);
init_truncated_large_filter(filter, settings->hash_version);

if (computed)
*computed |= BLOOM_TRUNC_LARGE;
Expand Down
Loading

0 comments on commit ecf7fc6

Please sign in to comment.