commit 4545b700107608ae12d074627db4e14d39f6b1fe from: Stefan Sperling via: Thomas Adam date: Fri Oct 15 19:21:55 2021 UTC use a bloom filter to avoid pointless pack index searches commit - 3333aec6eab7596a83037f6fc7abb9e0c8dee838 commit + 4545b700107608ae12d074627db4e14d39f6b1fe blob - 48637640943824f4455df6330321f83f0ea237ad blob + b68c94d43465ffea4905be005ad51f1b0cd194a8 --- gotweb/Makefile +++ gotweb/Makefile @@ -14,12 +14,13 @@ SRCS = gotweb.c parse.y blame.c commit_graph.c delta. deflate.c object_create.c delta_cache.c gotconfig.c \ diff_main.c diff_atomize_text.c diff_myers.c diff_output.c \ diff_output_plain.c diff_output_unidiff.c \ - diff_output_edscript.c diff_patience.c + diff_output_edscript.c diff_patience.c \ + bloom.c murmurhash2.c MAN = ${PROG}.conf.5 ${PROG}.8 CPPFLAGS += -I${.CURDIR}/../include -I${.CURDIR}/../lib -I${.CURDIR} \ -I${KCGIBASE}/include -LDADD += -L${KCGIBASE}/lib -lkcgihtml -lkcgi -lz +LDADD += -L${KCGIBASE}/lib -lkcgihtml -lkcgi -lz -lm LDSTATIC = ${STATIC} .if ${GOT_RELEASE} != "Yes" blob - /dev/null blob + 05e57b1f7053a73112d4713be722a197307866e2 (mode 644) --- /dev/null +++ lib/bloom.c @@ -0,0 +1,184 @@ +/* + * Copyright (c) 2012-2019, Jyri J. Virkki + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* Obtained from https://github.com/jvirkki/libbloom */ + +/* + * Refer to bloom.h for documentation on the public interfaces. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bloom.h" +#include "murmurhash2.h" + +#define MAKESTRING(n) STRING(n) +#define STRING(n) #n + + +inline static int test_bit_set_bit(unsigned char * buf, + unsigned int x, int set_bit) +{ + unsigned int byte = x >> 3; + unsigned char c = buf[byte]; // expensive memory access + unsigned int mask = 1 << (x % 8); + + if (c & mask) { + return 1; + } else { + if (set_bit) { + buf[byte] = c | mask; + } + return 0; + } +} + + +static int bloom_check_add(struct bloom * bloom, + const void * buffer, int len, int add) +{ + if (bloom->ready == 0) { + printf("bloom at %p not initialized!\n", (void *)bloom); + return -1; + } + + int hits = 0; + register unsigned int a = murmurhash2(buffer, len, 0x9747b28c); + register unsigned int b = murmurhash2(buffer, len, a); + register unsigned int x; + register unsigned int i; + + for (i = 0; i < bloom->hashes; i++) { + x = (a + i*b) % bloom->bits; + if (test_bit_set_bit(bloom->bf, x, add)) { + hits++; + } else if (!add) { + // Don't care about the presence of all the bits. Just our own. + return 0; + } + } + + if (hits == bloom->hashes) { + return 1; // 1 == element already in (or collision) + } + + return 0; +} + + +int bloom_init_size(struct bloom * bloom, int entries, double error, + unsigned int cache_size) +{ + return bloom_init(bloom, entries, error); +} + + +int bloom_init(struct bloom * bloom, int entries, double error) +{ + bloom->ready = 0; + + if (entries < 1000 || error == 0) { + return 1; + } + + bloom->entries = entries; + bloom->error = error; + + double num = log(bloom->error); + double denom = 0.480453013918201; // ln(2)^2 + bloom->bpe = -(num / denom); + + double dentries = (double)entries; + bloom->bits = (int)(dentries * bloom->bpe); + + if (bloom->bits % 8) { + bloom->bytes = (bloom->bits / 8) + 1; + } else { + bloom->bytes = bloom->bits / 8; + } + + bloom->hashes = (int)ceil(0.693147180559945 * bloom->bpe); // ln(2) + + bloom->bf = (unsigned char *)calloc(bloom->bytes, sizeof(unsigned char)); + if (bloom->bf == NULL) { // LCOV_EXCL_START + return 1; + } // LCOV_EXCL_STOP + + bloom->ready = 1; + return 0; +} + + +int bloom_check(struct bloom * bloom, const void * buffer, int len) +{ + return bloom_check_add(bloom, buffer, len, 0); +} + + +int bloom_add(struct bloom * bloom, const void * buffer, int len) +{ + return bloom_check_add(bloom, buffer, len, 1); +} + + +void bloom_print(struct bloom * bloom) +{ + printf("bloom at %p\n", (void *)bloom); + printf(" ->entries = %d\n", bloom->entries); + printf(" ->error = %f\n", bloom->error); + printf(" ->bits = %d\n", bloom->bits); + printf(" ->bits per elem = %f\n", bloom->bpe); + printf(" ->bytes = %d\n", bloom->bytes); + printf(" ->hash functions = %d\n", bloom->hashes); +} + + +void bloom_free(struct bloom * bloom) +{ + if (bloom->ready) { + free(bloom->bf); + } + bloom->ready = 0; +} + + +int bloom_reset(struct bloom * bloom) +{ + if (!bloom->ready) return 1; + memset(bloom->bf, 0, bloom->bytes); + return 0; +} blob - /dev/null blob + 28356e3721b9f7902763322750d5a1b87640cd4d (mode 644) --- /dev/null +++ lib/bloom.h @@ -0,0 +1,187 @@ +/* + * Copyright (c) 2012-2017, Jyri J. Virkki + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* Obtained from https://github.com/jvirkki/libbloom */ + +#ifndef _BLOOM_H +#define _BLOOM_H + +#ifdef __cplusplus +extern "C" { +#endif + + +/** *************************************************************************** + * Structure to keep track of one bloom filter. Caller needs to + * allocate this and pass it to the functions below. First call for + * every struct must be to bloom_init(). + * + */ +struct bloom +{ + // These fields are part of the public interface of this structure. + // Client code may read these values if desired. Client code MUST NOT + // modify any of these. + int entries; + double error; + int bits; + int bytes; + int hashes; + + // Fields below are private to the implementation. These may go away or + // change incompatibly at any moment. Client code MUST NOT access or rely + // on these. + double bpe; + unsigned char * bf; + int ready; +}; + + +/** *************************************************************************** + * Initialize the bloom filter for use. + * + * The filter is initialized with a bit field and number of hash functions + * according to the computations from the wikipedia entry: + * http://en.wikipedia.org/wiki/Bloom_filter + * + * Optimal number of bits is: + * bits = (entries * ln(error)) / ln(2)^2 + * + * Optimal number of hash functions is: + * hashes = bpe * ln(2) + * + * Parameters: + * ----------- + * bloom - Pointer to an allocated struct bloom (see above). + * entries - The expected number of entries which will be inserted. + * Must be at least 1000 (in practice, likely much larger). + * error - Probability of collision (as long as entries are not + * exceeded). + * + * Return: + * ------- + * 0 - on success + * 1 - on failure + * + */ +int bloom_init(struct bloom * bloom, int entries, double error); + + +/** *************************************************************************** + * Deprecated, use bloom_init() + * + */ +int bloom_init_size(struct bloom * bloom, int entries, double error, + unsigned int cache_size); + + +/** *************************************************************************** + * Check if the given element is in the bloom filter. Remember this may + * return false positive if a collision occurred. + * + * Parameters: + * ----------- + * bloom - Pointer to an allocated struct bloom (see above). + * buffer - Pointer to buffer containing element to check. + * len - Size of 'buffer'. + * + * Return: + * ------- + * 0 - element is not present + * 1 - element is present (or false positive due to collision) + * -1 - bloom not initialized + * + */ +int bloom_check(struct bloom * bloom, const void * buffer, int len); + + +/** *************************************************************************** + * Add the given element to the bloom filter. + * The return code indicates if the element (or a collision) was already in, + * so for the common check+add use case, no need to call check separately. + * + * Parameters: + * ----------- + * bloom - Pointer to an allocated struct bloom (see above). + * buffer - Pointer to buffer containing element to add. + * len - Size of 'buffer'. + * + * Return: + * ------- + * 0 - element was not present and was added + * 1 - element (or a collision) had already been added previously + * -1 - bloom not initialized + * + */ +int bloom_add(struct bloom * bloom, const void * buffer, int len); + + +/** *************************************************************************** + * Print (to stdout) info about this bloom filter. Debugging aid. + * + */ +void bloom_print(struct bloom * bloom); + + +/** *************************************************************************** + * Deallocate internal storage. + * + * Upon return, the bloom struct is no longer usable. You may call bloom_init + * again on the same struct to reinitialize it again. + * + * Parameters: + * ----------- + * bloom - Pointer to an allocated struct bloom (see above). + * + * Return: none + * + */ +void bloom_free(struct bloom * bloom); + +/** *************************************************************************** + * Erase internal storage. + * + * Erases all elements. Upon return, the bloom struct returns to its initial + * (initialized) state. + * + * Parameters: + * ----------- + * bloom - Pointer to an allocated struct bloom (see above). + * + * Return: + * 0 - on success + * 1 - on failure + * + */ +int bloom_reset(struct bloom * bloom); + +#ifdef __cplusplus +} +#endif + +#endif blob - 743935e3d21d9086783c3a8645b212424250520e blob + d12c1d0d2ac9ef2b17d24062220b1f89c1f7ae78 --- lib/got_lib_repository.h +++ lib/got_lib_repository.h @@ -30,6 +30,15 @@ #define GOT_PACK_CACHE_SIZE 64 +struct got_packidx_bloom_filter { + char path_packidx[PATH_MAX]; /* on-disk path */ + size_t path_packidx_len; + struct bloom *bloom; + STAILQ_ENTRY(got_packidx_bloom_filter) entry; +}; + +STAILQ_HEAD(got_packidx_bloom_filter_head, got_packidx_bloom_filter); + struct got_repository { char *path; char *path_git_dir; @@ -38,6 +47,13 @@ struct got_repository { /* The pack index cache speeds up search for packed objects. */ struct got_packidx *packidx_cache[GOT_PACK_CACHE_SIZE]; + /* + * List of bloom filters for pack index files. + * Used to avoid opening a pack index in search of an + * object ID which is not contained in this pack index. + */ + struct got_packidx_bloom_filter_head packidx_bloom_filters; + /* Open file handles for pack files. */ struct got_pack packs[GOT_PACK_CACHE_SIZE]; blob - /dev/null blob + ac869f866068f2b1050c52779f2539890a8c877f (mode 644) --- /dev/null +++ lib/murmurhash2.c @@ -0,0 +1,61 @@ +//----------------------------------------------------------------------------- +// MurmurHash2 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. + +/* Obtained from https://github.com/aappleby/smhasher */ + +#include + +#include "murmurhash2.h" + +uint32_t +murmurhash2(const void * key, int len, uint32_t seed) +{ + // 'm' and 'r' are mixing constants generated offline. + // They're not really 'magic', they just happen to work well. + + const uint32_t m = 0x5bd1e995; + const int r = 24; + + // Initialize the hash to a 'random' value + + uint32_t h = seed ^ len; + + // Mix 4 bytes at a time into the hash + + const unsigned char *data = (const unsigned char *)key; + + while(len >= 4) + { + uint32_t k = *(uint32_t*)data; + + k *= m; + k ^= k >> r; + k *= m; + + h *= m; + h ^= k; + + data += 4; + len -= 4; + } + + // Handle the last few bytes of the input array + + switch(len) + { + case 3: h ^= data[2] << 16; + case 2: h ^= data[1] << 8; + case 1: h ^= data[0]; + h *= m; + }; + + // Do a few final mixes of the hash to ensure the last few + // bytes are well-incorporated. + + h ^= h >> 13; + h *= m; + h ^= h >> 15; + + return h; +} blob - /dev/null blob + bbd2bf3ef0309efebdfbb79dd75aa100d4fb0b74 (mode 644) --- /dev/null +++ lib/murmurhash2.h @@ -0,0 +1,7 @@ +//----------------------------------------------------------------------------- +// MurmurHash2 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. + +/* Obtained from https://github.com/aappleby/smhasher */ + +uint32_t murmurhash2(const void *key, int len, uint32_t seed); blob - a963c0de6158cfedb20185a2af26c5cb109e7cad blob + 38d61ae41b5d5e94177f872aefb961dbaaf3a579 --- lib/repository.c +++ lib/repository.c @@ -37,7 +37,7 @@ #include #include -#include "got_compat.h" +#include "bloom.h" #include "got_error.h" #include "got_reference.h" @@ -632,6 +632,8 @@ got_repo_open(struct got_repository **repop, const cha err = got_error_from_errno("calloc"); goto done; } + + STAILQ_INIT(&repo->packidx_bloom_filters); for (i = 0; i < nitems(repo->privsep_children); i++) { memset(&repo->privsep_children[i], 0, @@ -727,6 +729,14 @@ got_repo_close(struct got_repository *repo) if (repo->packidx_cache[i] == NULL) break; got_packidx_close(repo->packidx_cache[i]); + } + + while (!STAILQ_EMPTY(&repo->packidx_bloom_filters)) { + struct got_packidx_bloom_filter *bf; + bf = STAILQ_FIRST(&repo->packidx_bloom_filters); + STAILQ_REMOVE_HEAD(&repo->packidx_bloom_filters, entry); + free(bf->bloom); + free(bf); } for (i = 0; i < repo->pack_cache_size; i++) { @@ -949,8 +959,82 @@ got_repo_is_packidx_filename(const char *name, size_t if (strcmp(name + strlen(GOT_PACK_PREFIX) + SHA1_DIGEST_STRING_LENGTH - 1, GOT_PACKIDX_SUFFIX) != 0) return 0; + + return 1; +} + +static int +check_packidx_bloom_filter(struct got_repository *repo, + const char *path_packidx, struct got_object_id *id) +{ + struct got_packidx_bloom_filter *bf; + + STAILQ_FOREACH(bf, &repo->packidx_bloom_filters, entry) { + if (got_path_cmp(bf->path_packidx, path_packidx, + bf->path_packidx_len, strlen(path_packidx)) == 0) { + return bloom_check(bf->bloom, id->sha1, + sizeof(id->sha1)); + } + } + /* No bloom filter means this pack index must be searched. */ return 1; +} + +static const struct got_error * +add_packidx_bloom_filter(struct got_repository *repo, + struct got_packidx *packidx, const char *path_packidx) +{ + int i, nobjects = be32toh(packidx->hdr.fanout_table[0xff]); + struct got_packidx_bloom_filter *bf; + size_t len; + + /* + * Don't use bloom filters for very large pack index files. + * Large pack files will contain a relatively large fraction + * of our objects so we will likely need to visit them anyway. + * The more objects a pack file contains the higher the probability + * of a false-positive match from the bloom filter. And reading + * all object IDs from a large pack index file can be expensive. + */ + if (nobjects > 100000) /* cut-off at about 2MB, at 20 bytes per ID */ + return NULL; + + /* Do we already have a filter for this pack index? */ + STAILQ_FOREACH(bf, &repo->packidx_bloom_filters, entry) { + if (got_path_cmp(bf->path_packidx, path_packidx, + bf->path_packidx_len, strlen(path_packidx)) == 0) + return NULL; + } + + bf = calloc(1, sizeof(*bf)); + if (bf == NULL) + return got_error_from_errno("calloc"); + bf->bloom = calloc(1, sizeof(*bf->bloom)); + if (bf->bloom == NULL) { + free(bf); + return got_error_from_errno("calloc"); + } + + + len = strlcpy(bf->path_packidx, path_packidx, sizeof(bf->path_packidx)); + if (len >= sizeof(bf->path_packidx)) { + free(bf->bloom); + free(bf); + return got_error(GOT_ERR_NO_SPACE); + } + bf->path_packidx_len = len; + + /* Minimum size supported by our bloom filter is 1000 entries. */ + bloom_init(bf->bloom, nobjects < 1000 ? 1000 : nobjects, 0.1); + for (i = 0; i < nobjects; i++) { + struct got_packidx_object_id *id; + id = &packidx->hdr.sorted_ids[i]; + bloom_add(bf->bloom, id->sha1, sizeof(id->sha1)); + } + + STAILQ_INSERT_TAIL(&repo->packidx_bloom_filters, bf, entry); + return NULL; } const struct got_error * @@ -968,6 +1052,9 @@ got_repo_search_packidx(struct got_packidx **packidx, for (i = 0; i < repo->pack_cache_size; i++) { if (repo->packidx_cache[i] == NULL) break; + if (!check_packidx_bloom_filter(repo, + repo->packidx_cache[i]->path_packidx, id)) + continue; /* object will not be found in this index */ *idx = got_packidx_get_object_idx(repo->packidx_cache[i], id); if (*idx != -1) { *packidx = repo->packidx_cache[i]; @@ -1017,6 +1104,11 @@ got_repo_search_packidx(struct got_packidx **packidx, goto done; } + if (!check_packidx_bloom_filter(repo, path_packidx, id)) { + free(path_packidx); + continue; /* object will not be found in this index */ + } + for (i = 0; i < repo->pack_cache_size; i++) { if (repo->packidx_cache[i] == NULL) break; @@ -1038,6 +1130,12 @@ got_repo_search_packidx(struct got_packidx **packidx, goto done; } + err = add_packidx_bloom_filter(repo, *packidx, path_packidx); + if (err) { + free(path_packidx); + goto done; + } + err = cache_packidx(repo, *packidx, path_packidx); free(path_packidx); if (err) blob - 10c22ece9a96f897b3b926737aa5bf217b1df8f2 blob + 0215869fd1a3678fe92c416a609faf3e875f0a34 --- regress/fetch/Makefile +++ regress/fetch/Makefile @@ -4,10 +4,10 @@ PROG = fetch_test SRCS = error.c privsep.c reference.c sha1.c object.c object_parse.c path.c \ opentemp.c repository.c lockfile.c object_cache.c pack.c inflate.c \ deflate.c delta.c delta_cache.c object_idset.c object_create.c \ - fetch.c gotconfig.c dial.c fetch_test.c + fetch.c gotconfig.c dial.c fetch_test.c bloom.c murmurhash2.c CPPFLAGS = -I${.CURDIR}/../../include -I${.CURDIR}/../../lib -LDADD = -lutil -lz +LDADD = -lutil -lz -lm NOMAN = yes