commit - 3333aec6eab7596a83037f6fc7abb9e0c8dee838
commit + 4545b700107608ae12d074627db4e14d39f6b1fe
blob - 48637640943824f4455df6330321f83f0ea237ad
blob + b68c94d43465ffea4905be005ad51f1b0cd194a8
--- gotweb/Makefile
+++ gotweb/Makefile
deflate.c object_create.c delta_cache.c gotconfig.c \
diff_main.c diff_atomize_text.c diff_myers.c diff_output.c \
diff_output_plain.c diff_output_unidiff.c \
- diff_output_edscript.c diff_patience.c
+ diff_output_edscript.c diff_patience.c \
+ bloom.c murmurhash2.c
MAN = ${PROG}.conf.5 ${PROG}.8
CPPFLAGS += -I${.CURDIR}/../include -I${.CURDIR}/../lib -I${.CURDIR} \
-I${KCGIBASE}/include
-LDADD += -L${KCGIBASE}/lib -lkcgihtml -lkcgi -lz
+LDADD += -L${KCGIBASE}/lib -lkcgihtml -lkcgi -lz -lm
LDSTATIC = ${STATIC}
.if ${GOT_RELEASE} != "Yes"
blob - /dev/null
blob + 05e57b1f7053a73112d4713be722a197307866e2 (mode 644)
--- /dev/null
+++ lib/bloom.c
+/*
+ * Copyright (c) 2012-2019, Jyri J. Virkki
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Obtained from https://github.com/jvirkki/libbloom */
+
+/*
+ * Refer to bloom.h for documentation on the public interfaces.
+ */
+
+#include <assert.h>
+#include <fcntl.h>
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "bloom.h"
+#include "murmurhash2.h"
+
+#define MAKESTRING(n) STRING(n)
+#define STRING(n) #n
+
+
+inline static int test_bit_set_bit(unsigned char * buf,
+ unsigned int x, int set_bit)
+{
+ unsigned int byte = x >> 3;
+ unsigned char c = buf[byte]; // expensive memory access
+ unsigned int mask = 1 << (x % 8);
+
+ if (c & mask) {
+ return 1;
+ } else {
+ if (set_bit) {
+ buf[byte] = c | mask;
+ }
+ return 0;
+ }
+}
+
+
+static int bloom_check_add(struct bloom * bloom,
+ const void * buffer, int len, int add)
+{
+ if (bloom->ready == 0) {
+ printf("bloom at %p not initialized!\n", (void *)bloom);
+ return -1;
+ }
+
+ int hits = 0;
+ register unsigned int a = murmurhash2(buffer, len, 0x9747b28c);
+ register unsigned int b = murmurhash2(buffer, len, a);
+ register unsigned int x;
+ register unsigned int i;
+
+ for (i = 0; i < bloom->hashes; i++) {
+ x = (a + i*b) % bloom->bits;
+ if (test_bit_set_bit(bloom->bf, x, add)) {
+ hits++;
+ } else if (!add) {
+ // Don't care about the presence of all the bits. Just our own.
+ return 0;
+ }
+ }
+
+ if (hits == bloom->hashes) {
+ return 1; // 1 == element already in (or collision)
+ }
+
+ return 0;
+}
+
+
+int bloom_init_size(struct bloom * bloom, int entries, double error,
+ unsigned int cache_size)
+{
+ return bloom_init(bloom, entries, error);
+}
+
+
+int bloom_init(struct bloom * bloom, int entries, double error)
+{
+ bloom->ready = 0;
+
+ if (entries < 1000 || error == 0) {
+ return 1;
+ }
+
+ bloom->entries = entries;
+ bloom->error = error;
+
+ double num = log(bloom->error);
+ double denom = 0.480453013918201; // ln(2)^2
+ bloom->bpe = -(num / denom);
+
+ double dentries = (double)entries;
+ bloom->bits = (int)(dentries * bloom->bpe);
+
+ if (bloom->bits % 8) {
+ bloom->bytes = (bloom->bits / 8) + 1;
+ } else {
+ bloom->bytes = bloom->bits / 8;
+ }
+
+ bloom->hashes = (int)ceil(0.693147180559945 * bloom->bpe); // ln(2)
+
+ bloom->bf = (unsigned char *)calloc(bloom->bytes, sizeof(unsigned char));
+ if (bloom->bf == NULL) { // LCOV_EXCL_START
+ return 1;
+ } // LCOV_EXCL_STOP
+
+ bloom->ready = 1;
+ return 0;
+}
+
+
+int bloom_check(struct bloom * bloom, const void * buffer, int len)
+{
+ return bloom_check_add(bloom, buffer, len, 0);
+}
+
+
+int bloom_add(struct bloom * bloom, const void * buffer, int len)
+{
+ return bloom_check_add(bloom, buffer, len, 1);
+}
+
+
+void bloom_print(struct bloom * bloom)
+{
+ printf("bloom at %p\n", (void *)bloom);
+ printf(" ->entries = %d\n", bloom->entries);
+ printf(" ->error = %f\n", bloom->error);
+ printf(" ->bits = %d\n", bloom->bits);
+ printf(" ->bits per elem = %f\n", bloom->bpe);
+ printf(" ->bytes = %d\n", bloom->bytes);
+ printf(" ->hash functions = %d\n", bloom->hashes);
+}
+
+
+void bloom_free(struct bloom * bloom)
+{
+ if (bloom->ready) {
+ free(bloom->bf);
+ }
+ bloom->ready = 0;
+}
+
+
+int bloom_reset(struct bloom * bloom)
+{
+ if (!bloom->ready) return 1;
+ memset(bloom->bf, 0, bloom->bytes);
+ return 0;
+}
blob - /dev/null
blob + 28356e3721b9f7902763322750d5a1b87640cd4d (mode 644)
--- /dev/null
+++ lib/bloom.h
+/*
+ * Copyright (c) 2012-2017, Jyri J. Virkki
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Obtained from https://github.com/jvirkki/libbloom */
+
+#ifndef _BLOOM_H
+#define _BLOOM_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** ***************************************************************************
+ * Structure to keep track of one bloom filter. Caller needs to
+ * allocate this and pass it to the functions below. First call for
+ * every struct must be to bloom_init().
+ *
+ */
+struct bloom
+{
+ // These fields are part of the public interface of this structure.
+ // Client code may read these values if desired. Client code MUST NOT
+ // modify any of these.
+ int entries;
+ double error;
+ int bits;
+ int bytes;
+ int hashes;
+
+ // Fields below are private to the implementation. These may go away or
+ // change incompatibly at any moment. Client code MUST NOT access or rely
+ // on these.
+ double bpe;
+ unsigned char * bf;
+ int ready;
+};
+
+
+/** ***************************************************************************
+ * Initialize the bloom filter for use.
+ *
+ * The filter is initialized with a bit field and number of hash functions
+ * according to the computations from the wikipedia entry:
+ * http://en.wikipedia.org/wiki/Bloom_filter
+ *
+ * Optimal number of bits is:
+ * bits = (entries * ln(error)) / ln(2)^2
+ *
+ * Optimal number of hash functions is:
+ * hashes = bpe * ln(2)
+ *
+ * Parameters:
+ * -----------
+ * bloom - Pointer to an allocated struct bloom (see above).
+ * entries - The expected number of entries which will be inserted.
+ * Must be at least 1000 (in practice, likely much larger).
+ * error - Probability of collision (as long as entries are not
+ * exceeded).
+ *
+ * Return:
+ * -------
+ * 0 - on success
+ * 1 - on failure
+ *
+ */
+int bloom_init(struct bloom * bloom, int entries, double error);
+
+
+/** ***************************************************************************
+ * Deprecated, use bloom_init()
+ *
+ */
+int bloom_init_size(struct bloom * bloom, int entries, double error,
+ unsigned int cache_size);
+
+
+/** ***************************************************************************
+ * Check if the given element is in the bloom filter. Remember this may
+ * return false positive if a collision occurred.
+ *
+ * Parameters:
+ * -----------
+ * bloom - Pointer to an allocated struct bloom (see above).
+ * buffer - Pointer to buffer containing element to check.
+ * len - Size of 'buffer'.
+ *
+ * Return:
+ * -------
+ * 0 - element is not present
+ * 1 - element is present (or false positive due to collision)
+ * -1 - bloom not initialized
+ *
+ */
+int bloom_check(struct bloom * bloom, const void * buffer, int len);
+
+
+/** ***************************************************************************
+ * Add the given element to the bloom filter.
+ * The return code indicates if the element (or a collision) was already in,
+ * so for the common check+add use case, no need to call check separately.
+ *
+ * Parameters:
+ * -----------
+ * bloom - Pointer to an allocated struct bloom (see above).
+ * buffer - Pointer to buffer containing element to add.
+ * len - Size of 'buffer'.
+ *
+ * Return:
+ * -------
+ * 0 - element was not present and was added
+ * 1 - element (or a collision) had already been added previously
+ * -1 - bloom not initialized
+ *
+ */
+int bloom_add(struct bloom * bloom, const void * buffer, int len);
+
+
+/** ***************************************************************************
+ * Print (to stdout) info about this bloom filter. Debugging aid.
+ *
+ */
+void bloom_print(struct bloom * bloom);
+
+
+/** ***************************************************************************
+ * Deallocate internal storage.
+ *
+ * Upon return, the bloom struct is no longer usable. You may call bloom_init
+ * again on the same struct to reinitialize it again.
+ *
+ * Parameters:
+ * -----------
+ * bloom - Pointer to an allocated struct bloom (see above).
+ *
+ * Return: none
+ *
+ */
+void bloom_free(struct bloom * bloom);
+
+/** ***************************************************************************
+ * Erase internal storage.
+ *
+ * Erases all elements. Upon return, the bloom struct returns to its initial
+ * (initialized) state.
+ *
+ * Parameters:
+ * -----------
+ * bloom - Pointer to an allocated struct bloom (see above).
+ *
+ * Return:
+ * 0 - on success
+ * 1 - on failure
+ *
+ */
+int bloom_reset(struct bloom * bloom);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
blob - 743935e3d21d9086783c3a8645b212424250520e
blob + d12c1d0d2ac9ef2b17d24062220b1f89c1f7ae78
--- lib/got_lib_repository.h
+++ lib/got_lib_repository.h
#define GOT_PACK_CACHE_SIZE 64
+struct got_packidx_bloom_filter {
+ char path_packidx[PATH_MAX]; /* on-disk path */
+ size_t path_packidx_len;
+ struct bloom *bloom;
+ STAILQ_ENTRY(got_packidx_bloom_filter) entry;
+};
+
+STAILQ_HEAD(got_packidx_bloom_filter_head, got_packidx_bloom_filter);
+
struct got_repository {
char *path;
char *path_git_dir;
/* The pack index cache speeds up search for packed objects. */
struct got_packidx *packidx_cache[GOT_PACK_CACHE_SIZE];
+ /*
+ * List of bloom filters for pack index files.
+ * Used to avoid opening a pack index in search of an
+ * object ID which is not contained in this pack index.
+ */
+ struct got_packidx_bloom_filter_head packidx_bloom_filters;
+
/* Open file handles for pack files. */
struct got_pack packs[GOT_PACK_CACHE_SIZE];
blob - /dev/null
blob + ac869f866068f2b1050c52779f2539890a8c877f (mode 644)
--- /dev/null
+++ lib/murmurhash2.c
+//-----------------------------------------------------------------------------
+// MurmurHash2 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+
+/* Obtained from https://github.com/aappleby/smhasher */
+
+#include <stdint.h>
+
+#include "murmurhash2.h"
+
+uint32_t
+murmurhash2(const void * key, int len, uint32_t seed)
+{
+ // 'm' and 'r' are mixing constants generated offline.
+ // They're not really 'magic', they just happen to work well.
+
+ const uint32_t m = 0x5bd1e995;
+ const int r = 24;
+
+ // Initialize the hash to a 'random' value
+
+ uint32_t h = seed ^ len;
+
+ // Mix 4 bytes at a time into the hash
+
+ const unsigned char *data = (const unsigned char *)key;
+
+ while(len >= 4)
+ {
+ uint32_t k = *(uint32_t*)data;
+
+ k *= m;
+ k ^= k >> r;
+ k *= m;
+
+ h *= m;
+ h ^= k;
+
+ data += 4;
+ len -= 4;
+ }
+
+ // Handle the last few bytes of the input array
+
+ switch(len)
+ {
+ case 3: h ^= data[2] << 16;
+ case 2: h ^= data[1] << 8;
+ case 1: h ^= data[0];
+ h *= m;
+ };
+
+ // Do a few final mixes of the hash to ensure the last few
+ // bytes are well-incorporated.
+
+ h ^= h >> 13;
+ h *= m;
+ h ^= h >> 15;
+
+ return h;
+}
blob - /dev/null
blob + bbd2bf3ef0309efebdfbb79dd75aa100d4fb0b74 (mode 644)
--- /dev/null
+++ lib/murmurhash2.h
+//-----------------------------------------------------------------------------
+// MurmurHash2 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+
+/* Obtained from https://github.com/aappleby/smhasher */
+
+uint32_t murmurhash2(const void *key, int len, uint32_t seed);
blob - a963c0de6158cfedb20185a2af26c5cb109e7cad
blob + 38d61ae41b5d5e94177f872aefb961dbaaf3a579
--- lib/repository.c
+++ lib/repository.c
#include <stdint.h>
#include <uuid.h>
-#include "got_compat.h"
+#include "bloom.h"
#include "got_error.h"
#include "got_reference.h"
err = got_error_from_errno("calloc");
goto done;
}
+
+ STAILQ_INIT(&repo->packidx_bloom_filters);
for (i = 0; i < nitems(repo->privsep_children); i++) {
memset(&repo->privsep_children[i], 0,
if (repo->packidx_cache[i] == NULL)
break;
got_packidx_close(repo->packidx_cache[i]);
+ }
+
+ while (!STAILQ_EMPTY(&repo->packidx_bloom_filters)) {
+ struct got_packidx_bloom_filter *bf;
+ bf = STAILQ_FIRST(&repo->packidx_bloom_filters);
+ STAILQ_REMOVE_HEAD(&repo->packidx_bloom_filters, entry);
+ free(bf->bloom);
+ free(bf);
}
for (i = 0; i < repo->pack_cache_size; i++) {
if (strcmp(name + strlen(GOT_PACK_PREFIX) +
SHA1_DIGEST_STRING_LENGTH - 1, GOT_PACKIDX_SUFFIX) != 0)
return 0;
+
+ return 1;
+}
+
+static int
+check_packidx_bloom_filter(struct got_repository *repo,
+ const char *path_packidx, struct got_object_id *id)
+{
+ struct got_packidx_bloom_filter *bf;
+
+ STAILQ_FOREACH(bf, &repo->packidx_bloom_filters, entry) {
+ if (got_path_cmp(bf->path_packidx, path_packidx,
+ bf->path_packidx_len, strlen(path_packidx)) == 0) {
+ return bloom_check(bf->bloom, id->sha1,
+ sizeof(id->sha1));
+ }
+ }
+ /* No bloom filter means this pack index must be searched. */
return 1;
+}
+
+static const struct got_error *
+add_packidx_bloom_filter(struct got_repository *repo,
+ struct got_packidx *packidx, const char *path_packidx)
+{
+ int i, nobjects = be32toh(packidx->hdr.fanout_table[0xff]);
+ struct got_packidx_bloom_filter *bf;
+ size_t len;
+
+ /*
+ * Don't use bloom filters for very large pack index files.
+ * Large pack files will contain a relatively large fraction
+ * of our objects so we will likely need to visit them anyway.
+ * The more objects a pack file contains the higher the probability
+ * of a false-positive match from the bloom filter. And reading
+ * all object IDs from a large pack index file can be expensive.
+ */
+ if (nobjects > 100000) /* cut-off at about 2MB, at 20 bytes per ID */
+ return NULL;
+
+ /* Do we already have a filter for this pack index? */
+ STAILQ_FOREACH(bf, &repo->packidx_bloom_filters, entry) {
+ if (got_path_cmp(bf->path_packidx, path_packidx,
+ bf->path_packidx_len, strlen(path_packidx)) == 0)
+ return NULL;
+ }
+
+ bf = calloc(1, sizeof(*bf));
+ if (bf == NULL)
+ return got_error_from_errno("calloc");
+ bf->bloom = calloc(1, sizeof(*bf->bloom));
+ if (bf->bloom == NULL) {
+ free(bf);
+ return got_error_from_errno("calloc");
+ }
+
+
+ len = strlcpy(bf->path_packidx, path_packidx, sizeof(bf->path_packidx));
+ if (len >= sizeof(bf->path_packidx)) {
+ free(bf->bloom);
+ free(bf);
+ return got_error(GOT_ERR_NO_SPACE);
+ }
+ bf->path_packidx_len = len;
+
+ /* Minimum size supported by our bloom filter is 1000 entries. */
+ bloom_init(bf->bloom, nobjects < 1000 ? 1000 : nobjects, 0.1);
+ for (i = 0; i < nobjects; i++) {
+ struct got_packidx_object_id *id;
+ id = &packidx->hdr.sorted_ids[i];
+ bloom_add(bf->bloom, id->sha1, sizeof(id->sha1));
+ }
+
+ STAILQ_INSERT_TAIL(&repo->packidx_bloom_filters, bf, entry);
+ return NULL;
}
const struct got_error *
for (i = 0; i < repo->pack_cache_size; i++) {
if (repo->packidx_cache[i] == NULL)
break;
+ if (!check_packidx_bloom_filter(repo,
+ repo->packidx_cache[i]->path_packidx, id))
+ continue; /* object will not be found in this index */
*idx = got_packidx_get_object_idx(repo->packidx_cache[i], id);
if (*idx != -1) {
*packidx = repo->packidx_cache[i];
goto done;
}
+ if (!check_packidx_bloom_filter(repo, path_packidx, id)) {
+ free(path_packidx);
+ continue; /* object will not be found in this index */
+ }
+
for (i = 0; i < repo->pack_cache_size; i++) {
if (repo->packidx_cache[i] == NULL)
break;
goto done;
}
+ err = add_packidx_bloom_filter(repo, *packidx, path_packidx);
+ if (err) {
+ free(path_packidx);
+ goto done;
+ }
+
err = cache_packidx(repo, *packidx, path_packidx);
free(path_packidx);
if (err)
blob - 10c22ece9a96f897b3b926737aa5bf217b1df8f2
blob + 0215869fd1a3678fe92c416a609faf3e875f0a34
--- regress/fetch/Makefile
+++ regress/fetch/Makefile
SRCS = error.c privsep.c reference.c sha1.c object.c object_parse.c path.c \
opentemp.c repository.c lockfile.c object_cache.c pack.c inflate.c \
deflate.c delta.c delta_cache.c object_idset.c object_create.c \
- fetch.c gotconfig.c dial.c fetch_test.c
+ fetch.c gotconfig.c dial.c fetch_test.c bloom.c murmurhash2.c
CPPFLAGS = -I${.CURDIR}/../../include -I${.CURDIR}/../../lib
-LDADD = -lutil -lz
+LDADD = -lutil -lz -lm
NOMAN = yes