Commit Diff


commit - 3333aec6eab7596a83037f6fc7abb9e0c8dee838
commit + 4545b700107608ae12d074627db4e14d39f6b1fe
blob - 48637640943824f4455df6330321f83f0ea237ad
blob + b68c94d43465ffea4905be005ad51f1b0cd194a8
--- gotweb/Makefile
+++ gotweb/Makefile
@@ -14,12 +14,13 @@ SRCS =		gotweb.c parse.y blame.c commit_graph.c delta.
 		deflate.c object_create.c delta_cache.c gotconfig.c \
 		diff_main.c diff_atomize_text.c diff_myers.c diff_output.c \
 		diff_output_plain.c diff_output_unidiff.c \
-		diff_output_edscript.c diff_patience.c
+		diff_output_edscript.c diff_patience.c \
+		bloom.c murmurhash2.c
 MAN =		${PROG}.conf.5 ${PROG}.8
 
 CPPFLAGS +=	-I${.CURDIR}/../include -I${.CURDIR}/../lib -I${.CURDIR} \
 		-I${KCGIBASE}/include
-LDADD +=	-L${KCGIBASE}/lib -lkcgihtml -lkcgi -lz
+LDADD +=	-L${KCGIBASE}/lib -lkcgihtml -lkcgi -lz -lm
 LDSTATIC =	${STATIC}
 
 .if ${GOT_RELEASE} != "Yes"
blob - /dev/null
blob + 05e57b1f7053a73112d4713be722a197307866e2 (mode 644)
--- /dev/null
+++ lib/bloom.c
@@ -0,0 +1,184 @@
+/*
+ *  Copyright (c) 2012-2019, Jyri J. Virkki
+ *  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Obtained from https://github.com/jvirkki/libbloom */
+
+/*
+ * Refer to bloom.h for documentation on the public interfaces.
+ */
+
+#include <assert.h>
+#include <fcntl.h>
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "bloom.h"
+#include "murmurhash2.h"
+
+#define MAKESTRING(n) STRING(n)
+#define STRING(n) #n
+
+
+inline static int test_bit_set_bit(unsigned char * buf,
+                                   unsigned int x, int set_bit)
+{
+  unsigned int byte = x >> 3;
+  unsigned char c = buf[byte];        // expensive memory access
+  unsigned int mask = 1 << (x % 8);
+
+  if (c & mask) {
+    return 1;
+  } else {
+    if (set_bit) {
+      buf[byte] = c | mask;
+    }
+    return 0;
+  }
+}
+
+
+static int bloom_check_add(struct bloom * bloom,
+                           const void * buffer, int len, int add)
+{
+  if (bloom->ready == 0) {
+    printf("bloom at %p not initialized!\n", (void *)bloom);
+    return -1;
+  }
+
+  int hits = 0;
+  register unsigned int a = murmurhash2(buffer, len, 0x9747b28c);
+  register unsigned int b = murmurhash2(buffer, len, a);
+  register unsigned int x;
+  register unsigned int i;
+
+  for (i = 0; i < bloom->hashes; i++) {
+    x = (a + i*b) % bloom->bits;
+    if (test_bit_set_bit(bloom->bf, x, add)) {
+      hits++;
+    } else if (!add) {
+      // Don't care about the presence of all the bits. Just our own.
+      return 0;
+    }
+  }
+
+  if (hits == bloom->hashes) {
+    return 1;                // 1 == element already in (or collision)
+  }
+
+  return 0;
+}
+
+
+int bloom_init_size(struct bloom * bloom, int entries, double error,
+                    unsigned int cache_size)
+{
+  return bloom_init(bloom, entries, error);
+}
+
+
+int bloom_init(struct bloom * bloom, int entries, double error)
+{
+  bloom->ready = 0;
+
+  if (entries < 1000 || error == 0) {
+    return 1;
+  }
+
+  bloom->entries = entries;
+  bloom->error = error;
+
+  double num = log(bloom->error);
+  double denom = 0.480453013918201; // ln(2)^2
+  bloom->bpe = -(num / denom);
+
+  double dentries = (double)entries;
+  bloom->bits = (int)(dentries * bloom->bpe);
+
+  if (bloom->bits % 8) {
+    bloom->bytes = (bloom->bits / 8) + 1;
+  } else {
+    bloom->bytes = bloom->bits / 8;
+  }
+
+  bloom->hashes = (int)ceil(0.693147180559945 * bloom->bpe);  // ln(2)
+
+  bloom->bf = (unsigned char *)calloc(bloom->bytes, sizeof(unsigned char));
+  if (bloom->bf == NULL) {                                   // LCOV_EXCL_START
+    return 1;
+  }                                                          // LCOV_EXCL_STOP
+
+  bloom->ready = 1;
+  return 0;
+}
+
+
+int bloom_check(struct bloom * bloom, const void * buffer, int len)
+{
+  return bloom_check_add(bloom, buffer, len, 0);
+}
+
+
+int bloom_add(struct bloom * bloom, const void * buffer, int len)
+{
+  return bloom_check_add(bloom, buffer, len, 1);
+}
+
+
+void bloom_print(struct bloom * bloom)
+{
+  printf("bloom at %p\n", (void *)bloom);
+  printf(" ->entries = %d\n", bloom->entries);
+  printf(" ->error = %f\n", bloom->error);
+  printf(" ->bits = %d\n", bloom->bits);
+  printf(" ->bits per elem = %f\n", bloom->bpe);
+  printf(" ->bytes = %d\n", bloom->bytes);
+  printf(" ->hash functions = %d\n", bloom->hashes);
+}
+
+
+void bloom_free(struct bloom * bloom)
+{
+  if (bloom->ready) {
+    free(bloom->bf);
+  }
+  bloom->ready = 0;
+}
+
+
+int bloom_reset(struct bloom * bloom)
+{
+  if (!bloom->ready) return 1;
+  memset(bloom->bf, 0, bloom->bytes);
+  return 0;
+}
blob - /dev/null
blob + 28356e3721b9f7902763322750d5a1b87640cd4d (mode 644)
--- /dev/null
+++ lib/bloom.h
@@ -0,0 +1,187 @@
+/*
+ *  Copyright (c) 2012-2017, Jyri J. Virkki
+ *  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Obtained from https://github.com/jvirkki/libbloom */
+
+#ifndef _BLOOM_H
+#define _BLOOM_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** ***************************************************************************
+ * Structure to keep track of one bloom filter.  Caller needs to
+ * allocate this and pass it to the functions below. First call for
+ * every struct must be to bloom_init().
+ *
+ */
+struct bloom
+{
+  // These fields are part of the public interface of this structure.
+  // Client code may read these values if desired. Client code MUST NOT
+  // modify any of these.
+  int entries;
+  double error;
+  int bits;
+  int bytes;
+  int hashes;
+
+  // Fields below are private to the implementation. These may go away or
+  // change incompatibly at any moment. Client code MUST NOT access or rely
+  // on these.
+  double bpe;
+  unsigned char * bf;
+  int ready;
+};
+
+
+/** ***************************************************************************
+ * Initialize the bloom filter for use.
+ *
+ * The filter is initialized with a bit field and number of hash functions
+ * according to the computations from the wikipedia entry:
+ *     http://en.wikipedia.org/wiki/Bloom_filter
+ *
+ * Optimal number of bits is:
+ *     bits = (entries * ln(error)) / ln(2)^2
+ *
+ * Optimal number of hash functions is:
+ *     hashes = bpe * ln(2)
+ *
+ * Parameters:
+ * -----------
+ *     bloom   - Pointer to an allocated struct bloom (see above).
+ *     entries - The expected number of entries which will be inserted.
+ *               Must be at least 1000 (in practice, likely much larger).
+ *     error   - Probability of collision (as long as entries are not
+ *               exceeded).
+ *
+ * Return:
+ * -------
+ *     0 - on success
+ *     1 - on failure
+ *
+ */
+int bloom_init(struct bloom * bloom, int entries, double error);
+
+
+/** ***************************************************************************
+ * Deprecated, use bloom_init()
+ *
+ */
+int bloom_init_size(struct bloom * bloom, int entries, double error,
+                    unsigned int cache_size);
+
+
+/** ***************************************************************************
+ * Check if the given element is in the bloom filter. Remember this may
+ * return false positive if a collision occurred.
+ *
+ * Parameters:
+ * -----------
+ *     bloom  - Pointer to an allocated struct bloom (see above).
+ *     buffer - Pointer to buffer containing element to check.
+ *     len    - Size of 'buffer'.
+ *
+ * Return:
+ * -------
+ *     0 - element is not present
+ *     1 - element is present (or false positive due to collision)
+ *    -1 - bloom not initialized
+ *
+ */
+int bloom_check(struct bloom * bloom, const void * buffer, int len);
+
+
+/** ***************************************************************************
+ * Add the given element to the bloom filter.
+ * The return code indicates if the element (or a collision) was already in,
+ * so for the common check+add use case, no need to call check separately.
+ *
+ * Parameters:
+ * -----------
+ *     bloom  - Pointer to an allocated struct bloom (see above).
+ *     buffer - Pointer to buffer containing element to add.
+ *     len    - Size of 'buffer'.
+ *
+ * Return:
+ * -------
+ *     0 - element was not present and was added
+ *     1 - element (or a collision) had already been added previously
+ *    -1 - bloom not initialized
+ *
+ */
+int bloom_add(struct bloom * bloom, const void * buffer, int len);
+
+
+/** ***************************************************************************
+ * Print (to stdout) info about this bloom filter. Debugging aid.
+ *
+ */
+void bloom_print(struct bloom * bloom);
+
+
+/** ***************************************************************************
+ * Deallocate internal storage.
+ *
+ * Upon return, the bloom struct is no longer usable. You may call bloom_init
+ * again on the same struct to reinitialize it again.
+ *
+ * Parameters:
+ * -----------
+ *     bloom  - Pointer to an allocated struct bloom (see above).
+ *
+ * Return: none
+ *
+ */
+void bloom_free(struct bloom * bloom);
+
+/** ***************************************************************************
+ * Erase internal storage.
+ *
+ * Erases all elements. Upon return, the bloom struct returns to its initial
+ * (initialized) state.
+ *
+ * Parameters:
+ * -----------
+ *     bloom  - Pointer to an allocated struct bloom (see above).
+ *
+ * Return:
+ *     0 - on success
+ *     1 - on failure
+ *
+ */
+int bloom_reset(struct bloom * bloom);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
blob - 743935e3d21d9086783c3a8645b212424250520e
blob + d12c1d0d2ac9ef2b17d24062220b1f89c1f7ae78
--- lib/got_lib_repository.h
+++ lib/got_lib_repository.h
@@ -30,6 +30,15 @@
 
 #define GOT_PACK_CACHE_SIZE	64
 
+struct got_packidx_bloom_filter {
+	char path_packidx[PATH_MAX]; /* on-disk path */
+	size_t path_packidx_len;
+	struct bloom *bloom;
+	STAILQ_ENTRY(got_packidx_bloom_filter) entry;
+};
+
+STAILQ_HEAD(got_packidx_bloom_filter_head, got_packidx_bloom_filter);
+
 struct got_repository {
 	char *path;
 	char *path_git_dir;
@@ -38,6 +47,13 @@ struct got_repository {
 	/* The pack index cache speeds up search for packed objects. */
 	struct got_packidx *packidx_cache[GOT_PACK_CACHE_SIZE];
 
+	/*
+	 * List of bloom filters for pack index files.
+	 * Used to avoid opening a pack index in search of an
+	 * object ID which is not contained in this pack index.
+	 */
+	struct got_packidx_bloom_filter_head packidx_bloom_filters;
+
 	/* Open file handles for pack files. */
 	struct got_pack packs[GOT_PACK_CACHE_SIZE];
 
blob - /dev/null
blob + ac869f866068f2b1050c52779f2539890a8c877f (mode 644)
--- /dev/null
+++ lib/murmurhash2.c
@@ -0,0 +1,61 @@
+//-----------------------------------------------------------------------------
+// MurmurHash2 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+
+/* Obtained from https://github.com/aappleby/smhasher */
+
+#include <stdint.h>
+
+#include "murmurhash2.h"
+
+uint32_t
+murmurhash2(const void * key, int len, uint32_t seed)
+{
+  // 'm' and 'r' are mixing constants generated offline.
+  // They're not really 'magic', they just happen to work well.
+
+  const uint32_t m = 0x5bd1e995;
+  const int r = 24;
+
+  // Initialize the hash to a 'random' value
+
+  uint32_t h = seed ^ len;
+
+  // Mix 4 bytes at a time into the hash
+
+  const unsigned char *data = (const unsigned char *)key;
+
+  while(len >= 4)
+  {
+    uint32_t k = *(uint32_t*)data;
+
+    k *= m;
+    k ^= k >> r;
+    k *= m;
+
+    h *= m;
+    h ^= k;
+
+    data += 4;
+    len -= 4;
+  }
+
+  // Handle the last few bytes of the input array
+
+  switch(len)
+  {
+  case 3: h ^= data[2] << 16;
+  case 2: h ^= data[1] << 8;
+  case 1: h ^= data[0];
+      h *= m;
+  };
+
+  // Do a few final mixes of the hash to ensure the last few
+  // bytes are well-incorporated.
+
+  h ^= h >> 13;
+  h *= m;
+  h ^= h >> 15;
+
+  return h;
+} 
blob - /dev/null
blob + bbd2bf3ef0309efebdfbb79dd75aa100d4fb0b74 (mode 644)
--- /dev/null
+++ lib/murmurhash2.h
@@ -0,0 +1,7 @@
+//-----------------------------------------------------------------------------
+// MurmurHash2 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+
+/* Obtained from https://github.com/aappleby/smhasher */
+
+uint32_t murmurhash2(const void *key, int len, uint32_t seed);
blob - a963c0de6158cfedb20185a2af26c5cb109e7cad
blob + 38d61ae41b5d5e94177f872aefb961dbaaf3a579
--- lib/repository.c
+++ lib/repository.c
@@ -37,7 +37,7 @@
 #include <stdint.h>
 #include <uuid.h>
 
-#include "got_compat.h"
+#include "bloom.h"
 
 #include "got_error.h"
 #include "got_reference.h"
@@ -632,6 +632,8 @@ got_repo_open(struct got_repository **repop, const cha
 		err = got_error_from_errno("calloc");
 		goto done;
 	}
+
+	STAILQ_INIT(&repo->packidx_bloom_filters);
 
 	for (i = 0; i < nitems(repo->privsep_children); i++) {
 		memset(&repo->privsep_children[i], 0,
@@ -727,6 +729,14 @@ got_repo_close(struct got_repository *repo)
 		if (repo->packidx_cache[i] == NULL)
 			break;
 		got_packidx_close(repo->packidx_cache[i]);
+	}
+
+	while (!STAILQ_EMPTY(&repo->packidx_bloom_filters)) {
+		struct got_packidx_bloom_filter *bf;
+		bf = STAILQ_FIRST(&repo->packidx_bloom_filters);
+		STAILQ_REMOVE_HEAD(&repo->packidx_bloom_filters, entry);
+		free(bf->bloom);
+		free(bf);
 	}
 
 	for (i = 0; i < repo->pack_cache_size; i++) {
@@ -949,8 +959,82 @@ got_repo_is_packidx_filename(const char *name, size_t 
 	if (strcmp(name + strlen(GOT_PACK_PREFIX) +
 	    SHA1_DIGEST_STRING_LENGTH - 1, GOT_PACKIDX_SUFFIX) != 0)
 		return 0;
+
+	return 1;
+}
+
+static int
+check_packidx_bloom_filter(struct got_repository *repo,
+    const char *path_packidx, struct got_object_id *id)
+{
+	struct got_packidx_bloom_filter *bf;
+
+	STAILQ_FOREACH(bf, &repo->packidx_bloom_filters, entry) {
+		if (got_path_cmp(bf->path_packidx, path_packidx,
+		    bf->path_packidx_len, strlen(path_packidx)) == 0) {
+			return bloom_check(bf->bloom, id->sha1,
+			    sizeof(id->sha1));
+		}
+	}
 
+	/* No bloom filter means this pack index must be searched. */
 	return 1;
+}
+
+static const struct got_error *
+add_packidx_bloom_filter(struct got_repository *repo,
+    struct got_packidx *packidx, const char *path_packidx)
+{
+	int i, nobjects = be32toh(packidx->hdr.fanout_table[0xff]);
+	struct got_packidx_bloom_filter *bf;
+	size_t len;
+
+	/*
+	 * Don't use bloom filters for very large pack index files.
+	 * Large pack files will contain a relatively large fraction
+	 * of our objects so we will likely need to visit them anyway.
+	 * The more objects a pack file contains the higher the probability
+	 * of a false-positive match from the bloom filter. And reading
+	 * all object IDs from a large pack index file can be expensive.
+	 */
+	if (nobjects > 100000) /* cut-off at about 2MB, at 20 bytes per ID */
+		return NULL;
+
+	/* Do we already have a filter for this pack index? */
+	STAILQ_FOREACH(bf, &repo->packidx_bloom_filters, entry) {
+		if (got_path_cmp(bf->path_packidx, path_packidx,
+		    bf->path_packidx_len, strlen(path_packidx)) == 0)
+			return NULL;
+	}
+
+	bf = calloc(1, sizeof(*bf));
+	if (bf == NULL)
+		return got_error_from_errno("calloc");
+	bf->bloom = calloc(1, sizeof(*bf->bloom));
+	if (bf->bloom == NULL) {
+		free(bf);
+		return got_error_from_errno("calloc");
+	}
+	
+	
+	len = strlcpy(bf->path_packidx, path_packidx, sizeof(bf->path_packidx));
+	if (len >= sizeof(bf->path_packidx)) {
+		free(bf->bloom);
+		free(bf);
+		return got_error(GOT_ERR_NO_SPACE);
+	}
+	bf->path_packidx_len = len;
+
+	/* Minimum size supported by our bloom filter is 1000 entries. */
+	bloom_init(bf->bloom, nobjects < 1000 ? 1000 : nobjects, 0.1);
+	for (i = 0; i < nobjects; i++) {
+		struct got_packidx_object_id *id;
+		id = &packidx->hdr.sorted_ids[i];
+		bloom_add(bf->bloom, id->sha1, sizeof(id->sha1));
+	}
+
+	STAILQ_INSERT_TAIL(&repo->packidx_bloom_filters, bf, entry);
+	return NULL;
 }
 
 const struct got_error *
@@ -968,6 +1052,9 @@ got_repo_search_packidx(struct got_packidx **packidx, 
 	for (i = 0; i < repo->pack_cache_size; i++) {
 		if (repo->packidx_cache[i] == NULL)
 			break;
+		if (!check_packidx_bloom_filter(repo,
+		    repo->packidx_cache[i]->path_packidx, id))
+			continue; /* object will not be found in this index */
 		*idx = got_packidx_get_object_idx(repo->packidx_cache[i], id);
 		if (*idx != -1) {
 			*packidx = repo->packidx_cache[i];
@@ -1017,6 +1104,11 @@ got_repo_search_packidx(struct got_packidx **packidx, 
 			goto done;
 		}
 
+		if (!check_packidx_bloom_filter(repo, path_packidx, id)) {
+			free(path_packidx);
+			continue; /* object will not be found in this index */
+		}
+
 		for (i = 0; i < repo->pack_cache_size; i++) {
 			if (repo->packidx_cache[i] == NULL)
 				break;
@@ -1038,6 +1130,12 @@ got_repo_search_packidx(struct got_packidx **packidx, 
 			goto done;
 		}
 
+		err = add_packidx_bloom_filter(repo, *packidx, path_packidx);
+		if (err) {
+			free(path_packidx);
+			goto done;
+		}
+
 		err = cache_packidx(repo, *packidx, path_packidx);
 		free(path_packidx);
 		if (err)
blob - 10c22ece9a96f897b3b926737aa5bf217b1df8f2
blob + 0215869fd1a3678fe92c416a609faf3e875f0a34
--- regress/fetch/Makefile
+++ regress/fetch/Makefile
@@ -4,10 +4,10 @@ PROG = fetch_test
 SRCS = error.c privsep.c reference.c sha1.c object.c object_parse.c path.c \
 	opentemp.c repository.c lockfile.c object_cache.c pack.c inflate.c \
 	deflate.c delta.c delta_cache.c object_idset.c object_create.c \
-	fetch.c gotconfig.c dial.c fetch_test.c
+	fetch.c gotconfig.c dial.c fetch_test.c bloom.c murmurhash2.c
 
 CPPFLAGS = -I${.CURDIR}/../../include -I${.CURDIR}/../../lib
-LDADD = -lutil -lz
+LDADD = -lutil -lz -lm
 
 NOMAN = yes