commit fcece7180725bba9a781eaa892af379b1986208b from: Omar Polo date: Mon Feb 26 16:25:15 2024 UTC attempt to speed up the deltification for big files The current hash table perform poorly on big files due to a small resize step that pushes the table to its limits continuously. Instead, to have both a better performing hash table and keep the memory consumption low, save the blocks in an array and use the hash table as index. Then, use a more generous resizing scheme that guarantees the good properties of the hash table. To avoid having to rebuild the table when the array is resized, save the indexes in the table, and to further reduce the memory consumption use 32 bit indices. On amd64 this means that each slot is 4 bytes instead of 8 for a pointer or 24 for a struct got_deltify_block. ok stsp@ commit - f9e9269513c7ee687c46d6060a784a9ca11646ce commit + fcece7180725bba9a781eaa892af379b1986208b blob - 99b5d418b8cec83d130d52be4e5022d0310f8c31 blob + a7db1e0a59efcecd2ba2b72df8e1ce42604415d4 --- lib/deltify.c +++ lib/deltify.c @@ -87,6 +87,9 @@ static const uint32_t geartab[256] = { 0xf1f6e72c, 0x5551128a, 0x83af87e2, 0x6f0342ba, }; +static const struct got_error *addblk(struct got_delta_table *, FILE *, + uint8_t *, off_t, off_t, off_t, uint32_t); + static uint32_t hashblk(const unsigned char *p, off_t n, uint32_t seed) { @@ -94,148 +97,153 @@ hashblk(const unsigned char *p, off_t n, uint32_t seed } static const struct got_error * -addblk(struct got_delta_table *dt, FILE *f, off_t file_offset0, off_t len, - off_t offset, uint32_t h) +lookup(int *n, struct got_delta_table *dt, FILE *f, off_t file_offset0, + off_t len, off_t offset, uint32_t h) { - const struct got_error *err = NULL; - int i; + struct got_delta_block *block; uint8_t buf[GOT_DELTIFY_MAXCHUNK]; uint8_t buf2[GOT_DELTIFY_MAXCHUNK]; size_t r = 0; + int i; - if (len == 0) - return NULL; + for (i = h % dt->size; dt->offs[i] != 0; i = (i + 1) % dt->size) { + block = &dt->blocks[dt->offs[i] - 1]; + if (block->len != len || block->hash != h) + continue; - i = h % dt->nalloc; - while (dt->blocks[i].len != 0) { - /* - * Avoid adding duplicate blocks. - * NB: A matching hash is insufficient for detecting equality. - * The hash can only detect inequality. - */ - if (len == dt->blocks[i].len && h == dt->blocks[i].hash) { - if (r == 0) { - if (fseeko(f, file_offset0 + offset, SEEK_SET) == -1) - return got_error_from_errno("fseeko"); - r = fread(buf, 1, len, f); - if (r != len) { - if (!ferror(f)) - return NULL; - return got_ferror(f, GOT_ERR_IO); - } - } - if (fseeko(f, file_offset0 + dt->blocks[i].offset, - SEEK_SET) == -1) + if (r == 0) { + if (fseeko(f, file_offset0 + offset, SEEK_SET) == -1) return got_error_from_errno("fseeko"); - if (fread(buf2, 1, len, f) != len) + r = fread(buf, 1, len, f); + if (r != len) { + if (!ferror(f)) + break; /* why? */ return got_ferror(f, GOT_ERR_IO); - if (memcmp(buf, buf2, len) == 0) - return NULL; + } } - i = (i + 1) % dt->nalloc; + if (fseeko(f, file_offset0 + block->offset, SEEK_SET) == -1) + return got_error_from_errno("fseeko"); + if (fread(buf2, 1, len, f) != len) + return got_ferror(f, GOT_ERR_IO); + if (memcmp(buf, buf2, len) == 0) + break; } - assert(dt->blocks[i].len == 0); - dt->blocks[i].len = len; - dt->blocks[i].offset = offset; - dt->blocks[i].hash = h; - dt->nblocks++; - if (dt->nalloc < dt->nblocks + 256) { - struct got_delta_block *db; - size_t old_size = dt->nalloc; - db = dt->blocks; - dt->blocks = calloc(dt->nalloc + 256, - sizeof(struct got_delta_block)); - if (dt->blocks == NULL) { - err = got_error_from_errno("calloc"); - dt->blocks = db; - return err; - } - dt->nalloc += 256; - /* - * Recompute all block positions. Hash-based indices of blocks - * in the array depend on the allocated length of the array. - */ - dt->nblocks = 0; - for (i = 0; i < old_size; i++) { - if (db[i].len == 0) - continue; - err = addblk(dt, f, file_offset0, db[i].len, - db[i].offset, db[i].hash); - if (err) + + *n = i; + return NULL; +} + +static const struct got_error * +lookup_mem(int *n, struct got_delta_table *dt, uint8_t *basedata, + off_t basefile_offset0, uint8_t *p, off_t len, uint32_t h) +{ + struct got_delta_block *block; + uint8_t *d; + int i; + + for (i = h % dt->size; dt->offs[i] != 0; i = (i + 1) % dt->size) { + block = &dt->blocks[dt->offs[i] - 1]; + if (len == block->len && h == block->hash) { + d = basedata + basefile_offset0 + block->offset; + if (memcmp(d, p, len) == 0) break; } - free(db); } - return err; + *n = i; + return NULL; } static const struct got_error * -addblk_mem(struct got_delta_table *dt, uint8_t *data, off_t file_offset0, - off_t len, off_t offset, uint32_t h) +resizeht(struct got_delta_table *dt, FILE *f, off_t offset0, uint8_t *data, + off_t len) { - const struct got_error *err = NULL; - int i; - uint8_t *block1; - uint8_t *block2; + const struct got_error *err; + struct got_delta_block *b; + size_t newsize, oldsize; + int i, n; - if (len == 0) - return NULL; + if (dt->nblocks == dt->nalloc) { + newsize = dt->nalloc + 256; + b = reallocarray(dt->blocks, newsize, sizeof(*dt->blocks)); + if (b == NULL) + return got_error_from_errno("reallocarray"); + dt->blocks = b; + dt->nalloc = newsize; + } - i = h % dt->nalloc; - while (dt->blocks[i].len != 0) { - /* - * Avoid adding duplicate blocks. - * NB: A matching hash is insufficient for detecting equality. - * The hash can only detect inequality. - */ - if (len == dt->blocks[i].len && h == dt->blocks[i].hash) { - block1 = data + file_offset0 + dt->blocks[i].offset; - block2 = data + file_offset0 + offset; - if (memcmp(block1, block2, len) == 0) - return NULL; + if (dt->blocks == NULL || dt->len * 100 / dt->size > 70) { + oldsize = dt->size; + dt->size *= 2; + free(dt->offs); + dt->offs = calloc(dt->size, sizeof(*dt->offs)); + if (dt->offs == NULL) { + dt->size = oldsize; + return got_error_from_errno("calloc"); } - i = (i + 1) % dt->nalloc; - } - assert(dt->blocks[i].len == 0); - dt->blocks[i].len = len; - dt->blocks[i].offset = offset; - dt->blocks[i].hash = h; - dt->nblocks++; - if (dt->nalloc < dt->nblocks + 256) { - struct got_delta_block *db; - size_t old_size = dt->nalloc; - db = dt->blocks; - dt->blocks = calloc(dt->nalloc + 256, - sizeof(struct got_delta_block)); - if (dt->blocks == NULL) { - err = got_error_from_errno("calloc"); - dt->blocks = db; - return err; + for (i = 0; i < dt->nblocks; ++i) { + b = &dt->blocks[i]; + if (f) + err = lookup(&n, dt, f, offset0, b->len, + b->offset, b->hash); + else + err = lookup_mem(&n, dt, data, offset0, + data + b->offset, b->len, b->hash); + if (err) { + free(dt->offs); + dt->offs = NULL; + dt->size = oldsize; + return err; + } + assert(dt->offs[n] == 0); + dt->offs[n] = i + 1; } - dt->nalloc += 256; - /* - * Recompute all block positions. Hash-based indices of blocks - * in the array depend on the allocated length of the array. - */ - dt->nblocks = 0; - for (i = 0; i < old_size; i++) { - if (db[i].len == 0) - continue; - err = addblk_mem(dt, data, file_offset0, db[i].len, - db[i].offset, db[i].hash); - if (err) - break; - } - free(db); } - return err; + return NULL; } static const struct got_error * +addblk(struct got_delta_table *dt, FILE *f, uint8_t *data, off_t file_offset0, + off_t len, off_t offset, uint32_t h) +{ + const struct got_error *err; + struct got_delta_block *block; + int i; + + if (len == 0) + return NULL; + + err = resizeht(dt, f, file_offset0, data, len); + if (err) + return err; + + if (f) + err = lookup(&i, dt, f, file_offset0, len, offset, h); + else + err = lookup_mem(&i, dt, data, file_offset0, data + offset, + len, h); + if (err) + return err; + + if (dt->offs[i] != 0) + return NULL; /* found */ + + dt->offs[i] = dt->nblocks + 1; + block = &dt->blocks[dt->nblocks]; + block->len = len; + block->offset = offset; + block->hash = h; + + dt->nblocks++; + dt->len++; + + return NULL; +} + +static const struct got_error * lookupblk(struct got_delta_block **block, struct got_delta_table *dt, unsigned char *p, off_t len, uint32_t seed, FILE *basefile, off_t basefile_offset0) @@ -243,24 +251,25 @@ lookupblk(struct got_delta_block **block, struct got_d int i; uint32_t h; uint8_t buf[GOT_DELTIFY_MAXCHUNK]; + struct got_delta_block *b; size_t r; *block = NULL; h = hashblk(p, len, seed); - for (i = h % dt->nalloc; dt->blocks[i].len != 0; - i = (i + 1) % dt->nalloc) { - if (dt->blocks[i].hash != h || - dt->blocks[i].len != len) + + for (i = h % dt->size; dt->offs[i] != 0; i = (i + 1) % dt->size) { + b = &dt->blocks[dt->offs[i] - 1]; + if (b->hash != h || b->len != len) continue; - if (fseeko(basefile, basefile_offset0 + dt->blocks[i].offset, + if (fseeko(basefile, basefile_offset0 + b->offset, SEEK_SET) == -1) return got_error_from_errno("fseeko"); r = fread(buf, 1, len, basefile); if (r != len) return got_ferror(basefile, GOT_ERR_IO); if (memcmp(p, buf, len) == 0) { - *block = &dt->blocks[i]; + *block = b; break; } } @@ -272,24 +281,17 @@ lookupblk_mem(struct got_delta_block **block, struct g unsigned char *p, off_t len, uint32_t seed, uint8_t *basedata, off_t basefile_offset0) { - int i; + const struct got_error *err; + int n; uint32_t h; - uint8_t *b; *block = NULL; - h = hashblk(p, len, seed); - for (i = h % dt->nalloc; dt->blocks[i].len != 0; - i = (i + 1) % dt->nalloc) { - if (dt->blocks[i].hash != h || - dt->blocks[i].len != len) - continue; - b = basedata + basefile_offset0 + dt->blocks[i].offset; - if (memcmp(p, b, len) == 0) { - *block = &dt->blocks[i]; - break; - } - } + err = lookup_mem(&n, dt, basedata, basefile_offset0, p, len, h); + if (err) + return err; + if (dt->offs[n] != 0) + *block = &dt->blocks[dt->offs[n] - 1]; return NULL; } @@ -370,6 +372,13 @@ got_deltify_init(struct got_delta_table **dt, FILE *f, goto done; } + (*dt)->size = 128; + (*dt)->offs = calloc((*dt)->size, sizeof(*(*dt)->offs)); + if ((*dt)->offs == NULL) { + err = got_error_from_errno("calloc"); + goto done; + } + if (fseeko(f, fileoffset, SEEK_SET) == -1) return got_error_from_errno("fseeko"); @@ -382,7 +391,7 @@ got_deltify_init(struct got_delta_table **dt, FILE *f, if (blocklen == 0) break; h = hashblk(buf, blocklen, seed); - err = addblk(*dt, f, offset0, blocklen, + err = addblk(*dt, f, NULL, offset0, blocklen, fileoffset - offset0, h); if (err) goto done; @@ -416,6 +425,13 @@ got_deltify_init_mem(struct got_delta_table **dt, uint (*dt)->nalloc = 128; (*dt)->blocks = calloc((*dt)->nalloc, sizeof(struct got_delta_block)); if ((*dt)->blocks == NULL) { + err = got_error_from_errno("calloc"); + goto done; + } + + (*dt)->size = 128; + (*dt)->offs = calloc((*dt)->size, sizeof(*(*dt)->offs)); + if ((*dt)->offs == NULL) { err = got_error_from_errno("calloc"); goto done; } @@ -428,7 +444,7 @@ got_deltify_init_mem(struct got_delta_table **dt, uint if (blocklen == 0) break; h = hashblk(data + fileoffset, blocklen, seed); - err = addblk_mem(*dt, data, offset0, blocklen, + err = addblk(*dt, NULL, data, offset0, blocklen, fileoffset - offset0, h); if (err) goto done; @@ -450,6 +466,7 @@ got_deltify_free(struct got_delta_table *dt) if (dt == NULL) return; free(dt->blocks); + free(dt->offs); free(dt); } blob - 23d51d665edd5927c4b9ca18e2ebebf3473b1795 blob + 973f1682b45bdf0b31cfe37ef1473caa9819418d --- lib/got_lib_deltify.h +++ lib/got_lib_deltify.h @@ -25,6 +25,14 @@ struct got_delta_table { struct got_delta_block *blocks; int nblocks; int nalloc; + + /* + * Index for blocks. offs[n] is zero when the slot is free, + * otherwise it points to blocks[offs[n] - 1]. + */ + uint32_t *offs; + int len; + int size; }; struct got_delta_instruction {