commit afd0da3878d7acfc3872448325112986efb1dbb9 from: Omar Polo via: Thomas Adam date: Thu Jun 22 13:10:34 2023 UTC gotadmin cleanup: remove redundant pack files improvements and ok stsp@ commit - 0b47703514fc6cbbe5318d575bb1cce7152aadcb commit + afd0da3878d7acfc3872448325112986efb1dbb9 blob - 5278c5830ad6b4af4c8d43dbbe466b6a4851dd25 blob + 030c504a6a789b22f0cf84f810b9b06dd2a011a0 --- gotadmin/gotadmin.1 +++ gotadmin/gotadmin.1 @@ -232,8 +232,9 @@ and a break-down of the number of objects per object t .Op Fl r Ar repository-path .Xc .Dl Pq alias: Cm cl -Purge unreferenced loose objects from the repository and display -the amount of disk space which has been freed as a result. +Purge unreferenced loose objects and redundant pack files from the +repository and display the amount of disk space which has been freed +as a result. .Pp Unreferenced objects are present in the repository but cannot be reached via any reference in the entire @@ -246,12 +247,18 @@ directory, spread across 256 sub-directories named after the 256 possible hexadecimal values of the first byte of an object identifier. .Pp -Packed objects stored in pack files under -.Pa objects/pack/ -will not be purged. -However, if redundant copies of packed objects exist in loose form, -such redundant copies will be purged. +Packed objects are stored in pack files under +.Pa objects/pack/ . +.Pp +If redundant copies of packed objects exist in loose form, such +redundant copies will be purged. +If all the objects of a pack file are present in other pack files, +the redundant pack file will be purged. .Pp +For compatibility with Git, if a matching file +.Pa .keep +exists for a given pack file, this pack file won't be removed. +.Pp Objects will usually become unreferenced as a result of deleting branches or tags with .Cm got branch -d @@ -261,6 +268,11 @@ Deleting arbitrary references with .Cm got ref -d may also leave unreferenced objects behind. .Pp +Pack files will usually become redundant as a result of repacking the +repository with +.Nm +.Cm pack Fl a . +.Pp In order to determine the set of objects which are referenced, search all references for commit objects and tag objects, and traverse the corresponding tree object hierarchies. blob - a72a024f791dd1baf5ea67c1f1416a7f42950866 blob + 5be1d1bbffc9287159df30b4dc3e4420bda2539b --- gotadmin/gotadmin.c +++ gotadmin/gotadmin.c @@ -1135,16 +1135,19 @@ struct got_cleanup_progress_arg { int last_nloose; int last_ncommits; int last_npurged; + int last_nredundant; int verbosity; int printed_something; int dry_run; }; static const struct got_error * -cleanup_progress(void *arg, int nloose, int ncommits, int npurged) +cleanup_progress(void *arg, int nloose, int ncommits, int npurged, + int nredundant) { struct got_cleanup_progress_arg *a = arg; int print_loose = 0, print_commits = 0, print_purged = 0; + int print_redundant = 0; if (a->last_nloose != nloose) { print_loose = 1; @@ -1161,11 +1164,15 @@ cleanup_progress(void *arg, int nloose, int ncommits, print_purged = 1; a->last_npurged = npurged; } + if (a->last_nredundant != nredundant) { + print_redundant = 1; + a->last_nredundant = nredundant; + } if (a->verbosity < 0) return NULL; - if (print_loose || print_commits || print_purged) + if (print_loose || print_commits || print_purged || print_redundant) printf("\r"); if (print_loose) printf("%d loose object%s", nloose, nloose == 1 ? "" : "s"); @@ -1181,7 +1188,16 @@ cleanup_progress(void *arg, int nloose, int ncommits, npurged == 1 ? "" : "s"); } } - if (print_loose || print_commits || print_purged) { + if (print_redundant) { + if (a->dry_run) { + printf("%d pack file%s could be purged", nredundant, + nredundant == 1 ? "" : "s"); + } else { + printf("%d pack file%s purged", nredundant, + nredundant == 1 ? "" : "s"); + } + } + if (print_loose || print_commits || print_purged || print_redundant) { a->printed_something = 1; fflush(stdout); } @@ -1221,10 +1237,14 @@ cmd_cleanup(int argc, char *argv[]) int remove_lonely_packidx = 0, ignore_mtime = 0; struct got_cleanup_progress_arg cpa; struct got_lonely_packidx_progress_arg lpa; - off_t size_before, size_after; - char scaled_before[FMT_SCALED_STRSIZE]; - char scaled_after[FMT_SCALED_STRSIZE]; - char scaled_diff[FMT_SCALED_STRSIZE]; + off_t loose_before, loose_after; + off_t pack_before, pack_after; + off_t total_size; + char loose_before_scaled[FMT_SCALED_STRSIZE]; + char loose_after_scaled[FMT_SCALED_STRSIZE]; + char pack_before_scaled[FMT_SCALED_STRSIZE]; + char pack_after_scaled[FMT_SCALED_STRSIZE]; + char total_size_scaled[FMT_SCALED_STRSIZE]; int *pack_fds = NULL; #ifndef PROFILE @@ -1298,37 +1318,67 @@ cmd_cleanup(int argc, char *argv[]) memset(&cpa, 0, sizeof(cpa)); cpa.last_ncommits = -1; cpa.last_npurged = -1; + cpa.last_nredundant = -1; cpa.dry_run = dry_run; cpa.verbosity = verbosity; + error = got_repo_purge_unreferenced_loose_objects(repo, - &size_before, &size_after, &npacked, dry_run, ignore_mtime, + &loose_before, &loose_after, &npacked, dry_run, ignore_mtime, cleanup_progress, &cpa, check_cancelled, NULL); if (cpa.printed_something) printf("\n"); + if (error) + goto done; + + cpa.printed_something = 0; + cpa.last_ncommits = -1; + cpa.last_npurged = -1; + cpa.last_nloose = -1; + cpa.last_nredundant = -1; + error = got_repo_purge_redundant_packfiles(repo, &pack_before, + &pack_after, dry_run, cleanup_progress, &cpa, + check_cancelled, NULL); if (error) goto done; + if (cpa.printed_something) + printf("\n"); + + total_size = (loose_before - loose_after) + (pack_before - pack_after); + if (cpa.printed_something) { - if (fmt_scaled(size_before, scaled_before) == -1) { + if (fmt_scaled(loose_before, loose_before_scaled) == -1) { error = got_error_from_errno("fmt_scaled"); goto done; } - if (fmt_scaled(size_after, scaled_after) == -1) { + if (fmt_scaled(loose_after, loose_after_scaled) == -1) { error = got_error_from_errno("fmt_scaled"); goto done; } - if (fmt_scaled(size_before - size_after, scaled_diff) == -1) { + if (fmt_scaled(pack_before, pack_before_scaled) == -1) { error = got_error_from_errno("fmt_scaled"); goto done; } - printf("loose total size before: %s\n", scaled_before); - printf("loose total size after: %s\n", scaled_after); + if (fmt_scaled(pack_after, pack_after_scaled) == -1) { + error = got_error_from_errno("fmt_scaled"); + goto done; + } + if (fmt_scaled(total_size, total_size_scaled) == -1) { + error = got_error_from_errno("fmt_scaled"); + goto done; + } + printf("loose total size before: %s\n", loose_before_scaled); + printf("loose total size after: %s\n", loose_after_scaled); + printf("pack files total size before: %s\n", + pack_before_scaled); + printf("pack files total size after: %s\n", pack_after_scaled); if (dry_run) { printf("disk space which would be freed: %s\n", - scaled_diff); + total_size_scaled); } else - printf("disk space freed: %s\n", scaled_diff); + printf("disk space freed: %s\n", total_size_scaled); printf("loose objects also found in pack files: %d\n", npacked); } + done: if (repo) got_repo_close(repo); blob - 5ddd50191b8acfb903877b148bb856de99014b07 blob + 57bb3f85a90ec6fdf48dc338be1a61ccab5a133c --- include/got_repository_admin.h +++ include/got_repository_admin.h @@ -70,7 +70,7 @@ got_repo_list_pack(FILE *packfile, struct got_object_i /* A callback function which gets invoked with cleanup information to print. */ typedef const struct got_error *(*got_cleanup_progress_cb)(void *arg, - int nloose, int ncommits, int npurged); + int nloose, int ncommits, int npurged, int nredundant); /* * Walk objects reachable via references to determine whether any loose @@ -88,6 +88,12 @@ got_repo_purge_unreferenced_loose_objects(struct got_r int ignore_mtime, got_cleanup_progress_cb progress_cb, void *progress_arg, got_cancel_cb cancel_cb, void *cancel_arg); +const struct got_error * +got_repo_purge_redundant_packfiles(struct got_repository *repo, + off_t *before, off_t *size_after, int dry_run, + got_cleanup_progress_cb progress_cb, void *progress_arg, + got_cancel_cb cancel_cb, void *cancel_arg); + /* A callback function which gets invoked with cleanup information to print. */ typedef const struct got_error *(*got_lonely_packidx_progress_cb)(void *arg, const char *path); blob - 96c3b2f9a5dbefe4d4825ddaaa5115c2a7d187ff blob + 2187b12aa9b0522477825183261b30c78d674291 --- lib/repository_admin.c +++ lib/repository_admin.c @@ -625,7 +625,7 @@ report_cleanup_progress(got_cleanup_progress_cb progre if (err || !elapsed) return err; - return progress_cb(progress_arg, nloose, ncommits, npurged); + return progress_cb(progress_arg, nloose, ncommits, npurged, -1); } static const struct got_error * @@ -1150,7 +1150,7 @@ got_repo_purge_unreferenced_loose_objects(struct got_r if (nloose == 0) { got_object_idset_free(loose_ids); if (progress_cb) { - err = progress_cb(progress_arg, 0, 0, 0); + err = progress_cb(progress_arg, 0, 0, 0, -1); if (err) return err; } @@ -1214,13 +1214,205 @@ got_repo_purge_unreferenced_loose_objects(struct got_r /* Produce a final progress report. */ if (progress_cb) { - err = progress_cb(progress_arg, nloose, ncommits, arg.npurged); + err = progress_cb(progress_arg, nloose, ncommits, arg.npurged, + -1); if (err) goto done; } done: got_object_idset_free(loose_ids); got_object_idset_free(traversed_ids); + return err; +} + +static const struct got_error * +purge_redundant_pack(struct got_repository *repo, const char *packidx_path, + int dry_run, int *remove, off_t *size_before, off_t *size_after) +{ + static const char *ext[] = {".idx", ".pack", ".rev", ".bitmap", + ".promisor", ".mtimes"}; + struct stat sb; + char *dot, path[PATH_MAX]; + size_t i; + + if (strlcpy(path, packidx_path, sizeof(path)) >= + sizeof(path)) + return got_error(GOT_ERR_NO_SPACE); + + /* + * For compatibility with Git, if a matching .keep file exist + * don't delete the packfile. + */ + dot = strrchr(path, '.'); + *dot = '\0'; + if (strlcat(path, ".keep", sizeof(path)) >= sizeof(path)) + return got_error(GOT_ERR_NO_SPACE); + if (faccessat(got_repo_get_fd(repo), path, F_OK, 0) == 0) + *remove = 0; + + for (i = 0; i < nitems(ext); ++i) { + *dot = '\0'; + + if (strlcat(path, ext[i], sizeof(path)) >= + sizeof(path)) + return got_error(GOT_ERR_NO_SPACE); + + if (fstatat(got_repo_get_fd(repo), path, &sb, 0) == + -1) { + if (errno == ENOENT && + strcmp(ext[i], ".pack") != 0 && + strcmp(ext[i], ".idx") != 0) + continue; + return got_error_from_errno2("fstatat", path); + } + + *size_before += sb.st_size; + if (!*remove) { + *size_after += sb.st_size; + continue; + } + + if (dry_run) + continue; + + if (unlinkat(got_repo_get_fd(repo), path, 0) == -1) { + if (errno == ENOENT) + continue; + return got_error_from_errno2("unlinkat", + path); + } + } + + return NULL; +} + +static const struct got_error * +pack_is_redundant(int *redundant, struct got_repository *repo, + const char *packidx_path, struct got_object_idset *idset) +{ + const struct got_error *err; + struct got_packidx *packidx; + struct got_packidx_object_id *pid; + struct got_object_id id; + size_t i, nobjects; + + *redundant = 1; + + err = got_repo_get_packidx(&packidx, packidx_path, repo); + if (err) + return err; + + nobjects = be32toh(packidx->hdr.fanout_table[0xff]); + for (i = 0; i < nobjects; ++i) { + pid = &packidx->hdr.sorted_ids[i]; + + memset(&id, 0, sizeof(id)); + memcpy(&id.sha1, pid->sha1, sizeof(id.sha1)); + + if (got_object_idset_contains(idset, &id)) + continue; + + *redundant = 0; + err = got_object_idset_add(idset, &id, NULL); + if (err) + return err; + } + + return NULL; +} + +struct pack_info { + const char *path; + size_t nobjects; +}; + +static int +pack_info_cmp(const void *a, const void *b) +{ + const struct pack_info *pa, *pb; + + pa = a; + pb = b; + if (pa->nobjects == pb->nobjects) + return strcmp(pa->path, pb->path); + if (pa->nobjects > pb->nobjects) + return -1; + return 1; +} + +const struct got_error * +got_repo_purge_redundant_packfiles(struct got_repository *repo, + off_t *size_before, off_t *size_after, int dry_run, + got_cleanup_progress_cb progress_cb, void *progress_arg, + got_cancel_cb cancel_cb, void *cancel_arg) +{ + const struct got_error *err; + struct pack_info *pinfo, *sorted = NULL; + struct got_packidx *packidx; + struct got_object_idset *idset = NULL; + struct got_pathlist_entry *pe; + size_t i, npacks; + int remove, redundant_packs = 0; + + *size_before = 0; + *size_after = 0; + + npacks = 0; + TAILQ_FOREACH(pe, &repo->packidx_paths, entry) + npacks++; + + if (npacks == 0) + return NULL; + + sorted = calloc(npacks, sizeof(*sorted)); + if (sorted == NULL) + return got_error_from_errno("calloc"); + + i = 0; + TAILQ_FOREACH(pe, &repo->packidx_paths, entry) { + err = got_repo_get_packidx(&packidx, pe->path, repo); + if (err) + goto done; + + pinfo = &sorted[i++]; + pinfo->path = pe->path; + pinfo->nobjects = be32toh(packidx->hdr.fanout_table[0xff]); + } + qsort(sorted, npacks, sizeof(*sorted), pack_info_cmp); + + idset = got_object_idset_alloc(); + if (idset == NULL) { + err = got_error_from_errno("got_object_idset_alloc"); + goto done; + } + + for (i = 0; i < npacks; ++i) { + if (cancel_cb) { + err = (*cancel_cb)(cancel_arg); + if (err) + break; + } + + err = pack_is_redundant(&remove, repo, sorted[i].path, idset); + if (err) + goto done; + err = purge_redundant_pack(repo, sorted[i].path, dry_run, + &remove, size_before, size_after); + if (err) + goto done; + if (!remove) + continue; + err = progress_cb(progress_arg, -1, -1, -1, + ++redundant_packs); + if (err) + goto done; + } + + err = progress_cb(progress_arg, -1, -1, -1, redundant_packs); + done: + free(sorted); + if (idset) + got_object_idset_free(idset); return err; } blob - 2381958704d73cb1ebaeb0f7af8d2bf01b7d434f blob + f6ddbb9657a29cfb443245457d038aee5b06d364 --- regress/cmdline/cleanup.sh +++ regress/cmdline/cleanup.sh @@ -236,7 +236,123 @@ test_cleanup_redundant_loose_objects() { test_done "$testroot" "$ret" } + +test_cleanup_redundant_pack_files() { + local testroot=`test_init cleanup_redundant_pack_files` + + # no pack files should exist yet + + n=$(gotadmin info -r "$testroot/repo" | awk '/^pack files/{print $3}') + if [ "$n" -ne 0 ]; then + echo "expected no pack file to exists, $n found" >&2 + test_done "$testroot" 1 + return 1 + fi + + # create a redundant pack with an associated .keep file + hash=$(gotadmin pack -a -r "$testroot/repo" \ + | awk '/^Indexed/{print $2}') + kpack="$testroot/repo/.git/objects/pack/pack-$hash" + touch "${kpack%.pack}.keep" + + # create a few pack files with different objects + for i in `jot 5`; do + echo "alpha $i" > $testroot/repo/alpha + git_commit "$testroot/repo" -m "edit #$i" + gotadmin pack -r "$testroot/repo" >/dev/null + done + + # create two packs with all the objects + gotadmin pack -a -r "$testroot/repo" >/dev/null + gotadmin pack -a -r "$testroot/repo" >/dev/null + + gotadmin cleanup -r "$testroot/repo" | grep 'pack files? purged' \ + | tail -1 > $testroot/stdout + echo "5 pack files purged" > $testroot/stdout.expected + if cmp -s "$testroot/stdout.expected" "$testroot/stdout"; then + diff -u "$testroot/stdout.expected" "$testroot/stdout" + test_done "$testroot" 1 + return 1 + fi + + n=$(gotadmin info -r "$testroot/repo" | awk '/^pack files/{print $3}') + if [ "$n" -ne 2 ]; then + echo "expected 2 pack files left, $n found instead" >&2 + test_done "$testroot" 1 + return 1 + fi + + if [ ! -f "$kpack" ]; then + echo "$kpack disappeared unexpectedly" >&2 + test_done "$testroot" 1 + return 1 + fi + + if [ ! -f "${kpack%.pack}.keep" ]; then + echo "${kpack%.pack}.keep disappeared unexpectedly" >&2 + test_done "$testroot" 1 + return 1 + fi + + # create one more non-redundant pack + for i in `jot 5`; do + echo "alpha again $i" > $testroot/repo/alpha + git_commit "$testroot/repo" -m "edit $i" + done + gotadmin pack -r "$testroot/repo" >/dev/null + + gotadmin cleanup -r "$testroot/repo" | grep 'pack files? purged' \ + | tail -1 > $testroot/stdout + + echo "0 pack files purged" > $testroot/stdout.expected + if cmp -s "$testroot/stdout.expected" "$testroot/stdout"; then + diff -u "$testroot/stdout.expected" "$testroot/stdout" + test_done "$testroot" 1 + return 1 + fi + + n=$(gotadmin info -r "$testroot/repo" | awk '/^pack files/{print $3}') + if [ "$n" -ne 3 ]; then + echo "expected 3 pack files left, $n found instead" >&2 + test_done "$testroot" 1 + return 1 + fi + + # remove the .keep file + rm "${kpack%.pack}.keep" + + # create some commits on a separate branch + (cd "$testroot/repo" && git checkout -q -b newbranch) + + for i in `jot 5`; do + echo "alpha $i" > $testroot/repo/alpha + git_commit "$testroot/repo" -m "edit #$i" + gotadmin pack -r "$testroot/repo" >/dev/null + done + + gotadmin pack -a -x master -r "$testroot/repo" >/dev/null + + gotadmin cleanup -r "$testroot/repo" | grep 'pack files? purged' \ + | tail -1 > $testroot/stdout + + echo "6 pack files purged" > $testroot/stdout.expected + if cmp -s "$testroot/stdout.expected" "$testroot/stdout"; then + diff -u "$testroot/stdout.expected" "$testroot/stdout" + test_done "$testroot" 1 + return 1 + fi + + n=$(gotadmin info -r "$testroot/repo" | awk '/^pack files/{print $3}') + if [ "$n" -ne 3 ]; then + echo "expected 3 pack files left, $n found instead" >&2 + test_done "$testroot" 1 + return 1 + fi + + test_done "$testroot" 0 +} + test_cleanup_precious_objects() { local testroot=`test_init cleanup_precious_objects` @@ -365,5 +481,6 @@ test_cleanup_missing_pack_file() { test_parseargs "$@" run_test test_cleanup_unreferenced_loose_objects run_test test_cleanup_redundant_loose_objects +run_test test_cleanup_redundant_pack_files run_test test_cleanup_precious_objects run_test test_cleanup_missing_pack_file