commit - /dev/null
commit + 3b0f3d6191103b52a0619ed00752f7f5e6fa754c
blob - /dev/null
blob + 4ce1b320e06779fbf641c890bdb8a0d6dcea3ed9 (mode 644)
--- /dev/null
+++ .gitignore
+.*.sw?
+diff/diff
+*.o
+*.d
+*.a
+tags
+test/got*.diff
+test/verify.*
blob - /dev/null
blob + d908088b90fc58865c6f9352047c7d52f37c3a48 (mode 644)
--- /dev/null
+++ LICENCE
+ Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+
+ Permission to use, copy, modify, and distribute this software for any
+ purpose with or without fee is hereby granted, provided that the above
+ copyright notice and this permission notice appear in all copies.
+
+ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
blob - /dev/null
blob + 2ff357ceee0eed252b191258de3fea36fa4ae92b (mode 644)
--- /dev/null
+++ README
+This is a collection of diff algorithms, to test various combinations.
+
+The initial aim was to provide a faster diff implementation for got
+(gameoftrees.org) with a BSD license, at the u2k20 OpenBSD hackathon.
+A side effect could be improving OpenBSD's /usr/bin/diff utility.
+
+At the time of writing, this is little more than a playground / benchmark basis
+/ diff algorithm analysis platform. What could be done:
+- add profiling and test series to rate diff algorithm combinations.
+- interface with / merge into got.
+
+The Myers and Patience Diff algorithm implementations found here are based on
+the explanations found in these blog post series:
+ https://blog.jcoglan.com/2017/02/12/the-myers-diff-algorithm-part-1/ ff.
+and
+ https://blog.jcoglan.com/2017/09/19/the-patience-diff-algorithm/ ff.
+-- possibly the single most comprehensive explanations of these algorithms.
+Many thanks for this valuable door opener!
+The source code itself is not based on the code found in those blogs, but
+written from scratch with the knowledge gained.
+
+Compile:
+ OpenBSD:
+ make -C diff
+ Linux:
+ make -f linux_Makefile -C diff
+
+Test:
+ make -C test/
blob - /dev/null
blob + b93701332fb1013c6f09d620e138075dc268925c (mode 644)
--- /dev/null
+++ diff/Makefile
+.PATH:${.CURDIR}/../lib
+
+.include "../diff-version.mk"
+
+PROG= diff
+SRCS= \
+ diff.c \
+ diff_atomize_text.c \
+ diff_main.c \
+ diff_myers.c \
+ diff_patience.c \
+ diff_output.c \
+ diff_output_plain.c \
+ diff_output_unidiff.c \
+ ${END}
+MAN = ${PROG}.1
+
+CPPFLAGS = -I${.CURDIR}/../include -I${.CURDIR}/../lib
+
+.if defined(PROFILE)
+LDADD = -lutil_p -lz_p -lc_p
+.else
+LDADD = -lutil -lz
+.endif
+
+.if ${DIFF_RELEASE} != "Yes"
+NOMAN = Yes
+.endif
+
+realinstall:
+ ${INSTALL} ${INSTALL_COPY} -o ${BINOWN} -g ${BINGRP} \
+ -m ${BINMODE} ${PROG} ${BINDIR}/${PROG}
+
+dist:
+ mkdir ../diff-${DIFF_VERSION}/diff
+ cp ${SRCS} ${MAN} ../diff-${DIFF_VERSION}/diff
+
+.include <bsd.prog.mk>
blob - /dev/null
blob + be68fea20a6bbf1f0e109ccd5ea2042919055f31 (mode 644)
--- /dev/null
+++ diff/diff.c
+/* Commandline diff utility to test diff implementations. */
+/*
+ * Copyright (c) 2018 Martin Pieuchot
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <sys/mman.h>
+#include <sys/stat.h>
+
+#include <err.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#ifdef __linux__
+/* stupid shims to compile and test on linux */
+#define __dead
+
+static const char *getprogname()
+{
+ return "diff";
+}
+#endif
+
+__dead void usage(void);
+int diffreg(char *, char *, int);
+char *mmapfile(const char *, struct stat *);
+
+__dead void
+usage(void)
+{
+ fprintf(stderr, "usage: %s file1 file2\n", getprogname());
+ exit(1);
+}
+
+int
+main(int argc, char *argv[])
+{
+ int ch;
+
+ while ((ch = getopt(argc, argv, "")) != -1) {
+ switch (ch) {
+ default:
+ usage();
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if (argc != 2)
+ usage();
+
+ return diffreg(argv[0], argv[1], 0);
+}
+
+#include <diff/diff_main.h>
+#include <diff/diff_output.h>
+
+const struct diff_algo_config myers, patience, myers_divide;
+
+const struct diff_algo_config myers = (struct diff_algo_config){
+ .impl = diff_algo_myers,
+ .permitted_state_size = 104 * sizeof(int),
+ .fallback_algo = &patience,
+};
+
+const struct diff_algo_config patience = (struct diff_algo_config){
+ .impl = diff_algo_patience,
+ .inner_algo = &patience, // After subdivision, do Patience again.
+ .fallback_algo = &myers_divide, // If subdivision failed, do Myers Divide et Impera.
+};
+
+const struct diff_algo_config myers_divide = (struct diff_algo_config){
+ .impl = diff_algo_myers_divide,
+ .inner_algo = &myers, // When division succeeded, start from the top.
+ // (fallback_algo = NULL implies diff_algo_none).
+};
+
+const struct diff_config diff_config = {
+ .atomize_func = diff_atomize_text_by_line,
+ .algo = &myers,
+};
+
+int
+diffreg(char *file1, char *file2, int flags)
+{
+ char *str1, *str2;
+ struct stat st1, st2;
+ struct diff_input_info info = {
+ .left_path = file1,
+ .right_path = file2,
+ };
+
+ str1 = mmapfile(file1, &st1);
+ str2 = mmapfile(file2, &st2);
+
+ diff_plain(stdout, &diff_config, &info, str1, st1.st_size, str2, st2.st_size);
+
+ munmap(str1, st1.st_size);
+ munmap(str2, st2.st_size);
+
+ return 0;
+}
+
+char *
+mmapfile(const char *path, struct stat *st)
+{
+ int fd;
+ char *p;
+
+ fd = open(path, O_RDONLY);
+ if (fd == -1)
+ err(2, "%s", path);
+
+ if (fstat(fd, st) == -1)
+ err(2, "%s", path);
+
+ if ((uintmax_t)st->st_size > SIZE_MAX)
+ errx(2, "%s: file too big to fit memory", path);
+
+ p = mmap(NULL, st->st_size, PROT_READ, MAP_PRIVATE, fd, 0);
+ if (p == MAP_FAILED)
+ err(2, "mmap");
+
+ close(fd);
+
+ return p;
+}
blob - /dev/null
blob + 62c7103d70ac32f292cc05f62c782305dd8798f6 (mode 644)
--- /dev/null
+++ diff/linux_Makefile
+CFLAGS = -fsanitize=address -fsanitize=undefined -g -O3
+
+diff: diff.c ../lib/libdiff.a
+ gcc $(CFLAGS) -I../include -o $@ $^
+
+
+../lib/libdiff.a: ../lib/*.[hc] ../include/diff/*.h
+ $(MAKE) -f linux_Makefile -C ../lib
+
+.PHONY: clean
+clean:
+ rm diff
+ $(MAKE) -f linux_Makefile -C ../lib clean
blob - /dev/null
blob + 1d14cd7ecfcc0a39e1935164b1480f4dffbf0b28 (mode 644)
--- /dev/null
+++ diff-version.mk
+DIFF_RELEASE=No
+DIFF_VERSION_NUMBER=0.1
+
+.if ${DIFF_RELEASE} == Yes
+DIFF_VERSION=${DIFF_VERSION_NUMBER}
+.else
+DIFF_VERSION=${DIFF_VERSION_NUMBER}-current
+.endif
blob - /dev/null
blob + 395468a8c29ce7c6d5ca5283a1a1aa247acb3d87 (mode 644)
--- /dev/null
+++ include/diff/arraylist.h
+/* Auto-reallocating array for arbitrary member types. */
+/*
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#pragma once
+
+#ifdef __linux__
+/* stupid shims to compile and test on linux */
+#include <strings.h>
+static inline void *reallocarray(void *buf, size_t n, size_t member_size)
+{
+ return realloc(buf, n * member_size);
+}
+static inline void *recallocarray(void *buf, size_t oldn, size_t n, size_t member_size)
+{
+ buf = reallocarray(buf, n, member_size);
+ bzero(((char*)buf) + (oldn * member_size), (n - oldn) * member_size);
+ return buf;
+}
+#endif
+
+
+/* Usage:
+ *
+ * ARRAYLIST(any_type_t) list;
+ * // OR
+ * typedef ARRAYLIST(any_type_t) any_type_list_t;
+ * any_type_list_t list;
+ *
+ * ARRAYLIST_INIT(list, 128); // < number of (at first unused) members to add on each realloc
+ * any_type_t *x;
+ * while (bar) {
+ * ARRAYLIST_ADD(x, list); // < enlarges the allocated array as needed; list.head may change due to realloc
+ * if (!x)
+ * abort();
+ * *x = random_foo_value;
+ * }
+ * for (i = 0; i < list.len; i++)
+ * printf("%s", foo_to_str(list.head[i]));
+ * ARRAYLIST_FREE(list);
+ */
+#define ARRAYLIST(MEMBER_TYPE) \
+ struct { \
+ MEMBER_TYPE *head; \
+ unsigned int len; \
+ unsigned int allocated; \
+ unsigned int alloc_blocksize; \
+ }
+
+#define ARRAYLIST_INIT(ARRAY_LIST, ALLOC_BLOCKSIZE) do { \
+ (ARRAY_LIST).head = NULL; \
+ (ARRAY_LIST).len = 0; \
+ (ARRAY_LIST).allocated = 0; \
+ (ARRAY_LIST).alloc_blocksize = ALLOC_BLOCKSIZE; \
+ } while(0)
+
+#define ARRAYLIST_ADD(NEW_ITEM_P, ARRAY_LIST) do { \
+ if ((ARRAY_LIST).len && !(ARRAY_LIST).allocated) { \
+ NEW_ITEM_P = NULL; \
+ break; \
+ } \
+ if ((ARRAY_LIST).head == NULL || (ARRAY_LIST).allocated < (ARRAY_LIST).len + 1) { \
+ (ARRAY_LIST).allocated += (ARRAY_LIST).alloc_blocksize ? : 8; \
+ (ARRAY_LIST).head = recallocarray((ARRAY_LIST).head, (ARRAY_LIST).len, \
+ (ARRAY_LIST).allocated, sizeof(*(ARRAY_LIST).head)); \
+ }; \
+ if (!(ARRAY_LIST).head || (ARRAY_LIST).allocated < (ARRAY_LIST).len + 1) { \
+ NEW_ITEM_P = NULL; \
+ break; \
+ } \
+ (NEW_ITEM_P) = &(ARRAY_LIST).head[(ARRAY_LIST).len]; \
+ (ARRAY_LIST).len++; \
+ } while (0)
+
+#define ARRAYLIST_CLEAR(ARRAY_LIST) \
+ (ARRAY_LIST).len = 0
+
+#define ARRAYLIST_FREE(ARRAY_LIST) \
+ do { \
+ if ((ARRAY_LIST).head && (ARRAY_LIST).allocated) \
+ free((ARRAY_LIST).head); \
+ ARRAYLIST_INIT(ARRAY_LIST, (ARRAY_LIST).alloc_blocksize); \
+ } while(0)
blob - /dev/null
blob + 073e0a21c0a312a7c09f13092e23da0decdec43a (mode 644)
--- /dev/null
+++ include/diff/diff_main.h
+/* Generic infrastructure to implement various diff algorithms. */
+/*
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#pragma once
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <errno.h>
+
+#include <diff/arraylist.h>
+
+/* List of all possible return codes of a diff invocation. */
+enum diff_rc {
+ DIFF_RC_USE_DIFF_ALGO_FALLBACK = -1,
+ DIFF_RC_OK = 0,
+ DIFF_RC_ENOTSUP = ENOTSUP,
+ DIFF_RC_ENOMEM = ENOMEM,
+ DIFF_RC_EINVAL = EINVAL,
+};
+
+struct diff_atom {
+ const uint8_t *at;
+ size_t len;
+
+ /* This hash is just a very cheap speed up for finding *mismatching* atoms. When hashes match, we still need to
+ * compare entire atoms to find out whether they are indeed identical or not. */
+ unsigned int hash;
+
+ /* State for the Patience Diff algorithm */
+ /* TODO: keep a separate array for the patience state */
+ struct {
+ bool unique_here;
+ bool unique_in_both;
+ struct diff_atom *pos_in_other;
+ struct diff_atom *prev_stack;
+ } patience;
+};
+
+static inline bool diff_atom_same(const struct diff_atom *left, const struct diff_atom *right)
+{
+ return left->hash == right->hash
+ && left->len == right->len
+ && memcmp(left->at, right->at, left->len) == 0;
+}
+
+/* For each file, there is a "root" struct diff_data referencing the entire file, which the atoms are parsed from. In
+ * recursion of diff algorithm, there may be "child" struct diff_data only referencing a subsection of the file,
+ * re-using the atoms parsing. For "root" structs, atoms_allocated will be nonzero, indicating that the array of atoms
+ * is owned by that struct. For "child" structs, atoms_allocated == 0, to indicate that the struct is referencing a
+ * subset of atoms. */
+struct diff_data {
+ const uint8_t *data;
+ size_t len;
+ ARRAYLIST(struct diff_atom) atoms;
+ struct diff_data *root;
+};
+
+void diff_data_free(struct diff_data *diff_data);
+
+/* The atom's index in the entire file. For atoms divided by lines of text, this yields the line number (starting with
+ * 0). Also works for diff_data that reference only a subsection of a file, always reflecting the global position in
+ * the file (and not the relative position within the subsection). */
+#define diff_atom_root_idx(DIFF_DATA, ATOM) \
+ ((ATOM) ? (ATOM) - ((DIFF_DATA)->root->atoms.head) : (DIFF_DATA)->root->atoms.len)
+
+/* The atom's index within DIFF_DATA. For atoms divided by lines of text, this yields the line number (starting with
+ * 0). */
+#define diff_atom_idx(DIFF_DATA, ATOM) \
+ ((ATOM) ? (ATOM) - ((DIFF_DATA)->atoms.head) : (DIFF_DATA)->atoms.len)
+
+#define foreach_diff_atom(ATOM, FIRST_ATOM, COUNT) \
+ for ((ATOM) = (FIRST_ATOM); \
+ (ATOM) && ((ATOM) >= (FIRST_ATOM)) && ((ATOM) - (FIRST_ATOM) < (COUNT)); \
+ (ATOM)++)
+
+#define diff_data_foreach_atom(ATOM, DIFF_DATA) \
+ foreach_diff_atom(ATOM, (DIFF_DATA)->atoms.head, (DIFF_DATA)->atoms.len)
+
+#define diff_data_foreach_atom_from(FROM, ATOM, DIFF_DATA) \
+ for ((ATOM) = (FROM); \
+ (ATOM) && ((ATOM) >= (DIFF_DATA)->atoms.head) && ((ATOM) - (DIFF_DATA)->atoms.head < (DIFF_DATA)->atoms.len); \
+ (ATOM)++)
+
+/* A diff chunk represents a set of atoms on the left and/or a set of atoms on the right.
+ *
+ * If solved == false:
+ * The diff algorithm has divided the source file, and this is a chunk that the inner_algo should run on next.
+ * The lines on the left should be diffed against the lines on the right.
+ * (If there are no left lines or no right lines, it implies solved == true, because there is nothing to diff.)
+ *
+ * If solved == true:
+ * If there are only left atoms, it is a chunk removing atoms from the left ("a minus chunk").
+ * If there are only right atoms, it is a chunk adding atoms from the right ("a plus chunk").
+ * If there are both left and right lines, it is a chunk of equal content on both sides,
+ * and left_count == right_count:
+ *
+ * - foo }
+ * - bar }-- diff_chunk{ left_start = &left.atoms.head[0], left_count = 3,
+ * - baz } right_start = NULL, right_count = 0 }
+ * moo }
+ * goo }-- diff_chunk{ left_start = &left.atoms.head[3], left_count = 3,
+ * zoo } right_start = &right.atoms.head[0], right_count = 3 }
+ * +loo }
+ * +roo }-- diff_chunk{ left_start = NULL, left_count = 0,
+ * +too } right_start = &right.atoms.head[3], right_count = 3 }
+ *
+ */
+struct diff_chunk {
+ bool solved;
+ struct diff_atom *left_start;
+ unsigned int left_count;
+ struct diff_atom *right_start;
+ unsigned int right_count;
+};
+
+typedef ARRAYLIST(struct diff_chunk) diff_chunk_arraylist_t;
+#define DIFF_RESULT_ALLOC_BLOCKSIZE 128
+
+struct diff_result {
+ enum diff_rc rc;
+ struct diff_data left;
+ struct diff_data right;
+ diff_chunk_arraylist_t chunks;
+};
+
+struct diff_state {
+ /* The final result passed to the original diff caller. */
+ struct diff_result *result;
+
+ /* The root diff_data is in result->left,right, these are (possibly) subsections of the root data. */
+ struct diff_data left;
+ struct diff_data right;
+
+ unsigned int recursion_depth_left;
+
+ /* Remaining chunks from one diff algorithm pass, if any solved == false chunks came up. */
+ diff_chunk_arraylist_t temp_result;
+};
+
+struct diff_chunk *diff_state_add_chunk(struct diff_state *state, bool solved,
+ struct diff_atom *left_start, unsigned int left_count,
+ struct diff_atom *right_start, unsigned int right_count);
+
+/* Signature of a utility function to divide both source files into diff atoms.
+ * It is possible that a (future) algorithm requires both source files to decide on atom split points, hence this gets
+ * both left and right to atomize at the same time.
+ * An example is diff_atomize_text_by_line() in diff_atomize_text.c.
+ *
+ * func_data: context pointer (free to be used by implementation).
+ * left: struct diff_data with left->data and left->len already set up, and left->atoms to be created.
+ * right: struct diff_data with right->data and right->len already set up, and right->atoms to be created.
+ */
+typedef enum diff_rc (*diff_atomize_func_t)(void *func_data, struct diff_data *left, struct diff_data *right);
+
+extern enum diff_rc diff_atomize_text_by_line(void *func_data, struct diff_data *left, struct diff_data *right);
+
+struct diff_algo_config;
+typedef enum diff_rc (*diff_algo_impl_t)(const struct diff_algo_config *algo_config, struct diff_state *state);
+
+/* Form a result with all left-side removed and all right-side added, i.e. no actual diff algorithm involved. */
+enum diff_rc diff_algo_none(const struct diff_algo_config *algo_config, struct diff_state *state);
+
+/* Myers Diff tracing from the start all the way through to the end, requiring quadratic amounts of memory. This can
+ * fail if the required space surpasses algo_config->permitted_state_size. */
+extern enum diff_rc diff_algo_myers(const struct diff_algo_config *algo_config, struct diff_state *state);
+
+/* Myers "Divide et Impera": tracing forwards from the start and backwards from the end to find a midpoint that divides
+ * the problem into smaller chunks. Requires only linear amounts of memory. */
+extern enum diff_rc diff_algo_myers_divide(const struct diff_algo_config *algo_config, struct diff_state *state);
+
+/* Patience Diff algorithm, which divides a larger diff into smaller chunks. For very specific scenarios, it may lead to
+ * a complete diff result by itself, but needs a fallback algo to solve chunks that don't have common-unique atoms. */
+extern enum diff_rc diff_algo_patience(const struct diff_algo_config *algo_config, struct diff_state *state);
+
+/* Diff algorithms to use, possibly nested. For example:
+ *
+ * struct diff_algo_config myers, patience, myers_divide;
+ *
+ * myers = (struct diff_algo_config){
+ * .impl = diff_algo_myers,
+ * .permitted_state_size = 32 * 1024 * 1024,
+ * .fallback_algo = &patience, // when too large, do diff_algo_patience
+ * };
+ *
+ * patience = (struct diff_algo_config){
+ * .impl = diff_algo_patience,
+ * .inner_algo = &patience, // After subdivision, do Patience again.
+ * .fallback_algo = &myers_divide, // If subdivision failed, do Myers Divide et Impera.
+ * };
+ *
+ * myers_divide = (struct diff_algo_config){
+ * .impl = diff_algo_myers_divide,
+ * .inner_algo = &myers, // When division succeeded, start from the top.
+ * // (fallback_algo = NULL implies diff_algo_none).
+ * };
+ *
+ * struct diff_config config = {
+ * .algo = &myers,
+ * ...
+ * };
+ * diff_main(&config, ...);
+ */
+struct diff_algo_config {
+ diff_algo_impl_t impl;
+
+ /* Fail this algo if it would use more than this amount of memory, and instead use fallback_algo
+ * (diff_algo_myers). permitted_state_size == 0 means no limitation. */
+ size_t permitted_state_size;
+
+ /* For algorithms that divide into smaller chunks, use this algorithm to solve the divided chunks. */
+ const struct diff_algo_config *inner_algo;
+
+ /* If the algorithm fails (e.g. diff_algo_myers_if_small needs too large state, or diff_algo_patience can't find
+ * any common-unique atoms), then use this algorithm instead. */
+ const struct diff_algo_config *fallback_algo;
+};
+
+struct diff_config {
+ diff_atomize_func_t atomize_func;
+ void *atomize_func_data;
+
+ const struct diff_algo_config *algo;
+
+ /* How deep to step into subdivisions of a source file, a paranoia / safety measure to guard against infinite
+ * loops through diff algorithms. When the maximum recursion is reached, employ diff_algo_none (i.e. remove all
+ * left atoms and add all right atoms). */
+ unsigned int max_recursion_depth;
+};
+
+struct diff_result *diff_main(const struct diff_config *config,
+ const uint8_t *left_data, size_t left_len,
+ const uint8_t *right_data, size_t right_len);
+void diff_result_free(struct diff_result *result);
blob - /dev/null
blob + 5d07e74d868098aab6819c41c9499f0f1b30040c (mode 644)
--- /dev/null
+++ include/diff/diff_output.h
+/* Diff output generators and invocation shims. */
+/*
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <diff/diff_main.h>
+
+struct diff_input_info {
+ const char *arbitrary_info;
+ const char *left_path;
+ const char *right_path;
+};
+
+enum diff_rc diff_output_plain(FILE *dest, const struct diff_input_info *info,
+ const struct diff_result *result);
+enum diff_rc diff_plain(FILE *dest, const struct diff_config *diff_config,
+ const struct diff_input_info *info,
+ const char *left, int left_len, const char *right, int right_len);
+
+enum diff_rc diff_output_unidiff(FILE *dest, const struct diff_input_info *info,
+ const struct diff_result *result, unsigned int context_lines);
+enum diff_rc diff_unidiff(FILE *dest, const struct diff_config *diff_config,
+ const struct diff_input_info *info,
+ const char *left, int left_len, const char *right, int right_len,
+ unsigned int context_lines);
+
+enum diff_rc diff_output_info(FILE *dest, const struct diff_input_info *info);
+void diff_output_lines(FILE *dest, const char *prefix, struct diff_atom *start_atom, unsigned int count);
blob - /dev/null
blob + 72a597bf05c06f25fd94c1421712e178888a0ca3 (mode 644)
--- /dev/null
+++ lib/debug.h
+/*
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#pragma once
+
+#define DEBUG 0
+
+#if DEBUG
+#include <stdio.h>
+#define print(args...) fprintf(stderr, ##args)
+#define debug print
+#define debug_dump dump
+#define debug_dump_atom dump_atom
+#define debug_dump_atoms dump_atoms
+#define debug_dump_myers_graph dump_myers_graph
+
+static inline void dump_atom(const struct diff_data *left, const struct diff_data *right, const struct diff_atom *atom)
+{
+ if (!atom) {
+ print("NULL atom\n");
+ return;
+ }
+ if (left)
+ print(" %3ld", diff_atom_root_idx(left, atom));
+ if (right && atom->patience.pos_in_other)
+ print(" %3ld", diff_atom_root_idx(right, atom->patience.pos_in_other));
+
+ print(" %s%s '", atom->patience.unique_here ? "u" : " ", atom->patience.unique_in_both ? "c" : " ");
+ const char *s;
+ for (s = atom->at; s < (const char*)(atom->at + atom->len); s++) {
+ if (*s == '\r')
+ print("\\r");
+ else if (*s == '\n')
+ print("\\n");
+ else if (*s < 32 || *s >= 127)
+ print("\\x%02x", *s);
+ else
+ print("%c", *s);
+ }
+ print("'\n");
+}
+
+static inline void dump_atoms(const struct diff_data *d, struct diff_atom *atom, unsigned int count)
+{
+ struct diff_atom *i;
+ foreach_diff_atom(i, atom, count) {
+ dump_atom(d, NULL, i);
+ }
+}
+
+static inline void dump(struct diff_data *d)
+{
+ dump_atoms(d, d->atoms.head, d->atoms.len);
+}
+
+static inline void dump_myers_graph(const struct diff_data *l, const struct diff_data *r,
+ int *kd)
+{
+ int x;
+ int y;
+ print(" ");
+ for (x = 0; x <= l->atoms.len; x++)
+ print("%2d", x);
+ print("\n");
+
+ for (y = 0; y <= r->atoms.len; y++) {
+ print("%2d ", y);
+ for (x = 0; x <= l->atoms.len; x++) {
+
+ /* print d advancements from kd, if any. */
+ int d = -1;
+ if (kd) {
+ int max = l->atoms.len + r->atoms.len;
+ size_t kd_len = max + 1 + max;
+ int *kd_pos = kd;
+ int di;
+#define xk_to_y(X, K) ((X) - (K))
+ for (di = 0; di < max; di++) {
+ int ki;
+ for (ki = di; ki >= -di; ki -= 2) {
+ if (x == kd_pos[ki]
+ && y == xk_to_y(x, ki)) {
+ d = di;
+ break;
+ }
+ }
+ if (d >= 0)
+ break;
+ kd_pos += kd_len;
+ }
+ }
+ if (d >= 0)
+ print("%d", d);
+ else
+ print("o");
+ if (x < l->atoms.len && d < 10)
+ print("-");
+ }
+ print("\n");
+ if (y == r->atoms.len)
+ break;
+
+ print(" ");
+ for (x = 0; x < l->atoms.len; x++) {
+ if (diff_atom_same(&l->atoms.head[x], &r->atoms.head[y]))
+ print("|\\");
+ else
+ print("| ");
+ }
+ print("|\n");
+ }
+}
+
+#else
+#define debug(args...)
+#define debug_dump(args...)
+#define debug_dump_atom(args...)
+#define debug_dump_atoms(args...)
+#define debug_dump_myers_graph(args...)
+#endif
blob - /dev/null
blob + 964c1353da1b055ae761ae6961c45ea19e16d032 (mode 644)
--- /dev/null
+++ lib/diff_atomize_text.c
+/* Split source by line breaks, and calculate a simplistic checksum. */
+/*
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <diff/diff_main.h>
+
+static int diff_data_atomize_text_lines(struct diff_data *d)
+{
+ const uint8_t *pos = d->data;
+ const uint8_t *end = pos + d->len;
+ enum diff_rc rc;
+
+ unsigned int array_size_estimate = d->len / 50;
+ unsigned int pow2 = 1;
+ while (array_size_estimate >>= 1)
+ pow2++;
+
+ ARRAYLIST_INIT(d->atoms, 1 << pow2);
+
+ while (pos < end) {
+ const uint8_t *line_end = pos;
+ unsigned int hash = 0;
+
+ while (line_end < end && *line_end != '\r' && *line_end != '\n') {
+ hash = hash * 23 + *line_end;
+ line_end++;
+ }
+
+ /* When not at the end of data, the line ending char ('\r' or '\n') must follow */
+ if (line_end < end)
+ line_end++;
+ /* If that was an '\r', also pull in any following '\n' */
+ if (line_end[0] == '\r' && line_end < end && line_end[1] == '\n')
+ line_end++;
+
+ /* Record the found line as diff atom */
+ struct diff_atom *atom;
+ ARRAYLIST_ADD(atom, d->atoms);
+ if (!atom)
+ return DIFF_RC_ENOMEM;
+
+ *atom = (struct diff_atom){
+ .at = pos,
+ .len = line_end - pos,
+ .hash = hash,
+ };
+
+ /* Starting point for next line: */
+ pos = line_end;
+ }
+
+ return DIFF_RC_OK;
+}
+
+enum diff_rc diff_atomize_text_by_line(void *func_data, struct diff_data *left, struct diff_data *right)
+{
+ enum diff_rc rc;
+ rc = diff_data_atomize_text_lines(left);
+ if (rc != DIFF_RC_OK)
+ return rc;
+ return diff_data_atomize_text_lines(right);
+}
blob - /dev/null
blob + 285bd40d5941fe84c5d9a9bd437a4a9f981ead13 (mode 644)
--- /dev/null
+++ lib/diff_main.c
+/* Generic infrastructure to implement various diff algorithms (implementation). */
+/*
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+
+#include <sys/queue.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <string.h>
+#include <limits.h>
+
+#include <assert.h>
+
+#include <diff/diff_main.h>
+
+#include "debug.h"
+
+/* Even if a left or right side is empty, diff output may need to know the position in that file.
+ * So left_start or right_start must never be NULL -- pass left_count or right_count as zero to indicate staying at that
+ * position without consuming any lines. */
+struct diff_chunk *diff_state_add_chunk(struct diff_state *state, bool solved,
+ struct diff_atom *left_start, unsigned int left_count,
+ struct diff_atom *right_start, unsigned int right_count)
+{
+ struct diff_chunk *chunk;
+ diff_chunk_arraylist_t *result;
+
+ if (solved && !state->temp_result.len)
+ result = &state->result->chunks;
+ else
+ result = &state->temp_result;
+
+ ARRAYLIST_ADD(chunk, *result);
+ if (!chunk)
+ return NULL;
+ *chunk = (struct diff_chunk){
+ .solved = solved,
+ .left_start = left_start,
+ .left_count = left_count,
+ .right_start = right_start,
+ .right_count = right_count,
+ };
+
+ debug("Add %s chunk:\n", chunk->solved ? "solved" : "UNSOLVED");
+ debug("L\n");
+ debug_dump_atoms(&state->left, chunk->left_start, chunk->left_count);
+ debug("R\n");
+ debug_dump_atoms(&state->right, chunk->right_start, chunk->right_count);
+ return chunk;
+}
+
+void diff_data_init_root(struct diff_data *d, const uint8_t *data, size_t len)
+{
+ *d = (struct diff_data){
+ .data = data,
+ .len = len,
+ .root = d,
+ };
+}
+
+void diff_data_init_subsection(struct diff_data *d, struct diff_data *parent,
+ struct diff_atom *from_atom, unsigned int atoms_count)
+{
+ struct diff_atom *last_atom = from_atom + atoms_count - 1;
+ *d = (struct diff_data){
+ .data = from_atom->at,
+ .len = (last_atom->at + last_atom->len) - from_atom->at,
+ .root = parent->root,
+ .atoms.head = from_atom,
+ .atoms.len = atoms_count,
+ };
+
+ debug("subsection:\n");
+ debug_dump(d);
+}
+
+void diff_data_free(struct diff_data *diff_data)
+{
+ if (!diff_data)
+ return;
+ if (diff_data->atoms.allocated)
+ ARRAYLIST_FREE(diff_data->atoms);
+}
+
+enum diff_rc diff_algo_none(const struct diff_algo_config *algo_config, struct diff_state *state)
+{
+ debug("\n** %s\n", __func__);
+ debug("left:\n");
+ debug_dump(&state->left);
+ debug("right:\n");
+ debug_dump(&state->right);
+ debug_dump_myers_graph(&state->left, &state->right, NULL);
+
+ /* Add a chunk of equal lines, if any */
+ unsigned int equal_atoms = 0;
+ while (equal_atoms < state->left.atoms.len && equal_atoms < state->right.atoms.len
+ && diff_atom_same(&state->left.atoms.head[equal_atoms], &state->right.atoms.head[equal_atoms]))
+ equal_atoms++;
+ if (equal_atoms) {
+ if (!diff_state_add_chunk(state, true,
+ &state->left.atoms.head[0], equal_atoms,
+ &state->right.atoms.head[0], equal_atoms))
+ return DIFF_RC_ENOMEM;
+ }
+
+ /* Add a "minus" chunk with all lines from the left. */
+ if (equal_atoms < state->left.atoms.len) {
+ if (!diff_state_add_chunk(state, true,
+ &state->left.atoms.head[equal_atoms],
+ state->left.atoms.len - equal_atoms,
+ NULL, 0))
+ return DIFF_RC_ENOMEM;
+ }
+
+ /* Add a "plus" chunk with all lines from the right. */
+ if (equal_atoms < state->right.atoms.len) {
+ if (!diff_state_add_chunk(state, true,
+ NULL, 0,
+ &state->right.atoms.head[equal_atoms],
+ state->right.atoms.len - equal_atoms))
+ return DIFF_RC_ENOMEM;
+ }
+ return DIFF_RC_OK;
+}
+
+enum diff_rc diff_run_algo(const struct diff_algo_config *algo_config, struct diff_state *state)
+{
+ enum diff_rc rc;
+ ARRAYLIST_FREE(state->temp_result);
+
+ if (!algo_config || !algo_config->impl || !state->recursion_depth_left) {
+ debug("MAX RECURSION REACHED, just dumping diff chunks\n");
+ return diff_algo_none(algo_config, state);
+ }
+
+ ARRAYLIST_INIT(state->temp_result, DIFF_RESULT_ALLOC_BLOCKSIZE);
+ rc = algo_config->impl(algo_config, state);
+ switch (rc) {
+ case DIFF_RC_USE_DIFF_ALGO_FALLBACK:
+ debug("Got DIFF_RC_USE_DIFF_ALGO_FALLBACK (%p)\n", algo_config->fallback_algo);
+ rc = diff_run_algo(algo_config->fallback_algo, state);
+ goto return_rc;
+
+ case DIFF_RC_OK:
+ /* continue below */
+ break;
+
+ default:
+ /* some error happened */
+ goto return_rc;
+ }
+
+ /* Pick up any diff chunks that are still unsolved and feed to inner_algo.
+ * inner_algo will solve unsolved chunks and append to result,
+ * and subsequent solved chunks on this level are then appended to result afterwards. */
+ int i;
+ for (i = 0; i < state->temp_result.len; i++) {
+ struct diff_chunk *c = &state->temp_result.head[i];
+ if (c->solved) {
+ struct diff_chunk *final_c;
+ ARRAYLIST_ADD(final_c, state->result->chunks);
+ if (!final_c) {
+ rc = DIFF_RC_ENOMEM;
+ goto return_rc;
+ }
+ *final_c = *c;
+ continue;
+ }
+
+ /* c is an unsolved chunk, feed to inner_algo */
+ struct diff_state inner_state = {
+ .result = state->result,
+ .recursion_depth_left = state->recursion_depth_left - 1,
+ };
+ diff_data_init_subsection(&inner_state.left, &state->left, c->left_start, c->left_count);
+ diff_data_init_subsection(&inner_state.right, &state->right, c->right_start, c->right_count);
+
+ rc = diff_run_algo(algo_config->inner_algo, &inner_state);
+
+ if (rc != DIFF_RC_OK)
+ goto return_rc;
+ }
+
+ rc = DIFF_RC_OK;
+return_rc:
+ ARRAYLIST_FREE(state->temp_result);
+ return rc;
+}
+
+struct diff_result *diff_main(const struct diff_config *config,
+ const uint8_t *left_data, size_t left_len,
+ const uint8_t *right_data, size_t right_len)
+{
+ struct diff_result *result = malloc(sizeof(struct diff_result));
+ if (!result)
+ return NULL;
+
+ *result = (struct diff_result){};
+ diff_data_init_root(&result->left, left_data, left_len);
+ diff_data_init_root(&result->right, right_data, right_len);
+
+ if (!config->atomize_func) {
+ result->rc = DIFF_RC_EINVAL;
+ return result;
+ }
+
+ result->rc = config->atomize_func(config->atomize_func_data, &result->left, &result->right);
+ if (result->rc != DIFF_RC_OK)
+ return result;
+
+ struct diff_state state = {
+ .result = result,
+ .recursion_depth_left = config->max_recursion_depth ? : 1024,
+ };
+ diff_data_init_subsection(&state.left, &result->left, result->left.atoms.head, result->left.atoms.len);
+ diff_data_init_subsection(&state.right, &result->right, result->right.atoms.head, result->right.atoms.len);
+
+ result->rc = diff_run_algo(config->algo, &state);
+
+ return result;
+}
+
+void diff_result_free(struct diff_result *result)
+{
+ if (!result)
+ return;
+ diff_data_free(&result->left);
+ diff_data_free(&result->right);
+ ARRAYLIST_FREE(result->chunks);
+ free(result);
+}
blob - /dev/null
blob + cba927a4a5813ed3a1afa1247227a9637271e176 (mode 644)
--- /dev/null
+++ lib/diff_myers.c
+/* Myers diff algorithm implementation, invented by Eugene W. Myers [1].
+ * Implementations of both the Myers Divide Et Impera (using linear space)
+ * and the canonical Myers algorithm (using quadratic space). */
+/*
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <diff/diff_main.h>
+
+#include "debug.h"
+
+/* Myers' diff algorithm [1] is nicely explained in [2].
+ * [1] http://www.xmailserver.org/diff2.pdf
+ * [2] https://blog.jcoglan.com/2017/02/12/the-myers-diff-algorithm-part-1/ ff.
+ *
+ * Myers approaches finding the smallest diff as a graph problem.
+ * The crux is that the original algorithm requires quadratic amount of memory:
+ * both sides' lengths added, and that squared. So if we're diffing lines of text, two files with 1000 lines each would
+ * blow up to a matrix of about 2000 * 2000 ints of state, about 16 Mb of RAM to figure out 2 kb of text.
+ * The solution is using Myers' "divide and conquer" extension algorithm, which does the original traversal from both
+ * ends of the files to reach a middle where these "snakes" touch, hence does not need to backtrace the traversal, and
+ * so gets away with only keeping a single column of that huge state matrix in memory.
+ *
+ * Todo: the divide and conquer requires linear *space*, not necessarily linear *time*. It recurses, apparently doing
+ * multiple Myers passes, and also it apparently favors fragmented diffs in cases where chunks of text were moved to a
+ * different place. Up to a given count of diff atoms (text lines), it might be desirable to accept the quadratic memory
+ * usage, get nicer diffs and less re-iteration of the same data?
+ */
+
+struct diff_box {
+ unsigned int left_start;
+ unsigned int left_end;
+ unsigned int right_start;
+ unsigned int right_end;
+};
+
+#define diff_box_empty(DIFF_SNAKE) ((DIFF_SNAKE)->left_end == 0)
+
+
+/* If the two contents of a file are A B C D E and X B C Y,
+ * the Myers diff graph looks like:
+ *
+ * k0 k1
+ * \ \
+ * k-1 0 1 2 3 4 5
+ * \ A B C D E
+ * 0 o-o-o-o-o-o
+ * X | | | | | |
+ * 1 o-o-o-o-o-o
+ * B | |\| | | |
+ * 2 o-o-o-o-o-o
+ * C | | |\| | |
+ * 3 o-o-o-o-o-o
+ * Y | | | | | |\
+ * 4 o-o-o-o-o-o c1
+ * \ \
+ * c-1 c0
+ *
+ * Moving right means delete an atom from the left-hand-side,
+ * Moving down means add an atom from the right-hand-side.
+ * Diagonals indicate identical atoms on both sides, the challenge is to use as many diagonals as possible.
+ *
+ * The original Myers algorithm walks all the way from the top left to the bottom right, remembers all steps, and then
+ * backtraces to find the shortest path. However, that requires keeping the entire graph in memory, which needs
+ * quadratic space.
+ *
+ * Myers adds a variant that uses linear space -- note, not linear time, only linear space: walk forward and backward,
+ * find a meeting point in the middle, and recurse on the two separate sections. This is called "divide and conquer".
+ *
+ * d: the step number, starting with 0, a.k.a. the distance from the starting point.
+ * k: relative index in the state array for the forward scan, indicating on which diagonal through the diff graph we
+ * currently are.
+ * c: relative index in the state array for the backward scan, indicating the diagonal number from the bottom up.
+ *
+ * The "divide and conquer" traversal through the Myers graph looks like this:
+ *
+ * | d= 0 1 2 3 2 1 0
+ * ----+--------------------------------------------
+ * k= | c=
+ * 4 | 3
+ * |
+ * 3 | 3,0 5,2 2
+ * | / \
+ * 2 | 2,0 5,3 1
+ * | / \
+ * 1 | 1,0 4,3 >= 4,3 5,4<-- 0
+ * | / / \ /
+ * 0 | -->0,0 3,3 4,4 -1
+ * | \ / /
+ * -1 | 0,1 1,2 3,4 -2
+ * | \ /
+ * -2 | 0,2 -3
+ * | \
+ * | 0,3
+ * | forward-> <-backward
+ *
+ * x,y pairs here are the coordinates in the Myers graph:
+ * x = atom index in left-side source, y = atom index in the right-side source.
+ *
+ * Only one forward column and one backward column are kept in mem, each need at most left.len + 1 + right.len items.
+ * Note that each d step occupies either the even or the odd items of a column: if e.g. the previous column is in the
+ * odd items, the next column is formed in the even items, without overwriting the previous column's results.
+ *
+ * Also note that from the diagonal index k and the x coordinate, the y coordinate can be derived:
+ * y = x - k
+ * Hence the state array only needs to keep the x coordinate, i.e. the position in the left-hand file, and the y
+ * coordinate, i.e. position in the right-hand file, is derived from the index in the state array.
+ *
+ * The two traces meet at 4,3, the first step (here found in the forward traversal) where a forward position is on or
+ * past a backward traced position on the same diagonal.
+ *
+ * This divides the problem space into:
+ *
+ * 0 1 2 3 4 5
+ * A B C D E
+ * 0 o-o-o-o-o
+ * X | | | | |
+ * 1 o-o-o-o-o
+ * B | |\| | |
+ * 2 o-o-o-o-o
+ * C | | |\| |
+ * 3 o-o-o-o-*-o *: forward and backward meet here
+ * Y | |
+ * 4 o-o
+ *
+ * Doing the same on each section lead to:
+ *
+ * 0 1 2 3 4 5
+ * A B C D E
+ * 0 o-o
+ * X | |
+ * 1 o-b b: backward d=1 first reaches here (sliding up the snake)
+ * B \ f: then forward d=2 reaches here (sliding down the snake)
+ * 2 o As result, the box from b to f is found to be identical;
+ * C \ leaving a top box from 0,0 to 1,1 and a bottom trivial tail 3,3 to 4,3.
+ * 3 f-o
+ *
+ * 3 o-*
+ * Y |
+ * 4 o *: forward and backward meet here
+ *
+ * and solving the last top left box gives:
+ *
+ * 0 1 2 3 4 5
+ * A B C D E -A
+ * 0 o-o +X
+ * X | B
+ * 1 o C
+ * B \ -D
+ * 2 o -E
+ * C \ +Y
+ * 3 o-o-o
+ * Y |
+ * 4 o
+ *
+ */
+
+#define xk_to_y(X, K) ((X) - (K))
+#define xc_to_y(X, C, DELTA) ((X) - (C) + (DELTA))
+#define k_to_c(K, DELTA) ((K) + (DELTA))
+#define c_to_k(C, DELTA) ((C) - (DELTA))
+
+/* Do one forwards step in the "divide and conquer" graph traversal.
+ * left: the left side to diff.
+ * right: the right side to diff against.
+ * kd_forward: the traversal state for forwards traversal, modified by this function.
+ * This is carried over between invocations with increasing d.
+ * kd_forward points at the center of the state array, allowing negative indexes.
+ * kd_backward: the traversal state for backwards traversal, to find a meeting point.
+ * Since forwards is done first, kd_backward will be valid for d - 1, not d.
+ * kd_backward points at the center of the state array, allowing negative indexes.
+ * d: Step or distance counter, indicating for what value of d the kd_forward should be populated.
+ * For d == 0, kd_forward[0] is initialized, i.e. the first invocation should be for d == 0.
+ * meeting_snake: resulting meeting point, if any.
+ */
+static void diff_divide_myers_forward(struct diff_data *left, struct diff_data *right,
+ int *kd_forward, int *kd_backward, int d,
+ struct diff_box *meeting_snake)
+{
+ int delta = (int)right->atoms.len - (int)left->atoms.len;
+ int prev_x;
+ int prev_y;
+ int k;
+ int x;
+
+ debug("-- %s d=%d\n", __func__, d);
+ debug_dump_myers_graph(left, right, NULL);
+
+ for (k = d; k >= -d; k -= 2) {
+ if (k < -(int)right->atoms.len || k > (int)left->atoms.len) {
+ /* This diagonal is completely outside of the Myers graph, don't calculate it. */
+ if (k < -(int)right->atoms.len)
+ debug(" %d k < -(int)right->atoms.len %d\n", k, -(int)right->atoms.len);
+ else
+ debug(" %d k > left->atoms.len %d\n", k, left->atoms.len);
+ if (k < 0) {
+ /* We are traversing negatively, and already below the entire graph, nothing will come
+ * of this. */
+ debug(" break");
+ break;
+ }
+ debug(" continue");
+ continue;
+ }
+ debug("- k = %d\n", k);
+ if (d == 0) {
+ /* This is the initializing step. There is no prev_k yet, get the initial x from the top left of
+ * the Myers graph. */
+ x = 0;
+ }
+ /* Favoring "-" lines first means favoring moving rightwards in the Myers graph.
+ * For this, all k should derive from k - 1, only the bottom most k derive from k + 1:
+ *
+ * | d= 0 1 2
+ * ----+----------------
+ * k= |
+ * 2 | 2,0 <-- from prev_k = 2 - 1 = 1
+ * | /
+ * 1 | 1,0
+ * | /
+ * 0 | -->0,0 3,3
+ * | \\ /
+ * -1 | 0,1 <-- bottom most for d=1 from prev_k = -1 + 1 = 0
+ * | \\
+ * -2 | 0,2 <-- bottom most for d=2 from prev_k = -2 + 1 = -1
+ *
+ * Except when a k + 1 from a previous run already means a further advancement in the graph.
+ * If k == d, there is no k + 1 and k - 1 is the only option.
+ * If k < d, use k + 1 in case that yields a larger x. Also use k + 1 if k - 1 is outside the graph.
+ */
+ else if (k > -d && (k == d
+ || (k - 1 >= -(int)right->atoms.len
+ && kd_forward[k - 1] >= kd_forward[k + 1]))) {
+ /* Advance from k - 1.
+ * From position prev_k, step to the right in the Myers graph: x += 1.
+ */
+ int prev_k = k - 1;
+ prev_x = kd_forward[prev_k];
+ prev_y = xk_to_y(prev_x, prev_k);
+ x = prev_x + 1;
+ } else {
+ /* The bottom most one.
+ * From position prev_k, step to the bottom in the Myers graph: y += 1.
+ * Incrementing y is achieved by decrementing k while keeping the same x.
+ * (since we're deriving y from y = x - k).
+ */
+ int prev_k = k + 1;
+ prev_x = kd_forward[prev_k];
+ prev_y = xk_to_y(prev_x, prev_k);
+ x = prev_x;
+ }
+
+ /* Slide down any snake that we might find here. */
+ while (x < left->atoms.len && xk_to_y(x, k) < right->atoms.len
+ && diff_atom_same(&left->atoms.head[x], &right->atoms.head[xk_to_y(x, k)]))
+ x++;
+ kd_forward[k] = x;
+
+ if (DEBUG) {
+ int fi;
+ for (fi = d; fi >= k; fi--) {
+ debug("kd_forward[%d] = (%d, %d)\n", fi, kd_forward[fi], kd_forward[fi] - fi);
+ /*
+ if (kd_forward[fi] >= 0 && kd_forward[fi] < left->atoms.len)
+ debug_dump_atom(left, right, &left->atoms.head[kd_forward[fi]]);
+ else
+ debug("\n");
+ if (kd_forward[fi]-fi >= 0 && kd_forward[fi]-fi < right->atoms.len)
+ debug_dump_atom(right, left, &right->atoms.head[kd_forward[fi]-fi]);
+ else
+ debug("\n");
+ */
+ }
+ }
+
+ if (x < 0 || x > left->atoms.len
+ || xk_to_y(x, k) < 0 || xk_to_y(x, k) > right->atoms.len)
+ continue;
+
+ /* Figured out a new forwards traversal, see if this has gone onto or even past a preceding backwards
+ * traversal.
+ *
+ * If the delta in length is odd, then d and backwards_d hit the same state indexes:
+ * | d= 0 1 2 1 0
+ * ----+---------------- ----------------
+ * k= | c=
+ * 4 | 3
+ * |
+ * 3 | 2
+ * | same
+ * 2 | 2,0====5,3 1
+ * | / \
+ * 1 | 1,0 5,4<-- 0
+ * | / /
+ * 0 | -->0,0 3,3====4,4 -1
+ * | \ /
+ * -1 | 0,1 -2
+ * | \
+ * -2 | 0,2 -3
+ * |
+ *
+ * If the delta is even, they end up off-by-one, i.e. on different diagonals:
+ *
+ * | d= 0 1 2 1 0
+ * ----+---------------- ----------------
+ * | c=
+ * 3 | 3
+ * |
+ * 2 | 2,0 off 2
+ * | / \\
+ * 1 | 1,0 4,3 1
+ * | / // \
+ * 0 | -->0,0 3,3 4,4<-- 0
+ * | \ / /
+ * -1 | 0,1 3,4 -1
+ * | \ //
+ * -2 | 0,2 -2
+ * |
+ *
+ * So in the forward path, we can only match up diagonals when the delta is odd.
+ */
+ /* Forwards is done first, so the backwards one was still at d - 1. Can't do this for d == 0. */
+ int backwards_d = d - 1;
+ if ((delta & 1) && (backwards_d >= 0)) {
+ debug("backwards_d = %d\n", backwards_d);
+
+ /* If both sides have the same length, forward and backward start on the same diagonal, meaning the
+ * backwards state index c == k.
+ * As soon as the lengths are not the same, the backwards traversal starts on a different diagonal, and
+ * c = k shifted by the difference in length.
+ */
+ int c = k_to_c(k, delta);
+
+ /* When the file sizes are very different, the traversal trees start on far distant diagonals.
+ * They don't necessarily meet straight on. See whether this forward value is on a diagonal that
+ * is also valid in kd_backward[], and match them if so. */
+ if (c >= -backwards_d && c <= backwards_d) {
+ /* Current k is on a diagonal that exists in kd_backward[]. If the two x positions have
+ * met or passed (forward walked onto or past backward), then we've found a midpoint / a
+ * mid-box.
+ *
+ * But we need to avoid matching a situation like this:
+ * 0 1
+ * x y
+ * 0 o-o-o
+ * x |\| |
+ * 1 o-o-o
+ * y | |\|
+ * 2 (B)o-o <--(B) backwards traversal reached here
+ * a | | |
+ * 3 o-o-o<-- prev_x, prev_y
+ * b | | |
+ * 4 o-o(F) <--(F) forwards traversal reached here
+ * x |\| | Now both are on the same diagonal and look like they passed,
+ * 5 o-o-o but actually they have sneaked past each other and have not met.
+ * y | |\|
+ * 6 o-o-o
+ *
+ * The solution is to notice that prev_x,prev_y were also already past (B).
+ */
+ int backward_x = kd_backward[c];
+ int backward_y = xc_to_y(backward_x, c, delta);
+ debug(" prev_x,y = (%d,%d) c%d:backward_x,y = (%d,%d) k%d:x,y = (%d,%d)\n",
+ prev_x, prev_y, c, backward_x, backward_y, k, x, xk_to_y(x, k));
+ if (prev_x <= backward_x && prev_y <= backward_y
+ && x >= backward_x) {
+ *meeting_snake = (struct diff_box){
+ .left_start = backward_x,
+ .left_end = x,
+ .right_start = xc_to_y(backward_x, c, delta),
+ .right_end = xk_to_y(x, k),
+ };
+ debug("HIT x=(%u,%u) - y=(%u,%u)\n",
+ meeting_snake->left_start,
+ meeting_snake->right_start,
+ meeting_snake->left_end,
+ meeting_snake->right_end);
+ return;
+ }
+ }
+ }
+ }
+}
+
+/* Do one backwards step in the "divide and conquer" graph traversal.
+ * left: the left side to diff.
+ * right: the right side to diff against.
+ * kd_forward: the traversal state for forwards traversal, to find a meeting point.
+ * Since forwards is done first, after this, both kd_forward and kd_backward will be valid for d.
+ * kd_forward points at the center of the state array, allowing negative indexes.
+ * kd_backward: the traversal state for backwards traversal, to find a meeting point.
+ * This is carried over between invocations with increasing d.
+ * kd_backward points at the center of the state array, allowing negative indexes.
+ * d: Step or distance counter, indicating for what value of d the kd_backward should be populated.
+ * Before the first invocation, kd_backward[0] shall point at the bottom right of the Myers graph
+ * (left.len, right.len).
+ * The first invocation will be for d == 1.
+ * meeting_snake: resulting meeting point, if any.
+ */
+static void diff_divide_myers_backward(struct diff_data *left, struct diff_data *right,
+ int *kd_forward, int *kd_backward, int d,
+ struct diff_box *meeting_snake)
+{
+ int delta = (int)right->atoms.len - (int)left->atoms.len;
+ int prev_x;
+ int prev_y;
+ int c;
+ int x;
+
+ debug("-- %s d=%d\n", __func__, d);
+ debug_dump_myers_graph(left, right, NULL);
+
+ for (c = d; c >= -d; c -= 2) {
+ if (c < -(int)left->atoms.len || c > (int)right->atoms.len) {
+ /* This diagonal is completely outside of the Myers graph, don't calculate it. */
+ if (c < -(int)left->atoms.len)
+ debug(" %d c < -(int)left->atoms.len %d\n", c, -(int)left->atoms.len);
+ else
+ debug(" %d c > right->atoms.len %d\n", c, right->atoms.len);
+ if (c < 0) {
+ /* We are traversing negatively, and already below the entire graph, nothing will come
+ * of this. */
+ debug(" break");
+ break;
+ }
+ debug(" continue");
+ continue;
+ }
+ debug("- c = %d\n", c);
+ if (d == 0) {
+ /* This is the initializing step. There is no prev_c yet, get the initial x from the bottom
+ * right of the Myers graph. */
+ x = left->atoms.len;
+ }
+ /* Favoring "-" lines first means favoring moving rightwards in the Myers graph.
+ * For this, all c should derive from c - 1, only the bottom most c derive from c + 1:
+ *
+ * 2 1 0
+ * ---------------------------------------------------
+ * c=
+ * 3
+ *
+ * from prev_c = c - 1 --> 5,2 2
+ * \
+ * 5,3 1
+ * \
+ * 4,3 5,4<-- 0
+ * \ /
+ * bottom most for d=1 from c + 1 --> 4,4 -1
+ * /
+ * bottom most for d=2 --> 3,4 -2
+ *
+ * Except when a c + 1 from a previous run already means a further advancement in the graph.
+ * If c == d, there is no c + 1 and c - 1 is the only option.
+ * If c < d, use c + 1 in case that yields a larger x. Also use c + 1 if c - 1 is outside the graph.
+ */
+ else if (c > -d && (c == d
+ || (c - 1 >= -(int)right->atoms.len
+ && kd_backward[c - 1] <= kd_backward[c + 1]))) {
+ /* A top one.
+ * From position prev_c, step upwards in the Myers graph: y -= 1.
+ * Decrementing y is achieved by incrementing c while keeping the same x.
+ * (since we're deriving y from y = x - c + delta).
+ */
+ int prev_c = c - 1;
+ prev_x = kd_backward[prev_c];
+ prev_y = xc_to_y(prev_x, prev_c, delta);
+ x = prev_x;
+ } else {
+ /* The bottom most one.
+ * From position prev_c, step to the left in the Myers graph: x -= 1.
+ */
+ int prev_c = c + 1;
+ prev_x = kd_backward[prev_c];
+ prev_y = xc_to_y(prev_x, prev_c, delta);
+ x = prev_x - 1;
+ }
+
+ /* Slide up any snake that we might find here. */
+ debug("c=%d x-1=%d Yb-1=%d-1=%d\n", c, x-1, xc_to_y(x, c, delta), xc_to_y(x, c, delta)-1);
+ if (x > 0) {
+ debug(" l="); debug_dump_atom(left, right, &left->atoms.head[x-1]);
+ }
+ if (xc_to_y(x, c, delta) > 0) {
+ debug(" r="); debug_dump_atom(right, left, &right->atoms.head[xc_to_y(x, c, delta)-1]);
+ }
+ while (x > 0 && xc_to_y(x, c, delta) > 0
+ && diff_atom_same(&left->atoms.head[x-1], &right->atoms.head[xc_to_y(x, c, delta)-1]))
+ x--;
+ kd_backward[c] = x;
+
+ if (DEBUG) {
+ int fi;
+ for (fi = d; fi >= c; fi--) {
+ debug("kd_backward[%d] = (%d, %d)\n", fi, kd_backward[fi],
+ kd_backward[fi] - fi + delta);
+ /*
+ if (kd_backward[fi] >= 0 && kd_backward[fi] < left->atoms.len)
+ debug_dump_atom(left, right, &left->atoms.head[kd_backward[fi]]);
+ else
+ debug("\n");
+ if (kd_backward[fi]-fi+delta >= 0 && kd_backward[fi]-fi+delta < right->atoms.len)
+ debug_dump_atom(right, left, &right->atoms.head[kd_backward[fi]-fi+delta]);
+ else
+ debug("\n");
+ */
+ }
+ }
+
+ if (x < 0 || x > left->atoms.len
+ || xc_to_y(x, c, delta) < 0 || xc_to_y(x, c, delta) > right->atoms.len)
+ continue;
+
+ /* Figured out a new backwards traversal, see if this has gone onto or even past a preceding forwards
+ * traversal.
+ *
+ * If the delta in length is even, then d and backwards_d hit the same state indexes -- note how this is
+ * different from in the forwards traversal, because now both d are the same:
+ *
+ * | d= 0 1 2 2 1 0
+ * ----+---------------- --------------------
+ * k= | c=
+ * 4 |
+ * |
+ * 3 | 3
+ * | same
+ * 2 | 2,0====5,2 2
+ * | / \
+ * 1 | 1,0 5,3 1
+ * | / / \
+ * 0 | -->0,0 3,3====4,3 5,4<-- 0
+ * | \ / /
+ * -1 | 0,1 4,4 -1
+ * | \
+ * -2 | 0,2 -2
+ * |
+ * -3
+ * If the delta is odd, they end up off-by-one, i.e. on different diagonals.
+ * So in the backward path, we can only match up diagonals when the delta is even.
+ */
+ if ((delta & 1) == 0) {
+ /* Forwards was done first, now both d are the same. */
+ int forwards_d = d;
+
+ /* As soon as the lengths are not the same, the backwards traversal starts on a different diagonal, and
+ * c = k shifted by the difference in length.
+ */
+ int k = c_to_k(c, delta);
+
+ /* When the file sizes are very different, the traversal trees start on far distant diagonals.
+ * They don't necessarily meet straight on. See whether this backward value is also on a valid
+ * diagonal in kd_forward[], and match them if so. */
+ if (k >= -forwards_d && k <= forwards_d) {
+ /* Current c is on a diagonal that exists in kd_forward[]. If the two x positions have
+ * met or passed (backward walked onto or past forward), then we've found a midpoint / a
+ * mid-box. */
+ int forward_x = kd_forward[k];
+ int forward_y = xk_to_y(forward_x, k);
+ debug("Compare %d to %d k=%d (x=%d,y=%d) to (x=%d,y=%d)\n",
+ forward_x, x, k,
+ forward_x, xk_to_y(forward_x, k), x, xc_to_y(x, c, delta));
+ if (forward_x <= prev_x && forward_y <= prev_y
+ && forward_x >= x) {
+ *meeting_snake = (struct diff_box){
+ .left_start = x,
+ .left_end = forward_x,
+ .right_start = xc_to_y(x, c, delta),
+ .right_end = xk_to_y(forward_x, k),
+ };
+ debug("HIT x=%u,%u - y=%u,%u\n",
+ meeting_snake->left_start,
+ meeting_snake->right_start,
+ meeting_snake->left_end,
+ meeting_snake->right_end);
+ return;
+ }
+ }
+ }
+ }
+}
+
+/* Myers "Divide et Impera": tracing forwards from the start and backwards from the end to find a midpoint that divides
+ * the problem into smaller chunks. Requires only linear amounts of memory. */
+enum diff_rc diff_algo_myers_divide(const struct diff_algo_config *algo_config, struct diff_state *state)
+{
+ enum diff_rc rc = DIFF_RC_ENOMEM;
+ struct diff_data *left = &state->left;
+ struct diff_data *right = &state->right;
+
+ debug("\n** %s\n", __func__);
+ debug("left:\n");
+ debug_dump(left);
+ debug("right:\n");
+ debug_dump(right);
+ debug_dump_myers_graph(left, right, NULL);
+
+ /* Allocate two columns of a Myers graph, one for the forward and one for the backward traversal. */
+ unsigned int max = left->atoms.len + right->atoms.len;
+ size_t kd_len = max + 1;
+ size_t kd_buf_size = kd_len << 1;
+ int *kd_buf = reallocarray(NULL, kd_buf_size, sizeof(int));
+ if (!kd_buf)
+ return DIFF_RC_ENOMEM;
+ int i;
+ for (i = 0; i < kd_buf_size; i++)
+ kd_buf[i] = -1;
+ int *kd_forward = kd_buf;
+ int *kd_backward = kd_buf + kd_len;
+
+ /* The 'k' axis in Myers spans positive and negative indexes, so point the kd to the middle.
+ * It is then possible to index from -max/2 .. max/2. */
+ kd_forward += max/2;
+ kd_backward += max/2;
+
+ int d;
+ struct diff_box mid_snake = {};
+ for (d = 0; d <= (max/2); d++) {
+ debug("-- d=%d\n", d);
+ diff_divide_myers_forward(left, right, kd_forward, kd_backward, d, &mid_snake);
+ if (!diff_box_empty(&mid_snake))
+ break;
+ diff_divide_myers_backward(left, right, kd_forward, kd_backward, d, &mid_snake);
+ if (!diff_box_empty(&mid_snake))
+ break;
+ }
+
+ if (diff_box_empty(&mid_snake)) {
+ /* Divide and conquer failed to find a meeting point. Use the fallback_algo defined in the algo_config
+ * (leave this to the caller). This is just paranoia/sanity, we normally should always find a midpoint.
+ */
+ debug(" no midpoint \n");
+ rc = DIFF_RC_USE_DIFF_ALGO_FALLBACK;
+ goto return_rc;
+ } else {
+ debug(" mid snake L: %u to %u of %u R: %u to %u of %u\n",
+ mid_snake.left_start, mid_snake.left_end, left->atoms.len,
+ mid_snake.right_start, mid_snake.right_end, right->atoms.len);
+
+ /* Section before the mid-snake. */
+ debug("Section before the mid-snake\n");
+
+ struct diff_atom *left_atom = &left->atoms.head[0];
+ unsigned int left_section_len = mid_snake.left_start;
+ struct diff_atom *right_atom = &right->atoms.head[0];
+ unsigned int right_section_len = mid_snake.right_start;
+
+ if (left_section_len && right_section_len) {
+ /* Record an unsolved chunk, the caller will apply inner_algo() on this chunk. */
+ if (!diff_state_add_chunk(state, false,
+ left_atom, left_section_len,
+ right_atom, right_section_len))
+ goto return_rc;
+ } else if (left_section_len && !right_section_len) {
+ /* Only left atoms and none on the right, they form a "minus" chunk, then. */
+ if (!diff_state_add_chunk(state, true,
+ left_atom, left_section_len,
+ right_atom, 0))
+ goto return_rc;
+ } else if (!left_section_len && right_section_len) {
+ /* No left atoms, only atoms on the right, they form a "plus" chunk, then. */
+ if (!diff_state_add_chunk(state, true,
+ left_atom, 0,
+ right_atom, right_section_len))
+ goto return_rc;
+ }
+ /* else: left_section_len == 0 and right_section_len == 0, i.e. nothing before the mid-snake. */
+
+ /* the mid-snake, identical data on both sides: */
+ debug("the mid-snake\n");
+ if (!diff_state_add_chunk(state, true,
+ &left->atoms.head[mid_snake.left_start],
+ mid_snake.left_end - mid_snake.left_start,
+ &right->atoms.head[mid_snake.right_start],
+ mid_snake.right_end - mid_snake.right_start))
+ goto return_rc;
+
+ /* Section after the mid-snake. */
+ debug("Section after the mid-snake\n");
+ debug(" left_end %u right_end %u\n", mid_snake.left_end, mid_snake.right_end);
+ debug(" left_count %u right_count %u\n", left->atoms.len, right->atoms.len);
+ left_atom = &left->atoms.head[mid_snake.left_end];
+ left_section_len = left->atoms.len - mid_snake.left_end;
+ right_atom = &right->atoms.head[mid_snake.right_end];
+ right_section_len = right->atoms.len - mid_snake.right_end;
+
+ if (left_section_len && right_section_len) {
+ /* Record an unsolved chunk, the caller will apply inner_algo() on this chunk. */
+ if (!diff_state_add_chunk(state, false,
+ left_atom, left_section_len,
+ right_atom, right_section_len))
+ goto return_rc;
+ } else if (left_section_len && !right_section_len) {
+ /* Only left atoms and none on the right, they form a "minus" chunk, then. */
+ if (!diff_state_add_chunk(state, true,
+ left_atom, left_section_len,
+ right_atom, 0))
+ goto return_rc;
+ } else if (!left_section_len && right_section_len) {
+ /* No left atoms, only atoms on the right, they form a "plus" chunk, then. */
+ if (!diff_state_add_chunk(state, true,
+ left_atom, 0,
+ right_atom, right_section_len))
+ goto return_rc;
+ }
+ /* else: left_section_len == 0 and right_section_len == 0, i.e. nothing after the mid-snake. */
+ }
+
+ rc = DIFF_RC_OK;
+
+return_rc:
+ free(kd_buf);
+ debug("** END %s\n", __func__);
+ return rc;
+}
+
+/* Myers Diff tracing from the start all the way through to the end, requiring quadratic amounts of memory. This can
+ * fail if the required space surpasses algo_config->permitted_state_size. */
+enum diff_rc diff_algo_myers(const struct diff_algo_config *algo_config, struct diff_state *state)
+{
+ /* do a diff_divide_myers_forward() without a _backward(), so that it walks forward across the entire
+ * files to reach the end. Keep each run's state, and do a final backtrace. */
+ enum diff_rc rc = DIFF_RC_ENOMEM;
+ struct diff_data *left = &state->left;
+ struct diff_data *right = &state->right;
+
+ debug("\n** %s\n", __func__);
+ debug("left:\n");
+ debug_dump(left);
+ debug("right:\n");
+ debug_dump(right);
+ debug_dump_myers_graph(left, right, NULL);
+
+ /* Allocate two columns of a Myers graph, one for the forward and one for the backward traversal. */
+ unsigned int max = left->atoms.len + right->atoms.len;
+ size_t kd_len = max + 1 + max;
+ size_t kd_buf_size = kd_len * kd_len;
+ debug("state size: %zu\n", kd_buf_size);
+ if (kd_buf_size < kd_len /* overflow? */
+ || kd_buf_size * sizeof(int) > algo_config->permitted_state_size) {
+ debug("state size %zu > permitted_state_size %zu, use fallback_algo\n",
+ kd_buf_size, algo_config->permitted_state_size);
+ return DIFF_RC_USE_DIFF_ALGO_FALLBACK;
+ }
+
+ int *kd_buf = reallocarray(NULL, kd_buf_size, sizeof(int));
+ if (!kd_buf)
+ return DIFF_RC_ENOMEM;
+ int i;
+ for (i = 0; i < kd_buf_size; i++)
+ kd_buf[i] = -1;
+
+ /* The 'k' axis in Myers spans positive and negative indexes, so point the kd to the middle.
+ * It is then possible to index from -max .. max. */
+ int *kd_origin = kd_buf + max;
+ int *kd_column = kd_origin;
+
+ int d;
+ int backtrack_d = -1;
+ int backtrack_k = 0;
+ int k;
+ int x, y;
+ for (d = 0; d <= max; d++, kd_column += kd_len) {
+ debug("-- d=%d\n", d);
+
+ debug("-- %s d=%d\n", __func__, d);
+
+ for (k = d; k >= -d; k -= 2) {
+ if (k < -(int)right->atoms.len || k > (int)left->atoms.len) {
+ /* This diagonal is completely outside of the Myers graph, don't calculate it. */
+ if (k < -(int)right->atoms.len)
+ debug(" %d k < -(int)right->atoms.len %d\n", k, -(int)right->atoms.len);
+ else
+ debug(" %d k > left->atoms.len %d\n", k, left->atoms.len);
+ if (k < 0) {
+ /* We are traversing negatively, and already below the entire graph, nothing will come
+ * of this. */
+ debug(" break");
+ break;
+ }
+ debug(" continue");
+ continue;
+ }
+
+ debug("- k = %d\n", k);
+ if (d == 0) {
+ /* This is the initializing step. There is no prev_k yet, get the initial x from the top left of
+ * the Myers graph. */
+ x = 0;
+ } else {
+ int *kd_prev_column = kd_column - kd_len;
+
+ /* Favoring "-" lines first means favoring moving rightwards in the Myers graph.
+ * For this, all k should derive from k - 1, only the bottom most k derive from k + 1:
+ *
+ * | d= 0 1 2
+ * ----+----------------
+ * k= |
+ * 2 | 2,0 <-- from prev_k = 2 - 1 = 1
+ * | /
+ * 1 | 1,0
+ * | /
+ * 0 | -->0,0 3,3
+ * | \\ /
+ * -1 | 0,1 <-- bottom most for d=1 from prev_k = -1 + 1 = 0
+ * | \\
+ * -2 | 0,2 <-- bottom most for d=2 from prev_k = -2 + 1 = -1
+ *
+ * Except when a k + 1 from a previous run already means a further advancement in the graph.
+ * If k == d, there is no k + 1 and k - 1 is the only option.
+ * If k < d, use k + 1 in case that yields a larger x. Also use k + 1 if k - 1 is outside the graph.
+ */
+ if (k > -d && (k == d
+ || (k - 1 >= -(int)right->atoms.len
+ && kd_prev_column[k - 1] >= kd_prev_column[k + 1]))) {
+ /* Advance from k - 1.
+ * From position prev_k, step to the right in the Myers graph: x += 1.
+ */
+ int prev_k = k - 1;
+ int prev_x = kd_prev_column[prev_k];
+ x = prev_x + 1;
+ } else {
+ /* The bottom most one.
+ * From position prev_k, step to the bottom in the Myers graph: y += 1.
+ * Incrementing y is achieved by decrementing k while keeping the same x.
+ * (since we're deriving y from y = x - k).
+ */
+ int prev_k = k + 1;
+ int prev_x = kd_prev_column[prev_k];
+ x = prev_x;
+ }
+ }
+
+ /* Slide down any snake that we might find here. */
+ while (x < left->atoms.len && xk_to_y(x, k) < right->atoms.len
+ && diff_atom_same(&left->atoms.head[x], &right->atoms.head[xk_to_y(x, k)]))
+ x++;
+ kd_column[k] = x;
+
+ if (DEBUG) {
+ int fi;
+ for (fi = d; fi >= k; fi-=2) {
+ debug("kd_column[%d] = (%d, %d)\n", fi, kd_column[fi], kd_column[fi] - fi);
+#if 0
+ if (kd_column[fi] >= 0 && kd_column[fi] < left->atoms.len)
+ debug_dump_atom(left, right, &left->atoms.head[kd_column[fi]]);
+ else
+ debug("\n");
+ if (kd_column[fi]-fi >= 0 && kd_column[fi]-fi < right->atoms.len)
+ debug_dump_atom(right, left, &right->atoms.head[kd_column[fi]-fi]);
+ else
+ debug("\n");
+#endif
+ }
+ }
+
+ if (x == left->atoms.len && xk_to_y(x, k) == right->atoms.len) {
+ /* Found a path */
+ backtrack_d = d;
+ backtrack_k = k;
+ debug("Reached the end at d = %d, k = %d\n",
+ backtrack_d, backtrack_k);
+ break;
+ }
+ }
+
+ if (backtrack_d >= 0)
+ break;
+ }
+
+ debug_dump_myers_graph(left, right, kd_origin);
+
+ /* backtrack. A matrix spanning from start to end of the file is ready:
+ *
+ * | d= 0 1 2 3 4
+ * ----+---------------------------------
+ * k= |
+ * 3 |
+ * |
+ * 2 | 2,0
+ * | /
+ * 1 | 1,0 4,3
+ * | / / \
+ * 0 | -->0,0 3,3 4,4 --> backtrack_d = 4, backtrack_k = 0
+ * | \ / \
+ * -1 | 0,1 3,4
+ * | \
+ * -2 | 0,2
+ * |
+ *
+ * From (4,4) backwards, find the previous position that is the largest, and remember it.
+ *
+ */
+ for (d = backtrack_d, k = backtrack_k; d >= 0; d--) {
+ x = kd_column[k];
+ y = xk_to_y(x, k);
+
+ /* When the best position is identified, remember it for that kd_column.
+ * That kd_column is no longer needed otherwise, so just re-purpose kd_column[0] = x and kd_column[1] = y,
+ * so that there is no need to allocate more memory.
+ */
+ kd_column[0] = x;
+ kd_column[1] = y;
+ debug("Backtrack d=%d: xy=(%d, %d)\n",
+ d, kd_column[0], kd_column[1]);
+
+ /* Don't access memory before kd_buf */
+ if (d == 0)
+ break;
+ int *kd_prev_column = kd_column - kd_len;
+
+ /* When y == 0, backtracking downwards (k-1) is the only way.
+ * When x == 0, backtracking upwards (k+1) is the only way.
+ *
+ * | d= 0 1 2 3 4
+ * ----+---------------------------------
+ * k= |
+ * 3 |
+ * | ..y == 0
+ * 2 | 2,0
+ * | /
+ * 1 | 1,0 4,3
+ * | / / \
+ * 0 | -->0,0 3,3 4,4 --> backtrack_d = 4, backtrack_k = 0
+ * | \ / \
+ * -1 | 0,1 3,4
+ * | \
+ * -2 | 0,2__
+ * | x == 0
+ */
+ debug("prev[k-1] = %d,%d prev[k+1] = %d,%d\n",
+ kd_prev_column[k-1], xk_to_y(kd_prev_column[k-1],k-1),
+ kd_prev_column[k+1], xk_to_y(kd_prev_column[k+1],k+1));
+ if (y == 0
+ || (x > 0 && kd_prev_column[k - 1] >= kd_prev_column[k + 1])) {
+ k = k - 1;
+ debug("prev k=k-1=%d x=%d y=%d\n",
+ k, kd_prev_column[k], xk_to_y(kd_prev_column[k], k));
+ } else {
+ k = k + 1;
+ debug("prev k=k+1=%d x=%d y=%d\n",
+ k, kd_prev_column[k], xk_to_y(kd_prev_column[k], k));
+ }
+ kd_column = kd_prev_column;
+ }
+
+ /* Forwards again, this time recording the diff chunks.
+ * Definitely start from 0,0. kd_column[0] may actually point to the bottom of a snake starting at 0,0 */
+ x = 0;
+ y = 0;
+
+ kd_column = kd_origin;
+ for (d = 0; d <= backtrack_d; d++, kd_column += kd_len) {
+ int next_x = kd_column[0];
+ int next_y = kd_column[1];
+ debug("Forward track from xy(%d,%d) to xy(%d,%d)\n",
+ x, y, next_x, next_y);
+
+ struct diff_atom *left_atom = &left->atoms.head[x];
+ int left_section_len = next_x - x;
+ struct diff_atom *right_atom = &right->atoms.head[y];
+ int right_section_len = next_y - y;
+
+ rc = DIFF_RC_ENOMEM;
+ if (left_section_len && right_section_len) {
+ /* This must be a snake slide.
+ * Snake slides have a straight line leading into them (except when starting at (0,0)). Find
+ * out whether the lead-in is horizontal or vertical:
+ *
+ * left
+ * ---------->
+ * |
+ * r| o-o o
+ * i| \ |
+ * g| o o
+ * h| \ \
+ * t| o o
+ * v
+ *
+ * If left_section_len > right_section_len, the lead-in is horizontal, meaning first
+ * remove one atom from the left before sliding down the snake.
+ * If right_section_len > left_section_len, the lead-in is vetical, so add one atom from
+ * the right before sliding down the snake. */
+ if (left_section_len == right_section_len + 1) {
+ if (!diff_state_add_chunk(state, true,
+ left_atom, 1,
+ right_atom, 0))
+ goto return_rc;
+ left_atom++;
+ left_section_len--;
+ } else if (right_section_len == left_section_len + 1) {
+ if (!diff_state_add_chunk(state, true,
+ left_atom, 0,
+ right_atom, 1))
+ goto return_rc;
+ right_atom++;
+ right_section_len--;
+ } else if (left_section_len != right_section_len) {
+ /* The numbers are making no sense. Should never happen. */
+ rc = DIFF_RC_USE_DIFF_ALGO_FALLBACK;
+ goto return_rc;
+ }
+
+ if (!diff_state_add_chunk(state, true,
+ left_atom, left_section_len,
+ right_atom, right_section_len))
+ goto return_rc;
+ } else if (left_section_len && !right_section_len) {
+ /* Only left atoms and none on the right, they form a "minus" chunk, then. */
+ if (!diff_state_add_chunk(state, true,
+ left_atom, left_section_len,
+ right_atom, 0))
+ goto return_rc;
+ } else if (!left_section_len && right_section_len) {
+ /* No left atoms, only atoms on the right, they form a "plus" chunk, then. */
+ if (!diff_state_add_chunk(state, true,
+ left_atom, 0,
+ right_atom, right_section_len))
+ goto return_rc;
+ }
+
+ x = next_x;
+ y = next_y;
+ }
+
+ rc = DIFF_RC_OK;
+
+return_rc:
+ free(kd_buf);
+ debug("** END %s rc=%d\n", __func__, rc);
+ return rc;
+}
blob - /dev/null
blob + 20d1f9a79ec84281efede279f51bd2292a99466a (mode 644)
--- /dev/null
+++ lib/diff_output.c
+/* Common parts for printing diff output */
+/*
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <diff/diff_output.h>
+
+void diff_output_lines(FILE *dest, const char *prefix, struct diff_atom *start_atom, unsigned int count)
+{
+ struct diff_atom *atom;
+ foreach_diff_atom(atom, start_atom, count) {
+ fprintf(dest, "%s", prefix);
+ int i;
+ unsigned int len = atom->len;
+ if (len && atom->at[len - 1] == '\n') {
+ len--;
+ if (len && atom->at[len - 1] == '\r')
+ len--;
+ }
+
+ for (i = 0; i < len; i++) {
+ char c = atom->at[i];
+ if (c < 0x20 || c >= 0x7f)
+ fprintf(dest, "\\x%02x", (unsigned char)c);
+ else
+ fprintf(dest, "%c", c);
+ }
+ fprintf(dest, "\n");
+ }
+}
+
+enum diff_rc diff_output_info(FILE *dest, const struct diff_input_info *info)
+{
+ if (info->arbitrary_info && *info->arbitrary_info)
+ fprintf(dest, "%s", info->arbitrary_info);
+ fprintf(dest, "--- %s\n+++ %s\n",
+ info->left_path ? : "a",
+ info->right_path ? : "b");
+ return DIFF_RC_OK;
+}
blob - /dev/null
blob + 796c3ee69f83bf338c67c8b310b27c2dea7501f1 (mode 644)
--- /dev/null
+++ lib/diff_output_plain.c
+/* Output all lines of a diff_result. */
+/*
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <diff/diff_output.h>
+
+enum diff_rc diff_output_plain(FILE *dest, const struct diff_input_info *info,
+ const struct diff_result *result)
+{
+ if (!result)
+ return DIFF_RC_EINVAL;
+ if (result->rc != DIFF_RC_OK)
+ return result->rc;
+
+ diff_output_info(dest, info);
+
+ int i;
+ for (i = 0; i < result->chunks.len; i++) {
+ struct diff_chunk *c = &result->chunks.head[i];
+ if (c->left_count && c->right_count)
+ diff_output_lines(dest, c->solved ? " " : "?", c->left_start, c->left_count);
+ else if (c->left_count && !c->right_count)
+ diff_output_lines(dest, c->solved ? "-" : "?", c->left_start, c->left_count);
+ else if (c->right_count && !c->left_count)
+ diff_output_lines(dest, c->solved ? "+" : "?", c->right_start, c->right_count);
+ }
+ return DIFF_RC_OK;
+}
+
+enum diff_rc diff_plain(FILE *dest, const struct diff_config *diff_config,
+ const struct diff_input_info *info,
+ const char *left, int left_len, const char *right, int right_len)
+{
+ enum diff_rc rc;
+ left_len = left_len < 0 ? strlen(left) : left_len;
+ right_len = right_len < 0 ? strlen(right) : right_len;
+ struct diff_result *result = diff_main(diff_config, left, left_len, right, right_len);
+ rc = diff_output_plain(dest, info, result);
+ diff_result_free(result);
+ return rc;
+}
blob - /dev/null
blob + d12e4c0b5d67538e6f0dbd89b84c22447ec44d25 (mode 644)
--- /dev/null
+++ lib/diff_output_unidiff.c
+/* Produce a unidiff output from a diff_result. */
+/*
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <diff/diff_output.h>
+
+#include "debug.h"
+
+enum chunk_type {
+ CHUNK_EMPTY,
+ CHUNK_PLUS,
+ CHUNK_MINUS,
+ CHUNK_SAME,
+ CHUNK_WEIRD,
+};
+
+static inline enum chunk_type chunk_type(const struct diff_chunk *chunk)
+{
+ if (!chunk->left_count && !chunk->right_count)
+ return CHUNK_EMPTY;
+ if (!chunk->solved)
+ return CHUNK_WEIRD;
+ if (!chunk->right_count)
+ return CHUNK_MINUS;
+ if (!chunk->left_count)
+ return CHUNK_PLUS;
+ if (chunk->left_count != chunk->right_count)
+ return CHUNK_WEIRD;
+ return CHUNK_SAME;
+}
+
+#define MAX(A,B) ((A)>(B)?(A):(B))
+#define MIN(A,B) ((A)<(B)?(A):(B))
+
+struct range {
+ int start;
+ int end;
+};
+
+static bool range_empty(const struct range *r)
+{
+ return r->start == r->end;
+}
+
+static bool ranges_touch(const struct range *a, const struct range *b)
+{
+ return (a->end >= b->start) && (a->start <= b->end);
+}
+
+static void ranges_merge(struct range *a, const struct range *b)
+{
+ *a = (struct range){
+ .start = MIN(a->start, b->start),
+ .end = MAX(a->end, b->end),
+ };
+}
+
+struct chunk_context {
+ struct range chunk;
+ struct range left, right;
+};
+
+static bool chunk_context_empty(const struct chunk_context *cc)
+{
+ return range_empty(&cc->chunk);
+}
+
+static void chunk_context_get(struct chunk_context *cc, const struct diff_result *r, int chunk_idx,
+ int context_lines)
+{
+ const struct diff_chunk *c = &r->chunks.head[chunk_idx];
+ int left_start = diff_atom_root_idx(&r->left, c->left_start);
+ int right_start = diff_atom_root_idx(&r->right, c->right_start);
+
+ *cc = (struct chunk_context){
+ .chunk = {
+ .start = chunk_idx,
+ .end = chunk_idx + 1,
+ },
+ .left = {
+ .start = MAX(0, left_start - context_lines),
+ .end = MIN(r->left.atoms.len, left_start + c->left_count + context_lines),
+ },
+ .right = {
+ .start = MAX(0, right_start - context_lines),
+ .end = MIN(r->right.atoms.len, right_start + c->right_count + context_lines),
+ },
+ };
+}
+
+static bool chunk_contexts_touch(const struct chunk_context *cc, const struct chunk_context *other)
+{
+ return ranges_touch(&cc->chunk, &other->chunk)
+ || ranges_touch(&cc->left, &other->left)
+ || ranges_touch(&cc->right, &other->right);
+}
+
+static void chunk_contexts_merge(struct chunk_context *cc, const struct chunk_context *other)
+{
+ ranges_merge(&cc->chunk, &other->chunk);
+ ranges_merge(&cc->left, &other->left);
+ ranges_merge(&cc->right, &other->right);
+}
+
+static void diff_output_unidiff_chunk(FILE *dest, bool *info_printed, const struct diff_input_info *info,
+ const struct diff_result *result, const struct chunk_context *cc)
+{
+ if (range_empty(&cc->left) && range_empty(&cc->right))
+ return;
+
+ if (!(*info_printed)) {
+ diff_output_info(dest, info);
+ *info_printed = true;
+ }
+
+ fprintf(dest, "@@ -%d,%d +%d,%d @@\n",
+ cc->left.start + 1, cc->left.end - cc->left.start,
+ cc->right.start + 1, cc->right.end - cc->right.start);
+
+ /* Got the absolute line numbers where to start printing, and the index of the interesting (non-context) chunk.
+ * To print context lines above the interesting chunk, nipping on the previous chunk index may be necessary.
+ * It is guaranteed to be only context lines where left == right, so it suffices to look on the left. */
+ const struct diff_chunk *first_chunk = &result->chunks.head[cc->chunk.start];
+ int chunk_start_line = diff_atom_root_idx(&result->left, first_chunk->left_start);
+ if (cc->left.start < chunk_start_line)
+ diff_output_lines(dest, " ", &result->left.atoms.head[cc->left.start],
+ chunk_start_line - cc->left.start);
+
+ /* Now write out all the joined chunks and contexts between them */
+ int c_idx;
+ for (c_idx = cc->chunk.start; c_idx < cc->chunk.end; c_idx++) {
+ const struct diff_chunk *c = &result->chunks.head[c_idx];
+
+ if (c->left_count && c->right_count)
+ diff_output_lines(dest, c->solved ? " " : "?", c->left_start, c->left_count);
+ else if (c->left_count && !c->right_count)
+ diff_output_lines(dest, c->solved ? "-" : "?", c->left_start, c->left_count);
+ else if (c->right_count && !c->left_count)
+ diff_output_lines(dest, c->solved ? "+" : "?", c->right_start, c->right_count);
+ }
+
+ /* Trailing context? */
+ const struct diff_chunk *last_chunk = &result->chunks.head[cc->chunk.end - 1];
+ int chunk_end_line = diff_atom_root_idx(&result->left, last_chunk->left_start + last_chunk->left_count);
+ if (cc->left.end > chunk_end_line)
+ diff_output_lines(dest, " ", &result->left.atoms.head[chunk_end_line],
+ cc->left.end - chunk_end_line);
+}
+
+enum diff_rc diff_output_unidiff(FILE *dest, const struct diff_input_info *info,
+ const struct diff_result *result, unsigned int context_lines)
+{
+ if (!result)
+ return DIFF_RC_EINVAL;
+ if (result->rc != DIFF_RC_OK)
+ return result->rc;
+
+ struct chunk_context cc = {};
+ bool info_printed = false;
+
+ int i;
+ for (i = 0; i < result->chunks.len; i++) {
+ struct diff_chunk *c = &result->chunks.head[i];
+ enum chunk_type t = chunk_type(c);
+
+ if (t == CHUNK_MINUS || t == CHUNK_PLUS) {
+ if (chunk_context_empty(&cc)) {
+ /* These are the first lines being printed.
+ * Note down the start point, any number of subsequent chunks may be joined up to this
+ * unidiff chunk by context lines or by being directly adjacent. */
+ chunk_context_get(&cc, result, i, context_lines);
+ debug("new chunk to be printed: chunk %d-%d left %d-%d right %d-%d\n",
+ cc.chunk.start, cc.chunk.end,
+ cc.left.start, cc.left.end, cc.right.start, cc.right.end);
+ } else {
+ /* There already is a previous chunk noted down for being printed.
+ * Does it join up with this one? */
+ struct chunk_context next;
+ chunk_context_get(&next, result, i, context_lines);
+ debug("new chunk to be printed: chunk %d-%d left %d-%d right %d-%d\n",
+ next.chunk.start, next.chunk.end,
+ next.left.start, next.left.end, next.right.start, next.right.end);
+ if (chunk_contexts_touch(&cc, &next)) {
+ /* This next context touches or overlaps the previous one, join. */
+ chunk_contexts_merge(&cc, &next);
+ debug("new chunk to be printed touches previous chunk, now: left %d-%d right %d-%d\n",
+ cc.left.start, cc.left.end, cc.right.start, cc.right.end);
+ } else {
+ /* No touching, so the previous context is complete with a gap between it and
+ * this next one. Print the previous one and start fresh here. */
+ debug("new chunk to be printed does not touch previous chunk; print left %d-%d right %d-%d\n",
+ cc.left.start, cc.left.end, cc.right.start, cc.right.end);
+ diff_output_unidiff_chunk(dest, &info_printed, info, result, &cc);
+ cc = next;
+ debug("new unprinted chunk is left %d-%d right %d-%d\n",
+ cc.left.start, cc.left.end, cc.right.start, cc.right.end);
+ }
+ }
+ }
+
+ }
+
+ if (!chunk_context_empty(&cc))
+ diff_output_unidiff_chunk(dest, &info_printed, info, result, &cc);
+ return DIFF_RC_OK;
+}
+
+enum diff_rc diff_unidiff(FILE *dest, const struct diff_config *diff_config,
+ const struct diff_input_info *info,
+ const char *left, int left_len, const char *right, int right_len,
+ unsigned int context_lines)
+{
+ enum diff_rc rc;
+ left_len = left_len < 0 ? strlen(left) : left_len;
+ right_len = right_len < 0 ? strlen(right) : right_len;
+ struct diff_result *result = diff_main(diff_config, left, left_len, right, right_len);
+ rc = diff_output_unidiff(dest, info, result, context_lines);
+ diff_result_free(result);
+ return rc;
+}
blob - /dev/null
blob + 9fcf8196417c1df4fda8effb7cdbb57995974764 (mode 644)
--- /dev/null
+++ lib/diff_patience.c
+/* Implementation of the Patience Diff algorithm invented by Bram Cohen:
+ * Divide a diff problem into smaller chunks by an LCS of common-unique lines. */
+/*
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <assert.h>
+#include <diff/diff_main.h>
+
+#include "debug.h"
+
+/* Set unique_here = true for all atoms that exist exactly once in this list. */
+static void diff_atoms_mark_unique(struct diff_data *d, unsigned int *unique_count)
+{
+ struct diff_atom *i;
+ unsigned int count = 0;
+ diff_data_foreach_atom(i, d) {
+ i->patience.unique_here = true;
+ i->patience.unique_in_both = true;
+ count++;
+ }
+ diff_data_foreach_atom(i, d) {
+ struct diff_atom *j;
+
+ if (!i->patience.unique_here)
+ continue;
+
+ diff_data_foreach_atom_from(i + 1, j, d) {
+ if (diff_atom_same(i, j)) {
+ if (i->patience.unique_here) {
+ i->patience.unique_here = false;
+ i->patience.unique_in_both = false;
+ count--;
+ }
+ j->patience.unique_here = false;
+ j->patience.unique_in_both = false;
+ count--;
+ }
+ }
+ }
+ if (unique_count)
+ *unique_count = count;
+}
+
+/* Mark those lines as atom->patience.unique_in_both = true that appear exactly once in each side. */
+static void diff_atoms_mark_unique_in_both(struct diff_data *left, struct diff_data *right,
+ unsigned int *unique_in_both_count)
+{
+ /* Derive the final unique_in_both count without needing an explicit iteration. So this is just some
+ * optimiziation to save one iteration in the end. */
+ unsigned int unique_in_both;
+
+ diff_atoms_mark_unique(left, &unique_in_both);
+ diff_atoms_mark_unique(right, NULL);
+
+ debug("unique_in_both %u\n", unique_in_both);
+
+ struct diff_atom *i;
+ diff_data_foreach_atom(i, left) {
+ if (!i->patience.unique_here)
+ continue;
+ struct diff_atom *j;
+ int found_in_b = 0;
+ diff_data_foreach_atom(j, right) {
+ if (!diff_atom_same(i, j))
+ continue;
+ if (!j->patience.unique_here) {
+ found_in_b = 2; /* or more */
+ break;
+ } else {
+ found_in_b = 1;
+ j->patience.pos_in_other = i;
+ i->patience.pos_in_other = j;
+ }
+ }
+
+ if (found_in_b == 0 || found_in_b > 1) {
+ i->patience.unique_in_both = false;
+ unique_in_both--;
+ debug("unique_in_both %u (%d) ", unique_in_both, found_in_b);
+ debug_dump_atom(left, NULL, i);
+ }
+ }
+
+ /* Still need to unmark right[*]->patience.unique_in_both for atoms that don't exist in left */
+ diff_data_foreach_atom(i, right) {
+ if (!i->patience.unique_here
+ || !i->patience.unique_in_both)
+ continue;
+ struct diff_atom *j;
+ bool found_in_a = false;
+ diff_data_foreach_atom(j, left) {
+ if (!j->patience.unique_in_both)
+ continue;
+ if (!diff_atom_same(i, j))
+ continue;
+ found_in_a = true;
+ break;
+ }
+
+ if (!found_in_a)
+ i->patience.unique_in_both = false;
+ }
+
+ if (unique_in_both_count)
+ *unique_in_both_count = unique_in_both;
+}
+
+
+/* Among the lines that appear exactly once in each side, find the longest streak that appear in both files in the same
+ * order (with other stuff allowed to interleave). Use patience sort for that, as in the Patience Diff algorithm.
+ * See https://bramcohen.livejournal.com/73318.html and, for a much more detailed explanation,
+ * https://blog.jcoglan.com/2017/09/19/the-patience-diff-algorithm/ */
+enum diff_rc diff_algo_patience(const struct diff_algo_config *algo_config, struct diff_state *state)
+{
+ enum diff_rc rc = DIFF_RC_ENOMEM;
+
+ struct diff_data *left = &state->left;
+ struct diff_data *right = &state->right;
+
+ unsigned int unique_in_both_count;
+
+ debug("\n** %s\n", __func__);
+
+ /* Find those lines that appear exactly once in 'left' and exactly once in 'right'. */
+ diff_atoms_mark_unique_in_both(left, right, &unique_in_both_count);
+
+ debug("unique_in_both_count %u\n", unique_in_both_count);
+ debug("left:\n");
+ debug_dump(left);
+ debug("right:\n");
+ debug_dump(right);
+
+ if (!unique_in_both_count) {
+ /* Cannot apply Patience, tell the caller to use fallback_algo instead. */
+ return DIFF_RC_USE_DIFF_ALGO_FALLBACK;
+ }
+
+ /* An array of Longest Common Sequence is the result of the below subscope: */
+ unsigned int lcs_count = 0;
+ struct diff_atom **lcs = NULL;
+ struct diff_atom *lcs_tail = NULL;
+
+ {
+ /* This subscope marks the lifetime of the atom_pointers allocation */
+
+ /* One chunk of storage for atom pointers */
+ struct diff_atom **atom_pointers = recallocarray(NULL, 0, unique_in_both_count * 2, sizeof(struct diff_atom*));
+
+ /* Half for the list of atoms that still need to be put on stacks */
+ struct diff_atom **uniques = atom_pointers;
+
+ /* Half for the patience sort state's "card stacks" -- we remember only each stack's topmost "card" */
+ struct diff_atom **patience_stacks = atom_pointers + unique_in_both_count;
+ unsigned int patience_stacks_count = 0;
+
+ /* Take all common, unique items from 'left' ... */
+
+ struct diff_atom *atom;
+ struct diff_atom **uniques_end = uniques;
+ diff_data_foreach_atom(atom, left) {
+ if (!atom->patience.unique_in_both)
+ continue;
+ *uniques_end = atom;
+ uniques_end++;
+ }
+
+ /* ...and sort them to the order found in 'right'.
+ * The idea is to find the leftmost stack that has a higher line number and add it to the stack's top.
+ * If there is no such stack, open a new one on the right. The line number is derived from the atom*,
+ * which are array items and hence reflect the relative position in the source file. So we got the
+ * common-uniques from 'left' and sort them according to atom->patience.pos_in_other. */
+ unsigned int i;
+ for (i = 0; i < unique_in_both_count; i++) {
+ atom = uniques[i];
+ unsigned int target_stack;
+
+ if (!patience_stacks_count)
+ target_stack = 0;
+ else {
+ /* binary search to find the stack to put this atom "card" on. */
+ unsigned int lo = 0;
+ unsigned int hi = patience_stacks_count;
+ while (lo < hi) {
+ unsigned int mid = (lo + hi) >> 1;
+ if (patience_stacks[mid]->patience.pos_in_other < atom->patience.pos_in_other)
+ lo = mid + 1;
+ else
+ hi = mid;
+ }
+
+ target_stack = lo;
+ }
+
+ assert(target_stack <= patience_stacks_count);
+ patience_stacks[target_stack] = atom;
+ if (target_stack == patience_stacks_count)
+ patience_stacks_count++;
+
+ /* Record a back reference to the next stack on the left, which will form the final longest sequence
+ * later. */
+ atom->patience.prev_stack = target_stack ? patience_stacks[target_stack - 1] : NULL;
+
+ }
+
+ /* backtrace through prev_stack references to form the final longest common sequence */
+ lcs_tail = patience_stacks[patience_stacks_count - 1];
+ lcs_count = patience_stacks_count;
+
+ /* uniques and patience_stacks are no longer needed. Backpointers are in atom->patience.prev_stack */
+ free(atom_pointers);
+ }
+
+ lcs = recallocarray(NULL, 0, lcs_count, sizeof(struct diff_atom*));
+ struct diff_atom **lcs_backtrace_pos = &lcs[lcs_count - 1];
+ struct diff_atom *atom;
+ for (atom = lcs_tail; atom; atom = atom->patience.prev_stack, lcs_backtrace_pos--) {
+ assert(lcs_backtrace_pos >= lcs);
+ *lcs_backtrace_pos = atom;
+ }
+
+ unsigned int i;
+ if (DEBUG) {
+ debug("\npatience LCS:\n");
+ for (i = 0; i < lcs_count; i++) {
+ debug_dump_atom(left, right, lcs[i]);
+ }
+ }
+
+
+ /* TODO: For each common-unique line found (now listed in lcs), swallow lines upwards and downwards that are
+ * identical on each side. Requires a way to represent atoms being glued to adjacent atoms. */
+
+ debug("\ntraverse LCS, possibly recursing:\n");
+
+ /* Now we have pinned positions in both files at which it makes sense to divide the diff problem into smaller
+ * chunks. Go into the next round: look at each section in turn, trying to again find common-unique lines in
+ * those smaller sections. As soon as no more are found, the remaining smaller sections are solved by Myers. */
+ unsigned int left_pos = 0;
+ unsigned int right_pos = 0;
+ for (i = 0; i <= lcs_count; i++) {
+ struct diff_atom *atom;
+ unsigned int left_idx;
+ unsigned int right_idx;
+
+ if (i < lcs_count) {
+ atom = lcs[i];
+ left_idx = diff_atom_idx(left, atom);
+ right_idx = diff_atom_idx(right, atom->patience.pos_in_other);
+ } else {
+ atom = NULL;
+ left_idx = left->atoms.len;
+ right_idx = right->atoms.len;
+ }
+
+ /* 'atom' now marks an atom that matches on both sides according to patience-diff
+ * (a common-unique identical atom in both files).
+ * Handle the section before and the atom itself; the section after will be handled by the next loop
+ * iteration -- note that i loops to last element + 1 ("i <= lcs_count"), so that there will be another
+ * final iteration to pick up the last remaining items after the last LCS atom.
+ * The sections before might also be empty on left and/or right.
+ * left_pos and right_pos mark the indexes of the first atoms that have not yet been handled in the
+ * previous loop iteration.
+ * left_idx and right_idx mark the indexes of the matching atom on left and right, respectively. */
+
+ debug("iteration %u left_pos %u left_idx %u right_pos %u right_idx %u\n",
+ i, left_pos, left_idx, right_pos, right_idx);
+
+ struct diff_chunk *chunk;
+
+ /* Section before the matching atom */
+ struct diff_atom *left_atom = &left->atoms.head[left_pos];
+ unsigned int left_section_len = left_idx - left_pos;
+
+ struct diff_atom *right_atom = &(right->atoms.head[right_pos]);
+ unsigned int right_section_len = right_idx - right_pos;
+
+ if (left_section_len && right_section_len) {
+ /* Record an unsolved chunk, the caller will apply inner_algo() on this chunk. */
+ if (!diff_state_add_chunk(state, false,
+ left_atom, left_section_len,
+ right_atom, right_section_len))
+ goto return_rc;
+ } else if (left_section_len && !right_section_len) {
+ /* Only left atoms and none on the right, they form a "minus" chunk, then. */
+ if (!diff_state_add_chunk(state, true,
+ left_atom, left_section_len,
+ right_atom, 0))
+ goto return_rc;
+ } else if (!left_section_len && right_section_len) {
+ /* No left atoms, only atoms on the right, they form a "plus" chunk, then. */
+ if (!diff_state_add_chunk(state, true,
+ left_atom, 0,
+ right_atom, right_section_len))
+ goto return_rc;
+ }
+ /* else: left_section_len == 0 and right_section_len == 0, i.e. nothing here. */
+
+ /* The atom found to match on both sides forms a chunk of equals on each side. In the very last
+ * iteration of this loop, there is no matching atom, we were just cleaning out the remaining lines. */
+ if (atom) {
+ if (!diff_state_add_chunk(state, true,
+ atom, 1,
+ atom->patience.pos_in_other, 1))
+ goto return_rc;
+ }
+
+ left_pos = left_idx + 1;
+ right_pos = right_idx + 1;
+ }
+ debug("** END %s\n", __func__);
+
+ rc = DIFF_RC_OK;
+
+return_rc:
+ free(lcs);
+ return rc;
+}
+
blob - /dev/null
blob + e8e14be5621f0f4536f2e0e42b7e9fb59d758cf4 (mode 644)
--- /dev/null
+++ lib/linux_Makefile
+srcs = \
+ diff_atomize_text.c \
+ diff_main.c \
+ diff_myers.c \
+ diff_patience.c \
+ diff_output.c \
+ diff_output_plain.c \
+ diff_output_unidiff.c \
+ $(END)
+
+objs = $(srcs:.c=.o)
+
+libdiff.a: $(objs)
+ ar rcs $@ $^
+
+CFLAGS = -fsanitize=address -fsanitize=undefined -g
+
+%.o: %.c ./*.h ../include/diff/*.h
+ gcc $(CFLAGS) -I../include -o $@ -c $<
+
+.PHONY: clean
+clean:
+ -rm $(objs)
+ -rm libdiff.a
blob - /dev/null
blob + b4a9acb0cdc71e2d714c76d3f510873470cb8236 (mode 644)
--- /dev/null
+++ man/diff.1
+.\" $OpenBSD$
+.\"
+.\" Copyright (c) 2018 Martin Pieuchot
+.\" Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+.\"
+.\" Permission to use, copy, modify, and distribute this software for any
+.\" purpose with or without fee is hereby granted, provided that the above
+.\" copyright notice and this permission notice appear in all copies.
+.\"
+.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+.\"
+.Dd $Mdocdate: August 28 2017 $
+.Dt DIFF 1
+.Os
+.Sh NAME
+.Nm diff
+.Nd compare files
+.Sh SYNOPSIS
+.Nm diff
+.Ar file1 file2
+.Sh DESCRIPTION
+The
+.Nm
+utility compares the contents of
+.Ar file1
+and
+.Ar file2
+line by line.
+.Sh EXIT STATUS
+The
+.Nm
+utility exits with one of the following values:
+.Pp
+.Bl -tag -width Ds -offset indent -compact
+.It 0
+No differences were found.
+.It 1
+Differences were found.
+.It >1
+An error occurred.
+.El
blob - /dev/null
blob + ad6ffbd9d12fc4f81e7e59ea3b4b7801c0b53887 (mode 644)
--- /dev/null
+++ test/Makefile
+.PHONY: test verify clean
+test: verify clean
+
+verify:
+ ./verify_all.sh
+clean:
+ -rm verify.*
blob - /dev/null
blob + 60764f10614644471b9cae1fdf7787246ef1ed92 (mode 644)
--- /dev/null
+++ test/README
+The test produces a diff, which is successful if it is able to reconstruct the
+original source files from it. It is not tested whether diff output is optimal
+or beautiful.
+
+Since different diff algorithms can produce different diff outputs, the
+expect*.diff files are merely provided for reference and are not part of the
+tests.
blob - /dev/null
blob + 48768c491c8dc456f94d9bd13e9a2f27590bbb0d (mode 644)
--- /dev/null
+++ test/expect001.diff
+--- test001.left.txt
++++ test001.right.txt
+-A
+-B
+ C
+-A
+ B
++A
+ B
+ A
++C
blob - /dev/null
blob + 5a409a01b7a7269424e11fd6a07a054e997dd3c5 (mode 644)
--- /dev/null
+++ test/expect002.diff
+--- test002.left.txt
++++ test002.right.txt
+-A
+-B
+ C
+-A
+ B
++A
+ B
+ A
++C
+ X
+-Y
+ Z
++Q
blob - /dev/null
blob + ff1c8b5c8d3c587b15fe9d9ec83ba6ceb516436e (mode 644)
--- /dev/null
+++ test/expect003.diff
+--- test003.left.txt
++++ test003.right.txt
+-a
++x
+ b
+ c
+-d
+-e
++y
blob - /dev/null
blob + 5989a134be2ab91b93f757600becacc8065b8f94 (mode 644)
--- /dev/null
+++ test/expect004.diff
+--- test004.left.txt
++++ test004.right.txt
++int Chunk_bounds_check(Chunk *chunk, size_t start, size_t n)
++{
++ if (chunk == NULL) return 0;
++
++ return start <= chunk->length && n <= chunk->length - start;
++}
++
+ void Chunk_copy(Chunk *src, size_t src_start, Chunk *dst, size_t dst_start, size_t n)
+ {
+ if (!Chunk_bounds_check(src, src_start, n)) return;
+ if (!Chunk_bounds_check(dst, dst_start, n)) return;
+
+ memcpy(dst->data + dst_start, src->data + src_start, n);
+ }
+-
+-int Chunk_bounds_check(Chunk *chunk, size_t start, size_t n)
+-{
+- if (chunk == NULL) return 0;
+-
+- return start <= chunk->length && n <= chunk->length - start;
+-}
blob - /dev/null
blob + 25e2e0bc35f05d8383e52fad77872be96de5dca0 (mode 644)
--- /dev/null
+++ test/expect005.diff
+--- test005.left.txt
++++ test005.right.txt
++The Slits
++Gil Scott Heron
+ David Axelrod
+ Electric Prunes
+-Gil Scott Heron
+-The Slits
+ Faust
+ The Sonics
+ The Sonics
blob - /dev/null
blob + ea0a5d1c20b159e7dad0631318ef5690875da3ba (mode 644)
--- /dev/null
+++ test/expect006.diff
+--- test006.left.txt
++++ test006.right.txt
+ Below is an example license to be used for new code in OpenBSD,
+ modeled after the ISC license.
+
+ It is important to specify the year of the copyright. Additional years
+ should be separated by a comma, e.g.
+- Copyright (c) 2003, 2004
++ Copyright (c) 2003, 2004, 2005
+
+ If you add extra text to the body of the license, be careful not to
+ add further restrictions.
+
+ /*
+ * Copyright (c) CCYY YOUR NAME HERE <user@your.dom.ain>
+ *
+- * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
++An extra line
blob - /dev/null
blob + 996246e7974d7b25bc76ee14c63114a241be4704 (mode 644)
--- /dev/null
+++ test/expect007.diff
+--- test007.left.txt
++++ test007.right.txt
+-x
++abcdx
blob - /dev/null
blob + c4f9c82c25819151690ffa3bcd44eea1b6e43159 (mode 644)
--- /dev/null
+++ test/expect008.diff
+--- test008.left.txt
++++ test008.right.txt
+ x
++a
++b
++c
++d
++x
blob - /dev/null
blob + 8f75e2ee8130368c1ef12eb5b35b99b786f49ec1 (mode 644)
--- /dev/null
+++ test/expect009.diff
+--- test009.left.txt
++++ test009.right.txt
+ x
+ a
+ b
++c
++d
++e
++f
++x
++a
++b
blob - /dev/null
blob + fd113b0f71509c767b9a4e7d4cc2f70f9d41c459 (mode 644)
--- /dev/null
+++ test/test001.left.txt
+A
+B
+C
+A
+B
+B
+A
blob - /dev/null
blob + 0075e6d23de0d19b7f870a3e82b9b310001b3db9 (mode 644)
--- /dev/null
+++ test/test001.right.txt
+C
+B
+A
+B
+A
+C
blob - /dev/null
blob + de77f56f2d47f324d97bd0e8968589acd31a54d6 (mode 644)
--- /dev/null
+++ test/test002.left.txt
+A
+B
+C
+A
+B
+B
+A
+X
+Y
+Z
blob - /dev/null
blob + a9b4b8b851ddd7eba0955da5c7cd1bf316b7da83 (mode 644)
--- /dev/null
+++ test/test002.right.txt
+C
+B
+A
+B
+A
+C
+X
+Z
+Q
blob - /dev/null
blob + 940532533944dd159bfd11136fac2ee35872de38 (mode 644)
--- /dev/null
+++ test/test003.left.txt
+a
+b
+c
+d
+e
blob - /dev/null
blob + d9f266e145742427cb8635edb7343b2d7cd33b92 (mode 644)
--- /dev/null
+++ test/test003.right.txt
+x
+b
+c
+y
blob - /dev/null
blob + 72d95cf6331a2a7f35baa19863e6dd0250112e23 (mode 644)
--- /dev/null
+++ test/test004.left.txt
+void Chunk_copy(Chunk *src, size_t src_start, Chunk *dst, size_t dst_start, size_t n)
+{
+ if (!Chunk_bounds_check(src, src_start, n)) return;
+ if (!Chunk_bounds_check(dst, dst_start, n)) return;
+
+ memcpy(dst->data + dst_start, src->data + src_start, n);
+}
+
+int Chunk_bounds_check(Chunk *chunk, size_t start, size_t n)
+{
+ if (chunk == NULL) return 0;
+
+ return start <= chunk->length && n <= chunk->length - start;
+}
blob - /dev/null
blob + cfe8f2ed6be03213c89877245b9ce3da3a4802d8 (mode 644)
--- /dev/null
+++ test/test004.right.txt
+int Chunk_bounds_check(Chunk *chunk, size_t start, size_t n)
+{
+ if (chunk == NULL) return 0;
+
+ return start <= chunk->length && n <= chunk->length - start;
+}
+
+void Chunk_copy(Chunk *src, size_t src_start, Chunk *dst, size_t dst_start, size_t n)
+{
+ if (!Chunk_bounds_check(src, src_start, n)) return;
+ if (!Chunk_bounds_check(dst, dst_start, n)) return;
+
+ memcpy(dst->data + dst_start, src->data + src_start, n);
+}
blob - /dev/null
blob + b77fc41c4c871564f41f74ebc9f4a07710681b3e (mode 644)
--- /dev/null
+++ test/test005.left.txt
+David Axelrod
+Electric Prunes
+Gil Scott Heron
+The Slits
+Faust
+The Sonics
+The Sonics
blob - /dev/null
blob + 2480cecfba48d082c6c9c52ba386499542d3e2aa (mode 644)
--- /dev/null
+++ test/test005.right.txt
+The Slits
+Gil Scott Heron
+David Axelrod
+Electric Prunes
+Faust
+The Sonics
+The Sonics
blob - /dev/null
blob + e01fce04d5ef4776a326a510558cfe85af41a22a (mode 644)
--- /dev/null
+++ test/test006.left.txt
+Below is an example license to be used for new code in OpenBSD,
+modeled after the ISC license.
+
+It is important to specify the year of the copyright. Additional years
+should be separated by a comma, e.g.
+ Copyright (c) 2003, 2004
+
+If you add extra text to the body of the license, be careful not to
+add further restrictions.
+
+/*
+ * Copyright (c) CCYY YOUR NAME HERE <user@your.dom.ain>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
blob - /dev/null
blob + 40a0f253dd2e3503d295711fcfe09427f52219b5 (mode 644)
--- /dev/null
+++ test/test006.right.txt
+Below is an example license to be used for new code in OpenBSD,
+modeled after the ISC license.
+
+It is important to specify the year of the copyright. Additional years
+should be separated by a comma, e.g.
+ Copyright (c) 2003, 2004, 2005
+
+If you add extra text to the body of the license, be careful not to
+add further restrictions.
+
+/*
+ * Copyright (c) CCYY YOUR NAME HERE <user@your.dom.ain>
+ *
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+An extra line
blob - /dev/null
blob + 587be6b4c3f93f93c489c0111bba5596147a26cb (mode 644)
--- /dev/null
+++ test/test007.left.txt
+x
blob - /dev/null
blob + 47b0ccb04734aa02fca7a524d6fbc171756a6a5b (mode 644)
--- /dev/null
+++ test/test007.right.txt
+abcdx
blob - /dev/null
blob + 587be6b4c3f93f93c489c0111bba5596147a26cb (mode 644)
--- /dev/null
+++ test/test008.left.txt
+x
blob - /dev/null
blob + 70b67eef4500e1d108c59c18653e56159e5c60ad (mode 644)
--- /dev/null
+++ test/test008.right.txt
+x
+a
+b
+c
+d
+x
blob - /dev/null
blob + e4e5238f8e5d3a5deb0169f1795d18a9a1eebe68 (mode 644)
--- /dev/null
+++ test/test009.left.txt
+x
+a
+b
blob - /dev/null
blob + 3a4386800dba971814434a508e6de9867ebc7abc (mode 644)
--- /dev/null
+++ test/test009.right.txt
+x
+a
+b
+c
+d
+e
+f
+x
+a
+b
blob - /dev/null
blob + 4a0ec1a64add5fccad7d01d0856e38be06629717 (mode 755)
--- /dev/null
+++ test/verify_all.sh
+#!/bin/sh
+
+diff_prog="../diff/diff"
+
+diff_type=plain
+
+verify_diff_script() {
+ orig_left="$1"
+ orig_right="$2"
+ the_diff="$3"
+
+ verify_left="verify.$orig_left"
+ verify_right="verify.$orig_right"
+
+ if [ "x$diff_type" = "xunidiff" ]; then
+ cp "$orig_left" "$verify_right"
+ patch --quiet -u "$verify_right" "$the_diff"
+ if ! cmp "$orig_right" "$verify_right" ; then
+ echo "FAIL: $orig_right != $verify_right"
+ return 1
+ fi
+
+ cp "$orig_right" "$verify_left"
+ patch --quiet -u -R "$verify_left" "$the_diff"
+ if ! cmp "$orig_left" "$verify_left" ; then
+ echo "FAIL: $orig_left != $verify_left"
+ return 1
+ fi
+ else
+ tail -n +3 "$the_diff" | grep -v "^+" | sed 's/^.//' > "$verify_left"
+ tail -n +3 "$the_diff" | grep -v "^-" | sed 's/^.//' > "$verify_right"
+
+ if ! cmp "$orig_left" "$verify_left" ; then
+ echo "FAIL: $orig_left != $verify_left"
+ return 1
+ fi
+ if ! cmp "$orig_right" "$verify_right" ; then
+ echo "FAIL: $orig_right != $verify_right"
+ return 1
+ fi
+ fi
+ echo "OK: $diff_prog $orig_left $orig_right"
+ return 0
+}
+
+for left in test*.left.* ; do
+ right="$(echo "$left" | sed 's/\.left\./.right./')"
+ expected_diff="$(echo "$left" | sed 's/test\([0-9]*\)\..*/expect\1.diff/')"
+ got_diff="verify.$expected_diff"
+
+ "$diff_prog" "$left" "$right" > "$got_diff"
+
+ set -e
+ verify_diff_script "$left" "$right" "$got_diff"
+ set +e
+done