1 /* Generic infrastructure to implement various diff algorithms (implementation). */
3 * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
19 #include <sys/queue.h>
32 #include <arraylist.h>
33 #include <diff_main.h>
35 #include "diff_internal.h"
36 #include "diff_debug.h"
38 inline enum diff_chunk_type
39 diff_chunk_type(const struct diff_chunk *chunk)
41 if (!chunk->left_count && !chunk->right_count)
45 if (!chunk->right_count)
47 if (!chunk->left_count)
49 if (chunk->left_count != chunk->right_count)
55 read_at(FILE *f, off_t at_pos, unsigned char *buf, size_t len)
58 if (fseeko(f, at_pos, SEEK_SET) == -1)
60 r = fread(buf, sizeof(char), len, f);
61 if ((r == 0 || r < len) && ferror(f))
69 buf_cmp(const unsigned char *left, size_t left_len,
70 const unsigned char *right, size_t right_len,
71 bool ignore_whitespace)
75 if (ignore_whitespace) {
77 while (il < left_len && ir < right_len) {
78 unsigned char cl = left[il];
79 unsigned char cr = right[ir];
81 if (isspace((unsigned char)cl) && il < left_len) {
85 if (isspace((unsigned char)cr) && ir < right_len) {
97 while (il < left_len) {
98 unsigned char cl = left[il++];
99 if (!isspace((unsigned char)cl))
102 while (ir < right_len) {
103 unsigned char cr = right[ir++];
104 if (!isspace((unsigned char)cr))
111 cmp = memcmp(left, right, MIN(left_len, right_len));
114 if (left_len == right_len)
116 return (left_len > right_len) ? 1 : -1;
120 diff_atom_cmp(int *cmp,
121 const struct diff_atom *left,
122 const struct diff_atom *right)
124 off_t remain_left, remain_right;
125 int flags = (left->root->diff_flags | right->root->diff_flags);
126 bool ignore_whitespace = (flags & DIFF_FLAG_IGNORE_WHITESPACE);
128 if (!left->len && !right->len) {
132 if (!ignore_whitespace) {
143 if (left->at != NULL && right->at != NULL) {
144 *cmp = buf_cmp(left->at, left->len, right->at, right->len,
149 remain_left = left->len;
150 remain_right = right->len;
151 while (remain_left > 0 || remain_right > 0) {
152 const size_t chunksz = 8192;
153 unsigned char buf_left[chunksz], buf_right[chunksz];
154 const uint8_t *p_left, *p_right;
155 off_t n_left, n_right;
167 n_left = MIN(chunksz, remain_left);
168 n_right = MIN(chunksz, remain_right);
170 if (left->at == NULL) {
171 r = read_at(left->root->f,
172 left->pos + (left->len - remain_left),
180 p_left = left->at + (left->len - remain_left);
183 if (right->at == NULL) {
184 r = read_at(right->root->f,
185 right->pos + (right->len - remain_right),
193 p_right = right->at + (right->len - remain_right);
196 r = buf_cmp(p_left, n_left, p_right, n_right,
203 remain_left -= n_left;
204 remain_right -= n_right;
212 diff_atom_same(bool *same,
213 const struct diff_atom *left,
214 const struct diff_atom *right)
218 if (left->hash != right->hash) {
222 r = diff_atom_cmp(&cmp, left, right);
231 static struct diff_chunk *
232 diff_state_add_solved_chunk(struct diff_state *state,
233 const struct diff_chunk *chunk)
235 diff_chunk_arraylist_t *result;
236 struct diff_chunk *new_chunk;
237 enum diff_chunk_type last_t;
238 enum diff_chunk_type new_t;
239 struct diff_chunk *last;
241 /* Append to solved chunks; make sure that adjacent chunks of same type are combined, and that a minus chunk
242 * never directly follows a plus chunk. */
243 result = &state->result->chunks;
245 last_t = result->len ? diff_chunk_type(&result->head[result->len - 1])
247 new_t = diff_chunk_type(chunk);
249 debug("ADD %s chunk #%u:\n", chunk->solved ? "solved" : "UNSOLVED",
252 debug_dump_atoms(&state->left, chunk->left_start, chunk->left_count);
254 debug_dump_atoms(&state->right, chunk->right_start, chunk->right_count);
257 last = &result->head[result->len - 1];
258 assert(chunk->left_start
259 == last->left_start + last->left_count);
260 assert(chunk->right_start
261 == last->right_start + last->right_count);
264 if (new_t == last_t) {
265 new_chunk = &result->head[result->len - 1];
266 new_chunk->left_count += chunk->left_count;
267 new_chunk->right_count += chunk->right_count;
268 debug(" - added chunk touches previous one of same type, joined:\n");
270 debug_dump_atoms(&state->left, new_chunk->left_start, new_chunk->left_count);
272 debug_dump_atoms(&state->right, new_chunk->right_start, new_chunk->right_count);
273 } else if (last_t == CHUNK_PLUS && new_t == CHUNK_MINUS) {
274 enum diff_chunk_type prev_last_t =
276 diff_chunk_type(&result->head[result->len - 2])
278 /* If a minus-chunk follows a plus-chunk, place it above the plus-chunk->
279 * Is the one before that also a minus? combine. */
280 if (prev_last_t == CHUNK_MINUS) {
281 new_chunk = &result->head[result->len - 2];
282 new_chunk->left_count += chunk->left_count;
283 new_chunk->right_count += chunk->right_count;
285 debug(" - added minus-chunk follows plus-chunk,"
286 " put before that plus-chunk and joined"
287 " with preceding minus-chunk:\n");
289 debug_dump_atoms(&state->left, new_chunk->left_start, new_chunk->left_count);
291 debug_dump_atoms(&state->right, new_chunk->right_start, new_chunk->right_count);
293 ARRAYLIST_INSERT(new_chunk, *result, result->len - 1);
298 /* The new minus chunk indicates to which position on
299 * the right it corresponds, even though it doesn't add
300 * any lines on the right. By moving above a plus chunk,
301 * that position on the right has shifted. */
302 last = &result->head[result->len - 1];
303 new_chunk->right_start = last->right_start;
305 debug(" - added minus-chunk follows plus-chunk,"
306 " put before that plus-chunk\n");
309 /* That last_t == CHUNK_PLUS indicates to which position on the
310 * left it corresponds, even though it doesn't add any lines on
311 * the left. By inserting/extending the prev_last_t ==
312 * CHUNK_MINUS, that position on the left has shifted. */
313 last = &result->head[result->len - 1];
314 last->left_start = new_chunk->left_start
315 + new_chunk->left_count;
318 ARRAYLIST_ADD(new_chunk, *result);
326 /* Even if a left or right side is empty, diff output may need to know the
327 * position in that file.
328 * So left_start or right_start must never be NULL -- pass left_count or
329 * right_count as zero to indicate staying at that position without consuming
332 diff_state_add_chunk(struct diff_state *state, bool solved,
333 struct diff_atom *left_start, unsigned int left_count,
334 struct diff_atom *right_start, unsigned int right_count)
336 struct diff_chunk *new_chunk;
337 struct diff_chunk chunk = {
339 .left_start = left_start,
340 .left_count = left_count,
341 .right_start = right_start,
342 .right_count = right_count,
345 /* An unsolved chunk means store as intermediate result for later
347 * If there already are intermediate results, that means even a
348 * following solved chunk needs to go to intermediate results, so that
349 * it is later put in the final correct position in solved chunks.
351 if (!solved || state->temp_result.len) {
352 /* Append to temp_result */
353 debug("ADD %s chunk to temp result:\n",
354 chunk.solved ? "solved" : "UNSOLVED");
356 debug_dump_atoms(&state->left, left_start, left_count);
358 debug_dump_atoms(&state->right, right_start, right_count);
359 ARRAYLIST_ADD(new_chunk, state->temp_result);
366 return diff_state_add_solved_chunk(state, &chunk);
370 diff_data_init_root(struct diff_data *d, FILE *f, const uint8_t *data,
371 unsigned long long len, int diff_flags)
373 *d = (struct diff_data){
379 .diff_flags = diff_flags,
384 diff_data_init_subsection(struct diff_data *d, struct diff_data *parent,
385 struct diff_atom *from_atom, unsigned int atoms_count)
387 struct diff_atom *last_atom;
389 debug("diff_data %p parent %p from_atom %p atoms_count %u\n",
390 d, parent, from_atom, atoms_count);
391 debug(" from_atom ");
392 debug_dump_atom(parent, NULL, from_atom);
394 if (atoms_count == 0) {
395 *d = (struct diff_data){
400 .root = parent->root,
402 .atoms.len = atoms_count,
408 last_atom = from_atom + atoms_count - 1;
409 *d = (struct diff_data){
411 .pos = from_atom->pos,
412 .data = from_atom->at,
413 .len = (last_atom->pos + last_atom->len) - from_atom->pos,
414 .root = parent->root,
415 .atoms.head = from_atom,
416 .atoms.len = atoms_count,
419 debug("subsection:\n");
424 diff_data_free(struct diff_data *diff_data)
428 if (diff_data->atoms.allocated)
429 ARRAYLIST_FREE(diff_data->atoms);
433 diff_algo_none(const struct diff_algo_config *algo_config,
434 struct diff_state *state)
436 debug("\n** %s\n", __func__);
438 debug_dump(&state->left);
440 debug_dump(&state->right);
441 debug_dump_myers_graph(&state->left, &state->right, NULL, NULL, 0, NULL,
444 /* Add a chunk of equal lines, if any */
445 struct diff_atom *l = state->left.atoms.head;
446 unsigned int l_len = state->left.atoms.len;
447 struct diff_atom *r = state->right.atoms.head;
448 unsigned int r_len = state->right.atoms.len;
449 unsigned int equal_atoms_start = 0;
450 unsigned int equal_atoms_end = 0;
451 unsigned int l_idx = 0;
452 unsigned int r_idx = 0;
454 while (equal_atoms_start < l_len
455 && equal_atoms_start < r_len) {
458 err = diff_atom_same(&same, &l[equal_atoms_start],
459 &r[equal_atoms_start]);
466 while (equal_atoms_end < (l_len - equal_atoms_start)
467 && equal_atoms_end < (r_len - equal_atoms_start)) {
470 err = diff_atom_same(&same, &l[l_len - 1 - equal_atoms_end],
471 &r[r_len - 1 - equal_atoms_end]);
479 /* Add a chunk of equal lines at the start */
480 if (equal_atoms_start) {
481 if (!diff_state_add_chunk(state, true,
482 l, equal_atoms_start,
483 r, equal_atoms_start))
485 l_idx += equal_atoms_start;
486 r_idx += equal_atoms_start;
489 /* Add a "minus" chunk with all lines from the left. */
490 if (equal_atoms_start + equal_atoms_end < l_len) {
491 unsigned int add_len = l_len - equal_atoms_start - equal_atoms_end;
492 if (!diff_state_add_chunk(state, true,
499 /* Add a "plus" chunk with all lines from the right. */
500 if (equal_atoms_start + equal_atoms_end < r_len) {
501 unsigned int add_len = r_len - equal_atoms_start - equal_atoms_end;
502 if (!diff_state_add_chunk(state, true,
509 /* Add a chunk of equal lines at the end */
510 if (equal_atoms_end) {
511 if (!diff_state_add_chunk(state, true,
512 &l[l_idx], equal_atoms_end,
513 &r[r_idx], equal_atoms_end))
521 diff_run_algo(const struct diff_algo_config *algo_config,
522 struct diff_state *state)
526 if (!algo_config || !algo_config->impl
527 || !state->recursion_depth_left
528 || !state->left.atoms.len || !state->right.atoms.len) {
529 debug("Fall back to diff_algo_none():%s%s%s\n",
530 (!algo_config || !algo_config->impl) ? " no-cfg" : "",
531 (!state->recursion_depth_left) ? " max-depth" : "",
532 (!state->left.atoms.len || !state->right.atoms.len)?
534 return diff_algo_none(algo_config, state);
537 ARRAYLIST_FREE(state->temp_result);
538 ARRAYLIST_INIT(state->temp_result, DIFF_RESULT_ALLOC_BLOCKSIZE);
539 rc = algo_config->impl(algo_config, state);
541 case DIFF_RC_USE_DIFF_ALGO_FALLBACK:
542 debug("Got DIFF_RC_USE_DIFF_ALGO_FALLBACK (%p)\n",
543 algo_config->fallback_algo);
544 rc = diff_run_algo(algo_config->fallback_algo, state);
552 /* some error happened */
556 /* Pick up any diff chunks that are still unsolved and feed to
557 * inner_algo. inner_algo will solve unsolved chunks and append to
558 * result, and subsequent solved chunks on this level are then appended
559 * to result afterwards. */
561 for (i = 0; i < state->temp_result.len; i++) {
562 struct diff_chunk *c = &state->temp_result.head[i];
564 diff_state_add_solved_chunk(state, c);
568 /* c is an unsolved chunk, feed to inner_algo */
569 struct diff_state inner_state = {
570 .result = state->result,
571 .recursion_depth_left = state->recursion_depth_left - 1,
572 .kd_buf = state->kd_buf,
573 .kd_buf_size = state->kd_buf_size,
575 diff_data_init_subsection(&inner_state.left, &state->left,
576 c->left_start, c->left_count);
577 diff_data_init_subsection(&inner_state.right, &state->right,
578 c->right_start, c->right_count);
580 rc = diff_run_algo(algo_config->inner_algo, &inner_state);
581 state->kd_buf = inner_state.kd_buf;
582 state->kd_buf_size = inner_state.kd_buf_size;
583 if (rc != DIFF_RC_OK)
589 ARRAYLIST_FREE(state->temp_result);
594 diff_atomize_file(struct diff_data *d,
595 const struct diff_config *config,
596 FILE *f, const uint8_t *data, off_t len, int diff_flags)
598 if (!config->atomize_func)
601 diff_data_init_root(d, f, data, len, diff_flags);
603 return config->atomize_func(config->atomize_func_data, d);
608 diff_main(const struct diff_config *config, struct diff_data *left,
609 struct diff_data *right)
611 struct diff_result *result = malloc(sizeof(struct diff_result));
615 *result = (struct diff_result){};
617 result->right = right;
619 struct diff_state state = {
621 .recursion_depth_left = config->max_recursion_depth ?
622 config->max_recursion_depth : UINT_MAX,
626 diff_data_init_subsection(&state.left, left,
629 diff_data_init_subsection(&state.right, right,
633 result->rc = diff_run_algo(config->algo, &state);
640 diff_result_free(struct diff_result *result)
644 ARRAYLIST_FREE(result->chunks);
649 diff_result_contains_printable_chunks(struct diff_result *result)
651 struct diff_chunk *c;
652 enum diff_chunk_type t;
655 for (i = 0; i < result->chunks.len; i++) {
656 c = &result->chunks.head[i];
657 t = diff_chunk_type(c);
658 if (t == CHUNK_MINUS || t == CHUNK_PLUS)