Blame


1 3b0f3d61 2020-01-22 neels /* Generic infrastructure to implement various diff algorithms (implementation). */
2 3b0f3d61 2020-01-22 neels /*
3 3b0f3d61 2020-01-22 neels * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
4 3b0f3d61 2020-01-22 neels *
5 3b0f3d61 2020-01-22 neels * Permission to use, copy, modify, and distribute this software for any
6 3b0f3d61 2020-01-22 neels * purpose with or without fee is hereby granted, provided that the above
7 3b0f3d61 2020-01-22 neels * copyright notice and this permission notice appear in all copies.
8 3b0f3d61 2020-01-22 neels *
9 3b0f3d61 2020-01-22 neels * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 3b0f3d61 2020-01-22 neels * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 3b0f3d61 2020-01-22 neels * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 3b0f3d61 2020-01-22 neels * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 3b0f3d61 2020-01-22 neels * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 3b0f3d61 2020-01-22 neels * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 3b0f3d61 2020-01-22 neels * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 3b0f3d61 2020-01-22 neels */
17 3b0f3d61 2020-01-22 neels
18 3b0f3d61 2020-01-22 neels
19 3b0f3d61 2020-01-22 neels #include <sys/queue.h>
20 732e8ee0 2020-09-20 stsp #include <ctype.h>
21 e10a628a 2020-09-16 stsp #include <errno.h>
22 3b0f3d61 2020-01-22 neels #include <stdint.h>
23 3b0f3d61 2020-01-22 neels #include <stdlib.h>
24 3b0f3d61 2020-01-22 neels #include <stdbool.h>
25 3b0f3d61 2020-01-22 neels #include <stdio.h>
26 3b0f3d61 2020-01-22 neels #include <string.h>
27 3b0f3d61 2020-01-22 neels #include <limits.h>
28 c6eecea3 2020-07-26 stsp #include <unistd.h>
29 3b0f3d61 2020-01-22 neels
30 3b0f3d61 2020-01-22 neels #include <assert.h>
31 3b0f3d61 2020-01-22 neels
32 1dfba055 2020-10-07 stsp #include <arraylist.h>
33 1dfba055 2020-10-07 stsp #include <diff_main.h>
34 3b0f3d61 2020-01-22 neels
35 85ab4559 2020-09-22 stsp #include "diff_internal.h"
36 2a1b94d0 2020-09-26 stsp #include "diff_debug.h"
37 e78a8d73 2023-02-20 mark
38 e78a8d73 2023-02-20 mark inline enum diff_chunk_type
39 e78a8d73 2023-02-20 mark diff_chunk_type(const struct diff_chunk *chunk)
40 e78a8d73 2023-02-20 mark {
41 e78a8d73 2023-02-20 mark if (!chunk->left_count && !chunk->right_count)
42 e78a8d73 2023-02-20 mark return CHUNK_EMPTY;
43 e78a8d73 2023-02-20 mark if (!chunk->solved)
44 e78a8d73 2023-02-20 mark return CHUNK_ERROR;
45 e78a8d73 2023-02-20 mark if (!chunk->right_count)
46 e78a8d73 2023-02-20 mark return CHUNK_MINUS;
47 e78a8d73 2023-02-20 mark if (!chunk->left_count)
48 e78a8d73 2023-02-20 mark return CHUNK_PLUS;
49 e78a8d73 2023-02-20 mark if (chunk->left_count != chunk->right_count)
50 e78a8d73 2023-02-20 mark return CHUNK_ERROR;
51 e78a8d73 2023-02-20 mark return CHUNK_SAME;
52 e78a8d73 2023-02-20 mark }
53 c6eecea3 2020-07-26 stsp
54 b3fb4686 2020-09-20 neels static int
55 7a54ad3a 2020-09-20 stsp read_at(FILE *f, off_t at_pos, unsigned char *buf, size_t len)
56 c6eecea3 2020-07-26 stsp {
57 b3fb4686 2020-09-20 neels int r;
58 7a54ad3a 2020-09-20 stsp if (fseeko(f, at_pos, SEEK_SET) == -1)
59 b3fb4686 2020-09-20 neels return errno;
60 7a54ad3a 2020-09-20 stsp r = fread(buf, sizeof(char), len, f);
61 7a54ad3a 2020-09-20 stsp if ((r == 0 || r < len) && ferror(f))
62 674563ab 2022-10-11 tj return EIO;
63 b3fb4686 2020-09-20 neels if (r != len)
64 b3fb4686 2020-09-20 neels return EIO;
65 b3fb4686 2020-09-20 neels return 0;
66 b3fb4686 2020-09-20 neels }
67 b3fb4686 2020-09-20 neels
68 b3fb4686 2020-09-20 neels static int
69 b3fb4686 2020-09-20 neels buf_cmp(const unsigned char *left, size_t left_len,
70 732e8ee0 2020-09-20 stsp const unsigned char *right, size_t right_len,
71 732e8ee0 2020-09-20 stsp bool ignore_whitespace)
72 b3fb4686 2020-09-20 neels {
73 732e8ee0 2020-09-20 stsp int cmp;
74 732e8ee0 2020-09-20 stsp
75 732e8ee0 2020-09-20 stsp if (ignore_whitespace) {
76 732e8ee0 2020-09-20 stsp int il = 0, ir = 0;
77 732e8ee0 2020-09-20 stsp while (il < left_len && ir < right_len) {
78 732e8ee0 2020-09-20 stsp unsigned char cl = left[il];
79 732e8ee0 2020-09-20 stsp unsigned char cr = right[ir];
80 732e8ee0 2020-09-20 stsp
81 1dce05e8 2022-11-17 op if (isspace((unsigned char)cl) && il < left_len) {
82 732e8ee0 2020-09-20 stsp il++;
83 732e8ee0 2020-09-20 stsp continue;
84 732e8ee0 2020-09-20 stsp }
85 1dce05e8 2022-11-17 op if (isspace((unsigned char)cr) && ir < right_len) {
86 732e8ee0 2020-09-20 stsp ir++;
87 732e8ee0 2020-09-20 stsp continue;
88 732e8ee0 2020-09-20 stsp }
89 732e8ee0 2020-09-20 stsp
90 732e8ee0 2020-09-20 stsp if (cl > cr)
91 732e8ee0 2020-09-20 stsp return 1;
92 732e8ee0 2020-09-20 stsp if (cr > cl)
93 732e8ee0 2020-09-20 stsp return -1;
94 732e8ee0 2020-09-20 stsp il++;
95 732e8ee0 2020-09-20 stsp ir++;
96 732e8ee0 2020-09-20 stsp }
97 732e8ee0 2020-09-20 stsp while (il < left_len) {
98 732e8ee0 2020-09-20 stsp unsigned char cl = left[il++];
99 1dce05e8 2022-11-17 op if (!isspace((unsigned char)cl))
100 732e8ee0 2020-09-20 stsp return 1;
101 732e8ee0 2020-09-20 stsp }
102 732e8ee0 2020-09-20 stsp while (ir < right_len) {
103 732e8ee0 2020-09-20 stsp unsigned char cr = right[ir++];
104 1dce05e8 2022-11-17 op if (!isspace((unsigned char)cr))
105 732e8ee0 2020-09-20 stsp return -1;
106 732e8ee0 2020-09-20 stsp }
107 732e8ee0 2020-09-20 stsp
108 732e8ee0 2020-09-20 stsp return 0;
109 732e8ee0 2020-09-20 stsp }
110 732e8ee0 2020-09-20 stsp
111 732e8ee0 2020-09-20 stsp cmp = memcmp(left, right, MIN(left_len, right_len));
112 b3fb4686 2020-09-20 neels if (cmp)
113 b3fb4686 2020-09-20 neels return cmp;
114 b3fb4686 2020-09-20 neels if (left_len == right_len)
115 b3fb4686 2020-09-20 neels return 0;
116 b3fb4686 2020-09-20 neels return (left_len > right_len) ? 1 : -1;
117 b3fb4686 2020-09-20 neels }
118 b3fb4686 2020-09-20 neels
119 b3fb4686 2020-09-20 neels int
120 b3fb4686 2020-09-20 neels diff_atom_cmp(int *cmp,
121 b3fb4686 2020-09-20 neels const struct diff_atom *left,
122 b3fb4686 2020-09-20 neels const struct diff_atom *right)
123 b3fb4686 2020-09-20 neels {
124 c6eecea3 2020-07-26 stsp off_t remain_left, remain_right;
125 ad5b3f85 2020-10-12 neels int flags = (left->root->diff_flags | right->root->diff_flags);
126 00d5652b 2020-09-22 stsp bool ignore_whitespace = (flags & DIFF_FLAG_IGNORE_WHITESPACE);
127 c6eecea3 2020-07-26 stsp
128 74ad6a69 2020-10-22 neels if (!left->len && !right->len) {
129 74ad6a69 2020-10-22 neels *cmp = 0;
130 74ad6a69 2020-10-22 neels return 0;
131 74ad6a69 2020-10-22 neels }
132 732e8ee0 2020-09-20 stsp if (!ignore_whitespace) {
133 732e8ee0 2020-09-20 stsp if (!right->len) {
134 732e8ee0 2020-09-20 stsp *cmp = 1;
135 732e8ee0 2020-09-20 stsp return 0;
136 732e8ee0 2020-09-20 stsp }
137 732e8ee0 2020-09-20 stsp if (!left->len) {
138 732e8ee0 2020-09-20 stsp *cmp = -1;
139 732e8ee0 2020-09-20 stsp return 0;
140 732e8ee0 2020-09-20 stsp }
141 b3fb4686 2020-09-20 neels }
142 c6eecea3 2020-07-26 stsp
143 b3fb4686 2020-09-20 neels if (left->at != NULL && right->at != NULL) {
144 732e8ee0 2020-09-20 stsp *cmp = buf_cmp(left->at, left->len, right->at, right->len,
145 732e8ee0 2020-09-20 stsp ignore_whitespace);
146 b3fb4686 2020-09-20 neels return 0;
147 b3fb4686 2020-09-20 neels }
148 c6eecea3 2020-07-26 stsp
149 c6eecea3 2020-07-26 stsp remain_left = left->len;
150 c6eecea3 2020-07-26 stsp remain_right = right->len;
151 c6eecea3 2020-07-26 stsp while (remain_left > 0 || remain_right > 0) {
152 c6eecea3 2020-07-26 stsp const size_t chunksz = 8192;
153 c6eecea3 2020-07-26 stsp unsigned char buf_left[chunksz], buf_right[chunksz];
154 c6eecea3 2020-07-26 stsp const uint8_t *p_left, *p_right;
155 c6eecea3 2020-07-26 stsp off_t n_left, n_right;
156 c6eecea3 2020-07-26 stsp ssize_t r;
157 c6eecea3 2020-07-26 stsp
158 b3fb4686 2020-09-20 neels if (!remain_right) {
159 b3fb4686 2020-09-20 neels *cmp = 1;
160 b3fb4686 2020-09-20 neels return 0;
161 b3fb4686 2020-09-20 neels }
162 b3fb4686 2020-09-20 neels if (!remain_left) {
163 b3fb4686 2020-09-20 neels *cmp = -1;
164 b3fb4686 2020-09-20 neels return 0;
165 b3fb4686 2020-09-20 neels }
166 b3fb4686 2020-09-20 neels
167 c6eecea3 2020-07-26 stsp n_left = MIN(chunksz, remain_left);
168 c6eecea3 2020-07-26 stsp n_right = MIN(chunksz, remain_right);
169 c6eecea3 2020-07-26 stsp
170 c6eecea3 2020-07-26 stsp if (left->at == NULL) {
171 ad5b3f85 2020-10-12 neels r = read_at(left->root->f,
172 b3fb4686 2020-09-20 neels left->pos + (left->len - remain_left),
173 b3fb4686 2020-09-20 neels buf_left, n_left);
174 b3fb4686 2020-09-20 neels if (r) {
175 b3fb4686 2020-09-20 neels *cmp = 0;
176 b3fb4686 2020-09-20 neels return r;
177 b3fb4686 2020-09-20 neels }
178 c6eecea3 2020-07-26 stsp p_left = buf_left;
179 c6eecea3 2020-07-26 stsp } else {
180 c6eecea3 2020-07-26 stsp p_left = left->at + (left->len - remain_left);
181 c6eecea3 2020-07-26 stsp }
182 c6eecea3 2020-07-26 stsp
183 c6eecea3 2020-07-26 stsp if (right->at == NULL) {
184 ad5b3f85 2020-10-12 neels r = read_at(right->root->f,
185 b3fb4686 2020-09-20 neels right->pos + (right->len - remain_right),
186 b3fb4686 2020-09-20 neels buf_right, n_right);
187 b3fb4686 2020-09-20 neels if (r) {
188 b3fb4686 2020-09-20 neels *cmp = 0;
189 b3fb4686 2020-09-20 neels return r;
190 b3fb4686 2020-09-20 neels }
191 c6eecea3 2020-07-26 stsp p_right = buf_right;
192 c6eecea3 2020-07-26 stsp } else {
193 c6eecea3 2020-07-26 stsp p_right = right->at + (right->len - remain_right);
194 c6eecea3 2020-07-26 stsp }
195 b3fb4686 2020-09-20 neels
196 732e8ee0 2020-09-20 stsp r = buf_cmp(p_left, n_left, p_right, n_right,
197 732e8ee0 2020-09-20 stsp ignore_whitespace);
198 b3fb4686 2020-09-20 neels if (r) {
199 b3fb4686 2020-09-20 neels *cmp = r;
200 b3fb4686 2020-09-20 neels return 0;
201 c6eecea3 2020-07-26 stsp }
202 c6eecea3 2020-07-26 stsp
203 c6eecea3 2020-07-26 stsp remain_left -= n_left;
204 c6eecea3 2020-07-26 stsp remain_right -= n_right;
205 c6eecea3 2020-07-26 stsp }
206 3b0f3d61 2020-01-22 neels
207 b3fb4686 2020-09-20 neels *cmp = 0;
208 b3fb4686 2020-09-20 neels return 0;
209 b3fb4686 2020-09-20 neels }
210 b3fb4686 2020-09-20 neels
211 b3fb4686 2020-09-20 neels int
212 b3fb4686 2020-09-20 neels diff_atom_same(bool *same,
213 b3fb4686 2020-09-20 neels const struct diff_atom *left,
214 b3fb4686 2020-09-20 neels const struct diff_atom *right)
215 b3fb4686 2020-09-20 neels {
216 b3fb4686 2020-09-20 neels int cmp;
217 2fa08c64 2020-10-22 neels int r;
218 2fa08c64 2020-10-22 neels if (left->hash != right->hash) {
219 2fa08c64 2020-10-22 neels *same = false;
220 2fa08c64 2020-10-22 neels return 0;
221 2fa08c64 2020-10-22 neels }
222 2fa08c64 2020-10-22 neels r = diff_atom_cmp(&cmp, left, right);
223 b3fb4686 2020-09-20 neels if (r) {
224 b3fb4686 2020-09-20 neels *same = true;
225 b3fb4686 2020-09-20 neels return r;
226 b3fb4686 2020-09-20 neels }
227 b3fb4686 2020-09-20 neels *same = (cmp == 0);
228 b3fb4686 2020-09-20 neels return 0;
229 c6eecea3 2020-07-26 stsp }
230 c6eecea3 2020-07-26 stsp
231 93f8150a 2020-10-11 neels static struct diff_chunk *
232 93f8150a 2020-10-11 neels diff_state_add_solved_chunk(struct diff_state *state,
233 93f8150a 2020-10-11 neels const struct diff_chunk *chunk)
234 3b0f3d61 2020-01-22 neels {
235 93f8150a 2020-10-11 neels diff_chunk_arraylist_t *result;
236 8546b045 2020-09-20 neels struct diff_chunk *new_chunk;
237 8546b045 2020-09-20 neels enum diff_chunk_type last_t;
238 8546b045 2020-09-20 neels enum diff_chunk_type new_t;
239 f6db3145 2020-10-30 neels struct diff_chunk *last;
240 8546b045 2020-09-20 neels
241 8546b045 2020-09-20 neels /* Append to solved chunks; make sure that adjacent chunks of same type are combined, and that a minus chunk
242 8546b045 2020-09-20 neels * never directly follows a plus chunk. */
243 8546b045 2020-09-20 neels result = &state->result->chunks;
244 8546b045 2020-09-20 neels
245 486215cf 2020-10-30 neels last_t = result->len ? diff_chunk_type(&result->head[result->len - 1])
246 486215cf 2020-10-30 neels : CHUNK_EMPTY;
247 93f8150a 2020-10-11 neels new_t = diff_chunk_type(chunk);
248 8546b045 2020-09-20 neels
249 486215cf 2020-10-30 neels debug("ADD %s chunk #%u:\n", chunk->solved ? "solved" : "UNSOLVED",
250 486215cf 2020-10-30 neels result->len);
251 93f8150a 2020-10-11 neels debug("L\n");
252 93f8150a 2020-10-11 neels debug_dump_atoms(&state->left, chunk->left_start, chunk->left_count);
253 93f8150a 2020-10-11 neels debug("R\n");
254 93f8150a 2020-10-11 neels debug_dump_atoms(&state->right, chunk->right_start, chunk->right_count);
255 4861c9da 2020-10-30 neels
256 4861c9da 2020-10-30 neels if (result->len) {
257 4861c9da 2020-10-30 neels last = &result->head[result->len - 1];
258 4861c9da 2020-10-30 neels assert(chunk->left_start
259 4861c9da 2020-10-30 neels == last->left_start + last->left_count);
260 4861c9da 2020-10-30 neels assert(chunk->right_start
261 4861c9da 2020-10-30 neels == last->right_start + last->right_count);
262 4861c9da 2020-10-30 neels }
263 93f8150a 2020-10-11 neels
264 8546b045 2020-09-20 neels if (new_t == last_t) {
265 8546b045 2020-09-20 neels new_chunk = &result->head[result->len - 1];
266 93f8150a 2020-10-11 neels new_chunk->left_count += chunk->left_count;
267 93f8150a 2020-10-11 neels new_chunk->right_count += chunk->right_count;
268 bb867e68 2020-10-11 neels debug(" - added chunk touches previous one of same type, joined:\n");
269 bb867e68 2020-10-11 neels debug("L\n");
270 bb867e68 2020-10-11 neels debug_dump_atoms(&state->left, new_chunk->left_start, new_chunk->left_count);
271 bb867e68 2020-10-11 neels debug("R\n");
272 bb867e68 2020-10-11 neels debug_dump_atoms(&state->right, new_chunk->right_start, new_chunk->right_count);
273 8546b045 2020-09-20 neels } else if (last_t == CHUNK_PLUS && new_t == CHUNK_MINUS) {
274 93f8150a 2020-10-11 neels enum diff_chunk_type prev_last_t =
275 93f8150a 2020-10-11 neels result->len > 1 ?
276 93f8150a 2020-10-11 neels diff_chunk_type(&result->head[result->len - 2])
277 93f8150a 2020-10-11 neels : CHUNK_EMPTY;
278 93f8150a 2020-10-11 neels /* If a minus-chunk follows a plus-chunk, place it above the plus-chunk->
279 8546b045 2020-09-20 neels * Is the one before that also a minus? combine. */
280 8546b045 2020-09-20 neels if (prev_last_t == CHUNK_MINUS) {
281 8546b045 2020-09-20 neels new_chunk = &result->head[result->len - 2];
282 93f8150a 2020-10-11 neels new_chunk->left_count += chunk->left_count;
283 93f8150a 2020-10-11 neels new_chunk->right_count += chunk->right_count;
284 bb867e68 2020-10-11 neels
285 bb867e68 2020-10-11 neels debug(" - added minus-chunk follows plus-chunk,"
286 bb867e68 2020-10-11 neels " put before that plus-chunk and joined"
287 bb867e68 2020-10-11 neels " with preceding minus-chunk:\n");
288 bb867e68 2020-10-11 neels debug("L\n");
289 bb867e68 2020-10-11 neels debug_dump_atoms(&state->left, new_chunk->left_start, new_chunk->left_count);
290 bb867e68 2020-10-11 neels debug("R\n");
291 bb867e68 2020-10-11 neels debug_dump_atoms(&state->right, new_chunk->right_start, new_chunk->right_count);
292 8546b045 2020-09-20 neels } else {
293 724967e9 2020-10-11 neels ARRAYLIST_INSERT(new_chunk, *result, result->len - 1);
294 8546b045 2020-09-20 neels if (!new_chunk)
295 8546b045 2020-09-20 neels return NULL;
296 93f8150a 2020-10-11 neels *new_chunk = *chunk;
297 f6db3145 2020-10-30 neels
298 f6db3145 2020-10-30 neels /* The new minus chunk indicates to which position on
299 f6db3145 2020-10-30 neels * the right it corresponds, even though it doesn't add
300 f6db3145 2020-10-30 neels * any lines on the right. By moving above a plus chunk,
301 f6db3145 2020-10-30 neels * that position on the right has shifted. */
302 f6db3145 2020-10-30 neels last = &result->head[result->len - 1];
303 f6db3145 2020-10-30 neels new_chunk->right_start = last->right_start;
304 bb867e68 2020-10-11 neels
305 bb867e68 2020-10-11 neels debug(" - added minus-chunk follows plus-chunk,"
306 bb867e68 2020-10-11 neels " put before that plus-chunk\n");
307 8546b045 2020-09-20 neels }
308 f6db3145 2020-10-30 neels
309 f6db3145 2020-10-30 neels /* That last_t == CHUNK_PLUS indicates to which position on the
310 f6db3145 2020-10-30 neels * left it corresponds, even though it doesn't add any lines on
311 f6db3145 2020-10-30 neels * the left. By inserting/extending the prev_last_t ==
312 f6db3145 2020-10-30 neels * CHUNK_MINUS, that position on the left has shifted. */
313 f6db3145 2020-10-30 neels last = &result->head[result->len - 1];
314 f6db3145 2020-10-30 neels last->left_start = new_chunk->left_start
315 f6db3145 2020-10-30 neels + new_chunk->left_count;
316 f6db3145 2020-10-30 neels
317 8546b045 2020-09-20 neels } else {
318 8546b045 2020-09-20 neels ARRAYLIST_ADD(new_chunk, *result);
319 8546b045 2020-09-20 neels if (!new_chunk)
320 8546b045 2020-09-20 neels return NULL;
321 93f8150a 2020-10-11 neels *new_chunk = *chunk;
322 93f8150a 2020-10-11 neels }
323 93f8150a 2020-10-11 neels return new_chunk;
324 93f8150a 2020-10-11 neels }
325 93f8150a 2020-10-11 neels
326 93f8150a 2020-10-11 neels /* Even if a left or right side is empty, diff output may need to know the
327 93f8150a 2020-10-11 neels * position in that file.
328 93f8150a 2020-10-11 neels * So left_start or right_start must never be NULL -- pass left_count or
329 93f8150a 2020-10-11 neels * right_count as zero to indicate staying at that position without consuming
330 93f8150a 2020-10-11 neels * any lines. */
331 93f8150a 2020-10-11 neels struct diff_chunk *
332 93f8150a 2020-10-11 neels diff_state_add_chunk(struct diff_state *state, bool solved,
333 93f8150a 2020-10-11 neels struct diff_atom *left_start, unsigned int left_count,
334 93f8150a 2020-10-11 neels struct diff_atom *right_start, unsigned int right_count)
335 93f8150a 2020-10-11 neels {
336 93f8150a 2020-10-11 neels struct diff_chunk *new_chunk;
337 93f8150a 2020-10-11 neels struct diff_chunk chunk = {
338 93f8150a 2020-10-11 neels .solved = solved,
339 93f8150a 2020-10-11 neels .left_start = left_start,
340 93f8150a 2020-10-11 neels .left_count = left_count,
341 93f8150a 2020-10-11 neels .right_start = right_start,
342 93f8150a 2020-10-11 neels .right_count = right_count,
343 93f8150a 2020-10-11 neels };
344 93f8150a 2020-10-11 neels
345 486215cf 2020-10-30 neels /* An unsolved chunk means store as intermediate result for later
346 486215cf 2020-10-30 neels * re-iteration.
347 486215cf 2020-10-30 neels * If there already are intermediate results, that means even a
348 486215cf 2020-10-30 neels * following solved chunk needs to go to intermediate results, so that
349 486215cf 2020-10-30 neels * it is later put in the final correct position in solved chunks.
350 486215cf 2020-10-30 neels */
351 93f8150a 2020-10-11 neels if (!solved || state->temp_result.len) {
352 93f8150a 2020-10-11 neels /* Append to temp_result */
353 486215cf 2020-10-30 neels debug("ADD %s chunk to temp result:\n",
354 486215cf 2020-10-30 neels chunk.solved ? "solved" : "UNSOLVED");
355 acfce337 2020-10-12 neels debug("L\n");
356 acfce337 2020-10-12 neels debug_dump_atoms(&state->left, left_start, left_count);
357 acfce337 2020-10-12 neels debug("R\n");
358 acfce337 2020-10-12 neels debug_dump_atoms(&state->right, right_start, right_count);
359 486215cf 2020-10-30 neels ARRAYLIST_ADD(new_chunk, state->temp_result);
360 93f8150a 2020-10-11 neels if (!new_chunk)
361 93f8150a 2020-10-11 neels return NULL;
362 8546b045 2020-09-20 neels *new_chunk = chunk;
363 93f8150a 2020-10-11 neels return new_chunk;
364 8546b045 2020-09-20 neels }
365 8546b045 2020-09-20 neels
366 93f8150a 2020-10-11 neels return diff_state_add_solved_chunk(state, &chunk);
367 3b0f3d61 2020-01-22 neels }
368 3b0f3d61 2020-01-22 neels
369 1f690ebe 2022-08-31 mark static void
370 7a54ad3a 2020-09-20 stsp diff_data_init_root(struct diff_data *d, FILE *f, const uint8_t *data,
371 00d5652b 2020-09-22 stsp unsigned long long len, int diff_flags)
372 3b0f3d61 2020-01-22 neels {
373 3b0f3d61 2020-01-22 neels *d = (struct diff_data){
374 7a54ad3a 2020-09-20 stsp .f = f,
375 c6eecea3 2020-07-26 stsp .pos = 0,
376 3b0f3d61 2020-01-22 neels .data = data,
377 3b0f3d61 2020-01-22 neels .len = len,
378 3b0f3d61 2020-01-22 neels .root = d,
379 00d5652b 2020-09-22 stsp .diff_flags = diff_flags,
380 3b0f3d61 2020-01-22 neels };
381 3b0f3d61 2020-01-22 neels }
382 3b0f3d61 2020-01-22 neels
383 61a7b578 2020-05-06 neels void
384 61a7b578 2020-05-06 neels diff_data_init_subsection(struct diff_data *d, struct diff_data *parent,
385 61a7b578 2020-05-06 neels struct diff_atom *from_atom, unsigned int atoms_count)
386 3b0f3d61 2020-01-22 neels {
387 05b5f01f 2020-09-21 stsp struct diff_atom *last_atom;
388 05b5f01f 2020-09-21 stsp
389 de7a2939 2020-10-24 neels debug("diff_data %p parent %p from_atom %p atoms_count %u\n",
390 de7a2939 2020-10-24 neels d, parent, from_atom, atoms_count);
391 de7a2939 2020-10-24 neels debug(" from_atom ");
392 f5a254cc 2020-10-24 neels debug_dump_atom(parent, NULL, from_atom);
393 de7a2939 2020-10-24 neels
394 05b5f01f 2020-09-21 stsp if (atoms_count == 0) {
395 05b5f01f 2020-09-21 stsp *d = (struct diff_data){
396 05b5f01f 2020-09-21 stsp .f = NULL,
397 05b5f01f 2020-09-21 stsp .pos = 0,
398 34570dbe 2020-10-16 stsp .data = NULL,
399 05b5f01f 2020-09-21 stsp .len = 0,
400 05b5f01f 2020-09-21 stsp .root = parent->root,
401 05b5f01f 2020-09-21 stsp .atoms.head = NULL,
402 05b5f01f 2020-09-21 stsp .atoms.len = atoms_count,
403 05b5f01f 2020-09-21 stsp };
404 05b5f01f 2020-09-21 stsp
405 05b5f01f 2020-09-21 stsp return;
406 05b5f01f 2020-09-21 stsp }
407 05b5f01f 2020-09-21 stsp
408 05b5f01f 2020-09-21 stsp last_atom = from_atom + atoms_count - 1;
409 3b0f3d61 2020-01-22 neels *d = (struct diff_data){
410 7a54ad3a 2020-09-20 stsp .f = NULL,
411 c6eecea3 2020-07-26 stsp .pos = from_atom->pos,
412 3b0f3d61 2020-01-22 neels .data = from_atom->at,
413 c6eecea3 2020-07-26 stsp .len = (last_atom->pos + last_atom->len) - from_atom->pos,
414 3b0f3d61 2020-01-22 neels .root = parent->root,
415 3b0f3d61 2020-01-22 neels .atoms.head = from_atom,
416 3b0f3d61 2020-01-22 neels .atoms.len = atoms_count,
417 3b0f3d61 2020-01-22 neels };
418 3b0f3d61 2020-01-22 neels
419 3b0f3d61 2020-01-22 neels debug("subsection:\n");
420 3b0f3d61 2020-01-22 neels debug_dump(d);
421 3b0f3d61 2020-01-22 neels }
422 3b0f3d61 2020-01-22 neels
423 61a7b578 2020-05-06 neels void
424 61a7b578 2020-05-06 neels diff_data_free(struct diff_data *diff_data)
425 3b0f3d61 2020-01-22 neels {
426 3b0f3d61 2020-01-22 neels if (!diff_data)
427 3b0f3d61 2020-01-22 neels return;
428 3b0f3d61 2020-01-22 neels if (diff_data->atoms.allocated)
429 3b0f3d61 2020-01-22 neels ARRAYLIST_FREE(diff_data->atoms);
430 3b0f3d61 2020-01-22 neels }
431 3b0f3d61 2020-01-22 neels
432 3e6cba3a 2020-08-13 stsp int
433 0d27172a 2020-05-06 neels diff_algo_none(const struct diff_algo_config *algo_config,
434 0d27172a 2020-05-06 neels struct diff_state *state)
435 3b0f3d61 2020-01-22 neels {
436 3b0f3d61 2020-01-22 neels debug("\n** %s\n", __func__);
437 3b0f3d61 2020-01-22 neels debug("left:\n");
438 3b0f3d61 2020-01-22 neels debug_dump(&state->left);
439 3b0f3d61 2020-01-22 neels debug("right:\n");
440 3b0f3d61 2020-01-22 neels debug_dump(&state->right);
441 0d27172a 2020-05-06 neels debug_dump_myers_graph(&state->left, &state->right, NULL, NULL, 0, NULL,
442 0d27172a 2020-05-06 neels 0);
443 3b0f3d61 2020-01-22 neels
444 3b0f3d61 2020-01-22 neels /* Add a chunk of equal lines, if any */
445 467a993d 2020-10-11 neels struct diff_atom *l = state->left.atoms.head;
446 467a993d 2020-10-11 neels unsigned int l_len = state->left.atoms.len;
447 467a993d 2020-10-11 neels struct diff_atom *r = state->right.atoms.head;
448 467a993d 2020-10-11 neels unsigned int r_len = state->right.atoms.len;
449 41ff30f3 2020-10-11 neels unsigned int equal_atoms_start = 0;
450 2146cf12 2020-10-11 neels unsigned int equal_atoms_end = 0;
451 13497fff 2020-10-11 neels unsigned int l_idx = 0;
452 13497fff 2020-10-11 neels unsigned int r_idx = 0;
453 467a993d 2020-10-11 neels
454 41ff30f3 2020-10-11 neels while (equal_atoms_start < l_len
455 41ff30f3 2020-10-11 neels && equal_atoms_start < r_len) {
456 467a993d 2020-10-11 neels int err;
457 b3fb4686 2020-09-20 neels bool same;
458 41ff30f3 2020-10-11 neels err = diff_atom_same(&same, &l[equal_atoms_start],
459 e5447b81 2020-10-12 neels &r[equal_atoms_start]);
460 467a993d 2020-10-11 neels if (err)
461 467a993d 2020-10-11 neels return err;
462 b3fb4686 2020-09-20 neels if (!same)
463 b3fb4686 2020-09-20 neels break;
464 41ff30f3 2020-10-11 neels equal_atoms_start++;
465 b3fb4686 2020-09-20 neels }
466 2146cf12 2020-10-11 neels while (equal_atoms_end < (l_len - equal_atoms_start)
467 2146cf12 2020-10-11 neels && equal_atoms_end < (r_len - equal_atoms_start)) {
468 2146cf12 2020-10-11 neels int err;
469 2146cf12 2020-10-11 neels bool same;
470 2146cf12 2020-10-11 neels err = diff_atom_same(&same, &l[l_len - 1 - equal_atoms_end],
471 2146cf12 2020-10-11 neels &r[r_len - 1 - equal_atoms_end]);
472 2146cf12 2020-10-11 neels if (err)
473 2146cf12 2020-10-11 neels return err;
474 2146cf12 2020-10-11 neels if (!same)
475 2146cf12 2020-10-11 neels break;
476 2146cf12 2020-10-11 neels equal_atoms_end++;
477 2146cf12 2020-10-11 neels }
478 2146cf12 2020-10-11 neels
479 2146cf12 2020-10-11 neels /* Add a chunk of equal lines at the start */
480 41ff30f3 2020-10-11 neels if (equal_atoms_start) {
481 3b0f3d61 2020-01-22 neels if (!diff_state_add_chunk(state, true,
482 e5447b81 2020-10-12 neels l, equal_atoms_start,
483 e5447b81 2020-10-12 neels r, equal_atoms_start))
484 3e6cba3a 2020-08-13 stsp return ENOMEM;
485 13497fff 2020-10-11 neels l_idx += equal_atoms_start;
486 13497fff 2020-10-11 neels r_idx += equal_atoms_start;
487 3b0f3d61 2020-01-22 neels }
488 3b0f3d61 2020-01-22 neels
489 3b0f3d61 2020-01-22 neels /* Add a "minus" chunk with all lines from the left. */
490 2146cf12 2020-10-11 neels if (equal_atoms_start + equal_atoms_end < l_len) {
491 13497fff 2020-10-11 neels unsigned int add_len = l_len - equal_atoms_start - equal_atoms_end;
492 3b0f3d61 2020-01-22 neels if (!diff_state_add_chunk(state, true,
493 13497fff 2020-10-11 neels &l[l_idx], add_len,
494 13497fff 2020-10-11 neels &r[r_idx], 0))
495 13497fff 2020-10-11 neels return ENOMEM;
496 13497fff 2020-10-11 neels l_idx += add_len;
497 3b0f3d61 2020-01-22 neels }
498 3b0f3d61 2020-01-22 neels
499 3b0f3d61 2020-01-22 neels /* Add a "plus" chunk with all lines from the right. */
500 2146cf12 2020-10-11 neels if (equal_atoms_start + equal_atoms_end < r_len) {
501 13497fff 2020-10-11 neels unsigned int add_len = r_len - equal_atoms_start - equal_atoms_end;
502 3b0f3d61 2020-01-22 neels if (!diff_state_add_chunk(state, true,
503 9dc0554f 2020-10-12 neels &l[l_idx], 0,
504 9dc0554f 2020-10-12 neels &r[r_idx], add_len))
505 13497fff 2020-10-11 neels return ENOMEM;
506 13497fff 2020-10-11 neels r_idx += add_len;
507 3b0f3d61 2020-01-22 neels }
508 2146cf12 2020-10-11 neels
509 2146cf12 2020-10-11 neels /* Add a chunk of equal lines at the end */
510 2146cf12 2020-10-11 neels if (equal_atoms_end) {
511 2146cf12 2020-10-11 neels if (!diff_state_add_chunk(state, true,
512 13497fff 2020-10-11 neels &l[l_idx], equal_atoms_end,
513 13497fff 2020-10-11 neels &r[r_idx], equal_atoms_end))
514 2146cf12 2020-10-11 neels return ENOMEM;
515 2146cf12 2020-10-11 neels }
516 2146cf12 2020-10-11 neels
517 3b0f3d61 2020-01-22 neels return DIFF_RC_OK;
518 3b0f3d61 2020-01-22 neels }
519 3b0f3d61 2020-01-22 neels
520 1f690ebe 2022-08-31 mark static int
521 0d27172a 2020-05-06 neels diff_run_algo(const struct diff_algo_config *algo_config,
522 0d27172a 2020-05-06 neels struct diff_state *state)
523 3b0f3d61 2020-01-22 neels {
524 3e6cba3a 2020-08-13 stsp int rc;
525 3b0f3d61 2020-01-22 neels
526 0d27172a 2020-05-06 neels if (!algo_config || !algo_config->impl
527 746d94df 2020-10-12 neels || !state->recursion_depth_left
528 746d94df 2020-10-12 neels || !state->left.atoms.len || !state->right.atoms.len) {
529 746d94df 2020-10-12 neels debug("Fall back to diff_algo_none():%s%s%s\n",
530 746d94df 2020-10-12 neels (!algo_config || !algo_config->impl) ? " no-cfg" : "",
531 746d94df 2020-10-12 neels (!state->recursion_depth_left) ? " max-depth" : "",
532 746d94df 2020-10-12 neels (!state->left.atoms.len || !state->right.atoms.len)?
533 746d94df 2020-10-12 neels " trivial" : "");
534 3b0f3d61 2020-01-22 neels return diff_algo_none(algo_config, state);
535 3b0f3d61 2020-01-22 neels }
536 3b0f3d61 2020-01-22 neels
537 746d94df 2020-10-12 neels ARRAYLIST_FREE(state->temp_result);
538 3b0f3d61 2020-01-22 neels ARRAYLIST_INIT(state->temp_result, DIFF_RESULT_ALLOC_BLOCKSIZE);
539 3b0f3d61 2020-01-22 neels rc = algo_config->impl(algo_config, state);
540 3b0f3d61 2020-01-22 neels switch (rc) {
541 3b0f3d61 2020-01-22 neels case DIFF_RC_USE_DIFF_ALGO_FALLBACK:
542 0d27172a 2020-05-06 neels debug("Got DIFF_RC_USE_DIFF_ALGO_FALLBACK (%p)\n",
543 0d27172a 2020-05-06 neels algo_config->fallback_algo);
544 3b0f3d61 2020-01-22 neels rc = diff_run_algo(algo_config->fallback_algo, state);
545 3b0f3d61 2020-01-22 neels goto return_rc;
546 3b0f3d61 2020-01-22 neels
547 3b0f3d61 2020-01-22 neels case DIFF_RC_OK:
548 3b0f3d61 2020-01-22 neels /* continue below */
549 3b0f3d61 2020-01-22 neels break;
550 3b0f3d61 2020-01-22 neels
551 3b0f3d61 2020-01-22 neels default:
552 3b0f3d61 2020-01-22 neels /* some error happened */
553 3b0f3d61 2020-01-22 neels goto return_rc;
554 3b0f3d61 2020-01-22 neels }
555 3b0f3d61 2020-01-22 neels
556 0d27172a 2020-05-06 neels /* Pick up any diff chunks that are still unsolved and feed to
557 0d27172a 2020-05-06 neels * inner_algo. inner_algo will solve unsolved chunks and append to
558 0d27172a 2020-05-06 neels * result, and subsequent solved chunks on this level are then appended
559 0d27172a 2020-05-06 neels * to result afterwards. */
560 3b0f3d61 2020-01-22 neels int i;
561 3b0f3d61 2020-01-22 neels for (i = 0; i < state->temp_result.len; i++) {
562 3b0f3d61 2020-01-22 neels struct diff_chunk *c = &state->temp_result.head[i];
563 3b0f3d61 2020-01-22 neels if (c->solved) {
564 93f8150a 2020-10-11 neels diff_state_add_solved_chunk(state, c);
565 3b0f3d61 2020-01-22 neels continue;
566 3b0f3d61 2020-01-22 neels }
567 3b0f3d61 2020-01-22 neels
568 3b0f3d61 2020-01-22 neels /* c is an unsolved chunk, feed to inner_algo */
569 3b0f3d61 2020-01-22 neels struct diff_state inner_state = {
570 3b0f3d61 2020-01-22 neels .result = state->result,
571 3b0f3d61 2020-01-22 neels .recursion_depth_left = state->recursion_depth_left - 1,
572 8be88dfa 2020-10-21 stsp .kd_buf = state->kd_buf,
573 8be88dfa 2020-10-21 stsp .kd_buf_size = state->kd_buf_size,
574 3b0f3d61 2020-01-22 neels };
575 0d27172a 2020-05-06 neels diff_data_init_subsection(&inner_state.left, &state->left,
576 0d27172a 2020-05-06 neels c->left_start, c->left_count);
577 0d27172a 2020-05-06 neels diff_data_init_subsection(&inner_state.right, &state->right,
578 0d27172a 2020-05-06 neels c->right_start, c->right_count);
579 3b0f3d61 2020-01-22 neels
580 3b0f3d61 2020-01-22 neels rc = diff_run_algo(algo_config->inner_algo, &inner_state);
581 8be88dfa 2020-10-21 stsp state->kd_buf = inner_state.kd_buf;
582 8be88dfa 2020-10-21 stsp state->kd_buf_size = inner_state.kd_buf_size;
583 3b0f3d61 2020-01-22 neels if (rc != DIFF_RC_OK)
584 3b0f3d61 2020-01-22 neels goto return_rc;
585 3b0f3d61 2020-01-22 neels }
586 3b0f3d61 2020-01-22 neels
587 3b0f3d61 2020-01-22 neels rc = DIFF_RC_OK;
588 3b0f3d61 2020-01-22 neels return_rc:
589 3b0f3d61 2020-01-22 neels ARRAYLIST_FREE(state->temp_result);
590 3b0f3d61 2020-01-22 neels return rc;
591 3b0f3d61 2020-01-22 neels }
592 c16dde50 2020-10-22 stsp
593 c16dde50 2020-10-22 stsp int
594 c16dde50 2020-10-22 stsp diff_atomize_file(struct diff_data *d,
595 c16dde50 2020-10-22 stsp const struct diff_config *config,
596 c16dde50 2020-10-22 stsp FILE *f, const uint8_t *data, off_t len, int diff_flags)
597 c16dde50 2020-10-22 stsp {
598 c16dde50 2020-10-22 stsp if (!config->atomize_func)
599 c16dde50 2020-10-22 stsp return EINVAL;
600 3b0f3d61 2020-01-22 neels
601 c16dde50 2020-10-22 stsp diff_data_init_root(d, f, data, len, diff_flags);
602 c16dde50 2020-10-22 stsp
603 c16dde50 2020-10-22 stsp return config->atomize_func(config->atomize_func_data, d);
604 c16dde50 2020-10-22 stsp
605 c16dde50 2020-10-22 stsp }
606 c16dde50 2020-10-22 stsp
607 61a7b578 2020-05-06 neels struct diff_result *
608 c16dde50 2020-10-22 stsp diff_main(const struct diff_config *config, struct diff_data *left,
609 c16dde50 2020-10-22 stsp struct diff_data *right)
610 3b0f3d61 2020-01-22 neels {
611 3b0f3d61 2020-01-22 neels struct diff_result *result = malloc(sizeof(struct diff_result));
612 3b0f3d61 2020-01-22 neels if (!result)
613 3b0f3d61 2020-01-22 neels return NULL;
614 3b0f3d61 2020-01-22 neels
615 3b0f3d61 2020-01-22 neels *result = (struct diff_result){};
616 c16dde50 2020-10-22 stsp result->left = left;
617 c16dde50 2020-10-22 stsp result->right = right;
618 3b0f3d61 2020-01-22 neels
619 3b0f3d61 2020-01-22 neels struct diff_state state = {
620 3b0f3d61 2020-01-22 neels .result = result,
621 35eae7fa 2022-08-31 mark .recursion_depth_left = config->max_recursion_depth ?
622 35eae7fa 2022-08-31 mark config->max_recursion_depth : UINT_MAX,
623 8be88dfa 2020-10-21 stsp .kd_buf = NULL,
624 8be88dfa 2020-10-21 stsp .kd_buf_size = 0,
625 3b0f3d61 2020-01-22 neels };
626 c16dde50 2020-10-22 stsp diff_data_init_subsection(&state.left, left,
627 c16dde50 2020-10-22 stsp left->atoms.head,
628 c16dde50 2020-10-22 stsp left->atoms.len);
629 c16dde50 2020-10-22 stsp diff_data_init_subsection(&state.right, right,
630 c16dde50 2020-10-22 stsp right->atoms.head,
631 c16dde50 2020-10-22 stsp right->atoms.len);
632 3b0f3d61 2020-01-22 neels
633 3b0f3d61 2020-01-22 neels result->rc = diff_run_algo(config->algo, &state);
634 8be88dfa 2020-10-21 stsp free(state.kd_buf);
635 3b0f3d61 2020-01-22 neels
636 3b0f3d61 2020-01-22 neels return result;
637 3b0f3d61 2020-01-22 neels }
638 3b0f3d61 2020-01-22 neels
639 61a7b578 2020-05-06 neels void
640 61a7b578 2020-05-06 neels diff_result_free(struct diff_result *result)
641 3b0f3d61 2020-01-22 neels {
642 3b0f3d61 2020-01-22 neels if (!result)
643 3b0f3d61 2020-01-22 neels return;
644 3b0f3d61 2020-01-22 neels ARRAYLIST_FREE(result->chunks);
645 3b0f3d61 2020-01-22 neels free(result);
646 9f5da091 2022-10-10 tj }
647 9f5da091 2022-10-10 tj
648 9f5da091 2022-10-10 tj int
649 9f5da091 2022-10-10 tj diff_result_contains_printable_chunks(struct diff_result *result)
650 9f5da091 2022-10-10 tj {
651 9f5da091 2022-10-10 tj struct diff_chunk *c;
652 9f5da091 2022-10-10 tj enum diff_chunk_type t;
653 29c010c8 2022-11-03 stsp int i;
654 9f5da091 2022-10-10 tj
655 29c010c8 2022-11-03 stsp for (i = 0; i < result->chunks.len; i++) {
656 9f5da091 2022-10-10 tj c = &result->chunks.head[i];
657 9f5da091 2022-10-10 tj t = diff_chunk_type(c);
658 9f5da091 2022-10-10 tj if (t == CHUNK_MINUS || t == CHUNK_PLUS)
659 9f5da091 2022-10-10 tj return 1;
660 9f5da091 2022-10-10 tj }
661 9f5da091 2022-10-10 tj
662 9f5da091 2022-10-10 tj return 0;
663 3b0f3d61 2020-01-22 neels }