1use std::collections::hash_map::DefaultHasher;
11use std::convert::TryFrom;
12use std::fmt::Debug;
13use std::fmt::{Error as FmtErr, Formatter};
14use std::hash::{Hash, Hasher};
15
16#[derive(Clone, Copy, Debug)]
18pub struct HashedSpan {
19 pub lo: usize,
20 pub hi: usize,
21 pub hash: u64,
22}
23
24#[derive(PartialEq, Eq)]
26pub struct HashedSlice<'a> {
27 pub hash: u64,
28 pub data: &'a [u8],
29}
30
31impl<'a> Debug for HashedSlice<'a> {
32 fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), FmtErr> {
33 let data_pp = String::from_utf8_lossy(self.data);
34 f.debug_tuple("HashedSlice").field(&data_pp).finish()
35 }
36}
37
38impl<'a> Hash for HashedSlice<'a> {
39 fn hash<H: Hasher>(&self, h: &mut H) {
40 h.write_u64(self.hash)
41 }
42}
43
44pub struct Tokenization<'a> {
46 data: &'a [u8],
47 tokens: &'a [HashedSpan],
48 start_index: isize,
49 one_past_end_index: isize,
50}
51
52impl<'a> Debug for Tokenization<'a> {
53 fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), FmtErr> {
54 let Self {
55 data,
56 tokens,
57 start_index,
58 one_past_end_index,
59 } = self;
60 let data_pp = String::from_utf8_lossy(data);
61 let tokens_pp = tokens[to_usize(*start_index)..to_usize(*one_past_end_index)]
62 .iter()
63 .map(|sref| String::from_utf8_lossy(&data[sref.lo..sref.hi]))
64 .collect::<Vec<_>>();
65 f.debug_struct("Tokenization")
66 .field("data", &data_pp)
67 .field("tokens", &tokens_pp)
68 .finish()
69 }
70}
71
72impl<'a> Tokenization<'a> {
73 pub fn new(data: &'a [u8], tokens: &'a [HashedSpan]) -> Self {
74 Tokenization {
75 data,
76 tokens,
77 start_index: 0,
78 one_past_end_index: to_isize(tokens.len()),
79 }
80 }
81
82 pub fn data(&self) -> &'a [u8] {
83 self.data
84 }
85
86 pub fn split_at(&self, lo: isize, hi: isize) -> (Self, Self) {
90 let start = self.start_index;
91 let end = self.one_past_end_index;
92 assert!(start <= lo);
93 assert!(lo <= hi);
94 assert!(hi <= end);
95 (
96 Tokenization {
97 one_past_end_index: lo,
98 ..*self
99 },
100 Tokenization {
101 start_index: hi,
102 ..*self
103 },
104 )
105 }
106
107 pub fn nb_tokens(&self) -> usize {
109 to_usize(self.one_past_end_index - self.start_index)
110 }
111
112 pub fn nth_token(&self, n: isize) -> HashedSlice {
114 let HashedSpan { lo, hi, hash } = self.nth_span(n);
115 HashedSlice {
116 hash,
117 data: &self.data[lo..hi],
118 }
119 }
120
121 pub fn nth_span(&self, n: isize) -> HashedSpan {
123 self.tokens[to_usize(self.start_index + n)]
124 }
125}
126
127#[derive(Debug)]
129pub struct DiffInput<'a> {
130 pub added: Tokenization<'a>,
131 pub removed: Tokenization<'a>,
132}
133
134impl<'a> DiffInput<'a> {
135 fn split_at(&self, (x0, y0): (isize, isize), (x1, y1): (isize, isize)) -> (Self, Self) {
136 let (removed1, removed2) = self.removed.split_at(x0, x1);
137 let (added1, added2) = self.added.split_at(y0, y1);
138
139 (
140 DiffInput {
141 added: added1,
142 removed: removed1,
143 },
144 DiffInput {
145 added: added2,
146 removed: removed2,
147 },
148 )
149 }
150
151 fn n(&self) -> usize {
152 self.removed.nb_tokens()
153 }
154
155 fn m(&self) -> usize {
156 self.added.nb_tokens()
157 }
158
159 fn seq_a(&self, index: isize) -> HashedSlice {
160 self.removed.nth_token(index)
161 }
162
163 fn seq_b(&self, index: isize) -> HashedSlice {
164 self.added.nth_token(index)
165 }
166}
167
168fn hash_slice(data: &[u8]) -> u64 {
170 let mut s = DefaultHasher::new();
171 s.write(data);
172 s.finish()
173}
174
175struct DiffTraversal<'a> {
176 v: &'a mut [isize],
177 max: usize,
178 end: (isize, isize),
179}
180
181impl<'a> DiffTraversal<'a> {
182 fn from_slice(input: &'a DiffInput<'a>, v: &'a mut [isize], forward: bool, max: usize) -> Self {
183 let start = (input.removed.start_index, input.added.start_index);
184 let end = (
185 input.removed.one_past_end_index,
186 input.added.one_past_end_index,
187 );
188 assert!(max * 2 + 1 <= v.len());
189 let (start, end) = if forward { (start, end) } else { (end, start) };
190 let mut res = DiffTraversal { v, max, end };
191 if max != 0 {
192 *res.v_mut(1) = start.0 - input.removed.start_index
193 }
194 res
195 }
196
197 fn from_vector(
198 input: &'a DiffInput<'a>,
199 v: &'a mut Vec<isize>,
200 forward: bool,
201 max: usize,
202 ) -> Self {
203 v.resize(max * 2 + 1, 0);
204 Self::from_slice(input, v, forward, max)
205 }
206
207 fn v(&self, index: isize) -> isize {
208 self.v[to_usize(index + to_isize(self.max))]
209 }
210
211 fn v_mut(&mut self, index: isize) -> &mut isize {
212 &mut self.v[to_usize(index + to_isize(self.max))]
213 }
214}
215
216fn diff_sequences_kernel_forward(
217 input: &DiffInput,
218 ctx: &mut DiffTraversal,
219 d: usize,
220) -> Option<usize> {
221 let n = to_isize(input.n());
222 let m = to_isize(input.m());
223 assert!(d < ctx.max);
224 let d = to_isize(d);
225 for k in (-d..=d).step_by(2) {
226 let mut x = if k == -d || k != d && ctx.v(k - 1) < ctx.v(k + 1) {
227 ctx.v(k + 1)
228 } else {
229 ctx.v(k - 1) + 1
230 };
231 let mut y = x - k;
232 while x < n && y < m && input.seq_a(x) == input.seq_b(y) {
233 x += 1;
234 y += 1;
235 }
236 *ctx.v_mut(k) = x;
237 if ctx.end == (x, y) {
238 return Some(to_usize(d));
239 }
240 }
241 None
242}
243
244fn diff_sequences_kernel_backward(
245 input: &DiffInput,
246 ctx: &mut DiffTraversal,
247 d: usize,
248) -> Option<usize> {
249 let n = to_isize(input.n());
250 let m = to_isize(input.m());
251 let delta = n - m;
252 assert!(d < ctx.max);
253 let d = to_isize(d);
254 for k in (-d..=d).step_by(2) {
255 let mut x = if k == -d || k != d && ctx.v(k + 1) < ctx.v(k - 1) {
256 ctx.v(k + 1)
257 } else {
258 ctx.v(k - 1) + 1
259 };
260 let mut y = x - (k + delta);
261 while 0 < x && 0 < y && input.seq_a(x - 1) == input.seq_b(y - 1) {
262 x -= 1;
263 y -= 1;
264 }
265 *ctx.v_mut(k) = x - 1;
266 if ctx.end == (x, y) {
267 return Some(to_usize(d));
268 }
269 }
270 None
271}
272
273#[derive(Debug, Default)]
275pub struct LineSplit {
276 data: Vec<u8>,
277 line_lengths: Vec<usize>,
278}
279
280impl LineSplit {
281 pub fn iter(&self) -> LineSplitIter {
282 LineSplitIter {
283 line_split: &self,
284 index: 0,
285 start_of_slice: 0,
286 }
287 }
288
289 pub fn data<'a>(&'a self) -> &'a [u8] {
290 &self.data
291 }
292
293 pub fn append_line(&mut self, line: &[u8]) {
294 if self.data.last().cloned() == Some(b'\n') {
295 self.line_lengths.push(line.len());
296 } else {
297 match self.line_lengths.last_mut() {
298 Some(len) => *len += line.len(),
299 None => self.line_lengths.push(line.len()),
300 }
301 }
302 self.data.extend_from_slice(line)
303 }
304
305 pub fn clear(&mut self) {
306 self.data.clear();
307 self.line_lengths.clear();
308 }
309
310 pub fn len(&self) -> usize {
311 self.data.len()
312 }
313}
314
315pub struct LineSplitIter<'a> {
316 line_split: &'a LineSplit,
317 start_of_slice: usize,
318 index: usize,
319}
320
321impl<'a> Iterator for LineSplitIter<'a> {
322 type Item = (usize, usize);
323 fn next(&mut self) -> Option<Self::Item> {
324 let &mut LineSplitIter {
325 line_split:
326 LineSplit {
327 data: _,
328 line_lengths,
329 },
330 index,
331 start_of_slice,
332 } = self;
333 if index < line_lengths.len() {
334 let len = line_lengths[index];
335 self.start_of_slice += len;
336 self.index += 1;
337 Some((start_of_slice, start_of_slice + len))
338 } else {
339 None
340 }
341 }
342}
343
344#[derive(Clone, Debug, Default)]
346pub struct Snake {
347 pub x0: isize,
349
350 pub y0: isize,
352
353 pub len: isize,
355}
356
357impl Snake {
358 fn from(mut self, x0: isize, y0: isize) -> Self {
359 self.x0 = x0;
360 self.y0 = y0;
361 self
362 }
363
364 fn len(mut self, len: isize) -> Self {
365 self.len = len;
366 self
367 }
368}
369
370fn diff_sequences_kernel_bidirectional(
371 input: &DiffInput,
372 ctx_fwd: &mut DiffTraversal,
373 ctx_bwd: &mut DiffTraversal,
374 d: usize,
375) -> Option<(Snake, isize)> {
376 let n = to_isize(input.n());
377 let m = to_isize(input.m());
378 let delta = n - m;
379 let odd = delta % 2 != 0;
380 assert!(d < ctx_fwd.max);
381 assert!(d < ctx_bwd.max);
382 let d = to_isize(d);
383 for k in (-d..=d).step_by(2) {
384 let mut x = if k == -d || k != d && ctx_fwd.v(k - 1) < ctx_fwd.v(k + 1) {
385 ctx_fwd.v(k + 1)
386 } else {
387 ctx_fwd.v(k - 1) + 1
388 };
389 let mut y = x - k;
390 let (x0, y0) = (x, y);
391 while x < n && y < m && input.seq_a(x) == input.seq_b(y) {
392 x += 1;
393 y += 1;
394 }
395 if odd && (k - delta).abs() <= d - 1 && x > ctx_bwd.v(k - delta) {
396 return Some((Snake::default().from(x0, y0).len(x - x0), 2 * d - 1));
397 }
398 *ctx_fwd.v_mut(k) = x;
399 }
400 for k in (-d..=d).step_by(2) {
401 let mut x = if k == -d || k != d && ctx_bwd.v(k + 1) < ctx_bwd.v(k - 1) {
402 ctx_bwd.v(k + 1)
403 } else {
404 ctx_bwd.v(k - 1) + 1
405 };
406 let mut y = x - (k + delta);
407 let x1 = x;
408 while 0 < x && 0 < y && input.seq_a(x - 1) == input.seq_b(y - 1) {
409 x -= 1;
410 y -= 1;
411 }
412 if !odd && (k + delta).abs() <= d && x - 1 < ctx_fwd.v(k + delta) {
413 return Some((Snake::default().from(x, y).len(x1 - x), 2 * d));
414 }
415 *ctx_bwd.v_mut(k) = x - 1;
416 }
417 None
418}
419
420pub fn diff_sequences_simple_forward(input: &DiffInput, v: &mut Vec<isize>) -> usize {
423 diff_sequences_simple(input, v, true)
424}
425
426pub fn diff_sequences_simple_backward(input: &DiffInput, v: &mut Vec<isize>) -> usize {
429 diff_sequences_simple(input, v, false)
430}
431
432fn diff_sequences_simple(input: &DiffInput, v: &mut Vec<isize>, forward: bool) -> usize {
433 let max_result = input.n() + input.m();
434 let ctx = &mut DiffTraversal::from_vector(input, v, forward, max_result);
435 (0..max_result)
436 .filter_map(|d| {
437 if forward {
438 diff_sequences_kernel_forward(input, ctx, d)
439 } else {
440 diff_sequences_kernel_backward(input, ctx, d)
441 }
442 })
443 .next()
444 .unwrap_or(max_result)
445}
446
447pub fn diff(input: &DiffInput, v: &mut Vec<isize>, dst: &mut Vec<Snake>) {
449 dst.clear();
450 diff_rec(input, v, dst)
451}
452
453fn diff_rec(input: &DiffInput, v: &mut Vec<isize>, dst: &mut Vec<Snake>) {
454 let n = to_isize(input.n());
455 fn trivial_diff(tok: &Tokenization) -> bool {
456 tok.one_past_end_index <= tok.start_index
457 }
458
459 if trivial_diff(&input.removed) || trivial_diff(&input.added) {
460 return;
461 }
462
463 let (snake, d) = diff_sequences_bidirectional_snake(input, v);
464 let &Snake { x0, y0, len } = &snake;
465 if 1 < d {
466 let (input1, input2) = input.split_at((x0, y0), (x0 + len, y0 + len));
467 diff_rec(&input1, v, dst);
468 if len != 0 {
469 dst.push(snake);
470 }
471 diff_rec(&input2, v, dst);
472 } else {
473 let SplittingPoint { sp, dx, dy } = find_splitting_point(&input);
474 let x0 = input.removed.start_index;
475 let y0 = input.added.start_index;
476 if sp != 0 {
477 dst.push(Snake::default().from(x0, y0).len(sp));
478 }
479 let len = n - sp - dx;
480 if len != 0 {
481 dst.push(Snake::default().from(x0 + sp + dx, y0 + sp + dy).len(len));
482 }
483 }
484}
485
486struct SplittingPoint {
487 sp: isize,
488 dx: isize,
489 dy: isize,
490}
491
492fn find_splitting_point(input: &DiffInput) -> SplittingPoint {
494 let n = to_isize(input.n());
495 let m = to_isize(input.m());
496 let (short, long, nb_tokens, dx, dy) = if n < m {
497 (&input.removed, &input.added, n, 0, 1)
498 } else if m < n {
499 (&input.added, &input.removed, m, 1, 0)
500 } else {
501 (&input.added, &input.removed, m, 0, 0)
502 };
503 let mut sp = nb_tokens;
504 for i in 0..nb_tokens {
505 if long.nth_token(i) != short.nth_token(i) {
506 sp = i;
507 break;
508 }
509 }
510 SplittingPoint { sp, dx, dy }
511}
512
513pub fn diff_sequences_bidirectional(input: &DiffInput, v: &mut Vec<isize>) -> usize {
516 if input.n() + input.m() == 0 {
517 return 0;
518 }
519 to_usize(diff_sequences_bidirectional_snake(input, v).1)
520}
521
522fn diff_sequences_bidirectional_snake(input: &DiffInput, v: &mut Vec<isize>) -> (Snake, isize) {
523 let max = (input.n() + input.m() + 1) / 2 + 1;
524 let iter_len = 2 * max + 1;
525 v.resize(2 * iter_len, 0);
526
527 let (v1, v2) = v.split_at_mut(iter_len);
528 let ctx_fwd = &mut DiffTraversal::from_slice(input, v1, true, max);
529 let ctx_bwd = &mut DiffTraversal::from_slice(input, v2, false, max);
530 let mut result = (0..max)
531 .filter_map(|d| diff_sequences_kernel_bidirectional(input, ctx_fwd, ctx_bwd, d))
532 .next()
533 .expect("snake not found");
534 result.0.x0 += input.removed.start_index;
535 result.0.y0 += input.added.start_index;
536 result
537}
538
539fn to_isize(input: usize) -> isize {
540 isize::try_from(input).unwrap()
541}
542
543fn to_usize(input: isize) -> usize {
544 usize::try_from(input).unwrap()
545}
546
547#[derive(PartialEq, Eq, Clone, Copy, Debug)]
548enum TokenKind {
549 Other,
550 Word,
551 Spaces,
552}
553
554pub fn tokenize(src: &[u8], ofs: usize, tokens: &mut Vec<HashedSpan>) {
556 let mut push = |lo: usize, hi: usize| {
557 if lo < hi {
558 tokens.push(HashedSpan {
559 lo,
560 hi,
561 hash: hash_slice(&src[lo..hi]),
562 })
563 }
564 };
565 let mut lo = ofs;
566 let mut kind = TokenKind::Other;
567 for hi in ofs..src.len() {
568 let oldkind = kind;
569 kind = classify_byte(src[hi]);
570 if kind != oldkind || oldkind == TokenKind::Other {
571 push(lo, hi);
572 lo = hi
573 }
574 }
575 push(lo, src.len());
576}
577
578fn classify_byte(b: u8) -> TokenKind {
579 match b {
580 b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' => TokenKind::Word,
581 b'\t' | b' ' => TokenKind::Spaces,
582 _ => TokenKind::Other,
583 }
584}
585
586mod best_projection;
587pub use crate::best_projection::optimize_partition;
588pub use crate::best_projection::{NormalizationResult, SharedSegments};
589
590#[cfg(test)]
591mod test;