imara_diff/
sources.rs

1use std::str::from_utf8_unchecked;
2
3use memchr::memchr;
4
5use crate::TokenSource;
6
7/// Returns a [`TokenSource`] that uses the lines in `data` as Tokens. The newline
8/// separator (`\r\n` or `\n`) is included in the emitted tokens. This means that changing
9/// the newline separator from `\r\n` to `\n` (or omitting it fully on the last line) is
10/// detected by [`Diff`](crate::Diff).
11pub fn lines(data: &str) -> Lines<'_> {
12    Lines(ByteLines(data.as_bytes()))
13}
14
15/// Returns a [`TokenSource`] that uses the lines in `data` as Tokens. The newline
16/// separator (`\r\n` or `\n`) is included in the emitted tokens. This means that changing
17/// the newline separator from `\r\n` to `\n` (or omitting it fully on the last line) is
18/// detected when computing a [`Diff`](crate::Diff).
19pub fn byte_lines(data: &[u8]) -> ByteLines<'_> {
20    ByteLines(data)
21}
22
23/// By default, a line diff is produced for a string
24impl<'a> TokenSource for &'a str {
25    type Token = &'a str;
26
27    type Tokenizer = Lines<'a>;
28
29    fn tokenize(&self) -> Self::Tokenizer {
30        lines(self)
31    }
32
33    fn estimate_tokens(&self) -> u32 {
34        lines(self).estimate_tokens()
35    }
36}
37
38/// By default, a line diff is produced for a bytes
39impl<'a> TokenSource for &'a [u8] {
40    type Token = Self;
41    type Tokenizer = ByteLines<'a>;
42
43    fn tokenize(&self) -> Self::Tokenizer {
44        byte_lines(self)
45    }
46
47    fn estimate_tokens(&self) -> u32 {
48        byte_lines(self).estimate_tokens()
49    }
50}
51
52/// A [`TokenSource`] that returns the lines of a `str` as tokens. See [`lines`] for
53/// details.
54#[derive(Clone, Copy, PartialEq, Eq)]
55pub struct Lines<'a>(ByteLines<'a>);
56
57impl<'a> Iterator for Lines<'a> {
58    type Item = &'a str;
59
60    fn next(&mut self) -> Option<Self::Item> {
61        // safety invariant: this struct may only contain valid utf8
62        // dividing valid utf8 bytes by ascii characters always produces valid utf-8
63        self.0.next().map(|it| unsafe { from_utf8_unchecked(it) })
64    }
65}
66
67/// By default a line diff is produced for a string
68impl<'a> TokenSource for Lines<'a> {
69    type Token = &'a str;
70
71    type Tokenizer = Self;
72
73    fn tokenize(&self) -> Self::Tokenizer {
74        *self
75    }
76
77    fn estimate_tokens(&self) -> u32 {
78        self.0.estimate_tokens()
79    }
80}
81
82/// A [`TokenSource`] that returns the lines of a byte slice as tokens. See [`byte_lines`]
83/// for details.
84#[derive(Clone, Copy, PartialEq, Eq)]
85pub struct ByteLines<'a>(&'a [u8]);
86
87impl<'a> Iterator for ByteLines<'a> {
88    type Item = &'a [u8];
89
90    fn next(&mut self) -> Option<Self::Item> {
91        if self.0.is_empty() {
92            return None;
93        }
94        let line_len = memchr(b'\n', self.0).map_or(self.0.len(), |len| len + 1);
95        let (line, rem) = self.0.split_at(line_len);
96        self.0 = rem;
97        Some(line)
98    }
99}
100
101/// By default a line diff is produced for a string
102impl<'a> TokenSource for ByteLines<'a> {
103    type Token = &'a [u8];
104
105    type Tokenizer = Self;
106
107    fn tokenize(&self) -> Self::Tokenizer {
108        *self
109    }
110
111    fn estimate_tokens(&self) -> u32 {
112        let len: usize = self.take(20).map(|line| line.len()).sum();
113        if len == 0 {
114            100
115        } else {
116            (self.0.len() * 20 / len) as u32
117        }
118    }
119}