Skip to main content

gix_imara_diff/
sources.rs

1//! Modified for gitoxide from the upstream imara-diff crate.
2//! Upstream source: git cat-file -p 32d1e45d3df061e6ccba6db7fdce92db29e345d8:src/sources.rs
3//!
4//! Utilities for creating token sources from common data types.
5//!
6//! This module provides implementations of [`TokenSource`] for
7//! strings and byte slices, splitting them into lines by default.
8
9use std::str::from_utf8_unchecked;
10
11use memchr::memchr;
12
13use crate::TokenSource;
14
15/// Returns a [`TokenSource`] that uses the lines in `data` as Tokens. The newline
16/// separator (`\r\n` or `\n`) is included in the emitted tokens. This means that changing
17/// the newline separator from `\r\n` to `\n` (or omitting it fully on the last line) is
18/// detected by [`Diff`](crate::Diff).
19pub fn lines(data: &str) -> Lines<'_> {
20    Lines(ByteLines(data.as_bytes()))
21}
22
23/// Returns a [`TokenSource`] that uses the words in `data` as Tokens. A word is
24/// a sequence of alphanumeric characters as determined by
25/// `char::is_alphanumeric`, or a sequence of just the space character ' '. Any
26/// other characters are their own word.
27pub fn words(data: &str) -> Words<'_> {
28    Words(data)
29}
30
31/// Returns a [`TokenSource`] that uses the lines in `data` as Tokens. The newline
32/// separator (`\r\n` or `\n`) is included in the emitted tokens. This means that changing
33/// the newline separator from `\r\n` to `\n` (or omitting it fully on the last line) is
34/// detected when computing a [`Diff`](crate::Diff).
35pub fn bstr_lines(data: &bstr::BStr) -> BStrLines<'_> {
36    BStrLines(data)
37}
38
39/// Returns a [`TokenSource`] that uses the lines in `data` as Tokens. The newline
40/// separator (`\r\n` or `\n`) is included in the emitted tokens. This means that changing
41/// the newline separator from `\r\n` to `\n` (or omitting it fully on the last line) is
42/// detected when computing a [`Diff`](crate::Diff).
43pub fn byte_lines(data: &[u8]) -> ByteLines<'_> {
44    ByteLines(data)
45}
46
47/// By default, a line diff is produced for a string
48impl<'a> TokenSource for &'a str {
49    type Token = &'a str;
50
51    type Tokenizer = Lines<'a>;
52
53    fn tokenize(&self) -> Self::Tokenizer {
54        lines(self)
55    }
56
57    fn estimate_tokens(&self) -> u32 {
58        lines(self).estimate_tokens()
59    }
60}
61
62/// By default, a line diff is produced for a `BStr`.
63impl<'a> TokenSource for &'a bstr::BStr {
64    type Token = Self;
65    type Tokenizer = BStrLines<'a>;
66
67    fn tokenize(&self) -> Self::Tokenizer {
68        bstr_lines(self)
69    }
70
71    fn estimate_tokens(&self) -> u32 {
72        bstr_lines(self).estimate_tokens()
73    }
74}
75
76/// By default, a line diff is produced for a bytes
77impl<'a> TokenSource for &'a [u8] {
78    type Token = Self;
79    type Tokenizer = ByteLines<'a>;
80
81    fn tokenize(&self) -> Self::Tokenizer {
82        byte_lines(self)
83    }
84
85    fn estimate_tokens(&self) -> u32 {
86        byte_lines(self).estimate_tokens()
87    }
88}
89
90/// A [`TokenSource`] that returns the lines of a `str` as tokens. See [`lines`] for
91/// details.
92#[derive(Clone, Copy, PartialEq, Eq)]
93pub struct Lines<'a>(ByteLines<'a>);
94
95impl<'a> Iterator for Lines<'a> {
96    type Item = &'a str;
97
98    fn next(&mut self) -> Option<Self::Item> {
99        // safety invariant: this struct may only contain valid utf8
100        // dividing valid utf8 bytes by ascii characters always produces valid utf-8
101        self.0.next().map(|it| unsafe { from_utf8_unchecked(it) })
102    }
103}
104
105/// By default, a line diff is produced for a string
106impl<'a> TokenSource for Lines<'a> {
107    type Token = &'a str;
108
109    type Tokenizer = Self;
110
111    fn tokenize(&self) -> Self::Tokenizer {
112        *self
113    }
114
115    fn estimate_tokens(&self) -> u32 {
116        self.0.estimate_tokens()
117    }
118}
119
120/// A [`TokenSource`] that returns the words of a string as tokens. See
121/// [`words`] for details.
122#[derive(Clone, Copy, PartialEq, Eq)]
123pub struct Words<'a>(&'a str);
124
125impl<'a> Iterator for Words<'a> {
126    type Item = &'a str;
127
128    fn next(&mut self) -> Option<Self::Item> {
129        if self.0.is_empty() {
130            return None;
131        }
132
133        let initial = self.0.chars().next().unwrap();
134        let word_len = if initial == ' ' {
135            self.0
136                .char_indices()
137                .find(|(_, c)| *c != ' ')
138                .map_or(self.0.len(), |(index, _)| index)
139        } else if initial.is_alphanumeric() {
140            self.0
141                .char_indices()
142                .find(|(_, c)| !c.is_alphanumeric() && *c != '_')
143                .map_or(self.0.len(), |(index, _)| index)
144        } else {
145            initial.len_utf8()
146        };
147
148        let (word, rem) = self.0.split_at(word_len);
149        self.0 = rem;
150        Some(word)
151    }
152}
153impl<'a> TokenSource for Words<'a> {
154    type Token = &'a str;
155
156    type Tokenizer = Self;
157
158    fn tokenize(&self) -> Self::Tokenizer {
159        *self
160    }
161
162    fn estimate_tokens(&self) -> u32 {
163        (self.0.len() / 3) as u32
164    }
165}
166
167/// A [`TokenSource`] that returns the lines of a `BStr` as tokens. See [`bstr_lines`] for details.
168#[derive(Clone, Copy, PartialEq, Eq)]
169pub struct BStrLines<'a>(&'a bstr::BStr);
170
171impl<'a> Iterator for BStrLines<'a> {
172    type Item = &'a bstr::BStr;
173
174    fn next(&mut self) -> Option<Self::Item> {
175        if self.0.is_empty() {
176            return None;
177        }
178        let line_len = memchr(b'\n', self.0).map_or(self.0.len(), |len| len + 1);
179        let (line, rem) = self.0.split_at(line_len);
180        self.0 = rem.into();
181        Some(line.into())
182    }
183}
184
185impl<'a> TokenSource for BStrLines<'a> {
186    type Token = &'a bstr::BStr;
187    type Tokenizer = Self;
188
189    fn tokenize(&self) -> Self::Tokenizer {
190        *self
191    }
192
193    fn estimate_tokens(&self) -> u32 {
194        let len: usize = self.take(20).map(|line| line.len()).sum();
195        (self.0.len() * 20).checked_div(len).unwrap_or(100) as u32
196    }
197}
198
199/// A [`TokenSource`] that returns the lines of a byte slice as tokens. See [`byte_lines`]
200/// for details.
201#[derive(Clone, Copy, PartialEq, Eq)]
202pub struct ByteLines<'a>(&'a [u8]);
203
204impl<'a> Iterator for ByteLines<'a> {
205    type Item = &'a [u8];
206
207    fn next(&mut self) -> Option<Self::Item> {
208        if self.0.is_empty() {
209            return None;
210        }
211        let line_len = memchr(b'\n', self.0).map_or(self.0.len(), |len| len + 1);
212        let (line, rem) = self.0.split_at(line_len);
213        self.0 = rem;
214        Some(line)
215    }
216}
217
218/// By default, a line diff is produced for a string
219impl<'a> TokenSource for ByteLines<'a> {
220    type Token = &'a [u8];
221
222    type Tokenizer = Self;
223
224    fn tokenize(&self) -> Self::Tokenizer {
225        *self
226    }
227
228    fn estimate_tokens(&self) -> u32 {
229        let len: usize = self.take(20).map(|line| line.len()).sum();
230        (self.0.len() * 20).checked_div(len).unwrap_or(100) as u32
231    }
232}