gix_imara_diff/
sources.rs1use std::str::from_utf8_unchecked;
10
11use memchr::memchr;
12
13use crate::TokenSource;
14
15pub fn lines(data: &str) -> Lines<'_> {
20 Lines(ByteLines(data.as_bytes()))
21}
22
23pub fn words(data: &str) -> Words<'_> {
28 Words(data)
29}
30
31pub fn bstr_lines(data: &bstr::BStr) -> BStrLines<'_> {
36 BStrLines(data)
37}
38
39pub fn byte_lines(data: &[u8]) -> ByteLines<'_> {
44 ByteLines(data)
45}
46
47impl<'a> TokenSource for &'a str {
49 type Token = &'a str;
50
51 type Tokenizer = Lines<'a>;
52
53 fn tokenize(&self) -> Self::Tokenizer {
54 lines(self)
55 }
56
57 fn estimate_tokens(&self) -> u32 {
58 lines(self).estimate_tokens()
59 }
60}
61
62impl<'a> TokenSource for &'a bstr::BStr {
64 type Token = Self;
65 type Tokenizer = BStrLines<'a>;
66
67 fn tokenize(&self) -> Self::Tokenizer {
68 bstr_lines(self)
69 }
70
71 fn estimate_tokens(&self) -> u32 {
72 bstr_lines(self).estimate_tokens()
73 }
74}
75
76impl<'a> TokenSource for &'a [u8] {
78 type Token = Self;
79 type Tokenizer = ByteLines<'a>;
80
81 fn tokenize(&self) -> Self::Tokenizer {
82 byte_lines(self)
83 }
84
85 fn estimate_tokens(&self) -> u32 {
86 byte_lines(self).estimate_tokens()
87 }
88}
89
90#[derive(Clone, Copy, PartialEq, Eq)]
93pub struct Lines<'a>(ByteLines<'a>);
94
95impl<'a> Iterator for Lines<'a> {
96 type Item = &'a str;
97
98 fn next(&mut self) -> Option<Self::Item> {
99 self.0.next().map(|it| unsafe { from_utf8_unchecked(it) })
102 }
103}
104
105impl<'a> TokenSource for Lines<'a> {
107 type Token = &'a str;
108
109 type Tokenizer = Self;
110
111 fn tokenize(&self) -> Self::Tokenizer {
112 *self
113 }
114
115 fn estimate_tokens(&self) -> u32 {
116 self.0.estimate_tokens()
117 }
118}
119
120#[derive(Clone, Copy, PartialEq, Eq)]
123pub struct Words<'a>(&'a str);
124
125impl<'a> Iterator for Words<'a> {
126 type Item = &'a str;
127
128 fn next(&mut self) -> Option<Self::Item> {
129 if self.0.is_empty() {
130 return None;
131 }
132
133 let initial = self.0.chars().next().unwrap();
134 let word_len = if initial == ' ' {
135 self.0
136 .char_indices()
137 .find(|(_, c)| *c != ' ')
138 .map_or(self.0.len(), |(index, _)| index)
139 } else if initial.is_alphanumeric() {
140 self.0
141 .char_indices()
142 .find(|(_, c)| !c.is_alphanumeric() && *c != '_')
143 .map_or(self.0.len(), |(index, _)| index)
144 } else {
145 initial.len_utf8()
146 };
147
148 let (word, rem) = self.0.split_at(word_len);
149 self.0 = rem;
150 Some(word)
151 }
152}
153impl<'a> TokenSource for Words<'a> {
154 type Token = &'a str;
155
156 type Tokenizer = Self;
157
158 fn tokenize(&self) -> Self::Tokenizer {
159 *self
160 }
161
162 fn estimate_tokens(&self) -> u32 {
163 (self.0.len() / 3) as u32
164 }
165}
166
167#[derive(Clone, Copy, PartialEq, Eq)]
169pub struct BStrLines<'a>(&'a bstr::BStr);
170
171impl<'a> Iterator for BStrLines<'a> {
172 type Item = &'a bstr::BStr;
173
174 fn next(&mut self) -> Option<Self::Item> {
175 if self.0.is_empty() {
176 return None;
177 }
178 let line_len = memchr(b'\n', self.0).map_or(self.0.len(), |len| len + 1);
179 let (line, rem) = self.0.split_at(line_len);
180 self.0 = rem.into();
181 Some(line.into())
182 }
183}
184
185impl<'a> TokenSource for BStrLines<'a> {
186 type Token = &'a bstr::BStr;
187 type Tokenizer = Self;
188
189 fn tokenize(&self) -> Self::Tokenizer {
190 *self
191 }
192
193 fn estimate_tokens(&self) -> u32 {
194 let len: usize = self.take(20).map(|line| line.len()).sum();
195 (self.0.len() * 20).checked_div(len).unwrap_or(100) as u32
196 }
197}
198
199#[derive(Clone, Copy, PartialEq, Eq)]
202pub struct ByteLines<'a>(&'a [u8]);
203
204impl<'a> Iterator for ByteLines<'a> {
205 type Item = &'a [u8];
206
207 fn next(&mut self) -> Option<Self::Item> {
208 if self.0.is_empty() {
209 return None;
210 }
211 let line_len = memchr(b'\n', self.0).map_or(self.0.len(), |len| len + 1);
212 let (line, rem) = self.0.split_at(line_len);
213 self.0 = rem;
214 Some(line)
215 }
216}
217
218impl<'a> TokenSource for ByteLines<'a> {
220 type Token = &'a [u8];
221
222 type Tokenizer = Self;
223
224 fn tokenize(&self) -> Self::Tokenizer {
225 *self
226 }
227
228 fn estimate_tokens(&self) -> u32 {
229 let len: usize = self.take(20).map(|line| line.len()).sum();
230 (self.0.len() * 20).checked_div(len).unwrap_or(100) as u32
231 }
232}