harper_core/
token_string_ext.rs1use crate::{Span, Token};
2use itertools::Itertools;
3use paste::paste;
4
5macro_rules! create_decl_for {
6 ($thing:ident) => {
7 paste! {
8 fn [< first_ $thing >](&self) -> Option<&Token>;
9
10 fn [< last_ $thing >](&self) -> Option<&Token>;
11
12 fn [< last_ $thing _index >](&self) -> Option<usize>;
13
14 fn [<iter_ $thing _indices>](&self) -> impl DoubleEndedIterator<Item = usize> + '_;
15
16 fn [<iter_ $thing s>](&self) -> impl Iterator<Item = &Token> + '_;
17 }
18 };
19}
20
21macro_rules! create_fns_for {
22 ($thing:ident) => {
23 paste! {
24 fn [< first_ $thing >](&self) -> Option<&Token> {
25 self.iter().find(|v| v.kind.[<is_ $thing>]())
26 }
27
28 fn [< last_ $thing >](&self) -> Option<&Token> {
29 self.iter().rev().find(|v| v.kind.[<is_ $thing>]())
30 }
31
32 fn [< last_ $thing _index >](&self) -> Option<usize> {
33 self.iter().rev().position(|v| v.kind.[<is_ $thing>]()).map(|i| self.len() - i - 1)
34 }
35
36 fn [<iter_ $thing _indices>](&self) -> impl DoubleEndedIterator<Item = usize> + '_ {
37 self.iter()
38 .enumerate()
39 .filter(|(_, t)| t.kind.[<is_ $thing>]())
40 .map(|(i, _)| i)
41 }
42
43 fn [<iter_ $thing s>](&self) -> impl Iterator<Item = &Token> + '_ {
44 self.[<iter_ $thing _indices>]().map(|i| &self[i])
45 }
46 }
47 };
48}
49
50mod private {
51 use crate::{Document, Token};
52
53 pub trait Sealed {}
54
55 impl Sealed for [Token] {}
56
57 impl Sealed for Document {}
58}
59
60pub trait TokenStringExt: private::Sealed {
62 fn first_sentence_word(&self) -> Option<&Token>;
63 fn first_non_whitespace(&self) -> Option<&Token>;
64 fn span(&self) -> Option<Span<char>>;
67
68 create_decl_for!(adjective);
69 create_decl_for!(apostrophe);
70 create_decl_for!(at);
71 create_decl_for!(comma);
72 create_decl_for!(conjunction);
73 create_decl_for!(chunk_terminator);
74 create_decl_for!(currency);
75 create_decl_for!(ellipsis);
76 create_decl_for!(hostname);
77 create_decl_for!(likely_homograph);
78 create_decl_for!(number);
79 create_decl_for!(noun);
80 create_decl_for!(paragraph_break);
81 create_decl_for!(pipe);
82 create_decl_for!(preposition);
83 create_decl_for!(punctuation);
84 create_decl_for!(quote);
85 create_decl_for!(sentence_terminator);
86 create_decl_for!(space);
87 create_decl_for!(unlintable);
88 create_decl_for!(verb);
89 create_decl_for!(word);
90 create_decl_for!(word_like);
91 create_decl_for!(heading_start);
92
93 fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_;
94 fn iter_linking_verbs(&self) -> impl Iterator<Item = &Token> + '_;
95
96 fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_;
105
106 fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_;
109
110 fn iter_headings(&self) -> impl Iterator<Item = &'_ [Token]> + '_;
115
116 fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_;
119
120 fn iter_sentences_mut(&mut self) -> impl Iterator<Item = &'_ mut [Token]> + '_;
123}
124
125impl TokenStringExt for [Token] {
126 create_fns_for!(adjective);
127 create_fns_for!(apostrophe);
128 create_fns_for!(at);
129 create_fns_for!(chunk_terminator);
130 create_fns_for!(comma);
131 create_fns_for!(conjunction);
132 create_fns_for!(currency);
133 create_fns_for!(ellipsis);
134 create_fns_for!(hostname);
135 create_fns_for!(likely_homograph);
136 create_fns_for!(noun);
137 create_fns_for!(number);
138 create_fns_for!(paragraph_break);
139 create_fns_for!(pipe);
140 create_fns_for!(preposition);
141 create_fns_for!(punctuation);
142 create_fns_for!(quote);
143 create_fns_for!(sentence_terminator);
144 create_fns_for!(space);
145 create_fns_for!(unlintable);
146 create_fns_for!(verb);
147 create_fns_for!(word_like);
148 create_fns_for!(word);
149 create_fns_for!(heading_start);
150
151 fn first_non_whitespace(&self) -> Option<&Token> {
152 self.iter().find(|t| !t.kind.is_whitespace())
153 }
154
155 fn first_sentence_word(&self) -> Option<&Token> {
156 let (w_idx, word) = self.iter().find_position(|v| v.kind.is_word())?;
157
158 let Some(u_idx) = self.iter().position(|v| v.kind.is_unlintable()) else {
159 return Some(word);
160 };
161
162 if w_idx < u_idx { Some(word) } else { None }
163 }
164
165 fn span(&self) -> Option<Span<char>> {
166 let min_max = self
167 .iter()
168 .flat_map(|v| [v.span.start, v.span.end].into_iter())
169 .minmax();
170
171 match min_max {
172 itertools::MinMaxResult::NoElements => None,
173 itertools::MinMaxResult::OneElement(min) => Some(Span::new(min, min)),
174 itertools::MinMaxResult::MinMax(min, max) => Some(Span::new(min, max)),
175 }
176 }
177
178 fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_ {
179 self.iter_word_indices().filter(|idx| {
180 let word = &self[*idx];
181 let Some(Some(meta)) = word.kind.as_word() else {
182 return false;
183 };
184
185 meta.is_linking_verb()
186 })
187 }
188
189 fn iter_linking_verbs(&self) -> impl Iterator<Item = &Token> + '_ {
190 self.iter_linking_verb_indices().map(|idx| &self[idx])
191 }
192
193 fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
194 let first_chunk = self
195 .iter_chunk_terminator_indices()
196 .next()
197 .map(|first_term| &self[0..=first_term]);
198
199 let rest = self
200 .iter_chunk_terminator_indices()
201 .tuple_windows()
202 .map(move |(a, b)| &self[a + 1..=b]);
203
204 let last = if let Some(last_i) = self.last_chunk_terminator_index() {
205 if last_i + 1 < self.len() {
206 Some(&self[last_i + 1..])
207 } else {
208 None
209 }
210 } else {
211 Some(self)
212 };
213
214 first_chunk.into_iter().chain(rest).chain(last)
215 }
216
217 fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
218 let first_pg = self
219 .iter_paragraph_break_indices()
220 .next()
221 .map(|first_term| &self[0..=first_term]);
222
223 let rest = self
224 .iter_paragraph_break_indices()
225 .tuple_windows()
226 .map(move |(a, b)| &self[a + 1..=b]);
227
228 let last_pg = if let Some(last_i) = self.last_paragraph_break_index() {
229 if last_i + 1 < self.len() {
230 Some(&self[last_i + 1..])
231 } else {
232 None
233 }
234 } else {
235 Some(self)
236 };
237
238 first_pg.into_iter().chain(rest).chain(last_pg)
239 }
240
241 fn iter_headings(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
242 self.iter_heading_start_indices().map(|start| {
243 let end = self[start..]
244 .iter()
245 .position(|t| t.kind.is_paragraph_break())
246 .unwrap_or(self[start..].len() - 1);
247
248 &self[start..=start + end]
249 })
250 }
251
252 fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
253 let first_sentence = self
254 .iter_sentence_terminator_indices()
255 .next()
256 .map(|first_term| &self[0..=first_term]);
257
258 let rest = self
259 .iter_sentence_terminator_indices()
260 .tuple_windows()
261 .map(move |(a, b)| &self[a + 1..=b]);
262
263 let last_sentence = if let Some(last_i) = self.last_sentence_terminator_index() {
264 if last_i + 1 < self.len() {
265 Some(&self[last_i + 1..])
266 } else {
267 None
268 }
269 } else {
270 Some(self)
271 };
272
273 first_sentence.into_iter().chain(rest).chain(last_sentence)
274 }
275
276 fn iter_sentences_mut(&mut self) -> impl Iterator<Item = &mut [Token]> + '_ {
277 struct SentIter<'a> {
278 rem: &'a mut [Token],
279 }
280
281 impl<'a> Iterator for SentIter<'a> {
282 type Item = &'a mut [Token];
283
284 fn next(&mut self) -> Option<Self::Item> {
285 if self.rem.is_empty() {
286 return None;
287 }
288 let split = self
289 .rem
290 .iter()
291 .position(|t| t.kind.is_sentence_terminator())
292 .map(|i| i + 1)
293 .unwrap_or(self.rem.len());
294 let tmp = core::mem::take(&mut self.rem);
295 let (sent, rest) = tmp.split_at_mut(split);
296 self.rem = rest;
297 Some(sent)
298 }
299 }
300
301 SentIter { rem: self }
302 }
303}