harper_core/
token_string_ext.rs1use crate::{Span, Token};
2use itertools::Itertools;
3use paste::paste;
4
5macro_rules! create_fns_for {
6 ($thing:ident) => {
7 paste! {
8 fn [< first_ $thing >](&self) -> Option<&Token> {
9 self.tokens().iter().find(|v| v.kind.[<is_ $thing>]())
10 }
11
12 fn [< last_ $thing >](&self) -> Option<&Token> {
13 self.tokens().iter().rev().find(|v| v.kind.[<is_ $thing>]())
14 }
15
16 fn [< last_ $thing _index >](&self) -> Option<usize> {
17 let tokens = self.tokens();
18
19 tokens.iter().rev().position(|v| v.kind.[<is_ $thing>]()).map(|i| tokens.len() - i - 1)
20 }
21
22 fn [<iter_ $thing _indices>](&self) -> impl DoubleEndedIterator<Item = usize> + '_ {
23 self.tokens().iter()
24 .enumerate()
25 .filter(|(_, t)| t.kind.[<is_ $thing>]())
26 .map(|(i, _)| i)
27 }
28
29 fn [<iter_ $thing s>](&self) -> impl Iterator<Item = &Token> + '_ {
30 let tokens = self.tokens();
31
32 tokens.[<iter_ $thing _indices>]().map(|i| &tokens[i])
33 }
34 }
35 };
36}
37
38mod private {
39 use crate::{Document, Token};
40
41 pub trait Sealed {}
42
43 impl Sealed for [Token] {}
44
45 impl Sealed for Document {}
46}
47
48pub trait TokenStringExt: private::Sealed {
50 fn tokens(&self) -> &[Token];
52
53 fn tokens_mut(&mut self) -> &mut [Token];
55
56 create_fns_for!(adjective);
57 create_fns_for!(apostrophe);
58 create_fns_for!(at);
59 create_fns_for!(comma);
60 create_fns_for!(conjunction);
61 create_fns_for!(chunk_terminator);
62 create_fns_for!(currency);
63 create_fns_for!(ellipsis);
64 create_fns_for!(hostname);
65 create_fns_for!(likely_homograph);
66 create_fns_for!(number);
67 create_fns_for!(noun);
68 create_fns_for!(paragraph_break);
69 create_fns_for!(pipe);
70 create_fns_for!(preposition);
71 create_fns_for!(punctuation);
72 create_fns_for!(quote);
73 create_fns_for!(sentence_terminator);
74 create_fns_for!(space);
75 create_fns_for!(unlintable);
76 create_fns_for!(verb);
77 create_fns_for!(word);
78 create_fns_for!(word_like);
79 create_fns_for!(heading_start);
80
81 fn first_sentence_word(&self) -> Option<&Token> {
82 let tokens = self.tokens();
83
84 let (w_idx, word) = tokens.iter().find_position(|v| v.kind.is_word())?;
85
86 let Some(u_idx) = tokens.iter().position(|v| v.kind.is_unlintable()) else {
87 return Some(word);
88 };
89
90 if w_idx < u_idx { Some(word) } else { None }
91 }
92
93 fn first_non_whitespace(&self) -> Option<&Token> {
94 self.tokens().iter().find(|t| !t.kind.is_whitespace())
95 }
96
97 fn span(&self) -> Option<Span<char>> {
100 let min_max = self
101 .tokens()
102 .iter()
103 .flat_map(|v| [v.span.start, v.span.end].into_iter())
104 .minmax();
105
106 match min_max {
107 itertools::MinMaxResult::NoElements => None,
108 itertools::MinMaxResult::OneElement(min) => Some(Span::new(min, min)),
109 itertools::MinMaxResult::MinMax(min, max) => Some(Span::new(min, max)),
110 }
111 }
112
113 fn get_rel(&self, index: isize) -> Option<&Token>
133 where
134 Self: AsRef<[Token]>,
135 {
136 let slice = self.as_ref();
137 let len = slice.len() as isize;
138
139 if index >= len || -index > len {
140 return None;
141 }
142
143 let idx = if index >= 0 { index } else { len + index } as usize;
144
145 slice.get(idx)
146 }
147
148 fn get_rel_slice(&self, rel_start: isize, inclusive_end: isize) -> Option<&[Token]>
161 where
162 Self: AsRef<[Token]>,
163 {
164 let slice = self.as_ref();
165 let len = slice.len() as isize;
166
167 let start_idx = if rel_start >= 0 {
169 rel_start
170 } else {
171 len + rel_start
172 } as usize;
173
174 let end_idx_plus_one = if inclusive_end >= 0 {
175 inclusive_end + 1 } else {
177 len + inclusive_end + 1
178 } as usize;
179
180 if start_idx >= slice.len()
182 || end_idx_plus_one > slice.len()
183 || start_idx >= end_idx_plus_one
184 {
185 return None;
186 }
187
188 Some(&slice[start_idx..end_idx_plus_one])
189 }
190
191 fn get_ch<'a>(&self, src: &'a [char]) -> Option<&'a [char]> {
193 self.span().map(|s| s.get_content(src))
194 }
195
196 fn get_str(&self, src: &[char]) -> Option<String> {
197 self.span().map(|s| s.get_content_string(src))
198 }
199
200 fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_ {
201 let tokens = self.tokens();
202
203 tokens.iter_word_indices().filter(|idx| {
204 let word = &tokens[*idx];
205 let Some(Some(meta)) = word.kind.as_word() else {
206 return false;
207 };
208
209 meta.is_linking_verb()
210 })
211 }
212
213 fn iter_linking_verbs(&self) -> impl Iterator<Item = &Token> + '_ {
214 let tokens = self.tokens();
215
216 tokens.iter_linking_verb_indices().map(|idx| &tokens[idx])
217 }
218
219 fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
228 self.tokens()
229 .split_inclusive(|tok| tok.kind.is_chunk_terminator())
230 }
231
232 fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
235 self.tokens()
236 .split_inclusive(|tok| tok.kind.is_paragraph_break())
237 }
238
239 fn iter_headings(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
244 let tokens = self.tokens();
245
246 tokens.iter_heading_start_indices().map(|start| {
247 let end = tokens[start..]
248 .iter()
249 .position(|t| t.kind.is_paragraph_break())
250 .unwrap_or(tokens[start..].len() - 1);
251
252 &tokens[start..=start + end]
253 })
254 }
255
256 fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
259 self.tokens()
260 .split_inclusive(|tok| tok.kind.is_sentence_terminator())
261 }
262
263 fn iter_sentences_mut(&mut self) -> impl Iterator<Item = &'_ mut [Token]> + '_ {
266 struct SentIter<'a> {
267 rem: &'a mut [Token],
268 }
269
270 impl<'a> Iterator for SentIter<'a> {
271 type Item = &'a mut [Token];
272
273 fn next(&mut self) -> Option<Self::Item> {
274 if self.rem.is_empty() {
275 return None;
276 }
277 let split = self
278 .rem
279 .iter()
280 .position(|t| t.kind.is_sentence_terminator())
281 .map(|i| i + 1)
282 .unwrap_or(self.rem.len());
283 let tmp = core::mem::take(&mut self.rem);
284 let (sent, rest) = tmp.split_at_mut(split);
285 self.rem = rest;
286 Some(sent)
287 }
288 }
289
290 let tokens = self.tokens_mut();
291
292 SentIter { rem: tokens }
293 }
294}
295
296impl TokenStringExt for [Token] {
297 fn tokens(&self) -> &[Token] {
298 self
299 }
300
301 fn tokens_mut(&mut self) -> &mut [Token] {
302 self
303 }
304}