harper_core/
token_string_ext.rs

1use crate::{Span, Token};
2use itertools::Itertools;
3use paste::paste;
4
5macro_rules! create_decl_for {
6    ($thing:ident) => {
7        paste! {
8            fn [< first_ $thing >](&self) -> Option<&Token>;
9
10            fn [< last_ $thing >](&self) -> Option<&Token>;
11
12            fn [< last_ $thing _index >](&self) -> Option<usize>;
13
14            fn [<iter_ $thing _indices>](&self) -> impl DoubleEndedIterator<Item = usize> + '_;
15
16            fn [<iter_ $thing s>](&self) -> impl Iterator<Item = &Token> + '_;
17        }
18    };
19}
20
21macro_rules! create_fns_for {
22    ($thing:ident) => {
23        paste! {
24            fn [< first_ $thing >](&self) -> Option<&Token> {
25                self.iter().find(|v| v.kind.[<is_ $thing>]())
26            }
27
28            fn [< last_ $thing >](&self) -> Option<&Token> {
29                self.iter().rev().find(|v| v.kind.[<is_ $thing>]())
30            }
31
32            fn [< last_ $thing _index >](&self) -> Option<usize> {
33                self.iter().rev().position(|v| v.kind.[<is_ $thing>]()).map(|i| self.len() - i - 1)
34            }
35
36            fn [<iter_ $thing _indices>](&self) -> impl DoubleEndedIterator<Item = usize> + '_ {
37                self.iter()
38                    .enumerate()
39                    .filter(|(_, t)| t.kind.[<is_ $thing>]())
40                    .map(|(i, _)| i)
41            }
42
43            fn [<iter_ $thing s>](&self) -> impl Iterator<Item = &Token> + '_ {
44                self.[<iter_ $thing _indices>]().map(|i| &self[i])
45            }
46        }
47    };
48}
49
50mod private {
51    use crate::{Document, Token};
52
53    pub trait Sealed {}
54
55    impl Sealed for [Token] {}
56
57    impl Sealed for Document {}
58}
59
60/// Extension methods for [`Token`] sequences that make them easier to wrangle and query.
61pub trait TokenStringExt: private::Sealed {
62    fn first_sentence_word(&self) -> Option<&Token>;
63    fn first_non_whitespace(&self) -> Option<&Token>;
64    /// Grab the span that represents the beginning of the first element and the
65    /// end of the last element.
66    fn span(&self) -> Option<Span<char>>;
67
68    create_decl_for!(adjective);
69    create_decl_for!(apostrophe);
70    create_decl_for!(at);
71    create_decl_for!(comma);
72    create_decl_for!(conjunction);
73    create_decl_for!(chunk_terminator);
74    create_decl_for!(currency);
75    create_decl_for!(ellipsis);
76    create_decl_for!(hostname);
77    create_decl_for!(likely_homograph);
78    create_decl_for!(number);
79    create_decl_for!(noun);
80    create_decl_for!(paragraph_break);
81    create_decl_for!(pipe);
82    create_decl_for!(preposition);
83    create_decl_for!(punctuation);
84    create_decl_for!(quote);
85    create_decl_for!(sentence_terminator);
86    create_decl_for!(space);
87    create_decl_for!(unlintable);
88    create_decl_for!(verb);
89    create_decl_for!(word);
90    create_decl_for!(word_like);
91    create_decl_for!(heading_start);
92
93    fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_;
94    fn iter_linking_verbs(&self) -> impl Iterator<Item = &Token> + '_;
95
96    /// Iterate over chunks.
97    ///
98    /// For example, the following sentence contains two chunks separated by a
99    /// comma:
100    ///
101    /// ```text
102    /// Here is an example, it is short.
103    /// ```
104    fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_;
105
106    /// Get an iterator over token slices that represent the individual
107    /// paragraphs in a document.
108    fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_;
109
110    /// Get an iterator over token slices that represent headings.
111    ///
112    /// A heading begins with a [`TokenKind::HeadingStart`] token and ends with
113    /// the next [`TokenKind::ParagraphBreak`].
114    fn iter_headings(&self) -> impl Iterator<Item = &'_ [Token]> + '_;
115
116    /// Get an iterator over token slices that represent the individual
117    /// sentences in a document.
118    fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_;
119
120    /// Get an iterator over mutable token slices that represent the individual
121    /// sentences in a document.
122    fn iter_sentences_mut(&mut self) -> impl Iterator<Item = &'_ mut [Token]> + '_;
123}
124
125impl TokenStringExt for [Token] {
126    create_fns_for!(adjective);
127    create_fns_for!(apostrophe);
128    create_fns_for!(at);
129    create_fns_for!(chunk_terminator);
130    create_fns_for!(comma);
131    create_fns_for!(conjunction);
132    create_fns_for!(currency);
133    create_fns_for!(ellipsis);
134    create_fns_for!(hostname);
135    create_fns_for!(likely_homograph);
136    create_fns_for!(noun);
137    create_fns_for!(number);
138    create_fns_for!(paragraph_break);
139    create_fns_for!(pipe);
140    create_fns_for!(preposition);
141    create_fns_for!(punctuation);
142    create_fns_for!(quote);
143    create_fns_for!(sentence_terminator);
144    create_fns_for!(space);
145    create_fns_for!(unlintable);
146    create_fns_for!(verb);
147    create_fns_for!(word_like);
148    create_fns_for!(word);
149    create_fns_for!(heading_start);
150
151    fn first_non_whitespace(&self) -> Option<&Token> {
152        self.iter().find(|t| !t.kind.is_whitespace())
153    }
154
155    fn first_sentence_word(&self) -> Option<&Token> {
156        let (w_idx, word) = self.iter().find_position(|v| v.kind.is_word())?;
157
158        let Some(u_idx) = self.iter().position(|v| v.kind.is_unlintable()) else {
159            return Some(word);
160        };
161
162        if w_idx < u_idx { Some(word) } else { None }
163    }
164
165    fn span(&self) -> Option<Span<char>> {
166        let min_max = self
167            .iter()
168            .flat_map(|v| [v.span.start, v.span.end].into_iter())
169            .minmax();
170
171        match min_max {
172            itertools::MinMaxResult::NoElements => None,
173            itertools::MinMaxResult::OneElement(min) => Some(Span::new(min, min)),
174            itertools::MinMaxResult::MinMax(min, max) => Some(Span::new(min, max)),
175        }
176    }
177
178    fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_ {
179        self.iter_word_indices().filter(|idx| {
180            let word = &self[*idx];
181            let Some(Some(meta)) = word.kind.as_word() else {
182                return false;
183            };
184
185            meta.is_linking_verb()
186        })
187    }
188
189    fn iter_linking_verbs(&self) -> impl Iterator<Item = &Token> + '_ {
190        self.iter_linking_verb_indices().map(|idx| &self[idx])
191    }
192
193    fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
194        let first_chunk = self
195            .iter_chunk_terminator_indices()
196            .next()
197            .map(|first_term| &self[0..=first_term]);
198
199        let rest = self
200            .iter_chunk_terminator_indices()
201            .tuple_windows()
202            .map(move |(a, b)| &self[a + 1..=b]);
203
204        let last = if let Some(last_i) = self.last_chunk_terminator_index() {
205            if last_i + 1 < self.len() {
206                Some(&self[last_i + 1..])
207            } else {
208                None
209            }
210        } else {
211            Some(self)
212        };
213
214        first_chunk.into_iter().chain(rest).chain(last)
215    }
216
217    fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
218        let first_pg = self
219            .iter_paragraph_break_indices()
220            .next()
221            .map(|first_term| &self[0..=first_term]);
222
223        let rest = self
224            .iter_paragraph_break_indices()
225            .tuple_windows()
226            .map(move |(a, b)| &self[a + 1..=b]);
227
228        let last_pg = if let Some(last_i) = self.last_paragraph_break_index() {
229            if last_i + 1 < self.len() {
230                Some(&self[last_i + 1..])
231            } else {
232                None
233            }
234        } else {
235            Some(self)
236        };
237
238        first_pg.into_iter().chain(rest).chain(last_pg)
239    }
240
241    fn iter_headings(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
242        self.iter_heading_start_indices().map(|start| {
243            let end = self[start..]
244                .iter()
245                .position(|t| t.kind.is_paragraph_break())
246                .unwrap_or(self[start..].len() - 1);
247
248            &self[start..=start + end]
249        })
250    }
251
252    fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
253        let first_sentence = self
254            .iter_sentence_terminator_indices()
255            .next()
256            .map(|first_term| &self[0..=first_term]);
257
258        let rest = self
259            .iter_sentence_terminator_indices()
260            .tuple_windows()
261            .map(move |(a, b)| &self[a + 1..=b]);
262
263        let last_sentence = if let Some(last_i) = self.last_sentence_terminator_index() {
264            if last_i + 1 < self.len() {
265                Some(&self[last_i + 1..])
266            } else {
267                None
268            }
269        } else {
270            Some(self)
271        };
272
273        first_sentence.into_iter().chain(rest).chain(last_sentence)
274    }
275
276    fn iter_sentences_mut(&mut self) -> impl Iterator<Item = &mut [Token]> + '_ {
277        struct SentIter<'a> {
278            rem: &'a mut [Token],
279        }
280
281        impl<'a> Iterator for SentIter<'a> {
282            type Item = &'a mut [Token];
283
284            fn next(&mut self) -> Option<Self::Item> {
285                if self.rem.is_empty() {
286                    return None;
287                }
288                let split = self
289                    .rem
290                    .iter()
291                    .position(|t| t.kind.is_sentence_terminator())
292                    .map(|i| i + 1)
293                    .unwrap_or(self.rem.len());
294                let tmp = core::mem::take(&mut self.rem);
295                let (sent, rest) = tmp.split_at_mut(split);
296                self.rem = rest;
297                Some(sent)
298            }
299        }
300
301        SentIter { rem: self }
302    }
303}