harper_core/
token_string_ext.rs

1use crate::{Span, Token};
2use itertools::Itertools;
3use paste::paste;
4
5macro_rules! create_decl_for {
6    ($thing:ident) => {
7        paste! {
8            fn [< first_ $thing >](&self) -> Option<&Token>;
9
10            fn [< last_ $thing >](&self) -> Option<&Token>;
11
12            fn [< last_ $thing _index >](&self) -> Option<usize>;
13
14            fn [<iter_ $thing _indices>](&self) -> impl DoubleEndedIterator<Item = usize> + '_;
15
16            fn [<iter_ $thing s>](&self) -> impl Iterator<Item = &Token> + '_;
17        }
18    };
19}
20
21macro_rules! create_fns_for {
22    ($thing:ident) => {
23        paste! {
24            fn [< first_ $thing >](&self) -> Option<&Token> {
25                self.iter().find(|v| v.kind.[<is_ $thing>]())
26            }
27
28            fn [< last_ $thing >](&self) -> Option<&Token> {
29                self.iter().rev().find(|v| v.kind.[<is_ $thing>]())
30            }
31
32            fn [< last_ $thing _index >](&self) -> Option<usize> {
33                self.iter().rev().position(|v| v.kind.[<is_ $thing>]()).map(|i| self.len() - i - 1)
34            }
35
36            fn [<iter_ $thing _indices>](&self) -> impl DoubleEndedIterator<Item = usize> + '_ {
37                self.iter()
38                    .enumerate()
39                    .filter(|(_, t)| t.kind.[<is_ $thing>]())
40                    .map(|(i, _)| i)
41            }
42
43            fn [<iter_ $thing s>](&self) -> impl Iterator<Item = &Token> + '_ {
44                self.[<iter_ $thing _indices>]().map(|i| &self[i])
45            }
46        }
47    };
48}
49
50/// Extension methods for [`Token`] sequences that make them easier to wrangle and query.
51pub trait TokenStringExt {
52    fn first_sentence_word(&self) -> Option<&Token>;
53    fn first_non_whitespace(&self) -> Option<&Token>;
54    /// Grab the span that represents the beginning of the first element and the
55    /// end of the last element.
56    fn span(&self) -> Option<Span<char>>;
57
58    create_decl_for!(adjective);
59    create_decl_for!(apostrophe);
60    create_decl_for!(at);
61    create_decl_for!(comma);
62    create_decl_for!(conjunction);
63    create_decl_for!(chunk_terminator);
64    create_decl_for!(currency);
65    create_decl_for!(ellipsis);
66    create_decl_for!(hostname);
67    create_decl_for!(likely_homograph);
68    create_decl_for!(number);
69    create_decl_for!(noun);
70    create_decl_for!(paragraph_break);
71    create_decl_for!(pipe);
72    create_decl_for!(preposition);
73    create_decl_for!(punctuation);
74    create_decl_for!(quote);
75    create_decl_for!(sentence_terminator);
76    create_decl_for!(space);
77    create_decl_for!(unlintable);
78    create_decl_for!(verb);
79    create_decl_for!(word);
80    create_decl_for!(word_like);
81
82    fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_;
83    fn iter_linking_verbs(&self) -> impl Iterator<Item = &Token> + '_;
84
85    /// Iterate over chunks.
86    ///
87    /// For example, the following sentence contains two chunks separated by a
88    /// comma:
89    ///
90    /// ```text
91    /// Here is an example, it is short.
92    /// ```
93    fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_;
94
95    /// Get an iterator over token slices that represent the individual
96    /// paragraphs in a document.
97    fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_;
98
99    /// Get an iterator over token slices that represent the individual
100    /// sentences in a document.
101    fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_;
102
103    /// Get an iterator over mutable token slices that represent the individual
104    /// sentences in a document.
105    fn iter_sentences_mut(&mut self) -> impl Iterator<Item = &'_ mut [Token]> + '_;
106}
107
108impl TokenStringExt for [Token] {
109    create_fns_for!(adjective);
110    create_fns_for!(apostrophe);
111    create_fns_for!(at);
112    create_fns_for!(chunk_terminator);
113    create_fns_for!(comma);
114    create_fns_for!(conjunction);
115    create_fns_for!(currency);
116    create_fns_for!(ellipsis);
117    create_fns_for!(hostname);
118    create_fns_for!(likely_homograph);
119    create_fns_for!(noun);
120    create_fns_for!(number);
121    create_fns_for!(paragraph_break);
122    create_fns_for!(pipe);
123    create_fns_for!(preposition);
124    create_fns_for!(punctuation);
125    create_fns_for!(quote);
126    create_fns_for!(sentence_terminator);
127    create_fns_for!(space);
128    create_fns_for!(unlintable);
129    create_fns_for!(verb);
130    create_fns_for!(word_like);
131    create_fns_for!(word);
132
133    fn first_non_whitespace(&self) -> Option<&Token> {
134        self.iter().find(|t| !t.kind.is_whitespace())
135    }
136
137    fn first_sentence_word(&self) -> Option<&Token> {
138        let (w_idx, word) = self.iter().find_position(|v| v.kind.is_word())?;
139
140        let Some(u_idx) = self.iter().position(|v| v.kind.is_unlintable()) else {
141            return Some(word);
142        };
143
144        if w_idx < u_idx { Some(word) } else { None }
145    }
146
147    fn span(&self) -> Option<Span<char>> {
148        let min_max = self
149            .iter()
150            .flat_map(|v| [v.span.start, v.span.end].into_iter())
151            .minmax();
152
153        match min_max {
154            itertools::MinMaxResult::NoElements => None,
155            itertools::MinMaxResult::OneElement(min) => Some(Span::new(min, min)),
156            itertools::MinMaxResult::MinMax(min, max) => Some(Span::new(min, max)),
157        }
158    }
159
160    fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_ {
161        self.iter_word_indices().filter(|idx| {
162            let word = &self[*idx];
163            let Some(Some(meta)) = word.kind.as_word() else {
164                return false;
165            };
166
167            meta.is_linking_verb()
168        })
169    }
170
171    fn iter_linking_verbs(&self) -> impl Iterator<Item = &Token> + '_ {
172        self.iter_linking_verb_indices().map(|idx| &self[idx])
173    }
174
175    fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
176        let first_chunk = self
177            .iter_chunk_terminator_indices()
178            .next()
179            .map(|first_term| &self[0..=first_term]);
180
181        let rest = self
182            .iter_chunk_terminator_indices()
183            .tuple_windows()
184            .map(move |(a, b)| &self[a + 1..=b]);
185
186        let last = if let Some(last_i) = self.last_chunk_terminator_index() {
187            if last_i + 1 < self.len() {
188                Some(&self[last_i + 1..])
189            } else {
190                None
191            }
192        } else {
193            Some(self)
194        };
195
196        first_chunk.into_iter().chain(rest).chain(last)
197    }
198
199    fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
200        let first_pg = self
201            .iter_paragraph_break_indices()
202            .next()
203            .map(|first_term| &self[0..=first_term]);
204
205        let rest = self
206            .iter_paragraph_break_indices()
207            .tuple_windows()
208            .map(move |(a, b)| &self[a + 1..=b]);
209
210        let last_pg = if let Some(last_i) = self.last_paragraph_break_index() {
211            if last_i + 1 < self.len() {
212                Some(&self[last_i + 1..])
213            } else {
214                None
215            }
216        } else {
217            Some(self)
218        };
219
220        first_pg.into_iter().chain(rest).chain(last_pg)
221    }
222
223    fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
224        let first_sentence = self
225            .iter_sentence_terminator_indices()
226            .next()
227            .map(|first_term| &self[0..=first_term]);
228
229        let rest = self
230            .iter_sentence_terminator_indices()
231            .tuple_windows()
232            .map(move |(a, b)| &self[a + 1..=b]);
233
234        let last_sentence = if let Some(last_i) = self.last_sentence_terminator_index() {
235            if last_i + 1 < self.len() {
236                Some(&self[last_i + 1..])
237            } else {
238                None
239            }
240        } else {
241            Some(self)
242        };
243
244        first_sentence.into_iter().chain(rest).chain(last_sentence)
245    }
246
247    fn iter_sentences_mut(&mut self) -> impl Iterator<Item = &mut [Token]> + '_ {
248        struct SentIter<'a> {
249            rem: &'a mut [Token],
250        }
251
252        impl<'a> Iterator for SentIter<'a> {
253            type Item = &'a mut [Token];
254
255            fn next(&mut self) -> Option<Self::Item> {
256                if self.rem.is_empty() {
257                    return None;
258                }
259                let split = self
260                    .rem
261                    .iter()
262                    .position(|t| t.kind.is_sentence_terminator())
263                    .map(|i| i + 1)
264                    .unwrap_or(self.rem.len());
265                let tmp = core::mem::take(&mut self.rem);
266                let (sent, rest) = tmp.split_at_mut(split);
267                self.rem = rest;
268                Some(sent)
269            }
270        }
271
272        SentIter { rem: self }
273    }
274}