Skip to main content

harper_core/
token_string_ext.rs

1use crate::{Span, Token};
2use itertools::Itertools;
3use paste::paste;
4
5macro_rules! create_decl_for {
6    ($thing:ident) => {
7        paste! {
8            fn [< first_ $thing >](&self) -> Option<&Token>;
9
10            fn [< last_ $thing >](&self) -> Option<&Token>;
11
12            fn [< last_ $thing _index >](&self) -> Option<usize>;
13
14            fn [<iter_ $thing _indices>](&self) -> impl DoubleEndedIterator<Item = usize> + '_;
15
16            fn [<iter_ $thing s>](&self) -> impl Iterator<Item = &Token> + '_;
17        }
18    };
19}
20
21macro_rules! create_fns_for {
22    ($thing:ident) => {
23        paste! {
24            fn [< first_ $thing >](&self) -> Option<&Token> {
25                self.iter().find(|v| v.kind.[<is_ $thing>]())
26            }
27
28            fn [< last_ $thing >](&self) -> Option<&Token> {
29                self.iter().rev().find(|v| v.kind.[<is_ $thing>]())
30            }
31
32            fn [< last_ $thing _index >](&self) -> Option<usize> {
33                self.iter().rev().position(|v| v.kind.[<is_ $thing>]()).map(|i| self.len() - i - 1)
34            }
35
36            fn [<iter_ $thing _indices>](&self) -> impl DoubleEndedIterator<Item = usize> + '_ {
37                self.iter()
38                    .enumerate()
39                    .filter(|(_, t)| t.kind.[<is_ $thing>]())
40                    .map(|(i, _)| i)
41            }
42
43            fn [<iter_ $thing s>](&self) -> impl Iterator<Item = &Token> + '_ {
44                self.[<iter_ $thing _indices>]().map(|i| &self[i])
45            }
46        }
47    };
48}
49
50mod private {
51    use crate::{Document, Token};
52
53    pub trait Sealed {}
54
55    impl Sealed for [Token] {}
56
57    impl Sealed for Document {}
58}
59
60/// Extension methods for [`Token`] sequences that make them easier to wrangle and query.
61pub trait TokenStringExt: private::Sealed {
62    fn first_sentence_word(&self) -> Option<&Token>;
63    fn first_non_whitespace(&self) -> Option<&Token>;
64    /// Grab the span that represents the beginning of the first element and the
65    /// end of the last element.
66    fn span(&self) -> Option<Span<char>>;
67
68    create_decl_for!(adjective);
69    create_decl_for!(apostrophe);
70    create_decl_for!(at);
71    create_decl_for!(comma);
72    create_decl_for!(conjunction);
73    create_decl_for!(chunk_terminator);
74    create_decl_for!(currency);
75    create_decl_for!(ellipsis);
76    create_decl_for!(hostname);
77    create_decl_for!(likely_homograph);
78    create_decl_for!(number);
79    create_decl_for!(noun);
80    create_decl_for!(paragraph_break);
81    create_decl_for!(pipe);
82    create_decl_for!(preposition);
83    create_decl_for!(punctuation);
84    create_decl_for!(quote);
85    create_decl_for!(sentence_terminator);
86    create_decl_for!(space);
87    create_decl_for!(unlintable);
88    create_decl_for!(verb);
89    create_decl_for!(word);
90    create_decl_for!(word_like);
91    create_decl_for!(heading_start);
92
93    /// Get a reference to a token by index, with negative numbers counting from the end.
94    ///
95    /// # Examples
96    /// ```
97    /// # use harper_core::{Token, TokenStringExt, parsers::{Parser, PlainEnglish}};
98    /// # fn main() {
99    /// let source = "The cat sat on the mat.".chars().collect::<Vec<_>>();
100    /// let tokens = PlainEnglish.parse(&source);
101    /// assert_eq!(tokens.get_rel(0).unwrap().span.get_content_string(&source), "The");
102    /// assert_eq!(tokens.get_rel(1).unwrap().kind.is_whitespace(), true);
103    /// assert_eq!(tokens.get_rel(-1).unwrap().kind.is_punctuation(), true);
104    /// assert_eq!(tokens.get_rel(-2).unwrap().span.get_content_string(&source), "mat");
105    /// # }
106    /// ```
107    ///
108    /// # Returns
109    ///
110    /// * `Some(&Token)` - If the index is in bounds
111    /// * `None` - If the index is out of bounds
112    fn get_rel(&self, index: isize) -> Option<&Token>
113    where
114        Self: AsRef<[Token]>,
115    {
116        let slice = self.as_ref();
117        let len = slice.len() as isize;
118
119        if index >= len || -index > len {
120            return None;
121        }
122
123        let idx = if index >= 0 { index } else { len + index } as usize;
124
125        slice.get(idx)
126    }
127
128    /// Get a slice of tokens using relative indices.
129    ///
130    /// # Examples
131    /// ```
132    /// # use harper_core::{Token, TokenStringExt, parsers::{Parser, PlainEnglish}};
133    /// # fn main() {
134    /// let source = "The cat sat on the mat.".chars().collect::<Vec<_>>();
135    /// let tokens = PlainEnglish.parse(&source);
136    /// assert_eq!(tokens.get_rel_slice(0, 2).unwrap().span().unwrap().get_content_string(&source), "The cat");
137    /// assert_eq!(tokens.get_rel_slice(-3, -1).unwrap().span().unwrap().get_content_string(&source), " mat.");
138    /// # }
139    /// ```
140    fn get_rel_slice(&self, rel_start: isize, inclusive_end: isize) -> Option<&[Token]>
141    where
142        Self: AsRef<[Token]>,
143    {
144        let slice = self.as_ref();
145        let len = slice.len() as isize;
146
147        // Convert relative indices to absolute indices
148        let start_idx = if rel_start >= 0 {
149            rel_start
150        } else {
151            len + rel_start
152        } as usize;
153
154        let end_idx_plus_one = if inclusive_end >= 0 {
155            inclusive_end + 1 // +1 to make end exclusive
156        } else {
157            len + inclusive_end + 1
158        } as usize;
159
160        // Check bounds
161        if start_idx >= slice.len()
162            || end_idx_plus_one > slice.len()
163            || start_idx >= end_idx_plus_one
164        {
165            return None;
166        }
167
168        Some(&slice[start_idx..end_idx_plus_one])
169    }
170
171    fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_;
172    fn iter_linking_verbs(&self) -> impl Iterator<Item = &Token> + '_;
173
174    /// Iterate over chunks.
175    ///
176    /// For example, the following sentence contains two chunks separated by a
177    /// comma:
178    ///
179    /// ```text
180    /// Here is an example, it is short.
181    /// ```
182    fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_;
183
184    /// Get an iterator over token slices that represent the individual
185    /// paragraphs in a document.
186    fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_;
187
188    /// Get an iterator over token slices that represent headings.
189    ///
190    /// A heading begins with a [`TokenKind::HeadingStart`](crate::TokenKind::HeadingStart) token and ends with
191    /// the next [`TokenKind::ParagraphBreak`](crate::TokenKind::ParagraphBreak).
192    fn iter_headings(&self) -> impl Iterator<Item = &'_ [Token]> + '_;
193
194    /// Get an iterator over token slices that represent the individual
195    /// sentences in a document.
196    fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_;
197
198    /// Get an iterator over mutable token slices that represent the individual
199    /// sentences in a document.
200    fn iter_sentences_mut(&mut self) -> impl Iterator<Item = &'_ mut [Token]> + '_;
201}
202
203impl TokenStringExt for [Token] {
204    create_fns_for!(adjective);
205    create_fns_for!(apostrophe);
206    create_fns_for!(at);
207    create_fns_for!(chunk_terminator);
208    create_fns_for!(comma);
209    create_fns_for!(conjunction);
210    create_fns_for!(currency);
211    create_fns_for!(ellipsis);
212    create_fns_for!(hostname);
213    create_fns_for!(likely_homograph);
214    create_fns_for!(noun);
215    create_fns_for!(number);
216    create_fns_for!(paragraph_break);
217    create_fns_for!(pipe);
218    create_fns_for!(preposition);
219    create_fns_for!(punctuation);
220    create_fns_for!(quote);
221    create_fns_for!(sentence_terminator);
222    create_fns_for!(space);
223    create_fns_for!(unlintable);
224    create_fns_for!(verb);
225    create_fns_for!(word_like);
226    create_fns_for!(word);
227    create_fns_for!(heading_start);
228
229    fn first_non_whitespace(&self) -> Option<&Token> {
230        self.iter().find(|t| !t.kind.is_whitespace())
231    }
232
233    fn first_sentence_word(&self) -> Option<&Token> {
234        let (w_idx, word) = self.iter().find_position(|v| v.kind.is_word())?;
235
236        let Some(u_idx) = self.iter().position(|v| v.kind.is_unlintable()) else {
237            return Some(word);
238        };
239
240        if w_idx < u_idx { Some(word) } else { None }
241    }
242
243    fn span(&self) -> Option<Span<char>> {
244        let min_max = self
245            .iter()
246            .flat_map(|v| [v.span.start, v.span.end].into_iter())
247            .minmax();
248
249        match min_max {
250            itertools::MinMaxResult::NoElements => None,
251            itertools::MinMaxResult::OneElement(min) => Some(Span::new(min, min)),
252            itertools::MinMaxResult::MinMax(min, max) => Some(Span::new(min, max)),
253        }
254    }
255
256    fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_ {
257        self.iter_word_indices().filter(|idx| {
258            let word = &self[*idx];
259            let Some(Some(meta)) = word.kind.as_word() else {
260                return false;
261            };
262
263            meta.is_linking_verb()
264        })
265    }
266
267    fn iter_linking_verbs(&self) -> impl Iterator<Item = &Token> + '_ {
268        self.iter_linking_verb_indices().map(|idx| &self[idx])
269    }
270
271    fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
272        self.split_inclusive(|tok| tok.kind.is_chunk_terminator())
273    }
274
275    fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
276        self.split_inclusive(|tok| tok.kind.is_paragraph_break())
277    }
278
279    fn iter_headings(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
280        self.iter_heading_start_indices().map(|start| {
281            let end = self[start..]
282                .iter()
283                .position(|t| t.kind.is_paragraph_break())
284                .unwrap_or(self[start..].len() - 1);
285
286            &self[start..=start + end]
287        })
288    }
289
290    fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
291        self.split_inclusive(|token| token.kind.is_sentence_terminator())
292    }
293
294    fn iter_sentences_mut(&mut self) -> impl Iterator<Item = &mut [Token]> + '_ {
295        struct SentIter<'a> {
296            rem: &'a mut [Token],
297        }
298
299        impl<'a> Iterator for SentIter<'a> {
300            type Item = &'a mut [Token];
301
302            fn next(&mut self) -> Option<Self::Item> {
303                if self.rem.is_empty() {
304                    return None;
305                }
306                let split = self
307                    .rem
308                    .iter()
309                    .position(|t| t.kind.is_sentence_terminator())
310                    .map(|i| i + 1)
311                    .unwrap_or(self.rem.len());
312                let tmp = core::mem::take(&mut self.rem);
313                let (sent, rest) = tmp.split_at_mut(split);
314                self.rem = rest;
315                Some(sent)
316            }
317        }
318
319        SentIter { rem: self }
320    }
321}