harper_core/
token_string_ext.rs

1use crate::{Span, Token};
2use itertools::Itertools;
3use paste::paste;
4
5macro_rules! create_decl_for {
6    ($thing:ident) => {
7        paste! {
8            fn [< first_ $thing >](&self) -> Option<&Token>;
9
10            fn [< last_ $thing >](&self) -> Option<&Token>;
11
12            fn [< last_ $thing _index >](&self) -> Option<usize>;
13
14            fn [<iter_ $thing _indices>](&self) -> impl DoubleEndedIterator<Item = usize> + '_;
15
16            fn [<iter_ $thing s>](&self) -> impl Iterator<Item = &Token> + '_;
17        }
18    };
19}
20
21macro_rules! create_fns_for {
22    ($thing:ident) => {
23        paste! {
24            fn [< first_ $thing >](&self) -> Option<&Token> {
25                self.iter().find(|v| v.kind.[<is_ $thing>]())
26            }
27
28            fn [< last_ $thing >](&self) -> Option<&Token> {
29                self.iter().rev().find(|v| v.kind.[<is_ $thing>]())
30            }
31
32            fn [< last_ $thing _index >](&self) -> Option<usize> {
33                self.iter().rev().position(|v| v.kind.[<is_ $thing>]()).map(|i| self.len() - i - 1)
34            }
35
36            fn [<iter_ $thing _indices>](&self) -> impl DoubleEndedIterator<Item = usize> + '_ {
37                self.iter()
38                    .enumerate()
39                    .filter(|(_, t)| t.kind.[<is_ $thing>]())
40                    .map(|(i, _)| i)
41            }
42
43            fn [<iter_ $thing s>](&self) -> impl Iterator<Item = &Token> + '_ {
44                self.[<iter_ $thing _indices>]().map(|i| &self[i])
45            }
46        }
47    };
48}
49
50mod private {
51    use crate::{Document, Token};
52
53    pub trait Sealed {}
54
55    impl Sealed for [Token] {}
56
57    impl Sealed for Document {}
58}
59
60/// Extension methods for [`Token`] sequences that make them easier to wrangle and query.
61pub trait TokenStringExt: private::Sealed {
62    fn first_sentence_word(&self) -> Option<&Token>;
63    fn first_non_whitespace(&self) -> Option<&Token>;
64    /// Grab the span that represents the beginning of the first element and the
65    /// end of the last element.
66    fn span(&self) -> Option<Span<char>>;
67
68    create_decl_for!(adjective);
69    create_decl_for!(apostrophe);
70    create_decl_for!(at);
71    create_decl_for!(comma);
72    create_decl_for!(conjunction);
73    create_decl_for!(chunk_terminator);
74    create_decl_for!(currency);
75    create_decl_for!(ellipsis);
76    create_decl_for!(hostname);
77    create_decl_for!(likely_homograph);
78    create_decl_for!(number);
79    create_decl_for!(noun);
80    create_decl_for!(paragraph_break);
81    create_decl_for!(pipe);
82    create_decl_for!(preposition);
83    create_decl_for!(punctuation);
84    create_decl_for!(quote);
85    create_decl_for!(sentence_terminator);
86    create_decl_for!(space);
87    create_decl_for!(unlintable);
88    create_decl_for!(verb);
89    create_decl_for!(word);
90    create_decl_for!(word_like);
91    create_decl_for!(heading_start);
92
93    /// Get a reference to a token by index, with negative numbers counting from the end.
94    ///
95    /// # Examples
96    /// ```
97    /// # use harper_core::{Token, TokenStringExt, parsers::{Parser, PlainEnglish}};
98    /// # fn main() {
99    /// let source = "The cat sat on the mat.".chars().collect::<Vec<_>>();
100    /// let tokens = PlainEnglish.parse(&source);
101    /// assert_eq!(tokens.get_rel(0).unwrap().span.get_content_string(&source), "The");
102    /// assert_eq!(tokens.get_rel(1).unwrap().kind.is_whitespace(), true);
103    /// assert_eq!(tokens.get_rel(-1).unwrap().kind.is_punctuation(), true);
104    /// assert_eq!(tokens.get_rel(-2).unwrap().span.get_content_string(&source), "mat");
105    /// # }
106    /// ```
107    ///
108    /// # Returns
109    ///
110    /// * `Some(&Token)` - If the index is in bounds
111    /// * `None` - If the index is out of bounds
112    fn get_rel(&self, index: isize) -> Option<&Token>
113    where
114        Self: AsRef<[Token]>,
115    {
116        let slice = self.as_ref();
117        let len = slice.len() as isize;
118
119        if index >= len || -index > len {
120            return None;
121        }
122
123        let idx = if index >= 0 { index } else { len + index } as usize;
124
125        slice.get(idx)
126    }
127
128    fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_;
129    fn iter_linking_verbs(&self) -> impl Iterator<Item = &Token> + '_;
130
131    /// Iterate over chunks.
132    ///
133    /// For example, the following sentence contains two chunks separated by a
134    /// comma:
135    ///
136    /// ```text
137    /// Here is an example, it is short.
138    /// ```
139    fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_;
140
141    /// Get an iterator over token slices that represent the individual
142    /// paragraphs in a document.
143    fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_;
144
145    /// Get an iterator over token slices that represent headings.
146    ///
147    /// A heading begins with a [`TokenKind::HeadingStart`](crate::TokenKind::HeadingStart) token and ends with
148    /// the next [`TokenKind::ParagraphBreak`](crate::TokenKind::ParagraphBreak).
149    fn iter_headings(&self) -> impl Iterator<Item = &'_ [Token]> + '_;
150
151    /// Get an iterator over token slices that represent the individual
152    /// sentences in a document.
153    fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_;
154
155    /// Get an iterator over mutable token slices that represent the individual
156    /// sentences in a document.
157    fn iter_sentences_mut(&mut self) -> impl Iterator<Item = &'_ mut [Token]> + '_;
158}
159
160impl TokenStringExt for [Token] {
161    create_fns_for!(adjective);
162    create_fns_for!(apostrophe);
163    create_fns_for!(at);
164    create_fns_for!(chunk_terminator);
165    create_fns_for!(comma);
166    create_fns_for!(conjunction);
167    create_fns_for!(currency);
168    create_fns_for!(ellipsis);
169    create_fns_for!(hostname);
170    create_fns_for!(likely_homograph);
171    create_fns_for!(noun);
172    create_fns_for!(number);
173    create_fns_for!(paragraph_break);
174    create_fns_for!(pipe);
175    create_fns_for!(preposition);
176    create_fns_for!(punctuation);
177    create_fns_for!(quote);
178    create_fns_for!(sentence_terminator);
179    create_fns_for!(space);
180    create_fns_for!(unlintable);
181    create_fns_for!(verb);
182    create_fns_for!(word_like);
183    create_fns_for!(word);
184    create_fns_for!(heading_start);
185
186    fn first_non_whitespace(&self) -> Option<&Token> {
187        self.iter().find(|t| !t.kind.is_whitespace())
188    }
189
190    fn first_sentence_word(&self) -> Option<&Token> {
191        let (w_idx, word) = self.iter().find_position(|v| v.kind.is_word())?;
192
193        let Some(u_idx) = self.iter().position(|v| v.kind.is_unlintable()) else {
194            return Some(word);
195        };
196
197        if w_idx < u_idx { Some(word) } else { None }
198    }
199
200    fn span(&self) -> Option<Span<char>> {
201        let min_max = self
202            .iter()
203            .flat_map(|v| [v.span.start, v.span.end].into_iter())
204            .minmax();
205
206        match min_max {
207            itertools::MinMaxResult::NoElements => None,
208            itertools::MinMaxResult::OneElement(min) => Some(Span::new(min, min)),
209            itertools::MinMaxResult::MinMax(min, max) => Some(Span::new(min, max)),
210        }
211    }
212
213    fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_ {
214        self.iter_word_indices().filter(|idx| {
215            let word = &self[*idx];
216            let Some(Some(meta)) = word.kind.as_word() else {
217                return false;
218            };
219
220            meta.is_linking_verb()
221        })
222    }
223
224    fn iter_linking_verbs(&self) -> impl Iterator<Item = &Token> + '_ {
225        self.iter_linking_verb_indices().map(|idx| &self[idx])
226    }
227
228    fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
229        self.split_inclusive(|tok| tok.kind.is_chunk_terminator())
230    }
231
232    fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
233        self.split_inclusive(|tok| tok.kind.is_paragraph_break())
234    }
235
236    fn iter_headings(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
237        self.iter_heading_start_indices().map(|start| {
238            let end = self[start..]
239                .iter()
240                .position(|t| t.kind.is_paragraph_break())
241                .unwrap_or(self[start..].len() - 1);
242
243            &self[start..=start + end]
244        })
245    }
246
247    fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
248        self.split_inclusive(|token| token.kind.is_sentence_terminator())
249    }
250
251    fn iter_sentences_mut(&mut self) -> impl Iterator<Item = &mut [Token]> + '_ {
252        struct SentIter<'a> {
253            rem: &'a mut [Token],
254        }
255
256        impl<'a> Iterator for SentIter<'a> {
257            type Item = &'a mut [Token];
258
259            fn next(&mut self) -> Option<Self::Item> {
260                if self.rem.is_empty() {
261                    return None;
262                }
263                let split = self
264                    .rem
265                    .iter()
266                    .position(|t| t.kind.is_sentence_terminator())
267                    .map(|i| i + 1)
268                    .unwrap_or(self.rem.len());
269                let tmp = core::mem::take(&mut self.rem);
270                let (sent, rest) = tmp.split_at_mut(split);
271                self.rem = rest;
272                Some(sent)
273            }
274        }
275
276        SentIter { rem: self }
277    }
278}