Skip to main content

harper_core/
token_string_ext.rs

1use crate::{Span, Token};
2use itertools::Itertools;
3use paste::paste;
4
5macro_rules! create_fns_for {
6    ($thing:ident) => {
7        paste! {
8            fn [< first_ $thing >](&self) -> Option<&Token> {
9                self.tokens().iter().find(|v| v.kind.[<is_ $thing>]())
10            }
11
12            fn [< last_ $thing >](&self) -> Option<&Token> {
13                self.tokens().iter().rev().find(|v| v.kind.[<is_ $thing>]())
14            }
15
16            fn [< last_ $thing _index >](&self) -> Option<usize> {
17                let tokens = self.tokens();
18
19                tokens.iter().rev().position(|v| v.kind.[<is_ $thing>]()).map(|i| tokens.len() - i - 1)
20            }
21
22            fn [<iter_ $thing _indices>](&self) -> impl DoubleEndedIterator<Item = usize> + '_ {
23                self.tokens().iter()
24                    .enumerate()
25                    .filter(|(_, t)| t.kind.[<is_ $thing>]())
26                    .map(|(i, _)| i)
27            }
28
29            fn [<iter_ $thing s>](&self) -> impl Iterator<Item = &Token> + '_ {
30                let tokens = self.tokens();
31
32                tokens.[<iter_ $thing _indices>]().map(|i| &tokens[i])
33            }
34        }
35    };
36}
37
38mod private {
39    use crate::{Document, Token};
40
41    pub trait Sealed {}
42
43    impl Sealed for [Token] {}
44
45    impl Sealed for Document {}
46}
47
48/// Extension methods for [`Token`] sequences that make them easier to wrangle and query.
49pub trait TokenStringExt: private::Sealed {
50    // Used by the default implementations.
51    fn tokens(&self) -> &[Token];
52
53    // Used by the default implementations.
54    fn tokens_mut(&mut self) -> &mut [Token];
55
56    create_fns_for!(adjective);
57    create_fns_for!(apostrophe);
58    create_fns_for!(at);
59    create_fns_for!(comma);
60    create_fns_for!(conjunction);
61    create_fns_for!(chunk_terminator);
62    create_fns_for!(currency);
63    create_fns_for!(ellipsis);
64    create_fns_for!(hostname);
65    create_fns_for!(likely_homograph);
66    create_fns_for!(number);
67    create_fns_for!(noun);
68    create_fns_for!(paragraph_break);
69    create_fns_for!(pipe);
70    create_fns_for!(preposition);
71    create_fns_for!(punctuation);
72    create_fns_for!(quote);
73    create_fns_for!(sentence_terminator);
74    create_fns_for!(space);
75    create_fns_for!(unlintable);
76    create_fns_for!(verb);
77    create_fns_for!(word);
78    create_fns_for!(word_like);
79    create_fns_for!(heading_start);
80
81    fn first_sentence_word(&self) -> Option<&Token> {
82        let tokens = self.tokens();
83
84        let (w_idx, word) = tokens.iter().find_position(|v| v.kind.is_word())?;
85
86        let Some(u_idx) = tokens.iter().position(|v| v.kind.is_unlintable()) else {
87            return Some(word);
88        };
89
90        if w_idx < u_idx { Some(word) } else { None }
91    }
92
93    fn first_non_whitespace(&self) -> Option<&Token> {
94        self.tokens().iter().find(|t| !t.kind.is_whitespace())
95    }
96
97    /// Grab the span that represents the beginning of the first element and the
98    /// end of the last element.
99    fn span(&self) -> Option<Span<char>> {
100        let min_max = self
101            .tokens()
102            .iter()
103            .flat_map(|v| [v.span.start, v.span.end].into_iter())
104            .minmax();
105
106        match min_max {
107            itertools::MinMaxResult::NoElements => None,
108            itertools::MinMaxResult::OneElement(min) => Some(Span::new(min, min)),
109            itertools::MinMaxResult::MinMax(min, max) => Some(Span::new(min, max)),
110        }
111    }
112
113    /// Get a reference to a token by index, with negative numbers counting from the end.
114    ///
115    /// # Examples
116    /// ```
117    /// # use harper_core::{Token, TokenStringExt, parsers::{Parser, PlainEnglish}};
118    /// # fn main() {
119    /// let source = "The cat sat on the mat.".chars().collect::<Vec<_>>();
120    /// let tokens = PlainEnglish.parse(&source);
121    /// assert_eq!(tokens.get_rel(0).unwrap().get_str(&source), "The");
122    /// assert_eq!(tokens.get_rel(1).unwrap().kind.is_whitespace(), true);
123    /// assert_eq!(tokens.get_rel(-1).unwrap().kind.is_punctuation(), true);
124    /// assert_eq!(tokens.get_rel(-2).unwrap().get_str(&source), "mat");
125    /// # }
126    /// ```
127    ///
128    /// # Returns
129    ///
130    /// * `Some(&Token)` - If the index is in bounds
131    /// * `None` - If the index is out of bounds
132    fn get_rel(&self, index: isize) -> Option<&Token>
133    where
134        Self: AsRef<[Token]>,
135    {
136        let slice = self.as_ref();
137        let len = slice.len() as isize;
138
139        if index >= len || -index > len {
140            return None;
141        }
142
143        let idx = if index >= 0 { index } else { len + index } as usize;
144
145        slice.get(idx)
146    }
147
148    /// Get a slice of tokens using relative indices.
149    ///
150    /// # Examples
151    /// ```
152    /// # use harper_core::{Token, TokenStringExt, parsers::{Parser, PlainEnglish}};
153    /// # fn main() {
154    /// let source = "The cat sat on the mat.".chars().collect::<Vec<_>>();
155    /// let tokens = PlainEnglish.parse(&source);
156    /// assert_eq!(tokens.get_rel_slice(0, 2).unwrap().span().unwrap().get_content_string(&source), "The cat");
157    /// assert_eq!(tokens.get_rel_slice(-3, -1).unwrap().span().unwrap().get_content_string(&source), " mat.");
158    /// # }
159    /// ```
160    fn get_rel_slice(&self, rel_start: isize, inclusive_end: isize) -> Option<&[Token]>
161    where
162        Self: AsRef<[Token]>,
163    {
164        let slice = self.as_ref();
165        let len = slice.len() as isize;
166
167        // Convert relative indices to absolute indices
168        let start_idx = if rel_start >= 0 {
169            rel_start
170        } else {
171            len + rel_start
172        } as usize;
173
174        let end_idx_plus_one = if inclusive_end >= 0 {
175            inclusive_end + 1 // +1 to make end exclusive
176        } else {
177            len + inclusive_end + 1
178        } as usize;
179
180        // Check bounds
181        if start_idx >= slice.len()
182            || end_idx_plus_one > slice.len()
183            || start_idx >= end_idx_plus_one
184        {
185            return None;
186        }
187
188        Some(&slice[start_idx..end_idx_plus_one])
189    }
190
191    // delegate to span
192    fn get_ch<'a>(&self, src: &'a [char]) -> Option<&'a [char]> {
193        self.span().map(|s| s.get_content(src))
194    }
195
196    fn get_str(&self, src: &[char]) -> Option<String> {
197        self.span().map(|s| s.get_content_string(src))
198    }
199
200    fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_ {
201        let tokens = self.tokens();
202
203        tokens.iter_word_indices().filter(|idx| {
204            let word = &tokens[*idx];
205            let Some(Some(meta)) = word.kind.as_word() else {
206                return false;
207            };
208
209            meta.is_linking_verb()
210        })
211    }
212
213    fn iter_linking_verbs(&self) -> impl Iterator<Item = &Token> + '_ {
214        let tokens = self.tokens();
215
216        tokens.iter_linking_verb_indices().map(|idx| &tokens[idx])
217    }
218
219    /// Iterate over chunks.
220    ///
221    /// For example, the following sentence contains two chunks separated by a
222    /// comma:
223    ///
224    /// ```text
225    /// Here is an example, it is short.
226    /// ```
227    fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
228        self.tokens()
229            .split_inclusive(|tok| tok.kind.is_chunk_terminator())
230    }
231
232    /// Get an iterator over token slices that represent the individual
233    /// paragraphs in a document.
234    fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
235        self.tokens()
236            .split_inclusive(|tok| tok.kind.is_paragraph_break())
237    }
238
239    /// Get an iterator over token slices that represent headings.
240    ///
241    /// A heading begins with a [`TokenKind::HeadingStart`](crate::TokenKind::HeadingStart) token and ends with
242    /// the next [`TokenKind::ParagraphBreak`](crate::TokenKind::ParagraphBreak).
243    fn iter_headings(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
244        let tokens = self.tokens();
245
246        tokens.iter_heading_start_indices().map(|start| {
247            let end = tokens[start..]
248                .iter()
249                .position(|t| t.kind.is_paragraph_break())
250                .unwrap_or(tokens[start..].len() - 1);
251
252            &tokens[start..=start + end]
253        })
254    }
255
256    /// Get an iterator over token slices that represent the individual
257    /// sentences in a document.
258    fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
259        self.tokens()
260            .split_inclusive(|tok| tok.kind.is_sentence_terminator())
261    }
262
263    /// Get an iterator over mutable token slices that represent the individual
264    /// sentences in a document.
265    fn iter_sentences_mut(&mut self) -> impl Iterator<Item = &'_ mut [Token]> + '_ {
266        struct SentIter<'a> {
267            rem: &'a mut [Token],
268        }
269
270        impl<'a> Iterator for SentIter<'a> {
271            type Item = &'a mut [Token];
272
273            fn next(&mut self) -> Option<Self::Item> {
274                if self.rem.is_empty() {
275                    return None;
276                }
277                let split = self
278                    .rem
279                    .iter()
280                    .position(|t| t.kind.is_sentence_terminator())
281                    .map(|i| i + 1)
282                    .unwrap_or(self.rem.len());
283                let tmp = core::mem::take(&mut self.rem);
284                let (sent, rest) = tmp.split_at_mut(split);
285                self.rem = rest;
286                Some(sent)
287            }
288        }
289
290        let tokens = self.tokens_mut();
291
292        SentIter { rem: tokens }
293    }
294}
295
296impl TokenStringExt for [Token] {
297    fn tokens(&self) -> &[Token] {
298        self
299    }
300
301    fn tokens_mut(&mut self) -> &mut [Token] {
302        self
303    }
304}