harper_core/
token_string_ext.rs

1use crate::{Span, Token, TokenKind};
2use itertools::Itertools;
3use paste::paste;
4
5macro_rules! create_decl_for {
6    ($thing:ident) => {
7        paste! {
8            fn [< first_ $thing >](&self) -> Option<Token>;
9
10            fn [< last_ $thing >](&self) -> Option<Token>;
11
12            fn [< last_ $thing _index >](&self) -> Option<usize>;
13
14            fn [<iter_ $thing _indices>](&self) -> impl Iterator<Item = usize> + '_;
15
16            fn [<iter_ $thing s>](&self) -> impl Iterator<Item = Token> + '_;
17        }
18    };
19}
20
21macro_rules! create_fns_for {
22    ($thing:ident) => {
23        paste! {
24            fn [< first_ $thing >](&self) -> Option<Token> {
25                self.iter().find(|v| v.kind.[<is_ $thing>]()).copied()
26            }
27
28            fn [< last_ $thing >](&self) -> Option<Token> {
29                self.iter().rev().find(|v| v.kind.[<is_ $thing>]()).copied()
30            }
31
32            fn [< last_ $thing _index >](&self) -> Option<usize> {
33                self.iter().rev().position(|v| v.kind.[<is_ $thing>]()).map(|i| self.len() - i - 1)
34            }
35
36            fn [<iter_ $thing _indices>](&self) -> impl Iterator<Item = usize> + '_ {
37                self.iter()
38                    .enumerate()
39                    .filter(|(_, t)| t.kind.[<is_ $thing>]())
40                    .map(|(i, _)| i)
41            }
42
43            fn [<iter_ $thing s>](&self) -> impl Iterator<Item = Token> + '_ {
44                self.[<iter_ $thing _indices>]().map(|i| self[i])
45            }
46        }
47    };
48}
49
50/// Extension methods for [`Token`] sequences that make them easier to wrangle and query.
51pub trait TokenStringExt {
52    fn first_sentence_word(&self) -> Option<Token>;
53    fn first_non_whitespace(&self) -> Option<Token>;
54    /// Grab the span that represents the beginning of the first element and the
55    /// end of the last element.
56    fn span(&self) -> Option<Span>;
57
58    create_decl_for!(word);
59    create_decl_for!(word_like);
60    create_decl_for!(conjunction);
61    create_decl_for!(space);
62    create_decl_for!(apostrophe);
63    create_decl_for!(pipe);
64    create_decl_for!(quote);
65    create_decl_for!(number);
66    create_decl_for!(at);
67    create_decl_for!(ellipsis);
68    create_decl_for!(unlintable);
69    create_decl_for!(sentence_terminator);
70    create_decl_for!(paragraph_break);
71    create_decl_for!(chunk_terminator);
72    create_decl_for!(punctuation);
73    create_decl_for!(currency);
74    create_decl_for!(likely_homograph);
75    create_decl_for!(comma);
76
77    fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_;
78    fn iter_linking_verbs(&self) -> impl Iterator<Item = Token> + '_;
79
80    /// Iterate over chunks.
81    ///
82    /// For example, the following sentence contains two chunks separated by a
83    /// comma:
84    ///
85    /// ```text
86    /// Here is an example, it is short.
87    /// ```
88    fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_;
89
90    /// Get an iterator over token slices that represent the individual
91    /// paragraphs in a document.
92    fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_;
93
94    /// Get an iterator over token slices that represent the individual
95    /// sentences in a document.
96    fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_;
97}
98
99impl TokenStringExt for [Token] {
100    create_fns_for!(word);
101    create_fns_for!(word_like);
102    create_fns_for!(conjunction);
103    create_fns_for!(space);
104    create_fns_for!(apostrophe);
105    create_fns_for!(pipe);
106    create_fns_for!(quote);
107    create_fns_for!(number);
108    create_fns_for!(at);
109    create_fns_for!(punctuation);
110    create_fns_for!(ellipsis);
111    create_fns_for!(unlintable);
112    create_fns_for!(sentence_terminator);
113    create_fns_for!(paragraph_break);
114    create_fns_for!(chunk_terminator);
115    create_fns_for!(currency);
116    create_fns_for!(likely_homograph);
117    create_fns_for!(comma);
118
119    fn first_non_whitespace(&self) -> Option<Token> {
120        self.iter().find(|t| !t.kind.is_whitespace()).copied()
121    }
122
123    fn first_sentence_word(&self) -> Option<Token> {
124        let (w_idx, word) = self.iter().find_position(|v| v.kind.is_word())?;
125
126        let Some(u_idx) = self.iter().position(|v| v.kind.is_unlintable()) else {
127            return Some(*word);
128        };
129
130        if w_idx < u_idx {
131            Some(*word)
132        } else {
133            None
134        }
135    }
136
137    fn span(&self) -> Option<Span> {
138        let min_max = self
139            .iter()
140            .flat_map(|v| [v.span.start, v.span.end].into_iter())
141            .minmax();
142
143        match min_max {
144            itertools::MinMaxResult::NoElements => None,
145            itertools::MinMaxResult::OneElement(min) => Some(Span::new(min, min)),
146            itertools::MinMaxResult::MinMax(min, max) => Some(Span::new(min, max)),
147        }
148    }
149
150    fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_ {
151        self.iter_word_indices().filter(|idx| {
152            let word = self[*idx];
153            let TokenKind::Word(word) = word.kind else {
154                panic!("Should be unreachable.");
155            };
156
157            word.is_linking_verb()
158        })
159    }
160
161    fn iter_linking_verbs(&self) -> impl Iterator<Item = Token> + '_ {
162        self.iter_linking_verb_indices().map(|idx| self[idx])
163    }
164
165    fn iter_chunks(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
166        let first_chunk = self
167            .iter_chunk_terminator_indices()
168            .next()
169            .map(|first_term| &self[0..=first_term]);
170
171        let rest = self
172            .iter_chunk_terminator_indices()
173            .tuple_windows()
174            .map(move |(a, b)| &self[a + 1..=b]);
175
176        let last = if let Some(last_i) = self.last_chunk_terminator_index() {
177            if last_i + 1 < self.len() {
178                Some(&self[last_i + 1..])
179            } else {
180                None
181            }
182        } else {
183            Some(self)
184        };
185
186        first_chunk.into_iter().chain(rest).chain(last)
187    }
188
189    fn iter_paragraphs(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
190        let first_pg = self
191            .iter_paragraph_break_indices()
192            .next()
193            .map(|first_term| &self[0..=first_term]);
194
195        let rest = self
196            .iter_paragraph_break_indices()
197            .tuple_windows()
198            .map(move |(a, b)| &self[a + 1..=b]);
199
200        let last_pg = if let Some(last_i) = self.last_paragraph_break_index() {
201            if last_i + 1 < self.len() {
202                Some(&self[last_i + 1..])
203            } else {
204                None
205            }
206        } else {
207            Some(self)
208        };
209
210        first_pg.into_iter().chain(rest).chain(last_pg)
211    }
212
213    fn iter_sentences(&self) -> impl Iterator<Item = &'_ [Token]> + '_ {
214        let first_sentence = self
215            .iter_sentence_terminator_indices()
216            .next()
217            .map(|first_term| &self[0..=first_term]);
218
219        let rest = self
220            .iter_sentence_terminator_indices()
221            .tuple_windows()
222            .map(move |(a, b)| &self[a + 1..=b]);
223
224        let last_sentence = if let Some(last_i) = self.last_sentence_terminator_index() {
225            if last_i + 1 < self.len() {
226                Some(&self[last_i + 1..])
227            } else {
228                None
229            }
230        } else {
231            Some(self)
232        };
233
234        first_sentence.into_iter().chain(rest).chain(last_sentence)
235    }
236}