Skip to main content

use_token/
lib.rs

1#![forbid(unsafe_code)]
2#![doc = include_str!("../README.md")]
3
4/// A token with its kind and byte span.
5#[derive(Clone, Debug, Eq, PartialEq)]
6pub struct Token {
7    /// The token classification.
8    pub kind: TokenKind,
9    /// The owned token text.
10    pub text: String,
11    /// The token span in the original input.
12    pub span: TokenSpan,
13}
14
15impl Token {
16    fn new(kind: TokenKind, text: String, start: usize, end: usize) -> Self {
17        Self {
18            kind,
19            text,
20            span: TokenSpan { start, end },
21        }
22    }
23}
24
25/// The category assigned to a token.
26#[derive(Clone, Copy, Debug, Eq, PartialEq)]
27pub enum TokenKind {
28    /// A non-whitespace segment split by whitespace.
29    Text,
30    /// A conservative word token.
31    Word,
32    /// A conservative sentence token.
33    Sentence,
34    /// A single Unicode scalar value.
35    Char,
36}
37
38/// A byte span in the original input string.
39#[derive(Clone, Copy, Debug, Eq, PartialEq)]
40pub struct TokenSpan {
41    /// Inclusive start byte offset.
42    pub start: usize,
43    /// Exclusive end byte offset.
44    pub end: usize,
45}
46
47/// Small configuration for future tokenizer extensions.
48#[derive(Clone, Copy, Debug, Eq, PartialEq)]
49pub struct TokenizerOptions {
50    /// Whether empty tokens should be dropped.
51    pub trim_empty: bool,
52    /// Whether surrounding whitespace should be kept in higher-level flows.
53    pub include_whitespace: bool,
54}
55
56impl Default for TokenizerOptions {
57    fn default() -> Self {
58        Self {
59            trim_empty: true,
60            include_whitespace: false,
61        }
62    }
63}
64
65/// Splits input on contiguous whitespace.
66pub fn tokenize_whitespace(input: &str) -> Vec<Token> {
67    let mut tokens = Vec::new();
68    let mut start = None;
69
70    for (index, character) in input.char_indices() {
71        if character.is_whitespace() {
72            if let Some(token_start) = start.take() {
73                tokens.push(Token::new(
74                    TokenKind::Text,
75                    input[token_start..index].to_owned(),
76                    token_start,
77                    index,
78                ));
79            }
80        } else if start.is_none() {
81            start = Some(index);
82        }
83    }
84
85    if let Some(token_start) = start {
86        tokens.push(Token::new(
87            TokenKind::Text,
88            input[token_start..].to_owned(),
89            token_start,
90            input.len(),
91        ));
92    }
93
94    tokens
95}
96
97/// Extracts conservative word tokens.
98pub fn tokenize_words(input: &str) -> Vec<Token> {
99    word_ranges(input)
100        .into_iter()
101        .map(|(start, end)| Token::new(TokenKind::Word, input[start..end].to_owned(), start, end))
102        .collect()
103}
104
105/// Extracts conservative sentence tokens.
106pub fn tokenize_sentences(input: &str) -> Vec<Token> {
107    let characters: Vec<(usize, char)> = input.char_indices().collect();
108    let mut tokens = Vec::new();
109    let mut start = None;
110    let mut last_non_whitespace_end = 0;
111    let mut index = 0;
112
113    while index < characters.len() {
114        let (byte_index, character) = characters[index];
115        let character_end = byte_index + character.len_utf8();
116
117        if start.is_none() {
118            if character.is_whitespace() {
119                index += 1;
120                continue;
121            }
122
123            start = Some(byte_index);
124        }
125
126        if !character.is_whitespace() {
127            last_non_whitespace_end = character_end;
128        }
129
130        if matches!(character, '.' | '!' | '?') {
131            let mut sentence_end = character_end;
132            let mut lookahead = index + 1;
133
134            while let Some((next_byte, next_character)) = characters.get(lookahead).copied() {
135                if matches!(
136                    next_character,
137                    '.' | '!' | '?' | '"' | '\'' | '”' | '’' | ')' | ']'
138                ) {
139                    sentence_end = next_byte + next_character.len_utf8();
140                    lookahead += 1;
141                } else {
142                    break;
143                }
144            }
145
146            let next_character = characters.get(lookahead).map(|(_, value)| *value);
147            if next_character.is_none() || next_character.is_some_and(char::is_whitespace) {
148                let token_start = start.expect("sentence start should exist");
149                tokens.push(Token::new(
150                    TokenKind::Sentence,
151                    input[token_start..sentence_end].to_owned(),
152                    token_start,
153                    sentence_end,
154                ));
155                start = None;
156                last_non_whitespace_end = sentence_end;
157                index = lookahead;
158                continue;
159            }
160        }
161
162        index += 1;
163    }
164
165    if let Some(token_start) = start {
166        tokens.push(Token::new(
167            TokenKind::Sentence,
168            input[token_start..last_non_whitespace_end].to_owned(),
169            token_start,
170            last_non_whitespace_end,
171        ));
172    }
173
174    tokens
175}
176
177/// Splits input into Unicode scalar values.
178pub fn tokenize_chars(input: &str) -> Vec<Token> {
179    input
180        .char_indices()
181        .map(|(start, character)| {
182            let end = start + character.len_utf8();
183            Token::new(TokenKind::Char, character.to_string(), start, end)
184        })
185        .collect()
186}
187
188/// Counts conservative word tokens.
189pub fn token_count(input: &str) -> usize {
190    tokenize_words(input).len()
191}
192
193fn word_ranges(input: &str) -> Vec<(usize, usize)> {
194    let characters: Vec<(usize, char)> = input.char_indices().collect();
195    let mut ranges = Vec::new();
196    let mut start = None;
197
198    for (index, (byte_index, character)) in characters.iter().copied().enumerate() {
199        let previous = index.checked_sub(1).map(|value| characters[value].1);
200        let next = characters.get(index + 1).map(|(_, value)| *value);
201        let is_word_character =
202            character.is_alphanumeric() || is_apostrophe(previous, character, next);
203
204        if is_word_character {
205            if start.is_none() {
206                start = Some(byte_index);
207            }
208        } else if let Some(token_start) = start.take() {
209            ranges.push((token_start, byte_index));
210        }
211    }
212
213    if let Some(token_start) = start {
214        ranges.push((token_start, input.len()));
215    }
216
217    ranges
218}
219
220fn is_apostrophe(previous: Option<char>, current: char, next: Option<char>) -> bool {
221    matches!(current, '\'' | '’')
222        && previous.is_some_and(char::is_alphanumeric)
223        && next.is_some_and(char::is_alphanumeric)
224}
225
226#[cfg(test)]
227mod tests {
228    use super::{
229        TokenKind, TokenizerOptions, token_count, tokenize_chars, tokenize_sentences,
230        tokenize_whitespace, tokenize_words,
231    };
232
233    #[test]
234    fn handles_empty_and_whitespace_only_input() {
235        assert!(tokenize_whitespace("").is_empty());
236        assert!(tokenize_words("   \n").is_empty());
237        assert_eq!(token_count("\t  "), 0);
238    }
239
240    #[test]
241    fn tokenizes_whitespace_and_tracks_spans() {
242        let tokens = tokenize_whitespace(" hello  world ");
243        assert_eq!(tokens.len(), 2);
244        assert_eq!(tokens[0].kind, TokenKind::Text);
245        assert_eq!(tokens[0].text, "hello");
246        assert_eq!(tokens[0].span.start, 1);
247        assert_eq!(tokens[1].span.end, 13);
248    }
249
250    #[test]
251    fn tokenizes_words_with_punctuation_and_apostrophes() {
252        let tokens = tokenize_words("Hello, world! don't-stop");
253        let texts: Vec<&str> = tokens.iter().map(|token| token.text.as_str()).collect();
254        assert_eq!(texts, vec!["Hello", "world", "don't", "stop"]);
255        assert!(tokens.iter().all(|token| token.kind == TokenKind::Word));
256    }
257
258    #[test]
259    fn tokenizes_sentences_and_multiline_text() {
260        let tokens = tokenize_sentences("One.  Two!\nThree");
261        let texts: Vec<&str> = tokens.iter().map(|token| token.text.as_str()).collect();
262        assert_eq!(texts, vec!["One.", "Two!", "Three"]);
263    }
264
265    #[test]
266    fn tokenizes_unicode_characters() {
267        let tokens = tokenize_chars("A🙂");
268        assert_eq!(tokens.len(), 2);
269        assert_eq!(tokens[1].text, "🙂");
270        assert_eq!(tokens[1].span.start, 1);
271        assert_eq!(tokens[1].span.end, 5);
272    }
273
274    #[test]
275    fn tokenizes_unicode_words_conservatively() {
276        let tokens = tokenize_words("naïve façade");
277        let texts: Vec<&str> = tokens.iter().map(|token| token.text.as_str()).collect();
278        assert_eq!(texts, vec!["naïve", "façade"]);
279        assert!(TokenizerOptions::default().trim_empty);
280    }
281}