Skip to main content

canonical/
token.rs

1use serde::{Deserialize, Serialize};
2
3/// A token with its UTF-8 byte offsets in the canonical text.
4#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
5pub struct Token {
6    /// The token text content.
7    pub text: String,
8    /// Byte offset (inclusive) in the canonical text.
9    pub start: usize,
10    /// Byte offset (exclusive) in the canonical text.
11    pub end: usize,
12}
13
14impl AsRef<str> for Token {
15    fn as_ref(&self) -> &str {
16        self.text.as_str()
17    }
18}
19
20/// Tokenizes canonical text and produces byte offsets.
21///
22/// This helper assumes that `text` has already been canonicalized and that
23/// tokens are separated by Unicode whitespace. It is deterministic and
24/// cross-platform.
25pub fn tokenize(text: &str) -> Vec<Token> {
26    let mut tokens = Vec::new();
27    let mut start: Option<usize> = None;
28
29    for (idx, ch) in text.char_indices() {
30        if ch.is_whitespace() {
31            if let Some(token_start) = start.take() {
32                tokens.push(Token {
33                    text: text[token_start..idx].to_string(),
34                    start: token_start,
35                    end: idx,
36                });
37            }
38        } else if start.is_none() {
39            start = Some(idx);
40        }
41    }
42
43    if let Some(token_start) = start {
44        tokens.push(Token {
45            text: text[token_start..].to_string(),
46            start: token_start,
47            end: text.len(),
48        });
49    }
50
51    tokens
52}