Skip to main content

oxihuman_core/
text_tokenizer.rs

1#![allow(dead_code)]
2// Copyright (C) 2026 COOLJAPAN OU (Team KitaSan)
3// SPDX-License-Identifier: Apache-2.0
4
5//! Simple whitespace/punctuation tokenizer.
6//! kind: 0=word, 1=number, 2=punct, 3=whitespace
7
8#[allow(dead_code)]
9#[derive(Debug, Clone, PartialEq)]
10pub struct Token {
11    pub kind: u8,
12    pub text: String,
13}
14
15#[allow(dead_code)]
16pub fn tokenize(text: &str) -> Vec<Token> {
17    let mut tokens = Vec::new();
18    let mut chars = text.char_indices().peekable();
19    while let Some((i, ch)) = chars.next() {
20        if ch.is_whitespace() {
21            let mut s = ch.to_string();
22            while let Some(&(_, nc)) = chars.peek() {
23                if nc.is_whitespace() {
24                    s.push(nc);
25                    chars.next();
26                } else {
27                    break;
28                }
29            }
30            tokens.push(Token { kind: 3, text: s });
31        } else if ch.is_ascii_digit()
32            || (ch == '-'
33                && chars
34                    .peek()
35                    .map(|&(_, c)| c.is_ascii_digit())
36                    .unwrap_or(false))
37        {
38            let mut s = ch.to_string();
39            while let Some(&(_, nc)) = chars.peek() {
40                if nc.is_ascii_digit() || nc == '.' {
41                    s.push(nc);
42                    chars.next();
43                } else {
44                    break;
45                }
46            }
47            tokens.push(Token { kind: 1, text: s });
48        } else if ch.is_alphabetic() || ch == '_' {
49            let _ = i;
50            let mut s = ch.to_string();
51            while let Some(&(_, nc)) = chars.peek() {
52                if nc.is_alphanumeric() || nc == '_' {
53                    s.push(nc);
54                    chars.next();
55                } else {
56                    break;
57                }
58            }
59            tokens.push(Token { kind: 0, text: s });
60        } else {
61            tokens.push(Token {
62                kind: 2,
63                text: ch.to_string(),
64            });
65        }
66    }
67    tokens
68}
69
70#[allow(dead_code)]
71pub fn token_words(tokens: &[Token]) -> Vec<&str> {
72    tokens
73        .iter()
74        .filter(|t| t.kind == 0)
75        .map(|t| t.text.as_str())
76        .collect()
77}
78
79#[allow(dead_code)]
80pub fn token_numbers(tokens: &[Token]) -> Vec<f64> {
81    tokens
82        .iter()
83        .filter(|t| t.kind == 1)
84        .filter_map(|t| t.text.parse::<f64>().ok())
85        .collect()
86}
87
88#[allow(dead_code)]
89pub fn token_count(tokens: &[Token]) -> usize {
90    tokens.len()
91}
92
93#[allow(dead_code)]
94pub fn is_numeric_token(t: &Token) -> bool {
95    t.kind == 1
96}
97
98#[cfg(test)]
99mod tests {
100    use super::*;
101
102    #[test]
103    fn tokenize_words() {
104        let tokens = tokenize("hello world");
105        let words = token_words(&tokens);
106        assert!(words.contains(&"hello"));
107        assert!(words.contains(&"world"));
108    }
109
110    #[test]
111    fn tokenize_numbers() {
112        let tokens = tokenize("abc 42 xyz");
113        let nums = token_numbers(&tokens);
114        assert_eq!(nums.len(), 1);
115        assert!((nums[0] - 42.0).abs() < 1e-9);
116    }
117
118    #[test]
119    fn tokenize_punctuation() {
120        let tokens = tokenize("a,b.c");
121        let puncts: Vec<_> = tokens.iter().filter(|t| t.kind == 2).collect();
122        assert!(!puncts.is_empty());
123    }
124
125    #[test]
126    fn tokenize_whitespace() {
127        let tokens = tokenize("  ");
128        assert_eq!(tokens.len(), 1);
129        assert_eq!(tokens[0].kind, 3);
130    }
131
132    #[test]
133    fn token_count_correct() {
134        let tokens = tokenize("one 2 three");
135        // one(word) + space + 2(num) + space + three(word) = 5
136        assert_eq!(token_count(&tokens), 5);
137    }
138
139    #[test]
140    fn is_numeric_token_true() {
141        let t = Token {
142            kind: 1,
143            text: "3.14".to_string(),
144        };
145        assert!(is_numeric_token(&t));
146    }
147
148    #[test]
149    fn is_numeric_token_false() {
150        let t = Token {
151            kind: 0,
152            text: "hello".to_string(),
153        };
154        assert!(!is_numeric_token(&t));
155    }
156
157    #[test]
158    fn tokenize_empty() {
159        let tokens = tokenize("");
160        assert!(tokens.is_empty());
161    }
162
163    #[test]
164    fn float_number() {
165        // Use a simple integer-like float to avoid approx_constant lint
166        let tokens = tokenize("2.5");
167        let nums = token_numbers(&tokens);
168        assert_eq!(nums.len(), 1);
169        assert!((nums[0] - 2.5).abs() < 1e-5);
170    }
171
172    #[test]
173    fn multiple_numbers() {
174        let tokens = tokenize("1 22 333");
175        let nums = token_numbers(&tokens);
176        assert_eq!(nums.len(), 3);
177    }
178}