oxihuman_core/
text_tokenizer.rs1#![allow(dead_code)]
2#[allow(dead_code)]
9#[derive(Debug, Clone, PartialEq)]
10pub struct Token {
11 pub kind: u8,
12 pub text: String,
13}
14
15#[allow(dead_code)]
16pub fn tokenize(text: &str) -> Vec<Token> {
17 let mut tokens = Vec::new();
18 let mut chars = text.char_indices().peekable();
19 while let Some((i, ch)) = chars.next() {
20 if ch.is_whitespace() {
21 let mut s = ch.to_string();
22 while let Some(&(_, nc)) = chars.peek() {
23 if nc.is_whitespace() {
24 s.push(nc);
25 chars.next();
26 } else {
27 break;
28 }
29 }
30 tokens.push(Token { kind: 3, text: s });
31 } else if ch.is_ascii_digit()
32 || (ch == '-'
33 && chars
34 .peek()
35 .map(|&(_, c)| c.is_ascii_digit())
36 .unwrap_or(false))
37 {
38 let mut s = ch.to_string();
39 while let Some(&(_, nc)) = chars.peek() {
40 if nc.is_ascii_digit() || nc == '.' {
41 s.push(nc);
42 chars.next();
43 } else {
44 break;
45 }
46 }
47 tokens.push(Token { kind: 1, text: s });
48 } else if ch.is_alphabetic() || ch == '_' {
49 let _ = i;
50 let mut s = ch.to_string();
51 while let Some(&(_, nc)) = chars.peek() {
52 if nc.is_alphanumeric() || nc == '_' {
53 s.push(nc);
54 chars.next();
55 } else {
56 break;
57 }
58 }
59 tokens.push(Token { kind: 0, text: s });
60 } else {
61 tokens.push(Token {
62 kind: 2,
63 text: ch.to_string(),
64 });
65 }
66 }
67 tokens
68}
69
70#[allow(dead_code)]
71pub fn token_words(tokens: &[Token]) -> Vec<&str> {
72 tokens
73 .iter()
74 .filter(|t| t.kind == 0)
75 .map(|t| t.text.as_str())
76 .collect()
77}
78
79#[allow(dead_code)]
80pub fn token_numbers(tokens: &[Token]) -> Vec<f64> {
81 tokens
82 .iter()
83 .filter(|t| t.kind == 1)
84 .filter_map(|t| t.text.parse::<f64>().ok())
85 .collect()
86}
87
88#[allow(dead_code)]
89pub fn token_count(tokens: &[Token]) -> usize {
90 tokens.len()
91}
92
93#[allow(dead_code)]
94pub fn is_numeric_token(t: &Token) -> bool {
95 t.kind == 1
96}
97
98#[cfg(test)]
99mod tests {
100 use super::*;
101
102 #[test]
103 fn tokenize_words() {
104 let tokens = tokenize("hello world");
105 let words = token_words(&tokens);
106 assert!(words.contains(&"hello"));
107 assert!(words.contains(&"world"));
108 }
109
110 #[test]
111 fn tokenize_numbers() {
112 let tokens = tokenize("abc 42 xyz");
113 let nums = token_numbers(&tokens);
114 assert_eq!(nums.len(), 1);
115 assert!((nums[0] - 42.0).abs() < 1e-9);
116 }
117
118 #[test]
119 fn tokenize_punctuation() {
120 let tokens = tokenize("a,b.c");
121 let puncts: Vec<_> = tokens.iter().filter(|t| t.kind == 2).collect();
122 assert!(!puncts.is_empty());
123 }
124
125 #[test]
126 fn tokenize_whitespace() {
127 let tokens = tokenize(" ");
128 assert_eq!(tokens.len(), 1);
129 assert_eq!(tokens[0].kind, 3);
130 }
131
132 #[test]
133 fn token_count_correct() {
134 let tokens = tokenize("one 2 three");
135 assert_eq!(token_count(&tokens), 5);
137 }
138
139 #[test]
140 fn is_numeric_token_true() {
141 let t = Token {
142 kind: 1,
143 text: "3.14".to_string(),
144 };
145 assert!(is_numeric_token(&t));
146 }
147
148 #[test]
149 fn is_numeric_token_false() {
150 let t = Token {
151 kind: 0,
152 text: "hello".to_string(),
153 };
154 assert!(!is_numeric_token(&t));
155 }
156
157 #[test]
158 fn tokenize_empty() {
159 let tokens = tokenize("");
160 assert!(tokens.is_empty());
161 }
162
163 #[test]
164 fn float_number() {
165 let tokens = tokenize("2.5");
167 let nums = token_numbers(&tokens);
168 assert_eq!(nums.len(), 1);
169 assert!((nums[0] - 2.5).abs() < 1e-5);
170 }
171
172 #[test]
173 fn multiple_numbers() {
174 let tokens = tokenize("1 22 333");
175 let nums = token_numbers(&tokens);
176 assert_eq!(nums.len(), 3);
177 }
178}