oxihuman_core/
xml_tokenizer.rs1#![allow(dead_code)]
4
5#[derive(Debug, Clone, PartialEq)]
9pub enum XmlToken {
10 Declaration(String),
12 StartTag {
14 name: String,
15 attrs: Vec<(String, String)>,
16 },
17 EndTag(String),
19 EmptyTag {
21 name: String,
22 attrs: Vec<(String, String)>,
23 },
24 Text(String),
26 Comment(String),
28 CData(String),
30}
31
32#[derive(Debug, Clone, PartialEq)]
34pub struct XmlError {
35 pub position: usize,
36 pub message: String,
37}
38
39impl std::fmt::Display for XmlError {
40 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
41 write!(
42 f,
43 "XML error at position {}: {}",
44 self.position, self.message
45 )
46 }
47}
48
49#[derive(Debug)]
51pub struct XmlTokenizer {
52 input: Vec<char>,
53 pos: usize,
54}
55
56impl XmlTokenizer {
57 pub fn new(input: &str) -> Self {
59 XmlTokenizer {
60 input: input.chars().collect(),
61 pos: 0,
62 }
63 }
64
65 pub fn is_done(&self) -> bool {
67 self.pos >= self.input.len()
68 }
69
70 pub fn position(&self) -> usize {
72 self.pos
73 }
74
75 fn peek(&self) -> Option<char> {
76 self.input.get(self.pos).copied()
77 }
78
79 fn advance(&mut self) -> Option<char> {
80 let ch = self.input.get(self.pos).copied();
81 self.pos += 1;
82 ch
83 }
84
85 fn consume_until(&mut self, stop: &str) -> String {
86 let stop_chars: Vec<char> = stop.chars().collect();
87 let mut buf = String::new();
88 while self.pos + stop_chars.len() <= self.input.len() {
89 let window: Vec<char> = self.input[self.pos..self.pos + stop_chars.len()].to_vec();
90 if window == stop_chars {
91 self.pos += stop_chars.len();
92 break;
93 }
94 buf.push(self.input[self.pos]);
95 self.pos += 1;
96 }
97 buf
98 }
99
100 pub fn tokenize(&mut self) -> Result<Vec<XmlToken>, XmlError> {
102 let mut tokens = vec![];
103 while !self.is_done() {
104 if self.peek() == Some('<') {
105 self.pos += 1; if self.pos >= self.input.len() {
107 break;
108 }
109 if self.input[self.pos..].starts_with(&['!', '-', '-']) {
111 self.pos += 3;
112 let text = self.consume_until("-->");
113 tokens.push(XmlToken::Comment(text));
114 } else if self.input[self.pos..]
116 .starts_with(&['!', '[', 'C', 'D', 'A', 'T', 'A', '['])
117 {
118 self.pos += 8;
119 let text = self.consume_until("]]>");
120 tokens.push(XmlToken::CData(text));
121 } else if self.peek() == Some('?') {
123 self.pos += 1;
124 let text = self.consume_until("?>");
125 tokens.push(XmlToken::Declaration(text));
126 } else if self.peek() == Some('/') {
128 self.pos += 1;
129 let name: String = self.input[self.pos..]
130 .iter()
131 .take_while(|&&c| c != '>')
132 .collect();
133 self.pos += name.len() + 1;
134 tokens.push(XmlToken::EndTag(name.trim().to_string()));
135 } else {
136 let raw: String = self.input[self.pos..]
138 .iter()
139 .take_while(|&&c| c != '>')
140 .collect();
141 self.pos += raw.len() + 1;
142 let is_empty = raw.ends_with('/');
143 let raw = raw.trim_end_matches('/').trim();
144 let mut parts = raw.splitn(2, char::is_whitespace);
145 let name = parts.next().unwrap_or("").to_string();
146 let attrs = vec![];
148 if is_empty {
149 tokens.push(XmlToken::EmptyTag { name, attrs });
150 } else {
151 tokens.push(XmlToken::StartTag { name, attrs });
152 }
153 }
154 } else {
155 let text: String = self.input[self.pos..]
157 .iter()
158 .take_while(|&&c| c != '<')
159 .collect();
160 self.pos += text.len();
161 if !text.is_empty() {
162 tokens.push(XmlToken::Text(text));
163 }
164 }
165 }
166 Ok(tokens)
167 }
168}
169
170pub fn count_start_tags(tokens: &[XmlToken]) -> usize {
172 tokens
173 .iter()
174 .filter(|t| matches!(t, XmlToken::StartTag { .. }))
175 .count()
176}
177
178pub fn count_end_tags(tokens: &[XmlToken]) -> usize {
180 tokens
181 .iter()
182 .filter(|t| matches!(t, XmlToken::EndTag(_)))
183 .count()
184}
185
186pub fn collect_text(tokens: &[XmlToken]) -> Vec<&str> {
188 tokens
189 .iter()
190 .filter_map(|t| {
191 if let XmlToken::Text(s) = t {
192 Some(s.as_str())
193 } else {
194 None
195 }
196 })
197 .collect()
198}
199
200pub fn is_balanced(tokens: &[XmlToken]) -> bool {
202 count_start_tags(tokens) == count_end_tags(tokens)
203}
204
205#[cfg(test)]
206mod tests {
207 use super::*;
208
209 #[test]
210 fn test_empty_input() {
211 let mut tok = XmlTokenizer::new("");
213 assert!(tok.tokenize().expect("should succeed").is_empty());
214 }
215
216 #[test]
217 fn test_simple_element() {
218 let mut tok = XmlTokenizer::new("<root>hello</root>");
220 let tokens = tok.tokenize().expect("should succeed");
221 assert!(count_start_tags(&tokens) > 0);
222 assert!(count_end_tags(&tokens) > 0);
223 }
224
225 #[test]
226 fn test_balanced() {
227 let mut tok = XmlTokenizer::new("<a>text</a>");
229 let tokens = tok.tokenize().expect("should succeed");
230 assert!(is_balanced(&tokens));
231 }
232
233 #[test]
234 fn test_comment_token() {
235 let mut tok = XmlTokenizer::new("<!-- hi -->");
237 let tokens = tok.tokenize().expect("should succeed");
238 assert!(matches!(tokens.first(), Some(XmlToken::Comment(_))));
239 }
240
241 #[test]
242 fn test_empty_tag() {
243 let mut tok = XmlTokenizer::new("<br/>");
245 let tokens = tok.tokenize().expect("should succeed");
246 assert!(matches!(tokens.first(), Some(XmlToken::EmptyTag { .. })));
247 }
248
249 #[test]
250 fn test_text_collection() {
251 let mut tok = XmlTokenizer::new("<x>world</x>");
253 let tokens = tok.tokenize().expect("should succeed");
254 let texts = collect_text(&tokens);
255 assert!(!texts.is_empty());
256 }
257
258 #[test]
259 fn test_declaration_token() {
260 let mut tok = XmlTokenizer::new("<?xml version=\"1.0\"?>");
262 let tokens = tok.tokenize().expect("should succeed");
263 assert!(matches!(tokens.first(), Some(XmlToken::Declaration(_))));
264 }
265
266 #[test]
267 fn test_count_start_end_symmetry() {
268 let mut tok = XmlTokenizer::new("<a><b></b></a>");
270 let tokens = tok.tokenize().expect("should succeed");
271 assert_eq!(count_start_tags(&tokens), count_end_tags(&tokens));
272 }
273
274 #[test]
275 fn test_position_advances() {
276 let mut tok = XmlTokenizer::new("<tag/>");
278 tok.tokenize().expect("should succeed");
279 assert!(tok.position() > 0);
280 }
281
282 #[test]
283 fn test_is_done_after_all_input() {
284 let mut tok = XmlTokenizer::new("<x/>");
286 tok.tokenize().expect("should succeed");
287 assert!(tok.is_done());
288 }
289}