Skip to main content

oxihuman_core/
xml_tokenizer.rs

1// Copyright (C) 2026 COOLJAPAN OU (Team KitaSan)
2// SPDX-License-Identifier: Apache-2.0
3#![allow(dead_code)]
4
5//! XML SAX-style tokenizer stub.
6
7/// A single XML token.
8#[derive(Debug, Clone, PartialEq)]
9pub enum XmlToken {
10    /// `<?xml ... ?>`
11    Declaration(String),
12    /// `<tag attr="v">`
13    StartTag {
14        name: String,
15        attrs: Vec<(String, String)>,
16    },
17    /// `</tag>`
18    EndTag(String),
19    /// `<tag/>`
20    EmptyTag {
21        name: String,
22        attrs: Vec<(String, String)>,
23    },
24    /// Text content between tags.
25    Text(String),
26    /// `<!-- comment -->`
27    Comment(String),
28    /// `<![CDATA[...]]>`
29    CData(String),
30}
31
32/// XML tokenizer error.
33#[derive(Debug, Clone, PartialEq)]
34pub struct XmlError {
35    pub position: usize,
36    pub message: String,
37}
38
39impl std::fmt::Display for XmlError {
40    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
41        write!(
42            f,
43            "XML error at position {}: {}",
44            self.position, self.message
45        )
46    }
47}
48
49/// A SAX-style XML tokenizer.
50#[derive(Debug)]
51pub struct XmlTokenizer {
52    input: Vec<char>,
53    pos: usize,
54}
55
56impl XmlTokenizer {
57    /// Create a new tokenizer for the given input.
58    pub fn new(input: &str) -> Self {
59        XmlTokenizer {
60            input: input.chars().collect(),
61            pos: 0,
62        }
63    }
64
65    /// Return `true` if all input has been consumed.
66    pub fn is_done(&self) -> bool {
67        self.pos >= self.input.len()
68    }
69
70    /// Return current position.
71    pub fn position(&self) -> usize {
72        self.pos
73    }
74
75    fn peek(&self) -> Option<char> {
76        self.input.get(self.pos).copied()
77    }
78
79    fn advance(&mut self) -> Option<char> {
80        let ch = self.input.get(self.pos).copied();
81        self.pos += 1;
82        ch
83    }
84
85    fn consume_until(&mut self, stop: &str) -> String {
86        let stop_chars: Vec<char> = stop.chars().collect();
87        let mut buf = String::new();
88        while self.pos + stop_chars.len() <= self.input.len() {
89            let window: Vec<char> = self.input[self.pos..self.pos + stop_chars.len()].to_vec();
90            if window == stop_chars {
91                self.pos += stop_chars.len();
92                break;
93            }
94            buf.push(self.input[self.pos]);
95            self.pos += 1;
96        }
97        buf
98    }
99
100    /// Collect all tokens from the input.
101    pub fn tokenize(&mut self) -> Result<Vec<XmlToken>, XmlError> {
102        let mut tokens = vec![];
103        while !self.is_done() {
104            if self.peek() == Some('<') {
105                self.pos += 1; /* consume '<' */
106                if self.pos >= self.input.len() {
107                    break;
108                }
109                /* comment */
110                if self.input[self.pos..].starts_with(&['!', '-', '-']) {
111                    self.pos += 3;
112                    let text = self.consume_until("-->");
113                    tokens.push(XmlToken::Comment(text));
114                /* CDATA */
115                } else if self.input[self.pos..]
116                    .starts_with(&['!', '[', 'C', 'D', 'A', 'T', 'A', '['])
117                {
118                    self.pos += 8;
119                    let text = self.consume_until("]]>");
120                    tokens.push(XmlToken::CData(text));
121                /* declaration */
122                } else if self.peek() == Some('?') {
123                    self.pos += 1;
124                    let text = self.consume_until("?>");
125                    tokens.push(XmlToken::Declaration(text));
126                /* end tag */
127                } else if self.peek() == Some('/') {
128                    self.pos += 1;
129                    let name: String = self.input[self.pos..]
130                        .iter()
131                        .take_while(|&&c| c != '>')
132                        .collect();
133                    self.pos += name.len() + 1;
134                    tokens.push(XmlToken::EndTag(name.trim().to_string()));
135                } else {
136                    /* start or empty tag — stub: read until '>' */
137                    let raw: String = self.input[self.pos..]
138                        .iter()
139                        .take_while(|&&c| c != '>')
140                        .collect();
141                    self.pos += raw.len() + 1;
142                    let is_empty = raw.ends_with('/');
143                    let raw = raw.trim_end_matches('/').trim();
144                    let mut parts = raw.splitn(2, char::is_whitespace);
145                    let name = parts.next().unwrap_or("").to_string();
146                    /* attrs stub — skip parsing */
147                    let attrs = vec![];
148                    if is_empty {
149                        tokens.push(XmlToken::EmptyTag { name, attrs });
150                    } else {
151                        tokens.push(XmlToken::StartTag { name, attrs });
152                    }
153                }
154            } else {
155                /* text content */
156                let text: String = self.input[self.pos..]
157                    .iter()
158                    .take_while(|&&c| c != '<')
159                    .collect();
160                self.pos += text.len();
161                if !text.is_empty() {
162                    tokens.push(XmlToken::Text(text));
163                }
164            }
165        }
166        Ok(tokens)
167    }
168}
169
170/// Count tokens of a specific variant name in a list.
171pub fn count_start_tags(tokens: &[XmlToken]) -> usize {
172    tokens
173        .iter()
174        .filter(|t| matches!(t, XmlToken::StartTag { .. }))
175        .count()
176}
177
178/// Count end tags.
179pub fn count_end_tags(tokens: &[XmlToken]) -> usize {
180    tokens
181        .iter()
182        .filter(|t| matches!(t, XmlToken::EndTag(_)))
183        .count()
184}
185
186/// Collect all text content strings.
187pub fn collect_text(tokens: &[XmlToken]) -> Vec<&str> {
188    tokens
189        .iter()
190        .filter_map(|t| {
191            if let XmlToken::Text(s) = t {
192                Some(s.as_str())
193            } else {
194                None
195            }
196        })
197        .collect()
198}
199
200/// Return `true` if the token list represents a well-formed document (stub check).
201pub fn is_balanced(tokens: &[XmlToken]) -> bool {
202    count_start_tags(tokens) == count_end_tags(tokens)
203}
204
205#[cfg(test)]
206mod tests {
207    use super::*;
208
209    #[test]
210    fn test_empty_input() {
211        /* empty input produces no tokens */
212        let mut tok = XmlTokenizer::new("");
213        assert!(tok.tokenize().expect("should succeed").is_empty());
214    }
215
216    #[test]
217    fn test_simple_element() {
218        /* start + text + end */
219        let mut tok = XmlTokenizer::new("<root>hello</root>");
220        let tokens = tok.tokenize().expect("should succeed");
221        assert!(count_start_tags(&tokens) > 0);
222        assert!(count_end_tags(&tokens) > 0);
223    }
224
225    #[test]
226    fn test_balanced() {
227        /* balanced tag check */
228        let mut tok = XmlTokenizer::new("<a>text</a>");
229        let tokens = tok.tokenize().expect("should succeed");
230        assert!(is_balanced(&tokens));
231    }
232
233    #[test]
234    fn test_comment_token() {
235        /* comment produces Comment token */
236        let mut tok = XmlTokenizer::new("<!-- hi -->");
237        let tokens = tok.tokenize().expect("should succeed");
238        assert!(matches!(tokens.first(), Some(XmlToken::Comment(_))));
239    }
240
241    #[test]
242    fn test_empty_tag() {
243        /* self-closing tag */
244        let mut tok = XmlTokenizer::new("<br/>");
245        let tokens = tok.tokenize().expect("should succeed");
246        assert!(matches!(tokens.first(), Some(XmlToken::EmptyTag { .. })));
247    }
248
249    #[test]
250    fn test_text_collection() {
251        /* collect_text extracts text nodes */
252        let mut tok = XmlTokenizer::new("<x>world</x>");
253        let tokens = tok.tokenize().expect("should succeed");
254        let texts = collect_text(&tokens);
255        assert!(!texts.is_empty());
256    }
257
258    #[test]
259    fn test_declaration_token() {
260        /* XML declaration */
261        let mut tok = XmlTokenizer::new("<?xml version=\"1.0\"?>");
262        let tokens = tok.tokenize().expect("should succeed");
263        assert!(matches!(tokens.first(), Some(XmlToken::Declaration(_))));
264    }
265
266    #[test]
267    fn test_count_start_end_symmetry() {
268        /* nested tags counted correctly */
269        let mut tok = XmlTokenizer::new("<a><b></b></a>");
270        let tokens = tok.tokenize().expect("should succeed");
271        assert_eq!(count_start_tags(&tokens), count_end_tags(&tokens));
272    }
273
274    #[test]
275    fn test_position_advances() {
276        /* position moves forward as tokens are consumed */
277        let mut tok = XmlTokenizer::new("<tag/>");
278        tok.tokenize().expect("should succeed");
279        assert!(tok.position() > 0);
280    }
281
282    #[test]
283    fn test_is_done_after_all_input() {
284        /* is_done returns true after tokenizing */
285        let mut tok = XmlTokenizer::new("<x/>");
286        tok.tokenize().expect("should succeed");
287        assert!(tok.is_done());
288    }
289}