#![allow(dead_code)]
#[derive(Debug, Clone, PartialEq)]
pub enum XmlToken {
Declaration(String),
StartTag {
name: String,
attrs: Vec<(String, String)>,
},
EndTag(String),
EmptyTag {
name: String,
attrs: Vec<(String, String)>,
},
Text(String),
Comment(String),
CData(String),
}
#[derive(Debug, Clone, PartialEq)]
pub struct XmlError {
pub position: usize,
pub message: String,
}
impl std::fmt::Display for XmlError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"XML error at position {}: {}",
self.position, self.message
)
}
}
#[derive(Debug)]
pub struct XmlTokenizer {
input: Vec<char>,
pos: usize,
}
impl XmlTokenizer {
pub fn new(input: &str) -> Self {
XmlTokenizer {
input: input.chars().collect(),
pos: 0,
}
}
pub fn is_done(&self) -> bool {
self.pos >= self.input.len()
}
pub fn position(&self) -> usize {
self.pos
}
fn peek(&self) -> Option<char> {
self.input.get(self.pos).copied()
}
fn advance(&mut self) -> Option<char> {
let ch = self.input.get(self.pos).copied();
self.pos += 1;
ch
}
fn consume_until(&mut self, stop: &str) -> String {
let stop_chars: Vec<char> = stop.chars().collect();
let mut buf = String::new();
while self.pos + stop_chars.len() <= self.input.len() {
let window: Vec<char> = self.input[self.pos..self.pos + stop_chars.len()].to_vec();
if window == stop_chars {
self.pos += stop_chars.len();
break;
}
buf.push(self.input[self.pos]);
self.pos += 1;
}
buf
}
pub fn tokenize(&mut self) -> Result<Vec<XmlToken>, XmlError> {
let mut tokens = vec![];
while !self.is_done() {
if self.peek() == Some('<') {
self.pos += 1;
if self.pos >= self.input.len() {
break;
}
if self.input[self.pos..].starts_with(&['!', '-', '-']) {
self.pos += 3;
let text = self.consume_until("-->");
tokens.push(XmlToken::Comment(text));
} else if self.input[self.pos..]
.starts_with(&['!', '[', 'C', 'D', 'A', 'T', 'A', '['])
{
self.pos += 8;
let text = self.consume_until("]]>");
tokens.push(XmlToken::CData(text));
} else if self.peek() == Some('?') {
self.pos += 1;
let text = self.consume_until("?>");
tokens.push(XmlToken::Declaration(text));
} else if self.peek() == Some('/') {
self.pos += 1;
let name: String = self.input[self.pos..]
.iter()
.take_while(|&&c| c != '>')
.collect();
self.pos += name.len() + 1;
tokens.push(XmlToken::EndTag(name.trim().to_string()));
} else {
let raw: String = self.input[self.pos..]
.iter()
.take_while(|&&c| c != '>')
.collect();
self.pos += raw.len() + 1;
let is_empty = raw.ends_with('/');
let raw = raw.trim_end_matches('/').trim();
let mut parts = raw.splitn(2, char::is_whitespace);
let name = parts.next().unwrap_or("").to_string();
let attrs = vec![];
if is_empty {
tokens.push(XmlToken::EmptyTag { name, attrs });
} else {
tokens.push(XmlToken::StartTag { name, attrs });
}
}
} else {
let text: String = self.input[self.pos..]
.iter()
.take_while(|&&c| c != '<')
.collect();
self.pos += text.len();
if !text.is_empty() {
tokens.push(XmlToken::Text(text));
}
}
}
Ok(tokens)
}
}
pub fn count_start_tags(tokens: &[XmlToken]) -> usize {
tokens
.iter()
.filter(|t| matches!(t, XmlToken::StartTag { .. }))
.count()
}
pub fn count_end_tags(tokens: &[XmlToken]) -> usize {
tokens
.iter()
.filter(|t| matches!(t, XmlToken::EndTag(_)))
.count()
}
pub fn collect_text(tokens: &[XmlToken]) -> Vec<&str> {
tokens
.iter()
.filter_map(|t| {
if let XmlToken::Text(s) = t {
Some(s.as_str())
} else {
None
}
})
.collect()
}
pub fn is_balanced(tokens: &[XmlToken]) -> bool {
count_start_tags(tokens) == count_end_tags(tokens)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_empty_input() {
let mut tok = XmlTokenizer::new("");
assert!(tok.tokenize().expect("should succeed").is_empty());
}
#[test]
fn test_simple_element() {
let mut tok = XmlTokenizer::new("<root>hello</root>");
let tokens = tok.tokenize().expect("should succeed");
assert!(count_start_tags(&tokens) > 0);
assert!(count_end_tags(&tokens) > 0);
}
#[test]
fn test_balanced() {
let mut tok = XmlTokenizer::new("<a>text</a>");
let tokens = tok.tokenize().expect("should succeed");
assert!(is_balanced(&tokens));
}
#[test]
fn test_comment_token() {
let mut tok = XmlTokenizer::new("<!-- hi -->");
let tokens = tok.tokenize().expect("should succeed");
assert!(matches!(tokens.first(), Some(XmlToken::Comment(_))));
}
#[test]
fn test_empty_tag() {
let mut tok = XmlTokenizer::new("<br/>");
let tokens = tok.tokenize().expect("should succeed");
assert!(matches!(tokens.first(), Some(XmlToken::EmptyTag { .. })));
}
#[test]
fn test_text_collection() {
let mut tok = XmlTokenizer::new("<x>world</x>");
let tokens = tok.tokenize().expect("should succeed");
let texts = collect_text(&tokens);
assert!(!texts.is_empty());
}
#[test]
fn test_declaration_token() {
let mut tok = XmlTokenizer::new("<?xml version=\"1.0\"?>");
let tokens = tok.tokenize().expect("should succeed");
assert!(matches!(tokens.first(), Some(XmlToken::Declaration(_))));
}
#[test]
fn test_count_start_end_symmetry() {
let mut tok = XmlTokenizer::new("<a><b></b></a>");
let tokens = tok.tokenize().expect("should succeed");
assert_eq!(count_start_tags(&tokens), count_end_tags(&tokens));
}
#[test]
fn test_position_advances() {
let mut tok = XmlTokenizer::new("<tag/>");
tok.tokenize().expect("should succeed");
assert!(tok.position() > 0);
}
#[test]
fn test_is_done_after_all_input() {
let mut tok = XmlTokenizer::new("<x/>");
tok.tokenize().expect("should succeed");
assert!(tok.is_done());
}
}