use std::any::Any;
use std::collections::HashMap;
use std::io::BufRead;
use std::sync::Arc;
use compact_str::CompactString;
use quick_xml::Reader;
use quick_xml::events::Event;
use crate::error::Result;
use crate::namespace::Namespace;
use crate::position::PositionTrackingReader;
#[derive(Debug, Default)]
struct StringInterner {
cache: HashMap<Box<str>, Arc<str>>,
}
impl StringInterner {
fn new() -> Self {
Self {
cache: HashMap::new(),
}
}
fn intern(&mut self, s: &str) -> Arc<str> {
if let Some(interned) = self.cache.get(s) {
Arc::clone(interned)
} else {
let arc: Arc<str> = Arc::from(s);
self.cache.insert(s.into(), Arc::clone(&arc));
arc
}
}
#[allow(dead_code)]
fn len(&self) -> usize {
self.cache.len()
}
}
#[derive(Debug, Clone)]
pub enum XmlEvent {
StartElement {
name: Arc<str>,
prefix: Option<Arc<str>>,
namespace: Option<String>,
attributes: Vec<(CompactString, CompactString)>,
namespace_decls: Vec<Namespace>,
line: Option<usize>,
column: Option<usize>,
},
EndElement {
name: Arc<str>,
prefix: Option<Arc<str>>,
},
Text(String),
CData(String),
Comment(String),
ProcessingInstruction {
target: String,
content: Option<String>,
},
Declaration {
version: Option<String>,
encoding: Option<String>,
standalone: Option<bool>,
},
Eof,
}
pub trait XmlEventHandler: Send + Any {
fn handle(&mut self, event: &XmlEvent) -> Result<()>;
fn finish(&mut self) -> Result<()> {
Ok(())
}
fn as_any(self: Box<Self>) -> Box<dyn Any>;
}
pub struct StreamingParser<R: BufRead> {
reader: Reader<PositionTrackingReader<R>>,
handlers: Vec<Box<dyn XmlEventHandler>>,
interner: StringInterner,
}
impl<R: BufRead> StreamingParser<R> {
pub fn new(reader: R) -> Self {
let position_reader = PositionTrackingReader::new(reader);
let mut xml_reader = Reader::from_reader(position_reader);
xml_reader.config_mut().trim_text(false);
xml_reader.config_mut().expand_empty_elements = true;
Self {
reader: xml_reader,
handlers: Vec::new(),
interner: StringInterner::new(),
}
}
fn current_line(&self) -> usize {
self.reader.get_ref().line()
}
fn current_column(&self) -> usize {
self.reader.get_ref().column()
}
pub fn add_handler(&mut self, handler: Box<dyn XmlEventHandler>) {
self.handlers.push(handler);
}
pub fn into_handlers(self) -> Vec<Box<dyn XmlEventHandler>> {
self.handlers
}
pub fn parse(&mut self) -> Result<()> {
let mut buffer = Vec::with_capacity(8 * 1024);
loop {
let event_result = self.reader.read_event_into(&mut buffer);
let line = self.current_line();
let column = self.current_column();
match event_result {
Ok(Event::Start(ref e)) => {
let event = convert_start_event(e, line, column, &mut self.interner)?;
self.dispatch_event(&event)?;
}
Ok(Event::Empty(ref e)) => {
let start_event = convert_start_event(e, line, column, &mut self.interner)?;
self.dispatch_event(&start_event)?;
if let XmlEvent::StartElement {
ref name,
ref prefix,
..
} = start_event
{
let end_event = XmlEvent::EndElement {
name: name.clone(),
prefix: prefix.clone(),
};
self.dispatch_event(&end_event)?;
}
}
Ok(Event::End(ref e)) => {
let name_bytes = e.name().as_ref().to_vec();
let full_name = std::str::from_utf8(&name_bytes)?;
let (prefix, name) = crate::namespace::split_qname(full_name);
let event = XmlEvent::EndElement {
name: self.interner.intern(name),
prefix: prefix.map(|p| self.interner.intern(p)),
};
self.dispatch_event(&event)?;
}
Ok(Event::Text(ref e)) => {
let text = e.unescape().map_err(|e| {
crate::parser::error::ParseError::TextDecodeError {
message: e.to_string(),
}
})?;
if !text.is_empty() {
let event = XmlEvent::Text(text.into_owned());
self.dispatch_event(&event)?;
}
}
Ok(Event::CData(ref e)) => {
let text = std::str::from_utf8(e.as_ref())?;
let event = XmlEvent::CData(text.to_string());
self.dispatch_event(&event)?;
}
Ok(Event::Comment(ref e)) => {
let text = std::str::from_utf8(e.as_ref())?;
let event = XmlEvent::Comment(text.to_string());
self.dispatch_event(&event)?;
}
Ok(Event::PI(ref e)) => {
let content = std::str::from_utf8(e.as_ref())?;
let parts: Vec<&str> = content.splitn(2, char::is_whitespace).collect();
let target = parts.first().unwrap_or(&"").to_string();
let pi_content = parts.get(1).map(|s| s.trim().to_string());
let event = XmlEvent::ProcessingInstruction {
target,
content: pi_content,
};
self.dispatch_event(&event)?;
}
Ok(Event::Decl(ref e)) => {
let version = e
.version()
.ok()
.map(|v| String::from_utf8_lossy(v.as_ref()).into_owned());
let encoding = e
.encoding()
.and_then(|r| r.ok())
.map(|v| String::from_utf8_lossy(v.as_ref()).into_owned());
let standalone = e
.standalone()
.and_then(|r| r.ok())
.map(|v| v.as_ref() == b"yes");
let event = XmlEvent::Declaration {
version,
encoding,
standalone,
};
self.dispatch_event(&event)?;
}
Ok(Event::DocType(_)) => {
}
Ok(Event::Eof) => {
let event = XmlEvent::Eof;
self.dispatch_event(&event)?;
break;
}
Err(e) => {
return Err(crate::parser::error::ParseError::AtPosition {
position: self.reader.get_ref().byte_offset() as u64,
message: e.to_string(),
}
.into());
}
}
buffer.clear();
}
for handler in &mut self.handlers {
handler.finish()?;
}
Ok(())
}
fn dispatch_event(&mut self, event: &XmlEvent) -> Result<()> {
for handler in &mut self.handlers {
handler.handle(event)?;
}
Ok(())
}
}
fn convert_start_event(
e: &quick_xml::events::BytesStart<'_>,
line: usize,
column: usize,
interner: &mut StringInterner,
) -> Result<XmlEvent> {
let name_bytes = e.name().as_ref().to_vec();
let full_name = std::str::from_utf8(&name_bytes)?;
let (prefix, name) = crate::namespace::split_qname(full_name);
let mut namespace_decls = Vec::new();
let mut attributes = Vec::new();
for attr_result in e.attributes() {
let attr = attr_result?;
let key = std::str::from_utf8(attr.key.as_ref())?;
let value = attr.unescape_value().map_err(|e| {
crate::parser::error::ParseError::AttributeDecodeError {
message: e.to_string(),
}
})?;
if key == "xmlns" {
namespace_decls.push(Namespace::default_ns(value.as_ref()));
} else if let Some(ns_prefix) = key.strip_prefix("xmlns:") {
namespace_decls.push(Namespace::new(ns_prefix, value.as_ref()));
} else {
attributes.push((
CompactString::from(key),
CompactString::from(value.as_ref()),
));
}
}
Ok(XmlEvent::StartElement {
name: interner.intern(name),
prefix: prefix.map(|p| interner.intern(p)),
namespace: None, attributes,
namespace_decls,
line: Some(line),
column: Some(column),
})
}
pub struct EventCollector {
events: Vec<XmlEvent>,
}
impl EventCollector {
pub fn new() -> Self {
Self { events: Vec::new() }
}
pub fn events(&self) -> &[XmlEvent] {
&self.events
}
pub fn into_events(self) -> Vec<XmlEvent> {
self.events
}
}
impl Default for EventCollector {
fn default() -> Self {
Self::new()
}
}
impl XmlEventHandler for EventCollector {
fn handle(&mut self, event: &XmlEvent) -> Result<()> {
self.events.push(event.clone());
Ok(())
}
fn as_any(self: Box<Self>) -> Box<dyn Any> {
self
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_streaming_parser() {
let xml = r#"<root attr="value"><child>text</child></root>"#;
let mut parser = StreamingParser::new(xml.as_bytes());
let collector = EventCollector::new();
parser.add_handler(Box::new(collector));
parser.parse().unwrap();
}
#[test]
fn test_event_collector() {
let mut collector = EventCollector::new();
collector
.handle(&XmlEvent::StartElement {
name: Arc::from("root"),
prefix: None,
namespace: None,
attributes: vec![],
namespace_decls: vec![],
line: Some(1),
column: Some(1),
})
.unwrap();
collector
.handle(&XmlEvent::StartElement {
name: Arc::from("child"),
prefix: None,
namespace: None,
attributes: vec![],
namespace_decls: vec![],
line: Some(1),
column: Some(1),
})
.unwrap();
collector
.handle(&XmlEvent::EndElement {
name: Arc::from("child"),
prefix: None,
})
.unwrap();
collector
.handle(&XmlEvent::EndElement {
name: Arc::from("root"),
prefix: None,
})
.unwrap();
collector.handle(&XmlEvent::Eof).unwrap();
let events = collector.into_events();
assert_eq!(events.len(), 5);
}
}