use tokenizer::{XmlTokenizerOpts, XmlTokenizer};
use tree_builder::{TreeSink, XmlTreeBuilder, XmlTreeBuilderOpts};
use std::borrow::Cow;
use std::mem;
use encoding::{self, EncodingRef};
use tendril;
use tendril::{StrTendril, ByteTendril};
use tendril::stream::{TendrilSink, Utf8LossyDecoder, LossyDecoder};
#[derive(Clone, Default)]
pub struct XmlParseOpts {
pub tokenizer: XmlTokenizerOpts,
pub tree_builder: XmlTreeBuilderOpts,
}
pub fn parse_document<Sink>(sink: Sink, opts: XmlParseOpts) -> XmlParser<Sink>
where Sink: TreeSink {
let tb = XmlTreeBuilder::new(sink, opts.tree_builder);
let tok = XmlTokenizer::new(tb, opts.tokenizer);
XmlParser { tokenizer: tok}
}
pub struct XmlParser<Sink> where Sink: TreeSink {
pub tokenizer: XmlTokenizer<XmlTreeBuilder<Sink::Handle, Sink>>,
}
pub trait ParseResult {
type Sink: TreeSink + Default;
fn get_result(sink: Self::Sink) -> Self;
}
impl<Sink: TreeSink> TendrilSink<tendril::fmt::UTF8> for XmlParser<Sink> {
type Output = Sink::Output;
fn process(&mut self, t: StrTendril) {
self.tokenizer.feed(t)
}
fn error(&mut self, desc: Cow<'static, str>) {
self.tokenizer.sink_mut().sink_mut().parse_error(desc)
}
fn finish(mut self) -> Self::Output {
self.tokenizer.end();
self.tokenizer.unwrap().unwrap().finish()
}
}
impl<Sink: TreeSink> XmlParser<Sink> {
pub fn from_utf8(self) -> Utf8LossyDecoder<Self> {
Utf8LossyDecoder::new(self)
}
pub fn from_bytes(self, opts: BytesOpts) -> BytesParser<Sink> {
BytesParser {
state: BytesParserState::Initial { parser: self },
opts: opts,
}
}
}
#[derive(Clone, Default)]
pub struct BytesOpts {
pub transport_layer_encoding: Option<EncodingRef>,
}
pub struct BytesParser<Sink> where Sink: TreeSink {
state: BytesParserState<Sink>,
opts: BytesOpts,
}
enum BytesParserState<Sink> where Sink: TreeSink {
Initial {
parser: XmlParser<Sink>,
},
Buffering {
parser: XmlParser<Sink>,
buffer: ByteTendril
},
Parsing {
decoder: LossyDecoder<XmlParser<Sink>>,
},
Transient
}
impl<Sink: TreeSink> BytesParser<Sink> {
pub fn str_parser(&self) -> &XmlParser<Sink> {
match self.state {
BytesParserState::Initial { ref parser } => parser,
BytesParserState::Buffering { ref parser, .. } => parser,
BytesParserState::Parsing { ref decoder } => decoder.inner_sink(),
BytesParserState::Transient => unreachable!(),
}
}
pub fn str_parser_mut(&mut self) -> &mut XmlParser<Sink> {
match self.state {
BytesParserState::Initial { ref mut parser } => parser,
BytesParserState::Buffering { ref mut parser, .. } => parser,
BytesParserState::Parsing { ref mut decoder } => decoder.inner_sink_mut(),
BytesParserState::Transient => unreachable!(),
}
}
pub fn process_unicode(&mut self, t: StrTendril) {
if t.is_empty() {
return }
if let BytesParserState::Parsing { ref mut decoder } = self.state {
decoder.inner_sink_mut().process(t)
} else {
match mem::replace(&mut self.state, BytesParserState::Transient) {
BytesParserState::Initial { mut parser } => {
parser.process(t);
self.start_parsing(parser, ByteTendril::new())
}
BytesParserState::Buffering { parser, buffer } => {
self.start_parsing(parser, buffer);
if let BytesParserState::Parsing { ref mut decoder } = self.state {
decoder.inner_sink_mut().process(t)
} else {
unreachable!()
}
}
BytesParserState::Parsing { .. } | BytesParserState::Transient => unreachable!(),
}
}
}
fn start_parsing(&mut self, parser: XmlParser<Sink>, buffer: ByteTendril) {
let encoding = detect_encoding(&buffer, &self.opts);
let mut decoder = LossyDecoder::new(encoding, parser);
decoder.process(buffer);
self.state = BytesParserState::Parsing { decoder: decoder }
}
}
impl<Sink: TreeSink> TendrilSink<tendril::fmt::Bytes> for BytesParser<Sink> {
fn process(&mut self, t: ByteTendril) {
if let &mut BytesParserState::Parsing { ref mut decoder } = &mut self.state {
return decoder.process(t)
}
let (parser, buffer) = match mem::replace(&mut self.state, BytesParserState::Transient) {
BytesParserState::Initial{ parser } => (parser, t),
BytesParserState::Buffering { parser, mut buffer } => {
buffer.push_tendril(&t);
(parser, buffer)
}
BytesParserState::Parsing { .. } | BytesParserState::Transient => unreachable!(),
};
if buffer.len32() >= PRESCAN_BYTES {
self.start_parsing(parser, buffer)
} else {
self.state = BytesParserState::Buffering {
parser: parser,
buffer: buffer,
}
}
}
fn error(&mut self, desc: Cow<'static, str>) {
match self.state {
BytesParserState::Initial { ref mut parser } => parser.error(desc),
BytesParserState::Buffering { ref mut parser, .. } => parser.error(desc),
BytesParserState::Parsing { ref mut decoder } => decoder.error(desc),
BytesParserState::Transient => unreachable!(),
}
}
type Output = Sink::Output;
fn finish(self) -> Self::Output {
match self.state {
BytesParserState::Initial { parser } => parser.finish(),
BytesParserState::Buffering { parser, buffer } => {
let encoding = detect_encoding(&buffer, &self.opts);
let mut decoder = LossyDecoder::new(encoding, parser);
decoder.process(buffer);
decoder.finish()
},
BytesParserState::Parsing { decoder } => decoder.finish(),
BytesParserState::Transient => unreachable!(),
}
}
}
const PRESCAN_BYTES: u32 = 3;
fn detect_encoding(bytes: &ByteTendril, opts: &BytesOpts) -> EncodingRef {
if bytes.starts_with(b"\xEF\xBB\xBF") {
return encoding::all::UTF_8
}
if bytes.starts_with(b"\xFE\xFF") {
return encoding::all::UTF_16BE
}
if bytes.starts_with(b"\xFF\xFE") {
return encoding::all::UTF_16LE
}
if let Some(encoding) = opts.transport_layer_encoding {
return encoding
}
return encoding::all::UTF_8
}
#[cfg(test)]
mod tests {
use rcdom::RcDom;
use serialize::serialize;
use std::iter::repeat;
use tendril::TendrilSink;
use super::*;
#[test]
fn el_ns_serialize() {
assert_eq_serialization("<a:title xmlns:a=\"http://www.foo.org/\" value=\"test\">Test</a:title>",
parse_document(RcDom::default(), XmlParseOpts::default())
.from_utf8()
.one("<a:title xmlns:a=\"http://www.foo.org/\" value=\"test\">Test</title>".as_bytes()));
}
#[test]
fn nested_ns_serialize() {
assert_eq_serialization("<a:x xmlns:a=\"http://www.foo.org/\" xmlns:b=\"http://www.bar.org/\" value=\"test\"><b:y/></a:x>",
parse_document(RcDom::default(), XmlParseOpts::default())
.from_utf8()
.one("<a:x xmlns:a=\"http://www.foo.org/\" xmlns:b=\"http://www.bar.org/\" value=\"test\"><b:y/></a:x>".as_bytes()));
}
#[test]
fn def_ns_serialize() {
assert_eq_serialization("<table xmlns=\"html4\"><td></td></table>",
parse_document(RcDom::default(), XmlParseOpts::default())
.from_utf8()
.one("<table xmlns=\"html4\"><td></td></table>".as_bytes()));
}
#[test]
fn undefine_ns_serialize() {
assert_eq_serialization("<a:x xmlns:a=\"http://www.foo.org\"><a:y xmlns:a=\"\"><a:z/></a:y</a:x>",
parse_document(RcDom::default(), XmlParseOpts::default())
.from_utf8()
.one("<a:x xmlns:a=\"http://www.foo.org\"><a:y xmlns:a=\"\"><a:z/></a:y</a:x>".as_bytes()));
}
#[test]
fn redefine_default_ns_serialize() {
assert_eq_serialization("<x xmlns=\"http://www.foo.org\"><y xmlns=\"\"><z/></y</x>",
parse_document(RcDom::default(), XmlParseOpts::default())
.from_utf8()
.one("<x xmlns=\"http://www.foo.org\"><y xmlns=\"\"><z/></y</x>".as_bytes()));
}
#[test]
fn attr_serialize() {
assert_serialization("<title value=\"test\">Test</title>",
parse_document(RcDom::default(), XmlParseOpts::default())
.from_utf8()
.one("<title value='test'>Test".as_bytes()));
}
#[test]
fn from_utf8() {
assert_serialization("<title>Test</title>",
parse_document(RcDom::default(), XmlParseOpts::default())
.from_utf8()
.one("<title>Test".as_bytes()));
}
#[test]
fn from_bytes_one() {
assert_serialization("<title>Test</title>",
parse_document(RcDom::default(), XmlParseOpts::default())
.from_bytes(BytesOpts::default())
.one("<title>Test".as_bytes()));
}
fn assert_eq_serialization(text: &'static str, dom: RcDom) {
let mut serialized = Vec::new();
serialize(&mut serialized, &dom.document, Default::default()).unwrap();
let dom_from_text = parse_document(RcDom::default(), XmlParseOpts::default())
.from_bytes(BytesOpts::default())
.one(text.as_bytes());
let mut reserialized = Vec::new();
serialize(&mut reserialized, &dom_from_text.document, Default::default()).unwrap();
assert_eq!(String::from_utf8(serialized).unwrap(),
String::from_utf8(reserialized).unwrap());
}
fn assert_serialization(text: &'static str, dom: RcDom) {
let mut serialized = Vec::new();
serialize(&mut serialized, &dom.document, Default::default()).unwrap();
assert_eq!(String::from_utf8(serialized).unwrap(),
text);
}
}