xml_syntax_reader/lib.rs
1#![no_std]
2
3//! High-performance, zero-copy, streaming XML syntax reader.
4//!
5//! This crate tokenizes well-formed XML into fine-grained events (start tags,
6//! attributes, text, comments, etc.) delivered through a [`Visitor`] trait.
7//! It does not validate that xml or attribute names are legal, build a tree, resolve namespaces,
8//! or expand entity references.
9//!
10//! # Quick start
11//!
12//! Implement [`Visitor`] to receive events, then feed input to a [`Reader`]:
13//!
14//! ```
15//! use xml_syntax_reader::{Reader, Visitor, Span};
16//!
17//! struct Print;
18//! impl Visitor for Print {
19//! type Error = std::convert::Infallible;
20//! fn start_tag_open(&mut self, name: &[u8], _: Span) -> Result<(), Self::Error> {
21//! println!("element: {}", String::from_utf8_lossy(name));
22//! Ok(())
23//! }
24//! }
25//!
26//! let mut reader = Reader::new();
27//! reader.parse_slice(b"<hello/>", &mut Print).unwrap();
28//! ```
29//!
30//! For streaming use, call [`Reader::parse`] in a loop - it returns the
31//! number of bytes consumed so the caller can shift the buffer and append
32//! more data. [`parse_read`] wraps this loop for [`std::io::Read`] sources.
33//!
34//! # Encoding
35//!
36//! The parser operates on bytes and assumes UTF-8 input. Use
37//! [`probe_encoding`] to detect the transport encoding (BOM / XML
38//! declaration) and transcode if necessary before parsing.
39//!
40//! ## Input Limits
41//!
42//! The parser enforces hardcoded limits to prevent resource exhaustion:
43//!
44//! - **Names** (element, attribute, PI target, DOCTYPE, entity references):
45//! maximum **1,000 bytes**. Exceeding this produces [`ErrorKind::NameTooLong`].
46//!
47//! - **Character references**: maximum **7 bytes** for the value between
48//! `&#` and `;` (the longest valid reference is `` or
49//! ``). Exceeding this produces [`ErrorKind::CharRefTooLong`].
50//!
51//! - **Text content, attribute values, and content bodies** (comments, CDATA
52//! sections, processing instructions, and DOCTYPE declarations) are all
53//! **streamed in chunks** at buffer boundaries. The visitor receives zero or
54//! more content calls with contiguous spans - zero for empty constructs
55//! (e.g. `<!---->`, `<?target?>`), and more than one when the body spans
56//! buffer boundaries. Text content (`characters`) is additionally
57//! interleaved with `entity_ref` / `char_ref` callbacks at reference
58//! boundaries. Attribute values are chunked at both buffer boundaries and
59//! entity/character reference boundaries, which produce separate
60//! `attribute_entity_ref` and `attribute_char_ref` callbacks. There is no
61//! size limit on any of these. See the [`Visitor`] trait documentation for
62//! the full callback sequences.
63
64#[cfg(feature = "std")]
65extern crate std;
66
67#[forbid(unsafe_code)]
68mod types;
69#[forbid(unsafe_code)]
70mod visitor;
71#[forbid(unsafe_code)]
72mod classify;
73#[forbid(unsafe_code)]
74mod state;
75#[forbid(unsafe_code)]
76mod reader;
77#[forbid(unsafe_code)]
78mod encoding;
79
80mod bitstream;
81
82pub use types::{DeclaredEncoding, Encoding, Error, ErrorKind, ParseError, Span};
83pub use visitor::Visitor;
84pub use reader::Reader;
85#[cfg(feature = "std")]
86pub use reader::{parse_read, parse_read_with_capacity, ReadError};
87pub use encoding::{probe_encoding, ProbeResult};