Skip to main content

xml_syntax_reader/
visitor.rs

1use crate::types::Span;
2
3/// Trait for receiving fine-grained XML parsing events.
4///
5/// All `&[u8]` slices are references into the caller's buffer.
6/// Return `Ok(())` to continue parsing, or `Err(Self::Error)` to abort.
7///
8/// Default implementations do nothing and return `Ok(())`.
9///
10/// # Callback sequences
11///
12/// The parser emits callbacks in these patterns for each XML construct.
13/// `*` means zero or more, `+` means one or more, `|` means alternatives.
14///
15/// ## Start tag
16///
17/// ```text
18/// start_tag_open
19///   (attribute_name  attribute-value-sequence  attribute_end)*
20///   start_tag_close | empty_element_end
21/// ```
22///
23/// `<img src="a.png" alt="pic"/>`:
24/// ```text
25/// start_tag_open("img")
26/// attribute_name("src")
27/// attribute_value("a.png")
28/// attribute_end
29/// attribute_name("alt")
30/// attribute_value("pic")
31/// attribute_end
32/// empty_element_end
33/// ```
34///
35/// `<p>`:
36/// ```text
37/// start_tag_open("p")
38/// start_tag_close
39/// ```
40///
41/// ## Attribute value sequence
42///
43/// Between the quotes of a single attribute:
44/// ```text
45/// (attribute_value | attribute_entity_ref | attribute_char_ref)*
46/// ```
47///
48/// The value is segmented at entity/character reference boundaries and at
49/// buffer boundaries. Empty attribute values and values consisting solely
50/// of references produce zero `attribute_value` calls. `attribute_end`
51/// always fires exactly once per attribute, after the closing quote.
52///
53/// `class="a&amp;b"`:
54/// ```text
55/// attribute_name("class")
56/// attribute_value("a")
57/// attribute_entity_ref("amp")
58/// attribute_value("b")
59/// attribute_end
60/// ```
61///
62/// `v="&amp;"` (ref-only - no `attribute_value` calls):
63/// ```text
64/// attribute_name("v")
65/// attribute_entity_ref("amp")
66/// attribute_end
67/// ```
68///
69/// `v=""` (empty - no `attribute_value` calls):
70/// ```text
71/// attribute_name("v")
72/// attribute_end
73/// ```
74///
75/// ## End tag
76///
77/// `</div>`:
78/// ```text
79/// end_tag("div")
80/// ```
81///
82/// ## Text content
83///
84/// When present between markup:
85/// ```text
86/// (characters | entity_ref | char_ref)+
87/// ```
88///
89/// Not all elements have text content - e.g. `<p></p>` produces no text
90/// events between `start_tag_close` and `end_tag`. When text is present,
91/// it is segmented at entity/character reference boundaries and at buffer
92/// boundaries. There is no trailing `characters` call after a final
93/// reference.
94///
95/// `hello &amp; world`:
96/// ```text
97/// characters("hello ")
98/// entity_ref("amp")
99/// characters(" world")
100/// ```
101///
102/// `&lt;&gt;` (references only - no `characters` calls):
103/// ```text
104/// entity_ref("lt")
105/// entity_ref("gt")
106/// ```
107///
108/// ## CDATA section
109///
110/// `cdata_start → cdata_content* → cdata_end`
111///
112/// `<![CDATA[hello]]>`:
113/// ```text
114/// cdata_start
115/// cdata_content("hello")
116/// cdata_end
117/// ```
118///
119/// `<![CDATA[]]>` (empty - no `cdata_content` call):
120/// ```text
121/// cdata_start
122/// cdata_end
123/// ```
124///
125/// ## Comment
126///
127/// `comment_start → comment_content* → comment_end`
128///
129/// `<!-- hi -->`:
130/// ```text
131/// comment_start
132/// comment_content(" hi ")
133/// comment_end
134/// ```
135///
136/// `<!---->` (empty - no `comment_content` call):
137/// ```text
138/// comment_start
139/// comment_end
140/// ```
141///
142/// ## Processing instruction
143///
144/// `pi_start → pi_content* → pi_end`
145///
146/// Leading whitespace between the target and content is consumed by the
147/// parser and not included in `pi_content`.
148///
149/// `<?pi data?>`:
150/// ```text
151/// pi_start("pi")
152/// pi_content("data")
153/// pi_end
154/// ```
155///
156/// `<?x?>` (no content - no `pi_content` call):
157/// ```text
158/// pi_start("x")
159/// pi_end
160/// ```
161///
162/// ## DOCTYPE declaration
163///
164/// `doctype_start → doctype_content* → doctype_end`
165///
166/// Content is opaque (not further parsed).
167///
168/// `<!DOCTYPE html [<!ENTITY foo "bar">]>`:
169/// ```text
170/// doctype_start("html")
171/// doctype_content(" [<!ENTITY foo \"bar\">]")
172/// doctype_end
173/// ```
174///
175/// `<!DOCTYPE html>` (no content - no `doctype_content` call):
176/// ```text
177/// doctype_start("html")
178/// doctype_end
179/// ```
180///
181/// ## XML declaration
182///
183/// A single `xml_declaration` call (never chunked).
184///
185/// `<?xml version="1.0" encoding="UTF-8"?>`:
186/// ```text
187/// xml_declaration(version="1.0", encoding=Some("UTF-8"), standalone=None)
188/// ```
189///
190/// For CDATA, comments, PIs, and DOCTYPE, the content callback may fire
191/// more than once when the content spans buffer boundaries.
192pub trait Visitor {
193    type Error;
194
195    // --- Element events ---
196
197    /// Start tag opened: `<name`.
198    /// `name` is the element name (may include a namespace prefix and `:`).
199    fn start_tag_open(&mut self, name: &[u8], span: Span) -> Result<(), Self::Error> {
200        let _ = (name, span);
201        Ok(())
202    }
203
204    /// Attribute name within a start tag.
205    fn attribute_name(&mut self, name: &[u8], span: Span) -> Result<(), Self::Error> {
206        let _ = (name, span);
207        Ok(())
208    }
209
210    /// Attribute value text (between entity/char ref boundaries or buffer boundaries).
211    /// The surrounding quotes are **not** included.
212    ///
213    /// Called zero or more times per attribute, segmented at entity/char
214    /// reference boundaries and buffer boundaries. Not called for empty
215    /// segments - an attribute whose value is empty or consists entirely
216    /// of references produces zero `attribute_value` calls.
217    fn attribute_value(&mut self, value: &[u8], span: Span) -> Result<(), Self::Error> {
218        let _ = (value, span);
219        Ok(())
220    }
221
222    /// End of an attribute value (the closing quote was consumed).
223    fn attribute_end(&mut self, span: Span) -> Result<(), Self::Error> {
224        let _ = span;
225        Ok(())
226    }
227
228    /// Entity reference in attribute value: `&name;`.
229    /// `name` is the entity name without `&` and `;`.
230    fn attribute_entity_ref(&mut self, name: &[u8], span: Span) -> Result<(), Self::Error> {
231        let _ = (name, span);
232        Ok(())
233    }
234
235    /// Character reference in attribute value: `&#NNN;` or `&#xHHH;`.
236    /// `value` is the raw text between `&#` and `;` (e.g. `"60"` or `"x3C"`).
237    fn attribute_char_ref(&mut self, value: &[u8], span: Span) -> Result<(), Self::Error> {
238        let _ = (value, span);
239        Ok(())
240    }
241
242    /// Start tag closed with `>`.
243    fn start_tag_close(&mut self, span: Span) -> Result<(), Self::Error> {
244        let _ = span;
245        Ok(())
246    }
247
248    /// Empty element closed with `/>`.
249    fn empty_element_end(&mut self, span: Span) -> Result<(), Self::Error> {
250        let _ = span;
251        Ok(())
252    }
253
254    /// End tag: `</name>`.
255    /// `name` is the element name (may include a namespace prefix and `:`).
256    fn end_tag(&mut self, name: &[u8], span: Span) -> Result<(), Self::Error> {
257        let _ = (name, span);
258        Ok(())
259    }
260
261    // --- Text events ---
262
263    /// Character data between markup.
264    ///
265    /// May be called multiple times for a single run of text content -
266    /// interleaved with [`entity_ref`](Self::entity_ref) and
267    /// [`char_ref`](Self::char_ref) calls at reference boundaries, and
268    /// split at buffer boundaries. For example, `a&amp;b` produces
269    /// `characters("a")`, `entity_ref("amp")`, `characters("b")`.
270    ///
271    /// Each `text` slice is guaranteed to not split a multi-byte UTF-8 character
272    /// at its boundaries (except when `is_final` is true and the document ends
273    /// mid-sequence). If the input is valid UTF-8, `std::str::from_utf8(text)`
274    /// will always succeed.
275    fn characters(&mut self, text: &[u8], span: Span) -> Result<(), Self::Error> {
276        let _ = (text, span);
277        Ok(())
278    }
279
280    /// Entity reference in text content: `&name;`.
281    /// `name` is the entity name without `&` and `;`.
282    fn entity_ref(&mut self, name: &[u8], span: Span) -> Result<(), Self::Error> {
283        let _ = (name, span);
284        Ok(())
285    }
286
287    /// Character reference in text content: `&#NNN;` or `&#xHHH;`.
288    /// `value` is the raw text between `&#` and `;` (e.g. `"60"` or `"x3C"`).
289    fn char_ref(&mut self, value: &[u8], span: Span) -> Result<(), Self::Error> {
290        let _ = (value, span);
291        Ok(())
292    }
293
294    // --- CDATA ---
295
296    /// Start of a CDATA section: `<![CDATA[`.
297    fn cdata_start(&mut self, span: Span) -> Result<(), Self::Error> {
298        let _ = span;
299        Ok(())
300    }
301
302    /// Content within a CDATA section.
303    /// Called zero or more times for a single CDATA section - zero for
304    /// empty sections (`<![CDATA[]]>`), and possibly more than once when
305    /// content spans buffer boundaries. Consecutive calls have contiguous
306    /// spans.
307    fn cdata_content(&mut self, text: &[u8], span: Span) -> Result<(), Self::Error> {
308        let _ = (text, span);
309        Ok(())
310    }
311
312    /// End of a CDATA section: `]]>`.
313    fn cdata_end(&mut self, span: Span) -> Result<(), Self::Error> {
314        let _ = span;
315        Ok(())
316    }
317
318    // --- Comments ---
319
320    /// Start of a comment: `<!--`.
321    fn comment_start(&mut self, span: Span) -> Result<(), Self::Error> {
322        let _ = span;
323        Ok(())
324    }
325
326    /// Content within a comment.
327    /// Called zero or more times for a single comment - zero for empty
328    /// comments (`<!---->`), and possibly more than once when content
329    /// spans buffer boundaries. Consecutive calls have contiguous spans.
330    fn comment_content(&mut self, text: &[u8], span: Span) -> Result<(), Self::Error> {
331        let _ = (text, span);
332        Ok(())
333    }
334
335    /// End of a comment: `-->`.
336    fn comment_end(&mut self, span: Span) -> Result<(), Self::Error> {
337        let _ = span;
338        Ok(())
339    }
340
341    // --- XML Declaration ---
342
343    /// XML declaration: `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>`.
344    ///
345    /// Fired instead of PI callbacks when `<?xml ...?>` appears at the document
346    /// start. Per the XML specification, the XML declaration is NOT a processing
347    /// instruction - it is a distinct construct.
348    ///
349    /// `version` is always present (e.g. `b"1.0"`).
350    /// `encoding` and `standalone` are optional.
351    fn xml_declaration(
352        &mut self,
353        version: &[u8],
354        encoding: Option<&[u8]>,
355        standalone: Option<bool>,
356        span: Span,
357    ) -> Result<(), Self::Error> {
358        let _ = (version, encoding, standalone, span);
359        Ok(())
360    }
361
362    // --- Processing Instructions ---
363
364    /// Start of a processing instruction: `<?target`.
365    /// `target` is the PI target name.
366    fn pi_start(&mut self, target: &[u8], span: Span) -> Result<(), Self::Error> {
367        let _ = (target, span);
368        Ok(())
369    }
370
371    /// Content of a processing instruction (everything between target and `?>`).
372    /// Called zero or more times for a single PI - zero when the PI has no
373    /// content (`<?target?>`), and possibly more than once when content
374    /// spans buffer boundaries. Consecutive calls have contiguous spans.
375    fn pi_content(&mut self, data: &[u8], span: Span) -> Result<(), Self::Error> {
376        let _ = (data, span);
377        Ok(())
378    }
379
380    /// End of a processing instruction: `?>`.
381    fn pi_end(&mut self, span: Span) -> Result<(), Self::Error> {
382        let _ = span;
383        Ok(())
384    }
385
386    // --- DOCTYPE ---
387
388    /// Start of a DOCTYPE declaration: `<!DOCTYPE name`.
389    /// `name` is the root element name.
390    fn doctype_start(&mut self, name: &[u8], span: Span) -> Result<(), Self::Error> {
391        let _ = (name, span);
392        Ok(())
393    }
394
395    /// Content within a DOCTYPE declaration (opaque).
396    /// Called zero or more times for a single DOCTYPE - zero for simple
397    /// declarations (`<!DOCTYPE html>`), and possibly more than once when
398    /// content spans buffer boundaries. Consecutive calls have contiguous
399    /// spans.
400    fn doctype_content(&mut self, content: &[u8], span: Span) -> Result<(), Self::Error> {
401        let _ = (content, span);
402        Ok(())
403    }
404
405    /// End of a DOCTYPE declaration: `>`.
406    fn doctype_end(&mut self, span: Span) -> Result<(), Self::Error> {
407        let _ = span;
408        Ok(())
409    }
410}