xml_syntax_reader/visitor.rs
1use crate::types::Span;
2
3/// Trait for receiving fine-grained XML parsing events.
4///
5/// All `&[u8]` slices are references into the caller's buffer.
6/// Return `Ok(())` to continue parsing, or `Err(Self::Error)` to abort.
7///
8/// Default implementations do nothing and return `Ok(())`.
9///
10/// # Callback sequences
11///
12/// The parser emits callbacks in these patterns for each XML construct.
13/// `*` means zero or more, `+` means one or more, `|` means alternatives.
14///
15/// ## Start tag
16///
17/// ```text
18/// start_tag_open
19/// (attribute_name attribute-value-sequence attribute_end)*
20/// start_tag_close | empty_element_end
21/// ```
22///
23/// `<img src="a.png" alt="pic"/>`:
24/// ```text
25/// start_tag_open("img")
26/// attribute_name("src")
27/// attribute_value("a.png")
28/// attribute_end
29/// attribute_name("alt")
30/// attribute_value("pic")
31/// attribute_end
32/// empty_element_end
33/// ```
34///
35/// `<p>`:
36/// ```text
37/// start_tag_open("p")
38/// start_tag_close
39/// ```
40///
41/// ## Attribute value sequence
42///
43/// Between the quotes of a single attribute:
44/// ```text
45/// (attribute_value | attribute_entity_ref | attribute_char_ref)*
46/// ```
47///
48/// The value is segmented at entity/character reference boundaries and at
49/// buffer boundaries. Empty attribute values and values consisting solely
50/// of references produce zero `attribute_value` calls. `attribute_end`
51/// always fires exactly once per attribute, after the closing quote.
52///
53/// `class="a&b"`:
54/// ```text
55/// attribute_name("class")
56/// attribute_value("a")
57/// attribute_entity_ref("amp")
58/// attribute_value("b")
59/// attribute_end
60/// ```
61///
62/// `v="&"` (ref-only - no `attribute_value` calls):
63/// ```text
64/// attribute_name("v")
65/// attribute_entity_ref("amp")
66/// attribute_end
67/// ```
68///
69/// `v=""` (empty - no `attribute_value` calls):
70/// ```text
71/// attribute_name("v")
72/// attribute_end
73/// ```
74///
75/// ## End tag
76///
77/// `</div>`:
78/// ```text
79/// end_tag("div")
80/// ```
81///
82/// ## Text content
83///
84/// When present between markup:
85/// ```text
86/// (characters | entity_ref | char_ref)+
87/// ```
88///
89/// Not all elements have text content - e.g. `<p></p>` produces no text
90/// events between `start_tag_close` and `end_tag`. When text is present,
91/// it is segmented at entity/character reference boundaries and at buffer
92/// boundaries. There is no trailing `characters` call after a final
93/// reference.
94///
95/// `hello & world`:
96/// ```text
97/// characters("hello ")
98/// entity_ref("amp")
99/// characters(" world")
100/// ```
101///
102/// `<>` (references only - no `characters` calls):
103/// ```text
104/// entity_ref("lt")
105/// entity_ref("gt")
106/// ```
107///
108/// ## CDATA section
109///
110/// `cdata_start → cdata_content* → cdata_end`
111///
112/// `<![CDATA[hello]]>`:
113/// ```text
114/// cdata_start
115/// cdata_content("hello")
116/// cdata_end
117/// ```
118///
119/// `<![CDATA[]]>` (empty - no `cdata_content` call):
120/// ```text
121/// cdata_start
122/// cdata_end
123/// ```
124///
125/// ## Comment
126///
127/// `comment_start → comment_content* → comment_end`
128///
129/// `<!-- hi -->`:
130/// ```text
131/// comment_start
132/// comment_content(" hi ")
133/// comment_end
134/// ```
135///
136/// `<!---->` (empty - no `comment_content` call):
137/// ```text
138/// comment_start
139/// comment_end
140/// ```
141///
142/// ## Processing instruction
143///
144/// `pi_start → pi_content* → pi_end`
145///
146/// Leading whitespace between the target and content is consumed by the
147/// parser and not included in `pi_content`.
148///
149/// `<?pi data?>`:
150/// ```text
151/// pi_start("pi")
152/// pi_content("data")
153/// pi_end
154/// ```
155///
156/// `<?x?>` (no content - no `pi_content` call):
157/// ```text
158/// pi_start("x")
159/// pi_end
160/// ```
161///
162/// ## DOCTYPE declaration
163///
164/// `doctype_start → doctype_content* → doctype_end`
165///
166/// Content is opaque (not further parsed).
167///
168/// `<!DOCTYPE html [<!ENTITY foo "bar">]>`:
169/// ```text
170/// doctype_start("html")
171/// doctype_content(" [<!ENTITY foo \"bar\">]")
172/// doctype_end
173/// ```
174///
175/// `<!DOCTYPE html>` (no content - no `doctype_content` call):
176/// ```text
177/// doctype_start("html")
178/// doctype_end
179/// ```
180///
181/// ## XML declaration
182///
183/// A single `xml_declaration` call (never chunked).
184///
185/// `<?xml version="1.0" encoding="UTF-8"?>`:
186/// ```text
187/// xml_declaration(version="1.0", encoding=Some("UTF-8"), standalone=None)
188/// ```
189///
190/// For CDATA, comments, PIs, and DOCTYPE, the content callback may fire
191/// more than once when the content spans buffer boundaries.
192pub trait Visitor {
193 type Error;
194
195 // --- Element events ---
196
197 /// Start tag opened: `<name`.
198 /// `name` is the element name (may include a namespace prefix and `:`).
199 fn start_tag_open(&mut self, name: &[u8], span: Span) -> Result<(), Self::Error> {
200 let _ = (name, span);
201 Ok(())
202 }
203
204 /// Attribute name within a start tag.
205 fn attribute_name(&mut self, name: &[u8], span: Span) -> Result<(), Self::Error> {
206 let _ = (name, span);
207 Ok(())
208 }
209
210 /// Attribute value text (between entity/char ref boundaries or buffer boundaries).
211 /// The surrounding quotes are **not** included.
212 ///
213 /// Called zero or more times per attribute, segmented at entity/char
214 /// reference boundaries and buffer boundaries. Not called for empty
215 /// segments - an attribute whose value is empty or consists entirely
216 /// of references produces zero `attribute_value` calls.
217 fn attribute_value(&mut self, value: &[u8], span: Span) -> Result<(), Self::Error> {
218 let _ = (value, span);
219 Ok(())
220 }
221
222 /// End of an attribute value (the closing quote was consumed).
223 fn attribute_end(&mut self, span: Span) -> Result<(), Self::Error> {
224 let _ = span;
225 Ok(())
226 }
227
228 /// Entity reference in attribute value: `&name;`.
229 /// `name` is the entity name without `&` and `;`.
230 fn attribute_entity_ref(&mut self, name: &[u8], span: Span) -> Result<(), Self::Error> {
231 let _ = (name, span);
232 Ok(())
233 }
234
235 /// Character reference in attribute value: `&#NNN;` or `&#xHHH;`.
236 /// `value` is the raw text between `&#` and `;` (e.g. `"60"` or `"x3C"`).
237 fn attribute_char_ref(&mut self, value: &[u8], span: Span) -> Result<(), Self::Error> {
238 let _ = (value, span);
239 Ok(())
240 }
241
242 /// Start tag closed with `>`.
243 fn start_tag_close(&mut self, span: Span) -> Result<(), Self::Error> {
244 let _ = span;
245 Ok(())
246 }
247
248 /// Empty element closed with `/>`.
249 fn empty_element_end(&mut self, span: Span) -> Result<(), Self::Error> {
250 let _ = span;
251 Ok(())
252 }
253
254 /// End tag: `</name>`.
255 /// `name` is the element name (may include a namespace prefix and `:`).
256 fn end_tag(&mut self, name: &[u8], span: Span) -> Result<(), Self::Error> {
257 let _ = (name, span);
258 Ok(())
259 }
260
261 // --- Text events ---
262
263 /// Character data between markup.
264 ///
265 /// May be called multiple times for a single run of text content -
266 /// interleaved with [`entity_ref`](Self::entity_ref) and
267 /// [`char_ref`](Self::char_ref) calls at reference boundaries, and
268 /// split at buffer boundaries. For example, `a&b` produces
269 /// `characters("a")`, `entity_ref("amp")`, `characters("b")`.
270 ///
271 /// Each `text` slice is guaranteed to not split a multi-byte UTF-8 character
272 /// at its boundaries (except when `is_final` is true and the document ends
273 /// mid-sequence). If the input is valid UTF-8, `std::str::from_utf8(text)`
274 /// will always succeed.
275 fn characters(&mut self, text: &[u8], span: Span) -> Result<(), Self::Error> {
276 let _ = (text, span);
277 Ok(())
278 }
279
280 /// Entity reference in text content: `&name;`.
281 /// `name` is the entity name without `&` and `;`.
282 fn entity_ref(&mut self, name: &[u8], span: Span) -> Result<(), Self::Error> {
283 let _ = (name, span);
284 Ok(())
285 }
286
287 /// Character reference in text content: `&#NNN;` or `&#xHHH;`.
288 /// `value` is the raw text between `&#` and `;` (e.g. `"60"` or `"x3C"`).
289 fn char_ref(&mut self, value: &[u8], span: Span) -> Result<(), Self::Error> {
290 let _ = (value, span);
291 Ok(())
292 }
293
294 // --- CDATA ---
295
296 /// Start of a CDATA section: `<![CDATA[`.
297 fn cdata_start(&mut self, span: Span) -> Result<(), Self::Error> {
298 let _ = span;
299 Ok(())
300 }
301
302 /// Content within a CDATA section.
303 /// Called zero or more times for a single CDATA section - zero for
304 /// empty sections (`<![CDATA[]]>`), and possibly more than once when
305 /// content spans buffer boundaries. Consecutive calls have contiguous
306 /// spans.
307 fn cdata_content(&mut self, text: &[u8], span: Span) -> Result<(), Self::Error> {
308 let _ = (text, span);
309 Ok(())
310 }
311
312 /// End of a CDATA section: `]]>`.
313 fn cdata_end(&mut self, span: Span) -> Result<(), Self::Error> {
314 let _ = span;
315 Ok(())
316 }
317
318 // --- Comments ---
319
320 /// Start of a comment: `<!--`.
321 fn comment_start(&mut self, span: Span) -> Result<(), Self::Error> {
322 let _ = span;
323 Ok(())
324 }
325
326 /// Content within a comment.
327 /// Called zero or more times for a single comment - zero for empty
328 /// comments (`<!---->`), and possibly more than once when content
329 /// spans buffer boundaries. Consecutive calls have contiguous spans.
330 fn comment_content(&mut self, text: &[u8], span: Span) -> Result<(), Self::Error> {
331 let _ = (text, span);
332 Ok(())
333 }
334
335 /// End of a comment: `-->`.
336 fn comment_end(&mut self, span: Span) -> Result<(), Self::Error> {
337 let _ = span;
338 Ok(())
339 }
340
341 // --- XML Declaration ---
342
343 /// XML declaration: `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>`.
344 ///
345 /// Fired instead of PI callbacks when `<?xml ...?>` appears at the document
346 /// start. Per the XML specification, the XML declaration is NOT a processing
347 /// instruction - it is a distinct construct.
348 ///
349 /// `version` is always present (e.g. `b"1.0"`).
350 /// `encoding` and `standalone` are optional.
351 fn xml_declaration(
352 &mut self,
353 version: &[u8],
354 encoding: Option<&[u8]>,
355 standalone: Option<bool>,
356 span: Span,
357 ) -> Result<(), Self::Error> {
358 let _ = (version, encoding, standalone, span);
359 Ok(())
360 }
361
362 // --- Processing Instructions ---
363
364 /// Start of a processing instruction: `<?target`.
365 /// `target` is the PI target name.
366 fn pi_start(&mut self, target: &[u8], span: Span) -> Result<(), Self::Error> {
367 let _ = (target, span);
368 Ok(())
369 }
370
371 /// Content of a processing instruction (everything between target and `?>`).
372 /// Called zero or more times for a single PI - zero when the PI has no
373 /// content (`<?target?>`), and possibly more than once when content
374 /// spans buffer boundaries. Consecutive calls have contiguous spans.
375 fn pi_content(&mut self, data: &[u8], span: Span) -> Result<(), Self::Error> {
376 let _ = (data, span);
377 Ok(())
378 }
379
380 /// End of a processing instruction: `?>`.
381 fn pi_end(&mut self, span: Span) -> Result<(), Self::Error> {
382 let _ = span;
383 Ok(())
384 }
385
386 // --- DOCTYPE ---
387
388 /// Start of a DOCTYPE declaration: `<!DOCTYPE name`.
389 /// `name` is the root element name.
390 fn doctype_start(&mut self, name: &[u8], span: Span) -> Result<(), Self::Error> {
391 let _ = (name, span);
392 Ok(())
393 }
394
395 /// Content within a DOCTYPE declaration (opaque).
396 /// Called zero or more times for a single DOCTYPE - zero for simple
397 /// declarations (`<!DOCTYPE html>`), and possibly more than once when
398 /// content spans buffer boundaries. Consecutive calls have contiguous
399 /// spans.
400 fn doctype_content(&mut self, content: &[u8], span: Span) -> Result<(), Self::Error> {
401 let _ = (content, span);
402 Ok(())
403 }
404
405 /// End of a DOCTYPE declaration: `>`.
406 fn doctype_end(&mut self, span: Span) -> Result<(), Self::Error> {
407 let _ = span;
408 Ok(())
409 }
410}