1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
/*!
# Restricted XML parsing and encoding

This crate provides "restricted" parsing and encoding of XML 1.0 documents
with namespacing.

## Features (some call them restrictions)

* No external resources
* No custom entities
* No DTD whatsoever
* No processing instructions
* No comments
* UTF-8 only
* Namespacing-well-formedness enforced
* XML 1.0 only
* Streamed parsing (parser emits a subset of SAX events)
* Streamed encoding
* Parser can be driven push- and pull-based
* Tokio-based asynchronicity supported via the `async` feature and [`AsyncParser`].

## Example

```
use rxml::EventRead;
let doc = b"<?xml version='1.0'?><hello>World!</hello>";
let mut fp = rxml::FeedParser::new();
fp.feed(doc.to_vec());
fp.feed_eof();
let result = fp.read_all_eof(|ev| {
	println!("got event: {:?}", ev);
});
// true indicates eof
assert_eq!(result.unwrap(), true);
```

## High-level parser usage

### Push-based usage

The [`FeedParser`] allows to push bits of XML into the parser as they arrive
in the application and process the resulting [`Event`]s as they happen.

### Pull-based usage

If the parser should block while waiting for more data to arrive, a
[`PullParser`] can be used instead. The `PullParser` requires a source which
implements [`io::BufRead`].

### Usage with Tokio

Tokio is supported with the `async` feature. It offers the [`AsyncParser`]
and the [`AsyncEventRead`] trait, which work similar to the `PullParser`.
Instead of blocking, however, the async parser will yield control to other
tasks.
*/
#[allow(unused_imports)]
use std::io;

mod bufq;
mod context;
mod errctx;
pub mod error;
pub mod lexer;
pub mod parser;
pub mod strings;
pub mod writer;

#[cfg(test)]
pub mod tests;

#[doc(inline)]
pub use bufq::BufferQueue;
pub use context::Context;
#[doc(inline)]
pub use error::{Error, Result};
#[doc(inline)]
pub use lexer::{Lexer, LexerOptions};
#[doc(inline)]
pub use parser::{Event, LexerAdapter, Parser, QName, XMLVersion, XMLNS_XML};
pub use strings::{CData, CDataStr, NCName, NCNameStr, Name, NameStr};
#[doc(inline)]
pub use writer::{Encoder, Item};

#[cfg(feature = "macros")]
#[doc(inline)]
#[cfg_attr(docsrs, doc(cfg(feature = "macros")))]
pub use rxml_proc::{xml_cdata, xml_name, xml_ncname};

#[cfg(feature = "async")]
mod future;

#[cfg(feature = "async")]
#[doc(inline)]
#[cfg_attr(docsrs, doc(cfg(feature = "async")))]
pub use future::{AsyncEventRead, AsyncEventReadExt, AsyncParser};

/// Package version
pub const VERSION: &'static str = env!("CARGO_PKG_VERSION");

/**
# Source for individual XML events

This trait is implemented by the different parser frontends. It is analogous
to the [`std::io::Read`] trait, but for [`Event`]s instead of bytes.
*/
pub trait EventRead {
	/// Read a single event from the parser.
	///
	/// If the EOF has been reached with a valid document, `None` is returned.
	///
	/// I/O errors may be retried, all other errors are fatal (and will be
	/// returned again by the parser on the next invocation without reading
	/// further data from the source).
	fn read(&mut self) -> Result<Option<Event>>;

	/// Read all events which can be produced from the data source (at this
	/// point in time).
	///
	/// The given `cb` is invoked for each event.
	///
	/// I/O errors may be retried, all other errors are fatal (and will be
	/// returned again by the parser on the next invocation without reading
	/// further data from the source).
	fn read_all<F>(&mut self, mut cb: F) -> Result<()>
	where
		F: FnMut(Event) -> (),
	{
		loop {
			match self.read()? {
				None => return Ok(()),
				Some(ev) => cb(ev),
			}
		}
	}

	/// Read all events which can be produced from the data source (at this
	/// point in time).
	///
	/// The given `cb` is invoked for each event.
	///
	/// If the data source indicates that it needs to block to read further
	/// data, `false` is returned. If the EOF is reached successfully, `true`
	/// is returned.
	///
	/// I/O errors may be retried, all other errors are fatal (and will be
	/// returned again by the parser on the next invocation without reading
	/// further data from the source).
	fn read_all_eof<F>(&mut self, cb: F) -> Result<bool>
	where
		F: FnMut(Event) -> (),
	{
		as_eof_flag(self.read_all(cb))
	}
}

/**
# Non-blocking parsing

The [`FeedParser`] allows parsing XML documents as they arrive in the
application, giving back control to the caller immediately when not enough
data is available for processing. This is especially useful when streaming
data from sockets.

To read events from the `FeedParser` after feeding data, use its [`EventRead`]
trait.

## Example

```
use rxml::{FeedParser, Error, Event, XMLVersion, EventRead};
use std::io;
let doc = b"<?xml version='1.0'?><hello>World!</hello>";
let mut fp = FeedParser::new();
fp.feed(doc[..10].to_vec());
// We expect a WouldBlock, because the XML declaration is not complete yet
let ev = fp.read();
assert!(matches!(
	ev.err().unwrap(),
	Error::IO(e) if e.kind() == io::ErrorKind::WouldBlock
));

fp.feed(doc[10..25].to_vec());
// Now we passed the XML declaration (and some), so we expect a corresponding
// event
let ev = fp.read();
assert!(matches!(ev.unwrap().unwrap(), Event::XMLDeclaration(_, XMLVersion::V1_0)));
```
*/
pub struct FeedParser<'x> {
	token_source: LexerAdapter<BufferQueue<'x>>,
	parser: Parser,
}

/// Convert end-of-file-ness of a result to a boolean flag.
///
/// If the result is ok, return true (EOF). If the result is not ok, but the
/// error is an I/O error indicating that the data source would have to block
/// to read further data, return false ("Ok, but not at eof yet").
///
/// All other errors are passed through.
pub fn as_eof_flag(r: Result<()>) -> Result<bool> {
	match r {
		Err(Error::IO(ioerr)) if ioerr.kind() == io::ErrorKind::WouldBlock => Ok(false),
		Err(e) => Err(e),
		Ok(()) => Ok(true),
	}
}

impl<'x> FeedParser<'x> {
	/// Create a new default `FeedParser`.
	pub fn new() -> FeedParser<'x> {
		Self::with_context(parser::RcPtr::new(Context::new()))
	}

	pub fn with_context(ctx: parser::RcPtr<Context>) -> FeedParser<'x> {
		FeedParser {
			token_source: LexerAdapter::new(Lexer::new(), BufferQueue::new()),
			parser: Parser::with_context(ctx),
		}
	}

	/// Feed a chunck of data to the parser.
	///
	/// This enqueues the data for processing, but does not process it right
	/// away.
	///
	/// To process data, call [`FeedParser::read()`] or
	/// [`FeedParser::read_all()`].
	///
	/// # Panics
	///
	/// If [`FeedParser::feed_eof()`] has been called before.
	pub fn feed<'a: 'x, T: Into<std::borrow::Cow<'a, [u8]>>>(&mut self, data: T) {
		self.token_source.get_mut().push(data);
	}

	/// Feed the eof marker to the parser.
	///
	/// This is a prerequisite for parsing to terminate with an eof signal
	/// (returning `true`). Otherwise, `false` will be returned indefinitely
	/// without emitting any events.
	///
	/// After the eof marker has been fed to the parser, no further data can
	/// be fed.
	pub fn feed_eof(&mut self) {
		self.token_source.get_mut().push_eof();
	}

	/// Return the amount of bytes which have not been read from the buffer
	/// yet.
	///
	/// This may not reflect the amount of memory used by the buffer
	/// accurately, as memory is only released when an entire chunk (as fed
	/// to `feed()`) has been processed (and only if that chunk is owned by
	/// the parser).
	pub fn buffered(&self) -> usize {
		self.token_source.get_ref().len()
	}

	/// Return a reference to the internal buffer BufferQueue
	///
	/// This can be used to force dropping of all memory in case of error
	/// conditions.
	pub fn get_buffer_mut(&mut self) -> &mut BufferQueue<'x> {
		self.token_source.get_mut()
	}

	/// Release all temporary buffers
	///
	/// This is sensible to call when it is expected that no more data will be
	/// processed by the parser for a while and the memory is better used
	/// elsewhere.
	pub fn release_temporaries(&mut self) {
		self.token_source.get_lexer_mut().release_temporaries();
		self.parser.release_temporaries();
	}
}

impl EventRead for FeedParser<'_> {
	/// Read a single event from the parser.
	///
	/// If the EOF has been reached with a valid document, `None` is returned.
	///
	/// If the buffered data is not sufficient to create an event, an I/O
	/// error of [`std::io::ErrorKind::WouldBlock`] is returned.
	///
	/// I/O errors may be retried, all other errors are fatal (and will be
	/// returned again by the parser on the next invocation without reading
	/// further data from the source).
	fn read(&mut self) -> Result<Option<Event>> {
		self.parser.parse(&mut self.token_source)
	}
}

/**
# Blocking parsing

The [`PullParser`] allows parsing XML documents from a [`io::Read`]
blockingly. The parser will block until the backing [`io::Read`] has enough
data available (or returns an error).

Interaction with a `PullParser` should happen exclusively via the
[`EventRead`] trait.

## Blocking I/O

If the [`PullParser`] is used with blocking I/O and a source which may block for a significant amount of time (e.g. a network socket), some events may be emitted with significant delay. This is due to an edge case where the lexer may emit a token without consuming a byte from the source.

This internal state of the lexer is not observable from the outside, but it affects most importantly closing element tags. In practice, this means that the last closing element tag of a "stanza" of XML is only going to be emitted once the first byte of the next stanza has been made available through the BufRead.

This only affects blocking I/O, because a non-blocking source will return [`std::io::ErrorKind::WouldBlock`] from the read call and yield control back to the parser to emit the event.

In general, for networked operations, it is recommended to use the [`FeedParser`] or [`AsyncParser`] instead of the [`PullParser`].

## Example

```
use rxml::{PullParser, Error, Event, XMLVersion, EventRead};
use std::io;
use std::io::BufRead;
let mut doc = &b"<?xml version='1.0'?><hello>World!</hello>"[..];
// this converts the doc into an io::BufRead
let mut pp = PullParser::new(&mut doc);
// we expect the first event to be the XML declaration
let ev = pp.read();
assert!(matches!(ev.unwrap().unwrap(), Event::XMLDeclaration(_, XMLVersion::V1_0)));
```
*/
pub struct PullParser<T: io::BufRead> {
	parser: Parser,
	token_source: LexerAdapter<T>,
}

impl<T: io::BufRead> PullParser<T> {
	/// Create a new parser with default options, wrapping the given reader.
	pub fn new(inner: T) -> Self {
		Self::with_options(inner, LexerOptions::default())
	}

	/// Create a new parser while configuring the lexer with the given
	/// options.
	pub fn with_options(inner: T, options: LexerOptions) -> Self {
		Self::wrap(inner, Lexer::with_options(options), Parser::new())
	}

	/// Create a fully customized parser from a lexer and a parser component.
	pub fn wrap(inner: T, lexer: Lexer, parser: Parser) -> Self {
		Self {
			token_source: LexerAdapter::new(lexer, inner),
			parser,
		}
	}
}

impl<T: io::BufRead> EventRead for PullParser<T> {
	/// Read a single event from the parser.
	///
	/// If the EOF has been reached with a valid document, `None` is returned.
	///
	/// All I/O errors from the source are passed on without modification.
	///
	/// I/O errors may be retried, all other errors are fatal (and will be
	/// returned again by the parser on the next invocation without reading
	/// further data from the source).
	fn read(&mut self) -> Result<Option<Event>> {
		self.parser.parse(&mut self.token_source)
	}
}