rxml_validation/
lib.rs

1#![deny(missing_docs)]
2#![cfg_attr(not(feature = "std"), no_std)]
3#![cfg_attr(docsrs, feature(doc_cfg))]
4/*!
5# Strongly-typed strings for use with XML 1.0 documents
6
7This crate defines various string- and str-like types which represent pieces
8of text as they may occur in XML documents. These types are checked to contain
9only text which conforms to the respective grammar in the XML specifications.
10
11This allows to carry information about the checking which already took place
12in the parser to the application, avoiding the need to execute checks multiple
13times.
14
15This is a supplementary crate for [`rxml`](https://docs.rs/rxml). It is
16factored out of the main crate to support
17[`rxml_proc`](https://docs.rs/rxml_proc), a crate of macros which allow
18compile-time validation and typing of XML strings. All types defined in this
19crate are re-exported in `rxml`; if you depend on `rxml`, you can use the
20types from there directly.
21
22If the `std` feature is *not* enabled (it is enabled by default), this crate
23can be used in `no_std` environments.
24
25## Type Overview
26
27- [`Name`] and [`NameStr`] represent the `Name` production and can be used
28  for element and attribute names before namespace prefix expansion.
29- [`NcName`] and [`NcNameStr`] represent the `Name` production but without a
30  colon inside; they are used for localnames after prefix expansion and to
31  carry the prefixes themselves.
32
33## Construction
34
35In general, values are constructed using the [`std::convert::TryInto`]
36trait, from other string types or `str`. Supported source types are:
37
38* [`String`] (copies)
39* [`compact_str::CompactString`] (moves)
40* [`str`] (copies for all types except the slice types)
41
42**Note:** If the `compact_str` feature is *not* enabled, all string types use
43the normal [`std::string::String`] type instead.
44
45In addition, converting from [`NcName`] to [`Name`] is possible without extra
46checking and is thus possible through `.into()` (and likewise for the
47corresponding str types).
48
49The inverse directions are only available through `try_into`.
50
51## When to use rxml_validation vs. rxml?
52
53You should use this crate (`rxml_validation`) whenever you only need to
54validate strings against rules present in XML, without actually parsing or
55serialising XML data. In that case, this crate is a much lighter choice and
56it can be used in `no_std` environments.
57*/
58use core::fmt;
59
60mod strings;
61
62pub mod selectors;
63
64use selectors::CharSelector;
65
66#[doc(inline)]
67#[cfg(feature = "std")]
68#[cfg_attr(docsrs, doc(cfg(feature = "std")))]
69pub use strings::{CompactString, Name, NcName};
70#[doc(inline)]
71pub use strings::{NameStr, NcNameStr};
72
73/**
74Error condition from validating an XML string.
75*/
76#[derive(Debug, Clone, Copy, PartialEq, Eq)]
77pub enum Error {
78	/// A Name or NCName was empty.
79	EmptyName,
80
81	/// An invalid character was encountered.
82	///
83	/// This variant contains the character as data.
84	InvalidChar(char),
85
86	/// One side of the colon in a name was empty.
87	///
88	/// The contents are implementation details.
89	EmptyNamePart,
90
91	/// More than one colon encountered in a name.
92	///
93	/// The contents are implementation details.
94	MultiColonName,
95
96	/// Local name does not conform to Name production (invalid start char)
97	InvalidLocalName,
98}
99
100impl fmt::Display for Error {
101	fn fmt<'f>(&self, f: &'f mut fmt::Formatter) -> fmt::Result {
102		match self {
103			Self::EmptyName => f.write_str("Name and NCName must not be empty"),
104			Self::InvalidChar(c) => write!(f, "character U+{:04x} is not allowed", *c as u32),
105			Self::EmptyNamePart => f.write_str("empty string on one side of the colon"),
106			Self::MultiColonName => f.write_str("more than one colon"),
107			Self::InvalidLocalName => f.write_str("local name is invalid"),
108		}
109	}
110}
111
112#[cfg(feature = "std")]
113impl std::error::Error for Error {}
114
115/**
116Check whether a str is a valid XML 1.0 Name
117
118# Example
119
120```rust
121use rxml_validation::{validate_name, Error};
122
123assert!(validate_name("foobar").is_ok());
124assert!(validate_name("foo:bar").is_ok());
125assert!(matches!(validate_name("foo bar"), Err(Error::InvalidChar(' '))));
126assert!(matches!(validate_name(""), Err(Error::EmptyName)));
127*/
128pub fn validate_name(s: &str) -> Result<(), Error> {
129	let mut chars = s.chars();
130	match chars.next() {
131		// must have at least one char
132		None => return Err(Error::EmptyName),
133		Some(c) => {
134			if !selectors::CLASS_XML_NAMESTART.select(c) {
135				return Err(Error::InvalidChar(c));
136			}
137		}
138	}
139	for ch in chars {
140		if !selectors::CLASS_XML_NAME.select(ch) {
141			return Err(Error::InvalidChar(ch));
142		}
143	}
144	Ok(())
145}
146
147/**
148Check whether a str is a valid XML 1.0 Name, without colons.
149
150# Example
151
152```rust
153use rxml_validation::{validate_ncname, Error};
154
155assert!(validate_ncname("foobar").is_ok());
156assert!(matches!(validate_ncname("foo:bar"), Err(Error::InvalidChar(':'))));
157assert!(matches!(validate_ncname(""), Err(Error::EmptyName)));
158*/
159pub fn validate_ncname(s: &str) -> Result<(), Error> {
160	let mut chars = s.chars();
161	match chars.next() {
162		// must have at least one char
163		None => return Err(Error::EmptyName),
164		Some(c) => {
165			if !selectors::CLASS_XML_NAMESTART.select(c) || c == ':' {
166				return Err(Error::InvalidChar(c));
167			}
168		}
169	}
170	for ch in chars {
171		if !selectors::CLASS_XML_NAME.select(ch) || ch == ':' {
172			return Err(Error::InvalidChar(ch));
173		}
174	}
175	Ok(())
176}
177
178/**
179Check whether a str is valid XML 1.0 CData.
180
181There exists no specific string type for CData, because it is almost identical
182to Rust strings and encoding even a validated CData string into XML requires
183extra steps because it may contain characters which need escaping when written
184into an XML document.
185
186# Example
187
188```rust
189use rxml_validation::{validate_cdata, Error};
190
191assert!(validate_cdata("foo bar baz <fnord!>").is_ok());
192assert!(matches!(validate_cdata("\x01"), Err(Error::InvalidChar('\x01'))));
193*/
194pub fn validate_cdata(s: &str) -> Result<(), Error> {
195	let s = s.as_bytes();
196	for i in 0..s.len() {
197		let b = s[i];
198		if b < 0x09 || b == 0x0b || b == 0x0c || (b >= 0x0e && b <= 0x1f) {
199			return Err(Error::InvalidChar(b.into()));
200		}
201		if b == 0xbe || b == 0xbf {
202			if i >= 2 && s[i - 2] == 0xef && s[i - 1] == 0xbf {
203				// U+FFFE or U+FFFF
204				let bit = (b & 0x01) as u32;
205				// SAFETY: we are passing only 16 bits and the upper
206				// nibble is set to all ones, so this is within the bounds
207				// of a unicode code point and not a surrogate.
208				let ch = unsafe { char::from_u32_unchecked(0xfffe | bit) };
209				return Err(Error::InvalidChar(ch));
210			}
211		}
212	}
213	Ok(())
214}
215
216#[cfg(test)]
217mod tests {
218	use super::*;
219
220	#[test]
221	fn test_cdata_smoketest() {
222		assert!(validate_cdata("foo bar baz http://<xyz>").is_ok());
223		assert!(validate_cdata("\u{ffff}").is_err());
224	}
225
226	#[test]
227	fn test_name_smoketest() {
228		assert!(validate_name("foobar").is_ok());
229		assert!(validate_name("foo:bar").is_ok());
230		assert!(validate_name("").is_err());
231		assert!(validate_name("foo bar baz http://<xyz>").is_err());
232		assert!(validate_name("\u{ffff}").is_err());
233	}
234
235	#[test]
236	fn test_ncname_smoketest() {
237		assert!(validate_ncname("foobar").is_ok());
238		assert!(validate_ncname("foo:bar").is_err());
239		assert!(validate_ncname("").is_err());
240		assert!(validate_ncname("foo bar baz http://<xyz>").is_err());
241		assert!(validate_ncname("\u{ffff}").is_err());
242	}
243
244	#[test]
245	fn test_validate_cdata_is_equivalent_to_nonchar_class() {
246		let mut buf = String::with_capacity(4);
247		for cp in 0x0..=0x10ffffu32 {
248			if let Some(ch) = std::char::from_u32(cp) {
249				buf.clear();
250				buf.push(ch);
251				if selectors::CLASS_XML_NONCHAR.select(ch) {
252					match validate_cdata(&buf) {
253						Err(Error::InvalidChar(v)) => {
254							assert_eq!(v, ch);
255						}
256						other => panic!("validate_cdata accepts {:?} (ch={:?}) which is rejected by CLASS_XML_NONCHAR: {:?}", buf, ch, other),
257					}
258				} else {
259					match validate_cdata(&buf) {
260						Ok(()) => (),
261						other => panic!("validate_cdata rejects {:?} (ch={:?}) which is accepted by CLASS_XML_NONCHAR: {:?}", buf, ch, other),
262					}
263				}
264			}
265		}
266	}
267}