rxml_validation/lib.rs
1#![deny(missing_docs)]
2#![cfg_attr(not(feature = "std"), no_std)]
3#![cfg_attr(docsrs, feature(doc_cfg))]
4/*!
5# Strongly-typed strings for use with XML 1.0 documents
6
7This crate defines various string- and str-like types which represent pieces
8of text as they may occur in XML documents. These types are checked to contain
9only text which conforms to the respective grammar in the XML specifications.
10
11This allows to carry information about the checking which already took place
12in the parser to the application, avoiding the need to execute checks multiple
13times.
14
15This is a supplementary crate for [`rxml`](https://docs.rs/rxml). It is
16factored out of the main crate to support
17[`rxml_proc`](https://docs.rs/rxml_proc), a crate of macros which allow
18compile-time validation and typing of XML strings. All types defined in this
19crate are re-exported in `rxml`; if you depend on `rxml`, you can use the
20types from there directly.
21
22If the `std` feature is *not* enabled (it is enabled by default), this crate
23can be used in `no_std` environments.
24
25## Type Overview
26
27- [`Name`] and [`NameStr`] represent the `Name` production and can be used
28 for element and attribute names before namespace prefix expansion.
29- [`NcName`] and [`NcNameStr`] represent the `Name` production but without a
30 colon inside; they are used for localnames after prefix expansion and to
31 carry the prefixes themselves.
32
33## Construction
34
35In general, values are constructed using the [`std::convert::TryInto`]
36trait, from other string types or `str`. Supported source types are:
37
38* [`String`] (copies)
39* [`compact_str::CompactString`] (moves)
40* [`str`] (copies for all types except the slice types)
41
42**Note:** If the `compact_str` feature is *not* enabled, all string types use
43the normal [`std::string::String`] type instead.
44
45In addition, converting from [`NcName`] to [`Name`] is possible without extra
46checking and is thus possible through `.into()` (and likewise for the
47corresponding str types).
48
49The inverse directions are only available through `try_into`.
50
51## When to use rxml_validation vs. rxml?
52
53You should use this crate (`rxml_validation`) whenever you only need to
54validate strings against rules present in XML, without actually parsing or
55serialising XML data. In that case, this crate is a much lighter choice and
56it can be used in `no_std` environments.
57*/
58use core::fmt;
59
60mod strings;
61
62pub mod selectors;
63
64use selectors::CharSelector;
65
66#[doc(inline)]
67#[cfg(feature = "std")]
68#[cfg_attr(docsrs, doc(cfg(feature = "std")))]
69pub use strings::{CompactString, Name, NcName};
70#[doc(inline)]
71pub use strings::{NameStr, NcNameStr};
72
73/**
74Error condition from validating an XML string.
75*/
76#[derive(Debug, Clone, Copy, PartialEq, Eq)]
77pub enum Error {
78 /// A Name or NCName was empty.
79 EmptyName,
80
81 /// An invalid character was encountered.
82 ///
83 /// This variant contains the character as data.
84 InvalidChar(char),
85
86 /// One side of the colon in a name was empty.
87 ///
88 /// The contents are implementation details.
89 EmptyNamePart,
90
91 /// More than one colon encountered in a name.
92 ///
93 /// The contents are implementation details.
94 MultiColonName,
95
96 /// Local name does not conform to Name production (invalid start char)
97 InvalidLocalName,
98}
99
100impl fmt::Display for Error {
101 fn fmt<'f>(&self, f: &'f mut fmt::Formatter) -> fmt::Result {
102 match self {
103 Self::EmptyName => f.write_str("Name and NCName must not be empty"),
104 Self::InvalidChar(c) => write!(f, "character U+{:04x} is not allowed", *c as u32),
105 Self::EmptyNamePart => f.write_str("empty string on one side of the colon"),
106 Self::MultiColonName => f.write_str("more than one colon"),
107 Self::InvalidLocalName => f.write_str("local name is invalid"),
108 }
109 }
110}
111
112#[cfg(feature = "std")]
113impl std::error::Error for Error {}
114
115/**
116Check whether a str is a valid XML 1.0 Name
117
118# Example
119
120```rust
121use rxml_validation::{validate_name, Error};
122
123assert!(validate_name("foobar").is_ok());
124assert!(validate_name("foo:bar").is_ok());
125assert!(matches!(validate_name("foo bar"), Err(Error::InvalidChar(' '))));
126assert!(matches!(validate_name(""), Err(Error::EmptyName)));
127*/
128pub fn validate_name(s: &str) -> Result<(), Error> {
129 let mut chars = s.chars();
130 match chars.next() {
131 // must have at least one char
132 None => return Err(Error::EmptyName),
133 Some(c) => {
134 if !selectors::CLASS_XML_NAMESTART.select(c) {
135 return Err(Error::InvalidChar(c));
136 }
137 }
138 }
139 for ch in chars {
140 if !selectors::CLASS_XML_NAME.select(ch) {
141 return Err(Error::InvalidChar(ch));
142 }
143 }
144 Ok(())
145}
146
147/**
148Check whether a str is a valid XML 1.0 Name, without colons.
149
150# Example
151
152```rust
153use rxml_validation::{validate_ncname, Error};
154
155assert!(validate_ncname("foobar").is_ok());
156assert!(matches!(validate_ncname("foo:bar"), Err(Error::InvalidChar(':'))));
157assert!(matches!(validate_ncname(""), Err(Error::EmptyName)));
158*/
159pub fn validate_ncname(s: &str) -> Result<(), Error> {
160 let mut chars = s.chars();
161 match chars.next() {
162 // must have at least one char
163 None => return Err(Error::EmptyName),
164 Some(c) => {
165 if !selectors::CLASS_XML_NAMESTART.select(c) || c == ':' {
166 return Err(Error::InvalidChar(c));
167 }
168 }
169 }
170 for ch in chars {
171 if !selectors::CLASS_XML_NAME.select(ch) || ch == ':' {
172 return Err(Error::InvalidChar(ch));
173 }
174 }
175 Ok(())
176}
177
178/**
179Check whether a str is valid XML 1.0 CData.
180
181There exists no specific string type for CData, because it is almost identical
182to Rust strings and encoding even a validated CData string into XML requires
183extra steps because it may contain characters which need escaping when written
184into an XML document.
185
186# Example
187
188```rust
189use rxml_validation::{validate_cdata, Error};
190
191assert!(validate_cdata("foo bar baz <fnord!>").is_ok());
192assert!(matches!(validate_cdata("\x01"), Err(Error::InvalidChar('\x01'))));
193*/
194pub fn validate_cdata(s: &str) -> Result<(), Error> {
195 let s = s.as_bytes();
196 for i in 0..s.len() {
197 let b = s[i];
198 if b < 0x09 || b == 0x0b || b == 0x0c || (b >= 0x0e && b <= 0x1f) {
199 return Err(Error::InvalidChar(b.into()));
200 }
201 if b == 0xbe || b == 0xbf {
202 if i >= 2 && s[i - 2] == 0xef && s[i - 1] == 0xbf {
203 // U+FFFE or U+FFFF
204 let bit = (b & 0x01) as u32;
205 // SAFETY: we are passing only 16 bits and the upper
206 // nibble is set to all ones, so this is within the bounds
207 // of a unicode code point and not a surrogate.
208 let ch = unsafe { char::from_u32_unchecked(0xfffe | bit) };
209 return Err(Error::InvalidChar(ch));
210 }
211 }
212 }
213 Ok(())
214}
215
216#[cfg(test)]
217mod tests {
218 use super::*;
219
220 #[test]
221 fn test_cdata_smoketest() {
222 assert!(validate_cdata("foo bar baz http://<xyz>").is_ok());
223 assert!(validate_cdata("\u{ffff}").is_err());
224 }
225
226 #[test]
227 fn test_name_smoketest() {
228 assert!(validate_name("foobar").is_ok());
229 assert!(validate_name("foo:bar").is_ok());
230 assert!(validate_name("").is_err());
231 assert!(validate_name("foo bar baz http://<xyz>").is_err());
232 assert!(validate_name("\u{ffff}").is_err());
233 }
234
235 #[test]
236 fn test_ncname_smoketest() {
237 assert!(validate_ncname("foobar").is_ok());
238 assert!(validate_ncname("foo:bar").is_err());
239 assert!(validate_ncname("").is_err());
240 assert!(validate_ncname("foo bar baz http://<xyz>").is_err());
241 assert!(validate_ncname("\u{ffff}").is_err());
242 }
243
244 #[test]
245 fn test_validate_cdata_is_equivalent_to_nonchar_class() {
246 let mut buf = String::with_capacity(4);
247 for cp in 0x0..=0x10ffffu32 {
248 if let Some(ch) = std::char::from_u32(cp) {
249 buf.clear();
250 buf.push(ch);
251 if selectors::CLASS_XML_NONCHAR.select(ch) {
252 match validate_cdata(&buf) {
253 Err(Error::InvalidChar(v)) => {
254 assert_eq!(v, ch);
255 }
256 other => panic!("validate_cdata accepts {:?} (ch={:?}) which is rejected by CLASS_XML_NONCHAR: {:?}", buf, ch, other),
257 }
258 } else {
259 match validate_cdata(&buf) {
260 Ok(()) => (),
261 other => panic!("validate_cdata rejects {:?} (ch={:?}) which is accepted by CLASS_XML_NONCHAR: {:?}", buf, ch, other),
262 }
263 }
264 }
265 }
266 }
267}