rxml_validation/
selectors.rs

1/*!
2# Constants and Utilities for matching ranges of codepoints
3
4The contents of this module are implementation details of `rxml`,
5`rxml_validation` and `rxml_proc` and should not be relied upon.
6*/
7use core::fmt;
8
9/**
10# Predicate trait for matching chars
11*/
12pub trait CharSelector {
13	/// Return true if the given char is selected by the selector
14	fn select(&self, c: char) -> bool;
15}
16
17/// Selects all chars
18#[derive(Debug, Copy, Clone)]
19pub struct AllChars();
20
21impl CharSelector for char {
22	fn select(&self, c: char) -> bool {
23		*self == c
24	}
25}
26
27impl CharSelector for &'_ [char] {
28	fn select(&self, c: char) -> bool {
29		for r in self.iter() {
30			if *r == c {
31				return true;
32			}
33		}
34		false
35	}
36}
37
38impl CharSelector for AllChars {
39	fn select(&self, _c: char) -> bool {
40		return true;
41	}
42}
43
44// start to end (incl., because some of our edge points are not valid chars
45// in rust)
46/// Selects all chars from a range (including both ends)
47pub struct CodepointRange(pub char, pub char);
48
49/// Valid codepoints for character data (XML 1.0 § 2.2)
50pub const VALID_XML_CDATA_RANGES: &'static [CodepointRange] = &[
51	CodepointRange('\x09', '\x0a'),
52	CodepointRange('\x0d', '\x0d'),
53	CodepointRange('\u{0020}', '\u{d7ff}'),
54	CodepointRange('\u{e000}', '\u{fffd}'),
55	CodepointRange('\u{10000}', '\u{10ffff}'),
56];
57
58/// Invalid codepoints for character data (XML 1.0 § 2.2)
59///
60/// Note that values which are not valid Rust characters are not included in
61/// these ranges.
62pub const INVALID_XML_CDATA_RANGES: &'static [CodepointRange] = &[
63	CodepointRange('\x00', '\x08'),
64	CodepointRange('\x0b', '\x0c'),
65	CodepointRange('\x0e', '\x1f'),
66	CodepointRange('\u{fffe}', '\u{ffff}'),
67];
68
69const VALID_XML_NAME_START_RANGES: &'static [CodepointRange] = &[
70	CodepointRange(':', ':'),
71	CodepointRange('A', 'Z'),
72	CodepointRange('_', '_'),
73	CodepointRange('a', 'z'),
74	CodepointRange('\u{c0}', '\u{d6}'),
75	CodepointRange('\u{d8}', '\u{f6}'),
76	CodepointRange('\u{f8}', '\u{2ff}'),
77	CodepointRange('\u{370}', '\u{37d}'),
78	CodepointRange('\u{37f}', '\u{1fff}'),
79	CodepointRange('\u{200c}', '\u{200d}'),
80	CodepointRange('\u{2070}', '\u{218f}'),
81	CodepointRange('\u{2c00}', '\u{2fef}'),
82	CodepointRange('\u{3001}', '\u{d7ff}'),
83	CodepointRange('\u{f900}', '\u{fdcf}'),
84	CodepointRange('\u{10000}', '\u{effff}'),
85];
86
87const VALID_XML_NAME_RANGES: &'static [CodepointRange] = &[
88	CodepointRange(':', ':'),
89	CodepointRange('-', '-'),
90	CodepointRange('.', '.'),
91	CodepointRange('A', 'Z'),
92	CodepointRange('_', '_'),
93	CodepointRange('0', '9'),
94	CodepointRange('a', 'z'),
95	CodepointRange('\u{b7}', '\u{b7}'),
96	CodepointRange('\u{c0}', '\u{d6}'),
97	CodepointRange('\u{d8}', '\u{f6}'),
98	CodepointRange('\u{f8}', '\u{2ff}'),
99	CodepointRange('\u{300}', '\u{36f}'),
100	CodepointRange('\u{370}', '\u{37d}'),
101	CodepointRange('\u{37f}', '\u{1fff}'),
102	CodepointRange('\u{200c}', '\u{200d}'),
103	CodepointRange('\u{203f}', '\u{2040}'),
104	CodepointRange('\u{2070}', '\u{218f}'),
105	CodepointRange('\u{2c00}', '\u{2fef}'),
106	CodepointRange('\u{3001}', '\u{d7ff}'),
107	CodepointRange('\u{f900}', '\u{fdcf}'),
108	CodepointRange('\u{10000}', '\u{effff}'),
109];
110
111impl CodepointRange {
112	/// Returns true if `c` is a member of the range of codepoints represented
113	/// by this object.
114	pub fn contains(&self, c: char) -> bool {
115		return (self.0 <= c) && (c <= self.1);
116	}
117}
118
119/// Selects all chars from any of the contained ranges
120#[derive(Copy)]
121pub struct CodepointRanges(pub &'static [CodepointRange]);
122
123/// Valid non-first characters for an XML Name (XML 1.0 § 2.3 \[4a\])
124pub static CLASS_XML_NAME: CodepointRanges = CodepointRanges(VALID_XML_NAME_RANGES);
125
126/// Valid first characters for an XML Name (XML 1.0 § 2.3 \[4\])
127pub static CLASS_XML_NAMESTART: CodepointRanges = CodepointRanges(VALID_XML_NAME_START_RANGES);
128
129/// See [`INVALID_XML_CDATA_RANGES`]
130pub static CLASS_XML_NONCHAR: CodepointRanges = CodepointRanges(INVALID_XML_CDATA_RANGES);
131
132impl CharSelector for CodepointRange {
133	fn select(&self, c: char) -> bool {
134		self.contains(c)
135	}
136}
137
138impl CharSelector for CodepointRanges {
139	fn select(&self, c: char) -> bool {
140		contained_in_ranges(c, self.0)
141	}
142}
143
144/// Returns true if `c` is a member of any of the range of the given codepoint
145/// ranges.
146pub fn contained_in_ranges(c: char, rs: &[CodepointRange]) -> bool {
147	for r in rs.iter() {
148		if r.contains(c) {
149			return true;
150		}
151	}
152	false
153}
154
155impl fmt::Debug for CodepointRanges {
156	fn fmt<'f>(&self, f: &'f mut fmt::Formatter) -> fmt::Result {
157		write!(f, "CodepointRanges(<{} ranges>)", self.0.len())
158	}
159}
160
161impl Clone for CodepointRanges {
162	fn clone(&self) -> Self {
163		CodepointRanges(self.0)
164	}
165}
166
167impl PartialEq for CodepointRanges {
168	fn eq(&self, other: &CodepointRanges) -> bool {
169		core::ptr::eq(&self.0, &other.0)
170	}
171}
172
173#[cfg(test)]
174mod tests {
175	use super::*;
176
177	#[test]
178	fn cdata_inclusion_and_exclusion_are_equivalent() {
179		let excluder = CodepointRanges(INVALID_XML_CDATA_RANGES);
180		let includer = CodepointRanges(VALID_XML_CDATA_RANGES);
181		for cp in 0x0..=0x10ffffu32 {
182			if let Some(ch) = core::char::from_u32(cp) {
183				if !includer.select(ch) != excluder.select(ch) {
184					panic!("INVALID_XML_CDATA_RANGES and VALID_XML_CDATA_RANGES have different opinions about U+{:x}", cp)
185				}
186			}
187		}
188	}
189}