sxd_document/
str.rs

1trait StrParseExt {
2    fn end_of_start_rest<F1, F2>(&self, is_first: F1, is_rest: F2) -> Option<usize>
3        where F1: Fn(char) -> bool,
4              F2: Fn(char) -> bool;
5}
6
7impl<'a> StrParseExt for &'a str {
8    fn end_of_start_rest<F1, F2>(&self, is_first: F1, is_rest: F2) -> Option<usize>
9        where F1: Fn(char) -> bool,
10              F2: Fn(char) -> bool,
11    {
12        let mut positions = self.char_indices();
13
14        match positions.next() {
15            Some((_, c)) if is_first(c) => (),
16            Some((_, _)) => return None,
17            None => return None,
18        };
19
20        let mut positions = positions.skip_while(|&(_, c)| is_rest(c));
21        match positions.next() {
22            Some((offset, _)) => Some(offset),
23            None => Some(self.len()),
24        }
25    }
26}
27
28pub trait XmlStr {
29    /// Find the end of the quoted attribute value, not including the quote
30    fn end_of_attribute(&self, quote: &str) -> Option<usize>;
31    /// Find the end of the direct character data
32    fn end_of_char_data(&self) -> Option<usize>;
33    /// Find the end of the CData section, not including the ]]>
34    fn end_of_cdata(&self) -> Option<usize>;
35    /// Find the end of a run of decimal characters
36    fn end_of_decimal_chars(&self) -> Option<usize>;
37    /// Find the end of a run of hexidecimal characters
38    fn end_of_hex_chars(&self) -> Option<usize>;
39    /// Find the end of the comment, not including the -->
40    fn end_of_comment(&self) -> Option<usize>;
41    /// Find the end of the processing instruction, not including the ?>
42    fn end_of_pi_value(&self) -> Option<usize>;
43    /// Find the end of the [Name](http://www.w3.org/TR/xml/#NT-Name)
44    fn end_of_name(&self) -> Option<usize>;
45    /// Find the end of the [NCName](http://www.w3.org/TR/REC-xml-names/#NT-NCName)
46    fn end_of_ncname(&self) -> Option<usize>;
47    /// Find the end of a run of space characters
48    fn end_of_space(&self) -> Option<usize>;
49    /// Find the end of the starting tag
50    fn end_of_start_tag(&self) -> Option<usize>;
51    fn end_of_encoding(&self) -> Option<usize>;
52    /// Find the end of the internal doc type declaration, not including the ]
53    fn end_of_int_subset(&self) -> Option<usize>;
54}
55
56impl<'a> XmlStr for &'a str {
57    fn end_of_attribute(&self, quote: &str) -> Option<usize> {
58        if self.len() == 0 ||
59           self.starts_with('&') ||
60           self.starts_with('<') ||
61           self.starts_with(quote)
62        {
63            return None;
64        }
65
66        let quote_char = quote.chars().next().expect("Cant have null quote");
67
68        self.find(&['&', '<', quote_char][..]).or(Some(self.len()))
69    }
70
71    fn end_of_char_data(&self) -> Option<usize> {
72        fn find_end_of_char_data(bytes: &[u8]) -> Option<usize> {
73            for (i, &b) in bytes.iter().enumerate() {
74                if b == b'<' || b == b'&' { return Some(i) }
75
76                if b == b']' && bytes[i..].starts_with(b"]]>") {
77                    return Some(i)
78                }
79            }
80            None
81        }
82
83        match find_end_of_char_data(self.as_bytes()) {
84            Some(0) => None,
85            Some(v) => Some(v),
86            None => Some(self.len()),
87        }
88    }
89
90    fn end_of_cdata(&self) -> Option<usize> {
91        self.find("]]>")
92    }
93
94    fn end_of_decimal_chars(&self) -> Option<usize> {
95        self.end_of_start_rest(|c| c.is_decimal_char(),
96                               |c| c.is_decimal_char())
97    }
98
99    fn end_of_hex_chars(&self) -> Option<usize> {
100        self.end_of_start_rest(|c| c.is_hex_char(),
101                               |c| c.is_hex_char())
102    }
103
104    fn end_of_comment(&self) -> Option<usize> {
105        // This deliberately does not include the >. -- is not allowed
106        // in a comment, so we can just test the end if it matches the
107        // complete close delimiter.
108        self.find("--")
109    }
110
111    fn end_of_pi_value(&self) -> Option<usize> {
112        self.find("?>")
113    }
114
115    fn end_of_name(&self) -> Option<usize> {
116        self.end_of_start_rest(|c| c.is_name_start_char(), |c| c.is_name_char())
117    }
118
119    fn end_of_ncname(&self) -> Option<usize> {
120        self.end_of_start_rest(|c| c.is_ncname_start_char(), |c| c.is_ncname_char())
121    }
122
123    fn end_of_space(&self) -> Option<usize> {
124        self.end_of_start_rest(|c| c.is_space_char(), |c| c.is_space_char())
125    }
126
127    fn end_of_start_tag(&self) -> Option<usize> {
128        let mut positions = self.char_indices();
129
130        match positions.next() {
131            Some((_, c)) if '<' == c => (),
132            _ => return None,
133        };
134
135        match positions.next() {
136            Some((offset, c)) =>
137                match c {
138                    '?' | '!' | '/' => None,
139                    _ => Some(offset),
140                },
141            None => Some(self.len()),
142        }
143    }
144
145    fn end_of_encoding(&self) -> Option<usize> {
146        self.end_of_start_rest(|c| c.is_encoding_start_char(), |c| c.is_encoding_rest_char())
147    }
148
149    fn end_of_int_subset(&self) -> Option<usize> { self.find("]") }
150}
151
152/// Predicates used when parsing an characters in an XML document.
153pub trait XmlChar {
154    /// Is this a [NameStartChar](http://www.w3.org/TR/xml/#NT-NameStartChar)?
155    fn is_name_start_char(self) -> bool;
156    /// Is this a [NameChar](http://www.w3.org/TR/xml/#NT-NameChar)?
157    fn is_name_char(self) -> bool;
158    /// Does this start a [NCName](http://www.w3.org/TR/REC-xml-names/#NT-NCName)?
159    fn is_ncname_start_char(self) -> bool;
160    /// Is this a component of a [NCName](http://www.w3.org/TR/REC-xml-names/#NT-NCName)?
161    fn is_ncname_char(self) -> bool;
162    /// Is this an [XML space](http://www.w3.org/TR/xml/#NT-S)?
163    fn is_space_char(self) -> bool;
164    fn is_decimal_char(self) -> bool;
165    fn is_hex_char(self) -> bool;
166    fn is_encoding_start_char(self) -> bool;
167    fn is_encoding_rest_char(self) -> bool;
168}
169
170impl XmlChar for char {
171    fn is_name_start_char(self) -> bool {
172        self == ':' || self.is_ncname_start_char()
173    }
174
175    fn is_name_char(self) -> bool {
176        self.is_name_start_char() || self.is_ncname_char()
177    }
178
179    fn is_ncname_start_char(self) -> bool {
180        match self {
181            'A'...'Z'                   |
182            '_'                         |
183            'a'...'z'                   |
184            '\u{0000C0}'...'\u{0000D6}' |
185            '\u{0000D8}'...'\u{0000F6}' |
186            '\u{0000F8}'...'\u{0002FF}' |
187            '\u{000370}'...'\u{00037D}' |
188            '\u{00037F}'...'\u{001FFF}' |
189            '\u{00200C}'...'\u{00200D}' |
190            '\u{002070}'...'\u{00218F}' |
191            '\u{002C00}'...'\u{002FEF}' |
192            '\u{003001}'...'\u{00D7FF}' |
193            '\u{00F900}'...'\u{00FDCF}' |
194            '\u{00FDF0}'...'\u{00FFFD}' |
195            '\u{010000}'...'\u{0EFFFF}' => true,
196            _ => false,
197        }
198    }
199
200    fn is_ncname_char(self) -> bool {
201        if self.is_ncname_start_char() { return true; }
202        match self {
203            '-'                     |
204            '.'                     |
205            '0'...'9'               |
206            '\u{00B7}'              |
207            '\u{0300}'...'\u{036F}' |
208            '\u{203F}'...'\u{2040}' => true,
209            _ => false
210        }
211    }
212
213    fn is_space_char(self) -> bool {
214        match self {
215            '\x20' |
216            '\x09' |
217            '\x0D' |
218            '\x0A' => true,
219            _ => false,
220        }
221    }
222
223    fn is_decimal_char(self) -> bool {
224        match self {
225            '0'...'9' => true,
226            _ => false,
227        }
228    }
229
230    fn is_hex_char(self) -> bool {
231        match self {
232            '0'...'9' |
233            'a'...'f' |
234            'A'...'F' => true,
235            _ => false,
236        }
237    }
238
239    fn is_encoding_start_char(self) -> bool {
240        match self {
241            'A'...'Z' |
242            'a'...'z' => true,
243            _ => false,
244        }
245    }
246
247    fn is_encoding_rest_char(self) -> bool {
248        match self {
249            'A'...'Z' |
250            'a'...'z' |
251            '0'...'9' |
252            '.' |
253            '_' |
254            '-' => true,
255            _ => false,
256        }
257    }
258
259}
260
261#[cfg(test)]
262mod test {
263    use super::XmlStr;
264
265    #[test]
266    fn end_of_char_data_leading_ampersand() {
267        assert_eq!("&".end_of_char_data(), None);
268    }
269
270    #[test]
271    fn end_of_char_data_leading_less_than() {
272        assert_eq!("<".end_of_char_data(), None);
273    }
274
275    #[test]
276    fn end_of_char_data_leading_cdata_end() {
277        assert_eq!("]]>".end_of_char_data(), None);
278    }
279
280    #[test]
281    fn end_of_char_data_until_ampersand() {
282        assert_eq!("hello&world".end_of_char_data(), Some("hello".len()));
283    }
284
285    #[test]
286    fn end_of_char_data_until_less_than() {
287        assert_eq!("hello<world".end_of_char_data(), Some("hello".len()));
288    }
289
290    #[test]
291    fn end_of_char_data_until_cdata_end() {
292        assert_eq!("hello]]>world".end_of_char_data(), Some("hello".len()));
293    }
294
295    #[test]
296    fn end_of_char_data_includes_right_square() {
297        assert_eq!("hello]world".end_of_char_data(), Some("hello]world".len()));
298    }
299
300    #[test]
301    fn end_of_char_data_includes_multiple_right_squares() {
302        assert_eq!("hello]]world".end_of_char_data(), Some("hello]]world".len()));
303    }
304
305    #[test]
306    fn end_of_int_subset_excludes_right_square() {
307        assert_eq!("hello]>world".end_of_int_subset(), Some("hello".len()))
308    }
309}