pub fn parse(
source: &str,
) -> Result<(CodePointInversionListAndStringList<'static>, usize), ParseError>Expand description
Parses a UnicodeSet pattern and returns a UnicodeSet in the form of a CodePointInversionListAndStringList,
as well as the number of bytes consumed from the source string.
Supports UnicodeSets as described in UTS #35 - Unicode Sets.
The error type of the returned Result can be pretty-printed with ParseError::fmt_with_source.
§Variables
If you need support for variables inside UnicodeSets (e.g., [$start-$end]), use parse_with_variables.
§Limitations
- Currently, we only support the ECMA-262 properties.
The property names must match the exact spelling listed in ECMA-262. Note that we do support UTS35 syntax for elided
General_CategoryandScriptproperty names, i.e.,[:Latn:]and[:Ll:]are both valid, with the former implying theScriptproperty, and the latter theGeneral_Categoryproperty. - We do not support
\N{Unicode code point name}character escaping. Use any other escape method described in UTS35.
✨ Enabled with the compiled_data Cargo feature.
§Examples
Parse ranges
use icu_unicodeset_parse::parse;
let source = "[a-zA-Z0-9]";
let (set, consumed) = parse(source).unwrap();
let code_points = set.code_points();
assert!(code_points.contains_range(&('a'..='z')));
assert!(code_points.contains_range(&('A'..='Z')));
assert!(code_points.contains_range(&('0'..='9')));
assert_eq!(consumed, source.len());Parse properties, set operations, inner sets
use icu_unicodeset_parse::parse;
let (set, _) =
parse("[[:^ll:]-[^][:gc = Lowercase Letter:]&[^[[^]-[a-z]]]]").unwrap();
let elements = 'a'..='z';
assert!(set.code_points().contains_range(&elements));
assert_eq!(elements.count(), set.size());Inversions remove strings
use icu_unicodeset_parse::parse;
let (set, _) =
parse(r"[[a-z{hello\ world}]&[^a-y{hello\ world}]]").unwrap();
assert!(set.contains_char('z'));
assert_eq!(set.size(), 1);
assert!(!set.has_strings());Set operators (including the implicit union) have the same precedence and are left-associative
use icu_unicodeset_parse::parse;
let (set, _) = parse("[[ace][bdf] - [abc][def]]").unwrap();
let elements = 'd'..='f';
assert!(set.code_points().contains_range(&elements));
assert_eq!(set.size(), elements.count());Supports partial parses
use icu_unicodeset_parse::parse;
let (set, consumed) = parse("[a-c][x-z]").unwrap();
let code_points = set.code_points();
let elements = 'a'..='c';
let elements_unparsed = 'x'..='z';
assert!(code_points.contains_range(&elements));
assert!(!code_points.contains_range(&elements_unparsed));
assert_eq!(set.size(), elements.count());
// only the first UnicodeSet is parsed
assert_eq!(consumed, "[a-c]".len());