1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
use unicode_normalization::char::canonical_combining_class; /// The size of the longest UTF-8 scalar value encoding. Note that even though /// RFC-2279 allowed longer encodings, it's obsoleted by RFC-3629 which doesn't. /// This limit is also documented in [the relevant section of Rust's documentation]. /// /// [the relevant section of Rust's documentation]: https://doc.rust-lang.org/std/primitive.char.html#method.encode_utf8 pub(crate) const MAX_UTF8_SIZE: usize = 4; /// From unicode-normalization. const MAX_NONSTARTERS: usize = 30; // Enough for a composed start, a long sequence of nonstarters, followed by a // composed end. // // TODO: Investigate whether we can avoid considering composed starters and stoppers. pub(crate) const NORMALIZATION_BUFFER_LEN: usize = 2 + MAX_NONSTARTERS + 2; /// The minimum size of a buffer needed to perform NFC normalization, and thus /// the minimum size needed to pass to [`TextReader`]'s [`read`]. /// /// [`TextReader`]: crate::TextReader /// [`read`]: std::io::Read::read pub const NORMALIZATION_BUFFER_SIZE: usize = MAX_UTF8_SIZE * NORMALIZATION_BUFFER_LEN; /// ASCII FF, known as '\f' in some contexts. pub(crate) const FF: char = '\u{c}'; /// ASCII BEL. pub(crate) const BEL: char = '\u{7}'; /// ASCII CAN. pub(crate) const CAN: char = '\u{18}'; /// ASCII ESC, known as '\e' in some contexts. pub(crate) const ESC: char = '\u{1b}'; /// ASCII SUB. pub(crate) const SUB: char = '\u{1a}'; /// ASCII DEL, which is not what's generated by the "delete" key on the keyboard pub(crate) const DEL: char = '\u{7f}'; /// EBCDIC NEXT LINE, which is treated like generic whitespace. pub(crate) const NEL: char = '\u{85}'; /// COMBINING GRAPHEME JOINER pub(crate) const CGJ: char = '\u{34f}'; /// ZERO WIDTH NO-BREAK SPACE, also known as the byte-order mark, or BOM pub(crate) const BOM: char = '\u{feff}'; /// WORD JOINER pub(crate) const WJ: char = '\u{2060}'; /// REPLACEMENT CHARACTER pub(crate) const REPL: char = '\u{fffd}'; /// OBJECT REPLACEMENT CHARACTER pub(crate) const ORC: char = '\u{fffc}'; /// LINE SEPARATOR pub(crate) const LS: char = '\u{2028}'; /// PARAGRAPH SEPARATOR pub(crate) const PS: char = '\u{2029}'; // TODO: include ZWJ, WJ, ZWNJ, CGJ as non-starters? pub(crate) fn is_normalization_form_starter(c: char) -> bool { canonical_combining_class(c) == 0 }