1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
use canonical_combining_class;
/// The size of the longest UTF-8 scalar value encoding. Note that even though
/// RFC-2279 allowed longer encodings, it's obsoleted by RFC-3629 which
/// doesn't. This limit is also documented in [the relevant section of Rust's
/// documentation].
///
/// [the relevant section of Rust's documentation]: https://doc.rust-lang.org/std/primitive.char.html#method.encode_utf8
pub const MAX_UTF8_SIZE: usize = 4;
/// From unicode-normalization.
const MAX_NONSTARTERS: usize = 30;
// Enough for a composed start, a long sequence of nonstarters, followed by a
// composed end.
//
// TODO: Investigate whether we can avoid considering composed starters and
// stoppers.
pub const NORMALIZATION_BUFFER_LEN: usize = 2 + MAX_NONSTARTERS + 2;
/// The minimum size of a buffer needed to perform NFC normalization, and thus
/// the minimum size needed to pass to [`TextReader`]'s [`read`].
///
/// [`TextReader`]: https://docs.rs/basic-text/latest/basic_text/struct.TextReader.html
/// [`read`]: std::io::Read::read
pub const NORMALIZATION_BUFFER_SIZE: usize = MAX_UTF8_SIZE * NORMALIZATION_BUFFER_LEN;
/// ASCII FF, known as '\f' in some contexts.
pub const FF: char = '\u{c}';
/// ASCII BEL.
pub const BEL: char = '\u{7}';
/// ASCII CAN.
pub const CAN: char = '\u{18}';
/// ASCII ESC, known as '\e' in some contexts.
pub const ESC: char = '\u{1b}';
/// ASCII SUB.
pub const SUB: char = '\u{1a}';
/// ASCII DEL, which is not what's generated by the "delete" key on the
/// keyboard
pub const DEL: char = '\u{7f}';
/// EBCDIC NEXT LINE, which is treated like generic whitespace.
pub const NEL: char = '\u{85}';
/// COMBINING GRAPHEME JOINER
pub const CGJ: char = '\u{34f}';
/// ZERO WIDTH NO-BREAK SPACE, also known as the byte-order mark, or BOM
pub const BOM: char = '\u{feff}';
/// WORD JOINER
pub const WJ: char = '\u{2060}';
/// ZERO WIDTH JOINER
pub const ZWJ: char = '\u{200d}';
/// REPLACEMENT CHARACTER
pub const REPL: char = '\u{fffd}';
/// OBJECT REPLACEMENT CHARACTER
pub const ORC: char = '\u{fffc}';
/// LINE SEPARATOR
pub const LS: char = '\u{2028}';
/// PARAGRAPH SEPARATOR
pub const PS: char = '\u{2029}';
// TODO: include ZWJ, WJ, ZWNJ, CGJ as non-starters?