basic_text_internals/unicode.rs
1use unicode_normalization::char::canonical_combining_class;
2
3/// The size of the longest UTF-8 scalar value encoding. Note that even though
4/// RFC-2279 allowed longer encodings, it's obsoleted by RFC-3629 which
5/// doesn't. This limit is also documented in [the relevant section of Rust's
6/// documentation].
7///
8/// [the relevant section of Rust's documentation]: https://doc.rust-lang.org/std/primitive.char.html#method.encode_utf8
9pub const MAX_UTF8_SIZE: usize = 4;
10
11/// From unicode-normalization.
12const MAX_NONSTARTERS: usize = 30;
13
14// Enough for a composed start, a long sequence of nonstarters, followed by a
15// composed end.
16//
17// TODO: Investigate whether we can avoid considering composed starters and
18// stoppers.
19pub const NORMALIZATION_BUFFER_LEN: usize = 2 + MAX_NONSTARTERS + 2;
20
21/// The minimum size of a buffer needed to perform NFC normalization, and thus
22/// the minimum size needed to pass to [`TextReader`]'s [`read`].
23///
24/// [`TextReader`]: https://docs.rs/basic-text/latest/basic_text/struct.TextReader.html
25/// [`read`]: std::io::Read::read
26pub const NORMALIZATION_BUFFER_SIZE: usize = MAX_UTF8_SIZE * NORMALIZATION_BUFFER_LEN;
27
28/// ASCII FF, known as '\f' in some contexts.
29pub const FF: char = '\u{c}';
30
31/// ASCII BEL.
32pub const BEL: char = '\u{7}';
33
34/// ASCII CAN.
35pub const CAN: char = '\u{18}';
36
37/// ASCII ESC, known as '\e' in some contexts.
38pub const ESC: char = '\u{1b}';
39
40/// ASCII SUB.
41pub const SUB: char = '\u{1a}';
42
43/// ASCII DEL, which is not what's generated by the "delete" key on the
44/// keyboard
45pub const DEL: char = '\u{7f}';
46
47/// EBCDIC NEXT LINE, which is treated like generic whitespace.
48pub const NEL: char = '\u{85}';
49
50/// COMBINING GRAPHEME JOINER
51pub const CGJ: char = '\u{34f}';
52
53/// ZERO WIDTH NO-BREAK SPACE, also known as the byte-order mark, or BOM
54pub const BOM: char = '\u{feff}';
55
56/// WORD JOINER
57pub const WJ: char = '\u{2060}';
58
59/// ZERO WIDTH JOINER
60pub const ZWJ: char = '\u{200d}';
61
62/// REPLACEMENT CHARACTER
63pub const REPL: char = '\u{fffd}';
64
65/// OBJECT REPLACEMENT CHARACTER
66pub const ORC: char = '\u{fffc}';
67
68/// LINE SEPARATOR
69pub const LS: char = '\u{2028}';
70
71/// PARAGRAPH SEPARATOR
72pub const PS: char = '\u{2029}';
73
74// TODO: include ZWJ, WJ, ZWNJ, CGJ as non-starters?
75#[inline]
76pub fn is_normalization_form_starter(c: char) -> bool {
77 canonical_combining_class(c) == 0
78}