Skip to main content

rfc9839_rs/
lib.rs

1#![cfg_attr(not(test), no_std)]
2
3//! # RFC9839-rs
4//!
5//! A rust implementation of RFC9839 to test for problematic Unicode code points
6
7
8/// Check if the value is either a low or high surrogate
9/// these characters should not be encoded as part of a UTF-8 stream.
10pub const fn is_unicode_surrotate(c: u32) -> bool {
11    matches!(c, 0xd800..=0xdbff | 0xdc00..=0xdfff)
12}
13
14pub mod control {
15    //! Characters which are part of the ASCII control character range or extended ASCII
16
17    /// Checks for `b'\n'`
18    pub const fn is_newline(c: u32) -> bool {
19        c == 0xa
20    }
21
22    /// Checks for `b'\r'`
23    pub const fn is_carriage_return(c: u32) -> bool {
24        c == 0xd
25    }
26
27    /// Checks for `b'\t'`
28    pub const fn is_horizontal_tab(c: u32) -> bool {
29        c == 0x9
30    }
31
32
33    /// Checks for either `b'\n'`, `b'\r'` or `b\t`
34    pub const fn is_useful_control(c: u32) -> bool {
35        is_newline(c)
36            || is_carriage_return(c)
37            || is_horizontal_tab(c)
38    }
39
40    /// Checks if the value falls into the ASCII control character range
41    pub const fn is_c0_control(c: u32) -> bool {
42        matches!(c, 0x0..=0x1f)
43    }
44
45    /// Checks if the value falls into extended ASCII range
46    pub const fn is_c1_control(c: u32) -> bool {
47        matches!(c, 0x80..=0x9f)
48    }
49
50    /// Checks if the value falls into the ASCII control character range
51    /// and isn't one of `b'\n'`, `b'\r'` or `b\t`
52    pub const fn is_legacy_control(c: u32) -> bool {
53        !is_useful_control(c)
54            && (is_c0_control(c)
55                || is_c1_control(c))
56    }
57}
58
59
60
61
62/// Checks if the value is outside the range of Unicode code points
63pub const fn is_noncharacter(c: u32) -> bool {
64    matches!(c,
65        0xfdd0..=0xfdef
66        | 0xfffe..=0xffff
67        | 0x1fffe..=0x1ffff
68        | 0x2fffe..=0x2ffff
69        | 0x3fffe..=0x3ffff
70        | 0x4fffe..=0x4ffff
71        | 0x5fffe..=0x5ffff
72        | 0x6fffe..=0x6ffff
73        | 0x7fffe..=0x7ffff
74        | 0x8fffe..=0x8ffff
75        | 0x9fffe..=0x9ffff
76        | 0xafffe..=0xaffff
77        | 0xbfffe..=0xbffff
78        | 0xcfffe..=0xcffff
79        | 0xdfffe..=0xdffff
80        | 0xefffe..=0xeffff
81        | 0xffffe..=0xfffff
82        | 0x10fffe..=0x10ffff
83    )
84}
85
86/// Any Unicode code point except high-surrogate and low-surrogate code points.
87/// As specified by Unicode 16
88pub struct UnicodeScalars {}
89
90impl UnicodeScalars {
91    pub const fn contains(c: u32) -> bool {
92        !is_unicode_surrotate(c)
93    }
94}
95
96/// Unicode code points that excludes surrogates, legacy C0 controls, and the
97/// noncharacters U+FFFE and U+FFFF. As specified by the XML 1.0 specification.
98pub struct XmlCharacters {}
99
100impl XmlCharacters {
101    pub const fn contains(c: u32) -> bool {
102        !(control::is_c0_control(c)
103            && !control::is_useful_control(c))
104        && !is_unicode_surrotate(c)
105        && !matches!(c, 0xfffe..=0xffff)
106    }
107}
108
109/// Unicode code points that are not problematic. As specified by RFC9839.
110pub struct UnicodeAssignables {}
111
112impl UnicodeAssignables {
113    pub const fn contains(c: u32) -> bool {
114        c != 0x7f // del
115        && !(control::is_c0_control(c)
116            && !control::is_useful_control(c))
117        && !control::is_c1_control(c)
118        && !is_unicode_surrotate(c)
119        && !is_noncharacter(c)
120    }
121}
122
123#[cfg(test)]
124mod test {
125    use super::*;
126    use core::ops::RangeInclusive;
127
128    #[track_caller]
129    fn assert_predicate(p: fn(u32) -> bool, ranges: &[RangeInclusive<u32>]) {
130        let mut last = 0;
131        for range in ranges {
132            for i in last..*range.start() {
133                assert!(
134                    p(i) == false,
135                    "{}: {:x} should not be included but is",
136                    core::panic::Location::caller(),
137                    i);
138            }
139            last = *range.end() + 1;
140            for u in range.clone() {
141                assert!(
142                    p(u),
143                    "{}: {:x} should be included but isn't",
144                    core::panic::Location::caller(),
145                    u);
146            }
147        }
148        for i in last..=(char::MAX as u32) {
149            assert!(
150                p(i) == false,
151                "{}: {:x} should not be included but is",
152                core::panic::Location::caller(),
153                i);
154        }
155    }
156
157    #[test]
158    fn test_scalars() {
159        let ranges = [
160            0x0..=0xd7ff,
161            0xe000..=0x10ffff
162        ];
163        assert_predicate(UnicodeScalars::contains, &ranges);
164    }
165
166
167    #[test]
168    fn test_xml() {
169        let ranges = [
170            0x9_u32..=0x9,
171            0xa..=0xa,
172            0xd..=0xd,
173            0x20..=0xd7ff,
174            0xe000..=0xfffd,
175            0x10000..=0x10ffff
176        ];
177        assert_predicate(XmlCharacters::contains, &ranges);
178    }
179
180    #[test]
181    fn test_assignable() {
182        let ranges = [
183            0x9_u32..=0x9,
184            0xa..=0xa,
185            0xd..=0xd,
186            0x20..=0x7e,
187            0xa0..=0xd7ff,
188            0xe000..=0xfdcf,
189            0xfdf0..=0xfffd,
190            0x10000..=0x1fffd,
191            0x20000..=0x2fffd,
192            0x30000..=0x3fffd,
193            0x40000..=0x4fffd,
194            0x50000..=0x5fffd,
195            0x60000..=0x6fffd,
196            0x70000..=0x7fffd,
197            0x80000..=0x8fffd,
198            0x90000..=0x9fffd,
199            0xa0000..=0xafffd,
200            0xb0000..=0xbfffd,
201            0xc0000..=0xcfffd,
202            0xd0000..=0xdfffd,
203            0xe0000..=0xefffd,
204            0xf0000..=0xffffd,
205            0x100000..=0x10fffd,
206        ];
207        assert_predicate(UnicodeAssignables::contains, &ranges);
208    }
209}