pub const fn is_c_printable(ch: char) -> bool {
matches!(ch,
'\t' | '\n' | '\r' | '\x20'..='\x7E' | '\u{85}' | '\u{A0}'..='\u{D7FF}' | '\u{E000}'..='\u{FFFD}' | '\u{10000}'..='\u{10FFFF}' )
}
#[cfg(test)]
pub const fn is_nb_json(ch: char) -> bool {
matches!(ch, '\t' | '\x20'..='\u{10FFFF}')
}
#[expect(
clippy::indexing_slicing,
reason = "bounds are enforced by the while i < bytes.len() guard and explicit i + 1 / i + 2 checks"
)]
pub fn find_non_c_printable(bytes: &[u8]) -> Option<(usize, char)> {
let mut i = 0;
while i < bytes.len() {
let b = bytes[i];
if b < 0x20 {
if b != 0x09 {
return Some((i, b as char));
}
i += 1;
} else if b == 0x7F {
return Some((i, '\x7F'));
} else if b == 0xC2 && i + 1 < bytes.len() {
let b2 = bytes[i + 1];
if (0x80..=0x9F).contains(&b2) && b2 != 0x85 {
let ch = char::from_u32(u32::from(b2 & 0x3F) | 0x80).unwrap_or('\u{FFFD}');
return Some((i, ch));
}
i += 2;
} else if b == 0xEF && i + 2 < bytes.len() {
let b2 = bytes[i + 1];
let b3 = bytes[i + 2];
if b2 == 0xBF && (b3 == 0xBE || b3 == 0xBF) {
let ch = if b3 == 0xBE { '\u{FFFE}' } else { '\u{FFFF}' };
return Some((i, ch));
}
i += 3;
} else if b >= 0x80 {
let s = unsafe { std::str::from_utf8_unchecked(&bytes[i..]) };
let ch = s.chars().next().unwrap_or('\u{FFFD}');
i += ch.len_utf8();
} else {
i += 1;
}
}
None
}
#[expect(
clippy::indexing_slicing,
reason = "bounds are enforced by the while i < bytes.len() guard"
)]
pub fn find_non_nb_json(bytes: &[u8]) -> Option<(usize, char)> {
let mut i = 0;
while i < bytes.len() {
let b = bytes[i];
if b < 0x20 && b != 0x09 {
return Some((i, b as char));
}
if b >= 0x80 {
let s = unsafe { std::str::from_utf8_unchecked(&bytes[i..]) };
let ch = s.chars().next().unwrap_or('\u{FFFD}');
i += ch.len_utf8();
} else {
i += 1;
}
}
None
}
#[must_use]
pub fn non_printable_error_message(ch: char, context: &str) -> String {
format!(
"non-printable character U+{:04X} is not allowed in {context}",
u32::from(ch)
)
}
pub const fn is_c_indicator(ch: char) -> bool {
matches!(
ch,
'-' | '?'
| ':'
| ','
| '['
| ']'
| '{'
| '}'
| '#'
| '&'
| '*'
| '!'
| '|'
| '>'
| '\''
| '"'
| '%'
| '@'
| '`'
)
}
pub const fn is_c_flow_indicator(ch: char) -> bool {
matches!(ch, ',' | '[' | ']' | '{' | '}')
}
pub const fn is_ns_char(ch: char) -> bool {
!matches!(ch, ' ' | '\t' | '\n' | '\r' | '\u{FEFF}')
&& matches!(ch,
'\x21'..='\x7E'
| '\u{85}'
| '\u{A0}'..='\u{D7FF}'
| '\u{E000}'..='\u{FFFD}'
| '\u{10000}'..='\u{10FFFF}'
)
}
pub const fn is_ns_uri_char_single(ch: char) -> bool {
ch.is_ascii_alphanumeric()
|| matches!(
ch,
'-' | '_'
| '.'
| '!'
| '~'
| '*'
| '\''
| '('
| ')'
| '['
| ']'
| '#'
| ';'
| '/'
| '?'
| ':'
| '@'
| '&'
| '='
| '+'
| '$'
| ','
)
}
pub const fn is_ns_tag_char_single(ch: char) -> bool {
ch.is_ascii_alphanumeric()
|| matches!(
ch,
'-' | '_'
| '.'
| '~'
| '*'
| '\''
| '('
| ')'
| '#'
| ';'
| '/'
| '?'
| ':'
| '@'
| '&'
| '='
| '+'
| '$'
)
}
pub const fn is_ns_anchor_char(ch: char) -> bool {
!matches!(ch, ' ' | '\t' | '\n' | '\r' | '\u{FEFF}')
&& !is_c_flow_indicator(ch)
&& matches!(ch,
'\x21'..='\x7E'
| '\u{85}'
| '\u{A0}'..='\u{D7FF}'
| '\u{E000}'..='\u{FFFD}'
| '\u{10000}'..='\u{10FFFF}'
)
}
pub fn decode_escape(input: &str) -> Option<(char, usize)> {
let mut chars = input.chars();
let code = chars.next()?;
match code {
'0' => Some(('\x00', 1)),
'a' => Some(('\x07', 1)),
'b' => Some(('\x08', 1)),
't' | '\t' => Some(('\t', 1)),
'n' => Some(('\n', 1)),
'v' => Some(('\x0B', 1)),
'f' => Some(('\x0C', 1)),
'r' => Some(('\r', 1)),
'e' => Some(('\x1B', 1)),
' ' => Some((' ', 1)),
'"' => Some(('"', 1)),
'/' => Some(('/', 1)),
'\\' => Some(('\\', 1)),
'N' => Some(('\u{85}', 1)),
'_' => Some(('\u{A0}', 1)),
'L' => Some(('\u{2028}', 1)),
'P' => Some(('\u{2029}', 1)),
'x' => decode_hex_escape(input, 1, 2),
'u' => decode_hex_escape(input, 1, 4),
'U' => decode_hex_escape(input, 1, 8),
_ => None,
}
}
fn decode_hex_escape(input: &str, start: usize, digit_count: usize) -> Option<(char, usize)> {
let rest = input.get(start..)?;
if rest.len() < digit_count {
return None;
}
let hex_str = rest.get(..digit_count)?;
if !hex_str.chars().all(|c| c.is_ascii_hexdigit()) {
return None;
}
let codepoint = u32::from_str_radix(hex_str, 16).ok()?;
let ch = char::from_u32(codepoint)?;
Some((ch, start + digit_count))
}
#[cfg(test)]
mod tests {
use proptest::prelude::*;
use rstest::rstest;
use super::*;
#[rstest]
#[case::tab('\t')]
#[case::lf('\n')]
#[case::cr('\r')]
#[case::space(' ')]
#[case::tilde('~')]
#[case::ascii_letter('M')]
#[case::nel('\u{85}')]
#[case::non_breaking_space('\u{A0}')]
fn c_printable_accepts(#[case] ch: char) {
assert!(is_c_printable(ch));
}
#[rstest]
#[case::null('\x00')]
#[case::del('\x7F')]
#[case::soh('\x01')]
#[case::bs('\x08')]
#[case::vt('\x0B')]
#[case::ff('\x0C')]
#[case::so('\x0E')]
#[case::us('\x1F')]
#[case::fffe('\u{FFFE}')]
#[case::ffff('\u{FFFF}')]
fn c_printable_rejects(#[case] ch: char) {
assert!(!is_c_printable(ch));
}
#[rstest]
#[case::last_before_surrogates('\u{D7FF}')] #[case::first_after_surrogates('\u{E000}')] #[case::fffd('\u{FFFD}')] fn c_printable_accepts_boundary_codepoints(#[case] ch: char) {
assert!(is_c_printable(ch));
}
#[test]
fn c_printable_rejects_fffe_boundary() {
assert!(!is_c_printable('\u{FFFE}'));
}
#[test]
fn decode_escape_literal_tab_returns_tab() {
assert_eq!(decode_escape("\t"), Some(('\t', 1)));
}
#[test]
fn ns_char_accepts_first_supplementary_plane_codepoint() {
assert!(is_ns_char('\u{10000}'));
}
#[test]
fn ns_char_accepts_last_unicode_codepoint() {
assert!(is_ns_char('\u{10FFFF}'));
}
#[test]
fn c_indicator_accepts_all_21_indicator_chars() {
let indicators = [
'-', '?', ':', ',', '[', ']', '{', '}', '#', '&', '*', '!', '|', '>', '\'', '"', '%',
'@', '`',
];
for ch in indicators {
assert!(is_c_indicator(ch), "should accept {ch:?}");
}
}
#[rstest]
#[case::lowercase_letter('a')]
#[case::digit('0')]
#[case::space(' ')]
fn c_indicator_rejects(#[case] ch: char) {
assert!(!is_c_indicator(ch));
}
#[test]
fn c_flow_indicator_accepts_exactly_five_chars() {
for ch in [',', '[', ']', '{', '}'] {
assert!(is_c_flow_indicator(ch), "should accept {ch:?}");
}
}
#[test]
fn c_flow_indicator_rejects_non_flow_indicators() {
for ch in [
'-', '?', ':', '#', '&', '*', '!', '|', '>', '\'', '"', '%', '@', '`',
] {
assert!(!is_c_flow_indicator(ch), "should reject {ch:?}");
}
}
#[rstest]
#[case::lowercase_letter('a')]
#[case::exclamation('!')]
#[case::cjk_ideograph('\u{4E2D}')]
fn ns_char_accepts(#[case] ch: char) {
assert!(is_ns_char(ch));
}
#[rstest]
#[case::space(' ')]
#[case::tab('\t')]
#[case::lf('\n')]
#[case::cr('\r')]
fn ns_char_rejects(#[case] ch: char) {
assert!(!is_ns_char(ch));
}
#[rstest]
#[case::lowercase_letter('a')]
#[case::hyphen('-')]
#[case::colon(':')]
fn ns_anchor_char_accepts(#[case] ch: char) {
assert!(is_ns_anchor_char(ch));
}
#[test]
fn ns_anchor_char_rejects_flow_indicators() {
for ch in [',', '[', ']', '{', '}'] {
assert!(!is_ns_anchor_char(ch), "should reject {ch:?}");
}
}
#[rstest]
#[case::space(' ')]
#[case::tab('\t')]
#[case::bom('\u{FEFF}')]
fn ns_anchor_char_rejects(#[case] ch: char) {
assert!(!is_ns_anchor_char(ch));
}
#[rstest]
#[case::comma(',')]
#[case::open_bracket('[')]
#[case::close_bracket(']')]
#[case::open_brace('{')]
#[case::close_brace('}')]
fn ns_tag_char_rejects_flow_indicators(#[case] ch: char) {
assert!(!is_ns_tag_char_single(ch));
}
#[rstest]
#[case::lowercase_letter('a')]
#[case::hyphen('-')]
#[case::digit('9')]
#[case::colon(':')]
fn ns_tag_char_accepts(#[case] ch: char) {
assert!(is_ns_tag_char_single(ch));
}
#[test]
fn ns_uri_char_accepts_exclamation_but_tag_char_does_not() {
assert!(is_ns_uri_char_single('!'));
assert!(!is_ns_tag_char_single('!'));
}
#[rstest]
#[case::null_escape("0", '\x00', 1)]
#[case::newline_escape("n", '\n', 1)]
#[case::tab_escape("t", '\t', 1)]
#[case::backslash_escape("\\", '\\', 1)]
#[case::nel_escape("N", '\u{85}', 1)]
#[case::nbsp_escape("_", '\u{A0}', 1)]
#[case::line_sep_escape("L", '\u{2028}', 1)]
#[case::para_sep_escape("P", '\u{2029}', 1)]
#[case::hex_2digit("x41", 'A', 3)]
#[case::hex_4digit("u0041", 'A', 5)]
#[case::hex_8digit("U00000041", 'A', 9)]
#[case::high_plane_codepoint("U0001F600", '\u{1F600}', 9)]
fn decode_escape_success(
#[case] input: &str,
#[case] expected_char: char,
#[case] expected_len: usize,
) {
assert_eq!(decode_escape(input), Some((expected_char, expected_len)));
}
#[rstest]
#[case::unknown_code("q")]
#[case::truncated_hex("x4")]
#[case::non_hex_digits("xGG")]
#[case::surrogate_codepoint("uD800")]
#[case::out_of_range_codepoint("U00110000")]
fn decode_escape_rejects(#[case] input: &str) {
assert_eq!(decode_escape(input), None);
}
#[rstest]
#[case::tab('\t')] #[case::space(' ')] #[case::printable_ascii('A')] #[case::del('\x7F')] #[case::c1_control_0x80('\u{80}')] #[case::c1_control_0x9f('\u{9F}')] #[case::fffe('\u{FFFE}')] #[case::ffff('\u{FFFF}')] #[case::supplementary('\u{1F600}')] fn nb_json_accepts(#[case] ch: char) {
assert!(is_nb_json(ch));
}
#[rstest]
#[case::nul('\x00')]
#[case::soh('\x01')]
#[case::stx('\x02')]
#[case::etx('\x03')]
#[case::eot('\x04')]
#[case::enq('\x05')]
#[case::ack('\x06')]
#[case::bel('\x07')]
#[case::bs('\x08')]
#[case::lf('\n')] #[case::vt('\x0B')]
#[case::ff('\x0C')]
#[case::cr('\r')] #[case::so('\x0E')]
#[case::us('\x1F')]
fn nb_json_rejects(#[case] ch: char) {
assert!(!is_nb_json(ch));
}
#[test]
fn find_non_c_printable_returns_none_for_clean_ascii() {
assert_eq!(find_non_c_printable(b"hello world"), None);
}
#[test]
fn find_non_c_printable_returns_none_for_tab() {
assert_eq!(find_non_c_printable(b"foo\tbar"), None);
}
#[test]
fn find_non_c_printable_detects_c0_control() {
let result = find_non_c_printable(b"foo\x01bar");
assert_eq!(result, Some((3, '\x01')));
}
#[test]
fn find_non_c_printable_detects_nul() {
let result = find_non_c_printable(b"foo\x00bar");
assert_eq!(result, Some((3, '\x00')));
}
#[test]
fn find_non_c_printable_detects_del() {
let result = find_non_c_printable(b"foo\x7Fbar");
assert_eq!(result, Some((3, '\x7F')));
}
#[test]
fn find_non_c_printable_detects_c1_control() {
let input = "foo\u{80}bar";
let result = find_non_c_printable(input.as_bytes());
assert_eq!(result, Some((3, '\u{80}')));
}
#[test]
fn find_non_c_printable_accepts_nel() {
let input = "foo\u{85}bar";
assert_eq!(find_non_c_printable(input.as_bytes()), None);
}
#[test]
fn find_non_c_printable_rejects_u0084() {
let input = "foo\u{84}bar";
let result = find_non_c_printable(input.as_bytes());
assert_eq!(result, Some((3, '\u{84}')));
}
#[test]
fn find_non_c_printable_rejects_u0086() {
let input = "foo\u{86}bar";
let result = find_non_c_printable(input.as_bytes());
assert_eq!(result, Some((3, '\u{86}')));
}
#[test]
fn find_non_c_printable_detects_fffe() {
let input = "foo\u{FFFE}bar";
let result = find_non_c_printable(input.as_bytes());
assert_eq!(result, Some((3, '\u{FFFE}')));
}
#[test]
fn find_non_c_printable_detects_ffff() {
let input = "foo\u{FFFF}bar";
let result = find_non_c_printable(input.as_bytes());
assert_eq!(result, Some((3, '\u{FFFF}')));
}
#[test]
fn find_non_c_printable_accepts_valid_bmp_multibyte() {
let input = "foo\u{4E2D}bar";
assert_eq!(find_non_c_printable(input.as_bytes()), None);
}
#[test]
fn find_non_nb_json_returns_none_for_clean_ascii() {
assert_eq!(find_non_nb_json(b"hello world"), None);
}
#[test]
fn find_non_nb_json_returns_none_for_tab() {
assert_eq!(find_non_nb_json(b"foo\tbar"), None);
}
#[test]
fn find_non_nb_json_returns_none_for_del() {
assert_eq!(find_non_nb_json(b"foo\x7Fbar"), None);
}
#[test]
fn find_non_nb_json_returns_none_for_c1_control() {
let input = "foo\u{80}bar";
assert_eq!(find_non_nb_json(input.as_bytes()), None);
}
#[test]
fn find_non_nb_json_returns_none_for_fffe() {
let input = "foo\u{FFFE}bar";
assert_eq!(find_non_nb_json(input.as_bytes()), None);
}
#[test]
fn find_non_nb_json_detects_c0_control() {
let result = find_non_nb_json(b"foo\x01bar");
assert_eq!(result, Some((3, '\x01')));
}
#[test]
fn find_non_nb_json_detects_nul() {
let result = find_non_nb_json(b"foo\x00bar");
assert_eq!(result, Some((3, '\x00')));
}
proptest! {
#[test]
fn ns_tag_char_implies_ns_uri_char(ch in proptest::char::any()) {
if is_ns_tag_char_single(ch) {
prop_assert!(
is_ns_uri_char_single(ch),
"ns-tag-char(U+{:04X}) -> ns-uri-char must hold",
u32::from(ch)
);
}
}
#[test]
fn ns_uri_char_implies_c_printable(ch in proptest::char::any()) {
if is_ns_uri_char_single(ch) {
prop_assert!(
is_c_printable(ch),
"ns-uri-char(U+{:04X}) -> c-printable must hold",
u32::from(ch)
);
}
}
}
}