#![expect(
clippy::panic,
clippy::unwrap_used,
clippy::expect_used,
clippy::indexing_slicing,
clippy::wildcard_enum_match_arm,
missing_docs,
reason = "test code"
)]
use proptest::prelude::*;
use rstest::rstest;
use rlsp_yaml_parser::encoding::{Encoding, EncodingError, decode, detect_encoding};
use rlsp_yaml_parser::{Event, parse_events};
fn has_parse_error(input: &str) -> bool {
parse_events(input).any(|r| r.is_err())
}
fn scalar_values(input: &str) -> Vec<String> {
parse_events(input)
.filter_map(Result::ok)
.filter_map(|(event, _span)| match event {
Event::Scalar { value, .. } => Some(value.into_owned()),
Event::StreamStart
| Event::StreamEnd
| Event::DocumentStart { .. }
| Event::DocumentEnd { .. }
| Event::MappingStart { .. }
| Event::MappingEnd
| Event::SequenceStart { .. }
| Event::SequenceEnd
| Event::Alias { .. }
| Event::Comment { .. } => None,
})
.collect()
}
#[rstest]
#[case::lone_continuation(&[0x80u8] as &[u8])]
#[case::high_continuation(&[0xBFu8] as &[u8])]
#[case::incomplete_two_byte(&[0xC3u8] as &[u8])]
#[case::incomplete_three_byte(&[0xE2u8, 0x82] as &[u8])]
#[case::incomplete_four_byte(&[0xF0u8, 0x9F, 0x98] as &[u8])]
#[case::overlong_nul(&[0xC0u8, 0x80] as &[u8])]
#[case::invalid_0xfe(&[0xFEu8, b'x'] as &[u8])]
#[case::invalid_0xff(&[0xFFu8, b'x'] as &[u8])]
#[case::truncated_at_eof(b"hello\xC3" as &[u8])]
fn decode_invalid_bytes_returns_error(#[case] input: &[u8]) {
assert_eq!(decode(input), Err(EncodingError::InvalidBytes));
}
#[rstest]
#[case::two_byte("café", "café")]
#[case::three_byte("中文", "中文")]
#[case::four_byte("\u{1F600}", "😀")]
#[case::arabic("\u{0639}\u{0631}\u{0628}\u{064A}", "\u{0639}\u{0631}\u{0628}\u{064A}")]
fn decode_valid_multibyte_roundtrip(#[case] input: &str, #[case] expected: &str) {
assert_eq!(decode(input.as_bytes()).unwrap(), expected);
}
#[rstest]
#[case::utf8_bom(&[0xEFu8, 0xBB, 0xBF, b'k', b'e', b'y'] as &[u8], "key")]
#[case::utf16_le_bom(&[0xFFu8, 0xFE, 0x68, 0x00, 0x69, 0x00] as &[u8], "hi")]
#[case::utf16_be_bom(&[0xFEu8, 0xFF, 0x00, 0x68, 0x00, 0x69] as &[u8], "hi")]
#[case::utf32_le_bom(&[0xFFu8, 0xFE, 0x00, 0x00, 0x41, 0x00, 0x00, 0x00] as &[u8], "A")]
#[case::utf32_be_bom(&[0x00u8, 0x00, 0xFE, 0xFF, 0x00, 0x00, 0x00, 0x41] as &[u8], "A")]
fn decode_bom_stripping(#[case] input: &[u8], #[case] expected: &str) {
assert_eq!(decode(input).unwrap(), expected);
}
#[test]
fn decode_rejects_truncated_utf16() {
assert_eq!(
decode(&[0xFF, 0xFE, 0x68]),
Err(EncodingError::TruncatedUtf16)
);
}
#[test]
fn decode_rejects_truncated_utf32() {
assert_eq!(
decode(&[0x00, 0x00, 0xFE, 0xFF, 0x00]),
Err(EncodingError::TruncatedUtf32)
);
}
#[test]
fn decode_rejects_utf16_unpaired_surrogate() {
assert_eq!(
decode(&[0xFE, 0xFF, 0xD8, 0x00]),
Err(EncodingError::InvalidCodepoint(0xD800))
);
}
#[test]
fn decode_rejects_utf32_out_of_range_codepoint() {
assert_eq!(
decode(&[0x00, 0x00, 0xFE, 0xFF, 0x00, 0x11, 0x00, 0x00]),
Err(EncodingError::InvalidCodepoint(0x0011_0000))
);
}
#[rstest]
#[case::plain_scalar("key: val\0ue\n")]
#[case::comment("key: value # comment\0here\n")]
#[case::standalone("\0\n")]
fn parse_events_nul_produces_error(#[case] input: &str) {
assert!(has_parse_error(input));
}
#[test]
fn parse_events_rejects_nul_in_double_quoted_scalar() {
assert!(has_parse_error("key: \"val\0ue\"\n"));
}
#[test]
fn parse_events_accepts_bom_at_stream_start() {
let input = "\u{FEFF}key: value\n";
assert!(!has_parse_error(input));
let values = scalar_values(input);
assert!(
values.contains(&"value".to_string()),
"expected scalar 'value', got: {values:?}"
);
}
#[test]
fn parse_events_rejects_bom_mid_stream() {
assert!(has_parse_error("key: val\u{FEFF}ue\n"));
}
#[test]
fn parse_events_accepts_bom_immediately_after_document_end_marker() {
let input = "key: a\n...\n\u{FEFF}key: b\n";
assert!(
!has_parse_error(input),
"BOM immediately after '...' must be accepted"
);
let values = scalar_values(input);
assert!(
values.contains(&"a".to_string()),
"first doc scalar 'a' present"
);
assert!(
values.contains(&"b".to_string()),
"second doc scalar 'b' present"
);
}
#[test]
fn parse_events_accepts_bom_after_doc_end_then_blank_lines() {
let input = "key: a\n...\n\n\u{FEFF}key: b\n";
assert!(
!has_parse_error(input),
"BOM after blank line after '...' must be accepted"
);
let values = scalar_values(input);
assert!(values.contains(&"a".to_string()));
assert!(values.contains(&"b".to_string()));
}
#[test]
fn parse_events_accepts_bom_after_doc_end_then_comment() {
let input = "key: a\n...\n# comment\n\u{FEFF}key: b\n";
assert!(
!has_parse_error(input),
"BOM after comment after '...' must be accepted"
);
let values = scalar_values(input);
assert!(values.contains(&"a".to_string()));
assert!(values.contains(&"b".to_string()));
let has_comment = parse_events(input)
.filter_map(Result::ok)
.any(|(event, _)| matches!(event, Event::Comment { text } if text.trim() == "comment"));
assert!(has_comment, "expected Comment event with text 'comment'");
}
#[test]
fn parse_events_accepts_multiple_docs_each_with_bom() {
let input = "\u{FEFF}a: 1\n...\n\u{FEFF}b: 2\n...\n\u{FEFF}c: 3\n";
assert!(
!has_parse_error(input),
"multiple docs each with BOM must be accepted"
);
let values = scalar_values(input);
assert!(values.contains(&"1".to_string()));
assert!(values.contains(&"2".to_string()));
assert!(values.contains(&"3".to_string()));
}
#[test]
fn parse_events_bom_at_stream_start_still_accepted() {
let input = "\u{FEFF}key: value\n";
assert!(!has_parse_error(input));
let values = scalar_values(input);
assert!(values.contains(&"value".to_string()));
}
#[test]
fn parse_events_rejects_bom_mid_scalar_regression() {
assert!(has_parse_error("key: val\u{FEFF}ue\n"));
}
#[test]
fn load_multidoc_with_bom_between_docs_produces_correct_ast() {
let input = "key: a\n...\n\u{FEFF}key: b\n";
let docs = rlsp_yaml_parser::load(input).expect("load must succeed");
assert_eq!(docs.len(), 2, "expected two documents");
match &docs[0].root {
rlsp_yaml_parser::Node::Mapping { entries, .. } => {
assert_eq!(entries.len(), 1);
let (k, v) = &entries[0];
assert!(matches!(k, rlsp_yaml_parser::Node::Scalar { value, .. } if value == "key"));
assert!(matches!(v, rlsp_yaml_parser::Node::Scalar { value, .. } if value == "a"));
}
other => panic!("expected mapping, got {other:?}"),
}
match &docs[1].root {
rlsp_yaml_parser::Node::Mapping { entries, .. } => {
assert_eq!(entries.len(), 1);
let (k, v) = &entries[0];
assert!(matches!(k, rlsp_yaml_parser::Node::Scalar { value, .. } if value == "key"));
assert!(matches!(v, rlsp_yaml_parser::Node::Scalar { value, .. } if value == "b"));
}
other => panic!("expected mapping, got {other:?}"),
}
}
#[test]
fn parse_events_bom_after_directives_end_marker_is_error() {
let input = "key: a\n...\n---\n\u{FEFF}key: b\n";
assert!(
has_parse_error(input),
"BOM after '---' is inside the document body and must produce a parse error"
);
}
#[test]
fn parse_events_rejects_double_bom_at_document_prefix() {
let input = "key: a\n...\n\u{FEFF}\u{FEFF}key: b\n";
assert!(
has_parse_error(input),
"double BOM at document prefix must produce a parse error"
);
}
#[test]
fn parse_events_rejects_double_bom_at_stream_start() {
let input = "\u{FEFF}\u{FEFF}key: v\n";
assert!(
has_parse_error(input),
"double BOM at stream start must produce a parse error"
);
}
#[test]
fn parse_events_accepts_single_bom_at_stream_start_regression() {
let input = "\u{FEFF}key: v\n";
assert!(
!has_parse_error(input),
"single BOM at stream start must be accepted"
);
let values = scalar_values(input);
assert!(
values.contains(&"v".to_string()),
"expected scalar 'v', got: {values:?}"
);
}
#[test]
fn parse_events_rejects_double_bom_at_inter_doc_regression() {
let input = "key: a\n...\n\u{FEFF}\u{FEFF}key: b\n";
assert!(
has_parse_error(input),
"double BOM at inter-document prefix must produce a parse error"
);
}
#[test]
fn parse_events_accepts_emoji_in_double_quoted_scalar() {
let input = "greeting: \"hello\u{1F600}\"\n";
assert!(!has_parse_error(input));
let values = scalar_values(input);
assert!(
values.contains(&"hello😀".to_string()),
"expected scalar with emoji, got: {values:?}"
);
}
#[test]
fn parse_events_accepts_cjk_in_plain_scalar() {
let input = "title: 中文\n";
assert!(!has_parse_error(input));
let values = scalar_values(input);
assert!(
values.contains(&"中文".to_string()),
"expected scalar '中文', got: {values:?}"
);
}
#[test]
fn parse_events_accepts_arabic_in_mapping_key() {
let arabic = "\u{0639}\u{0631}\u{0628}\u{064A}";
let input = format!("{arabic}: value\n");
assert!(!has_parse_error(&input));
let values = scalar_values(&input);
assert!(
values.contains(&arabic.to_string()),
"expected Arabic key scalar, got: {values:?}"
);
}
#[test]
fn utf32_be_bom_takes_priority_over_utf16_be_prefix() {
let input: &[u8] = &[0x00, 0x00, 0xFE, 0xFF, 0x00, 0x00, 0x00, 0x0A];
assert_eq!(detect_encoding(input), Encoding::Utf32Be);
assert_eq!(decode(input).unwrap(), "\n");
}
#[rstest]
#[case::utf32_be_with_bom(&[0x00u8, 0x00, 0xFE, 0xFF], Encoding::Utf32Be)]
#[case::utf32_le_with_bom(&[0xFFu8, 0xFE, 0x00, 0x00], Encoding::Utf32Le)]
#[case::utf16_be_with_bom(&[0xFEu8, 0xFF, 0x00, 0x41], Encoding::Utf16Be)]
#[case::utf16_le_with_bom(&[0xFFu8, 0xFE, 0x41, 0x00], Encoding::Utf16Le)]
#[case::utf8_with_bom(&[0xEFu8, 0xBB, 0xBF, 0x41], Encoding::Utf8)]
#[case::utf32_be_no_bom(&[0x00u8, 0x00, 0x00, 0x41], Encoding::Utf32Be)]
#[case::utf32_le_no_bom(&[0x41u8, 0x00, 0x00, 0x00], Encoding::Utf32Le)]
#[case::utf16_be_no_bom(&[0x00u8, 0x41, 0x00, 0x42], Encoding::Utf16Be)]
#[case::utf16_le_no_bom(&[0x41u8, 0x00, 0x42, 0x00], Encoding::Utf16Le)]
#[case::utf8_default(&[0x41u8, 0x42, 0x43, 0x44], Encoding::Utf8)]
fn detect_encoding_covers_all_spec_rows(#[case] bytes: &[u8], #[case] expected: Encoding) {
assert_eq!(detect_encoding(bytes), expected);
}
fn encode_ascii_as_utf32(bytes: &[u8], big_endian: bool) -> Vec<u8> {
let mut out = Vec::with_capacity(bytes.len() * 4);
for &b in bytes {
let cp = u32::from(b);
if big_endian {
out.extend_from_slice(&cp.to_be_bytes());
} else {
out.extend_from_slice(&cp.to_le_bytes());
}
}
out
}
fn encode_ascii_as_utf16(bytes: &[u8], big_endian: bool) -> Vec<u8> {
let mut out = Vec::with_capacity(bytes.len() * 2);
for &b in bytes {
if big_endian {
out.push(0x00);
out.push(b);
} else {
out.push(b);
out.push(0x00);
}
}
out
}
fn prepend_bom(encoding: Encoding, payload: &[u8]) -> Vec<u8> {
let bom: &[u8] = match encoding {
Encoding::Utf16Be => &[0xFE, 0xFF],
Encoding::Utf16Le => &[0xFF, 0xFE],
Encoding::Utf32Be => &[0x00, 0x00, 0xFE, 0xFF],
Encoding::Utf32Le => &[0xFF, 0xFE, 0x00, 0x00],
Encoding::Utf8 => &[],
};
let mut out = Vec::with_capacity(bom.len() + payload.len());
out.extend_from_slice(bom);
out.extend_from_slice(payload);
out
}
#[test]
fn decode_utf8_bom_only_three_bytes_detected_as_utf8() {
let input: &[u8] = &[0xEF, 0xBB, 0xBF];
assert_eq!(detect_encoding(input), Encoding::Utf8);
assert_eq!(decode(input).unwrap(), "");
}
#[test]
fn decode_bomless_utf16_le_odd_length_returns_truncated_utf16() {
use rlsp_yaml_parser::encoding::EncodingError;
let input: &[u8] = &[0x41, 0x00, 0x42];
assert_eq!(detect_encoding(input), Encoding::Utf16Le);
assert_eq!(decode(input), Err(EncodingError::TruncatedUtf16));
}
proptest! {
#[test]
fn encoding_choice_invariant_for_nonascii_utf8_scalars(
ch in proptest::char::range('\u{0080}', '\u{07FF}')
) {
let yaml = format!("key: \"{ch}\"\n");
let events: Vec<_> = parse_events(yaml.as_str())
.map(|r| r.map(|(e, _)| e))
.collect();
prop_assert!(
events.iter().all(Result::is_ok),
"parse error for char U+{:04X}: {:?}",
u32::from(ch),
events
);
let has_scalar = events.iter().any(|r| {
matches!(r, Ok(Event::Scalar { value, .. }) if value.contains(ch))
});
prop_assert!(
has_scalar,
"expected scalar containing U+{:04X}",
u32::from(ch)
);
}
}
proptest! {
#[test]
fn encoding_choice_invariant_under_parse(
yaml_str in "[a-z]{1,6}: [0-9]{1,6}\n"
) {
prop_assume!(!yaml_str.is_empty());
let utf8_events: Vec<_> = parse_events(yaml_str.as_str())
.map(|r| r.map(|(e, _)| e))
.collect();
let cases: &[(Encoding, Vec<u8>, bool)] = &[
(Encoding::Utf16Be, encode_ascii_as_utf16(yaml_str.as_bytes(), true), false),
(Encoding::Utf16Be, encode_ascii_as_utf16(yaml_str.as_bytes(), true), true),
(Encoding::Utf16Le, encode_ascii_as_utf16(yaml_str.as_bytes(), false), false),
(Encoding::Utf16Le, encode_ascii_as_utf16(yaml_str.as_bytes(), false), true),
(Encoding::Utf32Be, encode_ascii_as_utf32(yaml_str.as_bytes(), true), false),
(Encoding::Utf32Be, encode_ascii_as_utf32(yaml_str.as_bytes(), true), true),
(Encoding::Utf32Le, encode_ascii_as_utf32(yaml_str.as_bytes(), false), false),
(Encoding::Utf32Le, encode_ascii_as_utf32(yaml_str.as_bytes(), false), true),
];
for (encoding, payload, include_bom) in cases {
let bytes = if *include_bom {
prepend_bom(*encoding, payload)
} else {
payload.clone()
};
let decoded = decode(&bytes).unwrap_or_else(|e| {
panic!("decode failed for {encoding:?} bom={include_bom}: {e}");
});
let events: Vec<_> = parse_events(&decoded)
.map(|r| r.map(|(e, _)| e))
.collect();
prop_assert_eq!(
&events,
&utf8_events,
"encoding {:?} bom={} parse events differ from UTF-8",
encoding,
include_bom
);
}
}
}