Skip to main content

perl_test_generators/
unicode.rs

1//! Generators for Unicode strings exercising various edge cases.
2//!
3//! Produces strings with BMP characters, supplementary planes, surrogate
4//! boundaries, combining marks, and RTL scripts — useful for testing
5//! UTF-8/UTF-16 position mappers and LSP protocol encoding.
6
7use proptest::prelude::*;
8
9/// Generate a Unicode string suitable for testing UTF-8/UTF-16 conversion.
10///
11/// Mixes ASCII, BMP non-ASCII, supplementary plane characters, and
12/// combining marks.
13pub fn unicode_string() -> impl Strategy<Value = String> {
14    prop_oneof![
15        // Pure ASCII
16        prop::collection::vec(
17            prop_oneof![
18                prop::char::range('a', 'z'),
19                prop::char::range('A', 'Z'),
20                prop::char::range('0', '9'),
21                Just(' '),
22                Just('\t'),
23            ],
24            0..=50_usize,
25        )
26        .prop_map(|chars| chars.into_iter().collect()),
27        // BMP with non-ASCII (Latin-1 supplement, CJK, etc.)
28        prop::collection::vec(prop::char::range('\u{00C0}', '\u{FFFF}'), 0..=50_usize)
29            .prop_map(|chars| chars.into_iter().collect()),
30        // Supplementary plane characters (emoji, rare scripts)
31        prop::collection::vec(prop::char::range('\u{10000}', '\u{10FFFF}'), 0..=30_usize)
32            .prop_map(|chars| chars.into_iter().collect()),
33        // Mix of all ranges
34        prop::collection::vec(
35            prop_oneof![
36                prop::char::range('a', 'z'),
37                prop::char::range('A', 'Z'),
38                prop::char::range('0', '9'),
39                prop::char::range('\u{00C0}', '\u{024F}'),
40                prop::char::range('\u{4E00}', '\u{9FFF}'),
41                prop::char::range('\u{10000}', '\u{10FFFF}'),
42                Just('\t'),
43                Just('\n'),
44            ],
45            0..=100_usize,
46        )
47        .prop_map(|chars| chars.into_iter().collect()),
48    ]
49}
50
51#[cfg(test)]
52mod tests {
53    use super::*;
54
55    proptest! {
56        #[test]
57        fn unicode_string_is_valid_utf8(s in unicode_string()) {
58            // If it compiled as String, it's valid UTF-8. Verify round-trip.
59            let bytes = s.as_bytes();
60            match std::str::from_utf8(bytes) {
61                Ok(roundtrip) => prop_assert_eq!(s.as_str(), roundtrip),
62                Err(err) => prop_assert!(false, "generated string was not valid UTF-8: {err}"),
63            }
64        }
65
66        #[test]
67        fn utf16_len_agrees_with_encode_utf16(s in unicode_string()) {
68            let encoded: Vec<u16> = s.encode_utf16().collect();
69            let from_utf16 = String::from_utf16_lossy(&encoded);
70            prop_assert_eq!(s, from_utf16);
71        }
72    }
73}