vtcode-core 0.103.1

Core library for VT Code - a Rust-based terminal coding agent
#![allow(invalid_from_utf8)]
//! Unicode handling test for VT Code
//!
//! This test demonstrates unicode handling issues that can occur in PTY output processing.

#[cfg(test)]
mod tests {
    use std::str;

    #[test]
    fn test_utf8_validation_edge_cases() {
        // Test cases that simulate what push_utf8 might encounter

        // Valid UTF-8 sequences
        let valid_ascii = b"Hello World";
        assert!(str::from_utf8(valid_ascii).is_ok());

        let valid_unicode = "Hello 世界".as_bytes();
        assert!(str::from_utf8(valid_unicode).is_ok());

        let valid_emoji = "🌍🚀✨".as_bytes();
        assert!(str::from_utf8(valid_emoji).is_ok());

        // Invalid UTF-8 sequences that push_utf8 should handle
        let invalid_sequence = &[0xFF, 0xFE, b'X'];
        let result = str::from_utf8(invalid_sequence);
        assert!(result.is_err());

        // Test UTF-8 error handling similar to push_utf8
        match str::from_utf8(invalid_sequence) {
            Ok(_) => panic!("Should not be valid UTF-8"),
            Err(error) => {
                println!("UTF-8 error: {:?}", error);
                println!("Valid up to: {}", error.valid_up_to());
                println!("Error length: {:?}", error.error_len());

                // Simulate push_utf8 behavior
                let valid_up_to = error.valid_up_to();
                if valid_up_to > 0 {
                    let valid_part = &invalid_sequence[..valid_up_to];
                    let valid_str = str::from_utf8(valid_part).unwrap();
                    println!("Valid part: '{:?}'", valid_str);
                }

                if let Some(error_len) = error.error_len() {
                    println!("Invalid sequence length: {}", error_len);
                    // push_utf8 would push "\u{FFFD}" here
                }
            }
        }
    }

    #[test]
    fn test_split_utf8_sequences() {
        // Test what happens when UTF-8 sequences are split (simulating buffer boundaries)

        // "é" is 0xC3 0xA9 in UTF-8
        let full_sequence = [0xC3, 0xA9];
        assert_eq!(str::from_utf8(&full_sequence).unwrap(), "é");

        // Test incomplete sequences
        let incomplete_1 = [0xC3]; // Missing second byte
        assert!(str::from_utf8(&incomplete_1).is_err());

        let incomplete_2 = [0xA9]; // Second byte without first (would be different char)
        assert!(str::from_utf8(&incomplete_2).is_err());

        // Emoji "🌍" is 4 bytes: 0xF0 0x9F 0x8C 0x8D
        let full_emoji = [0xF0, 0x9F, 0x8C, 0x8D];
        assert_eq!(str::from_utf8(&full_emoji).unwrap(), "🌍");

        // Test incomplete emoji sequences
        let incomplete_emoji_1 = [0xF0, 0x9F]; // First 2 bytes
        assert!(str::from_utf8(&incomplete_emoji_1).is_err());

        let incomplete_emoji_2 = [0xF0, 0x9F, 0x8C]; // First 3 bytes
        assert!(str::from_utf8(&incomplete_emoji_2).is_err());
    }

    #[test]
    fn test_utf8_replacement_character() {
        // Test the replacement character that push_utf8 uses for invalid UTF-8
        let replacement = "\u{FFFD}";
        assert_eq!(replacement, "");

        // Test how it appears with other characters
        let mixed = format!("Hello{}World", replacement);
        assert!(mixed.contains(""));
        assert!(mixed.contains("Hello"));
        assert!(mixed.contains("World"));
    }

    #[test]
    fn test_overlong_utf8_encodings() {
        // Test overlong encodings that should be rejected
        // 'A' should be 0x41, but can be encoded as 0xC1 0x81 (overlong)
        let overlong_a = [0xC1, 0x81];

        // This should be invalid/replaced in proper UTF-8 handling
        match str::from_utf8(&overlong_a) {
            Ok(s) => {
                println!("Overlong encoding accepted as: '{:?}'", s);
                // Some UTF-8 decoders might accept this, but they shouldn't
            }
            Err(e) => {
                println!("Overlong encoding correctly rejected: {:?}", e);
            }
        }
    }

    #[test]
    fn test_surrogate_code_points() {
        // UTF-16 surrogate pairs encoded in UTF-8 are invalid
        // High surrogate: 0xD800-0xDBFF
        // Low surrogate: 0xDC00-0xDFFF

        // High surrogate in UTF-8: 0xED 0xA0 0x80 to 0xED 0xAF 0xBF
        let high_surrogate = [0xED, 0xA0, 0x80];
        assert!(str::from_utf8(&high_surrogate).is_err());

        // Low surrogate in UTF-8: 0xED 0xB0 0x80 to 0xED 0xBF 0xBF
        let low_surrogate = [0xED, 0xB0, 0x80];
        assert!(str::from_utf8(&low_surrogate).is_err());
    }

    #[test]
    fn simulate_push_utf8_behavior() {
        // Simulate the behavior of push_utf8 with various inputs

        fn simulate_push_utf8(buffer: &mut Vec<u8>, eof: bool) -> String {
            let mut result = String::new();

            loop {
                match str::from_utf8(buffer) {
                    Ok(valid) => {
                        if !valid.is_empty() {
                            result.push_str(valid);
                        }
                        buffer.clear();
                        break;
                    }
                    Err(error) => {
                        let valid_up_to = error.valid_up_to();
                        if valid_up_to > 0 {
                            if let Ok(valid) = str::from_utf8(&buffer[..valid_up_to])
                                && !valid.is_empty()
                            {
                                result.push_str(valid);
                            }
                            buffer.drain(..valid_up_to);
                            continue;
                        }

                        if let Some(error_len) = error.error_len() {
                            result.push('\u{FFFD}'); // Replacement character
                            buffer.drain(..error_len);
                            continue;
                        }

                        if eof && !buffer.is_empty() {
                            result.push('\u{FFFD}');
                            buffer.clear();
                        }

                        break;
                    }
                }
            }

            result
        }

        // Test cases

        // Valid UTF-8
        let mut buffer1 = b"Hello World".to_vec();
        let result1 = simulate_push_utf8(&mut buffer1, false);
        assert_eq!(result1, "Hello World");
        assert!(buffer1.is_empty());

        // Valid Unicode
        let mut buffer2 = "Hello 世界".as_bytes().to_vec();
        let result2 = simulate_push_utf8(&mut buffer2, false);
        assert_eq!(result2, "Hello 世界");
        assert!(buffer2.is_empty());

        // Invalid UTF-8
        let mut buffer3 = vec![0xFF, 0xFE, b'X'];
        let result3 = simulate_push_utf8(&mut buffer3, false);
        assert!(result3.contains("\u{FFFD}")); // Should contain replacement
        assert!(result3.contains('X'));
        assert!(buffer3.is_empty());

        // Incomplete UTF-8 at EOF
        let mut buffer4 = vec![0xC3]; // Incomplete "é"
        let result4 = simulate_push_utf8(&mut buffer4, true);
        assert!(result4.contains("\u{FFFD}")); // Should replace incomplete sequence
        assert!(buffer4.is_empty());

        // Split UTF-8 simulation
        let mut buffer5a = vec![0xC3]; // First half of "é"
        let result5a = simulate_push_utf8(&mut buffer5a, false);
        assert_eq!(result5a, ""); // No valid text yet
        assert_eq!(buffer5a, vec![0xC3]); // Should keep incomplete byte

        // Now add the second half
        buffer5a.extend_from_slice(&[0xA9, b' ']); // Complete "é" + space
        let result5b = simulate_push_utf8(&mut buffer5a, false);
        assert_eq!(result5b, "é ");
        assert!(buffer5a.is_empty());

        println!("Simulated push_utf8 behavior test completed successfully!");
    }

    #[test]
    fn test_real_world_unicode_scenarios() {
        // Test scenarios that might occur in real PTY output

        // Compiler output with unicode
        let compiler_output = "error: expected one of `!` or `::`, found `🌍`";
        assert!(compiler_output.contains("🌍"));

        // Progress indicators
        let progress = "Building... [wait] Compiling... v Done!";
        assert!(progress.contains("[wait]"));
        assert!(progress.contains("v"));

        // File paths with unicode (common in international projects)
        let unicode_path = "/projects/用户/документы/ファイル.txt";
        assert!(unicode_path.contains("用户"));
        assert!(unicode_path.contains("документы"));
        assert!(unicode_path.contains("ファイル"));

        // Error messages in different languages
        let multilingual_error = "Error/Erreur/错误/エラー/오류";
        assert!(multilingual_error.contains("错误"));
        assert!(multilingual_error.contains("エラー"));

        // Scientific/mathematical symbols
        let math_symbols = "∑ ∏ ∫ ∂ ∇ ≤ ≥ ≠ ≈ ∞";
        assert!(math_symbols.contains(""));
        assert!(math_symbols.contains(""));

        println!("Real-world unicode scenarios test completed!");
    }
}