1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
#![allow(invalid_from_utf8)]
//! Unicode handling test for VT Code
//!
//! This test demonstrates unicode handling issues that can occur in PTY output processing.
#[cfg(test)]
mod tests {
use std::str;
#[test]
fn test_utf8_validation_edge_cases() {
// Test cases that simulate what push_utf8 might encounter
// Valid UTF-8 sequences
let valid_ascii = b"Hello World";
assert!(str::from_utf8(valid_ascii).is_ok());
let valid_unicode = "Hello 世界".as_bytes();
assert!(str::from_utf8(valid_unicode).is_ok());
let valid_emoji = "🌍🚀✨".as_bytes();
assert!(str::from_utf8(valid_emoji).is_ok());
// Invalid UTF-8 sequences that push_utf8 should handle
let invalid_sequence = &[0xFF, 0xFE, b'X'];
let result = str::from_utf8(invalid_sequence);
assert!(result.is_err());
// Test UTF-8 error handling similar to push_utf8
match str::from_utf8(invalid_sequence) {
Ok(_) => panic!("Should not be valid UTF-8"),
Err(error) => {
println!("UTF-8 error: {:?}", error);
println!("Valid up to: {}", error.valid_up_to());
println!("Error length: {:?}", error.error_len());
// Simulate push_utf8 behavior
let valid_up_to = error.valid_up_to();
if valid_up_to > 0 {
let valid_part = &invalid_sequence[..valid_up_to];
let valid_str = str::from_utf8(valid_part).unwrap();
println!("Valid part: '{:?}'", valid_str);
}
if let Some(error_len) = error.error_len() {
println!("Invalid sequence length: {}", error_len);
// push_utf8 would push "\u{FFFD}" here
}
}
}
}
#[test]
fn test_split_utf8_sequences() {
// Test what happens when UTF-8 sequences are split (simulating buffer boundaries)
// "é" is 0xC3 0xA9 in UTF-8
let full_sequence = [0xC3, 0xA9];
assert_eq!(str::from_utf8(&full_sequence).unwrap(), "é");
// Test incomplete sequences
let incomplete_1 = [0xC3]; // Missing second byte
assert!(str::from_utf8(&incomplete_1).is_err());
let incomplete_2 = [0xA9]; // Second byte without first (would be different char)
assert!(str::from_utf8(&incomplete_2).is_err());
// Emoji "🌍" is 4 bytes: 0xF0 0x9F 0x8C 0x8D
let full_emoji = [0xF0, 0x9F, 0x8C, 0x8D];
assert_eq!(str::from_utf8(&full_emoji).unwrap(), "🌍");
// Test incomplete emoji sequences
let incomplete_emoji_1 = [0xF0, 0x9F]; // First 2 bytes
assert!(str::from_utf8(&incomplete_emoji_1).is_err());
let incomplete_emoji_2 = [0xF0, 0x9F, 0x8C]; // First 3 bytes
assert!(str::from_utf8(&incomplete_emoji_2).is_err());
}
#[test]
fn test_utf8_replacement_character() {
// Test the replacement character that push_utf8 uses for invalid UTF-8
let replacement = "\u{FFFD}";
assert_eq!(replacement, "�");
// Test how it appears with other characters
let mixed = format!("Hello{}World", replacement);
assert!(mixed.contains("�"));
assert!(mixed.contains("Hello"));
assert!(mixed.contains("World"));
}
#[test]
fn test_overlong_utf8_encodings() {
// Test overlong encodings that should be rejected
// 'A' should be 0x41, but can be encoded as 0xC1 0x81 (overlong)
let overlong_a = [0xC1, 0x81];
// This should be invalid/replaced in proper UTF-8 handling
match str::from_utf8(&overlong_a) {
Ok(s) => {
println!("Overlong encoding accepted as: '{:?}'", s);
// Some UTF-8 decoders might accept this, but they shouldn't
}
Err(e) => {
println!("Overlong encoding correctly rejected: {:?}", e);
}
}
}
#[test]
fn test_surrogate_code_points() {
// UTF-16 surrogate pairs encoded in UTF-8 are invalid
// High surrogate: 0xD800-0xDBFF
// Low surrogate: 0xDC00-0xDFFF
// High surrogate in UTF-8: 0xED 0xA0 0x80 to 0xED 0xAF 0xBF
let high_surrogate = [0xED, 0xA0, 0x80];
assert!(str::from_utf8(&high_surrogate).is_err());
// Low surrogate in UTF-8: 0xED 0xB0 0x80 to 0xED 0xBF 0xBF
let low_surrogate = [0xED, 0xB0, 0x80];
assert!(str::from_utf8(&low_surrogate).is_err());
}
#[test]
fn simulate_push_utf8_behavior() {
// Simulate the behavior of push_utf8 with various inputs
fn simulate_push_utf8(buffer: &mut Vec<u8>, eof: bool) -> String {
let mut result = String::new();
loop {
match str::from_utf8(buffer) {
Ok(valid) => {
if !valid.is_empty() {
result.push_str(valid);
}
buffer.clear();
break;
}
Err(error) => {
let valid_up_to = error.valid_up_to();
if valid_up_to > 0 {
if let Ok(valid) = str::from_utf8(&buffer[..valid_up_to])
&& !valid.is_empty()
{
result.push_str(valid);
}
buffer.drain(..valid_up_to);
continue;
}
if let Some(error_len) = error.error_len() {
result.push('\u{FFFD}'); // Replacement character
buffer.drain(..error_len);
continue;
}
if eof && !buffer.is_empty() {
result.push('\u{FFFD}');
buffer.clear();
}
break;
}
}
}
result
}
// Test cases
// Valid UTF-8
let mut buffer1 = b"Hello World".to_vec();
let result1 = simulate_push_utf8(&mut buffer1, false);
assert_eq!(result1, "Hello World");
assert!(buffer1.is_empty());
// Valid Unicode
let mut buffer2 = "Hello 世界".as_bytes().to_vec();
let result2 = simulate_push_utf8(&mut buffer2, false);
assert_eq!(result2, "Hello 世界");
assert!(buffer2.is_empty());
// Invalid UTF-8
let mut buffer3 = vec![0xFF, 0xFE, b'X'];
let result3 = simulate_push_utf8(&mut buffer3, false);
assert!(result3.contains("\u{FFFD}")); // Should contain replacement
assert!(result3.contains('X'));
assert!(buffer3.is_empty());
// Incomplete UTF-8 at EOF
let mut buffer4 = vec![0xC3]; // Incomplete "é"
let result4 = simulate_push_utf8(&mut buffer4, true);
assert!(result4.contains("\u{FFFD}")); // Should replace incomplete sequence
assert!(buffer4.is_empty());
// Split UTF-8 simulation
let mut buffer5a = vec![0xC3]; // First half of "é"
let result5a = simulate_push_utf8(&mut buffer5a, false);
assert_eq!(result5a, ""); // No valid text yet
assert_eq!(buffer5a, vec![0xC3]); // Should keep incomplete byte
// Now add the second half
buffer5a.extend_from_slice(&[0xA9, b' ']); // Complete "é" + space
let result5b = simulate_push_utf8(&mut buffer5a, false);
assert_eq!(result5b, "é ");
assert!(buffer5a.is_empty());
println!("Simulated push_utf8 behavior test completed successfully!");
}
#[test]
fn test_real_world_unicode_scenarios() {
// Test scenarios that might occur in real PTY output
// Compiler output with unicode
let compiler_output = "error: expected one of `!` or `::`, found `🌍`";
assert!(compiler_output.contains("🌍"));
// Progress indicators
let progress = "Building... [wait] Compiling... v Done!";
assert!(progress.contains("[wait]"));
assert!(progress.contains("v"));
// File paths with unicode (common in international projects)
let unicode_path = "/projects/用户/документы/ファイル.txt";
assert!(unicode_path.contains("用户"));
assert!(unicode_path.contains("документы"));
assert!(unicode_path.contains("ファイル"));
// Error messages in different languages
let multilingual_error = "Error/Erreur/错误/エラー/오류";
assert!(multilingual_error.contains("错误"));
assert!(multilingual_error.contains("エラー"));
// Scientific/mathematical symbols
let math_symbols = "∑ ∏ ∫ ∂ ∇ ≤ ≥ ≠ ≈ ∞";
assert!(math_symbols.contains("∑"));
assert!(math_symbols.contains("∞"));
println!("Real-world unicode scenarios test completed!");
}
}