fresh/model/encoding.rs
1//! Text encoding detection and conversion
2//!
3//! This module handles:
4//! - Detecting text encodings from byte content (UTF-8, UTF-16, Latin-1, CJK, etc.)
5//! - Binary file detection (distinguishing text from binary content)
6//! - Converting between encodings (normalizing to UTF-8 on load, converting back on save)
7//!
8//! # Encoding Detection Strategy
9//!
10//! 1. **BOM Detection**: Check for Byte Order Marks (UTF-8 BOM, UTF-16 LE/BE)
11//! 2. **UTF-8 Validation**: Fast path for most modern files
12//! 3. **UTF-16 Heuristics**: Detect UTF-16 without BOM via null byte patterns
13//! 4. **Binary Detection**: Check for control characters that indicate binary content
14//! 5. **Statistical Detection**: Use chardetng for legacy encoding detection
15//! 6. **Fallback**: Default to Windows-1252 for ambiguous cases
16
17use super::encoding_heuristics::has_windows1250_pattern;
18use schemars::JsonSchema;
19use serde::{Deserialize, Serialize};
20
21// ============================================================================
22// Encoding Type
23// ============================================================================
24
25/// Supported text encodings for file I/O
26///
27/// The editor internally uses UTF-8 for all text processing. When loading files,
28/// content is converted from the detected encoding to UTF-8. When saving, content
29/// is converted back to the original (or user-selected) encoding.
30#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize, JsonSchema)]
31pub enum Encoding {
32 /// UTF-8 (default, most common)
33 #[default]
34 Utf8,
35 /// UTF-8 with Byte Order Mark
36 Utf8Bom,
37 /// UTF-16 Little Endian (Windows default for Unicode files)
38 Utf16Le,
39 /// UTF-16 Big Endian
40 Utf16Be,
41 /// ASCII (7-bit, subset of UTF-8)
42 Ascii,
43 /// Latin-1 / ISO-8859-1 (Western European)
44 Latin1,
45 /// Windows-1252 / CP-1252 (Windows Western European, often called "ANSI")
46 Windows1252,
47 /// Windows-1250 / CP-1250 (Windows Central European)
48 Windows1250,
49 /// GB18030 (Chinese, superset of GBK)
50 Gb18030,
51 /// GBK (Chinese Simplified, subset of GB18030)
52 Gbk,
53 /// Shift-JIS (Japanese)
54 ShiftJis,
55 /// EUC-KR (Korean)
56 EucKr,
57}
58
59impl Encoding {
60 /// Get the display name for status bar
61 pub fn display_name(&self) -> &'static str {
62 match self {
63 Self::Utf8 => "UTF-8",
64 Self::Utf8Bom => "UTF-8 BOM",
65 Self::Utf16Le => "UTF-16 LE",
66 Self::Utf16Be => "UTF-16 BE",
67 Self::Ascii => "ASCII",
68 Self::Latin1 => "Latin-1",
69 Self::Windows1252 => "Windows-1252",
70 Self::Windows1250 => "Windows-1250",
71 Self::Gb18030 => "GB18030",
72 Self::Gbk => "GBK",
73 Self::ShiftJis => "Shift-JIS",
74 Self::EucKr => "EUC-KR",
75 }
76 }
77
78 /// Get a longer description for UI (e.g., command palette)
79 pub fn description(&self) -> &'static str {
80 match self {
81 Self::Utf8 => "UTF-8",
82 Self::Utf8Bom => "UTF-8 with BOM",
83 Self::Utf16Le => "UTF-16 Little Endian",
84 Self::Utf16Be => "UTF-16 Big Endian",
85 Self::Ascii => "US-ASCII",
86 Self::Latin1 => "ISO-8859-1 / Latin-1 – Western European",
87 Self::Windows1252 => "Windows-1252 / CP1252 – Western European",
88 Self::Windows1250 => "Windows-1250 / CP1250 – Central European",
89 Self::Gb18030 => "GB18030 – Chinese",
90 Self::Gbk => "GBK / CP936 – Simplified Chinese",
91 Self::ShiftJis => "Shift_JIS – Japanese",
92 Self::EucKr => "EUC-KR – Korean",
93 }
94 }
95
96 /// Get the encoding_rs Encoding for this type
97 pub fn to_encoding_rs(&self) -> &'static encoding_rs::Encoding {
98 match self {
99 Self::Utf8 | Self::Utf8Bom | Self::Ascii => encoding_rs::UTF_8,
100 Self::Utf16Le => encoding_rs::UTF_16LE,
101 Self::Utf16Be => encoding_rs::UTF_16BE,
102 Self::Latin1 => encoding_rs::WINDOWS_1252, // ISO-8859-1 maps to Windows-1252 per WHATWG
103 Self::Windows1252 => encoding_rs::WINDOWS_1252,
104 Self::Windows1250 => encoding_rs::WINDOWS_1250,
105 Self::Gb18030 => encoding_rs::GB18030,
106 Self::Gbk => encoding_rs::GBK,
107 Self::ShiftJis => encoding_rs::SHIFT_JIS,
108 Self::EucKr => encoding_rs::EUC_KR,
109 }
110 }
111
112 /// Returns true if this encoding uses a BOM (Byte Order Mark)
113 pub fn has_bom(&self) -> bool {
114 matches!(self, Self::Utf8Bom | Self::Utf16Le | Self::Utf16Be)
115 }
116
117 /// Get the BOM bytes for this encoding (if any)
118 pub fn bom_bytes(&self) -> Option<&'static [u8]> {
119 match self {
120 Self::Utf8Bom => Some(&[0xEF, 0xBB, 0xBF]),
121 Self::Utf16Le => Some(&[0xFF, 0xFE]),
122 Self::Utf16Be => Some(&[0xFE, 0xFF]),
123 _ => None,
124 }
125 }
126
127 /// All available encodings for UI display
128 pub fn all() -> &'static [Encoding] {
129 &[
130 Self::Utf8,
131 Self::Utf8Bom,
132 Self::Utf16Le,
133 Self::Utf16Be,
134 Self::Ascii,
135 Self::Latin1,
136 Self::Windows1252,
137 Self::Windows1250,
138 Self::Gb18030,
139 Self::Gbk,
140 Self::ShiftJis,
141 Self::EucKr,
142 ]
143 }
144
145 /// Returns true if this encoding supports "resynchronization" - the ability to
146 /// find character boundaries when jumping into the middle of a file.
147 ///
148 /// Resynchronizable encodings can be safely used with lazy/streaming file loading
149 /// because you can determine character boundaries from any position.
150 ///
151 /// - **UTF-8**: Excellent - unique bit patterns distinguish lead/continuation bytes
152 /// - **ASCII/Latin-1/Windows-1252**: Trivial - every byte is a character
153 /// - **UTF-16**: Good with 2-byte alignment - can detect surrogate pairs
154 /// - **UTF-32**: Good with 4-byte alignment
155 ///
156 /// Non-resynchronizable encodings (legacy CJK like Shift-JIS, GB18030, GBK, Big5)
157 /// have ambiguous byte sequences where a byte could be either a standalone character
158 /// or part of a multi-byte sequence. You must scan from the beginning to be certain.
159 pub fn is_resynchronizable(&self) -> bool {
160 match self {
161 // Fixed-width single byte - every byte is a character
162 Self::Ascii | Self::Latin1 | Self::Windows1252 | Self::Windows1250 => true,
163
164 // UTF-8 has unique bit patterns for lead vs continuation bytes
165 Self::Utf8 | Self::Utf8Bom => true,
166
167 // UTF-16 is resynchronizable with 2-byte alignment
168 // (can detect surrogate pairs by checking 0xD800-0xDFFF range)
169 Self::Utf16Le | Self::Utf16Be => true,
170
171 // Legacy CJK encodings are NOT resynchronizable
172 // The second byte of a double-byte char can equal a valid single-byte char
173 Self::Gb18030 | Self::Gbk | Self::ShiftJis | Self::EucKr => false,
174 }
175 }
176
177 /// Returns the byte alignment required for this encoding when doing random access.
178 ///
179 /// For lazy loading of large files, reads must be aligned to this boundary.
180 /// Returns None if the encoding is not resynchronizable (requires full file scan).
181 pub fn alignment(&self) -> Option<usize> {
182 match self {
183 // Single-byte encodings - no alignment needed
184 Self::Ascii | Self::Latin1 | Self::Windows1252 | Self::Windows1250 => Some(1),
185
186 // UTF-8 - no alignment needed (self-synchronizing)
187 Self::Utf8 | Self::Utf8Bom => Some(1),
188
189 // UTF-16 - must be 2-byte aligned
190 Self::Utf16Le | Self::Utf16Be => Some(2),
191
192 // Legacy CJK - not resynchronizable, no valid alignment
193 Self::Gb18030 | Self::Gbk | Self::ShiftJis | Self::EucKr => None,
194 }
195 }
196
197 /// Returns true if this encoding requires the entire file to be loaded
198 /// for correct decoding (cannot use lazy/streaming loading).
199 ///
200 /// This is the inverse of `is_resynchronizable()` and indicates that
201 /// the user should be warned before loading large files in this encoding.
202 pub fn requires_full_file_load(&self) -> bool {
203 !self.is_resynchronizable()
204 }
205}
206
207// ============================================================================
208// Encoding Detection
209// ============================================================================
210
211/// Detect the text encoding from a sample of bytes
212///
213/// This function delegates to `detect_encoding_or_binary` and returns only
214/// the encoding, ignoring the binary flag. Use `detect_encoding_or_binary`
215/// when you need to know if the content should be treated as binary.
216pub fn detect_encoding(bytes: &[u8]) -> Encoding {
217 detect_encoding_or_binary(bytes).0
218}
219
220/// Detect the text encoding and whether content is binary.
221///
222/// Returns (Encoding, is_binary) where:
223/// - Encoding is the detected encoding (or default if binary)
224/// - is_binary is true if the content should be treated as raw binary
225///
226/// # Detection Strategy
227///
228/// 1. Check for BOM (Byte Order Mark) - highest priority, definitely not binary
229/// 2. Try UTF-8 validation (fast path for most files), definitely not binary
230/// 3. Check for UTF-16 patterns without BOM, definitely not binary
231/// 4. Check for binary control characters (null bytes, etc.) - if found, it's binary
232/// 5. Use chardetng for statistical detection of legacy encodings
233/// 6. If encoding detection is uncertain, default to Windows-1252
234pub fn detect_encoding_or_binary(bytes: &[u8]) -> (Encoding, bool) {
235 // Only check the first 8KB for encoding detection
236 let check_len = bytes.len().min(8 * 1024);
237 let sample = &bytes[..check_len];
238
239 // 1. Check for BOM (Byte Order Mark) - highest priority, definitely text
240 if sample.starts_with(&[0xEF, 0xBB, 0xBF]) {
241 return (Encoding::Utf8Bom, false);
242 }
243 if sample.starts_with(&[0xFF, 0xFE]) {
244 // Could also be UTF-32 LE, but UTF-16 LE is much more common
245 return (Encoding::Utf16Le, false);
246 }
247 if sample.starts_with(&[0xFE, 0xFF]) {
248 return (Encoding::Utf16Be, false);
249 }
250
251 // 2. Try UTF-8 validation (fast path for most modern files)
252 // Note: When we truncate to 8KB, we may cut in the middle of a multi-byte UTF-8 sequence.
253 // We need to handle this case - if most of the sample is valid UTF-8 and the only error
254 // is an incomplete sequence at the very end, we should still detect it as UTF-8.
255 let utf8_valid_len = match std::str::from_utf8(sample) {
256 Ok(_) => sample.len(),
257 Err(e) => {
258 // error_len() returns None if the error is due to incomplete sequence at end
259 // (i.e., unexpected end of input), vs Some(n) for an invalid byte
260 if e.error_len().is_none() {
261 // Incomplete sequence at end - this is likely due to sample truncation
262 e.valid_up_to()
263 } else {
264 // Invalid byte found - not valid UTF-8
265 0
266 }
267 }
268 };
269
270 // If most of the sample is valid UTF-8 (at least 99% or all but the last few bytes),
271 // treat it as UTF-8. The incomplete sequence at end is just due to sample truncation.
272 if utf8_valid_len > 0 && (utf8_valid_len == sample.len() || utf8_valid_len >= sample.len() - 3)
273 {
274 let valid_sample = &sample[..utf8_valid_len];
275 // Check if it's pure ASCII (subset of UTF-8)
276 // Also check for binary indicators in valid ASCII/UTF-8
277 let has_binary_control = valid_sample.iter().any(|&b| is_binary_control_char(b));
278 if has_binary_control {
279 return (Encoding::Utf8, true);
280 }
281 if valid_sample.iter().all(|&b| b < 128) {
282 return (Encoding::Ascii, false);
283 }
284 return (Encoding::Utf8, false);
285 }
286
287 // 3. Check for UTF-16 without BOM (common in some Windows files)
288 // Heuristic: Look for patterns of null bytes alternating with printable chars
289 // The non-null byte should be printable (0x20-0x7E) or a valid high byte
290 //
291 // Note: Unlike UTF-8 above, this heuristic is robust to sample truncation because:
292 // - We use statistical pattern matching (50% threshold), not strict validation
293 // - chunks(2) naturally handles odd-length samples by dropping the last byte
294 // - Losing 1 pair out of ~4096 doesn't affect the detection threshold
295 if sample.len() >= 4 {
296 let is_printable_or_high = |b: u8| (0x20..=0x7E).contains(&b) || b >= 0x80;
297
298 // Align to even boundary to ensure we only process complete 2-byte pairs
299 let aligned_len = sample.len() & !1; // Round down to even
300 let aligned_sample = &sample[..aligned_len];
301
302 let le_pairs = aligned_sample
303 .chunks(2)
304 .filter(|chunk| chunk[1] == 0 && is_printable_or_high(chunk[0]))
305 .count();
306 let be_pairs = aligned_sample
307 .chunks(2)
308 .filter(|chunk| chunk[0] == 0 && is_printable_or_high(chunk[1]))
309 .count();
310 let pair_count = aligned_len / 2;
311
312 // If more than 50% of pairs look like valid UTF-16 text, it's text
313 if le_pairs > pair_count / 2 {
314 return (Encoding::Utf16Le, false);
315 }
316 if be_pairs > pair_count / 2 {
317 return (Encoding::Utf16Be, false);
318 }
319 }
320
321 // 4. Check for binary indicators EARLY (before chardetng)
322 // Binary files often contain control characters and null bytes that should not
323 // appear in any valid text encoding. Check this before chardetng because
324 // chardetng might still be "confident" about some encoding for binary data.
325 let has_binary_control = sample
326 .iter()
327 .any(|&b| b == 0x00 || is_binary_control_char(b));
328 if has_binary_control {
329 return (Encoding::Utf8, true);
330 }
331
332 // 5. Check for Latin-1 patterns: high bytes followed by invalid CJK trail bytes
333 // In GB18030/GBK, trail bytes must be 0x40-0x7E or 0x80-0xFE
334 // If a high byte is followed by a byte outside these ranges (e.g., space, newline,
335 // punctuation < 0x40), it's likely Latin-1, not CJK
336 let has_latin1_pattern = has_latin1_high_byte_pattern(sample);
337
338 // Also check for bytes in CJK-only range (0x81-0x9F) which can only be CJK lead bytes
339 let has_cjk_only_bytes = sample.iter().any(|&b| (0x81..0xA0).contains(&b));
340
341 // 6. Use chardetng for statistical encoding detection
342 let mut detector = chardetng::EncodingDetector::new();
343 detector.feed(sample, true);
344 let (detected_encoding, confident) = detector.guess_assess(None, true);
345
346 // If chardetng is confident, use that encoding (not binary)
347 if confident {
348 let is_cjk_encoding = detected_encoding == encoding_rs::GB18030
349 || detected_encoding == encoding_rs::GBK
350 || detected_encoding == encoding_rs::SHIFT_JIS
351 || detected_encoding == encoding_rs::EUC_KR;
352
353 // For CJK encodings, prefer Windows-1252 if we have clear Latin-1 indicators:
354 // - Space followed by high byte (0xA0-0xFF) is common in Latin-1 text
355 //
356 // If there are CJK-only bytes (0x81-0x9F), it's definitely CJK (not ambiguous).
357 // If there are Latin-1 patterns (space + high byte), prefer Windows-1252.
358 // Otherwise, trust chardetng's detection.
359 if is_cjk_encoding && !has_cjk_only_bytes && has_latin1_pattern {
360 return (Encoding::Windows1252, false);
361 }
362
363 // GBK is a subset of GB18030. Since we only inspect the first 8KB for
364 // detection, the sample may not contain GB18030-only code points (uncommon
365 // Chinese characters, emoji, etc.). Treating GBK as GB18030 is safer and
366 // ensures proper display of all characters including French, Spanish, and emoji.
367 let encoding =
368 if detected_encoding == encoding_rs::GB18030 || detected_encoding == encoding_rs::GBK {
369 Encoding::Gb18030
370 } else if detected_encoding == encoding_rs::SHIFT_JIS {
371 Encoding::ShiftJis
372 } else if detected_encoding == encoding_rs::EUC_KR {
373 Encoding::EucKr
374 } else if detected_encoding == encoding_rs::WINDOWS_1252
375 || detected_encoding == encoding_rs::WINDOWS_1250
376 {
377 // chardetng often returns Windows-1252 for Central European text
378 // Check for Windows-1250 specific patterns
379 if has_windows1250_pattern(sample) {
380 Encoding::Windows1250
381 } else {
382 Encoding::Windows1252
383 }
384 } else if detected_encoding == encoding_rs::UTF_8 {
385 // chardetng thinks it's UTF-8, but validation failed above
386 // Could still be Windows-1250 if it has Central European patterns
387 if has_windows1250_pattern(sample) {
388 Encoding::Windows1250
389 } else {
390 Encoding::Windows1252
391 }
392 } else {
393 // Unknown encoding - check for Windows-1250 patterns
394 if has_windows1250_pattern(sample) {
395 Encoding::Windows1250
396 } else {
397 Encoding::Windows1252
398 }
399 };
400 return (encoding, false);
401 }
402
403 // 7. chardetng not confident, but no binary indicators - check for Windows-1250 patterns
404 // We already checked for binary control chars earlier, so this is valid text
405 if has_windows1250_pattern(sample) {
406 (Encoding::Windows1250, false)
407 } else {
408 (Encoding::Windows1252, false)
409 }
410}
411
412// ============================================================================
413// Binary Detection Helpers
414// ============================================================================
415
416/// Check if a byte is a binary control character
417///
418/// Returns true for control characters that typically indicate binary content,
419/// excluding common text control chars (tab, newline, CR, form feed, etc.)
420pub fn is_binary_control_char(byte: u8) -> bool {
421 if byte < 0x20 {
422 // Allow common text control characters:
423 // 0x09 = Tab, 0x0A = LF, 0x0D = CR, 0x0C = Form Feed, 0x0B = Vertical Tab, 0x1B = ESC
424 !matches!(byte, 0x09 | 0x0A | 0x0D | 0x0C | 0x0B | 0x1B)
425 } else if byte == 0x7F {
426 // DEL character
427 true
428 } else {
429 false
430 }
431}
432
433/// Check if sample has Latin-1 patterns that cannot be valid CJK encoding
434///
435/// In GB18030/GBK, valid sequences are:
436/// - ASCII bytes (0x00-0x7F) as standalone characters
437/// - Lead byte (0x81-0xFE) + Trail byte (0x40-0x7E or 0x80-0xFE)
438///
439/// This function looks for patterns that indicate Latin-1:
440/// 1. High bytes followed by invalid CJK trail bytes (space, newline, etc.)
441/// 2. ASCII word followed by space followed by high byte (like "Hello é")
442/// 3. High byte immediately after ASCII space (like " é")
443fn has_latin1_high_byte_pattern(sample: &[u8]) -> bool {
444 let mut latin1_indicators = 0;
445 let mut i = 0;
446
447 while i < sample.len() {
448 let byte = sample[i];
449
450 if byte < 0x80 {
451 // ASCII byte
452 // Check for pattern: space followed by high byte (0xA0-0xFF)
453 // This is common in Latin-1 text like "Hello é" or "Café résumé"
454 if byte == 0x20 && i + 1 < sample.len() {
455 let next = sample[i + 1];
456 // Space followed by Latin-1 extended char (not CJK-only lead byte)
457 if next >= 0xA0 {
458 latin1_indicators += 1;
459 }
460 }
461 i += 1;
462 continue;
463 }
464
465 // High byte (0x80-0xFF) - could be Latin-1 or CJK lead byte
466 if i + 1 < sample.len() {
467 let next = sample[i + 1];
468
469 // Check if this could be a valid CJK double-byte sequence
470 let is_valid_cjk_lead = (0x81..=0xFE).contains(&byte);
471 let is_valid_cjk_trail = (0x40..=0x7E).contains(&next) || (0x80..=0xFE).contains(&next);
472
473 if is_valid_cjk_lead && is_valid_cjk_trail {
474 // Valid CJK pair - skip both bytes
475 i += 2;
476 continue;
477 }
478
479 // Not a valid CJK pair - check for Latin-1 indicator
480 // High byte followed by space, newline, or other low ASCII
481 if byte >= 0xA0 && next < 0x40 {
482 latin1_indicators += 1;
483 }
484 }
485
486 i += 1;
487 }
488
489 // Latin-1 is likely if we have indicators
490 latin1_indicators > 0
491}
492
493// ============================================================================
494// Encoding Conversion
495// ============================================================================
496
497/// Detect encoding and convert bytes to UTF-8
498///
499/// Returns the detected encoding and the UTF-8 converted content.
500/// This is the core function for normalizing file content to UTF-8 on load.
501pub fn detect_and_convert(bytes: &[u8]) -> (Encoding, Vec<u8>) {
502 if bytes.is_empty() {
503 return (Encoding::Utf8, Vec::new());
504 }
505
506 let encoding = detect_encoding(bytes);
507
508 // For UTF-8 (with or without BOM), we can use the content directly
509 match encoding {
510 Encoding::Utf8 | Encoding::Ascii => {
511 // Already UTF-8, just clone
512 (encoding, bytes.to_vec())
513 }
514 Encoding::Utf8Bom => {
515 // Skip the BOM (3 bytes) and use the rest
516 let content = if bytes.len() > 3 {
517 bytes[3..].to_vec()
518 } else {
519 Vec::new()
520 };
521 (encoding, content)
522 }
523 Encoding::Utf16Le | Encoding::Utf16Be => {
524 // Decode UTF-16 to UTF-8
525 let enc_rs = encoding.to_encoding_rs();
526 let start_offset =
527 if bytes.starts_with(&[0xFF, 0xFE]) || bytes.starts_with(&[0xFE, 0xFF]) {
528 2 // Skip BOM
529 } else {
530 0
531 };
532 let data = &bytes[start_offset..];
533
534 let (cow, _had_errors) = enc_rs.decode_without_bom_handling(data);
535 (encoding, cow.into_owned().into_bytes())
536 }
537 _ => {
538 // Use encoding_rs to convert to UTF-8
539 let enc_rs = encoding.to_encoding_rs();
540 let (cow, _had_errors) = enc_rs.decode_without_bom_handling(bytes);
541 (encoding, cow.into_owned().into_bytes())
542 }
543 }
544}
545
546/// Convert bytes from a specific encoding to UTF-8
547///
548/// Used when opening a file with a user-specified encoding instead of auto-detection.
549/// Returns the UTF-8 converted content.
550pub fn convert_to_utf8(bytes: &[u8], encoding: Encoding) -> Vec<u8> {
551 if bytes.is_empty() {
552 return Vec::new();
553 }
554
555 match encoding {
556 Encoding::Utf8 | Encoding::Ascii => {
557 // Already UTF-8, just clone
558 bytes.to_vec()
559 }
560 Encoding::Utf8Bom => {
561 // Skip the BOM (3 bytes) if present and use the rest
562 if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) && bytes.len() > 3 {
563 bytes[3..].to_vec()
564 } else {
565 bytes.to_vec()
566 }
567 }
568 Encoding::Utf16Le | Encoding::Utf16Be => {
569 // Decode UTF-16 to UTF-8
570 let enc_rs = encoding.to_encoding_rs();
571 let start_offset =
572 if bytes.starts_with(&[0xFF, 0xFE]) || bytes.starts_with(&[0xFE, 0xFF]) {
573 2 // Skip BOM
574 } else {
575 0
576 };
577 let data = &bytes[start_offset..];
578
579 let (cow, _had_errors) = enc_rs.decode_without_bom_handling(data);
580 cow.into_owned().into_bytes()
581 }
582 _ => {
583 // Use encoding_rs to convert to UTF-8
584 let enc_rs = encoding.to_encoding_rs();
585 let (cow, _had_errors) = enc_rs.decode_without_bom_handling(bytes);
586 cow.into_owned().into_bytes()
587 }
588 }
589}
590
591/// Convert UTF-8 content to the specified encoding for saving
592///
593/// Used when saving files to convert internal UTF-8 representation
594/// back to the original (or user-selected) encoding.
595///
596/// Note: This does NOT add BOM - the BOM should be handled separately.
597pub fn convert_from_utf8(utf8_bytes: &[u8], encoding: Encoding) -> Vec<u8> {
598 match encoding {
599 Encoding::Utf8 | Encoding::Ascii | Encoding::Utf8Bom => {
600 // UTF-8 (with or without BOM) - just clone, BOM added separately
601 utf8_bytes.to_vec()
602 }
603 Encoding::Utf16Le => {
604 // Convert UTF-8 to UTF-16 LE (no BOM - added separately)
605 let text = String::from_utf8_lossy(utf8_bytes);
606 let mut result = Vec::new();
607 for code_unit in text.encode_utf16() {
608 result.extend_from_slice(&code_unit.to_le_bytes());
609 }
610 result
611 }
612 Encoding::Utf16Be => {
613 // Convert UTF-8 to UTF-16 BE (no BOM - added separately)
614 let text = String::from_utf8_lossy(utf8_bytes);
615 let mut result = Vec::new();
616 for code_unit in text.encode_utf16() {
617 result.extend_from_slice(&code_unit.to_be_bytes());
618 }
619 result
620 }
621 _ => {
622 // Use encoding_rs to convert from UTF-8
623 let enc_rs = encoding.to_encoding_rs();
624 let text = String::from_utf8_lossy(utf8_bytes);
625 let (cow, _encoding_used, _had_errors) = enc_rs.encode(&text);
626 cow.into_owned()
627 }
628 }
629}
630
631// ============================================================================
632// Tests
633// ============================================================================
634
635#[cfg(test)]
636mod tests {
637 use super::*;
638
639 #[test]
640 fn test_encoding_display_names() {
641 assert_eq!(Encoding::Utf8.display_name(), "UTF-8");
642 assert_eq!(Encoding::Utf8Bom.display_name(), "UTF-8 BOM");
643 assert_eq!(Encoding::Utf16Le.display_name(), "UTF-16 LE");
644 assert_eq!(Encoding::Gb18030.display_name(), "GB18030");
645 assert_eq!(Encoding::Windows1250.display_name(), "Windows-1250");
646 }
647
648 #[test]
649 fn test_encoding_bom() {
650 assert!(Encoding::Utf8Bom.has_bom());
651 assert!(Encoding::Utf16Le.has_bom());
652 assert!(!Encoding::Utf8.has_bom());
653 assert!(!Encoding::Windows1252.has_bom());
654 assert!(!Encoding::Windows1250.has_bom());
655 }
656
657 #[test]
658 fn test_detect_utf8() {
659 assert_eq!(detect_encoding(b"Hello, world!"), Encoding::Ascii);
660 assert_eq!(detect_encoding("Hello, 世界!".as_bytes()), Encoding::Utf8);
661 }
662
663 #[test]
664 fn test_detect_utf8_bom() {
665 let with_bom = [0xEF, 0xBB, 0xBF, b'H', b'i'];
666 assert_eq!(detect_encoding(&with_bom), Encoding::Utf8Bom);
667 }
668
669 #[test]
670 fn test_detect_utf16_le() {
671 let utf16_le_bom = [0xFF, 0xFE, b'H', 0x00, b'i', 0x00];
672 assert_eq!(detect_encoding(&utf16_le_bom), Encoding::Utf16Le);
673 }
674
675 #[test]
676 fn test_detect_binary() {
677 let binary_data = [0x00, 0x01, 0x02, 0x03];
678 let (_, is_binary) = detect_encoding_or_binary(&binary_data);
679 assert!(is_binary);
680 }
681
682 #[test]
683 fn test_is_binary_control_char() {
684 // Binary control chars
685 assert!(is_binary_control_char(0x00)); // NUL
686 assert!(is_binary_control_char(0x01)); // SOH
687 assert!(is_binary_control_char(0x02)); // STX
688 assert!(is_binary_control_char(0x7F)); // DEL
689
690 // Text control chars (allowed)
691 assert!(!is_binary_control_char(0x09)); // Tab
692 assert!(!is_binary_control_char(0x0A)); // LF
693 assert!(!is_binary_control_char(0x0D)); // CR
694 assert!(!is_binary_control_char(0x1B)); // ESC
695
696 // Regular printable chars
697 assert!(!is_binary_control_char(b'A'));
698 assert!(!is_binary_control_char(b' '));
699 }
700
701 #[test]
702 fn test_convert_roundtrip_utf8() {
703 let original = "Hello, 世界!";
704 let bytes = original.as_bytes();
705
706 let (encoding, utf8_content) = detect_and_convert(bytes);
707 assert_eq!(encoding, Encoding::Utf8);
708 assert_eq!(utf8_content, bytes);
709
710 let back = convert_from_utf8(&utf8_content, encoding);
711 assert_eq!(back, bytes);
712 }
713
714 #[test]
715 fn test_convert_roundtrip_utf16le() {
716 // UTF-16 LE with BOM: "Hi"
717 let utf16_le = [0xFF, 0xFE, b'H', 0x00, b'i', 0x00];
718
719 let (encoding, utf8_content) = detect_and_convert(&utf16_le);
720 assert_eq!(encoding, Encoding::Utf16Le);
721 assert_eq!(utf8_content, b"Hi");
722
723 // Note: convert_from_utf8 doesn't add BOM, so result won't have BOM
724 let back = convert_from_utf8(&utf8_content, encoding);
725 assert_eq!(back, [b'H', 0x00, b'i', 0x00]);
726 }
727
728 #[test]
729 fn test_encoding_resynchronizable() {
730 // Self-synchronizing encodings (can find char boundaries from middle of file)
731 assert!(Encoding::Utf8.is_resynchronizable());
732 assert!(Encoding::Utf8Bom.is_resynchronizable());
733 assert!(Encoding::Ascii.is_resynchronizable());
734 assert!(Encoding::Latin1.is_resynchronizable());
735 assert!(Encoding::Windows1252.is_resynchronizable());
736 assert!(Encoding::Windows1250.is_resynchronizable());
737
738 // UTF-16 is resynchronizable with proper alignment
739 assert!(Encoding::Utf16Le.is_resynchronizable());
740 assert!(Encoding::Utf16Be.is_resynchronizable());
741
742 // Legacy CJK encodings are NOT resynchronizable
743 // (second byte of double-byte char can equal a valid single-byte char)
744 assert!(!Encoding::Gb18030.is_resynchronizable());
745 assert!(!Encoding::Gbk.is_resynchronizable());
746 assert!(!Encoding::ShiftJis.is_resynchronizable());
747 assert!(!Encoding::EucKr.is_resynchronizable());
748 }
749
750 #[test]
751 fn test_encoding_alignment() {
752 // Single-byte encodings have alignment of 1
753 assert_eq!(Encoding::Ascii.alignment(), Some(1));
754 assert_eq!(Encoding::Latin1.alignment(), Some(1));
755 assert_eq!(Encoding::Windows1252.alignment(), Some(1));
756 assert_eq!(Encoding::Windows1250.alignment(), Some(1));
757 assert_eq!(Encoding::Utf8.alignment(), Some(1));
758 assert_eq!(Encoding::Utf8Bom.alignment(), Some(1));
759
760 // UTF-16 requires 2-byte alignment
761 assert_eq!(Encoding::Utf16Le.alignment(), Some(2));
762 assert_eq!(Encoding::Utf16Be.alignment(), Some(2));
763
764 // Non-resynchronizable encodings have no valid alignment
765 assert_eq!(Encoding::Gb18030.alignment(), None);
766 assert_eq!(Encoding::Gbk.alignment(), None);
767 assert_eq!(Encoding::ShiftJis.alignment(), None);
768 assert_eq!(Encoding::EucKr.alignment(), None);
769 }
770
771 #[test]
772 fn test_requires_full_file_load() {
773 // Encodings that can be streamed
774 assert!(!Encoding::Utf8.requires_full_file_load());
775 assert!(!Encoding::Ascii.requires_full_file_load());
776 assert!(!Encoding::Latin1.requires_full_file_load());
777 assert!(!Encoding::Windows1250.requires_full_file_load());
778 assert!(!Encoding::Utf16Le.requires_full_file_load());
779
780 // Encodings that require full loading
781 assert!(Encoding::Gb18030.requires_full_file_load());
782 assert!(Encoding::Gbk.requires_full_file_load());
783 assert!(Encoding::ShiftJis.requires_full_file_load());
784 assert!(Encoding::EucKr.requires_full_file_load());
785 }
786
787 #[test]
788 fn test_convert_roundtrip_windows1250() {
789 // Windows-1250 encoded text with Central European characters
790 // "Zażółć" in Windows-1250: Z(0x5A) a(0x61) ż(0xBF) ó(0xF3) ł(0xB3) ć(0xE6)
791 let windows1250_bytes: &[u8] = &[0x5A, 0x61, 0xBF, 0xF3, 0xB3, 0xE6];
792
793 // Convert to UTF-8
794 let enc_rs = Encoding::Windows1250.to_encoding_rs();
795 let (decoded, _had_errors) = enc_rs.decode_without_bom_handling(windows1250_bytes);
796 let utf8_content = decoded.as_bytes();
797
798 // The UTF-8 content should contain the Polish characters
799 let utf8_str = std::str::from_utf8(utf8_content).unwrap();
800 assert!(utf8_str.contains('ż'), "Should contain ż: {}", utf8_str);
801 assert!(utf8_str.contains('ó'), "Should contain ó: {}", utf8_str);
802 assert!(utf8_str.contains('ł'), "Should contain ł: {}", utf8_str);
803 assert!(utf8_str.contains('ć'), "Should contain ć: {}", utf8_str);
804
805 // Convert back to Windows-1250
806 let back = convert_from_utf8(utf8_content, Encoding::Windows1250);
807 assert_eq!(back, windows1250_bytes, "Round-trip should preserve bytes");
808 }
809
810 #[test]
811 fn test_windows1250_description() {
812 assert_eq!(
813 Encoding::Windows1250.description(),
814 "Windows-1250 / CP1250 – Central European"
815 );
816 }
817
818 #[test]
819 fn test_detect_windows1250_definitive_bytes() {
820 // Bytes 0x8D (Ť), 0x8F (Ź), 0x9D (ť) are undefined in Windows-1252
821 // but valid in Windows-1250, so they definitively indicate Windows-1250
822
823 // Czech text with ť (0x9D): "měsťo" (city, archaic)
824 let with_t_caron = [0x6D, 0x9D, 0x73, 0x74, 0x6F]; // mťsto
825 assert_eq!(
826 detect_encoding(&with_t_caron),
827 Encoding::Windows1250,
828 "Byte 0x9D (ť) should trigger Windows-1250 detection"
829 );
830
831 // Polish text with Ź (0x8F): "Źródło" (source)
832 let with_z_acute_upper = [0x8F, 0x72, 0xF3, 0x64, 0xB3, 0x6F]; // Źródło
833 assert_eq!(
834 detect_encoding(&with_z_acute_upper),
835 Encoding::Windows1250,
836 "Byte 0x8F (Ź) should trigger Windows-1250 detection"
837 );
838 }
839
840 #[test]
841 fn test_detect_windows1250_strong_indicators() {
842 // Polish text with ś (0x9C) and Ś (0x8C) - strong indicators from 0x80-0x9F range
843 let polish_text = [
844 0x9C, 0x77, 0x69, 0x65, 0x74, 0x79, 0x20, // "świety "
845 0x8C, 0x77, 0x69, 0x61, 0x74, // "Świat"
846 ];
847 assert_eq!(
848 detect_encoding(&polish_text),
849 Encoding::Windows1250,
850 "Multiple Polish characters (ś, Ś) should trigger Windows-1250"
851 );
852 }
853
854 #[test]
855 fn test_detect_ambiguous_bytes_as_windows1252() {
856 // Bytes in 0xA0-0xFF range are ambiguous and should default to Windows-1252
857 // Polish "żółć" - ż(0xBF) ó(0xF3) ł(0xB3) ć(0xE6) - all ambiguous
858 let zolc = [0xBF, 0xF3, 0xB3, 0xE6];
859 assert_eq!(
860 detect_encoding(&zolc),
861 Encoding::Windows1252,
862 "Ambiguous bytes should default to Windows-1252"
863 );
864
865 // ą (0xB9) and ł (0xB3) could be ¹ and ³ in Windows-1252
866 let ambiguous = [
867 0x6D, 0xB9, 0x6B, 0x61, 0x20, // "mąka " or "m¹ka "
868 0x6D, 0xB3, 0x6F, 0x64, 0x79, // "młody" or "m³ody"
869 ];
870 assert_eq!(
871 detect_encoding(&ambiguous),
872 Encoding::Windows1252,
873 "Ambiguous Polish bytes should default to Windows-1252"
874 );
875 }
876
877 #[test]
878 fn test_detect_windows1250_czech_pangram() {
879 // "Příliš žluťoučký kůň úpěl ďábelské ódy" - Czech pangram in Windows-1250
880 // Contains ť (0x9D) which is a definitive Windows-1250 indicator
881 let czech_pangram: &[u8] = &[
882 0x50, 0xF8, 0xED, 0x6C, 0x69, 0x9A, 0x20, // "Příliš "
883 0x9E, 0x6C, 0x75, 0x9D, 0x6F, 0x75, 0xE8, 0x6B, 0xFD, 0x20, // "žluťoučký "
884 0x6B, 0xF9, 0xF2, 0x20, // "kůň "
885 0xFA, 0x70, 0xEC, 0x6C, 0x20, // "úpěl "
886 0xEF, 0xE1, 0x62, 0x65, 0x6C, 0x73, 0x6B, 0xE9, 0x20, // "ďábelské "
887 0xF3, 0x64, 0x79, // "ódy"
888 ];
889 assert_eq!(
890 detect_encoding(czech_pangram),
891 Encoding::Windows1250,
892 "Czech pangram should be detected as Windows-1250 (contains ť = 0x9D)"
893 );
894 }
895
896 #[test]
897 fn test_detect_windows1252_not_1250() {
898 // Pure Windows-1252 text without Central European indicators
899 // "Café résumé" in Windows-1252
900 let windows1252_text = [
901 0x43, 0x61, 0x66, 0xE9, 0x20, // "Café "
902 0x72, 0xE9, 0x73, 0x75, 0x6D, 0xE9, // "résumé"
903 ];
904 assert_eq!(
905 detect_encoding(&windows1252_text),
906 Encoding::Windows1252,
907 "French text should remain Windows-1252"
908 );
909 }
910
911 #[test]
912 fn test_detect_utf8_chinese_truncated_sequence() {
913 // Test that UTF-8 Chinese text is correctly detected even when the sample
914 // is truncated in the middle of a multi-byte sequence.
915 //
916 // Bug context: When sampling first 8KB for detection, the boundary may cut
917 // through a multi-byte UTF-8 character. This caused valid UTF-8 Chinese text
918 // to fail std::str::from_utf8() validation and fall through to Windows-1250
919 // detection (because UTF-8 continuation bytes like 0x9C, 0x9D overlap with
920 // Windows-1250 indicator bytes).
921
922 // Chinese text "更多" (more) = [0xE6, 0x9B, 0xB4, 0xE5, 0xA4, 0x9A]
923 // If we truncate after 0xE5, we get an incomplete sequence
924 let utf8_chinese_truncated = [
925 0xE6, 0x9B, 0xB4, // 更
926 0xE5, 0xA4, 0x9A, // 多
927 0xE5, // Start of another character, incomplete
928 ];
929
930 // This should still be detected as UTF-8, not Windows-1250
931 assert_eq!(
932 detect_encoding(&utf8_chinese_truncated),
933 Encoding::Utf8,
934 "Truncated UTF-8 Chinese text should be detected as UTF-8"
935 );
936
937 // Test with 2 bytes of incomplete sequence
938 let utf8_chinese_truncated_2 = [
939 0xE6, 0x9B, 0xB4, // 更
940 0xE5, 0xA4, 0x9A, // 多
941 0xE5, 0xA4, // Incomplete 3-byte sequence (missing last byte)
942 ];
943 assert_eq!(
944 detect_encoding(&utf8_chinese_truncated_2),
945 Encoding::Utf8,
946 "Truncated UTF-8 with 2-byte incomplete sequence should be detected as UTF-8"
947 );
948 }
949
950 #[test]
951 fn test_detect_utf8_chinese_with_high_bytes() {
952 // UTF-8 Chinese text contains many continuation bytes in the 0x80-0xBF range,
953 // including bytes like 0x9C, 0x9D that happen to be Windows-1250 indicators.
954 // These should NOT trigger Windows-1250 detection for valid UTF-8 content.
955
956 // Chinese characters that use continuation bytes that overlap with Windows-1250 indicators:
957 // 集 = E9 9B 86 (contains 0x9B)
958 // 精 = E7 B2 BE (contains 0xB2, 0xBE)
959 // Build a string with many such characters
960 let chinese_text = "更多全本全集精校小说"; // Contains various high continuation bytes
961 let bytes = chinese_text.as_bytes();
962
963 assert_eq!(
964 detect_encoding(bytes),
965 Encoding::Utf8,
966 "UTF-8 Chinese text should be detected as UTF-8, not Windows-1250"
967 );
968
969 // Verify these bytes would have triggered Windows-1250 detection if not valid UTF-8
970 // by checking that the sample contains bytes in the 0x80-0x9F range
971 let has_high_continuation_bytes = bytes.iter().any(|&b| (0x80..0xA0).contains(&b));
972 assert!(
973 has_high_continuation_bytes,
974 "Test should include bytes that could be mistaken for Windows-1250 indicators"
975 );
976 }
977
978 #[test]
979 fn test_detect_utf8_sample_truncation_at_boundary() {
980 // Simulate what happens when we take an 8KB sample that ends mid-character
981 // by creating a buffer that's valid UTF-8 except for the last 1-3 bytes
982
983 // Build a large UTF-8 Chinese text buffer
984 let chinese = "我的美女老师"; // "My Beautiful Teacher"
985 let mut buffer = Vec::new();
986 // Repeat to make it substantial
987 for _ in 0..100 {
988 buffer.extend_from_slice(chinese.as_bytes());
989 }
990
991 // Verify it's valid UTF-8 when complete
992 assert!(std::str::from_utf8(&buffer).is_ok());
993 assert_eq!(detect_encoding(&buffer), Encoding::Utf8);
994
995 // Now truncate at various points that cut through multi-byte sequences
996 // Each Chinese character is 3 bytes in UTF-8
997 for truncate_offset in 1..=3 {
998 let truncated_len = buffer.len() - truncate_offset;
999 let truncated = &buffer[..truncated_len];
1000
1001 // The truncated buffer should fail strict UTF-8 validation
1002 // (unless we happen to cut at a character boundary)
1003 let is_strict_valid = std::str::from_utf8(truncated).is_ok();
1004
1005 // But our encoding detection should still detect it as UTF-8
1006 let detected = detect_encoding(truncated);
1007 assert_eq!(
1008 detected,
1009 Encoding::Utf8,
1010 "Truncated UTF-8 at offset -{} should be detected as UTF-8, strict_valid={}",
1011 truncate_offset,
1012 is_strict_valid
1013 );
1014 }
1015 }
1016}