fresh/model/
encoding.rs

1//! Text encoding detection and conversion
2//!
3//! This module handles:
4//! - Detecting text encodings from byte content (UTF-8, UTF-16, Latin-1, CJK, etc.)
5//! - Binary file detection (distinguishing text from binary content)
6//! - Converting between encodings (normalizing to UTF-8 on load, converting back on save)
7//!
8//! # Encoding Detection Strategy
9//!
10//! 1. **BOM Detection**: Check for Byte Order Marks (UTF-8 BOM, UTF-16 LE/BE)
11//! 2. **UTF-8 Validation**: Fast path for most modern files
12//! 3. **UTF-16 Heuristics**: Detect UTF-16 without BOM via null byte patterns
13//! 4. **Binary Detection**: Check for control characters that indicate binary content
14//! 5. **Statistical Detection**: Use chardetng for legacy encoding detection
15//! 6. **Fallback**: Default to Windows-1252 for ambiguous cases
16
17use super::encoding_heuristics::has_windows1250_pattern;
18use schemars::JsonSchema;
19use serde::{Deserialize, Serialize};
20
21// ============================================================================
22// Encoding Type
23// ============================================================================
24
25/// Supported text encodings for file I/O
26///
27/// The editor internally uses UTF-8 for all text processing. When loading files,
28/// content is converted from the detected encoding to UTF-8. When saving, content
29/// is converted back to the original (or user-selected) encoding.
30#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize, JsonSchema)]
31pub enum Encoding {
32    /// UTF-8 (default, most common)
33    #[default]
34    Utf8,
35    /// UTF-8 with Byte Order Mark
36    Utf8Bom,
37    /// UTF-16 Little Endian (Windows default for Unicode files)
38    Utf16Le,
39    /// UTF-16 Big Endian
40    Utf16Be,
41    /// ASCII (7-bit, subset of UTF-8)
42    Ascii,
43    /// Latin-1 / ISO-8859-1 (Western European)
44    Latin1,
45    /// Windows-1252 / CP-1252 (Windows Western European, often called "ANSI")
46    Windows1252,
47    /// Windows-1250 / CP-1250 (Windows Central European)
48    Windows1250,
49    /// GB18030 (Chinese, superset of GBK)
50    Gb18030,
51    /// GBK (Chinese Simplified, subset of GB18030)
52    Gbk,
53    /// Shift-JIS (Japanese)
54    ShiftJis,
55    /// EUC-KR (Korean)
56    EucKr,
57}
58
59impl Encoding {
60    /// Get the display name for status bar
61    pub fn display_name(&self) -> &'static str {
62        match self {
63            Self::Utf8 => "UTF-8",
64            Self::Utf8Bom => "UTF-8 BOM",
65            Self::Utf16Le => "UTF-16 LE",
66            Self::Utf16Be => "UTF-16 BE",
67            Self::Ascii => "ASCII",
68            Self::Latin1 => "Latin-1",
69            Self::Windows1252 => "Windows-1252",
70            Self::Windows1250 => "Windows-1250",
71            Self::Gb18030 => "GB18030",
72            Self::Gbk => "GBK",
73            Self::ShiftJis => "Shift-JIS",
74            Self::EucKr => "EUC-KR",
75        }
76    }
77
78    /// Get a longer description for UI (e.g., command palette)
79    pub fn description(&self) -> &'static str {
80        match self {
81            Self::Utf8 => "UTF-8",
82            Self::Utf8Bom => "UTF-8 with BOM",
83            Self::Utf16Le => "UTF-16 Little Endian",
84            Self::Utf16Be => "UTF-16 Big Endian",
85            Self::Ascii => "US-ASCII",
86            Self::Latin1 => "ISO-8859-1 / Latin-1 – Western European",
87            Self::Windows1252 => "Windows-1252 / CP1252 – Western European",
88            Self::Windows1250 => "Windows-1250 / CP1250 – Central European",
89            Self::Gb18030 => "GB18030 – Chinese",
90            Self::Gbk => "GBK / CP936 – Simplified Chinese",
91            Self::ShiftJis => "Shift_JIS – Japanese",
92            Self::EucKr => "EUC-KR – Korean",
93        }
94    }
95
96    /// Get the encoding_rs Encoding for this type
97    pub fn to_encoding_rs(&self) -> &'static encoding_rs::Encoding {
98        match self {
99            Self::Utf8 | Self::Utf8Bom | Self::Ascii => encoding_rs::UTF_8,
100            Self::Utf16Le => encoding_rs::UTF_16LE,
101            Self::Utf16Be => encoding_rs::UTF_16BE,
102            Self::Latin1 => encoding_rs::WINDOWS_1252, // ISO-8859-1 maps to Windows-1252 per WHATWG
103            Self::Windows1252 => encoding_rs::WINDOWS_1252,
104            Self::Windows1250 => encoding_rs::WINDOWS_1250,
105            Self::Gb18030 => encoding_rs::GB18030,
106            Self::Gbk => encoding_rs::GBK,
107            Self::ShiftJis => encoding_rs::SHIFT_JIS,
108            Self::EucKr => encoding_rs::EUC_KR,
109        }
110    }
111
112    /// Returns true if this encoding uses a BOM (Byte Order Mark)
113    pub fn has_bom(&self) -> bool {
114        matches!(self, Self::Utf8Bom | Self::Utf16Le | Self::Utf16Be)
115    }
116
117    /// Get the BOM bytes for this encoding (if any)
118    pub fn bom_bytes(&self) -> Option<&'static [u8]> {
119        match self {
120            Self::Utf8Bom => Some(&[0xEF, 0xBB, 0xBF]),
121            Self::Utf16Le => Some(&[0xFF, 0xFE]),
122            Self::Utf16Be => Some(&[0xFE, 0xFF]),
123            _ => None,
124        }
125    }
126
127    /// All available encodings for UI display
128    pub fn all() -> &'static [Encoding] {
129        &[
130            Self::Utf8,
131            Self::Utf8Bom,
132            Self::Utf16Le,
133            Self::Utf16Be,
134            Self::Ascii,
135            Self::Latin1,
136            Self::Windows1252,
137            Self::Windows1250,
138            Self::Gb18030,
139            Self::Gbk,
140            Self::ShiftJis,
141            Self::EucKr,
142        ]
143    }
144
145    /// Returns true if this encoding supports "resynchronization" - the ability to
146    /// find character boundaries when jumping into the middle of a file.
147    ///
148    /// Resynchronizable encodings can be safely used with lazy/streaming file loading
149    /// because you can determine character boundaries from any position.
150    ///
151    /// - **UTF-8**: Excellent - unique bit patterns distinguish lead/continuation bytes
152    /// - **ASCII/Latin-1/Windows-1252**: Trivial - every byte is a character
153    /// - **UTF-16**: Good with 2-byte alignment - can detect surrogate pairs
154    /// - **UTF-32**: Good with 4-byte alignment
155    ///
156    /// Non-resynchronizable encodings (legacy CJK like Shift-JIS, GB18030, GBK, Big5)
157    /// have ambiguous byte sequences where a byte could be either a standalone character
158    /// or part of a multi-byte sequence. You must scan from the beginning to be certain.
159    pub fn is_resynchronizable(&self) -> bool {
160        match self {
161            // Fixed-width single byte - every byte is a character
162            Self::Ascii | Self::Latin1 | Self::Windows1252 | Self::Windows1250 => true,
163
164            // UTF-8 has unique bit patterns for lead vs continuation bytes
165            Self::Utf8 | Self::Utf8Bom => true,
166
167            // UTF-16 is resynchronizable with 2-byte alignment
168            // (can detect surrogate pairs by checking 0xD800-0xDFFF range)
169            Self::Utf16Le | Self::Utf16Be => true,
170
171            // Legacy CJK encodings are NOT resynchronizable
172            // The second byte of a double-byte char can equal a valid single-byte char
173            Self::Gb18030 | Self::Gbk | Self::ShiftJis | Self::EucKr => false,
174        }
175    }
176
177    /// Returns the byte alignment required for this encoding when doing random access.
178    ///
179    /// For lazy loading of large files, reads must be aligned to this boundary.
180    /// Returns None if the encoding is not resynchronizable (requires full file scan).
181    pub fn alignment(&self) -> Option<usize> {
182        match self {
183            // Single-byte encodings - no alignment needed
184            Self::Ascii | Self::Latin1 | Self::Windows1252 | Self::Windows1250 => Some(1),
185
186            // UTF-8 - no alignment needed (self-synchronizing)
187            Self::Utf8 | Self::Utf8Bom => Some(1),
188
189            // UTF-16 - must be 2-byte aligned
190            Self::Utf16Le | Self::Utf16Be => Some(2),
191
192            // Legacy CJK - not resynchronizable, no valid alignment
193            Self::Gb18030 | Self::Gbk | Self::ShiftJis | Self::EucKr => None,
194        }
195    }
196
197    /// Returns true if this encoding requires the entire file to be loaded
198    /// for correct decoding (cannot use lazy/streaming loading).
199    ///
200    /// This is the inverse of `is_resynchronizable()` and indicates that
201    /// the user should be warned before loading large files in this encoding.
202    pub fn requires_full_file_load(&self) -> bool {
203        !self.is_resynchronizable()
204    }
205}
206
207// ============================================================================
208// Encoding Detection
209// ============================================================================
210
211/// Detect the text encoding from a sample of bytes
212///
213/// This function delegates to `detect_encoding_or_binary` and returns only
214/// the encoding, ignoring the binary flag. Use `detect_encoding_or_binary`
215/// when you need to know if the content should be treated as binary.
216pub fn detect_encoding(bytes: &[u8]) -> Encoding {
217    detect_encoding_or_binary(bytes).0
218}
219
220/// Detect the text encoding and whether content is binary.
221///
222/// Returns (Encoding, is_binary) where:
223/// - Encoding is the detected encoding (or default if binary)
224/// - is_binary is true if the content should be treated as raw binary
225///
226/// # Detection Strategy
227///
228/// 1. Check for BOM (Byte Order Mark) - highest priority, definitely not binary
229/// 2. Try UTF-8 validation (fast path for most files), definitely not binary
230/// 3. Check for UTF-16 patterns without BOM, definitely not binary
231/// 4. Check for binary control characters (null bytes, etc.) - if found, it's binary
232/// 5. Use chardetng for statistical detection of legacy encodings
233/// 6. If encoding detection is uncertain, default to Windows-1252
234pub fn detect_encoding_or_binary(bytes: &[u8]) -> (Encoding, bool) {
235    // Only check the first 8KB for encoding detection
236    let check_len = bytes.len().min(8 * 1024);
237    let sample = &bytes[..check_len];
238
239    // 1. Check for BOM (Byte Order Mark) - highest priority, definitely text
240    if sample.starts_with(&[0xEF, 0xBB, 0xBF]) {
241        return (Encoding::Utf8Bom, false);
242    }
243    if sample.starts_with(&[0xFF, 0xFE]) {
244        // Could also be UTF-32 LE, but UTF-16 LE is much more common
245        return (Encoding::Utf16Le, false);
246    }
247    if sample.starts_with(&[0xFE, 0xFF]) {
248        return (Encoding::Utf16Be, false);
249    }
250
251    // 2. Try UTF-8 validation (fast path for most modern files)
252    // Note: When we truncate to 8KB, we may cut in the middle of a multi-byte UTF-8 sequence.
253    // We need to handle this case - if most of the sample is valid UTF-8 and the only error
254    // is an incomplete sequence at the very end, we should still detect it as UTF-8.
255    let utf8_valid_len = match std::str::from_utf8(sample) {
256        Ok(_) => sample.len(),
257        Err(e) => {
258            // error_len() returns None if the error is due to incomplete sequence at end
259            // (i.e., unexpected end of input), vs Some(n) for an invalid byte
260            if e.error_len().is_none() {
261                // Incomplete sequence at end - this is likely due to sample truncation
262                e.valid_up_to()
263            } else {
264                // Invalid byte found - not valid UTF-8
265                0
266            }
267        }
268    };
269
270    // If most of the sample is valid UTF-8 (at least 99% or all but the last few bytes),
271    // treat it as UTF-8. The incomplete sequence at end is just due to sample truncation.
272    if utf8_valid_len > 0 && (utf8_valid_len == sample.len() || utf8_valid_len >= sample.len() - 3)
273    {
274        let valid_sample = &sample[..utf8_valid_len];
275        // Check if it's pure ASCII (subset of UTF-8)
276        // Also check for binary indicators in valid ASCII/UTF-8
277        let has_binary_control = valid_sample.iter().any(|&b| is_binary_control_char(b));
278        if has_binary_control {
279            return (Encoding::Utf8, true);
280        }
281        if valid_sample.iter().all(|&b| b < 128) {
282            return (Encoding::Ascii, false);
283        }
284        return (Encoding::Utf8, false);
285    }
286
287    // 3. Check for UTF-16 without BOM (common in some Windows files)
288    // Heuristic: Look for patterns of null bytes alternating with printable chars
289    // The non-null byte should be printable (0x20-0x7E) or a valid high byte
290    //
291    // Note: Unlike UTF-8 above, this heuristic is robust to sample truncation because:
292    // - We use statistical pattern matching (50% threshold), not strict validation
293    // - chunks(2) naturally handles odd-length samples by dropping the last byte
294    // - Losing 1 pair out of ~4096 doesn't affect the detection threshold
295    if sample.len() >= 4 {
296        let is_printable_or_high = |b: u8| (0x20..=0x7E).contains(&b) || b >= 0x80;
297
298        // Align to even boundary to ensure we only process complete 2-byte pairs
299        let aligned_len = sample.len() & !1; // Round down to even
300        let aligned_sample = &sample[..aligned_len];
301
302        let le_pairs = aligned_sample
303            .chunks(2)
304            .filter(|chunk| chunk[1] == 0 && is_printable_or_high(chunk[0]))
305            .count();
306        let be_pairs = aligned_sample
307            .chunks(2)
308            .filter(|chunk| chunk[0] == 0 && is_printable_or_high(chunk[1]))
309            .count();
310        let pair_count = aligned_len / 2;
311
312        // If more than 50% of pairs look like valid UTF-16 text, it's text
313        if le_pairs > pair_count / 2 {
314            return (Encoding::Utf16Le, false);
315        }
316        if be_pairs > pair_count / 2 {
317            return (Encoding::Utf16Be, false);
318        }
319    }
320
321    // 4. Check for binary indicators EARLY (before chardetng)
322    // Binary files often contain control characters and null bytes that should not
323    // appear in any valid text encoding. Check this before chardetng because
324    // chardetng might still be "confident" about some encoding for binary data.
325    let has_binary_control = sample
326        .iter()
327        .any(|&b| b == 0x00 || is_binary_control_char(b));
328    if has_binary_control {
329        return (Encoding::Utf8, true);
330    }
331
332    // 5. Check for Latin-1 patterns: high bytes followed by invalid CJK trail bytes
333    // In GB18030/GBK, trail bytes must be 0x40-0x7E or 0x80-0xFE
334    // If a high byte is followed by a byte outside these ranges (e.g., space, newline,
335    // punctuation < 0x40), it's likely Latin-1, not CJK
336    let has_latin1_pattern = has_latin1_high_byte_pattern(sample);
337
338    // Also check for bytes in CJK-only range (0x81-0x9F) which can only be CJK lead bytes
339    let has_cjk_only_bytes = sample.iter().any(|&b| (0x81..0xA0).contains(&b));
340
341    // 6. Use chardetng for statistical encoding detection
342    let mut detector = chardetng::EncodingDetector::new();
343    detector.feed(sample, true);
344    let (detected_encoding, confident) = detector.guess_assess(None, true);
345
346    // If chardetng is confident, use that encoding (not binary)
347    if confident {
348        let is_cjk_encoding = detected_encoding == encoding_rs::GB18030
349            || detected_encoding == encoding_rs::GBK
350            || detected_encoding == encoding_rs::SHIFT_JIS
351            || detected_encoding == encoding_rs::EUC_KR;
352
353        // For CJK encodings, prefer Windows-1252 if we have clear Latin-1 indicators:
354        // - Space followed by high byte (0xA0-0xFF) is common in Latin-1 text
355        //
356        // If there are CJK-only bytes (0x81-0x9F), it's definitely CJK (not ambiguous).
357        // If there are Latin-1 patterns (space + high byte), prefer Windows-1252.
358        // Otherwise, trust chardetng's detection.
359        if is_cjk_encoding && !has_cjk_only_bytes && has_latin1_pattern {
360            return (Encoding::Windows1252, false);
361        }
362
363        // GBK is a subset of GB18030. Since we only inspect the first 8KB for
364        // detection, the sample may not contain GB18030-only code points (uncommon
365        // Chinese characters, emoji, etc.). Treating GBK as GB18030 is safer and
366        // ensures proper display of all characters including French, Spanish, and emoji.
367        let encoding =
368            if detected_encoding == encoding_rs::GB18030 || detected_encoding == encoding_rs::GBK {
369                Encoding::Gb18030
370            } else if detected_encoding == encoding_rs::SHIFT_JIS {
371                Encoding::ShiftJis
372            } else if detected_encoding == encoding_rs::EUC_KR {
373                Encoding::EucKr
374            } else if detected_encoding == encoding_rs::WINDOWS_1252
375                || detected_encoding == encoding_rs::WINDOWS_1250
376            {
377                // chardetng often returns Windows-1252 for Central European text
378                // Check for Windows-1250 specific patterns
379                if has_windows1250_pattern(sample) {
380                    Encoding::Windows1250
381                } else {
382                    Encoding::Windows1252
383                }
384            } else if detected_encoding == encoding_rs::UTF_8 {
385                // chardetng thinks it's UTF-8, but validation failed above
386                // Could still be Windows-1250 if it has Central European patterns
387                if has_windows1250_pattern(sample) {
388                    Encoding::Windows1250
389                } else {
390                    Encoding::Windows1252
391                }
392            } else {
393                // Unknown encoding - check for Windows-1250 patterns
394                if has_windows1250_pattern(sample) {
395                    Encoding::Windows1250
396                } else {
397                    Encoding::Windows1252
398                }
399            };
400        return (encoding, false);
401    }
402
403    // 7. chardetng not confident, but no binary indicators - check for Windows-1250 patterns
404    // We already checked for binary control chars earlier, so this is valid text
405    if has_windows1250_pattern(sample) {
406        (Encoding::Windows1250, false)
407    } else {
408        (Encoding::Windows1252, false)
409    }
410}
411
412// ============================================================================
413// Binary Detection Helpers
414// ============================================================================
415
416/// Check if a byte is a binary control character
417///
418/// Returns true for control characters that typically indicate binary content,
419/// excluding common text control chars (tab, newline, CR, form feed, etc.)
420pub fn is_binary_control_char(byte: u8) -> bool {
421    if byte < 0x20 {
422        // Allow common text control characters:
423        // 0x09 = Tab, 0x0A = LF, 0x0D = CR, 0x0C = Form Feed, 0x0B = Vertical Tab, 0x1B = ESC
424        !matches!(byte, 0x09 | 0x0A | 0x0D | 0x0C | 0x0B | 0x1B)
425    } else if byte == 0x7F {
426        // DEL character
427        true
428    } else {
429        false
430    }
431}
432
433/// Check if sample has Latin-1 patterns that cannot be valid CJK encoding
434///
435/// In GB18030/GBK, valid sequences are:
436/// - ASCII bytes (0x00-0x7F) as standalone characters
437/// - Lead byte (0x81-0xFE) + Trail byte (0x40-0x7E or 0x80-0xFE)
438///
439/// This function looks for patterns that indicate Latin-1:
440/// 1. High bytes followed by invalid CJK trail bytes (space, newline, etc.)
441/// 2. ASCII word followed by space followed by high byte (like "Hello é")
442/// 3. High byte immediately after ASCII space (like " é")
443fn has_latin1_high_byte_pattern(sample: &[u8]) -> bool {
444    let mut latin1_indicators = 0;
445    let mut i = 0;
446
447    while i < sample.len() {
448        let byte = sample[i];
449
450        if byte < 0x80 {
451            // ASCII byte
452            // Check for pattern: space followed by high byte (0xA0-0xFF)
453            // This is common in Latin-1 text like "Hello é" or "Café résumé"
454            if byte == 0x20 && i + 1 < sample.len() {
455                let next = sample[i + 1];
456                // Space followed by Latin-1 extended char (not CJK-only lead byte)
457                if next >= 0xA0 {
458                    latin1_indicators += 1;
459                }
460            }
461            i += 1;
462            continue;
463        }
464
465        // High byte (0x80-0xFF) - could be Latin-1 or CJK lead byte
466        if i + 1 < sample.len() {
467            let next = sample[i + 1];
468
469            // Check if this could be a valid CJK double-byte sequence
470            let is_valid_cjk_lead = (0x81..=0xFE).contains(&byte);
471            let is_valid_cjk_trail = (0x40..=0x7E).contains(&next) || (0x80..=0xFE).contains(&next);
472
473            if is_valid_cjk_lead && is_valid_cjk_trail {
474                // Valid CJK pair - skip both bytes
475                i += 2;
476                continue;
477            }
478
479            // Not a valid CJK pair - check for Latin-1 indicator
480            // High byte followed by space, newline, or other low ASCII
481            if byte >= 0xA0 && next < 0x40 {
482                latin1_indicators += 1;
483            }
484        }
485
486        i += 1;
487    }
488
489    // Latin-1 is likely if we have indicators
490    latin1_indicators > 0
491}
492
493// ============================================================================
494// Encoding Conversion
495// ============================================================================
496
497/// Detect encoding and convert bytes to UTF-8
498///
499/// Returns the detected encoding and the UTF-8 converted content.
500/// This is the core function for normalizing file content to UTF-8 on load.
501pub fn detect_and_convert(bytes: &[u8]) -> (Encoding, Vec<u8>) {
502    if bytes.is_empty() {
503        return (Encoding::Utf8, Vec::new());
504    }
505
506    let encoding = detect_encoding(bytes);
507
508    // For UTF-8 (with or without BOM), we can use the content directly
509    match encoding {
510        Encoding::Utf8 | Encoding::Ascii => {
511            // Already UTF-8, just clone
512            (encoding, bytes.to_vec())
513        }
514        Encoding::Utf8Bom => {
515            // Skip the BOM (3 bytes) and use the rest
516            let content = if bytes.len() > 3 {
517                bytes[3..].to_vec()
518            } else {
519                Vec::new()
520            };
521            (encoding, content)
522        }
523        Encoding::Utf16Le | Encoding::Utf16Be => {
524            // Decode UTF-16 to UTF-8
525            let enc_rs = encoding.to_encoding_rs();
526            let start_offset =
527                if bytes.starts_with(&[0xFF, 0xFE]) || bytes.starts_with(&[0xFE, 0xFF]) {
528                    2 // Skip BOM
529                } else {
530                    0
531                };
532            let data = &bytes[start_offset..];
533
534            let (cow, _had_errors) = enc_rs.decode_without_bom_handling(data);
535            (encoding, cow.into_owned().into_bytes())
536        }
537        _ => {
538            // Use encoding_rs to convert to UTF-8
539            let enc_rs = encoding.to_encoding_rs();
540            let (cow, _had_errors) = enc_rs.decode_without_bom_handling(bytes);
541            (encoding, cow.into_owned().into_bytes())
542        }
543    }
544}
545
546/// Convert bytes from a specific encoding to UTF-8
547///
548/// Used when opening a file with a user-specified encoding instead of auto-detection.
549/// Returns the UTF-8 converted content.
550pub fn convert_to_utf8(bytes: &[u8], encoding: Encoding) -> Vec<u8> {
551    if bytes.is_empty() {
552        return Vec::new();
553    }
554
555    match encoding {
556        Encoding::Utf8 | Encoding::Ascii => {
557            // Already UTF-8, just clone
558            bytes.to_vec()
559        }
560        Encoding::Utf8Bom => {
561            // Skip the BOM (3 bytes) if present and use the rest
562            if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) && bytes.len() > 3 {
563                bytes[3..].to_vec()
564            } else {
565                bytes.to_vec()
566            }
567        }
568        Encoding::Utf16Le | Encoding::Utf16Be => {
569            // Decode UTF-16 to UTF-8
570            let enc_rs = encoding.to_encoding_rs();
571            let start_offset =
572                if bytes.starts_with(&[0xFF, 0xFE]) || bytes.starts_with(&[0xFE, 0xFF]) {
573                    2 // Skip BOM
574                } else {
575                    0
576                };
577            let data = &bytes[start_offset..];
578
579            let (cow, _had_errors) = enc_rs.decode_without_bom_handling(data);
580            cow.into_owned().into_bytes()
581        }
582        _ => {
583            // Use encoding_rs to convert to UTF-8
584            let enc_rs = encoding.to_encoding_rs();
585            let (cow, _had_errors) = enc_rs.decode_without_bom_handling(bytes);
586            cow.into_owned().into_bytes()
587        }
588    }
589}
590
591/// Convert UTF-8 content to the specified encoding for saving
592///
593/// Used when saving files to convert internal UTF-8 representation
594/// back to the original (or user-selected) encoding.
595///
596/// Note: This does NOT add BOM - the BOM should be handled separately.
597pub fn convert_from_utf8(utf8_bytes: &[u8], encoding: Encoding) -> Vec<u8> {
598    match encoding {
599        Encoding::Utf8 | Encoding::Ascii | Encoding::Utf8Bom => {
600            // UTF-8 (with or without BOM) - just clone, BOM added separately
601            utf8_bytes.to_vec()
602        }
603        Encoding::Utf16Le => {
604            // Convert UTF-8 to UTF-16 LE (no BOM - added separately)
605            let text = String::from_utf8_lossy(utf8_bytes);
606            let mut result = Vec::new();
607            for code_unit in text.encode_utf16() {
608                result.extend_from_slice(&code_unit.to_le_bytes());
609            }
610            result
611        }
612        Encoding::Utf16Be => {
613            // Convert UTF-8 to UTF-16 BE (no BOM - added separately)
614            let text = String::from_utf8_lossy(utf8_bytes);
615            let mut result = Vec::new();
616            for code_unit in text.encode_utf16() {
617                result.extend_from_slice(&code_unit.to_be_bytes());
618            }
619            result
620        }
621        _ => {
622            // Use encoding_rs to convert from UTF-8
623            let enc_rs = encoding.to_encoding_rs();
624            let text = String::from_utf8_lossy(utf8_bytes);
625            let (cow, _encoding_used, _had_errors) = enc_rs.encode(&text);
626            cow.into_owned()
627        }
628    }
629}
630
631// ============================================================================
632// Tests
633// ============================================================================
634
635#[cfg(test)]
636mod tests {
637    use super::*;
638
639    #[test]
640    fn test_encoding_display_names() {
641        assert_eq!(Encoding::Utf8.display_name(), "UTF-8");
642        assert_eq!(Encoding::Utf8Bom.display_name(), "UTF-8 BOM");
643        assert_eq!(Encoding::Utf16Le.display_name(), "UTF-16 LE");
644        assert_eq!(Encoding::Gb18030.display_name(), "GB18030");
645        assert_eq!(Encoding::Windows1250.display_name(), "Windows-1250");
646    }
647
648    #[test]
649    fn test_encoding_bom() {
650        assert!(Encoding::Utf8Bom.has_bom());
651        assert!(Encoding::Utf16Le.has_bom());
652        assert!(!Encoding::Utf8.has_bom());
653        assert!(!Encoding::Windows1252.has_bom());
654        assert!(!Encoding::Windows1250.has_bom());
655    }
656
657    #[test]
658    fn test_detect_utf8() {
659        assert_eq!(detect_encoding(b"Hello, world!"), Encoding::Ascii);
660        assert_eq!(detect_encoding("Hello, 世界!".as_bytes()), Encoding::Utf8);
661    }
662
663    #[test]
664    fn test_detect_utf8_bom() {
665        let with_bom = [0xEF, 0xBB, 0xBF, b'H', b'i'];
666        assert_eq!(detect_encoding(&with_bom), Encoding::Utf8Bom);
667    }
668
669    #[test]
670    fn test_detect_utf16_le() {
671        let utf16_le_bom = [0xFF, 0xFE, b'H', 0x00, b'i', 0x00];
672        assert_eq!(detect_encoding(&utf16_le_bom), Encoding::Utf16Le);
673    }
674
675    #[test]
676    fn test_detect_binary() {
677        let binary_data = [0x00, 0x01, 0x02, 0x03];
678        let (_, is_binary) = detect_encoding_or_binary(&binary_data);
679        assert!(is_binary);
680    }
681
682    #[test]
683    fn test_is_binary_control_char() {
684        // Binary control chars
685        assert!(is_binary_control_char(0x00)); // NUL
686        assert!(is_binary_control_char(0x01)); // SOH
687        assert!(is_binary_control_char(0x02)); // STX
688        assert!(is_binary_control_char(0x7F)); // DEL
689
690        // Text control chars (allowed)
691        assert!(!is_binary_control_char(0x09)); // Tab
692        assert!(!is_binary_control_char(0x0A)); // LF
693        assert!(!is_binary_control_char(0x0D)); // CR
694        assert!(!is_binary_control_char(0x1B)); // ESC
695
696        // Regular printable chars
697        assert!(!is_binary_control_char(b'A'));
698        assert!(!is_binary_control_char(b' '));
699    }
700
701    #[test]
702    fn test_convert_roundtrip_utf8() {
703        let original = "Hello, 世界!";
704        let bytes = original.as_bytes();
705
706        let (encoding, utf8_content) = detect_and_convert(bytes);
707        assert_eq!(encoding, Encoding::Utf8);
708        assert_eq!(utf8_content, bytes);
709
710        let back = convert_from_utf8(&utf8_content, encoding);
711        assert_eq!(back, bytes);
712    }
713
714    #[test]
715    fn test_convert_roundtrip_utf16le() {
716        // UTF-16 LE with BOM: "Hi"
717        let utf16_le = [0xFF, 0xFE, b'H', 0x00, b'i', 0x00];
718
719        let (encoding, utf8_content) = detect_and_convert(&utf16_le);
720        assert_eq!(encoding, Encoding::Utf16Le);
721        assert_eq!(utf8_content, b"Hi");
722
723        // Note: convert_from_utf8 doesn't add BOM, so result won't have BOM
724        let back = convert_from_utf8(&utf8_content, encoding);
725        assert_eq!(back, [b'H', 0x00, b'i', 0x00]);
726    }
727
728    #[test]
729    fn test_encoding_resynchronizable() {
730        // Self-synchronizing encodings (can find char boundaries from middle of file)
731        assert!(Encoding::Utf8.is_resynchronizable());
732        assert!(Encoding::Utf8Bom.is_resynchronizable());
733        assert!(Encoding::Ascii.is_resynchronizable());
734        assert!(Encoding::Latin1.is_resynchronizable());
735        assert!(Encoding::Windows1252.is_resynchronizable());
736        assert!(Encoding::Windows1250.is_resynchronizable());
737
738        // UTF-16 is resynchronizable with proper alignment
739        assert!(Encoding::Utf16Le.is_resynchronizable());
740        assert!(Encoding::Utf16Be.is_resynchronizable());
741
742        // Legacy CJK encodings are NOT resynchronizable
743        // (second byte of double-byte char can equal a valid single-byte char)
744        assert!(!Encoding::Gb18030.is_resynchronizable());
745        assert!(!Encoding::Gbk.is_resynchronizable());
746        assert!(!Encoding::ShiftJis.is_resynchronizable());
747        assert!(!Encoding::EucKr.is_resynchronizable());
748    }
749
750    #[test]
751    fn test_encoding_alignment() {
752        // Single-byte encodings have alignment of 1
753        assert_eq!(Encoding::Ascii.alignment(), Some(1));
754        assert_eq!(Encoding::Latin1.alignment(), Some(1));
755        assert_eq!(Encoding::Windows1252.alignment(), Some(1));
756        assert_eq!(Encoding::Windows1250.alignment(), Some(1));
757        assert_eq!(Encoding::Utf8.alignment(), Some(1));
758        assert_eq!(Encoding::Utf8Bom.alignment(), Some(1));
759
760        // UTF-16 requires 2-byte alignment
761        assert_eq!(Encoding::Utf16Le.alignment(), Some(2));
762        assert_eq!(Encoding::Utf16Be.alignment(), Some(2));
763
764        // Non-resynchronizable encodings have no valid alignment
765        assert_eq!(Encoding::Gb18030.alignment(), None);
766        assert_eq!(Encoding::Gbk.alignment(), None);
767        assert_eq!(Encoding::ShiftJis.alignment(), None);
768        assert_eq!(Encoding::EucKr.alignment(), None);
769    }
770
771    #[test]
772    fn test_requires_full_file_load() {
773        // Encodings that can be streamed
774        assert!(!Encoding::Utf8.requires_full_file_load());
775        assert!(!Encoding::Ascii.requires_full_file_load());
776        assert!(!Encoding::Latin1.requires_full_file_load());
777        assert!(!Encoding::Windows1250.requires_full_file_load());
778        assert!(!Encoding::Utf16Le.requires_full_file_load());
779
780        // Encodings that require full loading
781        assert!(Encoding::Gb18030.requires_full_file_load());
782        assert!(Encoding::Gbk.requires_full_file_load());
783        assert!(Encoding::ShiftJis.requires_full_file_load());
784        assert!(Encoding::EucKr.requires_full_file_load());
785    }
786
787    #[test]
788    fn test_convert_roundtrip_windows1250() {
789        // Windows-1250 encoded text with Central European characters
790        // "Zażółć" in Windows-1250: Z(0x5A) a(0x61) ż(0xBF) ó(0xF3) ł(0xB3) ć(0xE6)
791        let windows1250_bytes: &[u8] = &[0x5A, 0x61, 0xBF, 0xF3, 0xB3, 0xE6];
792
793        // Convert to UTF-8
794        let enc_rs = Encoding::Windows1250.to_encoding_rs();
795        let (decoded, _had_errors) = enc_rs.decode_without_bom_handling(windows1250_bytes);
796        let utf8_content = decoded.as_bytes();
797
798        // The UTF-8 content should contain the Polish characters
799        let utf8_str = std::str::from_utf8(utf8_content).unwrap();
800        assert!(utf8_str.contains('ż'), "Should contain ż: {}", utf8_str);
801        assert!(utf8_str.contains('ó'), "Should contain ó: {}", utf8_str);
802        assert!(utf8_str.contains('ł'), "Should contain ł: {}", utf8_str);
803        assert!(utf8_str.contains('ć'), "Should contain ć: {}", utf8_str);
804
805        // Convert back to Windows-1250
806        let back = convert_from_utf8(utf8_content, Encoding::Windows1250);
807        assert_eq!(back, windows1250_bytes, "Round-trip should preserve bytes");
808    }
809
810    #[test]
811    fn test_windows1250_description() {
812        assert_eq!(
813            Encoding::Windows1250.description(),
814            "Windows-1250 / CP1250 – Central European"
815        );
816    }
817
818    #[test]
819    fn test_detect_windows1250_definitive_bytes() {
820        // Bytes 0x8D (Ť), 0x8F (Ź), 0x9D (ť) are undefined in Windows-1252
821        // but valid in Windows-1250, so they definitively indicate Windows-1250
822
823        // Czech text with ť (0x9D): "měsťo" (city, archaic)
824        let with_t_caron = [0x6D, 0x9D, 0x73, 0x74, 0x6F]; // mťsto
825        assert_eq!(
826            detect_encoding(&with_t_caron),
827            Encoding::Windows1250,
828            "Byte 0x9D (ť) should trigger Windows-1250 detection"
829        );
830
831        // Polish text with Ź (0x8F): "Źródło" (source)
832        let with_z_acute_upper = [0x8F, 0x72, 0xF3, 0x64, 0xB3, 0x6F]; // Źródło
833        assert_eq!(
834            detect_encoding(&with_z_acute_upper),
835            Encoding::Windows1250,
836            "Byte 0x8F (Ź) should trigger Windows-1250 detection"
837        );
838    }
839
840    #[test]
841    fn test_detect_windows1250_strong_indicators() {
842        // Polish text with ś (0x9C) and Ś (0x8C) - strong indicators from 0x80-0x9F range
843        let polish_text = [
844            0x9C, 0x77, 0x69, 0x65, 0x74, 0x79, 0x20, // "świety "
845            0x8C, 0x77, 0x69, 0x61, 0x74, // "Świat"
846        ];
847        assert_eq!(
848            detect_encoding(&polish_text),
849            Encoding::Windows1250,
850            "Multiple Polish characters (ś, Ś) should trigger Windows-1250"
851        );
852    }
853
854    #[test]
855    fn test_detect_ambiguous_bytes_as_windows1252() {
856        // Bytes in 0xA0-0xFF range are ambiguous and should default to Windows-1252
857        // Polish "żółć" - ż(0xBF) ó(0xF3) ł(0xB3) ć(0xE6) - all ambiguous
858        let zolc = [0xBF, 0xF3, 0xB3, 0xE6];
859        assert_eq!(
860            detect_encoding(&zolc),
861            Encoding::Windows1252,
862            "Ambiguous bytes should default to Windows-1252"
863        );
864
865        // ą (0xB9) and ł (0xB3) could be ¹ and ³ in Windows-1252
866        let ambiguous = [
867            0x6D, 0xB9, 0x6B, 0x61, 0x20, // "mąka " or "m¹ka "
868            0x6D, 0xB3, 0x6F, 0x64, 0x79, // "młody" or "m³ody"
869        ];
870        assert_eq!(
871            detect_encoding(&ambiguous),
872            Encoding::Windows1252,
873            "Ambiguous Polish bytes should default to Windows-1252"
874        );
875    }
876
877    #[test]
878    fn test_detect_windows1250_czech_pangram() {
879        // "Příliš žluťoučký kůň úpěl ďábelské ódy" - Czech pangram in Windows-1250
880        // Contains ť (0x9D) which is a definitive Windows-1250 indicator
881        let czech_pangram: &[u8] = &[
882            0x50, 0xF8, 0xED, 0x6C, 0x69, 0x9A, 0x20, // "Příliš "
883            0x9E, 0x6C, 0x75, 0x9D, 0x6F, 0x75, 0xE8, 0x6B, 0xFD, 0x20, // "žluťoučký "
884            0x6B, 0xF9, 0xF2, 0x20, // "kůň "
885            0xFA, 0x70, 0xEC, 0x6C, 0x20, // "úpěl "
886            0xEF, 0xE1, 0x62, 0x65, 0x6C, 0x73, 0x6B, 0xE9, 0x20, // "ďábelské "
887            0xF3, 0x64, 0x79, // "ódy"
888        ];
889        assert_eq!(
890            detect_encoding(czech_pangram),
891            Encoding::Windows1250,
892            "Czech pangram should be detected as Windows-1250 (contains ť = 0x9D)"
893        );
894    }
895
896    #[test]
897    fn test_detect_windows1252_not_1250() {
898        // Pure Windows-1252 text without Central European indicators
899        // "Café résumé" in Windows-1252
900        let windows1252_text = [
901            0x43, 0x61, 0x66, 0xE9, 0x20, // "Café "
902            0x72, 0xE9, 0x73, 0x75, 0x6D, 0xE9, // "résumé"
903        ];
904        assert_eq!(
905            detect_encoding(&windows1252_text),
906            Encoding::Windows1252,
907            "French text should remain Windows-1252"
908        );
909    }
910
911    #[test]
912    fn test_detect_utf8_chinese_truncated_sequence() {
913        // Test that UTF-8 Chinese text is correctly detected even when the sample
914        // is truncated in the middle of a multi-byte sequence.
915        //
916        // Bug context: When sampling first 8KB for detection, the boundary may cut
917        // through a multi-byte UTF-8 character. This caused valid UTF-8 Chinese text
918        // to fail std::str::from_utf8() validation and fall through to Windows-1250
919        // detection (because UTF-8 continuation bytes like 0x9C, 0x9D overlap with
920        // Windows-1250 indicator bytes).
921
922        // Chinese text "更多" (more) = [0xE6, 0x9B, 0xB4, 0xE5, 0xA4, 0x9A]
923        // If we truncate after 0xE5, we get an incomplete sequence
924        let utf8_chinese_truncated = [
925            0xE6, 0x9B, 0xB4, // 更
926            0xE5, 0xA4, 0x9A, // 多
927            0xE5, // Start of another character, incomplete
928        ];
929
930        // This should still be detected as UTF-8, not Windows-1250
931        assert_eq!(
932            detect_encoding(&utf8_chinese_truncated),
933            Encoding::Utf8,
934            "Truncated UTF-8 Chinese text should be detected as UTF-8"
935        );
936
937        // Test with 2 bytes of incomplete sequence
938        let utf8_chinese_truncated_2 = [
939            0xE6, 0x9B, 0xB4, // 更
940            0xE5, 0xA4, 0x9A, // 多
941            0xE5, 0xA4, // Incomplete 3-byte sequence (missing last byte)
942        ];
943        assert_eq!(
944            detect_encoding(&utf8_chinese_truncated_2),
945            Encoding::Utf8,
946            "Truncated UTF-8 with 2-byte incomplete sequence should be detected as UTF-8"
947        );
948    }
949
950    #[test]
951    fn test_detect_utf8_chinese_with_high_bytes() {
952        // UTF-8 Chinese text contains many continuation bytes in the 0x80-0xBF range,
953        // including bytes like 0x9C, 0x9D that happen to be Windows-1250 indicators.
954        // These should NOT trigger Windows-1250 detection for valid UTF-8 content.
955
956        // Chinese characters that use continuation bytes that overlap with Windows-1250 indicators:
957        // 集 = E9 9B 86 (contains 0x9B)
958        // 精 = E7 B2 BE (contains 0xB2, 0xBE)
959        // Build a string with many such characters
960        let chinese_text = "更多全本全集精校小说"; // Contains various high continuation bytes
961        let bytes = chinese_text.as_bytes();
962
963        assert_eq!(
964            detect_encoding(bytes),
965            Encoding::Utf8,
966            "UTF-8 Chinese text should be detected as UTF-8, not Windows-1250"
967        );
968
969        // Verify these bytes would have triggered Windows-1250 detection if not valid UTF-8
970        // by checking that the sample contains bytes in the 0x80-0x9F range
971        let has_high_continuation_bytes = bytes.iter().any(|&b| (0x80..0xA0).contains(&b));
972        assert!(
973            has_high_continuation_bytes,
974            "Test should include bytes that could be mistaken for Windows-1250 indicators"
975        );
976    }
977
978    #[test]
979    fn test_detect_utf8_sample_truncation_at_boundary() {
980        // Simulate what happens when we take an 8KB sample that ends mid-character
981        // by creating a buffer that's valid UTF-8 except for the last 1-3 bytes
982
983        // Build a large UTF-8 Chinese text buffer
984        let chinese = "我的美女老师"; // "My Beautiful Teacher"
985        let mut buffer = Vec::new();
986        // Repeat to make it substantial
987        for _ in 0..100 {
988            buffer.extend_from_slice(chinese.as_bytes());
989        }
990
991        // Verify it's valid UTF-8 when complete
992        assert!(std::str::from_utf8(&buffer).is_ok());
993        assert_eq!(detect_encoding(&buffer), Encoding::Utf8);
994
995        // Now truncate at various points that cut through multi-byte sequences
996        // Each Chinese character is 3 bytes in UTF-8
997        for truncate_offset in 1..=3 {
998            let truncated_len = buffer.len() - truncate_offset;
999            let truncated = &buffer[..truncated_len];
1000
1001            // The truncated buffer should fail strict UTF-8 validation
1002            // (unless we happen to cut at a character boundary)
1003            let is_strict_valid = std::str::from_utf8(truncated).is_ok();
1004
1005            // But our encoding detection should still detect it as UTF-8
1006            let detected = detect_encoding(truncated);
1007            assert_eq!(
1008                detected,
1009                Encoding::Utf8,
1010                "Truncated UTF-8 at offset -{} should be detected as UTF-8, strict_valid={}",
1011                truncate_offset,
1012                is_strict_valid
1013            );
1014        }
1015    }
1016}
fresh/model/encoding.rs

fresh/model/
encoding.rs