fresh/model/
encoding.rs

1//! Text encoding detection and conversion
2//!
3//! This module handles:
4//! - Detecting text encodings from byte content (UTF-8, UTF-16, Latin-1, CJK, etc.)
5//! - Binary file detection (distinguishing text from binary content)
6//! - Converting between encodings (normalizing to UTF-8 on load, converting back on save)
7//!
8//! # Encoding Detection Strategy
9//!
10//! 1. **BOM Detection**: Check for Byte Order Marks (UTF-8 BOM, UTF-16 LE/BE)
11//! 2. **UTF-8 Validation**: Fast path for most modern files
12//! 3. **UTF-16 Heuristics**: Detect UTF-16 without BOM via null byte patterns
13//! 4. **Binary Detection**: Check for control characters that indicate binary content
14//! 5. **Statistical Detection**: Use chardetng for legacy encoding detection
15//! 6. **Fallback**: Default to Windows-1252 for ambiguous cases
16
17use super::encoding_heuristics::has_windows1250_pattern;
18use schemars::JsonSchema;
19use serde::{Deserialize, Serialize};
20
21// ============================================================================
22// Encoding Type
23// ============================================================================
24
25/// Supported text encodings for file I/O
26///
27/// The editor internally uses UTF-8 for all text processing. When loading files,
28/// content is converted from the detected encoding to UTF-8. When saving, content
29/// is converted back to the original (or user-selected) encoding.
30#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize, JsonSchema)]
31pub enum Encoding {
32    /// UTF-8 (default, most common)
33    #[default]
34    Utf8,
35    /// UTF-8 with Byte Order Mark
36    Utf8Bom,
37    /// UTF-16 Little Endian (Windows default for Unicode files)
38    Utf16Le,
39    /// UTF-16 Big Endian
40    Utf16Be,
41    /// ASCII (7-bit, subset of UTF-8)
42    Ascii,
43    /// Latin-1 / ISO-8859-1 (Western European)
44    Latin1,
45    /// Windows-1252 / CP-1252 (Windows Western European, often called "ANSI")
46    Windows1252,
47    /// Windows-1250 / CP-1250 (Windows Central European)
48    Windows1250,
49    /// GB18030 (Chinese, superset of GBK)
50    Gb18030,
51    /// GBK (Chinese Simplified, subset of GB18030)
52    Gbk,
53    /// Shift-JIS (Japanese)
54    ShiftJis,
55    /// EUC-KR (Korean)
56    EucKr,
57}
58
59impl Encoding {
60    /// Get the display name for status bar
61    pub fn display_name(&self) -> &'static str {
62        match self {
63            Self::Utf8 => "UTF-8",
64            Self::Utf8Bom => "UTF-8 BOM",
65            Self::Utf16Le => "UTF-16 LE",
66            Self::Utf16Be => "UTF-16 BE",
67            Self::Ascii => "ASCII",
68            Self::Latin1 => "Latin-1",
69            Self::Windows1252 => "Windows-1252",
70            Self::Windows1250 => "Windows-1250",
71            Self::Gb18030 => "GB18030",
72            Self::Gbk => "GBK",
73            Self::ShiftJis => "Shift-JIS",
74            Self::EucKr => "EUC-KR",
75        }
76    }
77
78    /// Get a longer description for UI (e.g., command palette)
79    pub fn description(&self) -> &'static str {
80        match self {
81            Self::Utf8 => "UTF-8",
82            Self::Utf8Bom => "UTF-8 with BOM",
83            Self::Utf16Le => "UTF-16 Little Endian",
84            Self::Utf16Be => "UTF-16 Big Endian",
85            Self::Ascii => "US-ASCII",
86            Self::Latin1 => "ISO-8859-1 / Latin-1 – Western European",
87            Self::Windows1252 => "Windows-1252 / CP1252 – Western European",
88            Self::Windows1250 => "Windows-1250 / CP1250 – Central European",
89            Self::Gb18030 => "GB18030 – Chinese",
90            Self::Gbk => "GBK / CP936 – Simplified Chinese",
91            Self::ShiftJis => "Shift_JIS – Japanese",
92            Self::EucKr => "EUC-KR – Korean",
93        }
94    }
95
96    /// Get the encoding_rs Encoding for this type
97    pub fn to_encoding_rs(&self) -> &'static encoding_rs::Encoding {
98        match self {
99            Self::Utf8 | Self::Utf8Bom | Self::Ascii => encoding_rs::UTF_8,
100            Self::Utf16Le => encoding_rs::UTF_16LE,
101            Self::Utf16Be => encoding_rs::UTF_16BE,
102            Self::Latin1 => encoding_rs::WINDOWS_1252, // ISO-8859-1 maps to Windows-1252 per WHATWG
103            Self::Windows1252 => encoding_rs::WINDOWS_1252,
104            Self::Windows1250 => encoding_rs::WINDOWS_1250,
105            Self::Gb18030 => encoding_rs::GB18030,
106            Self::Gbk => encoding_rs::GBK,
107            Self::ShiftJis => encoding_rs::SHIFT_JIS,
108            Self::EucKr => encoding_rs::EUC_KR,
109        }
110    }
111
112    /// Returns true if this encoding uses a BOM (Byte Order Mark)
113    pub fn has_bom(&self) -> bool {
114        matches!(self, Self::Utf8Bom | Self::Utf16Le | Self::Utf16Be)
115    }
116
117    /// Get the BOM bytes for this encoding (if any)
118    pub fn bom_bytes(&self) -> Option<&'static [u8]> {
119        match self {
120            Self::Utf8Bom => Some(&[0xEF, 0xBB, 0xBF]),
121            Self::Utf16Le => Some(&[0xFF, 0xFE]),
122            Self::Utf16Be => Some(&[0xFE, 0xFF]),
123            _ => None,
124        }
125    }
126
127    /// All available encodings for UI display
128    pub fn all() -> &'static [Encoding] {
129        &[
130            Self::Utf8,
131            Self::Utf8Bom,
132            Self::Utf16Le,
133            Self::Utf16Be,
134            Self::Ascii,
135            Self::Latin1,
136            Self::Windows1252,
137            Self::Windows1250,
138            Self::Gb18030,
139            Self::Gbk,
140            Self::ShiftJis,
141            Self::EucKr,
142        ]
143    }
144
145    /// Returns true if this encoding supports "resynchronization" - the ability to
146    /// find character boundaries when jumping into the middle of a file.
147    ///
148    /// Resynchronizable encodings can be safely used with lazy/streaming file loading
149    /// because you can determine character boundaries from any position.
150    ///
151    /// - **UTF-8**: Excellent - unique bit patterns distinguish lead/continuation bytes
152    /// - **ASCII/Latin-1/Windows-1252**: Trivial - every byte is a character
153    /// - **UTF-16**: Good with 2-byte alignment - can detect surrogate pairs
154    /// - **UTF-32**: Good with 4-byte alignment
155    ///
156    /// Non-resynchronizable encodings (legacy CJK like Shift-JIS, GB18030, GBK, Big5)
157    /// have ambiguous byte sequences where a byte could be either a standalone character
158    /// or part of a multi-byte sequence. You must scan from the beginning to be certain.
159    pub fn is_resynchronizable(&self) -> bool {
160        match self {
161            // Fixed-width single byte - every byte is a character
162            Self::Ascii | Self::Latin1 | Self::Windows1252 | Self::Windows1250 => true,
163
164            // UTF-8 has unique bit patterns for lead vs continuation bytes
165            Self::Utf8 | Self::Utf8Bom => true,
166
167            // UTF-16 is resynchronizable with 2-byte alignment
168            // (can detect surrogate pairs by checking 0xD800-0xDFFF range)
169            Self::Utf16Le | Self::Utf16Be => true,
170
171            // Legacy CJK encodings are NOT resynchronizable
172            // The second byte of a double-byte char can equal a valid single-byte char
173            Self::Gb18030 | Self::Gbk | Self::ShiftJis | Self::EucKr => false,
174        }
175    }
176
177    /// Returns the byte alignment required for this encoding when doing random access.
178    ///
179    /// For lazy loading of large files, reads must be aligned to this boundary.
180    /// Returns None if the encoding is not resynchronizable (requires full file scan).
181    pub fn alignment(&self) -> Option<usize> {
182        match self {
183            // Single-byte encodings - no alignment needed
184            Self::Ascii | Self::Latin1 | Self::Windows1252 | Self::Windows1250 => Some(1),
185
186            // UTF-8 - no alignment needed (self-synchronizing)
187            Self::Utf8 | Self::Utf8Bom => Some(1),
188
189            // UTF-16 - must be 2-byte aligned
190            Self::Utf16Le | Self::Utf16Be => Some(2),
191
192            // Legacy CJK - not resynchronizable, no valid alignment
193            Self::Gb18030 | Self::Gbk | Self::ShiftJis | Self::EucKr => None,
194        }
195    }
196
197    /// Returns true if this encoding requires the entire file to be loaded
198    /// for correct decoding (cannot use lazy/streaming loading).
199    ///
200    /// This is the inverse of `is_resynchronizable()` and indicates that
201    /// the user should be warned before loading large files in this encoding.
202    pub fn requires_full_file_load(&self) -> bool {
203        !self.is_resynchronizable()
204    }
205}
206
207// ============================================================================
208// Encoding Detection
209// ============================================================================
210
211/// Detect the text encoding from a sample of bytes
212///
213/// This function delegates to `detect_encoding_or_binary` and returns only
214/// the encoding, ignoring the binary flag. Use `detect_encoding_or_binary`
215/// when you need to know if the content should be treated as binary.
216pub fn detect_encoding(bytes: &[u8]) -> Encoding {
217    detect_encoding_or_binary(bytes, false).0
218}
219
220/// Detect the text encoding and whether content is binary.
221///
222/// Returns (Encoding, is_binary) where:
223/// - Encoding is the detected encoding (or default if binary)
224/// - is_binary is true if the content should be treated as raw binary
225///
226/// When `truncated` is true, an incomplete multi-byte UTF-8 sequence at the
227/// end of the sample is tolerated (up to 3 bytes) since it likely results from
228/// the caller truncating a larger stream. When false, such trailing bytes cause
229/// the sample to be rejected as UTF-8.
230///
231/// # Detection Strategy
232///
233/// 1. Check for BOM (Byte Order Mark) - highest priority, definitely not binary
234/// 2. Try UTF-8 validation (fast path for most files), definitely not binary
235/// 3. Check for UTF-16 patterns without BOM, definitely not binary
236/// 4. Check for binary control characters (null bytes, etc.) - if found, it's binary
237/// 5. Use chardetng for statistical detection of legacy encodings
238/// 6. If encoding detection is uncertain, default to Windows-1252
239pub fn detect_encoding_or_binary(bytes: &[u8], truncated: bool) -> (Encoding, bool) {
240    // Only check the first 8KB for encoding detection
241    let check_len = bytes.len().min(8 * 1024);
242    let sample = &bytes[..check_len];
243
244    // 1. Check for BOM (Byte Order Mark) - highest priority, definitely text
245    if sample.starts_with(&[0xEF, 0xBB, 0xBF]) {
246        return (Encoding::Utf8Bom, false);
247    }
248    if sample.starts_with(&[0xFF, 0xFE]) {
249        // Could also be UTF-32 LE, but UTF-16 LE is much more common
250        return (Encoding::Utf16Le, false);
251    }
252    if sample.starts_with(&[0xFE, 0xFF]) {
253        return (Encoding::Utf16Be, false);
254    }
255
256    // 2. Try UTF-8 validation (fast path for most modern files)
257    // Note: When we truncate to 8KB, we may cut in the middle of a multi-byte UTF-8 sequence.
258    // We need to handle this case - if most of the sample is valid UTF-8 and the only error
259    // is an incomplete sequence at the very end, we should still detect it as UTF-8.
260    let utf8_valid_len = match std::str::from_utf8(sample) {
261        Ok(_) => sample.len(),
262        Err(e) => {
263            // error_len() returns None if the error is due to incomplete sequence at end
264            // (i.e., unexpected end of input), vs Some(n) for an invalid byte
265            if e.error_len().is_none() {
266                // Incomplete sequence at end - this is likely due to sample truncation
267                e.valid_up_to()
268            } else {
269                // Invalid byte found - not valid UTF-8
270                0
271            }
272        }
273    };
274
275    // If the sample is valid UTF-8, treat it as UTF-8.
276    // When the caller indicates the sample was truncated from a larger stream,
277    // tolerate up to 3 trailing bytes of an incomplete multi-byte sequence (a
278    // truncation artifact). Without truncation, require exact validity — a
279    // trailing 0xE9 in a short file is a Latin-1 'é', not a truncated codepoint.
280    let is_valid_utf8 = utf8_valid_len == sample.len()
281        || (truncated && utf8_valid_len > 0 && utf8_valid_len >= sample.len() - 3);
282    if is_valid_utf8 {
283        let valid_sample = &sample[..utf8_valid_len];
284        // Check if it's pure ASCII (subset of UTF-8)
285        // Also check for binary indicators in valid ASCII/UTF-8
286        let has_binary_control = valid_sample.iter().any(|&b| is_binary_control_char(b));
287        if has_binary_control {
288            return (Encoding::Utf8, true);
289        }
290        if valid_sample.iter().all(|&b| b < 128) {
291            return (Encoding::Ascii, false);
292        }
293        return (Encoding::Utf8, false);
294    }
295
296    // 3. Check for UTF-16 without BOM (common in some Windows files)
297    // Heuristic: Look for patterns of null bytes alternating with printable chars
298    // The non-null byte should be printable (0x20-0x7E) or a valid high byte
299    //
300    // Note: Unlike UTF-8 above, this heuristic is robust to sample truncation because:
301    // - We use statistical pattern matching (50% threshold), not strict validation
302    // - chunks(2) naturally handles odd-length samples by dropping the last byte
303    // - Losing 1 pair out of ~4096 doesn't affect the detection threshold
304    if sample.len() >= 4 {
305        let is_printable_or_high = |b: u8| (0x20..=0x7E).contains(&b) || b >= 0x80;
306
307        // Align to even boundary to ensure we only process complete 2-byte pairs
308        let aligned_len = sample.len() & !1; // Round down to even
309        let aligned_sample = &sample[..aligned_len];
310
311        let le_pairs = aligned_sample
312            .chunks(2)
313            .filter(|chunk| chunk[1] == 0 && is_printable_or_high(chunk[0]))
314            .count();
315        let be_pairs = aligned_sample
316            .chunks(2)
317            .filter(|chunk| chunk[0] == 0 && is_printable_or_high(chunk[1]))
318            .count();
319        let pair_count = aligned_len / 2;
320
321        // If more than 50% of pairs look like valid UTF-16 text, it's text
322        if le_pairs > pair_count / 2 {
323            return (Encoding::Utf16Le, false);
324        }
325        if be_pairs > pair_count / 2 {
326            return (Encoding::Utf16Be, false);
327        }
328    }
329
330    // 4. Check for binary indicators EARLY (before chardetng)
331    // Binary files often contain control characters and null bytes that should not
332    // appear in any valid text encoding. Check this before chardetng because
333    // chardetng might still be "confident" about some encoding for binary data.
334    let has_binary_control = sample
335        .iter()
336        .any(|&b| b == 0x00 || is_binary_control_char(b));
337    if has_binary_control {
338        return (Encoding::Utf8, true);
339    }
340
341    // 5. Check for Latin-1 patterns: high bytes followed by invalid CJK trail bytes
342    // In GB18030/GBK, trail bytes must be 0x40-0x7E or 0x80-0xFE
343    // If a high byte is followed by a byte outside these ranges (e.g., space, newline,
344    // punctuation < 0x40), it's likely Latin-1, not CJK
345    let has_latin1_pattern = has_latin1_high_byte_pattern(sample);
346
347    // Also check for bytes in CJK-only range (0x81-0x9F) which can only be CJK lead bytes
348    let has_cjk_only_bytes = sample.iter().any(|&b| (0x81..0xA0).contains(&b));
349
350    // 6. Use chardetng for statistical encoding detection
351    let mut detector = chardetng::EncodingDetector::new();
352    detector.feed(sample, true);
353    let (detected_encoding, confident) = detector.guess_assess(None, true);
354
355    // If chardetng is confident, use that encoding (not binary)
356    if confident {
357        let is_cjk_encoding = detected_encoding == encoding_rs::GB18030
358            || detected_encoding == encoding_rs::GBK
359            || detected_encoding == encoding_rs::SHIFT_JIS
360            || detected_encoding == encoding_rs::EUC_KR;
361
362        // For CJK encodings, prefer Windows-1252 if we have clear Latin-1 indicators:
363        // - Space followed by high byte (0xA0-0xFF) is common in Latin-1 text
364        //
365        // If there are CJK-only bytes (0x81-0x9F), it's definitely CJK (not ambiguous).
366        // If there are Latin-1 patterns (space + high byte), prefer Windows-1252.
367        // Otherwise, trust chardetng's detection.
368        if is_cjk_encoding && !has_cjk_only_bytes && has_latin1_pattern {
369            return (Encoding::Windows1252, false);
370        }
371
372        // GBK is a subset of GB18030. Since we only inspect the first 8KB for
373        // detection, the sample may not contain GB18030-only code points (uncommon
374        // Chinese characters, emoji, etc.). Treating GBK as GB18030 is safer and
375        // ensures proper display of all characters including French, Spanish, and emoji.
376        let encoding =
377            if detected_encoding == encoding_rs::GB18030 || detected_encoding == encoding_rs::GBK {
378                Encoding::Gb18030
379            } else if detected_encoding == encoding_rs::SHIFT_JIS {
380                Encoding::ShiftJis
381            } else if detected_encoding == encoding_rs::EUC_KR {
382                Encoding::EucKr
383            } else if detected_encoding == encoding_rs::WINDOWS_1252
384                || detected_encoding == encoding_rs::WINDOWS_1250
385            {
386                // chardetng often returns Windows-1252 for Central European text
387                // Check for Windows-1250 specific patterns
388                if has_windows1250_pattern(sample) {
389                    Encoding::Windows1250
390                } else {
391                    Encoding::Windows1252
392                }
393            } else if detected_encoding == encoding_rs::UTF_8 {
394                // chardetng thinks it's UTF-8, but validation failed above
395                // Could still be Windows-1250 if it has Central European patterns
396                if has_windows1250_pattern(sample) {
397                    Encoding::Windows1250
398                } else {
399                    Encoding::Windows1252
400                }
401            } else {
402                // Unknown encoding - check for Windows-1250 patterns
403                if has_windows1250_pattern(sample) {
404                    Encoding::Windows1250
405                } else {
406                    Encoding::Windows1252
407                }
408            };
409        return (encoding, false);
410    }
411
412    // 7. chardetng not confident, but no binary indicators - check for Windows-1250 patterns
413    // We already checked for binary control chars earlier, so this is valid text
414    if has_windows1250_pattern(sample) {
415        (Encoding::Windows1250, false)
416    } else {
417        (Encoding::Windows1252, false)
418    }
419}
420
421// ============================================================================
422// Binary Detection Helpers
423// ============================================================================
424
425/// Check if a byte is a binary control character
426///
427/// Returns true for control characters that typically indicate binary content,
428/// excluding common text control chars (tab, newline, CR, form feed, etc.)
429pub fn is_binary_control_char(byte: u8) -> bool {
430    if byte < 0x20 {
431        // Allow common text control characters:
432        // 0x09 = Tab, 0x0A = LF, 0x0D = CR, 0x0C = Form Feed, 0x0B = Vertical Tab, 0x1B = ESC
433        !matches!(byte, 0x09 | 0x0A | 0x0D | 0x0C | 0x0B | 0x1B)
434    } else if byte == 0x7F {
435        // DEL character
436        true
437    } else {
438        false
439    }
440}
441
442/// Check if sample has Latin-1 patterns that cannot be valid CJK encoding
443///
444/// In GB18030/GBK, valid sequences are:
445/// - ASCII bytes (0x00-0x7F) as standalone characters
446/// - Lead byte (0x81-0xFE) + Trail byte (0x40-0x7E or 0x80-0xFE)
447///
448/// This function looks for patterns that indicate Latin-1:
449/// 1. High bytes followed by invalid CJK trail bytes (space, newline, etc.)
450/// 2. ASCII word followed by space followed by high byte (like "Hello é")
451/// 3. High byte immediately after ASCII space (like " é")
452fn has_latin1_high_byte_pattern(sample: &[u8]) -> bool {
453    let mut latin1_indicators = 0;
454    let mut i = 0;
455
456    while i < sample.len() {
457        let byte = sample[i];
458
459        if byte < 0x80 {
460            // ASCII byte
461            // Check for pattern: space followed by high byte (0xA0-0xFF)
462            // This is common in Latin-1 text like "Hello é" or "Café résumé"
463            if byte == 0x20 && i + 1 < sample.len() {
464                let next = sample[i + 1];
465                // Space followed by Latin-1 extended char (not CJK-only lead byte)
466                if next >= 0xA0 {
467                    latin1_indicators += 1;
468                }
469            }
470            i += 1;
471            continue;
472        }
473
474        // High byte (0x80-0xFF) - could be Latin-1 or CJK lead byte
475        if i + 1 < sample.len() {
476            let next = sample[i + 1];
477
478            // Check if this could be a valid CJK double-byte sequence
479            let is_valid_cjk_lead = (0x81..=0xFE).contains(&byte);
480            let is_valid_cjk_trail = (0x40..=0x7E).contains(&next) || (0x80..=0xFE).contains(&next);
481
482            if is_valid_cjk_lead && is_valid_cjk_trail {
483                // Valid CJK pair - skip both bytes
484                i += 2;
485                continue;
486            }
487
488            // Not a valid CJK pair - check for Latin-1 indicator
489            // High byte followed by space, newline, or other low ASCII
490            if byte >= 0xA0 && next < 0x40 {
491                latin1_indicators += 1;
492            }
493        }
494
495        i += 1;
496    }
497
498    // Latin-1 is likely if we have indicators
499    latin1_indicators > 0
500}
501
502// ============================================================================
503// Encoding Conversion
504// ============================================================================
505
506/// Detect encoding and convert bytes to UTF-8
507///
508/// Returns the detected encoding and the UTF-8 converted content.
509/// This is the core function for normalizing file content to UTF-8 on load.
510pub fn detect_and_convert(bytes: &[u8]) -> (Encoding, Vec<u8>) {
511    if bytes.is_empty() {
512        return (Encoding::Utf8, Vec::new());
513    }
514
515    let encoding = detect_encoding(bytes);
516
517    // For UTF-8 (with or without BOM), we can use the content directly
518    match encoding {
519        Encoding::Utf8 | Encoding::Ascii => {
520            // Already UTF-8, just clone
521            (encoding, bytes.to_vec())
522        }
523        Encoding::Utf8Bom => {
524            // Skip the BOM (3 bytes) and use the rest
525            let content = if bytes.len() > 3 {
526                bytes[3..].to_vec()
527            } else {
528                Vec::new()
529            };
530            (encoding, content)
531        }
532        Encoding::Utf16Le | Encoding::Utf16Be => {
533            // Decode UTF-16 to UTF-8
534            let enc_rs = encoding.to_encoding_rs();
535            let start_offset =
536                if bytes.starts_with(&[0xFF, 0xFE]) || bytes.starts_with(&[0xFE, 0xFF]) {
537                    2 // Skip BOM
538                } else {
539                    0
540                };
541            let data = &bytes[start_offset..];
542
543            let (cow, _had_errors) = enc_rs.decode_without_bom_handling(data);
544            (encoding, cow.into_owned().into_bytes())
545        }
546        _ => {
547            // Use encoding_rs to convert to UTF-8
548            let enc_rs = encoding.to_encoding_rs();
549            let (cow, _had_errors) = enc_rs.decode_without_bom_handling(bytes);
550            (encoding, cow.into_owned().into_bytes())
551        }
552    }
553}
554
555/// Convert bytes from a specific encoding to UTF-8
556///
557/// Used when opening a file with a user-specified encoding instead of auto-detection.
558/// Returns the UTF-8 converted content.
559pub fn convert_to_utf8(bytes: &[u8], encoding: Encoding) -> Vec<u8> {
560    if bytes.is_empty() {
561        return Vec::new();
562    }
563
564    match encoding {
565        Encoding::Utf8 | Encoding::Ascii => {
566            // Already UTF-8, just clone
567            bytes.to_vec()
568        }
569        Encoding::Utf8Bom => {
570            // Skip the BOM (3 bytes) if present and use the rest
571            if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) && bytes.len() > 3 {
572                bytes[3..].to_vec()
573            } else {
574                bytes.to_vec()
575            }
576        }
577        Encoding::Utf16Le | Encoding::Utf16Be => {
578            // Decode UTF-16 to UTF-8
579            let enc_rs = encoding.to_encoding_rs();
580            let start_offset =
581                if bytes.starts_with(&[0xFF, 0xFE]) || bytes.starts_with(&[0xFE, 0xFF]) {
582                    2 // Skip BOM
583                } else {
584                    0
585                };
586            let data = &bytes[start_offset..];
587
588            let (cow, _had_errors) = enc_rs.decode_without_bom_handling(data);
589            cow.into_owned().into_bytes()
590        }
591        _ => {
592            // Use encoding_rs to convert to UTF-8
593            let enc_rs = encoding.to_encoding_rs();
594            let (cow, _had_errors) = enc_rs.decode_without_bom_handling(bytes);
595            cow.into_owned().into_bytes()
596        }
597    }
598}
599
600/// Convert UTF-8 content to the specified encoding for saving
601///
602/// Used when saving files to convert internal UTF-8 representation
603/// back to the original (or user-selected) encoding.
604///
605/// Note: This does NOT add BOM - the BOM should be handled separately.
606pub fn convert_from_utf8(utf8_bytes: &[u8], encoding: Encoding) -> Vec<u8> {
607    match encoding {
608        Encoding::Utf8 | Encoding::Ascii | Encoding::Utf8Bom => {
609            // UTF-8 (with or without BOM) - just clone, BOM added separately
610            utf8_bytes.to_vec()
611        }
612        Encoding::Utf16Le => {
613            // Convert UTF-8 to UTF-16 LE (no BOM - added separately)
614            let text = String::from_utf8_lossy(utf8_bytes);
615            let mut result = Vec::new();
616            for code_unit in text.encode_utf16() {
617                result.extend_from_slice(&code_unit.to_le_bytes());
618            }
619            result
620        }
621        Encoding::Utf16Be => {
622            // Convert UTF-8 to UTF-16 BE (no BOM - added separately)
623            let text = String::from_utf8_lossy(utf8_bytes);
624            let mut result = Vec::new();
625            for code_unit in text.encode_utf16() {
626                result.extend_from_slice(&code_unit.to_be_bytes());
627            }
628            result
629        }
630        _ => {
631            // Use encoding_rs to convert from UTF-8
632            let enc_rs = encoding.to_encoding_rs();
633            let text = String::from_utf8_lossy(utf8_bytes);
634            let (cow, _encoding_used, _had_errors) = enc_rs.encode(&text);
635            cow.into_owned()
636        }
637    }
638}
639
640// ============================================================================
641// Tests
642// ============================================================================
643
644#[cfg(test)]
645mod tests {
646    use super::*;
647
648    #[test]
649    fn test_encoding_display_names() {
650        assert_eq!(Encoding::Utf8.display_name(), "UTF-8");
651        assert_eq!(Encoding::Utf8Bom.display_name(), "UTF-8 BOM");
652        assert_eq!(Encoding::Utf16Le.display_name(), "UTF-16 LE");
653        assert_eq!(Encoding::Gb18030.display_name(), "GB18030");
654        assert_eq!(Encoding::Windows1250.display_name(), "Windows-1250");
655    }
656
657    #[test]
658    fn test_encoding_bom() {
659        assert!(Encoding::Utf8Bom.has_bom());
660        assert!(Encoding::Utf16Le.has_bom());
661        assert!(!Encoding::Utf8.has_bom());
662        assert!(!Encoding::Windows1252.has_bom());
663        assert!(!Encoding::Windows1250.has_bom());
664    }
665
666    #[test]
667    fn test_detect_utf8() {
668        assert_eq!(detect_encoding(b"Hello, world!"), Encoding::Ascii);
669        assert_eq!(detect_encoding("Hello, 世界!".as_bytes()), Encoding::Utf8);
670    }
671
672    #[test]
673    fn test_detect_utf8_bom() {
674        let with_bom = [0xEF, 0xBB, 0xBF, b'H', b'i'];
675        assert_eq!(detect_encoding(&with_bom), Encoding::Utf8Bom);
676    }
677
678    #[test]
679    fn test_detect_utf16_le() {
680        let utf16_le_bom = [0xFF, 0xFE, b'H', 0x00, b'i', 0x00];
681        assert_eq!(detect_encoding(&utf16_le_bom), Encoding::Utf16Le);
682    }
683
684    #[test]
685    fn test_detect_binary() {
686        let binary_data = [0x00, 0x01, 0x02, 0x03];
687        let (_, is_binary) = detect_encoding_or_binary(&binary_data, false);
688        assert!(is_binary);
689    }
690
691    #[test]
692    fn test_is_binary_control_char() {
693        // Binary control chars
694        assert!(is_binary_control_char(0x00)); // NUL
695        assert!(is_binary_control_char(0x01)); // SOH
696        assert!(is_binary_control_char(0x02)); // STX
697        assert!(is_binary_control_char(0x7F)); // DEL
698
699        // Text control chars (allowed)
700        assert!(!is_binary_control_char(0x09)); // Tab
701        assert!(!is_binary_control_char(0x0A)); // LF
702        assert!(!is_binary_control_char(0x0D)); // CR
703        assert!(!is_binary_control_char(0x1B)); // ESC
704
705        // Regular printable chars
706        assert!(!is_binary_control_char(b'A'));
707        assert!(!is_binary_control_char(b' '));
708    }
709
710    #[test]
711    fn test_convert_roundtrip_utf8() {
712        let original = "Hello, 世界!";
713        let bytes = original.as_bytes();
714
715        let (encoding, utf8_content) = detect_and_convert(bytes);
716        assert_eq!(encoding, Encoding::Utf8);
717        assert_eq!(utf8_content, bytes);
718
719        let back = convert_from_utf8(&utf8_content, encoding);
720        assert_eq!(back, bytes);
721    }
722
723    #[test]
724    fn test_convert_roundtrip_utf16le() {
725        // UTF-16 LE with BOM: "Hi"
726        let utf16_le = [0xFF, 0xFE, b'H', 0x00, b'i', 0x00];
727
728        let (encoding, utf8_content) = detect_and_convert(&utf16_le);
729        assert_eq!(encoding, Encoding::Utf16Le);
730        assert_eq!(utf8_content, b"Hi");
731
732        // Note: convert_from_utf8 doesn't add BOM, so result won't have BOM
733        let back = convert_from_utf8(&utf8_content, encoding);
734        assert_eq!(back, [b'H', 0x00, b'i', 0x00]);
735    }
736
737    #[test]
738    fn test_encoding_resynchronizable() {
739        // Self-synchronizing encodings (can find char boundaries from middle of file)
740        assert!(Encoding::Utf8.is_resynchronizable());
741        assert!(Encoding::Utf8Bom.is_resynchronizable());
742        assert!(Encoding::Ascii.is_resynchronizable());
743        assert!(Encoding::Latin1.is_resynchronizable());
744        assert!(Encoding::Windows1252.is_resynchronizable());
745        assert!(Encoding::Windows1250.is_resynchronizable());
746
747        // UTF-16 is resynchronizable with proper alignment
748        assert!(Encoding::Utf16Le.is_resynchronizable());
749        assert!(Encoding::Utf16Be.is_resynchronizable());
750
751        // Legacy CJK encodings are NOT resynchronizable
752        // (second byte of double-byte char can equal a valid single-byte char)
753        assert!(!Encoding::Gb18030.is_resynchronizable());
754        assert!(!Encoding::Gbk.is_resynchronizable());
755        assert!(!Encoding::ShiftJis.is_resynchronizable());
756        assert!(!Encoding::EucKr.is_resynchronizable());
757    }
758
759    #[test]
760    fn test_encoding_alignment() {
761        // Single-byte encodings have alignment of 1
762        assert_eq!(Encoding::Ascii.alignment(), Some(1));
763        assert_eq!(Encoding::Latin1.alignment(), Some(1));
764        assert_eq!(Encoding::Windows1252.alignment(), Some(1));
765        assert_eq!(Encoding::Windows1250.alignment(), Some(1));
766        assert_eq!(Encoding::Utf8.alignment(), Some(1));
767        assert_eq!(Encoding::Utf8Bom.alignment(), Some(1));
768
769        // UTF-16 requires 2-byte alignment
770        assert_eq!(Encoding::Utf16Le.alignment(), Some(2));
771        assert_eq!(Encoding::Utf16Be.alignment(), Some(2));
772
773        // Non-resynchronizable encodings have no valid alignment
774        assert_eq!(Encoding::Gb18030.alignment(), None);
775        assert_eq!(Encoding::Gbk.alignment(), None);
776        assert_eq!(Encoding::ShiftJis.alignment(), None);
777        assert_eq!(Encoding::EucKr.alignment(), None);
778    }
779
780    #[test]
781    fn test_requires_full_file_load() {
782        // Encodings that can be streamed
783        assert!(!Encoding::Utf8.requires_full_file_load());
784        assert!(!Encoding::Ascii.requires_full_file_load());
785        assert!(!Encoding::Latin1.requires_full_file_load());
786        assert!(!Encoding::Windows1250.requires_full_file_load());
787        assert!(!Encoding::Utf16Le.requires_full_file_load());
788
789        // Encodings that require full loading
790        assert!(Encoding::Gb18030.requires_full_file_load());
791        assert!(Encoding::Gbk.requires_full_file_load());
792        assert!(Encoding::ShiftJis.requires_full_file_load());
793        assert!(Encoding::EucKr.requires_full_file_load());
794    }
795
796    #[test]
797    fn test_convert_roundtrip_windows1250() {
798        // Windows-1250 encoded text with Central European characters
799        // "Zażółć" in Windows-1250: Z(0x5A) a(0x61) ż(0xBF) ó(0xF3) ł(0xB3) ć(0xE6)
800        let windows1250_bytes: &[u8] = &[0x5A, 0x61, 0xBF, 0xF3, 0xB3, 0xE6];
801
802        // Convert to UTF-8
803        let enc_rs = Encoding::Windows1250.to_encoding_rs();
804        let (decoded, _had_errors) = enc_rs.decode_without_bom_handling(windows1250_bytes);
805        let utf8_content = decoded.as_bytes();
806
807        // The UTF-8 content should contain the Polish characters
808        let utf8_str = std::str::from_utf8(utf8_content).unwrap();
809        assert!(utf8_str.contains('ż'), "Should contain ż: {}", utf8_str);
810        assert!(utf8_str.contains('ó'), "Should contain ó: {}", utf8_str);
811        assert!(utf8_str.contains('ł'), "Should contain ł: {}", utf8_str);
812        assert!(utf8_str.contains('ć'), "Should contain ć: {}", utf8_str);
813
814        // Convert back to Windows-1250
815        let back = convert_from_utf8(utf8_content, Encoding::Windows1250);
816        assert_eq!(back, windows1250_bytes, "Round-trip should preserve bytes");
817    }
818
819    #[test]
820    fn test_windows1250_description() {
821        assert_eq!(
822            Encoding::Windows1250.description(),
823            "Windows-1250 / CP1250 – Central European"
824        );
825    }
826
827    #[test]
828    fn test_detect_windows1250_definitive_bytes() {
829        // Bytes 0x8D (Ť), 0x8F (Ź), 0x9D (ť) are undefined in Windows-1252
830        // but valid in Windows-1250, so they definitively indicate Windows-1250
831
832        // Czech text with ť (0x9D): "měsťo" (city, archaic)
833        let with_t_caron = [0x6D, 0x9D, 0x73, 0x74, 0x6F]; // mťsto
834        assert_eq!(
835            detect_encoding(&with_t_caron),
836            Encoding::Windows1250,
837            "Byte 0x9D (ť) should trigger Windows-1250 detection"
838        );
839
840        // Polish text with Ź (0x8F): "Źródło" (source)
841        let with_z_acute_upper = [0x8F, 0x72, 0xF3, 0x64, 0xB3, 0x6F]; // Źródło
842        assert_eq!(
843            detect_encoding(&with_z_acute_upper),
844            Encoding::Windows1250,
845            "Byte 0x8F (Ź) should trigger Windows-1250 detection"
846        );
847    }
848
849    #[test]
850    fn test_detect_windows1250_strong_indicators() {
851        // Polish text with ś (0x9C) and Ś (0x8C) - strong indicators from 0x80-0x9F range
852        let polish_text = [
853            0x9C, 0x77, 0x69, 0x65, 0x74, 0x79, 0x20, // "świety "
854            0x8C, 0x77, 0x69, 0x61, 0x74, // "Świat"
855        ];
856        assert_eq!(
857            detect_encoding(&polish_text),
858            Encoding::Windows1250,
859            "Multiple Polish characters (ś, Ś) should trigger Windows-1250"
860        );
861    }
862
863    #[test]
864    fn test_detect_ambiguous_bytes_as_windows1252() {
865        // Bytes in 0xA0-0xFF range are ambiguous and should default to Windows-1252
866        // Polish "żółć" - ż(0xBF) ó(0xF3) ł(0xB3) ć(0xE6) - all ambiguous
867        let zolc = [0xBF, 0xF3, 0xB3, 0xE6];
868        assert_eq!(
869            detect_encoding(&zolc),
870            Encoding::Windows1252,
871            "Ambiguous bytes should default to Windows-1252"
872        );
873
874        // ą (0xB9) and ł (0xB3) could be ¹ and ³ in Windows-1252
875        let ambiguous = [
876            0x6D, 0xB9, 0x6B, 0x61, 0x20, // "mąka " or "m¹ka "
877            0x6D, 0xB3, 0x6F, 0x64, 0x79, // "młody" or "m³ody"
878        ];
879        assert_eq!(
880            detect_encoding(&ambiguous),
881            Encoding::Windows1252,
882            "Ambiguous Polish bytes should default to Windows-1252"
883        );
884    }
885
886    #[test]
887    fn test_detect_windows1250_czech_pangram() {
888        // "Příliš žluťoučký kůň úpěl ďábelské ódy" - Czech pangram in Windows-1250
889        // Contains ť (0x9D) which is a definitive Windows-1250 indicator
890        let czech_pangram: &[u8] = &[
891            0x50, 0xF8, 0xED, 0x6C, 0x69, 0x9A, 0x20, // "Příliš "
892            0x9E, 0x6C, 0x75, 0x9D, 0x6F, 0x75, 0xE8, 0x6B, 0xFD, 0x20, // "žluťoučký "
893            0x6B, 0xF9, 0xF2, 0x20, // "kůň "
894            0xFA, 0x70, 0xEC, 0x6C, 0x20, // "úpěl "
895            0xEF, 0xE1, 0x62, 0x65, 0x6C, 0x73, 0x6B, 0xE9, 0x20, // "ďábelské "
896            0xF3, 0x64, 0x79, // "ódy"
897        ];
898        assert_eq!(
899            detect_encoding(czech_pangram),
900            Encoding::Windows1250,
901            "Czech pangram should be detected as Windows-1250 (contains ť = 0x9D)"
902        );
903    }
904
905    #[test]
906    fn test_detect_windows1252_not_1250() {
907        // Pure Windows-1252 text without Central European indicators
908        // "Café résumé" in Windows-1252
909        let windows1252_text = [
910            0x43, 0x61, 0x66, 0xE9, 0x20, // "Café "
911            0x72, 0xE9, 0x73, 0x75, 0x6D, 0xE9, // "résumé"
912        ];
913        assert_eq!(
914            detect_encoding(&windows1252_text),
915            Encoding::Windows1252,
916            "French text should remain Windows-1252"
917        );
918    }
919
920    #[test]
921    fn test_detect_utf8_chinese_truncated_sequence() {
922        // Test that UTF-8 Chinese text is correctly detected even when the sample
923        // is truncated in the middle of a multi-byte sequence.
924        //
925        // Bug context: When sampling first 8KB for detection, the boundary may cut
926        // through a multi-byte UTF-8 character. This caused valid UTF-8 Chinese text
927        // to fail std::str::from_utf8() validation and fall through to Windows-1250
928        // detection (because UTF-8 continuation bytes like 0x9C, 0x9D overlap with
929        // Windows-1250 indicator bytes).
930
931        // Chinese text "更多" (more) = [0xE6, 0x9B, 0xB4, 0xE5, 0xA4, 0x9A]
932        // If we truncate after 0xE5, we get an incomplete sequence
933        let utf8_chinese_truncated = [
934            0xE6, 0x9B, 0xB4, // 更
935            0xE5, 0xA4, 0x9A, // 多
936            0xE5, // Start of another character, incomplete
937        ];
938
939        // With truncated=true, this should be detected as UTF-8
940        assert_eq!(
941            detect_encoding_or_binary(&utf8_chinese_truncated, true).0,
942            Encoding::Utf8,
943            "Truncated UTF-8 Chinese text should be detected as UTF-8"
944        );
945
946        // Without truncated flag, the incomplete trailing byte is treated as non-UTF-8
947        assert_ne!(
948            detect_encoding_or_binary(&utf8_chinese_truncated, false).0,
949            Encoding::Utf8,
950            "Non-truncated short sample with trailing 0xE5 should not be detected as UTF-8"
951        );
952
953        // Test with 2 bytes of incomplete sequence
954        let utf8_chinese_truncated_2 = [
955            0xE6, 0x9B, 0xB4, // 更
956            0xE5, 0xA4, 0x9A, // 多
957            0xE5, 0xA4, // Incomplete 3-byte sequence (missing last byte)
958        ];
959        assert_eq!(
960            detect_encoding_or_binary(&utf8_chinese_truncated_2, true).0,
961            Encoding::Utf8,
962            "Truncated UTF-8 with 2-byte incomplete sequence should be detected as UTF-8"
963        );
964    }
965
966    #[test]
967    fn test_detect_utf8_chinese_with_high_bytes() {
968        // UTF-8 Chinese text contains many continuation bytes in the 0x80-0xBF range,
969        // including bytes like 0x9C, 0x9D that happen to be Windows-1250 indicators.
970        // These should NOT trigger Windows-1250 detection for valid UTF-8 content.
971
972        // Chinese characters that use continuation bytes that overlap with Windows-1250 indicators:
973        // 集 = E9 9B 86 (contains 0x9B)
974        // 精 = E7 B2 BE (contains 0xB2, 0xBE)
975        // Build a string with many such characters
976        let chinese_text = "更多全本全集精校小说"; // Contains various high continuation bytes
977        let bytes = chinese_text.as_bytes();
978
979        assert_eq!(
980            detect_encoding(bytes),
981            Encoding::Utf8,
982            "UTF-8 Chinese text should be detected as UTF-8, not Windows-1250"
983        );
984
985        // Verify these bytes would have triggered Windows-1250 detection if not valid UTF-8
986        // by checking that the sample contains bytes in the 0x80-0x9F range
987        let has_high_continuation_bytes = bytes.iter().any(|&b| (0x80..0xA0).contains(&b));
988        assert!(
989            has_high_continuation_bytes,
990            "Test should include bytes that could be mistaken for Windows-1250 indicators"
991        );
992    }
993
994    #[test]
995    fn test_detect_utf8_sample_truncation_at_boundary() {
996        // Simulate what happens when we take an 8KB sample that ends mid-character
997        // by creating a buffer that's valid UTF-8 except for the last 1-3 bytes
998
999        // Build a large UTF-8 Chinese text buffer
1000        let chinese = "我的美女老师"; // "My Beautiful Teacher"
1001        let mut buffer = Vec::new();
1002        // Repeat to make it substantial
1003        for _ in 0..100 {
1004            buffer.extend_from_slice(chinese.as_bytes());
1005        }
1006
1007        // Verify it's valid UTF-8 when complete
1008        assert!(std::str::from_utf8(&buffer).is_ok());
1009        assert_eq!(detect_encoding(&buffer), Encoding::Utf8);
1010
1011        // Now truncate at various points that cut through multi-byte sequences
1012        // Each Chinese character is 3 bytes in UTF-8
1013        for truncate_offset in 1..=3 {
1014            let truncated_len = buffer.len() - truncate_offset;
1015            let truncated_buf = &buffer[..truncated_len];
1016
1017            // The truncated buffer should fail strict UTF-8 validation
1018            // (unless we happen to cut at a character boundary)
1019            let is_strict_valid = std::str::from_utf8(truncated_buf).is_ok();
1020
1021            // With truncated=true, our detection should still detect it as UTF-8
1022            let detected = detect_encoding_or_binary(truncated_buf, true).0;
1023            assert_eq!(
1024                detected,
1025                Encoding::Utf8,
1026                "Truncated UTF-8 at offset -{} should be detected as UTF-8, strict_valid={}",
1027                truncate_offset,
1028                is_strict_valid
1029            );
1030        }
1031    }
1032}
fresh/model/encoding.rs

fresh/model/
encoding.rs