fresh/model/
encoding.rs

1//! Text encoding detection and conversion
2//!
3//! This module handles:
4//! - Detecting text encodings from byte content (UTF-8, UTF-16, Latin-1, CJK, etc.)
5//! - Binary file detection (distinguishing text from binary content)
6//! - Converting between encodings (normalizing to UTF-8 on load, converting back on save)
7//!
8//! # Encoding Detection Strategy
9//!
10//! 1. **BOM Detection**: Check for Byte Order Marks (UTF-8 BOM, UTF-16 LE/BE)
11//! 2. **UTF-8 Validation**: Fast path for most modern files
12//! 3. **UTF-16 Heuristics**: Detect UTF-16 without BOM via null byte patterns
13//! 4. **Binary Detection**: Check for control characters that indicate binary content
14//! 5. **Statistical Detection**: Use chardetng for legacy encoding detection
15//! 6. **Fallback**: Default to Windows-1252 for ambiguous cases
16
17use super::encoding_heuristics::{has_windows1250_pattern, has_windows1251_pattern};
18use schemars::JsonSchema;
19use serde::{Deserialize, Serialize};
20
21// ============================================================================
22// Encoding Type
23// ============================================================================
24
25/// Supported text encodings for file I/O
26///
27/// The editor internally uses UTF-8 for all text processing. When loading files,
28/// content is converted from the detected encoding to UTF-8. When saving, content
29/// is converted back to the original (or user-selected) encoding.
30#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize, JsonSchema)]
31pub enum Encoding {
32    /// UTF-8 (default, most common)
33    #[default]
34    Utf8,
35    /// UTF-8 with Byte Order Mark
36    Utf8Bom,
37    /// UTF-16 Little Endian (Windows default for Unicode files)
38    Utf16Le,
39    /// UTF-16 Big Endian
40    Utf16Be,
41    /// ASCII (7-bit, subset of UTF-8)
42    Ascii,
43    /// Latin-1 / ISO-8859-1 (Western European)
44    Latin1,
45    /// Windows-1252 / CP-1252 (Windows Western European, often called "ANSI")
46    Windows1252,
47    /// Windows-1250 / CP-1250 (Windows Central European)
48    Windows1250,
49    /// Windows-1251 / CP-1251 (Windows Cyrillic)
50    Windows1251,
51    /// GB18030 (Chinese, superset of GBK)
52    Gb18030,
53    /// GBK (Chinese Simplified, subset of GB18030)
54    Gbk,
55    /// Shift-JIS (Japanese)
56    ShiftJis,
57    /// EUC-KR (Korean)
58    EucKr,
59}
60
61impl Encoding {
62    /// Get the display name for status bar
63    pub fn display_name(&self) -> &'static str {
64        match self {
65            Self::Utf8 => "UTF-8",
66            Self::Utf8Bom => "UTF-8 BOM",
67            Self::Utf16Le => "UTF-16 LE",
68            Self::Utf16Be => "UTF-16 BE",
69            Self::Ascii => "ASCII",
70            Self::Latin1 => "Latin-1",
71            Self::Windows1252 => "Windows-1252",
72            Self::Windows1250 => "Windows-1250",
73            Self::Windows1251 => "Windows-1251",
74            Self::Gb18030 => "GB18030",
75            Self::Gbk => "GBK",
76            Self::ShiftJis => "Shift-JIS",
77            Self::EucKr => "EUC-KR",
78        }
79    }
80
81    /// Get a longer description for UI (e.g., command palette)
82    pub fn description(&self) -> &'static str {
83        match self {
84            Self::Utf8 => "UTF-8",
85            Self::Utf8Bom => "UTF-8 with BOM",
86            Self::Utf16Le => "UTF-16 Little Endian",
87            Self::Utf16Be => "UTF-16 Big Endian",
88            Self::Ascii => "US-ASCII",
89            Self::Latin1 => "ISO-8859-1 / Latin-1 – Western European",
90            Self::Windows1252 => "Windows-1252 / CP1252 – Western European",
91            Self::Windows1250 => "Windows-1250 / CP1250 – Central European",
92            Self::Windows1251 => "Windows-1251 / CP1251 – Cyrillic",
93            Self::Gb18030 => "GB18030 – Chinese",
94            Self::Gbk => "GBK / CP936 – Simplified Chinese",
95            Self::ShiftJis => "Shift_JIS – Japanese",
96            Self::EucKr => "EUC-KR – Korean",
97        }
98    }
99
100    /// Get the encoding_rs Encoding for this type
101    pub fn to_encoding_rs(&self) -> &'static encoding_rs::Encoding {
102        match self {
103            Self::Utf8 | Self::Utf8Bom | Self::Ascii => encoding_rs::UTF_8,
104            Self::Utf16Le => encoding_rs::UTF_16LE,
105            Self::Utf16Be => encoding_rs::UTF_16BE,
106            Self::Latin1 => encoding_rs::WINDOWS_1252, // ISO-8859-1 maps to Windows-1252 per WHATWG
107            Self::Windows1252 => encoding_rs::WINDOWS_1252,
108            Self::Windows1250 => encoding_rs::WINDOWS_1250,
109            Self::Windows1251 => encoding_rs::WINDOWS_1251,
110            Self::Gb18030 => encoding_rs::GB18030,
111            Self::Gbk => encoding_rs::GBK,
112            Self::ShiftJis => encoding_rs::SHIFT_JIS,
113            Self::EucKr => encoding_rs::EUC_KR,
114        }
115    }
116
117    /// Returns true if this encoding uses a BOM (Byte Order Mark)
118    pub fn has_bom(&self) -> bool {
119        matches!(self, Self::Utf8Bom | Self::Utf16Le | Self::Utf16Be)
120    }
121
122    /// Get the BOM bytes for this encoding (if any)
123    pub fn bom_bytes(&self) -> Option<&'static [u8]> {
124        match self {
125            Self::Utf8Bom => Some(&[0xEF, 0xBB, 0xBF]),
126            Self::Utf16Le => Some(&[0xFF, 0xFE]),
127            Self::Utf16Be => Some(&[0xFE, 0xFF]),
128            _ => None,
129        }
130    }
131
132    /// All available encodings for UI display
133    pub fn all() -> &'static [Encoding] {
134        &[
135            Self::Utf8,
136            Self::Utf8Bom,
137            Self::Utf16Le,
138            Self::Utf16Be,
139            Self::Ascii,
140            Self::Latin1,
141            Self::Windows1252,
142            Self::Windows1250,
143            Self::Windows1251,
144            Self::Gb18030,
145            Self::Gbk,
146            Self::ShiftJis,
147            Self::EucKr,
148        ]
149    }
150
151    /// Returns true if this encoding supports "resynchronization" - the ability to
152    /// find character boundaries when jumping into the middle of a file.
153    ///
154    /// Resynchronizable encodings can be safely used with lazy/streaming file loading
155    /// because you can determine character boundaries from any position.
156    ///
157    /// - **UTF-8**: Excellent - unique bit patterns distinguish lead/continuation bytes
158    /// - **ASCII/Latin-1/Windows-1252**: Trivial - every byte is a character
159    /// - **UTF-16**: Good with 2-byte alignment - can detect surrogate pairs
160    /// - **UTF-32**: Good with 4-byte alignment
161    ///
162    /// Non-resynchronizable encodings (legacy CJK like Shift-JIS, GB18030, GBK, Big5)
163    /// have ambiguous byte sequences where a byte could be either a standalone character
164    /// or part of a multi-byte sequence. You must scan from the beginning to be certain.
165    pub fn is_resynchronizable(&self) -> bool {
166        match self {
167            // Fixed-width single byte - every byte is a character
168            Self::Ascii
169            | Self::Latin1
170            | Self::Windows1252
171            | Self::Windows1250
172            | Self::Windows1251 => true,
173
174            // UTF-8 has unique bit patterns for lead vs continuation bytes
175            Self::Utf8 | Self::Utf8Bom => true,
176
177            // UTF-16 is resynchronizable with 2-byte alignment
178            // (can detect surrogate pairs by checking 0xD800-0xDFFF range)
179            Self::Utf16Le | Self::Utf16Be => true,
180
181            // Legacy CJK encodings are NOT resynchronizable
182            // The second byte of a double-byte char can equal a valid single-byte char
183            Self::Gb18030 | Self::Gbk | Self::ShiftJis | Self::EucKr => false,
184        }
185    }
186
187    /// Returns the byte alignment required for this encoding when doing random access.
188    ///
189    /// For lazy loading of large files, reads must be aligned to this boundary.
190    /// Returns None if the encoding is not resynchronizable (requires full file scan).
191    pub fn alignment(&self) -> Option<usize> {
192        match self {
193            // Single-byte encodings - no alignment needed
194            Self::Ascii
195            | Self::Latin1
196            | Self::Windows1252
197            | Self::Windows1250
198            | Self::Windows1251 => Some(1),
199
200            // UTF-8 - no alignment needed (self-synchronizing)
201            Self::Utf8 | Self::Utf8Bom => Some(1),
202
203            // UTF-16 - must be 2-byte aligned
204            Self::Utf16Le | Self::Utf16Be => Some(2),
205
206            // Legacy CJK - not resynchronizable, no valid alignment
207            Self::Gb18030 | Self::Gbk | Self::ShiftJis | Self::EucKr => None,
208        }
209    }
210
211    /// Returns true if this encoding requires the entire file to be loaded
212    /// for correct decoding (cannot use lazy/streaming loading).
213    ///
214    /// This is the inverse of `is_resynchronizable()` and indicates that
215    /// the user should be warned before loading large files in this encoding.
216    pub fn requires_full_file_load(&self) -> bool {
217        !self.is_resynchronizable()
218    }
219}
220
221// ============================================================================
222// Encoding Detection
223// ============================================================================
224
225/// Detect the text encoding from a sample of bytes
226///
227/// This function delegates to `detect_encoding_or_binary` and returns only
228/// the encoding, ignoring the binary flag. Use `detect_encoding_or_binary`
229/// when you need to know if the content should be treated as binary.
230pub fn detect_encoding(bytes: &[u8]) -> Encoding {
231    detect_encoding_or_binary(bytes, false).0
232}
233
234/// Detect the text encoding and whether content is binary.
235///
236/// Returns (Encoding, is_binary) where:
237/// - Encoding is the detected encoding (or default if binary)
238/// - is_binary is true if the content should be treated as raw binary
239///
240/// When `truncated` is true, an incomplete multi-byte UTF-8 sequence at the
241/// end of the sample is tolerated (up to 3 bytes) since it likely results from
242/// the caller truncating a larger stream. When false, such trailing bytes cause
243/// the sample to be rejected as UTF-8.
244///
245/// # Detection Strategy
246///
247/// 1. Check for BOM (Byte Order Mark) - highest priority, definitely not binary
248/// 2. Try UTF-8 validation (fast path for most files), definitely not binary
249/// 3. Check for UTF-16 patterns without BOM, definitely not binary
250/// 4. Check for binary control characters (null bytes, etc.) - if found, it's binary
251/// 5. Use chardetng for statistical detection of legacy encodings
252/// 6. If encoding detection is uncertain, default to Windows-1252
253pub fn detect_encoding_or_binary(bytes: &[u8], truncated: bool) -> (Encoding, bool) {
254    // Only check the first 8KB for encoding detection
255    let check_len = bytes.len().min(8 * 1024);
256    let sample = &bytes[..check_len];
257
258    // 1. Check for BOM (Byte Order Mark) - highest priority, definitely text
259    if sample.starts_with(&[0xEF, 0xBB, 0xBF]) {
260        return (Encoding::Utf8Bom, false);
261    }
262    if sample.starts_with(&[0xFF, 0xFE]) {
263        // Could also be UTF-32 LE, but UTF-16 LE is much more common
264        return (Encoding::Utf16Le, false);
265    }
266    if sample.starts_with(&[0xFE, 0xFF]) {
267        return (Encoding::Utf16Be, false);
268    }
269
270    // 2. Try UTF-8 validation (fast path for most modern files)
271    // Note: When we truncate to 8KB, we may cut in the middle of a multi-byte UTF-8 sequence.
272    // We need to handle this case - if most of the sample is valid UTF-8 and the only error
273    // is an incomplete sequence at the very end, we should still detect it as UTF-8.
274    let utf8_valid_len = match std::str::from_utf8(sample) {
275        Ok(_) => sample.len(),
276        Err(e) => {
277            // error_len() returns None if the error is due to incomplete sequence at end
278            // (i.e., unexpected end of input), vs Some(n) for an invalid byte
279            if e.error_len().is_none() {
280                // Incomplete sequence at end - this is likely due to sample truncation
281                e.valid_up_to()
282            } else {
283                // Invalid byte found - not valid UTF-8
284                0
285            }
286        }
287    };
288
289    // If the sample is valid UTF-8, treat it as UTF-8.
290    // When the caller indicates the sample was truncated from a larger stream,
291    // tolerate up to 3 trailing bytes of an incomplete multi-byte sequence (a
292    // truncation artifact). Without truncation, require exact validity — a
293    // trailing 0xE9 in a short file is a Latin-1 'é', not a truncated codepoint.
294    let is_valid_utf8 = utf8_valid_len == sample.len()
295        || (truncated && utf8_valid_len > 0 && utf8_valid_len >= sample.len() - 3);
296    if is_valid_utf8 {
297        let valid_sample = &sample[..utf8_valid_len];
298        // Check if it's pure ASCII (subset of UTF-8)
299        // Also check for binary indicators in valid ASCII/UTF-8
300        let has_binary_control = valid_sample.iter().any(|&b| is_binary_control_char(b));
301        if has_binary_control {
302            return (Encoding::Utf8, true);
303        }
304        if valid_sample.iter().all(|&b| b < 128) {
305            return (Encoding::Ascii, false);
306        }
307        return (Encoding::Utf8, false);
308    }
309
310    // 3. Check for UTF-16 without BOM (common in some Windows files)
311    // Heuristic: Look for patterns of null bytes alternating with printable chars
312    // The non-null byte should be printable (0x20-0x7E) or a valid high byte
313    //
314    // Note: Unlike UTF-8 above, this heuristic is robust to sample truncation because:
315    // - We use statistical pattern matching (50% threshold), not strict validation
316    // - chunks(2) naturally handles odd-length samples by dropping the last byte
317    // - Losing 1 pair out of ~4096 doesn't affect the detection threshold
318    if sample.len() >= 4 {
319        let is_printable_or_high = |b: u8| (0x20..=0x7E).contains(&b) || b >= 0x80;
320
321        // Align to even boundary to ensure we only process complete 2-byte pairs
322        let aligned_len = sample.len() & !1; // Round down to even
323        let aligned_sample = &sample[..aligned_len];
324
325        let le_pairs = aligned_sample
326            .chunks(2)
327            .filter(|chunk| chunk[1] == 0 && is_printable_or_high(chunk[0]))
328            .count();
329        let be_pairs = aligned_sample
330            .chunks(2)
331            .filter(|chunk| chunk[0] == 0 && is_printable_or_high(chunk[1]))
332            .count();
333        let pair_count = aligned_len / 2;
334
335        // If more than 50% of pairs look like valid UTF-16 text, it's text
336        if le_pairs > pair_count / 2 {
337            return (Encoding::Utf16Le, false);
338        }
339        if be_pairs > pair_count / 2 {
340            return (Encoding::Utf16Be, false);
341        }
342    }
343
344    // 4. Check for binary indicators EARLY (before chardetng)
345    // Binary files often contain control characters and null bytes that should not
346    // appear in any valid text encoding. Check this before chardetng because
347    // chardetng might still be "confident" about some encoding for binary data.
348    let has_binary_control = sample
349        .iter()
350        .any(|&b| b == 0x00 || is_binary_control_char(b));
351    if has_binary_control {
352        return (Encoding::Utf8, true);
353    }
354
355    // 5. Check for Latin-1 patterns: high bytes followed by invalid CJK trail bytes
356    // In GB18030/GBK, trail bytes must be 0x40-0x7E or 0x80-0xFE
357    // If a high byte is followed by a byte outside these ranges (e.g., space, newline,
358    // punctuation < 0x40), it's likely Latin-1, not CJK
359    let has_latin1_pattern = has_latin1_high_byte_pattern(sample);
360
361    // Also check for bytes in CJK-only range (0x81-0x9F) which can only be CJK lead bytes
362    let has_cjk_only_bytes = sample.iter().any(|&b| (0x81..0xA0).contains(&b));
363
364    // 6. Use chardetng for statistical encoding detection
365    let mut detector = chardetng::EncodingDetector::new();
366    detector.feed(sample, true);
367    let (detected_encoding, confident) = detector.guess_assess(None, true);
368
369    // If chardetng is confident, use that encoding (not binary)
370    if confident {
371        let is_cjk_encoding = detected_encoding == encoding_rs::GB18030
372            || detected_encoding == encoding_rs::GBK
373            || detected_encoding == encoding_rs::SHIFT_JIS
374            || detected_encoding == encoding_rs::EUC_KR;
375
376        // For CJK encodings, prefer Windows-1252 if we have clear Latin-1 indicators:
377        // - Space followed by high byte (0xA0-0xFF) is common in Latin-1 text
378        //
379        // If there are CJK-only bytes (0x81-0x9F), it's definitely CJK (not ambiguous).
380        // If there are Latin-1 patterns (space + high byte), prefer Windows-1252.
381        // Otherwise, trust chardetng's detection.
382        if is_cjk_encoding && !has_cjk_only_bytes && has_latin1_pattern {
383            return (Encoding::Windows1252, false);
384        }
385
386        // GBK is a subset of GB18030. Since we only inspect the first 8KB for
387        // detection, the sample may not contain GB18030-only code points (uncommon
388        // Chinese characters, emoji, etc.). Treating GBK as GB18030 is safer and
389        // ensures proper display of all characters including French, Spanish, and emoji.
390        let encoding =
391            if detected_encoding == encoding_rs::GB18030 || detected_encoding == encoding_rs::GBK {
392                Encoding::Gb18030
393            } else if detected_encoding == encoding_rs::SHIFT_JIS {
394                Encoding::ShiftJis
395            } else if detected_encoding == encoding_rs::EUC_KR {
396                Encoding::EucKr
397            } else if detected_encoding == encoding_rs::WINDOWS_1251
398                || detected_encoding == encoding_rs::WINDOWS_1252
399                || detected_encoding == encoding_rs::WINDOWS_1250
400            {
401                // chardetng can't reliably distinguish Latin-1 from Cyrillic for
402                // short samples with ambiguous high bytes — a run like "éééÿ"
403                // (Latin-1) has the same bytes as "еёёя" (Cyrillic) and chardetng
404                // may confidently pick either. Route through the heuristic and
405                // default to Windows-1252 unless there is strong evidence.
406                if has_windows1250_pattern(sample) {
407                    Encoding::Windows1250
408                } else if has_windows1251_pattern(sample) {
409                    Encoding::Windows1251
410                } else {
411                    Encoding::Windows1252
412                }
413            } else if detected_encoding == encoding_rs::UTF_8 {
414                // chardetng thinks it's UTF-8, but validation failed above
415                // Could still be Windows-1250/1251 if it has legacy patterns
416                if has_windows1250_pattern(sample) {
417                    Encoding::Windows1250
418                } else if has_windows1251_pattern(sample) {
419                    Encoding::Windows1251
420                } else {
421                    Encoding::Windows1252
422                }
423            } else {
424                // Unknown encoding - check for Windows-1250/1251 patterns
425                if has_windows1250_pattern(sample) {
426                    Encoding::Windows1250
427                } else if has_windows1251_pattern(sample) {
428                    Encoding::Windows1251
429                } else {
430                    Encoding::Windows1252
431                }
432            };
433        return (encoding, false);
434    }
435
436    // 7. chardetng not confident, but no binary indicators - check for Windows-1250/1251 patterns
437    // We already checked for binary control chars earlier, so this is valid text
438    if has_windows1250_pattern(sample) {
439        (Encoding::Windows1250, false)
440    } else if has_windows1251_pattern(sample) {
441        (Encoding::Windows1251, false)
442    } else {
443        (Encoding::Windows1252, false)
444    }
445}
446
447// ============================================================================
448// Binary Detection Helpers
449// ============================================================================
450
451/// Check if a byte is a binary control character
452///
453/// Returns true for control characters that typically indicate binary content,
454/// excluding common text control chars (tab, newline, CR, form feed, etc.)
455pub fn is_binary_control_char(byte: u8) -> bool {
456    if byte < 0x20 {
457        // Allow common text control characters:
458        // 0x09 = Tab, 0x0A = LF, 0x0D = CR, 0x0C = Form Feed, 0x0B = Vertical Tab, 0x1B = ESC
459        !matches!(byte, 0x09 | 0x0A | 0x0D | 0x0C | 0x0B | 0x1B)
460    } else if byte == 0x7F {
461        // DEL character
462        true
463    } else {
464        false
465    }
466}
467
468/// Check if sample has Latin-1 patterns that cannot be valid CJK encoding
469///
470/// In GB18030/GBK, valid sequences are:
471/// - ASCII bytes (0x00-0x7F) as standalone characters
472/// - Lead byte (0x81-0xFE) + Trail byte (0x40-0x7E or 0x80-0xFE)
473///
474/// This function looks for patterns that indicate Latin-1:
475/// 1. High bytes followed by invalid CJK trail bytes (space, newline, etc.)
476/// 2. ASCII word followed by space followed by high byte (like "Hello é")
477/// 3. High byte immediately after ASCII space (like " é")
478fn has_latin1_high_byte_pattern(sample: &[u8]) -> bool {
479    let mut latin1_indicators = 0;
480    let mut i = 0;
481
482    while i < sample.len() {
483        let byte = sample[i];
484
485        if byte < 0x80 {
486            // ASCII byte
487            // Check for pattern: space followed by high byte (0xA0-0xFF)
488            // This is common in Latin-1 text like "Hello é" or "Café résumé"
489            if byte == 0x20 && i + 1 < sample.len() {
490                let next = sample[i + 1];
491                // Space followed by Latin-1 extended char (not CJK-only lead byte)
492                if next >= 0xA0 {
493                    latin1_indicators += 1;
494                }
495            }
496            i += 1;
497            continue;
498        }
499
500        // High byte (0x80-0xFF) - could be Latin-1 or CJK lead byte
501        if i + 1 < sample.len() {
502            let next = sample[i + 1];
503
504            // Check if this could be a valid CJK double-byte sequence
505            let is_valid_cjk_lead = (0x81..=0xFE).contains(&byte);
506            let is_valid_cjk_trail = (0x40..=0x7E).contains(&next) || (0x80..=0xFE).contains(&next);
507
508            if is_valid_cjk_lead && is_valid_cjk_trail {
509                // Valid CJK pair - skip both bytes
510                i += 2;
511                continue;
512            }
513
514            // Not a valid CJK pair - check for Latin-1 indicator
515            // High byte followed by space, newline, or other low ASCII
516            if byte >= 0xA0 && next < 0x40 {
517                latin1_indicators += 1;
518            }
519        }
520
521        i += 1;
522    }
523
524    // Latin-1 is likely if we have indicators
525    latin1_indicators > 0
526}
527
528// ============================================================================
529// Encoding Conversion
530// ============================================================================
531
532/// Detect encoding and convert bytes to UTF-8
533///
534/// Returns the detected encoding and the UTF-8 converted content.
535/// This is the core function for normalizing file content to UTF-8 on load.
536pub fn detect_and_convert(bytes: &[u8]) -> (Encoding, Vec<u8>) {
537    if bytes.is_empty() {
538        return (Encoding::Utf8, Vec::new());
539    }
540
541    let encoding = detect_encoding(bytes);
542
543    // For UTF-8 (with or without BOM), we can use the content directly
544    match encoding {
545        Encoding::Utf8 | Encoding::Ascii => {
546            // Already UTF-8, just clone
547            (encoding, bytes.to_vec())
548        }
549        Encoding::Utf8Bom => {
550            // Skip the BOM (3 bytes) and use the rest
551            let content = if bytes.len() > 3 {
552                bytes[3..].to_vec()
553            } else {
554                Vec::new()
555            };
556            (encoding, content)
557        }
558        Encoding::Utf16Le | Encoding::Utf16Be => {
559            // Decode UTF-16 to UTF-8
560            let enc_rs = encoding.to_encoding_rs();
561            let start_offset =
562                if bytes.starts_with(&[0xFF, 0xFE]) || bytes.starts_with(&[0xFE, 0xFF]) {
563                    2 // Skip BOM
564                } else {
565                    0
566                };
567            let data = &bytes[start_offset..];
568
569            let (cow, _had_errors) = enc_rs.decode_without_bom_handling(data);
570            (encoding, cow.into_owned().into_bytes())
571        }
572        _ => {
573            // Use encoding_rs to convert to UTF-8
574            let enc_rs = encoding.to_encoding_rs();
575            let (cow, _had_errors) = enc_rs.decode_without_bom_handling(bytes);
576            (encoding, cow.into_owned().into_bytes())
577        }
578    }
579}
580
581/// Convert bytes from a specific encoding to UTF-8
582///
583/// Used when opening a file with a user-specified encoding instead of auto-detection.
584/// Returns the UTF-8 converted content.
585pub fn convert_to_utf8(bytes: &[u8], encoding: Encoding) -> Vec<u8> {
586    if bytes.is_empty() {
587        return Vec::new();
588    }
589
590    match encoding {
591        Encoding::Utf8 | Encoding::Ascii => {
592            // Already UTF-8, just clone
593            bytes.to_vec()
594        }
595        Encoding::Utf8Bom => {
596            // Skip the BOM (3 bytes) if present and use the rest
597            if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) && bytes.len() > 3 {
598                bytes[3..].to_vec()
599            } else {
600                bytes.to_vec()
601            }
602        }
603        Encoding::Utf16Le | Encoding::Utf16Be => {
604            // Decode UTF-16 to UTF-8
605            let enc_rs = encoding.to_encoding_rs();
606            let start_offset =
607                if bytes.starts_with(&[0xFF, 0xFE]) || bytes.starts_with(&[0xFE, 0xFF]) {
608                    2 // Skip BOM
609                } else {
610                    0
611                };
612            let data = &bytes[start_offset..];
613
614            let (cow, _had_errors) = enc_rs.decode_without_bom_handling(data);
615            cow.into_owned().into_bytes()
616        }
617        _ => {
618            // Use encoding_rs to convert to UTF-8
619            let enc_rs = encoding.to_encoding_rs();
620            let (cow, _had_errors) = enc_rs.decode_without_bom_handling(bytes);
621            cow.into_owned().into_bytes()
622        }
623    }
624}
625
626/// Convert UTF-8 content to the specified encoding for saving
627///
628/// Used when saving files to convert internal UTF-8 representation
629/// back to the original (or user-selected) encoding.
630///
631/// Note: This does NOT add BOM - the BOM should be handled separately.
632pub fn convert_from_utf8(utf8_bytes: &[u8], encoding: Encoding) -> Vec<u8> {
633    match encoding {
634        Encoding::Utf8 | Encoding::Ascii | Encoding::Utf8Bom => {
635            // UTF-8 (with or without BOM) - just clone, BOM added separately
636            utf8_bytes.to_vec()
637        }
638        Encoding::Utf16Le => {
639            // Convert UTF-8 to UTF-16 LE (no BOM - added separately)
640            let text = String::from_utf8_lossy(utf8_bytes);
641            let mut result = Vec::new();
642            for code_unit in text.encode_utf16() {
643                result.extend_from_slice(&code_unit.to_le_bytes());
644            }
645            result
646        }
647        Encoding::Utf16Be => {
648            // Convert UTF-8 to UTF-16 BE (no BOM - added separately)
649            let text = String::from_utf8_lossy(utf8_bytes);
650            let mut result = Vec::new();
651            for code_unit in text.encode_utf16() {
652                result.extend_from_slice(&code_unit.to_be_bytes());
653            }
654            result
655        }
656        _ => {
657            // Use encoding_rs to convert from UTF-8
658            let enc_rs = encoding.to_encoding_rs();
659            let text = String::from_utf8_lossy(utf8_bytes);
660            let (cow, _encoding_used, _had_errors) = enc_rs.encode(&text);
661            cow.into_owned()
662        }
663    }
664}
665
666// ============================================================================
667// Tests
668// ============================================================================
669
670#[cfg(test)]
671mod tests {
672    use super::*;
673
674    #[test]
675    fn test_encoding_display_names() {
676        assert_eq!(Encoding::Utf8.display_name(), "UTF-8");
677        assert_eq!(Encoding::Utf8Bom.display_name(), "UTF-8 BOM");
678        assert_eq!(Encoding::Utf16Le.display_name(), "UTF-16 LE");
679        assert_eq!(Encoding::Gb18030.display_name(), "GB18030");
680        assert_eq!(Encoding::Windows1250.display_name(), "Windows-1250");
681    }
682
683    #[test]
684    fn test_encoding_bom() {
685        assert!(Encoding::Utf8Bom.has_bom());
686        assert!(Encoding::Utf16Le.has_bom());
687        assert!(!Encoding::Utf8.has_bom());
688        assert!(!Encoding::Windows1252.has_bom());
689        assert!(!Encoding::Windows1250.has_bom());
690    }
691
692    #[test]
693    fn test_detect_utf8() {
694        assert_eq!(detect_encoding(b"Hello, world!"), Encoding::Ascii);
695        assert_eq!(detect_encoding("Hello, 世界!".as_bytes()), Encoding::Utf8);
696    }
697
698    #[test]
699    fn test_detect_utf8_bom() {
700        let with_bom = [0xEF, 0xBB, 0xBF, b'H', b'i'];
701        assert_eq!(detect_encoding(&with_bom), Encoding::Utf8Bom);
702    }
703
704    #[test]
705    fn test_detect_utf16_le() {
706        let utf16_le_bom = [0xFF, 0xFE, b'H', 0x00, b'i', 0x00];
707        assert_eq!(detect_encoding(&utf16_le_bom), Encoding::Utf16Le);
708    }
709
710    #[test]
711    fn test_detect_binary() {
712        let binary_data = [0x00, 0x01, 0x02, 0x03];
713        let (_, is_binary) = detect_encoding_or_binary(&binary_data, false);
714        assert!(is_binary);
715    }
716
717    #[test]
718    fn test_is_binary_control_char() {
719        // Binary control chars
720        assert!(is_binary_control_char(0x00)); // NUL
721        assert!(is_binary_control_char(0x01)); // SOH
722        assert!(is_binary_control_char(0x02)); // STX
723        assert!(is_binary_control_char(0x7F)); // DEL
724
725        // Text control chars (allowed)
726        assert!(!is_binary_control_char(0x09)); // Tab
727        assert!(!is_binary_control_char(0x0A)); // LF
728        assert!(!is_binary_control_char(0x0D)); // CR
729        assert!(!is_binary_control_char(0x1B)); // ESC
730
731        // Regular printable chars
732        assert!(!is_binary_control_char(b'A'));
733        assert!(!is_binary_control_char(b' '));
734    }
735
736    #[test]
737    fn test_convert_roundtrip_utf8() {
738        let original = "Hello, 世界!";
739        let bytes = original.as_bytes();
740
741        let (encoding, utf8_content) = detect_and_convert(bytes);
742        assert_eq!(encoding, Encoding::Utf8);
743        assert_eq!(utf8_content, bytes);
744
745        let back = convert_from_utf8(&utf8_content, encoding);
746        assert_eq!(back, bytes);
747    }
748
749    #[test]
750    fn test_convert_roundtrip_utf16le() {
751        // UTF-16 LE with BOM: "Hi"
752        let utf16_le = [0xFF, 0xFE, b'H', 0x00, b'i', 0x00];
753
754        let (encoding, utf8_content) = detect_and_convert(&utf16_le);
755        assert_eq!(encoding, Encoding::Utf16Le);
756        assert_eq!(utf8_content, b"Hi");
757
758        // Note: convert_from_utf8 doesn't add BOM, so result won't have BOM
759        let back = convert_from_utf8(&utf8_content, encoding);
760        assert_eq!(back, [b'H', 0x00, b'i', 0x00]);
761    }
762
763    #[test]
764    fn test_encoding_resynchronizable() {
765        // Self-synchronizing encodings (can find char boundaries from middle of file)
766        assert!(Encoding::Utf8.is_resynchronizable());
767        assert!(Encoding::Utf8Bom.is_resynchronizable());
768        assert!(Encoding::Ascii.is_resynchronizable());
769        assert!(Encoding::Latin1.is_resynchronizable());
770        assert!(Encoding::Windows1252.is_resynchronizable());
771        assert!(Encoding::Windows1250.is_resynchronizable());
772
773        // UTF-16 is resynchronizable with proper alignment
774        assert!(Encoding::Utf16Le.is_resynchronizable());
775        assert!(Encoding::Utf16Be.is_resynchronizable());
776
777        // Legacy CJK encodings are NOT resynchronizable
778        // (second byte of double-byte char can equal a valid single-byte char)
779        assert!(!Encoding::Gb18030.is_resynchronizable());
780        assert!(!Encoding::Gbk.is_resynchronizable());
781        assert!(!Encoding::ShiftJis.is_resynchronizable());
782        assert!(!Encoding::EucKr.is_resynchronizable());
783    }
784
785    #[test]
786    fn test_encoding_alignment() {
787        // Single-byte encodings have alignment of 1
788        assert_eq!(Encoding::Ascii.alignment(), Some(1));
789        assert_eq!(Encoding::Latin1.alignment(), Some(1));
790        assert_eq!(Encoding::Windows1252.alignment(), Some(1));
791        assert_eq!(Encoding::Windows1250.alignment(), Some(1));
792        assert_eq!(Encoding::Utf8.alignment(), Some(1));
793        assert_eq!(Encoding::Utf8Bom.alignment(), Some(1));
794
795        // UTF-16 requires 2-byte alignment
796        assert_eq!(Encoding::Utf16Le.alignment(), Some(2));
797        assert_eq!(Encoding::Utf16Be.alignment(), Some(2));
798
799        // Non-resynchronizable encodings have no valid alignment
800        assert_eq!(Encoding::Gb18030.alignment(), None);
801        assert_eq!(Encoding::Gbk.alignment(), None);
802        assert_eq!(Encoding::ShiftJis.alignment(), None);
803        assert_eq!(Encoding::EucKr.alignment(), None);
804    }
805
806    #[test]
807    fn test_requires_full_file_load() {
808        // Encodings that can be streamed
809        assert!(!Encoding::Utf8.requires_full_file_load());
810        assert!(!Encoding::Ascii.requires_full_file_load());
811        assert!(!Encoding::Latin1.requires_full_file_load());
812        assert!(!Encoding::Windows1250.requires_full_file_load());
813        assert!(!Encoding::Utf16Le.requires_full_file_load());
814
815        // Encodings that require full loading
816        assert!(Encoding::Gb18030.requires_full_file_load());
817        assert!(Encoding::Gbk.requires_full_file_load());
818        assert!(Encoding::ShiftJis.requires_full_file_load());
819        assert!(Encoding::EucKr.requires_full_file_load());
820    }
821
822    #[test]
823    fn test_convert_roundtrip_windows1250() {
824        // Windows-1250 encoded text with Central European characters
825        // "Zażółć" in Windows-1250: Z(0x5A) a(0x61) ż(0xBF) ó(0xF3) ł(0xB3) ć(0xE6)
826        let windows1250_bytes: &[u8] = &[0x5A, 0x61, 0xBF, 0xF3, 0xB3, 0xE6];
827
828        // Convert to UTF-8
829        let enc_rs = Encoding::Windows1250.to_encoding_rs();
830        let (decoded, _had_errors) = enc_rs.decode_without_bom_handling(windows1250_bytes);
831        let utf8_content = decoded.as_bytes();
832
833        // The UTF-8 content should contain the Polish characters
834        let utf8_str = std::str::from_utf8(utf8_content).unwrap();
835        assert!(utf8_str.contains('ż'), "Should contain ż: {}", utf8_str);
836        assert!(utf8_str.contains('ó'), "Should contain ó: {}", utf8_str);
837        assert!(utf8_str.contains('ł'), "Should contain ł: {}", utf8_str);
838        assert!(utf8_str.contains('ć'), "Should contain ć: {}", utf8_str);
839
840        // Convert back to Windows-1250
841        let back = convert_from_utf8(utf8_content, Encoding::Windows1250);
842        assert_eq!(back, windows1250_bytes, "Round-trip should preserve bytes");
843    }
844
845    #[test]
846    fn test_windows1250_description() {
847        assert_eq!(
848            Encoding::Windows1250.description(),
849            "Windows-1250 / CP1250 – Central European"
850        );
851    }
852
853    #[test]
854    fn test_detect_windows1250_definitive_bytes() {
855        // Bytes 0x8D (Ť), 0x8F (Ź), 0x9D (ť) are undefined in Windows-1252
856        // but valid in Windows-1250, so they definitively indicate Windows-1250
857
858        // Czech text with ť (0x9D): "měsťo" (city, archaic)
859        let with_t_caron = [0x6D, 0x9D, 0x73, 0x74, 0x6F]; // mťsto
860        assert_eq!(
861            detect_encoding(&with_t_caron),
862            Encoding::Windows1250,
863            "Byte 0x9D (ť) should trigger Windows-1250 detection"
864        );
865
866        // Polish text with Ź (0x8F): "Źródło" (source)
867        let with_z_acute_upper = [0x8F, 0x72, 0xF3, 0x64, 0xB3, 0x6F]; // Źródło
868        assert_eq!(
869            detect_encoding(&with_z_acute_upper),
870            Encoding::Windows1250,
871            "Byte 0x8F (Ź) should trigger Windows-1250 detection"
872        );
873    }
874
875    #[test]
876    fn test_detect_windows1250_strong_indicators() {
877        // Polish text with ś (0x9C) and Ś (0x8C) - strong indicators from 0x80-0x9F range
878        let polish_text = [
879            0x9C, 0x77, 0x69, 0x65, 0x74, 0x79, 0x20, // "świety "
880            0x8C, 0x77, 0x69, 0x61, 0x74, // "Świat"
881        ];
882        assert_eq!(
883            detect_encoding(&polish_text),
884            Encoding::Windows1250,
885            "Multiple Polish characters (ś, Ś) should trigger Windows-1250"
886        );
887    }
888
889    #[test]
890    fn test_detect_ambiguous_bytes_as_windows1252() {
891        // Bytes in 0xA0-0xFF range are ambiguous and should default to Windows-1252
892        // Polish "żółć" - ż(0xBF) ó(0xF3) ł(0xB3) ć(0xE6) - all ambiguous
893        let zolc = [0xBF, 0xF3, 0xB3, 0xE6];
894        assert_eq!(
895            detect_encoding(&zolc),
896            Encoding::Windows1252,
897            "Ambiguous bytes should default to Windows-1252"
898        );
899
900        // ą (0xB9) and ł (0xB3) could be ¹ and ³ in Windows-1252
901        let ambiguous = [
902            0x6D, 0xB9, 0x6B, 0x61, 0x20, // "mąka " or "m¹ka "
903            0x6D, 0xB3, 0x6F, 0x64, 0x79, // "młody" or "m³ody"
904        ];
905        assert_eq!(
906            detect_encoding(&ambiguous),
907            Encoding::Windows1252,
908            "Ambiguous Polish bytes should default to Windows-1252"
909        );
910    }
911
912    #[test]
913    fn test_detect_windows1250_czech_pangram() {
914        // "Příliš žluťoučký kůň úpěl ďábelské ódy" - Czech pangram in Windows-1250
915        // Contains ť (0x9D) which is a definitive Windows-1250 indicator
916        let czech_pangram: &[u8] = &[
917            0x50, 0xF8, 0xED, 0x6C, 0x69, 0x9A, 0x20, // "Příliš "
918            0x9E, 0x6C, 0x75, 0x9D, 0x6F, 0x75, 0xE8, 0x6B, 0xFD, 0x20, // "žluťoučký "
919            0x6B, 0xF9, 0xF2, 0x20, // "kůň "
920            0xFA, 0x70, 0xEC, 0x6C, 0x20, // "úpěl "
921            0xEF, 0xE1, 0x62, 0x65, 0x6C, 0x73, 0x6B, 0xE9, 0x20, // "ďábelské "
922            0xF3, 0x64, 0x79, // "ódy"
923        ];
924        assert_eq!(
925            detect_encoding(czech_pangram),
926            Encoding::Windows1250,
927            "Czech pangram should be detected as Windows-1250 (contains ť = 0x9D)"
928        );
929    }
930
931    #[test]
932    fn test_detect_windows1252_not_1250() {
933        // Pure Windows-1252 text without Central European indicators
934        // "Café résumé" in Windows-1252
935        let windows1252_text = [
936            0x43, 0x61, 0x66, 0xE9, 0x20, // "Café "
937            0x72, 0xE9, 0x73, 0x75, 0x6D, 0xE9, // "résumé"
938        ];
939        assert_eq!(
940            detect_encoding(&windows1252_text),
941            Encoding::Windows1252,
942            "French text should remain Windows-1252"
943        );
944    }
945
946    #[test]
947    fn test_convert_roundtrip_windows1251() {
948        // Russian "Привет" (Hello) in Windows-1251:
949        // П=0xCF р=0xF0 и=0xE8 в=0xE2 е=0xE5 т=0xF2
950        let windows1251_bytes: &[u8] = &[0xCF, 0xF0, 0xE8, 0xE2, 0xE5, 0xF2];
951
952        // Convert to UTF-8
953        let enc_rs = Encoding::Windows1251.to_encoding_rs();
954        let (decoded, _had_errors) = enc_rs.decode_without_bom_handling(windows1251_bytes);
955        let utf8_content = decoded.as_bytes();
956
957        let utf8_str = std::str::from_utf8(utf8_content).unwrap();
958        assert_eq!(utf8_str, "Привет", "Should decode to Russian 'Привет'");
959
960        // Convert back to Windows-1251
961        let back = convert_from_utf8(utf8_content, Encoding::Windows1251);
962        assert_eq!(back, windows1251_bytes, "Round-trip should preserve bytes");
963    }
964
965    #[test]
966    fn test_windows1251_display_and_description() {
967        assert_eq!(Encoding::Windows1251.display_name(), "Windows-1251");
968        assert_eq!(
969            Encoding::Windows1251.description(),
970            "Windows-1251 / CP1251 – Cyrillic"
971        );
972    }
973
974    #[test]
975    fn test_windows1251_is_resynchronizable() {
976        assert!(Encoding::Windows1251.is_resynchronizable());
977        assert_eq!(Encoding::Windows1251.alignment(), Some(1));
978        assert!(!Encoding::Windows1251.requires_full_file_load());
979        assert!(!Encoding::Windows1251.has_bom());
980    }
981
982    #[test]
983    fn test_detect_windows1251_russian() {
984        // Russian sentence "Привет мир" (Hello world) in Windows-1251
985        let privet_mir: &[u8] = &[
986            0xCF, 0xF0, 0xE8, 0xE2, 0xE5, 0xF2, // Привет
987            0x20, // space
988            0xEC, 0xE8, 0xF0, // мир
989        ];
990        assert_eq!(
991            detect_encoding(privet_mir),
992            Encoding::Windows1251,
993            "Russian sentence should be detected as Windows-1251"
994        );
995    }
996
997    #[test]
998    fn test_detect_windows1251_russian_pangram() {
999        // Russian pangram fragment: "Съешь ещё этих мягких французских булок"
1000        // Contains many Cyrillic letters and the distinctive ё (0xB8) character.
1001        // bytes in Windows-1251:
1002        // С=0xD1 ъ=0xFA е=0xE5 ш=0xF8 ь=0xFC 0x20
1003        // е=0xE5 щ=0xF9 ё=0xB8 0x20
1004        // э=0xFD т=0xF2 и=0xE8 х=0xF5 0x20
1005        // м=0xEC я=0xFF г=0xE3 к=0xEA и=0xE8 х=0xF5 0x20
1006        // ф=0xF4 р=0xF0 а=0xE0 н=0xED ц=0xF6 у=0xF3 з=0xE7 с=0xF1 к=0xEA и=0xE8 х=0xF5 0x20
1007        // б=0xE1 у=0xF3 л=0xEB о=0xEE к=0xEA
1008        let pangram: &[u8] = &[
1009            0xD1, 0xFA, 0xE5, 0xF8, 0xFC, 0x20, 0xE5, 0xF9, 0xB8, 0x20, 0xFD, 0xF2, 0xE8, 0xF5,
1010            0x20, 0xEC, 0xFF, 0xE3, 0xEA, 0xE8, 0xF5, 0x20, 0xF4, 0xF0, 0xE0, 0xED, 0xF6, 0xF3,
1011            0xE7, 0xF1, 0xEA, 0xE8, 0xF5, 0x20, 0xE1, 0xF3, 0xEB, 0xEE, 0xEA,
1012        ];
1013        assert_eq!(
1014            detect_encoding(pangram),
1015            Encoding::Windows1251,
1016            "Russian pangram should be detected as Windows-1251"
1017        );
1018    }
1019
1020    #[test]
1021    fn test_detect_not_windows1251_ambiguous_polish() {
1022        // Regression: 4 consecutive Polish ambiguous bytes must still default
1023        // to Windows-1252, not be mis-detected as Cyrillic by the 1251 heuristic.
1024        let zolc = [0xBF, 0xF3, 0xB3, 0xE6];
1025        assert_eq!(
1026            detect_encoding(&zolc),
1027            Encoding::Windows1252,
1028            "Short ambiguous Polish bytes must not be detected as Windows-1251"
1029        );
1030    }
1031
1032    #[test]
1033    fn test_detect_utf8_chinese_truncated_sequence() {
1034        // Test that UTF-8 Chinese text is correctly detected even when the sample
1035        // is truncated in the middle of a multi-byte sequence.
1036        //
1037        // Bug context: When sampling first 8KB for detection, the boundary may cut
1038        // through a multi-byte UTF-8 character. This caused valid UTF-8 Chinese text
1039        // to fail std::str::from_utf8() validation and fall through to Windows-1250
1040        // detection (because UTF-8 continuation bytes like 0x9C, 0x9D overlap with
1041        // Windows-1250 indicator bytes).
1042
1043        // Chinese text "更多" (more) = [0xE6, 0x9B, 0xB4, 0xE5, 0xA4, 0x9A]
1044        // If we truncate after 0xE5, we get an incomplete sequence
1045        let utf8_chinese_truncated = [
1046            0xE6, 0x9B, 0xB4, // 更
1047            0xE5, 0xA4, 0x9A, // 多
1048            0xE5, // Start of another character, incomplete
1049        ];
1050
1051        // With truncated=true, this should be detected as UTF-8
1052        assert_eq!(
1053            detect_encoding_or_binary(&utf8_chinese_truncated, true).0,
1054            Encoding::Utf8,
1055            "Truncated UTF-8 Chinese text should be detected as UTF-8"
1056        );
1057
1058        // Without truncated flag, the incomplete trailing byte is treated as non-UTF-8
1059        assert_ne!(
1060            detect_encoding_or_binary(&utf8_chinese_truncated, false).0,
1061            Encoding::Utf8,
1062            "Non-truncated short sample with trailing 0xE5 should not be detected as UTF-8"
1063        );
1064
1065        // Test with 2 bytes of incomplete sequence
1066        let utf8_chinese_truncated_2 = [
1067            0xE6, 0x9B, 0xB4, // 更
1068            0xE5, 0xA4, 0x9A, // 多
1069            0xE5, 0xA4, // Incomplete 3-byte sequence (missing last byte)
1070        ];
1071        assert_eq!(
1072            detect_encoding_or_binary(&utf8_chinese_truncated_2, true).0,
1073            Encoding::Utf8,
1074            "Truncated UTF-8 with 2-byte incomplete sequence should be detected as UTF-8"
1075        );
1076    }
1077
1078    #[test]
1079    fn test_detect_utf8_chinese_with_high_bytes() {
1080        // UTF-8 Chinese text contains many continuation bytes in the 0x80-0xBF range,
1081        // including bytes like 0x9C, 0x9D that happen to be Windows-1250 indicators.
1082        // These should NOT trigger Windows-1250 detection for valid UTF-8 content.
1083
1084        // Chinese characters that use continuation bytes that overlap with Windows-1250 indicators:
1085        // 集 = E9 9B 86 (contains 0x9B)
1086        // 精 = E7 B2 BE (contains 0xB2, 0xBE)
1087        // Build a string with many such characters
1088        let chinese_text = "更多全本全集精校小说"; // Contains various high continuation bytes
1089        let bytes = chinese_text.as_bytes();
1090
1091        assert_eq!(
1092            detect_encoding(bytes),
1093            Encoding::Utf8,
1094            "UTF-8 Chinese text should be detected as UTF-8, not Windows-1250"
1095        );
1096
1097        // Verify these bytes would have triggered Windows-1250 detection if not valid UTF-8
1098        // by checking that the sample contains bytes in the 0x80-0x9F range
1099        let has_high_continuation_bytes = bytes.iter().any(|&b| (0x80..0xA0).contains(&b));
1100        assert!(
1101            has_high_continuation_bytes,
1102            "Test should include bytes that could be mistaken for Windows-1250 indicators"
1103        );
1104    }
1105
1106    #[test]
1107    fn test_detect_utf8_sample_truncation_at_boundary() {
1108        // Simulate what happens when we take an 8KB sample that ends mid-character
1109        // by creating a buffer that's valid UTF-8 except for the last 1-3 bytes
1110
1111        // Build a large UTF-8 Chinese text buffer
1112        let chinese = "我的美女老师"; // "My Beautiful Teacher"
1113        let mut buffer = Vec::new();
1114        // Repeat to make it substantial
1115        for _ in 0..100 {
1116            buffer.extend_from_slice(chinese.as_bytes());
1117        }
1118
1119        // Verify it's valid UTF-8 when complete
1120        assert!(std::str::from_utf8(&buffer).is_ok());
1121        assert_eq!(detect_encoding(&buffer), Encoding::Utf8);
1122
1123        // Now truncate at various points that cut through multi-byte sequences
1124        // Each Chinese character is 3 bytes in UTF-8
1125        for truncate_offset in 1..=3 {
1126            let truncated_len = buffer.len() - truncate_offset;
1127            let truncated_buf = &buffer[..truncated_len];
1128
1129            // The truncated buffer should fail strict UTF-8 validation
1130            // (unless we happen to cut at a character boundary)
1131            let is_strict_valid = std::str::from_utf8(truncated_buf).is_ok();
1132
1133            // With truncated=true, our detection should still detect it as UTF-8
1134            let detected = detect_encoding_or_binary(truncated_buf, true).0;
1135            assert_eq!(
1136                detected,
1137                Encoding::Utf8,
1138                "Truncated UTF-8 at offset -{} should be detected as UTF-8, strict_valid={}",
1139                truncate_offset,
1140                is_strict_valid
1141            );
1142        }
1143    }
1144}
fresh/model/encoding.rs

fresh/model/
encoding.rs