fresh/model/
encoding.rs

1//! Text encoding detection and conversion
2//!
3//! This module handles:
4//! - Detecting text encodings from byte content (UTF-8, UTF-16, Latin-1, CJK, etc.)
5//! - Binary file detection (distinguishing text from binary content)
6//! - Converting between encodings (normalizing to UTF-8 on load, converting back on save)
7//!
8//! # Encoding Detection Strategy
9//!
10//! 1. **BOM Detection**: Check for Byte Order Marks (UTF-8 BOM, UTF-16 LE/BE)
11//! 2. **UTF-8 Validation**: Fast path for most modern files
12//! 3. **UTF-16 Heuristics**: Detect UTF-16 without BOM via null byte patterns
13//! 4. **Binary Detection**: Check for control characters that indicate binary content
14//! 5. **Statistical Detection**: Use chardetng for legacy encoding detection
15//! 6. **Fallback**: Default to Windows-1252 for ambiguous cases
16
17use super::encoding_heuristics::{has_windows1250_pattern, has_windows1251_pattern};
18use schemars::JsonSchema;
19use serde::{Deserialize, Serialize};
20
21// ============================================================================
22// Encoding Type
23// ============================================================================
24
25/// Supported text encodings for file I/O
26///
27/// The editor internally uses UTF-8 for all text processing. When loading files,
28/// content is converted from the detected encoding to UTF-8. When saving, content
29/// is converted back to the original (or user-selected) encoding.
30#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize, JsonSchema)]
31pub enum Encoding {
32    /// UTF-8 (default, most common)
33    #[default]
34    Utf8,
35    /// UTF-8 with Byte Order Mark
36    Utf8Bom,
37    /// UTF-16 Little Endian (Windows default for Unicode files)
38    Utf16Le,
39    /// UTF-16 Big Endian
40    Utf16Be,
41    /// ASCII (7-bit, subset of UTF-8)
42    Ascii,
43    /// Latin-1 / ISO-8859-1 (Western European)
44    Latin1,
45    /// Windows-1252 / CP-1252 (Windows Western European, often called "ANSI")
46    Windows1252,
47    /// Windows-1250 / CP-1250 (Windows Central European)
48    Windows1250,
49    /// Windows-1251 / CP-1251 (Windows Cyrillic)
50    Windows1251,
51    /// GB18030 (Chinese, superset of GBK)
52    Gb18030,
53    /// GBK (Chinese Simplified, subset of GB18030)
54    Gbk,
55    /// Shift-JIS (Japanese)
56    ShiftJis,
57    /// EUC-KR (Korean)
58    EucKr,
59}
60
61impl Encoding {
62    /// Get the display name for status bar
63    pub fn display_name(&self) -> &'static str {
64        match self {
65            Self::Utf8 => "UTF-8",
66            Self::Utf8Bom => "UTF-8 BOM",
67            Self::Utf16Le => "UTF-16 LE",
68            Self::Utf16Be => "UTF-16 BE",
69            Self::Ascii => "ASCII",
70            Self::Latin1 => "Latin-1",
71            Self::Windows1252 => "Windows-1252",
72            Self::Windows1250 => "Windows-1250",
73            Self::Windows1251 => "Windows-1251",
74            Self::Gb18030 => "GB18030",
75            Self::Gbk => "GBK",
76            Self::ShiftJis => "Shift-JIS",
77            Self::EucKr => "EUC-KR",
78        }
79    }
80
81    /// Get a longer description for UI (e.g., command palette)
82    pub fn description(&self) -> &'static str {
83        match self {
84            Self::Utf8 => "UTF-8",
85            Self::Utf8Bom => "UTF-8 with BOM",
86            Self::Utf16Le => "UTF-16 Little Endian",
87            Self::Utf16Be => "UTF-16 Big Endian",
88            Self::Ascii => "US-ASCII",
89            Self::Latin1 => "ISO-8859-1 / Latin-1 – Western European",
90            Self::Windows1252 => "Windows-1252 / CP1252 – Western European",
91            Self::Windows1250 => "Windows-1250 / CP1250 – Central European",
92            Self::Windows1251 => "Windows-1251 / CP1251 – Cyrillic",
93            Self::Gb18030 => "GB18030 – Chinese",
94            Self::Gbk => "GBK / CP936 – Simplified Chinese",
95            Self::ShiftJis => "Shift_JIS – Japanese",
96            Self::EucKr => "EUC-KR – Korean",
97        }
98    }
99
100    /// Get the encoding_rs Encoding for this type
101    pub fn to_encoding_rs(&self) -> &'static encoding_rs::Encoding {
102        match self {
103            Self::Utf8 | Self::Utf8Bom | Self::Ascii => encoding_rs::UTF_8,
104            Self::Utf16Le => encoding_rs::UTF_16LE,
105            Self::Utf16Be => encoding_rs::UTF_16BE,
106            Self::Latin1 => encoding_rs::WINDOWS_1252, // ISO-8859-1 maps to Windows-1252 per WHATWG
107            Self::Windows1252 => encoding_rs::WINDOWS_1252,
108            Self::Windows1250 => encoding_rs::WINDOWS_1250,
109            Self::Windows1251 => encoding_rs::WINDOWS_1251,
110            Self::Gb18030 => encoding_rs::GB18030,
111            Self::Gbk => encoding_rs::GBK,
112            Self::ShiftJis => encoding_rs::SHIFT_JIS,
113            Self::EucKr => encoding_rs::EUC_KR,
114        }
115    }
116
117    /// Returns true if this encoding uses a BOM (Byte Order Mark)
118    pub fn has_bom(&self) -> bool {
119        matches!(self, Self::Utf8Bom | Self::Utf16Le | Self::Utf16Be)
120    }
121
122    /// Get the BOM bytes for this encoding (if any)
123    pub fn bom_bytes(&self) -> Option<&'static [u8]> {
124        match self {
125            Self::Utf8Bom => Some(&[0xEF, 0xBB, 0xBF]),
126            Self::Utf16Le => Some(&[0xFF, 0xFE]),
127            Self::Utf16Be => Some(&[0xFE, 0xFF]),
128            _ => None,
129        }
130    }
131
132    /// All available encodings for UI display
133    pub fn all() -> &'static [Encoding] {
134        &[
135            Self::Utf8,
136            Self::Utf8Bom,
137            Self::Utf16Le,
138            Self::Utf16Be,
139            Self::Ascii,
140            Self::Latin1,
141            Self::Windows1252,
142            Self::Windows1250,
143            Self::Windows1251,
144            Self::Gb18030,
145            Self::Gbk,
146            Self::ShiftJis,
147            Self::EucKr,
148        ]
149    }
150
151    /// Returns true if this encoding supports "resynchronization" - the ability to
152    /// find character boundaries when jumping into the middle of a file.
153    ///
154    /// Resynchronizable encodings can be safely used with lazy/streaming file loading
155    /// because you can determine character boundaries from any position.
156    ///
157    /// - **UTF-8**: Excellent - unique bit patterns distinguish lead/continuation bytes
158    /// - **ASCII/Latin-1/Windows-1252**: Trivial - every byte is a character
159    /// - **UTF-16**: Good with 2-byte alignment - can detect surrogate pairs
160    /// - **UTF-32**: Good with 4-byte alignment
161    ///
162    /// Non-resynchronizable encodings (legacy CJK like Shift-JIS, GB18030, GBK, Big5)
163    /// have ambiguous byte sequences where a byte could be either a standalone character
164    /// or part of a multi-byte sequence. You must scan from the beginning to be certain.
165    pub fn is_resynchronizable(&self) -> bool {
166        match self {
167            // Fixed-width single byte - every byte is a character
168            Self::Ascii
169            | Self::Latin1
170            | Self::Windows1252
171            | Self::Windows1250
172            | Self::Windows1251 => true,
173
174            // UTF-8 has unique bit patterns for lead vs continuation bytes
175            Self::Utf8 | Self::Utf8Bom => true,
176
177            // UTF-16 is resynchronizable with 2-byte alignment
178            // (can detect surrogate pairs by checking 0xD800-0xDFFF range)
179            Self::Utf16Le | Self::Utf16Be => true,
180
181            // Legacy CJK encodings are NOT resynchronizable
182            // The second byte of a double-byte char can equal a valid single-byte char
183            Self::Gb18030 | Self::Gbk | Self::ShiftJis | Self::EucKr => false,
184        }
185    }
186
187    /// Returns the byte alignment required for this encoding when doing random access.
188    ///
189    /// For lazy loading of large files, reads must be aligned to this boundary.
190    /// Returns None if the encoding is not resynchronizable (requires full file scan).
191    pub fn alignment(&self) -> Option<usize> {
192        match self {
193            // Single-byte encodings - no alignment needed
194            Self::Ascii
195            | Self::Latin1
196            | Self::Windows1252
197            | Self::Windows1250
198            | Self::Windows1251 => Some(1),
199
200            // UTF-8 - no alignment needed (self-synchronizing)
201            Self::Utf8 | Self::Utf8Bom => Some(1),
202
203            // UTF-16 - must be 2-byte aligned
204            Self::Utf16Le | Self::Utf16Be => Some(2),
205
206            // Legacy CJK - not resynchronizable, no valid alignment
207            Self::Gb18030 | Self::Gbk | Self::ShiftJis | Self::EucKr => None,
208        }
209    }
210
211    /// Returns true if this encoding requires the entire file to be loaded
212    /// for correct decoding (cannot use lazy/streaming loading).
213    ///
214    /// This is the inverse of `is_resynchronizable()` and indicates that
215    /// the user should be warned before loading large files in this encoding.
216    pub fn requires_full_file_load(&self) -> bool {
217        !self.is_resynchronizable()
218    }
219}
220
221// ============================================================================
222// Encoding Detection
223// ============================================================================
224
225/// Detect the text encoding from a sample of bytes
226///
227/// This function delegates to `detect_encoding_or_binary` and returns only
228/// the encoding, ignoring the binary flag. Use `detect_encoding_or_binary`
229/// when you need to know if the content should be treated as binary.
230pub fn detect_encoding(bytes: &[u8]) -> Encoding {
231    detect_encoding_or_binary(bytes, false).0
232}
233
234/// Detect the text encoding and whether content is binary.
235///
236/// Returns (Encoding, is_binary) where:
237/// - Encoding is the detected encoding (or default if binary)
238/// - is_binary is true if the content should be treated as raw binary
239///
240/// When `truncated` is true, an incomplete multi-byte UTF-8 sequence at the
241/// end of the sample is tolerated (up to 3 bytes) since it likely results from
242/// the caller truncating a larger stream. When false, such trailing bytes cause
243/// the sample to be rejected as UTF-8.
244///
245/// # Detection Strategy
246///
247/// 1. Check for BOM (Byte Order Mark) - highest priority, definitely not binary
248/// 2. Try UTF-8 validation (fast path for most files), definitely not binary
249/// 3. Check for UTF-16 patterns without BOM, definitely not binary
250/// 4. Check for binary control characters (null bytes, etc.) - if found, it's binary
251/// 5. Use chardetng for statistical detection of legacy encodings
252/// 6. If encoding detection is uncertain, default to Windows-1252
253pub fn detect_encoding_or_binary(bytes: &[u8], truncated: bool) -> (Encoding, bool) {
254    // Only check the first 8KB for encoding detection
255    let check_len = bytes.len().min(8 * 1024);
256    let sample = &bytes[..check_len];
257
258    // The caller's `truncated` flag says whether the bytes they passed were
259    // already cut from a larger stream. The detector additionally clamps the
260    // sample to 8 KB internally, which is its own source of truncation — a
261    // multi-byte UTF-8 sequence straddling that cutoff would otherwise fail
262    // strict validation even though the full buffer is valid UTF-8 (#1635).
263    let sample_truncated = truncated || check_len < bytes.len();
264
265    // 1. Check for BOM (Byte Order Mark) - highest priority, definitely text
266    if sample.starts_with(&[0xEF, 0xBB, 0xBF]) {
267        return (Encoding::Utf8Bom, false);
268    }
269    if sample.starts_with(&[0xFF, 0xFE]) {
270        // Could also be UTF-32 LE, but UTF-16 LE is much more common
271        return (Encoding::Utf16Le, false);
272    }
273    if sample.starts_with(&[0xFE, 0xFF]) {
274        return (Encoding::Utf16Be, false);
275    }
276
277    // 2. Try UTF-8 validation (fast path for most modern files)
278    // Note: When we truncate to 8KB, we may cut in the middle of a multi-byte UTF-8 sequence.
279    // We need to handle this case - if most of the sample is valid UTF-8 and the only error
280    // is an incomplete sequence at the very end, we should still detect it as UTF-8.
281    let utf8_valid_len = match std::str::from_utf8(sample) {
282        Ok(_) => sample.len(),
283        Err(e) => {
284            // error_len() returns None if the error is due to incomplete sequence at end
285            // (i.e., unexpected end of input), vs Some(n) for an invalid byte
286            if e.error_len().is_none() {
287                // Incomplete sequence at end - this is likely due to sample truncation
288                e.valid_up_to()
289            } else {
290                // Invalid byte found - not valid UTF-8
291                0
292            }
293        }
294    };
295
296    // If the sample is valid UTF-8, treat it as UTF-8.
297    // When the caller indicates the sample was truncated from a larger stream,
298    // tolerate up to 3 trailing bytes of an incomplete multi-byte sequence (a
299    // truncation artifact). Without truncation, require exact validity — a
300    // trailing 0xE9 in a short file is a Latin-1 'é', not a truncated codepoint.
301    let is_valid_utf8 = utf8_valid_len == sample.len()
302        || (sample_truncated && utf8_valid_len > 0 && utf8_valid_len >= sample.len() - 3);
303    if is_valid_utf8 {
304        let valid_sample = &sample[..utf8_valid_len];
305        // Check if it's pure ASCII (subset of UTF-8)
306        // Also check for binary indicators in valid ASCII/UTF-8
307        let has_binary_control = valid_sample.iter().any(|&b| is_binary_control_char(b));
308        if has_binary_control {
309            return (Encoding::Utf8, true);
310        }
311        // If the tolerance branch accepted a trailing incomplete multi-byte
312        // sequence, the file is not pure ASCII — the byte at `utf8_valid_len`
313        // is a UTF-8 lead byte. Classify as UTF-8 in that case.
314        let has_non_ascii_tail = utf8_valid_len < sample.len();
315        if !has_non_ascii_tail && valid_sample.iter().all(|&b| b < 128) {
316            return (Encoding::Ascii, false);
317        }
318        return (Encoding::Utf8, false);
319    }
320
321    // 3. Check for UTF-16 without BOM (common in some Windows files)
322    // Heuristic: Look for patterns of null bytes alternating with printable chars
323    // The non-null byte should be printable (0x20-0x7E) or a valid high byte
324    //
325    // Note: Unlike UTF-8 above, this heuristic is robust to sample truncation because:
326    // - We use statistical pattern matching (50% threshold), not strict validation
327    // - chunks(2) naturally handles odd-length samples by dropping the last byte
328    // - Losing 1 pair out of ~4096 doesn't affect the detection threshold
329    if sample.len() >= 4 {
330        let is_printable_or_high = |b: u8| (0x20..=0x7E).contains(&b) || b >= 0x80;
331
332        // Align to even boundary to ensure we only process complete 2-byte pairs
333        let aligned_len = sample.len() & !1; // Round down to even
334        let aligned_sample = &sample[..aligned_len];
335
336        let le_pairs = aligned_sample
337            .chunks(2)
338            .filter(|chunk| chunk[1] == 0 && is_printable_or_high(chunk[0]))
339            .count();
340        let be_pairs = aligned_sample
341            .chunks(2)
342            .filter(|chunk| chunk[0] == 0 && is_printable_or_high(chunk[1]))
343            .count();
344        let pair_count = aligned_len / 2;
345
346        // If more than 50% of pairs look like valid UTF-16 text, it's text
347        if le_pairs > pair_count / 2 {
348            return (Encoding::Utf16Le, false);
349        }
350        if be_pairs > pair_count / 2 {
351            return (Encoding::Utf16Be, false);
352        }
353    }
354
355    // 4. Check for binary indicators EARLY (before chardetng)
356    // Binary files often contain control characters and null bytes that should not
357    // appear in any valid text encoding. Check this before chardetng because
358    // chardetng might still be "confident" about some encoding for binary data.
359    let has_binary_control = sample
360        .iter()
361        .any(|&b| b == 0x00 || is_binary_control_char(b));
362    if has_binary_control {
363        return (Encoding::Utf8, true);
364    }
365
366    // 5. Check for Latin-1 patterns: high bytes followed by invalid CJK trail bytes
367    // In GB18030/GBK, trail bytes must be 0x40-0x7E or 0x80-0xFE
368    // If a high byte is followed by a byte outside these ranges (e.g., space, newline,
369    // punctuation < 0x40), it's likely Latin-1, not CJK
370    let has_latin1_pattern = has_latin1_high_byte_pattern(sample);
371
372    // Also check for bytes in CJK-only range (0x81-0x9F) which can only be CJK lead bytes
373    let has_cjk_only_bytes = sample.iter().any(|&b| (0x81..0xA0).contains(&b));
374
375    // 6. Use chardetng for statistical encoding detection
376    let mut detector = chardetng::EncodingDetector::new();
377    detector.feed(sample, true);
378    let (detected_encoding, confident) = detector.guess_assess(None, true);
379
380    // If chardetng is confident, use that encoding (not binary)
381    if confident {
382        let is_cjk_encoding = detected_encoding == encoding_rs::GB18030
383            || detected_encoding == encoding_rs::GBK
384            || detected_encoding == encoding_rs::SHIFT_JIS
385            || detected_encoding == encoding_rs::EUC_KR;
386
387        // For CJK encodings, prefer Windows-1252 if we have clear Latin-1 indicators:
388        // - Space followed by high byte (0xA0-0xFF) is common in Latin-1 text
389        //
390        // If there are CJK-only bytes (0x81-0x9F), it's definitely CJK (not ambiguous).
391        // If there are Latin-1 patterns (space + high byte), prefer Windows-1252.
392        // Otherwise, trust chardetng's detection.
393        if is_cjk_encoding && !has_cjk_only_bytes && has_latin1_pattern {
394            return (Encoding::Windows1252, false);
395        }
396
397        // GBK is a subset of GB18030. Since we only inspect the first 8KB for
398        // detection, the sample may not contain GB18030-only code points (uncommon
399        // Chinese characters, emoji, etc.). Treating GBK as GB18030 is safer and
400        // ensures proper display of all characters including French, Spanish, and emoji.
401        let encoding =
402            if detected_encoding == encoding_rs::GB18030 || detected_encoding == encoding_rs::GBK {
403                Encoding::Gb18030
404            } else if detected_encoding == encoding_rs::SHIFT_JIS {
405                Encoding::ShiftJis
406            } else if detected_encoding == encoding_rs::EUC_KR {
407                Encoding::EucKr
408            } else if detected_encoding == encoding_rs::WINDOWS_1251
409                || detected_encoding == encoding_rs::WINDOWS_1252
410                || detected_encoding == encoding_rs::WINDOWS_1250
411            {
412                // chardetng can't reliably distinguish Latin-1 from Cyrillic for
413                // short samples with ambiguous high bytes — a run like "éééÿ"
414                // (Latin-1) has the same bytes as "еёёя" (Cyrillic) and chardetng
415                // may confidently pick either. Route through the heuristic and
416                // default to Windows-1252 unless there is strong evidence.
417                if has_windows1250_pattern(sample) {
418                    Encoding::Windows1250
419                } else if has_windows1251_pattern(sample) {
420                    Encoding::Windows1251
421                } else {
422                    Encoding::Windows1252
423                }
424            } else if detected_encoding == encoding_rs::UTF_8 {
425                // chardetng thinks it's UTF-8, but validation failed above
426                // Could still be Windows-1250/1251 if it has legacy patterns
427                if has_windows1250_pattern(sample) {
428                    Encoding::Windows1250
429                } else if has_windows1251_pattern(sample) {
430                    Encoding::Windows1251
431                } else {
432                    Encoding::Windows1252
433                }
434            } else {
435                // Unknown encoding - check for Windows-1250/1251 patterns
436                if has_windows1250_pattern(sample) {
437                    Encoding::Windows1250
438                } else if has_windows1251_pattern(sample) {
439                    Encoding::Windows1251
440                } else {
441                    Encoding::Windows1252
442                }
443            };
444        return (encoding, false);
445    }
446
447    // 7. chardetng not confident, but no binary indicators - check for Windows-1250/1251 patterns
448    // We already checked for binary control chars earlier, so this is valid text
449    if has_windows1250_pattern(sample) {
450        (Encoding::Windows1250, false)
451    } else if has_windows1251_pattern(sample) {
452        (Encoding::Windows1251, false)
453    } else {
454        (Encoding::Windows1252, false)
455    }
456}
457
458// ============================================================================
459// Binary Detection Helpers
460// ============================================================================
461
462/// Check if a byte is a binary control character
463///
464/// Returns true for control characters that typically indicate binary content,
465/// excluding common text control chars (tab, newline, CR, form feed, etc.)
466pub fn is_binary_control_char(byte: u8) -> bool {
467    if byte < 0x20 {
468        // Allow common text control characters:
469        // 0x09 = Tab, 0x0A = LF, 0x0D = CR, 0x0C = Form Feed, 0x0B = Vertical Tab, 0x1B = ESC
470        !matches!(byte, 0x09 | 0x0A | 0x0D | 0x0C | 0x0B | 0x1B)
471    } else if byte == 0x7F {
472        // DEL character
473        true
474    } else {
475        false
476    }
477}
478
479/// Check if sample has Latin-1 patterns that cannot be valid CJK encoding
480///
481/// In GB18030/GBK, valid sequences are:
482/// - ASCII bytes (0x00-0x7F) as standalone characters
483/// - Lead byte (0x81-0xFE) + Trail byte (0x40-0x7E or 0x80-0xFE)
484///
485/// This function looks for patterns that indicate Latin-1:
486/// 1. High bytes followed by invalid CJK trail bytes (space, newline, etc.)
487/// 2. ASCII word followed by space followed by high byte (like "Hello é")
488/// 3. High byte immediately after ASCII space (like " é")
489fn has_latin1_high_byte_pattern(sample: &[u8]) -> bool {
490    let mut latin1_indicators = 0;
491    let mut i = 0;
492
493    while i < sample.len() {
494        let byte = sample[i];
495
496        if byte < 0x80 {
497            // ASCII byte
498            // Check for pattern: space followed by high byte (0xA0-0xFF)
499            // This is common in Latin-1 text like "Hello é" or "Café résumé"
500            if byte == 0x20 && i + 1 < sample.len() {
501                let next = sample[i + 1];
502                // Space followed by Latin-1 extended char (not CJK-only lead byte)
503                if next >= 0xA0 {
504                    latin1_indicators += 1;
505                }
506            }
507            i += 1;
508            continue;
509        }
510
511        // High byte (0x80-0xFF) - could be Latin-1 or CJK lead byte
512        if i + 1 < sample.len() {
513            let next = sample[i + 1];
514
515            // Check if this could be a valid CJK double-byte sequence
516            let is_valid_cjk_lead = (0x81..=0xFE).contains(&byte);
517            let is_valid_cjk_trail = (0x40..=0x7E).contains(&next) || (0x80..=0xFE).contains(&next);
518
519            if is_valid_cjk_lead && is_valid_cjk_trail {
520                // Valid CJK pair - skip both bytes
521                i += 2;
522                continue;
523            }
524
525            // Not a valid CJK pair - check for Latin-1 indicator
526            // High byte followed by space, newline, or other low ASCII
527            if byte >= 0xA0 && next < 0x40 {
528                latin1_indicators += 1;
529            }
530        }
531
532        i += 1;
533    }
534
535    // Latin-1 is likely if we have indicators
536    latin1_indicators > 0
537}
538
539// ============================================================================
540// Encoding Conversion
541// ============================================================================
542
543/// Detect encoding and convert bytes to UTF-8
544///
545/// Returns the detected encoding and the UTF-8 converted content.
546/// This is the core function for normalizing file content to UTF-8 on load.
547pub fn detect_and_convert(bytes: &[u8]) -> (Encoding, Vec<u8>) {
548    if bytes.is_empty() {
549        return (Encoding::Utf8, Vec::new());
550    }
551
552    let encoding = detect_encoding(bytes);
553
554    // For UTF-8 (with or without BOM), we can use the content directly
555    match encoding {
556        Encoding::Utf8 | Encoding::Ascii => {
557            // Already UTF-8, just clone
558            (encoding, bytes.to_vec())
559        }
560        Encoding::Utf8Bom => {
561            // Skip the BOM (3 bytes) and use the rest
562            let content = if bytes.len() > 3 {
563                bytes[3..].to_vec()
564            } else {
565                Vec::new()
566            };
567            (encoding, content)
568        }
569        Encoding::Utf16Le | Encoding::Utf16Be => {
570            // Decode UTF-16 to UTF-8
571            let enc_rs = encoding.to_encoding_rs();
572            let start_offset =
573                if bytes.starts_with(&[0xFF, 0xFE]) || bytes.starts_with(&[0xFE, 0xFF]) {
574                    2 // Skip BOM
575                } else {
576                    0
577                };
578            let data = &bytes[start_offset..];
579
580            let (cow, _had_errors) = enc_rs.decode_without_bom_handling(data);
581            (encoding, cow.into_owned().into_bytes())
582        }
583        _ => {
584            // Use encoding_rs to convert to UTF-8
585            let enc_rs = encoding.to_encoding_rs();
586            let (cow, _had_errors) = enc_rs.decode_without_bom_handling(bytes);
587            (encoding, cow.into_owned().into_bytes())
588        }
589    }
590}
591
592/// Convert bytes from a specific encoding to UTF-8
593///
594/// Used when opening a file with a user-specified encoding instead of auto-detection.
595/// Returns the UTF-8 converted content.
596pub fn convert_to_utf8(bytes: &[u8], encoding: Encoding) -> Vec<u8> {
597    if bytes.is_empty() {
598        return Vec::new();
599    }
600
601    match encoding {
602        Encoding::Utf8 | Encoding::Ascii => {
603            // Already UTF-8, just clone
604            bytes.to_vec()
605        }
606        Encoding::Utf8Bom => {
607            // Skip the BOM (3 bytes) if present and use the rest
608            if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) && bytes.len() > 3 {
609                bytes[3..].to_vec()
610            } else {
611                bytes.to_vec()
612            }
613        }
614        Encoding::Utf16Le | Encoding::Utf16Be => {
615            // Decode UTF-16 to UTF-8
616            let enc_rs = encoding.to_encoding_rs();
617            let start_offset =
618                if bytes.starts_with(&[0xFF, 0xFE]) || bytes.starts_with(&[0xFE, 0xFF]) {
619                    2 // Skip BOM
620                } else {
621                    0
622                };
623            let data = &bytes[start_offset..];
624
625            let (cow, _had_errors) = enc_rs.decode_without_bom_handling(data);
626            cow.into_owned().into_bytes()
627        }
628        _ => {
629            // Use encoding_rs to convert to UTF-8
630            let enc_rs = encoding.to_encoding_rs();
631            let (cow, _had_errors) = enc_rs.decode_without_bom_handling(bytes);
632            cow.into_owned().into_bytes()
633        }
634    }
635}
636
637/// Convert UTF-8 content to the specified encoding for saving
638///
639/// Used when saving files to convert internal UTF-8 representation
640/// back to the original (or user-selected) encoding.
641///
642/// Note: This does NOT add BOM - the BOM should be handled separately.
643pub fn convert_from_utf8(utf8_bytes: &[u8], encoding: Encoding) -> Vec<u8> {
644    match encoding {
645        Encoding::Utf8 | Encoding::Ascii | Encoding::Utf8Bom => {
646            // UTF-8 (with or without BOM) - just clone, BOM added separately
647            utf8_bytes.to_vec()
648        }
649        Encoding::Utf16Le => {
650            // Convert UTF-8 to UTF-16 LE (no BOM - added separately)
651            let text = String::from_utf8_lossy(utf8_bytes);
652            let mut result = Vec::new();
653            for code_unit in text.encode_utf16() {
654                result.extend_from_slice(&code_unit.to_le_bytes());
655            }
656            result
657        }
658        Encoding::Utf16Be => {
659            // Convert UTF-8 to UTF-16 BE (no BOM - added separately)
660            let text = String::from_utf8_lossy(utf8_bytes);
661            let mut result = Vec::new();
662            for code_unit in text.encode_utf16() {
663                result.extend_from_slice(&code_unit.to_be_bytes());
664            }
665            result
666        }
667        _ => {
668            // Use encoding_rs to convert from UTF-8
669            let enc_rs = encoding.to_encoding_rs();
670            let text = String::from_utf8_lossy(utf8_bytes);
671            let (cow, _encoding_used, _had_errors) = enc_rs.encode(&text);
672            cow.into_owned()
673        }
674    }
675}
676
677// ============================================================================
678// Tests
679// ============================================================================
680
681#[cfg(test)]
682mod tests {
683    use super::*;
684
685    #[test]
686    fn test_encoding_display_names() {
687        assert_eq!(Encoding::Utf8.display_name(), "UTF-8");
688        assert_eq!(Encoding::Utf8Bom.display_name(), "UTF-8 BOM");
689        assert_eq!(Encoding::Utf16Le.display_name(), "UTF-16 LE");
690        assert_eq!(Encoding::Gb18030.display_name(), "GB18030");
691        assert_eq!(Encoding::Windows1250.display_name(), "Windows-1250");
692    }
693
694    #[test]
695    fn test_encoding_bom() {
696        assert!(Encoding::Utf8Bom.has_bom());
697        assert!(Encoding::Utf16Le.has_bom());
698        assert!(!Encoding::Utf8.has_bom());
699        assert!(!Encoding::Windows1252.has_bom());
700        assert!(!Encoding::Windows1250.has_bom());
701    }
702
703    #[test]
704    fn test_detect_utf8() {
705        assert_eq!(detect_encoding(b"Hello, world!"), Encoding::Ascii);
706        assert_eq!(detect_encoding("Hello, 世界!".as_bytes()), Encoding::Utf8);
707    }
708
709    #[test]
710    fn test_detect_utf8_bom() {
711        let with_bom = [0xEF, 0xBB, 0xBF, b'H', b'i'];
712        assert_eq!(detect_encoding(&with_bom), Encoding::Utf8Bom);
713    }
714
715    #[test]
716    fn test_detect_utf16_le() {
717        let utf16_le_bom = [0xFF, 0xFE, b'H', 0x00, b'i', 0x00];
718        assert_eq!(detect_encoding(&utf16_le_bom), Encoding::Utf16Le);
719    }
720
721    #[test]
722    fn test_detect_binary() {
723        let binary_data = [0x00, 0x01, 0x02, 0x03];
724        let (_, is_binary) = detect_encoding_or_binary(&binary_data, false);
725        assert!(is_binary);
726    }
727
728    #[test]
729    fn test_is_binary_control_char() {
730        // Binary control chars
731        assert!(is_binary_control_char(0x00)); // NUL
732        assert!(is_binary_control_char(0x01)); // SOH
733        assert!(is_binary_control_char(0x02)); // STX
734        assert!(is_binary_control_char(0x7F)); // DEL
735
736        // Text control chars (allowed)
737        assert!(!is_binary_control_char(0x09)); // Tab
738        assert!(!is_binary_control_char(0x0A)); // LF
739        assert!(!is_binary_control_char(0x0D)); // CR
740        assert!(!is_binary_control_char(0x1B)); // ESC
741
742        // Regular printable chars
743        assert!(!is_binary_control_char(b'A'));
744        assert!(!is_binary_control_char(b' '));
745    }
746
747    #[test]
748    fn test_convert_roundtrip_utf8() {
749        let original = "Hello, 世界!";
750        let bytes = original.as_bytes();
751
752        let (encoding, utf8_content) = detect_and_convert(bytes);
753        assert_eq!(encoding, Encoding::Utf8);
754        assert_eq!(utf8_content, bytes);
755
756        let back = convert_from_utf8(&utf8_content, encoding);
757        assert_eq!(back, bytes);
758    }
759
760    #[test]
761    fn test_convert_roundtrip_utf16le() {
762        // UTF-16 LE with BOM: "Hi"
763        let utf16_le = [0xFF, 0xFE, b'H', 0x00, b'i', 0x00];
764
765        let (encoding, utf8_content) = detect_and_convert(&utf16_le);
766        assert_eq!(encoding, Encoding::Utf16Le);
767        assert_eq!(utf8_content, b"Hi");
768
769        // Note: convert_from_utf8 doesn't add BOM, so result won't have BOM
770        let back = convert_from_utf8(&utf8_content, encoding);
771        assert_eq!(back, [b'H', 0x00, b'i', 0x00]);
772    }
773
774    #[test]
775    fn test_encoding_resynchronizable() {
776        // Self-synchronizing encodings (can find char boundaries from middle of file)
777        assert!(Encoding::Utf8.is_resynchronizable());
778        assert!(Encoding::Utf8Bom.is_resynchronizable());
779        assert!(Encoding::Ascii.is_resynchronizable());
780        assert!(Encoding::Latin1.is_resynchronizable());
781        assert!(Encoding::Windows1252.is_resynchronizable());
782        assert!(Encoding::Windows1250.is_resynchronizable());
783
784        // UTF-16 is resynchronizable with proper alignment
785        assert!(Encoding::Utf16Le.is_resynchronizable());
786        assert!(Encoding::Utf16Be.is_resynchronizable());
787
788        // Legacy CJK encodings are NOT resynchronizable
789        // (second byte of double-byte char can equal a valid single-byte char)
790        assert!(!Encoding::Gb18030.is_resynchronizable());
791        assert!(!Encoding::Gbk.is_resynchronizable());
792        assert!(!Encoding::ShiftJis.is_resynchronizable());
793        assert!(!Encoding::EucKr.is_resynchronizable());
794    }
795
796    #[test]
797    fn test_encoding_alignment() {
798        // Single-byte encodings have alignment of 1
799        assert_eq!(Encoding::Ascii.alignment(), Some(1));
800        assert_eq!(Encoding::Latin1.alignment(), Some(1));
801        assert_eq!(Encoding::Windows1252.alignment(), Some(1));
802        assert_eq!(Encoding::Windows1250.alignment(), Some(1));
803        assert_eq!(Encoding::Utf8.alignment(), Some(1));
804        assert_eq!(Encoding::Utf8Bom.alignment(), Some(1));
805
806        // UTF-16 requires 2-byte alignment
807        assert_eq!(Encoding::Utf16Le.alignment(), Some(2));
808        assert_eq!(Encoding::Utf16Be.alignment(), Some(2));
809
810        // Non-resynchronizable encodings have no valid alignment
811        assert_eq!(Encoding::Gb18030.alignment(), None);
812        assert_eq!(Encoding::Gbk.alignment(), None);
813        assert_eq!(Encoding::ShiftJis.alignment(), None);
814        assert_eq!(Encoding::EucKr.alignment(), None);
815    }
816
817    #[test]
818    fn test_requires_full_file_load() {
819        // Encodings that can be streamed
820        assert!(!Encoding::Utf8.requires_full_file_load());
821        assert!(!Encoding::Ascii.requires_full_file_load());
822        assert!(!Encoding::Latin1.requires_full_file_load());
823        assert!(!Encoding::Windows1250.requires_full_file_load());
824        assert!(!Encoding::Utf16Le.requires_full_file_load());
825
826        // Encodings that require full loading
827        assert!(Encoding::Gb18030.requires_full_file_load());
828        assert!(Encoding::Gbk.requires_full_file_load());
829        assert!(Encoding::ShiftJis.requires_full_file_load());
830        assert!(Encoding::EucKr.requires_full_file_load());
831    }
832
833    #[test]
834    fn test_convert_roundtrip_windows1250() {
835        // Windows-1250 encoded text with Central European characters
836        // "Zażółć" in Windows-1250: Z(0x5A) a(0x61) ż(0xBF) ó(0xF3) ł(0xB3) ć(0xE6)
837        let windows1250_bytes: &[u8] = &[0x5A, 0x61, 0xBF, 0xF3, 0xB3, 0xE6];
838
839        // Convert to UTF-8
840        let enc_rs = Encoding::Windows1250.to_encoding_rs();
841        let (decoded, _had_errors) = enc_rs.decode_without_bom_handling(windows1250_bytes);
842        let utf8_content = decoded.as_bytes();
843
844        // The UTF-8 content should contain the Polish characters
845        let utf8_str = std::str::from_utf8(utf8_content).unwrap();
846        assert!(utf8_str.contains('ż'), "Should contain ż: {}", utf8_str);
847        assert!(utf8_str.contains('ó'), "Should contain ó: {}", utf8_str);
848        assert!(utf8_str.contains('ł'), "Should contain ł: {}", utf8_str);
849        assert!(utf8_str.contains('ć'), "Should contain ć: {}", utf8_str);
850
851        // Convert back to Windows-1250
852        let back = convert_from_utf8(utf8_content, Encoding::Windows1250);
853        assert_eq!(back, windows1250_bytes, "Round-trip should preserve bytes");
854    }
855
856    #[test]
857    fn test_windows1250_description() {
858        assert_eq!(
859            Encoding::Windows1250.description(),
860            "Windows-1250 / CP1250 – Central European"
861        );
862    }
863
864    #[test]
865    fn test_detect_windows1250_definitive_bytes() {
866        // Bytes 0x8D (Ť), 0x8F (Ź), 0x9D (ť) are undefined in Windows-1252
867        // but valid in Windows-1250, so they definitively indicate Windows-1250
868
869        // Czech text with ť (0x9D): "měsťo" (city, archaic)
870        let with_t_caron = [0x6D, 0x9D, 0x73, 0x74, 0x6F]; // mťsto
871        assert_eq!(
872            detect_encoding(&with_t_caron),
873            Encoding::Windows1250,
874            "Byte 0x9D (ť) should trigger Windows-1250 detection"
875        );
876
877        // Polish text with Ź (0x8F): "Źródło" (source)
878        let with_z_acute_upper = [0x8F, 0x72, 0xF3, 0x64, 0xB3, 0x6F]; // Źródło
879        assert_eq!(
880            detect_encoding(&with_z_acute_upper),
881            Encoding::Windows1250,
882            "Byte 0x8F (Ź) should trigger Windows-1250 detection"
883        );
884    }
885
886    #[test]
887    fn test_detect_windows1250_strong_indicators() {
888        // Polish text with ś (0x9C) and Ś (0x8C) - strong indicators from 0x80-0x9F range
889        let polish_text = [
890            0x9C, 0x77, 0x69, 0x65, 0x74, 0x79, 0x20, // "świety "
891            0x8C, 0x77, 0x69, 0x61, 0x74, // "Świat"
892        ];
893        assert_eq!(
894            detect_encoding(&polish_text),
895            Encoding::Windows1250,
896            "Multiple Polish characters (ś, Ś) should trigger Windows-1250"
897        );
898    }
899
900    #[test]
901    fn test_detect_ambiguous_bytes_as_windows1252() {
902        // Bytes in 0xA0-0xFF range are ambiguous and should default to Windows-1252
903        // Polish "żółć" - ż(0xBF) ó(0xF3) ł(0xB3) ć(0xE6) - all ambiguous
904        let zolc = [0xBF, 0xF3, 0xB3, 0xE6];
905        assert_eq!(
906            detect_encoding(&zolc),
907            Encoding::Windows1252,
908            "Ambiguous bytes should default to Windows-1252"
909        );
910
911        // ą (0xB9) and ł (0xB3) could be ¹ and ³ in Windows-1252
912        let ambiguous = [
913            0x6D, 0xB9, 0x6B, 0x61, 0x20, // "mąka " or "m¹ka "
914            0x6D, 0xB3, 0x6F, 0x64, 0x79, // "młody" or "m³ody"
915        ];
916        assert_eq!(
917            detect_encoding(&ambiguous),
918            Encoding::Windows1252,
919            "Ambiguous Polish bytes should default to Windows-1252"
920        );
921    }
922
923    #[test]
924    fn test_detect_windows1250_czech_pangram() {
925        // "Příliš žluťoučký kůň úpěl ďábelské ódy" - Czech pangram in Windows-1250
926        // Contains ť (0x9D) which is a definitive Windows-1250 indicator
927        let czech_pangram: &[u8] = &[
928            0x50, 0xF8, 0xED, 0x6C, 0x69, 0x9A, 0x20, // "Příliš "
929            0x9E, 0x6C, 0x75, 0x9D, 0x6F, 0x75, 0xE8, 0x6B, 0xFD, 0x20, // "žluťoučký "
930            0x6B, 0xF9, 0xF2, 0x20, // "kůň "
931            0xFA, 0x70, 0xEC, 0x6C, 0x20, // "úpěl "
932            0xEF, 0xE1, 0x62, 0x65, 0x6C, 0x73, 0x6B, 0xE9, 0x20, // "ďábelské "
933            0xF3, 0x64, 0x79, // "ódy"
934        ];
935        assert_eq!(
936            detect_encoding(czech_pangram),
937            Encoding::Windows1250,
938            "Czech pangram should be detected as Windows-1250 (contains ť = 0x9D)"
939        );
940    }
941
942    #[test]
943    fn test_detect_windows1252_not_1250() {
944        // Pure Windows-1252 text without Central European indicators
945        // "Café résumé" in Windows-1252
946        let windows1252_text = [
947            0x43, 0x61, 0x66, 0xE9, 0x20, // "Café "
948            0x72, 0xE9, 0x73, 0x75, 0x6D, 0xE9, // "résumé"
949        ];
950        assert_eq!(
951            detect_encoding(&windows1252_text),
952            Encoding::Windows1252,
953            "French text should remain Windows-1252"
954        );
955    }
956
957    #[test]
958    fn test_convert_roundtrip_windows1251() {
959        // Russian "Привет" (Hello) in Windows-1251:
960        // П=0xCF р=0xF0 и=0xE8 в=0xE2 е=0xE5 т=0xF2
961        let windows1251_bytes: &[u8] = &[0xCF, 0xF0, 0xE8, 0xE2, 0xE5, 0xF2];
962
963        // Convert to UTF-8
964        let enc_rs = Encoding::Windows1251.to_encoding_rs();
965        let (decoded, _had_errors) = enc_rs.decode_without_bom_handling(windows1251_bytes);
966        let utf8_content = decoded.as_bytes();
967
968        let utf8_str = std::str::from_utf8(utf8_content).unwrap();
969        assert_eq!(utf8_str, "Привет", "Should decode to Russian 'Привет'");
970
971        // Convert back to Windows-1251
972        let back = convert_from_utf8(utf8_content, Encoding::Windows1251);
973        assert_eq!(back, windows1251_bytes, "Round-trip should preserve bytes");
974    }
975
976    #[test]
977    fn test_windows1251_display_and_description() {
978        assert_eq!(Encoding::Windows1251.display_name(), "Windows-1251");
979        assert_eq!(
980            Encoding::Windows1251.description(),
981            "Windows-1251 / CP1251 – Cyrillic"
982        );
983    }
984
985    #[test]
986    fn test_windows1251_is_resynchronizable() {
987        assert!(Encoding::Windows1251.is_resynchronizable());
988        assert_eq!(Encoding::Windows1251.alignment(), Some(1));
989        assert!(!Encoding::Windows1251.requires_full_file_load());
990        assert!(!Encoding::Windows1251.has_bom());
991    }
992
993    #[test]
994    fn test_detect_windows1251_russian() {
995        // Russian sentence "Привет мир" (Hello world) in Windows-1251
996        let privet_mir: &[u8] = &[
997            0xCF, 0xF0, 0xE8, 0xE2, 0xE5, 0xF2, // Привет
998            0x20, // space
999            0xEC, 0xE8, 0xF0, // мир
1000        ];
1001        assert_eq!(
1002            detect_encoding(privet_mir),
1003            Encoding::Windows1251,
1004            "Russian sentence should be detected as Windows-1251"
1005        );
1006    }
1007
1008    #[test]
1009    fn test_detect_windows1251_russian_pangram() {
1010        // Russian pangram fragment: "Съешь ещё этих мягких французских булок"
1011        // Contains many Cyrillic letters and the distinctive ё (0xB8) character.
1012        // bytes in Windows-1251:
1013        // С=0xD1 ъ=0xFA е=0xE5 ш=0xF8 ь=0xFC 0x20
1014        // е=0xE5 щ=0xF9 ё=0xB8 0x20
1015        // э=0xFD т=0xF2 и=0xE8 х=0xF5 0x20
1016        // м=0xEC я=0xFF г=0xE3 к=0xEA и=0xE8 х=0xF5 0x20
1017        // ф=0xF4 р=0xF0 а=0xE0 н=0xED ц=0xF6 у=0xF3 з=0xE7 с=0xF1 к=0xEA и=0xE8 х=0xF5 0x20
1018        // б=0xE1 у=0xF3 л=0xEB о=0xEE к=0xEA
1019        let pangram: &[u8] = &[
1020            0xD1, 0xFA, 0xE5, 0xF8, 0xFC, 0x20, 0xE5, 0xF9, 0xB8, 0x20, 0xFD, 0xF2, 0xE8, 0xF5,
1021            0x20, 0xEC, 0xFF, 0xE3, 0xEA, 0xE8, 0xF5, 0x20, 0xF4, 0xF0, 0xE0, 0xED, 0xF6, 0xF3,
1022            0xE7, 0xF1, 0xEA, 0xE8, 0xF5, 0x20, 0xE1, 0xF3, 0xEB, 0xEE, 0xEA,
1023        ];
1024        assert_eq!(
1025            detect_encoding(pangram),
1026            Encoding::Windows1251,
1027            "Russian pangram should be detected as Windows-1251"
1028        );
1029    }
1030
1031    #[test]
1032    fn test_detect_not_windows1251_ambiguous_polish() {
1033        // Regression: 4 consecutive Polish ambiguous bytes must still default
1034        // to Windows-1252, not be mis-detected as Cyrillic by the 1251 heuristic.
1035        let zolc = [0xBF, 0xF3, 0xB3, 0xE6];
1036        assert_eq!(
1037            detect_encoding(&zolc),
1038            Encoding::Windows1252,
1039            "Short ambiguous Polish bytes must not be detected as Windows-1251"
1040        );
1041    }
1042
1043    #[test]
1044    fn test_detect_utf8_chinese_truncated_sequence() {
1045        // Test that UTF-8 Chinese text is correctly detected even when the sample
1046        // is truncated in the middle of a multi-byte sequence.
1047        //
1048        // Bug context: When sampling first 8KB for detection, the boundary may cut
1049        // through a multi-byte UTF-8 character. This caused valid UTF-8 Chinese text
1050        // to fail std::str::from_utf8() validation and fall through to Windows-1250
1051        // detection (because UTF-8 continuation bytes like 0x9C, 0x9D overlap with
1052        // Windows-1250 indicator bytes).
1053
1054        // Chinese text "更多" (more) = [0xE6, 0x9B, 0xB4, 0xE5, 0xA4, 0x9A]
1055        // If we truncate after 0xE5, we get an incomplete sequence
1056        let utf8_chinese_truncated = [
1057            0xE6, 0x9B, 0xB4, // 更
1058            0xE5, 0xA4, 0x9A, // 多
1059            0xE5, // Start of another character, incomplete
1060        ];
1061
1062        // With truncated=true, this should be detected as UTF-8
1063        assert_eq!(
1064            detect_encoding_or_binary(&utf8_chinese_truncated, true).0,
1065            Encoding::Utf8,
1066            "Truncated UTF-8 Chinese text should be detected as UTF-8"
1067        );
1068
1069        // Without truncated flag, the incomplete trailing byte is treated as non-UTF-8
1070        assert_ne!(
1071            detect_encoding_or_binary(&utf8_chinese_truncated, false).0,
1072            Encoding::Utf8,
1073            "Non-truncated short sample with trailing 0xE5 should not be detected as UTF-8"
1074        );
1075
1076        // Test with 2 bytes of incomplete sequence
1077        let utf8_chinese_truncated_2 = [
1078            0xE6, 0x9B, 0xB4, // 更
1079            0xE5, 0xA4, 0x9A, // 多
1080            0xE5, 0xA4, // Incomplete 3-byte sequence (missing last byte)
1081        ];
1082        assert_eq!(
1083            detect_encoding_or_binary(&utf8_chinese_truncated_2, true).0,
1084            Encoding::Utf8,
1085            "Truncated UTF-8 with 2-byte incomplete sequence should be detected as UTF-8"
1086        );
1087    }
1088
1089    #[test]
1090    fn test_detect_utf8_chinese_with_high_bytes() {
1091        // UTF-8 Chinese text contains many continuation bytes in the 0x80-0xBF range,
1092        // including bytes like 0x9C, 0x9D that happen to be Windows-1250 indicators.
1093        // These should NOT trigger Windows-1250 detection for valid UTF-8 content.
1094
1095        // Chinese characters that use continuation bytes that overlap with Windows-1250 indicators:
1096        // 集 = E9 9B 86 (contains 0x9B)
1097        // 精 = E7 B2 BE (contains 0xB2, 0xBE)
1098        // Build a string with many such characters
1099        let chinese_text = "更多全本全集精校小说"; // Contains various high continuation bytes
1100        let bytes = chinese_text.as_bytes();
1101
1102        assert_eq!(
1103            detect_encoding(bytes),
1104            Encoding::Utf8,
1105            "UTF-8 Chinese text should be detected as UTF-8, not Windows-1250"
1106        );
1107
1108        // Verify these bytes would have triggered Windows-1250 detection if not valid UTF-8
1109        // by checking that the sample contains bytes in the 0x80-0x9F range
1110        let has_high_continuation_bytes = bytes.iter().any(|&b| (0x80..0xA0).contains(&b));
1111        assert!(
1112            has_high_continuation_bytes,
1113            "Test should include bytes that could be mistaken for Windows-1250 indicators"
1114        );
1115    }
1116
1117    #[test]
1118    fn test_detect_utf8_sample_truncation_at_boundary() {
1119        // Simulate what happens when we take an 8KB sample that ends mid-character
1120        // by creating a buffer that's valid UTF-8 except for the last 1-3 bytes
1121
1122        // Build a large UTF-8 Chinese text buffer
1123        let chinese = "我的美女老师"; // "My Beautiful Teacher"
1124        let mut buffer = Vec::new();
1125        // Repeat to make it substantial
1126        for _ in 0..100 {
1127            buffer.extend_from_slice(chinese.as_bytes());
1128        }
1129
1130        // Verify it's valid UTF-8 when complete
1131        assert!(std::str::from_utf8(&buffer).is_ok());
1132        assert_eq!(detect_encoding(&buffer), Encoding::Utf8);
1133
1134        // Now truncate at various points that cut through multi-byte sequences
1135        // Each Chinese character is 3 bytes in UTF-8
1136        for truncate_offset in 1..=3 {
1137            let truncated_len = buffer.len() - truncate_offset;
1138            let truncated_buf = &buffer[..truncated_len];
1139
1140            // The truncated buffer should fail strict UTF-8 validation
1141            // (unless we happen to cut at a character boundary)
1142            let is_strict_valid = std::str::from_utf8(truncated_buf).is_ok();
1143
1144            // With truncated=true, our detection should still detect it as UTF-8
1145            let detected = detect_encoding_or_binary(truncated_buf, true).0;
1146            assert_eq!(
1147                detected,
1148                Encoding::Utf8,
1149                "Truncated UTF-8 at offset -{} should be detected as UTF-8, strict_valid={}",
1150                truncate_offset,
1151                is_strict_valid
1152            );
1153        }
1154    }
1155
1156    #[test]
1157    fn test_detect_utf8_cjk_across_internal_8kb_boundary() {
1158        // Regression for #1635: when callers pass the full file bytes with
1159        // `truncated=false`, the detector still internally clamps the sample
1160        // to the first 8 KB. If a multi-byte UTF-8 sequence straddles that
1161        // internal 8192-byte cutoff, the strict UTF-8 check fails and the
1162        // CJK continuation bytes get misclassified by the fallback path
1163        // (often as Windows-1250/1251/1252), garbling the whole file.
1164        //
1165        // The full buffer IS valid UTF-8, so detection must return UTF-8.
1166
1167        // Build padding that is pure ASCII and leaves exactly one byte of
1168        // room inside the first 8 KB before we write a 3-byte CJK codepoint.
1169        // "项" = [0xE9, 0xA1, 0xB9] — the first byte sits at offset 8191,
1170        // the remaining two bytes spill past the internal 8192-byte sample.
1171        let mut buffer = Vec::with_capacity(16 * 1024);
1172        buffer.resize(8 * 1024 - 1, b'a');
1173        // CJK text that crosses the boundary and continues after it.
1174        buffer.extend_from_slice("项目设置项目设置项目设置".as_bytes());
1175
1176        assert!(buffer.len() > 8 * 1024);
1177        assert!(std::str::from_utf8(&buffer).is_ok());
1178
1179        // The caller (load_small_file) passes the full content with
1180        // `truncated=false` because no truncation occurred at the call site.
1181        // Detection must still return UTF-8.
1182        let detected = detect_encoding_or_binary(&buffer, false).0;
1183        assert_eq!(
1184            detected,
1185            Encoding::Utf8,
1186            "Valid UTF-8 CJK content must be detected as UTF-8 even when a \
1187             multi-byte sequence straddles the detector's internal 8 KB sample boundary"
1188        );
1189    }
1190}
fresh/model/encoding.rs

fresh/model/
encoding.rs