fresh/model/
encoding.rs

1//! Text encoding detection and conversion
2//!
3//! This module handles:
4//! - Detecting text encodings from byte content (UTF-8, UTF-16, Latin-1, CJK, etc.)
5//! - Binary file detection (distinguishing text from binary content)
6//! - Converting between encodings (normalizing to UTF-8 on load, converting back on save)
7//!
8//! # Encoding Detection Strategy
9//!
10//! 1. **BOM Detection**: Check for Byte Order Marks (UTF-8 BOM, UTF-16 LE/BE)
11//! 2. **UTF-8 Validation**: Fast path for most modern files
12//! 3. **UTF-16 Heuristics**: Detect UTF-16 without BOM via null byte patterns
13//! 4. **Binary Detection**: Check for control characters that indicate binary content
14//! 5. **Statistical Detection**: Use chardetng for legacy encoding detection
15//! 6. **Fallback**: Default to Windows-1252 for ambiguous cases
16
17use super::encoding_heuristics::{has_windows1250_pattern, has_windows1251_pattern};
18use schemars::JsonSchema;
19use serde::{Deserialize, Serialize};
20
21// ============================================================================
22// Encoding Type
23// ============================================================================
24
25/// Supported text encodings for file I/O
26///
27/// The editor internally uses UTF-8 for all text processing. When loading files,
28/// content is converted from the detected encoding to UTF-8. When saving, content
29/// is converted back to the original (or user-selected) encoding.
30#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize, JsonSchema)]
31pub enum Encoding {
32    /// UTF-8 (default, most common)
33    #[default]
34    Utf8,
35    /// UTF-8 with Byte Order Mark
36    Utf8Bom,
37    /// UTF-16 Little Endian (Windows default for Unicode files)
38    Utf16Le,
39    /// UTF-16 Big Endian
40    Utf16Be,
41    /// ASCII (7-bit, subset of UTF-8)
42    Ascii,
43    /// Latin-1 / ISO-8859-1 (Western European)
44    Latin1,
45    /// Windows-1252 / CP-1252 (Windows Western European, often called "ANSI")
46    Windows1252,
47    /// Windows-1250 / CP-1250 (Windows Central European)
48    Windows1250,
49    /// Windows-1251 / CP-1251 (Windows Cyrillic)
50    Windows1251,
51    /// GB18030 (Chinese, superset of GBK)
52    Gb18030,
53    /// GBK (Chinese Simplified, subset of GB18030)
54    Gbk,
55    /// Shift-JIS (Japanese)
56    ShiftJis,
57    /// EUC-KR (Korean)
58    EucKr,
59}
60
61impl Encoding {
62    /// Get the display name for status bar
63    pub fn display_name(&self) -> &'static str {
64        match self {
65            Self::Utf8 => "UTF-8",
66            Self::Utf8Bom => "UTF-8 BOM",
67            Self::Utf16Le => "UTF-16 LE",
68            Self::Utf16Be => "UTF-16 BE",
69            Self::Ascii => "ASCII",
70            Self::Latin1 => "Latin-1",
71            Self::Windows1252 => "Windows-1252",
72            Self::Windows1250 => "Windows-1250",
73            Self::Windows1251 => "Windows-1251",
74            Self::Gb18030 => "GB18030",
75            Self::Gbk => "GBK",
76            Self::ShiftJis => "Shift-JIS",
77            Self::EucKr => "EUC-KR",
78        }
79    }
80
81    /// Get a longer description for UI (e.g., command palette)
82    pub fn description(&self) -> &'static str {
83        match self {
84            Self::Utf8 => "UTF-8",
85            Self::Utf8Bom => "UTF-8 with BOM",
86            Self::Utf16Le => "UTF-16 Little Endian",
87            Self::Utf16Be => "UTF-16 Big Endian",
88            Self::Ascii => "US-ASCII",
89            Self::Latin1 => "ISO-8859-1 / Latin-1 – Western European",
90            Self::Windows1252 => "Windows-1252 / CP1252 – Western European",
91            Self::Windows1250 => "Windows-1250 / CP1250 – Central European",
92            Self::Windows1251 => "Windows-1251 / CP1251 – Cyrillic",
93            Self::Gb18030 => "GB18030 – Chinese",
94            Self::Gbk => "GBK / CP936 – Simplified Chinese",
95            Self::ShiftJis => "Shift_JIS – Japanese",
96            Self::EucKr => "EUC-KR – Korean",
97        }
98    }
99
100    /// Get the encoding_rs Encoding for this type
101    pub fn to_encoding_rs(&self) -> &'static encoding_rs::Encoding {
102        match self {
103            Self::Utf8 | Self::Utf8Bom | Self::Ascii => encoding_rs::UTF_8,
104            Self::Utf16Le => encoding_rs::UTF_16LE,
105            Self::Utf16Be => encoding_rs::UTF_16BE,
106            Self::Latin1 => encoding_rs::WINDOWS_1252, // ISO-8859-1 maps to Windows-1252 per WHATWG
107            Self::Windows1252 => encoding_rs::WINDOWS_1252,
108            Self::Windows1250 => encoding_rs::WINDOWS_1250,
109            Self::Windows1251 => encoding_rs::WINDOWS_1251,
110            Self::Gb18030 => encoding_rs::GB18030,
111            Self::Gbk => encoding_rs::GBK,
112            Self::ShiftJis => encoding_rs::SHIFT_JIS,
113            Self::EucKr => encoding_rs::EUC_KR,
114        }
115    }
116
117    /// Returns true if this encoding uses a BOM (Byte Order Mark)
118    pub fn has_bom(&self) -> bool {
119        matches!(self, Self::Utf8Bom | Self::Utf16Le | Self::Utf16Be)
120    }
121
122    /// Get the BOM bytes for this encoding (if any)
123    pub fn bom_bytes(&self) -> Option<&'static [u8]> {
124        match self {
125            Self::Utf8Bom => Some(&[0xEF, 0xBB, 0xBF]),
126            Self::Utf16Le => Some(&[0xFF, 0xFE]),
127            Self::Utf16Be => Some(&[0xFE, 0xFF]),
128            _ => None,
129        }
130    }
131
132    /// All available encodings for UI display
133    pub fn all() -> &'static [Encoding] {
134        &[
135            Self::Utf8,
136            Self::Utf8Bom,
137            Self::Utf16Le,
138            Self::Utf16Be,
139            Self::Ascii,
140            Self::Latin1,
141            Self::Windows1252,
142            Self::Windows1250,
143            Self::Windows1251,
144            Self::Gb18030,
145            Self::Gbk,
146            Self::ShiftJis,
147            Self::EucKr,
148        ]
149    }
150
151    /// Returns true if this encoding supports "resynchronization" - the ability to
152    /// find character boundaries when jumping into the middle of a file.
153    ///
154    /// Resynchronizable encodings can be safely used with lazy/streaming file loading
155    /// because you can determine character boundaries from any position.
156    ///
157    /// - **UTF-8**: Excellent - unique bit patterns distinguish lead/continuation bytes
158    /// - **ASCII/Latin-1/Windows-1252**: Trivial - every byte is a character
159    /// - **UTF-16**: Good with 2-byte alignment - can detect surrogate pairs
160    /// - **UTF-32**: Good with 4-byte alignment
161    ///
162    /// Non-resynchronizable encodings (legacy CJK like Shift-JIS, GB18030, GBK, Big5)
163    /// have ambiguous byte sequences where a byte could be either a standalone character
164    /// or part of a multi-byte sequence. You must scan from the beginning to be certain.
165    pub fn is_resynchronizable(&self) -> bool {
166        match self {
167            // Fixed-width single byte - every byte is a character
168            Self::Ascii
169            | Self::Latin1
170            | Self::Windows1252
171            | Self::Windows1250
172            | Self::Windows1251 => true,
173
174            // UTF-8 has unique bit patterns for lead vs continuation bytes
175            Self::Utf8 | Self::Utf8Bom => true,
176
177            // UTF-16 is resynchronizable with 2-byte alignment
178            // (can detect surrogate pairs by checking 0xD800-0xDFFF range)
179            Self::Utf16Le | Self::Utf16Be => true,
180
181            // Legacy CJK encodings are NOT resynchronizable
182            // The second byte of a double-byte char can equal a valid single-byte char
183            Self::Gb18030 | Self::Gbk | Self::ShiftJis | Self::EucKr => false,
184        }
185    }
186
187    /// Returns the byte alignment required for this encoding when doing random access.
188    ///
189    /// For lazy loading of large files, reads must be aligned to this boundary.
190    /// Returns None if the encoding is not resynchronizable (requires full file scan).
191    pub fn alignment(&self) -> Option<usize> {
192        match self {
193            // Single-byte encodings - no alignment needed
194            Self::Ascii
195            | Self::Latin1
196            | Self::Windows1252
197            | Self::Windows1250
198            | Self::Windows1251 => Some(1),
199
200            // UTF-8 - no alignment needed (self-synchronizing)
201            Self::Utf8 | Self::Utf8Bom => Some(1),
202
203            // UTF-16 - must be 2-byte aligned
204            Self::Utf16Le | Self::Utf16Be => Some(2),
205
206            // Legacy CJK - not resynchronizable, no valid alignment
207            Self::Gb18030 | Self::Gbk | Self::ShiftJis | Self::EucKr => None,
208        }
209    }
210
211    /// Returns true if this encoding requires the entire file to be loaded
212    /// for correct decoding (cannot use lazy/streaming loading).
213    ///
214    /// This is the inverse of `is_resynchronizable()` and indicates that
215    /// the user should be warned before loading large files in this encoding.
216    pub fn requires_full_file_load(&self) -> bool {
217        !self.is_resynchronizable()
218    }
219}
220
221// ============================================================================
222// Encoding Detection
223// ============================================================================
224
225/// Detect the text encoding from a sample of bytes
226///
227/// This function delegates to `detect_encoding_or_binary` and returns only
228/// the encoding, ignoring the binary flag. Use `detect_encoding_or_binary`
229/// when you need to know if the content should be treated as binary.
230pub fn detect_encoding(bytes: &[u8]) -> Encoding {
231    detect_encoding_or_binary(bytes, false).0
232}
233
234/// Detect the text encoding and whether content is binary.
235///
236/// Returns (Encoding, is_binary) where:
237/// - Encoding is the detected encoding (or default if binary)
238/// - is_binary is true if the content should be treated as raw binary
239///
240/// When `truncated` is true, an incomplete multi-byte UTF-8 sequence at the
241/// end of the sample is tolerated (up to 3 bytes) since it likely results from
242/// the caller truncating a larger stream. When false, such trailing bytes cause
243/// the sample to be rejected as UTF-8.
244///
245/// # Detection Strategy
246///
247/// 1. Check for BOM (Byte Order Mark) - highest priority, definitely not binary
248/// 2. Try UTF-8 validation (fast path for most files), definitely not binary
249/// 3. Check for UTF-16 patterns without BOM, definitely not binary
250/// 4. Check for binary control characters (null bytes, etc.) - if found, it's binary
251/// 5. Use chardetng for statistical detection of legacy encodings
252/// 6. If encoding detection is uncertain, default to Windows-1252
253pub fn detect_encoding_or_binary(bytes: &[u8], truncated: bool) -> (Encoding, bool) {
254    // Only check the first 8KB for encoding detection.
255    let check_len = bytes.len().min(8 * 1024);
256    let sample = &bytes[..check_len];
257
258    // The caller's `truncated` flag says whether the bytes they passed were
259    // already cut from a larger stream. The detector additionally clamps the
260    // sample to 8 KB internally, which is its own source of truncation — a
261    // multi-byte UTF-8 sequence straddling that cutoff would otherwise fail
262    // strict validation even though the full buffer is valid UTF-8 (#1635).
263    let sample_truncated = truncated || check_len < bytes.len();
264
265    // Run the detection phases in priority order, returning at the first one
266    // that reaches a verdict. See the doc comment above for the strategy.
267    if let Some(result) = detect_by_bom(sample) {
268        return result;
269    }
270    if let Some(result) = detect_utf8(sample, sample_truncated) {
271        return result;
272    }
273    if let Some(result) = detect_utf16_without_bom(sample) {
274        return result;
275    }
276    detect_legacy_encoding(sample)
277}
278
279/// Phase 1: detect a leading Byte Order Mark. A BOM is definitive — the content
280/// is text in the marked encoding. Returns `None` when no BOM is present.
281fn detect_by_bom(sample: &[u8]) -> Option<(Encoding, bool)> {
282    if sample.starts_with(&[0xEF, 0xBB, 0xBF]) {
283        Some((Encoding::Utf8Bom, false))
284    } else if sample.starts_with(&[0xFF, 0xFE]) {
285        // Could also be UTF-32 LE, but UTF-16 LE is much more common.
286        Some((Encoding::Utf16Le, false))
287    } else if sample.starts_with(&[0xFE, 0xFF]) {
288        Some((Encoding::Utf16Be, false))
289    } else {
290        None
291    }
292}
293
294/// Phase 2: validate as UTF-8 (the fast path for most modern files). Returns
295/// `None` when the sample is not valid UTF-8, leaving the verdict to later
296/// phases.
297fn detect_utf8(sample: &[u8], sample_truncated: bool) -> Option<(Encoding, bool)> {
298    // When we truncate to 8KB we may cut in the middle of a multi-byte UTF-8
299    // sequence. If the only error is an incomplete sequence at the very end,
300    // treat the valid prefix as UTF-8 rather than rejecting the whole sample.
301    let utf8_valid_len = match std::str::from_utf8(sample) {
302        Ok(_) => sample.len(),
303        // error_len() is None for an incomplete sequence at end-of-input (a
304        // likely truncation artifact), vs Some(n) for a genuinely invalid byte.
305        Err(e) if e.error_len().is_none() => e.valid_up_to(),
306        Err(_) => 0,
307    };
308
309    // Accept exact validity; or, when the caller flagged truncation, tolerate
310    // up to 3 trailing bytes of an incomplete multi-byte sequence. Without
311    // truncation a trailing 0xE9 in a short file is a Latin-1 'é', not a cut
312    // codepoint, so we require exact validity there.
313    let is_valid_utf8 = utf8_valid_len == sample.len()
314        || (sample_truncated && utf8_valid_len > 0 && utf8_valid_len >= sample.len() - 3);
315    if !is_valid_utf8 {
316        return None;
317    }
318
319    let valid_sample = &sample[..utf8_valid_len];
320    if valid_sample.iter().any(|&b| is_binary_control_char(b)) {
321        return Some((Encoding::Utf8, true));
322    }
323    // If the tolerance branch accepted a trailing incomplete multi-byte
324    // sequence, the file is not pure ASCII — the byte at `utf8_valid_len` is a
325    // UTF-8 lead byte — so classify it as UTF-8.
326    let has_non_ascii_tail = utf8_valid_len < sample.len();
327    if !has_non_ascii_tail && valid_sample.iter().all(|&b| b < 128) {
328        return Some((Encoding::Ascii, false));
329    }
330    Some((Encoding::Utf8, false))
331}
332
333/// Phase 3: detect BOM-less UTF-16 (common in some Windows files) by looking
334/// for null bytes alternating with printable characters.
335///
336/// Unlike UTF-8 above, this heuristic is robust to sample truncation: it uses
337/// statistical pattern matching (50% threshold) over complete 2-byte pairs, so
338/// losing one pair out of ~4096 does not affect the verdict. Returns `None`
339/// when neither orientation crosses the threshold.
340fn detect_utf16_without_bom(sample: &[u8]) -> Option<(Encoding, bool)> {
341    if sample.len() < 4 {
342        return None;
343    }
344
345    let is_printable_or_high = |b: u8| (0x20..=0x7E).contains(&b) || b >= 0x80;
346
347    // Align to an even boundary so we only process complete 2-byte pairs.
348    let aligned_len = sample.len() & !1;
349    let aligned_sample = &sample[..aligned_len];
350
351    let le_pairs = aligned_sample
352        .chunks(2)
353        .filter(|chunk| chunk[1] == 0 && is_printable_or_high(chunk[0]))
354        .count();
355    let be_pairs = aligned_sample
356        .chunks(2)
357        .filter(|chunk| chunk[0] == 0 && is_printable_or_high(chunk[1]))
358        .count();
359    let pair_count = aligned_len / 2;
360
361    // More than 50% of pairs looking like UTF-16 text means it is text.
362    if le_pairs > pair_count / 2 {
363        Some((Encoding::Utf16Le, false))
364    } else if be_pairs > pair_count / 2 {
365        Some((Encoding::Utf16Be, false))
366    } else {
367        None
368    }
369}
370
371/// Phase 4-7: the sample is neither valid UTF-8 nor UTF-16. Reject binary
372/// content, then use chardetng plus heuristics to pick a legacy 8-bit (or CJK)
373/// encoding. Always reaches a verdict.
374fn detect_legacy_encoding(sample: &[u8]) -> (Encoding, bool) {
375    // Binary files often contain null bytes and control characters that appear
376    // in no valid text encoding. Check this before chardetng, which can still
377    // be "confident" about an encoding for binary data.
378    if sample
379        .iter()
380        .any(|&b| b == 0x00 || is_binary_control_char(b))
381    {
382        return (Encoding::Utf8, true);
383    }
384
385    // High bytes followed by invalid CJK trail bytes (space, newline,
386    // punctuation < 0x40) indicate Latin-1 rather than GB18030/GBK.
387    let has_latin1_pattern = has_latin1_high_byte_pattern(sample);
388    // Bytes in 0x81-0x9F can only be CJK lead bytes.
389    let has_cjk_only_bytes = sample.iter().any(|&b| (0x81..0xA0).contains(&b));
390
391    let mut detector = chardetng::EncodingDetector::new();
392    detector.feed(sample, true);
393    let (detected_encoding, confident) = detector.guess_assess(None, true);
394
395    if !confident {
396        // No binary indicators (checked above), so this is valid legacy text.
397        return (windows_125x_fallback(sample), false);
398    }
399
400    let is_cjk_encoding = detected_encoding == encoding_rs::GB18030
401        || detected_encoding == encoding_rs::GBK
402        || detected_encoding == encoding_rs::SHIFT_JIS
403        || detected_encoding == encoding_rs::EUC_KR;
404
405    // For CJK encodings with no CJK-only bytes but clear Latin-1 indicators
406    // (a space followed by a high byte), prefer Windows-1252.
407    if is_cjk_encoding && !has_cjk_only_bytes && has_latin1_pattern {
408        return (Encoding::Windows1252, false);
409    }
410
411    // GBK is a subset of GB18030. Because we only inspect the first 8KB, the
412    // sample may lack GB18030-only code points, so treating GBK as GB18030 is
413    // safer and still renders French, Spanish, emoji, etc.
414    let encoding =
415        if detected_encoding == encoding_rs::GB18030 || detected_encoding == encoding_rs::GBK {
416            Encoding::Gb18030
417        } else if detected_encoding == encoding_rs::SHIFT_JIS {
418            Encoding::ShiftJis
419        } else if detected_encoding == encoding_rs::EUC_KR {
420            Encoding::EucKr
421        } else {
422            // chardetng cannot reliably distinguish Latin-1 from Cyrillic for short
423            // samples with ambiguous high bytes — "éééÿ" (Latin-1) shares bytes with
424            // "еёёя" (Cyrillic). It may also report UTF-8 even though validation
425            // failed above. Route every remaining case through the same heuristic,
426            // defaulting to Windows-1252 unless there is strong evidence otherwise.
427            windows_125x_fallback(sample)
428        };
429    (encoding, false)
430}
431
432/// Disambiguate an ambiguous Windows code page from the sample's byte patterns,
433/// defaulting to Windows-1252 (Western European) when no stronger signal is
434/// present.
435fn windows_125x_fallback(sample: &[u8]) -> Encoding {
436    if has_windows1250_pattern(sample) {
437        Encoding::Windows1250
438    } else if has_windows1251_pattern(sample) {
439        Encoding::Windows1251
440    } else {
441        Encoding::Windows1252
442    }
443}
444
445// ============================================================================
446// Binary Detection Helpers
447// ============================================================================
448
449/// Check if a byte is a binary control character
450///
451/// Returns true for control characters that typically indicate binary content,
452/// excluding common text control chars (tab, newline, CR, form feed, etc.)
453pub fn is_binary_control_char(byte: u8) -> bool {
454    if byte < 0x20 {
455        // Allow common text control characters:
456        // 0x09 = Tab, 0x0A = LF, 0x0D = CR, 0x0C = Form Feed, 0x0B = Vertical Tab, 0x1B = ESC
457        !matches!(byte, 0x09 | 0x0A | 0x0D | 0x0C | 0x0B | 0x1B)
458    } else if byte == 0x7F {
459        // DEL character
460        true
461    } else {
462        false
463    }
464}
465
466/// Check if sample has Latin-1 patterns that cannot be valid CJK encoding
467///
468/// In GB18030/GBK, valid sequences are:
469/// - ASCII bytes (0x00-0x7F) as standalone characters
470/// - Lead byte (0x81-0xFE) + Trail byte (0x40-0x7E or 0x80-0xFE)
471///
472/// This function looks for patterns that indicate Latin-1:
473/// 1. High bytes followed by invalid CJK trail bytes (space, newline, etc.)
474/// 2. ASCII word followed by space followed by high byte (like "Hello é")
475/// 3. High byte immediately after ASCII space (like " é")
476fn has_latin1_high_byte_pattern(sample: &[u8]) -> bool {
477    let mut latin1_indicators = 0;
478    let mut i = 0;
479
480    while i < sample.len() {
481        let byte = sample[i];
482
483        if byte < 0x80 {
484            // ASCII byte
485            // Check for pattern: space followed by high byte (0xA0-0xFF)
486            // This is common in Latin-1 text like "Hello é" or "Café résumé"
487            if byte == 0x20 && i + 1 < sample.len() {
488                let next = sample[i + 1];
489                // Space followed by Latin-1 extended char (not CJK-only lead byte)
490                if next >= 0xA0 {
491                    latin1_indicators += 1;
492                }
493            }
494            i += 1;
495            continue;
496        }
497
498        // High byte (0x80-0xFF) - could be Latin-1 or CJK lead byte
499        if i + 1 < sample.len() {
500            let next = sample[i + 1];
501
502            // Check if this could be a valid CJK double-byte sequence
503            let is_valid_cjk_lead = (0x81..=0xFE).contains(&byte);
504            let is_valid_cjk_trail = (0x40..=0x7E).contains(&next) || (0x80..=0xFE).contains(&next);
505
506            if is_valid_cjk_lead && is_valid_cjk_trail {
507                // Valid CJK pair - skip both bytes
508                i += 2;
509                continue;
510            }
511
512            // Not a valid CJK pair - check for Latin-1 indicator
513            // High byte followed by space, newline, or other low ASCII
514            if byte >= 0xA0 && next < 0x40 {
515                latin1_indicators += 1;
516            }
517        }
518
519        i += 1;
520    }
521
522    // Latin-1 is likely if we have indicators
523    latin1_indicators > 0
524}
525
526// ============================================================================
527// Encoding Conversion
528// ============================================================================
529
530/// Detect encoding and convert bytes to UTF-8
531///
532/// Returns the detected encoding and the UTF-8 converted content.
533/// This is the core function for normalizing file content to UTF-8 on load.
534pub fn detect_and_convert(bytes: &[u8]) -> (Encoding, Vec<u8>) {
535    if bytes.is_empty() {
536        return (Encoding::Utf8, Vec::new());
537    }
538
539    let encoding = detect_encoding(bytes);
540
541    // For UTF-8 (with or without BOM), we can use the content directly
542    match encoding {
543        Encoding::Utf8 | Encoding::Ascii => {
544            // Already UTF-8, just clone
545            (encoding, bytes.to_vec())
546        }
547        Encoding::Utf8Bom => {
548            // Skip the BOM (3 bytes) and use the rest
549            let content = if bytes.len() > 3 {
550                bytes[3..].to_vec()
551            } else {
552                Vec::new()
553            };
554            (encoding, content)
555        }
556        Encoding::Utf16Le | Encoding::Utf16Be => {
557            // Decode UTF-16 to UTF-8
558            let enc_rs = encoding.to_encoding_rs();
559            let start_offset =
560                if bytes.starts_with(&[0xFF, 0xFE]) || bytes.starts_with(&[0xFE, 0xFF]) {
561                    2 // Skip BOM
562                } else {
563                    0
564                };
565            let data = &bytes[start_offset..];
566
567            let (cow, _had_errors) = enc_rs.decode_without_bom_handling(data);
568            (encoding, cow.into_owned().into_bytes())
569        }
570        _ => {
571            // Use encoding_rs to convert to UTF-8
572            let enc_rs = encoding.to_encoding_rs();
573            let (cow, _had_errors) = enc_rs.decode_without_bom_handling(bytes);
574            (encoding, cow.into_owned().into_bytes())
575        }
576    }
577}
578
579/// Convert bytes from a specific encoding to UTF-8
580///
581/// Used when opening a file with a user-specified encoding instead of auto-detection.
582/// Returns the UTF-8 converted content.
583pub fn convert_to_utf8(bytes: &[u8], encoding: Encoding) -> Vec<u8> {
584    if bytes.is_empty() {
585        return Vec::new();
586    }
587
588    match encoding {
589        Encoding::Utf8 | Encoding::Ascii => {
590            // Already UTF-8, just clone
591            bytes.to_vec()
592        }
593        Encoding::Utf8Bom => {
594            // Skip the BOM (3 bytes) if present and use the rest
595            if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) && bytes.len() > 3 {
596                bytes[3..].to_vec()
597            } else {
598                bytes.to_vec()
599            }
600        }
601        Encoding::Utf16Le | Encoding::Utf16Be => {
602            // Decode UTF-16 to UTF-8
603            let enc_rs = encoding.to_encoding_rs();
604            let start_offset =
605                if bytes.starts_with(&[0xFF, 0xFE]) || bytes.starts_with(&[0xFE, 0xFF]) {
606                    2 // Skip BOM
607                } else {
608                    0
609                };
610            let data = &bytes[start_offset..];
611
612            let (cow, _had_errors) = enc_rs.decode_without_bom_handling(data);
613            cow.into_owned().into_bytes()
614        }
615        _ => {
616            // Use encoding_rs to convert to UTF-8
617            let enc_rs = encoding.to_encoding_rs();
618            let (cow, _had_errors) = enc_rs.decode_without_bom_handling(bytes);
619            cow.into_owned().into_bytes()
620        }
621    }
622}
623
624/// Convert UTF-8 content to the specified encoding for saving
625///
626/// Used when saving files to convert internal UTF-8 representation
627/// back to the original (or user-selected) encoding.
628///
629/// Note: This does NOT add BOM - the BOM should be handled separately.
630pub fn convert_from_utf8(utf8_bytes: &[u8], encoding: Encoding) -> Vec<u8> {
631    match encoding {
632        Encoding::Utf8 | Encoding::Ascii | Encoding::Utf8Bom => {
633            // UTF-8 (with or without BOM) - just clone, BOM added separately
634            utf8_bytes.to_vec()
635        }
636        Encoding::Utf16Le => {
637            // Convert UTF-8 to UTF-16 LE (no BOM - added separately)
638            let text = String::from_utf8_lossy(utf8_bytes);
639            let mut result = Vec::new();
640            for code_unit in text.encode_utf16() {
641                result.extend_from_slice(&code_unit.to_le_bytes());
642            }
643            result
644        }
645        Encoding::Utf16Be => {
646            // Convert UTF-8 to UTF-16 BE (no BOM - added separately)
647            let text = String::from_utf8_lossy(utf8_bytes);
648            let mut result = Vec::new();
649            for code_unit in text.encode_utf16() {
650                result.extend_from_slice(&code_unit.to_be_bytes());
651            }
652            result
653        }
654        _ => {
655            // Use encoding_rs to convert from UTF-8
656            let enc_rs = encoding.to_encoding_rs();
657            let text = String::from_utf8_lossy(utf8_bytes);
658            let (cow, _encoding_used, _had_errors) = enc_rs.encode(&text);
659            cow.into_owned()
660        }
661    }
662}
663
664// ============================================================================
665// Tests
666// ============================================================================
667
668#[cfg(test)]
669mod tests {
670    use super::*;
671
672    #[test]
673    fn test_encoding_display_names() {
674        assert_eq!(Encoding::Utf8.display_name(), "UTF-8");
675        assert_eq!(Encoding::Utf8Bom.display_name(), "UTF-8 BOM");
676        assert_eq!(Encoding::Utf16Le.display_name(), "UTF-16 LE");
677        assert_eq!(Encoding::Gb18030.display_name(), "GB18030");
678        assert_eq!(Encoding::Windows1250.display_name(), "Windows-1250");
679    }
680
681    #[test]
682    fn test_encoding_bom() {
683        assert!(Encoding::Utf8Bom.has_bom());
684        assert!(Encoding::Utf16Le.has_bom());
685        assert!(!Encoding::Utf8.has_bom());
686        assert!(!Encoding::Windows1252.has_bom());
687        assert!(!Encoding::Windows1250.has_bom());
688    }
689
690    #[test]
691    fn test_detect_utf8() {
692        assert_eq!(detect_encoding(b"Hello, world!"), Encoding::Ascii);
693        assert_eq!(detect_encoding("Hello, 世界!".as_bytes()), Encoding::Utf8);
694    }
695
696    #[test]
697    fn test_detect_utf8_bom() {
698        let with_bom = [0xEF, 0xBB, 0xBF, b'H', b'i'];
699        assert_eq!(detect_encoding(&with_bom), Encoding::Utf8Bom);
700    }
701
702    #[test]
703    fn test_detect_utf16_le() {
704        let utf16_le_bom = [0xFF, 0xFE, b'H', 0x00, b'i', 0x00];
705        assert_eq!(detect_encoding(&utf16_le_bom), Encoding::Utf16Le);
706    }
707
708    #[test]
709    fn test_detect_binary() {
710        let binary_data = [0x00, 0x01, 0x02, 0x03];
711        let (_, is_binary) = detect_encoding_or_binary(&binary_data, false);
712        assert!(is_binary);
713    }
714
715    #[test]
716    fn test_is_binary_control_char() {
717        // Binary control chars
718        assert!(is_binary_control_char(0x00)); // NUL
719        assert!(is_binary_control_char(0x01)); // SOH
720        assert!(is_binary_control_char(0x02)); // STX
721        assert!(is_binary_control_char(0x7F)); // DEL
722
723        // Text control chars (allowed)
724        assert!(!is_binary_control_char(0x09)); // Tab
725        assert!(!is_binary_control_char(0x0A)); // LF
726        assert!(!is_binary_control_char(0x0D)); // CR
727        assert!(!is_binary_control_char(0x1B)); // ESC
728
729        // Regular printable chars
730        assert!(!is_binary_control_char(b'A'));
731        assert!(!is_binary_control_char(b' '));
732    }
733
734    #[test]
735    fn test_convert_roundtrip_utf8() {
736        let original = "Hello, 世界!";
737        let bytes = original.as_bytes();
738
739        let (encoding, utf8_content) = detect_and_convert(bytes);
740        assert_eq!(encoding, Encoding::Utf8);
741        assert_eq!(utf8_content, bytes);
742
743        let back = convert_from_utf8(&utf8_content, encoding);
744        assert_eq!(back, bytes);
745    }
746
747    #[test]
748    fn test_convert_roundtrip_utf16le() {
749        // UTF-16 LE with BOM: "Hi"
750        let utf16_le = [0xFF, 0xFE, b'H', 0x00, b'i', 0x00];
751
752        let (encoding, utf8_content) = detect_and_convert(&utf16_le);
753        assert_eq!(encoding, Encoding::Utf16Le);
754        assert_eq!(utf8_content, b"Hi");
755
756        // Note: convert_from_utf8 doesn't add BOM, so result won't have BOM
757        let back = convert_from_utf8(&utf8_content, encoding);
758        assert_eq!(back, [b'H', 0x00, b'i', 0x00]);
759    }
760
761    #[test]
762    fn test_encoding_resynchronizable() {
763        // Self-synchronizing encodings (can find char boundaries from middle of file)
764        assert!(Encoding::Utf8.is_resynchronizable());
765        assert!(Encoding::Utf8Bom.is_resynchronizable());
766        assert!(Encoding::Ascii.is_resynchronizable());
767        assert!(Encoding::Latin1.is_resynchronizable());
768        assert!(Encoding::Windows1252.is_resynchronizable());
769        assert!(Encoding::Windows1250.is_resynchronizable());
770
771        // UTF-16 is resynchronizable with proper alignment
772        assert!(Encoding::Utf16Le.is_resynchronizable());
773        assert!(Encoding::Utf16Be.is_resynchronizable());
774
775        // Legacy CJK encodings are NOT resynchronizable
776        // (second byte of double-byte char can equal a valid single-byte char)
777        assert!(!Encoding::Gb18030.is_resynchronizable());
778        assert!(!Encoding::Gbk.is_resynchronizable());
779        assert!(!Encoding::ShiftJis.is_resynchronizable());
780        assert!(!Encoding::EucKr.is_resynchronizable());
781    }
782
783    #[test]
784    fn test_encoding_alignment() {
785        // Single-byte encodings have alignment of 1
786        assert_eq!(Encoding::Ascii.alignment(), Some(1));
787        assert_eq!(Encoding::Latin1.alignment(), Some(1));
788        assert_eq!(Encoding::Windows1252.alignment(), Some(1));
789        assert_eq!(Encoding::Windows1250.alignment(), Some(1));
790        assert_eq!(Encoding::Utf8.alignment(), Some(1));
791        assert_eq!(Encoding::Utf8Bom.alignment(), Some(1));
792
793        // UTF-16 requires 2-byte alignment
794        assert_eq!(Encoding::Utf16Le.alignment(), Some(2));
795        assert_eq!(Encoding::Utf16Be.alignment(), Some(2));
796
797        // Non-resynchronizable encodings have no valid alignment
798        assert_eq!(Encoding::Gb18030.alignment(), None);
799        assert_eq!(Encoding::Gbk.alignment(), None);
800        assert_eq!(Encoding::ShiftJis.alignment(), None);
801        assert_eq!(Encoding::EucKr.alignment(), None);
802    }
803
804    #[test]
805    fn test_requires_full_file_load() {
806        // Encodings that can be streamed
807        assert!(!Encoding::Utf8.requires_full_file_load());
808        assert!(!Encoding::Ascii.requires_full_file_load());
809        assert!(!Encoding::Latin1.requires_full_file_load());
810        assert!(!Encoding::Windows1250.requires_full_file_load());
811        assert!(!Encoding::Utf16Le.requires_full_file_load());
812
813        // Encodings that require full loading
814        assert!(Encoding::Gb18030.requires_full_file_load());
815        assert!(Encoding::Gbk.requires_full_file_load());
816        assert!(Encoding::ShiftJis.requires_full_file_load());
817        assert!(Encoding::EucKr.requires_full_file_load());
818    }
819
820    #[test]
821    fn test_convert_roundtrip_windows1250() {
822        // Windows-1250 encoded text with Central European characters
823        // "Zażółć" in Windows-1250: Z(0x5A) a(0x61) ż(0xBF) ó(0xF3) ł(0xB3) ć(0xE6)
824        let windows1250_bytes: &[u8] = &[0x5A, 0x61, 0xBF, 0xF3, 0xB3, 0xE6];
825
826        // Convert to UTF-8
827        let enc_rs = Encoding::Windows1250.to_encoding_rs();
828        let (decoded, _had_errors) = enc_rs.decode_without_bom_handling(windows1250_bytes);
829        let utf8_content = decoded.as_bytes();
830
831        // The UTF-8 content should contain the Polish characters
832        let utf8_str = std::str::from_utf8(utf8_content).unwrap();
833        assert!(utf8_str.contains('ż'), "Should contain ż: {}", utf8_str);
834        assert!(utf8_str.contains('ó'), "Should contain ó: {}", utf8_str);
835        assert!(utf8_str.contains('ł'), "Should contain ł: {}", utf8_str);
836        assert!(utf8_str.contains('ć'), "Should contain ć: {}", utf8_str);
837
838        // Convert back to Windows-1250
839        let back = convert_from_utf8(utf8_content, Encoding::Windows1250);
840        assert_eq!(back, windows1250_bytes, "Round-trip should preserve bytes");
841    }
842
843    #[test]
844    fn test_windows1250_description() {
845        assert_eq!(
846            Encoding::Windows1250.description(),
847            "Windows-1250 / CP1250 – Central European"
848        );
849    }
850
851    #[test]
852    fn test_detect_windows1250_definitive_bytes() {
853        // Bytes 0x8D (Ť), 0x8F (Ź), 0x9D (ť) are undefined in Windows-1252
854        // but valid in Windows-1250, so they definitively indicate Windows-1250
855
856        // Czech text with ť (0x9D): "měsťo" (city, archaic)
857        let with_t_caron = [0x6D, 0x9D, 0x73, 0x74, 0x6F]; // mťsto
858        assert_eq!(
859            detect_encoding(&with_t_caron),
860            Encoding::Windows1250,
861            "Byte 0x9D (ť) should trigger Windows-1250 detection"
862        );
863
864        // Polish text with Ź (0x8F): "Źródło" (source)
865        let with_z_acute_upper = [0x8F, 0x72, 0xF3, 0x64, 0xB3, 0x6F]; // Źródło
866        assert_eq!(
867            detect_encoding(&with_z_acute_upper),
868            Encoding::Windows1250,
869            "Byte 0x8F (Ź) should trigger Windows-1250 detection"
870        );
871    }
872
873    #[test]
874    fn test_detect_windows1250_strong_indicators() {
875        // Polish text with ś (0x9C) and Ś (0x8C) - strong indicators from 0x80-0x9F range
876        let polish_text = [
877            0x9C, 0x77, 0x69, 0x65, 0x74, 0x79, 0x20, // "świety "
878            0x8C, 0x77, 0x69, 0x61, 0x74, // "Świat"
879        ];
880        assert_eq!(
881            detect_encoding(&polish_text),
882            Encoding::Windows1250,
883            "Multiple Polish characters (ś, Ś) should trigger Windows-1250"
884        );
885    }
886
887    #[test]
888    fn test_detect_ambiguous_bytes_as_windows1252() {
889        // Bytes in 0xA0-0xFF range are ambiguous and should default to Windows-1252
890        // Polish "żółć" - ż(0xBF) ó(0xF3) ł(0xB3) ć(0xE6) - all ambiguous
891        let zolc = [0xBF, 0xF3, 0xB3, 0xE6];
892        assert_eq!(
893            detect_encoding(&zolc),
894            Encoding::Windows1252,
895            "Ambiguous bytes should default to Windows-1252"
896        );
897
898        // ą (0xB9) and ł (0xB3) could be ¹ and ³ in Windows-1252
899        let ambiguous = [
900            0x6D, 0xB9, 0x6B, 0x61, 0x20, // "mąka " or "m¹ka "
901            0x6D, 0xB3, 0x6F, 0x64, 0x79, // "młody" or "m³ody"
902        ];
903        assert_eq!(
904            detect_encoding(&ambiguous),
905            Encoding::Windows1252,
906            "Ambiguous Polish bytes should default to Windows-1252"
907        );
908    }
909
910    #[test]
911    fn test_detect_windows1250_czech_pangram() {
912        // "Příliš žluťoučký kůň úpěl ďábelské ódy" - Czech pangram in Windows-1250
913        // Contains ť (0x9D) which is a definitive Windows-1250 indicator
914        let czech_pangram: &[u8] = &[
915            0x50, 0xF8, 0xED, 0x6C, 0x69, 0x9A, 0x20, // "Příliš "
916            0x9E, 0x6C, 0x75, 0x9D, 0x6F, 0x75, 0xE8, 0x6B, 0xFD, 0x20, // "žluťoučký "
917            0x6B, 0xF9, 0xF2, 0x20, // "kůň "
918            0xFA, 0x70, 0xEC, 0x6C, 0x20, // "úpěl "
919            0xEF, 0xE1, 0x62, 0x65, 0x6C, 0x73, 0x6B, 0xE9, 0x20, // "ďábelské "
920            0xF3, 0x64, 0x79, // "ódy"
921        ];
922        assert_eq!(
923            detect_encoding(czech_pangram),
924            Encoding::Windows1250,
925            "Czech pangram should be detected as Windows-1250 (contains ť = 0x9D)"
926        );
927    }
928
929    #[test]
930    fn test_detect_windows1252_not_1250() {
931        // Pure Windows-1252 text without Central European indicators
932        // "Café résumé" in Windows-1252
933        let windows1252_text = [
934            0x43, 0x61, 0x66, 0xE9, 0x20, // "Café "
935            0x72, 0xE9, 0x73, 0x75, 0x6D, 0xE9, // "résumé"
936        ];
937        assert_eq!(
938            detect_encoding(&windows1252_text),
939            Encoding::Windows1252,
940            "French text should remain Windows-1252"
941        );
942    }
943
944    #[test]
945    fn test_convert_roundtrip_windows1251() {
946        // Russian "Привет" (Hello) in Windows-1251:
947        // П=0xCF р=0xF0 и=0xE8 в=0xE2 е=0xE5 т=0xF2
948        let windows1251_bytes: &[u8] = &[0xCF, 0xF0, 0xE8, 0xE2, 0xE5, 0xF2];
949
950        // Convert to UTF-8
951        let enc_rs = Encoding::Windows1251.to_encoding_rs();
952        let (decoded, _had_errors) = enc_rs.decode_without_bom_handling(windows1251_bytes);
953        let utf8_content = decoded.as_bytes();
954
955        let utf8_str = std::str::from_utf8(utf8_content).unwrap();
956        assert_eq!(utf8_str, "Привет", "Should decode to Russian 'Привет'");
957
958        // Convert back to Windows-1251
959        let back = convert_from_utf8(utf8_content, Encoding::Windows1251);
960        assert_eq!(back, windows1251_bytes, "Round-trip should preserve bytes");
961    }
962
963    #[test]
964    fn test_windows1251_display_and_description() {
965        assert_eq!(Encoding::Windows1251.display_name(), "Windows-1251");
966        assert_eq!(
967            Encoding::Windows1251.description(),
968            "Windows-1251 / CP1251 – Cyrillic"
969        );
970    }
971
972    #[test]
973    fn test_windows1251_is_resynchronizable() {
974        assert!(Encoding::Windows1251.is_resynchronizable());
975        assert_eq!(Encoding::Windows1251.alignment(), Some(1));
976        assert!(!Encoding::Windows1251.requires_full_file_load());
977        assert!(!Encoding::Windows1251.has_bom());
978    }
979
980    #[test]
981    fn test_detect_windows1251_russian() {
982        // Russian sentence "Привет мир" (Hello world) in Windows-1251
983        let privet_mir: &[u8] = &[
984            0xCF, 0xF0, 0xE8, 0xE2, 0xE5, 0xF2, // Привет
985            0x20, // space
986            0xEC, 0xE8, 0xF0, // мир
987        ];
988        assert_eq!(
989            detect_encoding(privet_mir),
990            Encoding::Windows1251,
991            "Russian sentence should be detected as Windows-1251"
992        );
993    }
994
995    #[test]
996    fn test_detect_windows1251_russian_pangram() {
997        // Russian pangram fragment: "Съешь ещё этих мягких французских булок"
998        // Contains many Cyrillic letters and the distinctive ё (0xB8) character.
999        // bytes in Windows-1251:
1000        // С=0xD1 ъ=0xFA е=0xE5 ш=0xF8 ь=0xFC 0x20
1001        // е=0xE5 щ=0xF9 ё=0xB8 0x20
1002        // э=0xFD т=0xF2 и=0xE8 х=0xF5 0x20
1003        // м=0xEC я=0xFF г=0xE3 к=0xEA и=0xE8 х=0xF5 0x20
1004        // ф=0xF4 р=0xF0 а=0xE0 н=0xED ц=0xF6 у=0xF3 з=0xE7 с=0xF1 к=0xEA и=0xE8 х=0xF5 0x20
1005        // б=0xE1 у=0xF3 л=0xEB о=0xEE к=0xEA
1006        let pangram: &[u8] = &[
1007            0xD1, 0xFA, 0xE5, 0xF8, 0xFC, 0x20, 0xE5, 0xF9, 0xB8, 0x20, 0xFD, 0xF2, 0xE8, 0xF5,
1008            0x20, 0xEC, 0xFF, 0xE3, 0xEA, 0xE8, 0xF5, 0x20, 0xF4, 0xF0, 0xE0, 0xED, 0xF6, 0xF3,
1009            0xE7, 0xF1, 0xEA, 0xE8, 0xF5, 0x20, 0xE1, 0xF3, 0xEB, 0xEE, 0xEA,
1010        ];
1011        assert_eq!(
1012            detect_encoding(pangram),
1013            Encoding::Windows1251,
1014            "Russian pangram should be detected as Windows-1251"
1015        );
1016    }
1017
1018    #[test]
1019    fn test_detect_not_windows1251_ambiguous_polish() {
1020        // Regression: 4 consecutive Polish ambiguous bytes must still default
1021        // to Windows-1252, not be mis-detected as Cyrillic by the 1251 heuristic.
1022        let zolc = [0xBF, 0xF3, 0xB3, 0xE6];
1023        assert_eq!(
1024            detect_encoding(&zolc),
1025            Encoding::Windows1252,
1026            "Short ambiguous Polish bytes must not be detected as Windows-1251"
1027        );
1028    }
1029
1030    #[test]
1031    fn test_detect_utf8_chinese_truncated_sequence() {
1032        // Test that UTF-8 Chinese text is correctly detected even when the sample
1033        // is truncated in the middle of a multi-byte sequence.
1034        //
1035        // Bug context: When sampling first 8KB for detection, the boundary may cut
1036        // through a multi-byte UTF-8 character. This caused valid UTF-8 Chinese text
1037        // to fail std::str::from_utf8() validation and fall through to Windows-1250
1038        // detection (because UTF-8 continuation bytes like 0x9C, 0x9D overlap with
1039        // Windows-1250 indicator bytes).
1040
1041        // Chinese text "更多" (more) = [0xE6, 0x9B, 0xB4, 0xE5, 0xA4, 0x9A]
1042        // If we truncate after 0xE5, we get an incomplete sequence
1043        let utf8_chinese_truncated = [
1044            0xE6, 0x9B, 0xB4, // 更
1045            0xE5, 0xA4, 0x9A, // 多
1046            0xE5, // Start of another character, incomplete
1047        ];
1048
1049        // With truncated=true, this should be detected as UTF-8
1050        assert_eq!(
1051            detect_encoding_or_binary(&utf8_chinese_truncated, true).0,
1052            Encoding::Utf8,
1053            "Truncated UTF-8 Chinese text should be detected as UTF-8"
1054        );
1055
1056        // Without truncated flag, the incomplete trailing byte is treated as non-UTF-8
1057        assert_ne!(
1058            detect_encoding_or_binary(&utf8_chinese_truncated, false).0,
1059            Encoding::Utf8,
1060            "Non-truncated short sample with trailing 0xE5 should not be detected as UTF-8"
1061        );
1062
1063        // Test with 2 bytes of incomplete sequence
1064        let utf8_chinese_truncated_2 = [
1065            0xE6, 0x9B, 0xB4, // 更
1066            0xE5, 0xA4, 0x9A, // 多
1067            0xE5, 0xA4, // Incomplete 3-byte sequence (missing last byte)
1068        ];
1069        assert_eq!(
1070            detect_encoding_or_binary(&utf8_chinese_truncated_2, true).0,
1071            Encoding::Utf8,
1072            "Truncated UTF-8 with 2-byte incomplete sequence should be detected as UTF-8"
1073        );
1074    }
1075
1076    #[test]
1077    fn test_detect_utf8_chinese_with_high_bytes() {
1078        // UTF-8 Chinese text contains many continuation bytes in the 0x80-0xBF range,
1079        // including bytes like 0x9C, 0x9D that happen to be Windows-1250 indicators.
1080        // These should NOT trigger Windows-1250 detection for valid UTF-8 content.
1081
1082        // Chinese characters that use continuation bytes that overlap with Windows-1250 indicators:
1083        // 集 = E9 9B 86 (contains 0x9B)
1084        // 精 = E7 B2 BE (contains 0xB2, 0xBE)
1085        // Build a string with many such characters
1086        let chinese_text = "更多全本全集精校小说"; // Contains various high continuation bytes
1087        let bytes = chinese_text.as_bytes();
1088
1089        assert_eq!(
1090            detect_encoding(bytes),
1091            Encoding::Utf8,
1092            "UTF-8 Chinese text should be detected as UTF-8, not Windows-1250"
1093        );
1094
1095        // Verify these bytes would have triggered Windows-1250 detection if not valid UTF-8
1096        // by checking that the sample contains bytes in the 0x80-0x9F range
1097        let has_high_continuation_bytes = bytes.iter().any(|&b| (0x80..0xA0).contains(&b));
1098        assert!(
1099            has_high_continuation_bytes,
1100            "Test should include bytes that could be mistaken for Windows-1250 indicators"
1101        );
1102    }
1103
1104    #[test]
1105    fn test_detect_utf8_sample_truncation_at_boundary() {
1106        // Simulate what happens when we take an 8KB sample that ends mid-character
1107        // by creating a buffer that's valid UTF-8 except for the last 1-3 bytes
1108
1109        // Build a large UTF-8 Chinese text buffer
1110        let chinese = "我的美女老师"; // "My Beautiful Teacher"
1111        let mut buffer = Vec::new();
1112        // Repeat to make it substantial
1113        for _ in 0..100 {
1114            buffer.extend_from_slice(chinese.as_bytes());
1115        }
1116
1117        // Verify it's valid UTF-8 when complete
1118        assert!(std::str::from_utf8(&buffer).is_ok());
1119        assert_eq!(detect_encoding(&buffer), Encoding::Utf8);
1120
1121        // Now truncate at various points that cut through multi-byte sequences
1122        // Each Chinese character is 3 bytes in UTF-8
1123        for truncate_offset in 1..=3 {
1124            let truncated_len = buffer.len() - truncate_offset;
1125            let truncated_buf = &buffer[..truncated_len];
1126
1127            // The truncated buffer should fail strict UTF-8 validation
1128            // (unless we happen to cut at a character boundary)
1129            let is_strict_valid = std::str::from_utf8(truncated_buf).is_ok();
1130
1131            // With truncated=true, our detection should still detect it as UTF-8
1132            let detected = detect_encoding_or_binary(truncated_buf, true).0;
1133            assert_eq!(
1134                detected,
1135                Encoding::Utf8,
1136                "Truncated UTF-8 at offset -{} should be detected as UTF-8, strict_valid={}",
1137                truncate_offset,
1138                is_strict_valid
1139            );
1140        }
1141    }
1142
1143    #[test]
1144    fn test_detect_utf8_cjk_across_internal_8kb_boundary() {
1145        // Regression for #1635: when callers pass the full file bytes with
1146        // `truncated=false`, the detector still internally clamps the sample
1147        // to the first 8 KB. If a multi-byte UTF-8 sequence straddles that
1148        // internal 8192-byte cutoff, the strict UTF-8 check fails and the
1149        // CJK continuation bytes get misclassified by the fallback path
1150        // (often as Windows-1250/1251/1252), garbling the whole file.
1151        //
1152        // The full buffer IS valid UTF-8, so detection must return UTF-8.
1153
1154        // Build padding that is pure ASCII and leaves exactly one byte of
1155        // room inside the first 8 KB before we write a 3-byte CJK codepoint.
1156        // "项" = [0xE9, 0xA1, 0xB9] — the first byte sits at offset 8191,
1157        // the remaining two bytes spill past the internal 8192-byte sample.
1158        let mut buffer = Vec::with_capacity(16 * 1024);
1159        buffer.resize(8 * 1024 - 1, b'a');
1160        // CJK text that crosses the boundary and continues after it.
1161        buffer.extend_from_slice("项目设置项目设置项目设置".as_bytes());
1162
1163        assert!(buffer.len() > 8 * 1024);
1164        assert!(std::str::from_utf8(&buffer).is_ok());
1165
1166        // The caller (load_small_file) passes the full content with
1167        // `truncated=false` because no truncation occurred at the call site.
1168        // Detection must still return UTF-8.
1169        let detected = detect_encoding_or_binary(&buffer, false).0;
1170        assert_eq!(
1171            detected,
1172            Encoding::Utf8,
1173            "Valid UTF-8 CJK content must be detected as UTF-8 even when a \
1174             multi-byte sequence straddles the detector's internal 8 KB sample boundary"
1175        );
1176    }
1177}
fresh/model/encoding.rs

fresh/model/
encoding.rs