fresh/model/encoding.rs
1//! Text encoding detection and conversion
2//!
3//! This module handles:
4//! - Detecting text encodings from byte content (UTF-8, UTF-16, Latin-1, CJK, etc.)
5//! - Binary file detection (distinguishing text from binary content)
6//! - Converting between encodings (normalizing to UTF-8 on load, converting back on save)
7//!
8//! # Encoding Detection Strategy
9//!
10//! 1. **BOM Detection**: Check for Byte Order Marks (UTF-8 BOM, UTF-16 LE/BE)
11//! 2. **UTF-8 Validation**: Fast path for most modern files
12//! 3. **UTF-16 Heuristics**: Detect UTF-16 without BOM via null byte patterns
13//! 4. **Binary Detection**: Check for control characters that indicate binary content
14//! 5. **Statistical Detection**: Use chardetng for legacy encoding detection
15//! 6. **Fallback**: Default to Windows-1252 for ambiguous cases
16
17use super::encoding_heuristics::has_windows1250_pattern;
18use schemars::JsonSchema;
19use serde::{Deserialize, Serialize};
20
21// ============================================================================
22// Encoding Type
23// ============================================================================
24
25/// Supported text encodings for file I/O
26///
27/// The editor internally uses UTF-8 for all text processing. When loading files,
28/// content is converted from the detected encoding to UTF-8. When saving, content
29/// is converted back to the original (or user-selected) encoding.
30#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize, JsonSchema)]
31pub enum Encoding {
32 /// UTF-8 (default, most common)
33 #[default]
34 Utf8,
35 /// UTF-8 with Byte Order Mark
36 Utf8Bom,
37 /// UTF-16 Little Endian (Windows default for Unicode files)
38 Utf16Le,
39 /// UTF-16 Big Endian
40 Utf16Be,
41 /// ASCII (7-bit, subset of UTF-8)
42 Ascii,
43 /// Latin-1 / ISO-8859-1 (Western European)
44 Latin1,
45 /// Windows-1252 / CP-1252 (Windows Western European, often called "ANSI")
46 Windows1252,
47 /// Windows-1250 / CP-1250 (Windows Central European)
48 Windows1250,
49 /// GB18030 (Chinese, superset of GBK)
50 Gb18030,
51 /// GBK (Chinese Simplified, subset of GB18030)
52 Gbk,
53 /// Shift-JIS (Japanese)
54 ShiftJis,
55 /// EUC-KR (Korean)
56 EucKr,
57}
58
59impl Encoding {
60 /// Get the display name for status bar
61 pub fn display_name(&self) -> &'static str {
62 match self {
63 Self::Utf8 => "UTF-8",
64 Self::Utf8Bom => "UTF-8 BOM",
65 Self::Utf16Le => "UTF-16 LE",
66 Self::Utf16Be => "UTF-16 BE",
67 Self::Ascii => "ASCII",
68 Self::Latin1 => "Latin-1",
69 Self::Windows1252 => "Windows-1252",
70 Self::Windows1250 => "Windows-1250",
71 Self::Gb18030 => "GB18030",
72 Self::Gbk => "GBK",
73 Self::ShiftJis => "Shift-JIS",
74 Self::EucKr => "EUC-KR",
75 }
76 }
77
78 /// Get a longer description for UI (e.g., command palette)
79 pub fn description(&self) -> &'static str {
80 match self {
81 Self::Utf8 => "UTF-8",
82 Self::Utf8Bom => "UTF-8 with BOM",
83 Self::Utf16Le => "UTF-16 Little Endian",
84 Self::Utf16Be => "UTF-16 Big Endian",
85 Self::Ascii => "US-ASCII",
86 Self::Latin1 => "ISO-8859-1 / Latin-1 – Western European",
87 Self::Windows1252 => "Windows-1252 / CP1252 – Western European",
88 Self::Windows1250 => "Windows-1250 / CP1250 – Central European",
89 Self::Gb18030 => "GB18030 – Chinese",
90 Self::Gbk => "GBK / CP936 – Simplified Chinese",
91 Self::ShiftJis => "Shift_JIS – Japanese",
92 Self::EucKr => "EUC-KR – Korean",
93 }
94 }
95
96 /// Get the encoding_rs Encoding for this type
97 pub fn to_encoding_rs(&self) -> &'static encoding_rs::Encoding {
98 match self {
99 Self::Utf8 | Self::Utf8Bom | Self::Ascii => encoding_rs::UTF_8,
100 Self::Utf16Le => encoding_rs::UTF_16LE,
101 Self::Utf16Be => encoding_rs::UTF_16BE,
102 Self::Latin1 => encoding_rs::WINDOWS_1252, // ISO-8859-1 maps to Windows-1252 per WHATWG
103 Self::Windows1252 => encoding_rs::WINDOWS_1252,
104 Self::Windows1250 => encoding_rs::WINDOWS_1250,
105 Self::Gb18030 => encoding_rs::GB18030,
106 Self::Gbk => encoding_rs::GBK,
107 Self::ShiftJis => encoding_rs::SHIFT_JIS,
108 Self::EucKr => encoding_rs::EUC_KR,
109 }
110 }
111
112 /// Returns true if this encoding uses a BOM (Byte Order Mark)
113 pub fn has_bom(&self) -> bool {
114 matches!(self, Self::Utf8Bom | Self::Utf16Le | Self::Utf16Be)
115 }
116
117 /// Get the BOM bytes for this encoding (if any)
118 pub fn bom_bytes(&self) -> Option<&'static [u8]> {
119 match self {
120 Self::Utf8Bom => Some(&[0xEF, 0xBB, 0xBF]),
121 Self::Utf16Le => Some(&[0xFF, 0xFE]),
122 Self::Utf16Be => Some(&[0xFE, 0xFF]),
123 _ => None,
124 }
125 }
126
127 /// All available encodings for UI display
128 pub fn all() -> &'static [Encoding] {
129 &[
130 Self::Utf8,
131 Self::Utf8Bom,
132 Self::Utf16Le,
133 Self::Utf16Be,
134 Self::Ascii,
135 Self::Latin1,
136 Self::Windows1252,
137 Self::Windows1250,
138 Self::Gb18030,
139 Self::Gbk,
140 Self::ShiftJis,
141 Self::EucKr,
142 ]
143 }
144
145 /// Returns true if this encoding supports "resynchronization" - the ability to
146 /// find character boundaries when jumping into the middle of a file.
147 ///
148 /// Resynchronizable encodings can be safely used with lazy/streaming file loading
149 /// because you can determine character boundaries from any position.
150 ///
151 /// - **UTF-8**: Excellent - unique bit patterns distinguish lead/continuation bytes
152 /// - **ASCII/Latin-1/Windows-1252**: Trivial - every byte is a character
153 /// - **UTF-16**: Good with 2-byte alignment - can detect surrogate pairs
154 /// - **UTF-32**: Good with 4-byte alignment
155 ///
156 /// Non-resynchronizable encodings (legacy CJK like Shift-JIS, GB18030, GBK, Big5)
157 /// have ambiguous byte sequences where a byte could be either a standalone character
158 /// or part of a multi-byte sequence. You must scan from the beginning to be certain.
159 pub fn is_resynchronizable(&self) -> bool {
160 match self {
161 // Fixed-width single byte - every byte is a character
162 Self::Ascii | Self::Latin1 | Self::Windows1252 | Self::Windows1250 => true,
163
164 // UTF-8 has unique bit patterns for lead vs continuation bytes
165 Self::Utf8 | Self::Utf8Bom => true,
166
167 // UTF-16 is resynchronizable with 2-byte alignment
168 // (can detect surrogate pairs by checking 0xD800-0xDFFF range)
169 Self::Utf16Le | Self::Utf16Be => true,
170
171 // Legacy CJK encodings are NOT resynchronizable
172 // The second byte of a double-byte char can equal a valid single-byte char
173 Self::Gb18030 | Self::Gbk | Self::ShiftJis | Self::EucKr => false,
174 }
175 }
176
177 /// Returns the byte alignment required for this encoding when doing random access.
178 ///
179 /// For lazy loading of large files, reads must be aligned to this boundary.
180 /// Returns None if the encoding is not resynchronizable (requires full file scan).
181 pub fn alignment(&self) -> Option<usize> {
182 match self {
183 // Single-byte encodings - no alignment needed
184 Self::Ascii | Self::Latin1 | Self::Windows1252 | Self::Windows1250 => Some(1),
185
186 // UTF-8 - no alignment needed (self-synchronizing)
187 Self::Utf8 | Self::Utf8Bom => Some(1),
188
189 // UTF-16 - must be 2-byte aligned
190 Self::Utf16Le | Self::Utf16Be => Some(2),
191
192 // Legacy CJK - not resynchronizable, no valid alignment
193 Self::Gb18030 | Self::Gbk | Self::ShiftJis | Self::EucKr => None,
194 }
195 }
196
197 /// Returns true if this encoding requires the entire file to be loaded
198 /// for correct decoding (cannot use lazy/streaming loading).
199 ///
200 /// This is the inverse of `is_resynchronizable()` and indicates that
201 /// the user should be warned before loading large files in this encoding.
202 pub fn requires_full_file_load(&self) -> bool {
203 !self.is_resynchronizable()
204 }
205}
206
207// ============================================================================
208// Encoding Detection
209// ============================================================================
210
211/// Detect the text encoding from a sample of bytes
212///
213/// This function delegates to `detect_encoding_or_binary` and returns only
214/// the encoding, ignoring the binary flag. Use `detect_encoding_or_binary`
215/// when you need to know if the content should be treated as binary.
216pub fn detect_encoding(bytes: &[u8]) -> Encoding {
217 detect_encoding_or_binary(bytes, false).0
218}
219
220/// Detect the text encoding and whether content is binary.
221///
222/// Returns (Encoding, is_binary) where:
223/// - Encoding is the detected encoding (or default if binary)
224/// - is_binary is true if the content should be treated as raw binary
225///
226/// When `truncated` is true, an incomplete multi-byte UTF-8 sequence at the
227/// end of the sample is tolerated (up to 3 bytes) since it likely results from
228/// the caller truncating a larger stream. When false, such trailing bytes cause
229/// the sample to be rejected as UTF-8.
230///
231/// # Detection Strategy
232///
233/// 1. Check for BOM (Byte Order Mark) - highest priority, definitely not binary
234/// 2. Try UTF-8 validation (fast path for most files), definitely not binary
235/// 3. Check for UTF-16 patterns without BOM, definitely not binary
236/// 4. Check for binary control characters (null bytes, etc.) - if found, it's binary
237/// 5. Use chardetng for statistical detection of legacy encodings
238/// 6. If encoding detection is uncertain, default to Windows-1252
239pub fn detect_encoding_or_binary(bytes: &[u8], truncated: bool) -> (Encoding, bool) {
240 // Only check the first 8KB for encoding detection
241 let check_len = bytes.len().min(8 * 1024);
242 let sample = &bytes[..check_len];
243
244 // 1. Check for BOM (Byte Order Mark) - highest priority, definitely text
245 if sample.starts_with(&[0xEF, 0xBB, 0xBF]) {
246 return (Encoding::Utf8Bom, false);
247 }
248 if sample.starts_with(&[0xFF, 0xFE]) {
249 // Could also be UTF-32 LE, but UTF-16 LE is much more common
250 return (Encoding::Utf16Le, false);
251 }
252 if sample.starts_with(&[0xFE, 0xFF]) {
253 return (Encoding::Utf16Be, false);
254 }
255
256 // 2. Try UTF-8 validation (fast path for most modern files)
257 // Note: When we truncate to 8KB, we may cut in the middle of a multi-byte UTF-8 sequence.
258 // We need to handle this case - if most of the sample is valid UTF-8 and the only error
259 // is an incomplete sequence at the very end, we should still detect it as UTF-8.
260 let utf8_valid_len = match std::str::from_utf8(sample) {
261 Ok(_) => sample.len(),
262 Err(e) => {
263 // error_len() returns None if the error is due to incomplete sequence at end
264 // (i.e., unexpected end of input), vs Some(n) for an invalid byte
265 if e.error_len().is_none() {
266 // Incomplete sequence at end - this is likely due to sample truncation
267 e.valid_up_to()
268 } else {
269 // Invalid byte found - not valid UTF-8
270 0
271 }
272 }
273 };
274
275 // If the sample is valid UTF-8, treat it as UTF-8.
276 // When the caller indicates the sample was truncated from a larger stream,
277 // tolerate up to 3 trailing bytes of an incomplete multi-byte sequence (a
278 // truncation artifact). Without truncation, require exact validity — a
279 // trailing 0xE9 in a short file is a Latin-1 'é', not a truncated codepoint.
280 let is_valid_utf8 = utf8_valid_len == sample.len()
281 || (truncated && utf8_valid_len > 0 && utf8_valid_len >= sample.len() - 3);
282 if is_valid_utf8 {
283 let valid_sample = &sample[..utf8_valid_len];
284 // Check if it's pure ASCII (subset of UTF-8)
285 // Also check for binary indicators in valid ASCII/UTF-8
286 let has_binary_control = valid_sample.iter().any(|&b| is_binary_control_char(b));
287 if has_binary_control {
288 return (Encoding::Utf8, true);
289 }
290 if valid_sample.iter().all(|&b| b < 128) {
291 return (Encoding::Ascii, false);
292 }
293 return (Encoding::Utf8, false);
294 }
295
296 // 3. Check for UTF-16 without BOM (common in some Windows files)
297 // Heuristic: Look for patterns of null bytes alternating with printable chars
298 // The non-null byte should be printable (0x20-0x7E) or a valid high byte
299 //
300 // Note: Unlike UTF-8 above, this heuristic is robust to sample truncation because:
301 // - We use statistical pattern matching (50% threshold), not strict validation
302 // - chunks(2) naturally handles odd-length samples by dropping the last byte
303 // - Losing 1 pair out of ~4096 doesn't affect the detection threshold
304 if sample.len() >= 4 {
305 let is_printable_or_high = |b: u8| (0x20..=0x7E).contains(&b) || b >= 0x80;
306
307 // Align to even boundary to ensure we only process complete 2-byte pairs
308 let aligned_len = sample.len() & !1; // Round down to even
309 let aligned_sample = &sample[..aligned_len];
310
311 let le_pairs = aligned_sample
312 .chunks(2)
313 .filter(|chunk| chunk[1] == 0 && is_printable_or_high(chunk[0]))
314 .count();
315 let be_pairs = aligned_sample
316 .chunks(2)
317 .filter(|chunk| chunk[0] == 0 && is_printable_or_high(chunk[1]))
318 .count();
319 let pair_count = aligned_len / 2;
320
321 // If more than 50% of pairs look like valid UTF-16 text, it's text
322 if le_pairs > pair_count / 2 {
323 return (Encoding::Utf16Le, false);
324 }
325 if be_pairs > pair_count / 2 {
326 return (Encoding::Utf16Be, false);
327 }
328 }
329
330 // 4. Check for binary indicators EARLY (before chardetng)
331 // Binary files often contain control characters and null bytes that should not
332 // appear in any valid text encoding. Check this before chardetng because
333 // chardetng might still be "confident" about some encoding for binary data.
334 let has_binary_control = sample
335 .iter()
336 .any(|&b| b == 0x00 || is_binary_control_char(b));
337 if has_binary_control {
338 return (Encoding::Utf8, true);
339 }
340
341 // 5. Check for Latin-1 patterns: high bytes followed by invalid CJK trail bytes
342 // In GB18030/GBK, trail bytes must be 0x40-0x7E or 0x80-0xFE
343 // If a high byte is followed by a byte outside these ranges (e.g., space, newline,
344 // punctuation < 0x40), it's likely Latin-1, not CJK
345 let has_latin1_pattern = has_latin1_high_byte_pattern(sample);
346
347 // Also check for bytes in CJK-only range (0x81-0x9F) which can only be CJK lead bytes
348 let has_cjk_only_bytes = sample.iter().any(|&b| (0x81..0xA0).contains(&b));
349
350 // 6. Use chardetng for statistical encoding detection
351 let mut detector = chardetng::EncodingDetector::new();
352 detector.feed(sample, true);
353 let (detected_encoding, confident) = detector.guess_assess(None, true);
354
355 // If chardetng is confident, use that encoding (not binary)
356 if confident {
357 let is_cjk_encoding = detected_encoding == encoding_rs::GB18030
358 || detected_encoding == encoding_rs::GBK
359 || detected_encoding == encoding_rs::SHIFT_JIS
360 || detected_encoding == encoding_rs::EUC_KR;
361
362 // For CJK encodings, prefer Windows-1252 if we have clear Latin-1 indicators:
363 // - Space followed by high byte (0xA0-0xFF) is common in Latin-1 text
364 //
365 // If there are CJK-only bytes (0x81-0x9F), it's definitely CJK (not ambiguous).
366 // If there are Latin-1 patterns (space + high byte), prefer Windows-1252.
367 // Otherwise, trust chardetng's detection.
368 if is_cjk_encoding && !has_cjk_only_bytes && has_latin1_pattern {
369 return (Encoding::Windows1252, false);
370 }
371
372 // GBK is a subset of GB18030. Since we only inspect the first 8KB for
373 // detection, the sample may not contain GB18030-only code points (uncommon
374 // Chinese characters, emoji, etc.). Treating GBK as GB18030 is safer and
375 // ensures proper display of all characters including French, Spanish, and emoji.
376 let encoding =
377 if detected_encoding == encoding_rs::GB18030 || detected_encoding == encoding_rs::GBK {
378 Encoding::Gb18030
379 } else if detected_encoding == encoding_rs::SHIFT_JIS {
380 Encoding::ShiftJis
381 } else if detected_encoding == encoding_rs::EUC_KR {
382 Encoding::EucKr
383 } else if detected_encoding == encoding_rs::WINDOWS_1252
384 || detected_encoding == encoding_rs::WINDOWS_1250
385 {
386 // chardetng often returns Windows-1252 for Central European text
387 // Check for Windows-1250 specific patterns
388 if has_windows1250_pattern(sample) {
389 Encoding::Windows1250
390 } else {
391 Encoding::Windows1252
392 }
393 } else if detected_encoding == encoding_rs::UTF_8 {
394 // chardetng thinks it's UTF-8, but validation failed above
395 // Could still be Windows-1250 if it has Central European patterns
396 if has_windows1250_pattern(sample) {
397 Encoding::Windows1250
398 } else {
399 Encoding::Windows1252
400 }
401 } else {
402 // Unknown encoding - check for Windows-1250 patterns
403 if has_windows1250_pattern(sample) {
404 Encoding::Windows1250
405 } else {
406 Encoding::Windows1252
407 }
408 };
409 return (encoding, false);
410 }
411
412 // 7. chardetng not confident, but no binary indicators - check for Windows-1250 patterns
413 // We already checked for binary control chars earlier, so this is valid text
414 if has_windows1250_pattern(sample) {
415 (Encoding::Windows1250, false)
416 } else {
417 (Encoding::Windows1252, false)
418 }
419}
420
421// ============================================================================
422// Binary Detection Helpers
423// ============================================================================
424
425/// Check if a byte is a binary control character
426///
427/// Returns true for control characters that typically indicate binary content,
428/// excluding common text control chars (tab, newline, CR, form feed, etc.)
429pub fn is_binary_control_char(byte: u8) -> bool {
430 if byte < 0x20 {
431 // Allow common text control characters:
432 // 0x09 = Tab, 0x0A = LF, 0x0D = CR, 0x0C = Form Feed, 0x0B = Vertical Tab, 0x1B = ESC
433 !matches!(byte, 0x09 | 0x0A | 0x0D | 0x0C | 0x0B | 0x1B)
434 } else if byte == 0x7F {
435 // DEL character
436 true
437 } else {
438 false
439 }
440}
441
442/// Check if sample has Latin-1 patterns that cannot be valid CJK encoding
443///
444/// In GB18030/GBK, valid sequences are:
445/// - ASCII bytes (0x00-0x7F) as standalone characters
446/// - Lead byte (0x81-0xFE) + Trail byte (0x40-0x7E or 0x80-0xFE)
447///
448/// This function looks for patterns that indicate Latin-1:
449/// 1. High bytes followed by invalid CJK trail bytes (space, newline, etc.)
450/// 2. ASCII word followed by space followed by high byte (like "Hello é")
451/// 3. High byte immediately after ASCII space (like " é")
452fn has_latin1_high_byte_pattern(sample: &[u8]) -> bool {
453 let mut latin1_indicators = 0;
454 let mut i = 0;
455
456 while i < sample.len() {
457 let byte = sample[i];
458
459 if byte < 0x80 {
460 // ASCII byte
461 // Check for pattern: space followed by high byte (0xA0-0xFF)
462 // This is common in Latin-1 text like "Hello é" or "Café résumé"
463 if byte == 0x20 && i + 1 < sample.len() {
464 let next = sample[i + 1];
465 // Space followed by Latin-1 extended char (not CJK-only lead byte)
466 if next >= 0xA0 {
467 latin1_indicators += 1;
468 }
469 }
470 i += 1;
471 continue;
472 }
473
474 // High byte (0x80-0xFF) - could be Latin-1 or CJK lead byte
475 if i + 1 < sample.len() {
476 let next = sample[i + 1];
477
478 // Check if this could be a valid CJK double-byte sequence
479 let is_valid_cjk_lead = (0x81..=0xFE).contains(&byte);
480 let is_valid_cjk_trail = (0x40..=0x7E).contains(&next) || (0x80..=0xFE).contains(&next);
481
482 if is_valid_cjk_lead && is_valid_cjk_trail {
483 // Valid CJK pair - skip both bytes
484 i += 2;
485 continue;
486 }
487
488 // Not a valid CJK pair - check for Latin-1 indicator
489 // High byte followed by space, newline, or other low ASCII
490 if byte >= 0xA0 && next < 0x40 {
491 latin1_indicators += 1;
492 }
493 }
494
495 i += 1;
496 }
497
498 // Latin-1 is likely if we have indicators
499 latin1_indicators > 0
500}
501
502// ============================================================================
503// Encoding Conversion
504// ============================================================================
505
506/// Detect encoding and convert bytes to UTF-8
507///
508/// Returns the detected encoding and the UTF-8 converted content.
509/// This is the core function for normalizing file content to UTF-8 on load.
510pub fn detect_and_convert(bytes: &[u8]) -> (Encoding, Vec<u8>) {
511 if bytes.is_empty() {
512 return (Encoding::Utf8, Vec::new());
513 }
514
515 let encoding = detect_encoding(bytes);
516
517 // For UTF-8 (with or without BOM), we can use the content directly
518 match encoding {
519 Encoding::Utf8 | Encoding::Ascii => {
520 // Already UTF-8, just clone
521 (encoding, bytes.to_vec())
522 }
523 Encoding::Utf8Bom => {
524 // Skip the BOM (3 bytes) and use the rest
525 let content = if bytes.len() > 3 {
526 bytes[3..].to_vec()
527 } else {
528 Vec::new()
529 };
530 (encoding, content)
531 }
532 Encoding::Utf16Le | Encoding::Utf16Be => {
533 // Decode UTF-16 to UTF-8
534 let enc_rs = encoding.to_encoding_rs();
535 let start_offset =
536 if bytes.starts_with(&[0xFF, 0xFE]) || bytes.starts_with(&[0xFE, 0xFF]) {
537 2 // Skip BOM
538 } else {
539 0
540 };
541 let data = &bytes[start_offset..];
542
543 let (cow, _had_errors) = enc_rs.decode_without_bom_handling(data);
544 (encoding, cow.into_owned().into_bytes())
545 }
546 _ => {
547 // Use encoding_rs to convert to UTF-8
548 let enc_rs = encoding.to_encoding_rs();
549 let (cow, _had_errors) = enc_rs.decode_without_bom_handling(bytes);
550 (encoding, cow.into_owned().into_bytes())
551 }
552 }
553}
554
555/// Convert bytes from a specific encoding to UTF-8
556///
557/// Used when opening a file with a user-specified encoding instead of auto-detection.
558/// Returns the UTF-8 converted content.
559pub fn convert_to_utf8(bytes: &[u8], encoding: Encoding) -> Vec<u8> {
560 if bytes.is_empty() {
561 return Vec::new();
562 }
563
564 match encoding {
565 Encoding::Utf8 | Encoding::Ascii => {
566 // Already UTF-8, just clone
567 bytes.to_vec()
568 }
569 Encoding::Utf8Bom => {
570 // Skip the BOM (3 bytes) if present and use the rest
571 if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) && bytes.len() > 3 {
572 bytes[3..].to_vec()
573 } else {
574 bytes.to_vec()
575 }
576 }
577 Encoding::Utf16Le | Encoding::Utf16Be => {
578 // Decode UTF-16 to UTF-8
579 let enc_rs = encoding.to_encoding_rs();
580 let start_offset =
581 if bytes.starts_with(&[0xFF, 0xFE]) || bytes.starts_with(&[0xFE, 0xFF]) {
582 2 // Skip BOM
583 } else {
584 0
585 };
586 let data = &bytes[start_offset..];
587
588 let (cow, _had_errors) = enc_rs.decode_without_bom_handling(data);
589 cow.into_owned().into_bytes()
590 }
591 _ => {
592 // Use encoding_rs to convert to UTF-8
593 let enc_rs = encoding.to_encoding_rs();
594 let (cow, _had_errors) = enc_rs.decode_without_bom_handling(bytes);
595 cow.into_owned().into_bytes()
596 }
597 }
598}
599
600/// Convert UTF-8 content to the specified encoding for saving
601///
602/// Used when saving files to convert internal UTF-8 representation
603/// back to the original (or user-selected) encoding.
604///
605/// Note: This does NOT add BOM - the BOM should be handled separately.
606pub fn convert_from_utf8(utf8_bytes: &[u8], encoding: Encoding) -> Vec<u8> {
607 match encoding {
608 Encoding::Utf8 | Encoding::Ascii | Encoding::Utf8Bom => {
609 // UTF-8 (with or without BOM) - just clone, BOM added separately
610 utf8_bytes.to_vec()
611 }
612 Encoding::Utf16Le => {
613 // Convert UTF-8 to UTF-16 LE (no BOM - added separately)
614 let text = String::from_utf8_lossy(utf8_bytes);
615 let mut result = Vec::new();
616 for code_unit in text.encode_utf16() {
617 result.extend_from_slice(&code_unit.to_le_bytes());
618 }
619 result
620 }
621 Encoding::Utf16Be => {
622 // Convert UTF-8 to UTF-16 BE (no BOM - added separately)
623 let text = String::from_utf8_lossy(utf8_bytes);
624 let mut result = Vec::new();
625 for code_unit in text.encode_utf16() {
626 result.extend_from_slice(&code_unit.to_be_bytes());
627 }
628 result
629 }
630 _ => {
631 // Use encoding_rs to convert from UTF-8
632 let enc_rs = encoding.to_encoding_rs();
633 let text = String::from_utf8_lossy(utf8_bytes);
634 let (cow, _encoding_used, _had_errors) = enc_rs.encode(&text);
635 cow.into_owned()
636 }
637 }
638}
639
640// ============================================================================
641// Tests
642// ============================================================================
643
644#[cfg(test)]
645mod tests {
646 use super::*;
647
648 #[test]
649 fn test_encoding_display_names() {
650 assert_eq!(Encoding::Utf8.display_name(), "UTF-8");
651 assert_eq!(Encoding::Utf8Bom.display_name(), "UTF-8 BOM");
652 assert_eq!(Encoding::Utf16Le.display_name(), "UTF-16 LE");
653 assert_eq!(Encoding::Gb18030.display_name(), "GB18030");
654 assert_eq!(Encoding::Windows1250.display_name(), "Windows-1250");
655 }
656
657 #[test]
658 fn test_encoding_bom() {
659 assert!(Encoding::Utf8Bom.has_bom());
660 assert!(Encoding::Utf16Le.has_bom());
661 assert!(!Encoding::Utf8.has_bom());
662 assert!(!Encoding::Windows1252.has_bom());
663 assert!(!Encoding::Windows1250.has_bom());
664 }
665
666 #[test]
667 fn test_detect_utf8() {
668 assert_eq!(detect_encoding(b"Hello, world!"), Encoding::Ascii);
669 assert_eq!(detect_encoding("Hello, 世界!".as_bytes()), Encoding::Utf8);
670 }
671
672 #[test]
673 fn test_detect_utf8_bom() {
674 let with_bom = [0xEF, 0xBB, 0xBF, b'H', b'i'];
675 assert_eq!(detect_encoding(&with_bom), Encoding::Utf8Bom);
676 }
677
678 #[test]
679 fn test_detect_utf16_le() {
680 let utf16_le_bom = [0xFF, 0xFE, b'H', 0x00, b'i', 0x00];
681 assert_eq!(detect_encoding(&utf16_le_bom), Encoding::Utf16Le);
682 }
683
684 #[test]
685 fn test_detect_binary() {
686 let binary_data = [0x00, 0x01, 0x02, 0x03];
687 let (_, is_binary) = detect_encoding_or_binary(&binary_data, false);
688 assert!(is_binary);
689 }
690
691 #[test]
692 fn test_is_binary_control_char() {
693 // Binary control chars
694 assert!(is_binary_control_char(0x00)); // NUL
695 assert!(is_binary_control_char(0x01)); // SOH
696 assert!(is_binary_control_char(0x02)); // STX
697 assert!(is_binary_control_char(0x7F)); // DEL
698
699 // Text control chars (allowed)
700 assert!(!is_binary_control_char(0x09)); // Tab
701 assert!(!is_binary_control_char(0x0A)); // LF
702 assert!(!is_binary_control_char(0x0D)); // CR
703 assert!(!is_binary_control_char(0x1B)); // ESC
704
705 // Regular printable chars
706 assert!(!is_binary_control_char(b'A'));
707 assert!(!is_binary_control_char(b' '));
708 }
709
710 #[test]
711 fn test_convert_roundtrip_utf8() {
712 let original = "Hello, 世界!";
713 let bytes = original.as_bytes();
714
715 let (encoding, utf8_content) = detect_and_convert(bytes);
716 assert_eq!(encoding, Encoding::Utf8);
717 assert_eq!(utf8_content, bytes);
718
719 let back = convert_from_utf8(&utf8_content, encoding);
720 assert_eq!(back, bytes);
721 }
722
723 #[test]
724 fn test_convert_roundtrip_utf16le() {
725 // UTF-16 LE with BOM: "Hi"
726 let utf16_le = [0xFF, 0xFE, b'H', 0x00, b'i', 0x00];
727
728 let (encoding, utf8_content) = detect_and_convert(&utf16_le);
729 assert_eq!(encoding, Encoding::Utf16Le);
730 assert_eq!(utf8_content, b"Hi");
731
732 // Note: convert_from_utf8 doesn't add BOM, so result won't have BOM
733 let back = convert_from_utf8(&utf8_content, encoding);
734 assert_eq!(back, [b'H', 0x00, b'i', 0x00]);
735 }
736
737 #[test]
738 fn test_encoding_resynchronizable() {
739 // Self-synchronizing encodings (can find char boundaries from middle of file)
740 assert!(Encoding::Utf8.is_resynchronizable());
741 assert!(Encoding::Utf8Bom.is_resynchronizable());
742 assert!(Encoding::Ascii.is_resynchronizable());
743 assert!(Encoding::Latin1.is_resynchronizable());
744 assert!(Encoding::Windows1252.is_resynchronizable());
745 assert!(Encoding::Windows1250.is_resynchronizable());
746
747 // UTF-16 is resynchronizable with proper alignment
748 assert!(Encoding::Utf16Le.is_resynchronizable());
749 assert!(Encoding::Utf16Be.is_resynchronizable());
750
751 // Legacy CJK encodings are NOT resynchronizable
752 // (second byte of double-byte char can equal a valid single-byte char)
753 assert!(!Encoding::Gb18030.is_resynchronizable());
754 assert!(!Encoding::Gbk.is_resynchronizable());
755 assert!(!Encoding::ShiftJis.is_resynchronizable());
756 assert!(!Encoding::EucKr.is_resynchronizable());
757 }
758
759 #[test]
760 fn test_encoding_alignment() {
761 // Single-byte encodings have alignment of 1
762 assert_eq!(Encoding::Ascii.alignment(), Some(1));
763 assert_eq!(Encoding::Latin1.alignment(), Some(1));
764 assert_eq!(Encoding::Windows1252.alignment(), Some(1));
765 assert_eq!(Encoding::Windows1250.alignment(), Some(1));
766 assert_eq!(Encoding::Utf8.alignment(), Some(1));
767 assert_eq!(Encoding::Utf8Bom.alignment(), Some(1));
768
769 // UTF-16 requires 2-byte alignment
770 assert_eq!(Encoding::Utf16Le.alignment(), Some(2));
771 assert_eq!(Encoding::Utf16Be.alignment(), Some(2));
772
773 // Non-resynchronizable encodings have no valid alignment
774 assert_eq!(Encoding::Gb18030.alignment(), None);
775 assert_eq!(Encoding::Gbk.alignment(), None);
776 assert_eq!(Encoding::ShiftJis.alignment(), None);
777 assert_eq!(Encoding::EucKr.alignment(), None);
778 }
779
780 #[test]
781 fn test_requires_full_file_load() {
782 // Encodings that can be streamed
783 assert!(!Encoding::Utf8.requires_full_file_load());
784 assert!(!Encoding::Ascii.requires_full_file_load());
785 assert!(!Encoding::Latin1.requires_full_file_load());
786 assert!(!Encoding::Windows1250.requires_full_file_load());
787 assert!(!Encoding::Utf16Le.requires_full_file_load());
788
789 // Encodings that require full loading
790 assert!(Encoding::Gb18030.requires_full_file_load());
791 assert!(Encoding::Gbk.requires_full_file_load());
792 assert!(Encoding::ShiftJis.requires_full_file_load());
793 assert!(Encoding::EucKr.requires_full_file_load());
794 }
795
796 #[test]
797 fn test_convert_roundtrip_windows1250() {
798 // Windows-1250 encoded text with Central European characters
799 // "Zażółć" in Windows-1250: Z(0x5A) a(0x61) ż(0xBF) ó(0xF3) ł(0xB3) ć(0xE6)
800 let windows1250_bytes: &[u8] = &[0x5A, 0x61, 0xBF, 0xF3, 0xB3, 0xE6];
801
802 // Convert to UTF-8
803 let enc_rs = Encoding::Windows1250.to_encoding_rs();
804 let (decoded, _had_errors) = enc_rs.decode_without_bom_handling(windows1250_bytes);
805 let utf8_content = decoded.as_bytes();
806
807 // The UTF-8 content should contain the Polish characters
808 let utf8_str = std::str::from_utf8(utf8_content).unwrap();
809 assert!(utf8_str.contains('ż'), "Should contain ż: {}", utf8_str);
810 assert!(utf8_str.contains('ó'), "Should contain ó: {}", utf8_str);
811 assert!(utf8_str.contains('ł'), "Should contain ł: {}", utf8_str);
812 assert!(utf8_str.contains('ć'), "Should contain ć: {}", utf8_str);
813
814 // Convert back to Windows-1250
815 let back = convert_from_utf8(utf8_content, Encoding::Windows1250);
816 assert_eq!(back, windows1250_bytes, "Round-trip should preserve bytes");
817 }
818
819 #[test]
820 fn test_windows1250_description() {
821 assert_eq!(
822 Encoding::Windows1250.description(),
823 "Windows-1250 / CP1250 – Central European"
824 );
825 }
826
827 #[test]
828 fn test_detect_windows1250_definitive_bytes() {
829 // Bytes 0x8D (Ť), 0x8F (Ź), 0x9D (ť) are undefined in Windows-1252
830 // but valid in Windows-1250, so they definitively indicate Windows-1250
831
832 // Czech text with ť (0x9D): "měsťo" (city, archaic)
833 let with_t_caron = [0x6D, 0x9D, 0x73, 0x74, 0x6F]; // mťsto
834 assert_eq!(
835 detect_encoding(&with_t_caron),
836 Encoding::Windows1250,
837 "Byte 0x9D (ť) should trigger Windows-1250 detection"
838 );
839
840 // Polish text with Ź (0x8F): "Źródło" (source)
841 let with_z_acute_upper = [0x8F, 0x72, 0xF3, 0x64, 0xB3, 0x6F]; // Źródło
842 assert_eq!(
843 detect_encoding(&with_z_acute_upper),
844 Encoding::Windows1250,
845 "Byte 0x8F (Ź) should trigger Windows-1250 detection"
846 );
847 }
848
849 #[test]
850 fn test_detect_windows1250_strong_indicators() {
851 // Polish text with ś (0x9C) and Ś (0x8C) - strong indicators from 0x80-0x9F range
852 let polish_text = [
853 0x9C, 0x77, 0x69, 0x65, 0x74, 0x79, 0x20, // "świety "
854 0x8C, 0x77, 0x69, 0x61, 0x74, // "Świat"
855 ];
856 assert_eq!(
857 detect_encoding(&polish_text),
858 Encoding::Windows1250,
859 "Multiple Polish characters (ś, Ś) should trigger Windows-1250"
860 );
861 }
862
863 #[test]
864 fn test_detect_ambiguous_bytes_as_windows1252() {
865 // Bytes in 0xA0-0xFF range are ambiguous and should default to Windows-1252
866 // Polish "żółć" - ż(0xBF) ó(0xF3) ł(0xB3) ć(0xE6) - all ambiguous
867 let zolc = [0xBF, 0xF3, 0xB3, 0xE6];
868 assert_eq!(
869 detect_encoding(&zolc),
870 Encoding::Windows1252,
871 "Ambiguous bytes should default to Windows-1252"
872 );
873
874 // ą (0xB9) and ł (0xB3) could be ¹ and ³ in Windows-1252
875 let ambiguous = [
876 0x6D, 0xB9, 0x6B, 0x61, 0x20, // "mąka " or "m¹ka "
877 0x6D, 0xB3, 0x6F, 0x64, 0x79, // "młody" or "m³ody"
878 ];
879 assert_eq!(
880 detect_encoding(&ambiguous),
881 Encoding::Windows1252,
882 "Ambiguous Polish bytes should default to Windows-1252"
883 );
884 }
885
886 #[test]
887 fn test_detect_windows1250_czech_pangram() {
888 // "Příliš žluťoučký kůň úpěl ďábelské ódy" - Czech pangram in Windows-1250
889 // Contains ť (0x9D) which is a definitive Windows-1250 indicator
890 let czech_pangram: &[u8] = &[
891 0x50, 0xF8, 0xED, 0x6C, 0x69, 0x9A, 0x20, // "Příliš "
892 0x9E, 0x6C, 0x75, 0x9D, 0x6F, 0x75, 0xE8, 0x6B, 0xFD, 0x20, // "žluťoučký "
893 0x6B, 0xF9, 0xF2, 0x20, // "kůň "
894 0xFA, 0x70, 0xEC, 0x6C, 0x20, // "úpěl "
895 0xEF, 0xE1, 0x62, 0x65, 0x6C, 0x73, 0x6B, 0xE9, 0x20, // "ďábelské "
896 0xF3, 0x64, 0x79, // "ódy"
897 ];
898 assert_eq!(
899 detect_encoding(czech_pangram),
900 Encoding::Windows1250,
901 "Czech pangram should be detected as Windows-1250 (contains ť = 0x9D)"
902 );
903 }
904
905 #[test]
906 fn test_detect_windows1252_not_1250() {
907 // Pure Windows-1252 text without Central European indicators
908 // "Café résumé" in Windows-1252
909 let windows1252_text = [
910 0x43, 0x61, 0x66, 0xE9, 0x20, // "Café "
911 0x72, 0xE9, 0x73, 0x75, 0x6D, 0xE9, // "résumé"
912 ];
913 assert_eq!(
914 detect_encoding(&windows1252_text),
915 Encoding::Windows1252,
916 "French text should remain Windows-1252"
917 );
918 }
919
920 #[test]
921 fn test_detect_utf8_chinese_truncated_sequence() {
922 // Test that UTF-8 Chinese text is correctly detected even when the sample
923 // is truncated in the middle of a multi-byte sequence.
924 //
925 // Bug context: When sampling first 8KB for detection, the boundary may cut
926 // through a multi-byte UTF-8 character. This caused valid UTF-8 Chinese text
927 // to fail std::str::from_utf8() validation and fall through to Windows-1250
928 // detection (because UTF-8 continuation bytes like 0x9C, 0x9D overlap with
929 // Windows-1250 indicator bytes).
930
931 // Chinese text "更多" (more) = [0xE6, 0x9B, 0xB4, 0xE5, 0xA4, 0x9A]
932 // If we truncate after 0xE5, we get an incomplete sequence
933 let utf8_chinese_truncated = [
934 0xE6, 0x9B, 0xB4, // 更
935 0xE5, 0xA4, 0x9A, // 多
936 0xE5, // Start of another character, incomplete
937 ];
938
939 // With truncated=true, this should be detected as UTF-8
940 assert_eq!(
941 detect_encoding_or_binary(&utf8_chinese_truncated, true).0,
942 Encoding::Utf8,
943 "Truncated UTF-8 Chinese text should be detected as UTF-8"
944 );
945
946 // Without truncated flag, the incomplete trailing byte is treated as non-UTF-8
947 assert_ne!(
948 detect_encoding_or_binary(&utf8_chinese_truncated, false).0,
949 Encoding::Utf8,
950 "Non-truncated short sample with trailing 0xE5 should not be detected as UTF-8"
951 );
952
953 // Test with 2 bytes of incomplete sequence
954 let utf8_chinese_truncated_2 = [
955 0xE6, 0x9B, 0xB4, // 更
956 0xE5, 0xA4, 0x9A, // 多
957 0xE5, 0xA4, // Incomplete 3-byte sequence (missing last byte)
958 ];
959 assert_eq!(
960 detect_encoding_or_binary(&utf8_chinese_truncated_2, true).0,
961 Encoding::Utf8,
962 "Truncated UTF-8 with 2-byte incomplete sequence should be detected as UTF-8"
963 );
964 }
965
966 #[test]
967 fn test_detect_utf8_chinese_with_high_bytes() {
968 // UTF-8 Chinese text contains many continuation bytes in the 0x80-0xBF range,
969 // including bytes like 0x9C, 0x9D that happen to be Windows-1250 indicators.
970 // These should NOT trigger Windows-1250 detection for valid UTF-8 content.
971
972 // Chinese characters that use continuation bytes that overlap with Windows-1250 indicators:
973 // 集 = E9 9B 86 (contains 0x9B)
974 // 精 = E7 B2 BE (contains 0xB2, 0xBE)
975 // Build a string with many such characters
976 let chinese_text = "更多全本全集精校小说"; // Contains various high continuation bytes
977 let bytes = chinese_text.as_bytes();
978
979 assert_eq!(
980 detect_encoding(bytes),
981 Encoding::Utf8,
982 "UTF-8 Chinese text should be detected as UTF-8, not Windows-1250"
983 );
984
985 // Verify these bytes would have triggered Windows-1250 detection if not valid UTF-8
986 // by checking that the sample contains bytes in the 0x80-0x9F range
987 let has_high_continuation_bytes = bytes.iter().any(|&b| (0x80..0xA0).contains(&b));
988 assert!(
989 has_high_continuation_bytes,
990 "Test should include bytes that could be mistaken for Windows-1250 indicators"
991 );
992 }
993
994 #[test]
995 fn test_detect_utf8_sample_truncation_at_boundary() {
996 // Simulate what happens when we take an 8KB sample that ends mid-character
997 // by creating a buffer that's valid UTF-8 except for the last 1-3 bytes
998
999 // Build a large UTF-8 Chinese text buffer
1000 let chinese = "我的美女老师"; // "My Beautiful Teacher"
1001 let mut buffer = Vec::new();
1002 // Repeat to make it substantial
1003 for _ in 0..100 {
1004 buffer.extend_from_slice(chinese.as_bytes());
1005 }
1006
1007 // Verify it's valid UTF-8 when complete
1008 assert!(std::str::from_utf8(&buffer).is_ok());
1009 assert_eq!(detect_encoding(&buffer), Encoding::Utf8);
1010
1011 // Now truncate at various points that cut through multi-byte sequences
1012 // Each Chinese character is 3 bytes in UTF-8
1013 for truncate_offset in 1..=3 {
1014 let truncated_len = buffer.len() - truncate_offset;
1015 let truncated_buf = &buffer[..truncated_len];
1016
1017 // The truncated buffer should fail strict UTF-8 validation
1018 // (unless we happen to cut at a character boundary)
1019 let is_strict_valid = std::str::from_utf8(truncated_buf).is_ok();
1020
1021 // With truncated=true, our detection should still detect it as UTF-8
1022 let detected = detect_encoding_or_binary(truncated_buf, true).0;
1023 assert_eq!(
1024 detected,
1025 Encoding::Utf8,
1026 "Truncated UTF-8 at offset -{} should be detected as UTF-8, strict_valid={}",
1027 truncate_offset,
1028 is_strict_valid
1029 );
1030 }
1031 }
1032}