fresh-editor 0.1.99

A lightweight, fast terminal-based text editor with LSP support and TypeScript plugins
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
//! Text encoding detection and conversion
//!
//! This module handles:
//! - Detecting text encodings from byte content (UTF-8, UTF-16, Latin-1, CJK, etc.)
//! - Binary file detection (distinguishing text from binary content)
//! - Converting between encodings (normalizing to UTF-8 on load, converting back on save)
//!
//! # Encoding Detection Strategy
//!
//! 1. **BOM Detection**: Check for Byte Order Marks (UTF-8 BOM, UTF-16 LE/BE)
//! 2. **UTF-8 Validation**: Fast path for most modern files
//! 3. **UTF-16 Heuristics**: Detect UTF-16 without BOM via null byte patterns
//! 4. **Binary Detection**: Check for control characters that indicate binary content
//! 5. **Statistical Detection**: Use chardetng for legacy encoding detection
//! 6. **Fallback**: Default to Windows-1252 for ambiguous cases

use super::encoding_heuristics::has_windows1250_pattern;
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};

// ============================================================================
// Encoding Type
// ============================================================================

/// Supported text encodings for file I/O
///
/// The editor internally uses UTF-8 for all text processing. When loading files,
/// content is converted from the detected encoding to UTF-8. When saving, content
/// is converted back to the original (or user-selected) encoding.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize, JsonSchema)]
pub enum Encoding {
    /// UTF-8 (default, most common)
    #[default]
    Utf8,
    /// UTF-8 with Byte Order Mark
    Utf8Bom,
    /// UTF-16 Little Endian (Windows default for Unicode files)
    Utf16Le,
    /// UTF-16 Big Endian
    Utf16Be,
    /// ASCII (7-bit, subset of UTF-8)
    Ascii,
    /// Latin-1 / ISO-8859-1 (Western European)
    Latin1,
    /// Windows-1252 / CP-1252 (Windows Western European, often called "ANSI")
    Windows1252,
    /// Windows-1250 / CP-1250 (Windows Central European)
    Windows1250,
    /// GB18030 (Chinese, superset of GBK)
    Gb18030,
    /// GBK (Chinese Simplified, subset of GB18030)
    Gbk,
    /// Shift-JIS (Japanese)
    ShiftJis,
    /// EUC-KR (Korean)
    EucKr,
}

impl Encoding {
    /// Get the display name for status bar
    pub fn display_name(&self) -> &'static str {
        match self {
            Self::Utf8 => "UTF-8",
            Self::Utf8Bom => "UTF-8 BOM",
            Self::Utf16Le => "UTF-16 LE",
            Self::Utf16Be => "UTF-16 BE",
            Self::Ascii => "ASCII",
            Self::Latin1 => "Latin-1",
            Self::Windows1252 => "Windows-1252",
            Self::Windows1250 => "Windows-1250",
            Self::Gb18030 => "GB18030",
            Self::Gbk => "GBK",
            Self::ShiftJis => "Shift-JIS",
            Self::EucKr => "EUC-KR",
        }
    }

    /// Get a longer description for UI (e.g., command palette)
    pub fn description(&self) -> &'static str {
        match self {
            Self::Utf8 => "UTF-8",
            Self::Utf8Bom => "UTF-8 with BOM",
            Self::Utf16Le => "UTF-16 Little Endian",
            Self::Utf16Be => "UTF-16 Big Endian",
            Self::Ascii => "US-ASCII",
            Self::Latin1 => "ISO-8859-1 / Latin-1 – Western European",
            Self::Windows1252 => "Windows-1252 / CP1252 – Western European",
            Self::Windows1250 => "Windows-1250 / CP1250 – Central European",
            Self::Gb18030 => "GB18030 – Chinese",
            Self::Gbk => "GBK / CP936 – Simplified Chinese",
            Self::ShiftJis => "Shift_JIS – Japanese",
            Self::EucKr => "EUC-KR – Korean",
        }
    }

    /// Get the encoding_rs Encoding for this type
    pub fn to_encoding_rs(&self) -> &'static encoding_rs::Encoding {
        match self {
            Self::Utf8 | Self::Utf8Bom | Self::Ascii => encoding_rs::UTF_8,
            Self::Utf16Le => encoding_rs::UTF_16LE,
            Self::Utf16Be => encoding_rs::UTF_16BE,
            Self::Latin1 => encoding_rs::WINDOWS_1252, // ISO-8859-1 maps to Windows-1252 per WHATWG
            Self::Windows1252 => encoding_rs::WINDOWS_1252,
            Self::Windows1250 => encoding_rs::WINDOWS_1250,
            Self::Gb18030 => encoding_rs::GB18030,
            Self::Gbk => encoding_rs::GBK,
            Self::ShiftJis => encoding_rs::SHIFT_JIS,
            Self::EucKr => encoding_rs::EUC_KR,
        }
    }

    /// Returns true if this encoding uses a BOM (Byte Order Mark)
    pub fn has_bom(&self) -> bool {
        matches!(self, Self::Utf8Bom | Self::Utf16Le | Self::Utf16Be)
    }

    /// Get the BOM bytes for this encoding (if any)
    pub fn bom_bytes(&self) -> Option<&'static [u8]> {
        match self {
            Self::Utf8Bom => Some(&[0xEF, 0xBB, 0xBF]),
            Self::Utf16Le => Some(&[0xFF, 0xFE]),
            Self::Utf16Be => Some(&[0xFE, 0xFF]),
            _ => None,
        }
    }

    /// All available encodings for UI display
    pub fn all() -> &'static [Encoding] {
        &[
            Self::Utf8,
            Self::Utf8Bom,
            Self::Utf16Le,
            Self::Utf16Be,
            Self::Ascii,
            Self::Latin1,
            Self::Windows1252,
            Self::Windows1250,
            Self::Gb18030,
            Self::Gbk,
            Self::ShiftJis,
            Self::EucKr,
        ]
    }

    /// Returns true if this encoding supports "resynchronization" - the ability to
    /// find character boundaries when jumping into the middle of a file.
    ///
    /// Resynchronizable encodings can be safely used with lazy/streaming file loading
    /// because you can determine character boundaries from any position.
    ///
    /// - **UTF-8**: Excellent - unique bit patterns distinguish lead/continuation bytes
    /// - **ASCII/Latin-1/Windows-1252**: Trivial - every byte is a character
    /// - **UTF-16**: Good with 2-byte alignment - can detect surrogate pairs
    /// - **UTF-32**: Good with 4-byte alignment
    ///
    /// Non-resynchronizable encodings (legacy CJK like Shift-JIS, GB18030, GBK, Big5)
    /// have ambiguous byte sequences where a byte could be either a standalone character
    /// or part of a multi-byte sequence. You must scan from the beginning to be certain.
    pub fn is_resynchronizable(&self) -> bool {
        match self {
            // Fixed-width single byte - every byte is a character
            Self::Ascii | Self::Latin1 | Self::Windows1252 | Self::Windows1250 => true,

            // UTF-8 has unique bit patterns for lead vs continuation bytes
            Self::Utf8 | Self::Utf8Bom => true,

            // UTF-16 is resynchronizable with 2-byte alignment
            // (can detect surrogate pairs by checking 0xD800-0xDFFF range)
            Self::Utf16Le | Self::Utf16Be => true,

            // Legacy CJK encodings are NOT resynchronizable
            // The second byte of a double-byte char can equal a valid single-byte char
            Self::Gb18030 | Self::Gbk | Self::ShiftJis | Self::EucKr => false,
        }
    }

    /// Returns the byte alignment required for this encoding when doing random access.
    ///
    /// For lazy loading of large files, reads must be aligned to this boundary.
    /// Returns None if the encoding is not resynchronizable (requires full file scan).
    pub fn alignment(&self) -> Option<usize> {
        match self {
            // Single-byte encodings - no alignment needed
            Self::Ascii | Self::Latin1 | Self::Windows1252 | Self::Windows1250 => Some(1),

            // UTF-8 - no alignment needed (self-synchronizing)
            Self::Utf8 | Self::Utf8Bom => Some(1),

            // UTF-16 - must be 2-byte aligned
            Self::Utf16Le | Self::Utf16Be => Some(2),

            // Legacy CJK - not resynchronizable, no valid alignment
            Self::Gb18030 | Self::Gbk | Self::ShiftJis | Self::EucKr => None,
        }
    }

    /// Returns true if this encoding requires the entire file to be loaded
    /// for correct decoding (cannot use lazy/streaming loading).
    ///
    /// This is the inverse of `is_resynchronizable()` and indicates that
    /// the user should be warned before loading large files in this encoding.
    pub fn requires_full_file_load(&self) -> bool {
        !self.is_resynchronizable()
    }
}

// ============================================================================
// Encoding Detection
// ============================================================================

/// Detect the text encoding from a sample of bytes
///
/// This function delegates to `detect_encoding_or_binary` and returns only
/// the encoding, ignoring the binary flag. Use `detect_encoding_or_binary`
/// when you need to know if the content should be treated as binary.
pub fn detect_encoding(bytes: &[u8]) -> Encoding {
    detect_encoding_or_binary(bytes).0
}

/// Detect the text encoding and whether content is binary.
///
/// Returns (Encoding, is_binary) where:
/// - Encoding is the detected encoding (or default if binary)
/// - is_binary is true if the content should be treated as raw binary
///
/// # Detection Strategy
///
/// 1. Check for BOM (Byte Order Mark) - highest priority, definitely not binary
/// 2. Try UTF-8 validation (fast path for most files), definitely not binary
/// 3. Check for UTF-16 patterns without BOM, definitely not binary
/// 4. Check for binary control characters (null bytes, etc.) - if found, it's binary
/// 5. Use chardetng for statistical detection of legacy encodings
/// 6. If encoding detection is uncertain, default to Windows-1252
pub fn detect_encoding_or_binary(bytes: &[u8]) -> (Encoding, bool) {
    // Only check the first 8KB for encoding detection
    let check_len = bytes.len().min(8 * 1024);
    let sample = &bytes[..check_len];

    // 1. Check for BOM (Byte Order Mark) - highest priority, definitely text
    if sample.starts_with(&[0xEF, 0xBB, 0xBF]) {
        return (Encoding::Utf8Bom, false);
    }
    if sample.starts_with(&[0xFF, 0xFE]) {
        // Could also be UTF-32 LE, but UTF-16 LE is much more common
        return (Encoding::Utf16Le, false);
    }
    if sample.starts_with(&[0xFE, 0xFF]) {
        return (Encoding::Utf16Be, false);
    }

    // 2. Try UTF-8 validation (fast path for most modern files)
    // Note: When we truncate to 8KB, we may cut in the middle of a multi-byte UTF-8 sequence.
    // We need to handle this case - if most of the sample is valid UTF-8 and the only error
    // is an incomplete sequence at the very end, we should still detect it as UTF-8.
    let utf8_valid_len = match std::str::from_utf8(sample) {
        Ok(_) => sample.len(),
        Err(e) => {
            // error_len() returns None if the error is due to incomplete sequence at end
            // (i.e., unexpected end of input), vs Some(n) for an invalid byte
            if e.error_len().is_none() {
                // Incomplete sequence at end - this is likely due to sample truncation
                e.valid_up_to()
            } else {
                // Invalid byte found - not valid UTF-8
                0
            }
        }
    };

    // If most of the sample is valid UTF-8 (at least 99% or all but the last few bytes),
    // treat it as UTF-8. The incomplete sequence at end is just due to sample truncation.
    if utf8_valid_len > 0 && (utf8_valid_len == sample.len() || utf8_valid_len >= sample.len() - 3)
    {
        let valid_sample = &sample[..utf8_valid_len];
        // Check if it's pure ASCII (subset of UTF-8)
        // Also check for binary indicators in valid ASCII/UTF-8
        let has_binary_control = valid_sample.iter().any(|&b| is_binary_control_char(b));
        if has_binary_control {
            return (Encoding::Utf8, true);
        }
        if valid_sample.iter().all(|&b| b < 128) {
            return (Encoding::Ascii, false);
        }
        return (Encoding::Utf8, false);
    }

    // 3. Check for UTF-16 without BOM (common in some Windows files)
    // Heuristic: Look for patterns of null bytes alternating with printable chars
    // The non-null byte should be printable (0x20-0x7E) or a valid high byte
    //
    // Note: Unlike UTF-8 above, this heuristic is robust to sample truncation because:
    // - We use statistical pattern matching (50% threshold), not strict validation
    // - chunks(2) naturally handles odd-length samples by dropping the last byte
    // - Losing 1 pair out of ~4096 doesn't affect the detection threshold
    if sample.len() >= 4 {
        let is_printable_or_high = |b: u8| (0x20..=0x7E).contains(&b) || b >= 0x80;

        // Align to even boundary to ensure we only process complete 2-byte pairs
        let aligned_len = sample.len() & !1; // Round down to even
        let aligned_sample = &sample[..aligned_len];

        let le_pairs = aligned_sample
            .chunks(2)
            .filter(|chunk| chunk[1] == 0 && is_printable_or_high(chunk[0]))
            .count();
        let be_pairs = aligned_sample
            .chunks(2)
            .filter(|chunk| chunk[0] == 0 && is_printable_or_high(chunk[1]))
            .count();
        let pair_count = aligned_len / 2;

        // If more than 50% of pairs look like valid UTF-16 text, it's text
        if le_pairs > pair_count / 2 {
            return (Encoding::Utf16Le, false);
        }
        if be_pairs > pair_count / 2 {
            return (Encoding::Utf16Be, false);
        }
    }

    // 4. Check for binary indicators EARLY (before chardetng)
    // Binary files often contain control characters and null bytes that should not
    // appear in any valid text encoding. Check this before chardetng because
    // chardetng might still be "confident" about some encoding for binary data.
    let has_binary_control = sample
        .iter()
        .any(|&b| b == 0x00 || is_binary_control_char(b));
    if has_binary_control {
        return (Encoding::Utf8, true);
    }

    // 5. Check for Latin-1 patterns: high bytes followed by invalid CJK trail bytes
    // In GB18030/GBK, trail bytes must be 0x40-0x7E or 0x80-0xFE
    // If a high byte is followed by a byte outside these ranges (e.g., space, newline,
    // punctuation < 0x40), it's likely Latin-1, not CJK
    let has_latin1_pattern = has_latin1_high_byte_pattern(sample);

    // Also check for bytes in CJK-only range (0x81-0x9F) which can only be CJK lead bytes
    let has_cjk_only_bytes = sample.iter().any(|&b| (0x81..0xA0).contains(&b));

    // 6. Use chardetng for statistical encoding detection
    let mut detector = chardetng::EncodingDetector::new();
    detector.feed(sample, true);
    let (detected_encoding, confident) = detector.guess_assess(None, true);

    // If chardetng is confident, use that encoding (not binary)
    if confident {
        let is_cjk_encoding = detected_encoding == encoding_rs::GB18030
            || detected_encoding == encoding_rs::GBK
            || detected_encoding == encoding_rs::SHIFT_JIS
            || detected_encoding == encoding_rs::EUC_KR;

        // For CJK encodings, prefer Windows-1252 if we have clear Latin-1 indicators:
        // - Space followed by high byte (0xA0-0xFF) is common in Latin-1 text
        //
        // If there are CJK-only bytes (0x81-0x9F), it's definitely CJK (not ambiguous).
        // If there are Latin-1 patterns (space + high byte), prefer Windows-1252.
        // Otherwise, trust chardetng's detection.
        if is_cjk_encoding && !has_cjk_only_bytes && has_latin1_pattern {
            return (Encoding::Windows1252, false);
        }

        // GBK is a subset of GB18030. Since we only inspect the first 8KB for
        // detection, the sample may not contain GB18030-only code points (uncommon
        // Chinese characters, emoji, etc.). Treating GBK as GB18030 is safer and
        // ensures proper display of all characters including French, Spanish, and emoji.
        let encoding =
            if detected_encoding == encoding_rs::GB18030 || detected_encoding == encoding_rs::GBK {
                Encoding::Gb18030
            } else if detected_encoding == encoding_rs::SHIFT_JIS {
                Encoding::ShiftJis
            } else if detected_encoding == encoding_rs::EUC_KR {
                Encoding::EucKr
            } else if detected_encoding == encoding_rs::WINDOWS_1252
                || detected_encoding == encoding_rs::WINDOWS_1250
            {
                // chardetng often returns Windows-1252 for Central European text
                // Check for Windows-1250 specific patterns
                if has_windows1250_pattern(sample) {
                    Encoding::Windows1250
                } else {
                    Encoding::Windows1252
                }
            } else if detected_encoding == encoding_rs::UTF_8 {
                // chardetng thinks it's UTF-8, but validation failed above
                // Could still be Windows-1250 if it has Central European patterns
                if has_windows1250_pattern(sample) {
                    Encoding::Windows1250
                } else {
                    Encoding::Windows1252
                }
            } else {
                // Unknown encoding - check for Windows-1250 patterns
                if has_windows1250_pattern(sample) {
                    Encoding::Windows1250
                } else {
                    Encoding::Windows1252
                }
            };
        return (encoding, false);
    }

    // 7. chardetng not confident, but no binary indicators - check for Windows-1250 patterns
    // We already checked for binary control chars earlier, so this is valid text
    if has_windows1250_pattern(sample) {
        (Encoding::Windows1250, false)
    } else {
        (Encoding::Windows1252, false)
    }
}

// ============================================================================
// Binary Detection Helpers
// ============================================================================

/// Check if a byte is a binary control character
///
/// Returns true for control characters that typically indicate binary content,
/// excluding common text control chars (tab, newline, CR, form feed, etc.)
pub fn is_binary_control_char(byte: u8) -> bool {
    if byte < 0x20 {
        // Allow common text control characters:
        // 0x09 = Tab, 0x0A = LF, 0x0D = CR, 0x0C = Form Feed, 0x0B = Vertical Tab, 0x1B = ESC
        !matches!(byte, 0x09 | 0x0A | 0x0D | 0x0C | 0x0B | 0x1B)
    } else if byte == 0x7F {
        // DEL character
        true
    } else {
        false
    }
}

/// Check if sample has Latin-1 patterns that cannot be valid CJK encoding
///
/// In GB18030/GBK, valid sequences are:
/// - ASCII bytes (0x00-0x7F) as standalone characters
/// - Lead byte (0x81-0xFE) + Trail byte (0x40-0x7E or 0x80-0xFE)
///
/// This function looks for patterns that indicate Latin-1:
/// 1. High bytes followed by invalid CJK trail bytes (space, newline, etc.)
/// 2. ASCII word followed by space followed by high byte (like "Hello é")
/// 3. High byte immediately after ASCII space (like " é")
fn has_latin1_high_byte_pattern(sample: &[u8]) -> bool {
    let mut latin1_indicators = 0;
    let mut i = 0;

    while i < sample.len() {
        let byte = sample[i];

        if byte < 0x80 {
            // ASCII byte
            // Check for pattern: space followed by high byte (0xA0-0xFF)
            // This is common in Latin-1 text like "Hello é" or "Café résumé"
            if byte == 0x20 && i + 1 < sample.len() {
                let next = sample[i + 1];
                // Space followed by Latin-1 extended char (not CJK-only lead byte)
                if next >= 0xA0 {
                    latin1_indicators += 1;
                }
            }
            i += 1;
            continue;
        }

        // High byte (0x80-0xFF) - could be Latin-1 or CJK lead byte
        if i + 1 < sample.len() {
            let next = sample[i + 1];

            // Check if this could be a valid CJK double-byte sequence
            let is_valid_cjk_lead = (0x81..=0xFE).contains(&byte);
            let is_valid_cjk_trail = (0x40..=0x7E).contains(&next) || (0x80..=0xFE).contains(&next);

            if is_valid_cjk_lead && is_valid_cjk_trail {
                // Valid CJK pair - skip both bytes
                i += 2;
                continue;
            }

            // Not a valid CJK pair - check for Latin-1 indicator
            // High byte followed by space, newline, or other low ASCII
            if byte >= 0xA0 && next < 0x40 {
                latin1_indicators += 1;
            }
        }

        i += 1;
    }

    // Latin-1 is likely if we have indicators
    latin1_indicators > 0
}

// ============================================================================
// Encoding Conversion
// ============================================================================

/// Detect encoding and convert bytes to UTF-8
///
/// Returns the detected encoding and the UTF-8 converted content.
/// This is the core function for normalizing file content to UTF-8 on load.
pub fn detect_and_convert(bytes: &[u8]) -> (Encoding, Vec<u8>) {
    if bytes.is_empty() {
        return (Encoding::Utf8, Vec::new());
    }

    let encoding = detect_encoding(bytes);

    // For UTF-8 (with or without BOM), we can use the content directly
    match encoding {
        Encoding::Utf8 | Encoding::Ascii => {
            // Already UTF-8, just clone
            (encoding, bytes.to_vec())
        }
        Encoding::Utf8Bom => {
            // Skip the BOM (3 bytes) and use the rest
            let content = if bytes.len() > 3 {
                bytes[3..].to_vec()
            } else {
                Vec::new()
            };
            (encoding, content)
        }
        Encoding::Utf16Le | Encoding::Utf16Be => {
            // Decode UTF-16 to UTF-8
            let enc_rs = encoding.to_encoding_rs();
            let start_offset =
                if bytes.starts_with(&[0xFF, 0xFE]) || bytes.starts_with(&[0xFE, 0xFF]) {
                    2 // Skip BOM
                } else {
                    0
                };
            let data = &bytes[start_offset..];

            let (cow, _had_errors) = enc_rs.decode_without_bom_handling(data);
            (encoding, cow.into_owned().into_bytes())
        }
        _ => {
            // Use encoding_rs to convert to UTF-8
            let enc_rs = encoding.to_encoding_rs();
            let (cow, _had_errors) = enc_rs.decode_without_bom_handling(bytes);
            (encoding, cow.into_owned().into_bytes())
        }
    }
}

/// Convert bytes from a specific encoding to UTF-8
///
/// Used when opening a file with a user-specified encoding instead of auto-detection.
/// Returns the UTF-8 converted content.
pub fn convert_to_utf8(bytes: &[u8], encoding: Encoding) -> Vec<u8> {
    if bytes.is_empty() {
        return Vec::new();
    }

    match encoding {
        Encoding::Utf8 | Encoding::Ascii => {
            // Already UTF-8, just clone
            bytes.to_vec()
        }
        Encoding::Utf8Bom => {
            // Skip the BOM (3 bytes) if present and use the rest
            if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) && bytes.len() > 3 {
                bytes[3..].to_vec()
            } else {
                bytes.to_vec()
            }
        }
        Encoding::Utf16Le | Encoding::Utf16Be => {
            // Decode UTF-16 to UTF-8
            let enc_rs = encoding.to_encoding_rs();
            let start_offset =
                if bytes.starts_with(&[0xFF, 0xFE]) || bytes.starts_with(&[0xFE, 0xFF]) {
                    2 // Skip BOM
                } else {
                    0
                };
            let data = &bytes[start_offset..];

            let (cow, _had_errors) = enc_rs.decode_without_bom_handling(data);
            cow.into_owned().into_bytes()
        }
        _ => {
            // Use encoding_rs to convert to UTF-8
            let enc_rs = encoding.to_encoding_rs();
            let (cow, _had_errors) = enc_rs.decode_without_bom_handling(bytes);
            cow.into_owned().into_bytes()
        }
    }
}

/// Convert UTF-8 content to the specified encoding for saving
///
/// Used when saving files to convert internal UTF-8 representation
/// back to the original (or user-selected) encoding.
///
/// Note: This does NOT add BOM - the BOM should be handled separately.
pub fn convert_from_utf8(utf8_bytes: &[u8], encoding: Encoding) -> Vec<u8> {
    match encoding {
        Encoding::Utf8 | Encoding::Ascii | Encoding::Utf8Bom => {
            // UTF-8 (with or without BOM) - just clone, BOM added separately
            utf8_bytes.to_vec()
        }
        Encoding::Utf16Le => {
            // Convert UTF-8 to UTF-16 LE (no BOM - added separately)
            let text = String::from_utf8_lossy(utf8_bytes);
            let mut result = Vec::new();
            for code_unit in text.encode_utf16() {
                result.extend_from_slice(&code_unit.to_le_bytes());
            }
            result
        }
        Encoding::Utf16Be => {
            // Convert UTF-8 to UTF-16 BE (no BOM - added separately)
            let text = String::from_utf8_lossy(utf8_bytes);
            let mut result = Vec::new();
            for code_unit in text.encode_utf16() {
                result.extend_from_slice(&code_unit.to_be_bytes());
            }
            result
        }
        _ => {
            // Use encoding_rs to convert from UTF-8
            let enc_rs = encoding.to_encoding_rs();
            let text = String::from_utf8_lossy(utf8_bytes);
            let (cow, _encoding_used, _had_errors) = enc_rs.encode(&text);
            cow.into_owned()
        }
    }
}

// ============================================================================
// Tests
// ============================================================================

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_encoding_display_names() {
        assert_eq!(Encoding::Utf8.display_name(), "UTF-8");
        assert_eq!(Encoding::Utf8Bom.display_name(), "UTF-8 BOM");
        assert_eq!(Encoding::Utf16Le.display_name(), "UTF-16 LE");
        assert_eq!(Encoding::Gb18030.display_name(), "GB18030");
        assert_eq!(Encoding::Windows1250.display_name(), "Windows-1250");
    }

    #[test]
    fn test_encoding_bom() {
        assert!(Encoding::Utf8Bom.has_bom());
        assert!(Encoding::Utf16Le.has_bom());
        assert!(!Encoding::Utf8.has_bom());
        assert!(!Encoding::Windows1252.has_bom());
        assert!(!Encoding::Windows1250.has_bom());
    }

    #[test]
    fn test_detect_utf8() {
        assert_eq!(detect_encoding(b"Hello, world!"), Encoding::Ascii);
        assert_eq!(detect_encoding("Hello, 世界!".as_bytes()), Encoding::Utf8);
    }

    #[test]
    fn test_detect_utf8_bom() {
        let with_bom = [0xEF, 0xBB, 0xBF, b'H', b'i'];
        assert_eq!(detect_encoding(&with_bom), Encoding::Utf8Bom);
    }

    #[test]
    fn test_detect_utf16_le() {
        let utf16_le_bom = [0xFF, 0xFE, b'H', 0x00, b'i', 0x00];
        assert_eq!(detect_encoding(&utf16_le_bom), Encoding::Utf16Le);
    }

    #[test]
    fn test_detect_binary() {
        let binary_data = [0x00, 0x01, 0x02, 0x03];
        let (_, is_binary) = detect_encoding_or_binary(&binary_data);
        assert!(is_binary);
    }

    #[test]
    fn test_is_binary_control_char() {
        // Binary control chars
        assert!(is_binary_control_char(0x00)); // NUL
        assert!(is_binary_control_char(0x01)); // SOH
        assert!(is_binary_control_char(0x02)); // STX
        assert!(is_binary_control_char(0x7F)); // DEL

        // Text control chars (allowed)
        assert!(!is_binary_control_char(0x09)); // Tab
        assert!(!is_binary_control_char(0x0A)); // LF
        assert!(!is_binary_control_char(0x0D)); // CR
        assert!(!is_binary_control_char(0x1B)); // ESC

        // Regular printable chars
        assert!(!is_binary_control_char(b'A'));
        assert!(!is_binary_control_char(b' '));
    }

    #[test]
    fn test_convert_roundtrip_utf8() {
        let original = "Hello, 世界!";
        let bytes = original.as_bytes();

        let (encoding, utf8_content) = detect_and_convert(bytes);
        assert_eq!(encoding, Encoding::Utf8);
        assert_eq!(utf8_content, bytes);

        let back = convert_from_utf8(&utf8_content, encoding);
        assert_eq!(back, bytes);
    }

    #[test]
    fn test_convert_roundtrip_utf16le() {
        // UTF-16 LE with BOM: "Hi"
        let utf16_le = [0xFF, 0xFE, b'H', 0x00, b'i', 0x00];

        let (encoding, utf8_content) = detect_and_convert(&utf16_le);
        assert_eq!(encoding, Encoding::Utf16Le);
        assert_eq!(utf8_content, b"Hi");

        // Note: convert_from_utf8 doesn't add BOM, so result won't have BOM
        let back = convert_from_utf8(&utf8_content, encoding);
        assert_eq!(back, [b'H', 0x00, b'i', 0x00]);
    }

    #[test]
    fn test_encoding_resynchronizable() {
        // Self-synchronizing encodings (can find char boundaries from middle of file)
        assert!(Encoding::Utf8.is_resynchronizable());
        assert!(Encoding::Utf8Bom.is_resynchronizable());
        assert!(Encoding::Ascii.is_resynchronizable());
        assert!(Encoding::Latin1.is_resynchronizable());
        assert!(Encoding::Windows1252.is_resynchronizable());
        assert!(Encoding::Windows1250.is_resynchronizable());

        // UTF-16 is resynchronizable with proper alignment
        assert!(Encoding::Utf16Le.is_resynchronizable());
        assert!(Encoding::Utf16Be.is_resynchronizable());

        // Legacy CJK encodings are NOT resynchronizable
        // (second byte of double-byte char can equal a valid single-byte char)
        assert!(!Encoding::Gb18030.is_resynchronizable());
        assert!(!Encoding::Gbk.is_resynchronizable());
        assert!(!Encoding::ShiftJis.is_resynchronizable());
        assert!(!Encoding::EucKr.is_resynchronizable());
    }

    #[test]
    fn test_encoding_alignment() {
        // Single-byte encodings have alignment of 1
        assert_eq!(Encoding::Ascii.alignment(), Some(1));
        assert_eq!(Encoding::Latin1.alignment(), Some(1));
        assert_eq!(Encoding::Windows1252.alignment(), Some(1));
        assert_eq!(Encoding::Windows1250.alignment(), Some(1));
        assert_eq!(Encoding::Utf8.alignment(), Some(1));
        assert_eq!(Encoding::Utf8Bom.alignment(), Some(1));

        // UTF-16 requires 2-byte alignment
        assert_eq!(Encoding::Utf16Le.alignment(), Some(2));
        assert_eq!(Encoding::Utf16Be.alignment(), Some(2));

        // Non-resynchronizable encodings have no valid alignment
        assert_eq!(Encoding::Gb18030.alignment(), None);
        assert_eq!(Encoding::Gbk.alignment(), None);
        assert_eq!(Encoding::ShiftJis.alignment(), None);
        assert_eq!(Encoding::EucKr.alignment(), None);
    }

    #[test]
    fn test_requires_full_file_load() {
        // Encodings that can be streamed
        assert!(!Encoding::Utf8.requires_full_file_load());
        assert!(!Encoding::Ascii.requires_full_file_load());
        assert!(!Encoding::Latin1.requires_full_file_load());
        assert!(!Encoding::Windows1250.requires_full_file_load());
        assert!(!Encoding::Utf16Le.requires_full_file_load());

        // Encodings that require full loading
        assert!(Encoding::Gb18030.requires_full_file_load());
        assert!(Encoding::Gbk.requires_full_file_load());
        assert!(Encoding::ShiftJis.requires_full_file_load());
        assert!(Encoding::EucKr.requires_full_file_load());
    }

    #[test]
    fn test_convert_roundtrip_windows1250() {
        // Windows-1250 encoded text with Central European characters
        // "Zażółć" in Windows-1250: Z(0x5A) a(0x61) ż(0xBF) ó(0xF3) ł(0xB3) ć(0xE6)
        let windows1250_bytes: &[u8] = &[0x5A, 0x61, 0xBF, 0xF3, 0xB3, 0xE6];

        // Convert to UTF-8
        let enc_rs = Encoding::Windows1250.to_encoding_rs();
        let (decoded, _had_errors) = enc_rs.decode_without_bom_handling(windows1250_bytes);
        let utf8_content = decoded.as_bytes();

        // The UTF-8 content should contain the Polish characters
        let utf8_str = std::str::from_utf8(utf8_content).unwrap();
        assert!(utf8_str.contains('ż'), "Should contain ż: {}", utf8_str);
        assert!(utf8_str.contains('ó'), "Should contain ó: {}", utf8_str);
        assert!(utf8_str.contains('ł'), "Should contain ł: {}", utf8_str);
        assert!(utf8_str.contains('ć'), "Should contain ć: {}", utf8_str);

        // Convert back to Windows-1250
        let back = convert_from_utf8(utf8_content, Encoding::Windows1250);
        assert_eq!(back, windows1250_bytes, "Round-trip should preserve bytes");
    }

    #[test]
    fn test_windows1250_description() {
        assert_eq!(
            Encoding::Windows1250.description(),
            "Windows-1250 / CP1250 – Central European"
        );
    }

    #[test]
    fn test_detect_windows1250_definitive_bytes() {
        // Bytes 0x8D (Ť), 0x8F (Ź), 0x9D (ť) are undefined in Windows-1252
        // but valid in Windows-1250, so they definitively indicate Windows-1250

        // Czech text with ť (0x9D): "měsťo" (city, archaic)
        let with_t_caron = [0x6D, 0x9D, 0x73, 0x74, 0x6F]; // mťsto
        assert_eq!(
            detect_encoding(&with_t_caron),
            Encoding::Windows1250,
            "Byte 0x9D (ť) should trigger Windows-1250 detection"
        );

        // Polish text with Ź (0x8F): "Źródło" (source)
        let with_z_acute_upper = [0x8F, 0x72, 0xF3, 0x64, 0xB3, 0x6F]; // Źródło
        assert_eq!(
            detect_encoding(&with_z_acute_upper),
            Encoding::Windows1250,
            "Byte 0x8F (Ź) should trigger Windows-1250 detection"
        );
    }

    #[test]
    fn test_detect_windows1250_strong_indicators() {
        // Polish text with ś (0x9C) and Ś (0x8C) - strong indicators from 0x80-0x9F range
        let polish_text = [
            0x9C, 0x77, 0x69, 0x65, 0x74, 0x79, 0x20, // "świety "
            0x8C, 0x77, 0x69, 0x61, 0x74, // "Świat"
        ];
        assert_eq!(
            detect_encoding(&polish_text),
            Encoding::Windows1250,
            "Multiple Polish characters (ś, Ś) should trigger Windows-1250"
        );
    }

    #[test]
    fn test_detect_ambiguous_bytes_as_windows1252() {
        // Bytes in 0xA0-0xFF range are ambiguous and should default to Windows-1252
        // Polish "żółć" - ż(0xBF) ó(0xF3) ł(0xB3) ć(0xE6) - all ambiguous
        let zolc = [0xBF, 0xF3, 0xB3, 0xE6];
        assert_eq!(
            detect_encoding(&zolc),
            Encoding::Windows1252,
            "Ambiguous bytes should default to Windows-1252"
        );

        // ą (0xB9) and ł (0xB3) could be ¹ and ³ in Windows-1252
        let ambiguous = [
            0x6D, 0xB9, 0x6B, 0x61, 0x20, // "mąka " or "m¹ka "
            0x6D, 0xB3, 0x6F, 0x64, 0x79, // "młody" or "m³ody"
        ];
        assert_eq!(
            detect_encoding(&ambiguous),
            Encoding::Windows1252,
            "Ambiguous Polish bytes should default to Windows-1252"
        );
    }

    #[test]
    fn test_detect_windows1250_czech_pangram() {
        // "Příliš žluťoučký kůň úpěl ďábelské ódy" - Czech pangram in Windows-1250
        // Contains ť (0x9D) which is a definitive Windows-1250 indicator
        let czech_pangram: &[u8] = &[
            0x50, 0xF8, 0xED, 0x6C, 0x69, 0x9A, 0x20, // "Příliš "
            0x9E, 0x6C, 0x75, 0x9D, 0x6F, 0x75, 0xE8, 0x6B, 0xFD, 0x20, // "žluťoučký "
            0x6B, 0xF9, 0xF2, 0x20, // "kůň "
            0xFA, 0x70, 0xEC, 0x6C, 0x20, // "úpěl "
            0xEF, 0xE1, 0x62, 0x65, 0x6C, 0x73, 0x6B, 0xE9, 0x20, // "ďábelské "
            0xF3, 0x64, 0x79, // "ódy"
        ];
        assert_eq!(
            detect_encoding(czech_pangram),
            Encoding::Windows1250,
            "Czech pangram should be detected as Windows-1250 (contains ť = 0x9D)"
        );
    }

    #[test]
    fn test_detect_windows1252_not_1250() {
        // Pure Windows-1252 text without Central European indicators
        // "Café résumé" in Windows-1252
        let windows1252_text = [
            0x43, 0x61, 0x66, 0xE9, 0x20, // "Café "
            0x72, 0xE9, 0x73, 0x75, 0x6D, 0xE9, // "résumé"
        ];
        assert_eq!(
            detect_encoding(&windows1252_text),
            Encoding::Windows1252,
            "French text should remain Windows-1252"
        );
    }

    #[test]
    fn test_detect_utf8_chinese_truncated_sequence() {
        // Test that UTF-8 Chinese text is correctly detected even when the sample
        // is truncated in the middle of a multi-byte sequence.
        //
        // Bug context: When sampling first 8KB for detection, the boundary may cut
        // through a multi-byte UTF-8 character. This caused valid UTF-8 Chinese text
        // to fail std::str::from_utf8() validation and fall through to Windows-1250
        // detection (because UTF-8 continuation bytes like 0x9C, 0x9D overlap with
        // Windows-1250 indicator bytes).

        // Chinese text "更多" (more) = [0xE6, 0x9B, 0xB4, 0xE5, 0xA4, 0x9A]
        // If we truncate after 0xE5, we get an incomplete sequence
        let utf8_chinese_truncated = [
            0xE6, 0x9B, 0xB4, //            0xE5, 0xA4, 0x9A, //            0xE5, // Start of another character, incomplete
        ];

        // This should still be detected as UTF-8, not Windows-1250
        assert_eq!(
            detect_encoding(&utf8_chinese_truncated),
            Encoding::Utf8,
            "Truncated UTF-8 Chinese text should be detected as UTF-8"
        );

        // Test with 2 bytes of incomplete sequence
        let utf8_chinese_truncated_2 = [
            0xE6, 0x9B, 0xB4, //            0xE5, 0xA4, 0x9A, //            0xE5, 0xA4, // Incomplete 3-byte sequence (missing last byte)
        ];
        assert_eq!(
            detect_encoding(&utf8_chinese_truncated_2),
            Encoding::Utf8,
            "Truncated UTF-8 with 2-byte incomplete sequence should be detected as UTF-8"
        );
    }

    #[test]
    fn test_detect_utf8_chinese_with_high_bytes() {
        // UTF-8 Chinese text contains many continuation bytes in the 0x80-0xBF range,
        // including bytes like 0x9C, 0x9D that happen to be Windows-1250 indicators.
        // These should NOT trigger Windows-1250 detection for valid UTF-8 content.

        // Chinese characters that use continuation bytes that overlap with Windows-1250 indicators:
        // 集 = E9 9B 86 (contains 0x9B)
        // 精 = E7 B2 BE (contains 0xB2, 0xBE)
        // Build a string with many such characters
        let chinese_text = "更多全本全集精校小说"; // Contains various high continuation bytes
        let bytes = chinese_text.as_bytes();

        assert_eq!(
            detect_encoding(bytes),
            Encoding::Utf8,
            "UTF-8 Chinese text should be detected as UTF-8, not Windows-1250"
        );

        // Verify these bytes would have triggered Windows-1250 detection if not valid UTF-8
        // by checking that the sample contains bytes in the 0x80-0x9F range
        let has_high_continuation_bytes = bytes.iter().any(|&b| (0x80..0xA0).contains(&b));
        assert!(
            has_high_continuation_bytes,
            "Test should include bytes that could be mistaken for Windows-1250 indicators"
        );
    }

    #[test]
    fn test_detect_utf8_sample_truncation_at_boundary() {
        // Simulate what happens when we take an 8KB sample that ends mid-character
        // by creating a buffer that's valid UTF-8 except for the last 1-3 bytes

        // Build a large UTF-8 Chinese text buffer
        let chinese = "我的美女老师"; // "My Beautiful Teacher"
        let mut buffer = Vec::new();
        // Repeat to make it substantial
        for _ in 0..100 {
            buffer.extend_from_slice(chinese.as_bytes());
        }

        // Verify it's valid UTF-8 when complete
        assert!(std::str::from_utf8(&buffer).is_ok());
        assert_eq!(detect_encoding(&buffer), Encoding::Utf8);

        // Now truncate at various points that cut through multi-byte sequences
        // Each Chinese character is 3 bytes in UTF-8
        for truncate_offset in 1..=3 {
            let truncated_len = buffer.len() - truncate_offset;
            let truncated = &buffer[..truncated_len];

            // The truncated buffer should fail strict UTF-8 validation
            // (unless we happen to cut at a character boundary)
            let is_strict_valid = std::str::from_utf8(truncated).is_ok();

            // But our encoding detection should still detect it as UTF-8
            let detected = detect_encoding(truncated);
            assert_eq!(
                detected,
                Encoding::Utf8,
                "Truncated UTF-8 at offset -{} should be detected as UTF-8, strict_valid={}",
                truncate_offset,
                is_strict_valid
            );
        }
    }
}