subx_cli/core/formats/encoding/
charset.rs

1/// Character encoding types supported by the subtitle processing system.
2///
3/// This enum covers the most common text encodings encountered in subtitle
4/// files across different languages and regions.
5#[derive(Debug, Clone, PartialEq, Eq, Hash)]
6pub enum Charset {
7    /// UTF-8 encoding (Unicode)
8    Utf8,
9    /// UTF-16 Little Endian encoding
10    Utf16Le,
11    /// UTF-16 Big Endian encoding
12    Utf16Be,
13    /// UTF-32 Little Endian encoding
14    Utf32Le,
15    /// UTF-32 Big Endian encoding
16    Utf32Be,
17    /// GBK encoding (Chinese Simplified)
18    Gbk,
19    /// Shift JIS encoding (Japanese)
20    ShiftJis,
21    /// ISO 8859-1 encoding (Latin-1)
22    Iso88591,
23    /// Windows-1252 encoding (Western European)
24    Windows1252,
25    /// Big5 encoding (Chinese Traditional)
26    Big5,
27    /// EUC-KR encoding (Korean)
28    Euckr,
29    /// Unknown or undetectable encoding
30    Unknown,
31}
32
33/// Encoding detection result information
34#[derive(Debug, Clone)]
35pub struct EncodingInfo {
36    /// Detected character set
37    pub charset: Charset,
38    /// Detection confidence (0.0-1.0)
39    pub confidence: f32,
40    /// Whether BOM was detected
41    pub bom_detected: bool,
42    /// Decoded sample text
43    pub sample_text: String,
44}