subx_cli/core/formats/encoding/charset.rs
1/// Character encoding types supported by the subtitle processing system.
2///
3/// This enum covers the most common text encodings encountered in subtitle
4/// files across different languages and regions.
5#[derive(Debug, Clone, PartialEq, Eq, Hash)]
6pub enum Charset {
7 /// UTF-8 encoding (Unicode)
8 Utf8,
9 /// UTF-16 Little Endian encoding
10 Utf16Le,
11 /// UTF-16 Big Endian encoding
12 Utf16Be,
13 /// UTF-32 Little Endian encoding
14 Utf32Le,
15 /// UTF-32 Big Endian encoding
16 Utf32Be,
17 /// GBK encoding (Chinese Simplified)
18 Gbk,
19 /// Shift JIS encoding (Japanese)
20 ShiftJis,
21 /// ISO 8859-1 encoding (Latin-1)
22 Iso88591,
23 /// Windows-1252 encoding (Western European)
24 Windows1252,
25 /// Big5 encoding (Chinese Traditional)
26 Big5,
27 /// EUC-KR encoding (Korean)
28 Euckr,
29 /// Unknown or undetectable encoding
30 Unknown,
31}
32
33/// Encoding detection result information
34#[derive(Debug, Clone)]
35pub struct EncodingInfo {
36 /// Detected character set
37 pub charset: Charset,
38 /// Detection confidence (0.0-1.0)
39 pub confidence: f32,
40 /// Whether BOM was detected
41 pub bom_detected: bool,
42 /// Decoded sample text
43 pub sample_text: String,
44}