compact_enc_det/
lib.rs

1#![allow(non_camel_case_types)]
2
3use std::fmt;
4
5#[derive(Debug, Clone, Copy, PartialEq, Eq)]
6pub struct InvalidEnumValue {
7    pub raw: i32,
8}
9
10impl fmt::Display for InvalidEnumValue {
11    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
12        write!(f, "invalid enum value: {}", self.raw)
13    }
14}
15
16impl std::error::Error for InvalidEnumValue {}
17
18macro_rules! define_enum {
19    (
20        $(#[$meta:meta])*
21        $vis:vis enum $name:ident {
22            $(
23                $variant:ident = $value:expr,
24            )+
25        }
26    ) => {
27        $(#[$meta])*
28        #[repr(i32)]
29        #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
30        $vis enum $name {
31            $(
32                $variant = $value,
33            )+
34        }
35
36        impl $name {
37            pub const fn as_raw(self) -> i32 {
38                self as i32
39            }
40        }
41
42        impl TryFrom<i32> for $name {
43            type Error = InvalidEnumValue;
44
45            fn try_from(value: i32) -> Result<Self, Self::Error> {
46                match value {
47                    $(
48                        $value => Ok(Self::$variant),
49                    )+
50                    _ => Err(InvalidEnumValue { raw: value }),
51                }
52            }
53        }
54    };
55}
56
57define_enum! {
58    /// `util/encodings/encodings.pb.h`
59    pub enum Encoding {
60        ISO_8859_1 = 0,
61        ISO_8859_2 = 1,
62        ISO_8859_3 = 2,
63        ISO_8859_4 = 3,
64        ISO_8859_5 = 4,
65        ISO_8859_6 = 5,
66        ISO_8859_7 = 6,
67        ISO_8859_8 = 7,
68        ISO_8859_9 = 8,
69        ISO_8859_10 = 9,
70        JAPANESE_EUC_JP = 10,
71        JAPANESE_SHIFT_JIS = 11,
72        JAPANESE_JIS = 12,
73        CHINESE_BIG5 = 13,
74        CHINESE_GB = 14,
75        CHINESE_EUC_CN = 15,
76        KOREAN_EUC_KR = 16,
77        UNICODE = 17,
78        CHINESE_EUC_DEC = 18,
79        CHINESE_CNS = 19,
80        CHINESE_BIG5_CP950 = 20,
81        JAPANESE_CP932 = 21,
82        UTF8 = 22,
83        UNKNOWN_ENCODING = 23,
84        ASCII_7BIT = 24,
85        RUSSIAN_KOI8_R = 25,
86        RUSSIAN_CP1251 = 26,
87        MSFT_CP1252 = 27,
88        RUSSIAN_KOI8_RU = 28,
89        MSFT_CP1250 = 29,
90        ISO_8859_15 = 30,
91        MSFT_CP1254 = 31,
92        MSFT_CP1257 = 32,
93        ISO_8859_11 = 33,
94        MSFT_CP874 = 34,
95        MSFT_CP1256 = 35,
96        MSFT_CP1255 = 36,
97        ISO_8859_8_I = 37,
98        HEBREW_VISUAL = 38,
99        CZECH_CP852 = 39,
100        CZECH_CSN_369103 = 40,
101        MSFT_CP1253 = 41,
102        RUSSIAN_CP866 = 42,
103        ISO_8859_13 = 43,
104        ISO_2022_KR = 44,
105        GBK = 45,
106        GB18030 = 46,
107        BIG5_HKSCS = 47,
108        ISO_2022_CN = 48,
109        TSCII = 49,
110        TAMIL_MONO = 50,
111        TAMIL_BI = 51,
112        JAGRAN = 52,
113        MACINTOSH_ROMAN = 53,
114        UTF7 = 54,
115        BHASKAR = 55,
116        HTCHANAKYA = 56,
117        UTF16BE = 57,
118        UTF16LE = 58,
119        UTF32BE = 59,
120        UTF32LE = 60,
121        BINARYENC = 61,
122        HZ_GB_2312 = 62,
123        UTF8UTF8 = 63,
124        TAM_ELANGO = 64,
125        TAM_LTTMBARANI = 65,
126        TAM_SHREE = 66,
127        TAM_TBOOMIS = 67,
128        TAM_TMNEWS = 68,
129        TAM_WEBTAMIL = 69,
130        KDDI_SHIFT_JIS = 70,
131        DOCOMO_SHIFT_JIS = 71,
132        SOFTBANK_SHIFT_JIS = 72,
133        KDDI_ISO_2022_JP = 73,
134        SOFTBANK_ISO_2022_JP = 74,
135        NUM_ENCODINGS = 75,
136    }
137}
138
139define_enum! {
140    /// `util/languages/languages.pb.h`
141    pub enum Language {
142        ENGLISH = 0,
143        DANISH = 1,
144        DUTCH = 2,
145        FINNISH = 3,
146        FRENCH = 4,
147        GERMAN = 5,
148        HEBREW = 6,
149        ITALIAN = 7,
150        JAPANESE = 8,
151        KOREAN = 9,
152        NORWEGIAN = 10,
153        POLISH = 11,
154        PORTUGUESE = 12,
155        RUSSIAN = 13,
156        SPANISH = 14,
157        SWEDISH = 15,
158        CHINESE = 16,
159        CZECH = 17,
160        GREEK = 18,
161        ICELANDIC = 19,
162        LATVIAN = 20,
163        LITHUANIAN = 21,
164        ROMANIAN = 22,
165        HUNGARIAN = 23,
166        ESTONIAN = 24,
167        TG_UNKNOWN_LANGUAGE = 25,
168        UNKNOWN_LANGUAGE = 26,
169        BULGARIAN = 27,
170        CROATIAN = 28,
171        SERBIAN = 29,
172        IRISH = 30,
173        GALICIAN = 31,
174        TAGALOG = 32,
175        TURKISH = 33,
176        UKRAINIAN = 34,
177        HINDI = 35,
178        MACEDONIAN = 36,
179        BENGALI = 37,
180        INDONESIAN = 38,
181        LATIN = 39,
182        MALAY = 40,
183        MALAYALAM = 41,
184        WELSH = 42,
185        NEPALI = 43,
186        TELUGU = 44,
187        ALBANIAN = 45,
188        TAMIL = 46,
189        BELARUSIAN = 47,
190        JAVANESE = 48,
191        OCCITAN = 49,
192        URDU = 50,
193        BIHARI = 51,
194        GUJARATI = 52,
195        THAI = 53,
196        ARABIC = 54,
197        CATALAN = 55,
198        ESPERANTO = 56,
199        BASQUE = 57,
200        INTERLINGUA = 58,
201        KANNADA = 59,
202        PUNJABI = 60,
203        SCOTS_GAELIC = 61,
204        SWAHILI = 62,
205        SLOVENIAN = 63,
206        MARATHI = 64,
207        MALTESE = 65,
208        VIETNAMESE = 66,
209        FRISIAN = 67,
210        SLOVAK = 68,
211        CHINESE_T = 69,
212        FAROESE = 70,
213        SUNDANESE = 71,
214        UZBEK = 72,
215        AMHARIC = 73,
216        AZERBAIJANI = 74,
217        GEORGIAN = 75,
218        TIGRINYA = 76,
219        PERSIAN = 77,
220        BOSNIAN = 78,
221        SINHALESE = 79,
222        NORWEGIAN_N = 80,
223        PORTUGUESE_P = 81,
224        PORTUGUESE_B = 82,
225        XHOSA = 83,
226        ZULU = 84,
227        GUARANI = 85,
228        SESOTHO = 86,
229        TURKMEN = 87,
230        KYRGYZ = 88,
231        BRETON = 89,
232        TWI = 90,
233        YIDDISH = 91,
234        SERBO_CROATIAN = 92,
235        SOMALI = 93,
236        UIGHUR = 94,
237        KURDISH = 95,
238        MONGOLIAN = 96,
239        ARMENIAN = 97,
240        LAOTHIAN = 98,
241        SINDHI = 99,
242        RHAETO_ROMANCE = 100,
243        AFRIKAANS = 101,
244        LUXEMBOURGISH = 102,
245        BURMESE = 103,
246        KHMER = 104,
247        TIBETAN = 105,
248        DHIVEHI = 106,
249        CHEROKEE = 107,
250        SYRIAC = 108,
251        LIMBU = 109,
252        ORIYA = 110,
253        ASSAMESE = 111,
254        CORSICAN = 112,
255        INTERLINGUE = 113,
256        KAZAKH = 114,
257        LINGALA = 115,
258        MOLDAVIAN = 116,
259        PASHTO = 117,
260        QUECHUA = 118,
261        SHONA = 119,
262        TAJIK = 120,
263        TATAR = 121,
264        TONGA = 122,
265        YORUBA = 123,
266        CREOLES_AND_PIDGINS_ENGLISH_BASED = 124,
267        CREOLES_AND_PIDGINS_FRENCH_BASED = 125,
268        CREOLES_AND_PIDGINS_PORTUGUESE_BASED = 126,
269        CREOLES_AND_PIDGINS_OTHER = 127,
270        MAORI = 128,
271        WOLOF = 129,
272        ABKHAZIAN = 130,
273        AFAR = 131,
274        AYMARA = 132,
275        BASHKIR = 133,
276        BISLAMA = 134,
277        DZONGKHA = 135,
278        FIJIAN = 136,
279        GREENLANDIC = 137,
280        HAUSA = 138,
281        HAITIAN_CREOLE = 139,
282        INUPIAK = 140,
283        INUKTITUT = 141,
284        KASHMIRI = 142,
285        KINYARWANDA = 143,
286        MALAGASY = 144,
287        NAURU = 145,
288        OROMO = 146,
289        RUNDI = 147,
290        SAMOAN = 148,
291        SANGO = 149,
292        SANSKRIT = 150,
293        SISWANT = 151,
294        TSONGA = 152,
295        TSWANA = 153,
296        VOLAPUK = 154,
297        ZHUANG = 155,
298        KHASI = 156,
299        SCOTS = 157,
300        GANDA = 158,
301        MANX = 159,
302        MONTENEGRIN = 160,
303        NUM_LANGUAGES = 161,
304    }
305}
306
307define_enum! {
308    /// `CompactEncDet::TextCorpusType` 
309    pub enum TextCorpusType {
310        WEB_CORPUS = 0,
311        XML_CORPUS = 1,
312        QUERY_CORPUS = 2,
313        EMAIL_CORPUS = 3,
314        NUM_CORPA = 4,
315    }
316}
317
318#[derive(Debug, Clone, PartialEq, Eq)]
319pub struct Detection {
320    pub mime_name: String,
321    pub encoding: Encoding,
322    pub bytes_consumed: usize,
323    pub is_reliable: bool,
324}
325
326#[derive(Debug, Clone)]
327pub struct DetectHints<'a> {
328    pub url_hint: &'a str,
329    pub http_charset_hint: &'a str,
330    pub meta_charset_hint: &'a str,
331    pub encoding_hint: Option<Encoding>,
332    pub language_hint: Option<Language>,
333    pub corpus_type: TextCorpusType,
334    pub ignore_7bit_mail_encodings: bool,
335}
336
337impl<'a> Default for DetectHints<'a> {
338    fn default() -> Self {
339        Self {
340            url_hint: "",
341            http_charset_hint: "",
342            meta_charset_hint: "",
343            encoding_hint: None,
344            language_hint: None,
345            corpus_type: TextCorpusType::QUERY_CORPUS,
346            ignore_7bit_mail_encodings: true,
347        }
348    }
349}
350
351pub fn detect_encoding(bytes: &[u8], hints: DetectHints<'_>) -> Detection {
352    let encoding_hint = hints.encoding_hint.map(|e| e.as_raw()).unwrap_or(-1);
353    let language_hint = hints.language_hint.map(|l| l.as_raw()).unwrap_or(-1);
354
355    let result = compact_enc_det_sys::ced_detect_encoding(
356        bytes,
357        hints.url_hint,
358        hints.http_charset_hint,
359        hints.meta_charset_hint,
360        encoding_hint,
361        language_hint,
362        hints.corpus_type.as_raw(),
363        hints.ignore_7bit_mail_encodings,
364    );
365
366    Detection {
367        mime_name: result.mime_name,
368        encoding: Encoding::try_from(result.encoding).unwrap_or(Encoding::UNKNOWN_ENCODING),
369        bytes_consumed: result.bytes_consumed as usize,
370        is_reliable: result.is_reliable,
371    }
372}
373
374#[cfg(test)]
375mod tests {
376    use super::*;
377
378    #[test]
379    fn detect_utf8_plain_text() {
380        let payload = "Rust makes FFI safer. 编码检测 UTF-8 mixed text.".repeat(50);
381        let detection = detect_encoding(payload.as_bytes(), DetectHints::default());
382
383        assert_eq!(detection.encoding, Encoding::UTF8);
384        assert!(detection.is_reliable);
385        assert!(detection.bytes_consumed > 0);
386    }
387}