1#![allow(non_camel_case_types)]
2
3use std::fmt;
4
5#[derive(Debug, Clone, Copy, PartialEq, Eq)]
6pub struct InvalidEnumValue {
7 pub raw: i32,
8}
9
10impl fmt::Display for InvalidEnumValue {
11 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
12 write!(f, "invalid enum value: {}", self.raw)
13 }
14}
15
16impl std::error::Error for InvalidEnumValue {}
17
18macro_rules! define_enum {
19 (
20 $(#[$meta:meta])*
21 $vis:vis enum $name:ident {
22 $(
23 $variant:ident = $value:expr,
24 )+
25 }
26 ) => {
27 $(#[$meta])*
28 #[repr(i32)]
29 #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
30 $vis enum $name {
31 $(
32 $variant = $value,
33 )+
34 }
35
36 impl $name {
37 pub const fn as_raw(self) -> i32 {
38 self as i32
39 }
40 }
41
42 impl TryFrom<i32> for $name {
43 type Error = InvalidEnumValue;
44
45 fn try_from(value: i32) -> Result<Self, Self::Error> {
46 match value {
47 $(
48 $value => Ok(Self::$variant),
49 )+
50 _ => Err(InvalidEnumValue { raw: value }),
51 }
52 }
53 }
54 };
55}
56
57define_enum! {
58 pub enum Encoding {
60 ISO_8859_1 = 0,
61 ISO_8859_2 = 1,
62 ISO_8859_3 = 2,
63 ISO_8859_4 = 3,
64 ISO_8859_5 = 4,
65 ISO_8859_6 = 5,
66 ISO_8859_7 = 6,
67 ISO_8859_8 = 7,
68 ISO_8859_9 = 8,
69 ISO_8859_10 = 9,
70 JAPANESE_EUC_JP = 10,
71 JAPANESE_SHIFT_JIS = 11,
72 JAPANESE_JIS = 12,
73 CHINESE_BIG5 = 13,
74 CHINESE_GB = 14,
75 CHINESE_EUC_CN = 15,
76 KOREAN_EUC_KR = 16,
77 UNICODE = 17,
78 CHINESE_EUC_DEC = 18,
79 CHINESE_CNS = 19,
80 CHINESE_BIG5_CP950 = 20,
81 JAPANESE_CP932 = 21,
82 UTF8 = 22,
83 UNKNOWN_ENCODING = 23,
84 ASCII_7BIT = 24,
85 RUSSIAN_KOI8_R = 25,
86 RUSSIAN_CP1251 = 26,
87 MSFT_CP1252 = 27,
88 RUSSIAN_KOI8_RU = 28,
89 MSFT_CP1250 = 29,
90 ISO_8859_15 = 30,
91 MSFT_CP1254 = 31,
92 MSFT_CP1257 = 32,
93 ISO_8859_11 = 33,
94 MSFT_CP874 = 34,
95 MSFT_CP1256 = 35,
96 MSFT_CP1255 = 36,
97 ISO_8859_8_I = 37,
98 HEBREW_VISUAL = 38,
99 CZECH_CP852 = 39,
100 CZECH_CSN_369103 = 40,
101 MSFT_CP1253 = 41,
102 RUSSIAN_CP866 = 42,
103 ISO_8859_13 = 43,
104 ISO_2022_KR = 44,
105 GBK = 45,
106 GB18030 = 46,
107 BIG5_HKSCS = 47,
108 ISO_2022_CN = 48,
109 TSCII = 49,
110 TAMIL_MONO = 50,
111 TAMIL_BI = 51,
112 JAGRAN = 52,
113 MACINTOSH_ROMAN = 53,
114 UTF7 = 54,
115 BHASKAR = 55,
116 HTCHANAKYA = 56,
117 UTF16BE = 57,
118 UTF16LE = 58,
119 UTF32BE = 59,
120 UTF32LE = 60,
121 BINARYENC = 61,
122 HZ_GB_2312 = 62,
123 UTF8UTF8 = 63,
124 TAM_ELANGO = 64,
125 TAM_LTTMBARANI = 65,
126 TAM_SHREE = 66,
127 TAM_TBOOMIS = 67,
128 TAM_TMNEWS = 68,
129 TAM_WEBTAMIL = 69,
130 KDDI_SHIFT_JIS = 70,
131 DOCOMO_SHIFT_JIS = 71,
132 SOFTBANK_SHIFT_JIS = 72,
133 KDDI_ISO_2022_JP = 73,
134 SOFTBANK_ISO_2022_JP = 74,
135 NUM_ENCODINGS = 75,
136 }
137}
138
139define_enum! {
140 pub enum Language {
142 ENGLISH = 0,
143 DANISH = 1,
144 DUTCH = 2,
145 FINNISH = 3,
146 FRENCH = 4,
147 GERMAN = 5,
148 HEBREW = 6,
149 ITALIAN = 7,
150 JAPANESE = 8,
151 KOREAN = 9,
152 NORWEGIAN = 10,
153 POLISH = 11,
154 PORTUGUESE = 12,
155 RUSSIAN = 13,
156 SPANISH = 14,
157 SWEDISH = 15,
158 CHINESE = 16,
159 CZECH = 17,
160 GREEK = 18,
161 ICELANDIC = 19,
162 LATVIAN = 20,
163 LITHUANIAN = 21,
164 ROMANIAN = 22,
165 HUNGARIAN = 23,
166 ESTONIAN = 24,
167 TG_UNKNOWN_LANGUAGE = 25,
168 UNKNOWN_LANGUAGE = 26,
169 BULGARIAN = 27,
170 CROATIAN = 28,
171 SERBIAN = 29,
172 IRISH = 30,
173 GALICIAN = 31,
174 TAGALOG = 32,
175 TURKISH = 33,
176 UKRAINIAN = 34,
177 HINDI = 35,
178 MACEDONIAN = 36,
179 BENGALI = 37,
180 INDONESIAN = 38,
181 LATIN = 39,
182 MALAY = 40,
183 MALAYALAM = 41,
184 WELSH = 42,
185 NEPALI = 43,
186 TELUGU = 44,
187 ALBANIAN = 45,
188 TAMIL = 46,
189 BELARUSIAN = 47,
190 JAVANESE = 48,
191 OCCITAN = 49,
192 URDU = 50,
193 BIHARI = 51,
194 GUJARATI = 52,
195 THAI = 53,
196 ARABIC = 54,
197 CATALAN = 55,
198 ESPERANTO = 56,
199 BASQUE = 57,
200 INTERLINGUA = 58,
201 KANNADA = 59,
202 PUNJABI = 60,
203 SCOTS_GAELIC = 61,
204 SWAHILI = 62,
205 SLOVENIAN = 63,
206 MARATHI = 64,
207 MALTESE = 65,
208 VIETNAMESE = 66,
209 FRISIAN = 67,
210 SLOVAK = 68,
211 CHINESE_T = 69,
212 FAROESE = 70,
213 SUNDANESE = 71,
214 UZBEK = 72,
215 AMHARIC = 73,
216 AZERBAIJANI = 74,
217 GEORGIAN = 75,
218 TIGRINYA = 76,
219 PERSIAN = 77,
220 BOSNIAN = 78,
221 SINHALESE = 79,
222 NORWEGIAN_N = 80,
223 PORTUGUESE_P = 81,
224 PORTUGUESE_B = 82,
225 XHOSA = 83,
226 ZULU = 84,
227 GUARANI = 85,
228 SESOTHO = 86,
229 TURKMEN = 87,
230 KYRGYZ = 88,
231 BRETON = 89,
232 TWI = 90,
233 YIDDISH = 91,
234 SERBO_CROATIAN = 92,
235 SOMALI = 93,
236 UIGHUR = 94,
237 KURDISH = 95,
238 MONGOLIAN = 96,
239 ARMENIAN = 97,
240 LAOTHIAN = 98,
241 SINDHI = 99,
242 RHAETO_ROMANCE = 100,
243 AFRIKAANS = 101,
244 LUXEMBOURGISH = 102,
245 BURMESE = 103,
246 KHMER = 104,
247 TIBETAN = 105,
248 DHIVEHI = 106,
249 CHEROKEE = 107,
250 SYRIAC = 108,
251 LIMBU = 109,
252 ORIYA = 110,
253 ASSAMESE = 111,
254 CORSICAN = 112,
255 INTERLINGUE = 113,
256 KAZAKH = 114,
257 LINGALA = 115,
258 MOLDAVIAN = 116,
259 PASHTO = 117,
260 QUECHUA = 118,
261 SHONA = 119,
262 TAJIK = 120,
263 TATAR = 121,
264 TONGA = 122,
265 YORUBA = 123,
266 CREOLES_AND_PIDGINS_ENGLISH_BASED = 124,
267 CREOLES_AND_PIDGINS_FRENCH_BASED = 125,
268 CREOLES_AND_PIDGINS_PORTUGUESE_BASED = 126,
269 CREOLES_AND_PIDGINS_OTHER = 127,
270 MAORI = 128,
271 WOLOF = 129,
272 ABKHAZIAN = 130,
273 AFAR = 131,
274 AYMARA = 132,
275 BASHKIR = 133,
276 BISLAMA = 134,
277 DZONGKHA = 135,
278 FIJIAN = 136,
279 GREENLANDIC = 137,
280 HAUSA = 138,
281 HAITIAN_CREOLE = 139,
282 INUPIAK = 140,
283 INUKTITUT = 141,
284 KASHMIRI = 142,
285 KINYARWANDA = 143,
286 MALAGASY = 144,
287 NAURU = 145,
288 OROMO = 146,
289 RUNDI = 147,
290 SAMOAN = 148,
291 SANGO = 149,
292 SANSKRIT = 150,
293 SISWANT = 151,
294 TSONGA = 152,
295 TSWANA = 153,
296 VOLAPUK = 154,
297 ZHUANG = 155,
298 KHASI = 156,
299 SCOTS = 157,
300 GANDA = 158,
301 MANX = 159,
302 MONTENEGRIN = 160,
303 NUM_LANGUAGES = 161,
304 }
305}
306
307define_enum! {
308 pub enum TextCorpusType {
310 WEB_CORPUS = 0,
311 XML_CORPUS = 1,
312 QUERY_CORPUS = 2,
313 EMAIL_CORPUS = 3,
314 NUM_CORPA = 4,
315 }
316}
317
318#[derive(Debug, Clone, PartialEq, Eq)]
319pub struct Detection {
320 pub mime_name: String,
321 pub encoding: Encoding,
322 pub bytes_consumed: usize,
323 pub is_reliable: bool,
324}
325
326#[derive(Debug, Clone)]
327pub struct DetectHints<'a> {
328 pub url_hint: &'a str,
329 pub http_charset_hint: &'a str,
330 pub meta_charset_hint: &'a str,
331 pub encoding_hint: Option<Encoding>,
332 pub language_hint: Option<Language>,
333 pub corpus_type: TextCorpusType,
334 pub ignore_7bit_mail_encodings: bool,
335}
336
337impl<'a> Default for DetectHints<'a> {
338 fn default() -> Self {
339 Self {
340 url_hint: "",
341 http_charset_hint: "",
342 meta_charset_hint: "",
343 encoding_hint: None,
344 language_hint: None,
345 corpus_type: TextCorpusType::QUERY_CORPUS,
346 ignore_7bit_mail_encodings: true,
347 }
348 }
349}
350
351pub fn detect_encoding(bytes: &[u8], hints: DetectHints<'_>) -> Detection {
352 let encoding_hint = hints.encoding_hint.map(|e| e.as_raw()).unwrap_or(-1);
353 let language_hint = hints.language_hint.map(|l| l.as_raw()).unwrap_or(-1);
354
355 let result = compact_enc_det_sys::ced_detect_encoding(
356 bytes,
357 hints.url_hint,
358 hints.http_charset_hint,
359 hints.meta_charset_hint,
360 encoding_hint,
361 language_hint,
362 hints.corpus_type.as_raw(),
363 hints.ignore_7bit_mail_encodings,
364 );
365
366 Detection {
367 mime_name: result.mime_name,
368 encoding: Encoding::try_from(result.encoding).unwrap_or(Encoding::UNKNOWN_ENCODING),
369 bytes_consumed: result.bytes_consumed as usize,
370 is_reliable: result.is_reliable,
371 }
372}
373
374#[cfg(test)]
375mod tests {
376 use super::*;
377
378 #[test]
379 fn detect_utf8_plain_text() {
380 let payload = "Rust makes FFI safer. 编码检测 UTF-8 mixed text.".repeat(50);
381 let detection = detect_encoding(payload.as_bytes(), DetectHints::default());
382
383 assert_eq!(detection.encoding, Encoding::UTF8);
384 assert!(detection.is_reliable);
385 assert!(detection.bytes_consumed > 0);
386 }
387}