1#[cxx::bridge]
2#[allow(clippy::too_many_arguments)]
3mod ffi {
4 #[derive(Debug, Clone, PartialEq)]
5 pub struct CedResult {
6 mime_name: String,
7 encoding: i32,
8 bytes_consumed: i32,
9 is_reliable: bool,
10 }
11
12 unsafe extern "C++" {
13 include!("ced_wrapper.h");
14
15 fn ced_detect_encoding(
16 bytes: &[u8],
17 url_hint: &str,
18 http_charset_hint: &str,
19 meta_charset_hint: &str,
20 encoding_hint: i32,
21 language_hint: i32,
22 corpus_type: i32,
23 ignore_7bit_mail_encodings: bool,
24 ) -> CedResult;
25 }
26}
27
28pub use ffi::*;
29
30#[cfg(test)]
31mod tests {
32 use super::*;
33
34 use encoding_rs::{EUC_KR, Encoding, GB18030, SHIFT_JIS, UTF_8, WINDOWS_1251};
35
36 const MIN_PAYLOAD_BYTES: usize = 4 * 1024 + 256;
37 const QUERY_CORPUS: i32 = 2;
39 const ENC_UTF8: i32 = 22;
41 const ENC_GB2312: i32 = 14;
42 const ENC_SHIFT_JIS: i32 = 11;
43 const ENC_WINDOWS_1251: i32 = 26;
44 const ENC_EUC_KR: i32 = 16;
45
46 struct TestCase {
47 name: &'static str,
48 encoding: &'static Encoding,
49 expected_mime: &'static str,
50 expected_id: i32,
51 sample: &'static str,
52 }
53
54 fn build_payload(sample: &str, encoding: &'static Encoding) -> Vec<u8> {
55 let mut repeats = (MIN_PAYLOAD_BYTES / sample.len()).max(2) + 1;
57 loop {
58 let mut text = String::with_capacity(sample.len() * repeats);
59 for _ in 0..repeats {
60 text.push_str(sample);
61 }
62
63 let (encoded, actual_encoding, had_errors) = encoding.encode(&text);
64 assert!(
65 !had_errors,
66 "无法用 {} 编码样本文本({})",
67 encoding.name(),
68 sample
69 );
70 assert!(
71 std::ptr::eq(actual_encoding, encoding),
72 "编码检测用到的实际编码 {:?} 与预期 {:?} 不一致",
73 actual_encoding.name(),
74 encoding.name()
75 );
76
77 let bytes = encoded.into_owned();
78 if bytes.len() >= MIN_PAYLOAD_BYTES {
79 return bytes;
80 }
81
82 repeats += 1;
84 }
85 }
86
87 fn run_case(case: &TestCase) {
88 let payload = build_payload(case.sample, case.encoding);
89
90 let result = ced_detect_encoding(&payload, "", "", "", -1, -1, QUERY_CORPUS, true);
91
92 println!("result: {:?}", result);
93
94 assert!(
95 result.mime_name.eq_ignore_ascii_case(case.expected_mime),
96 "{} 检测结果 mime_name = {},期望 {}",
97 case.name,
98 result.mime_name,
99 case.expected_mime
100 );
101 assert_eq!(
102 result.encoding, case.expected_id,
103 "{} 检测出的编码枚举不符",
104 case.name
105 );
106 }
107
108 #[test]
109 fn detects_multiple_encodings_with_large_payloads() {
110 let cases = [
111 TestCase {
112 name: "utf8_mixed",
113 encoding: UTF_8,
114 expected_mime: "UTF-8",
115 expected_id: ENC_UTF8,
116 sample: "Rust 提供了安全与性能兼顾的编程体验,同时可以混合多种语言字符,例如中文、Русский текст、日本語、한국어,以及 emoji 😊🚀。",
117 },
118 TestCase {
119 name: "GBK",
120 encoding: GB18030,
121 expected_mime: "GB2312",
122 expected_id: ENC_GB2312,
123 sample: "熟悉的车轴声,明媚的春日阳光沿着林荫之中的缝隙顽强地坠落在地面上,两侧森林中间的泥土路上布满了腐烂的落叶,发出难听的声音。
124 偶尔车轮会碰上一个小石头,整架廉价的运货马车马上热情响应,上下弹动,老旧的木头结构已经不太稳定,每一次弹跳都似乎令其距离宣布解体更近一分。
125 空气中弥散着一股芳草的清香,混合着泥土与树叶的气息,柔软如丝巾般的微风触摸着马车乘客的肌肤,不禁令人生出一丝惬意之感。
126 “唔!”躺在马车上的男人用力地弓起自己的腰身,好像一位破伤风病人,不受控制地抽搐着,震动着,肌肉紧绷,扭曲,他的头跟脚顶在不太结实的木板上,顶得车厢结构发出痛苦的声响。",
127 },
128 TestCase {
129 name: "shift_jis_japanese",
130 encoding: SHIFT_JIS,
131 expected_mime: "Shift_JIS",
132 expected_id: ENC_SHIFT_JIS,
133 sample: "これは日本語で書かれた長い文章で、文字コード検出のテストを行います。東京の風景や四季の移ろい、開発に関する説明を繰り返します。",
134 },
135 TestCase {
136 name: "windows_1251_russian",
137 encoding: WINDOWS_1251,
138 expected_mime: "windows-1251",
139 expected_id: ENC_WINDOWS_1251,
140 sample: "Это большой русский текст, который покрывает разные буквы алфавита,
141 подробно описывает тестирование кодировок и многократно повторяет фразы,
142 чтобы сделать данные длиннее.Это большой русский текст,
143 который покрывает разные буквы алфавита,
144 подробно описывает тестирование кодировок и многократно повторяет фразы,
145 чтобы сделать данные длиннее.",
146 },
147 TestCase {
148 name: "euc_kr_korean",
149 encoding: EUC_KR,
150 expected_mime: "EUC-KR",
151 expected_id: ENC_EUC_KR,
152 sample: "이 문장은 한국어 EUC-KR 인코딩으로 작성되었으며, 코드 검출 테스트를 위해 문화와 기술 이야기를 반복하여 길이를 늘립니다.",
153 },
154 ];
155
156 for case in cases {
157 run_case(&case);
158 }
159 }
160}