compact_enc_det_sys/
lib.rs

1#[cxx::bridge]
2#[allow(clippy::too_many_arguments)]
3mod ffi {
4    #[derive(Debug, Clone, PartialEq)]
5    pub struct CedResult {
6        mime_name: String,
7        encoding: i32,
8        bytes_consumed: i32,
9        is_reliable: bool,
10    }
11
12    unsafe extern "C++" {
13        include!("ced_wrapper.h");
14
15        fn ced_detect_encoding(
16            bytes: &[u8],
17            url_hint: &str,
18            http_charset_hint: &str,
19            meta_charset_hint: &str,
20            encoding_hint: i32,
21            language_hint: i32,
22            corpus_type: i32,
23            ignore_7bit_mail_encodings: bool,
24        ) -> CedResult;
25    }
26}
27
28pub use ffi::*;
29
30#[cfg(test)]
31mod tests {
32    use super::*;
33
34    use encoding_rs::{EUC_KR, Encoding, GB18030, SHIFT_JIS, UTF_8, WINDOWS_1251};
35
36    const MIN_PAYLOAD_BYTES: usize = 4 * 1024 + 256;
37    // CompactEncDet::TextCorpusType::QUERY_CORPUS in compact_enc_det.h
38    const QUERY_CORPUS: i32 = 2;
39    // Encoding ids from util/encodings/encodings.pb.h
40    const ENC_UTF8: i32 = 22;
41    const ENC_GB2312: i32 = 14;
42    const ENC_SHIFT_JIS: i32 = 11;
43    const ENC_WINDOWS_1251: i32 = 26;
44    const ENC_EUC_KR: i32 = 16;
45
46    struct TestCase {
47        name: &'static str,
48        encoding: &'static Encoding,
49        expected_mime: &'static str,
50        expected_id: i32,
51        sample: &'static str,
52    }
53
54    fn build_payload(sample: &str, encoding: &'static Encoding) -> Vec<u8> {
55        // Repeat the sample until the encoded payload is comfortably above 4 KiB.
56        let mut repeats = (MIN_PAYLOAD_BYTES / sample.len()).max(2) + 1;
57        loop {
58            let mut text = String::with_capacity(sample.len() * repeats);
59            for _ in 0..repeats {
60                text.push_str(sample);
61            }
62
63            let (encoded, actual_encoding, had_errors) = encoding.encode(&text);
64            assert!(
65                !had_errors,
66                "无法用 {} 编码样本文本({})",
67                encoding.name(),
68                sample
69            );
70            assert!(
71                std::ptr::eq(actual_encoding, encoding),
72                "编码检测用到的实际编码 {:?} 与预期 {:?} 不一致",
73                actual_encoding.name(),
74                encoding.name()
75            );
76
77            let bytes = encoded.into_owned();
78            if bytes.len() >= MIN_PAYLOAD_BYTES {
79                return bytes;
80            }
81
82            // 如果仍未满足长度要求,再多重复一次。
83            repeats += 1;
84        }
85    }
86
87    fn run_case(case: &TestCase) {
88        let payload = build_payload(case.sample, case.encoding);
89
90        let result = ced_detect_encoding(&payload, "", "", "", -1, -1, QUERY_CORPUS, true);
91
92        println!("result: {:?}", result);
93
94        assert!(
95            result.mime_name.eq_ignore_ascii_case(case.expected_mime),
96            "{} 检测结果 mime_name = {},期望 {}",
97            case.name,
98            result.mime_name,
99            case.expected_mime
100        );
101        assert_eq!(
102            result.encoding, case.expected_id,
103            "{} 检测出的编码枚举不符",
104            case.name
105        );
106    }
107
108    #[test]
109    fn detects_multiple_encodings_with_large_payloads() {
110        let cases = [
111            TestCase {
112                name: "utf8_mixed",
113                encoding: UTF_8,
114                expected_mime: "UTF-8",
115                expected_id: ENC_UTF8,
116                sample: "Rust 提供了安全与性能兼顾的编程体验,同时可以混合多种语言字符,例如中文、Русский текст、日本語、한국어,以及 emoji 😊🚀。",
117            },
118            TestCase {
119                name: "GBK",
120                encoding: GB18030,
121                expected_mime: "GB2312",
122                expected_id: ENC_GB2312,
123                sample: "熟悉的车轴声,明媚的春日阳光沿着林荫之中的缝隙顽强地坠落在地面上,两侧森林中间的泥土路上布满了腐烂的落叶,发出难听的声音。
124  偶尔车轮会碰上一个小石头,整架廉价的运货马车马上热情响应,上下弹动,老旧的木头结构已经不太稳定,每一次弹跳都似乎令其距离宣布解体更近一分。
125  空气中弥散着一股芳草的清香,混合着泥土与树叶的气息,柔软如丝巾般的微风触摸着马车乘客的肌肤,不禁令人生出一丝惬意之感。
126  “唔!”躺在马车上的男人用力地弓起自己的腰身,好像一位破伤风病人,不受控制地抽搐着,震动着,肌肉紧绷,扭曲,他的头跟脚顶在不太结实的木板上,顶得车厢结构发出痛苦的声响。",
127            },
128            TestCase {
129                name: "shift_jis_japanese",
130                encoding: SHIFT_JIS,
131                expected_mime: "Shift_JIS",
132                expected_id: ENC_SHIFT_JIS,
133                sample: "これは日本語で書かれた長い文章で、文字コード検出のテストを行います。東京の風景や四季の移ろい、開発に関する説明を繰り返します。",
134            },
135            TestCase {
136                name: "windows_1251_russian",
137                encoding: WINDOWS_1251,
138                expected_mime: "windows-1251",
139                expected_id: ENC_WINDOWS_1251,
140                sample: "Это большой русский текст, который покрывает разные буквы алфавита,
141                подробно описывает тестирование кодировок и многократно повторяет фразы,
142                чтобы сделать данные длиннее.Это большой русский текст,
143                который покрывает разные буквы алфавита,
144                подробно описывает тестирование кодировок и многократно повторяет фразы,
145                чтобы сделать данные длиннее.",
146            },
147            TestCase {
148                name: "euc_kr_korean",
149                encoding: EUC_KR,
150                expected_mime: "EUC-KR",
151                expected_id: ENC_EUC_KR,
152                sample: "이 문장은 한국어 EUC-KR 인코딩으로 작성되었으며, 코드 검출 테스트를 위해 문화와 기술 이야기를 반복하여 길이를 늘립니다.",
153            },
154        ];
155
156        for case in cases {
157            run_case(&case);
158        }
159    }
160}