encoding/codec/
tradchinese.rs

1// This is a part of encoding-next.
2// Copyright (c) 2013-2015, Kang Seonghoon.
3// See README.md and LICENSE.txt for details.
4
5//! Legacy traditional Chinese encodings.
6
7use crate::index_tradchinese as index;
8use crate::types::*;
9use crate::util::StrCharIndex;
10use std::convert::Into;
11use std::default::Default;
12
13/**
14 * Big5-2003 with common extensions. (XXX with asymmetric HKSCS-2008 support)
15 *
16 * This is a traditional Chinese encoding spanning the region `[81-FE] [40-7E A1-FE]`.
17 * Originally a proprietary encoding by the consortium of five companies (hence the name),
18 * the Republic of China government standardized Big5-2003 in an appendix of CNS 11643
19 * so that CNS 11643 plane 1 and plane 2 have
20 * an almost identical set of characters as Big5 (but with a different mapping).
21 * The Hong Kong government has an official extension to Big5
22 * named Hong Kong Supplementary Character Set (HKSCS).
23 *
24 * This particular implementation of Big5 includes the widespread ETEN and HKSCS extensions,
25 * but excludes less common extensions such as Big5+, Big-5E and Unicode-at-on.
26 */
27#[derive(Clone, Copy)]
28pub struct BigFive2003Encoding;
29
30impl Encoding for BigFive2003Encoding {
31    fn name(&self) -> &'static str {
32        "big5-2003"
33    }
34    fn whatwg_name(&self) -> Option<&'static str> {
35        Some("big5")
36    } // WHATWG compatibility
37    fn raw_encoder(&self) -> Box<dyn RawEncoder> {
38        BigFive2003Encoder::new()
39    }
40    fn raw_decoder(&self) -> Box<dyn RawDecoder> {
41        BigFive2003HKSCS2008Decoder::new()
42    }
43}
44
45/// An encoder for Big5-2003.
46#[derive(Clone, Copy)]
47pub struct BigFive2003Encoder;
48
49impl BigFive2003Encoder {
50    #[allow(clippy::new_ret_no_self)]
51    pub fn new() -> Box<dyn RawEncoder> {
52        Box::new(BigFive2003Encoder)
53    }
54}
55
56impl RawEncoder for BigFive2003Encoder {
57    fn from_self(&self) -> Box<dyn RawEncoder> {
58        BigFive2003Encoder::new()
59    }
60    fn is_ascii_compatible(&self) -> bool {
61        true
62    }
63
64    fn raw_feed(
65        &mut self,
66        input: &str,
67        output: &mut dyn ByteWriter,
68    ) -> (usize, Option<CodecError>) {
69        output.writer_hint(input.len());
70
71        for ((i, j), ch) in input.index_iter() {
72            if ch < '\u{80}' {
73                output.write_byte(ch as u8);
74            } else {
75                let ptr = index::big5::backward(ch as u32);
76                if ptr == 0xffff {
77                    return (
78                        i,
79                        Some(CodecError {
80                            upto: j as isize,
81                            cause: "unrepresentable character".into(),
82                        }),
83                    );
84                }
85                let lead = ptr / 157 + 0x81;
86                let trail = ptr % 157;
87                let trailoffset = if trail < 0x3f { 0x40 } else { 0x62 };
88                output.write_byte(lead as u8);
89                output.write_byte((trail + trailoffset) as u8);
90            }
91        }
92        (input.len(), None)
93    }
94
95    fn raw_finish(&mut self, _output: &mut dyn ByteWriter) -> Option<CodecError> {
96        None
97    }
98}
99
100/// A decoder for Big5-2003 with HKSCS-2008 extension.
101#[derive(Clone, Copy)]
102struct BigFive2003HKSCS2008Decoder {
103    st: bigfive2003::State,
104}
105
106impl BigFive2003HKSCS2008Decoder {
107    #[allow(clippy::new_ret_no_self)]
108    pub fn new() -> Box<dyn RawDecoder> {
109        Box::new(BigFive2003HKSCS2008Decoder {
110            st: Default::default(),
111        })
112    }
113}
114
115impl RawDecoder for BigFive2003HKSCS2008Decoder {
116    fn from_self(&self) -> Box<dyn RawDecoder> {
117        BigFive2003HKSCS2008Decoder::new()
118    }
119    fn is_ascii_compatible(&self) -> bool {
120        true
121    }
122
123    fn raw_feed(
124        &mut self,
125        input: &[u8],
126        output: &mut dyn StringWriter,
127    ) -> (usize, Option<CodecError>) {
128        let (st, processed, err) = bigfive2003::raw_feed(self.st, input, output, &());
129        self.st = st;
130        (processed, err)
131    }
132
133    fn raw_finish(&mut self, output: &mut dyn StringWriter) -> Option<CodecError> {
134        let (st, err) = bigfive2003::raw_finish(self.st, output, &());
135        self.st = st;
136        err
137    }
138}
139
140stateful_decoder! {
141    module bigfive2003;
142
143    internal pub fn map_two_bytes(lead: u8, trail: u8) -> u32 {
144        use crate::index_tradchinese as index;
145
146        let lead = lead as u16;
147        let trail = trail as u16;
148        let index = match (lead, trail) {
149            (0x81..=0xfe, 0x40..=0x7e) | (0x81..=0xfe, 0xa1..=0xfe) => {
150                let trailoffset = if trail < 0x7f {0x40} else {0x62};
151                (lead - 0x81) * 157 + trail - trailoffset
152            }
153            _ => 0xffff,
154        };
155        index::big5::forward(index) // may return two-letter replacements 0..3
156    }
157
158initial:
159    // big5 lead = 0x00
160    state S0(ctx: Context) {
161        case b @ 0x00..=0x7f => ctx.emit(b as u32);
162        case b @ 0x81..=0xfe => S1(ctx, b);
163        case _ => ctx.err("invalid sequence");
164    }
165
166transient:
167    // big5 lead != 0x00
168    state S1(ctx: Context, lead: u8) {
169        case b => match map_two_bytes(lead, b) {
170            0xffff => {
171                let backup = if b < 0x80 {1} else {0};
172                ctx.backup_and_err(backup, "invalid sequence")
173            },
174            0 /*index=1133*/ => ctx.emit_str("\u{ca}\u{304}"),
175            1 /*index=1135*/ => ctx.emit_str("\u{ca}\u{30c}"),
176            2 /*index=1164*/ => ctx.emit_str("\u{ea}\u{304}"),
177            3 /*index=1166*/ => ctx.emit_str("\u{ea}\u{30c}"),
178            ch => ctx.emit(ch),
179        };
180    }
181}
182
183#[cfg(test)]
184mod bigfive2003_tests {
185    extern crate test;
186    use super::BigFive2003Encoding;
187    use crate::testutils;
188    use crate::types::*;
189
190    #[test]
191    fn test_encoder_valid() {
192        let mut e = BigFive2003Encoding.raw_encoder();
193        assert_feed_ok!(e, "A", "", [0x41]);
194        assert_feed_ok!(e, "BC", "", [0x42, 0x43]);
195        assert_feed_ok!(e, "", "", []);
196        assert_feed_ok!(
197            e,
198            "\u{4e2d}\u{83ef}\u{6c11}\u{570b}",
199            "",
200            [0xa4, 0xa4, 0xb5, 0xd8, 0xa5, 0xc1, 0xb0, 0xea]
201        );
202        assert_feed_ok!(e, "1\u{20ac}/m", "", [0x31, 0xa3, 0xe1, 0x2f, 0x6d]);
203        assert_feed_ok!(e, "\u{ffed}", "", [0xf9, 0xfe]);
204        assert_feed_ok!(e, "\u{2550}", "", [0xf9, 0xf9]); // not [0xa2, 0xa4]
205        assert_finish_ok!(e, []);
206    }
207
208    #[test]
209    fn test_encoder_invalid() {
210        let mut e = BigFive2003Encoding.raw_encoder();
211        assert_feed_err!(e, "", "\u{ffff}", "", []);
212        assert_feed_err!(e, "?", "\u{ffff}", "!", [0x3f]);
213        assert_feed_err!(e, "", "\u{3eec}", "\u{4e00}", []); // HKSCS-2008 addition
214        assert_finish_ok!(e, []);
215    }
216
217    #[test]
218    fn test_decoder_valid() {
219        let mut d = BigFive2003Encoding.raw_decoder();
220        assert_feed_ok!(d, [0x41], [], "A");
221        assert_feed_ok!(d, [0x42, 0x43], [], "BC");
222        assert_feed_ok!(d, [], [], "");
223        assert_feed_ok!(
224            d,
225            [0xa4, 0xa4, 0xb5, 0xd8, 0xa5, 0xc1, 0xb0, 0xea],
226            [],
227            "\u{4e2d}\u{83ef}\u{6c11}\u{570b}"
228        );
229        assert_feed_ok!(d, [], [0xa4], "");
230        assert_feed_ok!(d, [0xa4, 0xb5, 0xd8], [0xa5], "\u{4e2d}\u{83ef}");
231        assert_feed_ok!(d, [0xc1, 0xb0, 0xea], [], "\u{6c11}\u{570b}");
232        assert_feed_ok!(d, [0x31, 0xa3, 0xe1, 0x2f, 0x6d], [], "1\u{20ac}/m");
233        assert_feed_ok!(d, [0xf9, 0xfe], [], "\u{ffed}");
234        assert_feed_ok!(d, [0xf9, 0xf9], [], "\u{2550}");
235        assert_feed_ok!(d, [0xa2, 0xa4], [], "\u{2550}");
236        assert_feed_ok!(d, [0x87, 0x7e], [], "\u{3eec}"); // HKSCS-2008 addition
237        assert_feed_ok!(
238            d,
239            [0x88, 0x62, 0x88, 0x64, 0x88, 0xa3, 0x88, 0xa5],
240            [],
241            "\u{ca}\u{304}\u{00ca}\u{30c}\u{ea}\u{304}\u{ea}\u{30c}"
242        ); // 2-byte output
243        assert_finish_ok!(d, "");
244    }
245
246    #[test]
247    fn test_decoder_invalid_lone_lead_immediate_test_finish() {
248        for i in 0x81..0xff {
249            let mut d = BigFive2003Encoding.raw_decoder();
250            assert_feed_ok!(d, [], [i], ""); // wait for a trail
251            assert_finish_err!(d, "");
252        }
253
254        // 80/FF: immediate failure
255        let mut d = BigFive2003Encoding.raw_decoder();
256        assert_feed_err!(d, [], [0x80], [], "");
257        assert_feed_err!(d, [], [0xff], [], "");
258        assert_finish_ok!(d, "");
259    }
260
261    #[test]
262    fn test_decoder_invalid_lone_lead_followed_by_space() {
263        for i in 0x80..0x100 {
264            let i = i as u8;
265            let mut d = BigFive2003Encoding.raw_decoder();
266            assert_feed_err!(d, [], [i], [0x20], "");
267            assert_finish_ok!(d, "");
268        }
269    }
270
271    #[test]
272    fn test_decoder_invalid_lead_followed_by_invalid_trail() {
273        // unlike most other cases, valid lead + invalid MSB-set trail are entirely consumed.
274        // https://www.w3.org/Bugs/Public/show_bug.cgi?id=16771
275        for i in 0x81..0xff {
276            let mut d = BigFive2003Encoding.raw_decoder();
277            assert_feed_err!(d, [], [i, 0x80], [0x20], "");
278            assert_feed_err!(d, [], [i, 0xff], [0x20], "");
279            assert_finish_ok!(d, "");
280
281            let mut d = BigFive2003Encoding.raw_decoder();
282            assert_feed_ok!(d, [], [i], "");
283            assert_feed_err!(d, [], [0x80], [0x20], "");
284            assert_feed_ok!(d, [], [i], "");
285            assert_feed_err!(d, [], [0xff], [0x20], "");
286            assert_finish_ok!(d, "");
287        }
288
289        // 80/FF is not a valid lead and the trail is not consumed
290        let mut d = BigFive2003Encoding.raw_decoder();
291        assert_feed_err!(d, [], [0x80], [0x80], "");
292        assert_feed_err!(d, [], [0x80], [0xff], "");
293        assert_feed_err!(d, [], [0xff], [0x80], "");
294        assert_feed_err!(d, [], [0xff], [0xff], "");
295        assert_finish_ok!(d, "");
296    }
297
298    #[test]
299    fn test_decoder_feed_after_finish() {
300        let mut d = BigFive2003Encoding.raw_decoder();
301        assert_feed_ok!(d, [0xa4, 0x40], [0xa4], "\u{4e00}");
302        assert_finish_err!(d, "");
303        assert_feed_ok!(d, [0xa4, 0x40], [], "\u{4e00}");
304        assert_finish_ok!(d, "");
305    }
306
307    #[bench]
308    fn bench_encode_short_text(bencher: &mut test::Bencher) {
309        let s = testutils::TRADITIONAL_CHINESE_TEXT;
310        bencher.bytes = s.len() as u64;
311        bencher.iter(|| test::black_box(BigFive2003Encoding.encode(s, EncoderTrap::Strict)))
312    }
313
314    #[bench]
315    fn bench_decode_short_text(bencher: &mut test::Bencher) {
316        let s = BigFive2003Encoding
317            .encode(testutils::TRADITIONAL_CHINESE_TEXT, EncoderTrap::Strict)
318            .ok()
319            .unwrap();
320        bencher.bytes = s.len() as u64;
321        bencher.iter(|| test::black_box(BigFive2003Encoding.decode(&s, DecoderTrap::Strict)))
322    }
323}