encoding/codec/
simpchinese.rs

1// This is a part of encoding-next.
2// Copyright (c) 2013-2015, Kang Seonghoon.
3// See README.md and LICENSE.txt for details.
4
5//! Legacy simplified Chinese encodings based on GB 2312 and GB 18030.
6
7use crate::index_simpchinese as index;
8use crate::types::*;
9use crate::util::StrCharIndex;
10use std::convert::Into;
11use std::default::Default;
12
13/// GB 18030.
14///
15/// The original GBK 1.0 region spans `[81-FE] [40-7E 80-FE]`, and is derived from
16/// several different revisions of a family of encodings named "GBK":
17///
18/// - GBK as specified in the normative annex of GB 13000.1-93,
19///   the domestic standard equivalent to Unicode 1.1,
20///   consisted of characters included in Unicode 1.1 and not in GB 2312-80.
21///
22/// - Windows code page 936 is the widespread extension to GBK.
23///
24/// - Due to the popularity of Windows code page 936,
25///   a formal encoding based on Windows code page 936 (while adding new characters)
26///   was standardized into GBK 1.0.
27///
28/// - Finally, GB 18030 added four-byte sequences to GBK for becoming a pan-Unicode encoding,
29///   while adding new characters to the (former) GBK region again.
30///
31/// GB 18030-2005 is a simplified Chinese encoding which extends GBK 1.0 to a pan-Unicode encoding.
32/// It assigns four-byte sequences to every Unicode codepoint missing from the GBK area,
33/// lexicographically ordered with occasional "gaps" for codepoints in the GBK area.
34/// Due to this compatibility decision,
35/// there is no simple relationship between these four-byte sequences and Unicode codepoints,
36/// though there *exists* a relatively simple mapping algorithm with a small lookup table.
37#[derive(Clone, Copy)]
38pub struct GB18030Encoding;
39
40impl Encoding for GB18030Encoding {
41    fn name(&self) -> &'static str {
42        "gb18030"
43    }
44    fn whatwg_name(&self) -> Option<&'static str> {
45        Some("gb18030")
46    }
47    fn raw_encoder(&self) -> Box<dyn RawEncoder> {
48        GB18030Encoder::new()
49    }
50    fn raw_decoder(&self) -> Box<dyn RawDecoder> {
51        GB18030Decoder::new()
52    }
53}
54
55/// An encoder for GB 18030.
56#[derive(Clone, Copy)]
57pub struct GB18030Encoder;
58
59impl GB18030Encoder {
60    #[allow(clippy::new_ret_no_self)]
61    pub fn new() -> Box<dyn RawEncoder> {
62        Box::new(GB18030Encoder)
63    }
64}
65
66impl RawEncoder for GB18030Encoder {
67    fn from_self(&self) -> Box<dyn RawEncoder> {
68        GB18030Encoder::new()
69    }
70    fn is_ascii_compatible(&self) -> bool {
71        true
72    }
73    fn raw_feed(
74        &mut self,
75        input: &str,
76        output: &mut dyn ByteWriter,
77    ) -> (usize, Option<CodecError>) {
78        GBEncoder.raw_feed(input, output, false)
79    }
80    fn raw_finish(&mut self, _output: &mut dyn ByteWriter) -> Option<CodecError> {
81        None
82    }
83}
84
85/// GBK, as a subset of GB 18030.
86///
87/// The original GBK 1.0 region spans `[81-FE] [40-7E 80-FE]`, and is derived from
88/// several different revisions of a family of encodings named "GBK":
89///
90/// - GBK as specified in the normative annex of GB 13000.1-93,
91///   the domestic standard equivalent to Unicode 1.1,
92///   consisted of characters included in Unicode 1.1 and not in GB 2312-80.
93///
94/// - Windows code page 936 is the widespread extension to GBK.
95///
96/// - Due to the popularity of Windows code page 936,
97///   a formal encoding based on Windows code page 936 (while adding new characters)
98///   was standardized into GBK 1.0.
99///
100/// - Finally, GB 18030 added four-byte sequences to GBK for becoming a pan-Unicode encoding,
101///   while adding new characters to the (former) GBK region again.
102#[derive(Clone, Copy)]
103pub struct GBKEncoding;
104
105impl Encoding for GBKEncoding {
106    fn name(&self) -> &'static str {
107        "gbk"
108    }
109    fn whatwg_name(&self) -> Option<&'static str> {
110        Some("gbk")
111    }
112    fn raw_encoder(&self) -> Box<dyn RawEncoder> {
113        GBKEncoder::new()
114    }
115    fn raw_decoder(&self) -> Box<dyn RawDecoder> {
116        GB18030Decoder::new()
117    }
118}
119
120/// An encoder for GBK.
121#[derive(Clone, Copy)]
122pub struct GBKEncoder;
123
124impl GBKEncoder {
125    #[allow(clippy::new_ret_no_self)]
126    pub fn new() -> Box<dyn RawEncoder> {
127        Box::new(GBKEncoder)
128    }
129}
130
131impl RawEncoder for GBKEncoder {
132    fn from_self(&self) -> Box<dyn RawEncoder> {
133        GBKEncoder::new()
134    }
135    fn is_ascii_compatible(&self) -> bool {
136        true
137    }
138    fn raw_feed(
139        &mut self,
140        input: &str,
141        output: &mut dyn ByteWriter,
142    ) -> (usize, Option<CodecError>) {
143        GBEncoder.raw_feed(input, output, true)
144    }
145    fn raw_finish(&mut self, _output: &mut dyn ByteWriter) -> Option<CodecError> {
146        None
147    }
148}
149
150/// A shared encoder logic for GBK and GB 18030.
151#[derive(Clone, Copy)]
152struct GBEncoder;
153
154impl GBEncoder {
155    fn raw_feed(
156        &mut self,
157        input: &str,
158        output: &mut dyn ByteWriter,
159        gbk_flag: bool,
160    ) -> (usize, Option<CodecError>) {
161        output.writer_hint(input.len());
162
163        for ((i, j), ch) in input.index_iter() {
164            if ch < '\u{80}' {
165                output.write_byte(ch as u8);
166            } else if ch == '\u{e5e5}' {
167                return (
168                    i,
169                    Some(CodecError {
170                        upto: j as isize,
171                        cause: "no legacy private-use character supported".into(),
172                    }),
173                );
174            } else if gbk_flag && ch == '\u{20AC}' {
175                output.write_byte(b'\x80')
176            } else {
177                let ptr = index::gb18030::backward(ch as u32);
178                if ptr == 0xffff {
179                    if gbk_flag {
180                        return (
181                            i,
182                            Some(CodecError {
183                                upto: j as isize,
184                                cause: "gbk doesn't support gb18030 extensions".into(),
185                            }),
186                        );
187                    }
188                    let ptr = index::gb18030_ranges::backward(ch as u32);
189                    assert!(ptr != 0xffffffff);
190                    let (ptr, byte4) = (ptr / 10, ptr % 10);
191                    let (ptr, byte3) = (ptr / 126, ptr % 126);
192                    let (byte1, byte2) = (ptr / 10, ptr % 10);
193                    output.write_byte((byte1 + 0x81) as u8);
194                    output.write_byte((byte2 + 0x30) as u8);
195                    output.write_byte((byte3 + 0x81) as u8);
196                    output.write_byte((byte4 + 0x30) as u8);
197                } else {
198                    let lead = ptr / 190 + 0x81;
199                    let trail = ptr % 190;
200                    let trailoffset = if trail < 0x3f { 0x40 } else { 0x41 };
201                    output.write_byte(lead as u8);
202                    output.write_byte((trail + trailoffset) as u8);
203                }
204            }
205        }
206        (input.len(), None)
207    }
208}
209
210/// A decoder for GB 18030.
211#[derive(Clone, Copy)]
212struct GB18030Decoder {
213    st: gb18030::State,
214}
215
216impl GB18030Decoder {
217    #[allow(clippy::new_ret_no_self)]
218    pub fn new() -> Box<dyn RawDecoder> {
219        Box::new(GB18030Decoder {
220            st: Default::default(),
221        })
222    }
223}
224
225impl RawDecoder for GB18030Decoder {
226    fn from_self(&self) -> Box<dyn RawDecoder> {
227        GB18030Decoder::new()
228    }
229    fn is_ascii_compatible(&self) -> bool {
230        true
231    }
232
233    fn raw_feed(
234        &mut self,
235        input: &[u8],
236        output: &mut dyn StringWriter,
237    ) -> (usize, Option<CodecError>) {
238        let (st, processed, err) = gb18030::raw_feed(self.st, input, output, &());
239        self.st = st;
240        (processed, err)
241    }
242
243    fn raw_finish(&mut self, output: &mut dyn StringWriter) -> Option<CodecError> {
244        let (st, err) = gb18030::raw_finish(self.st, output, &());
245        self.st = st;
246        err
247    }
248}
249
250stateful_decoder! {
251    module gb18030;
252
253    internal pub fn map_two_bytes(lead: u8, trail: u8) -> u32 {
254        use crate::index_simpchinese as index;
255
256        let lead = lead as u16;
257        let trail = trail as u16;
258        let index = match (lead, trail) {
259            (0x81..=0xfe, 0x40..=0x7e) | (0x81..=0xfe, 0x80..=0xfe) => {
260                let trailoffset = if trail < 0x7f {0x40} else {0x41};
261                (lead - 0x81) * 190 + trail - trailoffset
262            }
263            _ => 0xffff,
264        };
265        index::gb18030::forward(index)
266    }
267
268    internal pub fn map_four_bytes(b1: u8, b2: u8, b3: u8, b4: u8) -> u32 {
269        use crate::index_simpchinese as index;
270
271        // no range check here, caller should have done all checks
272        let index = (b1 as u32 - 0x81) * 12600 + (b2 as u32 - 0x30) * 1260 +
273                    (b3 as u32 - 0x81) * 10 + (b4 as u32 - 0x30);
274        index::gb18030_ranges::forward(index)
275    }
276
277initial:
278    // gb18030 first = 0x00, gb18030 second = 0x00, gb18030 third = 0x00
279    state S0(ctx: Context) {
280        case b @ 0x00..=0x7f => ctx.emit(b as u32);
281        case 0x80 => ctx.emit(0x20ac);
282        case b @ 0x81..=0xfe => S1(ctx, b);
283        case _ => ctx.err("invalid sequence");
284    }
285
286transient:
287    // gb18030 first != 0x00, gb18030 second = 0x00, gb18030 third = 0x00
288    state S1(ctx: Context, first: u8) {
289        case b @ 0x30..=0x39 => S2(ctx, first, b);
290        case b => match map_two_bytes(first, b) {
291            0xffff => ctx.backup_and_err(1, "invalid sequence"), // unconditional
292            ch => ctx.emit(ch)
293        };
294    }
295
296    // gb18030 first != 0x00, gb18030 second != 0x00, gb18030 third = 0x00
297    state S2(ctx: Context, first: u8, second: u8) {
298        case b @ 0x81..=0xfe => S3(ctx, first, second, b);
299        case _ => ctx.backup_and_err(2, "invalid sequence");
300    }
301
302    // gb18030 first != 0x00, gb18030 second != 0x00, gb18030 third != 0x00
303    state S3(ctx: Context, first: u8, second: u8, third: u8) {
304        case b @ 0x30..=0x39 => match map_four_bytes(first, second, third, b) {
305            0xffffffff => ctx.backup_and_err(3, "invalid sequence"), // unconditional
306            ch => ctx.emit(ch)
307        };
308        case _ => ctx.backup_and_err(3, "invalid sequence");
309    }
310}
311
312#[cfg(test)]
313mod gb18030_tests {
314    extern crate test;
315    use super::GB18030Encoding;
316    use crate::testutils;
317    use crate::types::*;
318
319    #[test]
320    fn test_encoder_valid() {
321        let mut e = GB18030Encoding.raw_encoder();
322        assert_feed_ok!(e, "A", "", [0x41]);
323        assert_feed_ok!(e, "BC", "", [0x42, 0x43]);
324        assert_feed_ok!(e, "", "", []);
325        assert_feed_ok!(
326            e,
327            "\u{4e2d}\u{534e}\u{4eba}\u{6c11}\u{5171}\u{548c}\u{56fd}",
328            "",
329            [0xd6, 0xd0, 0xbb, 0xaa, 0xc8, 0xcb, 0xc3, 0xf1, 0xb9, 0xb2, 0xba, 0xcd, 0xb9, 0xfa]
330        );
331        assert_feed_ok!(e, "1\u{20ac}/m", "", [0x31, 0xa2, 0xe3, 0x2f, 0x6d]);
332        assert_feed_ok!(
333            e,
334            "\u{ff21}\u{ff22}\u{ff23}",
335            "",
336            [0xa3, 0xc1, 0xa3, 0xc2, 0xa3, 0xc3]
337        );
338        assert_feed_ok!(e, "\u{80}", "", [0x81, 0x30, 0x81, 0x30]);
339        assert_feed_ok!(e, "\u{81}", "", [0x81, 0x30, 0x81, 0x31]);
340        assert_feed_ok!(e, "\u{a3}", "", [0x81, 0x30, 0x84, 0x35]);
341        assert_feed_ok!(e, "\u{a4}", "", [0xa1, 0xe8]);
342        assert_feed_ok!(e, "\u{a5}", "", [0x81, 0x30, 0x84, 0x36]);
343        assert_feed_ok!(e, "\u{10ffff}", "", [0xe3, 0x32, 0x9a, 0x35]);
344        assert_feed_ok!(
345            e,
346            "\u{2a6a5}\u{3007}",
347            "",
348            [0x98, 0x35, 0xee, 0x37, 0xa9, 0x96]
349        );
350        assert_finish_ok!(e, []);
351    }
352
353    #[test]
354    fn test_encoder_invalid() {
355        let mut e = GB18030Encoding.raw_encoder();
356        // U+E5E5 is the only character that is forbidden from GB 18030
357        assert_feed_err!(e, "", "\u{e5e5}", "", []);
358        assert_finish_ok!(e, []);
359    }
360
361    #[test]
362    fn test_decoder_valid() {
363        let mut d = GB18030Encoding.raw_decoder();
364        assert_feed_ok!(d, [0x41], [], "A");
365        assert_feed_ok!(d, [0x42, 0x43], [], "BC");
366        assert_feed_ok!(d, [], [], "");
367        assert_feed_ok!(
368            d,
369            [0xd6, 0xd0, 0xbb, 0xaa, 0xc8, 0xcb, 0xc3, 0xf1, 0xb9, 0xb2, 0xba, 0xcd, 0xb9, 0xfa],
370            [],
371            "\u{4e2d}\u{534e}\u{4eba}\u{6c11}\u{5171}\u{548c}\u{56fd}"
372        );
373        assert_feed_ok!(d, [0x31, 0x80, 0x2f, 0x6d], [], "1\u{20ac}/m");
374        assert_feed_ok!(
375            d,
376            [0xa3, 0xc1, 0xa3, 0xc2, 0xa3, 0xc3],
377            [],
378            "\u{ff21}\u{ff22}\u{ff23}"
379        );
380        assert_feed_ok!(d, [0x81, 0x30, 0x81, 0x30], [], "\u{80}");
381        assert_feed_ok!(d, [0x81, 0x30, 0x81, 0x31], [], "\u{81}");
382        assert_feed_ok!(d, [0x81, 0x30, 0x84, 0x35], [], "\u{a3}");
383        assert_feed_ok!(d, [0xa1, 0xe8], [], "\u{a4}");
384        assert_feed_ok!(d, [0x81, 0x30, 0x84, 0x36], [], "\u{a5}");
385        assert_feed_ok!(d, [0xe3, 0x32, 0x9a, 0x35], [], "\u{10ffff}");
386        assert_feed_ok!(
387            d,
388            [0x98, 0x35, 0xee, 0x37, 0xa9, 0x96],
389            [],
390            "\u{2a6a5}\u{3007}"
391        );
392        assert_feed_ok!(d, [0xa3, 0xa0], [], "\u{3000}");
393        assert_finish_ok!(d, "");
394    }
395
396    #[test]
397    fn test_decoder_valid_partial() {
398        let mut d = GB18030Encoding.raw_decoder();
399        assert_feed_ok!(d, [], [0xa1], "");
400        assert_feed_ok!(d, [0xa1], [], "\u{3000}");
401        assert_feed_ok!(d, [], [0x81], "");
402        assert_feed_ok!(d, [], [0x30], "");
403        assert_feed_ok!(d, [], [0x81], "");
404        assert_feed_ok!(d, [0x30], [], "\u{80}");
405        assert_feed_ok!(d, [], [0x81], "");
406        assert_feed_ok!(d, [], [0x30], "");
407        assert_feed_ok!(d, [0x81, 0x31], [], "\u{81}");
408        assert_feed_ok!(d, [], [0x81], "");
409        assert_feed_ok!(d, [0x30, 0x81, 0x32], [], "\u{82}");
410        assert_feed_ok!(d, [], [0x81], "");
411        assert_feed_ok!(d, [], [0x30, 0x81], "");
412        assert_feed_ok!(d, [0x33], [], "\u{83}");
413        assert_feed_ok!(d, [], [0x81, 0x30], "");
414        assert_feed_ok!(d, [], [0x81], "");
415        assert_feed_ok!(d, [0x34], [], "\u{84}");
416        assert_feed_ok!(d, [], [0x81, 0x30], "");
417        assert_feed_ok!(d, [0x81, 0x35], [], "\u{85}");
418        assert_feed_ok!(d, [], [0x81, 0x30, 0x81], "");
419        assert_feed_ok!(d, [0x36], [], "\u{86}");
420        assert_finish_ok!(d, "");
421    }
422
423    #[test]
424    fn test_decoder_invalid_partial() {
425        let mut d = GB18030Encoding.raw_decoder();
426        assert_feed_ok!(d, [], [0xa1], "");
427        assert_finish_err!(d, "");
428
429        let mut d = GB18030Encoding.raw_decoder();
430        assert_feed_ok!(d, [], [0x81], "");
431        assert_finish_err!(d, "");
432
433        let mut d = GB18030Encoding.raw_decoder();
434        assert_feed_ok!(d, [], [0x81, 0x30], "");
435        assert_finish_err!(d, "");
436
437        let mut d = GB18030Encoding.raw_decoder();
438        assert_feed_ok!(d, [], [0x81, 0x30, 0x81], "");
439        assert_finish_err!(d, "");
440    }
441
442    #[test]
443    fn test_decoder_invalid_out_of_range() {
444        let mut d = GB18030Encoding.raw_decoder();
445        assert_feed_err!(d, [], [0xff], [], "");
446        assert_feed_err!(d, [], [0x81], [0x00], "");
447        assert_feed_err!(d, [], [0x81], [0x7f], "");
448        assert_feed_err!(d, [], [0x81], [0xff], "");
449        assert_feed_err!(d, [], [0x81], [0x31, 0x00], "");
450        assert_feed_err!(d, [], [0x81], [0x31, 0x80], "");
451        assert_feed_err!(d, [], [0x81], [0x31, 0xff], "");
452        assert_feed_err!(d, [], [0x81], [0x31, 0x81, 0x00], "");
453        assert_feed_err!(d, [], [0x81], [0x31, 0x81, 0x2f], "");
454        assert_feed_err!(d, [], [0x81], [0x31, 0x81, 0x3a], "");
455        assert_feed_err!(d, [], [0x81], [0x31, 0x81, 0xff], "");
456        assert_finish_ok!(d, "");
457    }
458
459    #[test]
460    fn test_decoder_invalid_boundary() {
461        // U+10FFFF (E3 32 9A 35) is the last Unicode codepoint, E3 32 9A 36 is invalid.
462        // note that since the 2nd to 4th bytes may coincide with ASCII, bytes 32 9A 36 is
463        // not considered to be in the problem. this is compatible to WHATWG Encoding standard.
464        let mut d = GB18030Encoding.raw_decoder();
465        assert_feed_ok!(d, [], [0xe3], "");
466        assert_feed_err!(d, [], [], [0x32, 0x9a, 0x36], "");
467        assert_finish_ok!(d, "");
468
469        let mut d = GB18030Encoding.raw_decoder();
470        assert_feed_ok!(d, [], [0xe3], "");
471        assert_feed_ok!(d, [], [0x32, 0x9a], "");
472        assert_feed_err!(d, -2, [], [], [0x32, 0x9a, 0x36], "");
473        assert_finish_ok!(d, "");
474    }
475
476    #[test]
477    fn test_decoder_feed_after_finish() {
478        let mut d = GB18030Encoding.raw_decoder();
479        assert_feed_ok!(d, [0xd2, 0xbb], [0xd2], "\u{4e00}");
480        assert_finish_err!(d, "");
481        assert_feed_ok!(d, [0xd2, 0xbb], [], "\u{4e00}");
482        assert_finish_ok!(d, "");
483
484        let mut d = GB18030Encoding.raw_decoder();
485        assert_feed_ok!(d, [0x98, 0x35, 0xee, 0x37], [0x98, 0x35, 0xee], "\u{2a6a5}");
486        assert_finish_err!(d, "");
487        assert_feed_ok!(d, [0x98, 0x35, 0xee, 0x37], [0x98, 0x35], "\u{2a6a5}");
488        assert_finish_err!(d, "");
489        assert_feed_ok!(d, [0x98, 0x35, 0xee, 0x37], [0x98], "\u{2a6a5}");
490        assert_finish_err!(d, "");
491        assert_feed_ok!(d, [0x98, 0x35, 0xee, 0x37], [], "\u{2a6a5}");
492        assert_finish_ok!(d, "");
493    }
494
495    #[bench]
496    fn bench_encode_short_text(bencher: &mut test::Bencher) {
497        let s = testutils::SIMPLIFIED_CHINESE_TEXT;
498        bencher.bytes = s.len() as u64;
499        bencher.iter(|| test::black_box(GB18030Encoding.encode(s, EncoderTrap::Strict)))
500    }
501
502    #[bench]
503    fn bench_decode_short_text(bencher: &mut test::Bencher) {
504        let s = GB18030Encoding
505            .encode(testutils::SIMPLIFIED_CHINESE_TEXT, EncoderTrap::Strict)
506            .ok()
507            .unwrap();
508        bencher.bytes = s.len() as u64;
509        bencher.iter(|| test::black_box(GB18030Encoding.decode(&s, DecoderTrap::Strict)))
510    }
511}
512
513#[cfg(test)]
514mod gbk_tests {
515    extern crate test;
516    use super::GBKEncoding;
517    use crate::testutils;
518    use crate::types::*;
519
520    // GBK and GB 18030 share the same decoder logic.
521
522    #[test]
523    fn test_encoder() {
524        let mut e = GBKEncoding.raw_encoder();
525        assert_feed_ok!(e, "A", "", [0x41]);
526        assert_feed_ok!(e, "BC", "", [0x42, 0x43]);
527        assert_feed_ok!(e, "", "", []);
528        assert_feed_ok!(
529            e,
530            "\u{4e2d}\u{534e}\u{4eba}\u{6c11}\u{5171}\u{548c}\u{56fd}",
531            "",
532            [0xd6, 0xd0, 0xbb, 0xaa, 0xc8, 0xcb, 0xc3, 0xf1, 0xb9, 0xb2, 0xba, 0xcd, 0xb9, 0xfa]
533        );
534        assert_feed_ok!(e, "1\u{20ac}/m", "", [0x31, 0x80, 0x2f, 0x6d]);
535        assert_feed_ok!(
536            e,
537            "\u{ff21}\u{ff22}\u{ff23}",
538            "",
539            [0xa3, 0xc1, 0xa3, 0xc2, 0xa3, 0xc3]
540        );
541        assert_feed_err!(e, "", "\u{80}", "", []);
542        assert_feed_err!(e, "", "\u{81}", "", []);
543        assert_feed_err!(e, "", "\u{a3}", "", []);
544        assert_feed_ok!(e, "\u{a4}", "", [0xa1, 0xe8]);
545        assert_feed_err!(e, "", "\u{a5}", "", []);
546        assert_feed_err!(e, "", "\u{10ffff}", "", []);
547        assert_feed_err!(e, "", "\u{2a6a5}", "\u{3007}", []);
548        assert_feed_err!(e, "\u{3007}", "\u{2a6a5}", "", [0xa9, 0x96]);
549        assert_finish_ok!(e, []);
550    }
551
552    #[bench]
553    fn bench_encode_short_text(bencher: &mut test::Bencher) {
554        let s = testutils::SIMPLIFIED_CHINESE_TEXT;
555        bencher.bytes = s.len() as u64;
556        bencher.iter(|| test::black_box(GBKEncoding.encode(s, EncoderTrap::Strict)))
557    }
558}
559
560/**
561 * HZ. (RFC 1843)
562 *
563 * This is a simplified Chinese encoding based on GB 2312.
564 * It bears a resemblance to ISO 2022 encodings in such that the printable escape sequences `~{`
565 * and `~}` are used to delimit a sequence of 7-bit-safe GB 2312 sequences. For the comparison,
566 * they are equivalent to ISO-2022-CN escape sequences `ESC $ ) A` and `ESC ( B`.
567 * Additional escape sequences `~~` (for a literal `~`) and `~\n` (ignored) are also supported.
568 */
569#[derive(Clone, Copy)]
570pub struct HZEncoding;
571
572impl Encoding for HZEncoding {
573    fn name(&self) -> &'static str {
574        "hz"
575    }
576    fn whatwg_name(&self) -> Option<&'static str> {
577        None
578    }
579    fn raw_encoder(&self) -> Box<dyn RawEncoder> {
580        HZEncoder::new()
581    }
582    fn raw_decoder(&self) -> Box<dyn RawDecoder> {
583        HZDecoder::new()
584    }
585}
586
587/// An encoder for HZ.
588#[derive(Clone, Copy)]
589pub struct HZEncoder {
590    escaped: bool,
591}
592
593impl HZEncoder {
594    #[allow(clippy::new_ret_no_self)]
595    pub fn new() -> Box<dyn RawEncoder> {
596        Box::new(HZEncoder { escaped: false })
597    }
598}
599
600impl RawEncoder for HZEncoder {
601    fn from_self(&self) -> Box<dyn RawEncoder> {
602        HZEncoder::new()
603    }
604    fn is_ascii_compatible(&self) -> bool {
605        false
606    }
607
608    fn raw_feed(
609        &mut self,
610        input: &str,
611        output: &mut dyn ByteWriter,
612    ) -> (usize, Option<CodecError>) {
613        output.writer_hint(input.len());
614
615        let mut escaped = self.escaped;
616        macro_rules! ensure_escaped(
617            () => (if !escaped { output.write_bytes(b"~{"); escaped = true; })
618        );
619        macro_rules! ensure_unescaped(
620            () => (if escaped { output.write_bytes(b"~}"); escaped = false; })
621        );
622
623        for ((i, j), ch) in input.index_iter() {
624            if ch < '\u{80}' {
625                ensure_unescaped!();
626                output.write_byte(ch as u8);
627                if ch == '~' {
628                    output.write_byte(b'~');
629                }
630            } else {
631                let ptr = index::gb18030::backward(ch as u32);
632                if ptr == 0xffff {
633                    self.escaped = escaped; // do NOT reset the state!
634                    return (
635                        i,
636                        Some(CodecError {
637                            upto: j as isize,
638                            cause: "unrepresentable character".into(),
639                        }),
640                    );
641                } else {
642                    let lead = ptr / 190;
643                    let trail = ptr % 190;
644                    if lead < 0x21 - 1 || trail < 0x21 + 0x3f {
645                        // GBK extension, ignored
646                        self.escaped = escaped; // do NOT reset the state!
647                        return (
648                            i,
649                            Some(CodecError {
650                                upto: j as isize,
651                                cause: "unrepresentable character".into(),
652                            }),
653                        );
654                    } else {
655                        ensure_escaped!();
656                        output.write_byte((lead + 1) as u8);
657                        output.write_byte((trail - 0x3f) as u8);
658                    }
659                }
660            }
661        }
662
663        self.escaped = escaped;
664        (input.len(), None)
665    }
666
667    fn raw_finish(&mut self, _output: &mut dyn ByteWriter) -> Option<CodecError> {
668        None
669    }
670}
671
672/// A decoder for HZ.
673#[derive(Clone, Copy)]
674struct HZDecoder {
675    st: hz::State,
676}
677
678impl HZDecoder {
679    #[allow(clippy::new_ret_no_self)]
680    pub fn new() -> Box<dyn RawDecoder> {
681        Box::new(HZDecoder {
682            st: Default::default(),
683        })
684    }
685}
686
687impl RawDecoder for HZDecoder {
688    fn from_self(&self) -> Box<dyn RawDecoder> {
689        HZDecoder::new()
690    }
691    fn is_ascii_compatible(&self) -> bool {
692        true
693    }
694
695    fn raw_feed(
696        &mut self,
697        input: &[u8],
698        output: &mut dyn StringWriter,
699    ) -> (usize, Option<CodecError>) {
700        let (st, processed, err) = hz::raw_feed(self.st, input, output, &());
701        self.st = st;
702        (processed, err)
703    }
704
705    fn raw_finish(&mut self, output: &mut dyn StringWriter) -> Option<CodecError> {
706        let (st, err) = hz::raw_finish(self.st, output, &());
707        self.st = st;
708        err
709    }
710}
711
712stateful_decoder! {
713    module hz;
714
715    internal pub fn map_two_bytes(lead: u8, trail: u8) -> u32 {
716        use crate::index_simpchinese as index;
717
718        let lead = lead as u16;
719        let trail = trail as u16;
720        let index = match (lead, trail) {
721            (0x20..=0x7f, 0x21..=0x7e) => (lead - 1) * 190 + (trail + 0x3f),
722            _ => 0xffff,
723        };
724        index::gb18030::forward(index)
725    }
726
727initial:
728    // hz-gb-2312 flag = unset, hz-gb-2312 lead = 0x00
729    state A0(ctx: Context) {
730        case 0x7e => A1(ctx);
731        case b @ 0x00..=0x7f => ctx.emit(b as u32);
732        case _ => ctx.err("invalid sequence");
733        final => ctx.reset();
734    }
735
736checkpoint:
737    // hz-gb-2312 flag = set, hz-gb-2312 lead = 0x00
738    state B0(ctx: Context) {
739        case 0x7e => B1(ctx);
740        case b @ 0x20..=0x7f => B2(ctx, b);
741        case 0x0a => ctx.err("invalid sequence"); // error *and* reset
742        case _ => ctx.err("invalid sequence"), B0(ctx);
743        final => ctx.reset();
744    }
745
746transient:
747    // hz-gb-2312 flag = unset, hz-gb-2312 lead = 0x7e
748    state A1(ctx: Context) {
749        case 0x7b => B0(ctx);
750        case 0x7d => A0(ctx);
751        case 0x7e => ctx.emit(0x7e), A0(ctx);
752        case 0x0a => A0(ctx);
753        case _ => ctx.backup_and_err(1, "invalid sequence");
754        final => ctx.err("incomplete sequence");
755    }
756
757    // hz-gb-2312 flag = set, hz-gb-2312 lead = 0x7e
758    state B1(ctx: Context) {
759        case 0x7b => B0(ctx);
760        case 0x7d => A0(ctx);
761        case 0x7e => ctx.emit(0x7e), B0(ctx);
762        case 0x0a => A0(ctx);
763        case _ => ctx.backup_and_err(1, "invalid sequence"), B0(ctx);
764        final => ctx.err("incomplete sequence");
765    }
766
767    // hz-gb-2312 flag = set, hz-gb-2312 lead != 0 & != 0x7e
768    state B2(ctx: Context, lead: u8) {
769        case 0x0a => ctx.err("invalid sequence"); // should reset the state!
770        case b =>
771            match map_two_bytes(lead, b) {
772                0xffff => ctx.err("invalid sequence"),
773                ch => ctx.emit(ch)
774            },
775            B0(ctx);
776        final => ctx.err("incomplete sequence");
777    }
778}
779
780#[cfg(test)]
781mod hz_tests {
782    extern crate test;
783    use super::HZEncoding;
784    use crate::testutils;
785    use crate::types::*;
786
787    #[test]
788    fn test_encoder_valid() {
789        let mut e = HZEncoding.raw_encoder();
790        assert_feed_ok!(e, "A", "", *b"A");
791        assert_feed_ok!(e, "BC", "", *b"BC");
792        assert_feed_ok!(e, "", "", *b"");
793        assert_feed_ok!(
794            e,
795            "\u{4e2d}\u{534e}\u{4eba}\u{6c11}\u{5171}\u{548c}\u{56fd}",
796            "",
797            *b"~{VP;*HKCq92:M9z"
798        );
799        assert_feed_ok!(e, "\u{ff21}\u{ff22}\u{ff23}", "", *b"#A#B#C");
800        assert_feed_ok!(e, "1\u{20ac}/m", "", *b"~}1~{\"c~}/m");
801        assert_feed_ok!(e, "~<\u{a4}~\u{0a4}>~", "", *b"~~<~{!h~}~~~{!h~}>~~");
802        assert_finish_ok!(e, []);
803    }
804
805    #[test]
806    fn test_encoder_invalid() {
807        let mut e = HZEncoding.raw_encoder();
808        assert_feed_err!(e, "", "\u{ffff}", "", []);
809        assert_feed_err!(e, "?", "\u{ffff}", "!", [0x3f]);
810        // no support for GBK extension
811        assert_feed_err!(e, "", "\u{3007}", "", []);
812        assert_finish_ok!(e, []);
813    }
814
815    #[test]
816    fn test_decoder_valid() {
817        let mut d = HZEncoding.raw_decoder();
818        assert_feed_ok!(d, *b"A", *b"", "A");
819        assert_feed_ok!(d, *b"BC", *b"", "BC");
820        assert_feed_ok!(d, *b"D~~E", *b"~", "D~E");
821        assert_feed_ok!(d, *b"~F~\nG", *b"~", "~FG");
822        assert_feed_ok!(d, *b"", *b"", "");
823        assert_feed_ok!(d, *b"\nH", *b"~", "H");
824        assert_feed_ok!(
825            d,
826            *b"{VP~}~{;*~{HKCq92:M9z",
827            *b"",
828            "\u{4e2d}\u{534e}\u{4eba}\u{6c11}\u{5171}\u{548c}\u{56fd}"
829        );
830        assert_feed_ok!(d, *b"", *b"#", "");
831        assert_feed_ok!(d, *b"A", *b"~", "\u{ff21}");
832        assert_feed_ok!(d, *b"~#B~~#C", *b"~", "~\u{ff22}~\u{ff23}");
833        assert_feed_ok!(d, *b"", *b"", "");
834        assert_feed_ok!(d, *b"\n#D~{#E~\n#F~{#G", *b"~", "#D\u{ff25}#F\u{ff27}");
835        assert_feed_ok!(d, *b"}X~}YZ", *b"", "XYZ");
836        assert_finish_ok!(d, "");
837    }
838
839    #[test]
840    fn test_decoder_invalid_out_or_range() {
841        let mut d = HZEncoding.raw_decoder();
842        assert_feed_ok!(d, *b"~{", *b"", "");
843        assert_feed_err!(d, *b"", *b"\x20\x20", *b"", "");
844        assert_feed_err!(d, *b"", *b"\x20\x7f", *b"", ""); // do not reset the state (except for CR)
845        assert_feed_err!(d, *b"", *b"\x21\x7f", *b"", "");
846        assert_feed_err!(d, *b"", *b"\x7f\x20", *b"", "");
847        assert_feed_err!(d, *b"", *b"\x7f\x21", *b"", "");
848        assert_feed_err!(d, *b"", *b"\x7f\x7f", *b"", "");
849        assert_finish_ok!(d, "");
850    }
851
852    #[test]
853    fn test_decoder_invalid_carriage_return() {
854        // CR in the multibyte mode is invalid but *also* resets the state
855        let mut d = HZEncoding.raw_decoder();
856        assert_feed_ok!(d, *b"~{#A", *b"", "\u{ff21}");
857        assert_feed_err!(d, *b"", *b"\n", *b"", "");
858        assert_feed_ok!(d, *b"#B~{#C", *b"", "#B\u{ff23}");
859        assert_feed_err!(d, *b"", *b"#\n", *b"", "");
860        assert_feed_ok!(d, *b"#D", *b"", "#D");
861        assert_finish_ok!(d, "");
862    }
863
864    #[test]
865    fn test_decoder_invalid_partial() {
866        let mut d = HZEncoding.raw_decoder();
867        assert_feed_ok!(d, *b"", *b"~", "");
868        assert_finish_err!(d, "");
869
870        let mut d = HZEncoding.raw_decoder();
871        assert_feed_ok!(d, *b"~{", *b"#", "");
872        assert_finish_err!(d, "");
873
874        let mut d = HZEncoding.raw_decoder();
875        assert_feed_ok!(d, *b"~{#A", *b"~", "\u{ff21}");
876        assert_finish_err!(d, "");
877    }
878
879    #[test]
880    fn test_decoder_invalid_escape() {
881        let mut d = HZEncoding.raw_decoder();
882        assert_feed_ok!(d, *b"#A", *b"", "#A");
883        assert_feed_err!(d, *b"", *b"~", *b"xy", "");
884        assert_feed_ok!(d, *b"#B", *b"", "#B");
885        assert_feed_ok!(d, *b"", *b"~", "");
886        assert_feed_err!(d, *b"", *b"", *b"xy", "");
887        assert_feed_ok!(d, *b"#C~{#D", *b"", "#C\u{ff24}");
888        assert_feed_err!(d, *b"", *b"~", *b"xy", "");
889        assert_feed_ok!(d, *b"#E", *b"", "\u{ff25}"); // does not reset to ASCII
890        assert_feed_ok!(d, *b"", *b"~", "");
891        assert_feed_err!(d, *b"", *b"", *b"xy", "");
892        assert_feed_ok!(d, *b"#F~}#G", *b"", "\u{ff26}#G");
893        assert_finish_ok!(d, "");
894    }
895
896    #[test]
897    fn test_decoder_feed_after_finish() {
898        let mut d = HZEncoding.raw_decoder();
899        assert_feed_ok!(d, *b"R;~{R;", *b"R", "R;\u{4e00}");
900        assert_finish_err!(d, "");
901        assert_feed_ok!(d, *b"R;~{R;", *b"", "R;\u{4e00}");
902        assert_finish_ok!(d, "");
903    }
904
905    #[bench]
906    fn bench_encode_short_text(bencher: &mut test::Bencher) {
907        let s = testutils::SIMPLIFIED_CHINESE_TEXT;
908        bencher.bytes = s.len() as u64;
909        bencher.iter(|| test::black_box(HZEncoding.encode(s, EncoderTrap::Strict)))
910    }
911
912    #[bench]
913    fn bench_decode_short_text(bencher: &mut test::Bencher) {
914        let s = HZEncoding
915            .encode(testutils::SIMPLIFIED_CHINESE_TEXT, EncoderTrap::Strict)
916            .ok()
917            .unwrap();
918        bencher.bytes = s.len() as u64;
919        bencher.iter(|| test::black_box(HZEncoding.decode(&s, DecoderTrap::Strict)))
920    }
921}