1use crate::index_simpchinese as index;
8use crate::types::*;
9use crate::util::StrCharIndex;
10use std::convert::Into;
11use std::default::Default;
12
13#[derive(Clone, Copy)]
38pub struct GB18030Encoding;
39
40impl Encoding for GB18030Encoding {
41 fn name(&self) -> &'static str {
42 "gb18030"
43 }
44 fn whatwg_name(&self) -> Option<&'static str> {
45 Some("gb18030")
46 }
47 fn raw_encoder(&self) -> Box<dyn RawEncoder> {
48 GB18030Encoder::new()
49 }
50 fn raw_decoder(&self) -> Box<dyn RawDecoder> {
51 GB18030Decoder::new()
52 }
53}
54
55#[derive(Clone, Copy)]
57pub struct GB18030Encoder;
58
59impl GB18030Encoder {
60 #[allow(clippy::new_ret_no_self)]
61 pub fn new() -> Box<dyn RawEncoder> {
62 Box::new(GB18030Encoder)
63 }
64}
65
66impl RawEncoder for GB18030Encoder {
67 fn from_self(&self) -> Box<dyn RawEncoder> {
68 GB18030Encoder::new()
69 }
70 fn is_ascii_compatible(&self) -> bool {
71 true
72 }
73 fn raw_feed(
74 &mut self,
75 input: &str,
76 output: &mut dyn ByteWriter,
77 ) -> (usize, Option<CodecError>) {
78 GBEncoder.raw_feed(input, output, false)
79 }
80 fn raw_finish(&mut self, _output: &mut dyn ByteWriter) -> Option<CodecError> {
81 None
82 }
83}
84
85#[derive(Clone, Copy)]
103pub struct GBKEncoding;
104
105impl Encoding for GBKEncoding {
106 fn name(&self) -> &'static str {
107 "gbk"
108 }
109 fn whatwg_name(&self) -> Option<&'static str> {
110 Some("gbk")
111 }
112 fn raw_encoder(&self) -> Box<dyn RawEncoder> {
113 GBKEncoder::new()
114 }
115 fn raw_decoder(&self) -> Box<dyn RawDecoder> {
116 GB18030Decoder::new()
117 }
118}
119
120#[derive(Clone, Copy)]
122pub struct GBKEncoder;
123
124impl GBKEncoder {
125 #[allow(clippy::new_ret_no_self)]
126 pub fn new() -> Box<dyn RawEncoder> {
127 Box::new(GBKEncoder)
128 }
129}
130
131impl RawEncoder for GBKEncoder {
132 fn from_self(&self) -> Box<dyn RawEncoder> {
133 GBKEncoder::new()
134 }
135 fn is_ascii_compatible(&self) -> bool {
136 true
137 }
138 fn raw_feed(
139 &mut self,
140 input: &str,
141 output: &mut dyn ByteWriter,
142 ) -> (usize, Option<CodecError>) {
143 GBEncoder.raw_feed(input, output, true)
144 }
145 fn raw_finish(&mut self, _output: &mut dyn ByteWriter) -> Option<CodecError> {
146 None
147 }
148}
149
150#[derive(Clone, Copy)]
152struct GBEncoder;
153
154impl GBEncoder {
155 fn raw_feed(
156 &mut self,
157 input: &str,
158 output: &mut dyn ByteWriter,
159 gbk_flag: bool,
160 ) -> (usize, Option<CodecError>) {
161 output.writer_hint(input.len());
162
163 for ((i, j), ch) in input.index_iter() {
164 if ch < '\u{80}' {
165 output.write_byte(ch as u8);
166 } else if ch == '\u{e5e5}' {
167 return (
168 i,
169 Some(CodecError {
170 upto: j as isize,
171 cause: "no legacy private-use character supported".into(),
172 }),
173 );
174 } else if gbk_flag && ch == '\u{20AC}' {
175 output.write_byte(b'\x80')
176 } else {
177 let ptr = index::gb18030::backward(ch as u32);
178 if ptr == 0xffff {
179 if gbk_flag {
180 return (
181 i,
182 Some(CodecError {
183 upto: j as isize,
184 cause: "gbk doesn't support gb18030 extensions".into(),
185 }),
186 );
187 }
188 let ptr = index::gb18030_ranges::backward(ch as u32);
189 assert!(ptr != 0xffffffff);
190 let (ptr, byte4) = (ptr / 10, ptr % 10);
191 let (ptr, byte3) = (ptr / 126, ptr % 126);
192 let (byte1, byte2) = (ptr / 10, ptr % 10);
193 output.write_byte((byte1 + 0x81) as u8);
194 output.write_byte((byte2 + 0x30) as u8);
195 output.write_byte((byte3 + 0x81) as u8);
196 output.write_byte((byte4 + 0x30) as u8);
197 } else {
198 let lead = ptr / 190 + 0x81;
199 let trail = ptr % 190;
200 let trailoffset = if trail < 0x3f { 0x40 } else { 0x41 };
201 output.write_byte(lead as u8);
202 output.write_byte((trail + trailoffset) as u8);
203 }
204 }
205 }
206 (input.len(), None)
207 }
208}
209
210#[derive(Clone, Copy)]
212struct GB18030Decoder {
213 st: gb18030::State,
214}
215
216impl GB18030Decoder {
217 #[allow(clippy::new_ret_no_self)]
218 pub fn new() -> Box<dyn RawDecoder> {
219 Box::new(GB18030Decoder {
220 st: Default::default(),
221 })
222 }
223}
224
225impl RawDecoder for GB18030Decoder {
226 fn from_self(&self) -> Box<dyn RawDecoder> {
227 GB18030Decoder::new()
228 }
229 fn is_ascii_compatible(&self) -> bool {
230 true
231 }
232
233 fn raw_feed(
234 &mut self,
235 input: &[u8],
236 output: &mut dyn StringWriter,
237 ) -> (usize, Option<CodecError>) {
238 let (st, processed, err) = gb18030::raw_feed(self.st, input, output, &());
239 self.st = st;
240 (processed, err)
241 }
242
243 fn raw_finish(&mut self, output: &mut dyn StringWriter) -> Option<CodecError> {
244 let (st, err) = gb18030::raw_finish(self.st, output, &());
245 self.st = st;
246 err
247 }
248}
249
250stateful_decoder! {
251 module gb18030;
252
253 internal pub fn map_two_bytes(lead: u8, trail: u8) -> u32 {
254 use crate::index_simpchinese as index;
255
256 let lead = lead as u16;
257 let trail = trail as u16;
258 let index = match (lead, trail) {
259 (0x81..=0xfe, 0x40..=0x7e) | (0x81..=0xfe, 0x80..=0xfe) => {
260 let trailoffset = if trail < 0x7f {0x40} else {0x41};
261 (lead - 0x81) * 190 + trail - trailoffset
262 }
263 _ => 0xffff,
264 };
265 index::gb18030::forward(index)
266 }
267
268 internal pub fn map_four_bytes(b1: u8, b2: u8, b3: u8, b4: u8) -> u32 {
269 use crate::index_simpchinese as index;
270
271 let index = (b1 as u32 - 0x81) * 12600 + (b2 as u32 - 0x30) * 1260 +
273 (b3 as u32 - 0x81) * 10 + (b4 as u32 - 0x30);
274 index::gb18030_ranges::forward(index)
275 }
276
277initial:
278 state S0(ctx: Context) {
280 case b @ 0x00..=0x7f => ctx.emit(b as u32);
281 case 0x80 => ctx.emit(0x20ac);
282 case b @ 0x81..=0xfe => S1(ctx, b);
283 case _ => ctx.err("invalid sequence");
284 }
285
286transient:
287 state S1(ctx: Context, first: u8) {
289 case b @ 0x30..=0x39 => S2(ctx, first, b);
290 case b => match map_two_bytes(first, b) {
291 0xffff => ctx.backup_and_err(1, "invalid sequence"), ch => ctx.emit(ch)
293 };
294 }
295
296 state S2(ctx: Context, first: u8, second: u8) {
298 case b @ 0x81..=0xfe => S3(ctx, first, second, b);
299 case _ => ctx.backup_and_err(2, "invalid sequence");
300 }
301
302 state S3(ctx: Context, first: u8, second: u8, third: u8) {
304 case b @ 0x30..=0x39 => match map_four_bytes(first, second, third, b) {
305 0xffffffff => ctx.backup_and_err(3, "invalid sequence"), ch => ctx.emit(ch)
307 };
308 case _ => ctx.backup_and_err(3, "invalid sequence");
309 }
310}
311
312#[cfg(test)]
313mod gb18030_tests {
314 extern crate test;
315 use super::GB18030Encoding;
316 use crate::testutils;
317 use crate::types::*;
318
319 #[test]
320 fn test_encoder_valid() {
321 let mut e = GB18030Encoding.raw_encoder();
322 assert_feed_ok!(e, "A", "", [0x41]);
323 assert_feed_ok!(e, "BC", "", [0x42, 0x43]);
324 assert_feed_ok!(e, "", "", []);
325 assert_feed_ok!(
326 e,
327 "\u{4e2d}\u{534e}\u{4eba}\u{6c11}\u{5171}\u{548c}\u{56fd}",
328 "",
329 [0xd6, 0xd0, 0xbb, 0xaa, 0xc8, 0xcb, 0xc3, 0xf1, 0xb9, 0xb2, 0xba, 0xcd, 0xb9, 0xfa]
330 );
331 assert_feed_ok!(e, "1\u{20ac}/m", "", [0x31, 0xa2, 0xe3, 0x2f, 0x6d]);
332 assert_feed_ok!(
333 e,
334 "\u{ff21}\u{ff22}\u{ff23}",
335 "",
336 [0xa3, 0xc1, 0xa3, 0xc2, 0xa3, 0xc3]
337 );
338 assert_feed_ok!(e, "\u{80}", "", [0x81, 0x30, 0x81, 0x30]);
339 assert_feed_ok!(e, "\u{81}", "", [0x81, 0x30, 0x81, 0x31]);
340 assert_feed_ok!(e, "\u{a3}", "", [0x81, 0x30, 0x84, 0x35]);
341 assert_feed_ok!(e, "\u{a4}", "", [0xa1, 0xe8]);
342 assert_feed_ok!(e, "\u{a5}", "", [0x81, 0x30, 0x84, 0x36]);
343 assert_feed_ok!(e, "\u{10ffff}", "", [0xe3, 0x32, 0x9a, 0x35]);
344 assert_feed_ok!(
345 e,
346 "\u{2a6a5}\u{3007}",
347 "",
348 [0x98, 0x35, 0xee, 0x37, 0xa9, 0x96]
349 );
350 assert_finish_ok!(e, []);
351 }
352
353 #[test]
354 fn test_encoder_invalid() {
355 let mut e = GB18030Encoding.raw_encoder();
356 assert_feed_err!(e, "", "\u{e5e5}", "", []);
358 assert_finish_ok!(e, []);
359 }
360
361 #[test]
362 fn test_decoder_valid() {
363 let mut d = GB18030Encoding.raw_decoder();
364 assert_feed_ok!(d, [0x41], [], "A");
365 assert_feed_ok!(d, [0x42, 0x43], [], "BC");
366 assert_feed_ok!(d, [], [], "");
367 assert_feed_ok!(
368 d,
369 [0xd6, 0xd0, 0xbb, 0xaa, 0xc8, 0xcb, 0xc3, 0xf1, 0xb9, 0xb2, 0xba, 0xcd, 0xb9, 0xfa],
370 [],
371 "\u{4e2d}\u{534e}\u{4eba}\u{6c11}\u{5171}\u{548c}\u{56fd}"
372 );
373 assert_feed_ok!(d, [0x31, 0x80, 0x2f, 0x6d], [], "1\u{20ac}/m");
374 assert_feed_ok!(
375 d,
376 [0xa3, 0xc1, 0xa3, 0xc2, 0xa3, 0xc3],
377 [],
378 "\u{ff21}\u{ff22}\u{ff23}"
379 );
380 assert_feed_ok!(d, [0x81, 0x30, 0x81, 0x30], [], "\u{80}");
381 assert_feed_ok!(d, [0x81, 0x30, 0x81, 0x31], [], "\u{81}");
382 assert_feed_ok!(d, [0x81, 0x30, 0x84, 0x35], [], "\u{a3}");
383 assert_feed_ok!(d, [0xa1, 0xe8], [], "\u{a4}");
384 assert_feed_ok!(d, [0x81, 0x30, 0x84, 0x36], [], "\u{a5}");
385 assert_feed_ok!(d, [0xe3, 0x32, 0x9a, 0x35], [], "\u{10ffff}");
386 assert_feed_ok!(
387 d,
388 [0x98, 0x35, 0xee, 0x37, 0xa9, 0x96],
389 [],
390 "\u{2a6a5}\u{3007}"
391 );
392 assert_feed_ok!(d, [0xa3, 0xa0], [], "\u{3000}");
393 assert_finish_ok!(d, "");
394 }
395
396 #[test]
397 fn test_decoder_valid_partial() {
398 let mut d = GB18030Encoding.raw_decoder();
399 assert_feed_ok!(d, [], [0xa1], "");
400 assert_feed_ok!(d, [0xa1], [], "\u{3000}");
401 assert_feed_ok!(d, [], [0x81], "");
402 assert_feed_ok!(d, [], [0x30], "");
403 assert_feed_ok!(d, [], [0x81], "");
404 assert_feed_ok!(d, [0x30], [], "\u{80}");
405 assert_feed_ok!(d, [], [0x81], "");
406 assert_feed_ok!(d, [], [0x30], "");
407 assert_feed_ok!(d, [0x81, 0x31], [], "\u{81}");
408 assert_feed_ok!(d, [], [0x81], "");
409 assert_feed_ok!(d, [0x30, 0x81, 0x32], [], "\u{82}");
410 assert_feed_ok!(d, [], [0x81], "");
411 assert_feed_ok!(d, [], [0x30, 0x81], "");
412 assert_feed_ok!(d, [0x33], [], "\u{83}");
413 assert_feed_ok!(d, [], [0x81, 0x30], "");
414 assert_feed_ok!(d, [], [0x81], "");
415 assert_feed_ok!(d, [0x34], [], "\u{84}");
416 assert_feed_ok!(d, [], [0x81, 0x30], "");
417 assert_feed_ok!(d, [0x81, 0x35], [], "\u{85}");
418 assert_feed_ok!(d, [], [0x81, 0x30, 0x81], "");
419 assert_feed_ok!(d, [0x36], [], "\u{86}");
420 assert_finish_ok!(d, "");
421 }
422
423 #[test]
424 fn test_decoder_invalid_partial() {
425 let mut d = GB18030Encoding.raw_decoder();
426 assert_feed_ok!(d, [], [0xa1], "");
427 assert_finish_err!(d, "");
428
429 let mut d = GB18030Encoding.raw_decoder();
430 assert_feed_ok!(d, [], [0x81], "");
431 assert_finish_err!(d, "");
432
433 let mut d = GB18030Encoding.raw_decoder();
434 assert_feed_ok!(d, [], [0x81, 0x30], "");
435 assert_finish_err!(d, "");
436
437 let mut d = GB18030Encoding.raw_decoder();
438 assert_feed_ok!(d, [], [0x81, 0x30, 0x81], "");
439 assert_finish_err!(d, "");
440 }
441
442 #[test]
443 fn test_decoder_invalid_out_of_range() {
444 let mut d = GB18030Encoding.raw_decoder();
445 assert_feed_err!(d, [], [0xff], [], "");
446 assert_feed_err!(d, [], [0x81], [0x00], "");
447 assert_feed_err!(d, [], [0x81], [0x7f], "");
448 assert_feed_err!(d, [], [0x81], [0xff], "");
449 assert_feed_err!(d, [], [0x81], [0x31, 0x00], "");
450 assert_feed_err!(d, [], [0x81], [0x31, 0x80], "");
451 assert_feed_err!(d, [], [0x81], [0x31, 0xff], "");
452 assert_feed_err!(d, [], [0x81], [0x31, 0x81, 0x00], "");
453 assert_feed_err!(d, [], [0x81], [0x31, 0x81, 0x2f], "");
454 assert_feed_err!(d, [], [0x81], [0x31, 0x81, 0x3a], "");
455 assert_feed_err!(d, [], [0x81], [0x31, 0x81, 0xff], "");
456 assert_finish_ok!(d, "");
457 }
458
459 #[test]
460 fn test_decoder_invalid_boundary() {
461 let mut d = GB18030Encoding.raw_decoder();
465 assert_feed_ok!(d, [], [0xe3], "");
466 assert_feed_err!(d, [], [], [0x32, 0x9a, 0x36], "");
467 assert_finish_ok!(d, "");
468
469 let mut d = GB18030Encoding.raw_decoder();
470 assert_feed_ok!(d, [], [0xe3], "");
471 assert_feed_ok!(d, [], [0x32, 0x9a], "");
472 assert_feed_err!(d, -2, [], [], [0x32, 0x9a, 0x36], "");
473 assert_finish_ok!(d, "");
474 }
475
476 #[test]
477 fn test_decoder_feed_after_finish() {
478 let mut d = GB18030Encoding.raw_decoder();
479 assert_feed_ok!(d, [0xd2, 0xbb], [0xd2], "\u{4e00}");
480 assert_finish_err!(d, "");
481 assert_feed_ok!(d, [0xd2, 0xbb], [], "\u{4e00}");
482 assert_finish_ok!(d, "");
483
484 let mut d = GB18030Encoding.raw_decoder();
485 assert_feed_ok!(d, [0x98, 0x35, 0xee, 0x37], [0x98, 0x35, 0xee], "\u{2a6a5}");
486 assert_finish_err!(d, "");
487 assert_feed_ok!(d, [0x98, 0x35, 0xee, 0x37], [0x98, 0x35], "\u{2a6a5}");
488 assert_finish_err!(d, "");
489 assert_feed_ok!(d, [0x98, 0x35, 0xee, 0x37], [0x98], "\u{2a6a5}");
490 assert_finish_err!(d, "");
491 assert_feed_ok!(d, [0x98, 0x35, 0xee, 0x37], [], "\u{2a6a5}");
492 assert_finish_ok!(d, "");
493 }
494
495 #[bench]
496 fn bench_encode_short_text(bencher: &mut test::Bencher) {
497 let s = testutils::SIMPLIFIED_CHINESE_TEXT;
498 bencher.bytes = s.len() as u64;
499 bencher.iter(|| test::black_box(GB18030Encoding.encode(s, EncoderTrap::Strict)))
500 }
501
502 #[bench]
503 fn bench_decode_short_text(bencher: &mut test::Bencher) {
504 let s = GB18030Encoding
505 .encode(testutils::SIMPLIFIED_CHINESE_TEXT, EncoderTrap::Strict)
506 .ok()
507 .unwrap();
508 bencher.bytes = s.len() as u64;
509 bencher.iter(|| test::black_box(GB18030Encoding.decode(&s, DecoderTrap::Strict)))
510 }
511}
512
513#[cfg(test)]
514mod gbk_tests {
515 extern crate test;
516 use super::GBKEncoding;
517 use crate::testutils;
518 use crate::types::*;
519
520 #[test]
523 fn test_encoder() {
524 let mut e = GBKEncoding.raw_encoder();
525 assert_feed_ok!(e, "A", "", [0x41]);
526 assert_feed_ok!(e, "BC", "", [0x42, 0x43]);
527 assert_feed_ok!(e, "", "", []);
528 assert_feed_ok!(
529 e,
530 "\u{4e2d}\u{534e}\u{4eba}\u{6c11}\u{5171}\u{548c}\u{56fd}",
531 "",
532 [0xd6, 0xd0, 0xbb, 0xaa, 0xc8, 0xcb, 0xc3, 0xf1, 0xb9, 0xb2, 0xba, 0xcd, 0xb9, 0xfa]
533 );
534 assert_feed_ok!(e, "1\u{20ac}/m", "", [0x31, 0x80, 0x2f, 0x6d]);
535 assert_feed_ok!(
536 e,
537 "\u{ff21}\u{ff22}\u{ff23}",
538 "",
539 [0xa3, 0xc1, 0xa3, 0xc2, 0xa3, 0xc3]
540 );
541 assert_feed_err!(e, "", "\u{80}", "", []);
542 assert_feed_err!(e, "", "\u{81}", "", []);
543 assert_feed_err!(e, "", "\u{a3}", "", []);
544 assert_feed_ok!(e, "\u{a4}", "", [0xa1, 0xe8]);
545 assert_feed_err!(e, "", "\u{a5}", "", []);
546 assert_feed_err!(e, "", "\u{10ffff}", "", []);
547 assert_feed_err!(e, "", "\u{2a6a5}", "\u{3007}", []);
548 assert_feed_err!(e, "\u{3007}", "\u{2a6a5}", "", [0xa9, 0x96]);
549 assert_finish_ok!(e, []);
550 }
551
552 #[bench]
553 fn bench_encode_short_text(bencher: &mut test::Bencher) {
554 let s = testutils::SIMPLIFIED_CHINESE_TEXT;
555 bencher.bytes = s.len() as u64;
556 bencher.iter(|| test::black_box(GBKEncoding.encode(s, EncoderTrap::Strict)))
557 }
558}
559
560#[derive(Clone, Copy)]
570pub struct HZEncoding;
571
572impl Encoding for HZEncoding {
573 fn name(&self) -> &'static str {
574 "hz"
575 }
576 fn whatwg_name(&self) -> Option<&'static str> {
577 None
578 }
579 fn raw_encoder(&self) -> Box<dyn RawEncoder> {
580 HZEncoder::new()
581 }
582 fn raw_decoder(&self) -> Box<dyn RawDecoder> {
583 HZDecoder::new()
584 }
585}
586
587#[derive(Clone, Copy)]
589pub struct HZEncoder {
590 escaped: bool,
591}
592
593impl HZEncoder {
594 #[allow(clippy::new_ret_no_self)]
595 pub fn new() -> Box<dyn RawEncoder> {
596 Box::new(HZEncoder { escaped: false })
597 }
598}
599
600impl RawEncoder for HZEncoder {
601 fn from_self(&self) -> Box<dyn RawEncoder> {
602 HZEncoder::new()
603 }
604 fn is_ascii_compatible(&self) -> bool {
605 false
606 }
607
608 fn raw_feed(
609 &mut self,
610 input: &str,
611 output: &mut dyn ByteWriter,
612 ) -> (usize, Option<CodecError>) {
613 output.writer_hint(input.len());
614
615 let mut escaped = self.escaped;
616 macro_rules! ensure_escaped(
617 () => (if !escaped { output.write_bytes(b"~{"); escaped = true; })
618 );
619 macro_rules! ensure_unescaped(
620 () => (if escaped { output.write_bytes(b"~}"); escaped = false; })
621 );
622
623 for ((i, j), ch) in input.index_iter() {
624 if ch < '\u{80}' {
625 ensure_unescaped!();
626 output.write_byte(ch as u8);
627 if ch == '~' {
628 output.write_byte(b'~');
629 }
630 } else {
631 let ptr = index::gb18030::backward(ch as u32);
632 if ptr == 0xffff {
633 self.escaped = escaped; return (
635 i,
636 Some(CodecError {
637 upto: j as isize,
638 cause: "unrepresentable character".into(),
639 }),
640 );
641 } else {
642 let lead = ptr / 190;
643 let trail = ptr % 190;
644 if lead < 0x21 - 1 || trail < 0x21 + 0x3f {
645 self.escaped = escaped; return (
648 i,
649 Some(CodecError {
650 upto: j as isize,
651 cause: "unrepresentable character".into(),
652 }),
653 );
654 } else {
655 ensure_escaped!();
656 output.write_byte((lead + 1) as u8);
657 output.write_byte((trail - 0x3f) as u8);
658 }
659 }
660 }
661 }
662
663 self.escaped = escaped;
664 (input.len(), None)
665 }
666
667 fn raw_finish(&mut self, _output: &mut dyn ByteWriter) -> Option<CodecError> {
668 None
669 }
670}
671
672#[derive(Clone, Copy)]
674struct HZDecoder {
675 st: hz::State,
676}
677
678impl HZDecoder {
679 #[allow(clippy::new_ret_no_self)]
680 pub fn new() -> Box<dyn RawDecoder> {
681 Box::new(HZDecoder {
682 st: Default::default(),
683 })
684 }
685}
686
687impl RawDecoder for HZDecoder {
688 fn from_self(&self) -> Box<dyn RawDecoder> {
689 HZDecoder::new()
690 }
691 fn is_ascii_compatible(&self) -> bool {
692 true
693 }
694
695 fn raw_feed(
696 &mut self,
697 input: &[u8],
698 output: &mut dyn StringWriter,
699 ) -> (usize, Option<CodecError>) {
700 let (st, processed, err) = hz::raw_feed(self.st, input, output, &());
701 self.st = st;
702 (processed, err)
703 }
704
705 fn raw_finish(&mut self, output: &mut dyn StringWriter) -> Option<CodecError> {
706 let (st, err) = hz::raw_finish(self.st, output, &());
707 self.st = st;
708 err
709 }
710}
711
712stateful_decoder! {
713 module hz;
714
715 internal pub fn map_two_bytes(lead: u8, trail: u8) -> u32 {
716 use crate::index_simpchinese as index;
717
718 let lead = lead as u16;
719 let trail = trail as u16;
720 let index = match (lead, trail) {
721 (0x20..=0x7f, 0x21..=0x7e) => (lead - 1) * 190 + (trail + 0x3f),
722 _ => 0xffff,
723 };
724 index::gb18030::forward(index)
725 }
726
727initial:
728 state A0(ctx: Context) {
730 case 0x7e => A1(ctx);
731 case b @ 0x00..=0x7f => ctx.emit(b as u32);
732 case _ => ctx.err("invalid sequence");
733 final => ctx.reset();
734 }
735
736checkpoint:
737 state B0(ctx: Context) {
739 case 0x7e => B1(ctx);
740 case b @ 0x20..=0x7f => B2(ctx, b);
741 case 0x0a => ctx.err("invalid sequence"); case _ => ctx.err("invalid sequence"), B0(ctx);
743 final => ctx.reset();
744 }
745
746transient:
747 state A1(ctx: Context) {
749 case 0x7b => B0(ctx);
750 case 0x7d => A0(ctx);
751 case 0x7e => ctx.emit(0x7e), A0(ctx);
752 case 0x0a => A0(ctx);
753 case _ => ctx.backup_and_err(1, "invalid sequence");
754 final => ctx.err("incomplete sequence");
755 }
756
757 state B1(ctx: Context) {
759 case 0x7b => B0(ctx);
760 case 0x7d => A0(ctx);
761 case 0x7e => ctx.emit(0x7e), B0(ctx);
762 case 0x0a => A0(ctx);
763 case _ => ctx.backup_and_err(1, "invalid sequence"), B0(ctx);
764 final => ctx.err("incomplete sequence");
765 }
766
767 state B2(ctx: Context, lead: u8) {
769 case 0x0a => ctx.err("invalid sequence"); case b =>
771 match map_two_bytes(lead, b) {
772 0xffff => ctx.err("invalid sequence"),
773 ch => ctx.emit(ch)
774 },
775 B0(ctx);
776 final => ctx.err("incomplete sequence");
777 }
778}
779
780#[cfg(test)]
781mod hz_tests {
782 extern crate test;
783 use super::HZEncoding;
784 use crate::testutils;
785 use crate::types::*;
786
787 #[test]
788 fn test_encoder_valid() {
789 let mut e = HZEncoding.raw_encoder();
790 assert_feed_ok!(e, "A", "", *b"A");
791 assert_feed_ok!(e, "BC", "", *b"BC");
792 assert_feed_ok!(e, "", "", *b"");
793 assert_feed_ok!(
794 e,
795 "\u{4e2d}\u{534e}\u{4eba}\u{6c11}\u{5171}\u{548c}\u{56fd}",
796 "",
797 *b"~{VP;*HKCq92:M9z"
798 );
799 assert_feed_ok!(e, "\u{ff21}\u{ff22}\u{ff23}", "", *b"#A#B#C");
800 assert_feed_ok!(e, "1\u{20ac}/m", "", *b"~}1~{\"c~}/m");
801 assert_feed_ok!(e, "~<\u{a4}~\u{0a4}>~", "", *b"~~<~{!h~}~~~{!h~}>~~");
802 assert_finish_ok!(e, []);
803 }
804
805 #[test]
806 fn test_encoder_invalid() {
807 let mut e = HZEncoding.raw_encoder();
808 assert_feed_err!(e, "", "\u{ffff}", "", []);
809 assert_feed_err!(e, "?", "\u{ffff}", "!", [0x3f]);
810 assert_feed_err!(e, "", "\u{3007}", "", []);
812 assert_finish_ok!(e, []);
813 }
814
815 #[test]
816 fn test_decoder_valid() {
817 let mut d = HZEncoding.raw_decoder();
818 assert_feed_ok!(d, *b"A", *b"", "A");
819 assert_feed_ok!(d, *b"BC", *b"", "BC");
820 assert_feed_ok!(d, *b"D~~E", *b"~", "D~E");
821 assert_feed_ok!(d, *b"~F~\nG", *b"~", "~FG");
822 assert_feed_ok!(d, *b"", *b"", "");
823 assert_feed_ok!(d, *b"\nH", *b"~", "H");
824 assert_feed_ok!(
825 d,
826 *b"{VP~}~{;*~{HKCq92:M9z",
827 *b"",
828 "\u{4e2d}\u{534e}\u{4eba}\u{6c11}\u{5171}\u{548c}\u{56fd}"
829 );
830 assert_feed_ok!(d, *b"", *b"#", "");
831 assert_feed_ok!(d, *b"A", *b"~", "\u{ff21}");
832 assert_feed_ok!(d, *b"~#B~~#C", *b"~", "~\u{ff22}~\u{ff23}");
833 assert_feed_ok!(d, *b"", *b"", "");
834 assert_feed_ok!(d, *b"\n#D~{#E~\n#F~{#G", *b"~", "#D\u{ff25}#F\u{ff27}");
835 assert_feed_ok!(d, *b"}X~}YZ", *b"", "XYZ");
836 assert_finish_ok!(d, "");
837 }
838
839 #[test]
840 fn test_decoder_invalid_out_or_range() {
841 let mut d = HZEncoding.raw_decoder();
842 assert_feed_ok!(d, *b"~{", *b"", "");
843 assert_feed_err!(d, *b"", *b"\x20\x20", *b"", "");
844 assert_feed_err!(d, *b"", *b"\x20\x7f", *b"", ""); assert_feed_err!(d, *b"", *b"\x21\x7f", *b"", "");
846 assert_feed_err!(d, *b"", *b"\x7f\x20", *b"", "");
847 assert_feed_err!(d, *b"", *b"\x7f\x21", *b"", "");
848 assert_feed_err!(d, *b"", *b"\x7f\x7f", *b"", "");
849 assert_finish_ok!(d, "");
850 }
851
852 #[test]
853 fn test_decoder_invalid_carriage_return() {
854 let mut d = HZEncoding.raw_decoder();
856 assert_feed_ok!(d, *b"~{#A", *b"", "\u{ff21}");
857 assert_feed_err!(d, *b"", *b"\n", *b"", "");
858 assert_feed_ok!(d, *b"#B~{#C", *b"", "#B\u{ff23}");
859 assert_feed_err!(d, *b"", *b"#\n", *b"", "");
860 assert_feed_ok!(d, *b"#D", *b"", "#D");
861 assert_finish_ok!(d, "");
862 }
863
864 #[test]
865 fn test_decoder_invalid_partial() {
866 let mut d = HZEncoding.raw_decoder();
867 assert_feed_ok!(d, *b"", *b"~", "");
868 assert_finish_err!(d, "");
869
870 let mut d = HZEncoding.raw_decoder();
871 assert_feed_ok!(d, *b"~{", *b"#", "");
872 assert_finish_err!(d, "");
873
874 let mut d = HZEncoding.raw_decoder();
875 assert_feed_ok!(d, *b"~{#A", *b"~", "\u{ff21}");
876 assert_finish_err!(d, "");
877 }
878
879 #[test]
880 fn test_decoder_invalid_escape() {
881 let mut d = HZEncoding.raw_decoder();
882 assert_feed_ok!(d, *b"#A", *b"", "#A");
883 assert_feed_err!(d, *b"", *b"~", *b"xy", "");
884 assert_feed_ok!(d, *b"#B", *b"", "#B");
885 assert_feed_ok!(d, *b"", *b"~", "");
886 assert_feed_err!(d, *b"", *b"", *b"xy", "");
887 assert_feed_ok!(d, *b"#C~{#D", *b"", "#C\u{ff24}");
888 assert_feed_err!(d, *b"", *b"~", *b"xy", "");
889 assert_feed_ok!(d, *b"#E", *b"", "\u{ff25}"); assert_feed_ok!(d, *b"", *b"~", "");
891 assert_feed_err!(d, *b"", *b"", *b"xy", "");
892 assert_feed_ok!(d, *b"#F~}#G", *b"", "\u{ff26}#G");
893 assert_finish_ok!(d, "");
894 }
895
896 #[test]
897 fn test_decoder_feed_after_finish() {
898 let mut d = HZEncoding.raw_decoder();
899 assert_feed_ok!(d, *b"R;~{R;", *b"R", "R;\u{4e00}");
900 assert_finish_err!(d, "");
901 assert_feed_ok!(d, *b"R;~{R;", *b"", "R;\u{4e00}");
902 assert_finish_ok!(d, "");
903 }
904
905 #[bench]
906 fn bench_encode_short_text(bencher: &mut test::Bencher) {
907 let s = testutils::SIMPLIFIED_CHINESE_TEXT;
908 bencher.bytes = s.len() as u64;
909 bencher.iter(|| test::black_box(HZEncoding.encode(s, EncoderTrap::Strict)))
910 }
911
912 #[bench]
913 fn bench_decode_short_text(bencher: &mut test::Bencher) {
914 let s = HZEncoding
915 .encode(testutils::SIMPLIFIED_CHINESE_TEXT, EncoderTrap::Strict)
916 .ok()
917 .unwrap();
918 bencher.bytes = s.len() as u64;
919 bencher.iter(|| test::black_box(HZEncoding.decode(&s, DecoderTrap::Strict)))
920 }
921}