encoding/codec/
japanese.rs

1// This is a part of encoding-next.
2// Copyright (c) 2013-2015, Kang Seonghoon.
3// See README.md and LICENSE.txt for details.
4
5//! Legacy Japanese encodings based on JIS X 0208 and JIS X 0212.
6
7use self::ISO2022JPState::{Katakana, Lead, ASCII};
8use crate::index_japanese as index;
9use crate::types::*;
10use crate::util::StrCharIndex;
11use std::convert::Into;
12use std::default::Default;
13
14/**
15 * EUC-JP. (XXX with asymmetric JIS X 0212 support)
16 *
17 * This is a Japanese encoding created from three JIS character sets:
18 *
19 * - JIS X 0201, which lower half is ISO/IEC 646:JP (US-ASCII with yen sign and overline)
20 *   and upper half contains legacy half-width Katakanas.
21 * - JIS X 0208, a primary graphic character set (94x94).
22 * - JIS X 0212, a supplementary graphic character set (94x94).
23 *
24 * EUC-JP contains the lower half of JIS X 0201 in G0 (`[21-7E]`),
25 * JIS X 0208 in G1 (`[A1-FE] [A1-FE]`),
26 * the upper half of JIS X 0212 in G2 (`8E [A1-DF]`), and
27 * JIS X 0212 in G3 (`8F [A1-FE] [A1-FE]`).
28 */
29#[derive(Clone, Copy)]
30pub struct EUCJPEncoding;
31
32impl Encoding for EUCJPEncoding {
33    fn name(&self) -> &'static str {
34        "euc-jp"
35    }
36    fn whatwg_name(&self) -> Option<&'static str> {
37        Some("euc-jp")
38    }
39    fn raw_encoder(&self) -> Box<dyn RawEncoder> {
40        EUCJPEncoder::new()
41    }
42    fn raw_decoder(&self) -> Box<dyn RawDecoder> {
43        EUCJP0212Decoder::new()
44    }
45}
46
47/// An encoder for EUC-JP with unused G3 character set.
48#[derive(Clone, Copy)]
49pub struct EUCJPEncoder;
50
51impl EUCJPEncoder {
52    #[allow(clippy::new_ret_no_self)]
53    pub fn new() -> Box<dyn RawEncoder> {
54        Box::new(EUCJPEncoder)
55    }
56}
57
58impl RawEncoder for EUCJPEncoder {
59    fn from_self(&self) -> Box<dyn RawEncoder> {
60        EUCJPEncoder::new()
61    }
62    fn is_ascii_compatible(&self) -> bool {
63        true
64    }
65
66    fn raw_feed(
67        &mut self,
68        input: &str,
69        output: &mut dyn ByteWriter,
70    ) -> (usize, Option<CodecError>) {
71        output.writer_hint(input.len());
72
73        for ((i, j), ch) in input.index_iter() {
74            match ch {
75                '\u{0}'..='\u{7f}' => {
76                    output.write_byte(ch as u8);
77                }
78                '\u{a5}' => {
79                    output.write_byte(0x5c);
80                }
81                '\u{203e}' => {
82                    output.write_byte(0x7e);
83                }
84                '\u{ff61}'..='\u{ff9f}' => {
85                    output.write_byte(0x8e);
86                    output.write_byte((ch as usize - 0xff61 + 0xa1) as u8);
87                }
88                _ => {
89                    let ptr = index::jis0208::backward(ch as u32);
90                    if ptr == 0xffff {
91                        return (
92                            i,
93                            Some(CodecError {
94                                upto: j as isize,
95                                cause: "unrepresentable character".into(),
96                            }),
97                        );
98                    } else {
99                        let lead = ptr / 94 + 0xa1;
100                        let trail = ptr % 94 + 0xa1;
101                        output.write_byte(lead as u8);
102                        output.write_byte(trail as u8);
103                    }
104                }
105            }
106        }
107        (input.len(), None)
108    }
109
110    fn raw_finish(&mut self, _output: &mut dyn ByteWriter) -> Option<CodecError> {
111        None
112    }
113}
114
115/// A decoder for EUC-JP with JIS X 0212 in G3.
116#[derive(Clone, Copy)]
117struct EUCJP0212Decoder {
118    st: eucjp::State,
119}
120
121impl EUCJP0212Decoder {
122    #[allow(clippy::new_ret_no_self)]
123    pub fn new() -> Box<dyn RawDecoder> {
124        Box::new(EUCJP0212Decoder {
125            st: Default::default(),
126        })
127    }
128}
129
130impl RawDecoder for EUCJP0212Decoder {
131    fn from_self(&self) -> Box<dyn RawDecoder> {
132        EUCJP0212Decoder::new()
133    }
134    fn is_ascii_compatible(&self) -> bool {
135        true
136    }
137
138    fn raw_feed(
139        &mut self,
140        input: &[u8],
141        output: &mut dyn StringWriter,
142    ) -> (usize, Option<CodecError>) {
143        let (st, processed, err) = eucjp::raw_feed(self.st, input, output, &());
144        self.st = st;
145        (processed, err)
146    }
147
148    fn raw_finish(&mut self, output: &mut dyn StringWriter) -> Option<CodecError> {
149        let (st, err) = eucjp::raw_finish(self.st, output, &());
150        self.st = st;
151        err
152    }
153}
154
155stateful_decoder! {
156    module eucjp;
157
158    internal pub fn map_two_0208_bytes(lead: u8, trail: u8) -> u32 {
159        use crate::index_japanese as index;
160
161        let lead = lead as u16;
162        let trail = trail as u16;
163        let index = match (lead, trail) {
164            (0xa1..=0xfe, 0xa1..=0xfe) => (lead - 0xa1) * 94 + trail - 0xa1,
165            _ => 0xffff,
166        };
167        index::jis0208::forward(index)
168    }
169
170    internal pub fn map_two_0212_bytes(lead: u8, trail: u8) -> u32 {
171        use crate::index_japanese as index;
172
173        let lead = lead as u16;
174        let trail = trail as u16;
175        let index = match (lead, trail) {
176            (0xa1..=0xfe, 0xa1..=0xfe) => (lead - 0xa1) * 94 + trail - 0xa1,
177            _ => 0xffff,
178        };
179        index::jis0212::forward(index)
180    }
181
182initial:
183    // euc-jp lead = 0x00
184    state S0(ctx: Context) {
185        case b @ 0x00..=0x7f => ctx.emit(b as u32);
186        case 0x8e => S1(ctx);
187        case 0x8f => S2(ctx);
188        case b @ 0xa1..=0xfe => S3(ctx, b);
189        case _ => ctx.err("invalid sequence");
190    }
191
192transient:
193    // euc-jp lead = 0x8e
194    state S1(ctx: Context) {
195        case b @ 0xa1..=0xdf => ctx.emit(0xff61 + b as u32 - 0xa1);
196        case 0xa1..=0xfe => ctx.err("invalid sequence");
197        case _ => ctx.backup_and_err(1, "invalid sequence");
198    }
199
200    // euc-jp lead = 0x8f
201    // JIS X 0201 half-width katakana
202    state S2(ctx: Context) {
203        case b @ 0xa1..=0xfe => S4(ctx, b);
204        case _ => ctx.backup_and_err(1, "invalid sequence");
205    }
206
207    // euc-jp lead != 0x00, euc-jp jis0212 flag = unset
208    // JIS X 0208 two-byte sequence
209    state S3(ctx: Context, lead: u8) {
210        case b @ 0xa1..=0xfe => match map_two_0208_bytes(lead, b) {
211            // do NOT backup, we only backup for out-of-range trails.
212            0xffff => ctx.err("invalid sequence"),
213            ch => ctx.emit(ch as u32)
214        };
215        case _ => ctx.backup_and_err(1, "invalid sequence");
216    }
217
218    // euc-jp lead != 0x00, euc-jp jis0212 flag = set
219    // JIS X 0212 three-byte sequence
220    state S4(ctx: Context, lead: u8) {
221        case b @ 0xa1..=0xfe => match map_two_0212_bytes(lead, b) {
222            // do NOT backup, we only backup for out-of-range trails.
223            0xffff => ctx.err("invalid sequence"),
224            ch => ctx.emit(ch as u32)
225        };
226        case _ => ctx.backup_and_err(1, "invalid sequence");
227    }
228}
229
230#[cfg(test)]
231mod eucjp_tests {
232    extern crate test;
233    use super::EUCJPEncoding;
234    use crate::testutils;
235    use crate::types::*;
236
237    #[test]
238    fn test_encoder_valid() {
239        let mut e = EUCJPEncoding.raw_encoder();
240        assert_feed_ok!(e, "A", "", [0x41]);
241        assert_feed_ok!(e, "BC", "", [0x42, 0x43]);
242        assert_feed_ok!(e, "", "", []);
243        assert_feed_ok!(e, "\u{a5}", "", [0x5c]);
244        assert_feed_ok!(e, "\u{203e}", "", [0x7e]);
245        assert_feed_ok!(
246            e,
247            "\u{306b}\u{307b}\u{3093}",
248            "",
249            [0xa4, 0xcb, 0xa4, 0xdb, 0xa4, 0xf3]
250        );
251        assert_feed_ok!(
252            e,
253            "\u{ff86}\u{ff8e}\u{ff9d}",
254            "",
255            [0x8e, 0xc6, 0x8e, 0xce, 0x8e, 0xdd]
256        );
257        assert_feed_ok!(e, "\u{65e5}\u{672c}", "", [0xc6, 0xfc, 0xcb, 0xdc]);
258        assert_finish_ok!(e, []);
259    }
260
261    #[test]
262    fn test_encoder_double_mapped() {
263        // these characters are double-mapped to both EUDC area and Shift_JIS extension area
264        // but only the former should be used. (note that U+FFE2 is triple-mapped!)
265        let mut e = EUCJPEncoding.raw_encoder();
266        assert_feed_ok!(
267            e,
268            "\u{9ed1}\u{2170}\u{ffe2}",
269            "",
270            [0xfc, 0xee, 0xfc, 0xf1, 0xa2, 0xcc]
271        );
272        assert_finish_ok!(e, []);
273    }
274
275    #[test]
276    fn test_encoder_invalid() {
277        let mut e = EUCJPEncoding.raw_encoder();
278        assert_feed_err!(e, "", "\u{ffff}", "", []);
279        assert_feed_err!(e, "?", "\u{ffff}", "!", [0x3f]);
280        // JIS X 0212 is not supported in the encoder
281        assert_feed_err!(e, "", "\u{736c}", "\u{8c78}", []);
282        assert_finish_ok!(e, []);
283    }
284
285    #[test]
286    fn test_decoder_valid() {
287        let mut d = EUCJPEncoding.raw_decoder();
288        assert_feed_ok!(d, [0x41], [], "A");
289        assert_feed_ok!(d, [0x42, 0x43], [], "BC");
290        assert_feed_ok!(d, [], [], "");
291        assert_feed_ok!(d, [0x5c], [], "\\");
292        assert_feed_ok!(d, [0x7e], [], "~");
293        assert_feed_ok!(
294            d,
295            [0xa4, 0xcb, 0xa4, 0xdb, 0xa4, 0xf3],
296            [],
297            "\u{306b}\u{307b}\u{3093}"
298        );
299        assert_feed_ok!(
300            d,
301            [0x8e, 0xc6, 0x8e, 0xce, 0x8e, 0xdd],
302            [],
303            "\u{ff86}\u{ff8e}\u{ff9d}"
304        );
305        assert_feed_ok!(d, [0xc6, 0xfc, 0xcb, 0xdc], [], "\u{65e5}\u{672c}");
306        assert_feed_ok!(d, [0x8f, 0xcb, 0xc6, 0xec, 0xb8], [], "\u{736c}\u{8c78}");
307        assert_finish_ok!(d, "");
308    }
309
310    #[test]
311    fn test_decoder_valid_partial() {
312        let mut d = EUCJPEncoding.raw_decoder();
313        assert_feed_ok!(d, [], [0xa4], "");
314        assert_feed_ok!(d, [0xcb], [0xa4], "\u{306b}");
315        assert_feed_ok!(d, [0xdb], [0xa4], "\u{307b}");
316        assert_feed_ok!(d, [0xf3], [], "\u{3093}");
317        assert_feed_ok!(d, [], [0x8e], "");
318        assert_feed_ok!(d, [0xc6], [0x8e], "\u{ff86}");
319        assert_feed_ok!(d, [0xce], [0x8e], "\u{ff8e}");
320        assert_feed_ok!(d, [0xdd], [], "\u{ff9d}");
321        assert_feed_ok!(d, [], [0xc6], "");
322        assert_feed_ok!(d, [0xfc], [0xcb], "\u{65e5}");
323        assert_feed_ok!(d, [0xdc], [], "\u{672c}");
324        assert_feed_ok!(d, [], [0x8f], "");
325        assert_feed_ok!(d, [], [0xcb], "");
326        assert_feed_ok!(d, [0xc6], [0xec], "\u{736c}");
327        assert_feed_ok!(d, [0xb8], [], "\u{8c78}");
328        assert_feed_ok!(d, [], [0x8f, 0xcb], "");
329        assert_feed_ok!(d, [0xc6, 0xec, 0xb8], [], "\u{736c}\u{8c78}");
330        assert_finish_ok!(d, "");
331    }
332
333    #[test]
334    fn test_decoder_invalid_lone_lead_immediate_test_finish() {
335        for i in 0x8e..0x90 {
336            let mut d = EUCJPEncoding.raw_decoder();
337            assert_feed_ok!(d, [], [i], ""); // wait for a trail
338            assert_finish_err!(d, "");
339        }
340
341        for i in 0xa1..0xff {
342            let mut d = EUCJPEncoding.raw_decoder();
343            assert_feed_ok!(d, [], [i], ""); // wait for a trail
344            assert_finish_err!(d, "");
345        }
346
347        // immediate failures
348        let mut d = EUCJPEncoding.raw_decoder();
349        for i in 0x80..0x8e {
350            assert_feed_err!(d, [], [i], [], "");
351        }
352        for i in 0x90..0xa1 {
353            assert_feed_err!(d, [], [i], [], "");
354        }
355        assert_feed_err!(d, [], [0xff], [], "");
356        assert_finish_ok!(d, "");
357    }
358
359    #[test]
360    fn test_decoder_invalid_lone_lead_followed_by_space() {
361        for i in 0x80..0x100 {
362            let i = i as u8;
363            let mut d = EUCJPEncoding.raw_decoder();
364            assert_feed_err!(d, [], [i], [0x20], "");
365            assert_finish_ok!(d, "");
366        }
367    }
368
369    #[test]
370    fn test_decoder_invalid_lead_followed_by_invalid_trail() {
371        for i in 0x80..0x100 {
372            let i = i as u8;
373            let mut d = EUCJPEncoding.raw_decoder();
374            assert_feed_err!(d, [], [i], [0x80], "");
375            assert_feed_err!(d, [], [i], [0xff], "");
376            assert_finish_ok!(d, "");
377        }
378    }
379
380    #[test]
381    fn test_decoder_invalid_lone_lead_for_0212_immediate_test_finish() {
382        for i in 0xa1..0xff {
383            let mut d = EUCJPEncoding.raw_decoder();
384            assert_feed_ok!(d, [], [0x8f, i], ""); // wait for a trail
385            assert_finish_err!(d, "");
386        }
387    }
388
389    #[test]
390    fn test_decoder_invalid_lone_lead_for_0212_immediate_test_finish_partial() {
391        for i in 0xa1..0xff {
392            let mut d = EUCJPEncoding.raw_decoder();
393            assert_feed_ok!(d, [], [0x8f], "");
394            assert_feed_ok!(d, [], [i], ""); // wait for a trail
395            assert_finish_err!(d, "");
396        }
397    }
398
399    #[test]
400    fn test_decoder_invalid_trail_for_0201() {
401        for i in 0..0xa1 {
402            let mut d = EUCJPEncoding.raw_decoder();
403            assert_feed_err!(d, [], [0x8e], [i], "");
404            assert_finish_ok!(d, "");
405        }
406
407        for i in 0xe0..0xff {
408            let mut d = EUCJPEncoding.raw_decoder();
409            assert_feed_err!(d, [], [0x8e, i], [], "");
410            assert_finish_ok!(d, "");
411        }
412    }
413
414    #[test]
415    fn test_decoder_invalid_trail_for_0201_partial() {
416        for i in 0..0xa1 {
417            let mut d = EUCJPEncoding.raw_decoder();
418            assert_feed_ok!(d, [], [0x8e], "");
419            assert_feed_err!(d, [], [], [i], "");
420            assert_finish_ok!(d, "");
421        }
422
423        for i in 0xe0..0xff {
424            let mut d = EUCJPEncoding.raw_decoder();
425            assert_feed_ok!(d, [], [0x8e], "");
426            assert_feed_err!(d, [], [i], [], "");
427            assert_finish_ok!(d, "");
428        }
429    }
430
431    #[test]
432    fn test_decoder_invalid_middle_for_0212() {
433        for i in 0..0xa1 {
434            let mut d = EUCJPEncoding.raw_decoder();
435            assert_feed_err!(d, [], [0x8f], [i], "");
436            assert_finish_ok!(d, "");
437        }
438    }
439
440    #[test]
441    fn test_decoder_invalid_middle_for_0212_partial() {
442        for i in 0..0xa1 {
443            let mut d = EUCJPEncoding.raw_decoder();
444            assert_feed_ok!(d, [], [0x8f], "");
445            assert_feed_err!(d, [], [], [i], "");
446            assert_finish_ok!(d, "");
447        }
448    }
449
450    #[test]
451    fn test_decoder_invalid_trail_for_0212() {
452        for i in 0..0xa1 {
453            let mut d = EUCJPEncoding.raw_decoder();
454            assert_feed_err!(d, [], [0x8f, 0xa1], [i], "");
455            assert_finish_ok!(d, "");
456        }
457    }
458
459    #[test]
460    fn test_decoder_invalid_trail_for_0212_partial() {
461        for i in 0..0xa1 {
462            let mut d = EUCJPEncoding.raw_decoder();
463            assert_feed_ok!(d, [], [0x8f], "");
464            assert_feed_ok!(d, [], [0xa1], "");
465            assert_feed_err!(d, [], [], [i], "");
466            assert_finish_ok!(d, "");
467        }
468    }
469
470    #[test]
471    fn test_decoder_feed_after_finish() {
472        let mut d = EUCJPEncoding.raw_decoder();
473        assert_feed_ok!(d, [0xa4, 0xa2], [0xa4], "\u{3042}");
474        assert_finish_err!(d, "");
475        assert_feed_ok!(d, [0xa4, 0xa2], [], "\u{3042}");
476        assert_finish_ok!(d, "");
477    }
478
479    #[bench]
480    fn bench_encode_short_text(bencher: &mut test::Bencher) {
481        let s = testutils::JAPANESE_TEXT;
482        bencher.bytes = s.len() as u64;
483        bencher.iter(|| test::black_box(EUCJPEncoding.encode(s, EncoderTrap::Strict)))
484    }
485
486    #[bench]
487    fn bench_decode_short_text(bencher: &mut test::Bencher) {
488        let s = EUCJPEncoding
489            .encode(testutils::JAPANESE_TEXT, EncoderTrap::Strict)
490            .ok()
491            .unwrap();
492        bencher.bytes = s.len() as u64;
493        bencher.iter(|| test::black_box(EUCJPEncoding.decode(&s, DecoderTrap::Strict)))
494    }
495}
496
497/**
498 * Windows code page 932, i.e. Shift_JIS with IBM/NEC extensions.
499 *
500 * This is a Japanese encoding for JIS X 0208
501 * compatible to the original assignments of JIS X 0201 (`[21-7E A1-DF]`).
502 * The 94 by 94 region of JIS X 0208 is sliced, or rather "shifted" into
503 * the odd half (odd row number) and even half (even row number),
504 * and merged into the 188 by 47 region mapped to `[81-9F E0-EF] [40-7E 80-FC]`.
505 * The remaining area, `[80 A0 F0-FF] [40-7E 80-FC]`, has been subjected to
506 * numerous extensions incompatible to each other.
507 * This particular implementation uses IBM/NEC extensions
508 * which assigns more characters to `[F0-FC 80-FC]` and also to the Private Use Area (PUA).
509 * It requires some cares to handle
510 * since the second byte of JIS X 0208 can have its MSB unset.
511 */
512#[derive(Clone, Copy)]
513pub struct Windows31JEncoding;
514
515impl Encoding for Windows31JEncoding {
516    fn name(&self) -> &'static str {
517        "windows-31j"
518    }
519    fn whatwg_name(&self) -> Option<&'static str> {
520        Some("shift_jis")
521    } // WHATWG compatibility
522    fn raw_encoder(&self) -> Box<dyn RawEncoder> {
523        Windows31JEncoder::new()
524    }
525    fn raw_decoder(&self) -> Box<dyn RawDecoder> {
526        Windows31JDecoder::new()
527    }
528}
529
530/// An encoder for Shift_JIS with IBM/NEC extensions.
531#[derive(Clone, Copy)]
532pub struct Windows31JEncoder;
533
534impl Windows31JEncoder {
535    #[allow(clippy::new_ret_no_self)]
536    pub fn new() -> Box<dyn RawEncoder> {
537        Box::new(Windows31JEncoder)
538    }
539}
540
541impl RawEncoder for Windows31JEncoder {
542    fn from_self(&self) -> Box<dyn RawEncoder> {
543        Windows31JEncoder::new()
544    }
545    fn is_ascii_compatible(&self) -> bool {
546        true
547    }
548
549    fn raw_feed(
550        &mut self,
551        input: &str,
552        output: &mut dyn ByteWriter,
553    ) -> (usize, Option<CodecError>) {
554        output.writer_hint(input.len());
555
556        for ((i, j), ch) in input.index_iter() {
557            match ch {
558                '\u{0}'..='\u{80}' => {
559                    output.write_byte(ch as u8);
560                }
561                '\u{a5}' => {
562                    output.write_byte(0x5c);
563                }
564                '\u{203e}' => {
565                    output.write_byte(0x7e);
566                }
567                '\u{ff61}'..='\u{ff9f}' => {
568                    output.write_byte((ch as usize - 0xff61 + 0xa1) as u8);
569                }
570                _ => {
571                    // corresponds to the "index shift_jis pointer" in the WHATWG spec
572                    let ptr = index::jis0208::backward_remapped(ch as u32);
573                    if ptr == 0xffff {
574                        return (
575                            i,
576                            Some(CodecError {
577                                upto: j as isize,
578                                cause: "unrepresentable character".into(),
579                            }),
580                        );
581                    } else {
582                        let lead = ptr / 188;
583                        let leadoffset = if lead < 0x1f { 0x81 } else { 0xc1 };
584                        let trail = ptr % 188;
585                        let trailoffset = if trail < 0x3f { 0x40 } else { 0x41 };
586                        output.write_byte((lead + leadoffset) as u8);
587                        output.write_byte((trail + trailoffset) as u8);
588                    }
589                }
590            }
591        }
592        (input.len(), None)
593    }
594
595    fn raw_finish(&mut self, _output: &mut dyn ByteWriter) -> Option<CodecError> {
596        None
597    }
598}
599
600/// A decoder for Shift_JIS with IBM/NEC extensions.
601#[derive(Clone, Copy)]
602struct Windows31JDecoder {
603    st: windows31j::State,
604}
605
606impl Windows31JDecoder {
607    #[allow(clippy::new_ret_no_self)]
608    pub fn new() -> Box<dyn RawDecoder> {
609        Box::new(Windows31JDecoder {
610            st: Default::default(),
611        })
612    }
613}
614
615impl RawDecoder for Windows31JDecoder {
616    fn from_self(&self) -> Box<dyn RawDecoder> {
617        Windows31JDecoder::new()
618    }
619    fn is_ascii_compatible(&self) -> bool {
620        true
621    }
622
623    fn raw_feed(
624        &mut self,
625        input: &[u8],
626        output: &mut dyn StringWriter,
627    ) -> (usize, Option<CodecError>) {
628        let (st, processed, err) = windows31j::raw_feed(self.st, input, output, &());
629        self.st = st;
630        (processed, err)
631    }
632
633    fn raw_finish(&mut self, output: &mut dyn StringWriter) -> Option<CodecError> {
634        let (st, err) = windows31j::raw_finish(self.st, output, &());
635        self.st = st;
636        err
637    }
638}
639
640stateful_decoder! {
641    module windows31j;
642
643    internal pub fn map_two_0208_bytes(lead: u8, trail: u8) -> u32 {
644        use crate::index_japanese as index;
645
646        let lead = lead as u16;
647        let trail = trail as u16;
648        let leadoffset = if lead < 0xa0 {0x81} else {0xc1};
649        let trailoffset = if trail < 0x7f {0x40} else {0x41};
650        let index = match (lead, trail) {
651            (0xf0..=0xf9, 0x40..=0x7e) | (0xf0..=0xf9, 0x80..=0xfc) =>
652                return (0xe000 + (lead - 0xf0) * 188 + trail - trailoffset) as u32,
653            (0x81..=0x9f, 0x40..=0x7e) | (0x81..=0x9f, 0x80..=0xfc) |
654            (0xe0..=0xfc, 0x40..=0x7e) | (0xe0..=0xfc, 0x80..=0xfc) =>
655                (lead - leadoffset) * 188 + trail - trailoffset,
656            _ => 0xffff,
657        };
658        index::jis0208::forward(index)
659    }
660
661initial:
662    // shift_jis lead = 0x00
663    state S0(ctx: Context) {
664        case b @ 0x00..=0x80 => ctx.emit(b as u32);
665        case b @ 0xa1..=0xdf => ctx.emit(0xff61 + b as u32 - 0xa1);
666        case b @ 0x81..=0x9f, b @ 0xe0..=0xfc => S1(ctx, b);
667        case _ => ctx.err("invalid sequence");
668    }
669
670transient:
671    // shift_jis lead != 0x00
672    state S1(ctx: Context, lead: u8) {
673        case b => match map_two_0208_bytes(lead, b) {
674            0xffff => ctx.backup_and_err(1, "invalid sequence"), // unconditional
675            ch => ctx.emit(ch)
676        };
677    }
678}
679
680#[cfg(test)]
681mod windows31j_tests {
682    extern crate test;
683    use super::Windows31JEncoding;
684    use crate::testutils;
685    use crate::types::*;
686
687    #[test]
688    fn test_encoder_valid() {
689        let mut e = Windows31JEncoding.raw_encoder();
690        assert_feed_ok!(e, "A", "", [0x41]);
691        assert_feed_ok!(e, "BC", "", [0x42, 0x43]);
692        assert_feed_ok!(e, "", "", []);
693        assert_feed_ok!(e, "\u{a5}", "", [0x5c]);
694        assert_feed_ok!(e, "\u{203e}", "", [0x7e]);
695        assert_feed_ok!(
696            e,
697            "\u{306b}\u{307b}\u{3093}",
698            "",
699            [0x82, 0xc9, 0x82, 0xd9, 0x82, 0xf1]
700        );
701        assert_feed_ok!(e, "\u{ff86}\u{ff8e}\u{ff9d}", "", [0xc6, 0xce, 0xdd]);
702        assert_feed_ok!(e, "\u{65e5}\u{672c}", "", [0x93, 0xfa, 0x96, 0x7b]);
703        assert_finish_ok!(e, []);
704    }
705
706    #[test]
707    fn test_encoder_no_eudc() {
708        let mut e = Windows31JEncoding.raw_encoder();
709        assert_feed_err!(e, "", "\u{e000}", "", []);
710        assert_feed_err!(e, "", "\u{e757}", "", []);
711        assert_feed_err!(e, "", "\u{e758}", "", []);
712        assert_finish_ok!(e, []);
713    }
714
715    #[test]
716    fn test_encoder_double_mapped() {
717        // these characters are double-mapped to both EUDC area and Shift_JIS extension area
718        // but only the latter should be used. (note that U+FFE2 is triple-mapped!)
719        let mut e = Windows31JEncoding.raw_encoder();
720        assert_feed_ok!(
721            e,
722            "\u{9ed1}\u{2170}\u{ffe2}",
723            "",
724            [0xfc, 0x4b, 0xfa, 0x40, 0x81, 0xca]
725        );
726        assert_finish_ok!(e, []);
727    }
728
729    #[test]
730    fn test_encoder_invalid() {
731        let mut e = Windows31JEncoding.raw_encoder();
732        assert_feed_err!(e, "", "\u{ffff}", "", []);
733        assert_feed_err!(e, "?", "\u{ffff}", "!", [0x3f]);
734        assert_feed_err!(e, "", "\u{736c}", "\u{8c78}", []);
735        assert_finish_ok!(e, []);
736    }
737
738    #[test]
739    fn test_decoder_valid() {
740        let mut d = Windows31JEncoding.raw_decoder();
741        assert_feed_ok!(d, [0x41], [], "A");
742        assert_feed_ok!(d, [0x42, 0x43], [], "BC");
743        assert_feed_ok!(d, [], [], "");
744        assert_feed_ok!(d, [0x5c], [], "\\");
745        assert_feed_ok!(d, [0x7e], [], "~");
746        assert_feed_ok!(d, [0x80], [], "\u{80}"); // compatibility
747        assert_feed_ok!(
748            d,
749            [0x82, 0xc9, 0x82, 0xd9, 0x82, 0xf1],
750            [],
751            "\u{306b}\u{307b}\u{3093}"
752        );
753        assert_feed_ok!(d, [0xc6, 0xce, 0xdd], [], "\u{ff86}\u{ff8e}\u{ff9d}");
754        assert_feed_ok!(d, [0x93, 0xfa, 0x96, 0x7b], [], "\u{65e5}\u{672c}");
755        assert_finish_ok!(d, "");
756    }
757
758    #[test]
759    fn test_decoder_eudc() {
760        let mut d = Windows31JEncoding.raw_decoder();
761        assert_feed_ok!(d, [], [0xf0], "");
762        assert_feed_ok!(d, [0x40], [], "\u{e000}");
763        assert_feed_ok!(d, [0xf9, 0xfc], [], "\u{e757}");
764        assert_feed_err!(d, [], [0xf0], [0x00], "");
765        assert_feed_err!(d, [], [0xf0], [0xff], "");
766        assert_finish_ok!(d, "");
767    }
768
769    #[test]
770    fn test_decoder_invalid_lone_lead_immediate_test_finish() {
771        for i in 0x81..0xa0 {
772            let mut d = Windows31JEncoding.raw_decoder();
773            assert_feed_ok!(d, [], [i], ""); // wait for a trail
774            assert_finish_err!(d, "");
775        }
776
777        for i in 0xe0..0xfd {
778            let mut d = Windows31JEncoding.raw_decoder();
779            assert_feed_ok!(d, [], [i], ""); // wait for a trail
780            assert_finish_err!(d, "");
781        }
782
783        // A0/FD/FE/FF: immediate failure
784        let mut d = Windows31JEncoding.raw_decoder();
785        assert_feed_err!(d, [], [0xa0], [], "");
786        assert_feed_err!(d, [], [0xfd], [], "");
787        assert_feed_err!(d, [], [0xfe], [], "");
788        assert_feed_err!(d, [], [0xff], [], "");
789        assert_finish_ok!(d, "");
790    }
791
792    #[test]
793    fn test_decoder_invalid_lone_lead_followed_by_space() {
794        for i in 0x81..0xa0 {
795            let mut d = Windows31JEncoding.raw_decoder();
796            assert_feed_err!(d, [], [i], [0x20], "");
797            assert_finish_ok!(d, "");
798        }
799
800        for i in 0xe0..0xfd {
801            let mut d = Windows31JEncoding.raw_decoder();
802            assert_feed_err!(d, [], [i], [0x20], "");
803            assert_finish_ok!(d, "");
804        }
805    }
806
807    #[test]
808    fn test_decoder_invalid_lead_followed_by_invalid_trail() {
809        for i in 0x81..0xa0 {
810            let mut d = Windows31JEncoding.raw_decoder();
811            assert_feed_err!(d, [], [i], [0x3f], "");
812            assert_feed_err!(d, [], [i], [0x7f], "");
813            assert_feed_err!(d, [], [i], [0xfd], "");
814            assert_feed_err!(d, [], [i], [0xfe], "");
815            assert_feed_err!(d, [], [i], [0xff], "");
816            assert_finish_ok!(d, "");
817        }
818
819        for i in 0xe0..0xfd {
820            let mut d = Windows31JEncoding.raw_decoder();
821            assert_feed_err!(d, [], [i], [0x3f], "");
822            assert_feed_err!(d, [], [i], [0x7f], "");
823            assert_feed_err!(d, [], [i], [0xfd], "");
824            assert_feed_err!(d, [], [i], [0xfe], "");
825            assert_feed_err!(d, [], [i], [0xff], "");
826            assert_finish_ok!(d, "");
827        }
828    }
829
830    #[test]
831    fn test_decoder_invalid_lead_followed_by_invalid_trail_partial() {
832        for i in 0x81..0xa0 {
833            let mut d = Windows31JEncoding.raw_decoder();
834            assert_feed_ok!(d, [], [i], "");
835            assert_feed_err!(d, [], [], [0xff], "");
836            assert_finish_ok!(d, "");
837        }
838
839        for i in 0xe0..0xfd {
840            let mut d = Windows31JEncoding.raw_decoder();
841            assert_feed_ok!(d, [], [i], "");
842            assert_feed_err!(d, [], [], [0xff], "");
843            assert_finish_ok!(d, "");
844        }
845    }
846
847    #[test]
848    fn test_decoder_feed_after_finish() {
849        let mut d = Windows31JEncoding.raw_decoder();
850        assert_feed_ok!(d, [0x82, 0xa0], [0x82], "\u{3042}");
851        assert_finish_err!(d, "");
852        assert_feed_ok!(d, [0x82, 0xa0], [], "\u{3042}");
853        assert_finish_ok!(d, "");
854    }
855
856    #[bench]
857    fn bench_encode_short_text(bencher: &mut test::Bencher) {
858        let s = testutils::JAPANESE_TEXT;
859        bencher.bytes = s.len() as u64;
860        bencher.iter(|| test::black_box(Windows31JEncoding.encode(s, EncoderTrap::Strict)))
861    }
862
863    #[bench]
864    fn bench_decode_short_text(bencher: &mut test::Bencher) {
865        let s = Windows31JEncoding
866            .encode(testutils::JAPANESE_TEXT, EncoderTrap::Strict)
867            .ok()
868            .unwrap();
869        bencher.bytes = s.len() as u64;
870        bencher.iter(|| test::black_box(Windows31JEncoding.decode(&s, DecoderTrap::Strict)))
871    }
872}
873
874/**
875 * ISO-2022-JP.
876 *
877 * This version of ISO-2022-JP does not correspond to any standardized repertoire of character sets
878 * due to the widespread implementation differences. The following character sets are supported:
879 *
880 * - JIS X 0201-1976 roman (`ESC ( J` or `ESC ( B`; the latter is originally allocated to ASCII
881 *   but willfully violated)
882 * - JIS X 0201-1976 kana (`ESC ( I`)
883 * - JIS X 0208-1983 (`ESC $ B` or `ESC $ @`; the latter is originally allocated to JIS X 0208-1978
884 *   but willfully violated)
885 * - JIS X 0212-1990 (`ESC $ ( D`, XXX asymmetric support)
886 */
887#[derive(Clone, Copy)]
888pub struct ISO2022JPEncoding;
889
890impl Encoding for ISO2022JPEncoding {
891    fn name(&self) -> &'static str {
892        "iso-2022-jp"
893    }
894    fn whatwg_name(&self) -> Option<&'static str> {
895        Some("iso-2022-jp")
896    }
897    fn raw_encoder(&self) -> Box<dyn RawEncoder> {
898        ISO2022JPEncoder::new()
899    }
900    fn raw_decoder(&self) -> Box<dyn RawDecoder> {
901        ISO2022JPDecoder::new()
902    }
903}
904
905#[allow(clippy::upper_case_acronyms)]
906#[derive(PartialEq, Clone, Copy)]
907enum ISO2022JPState {
908    ASCII,    // U+0000..007F, U+00A5, U+203E
909    Katakana, // JIS X 0201: U+FF61..FF9F
910    Lead,     // JIS X 0208
911}
912
913/// An encoder for ISO-2022-JP without JIS X 0212/0213 support.
914#[derive(Clone, Copy)]
915pub struct ISO2022JPEncoder {
916    st: ISO2022JPState,
917}
918
919impl ISO2022JPEncoder {
920    #[allow(clippy::new_ret_no_self)]
921    pub fn new() -> Box<dyn RawEncoder> {
922        Box::new(ISO2022JPEncoder { st: ASCII })
923    }
924}
925
926impl RawEncoder for ISO2022JPEncoder {
927    fn from_self(&self) -> Box<dyn RawEncoder> {
928        ISO2022JPEncoder::new()
929    }
930    fn is_ascii_compatible(&self) -> bool {
931        true
932    }
933
934    fn raw_feed(
935        &mut self,
936        input: &str,
937        output: &mut dyn ByteWriter,
938    ) -> (usize, Option<CodecError>) {
939        output.writer_hint(input.len());
940
941        let mut st = self.st;
942        macro_rules! ensure_ASCII(
943            () => (if st != ASCII { output.write_bytes(b"\x1b(B"); st = ASCII; })
944        );
945        macro_rules! ensure_Katakana(
946            () => (if st != Katakana { output.write_bytes(b"\x1b(I"); st = Katakana; })
947        );
948        macro_rules! ensure_Lead(
949            () => (if st != Lead { output.write_bytes(b"\x1b$B"); st = Lead; })
950        );
951
952        for ((i, j), ch) in input.index_iter() {
953            match ch {
954                '\u{0}'..='\u{7f}' => {
955                    ensure_ASCII!();
956                    output.write_byte(ch as u8);
957                }
958                '\u{a5}' => {
959                    ensure_ASCII!();
960                    output.write_byte(0x5c);
961                }
962                '\u{203e}' => {
963                    ensure_ASCII!();
964                    output.write_byte(0x7e);
965                }
966                '\u{ff61}'..='\u{ff9f}' => {
967                    ensure_Katakana!();
968                    output.write_byte((ch as usize - 0xff61 + 0x21) as u8);
969                }
970                _ => {
971                    let ptr = index::jis0208::backward(ch as u32);
972                    if ptr == 0xffff {
973                        self.st = st; // do NOT reset the state!
974                        return (
975                            i,
976                            Some(CodecError {
977                                upto: j as isize,
978                                cause: "unrepresentable character".into(),
979                            }),
980                        );
981                    } else {
982                        ensure_Lead!();
983                        let lead = ptr / 94 + 0x21;
984                        let trail = ptr % 94 + 0x21;
985                        output.write_byte(lead as u8);
986                        output.write_byte(trail as u8);
987                    }
988                }
989            }
990        }
991
992        self.st = st;
993        (input.len(), None)
994    }
995
996    fn raw_finish(&mut self, _output: &mut dyn ByteWriter) -> Option<CodecError> {
997        None
998    }
999}
1000
1001/// A decoder for ISO-2022-JP with JIS X 0212 support.
1002#[derive(Clone, Copy)]
1003struct ISO2022JPDecoder {
1004    st: iso2022jp::State,
1005}
1006
1007impl ISO2022JPDecoder {
1008    #[allow(clippy::new_ret_no_self)]
1009    pub fn new() -> Box<dyn RawDecoder> {
1010        Box::new(ISO2022JPDecoder {
1011            st: Default::default(),
1012        })
1013    }
1014}
1015
1016impl RawDecoder for ISO2022JPDecoder {
1017    fn from_self(&self) -> Box<dyn RawDecoder> {
1018        ISO2022JPDecoder::new()
1019    }
1020    fn is_ascii_compatible(&self) -> bool {
1021        false
1022    }
1023
1024    fn raw_feed(
1025        &mut self,
1026        input: &[u8],
1027        output: &mut dyn StringWriter,
1028    ) -> (usize, Option<CodecError>) {
1029        let (st, processed, err) = iso2022jp::raw_feed(self.st, input, output, &());
1030        self.st = st;
1031        (processed, err)
1032    }
1033
1034    fn raw_finish(&mut self, output: &mut dyn StringWriter) -> Option<CodecError> {
1035        let (st, err) = iso2022jp::raw_finish(self.st, output, &());
1036        self.st = st;
1037        err
1038    }
1039}
1040
1041stateful_decoder! {
1042    module iso2022jp;
1043
1044    internal pub fn map_two_0208_bytes(lead: u8, trail: u8) -> u32 {
1045        use crate::index_japanese as index;
1046
1047        let lead = lead as u16;
1048        let trail = trail as u16;
1049        let index = match (lead, trail) {
1050            (0x21..=0x7e, 0x21..=0x7e) => (lead - 0x21) * 94 + trail - 0x21,
1051            _ => 0xffff,
1052        };
1053        index::jis0208::forward(index)
1054    }
1055
1056    internal pub fn map_two_0212_bytes(lead: u8, trail: u8) -> u32 {
1057        use crate::index_japanese as index;
1058
1059        let lead = lead as u16;
1060        let trail = trail as u16;
1061        let index = match (lead, trail) {
1062            (0x21..=0x7e, 0x21..=0x7e) => (lead - 0x21) * 94 + trail - 0x21,
1063            _ => 0xffff,
1064        };
1065        index::jis0212::forward(index)
1066    }
1067
1068initial:
1069    // iso-2022-jp state = ASCII, iso-2022-jp jis0212 flag = unset, iso-2022-jp lead = 0x00
1070    state ASCII(ctx: Context) {
1071        case 0x1b => EscapeStart(ctx);
1072        case b @ 0x00..=0x7f => ctx.emit(b as u32), ASCII(ctx);
1073        case _ => ctx.err("invalid sequence"), ASCII(ctx);
1074        final => ctx.reset();
1075    }
1076
1077checkpoint:
1078    // iso-2022-jp state = Lead, iso-2022-jp jis0212 flag = unset
1079    state Lead0208(ctx: Context) {
1080        case 0x0a => ctx.emit(0x000a); // return to ASCII
1081        case 0x1b => EscapeStart(ctx);
1082        case b => Trail0208(ctx, b);
1083        final => ctx.reset();
1084    }
1085
1086    // iso-2022-jp state = Lead, iso-2022-jp jis0212 flag = set
1087    state Lead0212(ctx: Context) {
1088        case 0x0a => ctx.emit(0x000a); // return to ASCII
1089        case 0x1b => EscapeStart(ctx);
1090        case b => Trail0212(ctx, b);
1091        final => ctx.reset();
1092    }
1093
1094    // iso-2022-jp state = Katakana
1095    state Katakana(ctx: Context) {
1096        case 0x1b => EscapeStart(ctx);
1097        case b @ 0x21..=0x5f => ctx.emit(0xff61 + b as u32 - 0x21), Katakana(ctx);
1098        case _ => ctx.err("invalid sequence"), Katakana(ctx);
1099        final => ctx.reset();
1100    }
1101
1102transient:
1103    // iso-2022-jp state = EscapeStart
1104    // ESC
1105    state EscapeStart(ctx: Context) {
1106        case 0x24 => EscapeMiddle24(ctx); // ESC $
1107        case 0x28 => EscapeMiddle28(ctx); // ESC (
1108        case _ => ctx.backup_and_err(1, "invalid sequence");
1109        final => ctx.err("incomplete sequence");
1110    }
1111
1112    // iso-2022-jp state = EscapeMiddle, iso-2022-jp lead = 0x24
1113    // ESC $
1114    state EscapeMiddle24(ctx: Context) {
1115        case 0x40, 0x42 => Lead0208(ctx); // ESC $ @ (JIS X 0208-1978) or ESC $ B (-1983)
1116        case 0x28 => EscapeFinal(ctx); // ESC $ (
1117        case _ => ctx.backup_and_err(2, "invalid sequence");
1118        final => ctx.err("incomplete sequence");
1119    }
1120
1121    // iso-2022-jp state = EscapeMiddle, iso-2022-jp lead = 0x28
1122    // ESC (
1123    state EscapeMiddle28(ctx: Context) {
1124        case 0x42, 0x4a => ctx.reset(); // ESC ( B (ASCII) or ESC ( J (JIS X 0201-1976 roman)
1125        case 0x49 => Katakana(ctx); // ESC ( I (JIS X 0201-1976 kana)
1126        case _ => ctx.backup_and_err(2, "invalid sequence");
1127        final => ctx.err("incomplete sequence");
1128    }
1129
1130    // iso-2022-jp state = EscapeFinal
1131    // ESC $ (
1132    state EscapeFinal(ctx: Context) {
1133        case 0x44 => Lead0212(ctx); // ESC $ ( D (JIS X 0212-1990)
1134        case _ => ctx.backup_and_err(3, "invalid sequence");
1135        final => ctx.backup_and_err(1, "incomplete sequence");
1136    }
1137
1138    // iso-2022-jp state = Trail, iso-2022-jp jis0212 flag = unset
1139    state Trail0208(ctx: Context, lead: u8) {
1140        case b =>
1141            match map_two_0208_bytes(lead, b) {
1142                0xffff => ctx.err("invalid sequence"),
1143                ch => ctx.emit(ch as u32)
1144            },
1145            Lead0208(ctx);
1146        final => ctx.err("incomplete sequence");
1147    }
1148
1149    // iso-2022-jp state = Trail, iso-2022-jp jis0212 flag = set
1150    state Trail0212(ctx: Context, lead: u8) {
1151        case b =>
1152            match map_two_0212_bytes(lead, b) {
1153                0xffff => ctx.err("invalid sequence"),
1154                ch => ctx.emit(ch as u32)
1155            },
1156            Lead0212(ctx);
1157        final => ctx.err("incomplete sequence");
1158    }
1159}
1160
1161#[cfg(test)]
1162mod iso2022jp_tests {
1163    extern crate test;
1164    use super::ISO2022JPEncoding;
1165    use crate::testutils;
1166    use crate::types::*;
1167
1168    #[test]
1169    fn test_encoder_valid() {
1170        let mut e = ISO2022JPEncoding.raw_encoder();
1171        assert_feed_ok!(e, "A", "", [0x41]);
1172        assert_feed_ok!(e, "BC", "", [0x42, 0x43]);
1173        assert_feed_ok!(e, "\x1b\x24\x42", "", [0x1b, 0x24, 0x42]); // no round-trip guarantee
1174        assert_feed_ok!(e, "", "", []);
1175        assert_feed_ok!(e, "\u{a5}", "", [0x5c]);
1176        assert_feed_ok!(e, "\u{203e}", "", [0x7e]);
1177        assert_feed_ok!(
1178            e,
1179            "\u{306b}\u{307b}\u{3093}",
1180            "",
1181            [0x1b, 0x24, 0x42, 0x24, 0x4b, 0x24, 0x5b, 0x24, 0x73]
1182        );
1183        assert_feed_ok!(e, "\u{65e5}\u{672c}", "", [0x46, 0x7c, 0x4b, 0x5c]);
1184        assert_feed_ok!(
1185            e,
1186            "\u{ff86}\u{ff8e}\u{ff9d}",
1187            "",
1188            [0x1b, 0x28, 0x49, 0x46, 0x4e, 0x5d]
1189        );
1190        assert_feed_ok!(e, "XYZ", "", [0x1b, 0x28, 0x42, 0x58, 0x59, 0x5a]);
1191        assert_finish_ok!(e, []);
1192
1193        // one ASCII character and two similarly looking characters:
1194        // - A: U+0020 SPACE (requires ASCII state)
1195        // - B: U+30CD KATAKANA LETTER NE (requires JIS X 0208 Lead state)
1196        // - C: U+FF88 HALFWIDTH KATAKANA LETTER NE (requires Katakana state)
1197        // - D is omitted as the encoder does not support JIS X 0212.
1198        // a (3,2) De Bruijn near-sequence "ABCACBA" is used to test all possible cases.
1199        const AD: &str = "\x20";
1200        const BD: &str = "\u{30cd}";
1201        const CD: &str = "\u{ff88}";
1202        const AE: &[u8] = &[0x1b, 0x28, 0x42, 0x20];
1203        const BE: &[u8] = &[0x1b, 0x24, 0x42, 0x25, 0x4d];
1204        const CE: &[u8] = &[0x1b, 0x28, 0x49, 0x48];
1205        let mut e = ISO2022JPEncoding.raw_encoder();
1206        let decoded: String = ["\x20", BD, CD, AD, CD, BD, AD].concat();
1207        let encoded: Vec<_> = [&[0x20][..], BE, CE, AE, CE, BE, AE].concat();
1208        assert_feed_ok!(e, decoded, "", encoded);
1209        assert_finish_ok!(e, []);
1210    }
1211
1212    #[test]
1213    fn test_encoder_invalid() {
1214        let mut e = ISO2022JPEncoding.raw_encoder();
1215        assert_feed_err!(e, "", "\u{ffff}", "", []);
1216        assert_feed_err!(e, "?", "\u{ffff}", "!", [0x3f]);
1217        // JIS X 0212 is not supported in the encoder
1218        assert_feed_err!(e, "", "\u{736c}", "\u{8c78}", []);
1219        assert_finish_ok!(e, []);
1220    }
1221
1222    #[test]
1223    fn test_decoder_valid() {
1224        let mut d = ISO2022JPEncoding.raw_decoder();
1225        assert_feed_ok!(d, [0x41], [], "A");
1226        assert_feed_ok!(d, [0x42, 0x43], [], "BC");
1227        assert_feed_ok!(d, [0x1b, 0x28, 0x4a, 0x44, 0x45, 0x46], [], "DEF");
1228        assert_feed_ok!(d, [], [], "");
1229        assert_feed_ok!(d, [0x5c], [], "\\");
1230        assert_feed_ok!(d, [0x7e], [], "~");
1231        assert_feed_ok!(
1232            d,
1233            [0x1b, 0x24, 0x42, 0x24, 0x4b, 0x1b, 0x24, 0x42, 0x24, 0x5b, 0x24, 0x73],
1234            [],
1235            "\u{306b}\u{307b}\u{3093}"
1236        );
1237        assert_feed_ok!(d, [0x46, 0x7c, 0x4b, 0x5c], [], "\u{65e5}\u{672c}");
1238        assert_feed_ok!(
1239            d,
1240            [0x1b, 0x28, 0x49, 0x46, 0x4e, 0x5d],
1241            [],
1242            "\u{ff86}\u{ff8e}\u{ff9d}"
1243        );
1244        assert_feed_ok!(
1245            d,
1246            [0x1b, 0x24, 0x28, 0x44, 0x4b, 0x46, 0x1b, 0x24, 0x40, 0x6c, 0x38],
1247            [],
1248            "\u{736c}\u{8c78}"
1249        );
1250        assert_feed_ok!(d, [0x1b, 0x28, 0x42, 0x58, 0x59, 0x5a], [], "XYZ");
1251        assert_finish_ok!(d, "");
1252
1253        let mut d = ISO2022JPEncoding.raw_decoder();
1254        assert_feed_ok!(
1255            d,
1256            [0x1b, 0x24, 0x42, 0x24, 0x4b, 0x24, 0x5b, 0x24, 0x73],
1257            [],
1258            "\u{306b}\u{307b}\u{3093}"
1259        );
1260        assert_finish_ok!(d, "");
1261
1262        let mut d = ISO2022JPEncoding.raw_decoder();
1263        assert_feed_ok!(
1264            d,
1265            [0x1b, 0x28, 0x49, 0x46, 0x4e, 0x5d],
1266            [],
1267            "\u{ff86}\u{ff8e}\u{ff9d}"
1268        );
1269        assert_finish_ok!(d, "");
1270
1271        let mut d = ISO2022JPEncoding.raw_decoder();
1272        assert_feed_ok!(d, [0x1b, 0x24, 0x28, 0x44, 0x4b, 0x46], [], "\u{736c}");
1273        assert_finish_ok!(d, "");
1274
1275        // one ASCII character and three similarly looking characters:
1276        // - A: U+0020 SPACE (requires ASCII state)
1277        // - B: U+30CD KATAKANA LETTER NE (requires JIS X 0208 Lead state)
1278        // - C: U+FF88 HALFWIDTH KATAKANA LETTER NE (requires Katakana state)
1279        // - D: U+793B CJK UNIFIED IDEOGRAPH-793B (requires JIS X 0212 Lead state)
1280        // a (4,2) De Bruijn sequence "AABBCCACBADDBDCDA" is used to test all possible cases.
1281        const AD: &str = "\x20";
1282        const BD: &str = "\u{30cd}";
1283        const CD: &str = "\u{ff88}";
1284        const DD: &str = "\u{793b}";
1285        const AE: &[u8] = &[0x1b, 0x28, 0x42, 0x20];
1286        const BE: &[u8] = &[0x1b, 0x24, 0x42, 0x25, 0x4d];
1287        const CE: &[u8] = &[0x1b, 0x28, 0x49, 0x48];
1288        const DE: &[u8] = &[0x1b, 0x24, 0x28, 0x44, 0x50, 0x4b];
1289        let mut d = ISO2022JPEncoding.raw_decoder();
1290        let dec: String = [
1291            "\x20", AD, BD, BD, CD, CD, AD, CD, BD, AD, DD, DD, BD, DD, CD, DD, AD,
1292        ]
1293        .concat();
1294        let enc: Vec<_> = [
1295            &[0x20][..],
1296            AE,
1297            BE,
1298            BE,
1299            CE,
1300            CE,
1301            AE,
1302            CE,
1303            BE,
1304            AE,
1305            DE,
1306            DE,
1307            BE,
1308            DE,
1309            CE,
1310            DE,
1311            AE,
1312        ]
1313        .concat();
1314        assert_feed_ok!(d, enc, [], dec);
1315        assert_finish_ok!(d, "");
1316    }
1317
1318    #[test]
1319    fn test_decoder_valid_partial() {
1320        let mut d = ISO2022JPEncoding.raw_decoder();
1321
1322        assert_feed_ok!(d, [], [0x1b], "");
1323        assert_feed_ok!(d, [], [0x28], "");
1324        assert_feed_ok!(d, [0x4a, 0x41], [], "A");
1325        assert_feed_ok!(d, [], [0x1b, 0x28], "");
1326        assert_feed_ok!(d, [0x4a, 0x42], [0x1b], "B");
1327        assert_feed_ok!(d, [0x28, 0x4a, 0x43], [], "C");
1328
1329        assert_feed_ok!(d, [], [0x1b], "");
1330        assert_feed_ok!(d, [], [0x24], "");
1331        assert_feed_ok!(d, [0x42], [0x24], "");
1332        assert_feed_ok!(d, [0x4b], [0x1b, 0x24], "\u{306b}");
1333        assert_feed_ok!(d, [0x42, 0x24, 0x5b], [], "\u{307b}");
1334        assert_feed_ok!(d, [], [0x1b], "");
1335        assert_feed_ok!(d, [0x24, 0x42, 0x24, 0x73], [], "\u{3093}");
1336
1337        assert_feed_ok!(d, [], [0x1b], "");
1338        assert_feed_ok!(d, [], [0x28], "");
1339        assert_feed_ok!(d, [0x49, 0x46], [], "\u{ff86}");
1340        assert_feed_ok!(d, [], [0x1b, 0x28], "");
1341        assert_feed_ok!(d, [0x49, 0x4e], [0x1b], "\u{ff8e}");
1342        assert_feed_ok!(d, [0x28, 0x49, 0x5d], [], "\u{ff9d}");
1343
1344        assert_feed_ok!(d, [], [0x1b, 0x24], "");
1345        assert_feed_ok!(d, [], [0x28], "");
1346        assert_feed_ok!(d, [0x44], [0x4b], "");
1347        assert_feed_ok!(d, [0x46], [0x1b, 0x24, 0x28], "\u{736c}");
1348        assert_feed_ok!(d, [0x44, 0x4b, 0x46], [], "\u{736c}");
1349
1350        assert_finish_ok!(d, "");
1351    }
1352
1353    #[test]
1354    fn test_decoder_carriage_return() {
1355        // CR in Lead state "resets to ASCII"
1356        let mut d = ISO2022JPEncoding.raw_decoder();
1357        assert_feed_ok!(
1358            d,
1359            [0x1b, 0x24, 0x42, 0x25, 0x4d, 0x0a, 0x25, 0x4d],
1360            [],
1361            "\u{30cd}\n\x25\x4d"
1362        );
1363        assert_feed_ok!(
1364            d,
1365            [0x1b, 0x24, 0x28, 0x44, 0x50, 0x4b, 0x0a, 0x50, 0x4b],
1366            [],
1367            "\u{793b}\n\x50\x4b"
1368        );
1369        assert_finish_ok!(d, "");
1370
1371        // other states don't allow CR
1372        let mut d = ISO2022JPEncoding.raw_decoder();
1373        assert_feed_err!(d, [0x1b, 0x28, 0x49, 0x48], [0x0a], [], "\u{ff88}"); // Katakana
1374        assert_feed_err!(d, [0x1b, 0x24, 0x42], [0x25, 0x0a], [], ""); // Trail
1375        assert_finish_ok!(d, "");
1376    }
1377
1378    #[test]
1379    fn test_decoder_invalid_partial() {
1380        let mut d = ISO2022JPEncoding.raw_decoder();
1381        assert_feed_ok!(d, [0x1b, 0x24, 0x42, 0x24, 0x4b], [0x24], "\u{306b}");
1382        assert_finish_err!(d, "");
1383
1384        let mut d = ISO2022JPEncoding.raw_decoder();
1385        assert_feed_ok!(d, [0x1b, 0x24, 0x28, 0x44, 0x4b, 0x46], [0x50], "\u{736c}");
1386        assert_finish_err!(d, "");
1387    }
1388
1389    #[test]
1390    fn test_decoder_invalid_partial_escape() {
1391        let mut d = ISO2022JPEncoding.raw_decoder();
1392        assert_feed_ok!(d, [], [0x1b], "");
1393        assert_finish_err!(d, "");
1394
1395        let mut d = ISO2022JPEncoding.raw_decoder();
1396        assert_feed_ok!(d, [], [0x1b, 0x24], "");
1397        assert_finish_err!(d, ""); // no backup
1398
1399        let mut d = ISO2022JPEncoding.raw_decoder();
1400        assert_feed_ok!(d, [], [0x1b, 0x24, 0x28], "");
1401        assert_finish_err!(d, -1, ""); // backup of -1, not -2
1402
1403        let mut d = ISO2022JPEncoding.raw_decoder();
1404        assert_feed_ok!(d, [], [0x1b, 0x28], "");
1405        assert_finish_err!(d, ""); // no backup
1406
1407        assert_eq!(
1408            ISO2022JPEncoding.decode(&[0x1b], DecoderTrap::Replace),
1409            Ok("\u{fffd}".to_string())
1410        );
1411        assert_eq!(
1412            ISO2022JPEncoding.decode(&[0x1b, 0x24], DecoderTrap::Replace),
1413            Ok("\u{fffd}".to_string())
1414        );
1415        assert_eq!(
1416            ISO2022JPEncoding.decode(&[0x1b, 0x24, 0x28], DecoderTrap::Replace),
1417            Ok("\u{fffd}\x28".to_string())
1418        );
1419        assert_eq!(
1420            ISO2022JPEncoding.decode(&[0x1b, 0x28], DecoderTrap::Replace),
1421            Ok("\u{fffd}".to_string())
1422        );
1423    }
1424
1425    #[test]
1426    fn test_decoder_invalid_escape() {
1427        // also tests allowed but never used escape codes in ISO 2022
1428        let mut d = ISO2022JPEncoding.raw_decoder();
1429        macro_rules! reset(() => (
1430            assert_feed_ok!(d, [0x41, 0x42, 0x43, 0x1b, 0x24, 0x42, 0x21, 0x21], [],
1431                            "ABC\u{3000}")
1432        ));
1433
1434        reset!();
1435        assert_feed_ok!(d, [], [0x1b], "");
1436        assert_feed_err!(d, [], [], [0x00], "");
1437        reset!();
1438        assert_feed_err!(d, [], [0x1b], [0x0a], "");
1439        reset!();
1440        assert_feed_err!(d, [], [0x1b], [0x20], "");
1441        reset!();
1442        assert_feed_err!(d, [], [0x1b], [0x21, 0x5a], ""); // ESC ! Z (CZD)
1443        reset!();
1444        assert_feed_err!(d, [], [0x1b], [0x22, 0x5a], ""); // ESC " Z (C1D)
1445        reset!();
1446        assert_feed_err!(d, [], [0x1b], [0x24, 0x5a], ""); // ESC $ Z (GZDM4)
1447        reset!();
1448        assert_feed_ok!(d, [], [0x1b, 0x24], "");
1449        assert_feed_err!(d, -1, [], [], [0x24, 0x5a], "");
1450        reset!();
1451        assert_feed_err!(d, [], [0x1b], [0x24, 0x28, 0x5a], ""); // ESC $ ( Z (GZDM4)
1452        reset!();
1453        assert_feed_ok!(d, [], [0x1b, 0x24, 0x28], "");
1454        assert_feed_err!(d, -2, [], [], [0x24, 0x28, 0x5a], "");
1455        reset!();
1456        assert_feed_err!(d, [], [0x1b], [0x24, 0x29, 0x5a], ""); // ESC $ ) Z (G1DM4)
1457        reset!();
1458        assert_feed_err!(d, [], [0x1b], [0x24, 0x2a, 0x5a], ""); // ESC $ * Z (G2DM4)
1459        reset!();
1460        assert_feed_err!(d, [], [0x1b], [0x24, 0x2b, 0x5a], ""); // ESC $ + Z (G3DM4)
1461        reset!();
1462        assert_feed_err!(d, [], [0x1b], [0x24, 0x2d, 0x5a], ""); // ESC $ - Z (G1DM6)
1463        reset!();
1464        assert_feed_err!(d, [], [0x1b], [0x24, 0x2e, 0x5a], ""); // ESC $ . Z (G2DM6)
1465        reset!();
1466        assert_feed_err!(d, [], [0x1b], [0x24, 0x2f, 0x5a], ""); // ESC $ / Z (G3DM6)
1467        reset!();
1468        assert_feed_err!(d, [], [0x1b], [0x25, 0x5a], ""); // ESC % Z (DOCS)
1469        reset!();
1470        assert_feed_err!(d, [], [0x1b], [0x25, 0x2f, 0x5a], ""); // ESC % / Z (DOCS)
1471        reset!();
1472        assert_feed_err!(d, [], [0x1b], [0x28, 0x5a], ""); // ESC ( Z (GZD4)
1473        reset!();
1474        assert_feed_ok!(d, [], [0x1b, 0x28], "");
1475        assert_feed_err!(d, -1, [], [], [0x28, 0x5a], "");
1476        reset!();
1477        assert_feed_err!(d, [], [0x1b], [0x29, 0x5a], ""); // ESC ) Z (G1D4)
1478        reset!();
1479        assert_feed_err!(d, [], [0x1b], [0x2a, 0x5a], ""); // ESC * Z (G2D4)
1480        reset!();
1481        assert_feed_err!(d, [], [0x1b], [0x2b, 0x5a], ""); // ESC + Z (G3D4)
1482        reset!();
1483        assert_feed_err!(d, [], [0x1b], [0x2d, 0x5a], ""); // ESC - Z (G1D6)
1484        reset!();
1485        assert_feed_err!(d, [], [0x1b], [0x2e, 0x5a], ""); // ESC . Z (G2D6)
1486        reset!();
1487        assert_feed_err!(d, [], [0x1b], [0x2f, 0x5a], ""); // ESC / Z (G3D6)
1488        reset!();
1489        assert_feed_err!(d, [], [0x1b], [0x4e], ""); // ESC N (SS2)
1490        reset!();
1491        assert_feed_err!(d, [], [0x1b], [0x4f], ""); // ESC O (SS3)
1492        reset!();
1493        assert_feed_err!(d, [], [0x1b], [0x6e], ""); // ESC n (LS2)
1494        reset!();
1495        assert_feed_err!(d, [], [0x1b], [0x6f], ""); // ESC o (LS3)
1496        reset!();
1497        assert_feed_err!(d, [], [0x1b], [0x7c], ""); // ESC | (LS3R)
1498        reset!();
1499        assert_feed_err!(d, [], [0x1b], [0x7d], ""); // ESC } (LS2R)
1500        reset!();
1501        assert_feed_err!(d, [], [0x1b], [0x7e], ""); // ESC ~ (LS1R)
1502        reset!();
1503        assert_feed_err!(d, [], [0x1b], [0xff], "");
1504        reset!();
1505        assert_finish_ok!(d, "");
1506    }
1507
1508    #[test]
1509    fn test_decoder_invalid_out_or_range() {
1510        let mut d = ISO2022JPEncoding.raw_decoder();
1511        assert_feed_err!(d, [], [0x80], [], "");
1512        assert_feed_err!(d, [], [0xff], [], "");
1513        assert_feed_err!(d, [0x1b, 0x24, 0x42], [0x80, 0x21], [], "");
1514        assert_feed_err!(d, [0x1b, 0x24, 0x42], [0x21, 0x80], [], "");
1515        assert_feed_err!(d, [0x1b, 0x24, 0x42], [0x20, 0x21], [], "");
1516        assert_feed_err!(d, [0x1b, 0x24, 0x42], [0x21, 0x20], [], "");
1517        assert_feed_err!(d, [0x1b, 0x28, 0x49], [0x20], [], "");
1518        assert_feed_err!(d, [0x1b, 0x28, 0x49], [0x60], [], "");
1519        assert_feed_err!(d, [0x1b, 0x24, 0x28, 0x44], [0x80, 0x21], [], "");
1520        assert_feed_err!(d, [0x1b, 0x24, 0x28, 0x44], [0x21, 0x80], [], "");
1521        assert_feed_err!(d, [0x1b, 0x24, 0x28, 0x44], [0x20, 0x21], [], "");
1522        assert_feed_err!(d, [0x1b, 0x24, 0x28, 0x44], [0x21, 0x20], [], "");
1523        assert_finish_ok!(d, "");
1524    }
1525
1526    #[test]
1527    fn test_decoder_feed_after_finish() {
1528        let mut d = ISO2022JPEncoding.raw_decoder();
1529        assert_feed_ok!(
1530            d,
1531            [0x24, 0x22, 0x1b, 0x24, 0x42, 0x24, 0x22],
1532            [0x24],
1533            "\x24\x22\u{3042}"
1534        );
1535        assert_finish_err!(d, "");
1536        assert_feed_ok!(
1537            d,
1538            [0x24, 0x22, 0x1b, 0x24, 0x42, 0x24, 0x22],
1539            [],
1540            "\x24\x22\u{3042}"
1541        );
1542        assert_finish_ok!(d, "");
1543    }
1544
1545    #[bench]
1546    fn bench_encode_short_text(bencher: &mut test::Bencher) {
1547        let s = testutils::JAPANESE_TEXT;
1548        bencher.bytes = s.len() as u64;
1549        bencher.iter(|| test::black_box(ISO2022JPEncoding.encode(s, EncoderTrap::Strict)))
1550    }
1551
1552    #[bench]
1553    fn bench_decode_short_text(bencher: &mut test::Bencher) {
1554        let s = ISO2022JPEncoding
1555            .encode(testutils::JAPANESE_TEXT, EncoderTrap::Strict)
1556            .ok()
1557            .unwrap();
1558        bencher.bytes = s.len() as u64;
1559        bencher.iter(|| test::black_box(ISO2022JPEncoding.decode(&s, DecoderTrap::Strict)))
1560    }
1561}