encoding/codec/
korean.rs

1// This is a part of encoding-next.
2// Copyright (c) 2013-2015, Kang Seonghoon.
3// See README.md and LICENSE.txt for details.
4
5//! Legacy Korean encodings based on KS X 1001.
6
7use crate::index_korean as index;
8use crate::types::*;
9use crate::util::StrCharIndex;
10use std::convert::Into;
11use std::default::Default;
12
13/**
14 * Windows code page 949.
15 *
16 * This is a Korean encoding derived from EUC-KR,
17 * which is so widespread that most occurrences of EUC-KR actually mean this encoding.
18 * Unlike KS X 1001 (and EUC-KR) which only contains a set of 2,350 common Hangul syllables,
19 * it assigns remaining 8,822 Hangul syllables to the two-byte sequence
20 * which second byte have its MSB unset (i.e. `[81-C6] [41-5A 61-7A 81-FE]`).
21 * Its design strongly resembles that of Shift_JIS but less prone to errors
22 * since the set of MSB-unset second bytes is much limited compared to Shift_JIS.
23 */
24#[derive(Clone, Copy)]
25pub struct Windows949Encoding;
26
27impl Encoding for Windows949Encoding {
28    fn name(&self) -> &'static str {
29        "windows-949"
30    }
31    fn whatwg_name(&self) -> Option<&'static str> {
32        Some("euc-kr")
33    } // WHATWG compatibility
34    fn raw_encoder(&self) -> Box<dyn RawEncoder> {
35        Windows949Encoder::new()
36    }
37    fn raw_decoder(&self) -> Box<dyn RawDecoder> {
38        Windows949Decoder::new()
39    }
40}
41
42/// An encoder for Windows code page 949.
43#[derive(Clone, Copy)]
44pub struct Windows949Encoder;
45
46impl Windows949Encoder {
47    #[allow(clippy::new_ret_no_self)]
48    pub fn new() -> Box<dyn RawEncoder> {
49        Box::new(Windows949Encoder)
50    }
51}
52
53impl RawEncoder for Windows949Encoder {
54    fn from_self(&self) -> Box<dyn RawEncoder> {
55        Windows949Encoder::new()
56    }
57    fn is_ascii_compatible(&self) -> bool {
58        true
59    }
60
61    fn raw_feed(
62        &mut self,
63        input: &str,
64        output: &mut dyn ByteWriter,
65    ) -> (usize, Option<CodecError>) {
66        output.writer_hint(input.len());
67
68        for ((i, j), ch) in input.index_iter() {
69            if ch <= '\u{7f}' {
70                output.write_byte(ch as u8);
71            } else {
72                let ptr = index::euc_kr::backward(ch as u32);
73                if ptr == 0xffff {
74                    return (
75                        i,
76                        Some(CodecError {
77                            upto: j as isize,
78                            cause: "unrepresentable character".into(),
79                        }),
80                    );
81                } else {
82                    output.write_byte((ptr / 190 + 0x81) as u8);
83                    output.write_byte((ptr % 190 + 0x41) as u8);
84                }
85            }
86        }
87        (input.len(), None)
88    }
89
90    fn raw_finish(&mut self, _output: &mut dyn ByteWriter) -> Option<CodecError> {
91        None
92    }
93}
94
95/// A decoder for Windows code page 949.
96#[derive(Clone, Copy)]
97struct Windows949Decoder {
98    st: windows949::State,
99}
100
101impl Windows949Decoder {
102    #[allow(clippy::new_ret_no_self)]
103    pub fn new() -> Box<dyn RawDecoder> {
104        Box::new(Windows949Decoder {
105            st: Default::default(),
106        })
107    }
108}
109
110impl RawDecoder for Windows949Decoder {
111    fn from_self(&self) -> Box<dyn RawDecoder> {
112        Windows949Decoder::new()
113    }
114    fn is_ascii_compatible(&self) -> bool {
115        true
116    }
117
118    fn raw_feed(
119        &mut self,
120        input: &[u8],
121        output: &mut dyn StringWriter,
122    ) -> (usize, Option<CodecError>) {
123        let (st, processed, err) = windows949::raw_feed(self.st, input, output, &());
124        self.st = st;
125        (processed, err)
126    }
127
128    fn raw_finish(&mut self, output: &mut dyn StringWriter) -> Option<CodecError> {
129        let (st, err) = windows949::raw_finish(self.st, output, &());
130        self.st = st;
131        err
132    }
133}
134
135stateful_decoder! {
136    module windows949;
137
138    internal pub fn map_two_bytes(lead: u8, trail: u8) -> u32 {
139        use crate::index_korean as index;
140
141        let lead = lead as u16;
142        let trail = trail as u16;
143        let index = match (lead, trail) {
144            (0x81..=0xfe, 0x41..=0xfe) => (lead - 0x81) * 190 + (trail - 0x41),
145            (_, _) => 0xffff,
146        };
147        index::euc_kr::forward(index)
148    }
149
150initial:
151    // euc-kr lead = 0x00
152    state S0(ctx: Context) {
153        case b @ 0x00..=0x7f => ctx.emit(b as u32);
154        case b @ 0x81..=0xfe => S1(ctx, b);
155        case _ => ctx.err("invalid sequence");
156    }
157
158transient:
159    // euc-kr lead != 0x00
160    state S1(ctx: Context, lead: u8) {
161        case b => match map_two_bytes(lead, b) {
162            0xffff => {
163                let backup = if b < 0x80 {1} else {0};
164                ctx.backup_and_err(backup, "invalid sequence")
165            },
166            ch => ctx.emit(ch as u32)
167        };
168    }
169}
170
171#[cfg(test)]
172mod windows949_tests {
173    extern crate test;
174    use super::Windows949Encoding;
175    use crate::testutils;
176    use crate::types::*;
177
178    #[test]
179    fn test_encoder_valid() {
180        let mut e = Windows949Encoding.raw_encoder();
181        assert_feed_ok!(e, "A", "", [0x41]);
182        assert_feed_ok!(e, "BC", "", [0x42, 0x43]);
183        assert_feed_ok!(e, "", "", []);
184        assert_feed_ok!(e, "\u{ac00}", "", [0xb0, 0xa1]);
185        assert_feed_ok!(e, "\u{b098}\u{b2e4}", "", [0xb3, 0xaa, 0xb4, 0xd9]);
186        assert_feed_ok!(
187            e,
188            "\u{bdc1}\u{314b}\u{d7a3}",
189            "",
190            [0x94, 0xee, 0xa4, 0xbb, 0xc6, 0x52]
191        );
192        assert_finish_ok!(e, []);
193    }
194
195    #[test]
196    fn test_encoder_invalid() {
197        let mut e = Windows949Encoding.raw_encoder();
198        assert_feed_err!(e, "", "\u{ffff}", "", []);
199        assert_feed_err!(e, "?", "\u{ffff}", "!", [0x3f]);
200        assert_feed_err!(e, "?", "\u{fffd}", "!", [0x3f]); // for invalid table entries
201        assert_finish_ok!(e, []);
202    }
203
204    #[test]
205    fn test_decoder_valid() {
206        let mut d = Windows949Encoding.raw_decoder();
207        assert_feed_ok!(d, [0x41], [], "A");
208        assert_feed_ok!(d, [0x42, 0x43], [], "BC");
209        assert_feed_ok!(d, [], [], "");
210        assert_feed_ok!(d, [0xb0, 0xa1], [], "\u{ac00}");
211        assert_feed_ok!(d, [0xb3, 0xaa, 0xb4, 0xd9], [], "\u{b098}\u{b2e4}");
212        assert_feed_ok!(
213            d,
214            [0x94, 0xee, 0xa4, 0xbb, 0xc6, 0x52, 0xc1, 0x64],
215            [],
216            "\u{bdc1}\u{314b}\u{d7a3}\u{d58f}"
217        );
218        assert_finish_ok!(d, "");
219    }
220
221    #[test]
222    fn test_decoder_valid_partial() {
223        let mut d = Windows949Encoding.raw_decoder();
224        assert_feed_ok!(d, [], [0xb0], "");
225        assert_feed_ok!(d, [0xa1], [], "\u{ac00}");
226        assert_feed_ok!(d, [0xb3, 0xaa], [0xb4], "\u{b098}");
227        assert_feed_ok!(d, [0xd9], [0x94], "\u{b2e4}");
228        assert_feed_ok!(d, [0xee, 0xa4, 0xbb], [0xc6], "\u{bdc1}\u{314b}");
229        assert_feed_ok!(d, [0x52, 0xc1, 0x64], [], "\u{d7a3}\u{d58f}");
230        assert_finish_ok!(d, "");
231    }
232
233    #[test]
234    fn test_decoder_invalid_lone_lead_immediate_test_finish() {
235        for i in 0x81..0xff {
236            let mut d = Windows949Encoding.raw_decoder();
237            assert_feed_ok!(d, [], [i], ""); // wait for a trail
238            assert_finish_err!(d, "");
239        }
240
241        // 80/FF: immediate failure
242        let mut d = Windows949Encoding.raw_decoder();
243        assert_feed_err!(d, [], [0x80], [], "");
244        assert_feed_err!(d, [], [0xff], [], "");
245        assert_finish_ok!(d, "");
246    }
247
248    #[test]
249    fn test_decoder_invalid_lone_lead_followed_by_space() {
250        for i in 0x80..0x100 {
251            let i = i as u8;
252            let mut d = Windows949Encoding.raw_decoder();
253            assert_feed_err!(d, [], [i], [0x20], "");
254            assert_finish_ok!(d, "");
255        }
256    }
257
258    #[test]
259    fn test_decoder_invalid_lead_followed_by_invalid_trail() {
260        // should behave similarly to Big5.
261        // https://www.w3.org/Bugs/Public/show_bug.cgi?id=16691
262        for i in 0x81..0xff {
263            let mut d = Windows949Encoding.raw_decoder();
264            assert_feed_err!(d, [], [i, 0x80], [0x20], "");
265            assert_feed_err!(d, [], [i, 0xff], [0x20], "");
266            assert_finish_ok!(d, "");
267
268            let mut d = Windows949Encoding.raw_decoder();
269            assert_feed_ok!(d, [], [i], "");
270            assert_feed_err!(d, [], [0x80], [0x20], "");
271            assert_feed_ok!(d, [], [i], "");
272            assert_feed_err!(d, [], [0xff], [0x20], "");
273            assert_finish_ok!(d, "");
274        }
275
276        let mut d = Windows949Encoding.raw_decoder();
277        assert_feed_err!(d, [], [0x80], [0x80], "");
278        assert_feed_err!(d, [], [0x80], [0xff], "");
279        assert_feed_err!(d, [], [0xff], [0x80], "");
280        assert_feed_err!(d, [], [0xff], [0xff], "");
281        assert_finish_ok!(d, "");
282    }
283
284    #[test]
285    fn test_decoder_invalid_boundary() {
286        // U+D7A3 (C6 52) is the last Hangul syllable not in KS X 1001, C6 53 is invalid.
287        // note that since the trail byte may coincide with ASCII, the trail byte 53 is
288        // not considered to be in the problem. this is compatible to WHATWG Encoding standard.
289        let mut d = Windows949Encoding.raw_decoder();
290        assert_feed_ok!(d, [], [0xc6], "");
291        assert_feed_err!(d, [], [], [0x53], "");
292        assert_finish_ok!(d, "");
293    }
294
295    #[test]
296    fn test_decoder_feed_after_finish() {
297        let mut d = Windows949Encoding.raw_decoder();
298        assert_feed_ok!(d, [0xb0, 0xa1], [0xb0], "\u{ac00}");
299        assert_finish_err!(d, "");
300        assert_feed_ok!(d, [0xb0, 0xa1], [], "\u{ac00}");
301        assert_finish_ok!(d, "");
302    }
303
304    #[bench]
305    fn bench_encode_short_text(bencher: &mut test::Bencher) {
306        let s = testutils::KOREAN_TEXT;
307        bencher.bytes = s.len() as u64;
308        bencher.iter(|| test::black_box(Windows949Encoding.encode(s, EncoderTrap::Strict)))
309    }
310
311    #[bench]
312    fn bench_decode_short_text(bencher: &mut test::Bencher) {
313        let s = Windows949Encoding
314            .encode(testutils::KOREAN_TEXT, EncoderTrap::Strict)
315            .ok()
316            .unwrap();
317        bencher.bytes = s.len() as u64;
318        bencher.iter(|| test::black_box(Windows949Encoding.decode(&s, DecoderTrap::Strict)))
319    }
320}