1use crate::index_tradchinese as index;
8use crate::types::*;
9use crate::util::StrCharIndex;
10use std::convert::Into;
11use std::default::Default;
12
13#[derive(Clone, Copy)]
28pub struct BigFive2003Encoding;
29
30impl Encoding for BigFive2003Encoding {
31 fn name(&self) -> &'static str {
32 "big5-2003"
33 }
34 fn whatwg_name(&self) -> Option<&'static str> {
35 Some("big5")
36 } fn raw_encoder(&self) -> Box<dyn RawEncoder> {
38 BigFive2003Encoder::new()
39 }
40 fn raw_decoder(&self) -> Box<dyn RawDecoder> {
41 BigFive2003HKSCS2008Decoder::new()
42 }
43}
44
45#[derive(Clone, Copy)]
47pub struct BigFive2003Encoder;
48
49impl BigFive2003Encoder {
50 #[allow(clippy::new_ret_no_self)]
51 pub fn new() -> Box<dyn RawEncoder> {
52 Box::new(BigFive2003Encoder)
53 }
54}
55
56impl RawEncoder for BigFive2003Encoder {
57 fn from_self(&self) -> Box<dyn RawEncoder> {
58 BigFive2003Encoder::new()
59 }
60 fn is_ascii_compatible(&self) -> bool {
61 true
62 }
63
64 fn raw_feed(
65 &mut self,
66 input: &str,
67 output: &mut dyn ByteWriter,
68 ) -> (usize, Option<CodecError>) {
69 output.writer_hint(input.len());
70
71 for ((i, j), ch) in input.index_iter() {
72 if ch < '\u{80}' {
73 output.write_byte(ch as u8);
74 } else {
75 let ptr = index::big5::backward(ch as u32);
76 if ptr == 0xffff {
77 return (
78 i,
79 Some(CodecError {
80 upto: j as isize,
81 cause: "unrepresentable character".into(),
82 }),
83 );
84 }
85 let lead = ptr / 157 + 0x81;
86 let trail = ptr % 157;
87 let trailoffset = if trail < 0x3f { 0x40 } else { 0x62 };
88 output.write_byte(lead as u8);
89 output.write_byte((trail + trailoffset) as u8);
90 }
91 }
92 (input.len(), None)
93 }
94
95 fn raw_finish(&mut self, _output: &mut dyn ByteWriter) -> Option<CodecError> {
96 None
97 }
98}
99
100#[derive(Clone, Copy)]
102struct BigFive2003HKSCS2008Decoder {
103 st: bigfive2003::State,
104}
105
106impl BigFive2003HKSCS2008Decoder {
107 #[allow(clippy::new_ret_no_self)]
108 pub fn new() -> Box<dyn RawDecoder> {
109 Box::new(BigFive2003HKSCS2008Decoder {
110 st: Default::default(),
111 })
112 }
113}
114
115impl RawDecoder for BigFive2003HKSCS2008Decoder {
116 fn from_self(&self) -> Box<dyn RawDecoder> {
117 BigFive2003HKSCS2008Decoder::new()
118 }
119 fn is_ascii_compatible(&self) -> bool {
120 true
121 }
122
123 fn raw_feed(
124 &mut self,
125 input: &[u8],
126 output: &mut dyn StringWriter,
127 ) -> (usize, Option<CodecError>) {
128 let (st, processed, err) = bigfive2003::raw_feed(self.st, input, output, &());
129 self.st = st;
130 (processed, err)
131 }
132
133 fn raw_finish(&mut self, output: &mut dyn StringWriter) -> Option<CodecError> {
134 let (st, err) = bigfive2003::raw_finish(self.st, output, &());
135 self.st = st;
136 err
137 }
138}
139
140stateful_decoder! {
141 module bigfive2003;
142
143 internal pub fn map_two_bytes(lead: u8, trail: u8) -> u32 {
144 use crate::index_tradchinese as index;
145
146 let lead = lead as u16;
147 let trail = trail as u16;
148 let index = match (lead, trail) {
149 (0x81..=0xfe, 0x40..=0x7e) | (0x81..=0xfe, 0xa1..=0xfe) => {
150 let trailoffset = if trail < 0x7f {0x40} else {0x62};
151 (lead - 0x81) * 157 + trail - trailoffset
152 }
153 _ => 0xffff,
154 };
155 index::big5::forward(index) }
157
158initial:
159 state S0(ctx: Context) {
161 case b @ 0x00..=0x7f => ctx.emit(b as u32);
162 case b @ 0x81..=0xfe => S1(ctx, b);
163 case _ => ctx.err("invalid sequence");
164 }
165
166transient:
167 state S1(ctx: Context, lead: u8) {
169 case b => match map_two_bytes(lead, b) {
170 0xffff => {
171 let backup = if b < 0x80 {1} else {0};
172 ctx.backup_and_err(backup, "invalid sequence")
173 },
174 0 => ctx.emit_str("\u{ca}\u{304}"),
175 1 => ctx.emit_str("\u{ca}\u{30c}"),
176 2 => ctx.emit_str("\u{ea}\u{304}"),
177 3 => ctx.emit_str("\u{ea}\u{30c}"),
178 ch => ctx.emit(ch),
179 };
180 }
181}
182
183#[cfg(test)]
184mod bigfive2003_tests {
185 extern crate test;
186 use super::BigFive2003Encoding;
187 use crate::testutils;
188 use crate::types::*;
189
190 #[test]
191 fn test_encoder_valid() {
192 let mut e = BigFive2003Encoding.raw_encoder();
193 assert_feed_ok!(e, "A", "", [0x41]);
194 assert_feed_ok!(e, "BC", "", [0x42, 0x43]);
195 assert_feed_ok!(e, "", "", []);
196 assert_feed_ok!(
197 e,
198 "\u{4e2d}\u{83ef}\u{6c11}\u{570b}",
199 "",
200 [0xa4, 0xa4, 0xb5, 0xd8, 0xa5, 0xc1, 0xb0, 0xea]
201 );
202 assert_feed_ok!(e, "1\u{20ac}/m", "", [0x31, 0xa3, 0xe1, 0x2f, 0x6d]);
203 assert_feed_ok!(e, "\u{ffed}", "", [0xf9, 0xfe]);
204 assert_feed_ok!(e, "\u{2550}", "", [0xf9, 0xf9]); assert_finish_ok!(e, []);
206 }
207
208 #[test]
209 fn test_encoder_invalid() {
210 let mut e = BigFive2003Encoding.raw_encoder();
211 assert_feed_err!(e, "", "\u{ffff}", "", []);
212 assert_feed_err!(e, "?", "\u{ffff}", "!", [0x3f]);
213 assert_feed_err!(e, "", "\u{3eec}", "\u{4e00}", []); assert_finish_ok!(e, []);
215 }
216
217 #[test]
218 fn test_decoder_valid() {
219 let mut d = BigFive2003Encoding.raw_decoder();
220 assert_feed_ok!(d, [0x41], [], "A");
221 assert_feed_ok!(d, [0x42, 0x43], [], "BC");
222 assert_feed_ok!(d, [], [], "");
223 assert_feed_ok!(
224 d,
225 [0xa4, 0xa4, 0xb5, 0xd8, 0xa5, 0xc1, 0xb0, 0xea],
226 [],
227 "\u{4e2d}\u{83ef}\u{6c11}\u{570b}"
228 );
229 assert_feed_ok!(d, [], [0xa4], "");
230 assert_feed_ok!(d, [0xa4, 0xb5, 0xd8], [0xa5], "\u{4e2d}\u{83ef}");
231 assert_feed_ok!(d, [0xc1, 0xb0, 0xea], [], "\u{6c11}\u{570b}");
232 assert_feed_ok!(d, [0x31, 0xa3, 0xe1, 0x2f, 0x6d], [], "1\u{20ac}/m");
233 assert_feed_ok!(d, [0xf9, 0xfe], [], "\u{ffed}");
234 assert_feed_ok!(d, [0xf9, 0xf9], [], "\u{2550}");
235 assert_feed_ok!(d, [0xa2, 0xa4], [], "\u{2550}");
236 assert_feed_ok!(d, [0x87, 0x7e], [], "\u{3eec}"); assert_feed_ok!(
238 d,
239 [0x88, 0x62, 0x88, 0x64, 0x88, 0xa3, 0x88, 0xa5],
240 [],
241 "\u{ca}\u{304}\u{00ca}\u{30c}\u{ea}\u{304}\u{ea}\u{30c}"
242 ); assert_finish_ok!(d, "");
244 }
245
246 #[test]
247 fn test_decoder_invalid_lone_lead_immediate_test_finish() {
248 for i in 0x81..0xff {
249 let mut d = BigFive2003Encoding.raw_decoder();
250 assert_feed_ok!(d, [], [i], ""); assert_finish_err!(d, "");
252 }
253
254 let mut d = BigFive2003Encoding.raw_decoder();
256 assert_feed_err!(d, [], [0x80], [], "");
257 assert_feed_err!(d, [], [0xff], [], "");
258 assert_finish_ok!(d, "");
259 }
260
261 #[test]
262 fn test_decoder_invalid_lone_lead_followed_by_space() {
263 for i in 0x80..0x100 {
264 let i = i as u8;
265 let mut d = BigFive2003Encoding.raw_decoder();
266 assert_feed_err!(d, [], [i], [0x20], "");
267 assert_finish_ok!(d, "");
268 }
269 }
270
271 #[test]
272 fn test_decoder_invalid_lead_followed_by_invalid_trail() {
273 for i in 0x81..0xff {
276 let mut d = BigFive2003Encoding.raw_decoder();
277 assert_feed_err!(d, [], [i, 0x80], [0x20], "");
278 assert_feed_err!(d, [], [i, 0xff], [0x20], "");
279 assert_finish_ok!(d, "");
280
281 let mut d = BigFive2003Encoding.raw_decoder();
282 assert_feed_ok!(d, [], [i], "");
283 assert_feed_err!(d, [], [0x80], [0x20], "");
284 assert_feed_ok!(d, [], [i], "");
285 assert_feed_err!(d, [], [0xff], [0x20], "");
286 assert_finish_ok!(d, "");
287 }
288
289 let mut d = BigFive2003Encoding.raw_decoder();
291 assert_feed_err!(d, [], [0x80], [0x80], "");
292 assert_feed_err!(d, [], [0x80], [0xff], "");
293 assert_feed_err!(d, [], [0xff], [0x80], "");
294 assert_feed_err!(d, [], [0xff], [0xff], "");
295 assert_finish_ok!(d, "");
296 }
297
298 #[test]
299 fn test_decoder_feed_after_finish() {
300 let mut d = BigFive2003Encoding.raw_decoder();
301 assert_feed_ok!(d, [0xa4, 0x40], [0xa4], "\u{4e00}");
302 assert_finish_err!(d, "");
303 assert_feed_ok!(d, [0xa4, 0x40], [], "\u{4e00}");
304 assert_finish_ok!(d, "");
305 }
306
307 #[bench]
308 fn bench_encode_short_text(bencher: &mut test::Bencher) {
309 let s = testutils::TRADITIONAL_CHINESE_TEXT;
310 bencher.bytes = s.len() as u64;
311 bencher.iter(|| test::black_box(BigFive2003Encoding.encode(s, EncoderTrap::Strict)))
312 }
313
314 #[bench]
315 fn bench_decode_short_text(bencher: &mut test::Bencher) {
316 let s = BigFive2003Encoding
317 .encode(testutils::TRADITIONAL_CHINESE_TEXT, EncoderTrap::Strict)
318 .ok()
319 .unwrap();
320 bencher.bytes = s.len() as u64;
321 bencher.iter(|| test::black_box(BigFive2003Encoding.decode(&s, DecoderTrap::Strict)))
322 }
323}