encoding_types/lib.rs
1// This is a part of encoding-next.
2// Copyright (c) 2013-2015, Kang Seonghoon.
3// See README.md and LICENSE.txt for details.
4
5/*!
6 * Interface to the character encoding.
7 *
8 * # Raw incremental interface
9 *
10 * Methods which name starts with `raw_` constitute the raw incremental interface,
11 * the lowest-available API for encoders and decoders.
12 * This interface divides the entire input to four parts:
13 *
14 * - **Processed** bytes do not affect the future result.
15 * - **Unprocessed** bytes may affect the future result
16 * and can be a part of problematic sequence according to the future input.
17 * - **Problematic** byte is the first byte that causes an error condition.
18 * - **Remaining** bytes are not yet processed nor read,
19 * so the caller should feed any remaining bytes again.
20 *
21 * The following figure illustrates an example of successive `raw_feed` calls:
22 *
23 * ````notrust
24 * 1st raw_feed :2nd raw_feed :3rd raw_feed
25 * ----------+----:---------------:--+--+---------
26 * | : : | |
27 * ----------+----:---------------:--+--+---------
28 * processed unprocessed | remaining
29 * problematic
30 * ````
31 *
32 * Since these parts can span the multiple input sequences to `raw_feed`,
33 * `raw_feed` returns two offsets (one optional)
34 * with that the caller can track the problematic sequence.
35 * The first offset (the first `usize` in the tuple) points to the first unprocessed bytes,
36 * or is zero when unprocessed bytes have started before the current call.
37 * (The first unprocessed byte can also be at offset 0,
38 * which doesn't make a difference for the caller.)
39 * The second offset (`upto` field in the `CodecError` struct), if any,
40 * points to the first remaining bytes.
41 *
42 * If the caller needs to recover the error via the problematic sequence,
43 * then the caller starts to save the unprocessed bytes when the first offset < the input length,
44 * appends any new unprocessed bytes while the first offset is zero,
45 * and discards unprocessed bytes when first offset becomes non-zero
46 * while saving new unprocessed bytes when the first offset < the input length.
47 * Then the caller checks for the error condition
48 * and can use the saved unprocessed bytes for error recovery.
49 * Alternatively, if the caller only wants to replace the problematic sequence
50 * with a fixed string (like U+FFFD),
51 * then it can just discard the first sequence and can emit the fixed string on an error.
52 * It still has to feed the input bytes starting at the second offset again.
53 */
54
55use std::borrow::Cow;
56use std::fmt;
57
58/// Error information from either encoder or decoder.
59pub struct CodecError {
60 /// The byte position of the first remaining byte, with respect to the *current* input.
61 /// For the `finish` call, this should be no more than zero (since there is no input).
62 /// It can be negative if the remaining byte is in the prior inputs,
63 /// as long as the remaining byte is not yet processed.
64 /// The caller should feed the bytes starting from this point again
65 /// in order to continue encoding or decoding after an error.
66 pub upto: isize,
67 /// A human-readable cause of the error.
68 pub cause: Cow<'static, str>,
69}
70
71/// Byte writer used by encoders. In most cases this will be an owned vector of `u8`.
72pub trait ByteWriter {
73 /// Hints an expected lower bound on the length (in bytes) of the output
74 /// until the next call to `writer_hint`,
75 /// so that the writer can reserve the memory for writing.
76 /// `RawEncoder`s are recommended but not required to call this method
77 /// with an appropriate estimate.
78 /// By default this method does nothing.
79 fn writer_hint(&mut self, _expectedlen: usize) {}
80
81 /// Writes a single byte.
82 fn write_byte(&mut self, b: u8);
83
84 /// Writes a number of bytes.
85 fn write_bytes(&mut self, v: &[u8]);
86
87 /// If this `ByteWriter` is a `Vec<u8>`, returns a mutable reference to
88 /// `self` as `Some(&mut Vec<u8>)`. Returns `None` otherwise.
89 fn as_mut_vec(&mut self) -> Option<&mut Vec<u8>> {
90 None
91 }
92}
93
94impl ByteWriter for Vec<u8> {
95 fn writer_hint(&mut self, expectedlen: usize) {
96 self.reserve(expectedlen);
97 }
98
99 fn write_byte(&mut self, b: u8) {
100 self.push(b);
101 }
102
103 fn write_bytes(&mut self, v: &[u8]) {
104 self.extend_from_slice(v);
105 }
106
107 fn as_mut_vec(&mut self) -> Option<&mut Vec<u8>> {
108 Some(self)
109 }
110}
111
112/// String writer used by decoders. In most cases this will be an owned string.
113pub trait StringWriter {
114 /// Hints an expected lower bound on the length (in bytes) of the output
115 /// until the next call to `writer_hint`,
116 /// so that the writer can reserve the memory for writing.
117 /// `RawDecoder`s are recommended but not required to call this method
118 /// with an appropriate estimate.
119 /// By default this method does nothing.
120 fn writer_hint(&mut self, _expectedlen: usize) {}
121
122 /// Writes a single character.
123 fn write_char(&mut self, c: char);
124
125 /// Writes a string.
126 fn write_str(&mut self, s: &str);
127
128 /// If this `StringWriter` is a `String`, returns a mutable reference to
129 /// `self` as `Some(&mut String)`. Returns `None` otherwise.
130 fn as_mut_string(&mut self) -> Option<&mut String> {
131 None
132 }
133}
134
135impl StringWriter for String {
136 fn writer_hint(&mut self, expectedlen: usize) {
137 self.reserve(expectedlen);
138 }
139
140 fn write_char(&mut self, c: char) {
141 self.push(c);
142 }
143
144 fn write_str(&mut self, s: &str) {
145 self.push_str(s);
146 }
147
148 fn as_mut_string(&mut self) -> Option<&mut String> {
149 Some(self)
150 }
151}
152
153/// Encoder converting a Unicode string into a byte sequence.
154/// This is a lower level interface, and normally `Encoding::encode` should be used instead.
155pub trait RawEncoder: Send + 'static {
156 /// Creates a fresh `RawEncoder` instance which parameters are same as `self`.
157 #[allow(clippy::wrong_self_convention)]
158 fn from_self(&self) -> Box<dyn RawEncoder>;
159
160 /// Returns true if this encoding is compatible to ASCII,
161 /// i.e. U+0000 through U+007F always map to bytes 00 through 7F and nothing else.
162 fn is_ascii_compatible(&self) -> bool {
163 false
164 }
165
166 /// Feeds given portion of string to the encoder,
167 /// pushes the an encoded byte sequence at the end of the given output,
168 /// and returns a byte offset to the first unprocessed character
169 /// (that can be zero when the first such character appeared in the prior calls to `raw_feed`)
170 /// and optional error information (None means success).
171 fn raw_feed(&mut self, input: &str, output: &mut dyn ByteWriter)
172 -> (usize, Option<CodecError>);
173
174 /// Finishes the encoder,
175 /// pushes the an encoded byte sequence at the end of the given output,
176 /// and returns optional error information (None means success).
177 /// `remaining` value of the error information, if any, is always an empty string.
178 fn raw_finish(&mut self, output: &mut dyn ByteWriter) -> Option<CodecError>;
179}
180
181/// Decoder converting a byte sequence into a Unicode string.
182/// This is a lower level interface, and normally `Encoding::decode` should be used instead.
183pub trait RawDecoder: Send + 'static {
184 /// Creates a fresh `RawDecoder` instance which parameters are same as `self`.
185 #[allow(clippy::wrong_self_convention)]
186 fn from_self(&self) -> Box<dyn RawDecoder>;
187
188 /// Returns true if this encoding is compatible to ASCII,
189 /// i.e. bytes 00 through 7F always map to U+0000 through U+007F and nothing else.
190 fn is_ascii_compatible(&self) -> bool {
191 false
192 }
193
194 /// Feeds given portion of byte sequence to the encoder,
195 /// pushes the a decoded string at the end of the given output,
196 /// and returns an offset to the first unprocessed byte
197 /// (that can be zero when the first such byte appeared in the prior calls to `raw_feed`)
198 /// and optional error information (None means success).
199 fn raw_feed(
200 &mut self,
201 input: &[u8],
202 output: &mut dyn StringWriter,
203 ) -> (usize, Option<CodecError>);
204
205 /// Finishes the decoder,
206 /// pushes the a decoded string at the end of the given output,
207 /// and returns optional error information (None means success).
208 fn raw_finish(&mut self, output: &mut dyn StringWriter) -> Option<CodecError>;
209}
210
211/// A trait object using dynamic dispatch which is a sendable reference to the encoding,
212/// for code where the encoding is not known at compile-time.
213pub type EncodingRef = &'static (dyn Encoding + Send + Sync);
214
215/// Character encoding.
216pub trait Encoding {
217 /// Returns the canonical name of given encoding.
218 /// This name is guaranteed to be unique across built-in encodings,
219 /// but it is not normative and would be at most arbitrary.
220 fn name(&self) -> &'static str;
221
222 /// Returns a name of given encoding defined in the WHATWG Encoding standard, if any.
223 /// This name often differs from `name` due to the compatibility reason.
224 fn whatwg_name(&self) -> Option<&'static str> {
225 None
226 }
227
228 /// Creates a new encoder.
229 fn raw_encoder(&self) -> Box<dyn RawEncoder>;
230
231 /// Creates a new decoder.
232 fn raw_decoder(&self) -> Box<dyn RawDecoder>;
233
234 /// An easy-to-use interface to `RawEncoder`.
235 /// On the encoder error `trap` is called,
236 /// which may return a replacement sequence to continue processing,
237 /// or a failure to return the error.
238 fn encode(&self, input: &str, trap: EncoderTrap) -> Result<Vec<u8>, Cow<'static, str>> {
239 let mut ret = Vec::new();
240 self.encode_to(input, trap, &mut ret).map(|_| ret)
241 }
242
243 /// Encode into a `dyn ByteWriter`.
244 fn encode_to(
245 &self,
246 input: &str,
247 trap: EncoderTrap,
248 ret: &mut dyn ByteWriter,
249 ) -> Result<(), Cow<'static, str>> {
250 // we don't need to keep `unprocessed` here;
251 // `raw_feed` should process as much input as possible.
252 let mut encoder = self.raw_encoder();
253 let mut remaining = 0;
254
255 loop {
256 let (offset, err) = encoder.raw_feed(&input[remaining..], ret);
257 let unprocessed = remaining + offset;
258 match err {
259 Some(err) => {
260 remaining = (remaining as isize + err.upto) as usize;
261 if !trap.trap(&mut *encoder, &input[unprocessed..remaining], ret) {
262 return Err(err.cause);
263 }
264 }
265 None => {
266 remaining = input.len();
267 match encoder.raw_finish(ret) {
268 Some(err) => {
269 remaining = (remaining as isize + err.upto) as usize;
270 if !trap.trap(&mut *encoder, &input[unprocessed..remaining], ret) {
271 return Err(err.cause);
272 }
273 }
274 None => {}
275 }
276 if remaining >= input.len() {
277 return Ok(());
278 }
279 }
280 }
281 }
282 }
283
284 /// An easy-to-use interface to `RawDecoder`.
285 /// On the decoder error `trap` is called,
286 /// which may return a replacement string to continue processing,
287 /// or a failure to return the error.
288 fn decode(&self, input: &[u8], trap: DecoderTrap) -> Result<String, Cow<'static, str>> {
289 let mut ret = String::new();
290 self.decode_to(input, trap, &mut ret).map(|_| ret)
291 }
292
293 /// Decode into a `StringWriter`.
294 ///
295 /// This does *not* handle partial characters at the beginning or end of `input`!
296 /// Use `RawDecoder` for incremental decoding.
297 fn decode_to(
298 &self,
299 input: &[u8],
300 trap: DecoderTrap,
301 ret: &mut dyn StringWriter,
302 ) -> Result<(), Cow<'static, str>> {
303 // we don't need to keep `unprocessed` here;
304 // `raw_feed` should process as much input as possible.
305 let mut decoder = self.raw_decoder();
306 let mut remaining = 0;
307
308 loop {
309 let (offset, err) = decoder.raw_feed(&input[remaining..], ret);
310 let unprocessed = remaining + offset;
311 match err {
312 Some(err) => {
313 remaining = (remaining as isize + err.upto) as usize;
314 if !trap.trap(&mut *decoder, &input[unprocessed..remaining], ret) {
315 return Err(err.cause);
316 }
317 }
318 None => {
319 remaining = input.len();
320 match decoder.raw_finish(ret) {
321 Some(err) => {
322 remaining = (remaining as isize + err.upto) as usize;
323 if !trap.trap(&mut *decoder, &input[unprocessed..remaining], ret) {
324 return Err(err.cause);
325 }
326 }
327 None => {}
328 }
329 if remaining >= input.len() {
330 return Ok(());
331 }
332 }
333 }
334 }
335 }
336}
337
338impl<'a> fmt::Debug for &'a dyn Encoding {
339 fn fmt(&self, fmt: &mut fmt::Formatter) -> Result<(), fmt::Error> {
340 fmt.write_str("Encoding(")?;
341 fmt.write_str(self.name())?;
342 fmt.write_str(")")?;
343 Ok(())
344 }
345}
346
347/// A type of the bare function in `EncoderTrap` values.
348pub type EncoderTrapFunc =
349 fn(encoder: &mut dyn RawEncoder, input: &str, output: &mut dyn ByteWriter) -> bool;
350
351/// A type of the bare function in `DecoderTrap` values.
352pub type DecoderTrapFunc =
353 fn(decoder: &mut dyn RawDecoder, input: &[u8], output: &mut dyn StringWriter) -> bool;
354
355/// Trap, which handles decoder errors.
356#[derive(Copy)]
357pub enum DecoderTrap {
358 /// Immediately fails on errors.
359 /// Corresponds to WHATWG "fatal" error algorithm.
360 Strict,
361 /// Replaces an error with a U+FFFD (decoder).
362 /// Corresponds to WHATWG "replacement" error algorithm.
363 Replace,
364 /// Silently ignores an error, effectively replacing it with an empty sequence.
365 Ignore,
366 /// Calls given function to handle decoder errors.
367 /// The function is given the current decoder, input and output writer,
368 /// and should return true only when it is fine to keep going.
369 Call(DecoderTrapFunc),
370}
371
372impl DecoderTrap {
373 /// Handles a decoder error. May write to the output writer.
374 /// Returns true only when it is fine to keep going.
375 pub fn trap(
376 &self,
377 decoder: &mut dyn RawDecoder,
378 input: &[u8],
379 output: &mut dyn StringWriter,
380 ) -> bool {
381 match *self {
382 DecoderTrap::Strict => false,
383 DecoderTrap::Replace => {
384 output.write_char('\u{fffd}');
385 true
386 }
387 DecoderTrap::Ignore => true,
388 DecoderTrap::Call(func) => func(decoder, input, output),
389 }
390 }
391}
392
393impl Clone for DecoderTrap {
394 fn clone(&self) -> DecoderTrap {
395 match *self {
396 DecoderTrap::Strict => DecoderTrap::Strict,
397 DecoderTrap::Replace => DecoderTrap::Replace,
398 DecoderTrap::Ignore => DecoderTrap::Ignore,
399 DecoderTrap::Call(f) => DecoderTrap::Call(f),
400 }
401 }
402}
403
404#[derive(Copy)]
405pub enum EncoderTrap {
406 /// Immediately fails on errors.
407 /// Corresponds to WHATWG "fatal" error algorithm.
408 Strict,
409 /// Replaces an error with `?` in given encoding.
410 /// Note that this fails when `?` cannot be represented in given encoding.
411 /// Corresponds to WHATWG "URL" error algorithms.
412 Replace,
413 /// Silently ignores an error, effectively replacing it with an empty sequence.
414 Ignore,
415 /// Replaces an error with XML numeric character references (e.g. `Ӓ`).
416 /// The encoder trap fails when NCRs cannot be represented in given encoding.
417 /// Corresponds to WHATWG "<form>" error algorithms.
418 NcrEscape,
419 /// Calls given function to handle encoder errors.
420 /// The function is given the current encoder, input and output writer,
421 /// and should return true only when it is fine to keep going.
422 Call(EncoderTrapFunc),
423}
424
425impl EncoderTrap {
426 /// Handles an encoder error. May write to the output writer.
427 /// Returns true only when it is fine to keep going.
428 pub fn trap(
429 &self,
430 encoder: &mut dyn RawEncoder,
431 input: &str,
432 output: &mut dyn ByteWriter,
433 ) -> bool {
434 fn reencode(
435 encoder: &mut dyn RawEncoder,
436 input: &str,
437 output: &mut dyn ByteWriter,
438 trapname: &str,
439 ) -> bool {
440 if encoder.is_ascii_compatible() {
441 // optimization!
442 output.write_bytes(input.as_bytes());
443 } else {
444 let (_, err) = encoder.raw_feed(input, output);
445 if err.is_some() {
446 panic!("{} cannot reencode a replacement string", trapname);
447 }
448 }
449 true
450 }
451
452 match *self {
453 EncoderTrap::Strict => false,
454 EncoderTrap::Replace => reencode(encoder, "?", output, "Replace"),
455 EncoderTrap::Ignore => true,
456 EncoderTrap::NcrEscape => {
457 let mut escapes = String::new();
458 for ch in input.chars() {
459 escapes.push_str(&format!("&#{};", ch as isize));
460 }
461 reencode(encoder, &escapes, output, "NcrEscape")
462 }
463 EncoderTrap::Call(func) => func(encoder, input, output),
464 }
465 }
466}
467
468impl Clone for EncoderTrap {
469 fn clone(&self) -> EncoderTrap {
470 match *self {
471 EncoderTrap::Strict => EncoderTrap::Strict,
472 EncoderTrap::Replace => EncoderTrap::Replace,
473 EncoderTrap::Ignore => EncoderTrap::Ignore,
474 EncoderTrap::NcrEscape => EncoderTrap::NcrEscape,
475 EncoderTrap::Call(f) => EncoderTrap::Call(f),
476 }
477 }
478}